{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Bab 07 — Supervised Learning Playground\n",
        "\n",
        "Notebook pendamping Bab 7. Semua implementasi manual memakai Python standard library.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#!/usr/bin/env python3\n",
        "\"\"\"Bab 07 — Supervised Learning Playground.\n",
        "\n",
        "Standard library only. Educational implementations, not production replacements.\n",
        "Compares: majority baseline, kNN, Gaussian Naive Bayes, logistic regression,\n",
        "decision stump, ensemble stumps, and perceptron.\n",
        "\"\"\"\n",
        "from __future__ import annotations\n",
        "\n",
        "import math\n",
        "import random\n",
        "from statistics import mean, pstdev\n",
        "\n",
        "SEED = 42\n",
        "random.seed(SEED)\n",
        "\n",
        "Vector = list[float]\n",
        "\n",
        "\n",
        "def make_dataset(n: int = 80) -> list[dict[str, float | int]]:\n",
        "    rows = []\n",
        "    for i in range(n):\n",
        "        rating = round(random.uniform(2.5, 5.0), 2)\n",
        "        discount = round(random.uniform(0.0, 0.45), 2)\n",
        "        clicks = random.randint(0, 12)\n",
        "        price_rel = round(random.uniform(0.2, 1.0), 2)\n",
        "        noise = random.uniform(-0.35, 0.35)\n",
        "        score = 1.25 * rating + 3.0 * discount + 0.23 * clicks - 2.2 * price_rel + noise\n",
        "        bought = int(score > 4.7)\n",
        "        demand = max(0, int(8 + 8 * rating + 30 * discount + 2 * clicks - 12 * price_rel + noise * 5))\n",
        "        rows.append({\"rating\": rating, \"discount\": discount, \"clicks\": clicks, \"price_rel\": price_rel, \"bought\": bought, \"demand\": demand})\n",
        "    return rows\n",
        "\n",
        "\n",
        "def features(row: dict[str, float | int]) -> Vector:\n",
        "    return [float(row[\"rating\"]), float(row[\"discount\"]), float(row[\"clicks\"]), float(row[\"price_rel\"])]\n",
        "\n",
        "\n",
        "def labels(rows: list[dict[str, float | int]]) -> list[int]:\n",
        "    return [int(r[\"bought\"]) for r in rows]\n",
        "\n",
        "\n",
        "def train_test_split(rows: list[dict[str, float | int]], test_ratio: float = 0.25) -> tuple[list[dict[str, float | int]], list[dict[str, float | int]]]:\n",
        "    xs = rows[:]\n",
        "    random.shuffle(xs)\n",
        "    cut = int(len(xs) * (1 - test_ratio))\n",
        "    return xs[:cut], xs[cut:]\n",
        "\n",
        "\n",
        "def standardize_fit(x_train: list[Vector]) -> tuple[Vector, Vector]:\n",
        "    cols = list(zip(*x_train))\n",
        "    mus = [mean(c) for c in cols]\n",
        "    sigmas = [pstdev(c) or 1.0 for c in cols]\n",
        "    return mus, sigmas\n",
        "\n",
        "\n",
        "def standardize_transform(x_rows: list[Vector], mus: Vector, sigmas: Vector) -> list[Vector]:\n",
        "    return [[(v - m) / s for v, m, s in zip(row, mus, sigmas)] for row in x_rows]\n",
        "\n",
        "\n",
        "def confusion_matrix(y_true: list[int], y_pred: list[int]) -> dict[str, int]:\n",
        "    tp = sum(1 for y, p in zip(y_true, y_pred) if y == 1 and p == 1)\n",
        "    fp = sum(1 for y, p in zip(y_true, y_pred) if y == 0 and p == 1)\n",
        "    tn = sum(1 for y, p in zip(y_true, y_pred) if y == 0 and p == 0)\n",
        "    fn = sum(1 for y, p in zip(y_true, y_pred) if y == 1 and p == 0)\n",
        "    return {\"TP\": tp, \"FP\": fp, \"TN\": tn, \"FN\": fn}\n",
        "\n",
        "\n",
        "def safe_div(a: float, b: float) -> float:\n",
        "    return 0.0 if b == 0 else a / b\n",
        "\n",
        "\n",
        "def metrics(y_true: list[int], y_pred: list[int]) -> dict[str, float | int]:\n",
        "    cm = confusion_matrix(y_true, y_pred)\n",
        "    tp, fp, tn, fn = cm[\"TP\"], cm[\"FP\"], cm[\"TN\"], cm[\"FN\"]\n",
        "    precision = safe_div(tp, tp + fp)\n",
        "    recall = safe_div(tp, tp + fn)\n",
        "    return {\n",
        "        **cm,\n",
        "        \"accuracy\": safe_div(tp + tn, tp + fp + tn + fn),\n",
        "        \"precision\": precision,\n",
        "        \"recall\": recall,\n",
        "        \"f1\": safe_div(2 * precision * recall, precision + recall),\n",
        "    }\n",
        "\n",
        "\n",
        "def print_metrics(name: str, y_true: list[int], y_pred: list[int]) -> None:\n",
        "    m = metrics(y_true, y_pred)\n",
        "    print(f\"\\n{name}\")\n",
        "    print(\"-\" * len(name))\n",
        "    for k in [\"TP\", \"FP\", \"TN\", \"FN\", \"accuracy\", \"precision\", \"recall\", \"f1\"]:\n",
        "        v = m[k]\n",
        "        print(f\"{k:>9}: {v:.3f}\" if isinstance(v, float) else f\"{k:>9}: {v}\")\n",
        "\n",
        "\n",
        "def majority_predict(y_train: list[int], n: int) -> list[int]:\n",
        "    pred = 1 if sum(y_train) >= len(y_train) / 2 else 0\n",
        "    return [pred] * n\n",
        "\n",
        "\n",
        "def euclidean(a: Vector, b: Vector) -> float:\n",
        "    return math.sqrt(sum((x - y) ** 2 for x, y in zip(a, b)))\n",
        "\n",
        "\n",
        "def knn_predict(x_train: list[Vector], y_train: list[int], x_test: list[Vector], k: int = 5) -> list[int]:\n",
        "    preds = []\n",
        "    for row in x_test:\n",
        "        neighbors = sorted(zip(x_train, y_train), key=lambda pair: euclidean(row, pair[0]))[:k]\n",
        "        preds.append(1 if sum(y for _, y in neighbors) >= k / 2 else 0)\n",
        "    return preds\n",
        "\n",
        "\n",
        "def gaussian_nb_fit(x_train: list[Vector], y_train: list[int]) -> dict[int, tuple[float, Vector, Vector]]:\n",
        "    model = {}\n",
        "    for cls in [0, 1]:\n",
        "        rows = [x for x, y in zip(x_train, y_train) if y == cls]\n",
        "        prior = len(rows) / len(x_train)\n",
        "        cols = list(zip(*rows))\n",
        "        mus = [mean(c) for c in cols]\n",
        "        sigmas = [pstdev(c) or 1e-6 for c in cols]\n",
        "        model[cls] = (prior, mus, sigmas)\n",
        "    return model\n",
        "\n",
        "\n",
        "def normal_log_pdf(x: float, mu: float, sigma: float) -> float:\n",
        "    return -math.log(sigma) - ((x - mu) ** 2) / (2 * sigma * sigma)\n",
        "\n",
        "\n",
        "def gaussian_nb_predict(model: dict[int, tuple[float, Vector, Vector]], x_test: list[Vector]) -> list[int]:\n",
        "    preds = []\n",
        "    for row in x_test:\n",
        "        scores = {}\n",
        "        for cls, (prior, mus, sigmas) in model.items():\n",
        "            scores[cls] = math.log(prior + 1e-12) + sum(normal_log_pdf(v, m, s) for v, m, s in zip(row, mus, sigmas))\n",
        "        preds.append(max(scores, key=scores.get))\n",
        "    return preds\n",
        "\n",
        "\n",
        "def sigmoid(z: float) -> float:\n",
        "    return 1 / (1 + math.exp(-max(-40, min(40, z))))\n",
        "\n",
        "\n",
        "def logistic_fit(x_train: list[Vector], y_train: list[int], lr: float = 0.15, epochs: int = 300) -> tuple[Vector, float]:\n",
        "    w = [0.0] * len(x_train[0])\n",
        "    b = 0.0\n",
        "    for _ in range(epochs):\n",
        "        grad_w = [0.0] * len(w)\n",
        "        grad_b = 0.0\n",
        "        for x, y in zip(x_train, y_train):\n",
        "            p = sigmoid(sum(wi * xi for wi, xi in zip(w, x)) + b)\n",
        "            err = p - y\n",
        "            for j in range(len(w)):\n",
        "                grad_w[j] += err * x[j]\n",
        "            grad_b += err\n",
        "        n = len(x_train)\n",
        "        w = [wi - lr * gw / n for wi, gw in zip(w, grad_w)]\n",
        "        b -= lr * grad_b / n\n",
        "    return w, b\n",
        "\n",
        "\n",
        "def logistic_predict(w: Vector, b: float, x_test: list[Vector], threshold: float = 0.5) -> list[int]:\n",
        "    return [int(sigmoid(sum(wi * xi for wi, xi in zip(w, x)) + b) >= threshold) for x in x_test]\n",
        "\n",
        "\n",
        "def stump_fit(x_train: list[Vector], y_train: list[int]) -> tuple[int, float, int]:\n",
        "    best = (0, 0.0, 1, -1.0)\n",
        "    for feature_idx in range(len(x_train[0])):\n",
        "        values = sorted(set(row[feature_idx] for row in x_train))\n",
        "        for threshold in values:\n",
        "            for polarity in [1, -1]:\n",
        "                pred = [1 if polarity * row[feature_idx] >= polarity * threshold else 0 for row in x_train]\n",
        "                acc = sum(int(p == y) for p, y in zip(pred, y_train)) / len(y_train)\n",
        "                if acc > best[3]:\n",
        "                    best = (feature_idx, threshold, polarity, acc)\n",
        "    return best[0], best[1], best[2]\n",
        "\n",
        "\n",
        "def stump_predict(stump: tuple[int, float, int], x_test: list[Vector]) -> list[int]:\n",
        "    i, threshold, polarity = stump\n",
        "    return [1 if polarity * row[i] >= polarity * threshold else 0 for row in x_test]\n",
        "\n",
        "\n",
        "def ensemble_stumps_predict(x_train: list[Vector], y_train: list[int], x_test: list[Vector]) -> list[int]:\n",
        "    stumps = []\n",
        "    for seed in [1, 2, 3, 4, 5]:\n",
        "        random.seed(seed)\n",
        "        sample_idx = [random.randrange(len(x_train)) for _ in x_train]\n",
        "        xs = [x_train[i] for i in sample_idx]\n",
        "        ys = [y_train[i] for i in sample_idx]\n",
        "        stumps.append(stump_fit(xs, ys))\n",
        "    votes = [stump_predict(stump, x_test) for stump in stumps]\n",
        "    return [1 if sum(vote[i] for vote in votes) >= len(stumps) / 2 else 0 for i in range(len(x_test))]\n",
        "\n",
        "\n",
        "def perceptron_fit(x_train: list[Vector], y_train: list[int], lr: float = 0.1, epochs: int = 30) -> tuple[Vector, float]:\n",
        "    w = [0.0] * len(x_train[0])\n",
        "    b = 0.0\n",
        "    y_signed = [1 if y == 1 else -1 for y in y_train]\n",
        "    for _ in range(epochs):\n",
        "        for x, y in zip(x_train, y_signed):\n",
        "            score = sum(wi * xi for wi, xi in zip(w, x)) + b\n",
        "            if y * score <= 0:\n",
        "                w = [wi + lr * y * xi for wi, xi in zip(w, x)]\n",
        "                b += lr * y\n",
        "    return w, b\n",
        "\n",
        "\n",
        "def perceptron_predict(w: Vector, b: float, x_test: list[Vector]) -> list[int]:\n",
        "    return [1 if sum(wi * xi for wi, xi in zip(w, x)) + b >= 0 else 0 for x in x_test]\n",
        "\n",
        "\n",
        "def mae(preds: list[float], actuals: list[float]) -> float:\n",
        "    return mean(abs(p - a) for p, a in zip(preds, actuals))\n",
        "\n",
        "\n",
        "def main() -> None:\n",
        "    print(\"Bab 07 — Supervised Learning Playground\")\n",
        "    print(\"=\" * 68)\n",
        "    rows = make_dataset()\n",
        "    train, test = train_test_split(rows)\n",
        "    x_train_raw, x_test_raw = [features(r) for r in train], [features(r) for r in test]\n",
        "    y_train, y_test = labels(train), labels(test)\n",
        "    mus, sigmas = standardize_fit(x_train_raw)\n",
        "    x_train = standardize_transform(x_train_raw, mus, sigmas)\n",
        "    x_test = standardize_transform(x_test_raw, mus, sigmas)\n",
        "    print(f\"data: train={len(train)} test={len(test)} positif_train={sum(y_train)} positif_test={sum(y_test)}\")\n",
        "\n",
        "    print_metrics(\"Majority baseline\", y_test, majority_predict(y_train, len(test)))\n",
        "    for k in [1, 3, 5]:\n",
        "        print_metrics(f\"kNN k={k}\", y_test, knn_predict(x_train, y_train, x_test, k=k))\n",
        "    nb = gaussian_nb_fit(x_train, y_train)\n",
        "    print_metrics(\"Gaussian Naive Bayes\", y_test, gaussian_nb_predict(nb, x_test))\n",
        "    w, b = logistic_fit(x_train, y_train)\n",
        "    for threshold in [0.4, 0.5, 0.6]:\n",
        "        print_metrics(f\"Logistic regression threshold={threshold}\", y_test, logistic_predict(w, b, x_test, threshold))\n",
        "    stump = stump_fit(x_train, y_train)\n",
        "    print_metrics(\"Decision stump\", y_test, stump_predict(stump, x_test))\n",
        "    print_metrics(\"Ensemble stumps\", y_test, ensemble_stumps_predict(x_train, y_train, x_test))\n",
        "    pw, pb = perceptron_fit(x_train, y_train)\n",
        "    print_metrics(\"Perceptron linear\", y_test, perceptron_predict(pw, pb, x_test))\n",
        "\n",
        "    mean_demand = mean(float(r[\"demand\"]) for r in train)\n",
        "    demand_test = [float(r[\"demand\"]) for r in test]\n",
        "    print(\"\\nRegression baseline\")\n",
        "    print(\"-------------------\")\n",
        "    print(f\"prediksi demand konstan={mean_demand:.2f} MAE={mae([mean_demand]*len(test), demand_test):.3f}\")\n",
        "\n",
        "    print(\"\\nCatatan: implementasi manual ini untuk belajar. Untuk produksi, gunakan library teruji dan validasi lebih ketat.\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 1. Dataset dan split\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "rows = make_dataset()\n",
        "train, test = train_test_split(rows)\n",
        "x_train_raw, x_test_raw = [features(r) for r in train], [features(r) for r in test]\n",
        "y_train, y_test = labels(train), labels(test)\n",
        "mus, sigmas = standardize_fit(x_train_raw)\n",
        "x_train = standardize_transform(x_train_raw, mus, sigmas)\n",
        "x_test = standardize_transform(x_test_raw, mus, sigmas)\n",
        "print(len(train), len(test), sum(y_train), sum(y_test))\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 2. Baseline dan kNN\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "print_metrics(\"Majority baseline\", y_test, majority_predict(y_train, len(test)))\n",
        "for k in [1,3,5]:\n",
        "    print_metrics(f\"kNN k={k}\", y_test, knn_predict(x_train, y_train, x_test, k))\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 3. Naive Bayes\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "nb = gaussian_nb_fit(x_train, y_train)\n",
        "print_metrics(\"Gaussian Naive Bayes\", y_test, gaussian_nb_predict(nb, x_test))\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 4. Logistic regression dan threshold\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "w,b = logistic_fit(x_train, y_train)\n",
        "for threshold in [0.4,0.5,0.6]:\n",
        "    print_metrics(f\"Logistic threshold={threshold}\", y_test, logistic_predict(w,b,x_test,threshold))\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 5. Decision stump dan ensemble\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "stump = stump_fit(x_train, y_train)\n",
        "print(stump)\n",
        "print_metrics(\"Decision stump\", y_test, stump_predict(stump, x_test))\n",
        "print_metrics(\"Ensemble stumps\", y_test, ensemble_stumps_predict(x_train, y_train, x_test))\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 6. Perceptron linear\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "pw,pb = perceptron_fit(x_train, y_train)\n",
        "print_metrics(\"Perceptron\", y_test, perceptron_predict(pw,pb,x_test))\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 7. Regression baseline\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "mean_demand = mean(float(r[\"demand\"]) for r in train)\n",
        "demand_test = [float(r[\"demand\"]) for r in test]\n",
        "print(mean_demand, mae([mean_demand]*len(test), demand_test))\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 8. Challenge\n",
        "\n",
        "Ubah k, threshold, learning rate, atau noise dataset. Catat perubahan precision, recall, dan F1.\n"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "pygments_lexer": "ipython3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}