{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Bab 03B — Data Exploration and Visualization Lab\n",
        "\n",
        "Notebook ini menjalankan lab audit data, cleaning, visualisasi, split, leakage checklist, dan laporan data.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 1. Definisi data dan fungsi\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#!/usr/bin/env python3\n",
        "\"\"\"Bab 03B — Data exploration, visualization, and data quality lab.\n",
        "\n",
        "Standard-library first so it runs in local terminal, VS Code, Jupyter,\n",
        "Google Colab, and Kaggle. The chapter also explains matplotlib/seaborn;\n",
        "this script creates SVG plots without external dependencies as a portable\n",
        "fallback.\n",
        "\"\"\"\n",
        "\n",
        "from __future__ import annotations\n",
        "\n",
        "import csv\n",
        "import json\n",
        "import math\n",
        "import random\n",
        "from collections import Counter, defaultdict\n",
        "from dataclasses import dataclass\n",
        "from pathlib import Path\n",
        "from statistics import median\n",
        "from typing import Iterable, Optional, Sequence\n",
        "\n",
        "\n",
        "@dataclass(frozen=True)\n",
        "class RawRow:\n",
        "    customer_id: str\n",
        "    segment: str\n",
        "    payment: str\n",
        "    device: str\n",
        "    visits: Optional[float]\n",
        "    spend: Optional[float]\n",
        "    returned: Optional[int]\n",
        "    date: str\n",
        "\n",
        "\n",
        "RAW_DATA = [\n",
        "    RawRow(\"C001\", \"rutin\", \"QRIS\", \"mobile\", 12, 58, 0, \"2026-01-01\"),\n",
        "    RawRow(\"C002\", \"rutin\", \"QRIS\", \"mobile\", 10, 50, 0, \"2026-01-02\"),\n",
        "    RawRow(\"C003\", \"baru\", \"Cash\", \"mobile\", 2, 15, 1, \"2026-01-03\"),\n",
        "    RawRow(\"C004\", \"baru\", \"cash\", \"desktop\", 3, 17, 0, \"2026-01-04\"),\n",
        "    RawRow(\"C005\", \"vip\", \"Kartu\", \"desktop\", 1, 170, 0, \"2026-01-05\"),\n",
        "    RawRow(\"C006\", \"rutin\", \"QRIS\", \"mobile\", 9, None, 0, \"2026-01-06\"),\n",
        "    RawRow(\"C007\", \"promo\", \"QRIS\", \"tablet\", -1, 22, 1, \"2026-01-07\"),\n",
        "    RawRow(\"C008\", \"promo\", \"Cash\", \"mobile\", 4, 25, 0, \"2026-01-08\"),\n",
        "    RawRow(\"C009\", \"rutin\", \"QRIS\", \"mobile\", 11, 55, 0, \"2026-01-09\"),\n",
        "    RawRow(\"C001\", \"rutin\", \"QRIS\", \"mobile\", 12, 58, 0, \"2026-01-01\"),  # duplicate\n",
        "    RawRow(\"C010\", \"baru\", \"Kartu\", \"desktop\", 2, 19, 1, \"2026-01-10\"),\n",
        "    RawRow(\"C011\", \"vip\", \"QRIS\", \"mobile\", 5, 120, 0, \"2026-01-11\"),\n",
        "]\n",
        "\n",
        "\n",
        "def mean(values: Iterable[float]) -> float:\n",
        "    values = list(values)\n",
        "    return sum(values) / len(values)\n",
        "\n",
        "\n",
        "def std(values: Iterable[float]) -> float:\n",
        "    values = list(values)\n",
        "    mu = mean(values)\n",
        "    return math.sqrt(sum((x - mu) ** 2 for x in values) / len(values))\n",
        "\n",
        "\n",
        "def quantile(values: Sequence[float], q: float) -> float:\n",
        "    values = sorted(values)\n",
        "    pos = (len(values) - 1) * q\n",
        "    lo = math.floor(pos)\n",
        "    hi = math.ceil(pos)\n",
        "    if lo == hi:\n",
        "        return values[lo]\n",
        "    return values[lo] * (hi - pos) + values[hi] * (pos - lo)\n",
        "\n",
        "\n",
        "def audit(rows: Sequence[RawRow]) -> dict:\n",
        "    total_cells = len(rows) * 8\n",
        "    missing = 0\n",
        "    for row in rows:\n",
        "        missing += sum(v is None or v == \"\" for v in row.__dict__.values())\n",
        "    duplicate_rows = len(rows) - len(set(rows))\n",
        "    invalid_visits = sum(row.visits is not None and row.visits < 0 for row in rows)\n",
        "    payment_values = Counter(row.payment for row in rows)\n",
        "    return {\n",
        "        \"rows\": len(rows),\n",
        "        \"columns\": 8,\n",
        "        \"missing_cells\": missing,\n",
        "        \"missing_rate\": missing / total_cells,\n",
        "        \"duplicate_rows\": duplicate_rows,\n",
        "        \"invalid_visits\": invalid_visits,\n",
        "        \"payment_raw_values\": dict(payment_values),\n",
        "    }\n",
        "\n",
        "\n",
        "def clean(rows: Sequence[RawRow]) -> tuple[list[RawRow], list[str]]:\n",
        "    log: list[str] = []\n",
        "    seen = set()\n",
        "    deduped: list[RawRow] = []\n",
        "    for row in rows:\n",
        "        if row in seen:\n",
        "            log.append(f\"hapus duplikat identik: {row.customer_id}\")\n",
        "            continue\n",
        "        seen.add(row)\n",
        "        deduped.append(row)\n",
        "\n",
        "    spend_values = [row.spend for row in deduped if row.spend is not None]\n",
        "    spend_fill = float(median(spend_values))\n",
        "    log.append(f\"imputasi spend kosong dengan median={spend_fill:.2f}\")\n",
        "\n",
        "    cleaned: list[RawRow] = []\n",
        "    for row in deduped:\n",
        "        payment = row.payment.strip().upper()\n",
        "        if payment == \"CASH\":\n",
        "            payment = \"Cash\"\n",
        "        elif payment == \"QRIS\":\n",
        "            payment = \"QRIS\"\n",
        "        elif payment in {\"KARTU\", \"CARD\"}:\n",
        "            payment = \"Kartu\"\n",
        "        if row.visits is None or row.visits < 0:\n",
        "            log.append(f\"buang {row.customer_id}: visits invalid ({row.visits})\")\n",
        "            continue\n",
        "        cleaned.append(\n",
        "            RawRow(\n",
        "                row.customer_id,\n",
        "                row.segment.strip().lower(),\n",
        "                payment,\n",
        "                row.device.strip().lower(),\n",
        "                row.visits,\n",
        "                row.spend if row.spend is not None else spend_fill,\n",
        "                row.returned,\n",
        "                row.date,\n",
        "            )\n",
        "        )\n",
        "    return cleaned, log\n",
        "\n",
        "\n",
        "def z_scores(values: Sequence[float]) -> list[float]:\n",
        "    mu = mean(values)\n",
        "    sigma = std(values) or 1.0\n",
        "    return [(x - mu) / sigma for x in values]\n",
        "\n",
        "\n",
        "def linear_regression(xs: Sequence[float], ys: Sequence[float]) -> tuple[float, float]:\n",
        "    xbar = mean(xs)\n",
        "    ybar = mean(ys)\n",
        "    numerator = sum((x - xbar) * (y - ybar) for x, y in zip(xs, ys))\n",
        "    denominator = sum((x - xbar) ** 2 for x in xs) or 1.0\n",
        "    w = numerator / denominator\n",
        "    b = ybar - w * xbar\n",
        "    return w, b\n",
        "\n",
        "\n",
        "def split_ids(rows: Sequence[RawRow], seed: int = 42) -> dict:\n",
        "    ids = [row.customer_id for row in rows]\n",
        "    rng = random.Random(seed)\n",
        "    ids = ids[:]\n",
        "    rng.shuffle(ids)\n",
        "    n = len(ids)\n",
        "    train_end = int(n * 0.7)\n",
        "    valid_end = int(n * 0.85)\n",
        "    return {\n",
        "        \"seed\": seed,\n",
        "        \"strategy\": \"entity-level random split after cleaning; no duplicate customer rows\",\n",
        "        \"train\": ids[:train_end],\n",
        "        \"validation\": ids[train_end:valid_end],\n",
        "        \"test\": ids[valid_end:],\n",
        "        \"leakage_checks\": [\n",
        "            \"cleaning median is computed on cleaned dataset only for teaching; in production fit on train only\",\n",
        "            \"customer_id is not a model feature\",\n",
        "            \"future outcome columns are not included in features\",\n",
        "        ],\n",
        "    }\n",
        "\n",
        "\n",
        "def write_csv(rows: Sequence[RawRow], path: Path) -> None:\n",
        "    path.parent.mkdir(parents=True, exist_ok=True)\n",
        "    with path.open(\"w\", newline=\"\", encoding=\"utf-8\") as f:\n",
        "        writer = csv.DictWriter(f, fieldnames=list(rows[0].__dict__.keys()))\n",
        "        writer.writeheader()\n",
        "        for row in rows:\n",
        "            writer.writerow(row.__dict__)\n",
        "\n",
        "\n",
        "def scale(v: float, lo: float, hi: float, out_lo: float, out_hi: float) -> float:\n",
        "    if hi == lo:\n",
        "        return (out_lo + out_hi) / 2\n",
        "    return out_lo + (v - lo) * (out_hi - out_lo) / (hi - lo)\n",
        "\n",
        "\n",
        "def svg_shell(title: str, subtitle: str, body: str) -> str:\n",
        "    return f'''<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"760\" height=\"460\" viewBox=\"0 0 760 460\" role=\"img\" aria-label=\"{title}\">\n",
        "<rect width=\"760\" height=\"460\" fill=\"#f8fafc\"/>\n",
        "<rect x=\"24\" y=\"22\" width=\"712\" height=\"416\" rx=\"22\" fill=\"#ffffff\" stroke=\"#cbd5e1\" stroke-width=\"2\"/>\n",
        "<text x=\"50\" y=\"60\" font-family=\"Arial\" font-size=\"24\" font-weight=\"700\" fill=\"#0f172a\">{title}</text>\n",
        "<text x=\"50\" y=\"86\" font-family=\"Arial\" font-size=\"15\" fill=\"#475569\">{subtitle}</text>\n",
        "{body}\n",
        "</svg>'''\n",
        "\n",
        "\n",
        "def write_bar(counter: Counter, path: Path, title: str) -> None:\n",
        "    max_count = max(counter.values()) or 1\n",
        "    body = ['<path d=\"M90 380 L690 380 M90 380 L90 120\" stroke=\"#334155\"/>']\n",
        "    for i, (label, count) in enumerate(counter.most_common()):\n",
        "        h = scale(count, 0, max_count, 0, 220)\n",
        "        x = 120 + i * 120\n",
        "        body.append(f'<rect x=\"{x}\" y=\"{380-h:.1f}\" width=\"70\" height=\"{h:.1f}\" fill=\"#2563eb\"/><text x=\"{x}\" y=\"405\" font-size=\"13\">{label}</text><text x=\"{x+22}\" y=\"{370-h:.1f}\" font-size=\"13\">{count}</text>')\n",
        "    path.write_text(svg_shell(title, \"bar plot cocok untuk membandingkan kategori\", \"\\n\".join(body)), encoding=\"utf-8\")\n",
        "\n",
        "\n",
        "def write_hist(values: Sequence[float], path: Path) -> None:\n",
        "    bins = 5\n",
        "    lo, hi = min(values), max(values)\n",
        "    width = (hi - lo) / bins or 1\n",
        "    counts = [0] * bins\n",
        "    for v in values:\n",
        "        counts[min(bins - 1, int((v - lo) / width))] += 1\n",
        "    max_count = max(counts) or 1\n",
        "    body = ['<path d=\"M90 380 L690 380 M90 380 L90 120\" stroke=\"#334155\"/>']\n",
        "    for i, count in enumerate(counts):\n",
        "        h = scale(count, 0, max_count, 0, 220)\n",
        "        x = 120 + i * 90\n",
        "        body.append(f'<rect x=\"{x}\" y=\"{380-h:.1f}\" width=\"64\" height=\"{h:.1f}\" fill=\"#60a5fa\"/><text x=\"{x+20}\" y=\"{370-h:.1f}\" font-size=\"13\">{count}</text>')\n",
        "    path.write_text(svg_shell(\"Histogram Belanja\", \"histogram cocok untuk distribusi numerik\", \"\\n\".join(body)), encoding=\"utf-8\")\n",
        "\n",
        "\n",
        "def write_scatter(rows: Sequence[RawRow], path: Path) -> None:\n",
        "    xs = [r.visits for r in rows if r.visits is not None]\n",
        "    ys = [r.spend for r in rows if r.spend is not None]\n",
        "    body = ['<path d=\"M90 380 L690 380 M90 380 L90 120\" stroke=\"#334155\"/>']\n",
        "    for row in rows:\n",
        "        x = scale(row.visits or 0, min(xs), max(xs), 105, 670)\n",
        "        y = scale(row.spend or 0, min(ys), max(ys), 370, 130)\n",
        "        body.append(f'<circle cx=\"{x:.1f}\" cy=\"{y:.1f}\" r=\"8\" fill=\"#16a34a\"/><text x=\"{x+8:.1f}\" y=\"{y-7:.1f}\" font-size=\"11\">{row.customer_id}</text>')\n",
        "    path.write_text(svg_shell(\"Scatter Visits vs Spend\", \"scatter cocok untuk dua variabel numerik\", \"\\n\".join(body)), encoding=\"utf-8\")\n",
        "\n",
        "\n",
        "def write_box(rows: Sequence[RawRow], path: Path) -> None:\n",
        "    groups: dict[str, list[float]] = defaultdict(list)\n",
        "    for row in rows:\n",
        "        groups[row.segment].append(row.spend or 0)\n",
        "    body = ['<path d=\"M90 380 L690 380 M90 380 L90 120\" stroke=\"#334155\"/>']\n",
        "    for i, (seg, vals) in enumerate(sorted(groups.items())):\n",
        "        vals = sorted(vals)\n",
        "        q1, q2, q3 = quantile(vals, 0.25), quantile(vals, 0.5), quantile(vals, 0.75)\n",
        "        ymin, ymax = min(v for row in rows for v in [row.spend or 0]), max(v for row in rows for v in [row.spend or 0])\n",
        "        x = 150 + i * 120\n",
        "        y1, y2, y3 = [scale(v, ymin, ymax, 370, 130) for v in (q1, q2, q3)]\n",
        "        body.append(f'<rect x=\"{x}\" y=\"{y3:.1f}\" width=\"60\" height=\"{y1-y3:.1f}\" fill=\"#dbeafe\" stroke=\"#2563eb\"/><path d=\"M{x} {y2:.1f} L{x+60} {y2:.1f}\" stroke=\"#ef4444\" stroke-width=\"3\"/><text x=\"{x}\" y=\"405\" font-size=\"12\">{seg}</text>')\n",
        "    path.write_text(svg_shell(\"Box Plot Spend by Segment\", \"box plot menampilkan median dan kuartil\", \"\\n\".join(body)), encoding=\"utf-8\")\n",
        "\n",
        "\n",
        "def write_line(rows: Sequence[RawRow], path: Path) -> None:\n",
        "    rows = sorted(rows, key=lambda r: r.date)\n",
        "    ys = [r.spend or 0 for r in rows]\n",
        "    body = ['<path d=\"M90 380 L690 380 M90 380 L90 120\" stroke=\"#334155\"/>']\n",
        "    pts = []\n",
        "    for i, row in enumerate(rows):\n",
        "        x = scale(i, 0, len(rows) - 1, 110, 670)\n",
        "        y = scale(row.spend or 0, min(ys), max(ys), 370, 130)\n",
        "        pts.append(f'{x:.1f},{y:.1f}')\n",
        "        body.append(f'<circle cx=\"{x:.1f}\" cy=\"{y:.1f}\" r=\"5\" fill=\"#7c3aed\"/>')\n",
        "    body.append(f'<polyline points=\"{\" \".join(pts)}\" fill=\"none\" stroke=\"#7c3aed\" stroke-width=\"4\"/>')\n",
        "    path.write_text(svg_shell(\"Line Plot Harian\", \"line plot cocok untuk urutan waktu\", \"\\n\".join(body)), encoding=\"utf-8\")\n",
        "\n",
        "\n",
        "def write_outlier(rows: Sequence[RawRow], path: Path) -> None:\n",
        "    spends = [r.spend or 0 for r in rows]\n",
        "    zs = z_scores(spends)\n",
        "    body = ['<path d=\"M90 380 L690 380 M90 380 L90 120\" stroke=\"#334155\"/>']\n",
        "    for i, (row, z) in enumerate(zip(rows, zs)):\n",
        "        x = scale(i, 0, len(rows) - 1, 110, 670)\n",
        "        y = scale(z, min(zs), max(zs), 370, 130)\n",
        "        color = '#ef4444' if abs(z) > 1.5 else '#10b981'\n",
        "        body.append(f'<circle cx=\"{x:.1f}\" cy=\"{y:.1f}\" r=\"8\" fill=\"{color}\"/><text x=\"{x-10:.1f}\" y=\"405\" font-size=\"10\">{row.customer_id}</text><text x=\"{x-12:.1f}\" y=\"{y-10:.1f}\" font-size=\"11\">{z:.1f}</text>')\n",
        "    path.write_text(svg_shell(\"Z-score Outlier\", \"visualisasi kandidat outlier belanja\", \"\\n\".join(body)), encoding=\"utf-8\")\n",
        "\n",
        "\n",
        "def write_pie(counter: Counter, path: Path) -> None:\n",
        "    total = sum(counter.values())\n",
        "    # Keep this intentionally simple: labels + color blocks instead of true arcs for portability.\n",
        "    colors = ['#60a5fa', '#34d399', '#fbbf24', '#f87171']\n",
        "    body = ['<circle cx=\"250\" cy=\"250\" r=\"95\" fill=\"#dbeafe\"/>']\n",
        "    for i, (label, count) in enumerate(counter.most_common()):\n",
        "        pct = count / total * 100\n",
        "        body.append(f'<rect x=\"430\" y=\"{160+i*38}\" width=\"24\" height=\"24\" fill=\"{colors[i%len(colors)]}\"/><text x=\"465\" y=\"{178+i*38}\" font-size=\"14\">{label}: {pct:.1f}%</text>')\n",
        "    path.write_text(svg_shell(\"Pie-style Device Share\", \"pie cocok untuk sedikit kategori proporsi\", \"\\n\".join(body)), encoding=\"utf-8\")\n",
        "\n",
        "\n",
        "def main() -> None:\n",
        "    script_dir = Path(__file__).resolve().parent if \"__file__\" in globals() else Path.cwd()\n",
        "    out = script_dir / \"outputs\"\n",
        "    out.mkdir(parents=True, exist_ok=True)\n",
        "\n",
        "    audit_report = audit(RAW_DATA)\n",
        "    cleaned, cleaning_log = clean(RAW_DATA)\n",
        "    spends = [row.spend or 0 for row in cleaned]\n",
        "    visits = [row.visits or 0 for row in cleaned]\n",
        "    w, b = linear_regression(visits, spends)\n",
        "    residuals = [y - (w * x + b) for x, y in zip(visits, spends)]\n",
        "    q1, q3 = quantile(spends, 0.25), quantile(spends, 0.75)\n",
        "    iqr = q3 - q1\n",
        "    upper = q3 + 1.5 * iqr\n",
        "\n",
        "    write_csv(cleaned, out / \"cleaned_customers.csv\")\n",
        "    (out / \"split_manifest.json\").write_text(json.dumps(split_ids(cleaned), ensure_ascii=False, indent=2) + \"\\n\", encoding=\"utf-8\")\n",
        "\n",
        "    write_bar(Counter(r.payment for r in cleaned), out / \"bar_payment.svg\", \"Bar Plot Payment\")\n",
        "    write_pie(Counter(r.device for r in cleaned), out / \"pie_device.svg\")\n",
        "    write_hist(spends, out / \"hist_spend.svg\")\n",
        "    write_scatter(cleaned, out / \"scatter_visit_spend.svg\")\n",
        "    write_box(cleaned, out / \"box_spend_by_segment.svg\")\n",
        "    write_line(cleaned, out / \"line_daily_sales.svg\")\n",
        "    write_outlier(cleaned, out / \"outlier_zscore.svg\")\n",
        "\n",
        "    report = f\"\"\"# Data Audit Report Bab 03B\n",
        "\n",
        "## Audit mentah\n",
        "\n",
        "```json\n",
        "{json.dumps(audit_report, ensure_ascii=False, indent=2)}\n",
        "```\n",
        "\n",
        "## Cleaning log\n",
        "\n",
        "\"\"\" + \"\\n\".join(f\"- {item}\" for item in cleaning_log) + f\"\"\"\n",
        "\n",
        "## Statistik belanja\n",
        "\n",
        "- mean: {mean(spends):.2f}\n",
        "- median: {median(spends):.2f}\n",
        "- Q1: {q1:.2f}\n",
        "- Q3: {q3:.2f}\n",
        "- IQR: {iqr:.2f}\n",
        "- batas atas outlier IQR: {upper:.2f}\n",
        "\n",
        "## Regresi linear insight\n",
        "\n",
        "- spend_hat = {w:.3f} * visits + {b:.3f}\n",
        "- residual terbesar: {max(residuals, key=abs):.2f}\n",
        "\n",
        "## Insight awal\n",
        "\n",
        "- Payment paling sering: {Counter(r.payment for r in cleaned).most_common(1)[0][0]}\n",
        "- Device paling sering: {Counter(r.device for r in cleaned).most_common(1)[0][0]}\n",
        "- Nilai belanja di atas batas IQR perlu diperiksa, bukan otomatis dihapus.\n",
        "- Split manifest disimpan untuk mencegah evaluasi tidak reproducible.\n",
        "\"\"\"\n",
        "    (out / \"data_audit_report.md\").write_text(report, encoding=\"utf-8\")\n",
        "\n",
        "    print(\"Audit:\", audit_report)\n",
        "    print(\"Cleaning log:\", cleaning_log)\n",
        "    print(\"Linear regression: spend_hat =\", round(w, 3), \"* visits +\", round(b, 3))\n",
        "    print(\"Outputs written to\", out)\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 2. Jalankan pipeline utama\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "main()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 3. Latihan manual\n",
        "Cek rumus mean, median, IQR, z-score, dan split manifest dari output.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "values = [20, 25, 25, 30, 100]\n",
        "print(\"mean\", mean(values))\n",
        "print(\"median\", median(values))\n",
        "print(\"q1/q3\", quantile(values, 0.25), quantile(values, 0.75))\n",
        "print(\"z\", [round(v, 2) for v in z_scores(values)])\n"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}