#!/usr/bin/env python3
"""Bab 8 — Unsupervised learning + data preparation playground.

Kode ini memakai Python standard library agar bisa dijalankan dari terminal,
VS Code, Jupyter, Google Colab, dan Kaggle tanpa instalasi tambahan.

Yang dipraktikkan:
- audit data mentah,
- cleansing: duplikasi, nilai negatif, missing value,
- preprocessing: standardisasi fitur,
- EDA: ringkasan statistik dan plot SVG,
- regresi linear sederhana untuk membaca tren dan residual,
- k-means dari nol,
- inertia, PCA 2D mini, cosine similarity,
- anomaly detection dengan z-score.
"""

from __future__ import annotations

import math
import random
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List, Optional, Sequence, Tuple


@dataclass(frozen=True)
class RawCustomer:
    name: str
    visits: Optional[float]
    spend: Optional[float]
    coffee_milk_percent: Optional[float]


@dataclass(frozen=True)
class Customer:
    name: str
    visits: float
    spend: float
    coffee_milk_percent: float


RAW_CUSTOMERS = [
    RawCustomer("Ayu", 10, 50, 80),
    RawCustomer("Bima", 11, 52, 75),
    RawCustomer("Citra", 2, 15, 10),
    RawCustomer("Dedi", -3, 17, 20),  # invalid: kunjungan negatif
    RawCustomer("Eka", 9, None, 78),  # missing spend
    RawCustomer("Fajar", 4, 18, 25),
    RawCustomer("Gita", 1, 90, 5),  # kandidat anomali belanja
    RawCustomer("Ayu", 10, 50, 80),  # duplikat identik
    RawCustomer("Hani", 12, 58, 82),
]


def mean(values: Iterable[float]) -> float:
    values = list(values)
    return sum(values) / len(values)


def median(values: Iterable[float]) -> float:
    values = sorted(values)
    mid = len(values) // 2
    if len(values) % 2 == 1:
        return values[mid]
    return (values[mid - 1] + values[mid]) / 2


def population_std(values: Iterable[float]) -> float:
    values = list(values)
    mu = mean(values)
    return math.sqrt(sum((x - mu) ** 2 for x in values) / len(values))


def audit_raw_data(rows: Sequence[RawCustomer]) -> dict:
    total_cells = len(rows) * 3
    missing = sum(
        value is None
        for row in rows
        for value in (row.visits, row.spend, row.coffee_milk_percent)
    )
    negative_visits = sum((row.visits is not None and row.visits < 0) for row in rows)
    duplicate_rows = len(rows) - len(set(rows))
    return {
        "rows": len(rows),
        "missing_cells": missing,
        "missing_rate": missing / total_cells,
        "negative_visits": negative_visits,
        "duplicate_rows": duplicate_rows,
    }


def clean_customers(rows: Sequence[RawCustomer]) -> Tuple[List[Customer], List[str]]:
    """Clean raw rows and return cleaned customers + action log."""
    log: List[str] = []
    seen = set()
    deduped: List[RawCustomer] = []
    for row in rows:
        if row in seen:
            log.append(f"hapus duplikat identik: {row.name}")
            continue
        seen.add(row)
        deduped.append(row)

    valid_spend = [row.spend for row in deduped if row.spend is not None]
    spend_fill = median(valid_spend)
    log.append(f"imputasi spend kosong dengan median={spend_fill:.2f}")

    cleaned: List[Customer] = []
    for row in deduped:
        if row.visits is None or row.coffee_milk_percent is None:
            log.append(f"buang {row.name}: visits/kopi_susu kosong")
            continue
        if row.visits < 0:
            log.append(f"buang {row.name}: kunjungan negatif ({row.visits})")
            continue
        if not (0 <= row.coffee_milk_percent <= 100):
            log.append(f"buang {row.name}: persen kopi susu di luar 0-100")
            continue
        cleaned.append(
            Customer(
                row.name,
                float(row.visits),
                float(row.spend if row.spend is not None else spend_fill),
                float(row.coffee_milk_percent),
            )
        )
    return cleaned, log


def as_matrix(customers: Sequence[Customer]) -> List[List[float]]:
    return [[c.visits, c.spend, c.coffee_milk_percent] for c in customers]


def transpose(matrix: Sequence[Sequence[float]]) -> List[List[float]]:
    return [list(col) for col in zip(*matrix)]


def standardize(matrix: Sequence[Sequence[float]]) -> Tuple[List[List[float]], List[float], List[float]]:
    columns = transpose(matrix)
    mus = [mean(col) for col in columns]
    sigmas = [population_std(col) or 1.0 for col in columns]
    scaled = []
    for row in matrix:
        scaled.append([(x - mu) / sigma for x, mu, sigma in zip(row, mus, sigmas)])
    return scaled, mus, sigmas


def euclidean(a: Sequence[float], b: Sequence[float]) -> float:
    return math.sqrt(sum((x - y) ** 2 for x, y in zip(a, b)))


def cosine_similarity(a: Sequence[float], b: Sequence[float]) -> float:
    dot = sum(x * y for x, y in zip(a, b))
    norm_a = math.sqrt(sum(x * x for x in a))
    norm_b = math.sqrt(sum(y * y for y in b))
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return dot / (norm_a * norm_b)


def centroid(rows: Sequence[Sequence[float]]) -> List[float]:
    return [mean(col) for col in transpose(rows)]


def kmeans(matrix: Sequence[Sequence[float]], k: int = 2, iterations: int = 12, seed: int = 42):
    random.seed(seed)
    centers = [list(row) for row in random.sample(list(matrix), k)]
    assignments = [0 for _ in matrix]
    history = []

    for _ in range(iterations):
        assignments = []
        for row in matrix:
            distances = [euclidean(row, center) for center in centers]
            assignments.append(min(range(k), key=lambda idx: distances[idx]))

        new_centers = []
        for cluster_id in range(k):
            members = [row for row, assigned in zip(matrix, assignments) if assigned == cluster_id]
            new_centers.append(centroid(members) if members else centers[cluster_id])
        history.append((assignments[:], [c[:] for c in new_centers]))
        if new_centers == centers:
            break
        centers = new_centers

    return assignments, centers, history


def inertia(matrix: Sequence[Sequence[float]], assignments: Sequence[int], centers: Sequence[Sequence[float]]) -> float:
    return sum(euclidean(row, centers[cluster_id]) ** 2 for row, cluster_id in zip(matrix, assignments))


def covariance_2d(points: Sequence[Sequence[float]]) -> Tuple[float, float, float]:
    xs = [p[0] for p in points]
    ys = [p[1] for p in points]
    mux, muy = mean(xs), mean(ys)
    var_x = mean((x - mux) ** 2 for x in xs)
    var_y = mean((y - muy) ** 2 for y in ys)
    cov_xy = mean((x - mux) * (y - muy) for x, y in zip(xs, ys))
    return var_x, cov_xy, var_y


def first_principal_component_2d(points: Sequence[Sequence[float]]) -> Tuple[List[float], float]:
    a, b, d = covariance_2d(points)
    trace = a + d
    determinant = a * d - b * b
    delta = math.sqrt(max(0.0, trace * trace - 4 * determinant))
    lambda1 = (trace + delta) / 2

    if abs(b) > 1e-12:
        vector = [b, lambda1 - a]
    elif a >= d:
        vector = [1.0, 0.0]
    else:
        vector = [0.0, 1.0]
    norm = math.sqrt(vector[0] ** 2 + vector[1] ** 2) or 1.0
    return [vector[0] / norm, vector[1] / norm], lambda1


def project_2d(points: Sequence[Sequence[float]], component: Sequence[float]) -> List[float]:
    xs = [p[0] for p in points]
    ys = [p[1] for p in points]
    mux, muy = mean(xs), mean(ys)
    return [((x - mux) * component[0] + (y - muy) * component[1]) for x, y in points]


def z_scores(values: Sequence[float]) -> List[float]:
    mu = mean(values)
    sigma = population_std(values) or 1.0
    return [(x - mu) / sigma for x in values]


def linear_regression(xs: Sequence[float], ys: Sequence[float]) -> Tuple[float, float]:
    xbar, ybar = mean(xs), mean(ys)
    numerator = sum((x - xbar) * (y - ybar) for x, y in zip(xs, ys))
    denominator = sum((x - xbar) ** 2 for x in xs) or 1.0
    w = numerator / denominator
    b = ybar - w * xbar
    return w, b


def residuals(xs: Sequence[float], ys: Sequence[float], w: float, b: float) -> List[float]:
    return [y - (w * x + b) for x, y in zip(xs, ys)]


def scale(value: float, lo: float, hi: float, out_lo: float, out_hi: float) -> float:
    if hi == lo:
        return (out_lo + out_hi) / 2
    return out_lo + (value - lo) * (out_hi - out_lo) / (hi - lo)


def write_svg(path: Path, body: str, title: str, subtitle: str) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(
        f'''<svg xmlns="http://www.w3.org/2000/svg" width="720" height="420" viewBox="0 0 720 420" role="img" aria-label="{title}">
<rect width="720" height="420" fill="#f8fafc"/>
<rect x="24" y="22" width="672" height="376" rx="22" fill="#ffffff" stroke="#cbd5e1" stroke-width="2"/>
<text x="50" y="58" font-family="Arial" font-size="24" font-weight="700" fill="#0f172a">{title}</text>
<text x="50" y="84" font-family="Arial" font-size="15" fill="#475569">{subtitle}</text>
{body}
</svg>''',
        encoding="utf-8",
    )


def plot_scatter_clusters(customers: Sequence[Customer], assignments: Sequence[int], out: Path) -> None:
    xs = [c.visits for c in customers]
    ys = [c.spend for c in customers]
    colors = ["#2563eb", "#16a34a", "#f97316", "#9333ea"]
    parts = ['<path d="M80 340 L650 340 M80 340 L80 105" stroke="#334155"/>']
    for c, cluster_id in zip(customers, assignments):
        x = scale(c.visits, min(xs), max(xs), 95, 640)
        y = scale(c.spend, min(ys), max(ys), 330, 115)
        parts.append(f'<circle cx="{x:.1f}" cy="{y:.1f}" r="9" fill="{colors[cluster_id % len(colors)]}"/><text x="{x+10:.1f}" y="{y-8:.1f}" font-size="12">{c.name}</text>')
    parts.append('<text x="310" y="380" font-size="14">kunjungan</text><text x="26" y="230" font-size="14" transform="rotate(-90 26,230)">belanja</text>')
    write_svg(out, "\n".join(parts), "Scatter Cluster Pelanggan", "Setiap titik adalah pelanggan; warna adalah cluster k-means")


def plot_histogram(values: Sequence[float], out: Path) -> None:
    bins = 5
    lo, hi = min(values), max(values)
    width = (hi - lo) / bins or 1
    counts = [0] * bins
    for v in values:
        idx = min(bins - 1, int((v - lo) / width))
        counts[idx] += 1
    max_count = max(counts) or 1
    parts = ['<path d="M80 340 L650 340 M80 340 L80 105" stroke="#334155"/>']
    for i, count in enumerate(counts):
        h = scale(count, 0, max_count, 0, 210)
        x = 110 + i * 95
        y = 340 - h
        parts.append(f'<rect x="{x}" y="{y:.1f}" width="65" height="{h:.1f}" fill="#60a5fa"/><text x="{x+20}" y="{y-8:.1f}" font-size="12">{count}</text>')
    parts.append('<text x="300" y="380" font-size="14">rentang belanja</text>')
    write_svg(out, "\n".join(parts), "Histogram Belanja", "Melihat distribusi satu fitur sebelum model")


def plot_regression(customers: Sequence[Customer], w: float, b: float, out: Path) -> None:
    xs = [c.visits for c in customers]
    ys = [c.spend for c in customers]
    x_min, x_max = min(xs), max(xs)
    y_values = ys + [w * x_min + b, w * x_max + b]
    parts = ['<path d="M80 340 L650 340 M80 340 L80 105" stroke="#334155"/>']
    x1 = scale(x_min, x_min, x_max, 95, 640)
    y1 = scale(w * x_min + b, min(y_values), max(y_values), 330, 115)
    x2 = scale(x_max, x_min, x_max, 95, 640)
    y2 = scale(w * x_max + b, min(y_values), max(y_values), 330, 115)
    parts.append(f'<path d="M{x1:.1f} {y1:.1f} L{x2:.1f} {y2:.1f}" stroke="#ef4444" stroke-width="4"/>')
    for c in customers:
        x = scale(c.visits, x_min, x_max, 95, 640)
        y = scale(c.spend, min(y_values), max(y_values), 330, 115)
        yhat = scale(w * c.visits + b, min(y_values), max(y_values), 330, 115)
        parts.append(f'<circle cx="{x:.1f}" cy="{y:.1f}" r="8" fill="#2563eb"/><path d="M{x:.1f} {y:.1f} L{x:.1f} {yhat:.1f}" stroke="#f97316" stroke-dasharray="4 4"/><text x="{x+9:.1f}" y="{y-6:.1f}" font-size="12">{c.name}</text>')
    write_svg(out, "\n".join(parts), "Regresi Linear dan Residual", "Garis merah adalah tren; garis putus-putus adalah residual")


def plot_anomaly_z(customers: Sequence[Customer], z_values: Sequence[float], out: Path) -> None:
    parts = ['<path d="M80 340 L650 340 M80 340 L80 105" stroke="#334155"/>']
    for i, (c, z) in enumerate(zip(customers, z_values)):
        x = 110 + i * 70
        y = scale(z, min(z_values), max(z_values), 330, 120)
        color = "#ef4444" if abs(z) >= 1.5 else "#10b981"
        parts.append(f'<circle cx="{x}" cy="{y:.1f}" r="9" fill="{color}"/><text x="{x-14}" y="360" font-size="12">{c.name}</text><text x="{x-12}" y="{y-12:.1f}" font-size="12">{z:.1f}</text>')
    parts.append('<text x="92" y="113" font-size="13" fill="#ef4444">|z| besar = perlu diperiksa</text>')
    write_svg(out, "\n".join(parts), "Z-score Anomali Belanja", "Titik merah bukan vonis; hanya sinyal investigasi")


def print_table(customers: Sequence[Customer], assignments: Sequence[int], z_spend: Sequence[float], res: Sequence[float]) -> None:
    print("Nama     kunjungan belanja kopi_susu% cluster z_belanja residual")
    print("-" * 74)
    for customer, cluster_id, z, e in zip(customers, assignments, z_spend, res):
        print(
            f"{customer.name:<8} {customer.visits:>8.0f} {customer.spend:>7.0f}"
            f" {customer.coffee_milk_percent:>10.0f} {cluster_id:>7} {z:>9.2f} {e:>8.2f}"
        )


def main() -> None:
    print("=== Bab 8: Unsupervised + Data Preparation Playground ===")
    audit = audit_raw_data(RAW_CUSTOMERS)
    print("\nAudit data mentah:", audit)

    customers, cleaning_log = clean_customers(RAW_CUSTOMERS)
    print("\nLog cleansing:")
    for item in cleaning_log:
        print("-", item)

    matrix = as_matrix(customers)
    scaled, mus, sigmas = standardize(matrix)
    print("\nFitur: visits, spend, coffee_milk_percent")
    print("Mean fitur:", [round(x, 2) for x in mus])
    print("Std fitur:", [round(x, 2) for x in sigmas])

    dist_ab = euclidean([customers[0].visits, customers[0].spend], [customers[1].visits, customers[1].spend])
    dist_ac = euclidean([customers[0].visits, customers[0].spend], [customers[2].visits, customers[2].spend])
    print("\nJarak mentah Ayu-Bima:", round(dist_ab, 2))
    print("Jarak mentah Ayu-Citra:", round(dist_ac, 2))

    visits = [c.visits for c in customers]
    spend = [c.spend for c in customers]
    w, b = linear_regression(visits, spend)
    res = residuals(visits, spend, w, b)
    print("\nRegresi linear insight: spend_hat = w*visits + b")
    print("w:", round(w, 3), "b:", round(b, 3))
    print("Residual:", [round(x, 2) for x in res])

    assignments, centers, history = kmeans(scaled, k=2, iterations=12, seed=7)
    score = inertia(scaled, assignments, centers)
    spend_z = z_scores(spend)
    print("\nK-means assignments:", assignments)
    print("Inertia:", round(score, 3))
    print("\nTabel hasil:")
    print_table(customers, assignments, spend_z, res)

    # PCA mini memakai dua fitur pertama setelah scaling: visits dan spend.
    points_2d = [[row[0], row[1]] for row in scaled]
    pc1, eig = first_principal_component_2d(points_2d)
    projections = project_2d(points_2d, pc1)
    print("\nPCA 2D mini")
    print("Komponen utama pertama:", [round(x, 3) for x in pc1])
    print("Eigenvalue:", round(eig, 3))
    print("Proyeksi:", [round(x, 3) for x in projections])

    print("\nCosine similarity [1,1] vs [2,2]:", round(cosine_similarity([1, 1], [2, 2]), 3))
    print("Cosine similarity [1,0] vs [0,1]:", round(cosine_similarity([1, 0], [0, 1]), 3))

    script_dir = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
    output_dir = script_dir / "outputs"
    plot_scatter_clusters(customers, assignments, output_dir / "scatter_clusters.svg")
    plot_histogram(spend, output_dir / "histogram_spend.svg")
    plot_regression(customers, w, b, output_dir / "linear_regression_residuals.svg")
    plot_anomaly_z(customers, spend_z, output_dir / "anomaly_zscore.svg")
    print("\nPlot SVG dibuat di:", output_dir)

    print("\nInterpretasi aman:")
    print("- Cleansing adalah keputusan analitis; catat lognya.")
    print("- Cluster adalah alat eksplorasi, bukan label kebenaran.")
    print("- Residual besar dan |z| besar adalah sinyal investigasi, bukan vonis.")
    print("- Nama cluster harus netral, misalnya 'rutin bernilai sedang'.")


if __name__ == "__main__":
    main()
