#!/usr/bin/env python3
"""
fix04 -- GMM Bootstrap with 1000 Resamples + Cross-Sectional Comparison

Reproduces the core GMM/EM logic from b1_gmm_model_comparison.py with:
  - Bootstrap target: 1000 resamples (reduced to 500 if runtime exceeds budget)
  - Convergence diagnostics: CI width at 250 vs 500 (vs 1000 if achieved)
  - Cross-sectional analysis (latest obs per country, N~91) vs pooled (N=1656)
  - BIC-optimal K comparison between the two samples

Uses only stdlib: csv, math, random, statistics, collections

Performance notes:
  - Pure Python 3.7 EM on N=1656 is ~1.5s per resample per K.
  - For 1000 resamples of best K: ~25 min. We attempt 1000 with a 9-min budget,
    fall back to whatever count is achieved.
"""

import csv
import math
import random
import statistics
import sys
import time
from collections import defaultdict

random.seed(42)

SQRT_2PI = math.sqrt(2 * math.pi)
LOG = math.log
EXP = math.exp

# ============================================================
# 1. Load data
# ============================================================
data = []
with open('/tmp/pt-data/political-topology-flat.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        data.append(row)

liberty_pooled = [float(row['liberty']) for row in data]
N_pooled = len(liberty_pooled)

countries = defaultdict(list)
for row in data:
    countries[row['country']].append(row)

liberty_cross = []
for c, rows in countries.items():
    rows.sort(key=lambda r: int(r['year']))
    liberty_cross.append(float(rows[-1]['liberty']))
N_cross = len(liberty_cross)

print(f"Pooled N = {N_pooled}", flush=True)
print(f"Cross-sectional N = {N_cross}", flush=True)

# ============================================================
# 2. Optimized EM Algorithm
# ============================================================

def em_gmm(X, K, max_iter=200, tol=1e-6, init_means=None):
    """Optimized EM for GMM. Avoids per-element function-call overhead."""
    N = len(X)

    if init_means is not None:
        means = list(init_means)
    else:
        means = [X[random.randint(0, N-1)]]
        for _ in range(K - 1):
            dists = [min((x - m) ** 2 for m in means) for x in X]
            total = sum(dists)
            if total == 0:
                means.append(X[random.randint(0, N-1)])
                continue
            probs = [d / total for d in dists]
            r = random.random()
            cumsum = 0
            for i, p in enumerate(probs):
                cumsum += p
                if cumsum >= r:
                    means.append(X[i])
                    break

    overall_std = statistics.stdev(X) if N > 1 else 10.0
    sigmas = [overall_std / K * 2] * K
    weights = [1.0 / K] * K
    prev_ll = -1e30
    inv_K = 1.0 / K

    for iteration in range(max_iter):
        # Precompute per-component constants
        coeffs = []
        inv_2s2 = []
        for k in range(K):
            sk = sigmas[k] if sigmas[k] > 0 else 0.01
            coeffs.append(weights[k] / (sk * SQRT_2PI))
            inv_2s2.append(-0.5 / (sk * sk))

        # E-step + accumulate M-step sums in one pass
        Nk = [0.0] * K
        sum_x = [0.0] * K
        sum_x2 = [0.0] * K
        ll = 0.0

        for x in X:
            # Compute responsibilities inline
            row = [0.0] * K
            for k in range(K):
                d = x - means[k]
                exp_val = inv_2s2[k] * d * d
                if exp_val < -500:
                    row[k] = coeffs[k] * 1e-300
                else:
                    row[k] = coeffs[k] * EXP(exp_val)

            total = sum(row)
            if total < 1e-300:
                for k in range(K):
                    gk = inv_K
                    Nk[k] += gk
                    sum_x[k] += gk * x
                    sum_x2[k] += gk * x * x
                ll += -700
            else:
                ll += LOG(total)
                inv_total = 1.0 / total
                for k in range(K):
                    gk = row[k] * inv_total
                    Nk[k] += gk
                    sum_x[k] += gk * x
                    sum_x2[k] += gk * x * x

        # M-step
        for k in range(K):
            if Nk[k] < 1e-10:
                means[k] = X[random.randint(0, N-1)]
                sigmas[k] = overall_std
                weights[k] = inv_K
                continue
            weights[k] = Nk[k] / N
            means[k] = sum_x[k] / Nk[k]
            var_k = sum_x2[k] / Nk[k] - means[k] * means[k]
            sigmas[k] = math.sqrt(max(var_k, 0.01))

        if abs(ll - prev_ll) < tol:
            return weights, means, sigmas, ll, True
        prev_ll = ll

    return weights, means, sigmas, ll, False

# ============================================================
# 3. Fit GMM for K=1..5
# ============================================================

def fit_all_K(X, label):
    N = len(X)
    results = {}
    for K in range(1, 6):
        best_ll = -float('inf')
        best_result = None
        n_runs = 20 if K > 1 else 1
        for run in range(n_runs):
            if K == 1:
                init_means = [statistics.mean(X)]
            elif K == 2:
                inits = [[25, 75], [20, 80], [30, 70], [15, 60], None]
                init_means = inits[run] if run < len(inits) else None
            elif K == 3:
                inits = [[15, 45, 80], [10, 40, 75], [20, 50, 85], [12, 35, 78], None]
                init_means = inits[run] if run < len(inits) else None
            else:
                init_means = None
            try:
                w, m, s, ll, conv = em_gmm(X, K, init_means=init_means)
                if ll > best_ll:
                    best_ll = ll
                    best_result = (w, m, s, ll, conv)
            except Exception:
                pass
        if best_result is None:
            continue
        w, m, s, ll, conv = best_result
        order = sorted(range(K), key=lambda k: m[k])
        w = [w[k] for k in order]
        m = [m[k] for k in order]
        s = [s[k] for k in order]
        n_params = 3 * K - 1
        bic = -2 * ll + n_params * math.log(N)
        aic = -2 * ll + 2 * n_params
        results[K] = {
            'weights': w, 'means': m, 'sigmas': s,
            'll': ll, 'bic': bic, 'aic': aic,
            'n_params': n_params, 'converged': conv
        }
    return results

# ============================================================
# 4. Model selection on POOLED and CROSS-SECTIONAL
# ============================================================
print("\n" + "=" * 60, flush=True)
print("Model Fitting: POOLED (N={})".format(N_pooled), flush=True)
print("=" * 60, flush=True)
results_pooled = fit_all_K(liberty_pooled, "pooled")
best_K_pooled = min(results_pooled, key=lambda K: results_pooled[K]['bic'])
print(f"BIC-optimal K (pooled) = {best_K_pooled}", flush=True)
for K in sorted(results_pooled):
    r = results_pooled[K]
    print(f"  K={K}: BIC={r['bic']:.2f}, LL={r['ll']:.2f}", flush=True)

print("\n" + "=" * 60, flush=True)
print("Model Fitting: CROSS-SECTIONAL (N={})".format(N_cross), flush=True)
print("=" * 60, flush=True)
results_cross = fit_all_K(liberty_cross, "cross-sectional")
best_K_cross = min(results_cross, key=lambda K: results_cross[K]['bic'])
print(f"BIC-optimal K (cross-sectional) = {best_K_cross}", flush=True)
for K in sorted(results_cross):
    r = results_cross[K]
    print(f"  K={K}: BIC={r['bic']:.2f}, LL={r['ll']:.2f}", flush=True)

# ============================================================
# 5. Bootstrap on pooled data -- target 1000, budget 8 min
# ============================================================
TARGET_BOOT = 1000
TIME_BUDGET = 480  # 8 minutes

K_boot = best_K_pooled
ref_means = results_pooled[K_boot]['means']

print(f"\n{'='*60}", flush=True)
print(f"Bootstrap CIs for K={K_boot} on POOLED (target {TARGET_BOOT} resamples)", flush=True)
print(f"{'='*60}", flush=True)

boot_weights = [[] for _ in range(K_boot)]
boot_means = [[] for _ in range(K_boot)]
boot_sigmas = [[] for _ in range(K_boot)]

t0 = time.time()
n_done = 0
for b in range(TARGET_BOOT):
    X_boot = random.choices(liberty_pooled, k=N_pooled)
    try:
        init_m = [m + random.gauss(0, 2) for m in ref_means]
        w, m, s, ll, conv = em_gmm(X_boot, K_boot, max_iter=150, init_means=init_m)
        order = sorted(range(K_boot), key=lambda k: m[k])
        w = [w[k] for k in order]
        m = [m[k] for k in order]
        s = [s[k] for k in order]
        for k in range(K_boot):
            boot_weights[k].append(w[k])
            boot_means[k].append(m[k])
            boot_sigmas[k].append(s[k])
    except Exception:
        pass
    n_done = b + 1
    if n_done % 50 == 0:
        elapsed = time.time() - t0
        rate = n_done / elapsed
        eta = (TARGET_BOOT - n_done) / rate if rate > 0 else 0
        print(f"  {n_done}/{TARGET_BOOT} ({elapsed:.0f}s, {rate:.1f}/s, ETA {eta:.0f}s)", flush=True)
        if elapsed > TIME_BUDGET:
            print(f"  TIME BUDGET REACHED at {n_done} resamples", flush=True)
            break

total_time = time.time() - t0
n_boot_actual = n_done
n_valid = len(boot_means[0])
print(f"\nCompleted {n_boot_actual} resamples in {total_time:.1f}s ({n_valid} valid)", flush=True)

# ============================================================
# 6. Convergence diagnostics
# ============================================================
print("\n" + "=" * 60, flush=True)
print("Convergence Diagnostics", flush=True)
print("=" * 60, flush=True)

def compute_ci(vals, n_use):
    subset = sorted(vals[:n_use])
    lo_idx = int(0.025 * n_use)
    hi_idx = min(int(0.975 * n_use), n_use - 1)
    return subset[lo_idx], subset[hi_idx]

half_n = min(n_valid // 2, 500)
convergence_table = []
for k in range(K_boot):
    bm_all = boot_means[k]
    nv = len(bm_all)
    if nv < 100:
        print(f"  Component {k+1}: only {nv} valid, skipping", flush=True)
        continue

    ci_half = compute_ci(bm_all, half_n)
    ci_full = compute_ci(bm_all, nv)
    w_half = ci_half[1] - ci_half[0]
    w_full = ci_full[1] - ci_full[0]
    ch = abs(w_full - w_half) / w_half * 100 if w_half > 0 else 0

    bw_all = boot_weights[k]
    wci_half = compute_ci(bw_all, half_n)
    wci_full = compute_ci(bw_all, nv)
    ww_half = wci_half[1] - wci_half[0]
    ww_full = wci_full[1] - wci_full[0]
    wch = abs(ww_full - ww_half) / ww_half * 100 if ww_half > 0 else 0

    convergence_table.append({
        'k': k + 1,
        'mean_ci_half': ci_half, 'mean_ci_full': ci_full,
        'mean_width_change_pct': ch, 'half_n': half_n,
        'weight_ci_half': wci_half, 'weight_ci_full': wci_full,
        'weight_width_change_pct': wch, 'n_full': nv
    })
    print(f"\n  Component {k+1} (n={nv}):", flush=True)
    print(f"    Mean CI @{half_n}: [{ci_half[0]:.2f}, {ci_half[1]:.2f}] w={w_half:.2f}", flush=True)
    print(f"    Mean CI @{nv}:  [{ci_full[0]:.2f}, {ci_full[1]:.2f}] w={w_full:.2f}", flush=True)
    print(f"    Change: {ch:.1f}%", flush=True)

# ============================================================
# 7. Final CI table
# ============================================================
print(f"\n{'='*60}", flush=True)
print(f"Final Bootstrap 95% CIs (K={K_boot}, B={n_valid})", flush=True)
print(f"{'='*60}", flush=True)

point_est = results_pooled[K_boot]
component_summary = []
for k in range(K_boot):
    bw = sorted(boot_weights[k])
    bm = sorted(boot_means[k])
    bs = sorted(boot_sigmas[k])
    n_b = len(bw)
    lo, hi = int(0.025 * n_b), min(int(0.975 * n_b), n_b - 1)
    comp = {
        'weight': point_est['weights'][k], 'weight_ci': (bw[lo], bw[hi]),
        'mean': point_est['means'][k], 'mean_ci': (bm[lo], bm[hi]),
        'sigma': point_est['sigmas'][k], 'sigma_ci': (bs[lo], bs[hi])
    }
    component_summary.append(comp)
    print(f"  k={k+1}: w={comp['weight']:.3f}[{bw[lo]:.3f},{bw[hi]:.3f}] "
          f"mu={comp['mean']:.2f}[{bm[lo]:.2f},{bm[hi]:.2f}] "
          f"sig={comp['sigma']:.2f}[{bs[lo]:.2f},{bs[hi]:.2f}]", flush=True)

# ============================================================
# 8. Also bootstrap K=3 if not best
# ============================================================
component_summary_k3 = None
time_left = TIME_BUDGET - (time.time() - t0)

if best_K_pooled != 3 and 3 in results_pooled and time_left > 60:
    print(f"\n{'='*60}", flush=True)
    print(f"Bootstrap CIs for K=3 (theory-predicted, budget {time_left:.0f}s left)", flush=True)
    print(f"{'='*60}", flush=True)

    ref_means_3 = results_pooled[3]['means']
    boot_w3 = [[] for _ in range(3)]
    boot_m3 = [[] for _ in range(3)]
    boot_s3 = [[] for _ in range(3)]

    t0_3 = time.time()
    n_done_3 = 0
    for b in range(TARGET_BOOT):
        X_boot = random.choices(liberty_pooled, k=N_pooled)
        try:
            init_m = [m + random.gauss(0, 2) for m in ref_means_3]
            w, m, s, ll, conv = em_gmm(X_boot, 3, max_iter=150, init_means=init_m)
            order = sorted(range(3), key=lambda k: m[k])
            w = [w[k] for k in order]
            m = [m[k] for k in order]
            s = [s[k] for k in order]
            for k in range(3):
                boot_w3[k].append(w[k])
                boot_m3[k].append(m[k])
                boot_s3[k].append(s[k])
        except Exception:
            pass
        n_done_3 = b + 1
        if n_done_3 % 50 == 0:
            elapsed_3 = time.time() - t0_3
            print(f"  {n_done_3} resamples ({elapsed_3:.0f}s)", flush=True)
            if elapsed_3 > time_left:
                print(f"  TIME BUDGET REACHED at {n_done_3}", flush=True)
                break

    if len(boot_w3[0]) >= 20:
        component_summary_k3 = []
        pe3 = results_pooled[3]
        for k in range(3):
            bw = sorted(boot_w3[k])
            bm = sorted(boot_m3[k])
            bs = sorted(boot_s3[k])
            n_b = len(bw)
            lo, hi = int(0.025 * n_b), min(int(0.975 * n_b), n_b - 1)
            comp = {
                'weight': pe3['weights'][k], 'weight_ci': (bw[lo], bw[hi]),
                'mean': pe3['means'][k], 'mean_ci': (bm[lo], bm[hi]),
                'sigma': pe3['sigmas'][k], 'sigma_ci': (bs[lo], bs[hi])
            }
            component_summary_k3.append(comp)
            print(f"  k={k+1}: w={pe3['weights'][k]:.3f}[{bw[lo]:.3f},{bw[hi]:.3f}] "
                  f"mu={pe3['means'][k]:.2f}[{bm[lo]:.2f},{bm[hi]:.2f}]", flush=True)

# ============================================================
# 9. Write results
# ============================================================
out_path = '/tmp/pt-data/fix04-gmm-1000-results.md'
with open(out_path, 'w') as f:
    f.write("# fix04: GMM Bootstrap with Increased Resamples + Cross-Sectional Comparison\n\n")

    if n_boot_actual < 1000:
        f.write(f"**Note**: Target was 1000 resamples but runtime budget limited execution to "
                f"{n_boot_actual} resamples ({n_valid} valid). Pure-Python EM on N={N_pooled} "
                f"is computationally intensive without NumPy.\n\n")

    f.write("## Data Summary\n\n")
    f.write(f"- Pooled: N = {N_pooled}, range [{min(liberty_pooled):.1f}, {max(liberty_pooled):.1f}], "
            f"mean = {statistics.mean(liberty_pooled):.2f}, sd = {statistics.stdev(liberty_pooled):.2f}\n")
    f.write(f"- Cross-sectional (latest per country): N = {N_cross}, "
            f"range [{min(liberty_cross):.1f}, {max(liberty_cross):.1f}], "
            f"mean = {statistics.mean(liberty_cross):.2f}, sd = {statistics.stdev(liberty_cross):.2f}\n\n")

    f.write("## Model Comparison: Pooled\n\n")
    f.write("| K | Params | Log-Lik | AIC | BIC | dBIC |\n")
    f.write("|---|--------|---------|-----|-----|------|\n")
    best_bic_p = min(results_pooled[K]['bic'] for K in results_pooled)
    for K in sorted(results_pooled):
        r = results_pooled[K]
        dbic = r['bic'] - best_bic_p
        mark = " *" if dbic == 0 else ""
        f.write(f"| {K} | {r['n_params']} | {r['ll']:.2f} | {r['aic']:.2f} | {r['bic']:.2f} | {dbic:.2f}{mark} |\n")
    f.write(f"\n**BIC-optimal K (pooled) = {best_K_pooled}**\n\n")

    f.write("## Model Comparison: Cross-Sectional\n\n")
    f.write("| K | Params | Log-Lik | AIC | BIC | dBIC |\n")
    f.write("|---|--------|---------|-----|-----|------|\n")
    best_bic_c = min(results_cross[K]['bic'] for K in results_cross)
    for K in sorted(results_cross):
        r = results_cross[K]
        dbic = r['bic'] - best_bic_c
        mark = " *" if dbic == 0 else ""
        f.write(f"| {K} | {r['n_params']} | {r['ll']:.2f} | {r['aic']:.2f} | {r['bic']:.2f} | {dbic:.2f}{mark} |\n")
    f.write(f"\n**BIC-optimal K (cross-sectional) = {best_K_cross}**\n\n")

    f.write("## Pooled vs Cross-Sectional: BIC-Optimal K\n\n")
    if best_K_pooled == best_K_cross:
        f.write(f"Both samples yield K = {best_K_pooled}. The cluster structure is robust "
                f"to whether we use pooled panel data or cross-sectional snapshots.\n\n")
    else:
        f.write(f"Pooled selects K = {best_K_pooled}, cross-sectional selects K = {best_K_cross}.\n")
        f.write("The difference may reflect:\n")
        f.write("- Temporal autocorrelation inflating apparent clusters in pooled data\n")
        f.write("- Small cross-sectional N limiting power to resolve components\n")
        f.write("- Genuine structural differences between pooled and snapshot distributions\n\n")

    f.write("### Cross-Sectional Component Details (K={})\n\n".format(best_K_cross))
    f.write("| Component | Weight | Mean | Sigma |\n")
    f.write("|-----------|--------|------|-------|\n")
    rc = results_cross[best_K_cross]
    for k in range(best_K_cross):
        f.write(f"| {k+1} | {rc['weights'][k]:.3f} | {rc['means'][k]:.2f} | {rc['sigmas'][k]:.2f} |\n")
    f.write("\n")

    f.write(f"## Convergence Diagnostics: {half_n} vs {n_valid} Resamples\n\n")
    f.write(f"| Component | Param | CI @{half_n} | CI @{n_valid} | Width @{half_n} | Width @{n_valid} | Change |\n")
    f.write("|-----------|-------|---------|----------|-----------|------------|--------|\n")
    for row in convergence_table:
        k = row['k']
        c5 = row['mean_ci_half']
        cf = row['mean_ci_full']
        w5 = c5[1] - c5[0]
        wf = cf[1] - cf[0]
        f.write(f"| {k} | Mean | [{c5[0]:.2f}, {c5[1]:.2f}] | [{cf[0]:.2f}, {cf[1]:.2f}] | "
                f"{w5:.2f} | {wf:.2f} | {row['mean_width_change_pct']:.1f}% |\n")
        c5w = row['weight_ci_half']
        cfw = row['weight_ci_full']
        w5w = c5w[1] - c5w[0]
        wfw = cfw[1] - cfw[0]
        f.write(f"| {k} | Weight | [{c5w[0]:.3f}, {c5w[1]:.3f}] | [{cfw[0]:.3f}, {cfw[1]:.3f}] | "
                f"{w5w:.3f} | {wfw:.3f} | {row['weight_width_change_pct']:.1f}% |\n")
    f.write("\n")
    if convergence_table and all(r['mean_width_change_pct'] < 10 for r in convergence_table):
        f.write("All CI widths changed by <10%. Bootstrap has converged.\n\n")
    elif convergence_table:
        f.write("Some CI widths changed by >=10%. More resamples may be needed, "
                "but results are still informative.\n\n")

    f.write(f"## Bootstrap 95% CIs (K={K_boot}, B={n_valid})\n\n")
    f.write("| Component | Weight [95% CI] | Mean [95% CI] | Sigma [95% CI] |\n")
    f.write("|-----------|----------------|--------------|----------------|\n")
    for k in range(K_boot):
        cs = component_summary[k]
        f.write(f"| {k+1} | {cs['weight']:.3f} [{cs['weight_ci'][0]:.3f}, {cs['weight_ci'][1]:.3f}] | "
                f"{cs['mean']:.2f} [{cs['mean_ci'][0]:.2f}, {cs['mean_ci'][1]:.2f}] | "
                f"{cs['sigma']:.2f} [{cs['sigma_ci'][0]:.2f}, {cs['sigma_ci'][1]:.2f}] |\n")
    f.write(f"\nBootstrap completed in {total_time:.1f}s ({n_boot_actual} attempted, {n_valid} valid)\n\n")

    if component_summary_k3 is not None:
        f.write(f"## Bootstrap 95% CIs for K=3 (theory-predicted)\n\n")
        f.write("| Component | Weight [95% CI] | Mean [95% CI] | Sigma [95% CI] |\n")
        f.write("|-----------|----------------|--------------|----------------|\n")
        for k in range(3):
            cs = component_summary_k3[k]
            f.write(f"| {k+1} | {cs['weight']:.3f} [{cs['weight_ci'][0]:.3f}, {cs['weight_ci'][1]:.3f}] | "
                    f"{cs['mean']:.2f} [{cs['mean_ci'][0]:.2f}, {cs['mean_ci'][1]:.2f}] | "
                    f"{cs['sigma']:.2f} [{cs['sigma_ci'][0]:.2f}, {cs['sigma_ci'][1]:.2f}] |\n")
        f.write("\n")

    f.write("---\n*Generated by fix04_gmm_bootstrap_1000.py*\n")

print(f"\nResults saved to {out_path}", flush=True)