#!/usr/bin/env python3 """ fix04 -- GMM Bootstrap with 1000 Resamples + Cross-Sectional Comparison Reproduces the core GMM/EM logic from b1_gmm_model_comparison.py with: - Bootstrap target: 1000 resamples (reduced to 500 if runtime exceeds budget) - Convergence diagnostics: CI width at 250 vs 500 (vs 1000 if achieved) - Cross-sectional analysis (latest obs per country, N~91) vs pooled (N=1656) - BIC-optimal K comparison between the two samples Uses only stdlib: csv, math, random, statistics, collections Performance notes: - Pure Python 3.7 EM on N=1656 is ~1.5s per resample per K. - For 1000 resamples of best K: ~25 min. We attempt 1000 with a 9-min budget, fall back to whatever count is achieved. """ import csv import math import random import statistics import sys import time from collections import defaultdict random.seed(42) SQRT_2PI = math.sqrt(2 * math.pi) LOG = math.log EXP = math.exp # ============================================================ # 1. Load data # ============================================================ data = [] with open('/tmp/pt-data/political-topology-flat.csv', 'r') as f: reader = csv.DictReader(f) for row in reader: data.append(row) liberty_pooled = [float(row['liberty']) for row in data] N_pooled = len(liberty_pooled) countries = defaultdict(list) for row in data: countries[row['country']].append(row) liberty_cross = [] for c, rows in countries.items(): rows.sort(key=lambda r: int(r['year'])) liberty_cross.append(float(rows[-1]['liberty'])) N_cross = len(liberty_cross) print(f"Pooled N = {N_pooled}", flush=True) print(f"Cross-sectional N = {N_cross}", flush=True) # ============================================================ # 2. Optimized EM Algorithm # ============================================================ def em_gmm(X, K, max_iter=200, tol=1e-6, init_means=None): """Optimized EM for GMM. Avoids per-element function-call overhead.""" N = len(X) if init_means is not None: means = list(init_means) else: means = [X[random.randint(0, N-1)]] for _ in range(K - 1): dists = [min((x - m) ** 2 for m in means) for x in X] total = sum(dists) if total == 0: means.append(X[random.randint(0, N-1)]) continue probs = [d / total for d in dists] r = random.random() cumsum = 0 for i, p in enumerate(probs): cumsum += p if cumsum >= r: means.append(X[i]) break overall_std = statistics.stdev(X) if N > 1 else 10.0 sigmas = [overall_std / K * 2] * K weights = [1.0 / K] * K prev_ll = -1e30 inv_K = 1.0 / K for iteration in range(max_iter): # Precompute per-component constants coeffs = [] inv_2s2 = [] for k in range(K): sk = sigmas[k] if sigmas[k] > 0 else 0.01 coeffs.append(weights[k] / (sk * SQRT_2PI)) inv_2s2.append(-0.5 / (sk * sk)) # E-step + accumulate M-step sums in one pass Nk = [0.0] * K sum_x = [0.0] * K sum_x2 = [0.0] * K ll = 0.0 for x in X: # Compute responsibilities inline row = [0.0] * K for k in range(K): d = x - means[k] exp_val = inv_2s2[k] * d * d if exp_val < -500: row[k] = coeffs[k] * 1e-300 else: row[k] = coeffs[k] * EXP(exp_val) total = sum(row) if total < 1e-300: for k in range(K): gk = inv_K Nk[k] += gk sum_x[k] += gk * x sum_x2[k] += gk * x * x ll += -700 else: ll += LOG(total) inv_total = 1.0 / total for k in range(K): gk = row[k] * inv_total Nk[k] += gk sum_x[k] += gk * x sum_x2[k] += gk * x * x # M-step for k in range(K): if Nk[k] < 1e-10: means[k] = X[random.randint(0, N-1)] sigmas[k] = overall_std weights[k] = inv_K continue weights[k] = Nk[k] / N means[k] = sum_x[k] / Nk[k] var_k = sum_x2[k] / Nk[k] - means[k] * means[k] sigmas[k] = math.sqrt(max(var_k, 0.01)) if abs(ll - prev_ll) < tol: return weights, means, sigmas, ll, True prev_ll = ll return weights, means, sigmas, ll, False # ============================================================ # 3. Fit GMM for K=1..5 # ============================================================ def fit_all_K(X, label): N = len(X) results = {} for K in range(1, 6): best_ll = -float('inf') best_result = None n_runs = 20 if K > 1 else 1 for run in range(n_runs): if K == 1: init_means = [statistics.mean(X)] elif K == 2: inits = [[25, 75], [20, 80], [30, 70], [15, 60], None] init_means = inits[run] if run < len(inits) else None elif K == 3: inits = [[15, 45, 80], [10, 40, 75], [20, 50, 85], [12, 35, 78], None] init_means = inits[run] if run < len(inits) else None else: init_means = None try: w, m, s, ll, conv = em_gmm(X, K, init_means=init_means) if ll > best_ll: best_ll = ll best_result = (w, m, s, ll, conv) except Exception: pass if best_result is None: continue w, m, s, ll, conv = best_result order = sorted(range(K), key=lambda k: m[k]) w = [w[k] for k in order] m = [m[k] for k in order] s = [s[k] for k in order] n_params = 3 * K - 1 bic = -2 * ll + n_params * math.log(N) aic = -2 * ll + 2 * n_params results[K] = { 'weights': w, 'means': m, 'sigmas': s, 'll': ll, 'bic': bic, 'aic': aic, 'n_params': n_params, 'converged': conv } return results # ============================================================ # 4. Model selection on POOLED and CROSS-SECTIONAL # ============================================================ print("\n" + "=" * 60, flush=True) print("Model Fitting: POOLED (N={})".format(N_pooled), flush=True) print("=" * 60, flush=True) results_pooled = fit_all_K(liberty_pooled, "pooled") best_K_pooled = min(results_pooled, key=lambda K: results_pooled[K]['bic']) print(f"BIC-optimal K (pooled) = {best_K_pooled}", flush=True) for K in sorted(results_pooled): r = results_pooled[K] print(f" K={K}: BIC={r['bic']:.2f}, LL={r['ll']:.2f}", flush=True) print("\n" + "=" * 60, flush=True) print("Model Fitting: CROSS-SECTIONAL (N={})".format(N_cross), flush=True) print("=" * 60, flush=True) results_cross = fit_all_K(liberty_cross, "cross-sectional") best_K_cross = min(results_cross, key=lambda K: results_cross[K]['bic']) print(f"BIC-optimal K (cross-sectional) = {best_K_cross}", flush=True) for K in sorted(results_cross): r = results_cross[K] print(f" K={K}: BIC={r['bic']:.2f}, LL={r['ll']:.2f}", flush=True) # ============================================================ # 5. Bootstrap on pooled data -- target 1000, budget 8 min # ============================================================ TARGET_BOOT = 1000 TIME_BUDGET = 480 # 8 minutes K_boot = best_K_pooled ref_means = results_pooled[K_boot]['means'] print(f"\n{'='*60}", flush=True) print(f"Bootstrap CIs for K={K_boot} on POOLED (target {TARGET_BOOT} resamples)", flush=True) print(f"{'='*60}", flush=True) boot_weights = [[] for _ in range(K_boot)] boot_means = [[] for _ in range(K_boot)] boot_sigmas = [[] for _ in range(K_boot)] t0 = time.time() n_done = 0 for b in range(TARGET_BOOT): X_boot = random.choices(liberty_pooled, k=N_pooled) try: init_m = [m + random.gauss(0, 2) for m in ref_means] w, m, s, ll, conv = em_gmm(X_boot, K_boot, max_iter=150, init_means=init_m) order = sorted(range(K_boot), key=lambda k: m[k]) w = [w[k] for k in order] m = [m[k] for k in order] s = [s[k] for k in order] for k in range(K_boot): boot_weights[k].append(w[k]) boot_means[k].append(m[k]) boot_sigmas[k].append(s[k]) except Exception: pass n_done = b + 1 if n_done % 50 == 0: elapsed = time.time() - t0 rate = n_done / elapsed eta = (TARGET_BOOT - n_done) / rate if rate > 0 else 0 print(f" {n_done}/{TARGET_BOOT} ({elapsed:.0f}s, {rate:.1f}/s, ETA {eta:.0f}s)", flush=True) if elapsed > TIME_BUDGET: print(f" TIME BUDGET REACHED at {n_done} resamples", flush=True) break total_time = time.time() - t0 n_boot_actual = n_done n_valid = len(boot_means[0]) print(f"\nCompleted {n_boot_actual} resamples in {total_time:.1f}s ({n_valid} valid)", flush=True) # ============================================================ # 6. Convergence diagnostics # ============================================================ print("\n" + "=" * 60, flush=True) print("Convergence Diagnostics", flush=True) print("=" * 60, flush=True) def compute_ci(vals, n_use): subset = sorted(vals[:n_use]) lo_idx = int(0.025 * n_use) hi_idx = min(int(0.975 * n_use), n_use - 1) return subset[lo_idx], subset[hi_idx] half_n = min(n_valid // 2, 500) convergence_table = [] for k in range(K_boot): bm_all = boot_means[k] nv = len(bm_all) if nv < 100: print(f" Component {k+1}: only {nv} valid, skipping", flush=True) continue ci_half = compute_ci(bm_all, half_n) ci_full = compute_ci(bm_all, nv) w_half = ci_half[1] - ci_half[0] w_full = ci_full[1] - ci_full[0] ch = abs(w_full - w_half) / w_half * 100 if w_half > 0 else 0 bw_all = boot_weights[k] wci_half = compute_ci(bw_all, half_n) wci_full = compute_ci(bw_all, nv) ww_half = wci_half[1] - wci_half[0] ww_full = wci_full[1] - wci_full[0] wch = abs(ww_full - ww_half) / ww_half * 100 if ww_half > 0 else 0 convergence_table.append({ 'k': k + 1, 'mean_ci_half': ci_half, 'mean_ci_full': ci_full, 'mean_width_change_pct': ch, 'half_n': half_n, 'weight_ci_half': wci_half, 'weight_ci_full': wci_full, 'weight_width_change_pct': wch, 'n_full': nv }) print(f"\n Component {k+1} (n={nv}):", flush=True) print(f" Mean CI @{half_n}: [{ci_half[0]:.2f}, {ci_half[1]:.2f}] w={w_half:.2f}", flush=True) print(f" Mean CI @{nv}: [{ci_full[0]:.2f}, {ci_full[1]:.2f}] w={w_full:.2f}", flush=True) print(f" Change: {ch:.1f}%", flush=True) # ============================================================ # 7. Final CI table # ============================================================ print(f"\n{'='*60}", flush=True) print(f"Final Bootstrap 95% CIs (K={K_boot}, B={n_valid})", flush=True) print(f"{'='*60}", flush=True) point_est = results_pooled[K_boot] component_summary = [] for k in range(K_boot): bw = sorted(boot_weights[k]) bm = sorted(boot_means[k]) bs = sorted(boot_sigmas[k]) n_b = len(bw) lo, hi = int(0.025 * n_b), min(int(0.975 * n_b), n_b - 1) comp = { 'weight': point_est['weights'][k], 'weight_ci': (bw[lo], bw[hi]), 'mean': point_est['means'][k], 'mean_ci': (bm[lo], bm[hi]), 'sigma': point_est['sigmas'][k], 'sigma_ci': (bs[lo], bs[hi]) } component_summary.append(comp) print(f" k={k+1}: w={comp['weight']:.3f}[{bw[lo]:.3f},{bw[hi]:.3f}] " f"mu={comp['mean']:.2f}[{bm[lo]:.2f},{bm[hi]:.2f}] " f"sig={comp['sigma']:.2f}[{bs[lo]:.2f},{bs[hi]:.2f}]", flush=True) # ============================================================ # 8. Also bootstrap K=3 if not best # ============================================================ component_summary_k3 = None time_left = TIME_BUDGET - (time.time() - t0) if best_K_pooled != 3 and 3 in results_pooled and time_left > 60: print(f"\n{'='*60}", flush=True) print(f"Bootstrap CIs for K=3 (theory-predicted, budget {time_left:.0f}s left)", flush=True) print(f"{'='*60}", flush=True) ref_means_3 = results_pooled[3]['means'] boot_w3 = [[] for _ in range(3)] boot_m3 = [[] for _ in range(3)] boot_s3 = [[] for _ in range(3)] t0_3 = time.time() n_done_3 = 0 for b in range(TARGET_BOOT): X_boot = random.choices(liberty_pooled, k=N_pooled) try: init_m = [m + random.gauss(0, 2) for m in ref_means_3] w, m, s, ll, conv = em_gmm(X_boot, 3, max_iter=150, init_means=init_m) order = sorted(range(3), key=lambda k: m[k]) w = [w[k] for k in order] m = [m[k] for k in order] s = [s[k] for k in order] for k in range(3): boot_w3[k].append(w[k]) boot_m3[k].append(m[k]) boot_s3[k].append(s[k]) except Exception: pass n_done_3 = b + 1 if n_done_3 % 50 == 0: elapsed_3 = time.time() - t0_3 print(f" {n_done_3} resamples ({elapsed_3:.0f}s)", flush=True) if elapsed_3 > time_left: print(f" TIME BUDGET REACHED at {n_done_3}", flush=True) break if len(boot_w3[0]) >= 20: component_summary_k3 = [] pe3 = results_pooled[3] for k in range(3): bw = sorted(boot_w3[k]) bm = sorted(boot_m3[k]) bs = sorted(boot_s3[k]) n_b = len(bw) lo, hi = int(0.025 * n_b), min(int(0.975 * n_b), n_b - 1) comp = { 'weight': pe3['weights'][k], 'weight_ci': (bw[lo], bw[hi]), 'mean': pe3['means'][k], 'mean_ci': (bm[lo], bm[hi]), 'sigma': pe3['sigmas'][k], 'sigma_ci': (bs[lo], bs[hi]) } component_summary_k3.append(comp) print(f" k={k+1}: w={pe3['weights'][k]:.3f}[{bw[lo]:.3f},{bw[hi]:.3f}] " f"mu={pe3['means'][k]:.2f}[{bm[lo]:.2f},{bm[hi]:.2f}]", flush=True) # ============================================================ # 9. Write results # ============================================================ out_path = '/tmp/pt-data/fix04-gmm-1000-results.md' with open(out_path, 'w') as f: f.write("# fix04: GMM Bootstrap with Increased Resamples + Cross-Sectional Comparison\n\n") if n_boot_actual < 1000: f.write(f"**Note**: Target was 1000 resamples but runtime budget limited execution to " f"{n_boot_actual} resamples ({n_valid} valid). Pure-Python EM on N={N_pooled} " f"is computationally intensive without NumPy.\n\n") f.write("## Data Summary\n\n") f.write(f"- Pooled: N = {N_pooled}, range [{min(liberty_pooled):.1f}, {max(liberty_pooled):.1f}], " f"mean = {statistics.mean(liberty_pooled):.2f}, sd = {statistics.stdev(liberty_pooled):.2f}\n") f.write(f"- Cross-sectional (latest per country): N = {N_cross}, " f"range [{min(liberty_cross):.1f}, {max(liberty_cross):.1f}], " f"mean = {statistics.mean(liberty_cross):.2f}, sd = {statistics.stdev(liberty_cross):.2f}\n\n") f.write("## Model Comparison: Pooled\n\n") f.write("| K | Params | Log-Lik | AIC | BIC | dBIC |\n") f.write("|---|--------|---------|-----|-----|------|\n") best_bic_p = min(results_pooled[K]['bic'] for K in results_pooled) for K in sorted(results_pooled): r = results_pooled[K] dbic = r['bic'] - best_bic_p mark = " *" if dbic == 0 else "" f.write(f"| {K} | {r['n_params']} | {r['ll']:.2f} | {r['aic']:.2f} | {r['bic']:.2f} | {dbic:.2f}{mark} |\n") f.write(f"\n**BIC-optimal K (pooled) = {best_K_pooled}**\n\n") f.write("## Model Comparison: Cross-Sectional\n\n") f.write("| K | Params | Log-Lik | AIC | BIC | dBIC |\n") f.write("|---|--------|---------|-----|-----|------|\n") best_bic_c = min(results_cross[K]['bic'] for K in results_cross) for K in sorted(results_cross): r = results_cross[K] dbic = r['bic'] - best_bic_c mark = " *" if dbic == 0 else "" f.write(f"| {K} | {r['n_params']} | {r['ll']:.2f} | {r['aic']:.2f} | {r['bic']:.2f} | {dbic:.2f}{mark} |\n") f.write(f"\n**BIC-optimal K (cross-sectional) = {best_K_cross}**\n\n") f.write("## Pooled vs Cross-Sectional: BIC-Optimal K\n\n") if best_K_pooled == best_K_cross: f.write(f"Both samples yield K = {best_K_pooled}. The cluster structure is robust " f"to whether we use pooled panel data or cross-sectional snapshots.\n\n") else: f.write(f"Pooled selects K = {best_K_pooled}, cross-sectional selects K = {best_K_cross}.\n") f.write("The difference may reflect:\n") f.write("- Temporal autocorrelation inflating apparent clusters in pooled data\n") f.write("- Small cross-sectional N limiting power to resolve components\n") f.write("- Genuine structural differences between pooled and snapshot distributions\n\n") f.write("### Cross-Sectional Component Details (K={})\n\n".format(best_K_cross)) f.write("| Component | Weight | Mean | Sigma |\n") f.write("|-----------|--------|------|-------|\n") rc = results_cross[best_K_cross] for k in range(best_K_cross): f.write(f"| {k+1} | {rc['weights'][k]:.3f} | {rc['means'][k]:.2f} | {rc['sigmas'][k]:.2f} |\n") f.write("\n") f.write(f"## Convergence Diagnostics: {half_n} vs {n_valid} Resamples\n\n") f.write(f"| Component | Param | CI @{half_n} | CI @{n_valid} | Width @{half_n} | Width @{n_valid} | Change |\n") f.write("|-----------|-------|---------|----------|-----------|------------|--------|\n") for row in convergence_table: k = row['k'] c5 = row['mean_ci_half'] cf = row['mean_ci_full'] w5 = c5[1] - c5[0] wf = cf[1] - cf[0] f.write(f"| {k} | Mean | [{c5[0]:.2f}, {c5[1]:.2f}] | [{cf[0]:.2f}, {cf[1]:.2f}] | " f"{w5:.2f} | {wf:.2f} | {row['mean_width_change_pct']:.1f}% |\n") c5w = row['weight_ci_half'] cfw = row['weight_ci_full'] w5w = c5w[1] - c5w[0] wfw = cfw[1] - cfw[0] f.write(f"| {k} | Weight | [{c5w[0]:.3f}, {c5w[1]:.3f}] | [{cfw[0]:.3f}, {cfw[1]:.3f}] | " f"{w5w:.3f} | {wfw:.3f} | {row['weight_width_change_pct']:.1f}% |\n") f.write("\n") if convergence_table and all(r['mean_width_change_pct'] < 10 for r in convergence_table): f.write("All CI widths changed by <10%. Bootstrap has converged.\n\n") elif convergence_table: f.write("Some CI widths changed by >=10%. More resamples may be needed, " "but results are still informative.\n\n") f.write(f"## Bootstrap 95% CIs (K={K_boot}, B={n_valid})\n\n") f.write("| Component | Weight [95% CI] | Mean [95% CI] | Sigma [95% CI] |\n") f.write("|-----------|----------------|--------------|----------------|\n") for k in range(K_boot): cs = component_summary[k] f.write(f"| {k+1} | {cs['weight']:.3f} [{cs['weight_ci'][0]:.3f}, {cs['weight_ci'][1]:.3f}] | " f"{cs['mean']:.2f} [{cs['mean_ci'][0]:.2f}, {cs['mean_ci'][1]:.2f}] | " f"{cs['sigma']:.2f} [{cs['sigma_ci'][0]:.2f}, {cs['sigma_ci'][1]:.2f}] |\n") f.write(f"\nBootstrap completed in {total_time:.1f}s ({n_boot_actual} attempted, {n_valid} valid)\n\n") if component_summary_k3 is not None: f.write(f"## Bootstrap 95% CIs for K=3 (theory-predicted)\n\n") f.write("| Component | Weight [95% CI] | Mean [95% CI] | Sigma [95% CI] |\n") f.write("|-----------|----------------|--------------|----------------|\n") for k in range(3): cs = component_summary_k3[k] f.write(f"| {k+1} | {cs['weight']:.3f} [{cs['weight_ci'][0]:.3f}, {cs['weight_ci'][1]:.3f}] | " f"{cs['mean']:.2f} [{cs['mean_ci'][0]:.2f}, {cs['mean_ci'][1]:.2f}] | " f"{cs['sigma']:.2f} [{cs['sigma_ci'][0]:.2f}, {cs['sigma_ci'][1]:.2f}] |\n") f.write("\n") f.write("---\n*Generated by fix04_gmm_bootstrap_1000.py*\n") print(f"\nResults saved to {out_path}", flush=True)