import sys
import json
import os
import tempfile
from pathlib import Path

import numpy as np
import timeit
import matplotlib.pyplot as plt

sys.path.insert(0, str(Path(".").resolve().parent / "src"))

from scpn_fusion.core import FusionKernel
from scpn_fusion.core.neural_transport import (
    NeuralTransportModel,
    TransportInputs,
)

print(f"NumPy {np.__version__}")
print("Imports OK")

# ITER-like configuration (same as notebook 03)
config = {
    "reactor_name": "ITER-like",
    "grid_resolution": [65, 65],
    "dimensions": {"R_min": 1.0, "R_max": 9.0, "Z_min": -5.0, "Z_max": 5.0},
    "physics": {"plasma_current_target": 15.0, "vacuum_permeability": 1.2566370614e-6},
    "coils": [
        {"R": 3.5, "Z": 4.0, "current": 5.0},
        {"R": 3.5, "Z": -4.0, "current": 5.0},
        {"R": 9.0, "Z": 4.0, "current": -3.0},
        {"R": 9.0, "Z": -4.0, "current": -3.0},
        {"R": 6.2, "Z": 5.5, "current": -1.5},
        {"R": 6.2, "Z": -5.5, "current": -1.5},
    ],
    "solver": {"max_iterations": 100, "convergence_threshold": 1e-6, "relaxation_factor": 0.1},
}

fd, config_path = tempfile.mkstemp(suffix=".json")
os.close(fd)
with open(config_path, "w") as f:
    json.dump(config, f)

print(f"Grid: {config['grid_resolution'][0]}x{config['grid_resolution'][1]}")
print(f"Coils: {len(config['coils'])}")
print(f"Config written to: {config_path}")

# Benchmark: Forward solve (vacuum + equilibrium)
# This is the bottleneck inside each LM iteration.


def bench_forward_solve():
    k = FusionKernel(config_path)
    k.initialize_grid()
    k.calculate_vacuum_field()
    k.solve_equilibrium()


# Warm-up
bench_forward_solve()

t_fwd = timeit.repeat(bench_forward_solve, number=1, repeat=3)

t_mean = np.mean(t_fwd) * 1000
t_std = np.std(t_fwd) * 1000

print("Forward Solve Benchmark (65x65, Python)")
print("=" * 48)
print(f"  Mean:   {t_mean:.1f} ms +/- {t_std:.1f} ms")
print(f"  Best:   {min(t_fwd) * 1000:.1f} ms")
print()

# Inverse solver overhead estimate:
# 1 LM iteration = 8 forward solves (1 base + 7 Jacobian columns)
# + Cholesky factor + line search (negligible)
t_lm_iter = t_mean * 8
print("Estimated Inverse Solver Overhead (1 LM iteration)")
print("-" * 48)
print(f"  8 forward solves:  {t_lm_iter:.0f} ms")
print("  + Cholesky/IRLS:   ~0.1 ms (negligible)")
print(f"  Total:             ~{t_lm_iter:.0f} ms")
print()

# Comparison table: 4 InverseConfig variants
# Tikhonov, Huber, and sigma add only O(N_params) or O(N_probes) work
# per iteration — dominated entirely by forward solves.
print("Inverse Config Variant Comparison")
print("-" * 60)
print(f"{'Config':<25} {'Overhead / LM iter':>18} {'Notes':>15}")
print(f"{'Default (LS)':<25} {t_lm_iter:>14.0f} ms   {'baseline':>15}")
print(f"{'+ Tikhonov (a=0.1)':<25} {t_lm_iter:>14.0f} ms   {'+N adds':>15}")
print(f"{'+ Huber (d=0.1)':<25} {t_lm_iter:>14.0f} ms   {'+IRLS wts':>15}")
print(f"{'+ sigma weights':<25} {t_lm_iter:>14.0f} ms   {'+N divs':>15}")
print(f"{'Combined (all)':<25} {t_lm_iter:>14.0f} ms   {'negligible':>15}")

%%timeit -n1 -r3
_k = FusionKernel(config_path)
_k.initialize_grid()
_k.calculate_vacuum_field()
_k.solve_equilibrium()

# Convergence visualisation: forward solve time scaling with grid size
# Run 33x33, 49x49, 65x65 grids to show scaling behaviour

grid_sizes = [33, 49, 65]
times_ms = []

for n in grid_sizes:
    cfg = config.copy()
    cfg["grid_resolution"] = [n, n]
    _fd, _path = tempfile.mkstemp(suffix=".json")
    os.close(_fd)
    with open(_path, "w") as f:
        json.dump(cfg, f)

    def _bench(_p=_path):
        k = FusionKernel(_p)
        k.initialize_grid()
        k.calculate_vacuum_field()
        k.solve_equilibrium()

    _bench()  # warm-up
    t = timeit.repeat(_bench, number=1, repeat=3)
    times_ms.append(np.mean(t) * 1000)
    os.unlink(_path)
    print(f"  {n}x{n}: {times_ms[-1]:.1f} ms")

# Projected LM iteration cost (8 forward solves)
lm_times = [t * 8 for t in times_ms]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart: forward solve time vs grid
labels = [f"{n}x{n}" for n in grid_sizes]
ax1.bar(labels, times_ms, color=["#2196F3", "#4CAF50", "#FF9800"])
ax1.set_xlabel("Grid Resolution")
ax1.set_ylabel("Forward Solve Time (ms)")
ax1.set_title("Forward Solve Scaling")
for i, v in enumerate(times_ms):
    ax1.text(i, v + max(times_ms) * 0.02, f"{v:.0f}", ha="center", fontsize=10)

# Bar chart: projected LM iteration cost
ax2.bar(labels, lm_times, color=["#2196F3", "#4CAF50", "#FF9800"], alpha=0.8)
ax2.set_xlabel("Grid Resolution")
ax2.set_ylabel("Est. LM Iteration Time (ms)")
ax2.set_title("Inverse Solver: 1 LM Iteration (8 forward solves)")
for i, v in enumerate(lm_times):
    ax2.text(i, v + max(lm_times) * 0.02, f"{v:.0f}", ha="center", fontsize=10)

plt.tight_layout()
plt.show()

os.unlink(config_path)

# Benchmark: critical-gradient fallback

model_fallback = NeuralTransportModel()  # no weights → fallback
assert not model_fallback.is_neural
print(f"Model mode: {'neural' if model_fallback.is_neural else 'fallback'}")

# Single-point timing
inp = TransportInputs(grad_ti=8.0, te_kev=10.0, ti_kev=10.0)


def bench_single_fallback():
    model_fallback.predict(inp)


t_single = timeit.repeat(bench_single_fallback, number=10000, repeat=5)
t_single_us = np.mean(t_single) / 10000 * 1e6
print("\nSingle-point predict (fallback, 10k calls):")
print(f"  Per call: {t_single_us:.2f} us")

# Profile timing: 100-point and 1000-point
rho_100 = np.linspace(0.01, 0.99, 100)
rho_1k = np.linspace(0.01, 0.99, 1000)


# ITER-like profiles
def make_profiles(rho):
    te = 20.0 * (1 - rho**2) ** 1.5 + 0.5
    ti = 18.0 * (1 - rho**2) ** 1.5 + 0.5
    ne = 10.0 * (1 - rho**2) ** 0.5 + 1.0
    q = 1.0 + 2.5 * rho**2
    s = 2.0 * 2.5 * rho / q
    return te, ti, ne, q, s


te100, ti100, ne100, q100, s100 = make_profiles(rho_100)
te1k, ti1k, ne1k, q1k, s1k = make_profiles(rho_1k)


def bench_profile_100():
    model_fallback.predict_profile(rho_100, te100, ti100, ne100, q100, s100)


def bench_profile_1k():
    model_fallback.predict_profile(rho_1k, te1k, ti1k, ne1k, q1k, s1k)


t_100 = timeit.repeat(bench_profile_100, number=1000, repeat=5)
t_1k = timeit.repeat(bench_profile_1k, number=1000, repeat=5)

t_100_ms = np.mean(t_100) / 1000 * 1000
t_1k_ms = np.mean(t_1k) / 1000 * 1000

print("\npredict_profile (fallback, 1000 calls each):")
print(f"  100-point: {t_100_ms:.3f} ms/call")
print(f"  1000-point: {t_1k_ms:.3f} ms/call")

# Benchmark: neural MLP surrogate
# Create synthetic weights (H=64) for timing — not physically trained,
# but exercises the same matmul/activation code path.

N_INPUT, H1, H2, N_OUTPUT = 10, 64, 32, 3

rng = np.random.default_rng(42)
fd, weights_path = tempfile.mkstemp(suffix=".npz")
os.close(fd)
np.savez(
    weights_path,
    w1=rng.standard_normal((N_INPUT, H1)).astype(np.float64) * 0.1,
    b1=np.zeros(H1),
    w2=rng.standard_normal((H1, H2)).astype(np.float64) * 0.1,
    b2=np.zeros(H2),
    w3=rng.standard_normal((H2, N_OUTPUT)).astype(np.float64) * 0.1,
    b3=np.zeros(N_OUTPUT),
    input_mean=np.zeros(N_INPUT),
    input_std=np.ones(N_INPUT),
    output_scale=np.ones(N_OUTPUT),
    version=np.array(1),
)

model_neural = NeuralTransportModel(weights_path=weights_path)
assert model_neural.is_neural, "MLP weights failed to load"
print(f"Model mode: neural (H={H1}/{H2}, checksum={model_neural.weights_checksum})")


# Single-point timing
def bench_single_neural():
    model_neural.predict(inp)


t_sn = timeit.repeat(bench_single_neural, number=10000, repeat=5)
t_sn_us = np.mean(t_sn) / 10000 * 1e6
print("\nSingle-point predict (neural, 10k calls):")
print(f"  Per call: {t_sn_us:.2f} us")


# Profile timing: vectorised matmul
def bench_profile_100_neural():
    model_neural.predict_profile(rho_100, te100, ti100, ne100, q100, s100)


def bench_profile_1k_neural():
    model_neural.predict_profile(rho_1k, te1k, ti1k, ne1k, q1k, s1k)


t_100n = timeit.repeat(bench_profile_100_neural, number=1000, repeat=5)
t_1kn = timeit.repeat(bench_profile_1k_neural, number=1000, repeat=5)

t_100n_ms = np.mean(t_100n) / 1000 * 1000
t_1kn_ms = np.mean(t_1kn) / 1000 * 1000

print("\npredict_profile (neural, 1000 calls each):")
print(f"  100-point: {t_100n_ms:.3f} ms/call")
print(f"  1000-point: {t_1kn_ms:.3f} ms/call")


# Simulate old point-by-point evaluation for speedup comparison
def bench_pointwise_1k():
    for i in range(1000):
        model_neural.predict(
            TransportInputs(
                rho=rho_1k[i],
                te_kev=te1k[i],
                ti_kev=ti1k[i],
                ne_19=ne1k[i],
                grad_ti=6.0,
                q=q1k[i],
                s_hat=s1k[i],
            )
        )


t_pw = timeit.repeat(bench_pointwise_1k, number=1, repeat=3)
t_pw_ms = np.mean(t_pw) * 1000

speedup = t_pw_ms / t_1kn_ms if t_1kn_ms > 0 else float("inf")

print("\nVectorised vs Point-by-Point (1000-pt, neural):")
print(f"  Vectorised:    {t_1kn_ms:.3f} ms")
print(f"  Point-by-point: {t_pw_ms:.1f} ms")
print(f"  Speedup:        {speedup:.0f}x")

# Summary table
print(f"\n{'Method':<35} {'Single':>10} {'100-pt':>10} {'1000-pt':>10}")
print("-" * 67)
print(
    f"{'Critical-gradient (numpy)':<35} {t_single_us:>8.1f} us {t_100_ms:>7.3f} ms {t_1k_ms:>7.3f} ms"
)
print(
    f"{'MLP surrogate (numpy, H=64/32)':<35} {t_sn_us:>8.1f} us {t_100n_ms:>7.3f} ms {t_1kn_ms:>7.3f} ms"
)
print(f"{'MLP point-by-point (1k loop)':<35} {t_sn_us:>8.1f} us {'—':>10} {t_pw_ms:>7.1f} ms")

os.unlink(weights_path)

# Accuracy comparison: fallback vs MLP across R/L_Ti sweep
# (Random weights won't match physics, but demonstrates both code paths.)

# Re-create MLP weights for this cell
fd2, wp2 = tempfile.mkstemp(suffix=".npz")
os.close(fd2)
np.savez(
    wp2,
    w1=rng.standard_normal((N_INPUT, H1)).astype(np.float64) * 0.1,
    b1=np.zeros(H1),
    w2=rng.standard_normal((H1, H2)).astype(np.float64) * 0.1,
    b2=np.zeros(H2),
    w3=rng.standard_normal((H2, N_OUTPUT)).astype(np.float64) * 0.1,
    b3=np.zeros(N_OUTPUT),
    input_mean=np.zeros(N_INPUT),
    input_std=np.ones(N_INPUT),
    output_scale=np.ones(N_OUTPUT),
    version=np.array(1),
)

model_nn = NeuralTransportModel(weights_path=wp2)
model_fb = NeuralTransportModel()

grad_ti_sweep = np.linspace(0.0, 20.0, 200)
chi_i_fb = np.array(
    [
        model_fb.predict(TransportInputs(grad_ti=g, te_kev=10.0, ti_kev=10.0)).chi_i
        for g in grad_ti_sweep
    ]
)
chi_i_nn = np.array(
    [
        model_nn.predict(TransportInputs(grad_ti=g, te_kev=10.0, ti_kev=10.0)).chi_i
        for g in grad_ti_sweep
    ]
)

fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(grad_ti_sweep, chi_i_fb, "b-", linewidth=2, label="Critical-gradient (analytic)")
ax.plot(grad_ti_sweep, chi_i_nn, "r--", linewidth=2, label="MLP surrogate (random weights)")
ax.axvline(x=4.0, color="gray", linestyle=":", alpha=0.7, label="ITG threshold (R/L_Ti = 4)")

ax.set_xlabel("R/L_Ti (normalised ion temperature gradient)", fontsize=12)
ax.set_ylabel("chi_i [m^2/s]", fontsize=12)
ax.set_title("Ion Thermal Diffusivity: Fallback vs MLP Surrogate", fontsize=14)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
ax.set_xlim(0, 20)
plt.tight_layout()
plt.show()

print("Note: MLP curve uses random (untrained) weights — shape is not physical.")
print("With trained QLKNN weights, the MLP reproduces gyrokinetic results.")

os.unlink(wp2)

# --- Part C: Memory Footprint & FLOP Estimates ---


# 1. Measure memory footprint for FusionKernel at different grid sizes
print("=" * 72)
print("Memory Footprint — FusionKernel Grid Arrays")
print("=" * 72)

grid_memory = {}
for n in [33, 65, 128]:
    cfg = config.copy()
    cfg["grid_resolution"] = [n, n]
    _fd, _p = tempfile.mkstemp(suffix=".json")
    os.close(_fd)
    with open(_p, "w") as f:
        json.dump(cfg, f)
    k = FusionKernel(_p)
    k.initialize_grid()
    k.calculate_vacuum_field()
    # Measure psi_grid + vacuum_field arrays (main memory consumers)
    psi_bytes = (
        k.psi_grid.nbytes if hasattr(k, "psi_grid") and k.psi_grid is not None else n * n * 8
    )
    vac_bytes = (
        k.vacuum_field.nbytes
        if hasattr(k, "vacuum_field") and k.vacuum_field is not None
        else n * n * 8
    )
    total_mb = (psi_bytes + vac_bytes) / (1024 * 1024)
    grid_memory[n] = {
        "psi_bytes": psi_bytes,
        "vac_bytes": vac_bytes,
        "total_mb": total_mb,
        "grid_pts": n * n,
    }
    os.unlink(_p)
    print(
        f"  {n:>3}x{n:<3}: psi={psi_bytes / 1024:.1f} KB, vacuum={vac_bytes / 1024:.1f} KB, total={total_mb:.3f} MB"
    )

# 2. MLP weight memory
N_INPUT, H1, H2, N_OUTPUT = 10, 64, 32, 3
w1_bytes = N_INPUT * H1 * 8  # float64
w2_bytes = H1 * H2 * 8
w3_bytes = H2 * N_OUTPUT * 8
b_bytes = (H1 + H2 + N_OUTPUT) * 8
mlp_total = w1_bytes + w2_bytes + w3_bytes + b_bytes
print("\nMLP Weights (10->64->32->3, float64):")
print(f"  W1: {w1_bytes} B, W2: {w2_bytes} B, W3: {w3_bytes} B, biases: {b_bytes} B")
print(f"  Total: {mlp_total} B ({mlp_total / 1024:.2f} KB)")

# 3. Analytical FLOP estimates
print(f"\n{'=' * 72}")
print("Computational Power Metrics — FLOP Estimates")
print(f"{'=' * 72}")

# Energy model: ~15 pJ/FLOP (Zen 4 core, ~5W at 300 GFLOP/s)
PJ_PER_FLOP = 15e-12  # joules

components = [
    ("SOR step (65x65)", "4,225 pts", 0.1e6, 0.26, "5-pt stencil, 4 FLOP/pt"),
    ("Multigrid V-cycle (65x65)", "4 levels", 2.0e6, 0.70, "3+3 smoothing + restrict + prolong"),
    ("Full equil. (65x65, 12cyc)", "—", 24.0e6, 0.70, "12 V-cycles x 2 MFLOP"),
    ("Full equil. (128x128, 15cyc)", "—", 120.0e6, 2.50, "Dominated by SOR sweeps"),
    ("Inverse LM iter (65x65)", "8 fwd solves", 192.0e6, 1.50, "+ Cholesky ~0.01 MFLOP"),
    ("MLP inference (H=64/32)", "10->64->32->3", 5.0e3, 0.01, "2 matmul + 2 ReLU + softplus"),
    ("MLP profile (1000-pt)", "batch x 10->3", 5.0e6, 0.08, "Single batched matmul path"),
    ("Crit-gradient (1000-pt)", "1000 pts", 0.02e6, 0.06, "Vectorised numpy"),
]

header = f"{'Component':<32} {'Grid/Size':<18} {'FLOP':>12} {'Mem (MB)':>10} {'Energy (mJ)':>12} {'Notes'}"
print(header)
print("-" * len(header))
for name, grid, flops, mem_mb, notes in components:
    energy_mj = flops * PJ_PER_FLOP * 1000  # convert J -> mJ
    if flops >= 1e6:
        flop_str = f"{flops / 1e6:.1f} MFLOP"
    elif flops >= 1e3:
        flop_str = f"{flops / 1e3:.0f} KFLOP"
    else:
        flop_str = f"{flops:.0f} FLOP"
    print(f"{name:<32} {grid:<18} {flop_str:>12} {mem_mb:>10.2f} {energy_mj:>12.4f}   {notes}")

# 4. Bandwidth utilisation
print(f"\n{'=' * 72}")
print("Memory Bandwidth Utilisation")
print(f"{'=' * 72}")
bw_data = [
    ("SOR step 65x65", 132, "<1% of 50 GB/s"),
    ("Multigrid V-cycle", 300, "<1%"),
    ("MLP 1000-pt batch", 160, "<1%"),
]
print(f"{'Component':<25} {'Data moved (KB)':>16} {'BW utilisation':>20}")
print("-" * 63)
for name, kb, util in bw_data:
    print(f"{name:<25} {kb:>14} KB {util:>20}")

print("\nAll current workloads are compute-bound at these grid sizes.")

# --- Part C: Solver Comparison Bar Charts ---

# 1. SCPN forward-solve scaling across grid sizes
# Re-use times from Part A (grid_sizes, times_ms already computed above)
# If not available, define representative values.
try:
    _ = times_ms[0]
    scpn_times = dict(zip(grid_sizes, times_ms))
except NameError:
    # Fallback representative values (Python NumPy backend)
    grid_sizes = [33, 49, 65]
    times_ms = [800, 2500, 5000]
    scpn_times = dict(zip(grid_sizes, times_ms))

# Rust Criterion bench data (release build, representative)
rust_times = {33: 2, 65: 100, 128: 950}  # ms

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# --- Left: SCPN Python vs Rust at overlapping grid sizes ---
grids_compare = [33, 65]
x = np.arange(len(grids_compare))
width = 0.35

py_vals = [scpn_times.get(g, 0) for g in grids_compare]
rs_vals = [rust_times.get(g, 0) for g in grids_compare]

bars1 = ax1.bar(x - width / 2, py_vals, width, label="Python (NumPy)", color="#FF9800")
bars2 = ax1.bar(x + width / 2, rs_vals, width, label="Rust (release)", color="#2196F3")
ax1.set_yscale("log")
ax1.set_xlabel("Grid Resolution")
ax1.set_ylabel("Forward Solve Time (ms, log scale)")
ax1.set_title("SCPN Forward Solve: Python vs Rust")
ax1.set_xticks(x)
ax1.set_xticklabels([f"{g}x{g}" for g in grids_compare])
ax1.legend()
ax1.grid(axis="y", alpha=0.3)
for bar, val in zip(list(bars1) + list(bars2), py_vals + rs_vals):
    ax1.text(
        bar.get_x() + bar.get_width() / 2,
        val * 1.15,
        f"{val:.0f}",
        ha="center",
        va="bottom",
        fontsize=9,
    )

# --- Right: Community code runtime comparison (log scale) ---
# Runtimes converted to seconds for uniform comparison
codes = [
    ("GENE\n(5D gyro)", 3.6e9),  # ~10^6 CPU-h = 3.6e9 s
    ("CGYRO\n(5D gyro)", 3.6e8),  # ~10^5 CPU-h
    ("JINTRAC\n(integrated)", 600),  # ~10 min
    ("TORAX\n(JAX GPU)", 30),
    ("HELENA\n(equilibrium)", 10),
    ("CHEASE\n(equilibrium)", 5),
    ("SCPN Rust\n(full recon)", 4),
    ("EFIT\n(reconstruction)", 2),
    ("DREAM\n(disruption)", 1),
    ("P-EFIT\n(GPU recon)", 0.001),
]

names = [c[0] for c in codes]
runtimes = [c[1] for c in codes]
colors = [
    "#e74c3c"
    if "GENE" in n or "CGYRO" in n
    else "#f39c12"
    if "JINTRAC" in n or "TORAX" in n
    else "#2196F3"
    if "SCPN" in n
    else "#27ae60"
    for n in names
]

ax2.barh(range(len(names)), runtimes, color=colors)
ax2.set_xscale("log")
ax2.set_xlabel("Runtime (seconds, log scale)")
ax2.set_title("Community Code Runtime Comparison")
ax2.set_yticks(range(len(names)))
ax2.set_yticklabels(names, fontsize=9)
ax2.invert_yaxis()
ax2.grid(axis="x", alpha=0.3)

# Add text labels
for i, (name, rt) in enumerate(codes):
    if rt >= 1:
        label = f"{rt:.0f} s" if rt < 3600 else f"{rt / 3600:.0f} h"
    else:
        label = f"{rt * 1000:.0f} ms"
    ax2.text(rt * 1.5, i, label, va="center", fontsize=8)

plt.tight_layout()
plt.show()

print("\nColor key: red=gyrokinetic, orange=integrated, blue=SCPN, green=other")
print("Note: gyrokinetic codes solve the 5D Vlasov equation (fundamentally different problem).")

# --- Part E: Serial vs Parallel Jacobian Benchmark ---
#
# We simulate the inverse solver's Jacobian construction:
#   Serial:   8 forward solves executed one after another
#   Parallel: 8 forward solves via ProcessPoolExecutor (bypasses GIL)
#
# The Rust solver uses rayon::par_iter for the same pattern.

import concurrent.futures
import time

# Re-create config for benchmarking
cfg_bench = {
    "reactor_name": "ITER-like",
    "grid_resolution": [33, 33],  # 33x33 for faster demo
    "dimensions": {"R_min": 1.0, "R_max": 9.0, "Z_min": -5.0, "Z_max": 5.0},
    "physics": {"plasma_current_target": 15.0, "vacuum_permeability": 1.2566370614e-6},
    "coils": [
        {"R": 3.5, "Z": 4.0, "current": 5.0},
        {"R": 3.5, "Z": -4.0, "current": 5.0},
        {"R": 9.0, "Z": 4.0, "current": -3.0},
        {"R": 9.0, "Z": -4.0, "current": -3.0},
        {"R": 6.2, "Z": 5.5, "current": -1.5},
        {"R": 6.2, "Z": -5.5, "current": -1.5},
    ],
    "solver": {"max_iterations": 50, "convergence_threshold": 1e-5, "relaxation_factor": 0.1},
}

fd_b, cfg_path_b = tempfile.mkstemp(suffix=".json")
os.close(fd_b)
with open(cfg_path_b, "w") as f:
    json.dump(cfg_bench, f)

N_JACOBIAN_COLS = 8  # 1 baseline + 7 perturbations (we time all 8 forward solves)


def single_forward_solve(config_path):
    """One forward solve — the unit of work for each Jacobian column."""
    k = FusionKernel(config_path)
    k.initialize_grid()
    k.calculate_vacuum_field()
    k.solve_equilibrium()
    return k.psi_grid.sum()  # return a scalar to confirm completion


# --- Warm-up ---
single_forward_solve(cfg_path_b)

# --- Serial benchmark ---
serial_times = []
for trial in range(3):
    t0 = time.perf_counter()
    for _ in range(N_JACOBIAN_COLS):
        single_forward_solve(cfg_path_b)
    serial_times.append(time.perf_counter() - t0)

t_serial_ms = np.mean(serial_times) * 1000
t_serial_per_col = t_serial_ms / N_JACOBIAN_COLS

print("Serial Jacobian (8 forward solves, 33x33)")
print("=" * 50)
print(f"  Total:    {t_serial_ms:.1f} ms  (mean of {len(serial_times)} trials)")
print(f"  Per col:  {t_serial_per_col:.1f} ms")

# --- Parallel benchmark (ProcessPoolExecutor) ---
parallel_times = []
for trial in range(3):
    t0 = time.perf_counter()
    with concurrent.futures.ProcessPoolExecutor(
        max_workers=min(N_JACOBIAN_COLS, os.cpu_count() or 4)
    ) as pool:
        futures = [pool.submit(single_forward_solve, cfg_path_b) for _ in range(N_JACOBIAN_COLS)]
        results = [f.result() for f in concurrent.futures.as_completed(futures)]
    parallel_times.append(time.perf_counter() - t0)

t_parallel_ms = np.mean(parallel_times) * 1000

n_workers = min(N_JACOBIAN_COLS, os.cpu_count() or 4)
speedup = t_serial_ms / t_parallel_ms if t_parallel_ms > 0 else 1.0
efficiency = speedup / n_workers * 100

print(f"\nParallel Jacobian ({n_workers} workers, ProcessPoolExecutor)")
print("=" * 50)
print(f"  Total:    {t_parallel_ms:.1f} ms  (mean of {len(parallel_times)} trials)")
print(f"  Speedup:  {speedup:.2f}x  (vs serial)")
print(f"  Efficiency: {efficiency:.0f}%  ({speedup:.1f}x / {n_workers} workers)")

# --- Projected Rust rayon speedup ---
# Rust forward solve is ~50x faster → scale serial time by 50x
rust_serial_est = t_serial_ms / 50
rust_parallel_est = rust_serial_est / min(speedup, 5.5)  # rayon typically ~5-6x on 8 cols

print("\nProjected Rust Rayon Performance (estimated)")
print("-" * 50)
print(f"  Serial (Rust):    {rust_serial_est:.1f} ms  (Python/{50})")
print(f"  Parallel (rayon): {rust_parallel_est:.1f} ms  (~{min(speedup, 5.5):.1f}x speedup)")

os.unlink(cfg_path_b)

# --- Part E: Visualization — Serial vs Parallel Speedup ---

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5))

# --- Left: Bar chart — serial vs parallel wall time ---
categories = ["Serial\n(sequential)", f"Parallel\n({n_workers} workers)"]
wall_times = [t_serial_ms, t_parallel_ms]
colors_bar = ["#e74c3c", "#2196F3"]

bars = ax1.bar(
    categories, wall_times, color=colors_bar, width=0.5, edgecolor="white", linewidth=1.5
)
ax1.set_ylabel("Wall Time (ms)")
ax1.set_title(f"Jacobian Construction: {N_JACOBIAN_COLS} Forward Solves (33x33)")
ax1.grid(axis="y", alpha=0.3)
for bar, val in zip(bars, wall_times):
    ax1.text(
        bar.get_x() + bar.get_width() / 2,
        val + max(wall_times) * 0.02,
        f"{val:.0f} ms",
        ha="center",
        va="bottom",
        fontsize=11,
        fontweight="bold",
    )

# Add speedup annotation
ax1.annotate(
    f"{speedup:.1f}x speedup",
    xy=(1, t_parallel_ms),
    xytext=(0.5, (t_serial_ms + t_parallel_ms) / 2),
    fontsize=12,
    fontweight="bold",
    color="#2196F3",
    arrowprops=dict(arrowstyle="->", color="#2196F3", lw=2),
    ha="center",
)

# --- Right: Projected speedup table as bar chart (Python vs Rust) ---
scenarios = [
    "Python\nSerial",
    "Python\nParallel",
    "Rust\nSerial (est.)",
    "Rust+rayon\nParallel (est.)",
]
scenario_times = [
    t_serial_ms,
    t_parallel_ms,
    rust_serial_est,
    rust_parallel_est,
]
scenario_colors = ["#e74c3c", "#2196F3", "#FF9800", "#4CAF50"]

bars2 = ax2.bar(
    scenarios, scenario_times, color=scenario_colors, width=0.6, edgecolor="white", linewidth=1.5
)
ax2.set_ylabel("Wall Time (ms)")
ax2.set_title("1 LM Iteration: Python vs Rust (Projected)")
ax2.set_yscale("log")
ax2.grid(axis="y", alpha=0.3)
for bar, val in zip(bars2, scenario_times):
    label = f"{val:.0f} ms" if val >= 1 else f"{val * 1000:.0f} µs"
    ax2.text(
        bar.get_x() + bar.get_width() / 2,
        val * 1.3,
        label,
        ha="center",
        va="bottom",
        fontsize=10,
        fontweight="bold",
    )

plt.tight_layout()
plt.show()

# --- Summary table ---
print("\nParallel Jacobian Speedup Summary")
print("=" * 70)
print(f"{'Scenario':<30} {'Wall Time':>12} {'Speedup vs Py Serial':>22}")
print("-" * 70)
for name, t in zip(scenarios, scenario_times):
    name_clean = name.replace("\n", " ")
    sp = t_serial_ms / t if t > 0 else float("inf")
    t_str = f"{t:.1f} ms" if t >= 1 else f"{t * 1000:.0f} µs"
    print(f"{name_clean:<30} {t_str:>12} {sp:>20.1f}x")
print("-" * 70)
print(f"Cores available: {os.cpu_count()}")
print(f"Jacobian columns: {N_JACOBIAN_COLS}")
print("\nNote: Rust estimates based on 50x single-solve speedup (Picard → multigrid).")

RAYON_NUM_THREADS=4 cargo run --release  # limit to 4 threads

# --- Part F: Three configurations — defaults vs conservative vs speed ---

import copy
import logging


# Capture solver convergence history by monkey-patching the logger
class ConvergenceCapture(logging.Handler):
    """Captures residual values emitted by FusionKernel during solve."""

    def __init__(self):
        super().__init__()
        self.residuals = []

    def emit(self, record):
        msg = record.getMessage()
        if "res=" in msg:
            # Extract residual from "Iter N: res=X.XXe-YY ..."
            try:
                res_str = msg.split("res=")[1].split()[0].rstrip(",")
                self.residuals.append(float(res_str))
            except (IndexError, ValueError):
                pass
        elif "Converged" in msg and "Residual" in msg:
            try:
                res_str = msg.split("Residual:")[1].strip()
                self.residuals.append(float(res_str))
            except (IndexError, ValueError):
                pass


# Base configuration (ITER-like)
base_cfg = {
    "reactor_name": "ITER-like",
    "dimensions": {"R_min": 1.0, "R_max": 9.0, "Z_min": -5.0, "Z_max": 5.0},
    "physics": {"plasma_current_target": 15.0, "vacuum_permeability": 1.2566370614e-6},
    "coils": [
        {"r": 3.5, "z": 4.0, "current": 5.0},
        {"r": 3.5, "z": -4.0, "current": 5.0},
        {"r": 9.0, "z": 4.0, "current": -3.0},
        {"r": 9.0, "z": -4.0, "current": -3.0},
        {"r": 6.2, "z": 5.5, "current": -1.5},
        {"r": 6.2, "z": -5.5, "current": -1.5},
    ],
}

# Three solver configurations
configs = {
    "Defaults": {
        "grid_resolution": [65, 65],
        "solver": {"max_iterations": 100, "convergence_threshold": 1e-6, "relaxation_factor": 0.1},
    },
    "Conservative": {
        "grid_resolution": [65, 65],
        "solver": {
            "max_iterations": 1000,
            "convergence_threshold": 1e-8,
            "relaxation_factor": 0.05,
        },
    },
    "Speed-optimised": {
        "grid_resolution": [33, 33],
        "solver": {"max_iterations": 200, "convergence_threshold": 1e-4, "relaxation_factor": 0.2},
    },
}

results = {}

fk_logger = logging.getLogger("scpn_fusion.core.fusion_kernel")
fk_logger.setLevel(logging.DEBUG)

for name, solver_cfg in configs.items():
    cfg = copy.deepcopy(base_cfg)
    cfg.update(solver_cfg)

    _fd, _path = tempfile.mkstemp(suffix=".json")
    os.close(_fd)
    with open(_path, "w") as f:
        json.dump(cfg, f)

    # Capture convergence
    handler = ConvergenceCapture()
    fk_logger.addHandler(handler)

    t0 = time.perf_counter()
    k = FusionKernel(_path)
    k.initialize_grid()
    k.calculate_vacuum_field()
    k.solve_equilibrium()
    elapsed_ms = (time.perf_counter() - t0) * 1000

    fk_logger.removeHandler(handler)
    os.unlink(_path)

    # Compute final residual from Psi
    final_residual = float(np.mean(np.abs(np.gradient(k.Psi, k.dR, k.dZ)[0])))

    # Extract Psi for profile comparison
    psi_norm = (k.Psi - k.Psi.min()) / (k.Psi.max() - k.Psi.min() + 1e-30)
    mid_z = k.NZ // 2
    psi_midplane = psi_norm[mid_z, :]

    results[name] = {
        "elapsed_ms": elapsed_ms,
        "grid": solver_cfg["grid_resolution"][0],
        "alpha": solver_cfg["solver"]["relaxation_factor"],
        "max_iter": solver_cfg["solver"]["max_iterations"],
        "tol": solver_cfg["solver"]["convergence_threshold"],
        "residuals": handler.residuals,
        "psi_midplane": psi_midplane,
        "R": k.R.copy(),
        "final_residual": final_residual,
    }

    print(
        f"{name:>20}: {elapsed_ms:8.1f} ms  "
        f"(alpha={solver_cfg['solver']['relaxation_factor']}, "
        f"grid={solver_cfg['grid_resolution'][0]}x{solver_cfg['grid_resolution'][0]}, "
        f"captured {len(handler.residuals)} residuals)"
    )

# Now run transport profile for each configuration's midplane
transport_results = {}
model_fb = NeuralTransportModel()  # fallback mode
for name, r in results.items():
    rho = np.linspace(0.01, 0.99, 100)
    te, ti, ne, q, s = make_profiles(rho)
    out = model_fb.predict_profile(rho, te, ti, ne, q, s)
    transport_results[name] = {
        "rho": rho,
        "chi_i": out.chi_i
        if hasattr(out, "chi_i")
        else getattr(out, "chi_i_profile", np.zeros(100)),
    }

print("\nAll configurations completed.")

# --- Part F: Visualization — Tuned vs Defaults ---

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
colors = {"Defaults": "#2196F3", "Conservative": "#4CAF50", "Speed-optimised": "#FF9800"}
markers = {"Defaults": "o", "Conservative": "s", "Speed-optimised": "^"}

# ── Top-left: Convergence history ──
ax = axes[0, 0]
for name, r in results.items():
    if r["residuals"]:
        ax.semilogy(
            r["residuals"],
            color=colors[name],
            marker=markers[name],
            markersize=3,
            linewidth=1.5,
            label=f"{name} (alpha={r['alpha']})",
        )
ax.set_xlabel("Logged Iteration")
ax.set_ylabel("Residual (log scale)")
ax.set_title("Convergence History")
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

# ── Top-right: Wall time comparison ──
ax = axes[0, 1]
names = list(results.keys())
times = [results[n]["elapsed_ms"] for n in names]
bar_colors = [colors[n] for n in names]
bars = ax.bar(names, times, color=bar_colors, edgecolor="white", linewidth=1.5)
ax.set_ylabel("Total Solve Time (ms)")
ax.set_title("Equilibrium Solve: Time Comparison")
ax.grid(axis="y", alpha=0.3)
for bar, val in zip(bars, times):
    label = f"{val:.0f} ms" if val >= 100 else f"{val:.1f} ms"
    ax.text(
        bar.get_x() + bar.get_width() / 2,
        val + max(times) * 0.02,
        label,
        ha="center",
        va="bottom",
        fontsize=10,
        fontweight="bold",
    )

# ── Bottom-left: Midplane Psi profiles ──
ax = axes[1, 0]
for name, r in results.items():
    ax.plot(
        r["R"],
        r["psi_midplane"],
        color=colors[name],
        linewidth=2,
        label=f"{name} ({r['grid']}x{r['grid']})",
    )
ax.set_xlabel("R (m)")
ax.set_ylabel("Normalised Psi")
ax.set_title("Midplane Flux Profile")
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

# ── Bottom-right: Transport chi_i profile ──
ax = axes[1, 1]
for name, tr in transport_results.items():
    chi = tr["chi_i"]
    if isinstance(chi, np.ndarray) and len(chi) > 1:
        ax.plot(tr["rho"], chi, color=colors[name], linewidth=2, label=name)
    else:
        # If predict_profile returns a scalar or list, handle gracefully
        ax.axhline(
            y=float(chi) if np.isscalar(chi) else float(chi[0]),
            color=colors[name],
            linewidth=2,
            linestyle="--",
            label=name,
        )
ax.set_xlabel("rho (normalised radius)")
ax.set_ylabel("chi_i [m^2/s]")
ax.set_title("Ion Thermal Diffusivity Profile")
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

plt.suptitle("Part F: Tuned Parameters vs Defaults", fontsize=15, fontweight="bold", y=1.01)
plt.tight_layout()

# Export static PNG for docs/BENCHMARK_FIGURES.md
fig_path = Path("..") / "docs" / "figures" / "tuned_vs_defaults.png"
fig_path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(fig_path, dpi=150, bbox_inches="tight", facecolor="white")
print(f"Saved static figure: {fig_path.resolve()}")

plt.show()

# ── Summary table ──
print("\nConfiguration Comparison Summary")
print("=" * 85)
print(
    f"{'Config':<20} {'Grid':>6} {'alpha':>7} {'max_iter':>10} {'tol':>10} {'Time (ms)':>12} {'Residual':>12}"
)
print("-" * 85)
for name, r in results.items():
    print(
        f"{name:<20} {r['grid']:>3}x{r['grid']:<3} {r['alpha']:>7.2f} {r['max_iter']:>10} "
        f"{r['tol']:>10.0e} {r['elapsed_ms']:>12.1f} {r['final_residual']:>12.2e}"
    )
print("-" * 85)

# Relative speedup
t_default = results["Defaults"]["elapsed_ms"]
for name, r in results.items():
    sp = t_default / r["elapsed_ms"] if r["elapsed_ms"] > 0 else 0
    print(f"  {name}: {sp:.2f}x vs Defaults")

print("\nTakeaway:")
print("  - Conservative is slower but reaches tighter convergence (use for publication)")
print("  - Speed-optimised is fastest (use for parameter sweeps and design scans)")
print("  - Defaults balance speed and accuracy for most workflows")

Metric	SCPN Fusion Core (Python)	SCPN Fusion Core (Rust release)	EFIT
Method	Picard + SOR, mtanh profiles	Multigrid V-cycle, mtanh LM	Current-filament, Picard
Grid	65×65	65×65	65×65 (typical)
Forward solve	~5 s (NumPy)	~0.1 s (release)	~50 ms (Fortran)
1 LM iteration	~40 s (8 fwd)	~0.8 s (8 fwd)	~0.4 s (Picard)
Full reconstruction	~200 s (5 iters)	~4 s (5 iters)	~2 s (converged)
Regularisation	—	Tikhonov + Huber + σ	Von-Hagenow smoothing
Profile model	Linear / mtanh	Linear / mtanh (7 params)	Spline knots (~20 params)

Method	Single-point	100-pt profile	1000-pt profile	Framework
QuaLiKiz (gyrokinetic)	~1 s	~100 s	~1000 s	Fortran
QLKNN (TensorFlow)	~10 µs	~0.1 ms	~1 ms	TensorFlow
SCPN MLP (numpy, H=64/32)	~5 µs	~0.05 ms	~0.3 ms	NumPy only
SCPN fallback (analytic)	~2 µs	~0.2 ms	~2 ms	NumPy only

Subsystem	Key Finding
Inverse reconstruction	Forward solve dominates LM iteration cost; Tikhonov/Huber/σ add negligible overhead. Rust release build approaches EFIT speed (~4 s full reconstruction vs ~2 s for EFIT).
Neural transport	MLP surrogate with H=64/32 achieves ~5 µs single-point inference (no framework overhead). Vectorised profile evaluation gives ~100x speedup over point-by-point loop.
vs QuaLiKiz	~200,000x faster than gyrokinetic at single-point; ~2x faster than QLKNN due to zero framework overhead.
vs EFIT	Rust inverse solver within 2x of EFIT; Python solver ~100x slower (expected for interpreted code).

Target	Backend	Expected Speedup	Priority	Status
SOR red-black sweep	wgpu compute shader	20–50× (65×65), 100–200× (256×256)	P0	Planned
Multigrid V-cycle	wgpu + host orchestration	10–30×	P1	Planned
Vacuum field (elliptic integrals)	rayon (CPU) → wgpu	5–10×	P2	rayon done
MLP batch inference	wgpu or cuBLAS	2–5× (small H)	P3	Planned
FNO turbulence (FFT)	cuFFT / wgpu FFT	50–100× (64×64)	P3	Planned

Component	CPU Rust (release)	GPU projected	Source
Equilibrium 65×65	100 ms	~2 ms	Section 28 study
Equilibrium 256×256	~10 s	~50 ms	Extrapolated
P-EFIT reference (65×65)	—	<1 ms	Sabbagh 2023
Full inverse reconstruction	~4 s	~200 ms	8× GPU fwd solve
MLP 1000-pt profile	0.3 ms	~0.05 ms	Batch matmul

06 — Inverse Reconstruction & Transport Surrogate Benchmarks¶

Part A — Inverse Reconstruction¶

Benchmark¶

EFIT Comparison¶

Part B — Neural Transport Surrogate¶

QuaLiKiz / QLKNN Comparison¶

Summary¶

Part C — Extended Baseline Comparison¶

Memory Footprint & FLOP Estimates¶

Solver Comparison¶

Part D — Roadmap: GPU, Adaptive Grids, 3D Transport¶

GPU Offload Roadmap (Issue #12)¶

Adaptive Mesh Refinement (AMR)¶

3D Transport Roadmap¶

Part E — Parallel Jacobian Benchmark¶

Rayon Parallel Jacobian — Expected Speedup¶

Part F — Tuned Parameters vs Defaults¶

Feature	Current State	Target	Effort
Uniform multigrid	Production (V-cycle, 4 sizes)	—	Done
AMR (h-refinement)	Not implemented	Quadtree, error-based tagging	~4 weeks
AMR error estimator	Not implemented	Gradient-jump + curvature	~1 week

Code	AMR Type	Application
NIMROD	Block-structured	3D MHD
JOREK	Bézier elements, h-p	3D nonlinear MHD
BOUT++	Field-aligned, block	Edge turbulence
SCPN (planned)	Quadtree, gradient-based	2D GS + 3D extension

Feature	Current State	Target	Effort	Prerequisite
3D equilibrium (stellarator)	Not applicable (tokamak only)	VMEC-like 3D	~3 months	—
3D transport	1.5D radial only	Toroidal mode coupling (n=0,1,2)	~6 weeks	3D geometry
FNO 3D turbulence	2D proof-of-concept	3D fftn + toroidal modes	~4 weeks	Training data
3D geometry physics	Visualization only (OBJ export)	Field-line tracing, Poincaré maps	~3 weeks	3D equilibrium

Cores	Speedup (8 columns)	Notes
1	1× (serial fallback)	Same as before
4	~3.5×	Some overhead from cloning
8	~5–6×	Matches column count
16	~5–6×	No benefit beyond 8 columns