p1/find_optimal_window.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Find Optimal Data Window for K ≈ 3650

Enumerate all possible starting years to find which data window
produces K closest to the physical limit (3650 launches/year).
"""

import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False


# Richards model
def richards(t, K, r, t0, v):
    exp_term = np.exp(-r * (t - t0))
    exp_term = np.clip(exp_term, 1e-10, 1e10)
    return K / np.power(1 + exp_term, 1/v)


def load_data(filepath="rocket_launch_counts.csv"):
    df = pd.read_csv(filepath)
    df = df.rename(columns={"YDate": "year", "Total": "launches"})
    df["year"] = pd.to_numeric(df["year"], errors="coerce")
    df["launches"] = pd.to_numeric(df["launches"], errors="coerce")
    df = df.dropna(subset=["year", "launches"])
    df = df[(df["year"] >= 1957) & (df["year"] <= 2025)]
    df = df.astype({"year": int, "launches": int})
    df = df.sort_values("year").reset_index(drop=True)
    return df


def fit_richards(data, base_year=1957):
    """Fit unconstrained Richards model"""
    years = data["year"].values
    launches = data["launches"].values
    t = (years - base_year).astype(float)
    
    p0 = [5000.0, 0.08, 80.0, 2.0]
    bounds = ([500, 0.005, 10, 0.2], [100000, 1.0, 200, 10.0])
    
    try:
        popt, pcov = curve_fit(richards, t, launches, p0=p0, bounds=bounds, maxfev=100000)
        y_pred = richards(t, *popt)
        ss_res = np.sum((launches - y_pred) ** 2)
        ss_tot = np.sum((launches - np.mean(launches)) ** 2)
        r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0
        
        return {
            "success": True,
            "K": popt[0],
            "r": popt[1],
            "t0": popt[2],
            "v": popt[3],
            "r_squared": r_squared,
            "n_points": len(data),
        }
    except:
        return {"success": False}


def main():
    print("=" * 80)
    print("枚举起始年份，寻找 K ≈ 3650 的数据窗口")
    print("=" * 80)
    
    df = load_data()
    print(f"数据范围: {df['year'].min()} - {df['year'].max()}\n")
    
    target_K = 3650
    end_year = 2025
    
    # Enumerate all starting years
    results = []
    
    for start_year in range(1957, 2020):
        data = df[(df["year"] >= start_year) & (df["year"] <= end_year)].copy()
        
        if len(data) < 5:  # Need at least 5 data points
            continue
        
        result = fit_richards(data)
        
        if result["success"]:
            diff = abs(result["K"] - target_K)
            results.append({
                "start_year": start_year,
                "end_year": end_year,
                "years_span": end_year - start_year + 1,
                "n_points": result["n_points"],
                "K": result["K"],
                "r_squared": result["r_squared"],
                "diff_from_target": diff,
                "ratio": result["K"] / target_K,
            })
    
    # Sort by difference from target
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values("diff_from_target")
    
    # Print full table
    print("完整枚举结果表（按与目标值 3650 的差距排序）:")
    print("-" * 80)
    print(f"{'起始年份':>8} | {'时间跨度':>8} | {'数据点':>6} | {'K值':>10} | {'R²':>8} | {'K/3650':>8} | {'差距':>10}")
    print("-" * 80)
    
    for _, row in results_df.iterrows():
        marker = "★" if row["diff_from_target"] < 500 else ""
        print(f"{int(row['start_year']):>8} | {int(row['years_span']):>6}年 | {int(row['n_points']):>6} | "
              f"{row['K']:>10.0f} | {row['r_squared']:>8.4f} | {row['ratio']:>8.2f}x | {row['diff_from_target']:>10.0f} {marker}")
    
    # Find closest matches
    print("\n" + "=" * 80)
    print("最接近 K = 3650 的数据窗口 (Top 10)")
    print("=" * 80)
    
    top10 = results_df.head(10)
    print(f"\n{'排名':>4} | {'起始年份':>8} | {'K值':>10} | {'R²':>8} | {'差距':>10} | {'评价':<20}")
    print("-" * 75)
    
    for i, (_, row) in enumerate(top10.iterrows(), 1):
        if row["r_squared"] < 0.5:
            comment = "❌ 拟合差，不可信"
        elif row["r_squared"] < 0.8:
            comment = "⚠️ 拟合一般"
        else:
            comment = "✅ 拟合良好"
        
        print(f"{i:>4} | {int(row['start_year']):>8} | {row['K']:>10.0f} | {row['r_squared']:>8.4f} | "
              f"{row['diff_from_target']:>10.0f} | {comment:<20}")
    
    # Analysis
    print("\n" + "=" * 80)
    print("分析结论")
    print("=" * 80)
    
    # Find best with good R²
    good_fit = results_df[results_df["r_squared"] >= 0.8]
    if len(good_fit) > 0:
        best_good = good_fit.iloc[0]
        print(f"\n在 R² ≥ 0.8 的条件下，最接近 K=3650 的窗口:")
        print(f"  起始年份: {int(best_good['start_year'])}")
        print(f"  K = {best_good['K']:.0f}")
        print(f"  R² = {best_good['r_squared']:.4f}")
        print(f"  差距: {best_good['diff_from_target']:.0f}")
    
    # Summary
    print("\n关键发现:")
    print("-" * 40)
    
    # Check if any good fit gives K near 3650
    near_target = results_df[(results_df["diff_from_target"] < 1000) & (results_df["r_squared"] >= 0.7)]
    
    if len(near_target) == 0:
        print("  ⚠️ 没有任何数据窗口能在良好拟合(R²≥0.7)的条件下得到 K≈3650")
        print("  ⚠️ 所有良好拟合的窗口都给出 K >> 3650 或 K << 3650")
        print("\n  这说明:")
        print("  • K=3650 不是数据自然支持的结论")
        print("  • K=3650 来自物理约束，而非统计预测")
        print("  • 论文中应该明确说明这是'物理上限'而非'数据预测'")
    else:
        print(f"  找到 {len(near_target)} 个窗口使 K 接近 3650:")
        for _, row in near_target.iterrows():
            print(f"    {int(row['start_year'])}-2025: K={row['K']:.0f}, R²={row['r_squared']:.4f}")
    
    # Generate visualization
    print("\n" + "=" * 80)
    print("生成可视化...")
    print("=" * 80)
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Plot 1: K vs Start Year
    ax1 = axes[0]
    colors = ['#27AE60' if r2 >= 0.8 else '#F39C12' if r2 >= 0.5 else '#E74C3C' 
              for r2 in results_df["r_squared"]]
    
    ax1.scatter(results_df["start_year"], results_df["K"], c=colors, s=60, alpha=0.7, edgecolor='black')
    ax1.axhline(3650, color='blue', ls='--', lw=2, label='Target K=3650')
    ax1.axhline(3650*0.9, color='blue', ls=':', lw=1, alpha=0.5)
    ax1.axhline(3650*1.1, color='blue', ls=':', lw=1, alpha=0.5)
    
    ax1.set_xlabel("Starting Year", fontsize=11)
    ax1.set_ylabel("K (Carrying Capacity)", fontsize=11)
    ax1.set_title("K vs Starting Year\n(Color: Green=R²≥0.8, Yellow=R²≥0.5, Red=R²<0.5)", fontsize=12)
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    ax1.set_xlim(1955, 2020)
    
    # Plot 2: R² vs Start Year
    ax2 = axes[1]
    ax2.scatter(results_df["start_year"], results_df["r_squared"], c=colors, s=60, alpha=0.7, edgecolor='black')
    ax2.axhline(0.8, color='green', ls='--', lw=1.5, label='R²=0.8 (Good)')
    ax2.axhline(0.5, color='orange', ls=':', lw=1.5, label='R²=0.5 (Acceptable)')
    
    ax2.set_xlabel("Starting Year", fontsize=11)
    ax2.set_ylabel("R² (Goodness of Fit)", fontsize=11)
    ax2.set_title("Model Fit Quality vs Starting Year", fontsize=12)
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    ax2.set_xlim(1955, 2020)
    ax2.set_ylim(-0.1, 1.1)
    
    plt.tight_layout()
    plt.savefig("find_optimal_window.png", dpi=150, bbox_inches='tight')
    plt.close()
    print("图表已保存: find_optimal_window.png")
    
    # Save to CSV
    results_df.to_csv("window_enumeration.csv", index=False)
    print("数据已保存: window_enumeration.csv")
    
    print("\n" + "=" * 80)
    print("分析完成!")
    print("=" * 80)
    
    return results_df


if __name__ == "__main__":
    results = main()
plane: fig 2026-02-02 21:47:52 +08:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`
			`"""`
			`Find Optimal Data Window for K ≈ 3650`

			`Enumerate all possible starting years to find which data window`
			`produces K closest to the physical limit (3650 launches/year).`
			`"""`

			`import pandas as pd`
			`import numpy as np`
			`from scipy.optimize import curve_fit`
			`import matplotlib`
			`matplotlib.use('Agg')`
			`import matplotlib.pyplot as plt`
			`import warnings`
			`warnings.filterwarnings('ignore')`

			`plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'DejaVu Sans']`
			`plt.rcParams['axes.unicode_minus'] = False`


			`# Richards model`
			`def richards(t, K, r, t0, v):`
			`exp_term = np.exp(-r * (t - t0))`
			`exp_term = np.clip(exp_term, 1e-10, 1e10)`
			`return K / np.power(1 + exp_term, 1/v)`


			`def load_data(filepath="rocket_launch_counts.csv"):`
			`df = pd.read_csv(filepath)`
			`df = df.rename(columns={"YDate": "year", "Total": "launches"})`
			`df["year"] = pd.to_numeric(df["year"], errors="coerce")`
			`df["launches"] = pd.to_numeric(df["launches"], errors="coerce")`
			`df = df.dropna(subset=["year", "launches"])`
			`df = df[(df["year"] >= 1957) & (df["year"] <= 2025)]`
			`df = df.astype({"year": int, "launches": int})`
			`df = df.sort_values("year").reset_index(drop=True)`
			`return df`


			`def fit_richards(data, base_year=1957):`
			`"""Fit unconstrained Richards model"""`
			`years = data["year"].values`
			`launches = data["launches"].values`
			`t = (years - base_year).astype(float)`

			`p0 = [5000.0, 0.08, 80.0, 2.0]`
			`bounds = ([500, 0.005, 10, 0.2], [100000, 1.0, 200, 10.0])`

			`try:`
			`popt, pcov = curve_fit(richards, t, launches, p0=p0, bounds=bounds, maxfev=100000)`
			`y_pred = richards(t, *popt)`
			`ss_res = np.sum((launches - y_pred) ** 2)`
			`ss_tot = np.sum((launches - np.mean(launches)) ** 2)`
			`r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0`

			`return {`
			`"success": True,`
			`"K": popt[0],`
			`"r": popt[1],`
			`"t0": popt[2],`
			`"v": popt[3],`
			`"r_squared": r_squared,`
			`"n_points": len(data),`
			`}`
			`except:`
			`return {"success": False}`


			`def main():`
			`print("=" * 80)`
			`print("枚举起始年份，寻找 K ≈ 3650 的数据窗口")`
			`print("=" * 80)`

			`df = load_data()`
			`print(f"数据范围: {df['year'].min()} - {df['year'].max()}\n")`

			`target_K = 3650`
			`end_year = 2025`

			`# Enumerate all starting years`
			`results = []`

			`for start_year in range(1957, 2020):`
			`data = df[(df["year"] >= start_year) & (df["year"] <= end_year)].copy()`

			`if len(data) < 5: # Need at least 5 data points`
			`continue`

			`result = fit_richards(data)`

			`if result["success"]:`
			`diff = abs(result["K"] - target_K)`
			`results.append({`
			`"start_year": start_year,`
			`"end_year": end_year,`
			`"years_span": end_year - start_year + 1,`
			`"n_points": result["n_points"],`
			`"K": result["K"],`
			`"r_squared": result["r_squared"],`
			`"diff_from_target": diff,`
			`"ratio": result["K"] / target_K,`
			`})`

			`# Sort by difference from target`
			`results_df = pd.DataFrame(results)`
			`results_df = results_df.sort_values("diff_from_target")`

			`# Print full table`
			`print("完整枚举结果表（按与目标值 3650 的差距排序）:")`
			`print("-" * 80)`
			`print(f"{'起始年份':>8} \| {'时间跨度':>8} \| {'数据点':>6} \| {'K值':>10} \| {'R²':>8} \| {'K/3650':>8} \| {'差距':>10}")`
			`print("-" * 80)`

			`for _, row in results_df.iterrows():`
			`marker = "★" if row["diff_from_target"] < 500 else ""`
			`print(f"{int(row['start_year']):>8} \| {int(row['years_span']):>6}年 \| {int(row['n_points']):>6} \| "`
			`f"{row['K']:>10.0f} \| {row['r_squared']:>8.4f} \| {row['ratio']:>8.2f}x \| {row['diff_from_target']:>10.0f} {marker}")`

			`# Find closest matches`
			`print("\n" + "=" * 80)`
			`print("最接近 K = 3650 的数据窗口 (Top 10)")`
			`print("=" * 80)`

			`top10 = results_df.head(10)`
			`print(f"\n{'排名':>4} \| {'起始年份':>8} \| {'K值':>10} \| {'R²':>8} \| {'差距':>10} \| {'评价':<20}")`
			`print("-" * 75)`

			`for i, (_, row) in enumerate(top10.iterrows(), 1):`
			`if row["r_squared"] < 0.5:`
			`comment = "❌ 拟合差，不可信"`
			`elif row["r_squared"] < 0.8:`
			`comment = "⚠️ 拟合一般"`
			`else:`
			`comment = "✅ 拟合良好"`

			`print(f"{i:>4} \| {int(row['start_year']):>8} \| {row['K']:>10.0f} \| {row['r_squared']:>8.4f} \| "`
			`f"{row['diff_from_target']:>10.0f} \| {comment:<20}")`

			`# Analysis`
			`print("\n" + "=" * 80)`
			`print("分析结论")`
			`print("=" * 80)`

			`# Find best with good R²`
			`good_fit = results_df[results_df["r_squared"] >= 0.8]`
			`if len(good_fit) > 0:`
			`best_good = good_fit.iloc[0]`
			`print(f"\n在 R² ≥ 0.8 的条件下，最接近 K=3650 的窗口:")`
			`print(f" 起始年份: {int(best_good['start_year'])}")`
			`print(f" K = {best_good['K']:.0f}")`
			`print(f" R² = {best_good['r_squared']:.4f}")`
			`print(f" 差距: {best_good['diff_from_target']:.0f}")`

			`# Summary`
			`print("\n关键发现:")`
			`print("-" * 40)`

			`# Check if any good fit gives K near 3650`
			`near_target = results_df[(results_df["diff_from_target"] < 1000) & (results_df["r_squared"] >= 0.7)]`

			`if len(near_target) == 0:`
			`print(" ⚠️ 没有任何数据窗口能在良好拟合(R²≥0.7)的条件下得到 K≈3650")`
			`print(" ⚠️ 所有良好拟合的窗口都给出 K >> 3650 或 K << 3650")`
			`print("\n 这说明:")`
			`print(" • K=3650 不是数据自然支持的结论")`
			`print(" • K=3650 来自物理约束，而非统计预测")`
			`print(" • 论文中应该明确说明这是'物理上限'而非'数据预测'")`
			`else:`
			`print(f" 找到 {len(near_target)} 个窗口使 K 接近 3650:")`
			`for _, row in near_target.iterrows():`
			`print(f" {int(row['start_year'])}-2025: K={row['K']:.0f}, R²={row['r_squared']:.4f}")`

			`# Generate visualization`
			`print("\n" + "=" * 80)`
			`print("生成可视化...")`
			`print("=" * 80)`

			`fig, axes = plt.subplots(1, 2, figsize=(14, 6))`

			`# Plot 1: K vs Start Year`
			`ax1 = axes[0]`
			`colors = ['#27AE60' if r2 >= 0.8 else '#F39C12' if r2 >= 0.5 else '#E74C3C'`
			`for r2 in results_df["r_squared"]]`

			`ax1.scatter(results_df["start_year"], results_df["K"], c=colors, s=60, alpha=0.7, edgecolor='black')`
			`ax1.axhline(3650, color='blue', ls='--', lw=2, label='Target K=3650')`
			`ax1.axhline(3650*0.9, color='blue', ls=':', lw=1, alpha=0.5)`
			`ax1.axhline(3650*1.1, color='blue', ls=':', lw=1, alpha=0.5)`

			`ax1.set_xlabel("Starting Year", fontsize=11)`
			`ax1.set_ylabel("K (Carrying Capacity)", fontsize=11)`
			`ax1.set_title("K vs Starting Year\n(Color: Green=R²≥0.8, Yellow=R²≥0.5, Red=R²<0.5)", fontsize=12)`
			`ax1.legend()`
			`ax1.grid(True, alpha=0.3)`
			`ax1.set_xlim(1955, 2020)`

			`# Plot 2: R² vs Start Year`
			`ax2 = axes[1]`
			`ax2.scatter(results_df["start_year"], results_df["r_squared"], c=colors, s=60, alpha=0.7, edgecolor='black')`
			`ax2.axhline(0.8, color='green', ls='--', lw=1.5, label='R²=0.8 (Good)')`
			`ax2.axhline(0.5, color='orange', ls=':', lw=1.5, label='R²=0.5 (Acceptable)')`

			`ax2.set_xlabel("Starting Year", fontsize=11)`
			`ax2.set_ylabel("R² (Goodness of Fit)", fontsize=11)`
			`ax2.set_title("Model Fit Quality vs Starting Year", fontsize=12)`
			`ax2.legend()`
			`ax2.grid(True, alpha=0.3)`
			`ax2.set_xlim(1955, 2020)`
			`ax2.set_ylim(-0.1, 1.1)`

			`plt.tight_layout()`
			`plt.savefig("find_optimal_window.png", dpi=150, bbox_inches='tight')`
			`plt.close()`
			`print("图表已保存: find_optimal_window.png")`

			`# Save to CSV`
			`results_df.to_csv("window_enumeration.csv", index=False)`
			`print("数据已保存: window_enumeration.csv")`

			`print("\n" + "=" * 80)`
			`print("分析完成!")`
			`print("=" * 80)`

			`return results_df`


			`if __name__ == "__main__":`
			`results = main()`