Files
2026_mcm_b/p1/data_window_analysis.py
2026-02-02 16:07:12 +08:00

352 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Data Window Sensitivity Analysis for Richards Model
Analyzes how different historical data windows affect the K (carrying capacity)
estimate in the Richards growth model.
Windows analyzed:
- 1957-2025: All available data (including Cold War era)
- 1990-2025: Post-Cold War era
- 2000-2025: Commercial space age
- 2010-2025: SpaceX era
- 2015-2025: Recent rapid growth
"""
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
# ============================================================
# Richards Growth Model
# ============================================================
def richards(t, K, r, t0, v):
"""Richards curve (generalized logistic model)"""
exp_term = np.exp(-r * (t - t0))
exp_term = np.clip(exp_term, 1e-10, 1e10)
return K / np.power(1 + exp_term, 1/v)
# ============================================================
# Data Loading
# ============================================================
def load_data(filepath="rocket_launch_counts.csv"):
"""Load and preprocess rocket launch data"""
df = pd.read_csv(filepath)
df = df.rename(columns={"YDate": "year", "Total": "launches"})
df["year"] = pd.to_numeric(df["year"], errors="coerce")
df["launches"] = pd.to_numeric(df["launches"], errors="coerce")
df = df.dropna(subset=["year", "launches"])
df = df[(df["year"] >= 1957) & (df["year"] <= 2025)]
df = df.astype({"year": int, "launches": int})
df = df.sort_values("year").reset_index(drop=True)
return df
# ============================================================
# Fit Richards Model (Unconstrained)
# ============================================================
def fit_richards_unconstrained(data, base_year=1957):
"""Fit Richards model without constraining K"""
years = data["year"].values
launches = data["launches"].values
t = (years - base_year).astype(float)
# Initial parameters
p0 = [5000.0, 0.08, 80.0, 2.0]
# Wide bounds - let data determine K
bounds = ([500, 0.005, 10, 0.2], [100000, 1.0, 200, 10.0])
try:
popt, pcov = curve_fit(richards, t, launches, p0=p0, bounds=bounds, maxfev=100000)
perr = np.sqrt(np.diag(pcov))
y_pred = richards(t, *popt)
ss_res = np.sum((launches - y_pred) ** 2)
ss_tot = np.sum((launches - np.mean(launches)) ** 2)
r_squared = 1 - (ss_res / ss_tot)
K, r, t0, v = popt
K_err = perr[0]
return {
"success": True,
"K": K,
"K_err": K_err,
"r": r,
"t0": t0,
"v": v,
"r_squared": r_squared,
"n_points": len(data),
"years": years,
"launches": launches,
"params": popt,
"y_pred": y_pred,
}
except Exception as e:
return {"success": False, "error": str(e)}
# ============================================================
# Main Analysis
# ============================================================
def analyze_data_windows(df):
"""Analyze K estimates for different data windows"""
# Define windows
windows = {
"1957-2025 (全部68年)": (1957, 2025),
"1990-2025 (35年)": (1990, 2025),
"2000-2025 (25年)": (2000, 2025),
"2010-2025 (15年)": (2010, 2025),
"2015-2025 (10年)": (2015, 2025),
}
results = {}
print("=" * 80)
print("分析不同历史数据窗口对 K (carrying capacity) 估计的影响")
print("=" * 80)
for name, (start, end) in windows.items():
data = df[(df["year"] >= start) & (df["year"] <= end)].copy()
result = fit_richards_unconstrained(data)
if result["success"]:
result["start"] = start
result["end"] = end
result["name"] = name
results[name] = result
print(f"\n--- {name} ---")
print(f" 数据点数: {result['n_points']}")
print(f" K = {result['K']:.0f} (carrying capacity)")
print(f" r = {result['r']:.4f} (growth rate)")
print(f" R² = {result['r_squared']:.4f}")
else:
print(f"\n--- {name} ---")
print(f" 拟合失败: {result['error']}")
return results
# ============================================================
# Visualization
# ============================================================
def plot_window_analysis(df, results, save_path="launch_capacity_window_analysis.png"):
"""Generate comprehensive window analysis plot"""
fig, axes = plt.subplots(2, 2, figsize=(14, 11))
names = list(results.keys())
physical_max = 3650
# ========== Plot 1: K Values Comparison ==========
ax1 = axes[0, 0]
K_vals = [results[n]["K"] for n in names]
colors = ["#E74C3C" if k < physical_max else "#27AE60" for k in K_vals]
y_pos = range(len(names))
bars = ax1.barh(y_pos, K_vals, color=colors, alpha=0.7, edgecolor="black")
ax1.axvline(physical_max, color="blue", ls="--", lw=2.5,
label=f"Physical Max ({physical_max})")
ax1.set_yticks(y_pos)
ax1.set_yticklabels(names, fontsize=10)
ax1.set_xlabel("K (Carrying Capacity, launches/year)", fontsize=11)
ax1.set_title("K Estimates by Data Window\n(Unconstrained Richards Model)", fontsize=12)
ax1.legend(loc="lower right", fontsize=10)
ax1.grid(True, alpha=0.3, axis="x")
for bar, k in zip(bars, K_vals):
ax1.text(k + 200, bar.get_y() + bar.get_height()/2,
f"{k:.0f}", va="center", fontsize=10, fontweight="bold")
# ========== Plot 2: R² Comparison ==========
ax2 = axes[0, 1]
r2_vals = [results[n]["r_squared"] for n in names]
colors2 = ["#27AE60" if r2 > 0.9 else "#F39C12" if r2 > 0.7 else "#E74C3C"
for r2 in r2_vals]
bars2 = ax2.barh(y_pos, r2_vals, color=colors2, alpha=0.7, edgecolor="black")
ax2.axvline(0.9, color="green", ls="--", lw=1.5, alpha=0.7, label="R²=0.9 (Good fit)")
ax2.axvline(0.7, color="orange", ls=":", lw=1.5, alpha=0.7, label="R²=0.7 (Acceptable)")
ax2.set_yticks(y_pos)
ax2.set_yticklabels(names, fontsize=10)
ax2.set_xlabel("R² (Goodness of Fit)", fontsize=11)
ax2.set_title("Model Fit Quality by Data Window", fontsize=12)
ax2.set_xlim(0, 1.1)
ax2.legend(loc="lower right", fontsize=9)
ax2.grid(True, alpha=0.3, axis="x")
for bar, r2 in zip(bars2, r2_vals):
ax2.text(r2 + 0.02, bar.get_y() + bar.get_height()/2,
f"{r2:.3f}", va="center", fontsize=10, fontweight="bold")
# ========== Plot 3: Historical Data with All Model Fits ==========
ax3 = axes[1, 0]
# Plot all historical data
ax3.scatter(df["year"], df["launches"], color="gray", s=30, alpha=0.5,
label="Historical Data", zorder=1)
# Plot each model prediction
colors_line = ["#E74C3C", "#F39C12", "#3498DB", "#27AE60", "#9B59B6"]
for i, (name, res) in enumerate(results.items()):
start = res["start"]
years_proj = np.arange(start, 2080)
t_proj = years_proj - 1957
pred = richards(t_proj, *res["params"])
label = f'{name.split(" ")[0]}: K={res["K"]:.0f}'
ax3.plot(years_proj, pred, color=colors_line[i], lw=2, alpha=0.8,
label=label, zorder=2)
ax3.axhline(physical_max, color="black", ls=":", lw=2,
label=f"Physical Max ({physical_max})")
ax3.set_xlabel("Year", fontsize=11)
ax3.set_ylabel("Annual Launches", fontsize=11)
ax3.set_title("Richards Model Predictions\n(Different Data Windows)", fontsize=12)
ax3.legend(loc="upper left", fontsize=8)
ax3.grid(True, alpha=0.3)
ax3.set_xlim(1955, 2080)
ax3.set_ylim(0, 15000)
# ========== Plot 4: Summary Table ==========
ax4 = axes[1, 1]
ax4.axis("off")
# Build summary text
summary = """
┌────────────────────────────────────────────────────────────────────────┐
│ 数据窗口对 K 估计的影响分析 │
├────────────────────────────────────────────────────────────────────────┤
│ │
│ 核心发现: │
│ ───────── │
│ 1. 数据窗口选择对 K 估计影响巨大 │
│ • 全部数据: K ≈ 500 │
│ • 近15年: K ≈ 12,000 │
│ │
│ 2. 1957-2025 全部数据: │
│ • R² = 0.08 (极差拟合) │
│ • 冷战时期数据干扰,模型误判为"已饱和"
│ • 不适合用于预测未来增长 │
│ │
│ 3. 2010-2025 近15年数据: │
│ • R² = 0.90 (良好拟合) │
│ • 准确捕捉商业航天时代的增长趋势 │
│ • 预测 K ≈ 12,000 >> 物理上限 (3,650) │
│ │
│ 结论: │
│ ───── │
│ • 选择 2010-2025 数据窗口是合理的 (R² = 0.90, 反映当前趋势) │
│ • 数据驱动 K ≈ 12,000 反映增长潜力 (需要 ~33 个发射站) │
│ • 但物理约束 (10站 × 365天 = 3,650) 才是真正的上限 │
│ • 因此采用 1 次/站/天 作为保守估计是合理的 │
│ │
│ 物理上限: 3,650 次/年 (10个发射站 × 365天) │
│ │
└────────────────────────────────────────────────────────────────────────┘
"""
ax4.text(0.02, 0.98, summary, transform=ax4.transAxes, fontsize=10,
verticalalignment="top", family="monospace",
bbox=dict(boxstyle="round", facecolor="lightyellow", alpha=0.9))
plt.suptitle("Data Window Sensitivity Analysis for Richards Model",
fontsize=14, fontweight="bold", y=1.02)
plt.tight_layout()
plt.savefig(save_path, dpi=150, bbox_inches="tight")
plt.close()
print(f"\nPlot saved: {save_path}")
# ============================================================
# Print Detailed Table
# ============================================================
def print_comparison_table(results):
"""Print detailed comparison table"""
physical_max = 3650
print("\n" + "=" * 80)
print("详细对比表")
print("=" * 80)
header = f"{'数据窗口':<22} | {'K值':>8} | {'':>7} | {'数据点':>5} | {'K/物理上限':>10} | {'说明':<18}"
print(header)
print("-" * 22 + "-+-" + "-" * 8 + "-+-" + "-" * 7 + "-+-" + "-" * 5 + "-+-" + "-" * 10 + "-+-" + "-" * 18)
for name, res in results.items():
k = res["K"]
r2 = res["r_squared"]
n = res["n_points"]
ratio = k / physical_max
if r2 < 0.5:
note = "拟合差,不可用"
elif k < physical_max:
note = "低于物理上限"
elif ratio < 2:
note = "接近物理上限"
else:
note = "远超物理上限"
print(f"{name:<22} | {k:>8.0f} | {r2:>7.4f} | {n:>5} | {ratio:>10.2f}x | {note:<18}")
print("\n物理上限: 3,650 次/年 (10个发射站 × 每站每天1次 × 365天)")
# ============================================================
# Main
# ============================================================
def main():
print("Loading data...")
df = load_data()
print(f"Data loaded: {len(df)} records from {df['year'].min()} to {df['year'].max()}")
# Analyze different windows
results = analyze_data_windows(df)
# Print comparison table
print_comparison_table(results)
# Generate visualization
print("\n" + "=" * 80)
print("Generating visualization...")
print("=" * 80)
plot_window_analysis(df, results)
print("\n" + "=" * 80)
print("分析完成!")
print("=" * 80)
return results
if __name__ == "__main__":
results = main()