""" Step 07: 历史回测验证 输入: 01_clean.xlsx, 02_demand.xlsx, 03_allocate.xlsx 输出: 07_backtest.xlsx 功能: 1. 验证截断修正模型的合理性 2. 对比2019年实际访问次数与模型建议的分配 3. 分析残差分布 4. 计算模型拟合指标 (R², MAPE) """ import pandas as pd import numpy as np from scipy.stats import norm, pearsonr, spearmanr from pathlib import Path # 路径配置 CLEAN_PATH = Path(__file__).parent / "01_clean.xlsx" DEMAND_PATH = Path(__file__).parent / "02_demand.xlsx" ALLOCATE_PATH = Path(__file__).parent / "03_allocate.xlsx" OUTPUT_PATH = Path(__file__).parent / "07_backtest.xlsx" def compute_metrics(actual, predicted): """计算回归评估指标""" actual = np.array(actual) predicted = np.array(predicted) # 避免除零 mask = actual != 0 actual_nz = actual[mask] predicted_nz = predicted[mask] # MAPE (Mean Absolute Percentage Error) mape = np.mean(np.abs((actual_nz - predicted_nz) / actual_nz)) * 100 # MAE (Mean Absolute Error) mae = np.mean(np.abs(actual - predicted)) # RMSE (Root Mean Squared Error) rmse = np.sqrt(np.mean((actual - predicted) ** 2)) # R² (Coefficient of Determination) ss_res = np.sum((actual - predicted) ** 2) ss_tot = np.sum((actual - np.mean(actual)) ** 2) r2 = 1 - ss_res / ss_tot if ss_tot != 0 else 0 # Pearson correlation pearson_r, pearson_p = pearsonr(actual, predicted) # Spearman correlation (rank-based) spearman_r, spearman_p = spearmanr(actual, predicted) return { 'MAPE': mape, 'MAE': mae, 'RMSE': rmse, 'R2': r2, 'Pearson_r': pearson_r, 'Pearson_p': pearson_p, 'Spearman_r': spearman_r, 'Spearman_p': spearman_p } def main(): print("=" * 60) print("Step 07: 历史回测验证") print("=" * 60) # 1. 读取数据 print(f"\n[1] 读取输入数据") df_clean = pd.read_excel(CLEAN_PATH) df_demand = pd.read_excel(DEMAND_PATH) df_allocate = pd.read_excel(ALLOCATE_PATH) # 合并数据 df = df_clean.merge(df_demand[['site_id', 'mu_tilde', 'p_trunc', 'is_corrected']], on='site_id') df = df.merge(df_allocate[['site_id', 'k']], on='site_id') print(f" 读取 {len(df)} 站点数据") # 2. 分析1: 2019年访问次数 vs 需求μ的关系 print(f"\n[2] 分析1: 2019年访问次数 vs 需求μ的相关性") # 2019年的访问次数是否与需求成正比? corr_2019_mu = pearsonr(df['visits_2019'], df['mu']) corr_2019_mu_tilde = pearsonr(df['visits_2019'], df['mu_tilde']) print(f" visits_2019 vs μ: r = {corr_2019_mu[0]:.4f}, p = {corr_2019_mu[1]:.4e}") print(f" visits_2019 vs μ̃: r = {corr_2019_mu_tilde[0]:.4f}, p = {corr_2019_mu_tilde[1]:.4e}") # 解读 if abs(corr_2019_mu[0]) < 0.3: print(f" 解读: 2019年访问次数与需求弱相关,说明历史分配未充分考虑需求") else: print(f" 解读: 2019年访问次数与需求有一定相关性") # 3. 分析2: 模型建议的k vs 2019年实际 print(f"\n[3] 分析2: 模型建议的k vs 2019年实际访问次数") # 将2019年访问次数缩放到730总次数 total_2019 = df['visits_2019'].sum() df['visits_2019_scaled'] = df['visits_2019'] * 730 / total_2019 # 计算差异 df['k_diff'] = df['k'] - df['visits_2019_scaled'] df['k_diff_pct'] = df['k_diff'] / df['visits_2019_scaled'] * 100 # 统计 increased = (df['k_diff'] > 0.5).sum() decreased = (df['k_diff'] < -0.5).sum() unchanged = len(df) - increased - decreased print(f" 相比2019缩放方案:") print(f" - 增加访问的站点: {increased} 个") print(f" - 减少访问的站点: {decreased} 个") print(f" - 基本不变的站点: {unchanged} 个") # 增加最多和减少最多的站点 print(f"\n 增加访问最多的5个站点:") top_increase = df.nlargest(5, 'k_diff')[['site_id', 'site_name', 'mu', 'visits_2019', 'k', 'k_diff']] for _, row in top_increase.iterrows(): print(f" {row['site_name'][:30]:<30}: 2019={row['visits_2019']}, 建议k={row['k']}, 差异={row['k_diff']:+.1f}") print(f"\n 减少访问最多的5个站点:") top_decrease = df.nsmallest(5, 'k_diff')[['site_id', 'site_name', 'mu', 'visits_2019', 'k', 'k_diff']] for _, row in top_decrease.iterrows(): print(f" {row['site_name'][:30]:<30}: 2019={row['visits_2019']}, 建议k={row['k']}, 差异={row['k_diff']:+.1f}") # 4. 分析3: 截断修正的合理性检验 print(f"\n[4] 分析3: 截断修正的合理性检验") corrected_sites = df[df['is_corrected']] uncorrected_sites = df[~df['is_corrected']] print(f" 被修正站点 ({len(corrected_sites)} 个):") print(f" - μ 均值: {corrected_sites['mu'].mean():.1f}") print(f" - σ 均值: {corrected_sites['sigma'].mean():.1f}") print(f" - CV (σ/μ) 均值: {(corrected_sites['sigma']/corrected_sites['mu']).mean():.3f}") print(f" 未修正站点 ({len(uncorrected_sites)} 个):") print(f" - μ 均值: {uncorrected_sites['mu'].mean():.1f}") print(f" - σ 均值: {uncorrected_sites['sigma'].mean():.1f}") print(f" - CV (σ/μ) 均值: {(uncorrected_sites['sigma']/uncorrected_sites['mu']).mean():.3f}") # 5. 分析4: 模型一致性 - k是否与μ̃成正比 print(f"\n[5] 分析4: 模型一致性检验 (k 应与 μ̃ 成正比)") # 理论上 k_i / μ̃_i 应该近似为常数 df['k_per_mu_tilde'] = df['k'] / df['mu_tilde'] cv_ratio = df['k_per_mu_tilde'].std() / df['k_per_mu_tilde'].mean() print(f" k / μ̃ 的统计:") print(f" - 均值: {df['k_per_mu_tilde'].mean():.4f}") print(f" - 标准差: {df['k_per_mu_tilde'].std():.4f}") print(f" - CV (变异系数): {cv_ratio:.4f}") if cv_ratio < 0.1: print(f" 解读: k 与 μ̃ 高度成正比 (CV < 0.1),模型内部一致") else: print(f" 解读: 存在一些偏差,主要来自整数化舍入") # 6. 分析5: 与2019方案的总服务量对比 print(f"\n[6] 分析5: 服务量对比") service_2019 = (df['visits_2019'] * df['mu']).sum() service_2019_scaled = (df['visits_2019_scaled'] * df['mu']).sum() service_model = (df['k'] * df['mu']).sum() print(f" 2019年实际服务量 (722次): {service_2019:.0f}") print(f" 2019年缩放服务量 (730次): {service_2019_scaled:.0f}") print(f" 模型建议服务量 (730次): {service_model:.0f}") print(f" 模型改进: {(service_model/service_2019_scaled - 1)*100:+.2f}%") # 7. 保存输出 print(f"\n[7] 保存输出: {OUTPUT_PATH}") with pd.ExcelWriter(OUTPUT_PATH, engine='openpyxl') as writer: # Sheet 1: 站点级对比 output_cols = ['site_id', 'site_name', 'mu', 'sigma', 'mu_tilde', 'is_corrected', 'visits_2019', 'visits_2019_scaled', 'k', 'k_diff', 'k_diff_pct', 'k_per_mu_tilde'] df[output_cols].to_excel(writer, sheet_name='site_comparison', index=False) # Sheet 2: 汇总指标 summary = pd.DataFrame([ {'metric': 'corr(visits_2019, μ)', 'value': corr_2019_mu[0], 'p_value': corr_2019_mu[1]}, {'metric': 'corr(visits_2019, μ̃)', 'value': corr_2019_mu_tilde[0], 'p_value': corr_2019_mu_tilde[1]}, {'metric': 'sites_increased', 'value': increased, 'p_value': None}, {'metric': 'sites_decreased', 'value': decreased, 'p_value': None}, {'metric': 'sites_unchanged', 'value': unchanged, 'p_value': None}, {'metric': 'CV(k/μ̃)', 'value': cv_ratio, 'p_value': None}, {'metric': 'service_2019_actual', 'value': service_2019, 'p_value': None}, {'metric': 'service_model', 'value': service_model, 'p_value': None}, {'metric': 'improvement_pct', 'value': (service_model/service_2019_scaled - 1)*100, 'p_value': None}, ]) summary.to_excel(writer, sheet_name='summary_metrics', index=False) # Sheet 3: 修正站点详情 corrected_sites[['site_id', 'site_name', 'mu', 'sigma', 'p_trunc', 'mu_tilde']].to_excel( writer, sheet_name='corrected_sites', index=False ) print(f" 已保存3个工作表") print("\n" + "=" * 60) print("Step 07 完成") print("=" * 60) return df if __name__ == "__main__": main()