Files
mcm-mfp/analyze_visits.py
2026-01-17 11:26:04 +08:00

136 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
分析:访问总次数是否由每次访问平均需求量决定
使用相关性分析和回归分析
"""
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
# 读取数据
df = pd.read_excel('prob/MFP Regular Sites 2019.xlsx')
# 提取关键列
visits = df['Number of Visits in 2019']
avg_demand = df['Average Demand per Visit']
std_demand = df['StDev(Demand per Visit)']
print("=" * 60)
print("数据基本统计")
print("=" * 60)
print(f"样本数量: {len(visits)}")
print(f"\n访问总次数:")
print(f" 均值: {visits.mean():.2f}, 标准差: {visits.std():.2f}")
print(f"\n每次访问平均需求量:")
print(f" 均值: {avg_demand.mean():.2f}, 标准差: {avg_demand.std():.2f}")
# 1. 皮尔逊相关系数分析
print("\n" + "=" * 60)
print("1. 皮尔逊相关系数分析")
print("=" * 60)
r, p_value = stats.pearsonr(avg_demand, visits)
print(f"相关系数 r = {r:.4f}")
print(f"p值 = {p_value:.4e}")
print(f"决定系数 R² = {r**2:.4f} (可解释{r**2*100:.1f}%的变异)")
if p_value < 0.05:
print("结论: p < 0.05, 相关性显著")
else:
print("结论: p >= 0.05, 相关性不显著")
# 2. 线性回归分析
print("\n" + "=" * 60)
print("2. 线性回归分析 (访问次数 ~ 平均需求量)")
print("=" * 60)
slope, intercept, r_val, p_val, std_err = stats.linregress(avg_demand, visits)
print(f"回归方程: 访问次数 = {slope:.4f} × 平均需求量 + {intercept:.4f}")
print(f"斜率标准误: {std_err:.4f}")
print(f"p值: {p_val:.4e}")
# 3. 标准差作为辅助分析
print("\n" + "=" * 60)
print("3. 标准差辅助分析")
print("=" * 60)
# 变异系数 (CV) = 标准差/均值, 衡量相对离散程度
cv = std_demand / avg_demand
print(f"变异系数 (CV = 标准差/均值) 统计:")
print(f" 均值: {cv.mean():.4f}")
print(f" 范围: {cv.min():.4f} - {cv.max():.4f}")
# 标准差与访问次数的相关性
r_std, p_std = stats.pearsonr(std_demand.dropna(), visits[std_demand.notna()])
print(f"\n标准差与访问次数的相关系数: r = {r_std:.4f}, p = {p_std:.4e}")
# 4. 多元回归 (平均需求量 + 标准差 -> 访问次数)
print("\n" + "=" * 60)
print("4. 多元回归分析 (同时考虑平均需求量和标准差)")
print("=" * 60)
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
# 准备数据 (去除缺失值)
mask = std_demand.notna()
X = np.column_stack([avg_demand[mask], std_demand[mask]])
y = visits[mask]
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - y.mean()) ** 2)
r2_multi = 1 - ss_res / ss_tot
print(f"多元 R² = {r2_multi:.4f} (可解释{r2_multi*100:.1f}%的变异)")
print(f"系数: 平均需求量 = {model.coef_[0]:.4f}, 标准差 = {model.coef_[1]:.4f}")
print(f"截距: {model.intercept_:.4f}")
# 5. 总结
print("\n" + "=" * 60)
print("综合结论")
print("=" * 60)
if abs(r) < 0.3:
strength = ""
elif abs(r) < 0.7:
strength = "中等"
else:
strength = ""
direction = "" if r > 0 else ""
print(f"• 平均需求量与访问次数呈{strength}{direction}相关 (r={r:.3f})")
print(f"• 平均需求量仅能解释访问次数{r**2*100:.1f}%的变异")
print(f"• 加入标准差后可解释{r2_multi*100:.1f}%的变异")
if r**2 < 0.25:
print("• 结论: 访问总次数主要不由每次访问平均需求量决定")
else:
print("• 结论: 每次访问平均需求量对访问总次数有较大影响")
# 绘图
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# 散点图 + 回归线
ax1 = axes[0]
ax1.scatter(avg_demand, visits, alpha=0.6, edgecolors='black', linewidth=0.5)
x_line = np.linspace(avg_demand.min(), avg_demand.max(), 100)
y_line = slope * x_line + intercept
ax1.plot(x_line, y_line, 'r-', linewidth=2, label=f'回归线 (R²={r**2:.3f})')
ax1.set_xlabel('Average Demand per Visit (每次访问平均需求量)')
ax1.set_ylabel('Number of Visits (访问总次数)')
ax1.set_title('访问次数 vs 平均需求量')
ax1.legend()
ax1.grid(True, alpha=0.3)
# 残差图
ax2 = axes[1]
residuals = visits - (slope * avg_demand + intercept)
ax2.scatter(avg_demand, residuals, alpha=0.6, edgecolors='black', linewidth=0.5)
ax2.axhline(y=0, color='r', linestyle='--', linewidth=2)
ax2.set_xlabel('Average Demand per Visit (每次访问平均需求量)')
ax2.set_ylabel('Residuals (残差)')
ax2.set_title('残差分析')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('analysis_result.png', dpi=150, bbox_inches='tight')
print("\n图表已保存至 analysis_result.png")