add: plots
This commit is contained in:
135
analyze_visits.py
Normal file
135
analyze_visits.py
Normal file
@@ -0,0 +1,135 @@
|
||||
"""
|
||||
分析:访问总次数是否由每次访问平均需求量决定
|
||||
使用相关性分析和回归分析
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# 读取数据
|
||||
df = pd.read_excel('prob/MFP Regular Sites 2019.xlsx')
|
||||
|
||||
# 提取关键列
|
||||
visits = df['Number of Visits in 2019']
|
||||
avg_demand = df['Average Demand per Visit']
|
||||
std_demand = df['StDev(Demand per Visit)']
|
||||
|
||||
print("=" * 60)
|
||||
print("数据基本统计")
|
||||
print("=" * 60)
|
||||
print(f"样本数量: {len(visits)}")
|
||||
print(f"\n访问总次数:")
|
||||
print(f" 均值: {visits.mean():.2f}, 标准差: {visits.std():.2f}")
|
||||
print(f"\n每次访问平均需求量:")
|
||||
print(f" 均值: {avg_demand.mean():.2f}, 标准差: {avg_demand.std():.2f}")
|
||||
|
||||
# 1. 皮尔逊相关系数分析
|
||||
print("\n" + "=" * 60)
|
||||
print("1. 皮尔逊相关系数分析")
|
||||
print("=" * 60)
|
||||
r, p_value = stats.pearsonr(avg_demand, visits)
|
||||
print(f"相关系数 r = {r:.4f}")
|
||||
print(f"p值 = {p_value:.4e}")
|
||||
print(f"决定系数 R² = {r**2:.4f} (可解释{r**2*100:.1f}%的变异)")
|
||||
|
||||
if p_value < 0.05:
|
||||
print("结论: p < 0.05, 相关性显著")
|
||||
else:
|
||||
print("结论: p >= 0.05, 相关性不显著")
|
||||
|
||||
# 2. 线性回归分析
|
||||
print("\n" + "=" * 60)
|
||||
print("2. 线性回归分析 (访问次数 ~ 平均需求量)")
|
||||
print("=" * 60)
|
||||
slope, intercept, r_val, p_val, std_err = stats.linregress(avg_demand, visits)
|
||||
print(f"回归方程: 访问次数 = {slope:.4f} × 平均需求量 + {intercept:.4f}")
|
||||
print(f"斜率标准误: {std_err:.4f}")
|
||||
print(f"p值: {p_val:.4e}")
|
||||
|
||||
# 3. 标准差作为辅助分析
|
||||
print("\n" + "=" * 60)
|
||||
print("3. 标准差辅助分析")
|
||||
print("=" * 60)
|
||||
# 变异系数 (CV) = 标准差/均值, 衡量相对离散程度
|
||||
cv = std_demand / avg_demand
|
||||
print(f"变异系数 (CV = 标准差/均值) 统计:")
|
||||
print(f" 均值: {cv.mean():.4f}")
|
||||
print(f" 范围: {cv.min():.4f} - {cv.max():.4f}")
|
||||
|
||||
# 标准差与访问次数的相关性
|
||||
r_std, p_std = stats.pearsonr(std_demand.dropna(), visits[std_demand.notna()])
|
||||
print(f"\n标准差与访问次数的相关系数: r = {r_std:.4f}, p = {p_std:.4e}")
|
||||
|
||||
# 4. 多元回归 (平均需求量 + 标准差 -> 访问次数)
|
||||
print("\n" + "=" * 60)
|
||||
print("4. 多元回归分析 (同时考虑平均需求量和标准差)")
|
||||
print("=" * 60)
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
# 准备数据 (去除缺失值)
|
||||
mask = std_demand.notna()
|
||||
X = np.column_stack([avg_demand[mask], std_demand[mask]])
|
||||
y = visits[mask]
|
||||
|
||||
model = LinearRegression()
|
||||
model.fit(X, y)
|
||||
y_pred = model.predict(X)
|
||||
ss_res = np.sum((y - y_pred) ** 2)
|
||||
ss_tot = np.sum((y - y.mean()) ** 2)
|
||||
r2_multi = 1 - ss_res / ss_tot
|
||||
|
||||
print(f"多元 R² = {r2_multi:.4f} (可解释{r2_multi*100:.1f}%的变异)")
|
||||
print(f"系数: 平均需求量 = {model.coef_[0]:.4f}, 标准差 = {model.coef_[1]:.4f}")
|
||||
print(f"截距: {model.intercept_:.4f}")
|
||||
|
||||
# 5. 总结
|
||||
print("\n" + "=" * 60)
|
||||
print("综合结论")
|
||||
print("=" * 60)
|
||||
if abs(r) < 0.3:
|
||||
strength = "弱"
|
||||
elif abs(r) < 0.7:
|
||||
strength = "中等"
|
||||
else:
|
||||
strength = "强"
|
||||
|
||||
direction = "正" if r > 0 else "负"
|
||||
print(f"• 平均需求量与访问次数呈{strength}{direction}相关 (r={r:.3f})")
|
||||
print(f"• 平均需求量仅能解释访问次数{r**2*100:.1f}%的变异")
|
||||
print(f"• 加入标准差后可解释{r2_multi*100:.1f}%的变异")
|
||||
|
||||
if r**2 < 0.25:
|
||||
print("• 结论: 访问总次数主要不由每次访问平均需求量决定")
|
||||
else:
|
||||
print("• 结论: 每次访问平均需求量对访问总次数有较大影响")
|
||||
|
||||
# 绘图
|
||||
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
|
||||
|
||||
# 散点图 + 回归线
|
||||
ax1 = axes[0]
|
||||
ax1.scatter(avg_demand, visits, alpha=0.6, edgecolors='black', linewidth=0.5)
|
||||
x_line = np.linspace(avg_demand.min(), avg_demand.max(), 100)
|
||||
y_line = slope * x_line + intercept
|
||||
ax1.plot(x_line, y_line, 'r-', linewidth=2, label=f'回归线 (R²={r**2:.3f})')
|
||||
ax1.set_xlabel('Average Demand per Visit (每次访问平均需求量)')
|
||||
ax1.set_ylabel('Number of Visits (访问总次数)')
|
||||
ax1.set_title('访问次数 vs 平均需求量')
|
||||
ax1.legend()
|
||||
ax1.grid(True, alpha=0.3)
|
||||
|
||||
# 残差图
|
||||
ax2 = axes[1]
|
||||
residuals = visits - (slope * avg_demand + intercept)
|
||||
ax2.scatter(avg_demand, residuals, alpha=0.6, edgecolors='black', linewidth=0.5)
|
||||
ax2.axhline(y=0, color='r', linestyle='--', linewidth=2)
|
||||
ax2.set_xlabel('Average Demand per Visit (每次访问平均需求量)')
|
||||
ax2.set_ylabel('Residuals (残差)')
|
||||
ax2.set_title('残差分析')
|
||||
ax2.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig('analysis_result.png', dpi=150, bbox_inches='tight')
|
||||
print("\n图表已保存至 analysis_result.png")
|
||||
Reference in New Issue
Block a user