2026-01-19 10:14:46 +08:00
|
|
|
|
"""
|
|
|
|
|
|
Step 01: 数据清洗与标准化
|
|
|
|
|
|
|
|
|
|
|
|
输入: ../data.xlsx (原始数据)
|
|
|
|
|
|
输出: 01_clean.xlsx (清洗后的标准化数据)
|
|
|
|
|
|
|
|
|
|
|
|
功能:
|
|
|
|
|
|
1. 读取原始数据
|
|
|
|
|
|
2. 保留有效列并重命名为标准字段名
|
|
|
|
|
|
3. 生成 site_id (1-70)
|
|
|
|
|
|
4. 检查缺失值和数据质量
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2026-01-19 01:40:19 +08:00
|
|
|
|
import pandas as pd
|
2026-01-19 10:14:46 +08:00
|
|
|
|
import numpy as np
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
# 路径配置
|
|
|
|
|
|
INPUT_PATH = Path(__file__).parent.parent / "data.xlsx"
|
|
|
|
|
|
OUTPUT_PATH = Path(__file__).parent / "01_clean.xlsx"
|
|
|
|
|
|
|
|
|
|
|
|
# 列名映射: 原始列名 -> 标准列名
|
|
|
|
|
|
COLUMN_MAPPING = {
|
|
|
|
|
|
'Site Name': 'site_name',
|
|
|
|
|
|
'latitude': 'lat',
|
|
|
|
|
|
'longitude': 'lon',
|
|
|
|
|
|
'Number of Visits in 2019': 'visits_2019',
|
|
|
|
|
|
'Average Demand per Visit': 'mu',
|
|
|
|
|
|
'StDev(Demand per Visit)': 'sigma'
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
print("=" * 60)
|
|
|
|
|
|
print("Step 01: 数据清洗与标准化")
|
|
|
|
|
|
print("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
# 1. 读取原始数据
|
|
|
|
|
|
print(f"\n[1] 读取原始数据: {INPUT_PATH}")
|
|
|
|
|
|
df_raw = pd.read_excel(INPUT_PATH)
|
|
|
|
|
|
print(f" 原始数据: {df_raw.shape[0]} 行, {df_raw.shape[1]} 列")
|
2026-01-19 01:40:19 +08:00
|
|
|
|
|
2026-01-19 10:14:46 +08:00
|
|
|
|
# 2. 选择并重命名列
|
|
|
|
|
|
print(f"\n[2] 选择有效列并重命名")
|
|
|
|
|
|
df = df_raw[list(COLUMN_MAPPING.keys())].copy()
|
|
|
|
|
|
df = df.rename(columns=COLUMN_MAPPING)
|
2026-01-19 01:40:19 +08:00
|
|
|
|
|
2026-01-19 10:14:46 +08:00
|
|
|
|
# 3. 生成 site_id
|
|
|
|
|
|
print(f"\n[3] 生成 site_id (1-70)")
|
|
|
|
|
|
df.insert(0, 'site_id', range(1, len(df) + 1))
|
|
|
|
|
|
|
|
|
|
|
|
# 4. 数据质量检查
|
|
|
|
|
|
print(f"\n[4] 数据质量检查")
|
|
|
|
|
|
print(f" 缺失值统计:")
|
|
|
|
|
|
missing = df.isnull().sum()
|
|
|
|
|
|
for col, count in missing.items():
|
|
|
|
|
|
if count > 0:
|
|
|
|
|
|
print(f" - {col}: {count} 个缺失值")
|
|
|
|
|
|
if missing.sum() == 0:
|
|
|
|
|
|
print(f" - 无缺失值")
|
|
|
|
|
|
|
|
|
|
|
|
# 5. 数据统计摘要
|
|
|
|
|
|
print(f"\n[5] 关键字段统计:")
|
|
|
|
|
|
print(f" 站点数: {len(df)}")
|
|
|
|
|
|
print(f" μ (单次服务人数均值):")
|
|
|
|
|
|
print(f" - 范围: [{df['mu'].min():.1f}, {df['mu'].max():.1f}]")
|
|
|
|
|
|
print(f" - 均值: {df['mu'].mean():.1f}")
|
|
|
|
|
|
print(f" - μ > 250 的站点数: {(df['mu'] > 250).sum()}")
|
|
|
|
|
|
print(f" σ (单次服务人数标准差):")
|
|
|
|
|
|
print(f" - 范围: [{df['sigma'].min():.1f}, {df['sigma'].max():.1f}]")
|
|
|
|
|
|
print(f" 2019年访问次数:")
|
|
|
|
|
|
print(f" - 总计: {df['visits_2019'].sum()}")
|
|
|
|
|
|
print(f" - 范围: [{df['visits_2019'].min()}, {df['visits_2019'].max()}]")
|
|
|
|
|
|
|
|
|
|
|
|
# 6. 保存输出
|
|
|
|
|
|
print(f"\n[6] 保存输出: {OUTPUT_PATH}")
|
|
|
|
|
|
df.to_excel(OUTPUT_PATH, index=False)
|
|
|
|
|
|
print(f" 已保存 {len(df)} 条记录")
|
|
|
|
|
|
|
|
|
|
|
|
# 7. 显示前5行
|
|
|
|
|
|
print(f"\n[7] 输出数据预览 (前5行):")
|
|
|
|
|
|
print(df.head().to_string(index=False))
|
|
|
|
|
|
|
|
|
|
|
|
print("\n" + "=" * 60)
|
|
|
|
|
|
print("Step 01 完成")
|
|
|
|
|
|
print("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
return df
|
2026-01-19 01:40:19 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
main()
|