Files
mcm-mfp/task1/01_clean.py
2026-01-19 10:14:46 +08:00

93 lines
2.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Step 01: 数据清洗与标准化
输入: ../data.xlsx (原始数据)
输出: 01_clean.xlsx (清洗后的标准化数据)
功能:
1. 读取原始数据
2. 保留有效列并重命名为标准字段名
3. 生成 site_id (1-70)
4. 检查缺失值和数据质量
"""
import pandas as pd
import numpy as np
from pathlib import Path
# 路径配置
INPUT_PATH = Path(__file__).parent.parent / "data.xlsx"
OUTPUT_PATH = Path(__file__).parent / "01_clean.xlsx"
# 列名映射: 原始列名 -> 标准列名
COLUMN_MAPPING = {
'Site Name': 'site_name',
'latitude': 'lat',
'longitude': 'lon',
'Number of Visits in 2019': 'visits_2019',
'Average Demand per Visit': 'mu',
'StDev(Demand per Visit)': 'sigma'
}
def main():
print("=" * 60)
print("Step 01: 数据清洗与标准化")
print("=" * 60)
# 1. 读取原始数据
print(f"\n[1] 读取原始数据: {INPUT_PATH}")
df_raw = pd.read_excel(INPUT_PATH)
print(f" 原始数据: {df_raw.shape[0]} 行, {df_raw.shape[1]}")
# 2. 选择并重命名列
print(f"\n[2] 选择有效列并重命名")
df = df_raw[list(COLUMN_MAPPING.keys())].copy()
df = df.rename(columns=COLUMN_MAPPING)
# 3. 生成 site_id
print(f"\n[3] 生成 site_id (1-70)")
df.insert(0, 'site_id', range(1, len(df) + 1))
# 4. 数据质量检查
print(f"\n[4] 数据质量检查")
print(f" 缺失值统计:")
missing = df.isnull().sum()
for col, count in missing.items():
if count > 0:
print(f" - {col}: {count} 个缺失值")
if missing.sum() == 0:
print(f" - 无缺失值")
# 5. 数据统计摘要
print(f"\n[5] 关键字段统计:")
print(f" 站点数: {len(df)}")
print(f" μ (单次服务人数均值):")
print(f" - 范围: [{df['mu'].min():.1f}, {df['mu'].max():.1f}]")
print(f" - 均值: {df['mu'].mean():.1f}")
print(f" - μ > 250 的站点数: {(df['mu'] > 250).sum()}")
print(f" σ (单次服务人数标准差):")
print(f" - 范围: [{df['sigma'].min():.1f}, {df['sigma'].max():.1f}]")
print(f" 2019年访问次数:")
print(f" - 总计: {df['visits_2019'].sum()}")
print(f" - 范围: [{df['visits_2019'].min()}, {df['visits_2019'].max()}]")
# 6. 保存输出
print(f"\n[6] 保存输出: {OUTPUT_PATH}")
df.to_excel(OUTPUT_PATH, index=False)
print(f" 已保存 {len(df)} 条记录")
# 7. 显示前5行
print(f"\n[7] 输出数据预览 (前5行):")
print(df.head().to_string(index=False))
print("\n" + "=" * 60)
print("Step 01 完成")
print("=" * 60)
return df
if __name__ == "__main__":
main()