P1: rebuild
This commit is contained in:
@@ -1,58 +1,92 @@
|
||||
"""
|
||||
Step 01: 数据清洗与标准化
|
||||
|
||||
输入: ../data.xlsx (原始数据)
|
||||
输出: 01_clean.xlsx (清洗后的标准化数据)
|
||||
|
||||
功能:
|
||||
1. 读取原始数据
|
||||
2. 保留有效列并重命名为标准字段名
|
||||
3. 生成 site_id (1-70)
|
||||
4. 检查缺失值和数据质量
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
# 路径配置
|
||||
INPUT_PATH = Path(__file__).parent.parent / "data.xlsx"
|
||||
OUTPUT_PATH = Path(__file__).parent / "01_clean.xlsx"
|
||||
|
||||
# 列名映射: 原始列名 -> 标准列名
|
||||
COLUMN_MAPPING = {
|
||||
'Site Name': 'site_name',
|
||||
'latitude': 'lat',
|
||||
'longitude': 'lon',
|
||||
'Number of Visits in 2019': 'visits_2019',
|
||||
'Average Demand per Visit': 'mu',
|
||||
'StDev(Demand per Visit)': 'sigma'
|
||||
}
|
||||
|
||||
|
||||
INPUT_XLSX = "data.xlsx"
|
||||
OUTPUT_XLSX = "task1/01_clean.xlsx"
|
||||
SHEET_NAME = "addresses2019 updated"
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Step 01: 数据清洗与标准化")
|
||||
print("=" * 60)
|
||||
|
||||
# 1. 读取原始数据
|
||||
print(f"\n[1] 读取原始数据: {INPUT_PATH}")
|
||||
df_raw = pd.read_excel(INPUT_PATH)
|
||||
print(f" 原始数据: {df_raw.shape[0]} 行, {df_raw.shape[1]} 列")
|
||||
|
||||
def main() -> None:
|
||||
df_raw = pd.read_excel(INPUT_XLSX, sheet_name=SHEET_NAME)
|
||||
# 2. 选择并重命名列
|
||||
print(f"\n[2] 选择有效列并重命名")
|
||||
df = df_raw[list(COLUMN_MAPPING.keys())].copy()
|
||||
df = df.rename(columns=COLUMN_MAPPING)
|
||||
|
||||
required = [
|
||||
"Site Name",
|
||||
"latitude",
|
||||
"longitude",
|
||||
"Number of Visits in 2019",
|
||||
"Average Demand per Visit",
|
||||
"StDev(Demand per Visit)",
|
||||
]
|
||||
missing = [c for c in required if c not in df_raw.columns]
|
||||
if missing:
|
||||
raise ValueError(f"Missing required columns: {missing}")
|
||||
# 3. 生成 site_id
|
||||
print(f"\n[3] 生成 site_id (1-70)")
|
||||
df.insert(0, 'site_id', range(1, len(df) + 1))
|
||||
|
||||
df = df_raw[required].copy()
|
||||
df = df.rename(
|
||||
columns={
|
||||
"Site Name": "site_name",
|
||||
"latitude": "lat",
|
||||
"longitude": "lon",
|
||||
"Number of Visits in 2019": "visits_2019",
|
||||
"Average Demand per Visit": "mu_clients_per_visit",
|
||||
"StDev(Demand per Visit)": "sd_clients_per_visit",
|
||||
}
|
||||
)
|
||||
# 4. 数据质量检查
|
||||
print(f"\n[4] 数据质量检查")
|
||||
print(f" 缺失值统计:")
|
||||
missing = df.isnull().sum()
|
||||
for col, count in missing.items():
|
||||
if count > 0:
|
||||
print(f" - {col}: {count} 个缺失值")
|
||||
if missing.sum() == 0:
|
||||
print(f" - 无缺失值")
|
||||
|
||||
df.insert(0, "site_id", range(1, len(df) + 1))
|
||||
# 5. 数据统计摘要
|
||||
print(f"\n[5] 关键字段统计:")
|
||||
print(f" 站点数: {len(df)}")
|
||||
print(f" μ (单次服务人数均值):")
|
||||
print(f" - 范围: [{df['mu'].min():.1f}, {df['mu'].max():.1f}]")
|
||||
print(f" - 均值: {df['mu'].mean():.1f}")
|
||||
print(f" - μ > 250 的站点数: {(df['mu'] > 250).sum()}")
|
||||
print(f" σ (单次服务人数标准差):")
|
||||
print(f" - 范围: [{df['sigma'].min():.1f}, {df['sigma'].max():.1f}]")
|
||||
print(f" 2019年访问次数:")
|
||||
print(f" - 总计: {df['visits_2019'].sum()}")
|
||||
print(f" - 范围: [{df['visits_2019'].min()}, {df['visits_2019'].max()}]")
|
||||
|
||||
numeric_cols = ["lat", "lon", "visits_2019", "mu_clients_per_visit", "sd_clients_per_visit"]
|
||||
for col in numeric_cols:
|
||||
df[col] = pd.to_numeric(df[col], errors="coerce")
|
||||
# 6. 保存输出
|
||||
print(f"\n[6] 保存输出: {OUTPUT_PATH}")
|
||||
df.to_excel(OUTPUT_PATH, index=False)
|
||||
print(f" 已保存 {len(df)} 条记录")
|
||||
|
||||
if df["site_name"].isna().any():
|
||||
raise ValueError("Found missing site_name values.")
|
||||
if df[numeric_cols].isna().any().any():
|
||||
bad = df[df[numeric_cols].isna().any(axis=1)][["site_id", "site_name"] + numeric_cols]
|
||||
raise ValueError(f"Found missing numeric values:\n{bad}")
|
||||
if (df["mu_clients_per_visit"] < 0).any() or (df["sd_clients_per_visit"] < 0).any():
|
||||
raise ValueError("Found negative mu/sd values; expected nonnegative.")
|
||||
if (df["visits_2019"] <= 0).any():
|
||||
raise ValueError("Found non-positive visits_2019; expected >0 for all 70 regular sites.")
|
||||
# 7. 显示前5行
|
||||
print(f"\n[7] 输出数据预览 (前5行):")
|
||||
print(df.head().to_string(index=False))
|
||||
|
||||
with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as writer:
|
||||
df.to_excel(writer, index=False, sheet_name="sites")
|
||||
print("\n" + "=" * 60)
|
||||
print("Step 01 完成")
|
||||
print("=" * 60)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
Binary file not shown.
BIN
task1/02_demand.xlsx
Normal file
BIN
task1/02_demand.xlsx
Normal file
Binary file not shown.
140
task1/02_demand_correction.py
Normal file
140
task1/02_demand_correction.py
Normal file
@@ -0,0 +1,140 @@
|
||||
"""
|
||||
Step 02: 截断回归修正 - 估计真实需求
|
||||
|
||||
输入: 01_clean.xlsx
|
||||
输出: 02_demand.xlsx
|
||||
|
||||
功能:
|
||||
1. 识别被容量截断的高需求站点
|
||||
2. 使用截断正态模型修正,估计真实需求 μ̃
|
||||
3. 输出修正前后的对比
|
||||
|
||||
核心假设:
|
||||
- 观测到的 μ_i 是真实需求在容量 C 下的截断观测
|
||||
- 真实需求服从正态分布 N(μ̃_i, σ̃_i²)
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy.stats import norm
|
||||
from pathlib import Path
|
||||
|
||||
# 路径配置
|
||||
INPUT_PATH = Path(__file__).parent / "01_clean.xlsx"
|
||||
OUTPUT_PATH = Path(__file__).parent / "02_demand.xlsx"
|
||||
|
||||
# 模型参数
|
||||
C = 400 # 有效容量上限 (基于 μ_max = 396.6)
|
||||
P_TRUNC_THRESHOLD = 0.02 # 截断概率阈值 (调低以捕获更多潜在截断站点)
|
||||
|
||||
|
||||
def truncation_correction(mu: float, sigma: float, C: float = 400) -> tuple:
|
||||
"""
|
||||
截断回归修正
|
||||
|
||||
Args:
|
||||
mu: 观测均值
|
||||
sigma: 观测标准差
|
||||
C: 有效容量上限
|
||||
|
||||
Returns:
|
||||
(mu_tilde, p_trunc, is_corrected)
|
||||
- mu_tilde: 修正后的真实需求估计
|
||||
- p_trunc: 截断概率
|
||||
- is_corrected: 是否进行了修正
|
||||
"""
|
||||
if sigma <= 0 or np.isnan(sigma):
|
||||
return mu, 0.0, False
|
||||
|
||||
# 计算截断概率: P(D > C)
|
||||
z = (C - mu) / sigma
|
||||
p_trunc = 1 - norm.cdf(z)
|
||||
|
||||
if p_trunc < P_TRUNC_THRESHOLD:
|
||||
# 截断概率低于阈值,视为未截断
|
||||
return mu, p_trunc, False
|
||||
else:
|
||||
# 截断修正: 使用 Mills ratio 近似
|
||||
# E[D | D > C] = μ + σ * φ(z) / (1 - Φ(z))
|
||||
# 修正后: μ̃ ≈ μ * (1 + α * p_trunc)
|
||||
# 这里使用简化的线性修正
|
||||
correction_factor = 1 + 0.4 * p_trunc
|
||||
mu_tilde = mu * correction_factor
|
||||
return mu_tilde, p_trunc, True
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Step 02: 截断回归修正 - 估计真实需求")
|
||||
print("=" * 60)
|
||||
|
||||
# 1. 读取清洗后的数据
|
||||
print(f"\n[1] 读取输入: {INPUT_PATH}")
|
||||
df = pd.read_excel(INPUT_PATH)
|
||||
print(f" 读取 {len(df)} 条记录")
|
||||
|
||||
# 2. 显示参数
|
||||
print(f"\n[2] 模型参数:")
|
||||
print(f" 有效容量上限 C = {C}")
|
||||
print(f" 截断概率阈值 = {P_TRUNC_THRESHOLD}")
|
||||
|
||||
# 3. 应用截断修正
|
||||
print(f"\n[3] 应用截断修正...")
|
||||
results = []
|
||||
for _, row in df.iterrows():
|
||||
mu_tilde, p_trunc, is_corrected = truncation_correction(
|
||||
row['mu'], row['sigma'], C
|
||||
)
|
||||
results.append({
|
||||
'mu_tilde': mu_tilde,
|
||||
'p_trunc': p_trunc,
|
||||
'is_corrected': is_corrected
|
||||
})
|
||||
|
||||
df_result = pd.DataFrame(results)
|
||||
df['mu_tilde'] = df_result['mu_tilde']
|
||||
df['p_trunc'] = df_result['p_trunc']
|
||||
df['is_corrected'] = df_result['is_corrected']
|
||||
|
||||
# 4. 修正统计
|
||||
n_corrected = df['is_corrected'].sum()
|
||||
print(f"\n[4] 修正统计:")
|
||||
print(f" 被修正的站点数: {n_corrected} / {len(df)}")
|
||||
|
||||
if n_corrected > 0:
|
||||
corrected_sites = df[df['is_corrected']]
|
||||
print(f"\n 被修正站点详情:")
|
||||
print(f" {'site_id':<8} {'site_name':<35} {'μ':>8} {'σ':>8} {'p_trunc':>8} {'μ̃':>10} {'修正幅度':>10}")
|
||||
print(f" {'-'*8} {'-'*35} {'-'*8} {'-'*8} {'-'*8} {'-'*10} {'-'*10}")
|
||||
for _, row in corrected_sites.iterrows():
|
||||
correction_pct = (row['mu_tilde'] - row['mu']) / row['mu'] * 100
|
||||
print(f" {row['site_id']:<8} {row['site_name'][:35]:<35} {row['mu']:>8.1f} {row['sigma']:>8.1f} {row['p_trunc']:>8.3f} {row['mu_tilde']:>10.1f} {correction_pct:>9.1f}%")
|
||||
|
||||
# 5. 修正前后对比
|
||||
print(f"\n[5] 修正前后对比:")
|
||||
print(f" 修正前 μ 总和: {df['mu'].sum():.1f}")
|
||||
print(f" 修正后 μ̃ 总和: {df['mu_tilde'].sum():.1f}")
|
||||
print(f" 增幅: {(df['mu_tilde'].sum() / df['mu'].sum() - 1) * 100:.2f}%")
|
||||
|
||||
# 6. 保存输出
|
||||
print(f"\n[6] 保存输出: {OUTPUT_PATH}")
|
||||
# 选择输出列
|
||||
output_cols = ['site_id', 'site_name', 'lat', 'lon', 'visits_2019',
|
||||
'mu', 'sigma', 'mu_tilde', 'p_trunc', 'is_corrected']
|
||||
df[output_cols].to_excel(OUTPUT_PATH, index=False)
|
||||
print(f" 已保存 {len(df)} 条记录")
|
||||
|
||||
# 7. 输出预览
|
||||
print(f"\n[7] 输出数据预览 (μ 最高的10个站点):")
|
||||
top10 = df.nlargest(10, 'mu')[['site_id', 'site_name', 'mu', 'sigma', 'p_trunc', 'mu_tilde', 'is_corrected']]
|
||||
print(top10.to_string(index=False))
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Step 02 完成")
|
||||
print("=" * 60)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Binary file not shown.
@@ -1,59 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
INPUT_XLSX = "task1/01_clean.xlsx"
|
||||
OUTPUT_XLSX = "task1/02_neighbor.xlsx"
|
||||
|
||||
RHO_MILES_LIST = [10.0, 20.0, 30.0]
|
||||
|
||||
|
||||
def haversine_miles(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
|
||||
r_miles = 3958.7613
|
||||
phi1 = math.radians(lat1)
|
||||
phi2 = math.radians(lat2)
|
||||
dphi = math.radians(lat2 - lat1)
|
||||
dlambda = math.radians(lon2 - lon1)
|
||||
|
||||
a = math.sin(dphi / 2.0) ** 2 + math.cos(phi1) * math.cos(phi2) * math.sin(dlambda / 2.0) ** 2
|
||||
c = 2.0 * math.atan2(math.sqrt(a), math.sqrt(1.0 - a))
|
||||
return r_miles * c
|
||||
|
||||
|
||||
def main() -> None:
|
||||
sites = pd.read_excel(INPUT_XLSX, sheet_name="sites")
|
||||
|
||||
lat = sites["lat"].to_numpy(float)
|
||||
lon = sites["lon"].to_numpy(float)
|
||||
mu = sites["mu_clients_per_visit"].to_numpy(float)
|
||||
|
||||
n = len(sites)
|
||||
dist = np.zeros((n, n), dtype=float)
|
||||
for i in range(n):
|
||||
for j in range(i + 1, n):
|
||||
d = haversine_miles(lat[i], lon[i], lat[j], lon[j])
|
||||
dist[i, j] = d
|
||||
dist[j, i] = d
|
||||
|
||||
out = sites.copy()
|
||||
out["neighbor_demand_mu_self"] = mu.copy()
|
||||
for rho in RHO_MILES_LIST:
|
||||
w = np.exp(-(dist**2) / (2.0 * rho * rho))
|
||||
# D_i(rho) = sum_j mu_j * exp(-dist(i,j)^2 / (2 rho^2))
|
||||
out[f"neighbor_demand_mu_rho_{int(rho)}mi"] = w @ mu
|
||||
|
||||
dist_df = pd.DataFrame(dist, columns=sites["site_id"].tolist())
|
||||
dist_df.insert(0, "site_id", sites["site_id"])
|
||||
|
||||
with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as writer:
|
||||
out.to_excel(writer, index=False, sheet_name="sites")
|
||||
dist_df.to_excel(writer, index=False, sheet_name="dist_miles")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
155
task1/03_allocate.py
Normal file
155
task1/03_allocate.py
Normal file
@@ -0,0 +1,155 @@
|
||||
"""
|
||||
Step 03: 频次分配 - Hamilton最大余数法
|
||||
|
||||
输入: 02_demand.xlsx
|
||||
输出: 03_allocate.xlsx
|
||||
|
||||
功能:
|
||||
1. 按真实需求 μ̃ 比例分配年度访问次数 k_i
|
||||
2. 使用 Hamilton 方法保证整数分配且总和 = N
|
||||
3. 满足覆盖约束: k_i >= 1
|
||||
|
||||
分配原则:
|
||||
- 先给每个站点分配1次 (覆盖约束)
|
||||
- 剩余 N-70 次按 μ̃ 比例分配
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
# 路径配置
|
||||
INPUT_PATH = Path(__file__).parent / "02_demand.xlsx"
|
||||
OUTPUT_PATH = Path(__file__).parent / "03_allocate.xlsx"
|
||||
|
||||
# 分配参数
|
||||
N_TOTAL = 730 # 年度总访问次数 (365天 × 2站点/天)
|
||||
MIN_VISITS = 1 # 每站点最少访问次数 (覆盖约束)
|
||||
|
||||
|
||||
def hamilton_allocation(total: int, weights: list) -> list:
|
||||
"""
|
||||
Hamilton最大余数法整数分配
|
||||
|
||||
Args:
|
||||
total: 待分配总数
|
||||
weights: 各项权重
|
||||
|
||||
Returns:
|
||||
整数分配结果列表
|
||||
"""
|
||||
n = len(weights)
|
||||
w_sum = sum(weights)
|
||||
|
||||
# 连续配额
|
||||
quotas = [total * w / w_sum for w in weights]
|
||||
|
||||
# 下取整
|
||||
floors = [int(q) for q in quotas]
|
||||
remainders = [q - f for q, f in zip(quotas, floors)]
|
||||
|
||||
# 剩余席位按余数从大到小分配
|
||||
leftover = total - sum(floors)
|
||||
indices = sorted(range(n), key=lambda i: -remainders[i])
|
||||
for i in indices[:leftover]:
|
||||
floors[i] += 1
|
||||
|
||||
return floors
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Step 03: 频次分配 - Hamilton最大余数法")
|
||||
print("=" * 60)
|
||||
|
||||
# 1. 读取需求修正后的数据
|
||||
print(f"\n[1] 读取输入: {INPUT_PATH}")
|
||||
df = pd.read_excel(INPUT_PATH)
|
||||
print(f" 读取 {len(df)} 条记录")
|
||||
|
||||
n_sites = len(df)
|
||||
|
||||
# 2. 显示参数
|
||||
print(f"\n[2] 分配参数:")
|
||||
print(f" 年度总访问次数 N = {N_TOTAL}")
|
||||
print(f" 站点数 = {n_sites}")
|
||||
print(f" 覆盖约束: 每站点至少 {MIN_VISITS} 次")
|
||||
print(f" 剩余可分配次数 = {N_TOTAL} - {n_sites} × {MIN_VISITS} = {N_TOTAL - n_sites * MIN_VISITS}")
|
||||
|
||||
# 3. Hamilton分配
|
||||
print(f"\n[3] 执行Hamilton分配...")
|
||||
|
||||
# 权重 = 修正后的真实需求 μ̃
|
||||
weights = df['mu_tilde'].tolist()
|
||||
|
||||
# 分配剩余次数
|
||||
extra_visits = N_TOTAL - n_sites * MIN_VISITS
|
||||
k_extra = hamilton_allocation(extra_visits, weights)
|
||||
|
||||
# 总访问次数 = 基础 + 额外
|
||||
df['k'] = [MIN_VISITS + ke for ke in k_extra]
|
||||
|
||||
# 4. 验证分配结果
|
||||
print(f"\n[4] 分配结果验证:")
|
||||
print(f" 总访问次数: Σk_i = {df['k'].sum()} (应为 {N_TOTAL})")
|
||||
print(f" 最小访问次数: min(k_i) = {df['k'].min()} (应 >= {MIN_VISITS})")
|
||||
print(f" 最大访问次数: max(k_i) = {df['k'].max()}")
|
||||
print(f" 访问次数范围: [{df['k'].min()}, {df['k'].max()}]")
|
||||
|
||||
assert df['k'].sum() == N_TOTAL, f"总访问次数不等于{N_TOTAL}"
|
||||
assert df['k'].min() >= MIN_VISITS, f"存在站点访问次数少于{MIN_VISITS}"
|
||||
|
||||
# 5. 计算满足率
|
||||
# r_i = k_i * μ_i / μ̃_i (年度服务量 / 真实需求)
|
||||
df['annual_service'] = df['k'] * df['mu'] # 年度预期服务量
|
||||
df['r'] = df['annual_service'] / df['mu_tilde'] # 满足率代理
|
||||
|
||||
print(f"\n[5] 满足率统计:")
|
||||
print(f" 满足率 r = k × μ / μ̃")
|
||||
print(f" r 均值: {df['r'].mean():.2f}")
|
||||
print(f" r 标准差: {df['r'].std():.2f}")
|
||||
print(f" r 范围: [{df['r'].min():.2f}, {df['r'].max():.2f}]")
|
||||
print(f" r 变异系数: {df['r'].std() / df['r'].mean():.4f}")
|
||||
|
||||
# 6. 分配结果分布
|
||||
print(f"\n[6] 访问次数分布:")
|
||||
k_counts = df['k'].value_counts().sort_index()
|
||||
for k_val, count in k_counts.items():
|
||||
print(f" k = {k_val:2d}: {count:2d} 个站点 {'█' * count}")
|
||||
|
||||
# 7. 与2019年对比
|
||||
print(f"\n[7] 与2019年访问次数对比:")
|
||||
df['k_2019_scaled'] = df['visits_2019'] * N_TOTAL / df['visits_2019'].sum()
|
||||
df['k_diff'] = df['k'] - df['k_2019_scaled']
|
||||
print(f" 2019年总访问次数: {df['visits_2019'].sum()}")
|
||||
print(f" 2019年缩放后总次数: {df['k_2019_scaled'].sum():.1f}")
|
||||
print(f" 新方案 vs 2019缩放:")
|
||||
print(f" - 增加访问的站点: {(df['k_diff'] > 0.5).sum()} 个")
|
||||
print(f" - 减少访问的站点: {(df['k_diff'] < -0.5).sum()} 个")
|
||||
print(f" - 基本不变的站点: {((df['k_diff'] >= -0.5) & (df['k_diff'] <= 0.5)).sum()} 个")
|
||||
|
||||
# 8. 保存输出
|
||||
print(f"\n[8] 保存输出: {OUTPUT_PATH}")
|
||||
output_cols = ['site_id', 'site_name', 'lat', 'lon', 'visits_2019',
|
||||
'mu', 'sigma', 'mu_tilde', 'k', 'annual_service', 'r']
|
||||
df[output_cols].to_excel(OUTPUT_PATH, index=False)
|
||||
print(f" 已保存 {len(df)} 条记录")
|
||||
|
||||
# 9. 输出预览
|
||||
print(f"\n[9] 分配结果预览 (k 最高的10个站点):")
|
||||
top10 = df.nlargest(10, 'k')[['site_id', 'site_name', 'mu', 'mu_tilde', 'k', 'annual_service', 'r']]
|
||||
print(top10.to_string(index=False))
|
||||
|
||||
print(f"\n 分配结果预览 (k 最低的10个站点):")
|
||||
bottom10 = df.nsmallest(10, 'k')[['site_id', 'site_name', 'mu', 'mu_tilde', 'k', 'annual_service', 'r']]
|
||||
print(bottom10.to_string(index=False))
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Step 03 完成")
|
||||
print("=" * 60)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Binary file not shown.
@@ -1,301 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
INPUT_XLSX = "task1/02_neighbor.xlsx"
|
||||
OUTPUT_XLSX = "task1/03_allocate.xlsx"
|
||||
|
||||
N_TOTAL_2021 = 730 # scenario B: 2 sites/day * 365 days
|
||||
K_MIN = 1
|
||||
|
||||
RHO_COLS = [
|
||||
("self", "neighbor_demand_mu_self"),
|
||||
("rho10", "neighbor_demand_mu_rho_10mi"),
|
||||
("rho20", "neighbor_demand_mu_rho_20mi"),
|
||||
("rho30", "neighbor_demand_mu_rho_30mi"),
|
||||
]
|
||||
|
||||
|
||||
def gini(x: np.ndarray) -> float:
|
||||
x = np.asarray(x, dtype=float)
|
||||
if np.any(x < 0):
|
||||
raise ValueError("Gini expects nonnegative values.")
|
||||
if np.allclose(x.sum(), 0.0):
|
||||
return 0.0
|
||||
n = len(x)
|
||||
diff_sum = np.abs(x.reshape(-1, 1) - x.reshape(1, -1)).sum()
|
||||
return diff_sum / (2.0 * n * n * x.mean())
|
||||
|
||||
|
||||
def largest_remainder_allocation(weights: np.ndarray, total: int, k_min: int) -> np.ndarray:
|
||||
if total < k_min * len(weights):
|
||||
raise ValueError("total too small for k_min constraint.")
|
||||
w = np.asarray(weights, dtype=float)
|
||||
if np.any(w < 0):
|
||||
raise ValueError("weights must be nonnegative.")
|
||||
if np.allclose(w.sum(), 0.0):
|
||||
# Purely equal split
|
||||
base = np.full(len(w), total // len(w), dtype=int)
|
||||
base[: total - base.sum()] += 1
|
||||
return np.maximum(base, k_min)
|
||||
|
||||
remaining = total - k_min * len(w)
|
||||
w_norm = w / w.sum()
|
||||
raw = remaining * w_norm
|
||||
flo = np.floor(raw).astype(int)
|
||||
k = flo + k_min
|
||||
|
||||
need = total - k.sum()
|
||||
if need > 0:
|
||||
frac = raw - flo
|
||||
order = np.argsort(-frac, kind="stable")
|
||||
k[order[:need]] += 1
|
||||
elif need < 0:
|
||||
frac = raw - flo
|
||||
order = np.argsort(frac, kind="stable")
|
||||
to_remove = -need
|
||||
for idx in order:
|
||||
if to_remove == 0:
|
||||
break
|
||||
if k[idx] > k_min:
|
||||
k[idx] -= 1
|
||||
to_remove -= 1
|
||||
if k.sum() != total:
|
||||
raise RuntimeError("Failed to adjust allocation to exact total.")
|
||||
|
||||
if k.sum() != total:
|
||||
raise RuntimeError("Allocation did not match total.")
|
||||
if (k < k_min).any():
|
||||
raise RuntimeError("Allocation violated k_min.")
|
||||
return k
|
||||
|
||||
|
||||
def feasibility_sum_for_t(t: float, d: np.ndarray, mu: np.ndarray, k_min: int) -> int:
|
||||
# Constraints: k_i >= max(k_min, ceil(t * D_i / mu_i))
|
||||
if t < 0:
|
||||
return math.inf
|
||||
req = np.ceil((t * d) / mu).astype(int)
|
||||
req = np.maximum(req, k_min)
|
||||
return int(req.sum())
|
||||
|
||||
|
||||
def max_min_service_level(d: np.ndarray, mu: np.ndarray, n_total: int, k_min: int) -> tuple[float, np.ndarray]:
|
||||
if (mu <= 0).any():
|
||||
raise ValueError("mu must be positive to define service level.")
|
||||
if (d <= 0).any():
|
||||
raise ValueError("neighbor demand proxy D must be positive.")
|
||||
|
||||
# Upper bound for t: if all visits assigned to best site, service level there is (n_total*mu_i/d_i).
|
||||
# For max-min, t cannot exceed min_i (n_total*mu_i/d_i) but k_min can also bind; use conservative.
|
||||
t_hi = float(np.min((n_total * mu) / d))
|
||||
t_lo = 0.0
|
||||
|
||||
# Binary search for max feasible t
|
||||
for _ in range(60):
|
||||
t_mid = (t_lo + t_hi) / 2.0
|
||||
s = feasibility_sum_for_t(t_mid, d=d, mu=mu, k_min=k_min)
|
||||
if s <= n_total:
|
||||
t_lo = t_mid
|
||||
else:
|
||||
t_hi = t_mid
|
||||
|
||||
t_star = t_lo
|
||||
k_base = np.ceil((t_star * d) / mu).astype(int)
|
||||
k_base = np.maximum(k_base, k_min)
|
||||
|
||||
if k_base.sum() > n_total:
|
||||
# Numerical edge case: back off to ensure feasibility
|
||||
t_star *= 0.999
|
||||
k_base = np.ceil((t_star * d) / mu).astype(int)
|
||||
k_base = np.maximum(k_base, k_min)
|
||||
if k_base.sum() > n_total:
|
||||
raise RuntimeError("Failed to construct feasible k at t*.")
|
||||
|
||||
return t_star, k_base
|
||||
|
||||
|
||||
def distribute_remaining_fair(k: np.ndarray, mu: np.ndarray, d: np.ndarray, remaining: int) -> np.ndarray:
|
||||
# Greedy: repeatedly allocate to smallest current s_i = k_i*mu_i/d_i
|
||||
k_out = k.copy()
|
||||
for _ in range(remaining):
|
||||
s = (k_out * mu) / d
|
||||
idx = int(np.argmin(s))
|
||||
k_out[idx] += 1
|
||||
return k_out
|
||||
|
||||
|
||||
def distribute_remaining_efficient(k: np.ndarray, mu: np.ndarray, remaining: int) -> np.ndarray:
|
||||
# Greedy: allocate to highest mu_i (maximizes sum k_i*mu_i)
|
||||
k_out = k.copy()
|
||||
order = np.argsort(-mu, kind="stable")
|
||||
for t in range(remaining):
|
||||
k_out[order[t % len(order)]] += 1
|
||||
return k_out
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AllocationResult:
|
||||
method: str
|
||||
scenario: str
|
||||
k: np.ndarray
|
||||
t_star: float | None
|
||||
|
||||
|
||||
def compute_metrics(k: np.ndarray, mu: np.ndarray, d: np.ndarray) -> dict[str, float]:
|
||||
s = (k * mu) / d
|
||||
return {
|
||||
"k_sum": float(k.sum()),
|
||||
"total_expected_clients": float(np.dot(k, mu)),
|
||||
"service_level_min": float(s.min()),
|
||||
"service_level_mean": float(s.mean()),
|
||||
"service_level_gini": float(gini(s)),
|
||||
"service_level_cv": float(s.std(ddof=0) / (s.mean() + 1e-12)),
|
||||
"k_gini": float(gini(k.astype(float))),
|
||||
}
|
||||
|
||||
|
||||
def improve_efficiency_under_gini_cap(
|
||||
*,
|
||||
k_start: np.ndarray,
|
||||
mu: np.ndarray,
|
||||
d: np.ndarray,
|
||||
n_total: int,
|
||||
k_min: int,
|
||||
gini_cap: float,
|
||||
max_iters: int = 20000,
|
||||
) -> np.ndarray:
|
||||
if k_start.sum() != n_total:
|
||||
raise ValueError("k_start must sum to n_total.")
|
||||
if (k_start < k_min).any():
|
||||
raise ValueError("k_start violates k_min.")
|
||||
|
||||
k = k_start.copy()
|
||||
s = (k * mu) / d
|
||||
g = gini(s)
|
||||
if g <= gini_cap:
|
||||
return k
|
||||
|
||||
for _ in range(max_iters):
|
||||
s = (k * mu) / d
|
||||
g = gini(s)
|
||||
if g <= gini_cap:
|
||||
break
|
||||
|
||||
donor_candidates = np.argsort(-s, kind="stable")[:10]
|
||||
receiver_candidates = np.argsort(s, kind="stable")[:10]
|
||||
|
||||
best_move = None
|
||||
best_score = None
|
||||
|
||||
for donor in donor_candidates:
|
||||
if k[donor] <= k_min:
|
||||
continue
|
||||
for receiver in receiver_candidates:
|
||||
if donor == receiver:
|
||||
continue
|
||||
|
||||
k2 = k.copy()
|
||||
k2[donor] -= 1
|
||||
k2[receiver] += 1
|
||||
s2 = (k2 * mu) / d
|
||||
g2 = gini(s2)
|
||||
if g2 >= g:
|
||||
continue
|
||||
|
||||
eff_loss = mu[donor] - mu[receiver]
|
||||
if eff_loss < 0:
|
||||
# This move increases effectiveness and improves fairness; prefer strongly.
|
||||
score = (g - g2) * 1e9 + (-eff_loss)
|
||||
else:
|
||||
score = (g - g2) / (eff_loss + 1e-9)
|
||||
|
||||
if best_score is None or score > best_score:
|
||||
best_score = score
|
||||
best_move = (donor, receiver, k2)
|
||||
|
||||
if best_move is None:
|
||||
# No improving local swap found; stop.
|
||||
break
|
||||
k = best_move[2]
|
||||
|
||||
if k.sum() != n_total or (k < k_min).any():
|
||||
raise RuntimeError("Post-optimization allocation violated constraints.")
|
||||
return k
|
||||
|
||||
|
||||
def main() -> None:
|
||||
sites = pd.read_excel(INPUT_XLSX, sheet_name="sites")
|
||||
mu = sites["mu_clients_per_visit"].to_numpy(float)
|
||||
visits_2019 = sites["visits_2019"].to_numpy(int)
|
||||
|
||||
results: list[AllocationResult] = []
|
||||
metrics_rows: list[dict[str, object]] = []
|
||||
|
||||
for scenario, dcol in RHO_COLS:
|
||||
d = sites[dcol].to_numpy(float)
|
||||
|
||||
# Baseline: scaled 2019 to sum to N_TOTAL_2021
|
||||
k_2019_scaled = largest_remainder_allocation(
|
||||
weights=visits_2019.astype(float), total=N_TOTAL_2021, k_min=K_MIN
|
||||
)
|
||||
results.append(AllocationResult(method="baseline_2019_scaled", scenario=scenario, k=k_2019_scaled, t_star=None))
|
||||
|
||||
# Max-min baseline k achieving best possible min service level under sum constraint.
|
||||
t_star, k_base = max_min_service_level(d=d, mu=mu, n_total=N_TOTAL_2021, k_min=K_MIN)
|
||||
remaining = N_TOTAL_2021 - int(k_base.sum())
|
||||
|
||||
k_fair = distribute_remaining_fair(k_base, mu=mu, d=d, remaining=remaining)
|
||||
results.append(AllocationResult(method="fairness_waterfill", scenario=scenario, k=k_fair, t_star=t_star))
|
||||
|
||||
k_eff = distribute_remaining_efficient(k_base, mu=mu, remaining=remaining)
|
||||
results.append(AllocationResult(method="efficiency_post_fair", scenario=scenario, k=k_eff, t_star=t_star))
|
||||
|
||||
# Proportional to surrounding demand proxy D (uses "surrounding communities demand" directly)
|
||||
k_D = largest_remainder_allocation(weights=d, total=N_TOTAL_2021, k_min=K_MIN)
|
||||
results.append(AllocationResult(method="proportional_D", scenario=scenario, k=k_D, t_star=None))
|
||||
|
||||
# Simple proportional fairness: k proportional to D/mu gives near-constant s in continuous relaxation.
|
||||
k_prop = largest_remainder_allocation(weights=d / mu, total=N_TOTAL_2021, k_min=K_MIN)
|
||||
results.append(AllocationResult(method="proportional_D_over_mu", scenario=scenario, k=k_prop, t_star=None))
|
||||
|
||||
# Pure efficiency: k proportional to mu (with k_min)
|
||||
k_mu = largest_remainder_allocation(weights=mu, total=N_TOTAL_2021, k_min=K_MIN)
|
||||
results.append(AllocationResult(method="proportional_mu", scenario=scenario, k=k_mu, t_star=None))
|
||||
|
||||
# Efficiency-maximizing subject to "no worse fairness than baseline_2019_scaled" (auditable cap)
|
||||
gini_cap = compute_metrics(k_2019_scaled, mu=mu, d=d)["service_level_gini"]
|
||||
k_mu_capped = improve_efficiency_under_gini_cap(
|
||||
k_start=k_mu, mu=mu, d=d, n_total=N_TOTAL_2021, k_min=K_MIN, gini_cap=float(gini_cap)
|
||||
)
|
||||
results.append(AllocationResult(method="proportional_mu_gini_capped_to_2019", scenario=scenario, k=k_mu_capped, t_star=None))
|
||||
|
||||
# Assemble per-site output
|
||||
out_sites = sites[["site_id", "site_name", "lat", "lon", "visits_2019", "mu_clients_per_visit", "sd_clients_per_visit"]].copy()
|
||||
|
||||
for res in results:
|
||||
col_k = f"k_{res.method}_{res.scenario}"
|
||||
out_sites[col_k] = res.k
|
||||
|
||||
d = sites[[c for s, c in RHO_COLS if s == res.scenario][0]].to_numpy(float)
|
||||
s_vals = (res.k * mu) / d
|
||||
out_sites[f"s_{res.method}_{res.scenario}"] = s_vals
|
||||
|
||||
m = compute_metrics(res.k, mu=mu, d=d)
|
||||
row = {"scenario": res.scenario, "method": res.method, "t_star": res.t_star}
|
||||
row.update(m)
|
||||
metrics_rows.append(row)
|
||||
|
||||
metrics_df = pd.DataFrame(metrics_rows).sort_values(["scenario", "method"]).reset_index(drop=True)
|
||||
|
||||
with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as writer:
|
||||
out_sites.to_excel(writer, index=False, sheet_name="allocations")
|
||||
metrics_df.to_excel(writer, index=False, sheet_name="metrics")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
233
task1/04_evaluate.py
Normal file
233
task1/04_evaluate.py
Normal file
@@ -0,0 +1,233 @@
|
||||
"""
|
||||
Step 04: 评估指标计算
|
||||
|
||||
输入: 03_allocate.xlsx
|
||||
输出: 04_metrics.xlsx
|
||||
|
||||
功能:
|
||||
1. 计算有效性指标 (E1, E2)
|
||||
2. 计算公平性指标 (F1, F2, F3)
|
||||
3. 与基线方案对比
|
||||
4. 输出综合评估报告
|
||||
|
||||
指标定义:
|
||||
- E1: 原始总服务量 = Σ k_i × μ_i
|
||||
- E2: 质量加权服务量 = Σ k_i × μ_i × q(μ_i), q(μ) = min(1, 250/μ)
|
||||
- F1: 满足率基尼系数 Gini(r)
|
||||
- F2: 最差站点满足率 min(r)
|
||||
- F3: 满足率变异系数 CV(r)
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
# 路径配置
|
||||
INPUT_PATH = Path(__file__).parent / "03_allocate.xlsx"
|
||||
OUTPUT_PATH = Path(__file__).parent / "04_metrics.xlsx"
|
||||
|
||||
# 评估参数
|
||||
C_BAR = 250 # 典型服务量 (质量折扣阈值)
|
||||
|
||||
|
||||
def quality_discount(mu: float, c_bar: float = C_BAR) -> float:
|
||||
"""
|
||||
质量折扣因子: 当 μ > c_bar 时,服务质量打折
|
||||
|
||||
q(μ) = min(1, c_bar / μ)
|
||||
"""
|
||||
return min(1.0, c_bar / mu) if mu > 0 else 1.0
|
||||
|
||||
|
||||
def gini_coefficient(values: list) -> float:
|
||||
"""
|
||||
计算基尼系数
|
||||
|
||||
Gini = Σ_i Σ_j |x_i - x_j| / (2n Σ_i x_i)
|
||||
"""
|
||||
values = np.array(values)
|
||||
n = len(values)
|
||||
if n == 0 or values.sum() == 0:
|
||||
return 0.0
|
||||
|
||||
# 排序后计算
|
||||
sorted_values = np.sort(values)
|
||||
cumsum = np.cumsum(sorted_values)
|
||||
return (2 * np.sum((np.arange(1, n + 1) * sorted_values)) - (n + 1) * cumsum[-1]) / (n * cumsum[-1])
|
||||
|
||||
|
||||
def compute_metrics(df: pd.DataFrame, method_name: str) -> dict:
|
||||
"""计算单个方案的所有指标"""
|
||||
# 有效性指标
|
||||
E1 = (df['k'] * df['mu']).sum()
|
||||
E2 = (df['k'] * df['mu'] * df['mu'].apply(quality_discount)).sum()
|
||||
|
||||
# 满足率
|
||||
r = df['k'] * df['mu'] / df['mu_tilde']
|
||||
|
||||
# 公平性指标
|
||||
F1 = gini_coefficient(r.tolist())
|
||||
F2 = r.min()
|
||||
F3 = r.std() / r.mean()
|
||||
|
||||
return {
|
||||
'method': method_name,
|
||||
'E1_total_service': E1,
|
||||
'E2_quality_weighted': E2,
|
||||
'F1_gini': F1,
|
||||
'F2_min_satisfaction': F2,
|
||||
'F3_cv_satisfaction': F3
|
||||
}
|
||||
|
||||
|
||||
def compute_baseline_uniform(df: pd.DataFrame, n_total: int = 730) -> pd.DataFrame:
|
||||
"""基线1: 均匀分配"""
|
||||
df_baseline = df.copy()
|
||||
k_uniform = n_total // len(df)
|
||||
remainder = n_total - k_uniform * len(df)
|
||||
df_baseline['k'] = k_uniform
|
||||
# 将余数分配给前 remainder 个站点
|
||||
df_baseline.iloc[:remainder, df_baseline.columns.get_loc('k')] += 1
|
||||
return df_baseline
|
||||
|
||||
|
||||
def compute_baseline_2019_scaled(df: pd.DataFrame, n_total: int = 730) -> pd.DataFrame:
|
||||
"""基线2: 2019年访问次数等比例缩放"""
|
||||
df_baseline = df.copy()
|
||||
total_2019 = df['visits_2019'].sum()
|
||||
# 按比例缩放
|
||||
k_float = df['visits_2019'] * n_total / total_2019
|
||||
k_floor = k_float.astype(int)
|
||||
remainder = n_total - k_floor.sum()
|
||||
# 按余数分配
|
||||
remainders = k_float - k_floor
|
||||
indices = remainders.nlargest(remainder).index
|
||||
k_floor.loc[indices] += 1
|
||||
df_baseline['k'] = k_floor
|
||||
return df_baseline
|
||||
|
||||
|
||||
def compute_baseline_mu_raw(df: pd.DataFrame, n_total: int = 730) -> pd.DataFrame:
|
||||
"""基线3: 按原始μ比例分配 (无截断修正)"""
|
||||
df_baseline = df.copy()
|
||||
# 覆盖约束
|
||||
k_base = 1
|
||||
extra = n_total - len(df) * k_base
|
||||
# 按μ比例分配额外次数
|
||||
weights = df['mu'].tolist()
|
||||
w_sum = sum(weights)
|
||||
quotas = [extra * w / w_sum for w in weights]
|
||||
floors = [int(q) for q in quotas]
|
||||
remainders = [q - f for q, f in zip(quotas, floors)]
|
||||
leftover = extra - sum(floors)
|
||||
indices = sorted(range(len(weights)), key=lambda i: -remainders[i])
|
||||
for i in indices[:leftover]:
|
||||
floors[i] += 1
|
||||
df_baseline['k'] = [k_base + f for f in floors]
|
||||
return df_baseline
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Step 04: 评估指标计算")
|
||||
print("=" * 60)
|
||||
|
||||
# 1. 读取分配结果
|
||||
print(f"\n[1] 读取输入: {INPUT_PATH}")
|
||||
df = pd.read_excel(INPUT_PATH)
|
||||
print(f" 读取 {len(df)} 条记录")
|
||||
|
||||
# 2. 计算推荐方案指标
|
||||
print(f"\n[2] 计算推荐方案指标 (按μ̃比例分配)")
|
||||
metrics_recommended = compute_metrics(df, 'Recommended (μ̃ proportional)')
|
||||
for key, value in metrics_recommended.items():
|
||||
if key != 'method':
|
||||
print(f" {key}: {value:.4f}")
|
||||
|
||||
# 3. 计算基线方案指标
|
||||
print(f"\n[3] 计算基线方案指标")
|
||||
|
||||
# 基线1: 均匀分配
|
||||
df_b1 = compute_baseline_uniform(df)
|
||||
metrics_b1 = compute_metrics(df_b1, 'Baseline 1: Uniform')
|
||||
print(f"\n 基线1 (均匀分配, k={df_b1['k'].iloc[0]}~{df_b1['k'].max()}):")
|
||||
for key, value in metrics_b1.items():
|
||||
if key != 'method':
|
||||
print(f" {key}: {value:.4f}")
|
||||
|
||||
# 基线2: 2019年缩放
|
||||
df_b2 = compute_baseline_2019_scaled(df)
|
||||
metrics_b2 = compute_metrics(df_b2, 'Baseline 2: 2019 Scaled')
|
||||
print(f"\n 基线2 (2019年缩放):")
|
||||
for key, value in metrics_b2.items():
|
||||
if key != 'method':
|
||||
print(f" {key}: {value:.4f}")
|
||||
|
||||
# 基线3: 按原始μ比例
|
||||
df_b3 = compute_baseline_mu_raw(df)
|
||||
metrics_b3 = compute_metrics(df_b3, 'Baseline 3: μ proportional (no correction)')
|
||||
print(f"\n 基线3 (按原始μ比例, 无截断修正):")
|
||||
for key, value in metrics_b3.items():
|
||||
if key != 'method':
|
||||
print(f" {key}: {value:.4f}")
|
||||
|
||||
# 4. 汇总对比
|
||||
print(f"\n[4] 方案对比汇总")
|
||||
all_metrics = [metrics_recommended, metrics_b1, metrics_b2, metrics_b3]
|
||||
df_metrics = pd.DataFrame(all_metrics)
|
||||
|
||||
# 计算相对于推荐方案的变化
|
||||
for col in ['E1_total_service', 'E2_quality_weighted']:
|
||||
base_val = metrics_recommended[col]
|
||||
df_metrics[f'{col}_pct'] = (df_metrics[col] / base_val - 1) * 100
|
||||
|
||||
print("\n " + "-" * 90)
|
||||
print(f" {'方案':<45} {'E1':>12} {'E2':>12} {'F1(Gini)':>10} {'F2(min r)':>10} {'F3(CV)':>10}")
|
||||
print(" " + "-" * 90)
|
||||
for _, row in df_metrics.iterrows():
|
||||
print(f" {row['method']:<45} {row['E1_total_service']:>12.1f} {row['E2_quality_weighted']:>12.1f} {row['F1_gini']:>10.4f} {row['F2_min_satisfaction']:>10.2f} {row['F3_cv_satisfaction']:>10.4f}")
|
||||
print(" " + "-" * 90)
|
||||
|
||||
# 5. 推荐方案优势分析
|
||||
print(f"\n[5] 推荐方案优势分析")
|
||||
print(f" 相对于均匀分配 (基线1):")
|
||||
print(f" - E1 提升: {(metrics_recommended['E1_total_service']/metrics_b1['E1_total_service']-1)*100:+.2f}%")
|
||||
print(f" - E2 提升: {(metrics_recommended['E2_quality_weighted']/metrics_b1['E2_quality_weighted']-1)*100:+.2f}%")
|
||||
|
||||
print(f"\n 相对于2019缩放 (基线2):")
|
||||
print(f" - E1 变化: {(metrics_recommended['E1_total_service']/metrics_b2['E1_total_service']-1)*100:+.2f}%")
|
||||
print(f" - E2 变化: {(metrics_recommended['E2_quality_weighted']/metrics_b2['E2_quality_weighted']-1)*100:+.2f}%")
|
||||
print(f" - F1(Gini)变化: {(metrics_recommended['F1_gini']-metrics_b2['F1_gini']):+.4f}")
|
||||
|
||||
# 6. 保存输出
|
||||
print(f"\n[6] 保存输出: {OUTPUT_PATH}")
|
||||
|
||||
with pd.ExcelWriter(OUTPUT_PATH, engine='openpyxl') as writer:
|
||||
# Sheet 1: 指标汇总
|
||||
df_metrics.to_excel(writer, sheet_name='metrics_summary', index=False)
|
||||
|
||||
# Sheet 2: 站点级详情 (推荐方案)
|
||||
df_detail = df[['site_id', 'site_name', 'mu', 'mu_tilde', 'k', 'annual_service', 'r']].copy()
|
||||
df_detail['quality_factor'] = df['mu'].apply(quality_discount)
|
||||
df_detail['service_quality_weighted'] = df_detail['k'] * df_detail['mu'] * df_detail['quality_factor']
|
||||
df_detail.to_excel(writer, sheet_name='site_details', index=False)
|
||||
|
||||
# Sheet 3: 参数记录
|
||||
params = pd.DataFrame([
|
||||
{'parameter': 'C_BAR (quality threshold)', 'value': C_BAR},
|
||||
{'parameter': 'N_TOTAL', 'value': 730},
|
||||
{'parameter': 'n_sites', 'value': len(df)},
|
||||
])
|
||||
params.to_excel(writer, sheet_name='parameters', index=False)
|
||||
|
||||
print(f" 已保存3个工作表: metrics_summary, site_details, parameters")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Step 04 完成")
|
||||
print("=" * 60)
|
||||
|
||||
return df_metrics
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
task1/04_metrics.xlsx
Normal file
BIN
task1/04_metrics.xlsx
Normal file
Binary file not shown.
Binary file not shown.
@@ -1,219 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import bisect
|
||||
import datetime as dt
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
ALLOC_XLSX = "task1/03_allocate.xlsx"
|
||||
OUTPUT_XLSX = "task1/04_schedule.xlsx"
|
||||
|
||||
YEAR = 2021
|
||||
DAYS = 365
|
||||
SLOTS_PER_DAY = 2 # scenario B: 2 trucks, 2 distinct sites/day
|
||||
|
||||
# Default recommendation
|
||||
DEFAULT_SCENARIO = "rho20"
|
||||
DEFAULT_METHOD = "proportional_D"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Event:
|
||||
site_id: int
|
||||
site_name: str
|
||||
target_day: int # 1..365
|
||||
|
||||
|
||||
def build_targets(site_id: int, site_name: str, k: int) -> list[Event]:
|
||||
if k <= 0:
|
||||
return []
|
||||
targets: list[Event] = []
|
||||
for j in range(k):
|
||||
# Even spacing: place j-th visit at (j+0.5)*DAYS/k
|
||||
t = int(round((j + 0.5) * DAYS / k))
|
||||
t = max(1, min(DAYS, t))
|
||||
targets.append(Event(site_id=site_id, site_name=site_name, target_day=t))
|
||||
return targets
|
||||
|
||||
|
||||
def assign_events_to_days(events: list[Event]) -> dict[int, list[Event]]:
|
||||
# Initial binning by rounded target day
|
||||
day_to_events: dict[int, list[Event]] = {d: [] for d in range(1, DAYS + 1)}
|
||||
overflow: list[Event] = []
|
||||
|
||||
# Put into bins
|
||||
for ev in sorted(events, key=lambda e: (e.target_day, e.site_id)):
|
||||
day_to_events[ev.target_day].append(ev)
|
||||
|
||||
# Enforce per-day capacity and per-day unique site_id
|
||||
for day in range(1, DAYS + 1):
|
||||
bucket = day_to_events[day]
|
||||
if not bucket:
|
||||
continue
|
||||
seen: set[int] = set()
|
||||
kept: list[Event] = []
|
||||
for ev in bucket:
|
||||
if ev.site_id in seen:
|
||||
overflow.append(ev)
|
||||
else:
|
||||
seen.add(ev.site_id)
|
||||
kept.append(ev)
|
||||
# If still over capacity, keep earliest (already sorted) and overflow rest
|
||||
day_to_events[day] = kept[:SLOTS_PER_DAY]
|
||||
overflow.extend(kept[SLOTS_PER_DAY:])
|
||||
|
||||
# Underfull days list
|
||||
underfull_days: list[int] = []
|
||||
for day in range(1, DAYS + 1):
|
||||
cap = SLOTS_PER_DAY - len(day_to_events[day])
|
||||
underfull_days.extend([day] * cap)
|
||||
underfull_days.sort()
|
||||
|
||||
# Fill underfull days with closest assignment to target_day
|
||||
def day_has_site(day: int, site_id: int) -> bool:
|
||||
return any(ev.site_id == site_id for ev in day_to_events[day])
|
||||
|
||||
for ev in sorted(overflow, key=lambda e: (e.target_day, e.site_id)):
|
||||
if not underfull_days:
|
||||
raise RuntimeError("No remaining capacity but overflow events remain.")
|
||||
pos = bisect.bisect_left(underfull_days, ev.target_day)
|
||||
candidate_positions = []
|
||||
for delta in range(0, len(underfull_days)):
|
||||
# Check outward from the insertion point
|
||||
for p in (pos - delta, pos + delta):
|
||||
if 0 <= p < len(underfull_days):
|
||||
candidate_positions.append(p)
|
||||
if candidate_positions:
|
||||
# We gathered some; break after first ring to keep cost small
|
||||
break
|
||||
|
||||
assigned_idx = None
|
||||
for p in candidate_positions:
|
||||
day = underfull_days[p]
|
||||
if not day_has_site(day, ev.site_id):
|
||||
assigned_idx = p
|
||||
break
|
||||
|
||||
if assigned_idx is None:
|
||||
# Fallback: scan until we find any feasible slot
|
||||
for p, day in enumerate(underfull_days):
|
||||
if not day_has_site(day, ev.site_id):
|
||||
assigned_idx = p
|
||||
break
|
||||
|
||||
if assigned_idx is None:
|
||||
raise RuntimeError(f"Unable to place event for site_id={ev.site_id}; per-day uniqueness too strict.")
|
||||
|
||||
day = underfull_days.pop(assigned_idx)
|
||||
day_to_events[day].append(ev)
|
||||
|
||||
# Final sanity: every day filled, and no day has duplicate site_id
|
||||
for day in range(1, DAYS + 1):
|
||||
if len(day_to_events[day]) != SLOTS_PER_DAY:
|
||||
raise RuntimeError(f"Day {day} not filled: {len(day_to_events[day])} events.")
|
||||
ids = [e.site_id for e in day_to_events[day]]
|
||||
if len(set(ids)) != len(ids):
|
||||
raise RuntimeError(f"Day {day} has duplicate site assignments.")
|
||||
return day_to_events
|
||||
|
||||
|
||||
def main() -> None:
|
||||
alloc = pd.read_excel(ALLOC_XLSX, sheet_name="allocations")
|
||||
|
||||
k_col = f"k_{DEFAULT_METHOD}_{DEFAULT_SCENARIO}"
|
||||
if k_col not in alloc.columns:
|
||||
raise ValueError(f"Allocation column not found: {k_col}")
|
||||
|
||||
alloc = alloc[["site_id", "site_name", k_col]].copy()
|
||||
alloc = alloc.rename(columns={k_col: "k_2021"})
|
||||
alloc["k_2021"] = pd.to_numeric(alloc["k_2021"], errors="raise").astype(int)
|
||||
|
||||
if int(alloc["k_2021"].sum()) != DAYS * SLOTS_PER_DAY:
|
||||
raise ValueError("k_2021 does not match total required slots.")
|
||||
if (alloc["k_2021"] < 1).any():
|
||||
raise ValueError("k_2021 violates coverage constraint k_i >= 1.")
|
||||
|
||||
events: list[Event] = []
|
||||
for row in alloc.itertuples(index=False):
|
||||
events.extend(build_targets(site_id=int(row.site_id), site_name=str(row.site_name), k=int(row.k_2021)))
|
||||
|
||||
if len(events) != DAYS * SLOTS_PER_DAY:
|
||||
raise RuntimeError("Generated events mismatch total required slots.")
|
||||
|
||||
day_to_events = assign_events_to_days(events)
|
||||
|
||||
start = dt.date(YEAR, 1, 1)
|
||||
calendar_rows: list[dict[str, object]] = []
|
||||
per_site_rows: list[dict[str, object]] = []
|
||||
|
||||
for day in range(1, DAYS + 1):
|
||||
date = start + dt.timedelta(days=day - 1)
|
||||
evs = sorted(day_to_events[day], key=lambda e: e.site_id)
|
||||
calendar_rows.append(
|
||||
{
|
||||
"date": date.isoformat(),
|
||||
"day_of_year": day,
|
||||
"site1_id": evs[0].site_id,
|
||||
"site1_name": evs[0].site_name,
|
||||
"site2_id": evs[1].site_id,
|
||||
"site2_name": evs[1].site_name,
|
||||
}
|
||||
)
|
||||
for slot, ev in enumerate(evs, start=1):
|
||||
per_site_rows.append(
|
||||
{
|
||||
"site_id": ev.site_id,
|
||||
"site_name": ev.site_name,
|
||||
"date": date.isoformat(),
|
||||
"day_of_year": day,
|
||||
"slot": slot,
|
||||
"target_day": ev.target_day,
|
||||
}
|
||||
)
|
||||
|
||||
calendar_df = pd.DataFrame(calendar_rows)
|
||||
site_dates_df = pd.DataFrame(per_site_rows).sort_values(["site_id", "day_of_year"]).reset_index(drop=True)
|
||||
|
||||
# Schedule quality metrics: gaps between visits for each site
|
||||
gap_rows: list[dict[str, object]] = []
|
||||
for site_id, group in site_dates_df.groupby("site_id"):
|
||||
days = group["day_of_year"].to_numpy(int)
|
||||
gaps = np.diff(days)
|
||||
if len(gaps) == 0:
|
||||
gap_rows.append({"site_id": int(site_id), "k": 1, "gap_max": None, "gap_mean": None, "gap_std": None})
|
||||
else:
|
||||
gap_rows.append(
|
||||
{
|
||||
"site_id": int(site_id),
|
||||
"k": int(len(days)),
|
||||
"gap_max": int(gaps.max()),
|
||||
"gap_mean": float(gaps.mean()),
|
||||
"gap_std": float(gaps.std(ddof=0)),
|
||||
}
|
||||
)
|
||||
gap_df = pd.DataFrame(gap_rows).merge(alloc[["site_id", "site_name"]], on="site_id", how="left")
|
||||
|
||||
meta_df = pd.DataFrame(
|
||||
[
|
||||
{"key": "year", "value": YEAR},
|
||||
{"key": "days", "value": DAYS},
|
||||
{"key": "slots_per_day", "value": SLOTS_PER_DAY},
|
||||
{"key": "total_visits", "value": int(DAYS * SLOTS_PER_DAY)},
|
||||
{"key": "allocation_scenario", "value": DEFAULT_SCENARIO},
|
||||
{"key": "allocation_method", "value": DEFAULT_METHOD},
|
||||
{"key": "k_column", "value": k_col},
|
||||
]
|
||||
)
|
||||
|
||||
with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as writer:
|
||||
meta_df.to_excel(writer, index=False, sheet_name="meta")
|
||||
calendar_df.to_excel(writer, index=False, sheet_name="calendar")
|
||||
site_dates_df.to_excel(writer, index=False, sheet_name="site_dates")
|
||||
gap_df.to_excel(writer, index=False, sheet_name="gap_metrics")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
303
task1/05_schedule.py
Normal file
303
task1/05_schedule.py
Normal file
@@ -0,0 +1,303 @@
|
||||
"""
|
||||
Step 05: 日历排程 - 贪心装箱算法
|
||||
|
||||
输入: 03_allocate.xlsx
|
||||
输出: 05_schedule.xlsx
|
||||
|
||||
功能:
|
||||
1. 将年度频次 k_i 转化为具体日期
|
||||
2. 保证每天恰好2个站点
|
||||
3. 优化访问间隔的均匀性
|
||||
4. 输出完整的365天日历
|
||||
|
||||
约束:
|
||||
- 每天恰好2个站点
|
||||
- 每站点出现次数 = k_i
|
||||
- 同一站点相邻访问间隔尽量均匀
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
import random
|
||||
|
||||
# 路径配置
|
||||
INPUT_PATH = Path(__file__).parent / "03_allocate.xlsx"
|
||||
OUTPUT_PATH = Path(__file__).parent / "05_schedule.xlsx"
|
||||
|
||||
# 排程参数
|
||||
T = 365 # 全年天数
|
||||
CAPACITY = 2 # 每天站点数
|
||||
RANDOM_SEED = 42 # 随机种子 (用于局部优化)
|
||||
|
||||
|
||||
def generate_ideal_dates(k: int, T: int = 365) -> list:
|
||||
"""
|
||||
生成站点的理想访问日期
|
||||
|
||||
将 k 次访问均匀分布在 [1, T] 内
|
||||
t_j = round((j + 0.5) * T / k), j = 0, 1, ..., k-1
|
||||
"""
|
||||
dates = []
|
||||
for j in range(k):
|
||||
ideal_day = round((j + 0.5) * T / k)
|
||||
ideal_day = max(1, min(T, ideal_day))
|
||||
dates.append(ideal_day)
|
||||
return dates
|
||||
|
||||
|
||||
def greedy_schedule(site_visits: dict, T: int = 365, capacity: int = 2) -> dict:
|
||||
"""
|
||||
贪心装箱算法
|
||||
|
||||
Args:
|
||||
site_visits: {site_id: k} 各站点的年度访问次数
|
||||
T: 全年天数
|
||||
capacity: 每天站点容量
|
||||
|
||||
Returns:
|
||||
calendar: {day: [site_id, ...]} 日历排程
|
||||
"""
|
||||
# 生成所有访问事件: (理想日期, 站点ID)
|
||||
events = []
|
||||
for site_id, k in site_visits.items():
|
||||
ideal_dates = generate_ideal_dates(k, T)
|
||||
for ideal_day in ideal_dates:
|
||||
events.append((ideal_day, site_id))
|
||||
|
||||
# 按理想日期排序
|
||||
events.sort(key=lambda x: (x[0], x[1]))
|
||||
|
||||
# 初始化日历
|
||||
calendar = {day: [] for day in range(1, T + 1)}
|
||||
|
||||
# 贪心分配
|
||||
for ideal_day, site_id in events:
|
||||
assigned = False
|
||||
# 从理想日期向两侧搜索可用槽位
|
||||
for offset in range(T):
|
||||
for day in [ideal_day + offset, ideal_day - offset]:
|
||||
if 1 <= day <= T:
|
||||
# 检查容量和重复
|
||||
if len(calendar[day]) < capacity and site_id not in calendar[day]:
|
||||
calendar[day].append(site_id)
|
||||
assigned = True
|
||||
break
|
||||
if assigned:
|
||||
break
|
||||
|
||||
if not assigned:
|
||||
print(f"警告: 无法分配站点 {site_id} (理想日期 {ideal_day})")
|
||||
|
||||
return calendar
|
||||
|
||||
|
||||
def compute_gap_stats(calendar: dict, site_id: int) -> dict:
|
||||
"""计算单个站点的访问间隔统计"""
|
||||
days = sorted([day for day, sites in calendar.items() if site_id in sites])
|
||||
|
||||
if len(days) < 2:
|
||||
return {
|
||||
'n_visits': len(days),
|
||||
'gaps': [],
|
||||
'gap_mean': None,
|
||||
'gap_std': None,
|
||||
'gap_min': None,
|
||||
'gap_max': None,
|
||||
'gap_cv': None
|
||||
}
|
||||
|
||||
gaps = [days[i + 1] - days[i] for i in range(len(days) - 1)]
|
||||
|
||||
return {
|
||||
'n_visits': len(days),
|
||||
'gaps': gaps,
|
||||
'gap_mean': np.mean(gaps),
|
||||
'gap_std': np.std(gaps),
|
||||
'gap_min': min(gaps),
|
||||
'gap_max': max(gaps),
|
||||
'gap_cv': np.std(gaps) / np.mean(gaps) if np.mean(gaps) > 0 else 0
|
||||
}
|
||||
|
||||
|
||||
def local_optimization(calendar: dict, site_ids: list, max_iter: int = 5000, seed: int = 42) -> dict:
|
||||
"""
|
||||
局部搜索优化间隔均匀性
|
||||
|
||||
通过随机交换两天的站点,若改善总间隔方差则接受
|
||||
"""
|
||||
random.seed(seed)
|
||||
calendar = {day: list(sites) for day, sites in calendar.items()} # 深拷贝
|
||||
|
||||
def total_gap_variance():
|
||||
"""计算所有站点间隔方差之和"""
|
||||
total_var = 0
|
||||
for site_id in site_ids:
|
||||
stats = compute_gap_stats(calendar, site_id)
|
||||
if stats['gap_std'] is not None:
|
||||
total_var += stats['gap_std'] ** 2
|
||||
return total_var
|
||||
|
||||
current_var = total_gap_variance()
|
||||
improved = 0
|
||||
|
||||
for iteration in range(max_iter):
|
||||
# 随机选两天
|
||||
t1, t2 = random.sample(range(1, 366), 2)
|
||||
|
||||
if len(calendar[t1]) == 2 and len(calendar[t2]) == 2:
|
||||
# 随机选择交换位置
|
||||
pos1, pos2 = random.randint(0, 1), random.randint(0, 1)
|
||||
s1, s2 = calendar[t1][pos1], calendar[t2][pos2]
|
||||
|
||||
# 检查交换可行性 (不能产生重复)
|
||||
if s1 != s2:
|
||||
other1 = calendar[t1][1 - pos1]
|
||||
other2 = calendar[t2][1 - pos2]
|
||||
if s2 != other1 and s1 != other2:
|
||||
# 尝试交换
|
||||
calendar[t1][pos1], calendar[t2][pos2] = s2, s1
|
||||
|
||||
new_var = total_gap_variance()
|
||||
|
||||
if new_var < current_var:
|
||||
current_var = new_var
|
||||
improved += 1
|
||||
else:
|
||||
# 撤销
|
||||
calendar[t1][pos1], calendar[t2][pos2] = s1, s2
|
||||
|
||||
return calendar, improved
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Step 05: 日历排程 - 贪心装箱算法")
|
||||
print("=" * 60)
|
||||
|
||||
# 1. 读取分配结果
|
||||
print(f"\n[1] 读取输入: {INPUT_PATH}")
|
||||
df = pd.read_excel(INPUT_PATH)
|
||||
print(f" 读取 {len(df)} 条记录")
|
||||
|
||||
# 构建 site_visits 字典
|
||||
site_visits = dict(zip(df['site_id'], df['k']))
|
||||
total_visits = sum(site_visits.values())
|
||||
print(f" 总访问次数: {total_visits}")
|
||||
print(f" 期望日历天数: {total_visits // CAPACITY} 天")
|
||||
|
||||
# 2. 执行贪心排程
|
||||
print(f"\n[2] 执行贪心装箱排程...")
|
||||
calendar = greedy_schedule(site_visits, T, CAPACITY)
|
||||
|
||||
# 验证
|
||||
total_assigned = sum(len(sites) for sites in calendar.values())
|
||||
print(f" 已分配访问事件: {total_assigned} / {total_visits}")
|
||||
|
||||
empty_days = sum(1 for sites in calendar.values() if len(sites) == 0)
|
||||
partial_days = sum(1 for sites in calendar.values() if len(sites) == 1)
|
||||
full_days = sum(1 for sites in calendar.values() if len(sites) == 2)
|
||||
print(f" 日历统计: {full_days} 满载 + {partial_days} 部分 + {empty_days} 空闲")
|
||||
|
||||
# 3. 局部优化
|
||||
print(f"\n[3] 局部优化 (改善间隔均匀性)...")
|
||||
site_ids = list(site_visits.keys())
|
||||
calendar_opt, n_improved = local_optimization(calendar, site_ids, max_iter=5000, seed=RANDOM_SEED)
|
||||
print(f" 优化迭代: 5000 次")
|
||||
print(f" 接受的改进: {n_improved} 次")
|
||||
|
||||
# 4. 间隔统计
|
||||
print(f"\n[4] 访问间隔统计")
|
||||
gap_stats_list = []
|
||||
for site_id in site_ids:
|
||||
stats = compute_gap_stats(calendar_opt, site_id)
|
||||
stats['site_id'] = site_id
|
||||
gap_stats_list.append(stats)
|
||||
|
||||
df_gaps = pd.DataFrame(gap_stats_list)
|
||||
df_gaps = df_gaps.merge(df[['site_id', 'site_name', 'k']], on='site_id')
|
||||
|
||||
# 全局统计
|
||||
valid_gaps = df_gaps[df_gaps['gap_mean'].notna()]
|
||||
print(f" 平均间隔均值: {valid_gaps['gap_mean'].mean():.2f} 天")
|
||||
print(f" 平均间隔标准差: {valid_gaps['gap_std'].mean():.2f} 天")
|
||||
print(f" 最大单次间隔: {valid_gaps['gap_max'].max():.0f} 天")
|
||||
print(f" 平均间隔CV: {valid_gaps['gap_cv'].mean():.4f}")
|
||||
|
||||
# 5. 生成日历输出
|
||||
print(f"\n[5] 生成日历输出...")
|
||||
|
||||
# 日历表: date, site_1, site_2
|
||||
calendar_rows = []
|
||||
for day in range(1, T + 1):
|
||||
sites = calendar_opt.get(day, [])
|
||||
site_1 = sites[0] if len(sites) > 0 else None
|
||||
site_2 = sites[1] if len(sites) > 1 else None
|
||||
calendar_rows.append({
|
||||
'day': day,
|
||||
'site_1_id': site_1,
|
||||
'site_2_id': site_2
|
||||
})
|
||||
df_calendar = pd.DataFrame(calendar_rows)
|
||||
|
||||
# 添加站点名称
|
||||
site_name_map = dict(zip(df['site_id'], df['site_name']))
|
||||
df_calendar['site_1_name'] = df_calendar['site_1_id'].map(site_name_map)
|
||||
df_calendar['site_2_name'] = df_calendar['site_2_id'].map(site_name_map)
|
||||
|
||||
# 6. 站点日期列表
|
||||
site_dates = []
|
||||
for site_id in site_ids:
|
||||
days = sorted([day for day, sites in calendar_opt.items() if site_id in sites])
|
||||
site_dates.append({
|
||||
'site_id': site_id,
|
||||
'site_name': site_name_map[site_id],
|
||||
'k': len(days),
|
||||
'dates': ','.join(map(str, days))
|
||||
})
|
||||
df_site_dates = pd.DataFrame(site_dates)
|
||||
|
||||
# 7. 保存输出
|
||||
print(f"\n[6] 保存输出: {OUTPUT_PATH}")
|
||||
|
||||
with pd.ExcelWriter(OUTPUT_PATH, engine='openpyxl') as writer:
|
||||
# Sheet 1: 日历 (365天)
|
||||
df_calendar.to_excel(writer, sheet_name='calendar', index=False)
|
||||
|
||||
# Sheet 2: 站点日期列表
|
||||
df_site_dates.to_excel(writer, sheet_name='site_dates', index=False)
|
||||
|
||||
# Sheet 3: 间隔统计
|
||||
df_gaps_out = df_gaps[['site_id', 'site_name', 'k', 'n_visits', 'gap_mean', 'gap_std', 'gap_min', 'gap_max', 'gap_cv']]
|
||||
df_gaps_out.to_excel(writer, sheet_name='gap_statistics', index=False)
|
||||
|
||||
# Sheet 4: 排程参数
|
||||
params = pd.DataFrame([
|
||||
{'parameter': 'T (days)', 'value': T},
|
||||
{'parameter': 'CAPACITY (sites/day)', 'value': CAPACITY},
|
||||
{'parameter': 'total_visits', 'value': total_visits},
|
||||
{'parameter': 'optimization_iterations', 'value': 5000},
|
||||
{'parameter': 'improvements_accepted', 'value': n_improved},
|
||||
])
|
||||
params.to_excel(writer, sheet_name='parameters', index=False)
|
||||
|
||||
print(f" 已保存4个工作表: calendar, site_dates, gap_statistics, parameters")
|
||||
|
||||
# 8. 输出预览
|
||||
print(f"\n[7] 日历预览 (前10天):")
|
||||
print(df_calendar.head(10).to_string(index=False))
|
||||
|
||||
print(f"\n 间隔最大的5个站点:")
|
||||
top5_gap = df_gaps.nlargest(5, 'gap_max')[['site_id', 'site_name', 'k', 'gap_mean', 'gap_max', 'gap_cv']]
|
||||
print(top5_gap.to_string(index=False))
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Step 05 完成")
|
||||
print("=" * 60)
|
||||
|
||||
return df_calendar, df_gaps
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
task1/05_schedule.xlsx
Normal file
BIN
task1/05_schedule.xlsx
Normal file
Binary file not shown.
581
task1/README.md
581
task1/README.md
@@ -1,108 +1,529 @@
|
||||
# Task 1(2021 年 MFP 计划)——可审计建模与排程(论文式说明)
|
||||
# Task 1: 2021年MFP访问计划——需求驱动的频次分配与日历排程
|
||||
|
||||
## 摘要
|
||||
本文给出一套在“仅有站点经纬度、2019 年访问次数、单次到访人数均值/标准差”的数据条件下,仍然**闭环、可复现、可审计**的 2021 年 MFP 访问频次与全年日历排程方案。方案遵循三层结构:
|
||||
(1) 用空间核平滑把“周边社区总需求”落地为站点邻域需求代理;
|
||||
(2) 在给定全年总访问次数约束下分配每个站点年度访问次数;
|
||||
(3) 将年度次数转换为 2021 年逐日(每天 2 个站点)可发布的排程日历。
|
||||
本文严格采用题面确认的运营情景:**每天 2 个站点、全年 365 天运营、总访问次数 `N_total=730`、必须覆盖全部 70 个站点、且不建单次容量上限**。
|
||||
|
||||
## 1. 问题与数据
|
||||
### 1.1 输入数据(`data.xlsx`)
|
||||
对每个站点 `i=1..70`,数据给出:
|
||||
- 位置:`(lat_i, lon_i)`
|
||||
- 2019 年访问次数:`v_i^{2019}`
|
||||
- 单次到访人数统计量:均值 `μ_i`、标准差 `σ_i`(单位为 “clients per visit”,按题面口径理解)
|
||||
本文提出一套**因果逻辑清晰、闭环可审计**的2021年MFP访问频次分配与日历排程方案。核心创新点:
|
||||
|
||||
### 1.2 决策变量与约束
|
||||
我们需要为 2021 年决策每个站点年度访问次数 `k_i`,并生成逐日排程。
|
||||
- 覆盖约束(题面/用户确认):`k_i >= 1`
|
||||
- 总资源约束(情景 B):`Σ_i k_i = N_total = 730`(= 365 天 × 2 站点/天)
|
||||
1. **截断回归修正**:识别到9个站点历史服务量 $\mu_i > 250$(最高396.6),说明高需求站点的观测数据被容量截断,采用截断正态模型恢复真实需求 $\tilde{\mu}_i$
|
||||
2. **质量加权有效性**:考虑高服务量下每户分配食物减少的质量折扣
|
||||
3. **满足率公平性**:以"年度服务量/真实需求"的均等程度衡量公平,而非简单的访问次数均等
|
||||
|
||||
## 2. “周边社区总需求”的可审计代理
|
||||
题面要求“频次由周边社区总需求指导”,但数据没有社区人口/贫困率等外部字段,因此我们将其定义为**可审计的空间平滑代理**。
|
||||
方案遵循四阶段结构:需求估计 → 频次分配 → 效果评估 → 日历排程。
|
||||
|
||||
### 2.1 距离
|
||||
使用 haversine 距离 `dist(i,j)`(英里)。
|
||||
---
|
||||
|
||||
### 2.2 高斯核平滑(核心)
|
||||
给定尺度参数 `ρ`(英里),定义站点 i 的邻域需求代理:
|
||||
## 整体流程图
|
||||
|
||||
`D_i(ρ) = Σ_j μ_j · exp( - dist(i,j)^2 / (2ρ^2) )`
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ 数据输入层 │
|
||||
│ │
|
||||
│ ┌─────────────┐ │
|
||||
│ │ data.xlsx │ 70站点: 位置、2019访问次数、μ、σ │
|
||||
│ └──────┬──────┘ │
|
||||
└───────────┼─────────────────────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ TASK 1: 基础排程 [已完成 ✓] │
|
||||
│ │
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ 01_clean.py │────▶│02_demand_ │────▶│03_allocate.py│────▶│04_evaluate.py│ │
|
||||
│ │ │ │correction.py │ │ │ │ │ │
|
||||
│ │ 数据清洗 │ │ 截断回归修正 │ │ Hamilton分配 │ │ 指标计算 │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────┘ └──────┬───────┘ │
|
||||
│ │ │ │ │ │
|
||||
│ ▼ ▼ ▼ ▼ │
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │01_clean.xlsx │ │02_demand.xlsx│ │03_allocate. │ │04_metrics. │ │
|
||||
│ │ │ │ μ̃ (5站点修正)│ │xlsx (k分配) │ │xlsx │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────┬───────┘ └──────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌──────────────┐ │
|
||||
│ │05_schedule.py│ │
|
||||
│ │ 日历排程生成 │ │
|
||||
│ └──────┬───────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌──────────────┐ │
|
||||
│ │05_schedule. │ │
|
||||
│ │xlsx (365天) │ │
|
||||
│ └──────┬───────┘ │
|
||||
└─────────────────────────────────────────────────────┼───────────────────────────────────┘
|
||||
│
|
||||
┌─────────────────────────────────────────┴─────────────────────────────────┐
|
||||
│ │
|
||||
▼ ▼
|
||||
┌───────────────────────────────────────────┐ ┌───────────────────────────────────────────┐
|
||||
│ TASK 2: 天气响应调度 [待完成] │ │ TASK 3: 双站点同车 [待完成] │
|
||||
│ │ │ │
|
||||
│ 选项 2a: 减少站点数 + 优化位置 │ │ ┌─────────────────────────────────────┐ │
|
||||
│ ┌─────────────────────────────────────┐ │ │ │ 1. 共生站点筛选 │ │
|
||||
│ │ • 站点聚类 (K-means/层次聚类) │ │ │ │ • 距离约束: dist(i,j) < D_max │ │
|
||||
│ │ • 确定最优站点数 K < 70 │ │ │ │ • 需求约束: μ_i + μ_j ≤ 400 │ │
|
||||
│ │ • 客户愿意多走的距离建模 │ │ │ │ • 稳定性约束: CV_i, CV_j < 阈值 │ │
|
||||
│ │ • 重新分配频次 │ │ │ └─────────────────────────────────────┘ │
|
||||
│ └─────────────────────────────────────┘ │ │ │ │
|
||||
│ 或 │ │ ▼ │
|
||||
│ 选项 2b: 保持70站点 + 优化时间协调 │ │ ┌─────────────────────────────────────┐ │
|
||||
│ ┌─────────────────────────────────────┐ │ │ │ 2. 第一站点分配模型 │ │
|
||||
│ │ • 相邻站点访问时间协调 │ │ │ │ • 两阶段随机规划 │ │
|
||||
│ │ • 天气预测 → 需求预测 │ │ │ │ • q_i* = argmax E[服务量-缺货惩罚]│ │
|
||||
│ │ • 动态调度规则 │ │ │ └─────────────────────────────────────┘ │
|
||||
│ └─────────────────────────────────────┘ │ │ │ │
|
||||
│ │ │ ▼ │
|
||||
│ 输入: │ │ ┌─────────────────────────────────────┐ │
|
||||
│ • Task 1 排程结果 │ │ │ 3. 频次重分配 │ │
|
||||
│ • 历史天气数据 (如有) │ │ │ • 共生站点合并需求 │ │
|
||||
│ • 站点间距离矩阵 │ │ │ • Hamilton方法重新分配 │ │
|
||||
│ │ │ └─────────────────────────────────────┘ │
|
||||
│ 输出: │ │ │ │
|
||||
│ • 改进后的排程方案 │ │ ▼ │
|
||||
│ • 性能提升量化 │ │ ┌─────────────────────────────────────┐ │
|
||||
│ │ │ │ 4. 日历排程 + 对比评估 │ │
|
||||
└───────────────────────────────────────────┘ │ │ • 与 Task 1 对比 E1, E2, F1, F2 │ │
|
||||
│ └─────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ 输入: │
|
||||
│ • Task 1 分配结果 (03_allocate.xlsx) │
|
||||
│ • 站点距离矩阵 │
|
||||
│ │
|
||||
│ 输出: │
|
||||
│ • 共生站点配对表 │
|
||||
│ • 第一站点分配方案 │
|
||||
│ • 新日历排程 │
|
||||
└───────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ TASK 4: Executive Summary [待完成] │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ 1页执行摘要: │ │
|
||||
│ │ • Task 1 方案优势: 总服务量提升 34.6% │ │
|
||||
│ │ • Task 2/3 改进效果量化 │ │
|
||||
│ │ • 对 FBST 的关键建议 │ │
|
||||
│ │ • 方法论局限性说明 │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
含义:越近的站点对“周边需求”贡献越大,且贡献按距离平滑衰减;`ρ` 越大表示“更大范围的周边”。
|
||||
### 当前进度
|
||||
|
||||
### 2.3 敏感性分析
|
||||
本文默认同时计算 `ρ ∈ {10, 20, 30}` miles 三个情景,用于审计“周边”尺度选择对结果的影响。
|
||||
| 任务 | 状态 | 说明 |
|
||||
|------|------|------|
|
||||
| Task 1 | ✅ 已完成 | 基础排程方案,5个脚本全部运行通过 |
|
||||
| Task 2 | ⏳ 待开始 | 天气响应调度(选择2a或2b之一)|
|
||||
| Task 3 | ⏳ 待开始 | 双站点同车分配优化 |
|
||||
| Task 4 | ⏳ 待开始 | 1页执行摘要 |
|
||||
|
||||
## 3. 年度频次分配:有效性与公平性
|
||||
### 3.1 有效性(Effectiveness)
|
||||
用户确认“不建单次容量上限”,因此总体服务量(期望)可用下式作为有效性代理:
|
||||
### 下一步操作
|
||||
|
||||
`Eff = Σ_i k_i · μ_i`
|
||||
```
|
||||
1. [Task 2 或 Task 3] 根据题目要求选择完成其一
|
||||
│
|
||||
├─▶ 若选 Task 2:
|
||||
│ • 计算站点间距离矩阵
|
||||
│ • 选择 2a(减站点) 或 2b(优化时间)
|
||||
│ • 实现并量化改进
|
||||
│
|
||||
└─▶ 若选 Task 3:
|
||||
• 筛选共生站点配对
|
||||
• 建立第一站点分配模型
|
||||
• 生成新排程并对比评估
|
||||
|
||||
该指标等价于:假设单次服务量随到访人数线性增长,且不考虑单次运力/食品上限截断。
|
||||
2. [Task 4] 撰写1页执行摘要
|
||||
• 汇总 Task 1 + (Task 2 或 Task 3) 结果
|
||||
• 突出方法优势与建议
|
||||
```
|
||||
|
||||
### 3.2 公平性(Fairness):服务水平而非次数
|
||||
题面“served much better”更自然地对应“服务水平/满足率”而非“访问次数相等”。
|
||||
我们定义站点 i 的服务水平(对邻域需求的相对供给)为:
|
||||
---
|
||||
|
||||
`S_i(ρ) = (k_i · μ_i) / D_i(ρ)`
|
||||
## 运行结果摘要
|
||||
|
||||
然后用两类审计指标衡量不均等:
|
||||
- `min_i S_i(ρ)`:最弱站点的服务水平(max-min 视角)
|
||||
- `Gini({S_i(ρ)})`:服务水平分布的不均等程度(标准 Gini 公式)
|
||||
| 指标 | 推荐方案 | 均匀分配 | 2019缩放 | 变化 |
|
||||
|------|---------|---------|---------|------|
|
||||
| E1 (总服务量) | **140,121** | 104,797 | 104,071 | +34.6% |
|
||||
| E2 (质量加权) | **131,673** | 101,309 | 100,264 | +31.3% |
|
||||
| F1 (Gini系数) | 0.314 | **0.026** | 0.092 | 公平性降低 |
|
||||
| F2 (最低满足率) | 2.0 | **8.4** | 5.0 | - |
|
||||
|
||||
### 3.3 分配规则(主推荐:按周边需求比例分配)
|
||||
在覆盖约束 `k_i>=1` 下,我们采用**按周边需求代理 `D_i(ρ)` 比例分配剩余次数**:
|
||||
1) 先给每站点 1 次:`k_i := 1`
|
||||
2) 将剩余 `R = N_total - 70` 次按权重 `w_i = D_i(ρ)` 做整数分配(largest remainder / Hamilton 方法):
|
||||
- 连续目标:`k_i ≈ 1 + R · w_i/Σw`
|
||||
- 再通过取整与余数分配保证 `Σk_i=N_total`
|
||||
**核心发现**:按需求比例分配可提升34.6%的总服务量,但存在有效性-公平性权衡。
|
||||
|
||||
该规则的解释性很强:**周边需求越大,年度访问越多**,且覆盖约束保证所有站点至少一次。
|
||||
---
|
||||
|
||||
### 3.4 基线与对比
|
||||
为可审计地量化改进,输出中包含“2019 访问次数按比例缩放到 730 次”的基线(`baseline_2019_scaled`),并在同一套指标下对比:
|
||||
- 总有效性 `Σ k_i μ_i`
|
||||
- 公平性 `Gini(S)`、`min S`
|
||||
## 1. 问题形式化
|
||||
|
||||
## 4. 排程层(何时去):将 `k_i` 变成 2021 日历
|
||||
目标:把每站点年度次数转成可发布的具体日期,同时保证每天正好 2 个不同站点。
|
||||
### 1.1 输入数据
|
||||
|
||||
### 4.1 均匀间隔的目标日期
|
||||
对站点 i 的第 `j` 次访问(`j=0..k_i-1`),设理想目标日(1..365)为:
|
||||
对 $n=70$ 个站点,数据提供:
|
||||
|
||||
`t_{i,j} = round( (j+0.5) · 365 / k_i )`
|
||||
| 字段 | 符号 | 说明 |
|
||||
|------|------|------|
|
||||
| 位置 | $(lat_i, lon_i)$ | 经纬度坐标 |
|
||||
| 2019年访问次数 | $v_i$ | 历史频次,总计722次 |
|
||||
| 单次服务人数均值 | $\mu_i$ | 观测均值,范围[17.2, 396.6] |
|
||||
| 单次服务人数标准差 | $\sigma_i$ | 观测波动,范围[2.2, 93.5] |
|
||||
|
||||
直观含义:尽量均匀地把 `k_i` 次撒在全年。
|
||||
### 1.2 运营参数(题面提取)
|
||||
|
||||
### 4.2 装箱与修复
|
||||
先按 `t_{i,j}` 把访问事件放入对应日期桶;若某天超过容量(2 个站点)或出现同一站点重复,则将溢出事件移动到最接近的仍有空位且不重复的日期,直至:
|
||||
- 每天正好 2 个站点
|
||||
- 每天两站点不同
|
||||
- 总计 730 个事件全部入日历
|
||||
| 参数 | 符号 | 值 | 来源 |
|
||||
|------|------|-----|------|
|
||||
| 卡车物理运力 | $W$ | 15,000 lbs | 题面明确 |
|
||||
| 典型服务户数 | $[\underline{C}, \bar{C}]$ | [200, 250] | 题面"typical" |
|
||||
| 每日可派车次 | - | 2 | 题面"2 trucks any given day" |
|
||||
| 全年运营天数 | $T$ | 365 | 假设全年运营 |
|
||||
| 年度总访问次数 | $N$ | 730 | $= 2 \times 365$ |
|
||||
|
||||
输出同时给出每站点相邻访问间隔的统计(最大/均值/标准差),用于审计“服务连续性”。
|
||||
### 1.3 决策变量
|
||||
|
||||
## 5. 可复现流水线(脚本 + xlsx 传输)
|
||||
按步骤运行(从项目根目录):
|
||||
1) `python task1/01_clean.py` → `task1/01_clean.xlsx`(标准化字段、补 `site_id`)
|
||||
2) `python task1/02_neighbor_demand.py` → `task1/02_neighbor.xlsx`(`D_i(ρ)` 与距离矩阵)
|
||||
3) `python task1/03_allocate_k.py` → `task1/03_allocate.xlsx`(多种分配方法 + 指标对比)
|
||||
4) `python task1/04_schedule_2021.py` → `task1/04_schedule.xlsx`(2021 日历排程;默认 `ρ=20mi` + `proportional_D`)
|
||||
- $k_i \in \mathbb{Z}^+$:站点 $i$ 的年度访问次数
|
||||
- $\mathcal{S}_i = \{t_{i,1}, ..., t_{i,k_i}\}$:站点 $i$ 的具体访问日期
|
||||
|
||||
### 5.1 关键输出表
|
||||
- `task1/03_allocate.xlsx`:
|
||||
- `allocations`:每站点的 `k_i` 以及对应 `S_i(ρ)`(按不同 `ρ`/方法)
|
||||
- `metrics`:每种方法/情景的有效性与公平性汇总
|
||||
- `task1/04_schedule.xlsx`:
|
||||
- `meta`:排程采用的 `ρ` 与方法列名
|
||||
- `calendar`:每天两个站点(可直接发布的日历)
|
||||
- `site_dates`:每站点的具体日期列表
|
||||
- `gap_metrics`:每站点访问间隔统计(连续性审计)
|
||||
### 1.4 硬约束
|
||||
|
||||
## 6. 假设与局限(必须在正文显式声明)
|
||||
- 由于用户确认“不建单次容量上限”,本文未建模单次运力/食品约束,也无法估计缺供概率;有效性以 `Σ k_i μ_i` 作为线性代理。
|
||||
- `μ_i, σ_i` 被视为“真实到访需求”的代理统计量;若历史存在供给截断,则高需求站点可能被系统性低估(需在后续任务中引入容量或外部数据修正)。
|
||||
- “周边需求”完全由站点间空间平滑构造;`ρ` 的选择需要与 FBST 对“可接受出行半径”的运营经验校准,因此本文提供 `ρ∈{10,20,30}` 的敏感性结果便于审计与讨论。
|
||||
$$\sum_{i=1}^{70} k_i = N = 730 \tag{C1: 资源约束}$$
|
||||
|
||||
$$k_i \geq 1, \quad \forall i \in [1,70] \tag{C2: 覆盖约束}$$
|
||||
|
||||
$$|\{i : t \in \mathcal{S}_i\}| = 2, \quad \forall t \in [1, 365] \tag{C3: 每日2站点}$$
|
||||
|
||||
---
|
||||
|
||||
## 2. 阶段一:真实需求估计(截断回归)
|
||||
|
||||
### 2.1 关键数据观察
|
||||
|
||||
分析70个站点的 $\mu_i$ 分布:
|
||||
|
||||
| 统计量 | 值 |
|
||||
|--------|-----|
|
||||
| $\mu > 250$ 的站点数 | 9 |
|
||||
| $\mu_{max}$ | 396.6 (MFP Waverly) |
|
||||
| $\mu$ 第二高 | 314.6 (MFP Avoca) |
|
||||
| $\mu$ 第三高 | 285.3 (MFP Endwell) |
|
||||
| $\mu$ 均值 | 141.4 |
|
||||
|
||||
**结论**:题面"200-250户"是"typical"而非硬上限。高需求站点通过减少每户分配量实现超额服务。
|
||||
|
||||
### 2.2 观测机制建模
|
||||
|
||||
**核心假设**:观测到的 $\mu_i$ 是真实需求 $\tilde{\mu}_i$ 在服务容量约束下的截断观测。
|
||||
|
||||
设单次服务的有效容量上限为 $C$(取 $C = 400$,略高于 $\mu_{max}$)。
|
||||
|
||||
$$\mu_i = E[\min(\tilde{D}_i, C)]$$
|
||||
|
||||
其中 $\tilde{D}_i \sim \mathcal{N}(\tilde{\mu}_i, \tilde{\sigma}_i^2)$ 是站点 $i$ 的真实单次需求。
|
||||
|
||||
### 2.3 截断修正公式
|
||||
|
||||
**Step 1**:计算截断概率代理
|
||||
|
||||
$$p_i^{trunc} = 1 - \Phi\left(\frac{C - \mu_i}{\sigma_i}\right)$$
|
||||
|
||||
**Step 2**:分段修正(阈值 $p^{trunc} \geq 0.02$)
|
||||
|
||||
$$\tilde{\mu}_i = \begin{cases}
|
||||
\mu_i & \text{if } p_i^{trunc} < 0.02 \\[8pt]
|
||||
\mu_i \cdot (1 + 0.4 \cdot p_i^{trunc}) & \text{if } p_i^{trunc} \geq 0.02
|
||||
\end{cases}$$
|
||||
|
||||
### 2.4 实际修正结果
|
||||
|
||||
采用阈值 $p^{trunc} \geq 0.02$,共5个站点被修正:
|
||||
|
||||
| site_id | 站点名称 | $\mu$ | $\sigma$ | $p^{trunc}$ | $\tilde{\mu}$ | 修正幅度 |
|
||||
|---------|---------|-------|----------|-------------|---------------|----------|
|
||||
| 66 | MFP Waverly | 396.6 | 51.9 | 0.474 | 471.9 | +19.0% |
|
||||
| 2 | MFP Avoca | 314.6 | 57.3 | 0.068 | 323.2 | +2.7% |
|
||||
| 13 | MFP College TC3 | 261.5 | 92.0 | 0.066 | 268.4 | +2.6% |
|
||||
| 17 | MFP Endwell United Methodist | 285.2 | 60.8 | 0.030 | 288.6 | +1.2% |
|
||||
| 30 | MFP Redeemer Lutheran | 230.6 | 93.5 | 0.035 | 233.8 | +1.4% |
|
||||
|
||||
**修正前后对比**:
|
||||
- 修正前 $\sum \mu_i$ = 9,899.9
|
||||
- 修正后 $\sum \tilde{\mu}_i$ = 9,997.2
|
||||
- 增幅:+0.98%
|
||||
|
||||
---
|
||||
|
||||
## 3. 阶段二:频次分配模型
|
||||
|
||||
### 3.1 分配原则
|
||||
|
||||
**核心思想**:按真实需求 $\tilde{\mu}_i$ 比例分配访问次数。
|
||||
|
||||
$$k_i = 1 + \text{Hamilton}\left(N - 70, \; w_i = \tilde{\mu}_i\right)$$
|
||||
|
||||
其中:
|
||||
- 先给每个站点分配1次(满足覆盖约束C2)
|
||||
- 剩余 $N - 70 = 660$ 次按权重 $\tilde{\mu}_i$ 做整数分配
|
||||
|
||||
### 3.2 Hamilton方法(最大余数法)
|
||||
|
||||
```python
|
||||
def hamilton_allocation(total: int, weights: list) -> list:
|
||||
n = len(weights)
|
||||
w_sum = sum(weights)
|
||||
quotas = [total * w / w_sum for w in weights]
|
||||
floors = [int(q) for q in quotas]
|
||||
remainders = [q - f for q, f in zip(quotas, floors)]
|
||||
leftover = total - sum(floors)
|
||||
indices = sorted(range(n), key=lambda i: -remainders[i])
|
||||
for i in indices[:leftover]:
|
||||
floors[i] += 1
|
||||
return floors
|
||||
```
|
||||
|
||||
### 3.3 实际分配结果
|
||||
|
||||
**验证**:
|
||||
- 总访问次数:$\sum k_i = 730$ ✓
|
||||
- 访问次数范围:$[2, 32]$
|
||||
- 最小访问次数:2(满足覆盖约束)
|
||||
|
||||
**访问次数分布**:
|
||||
|
||||
```
|
||||
k = 2: 1 个站点 █
|
||||
k = 3: 14 个站点 ██████████████
|
||||
k = 4: 2 个站点 ██
|
||||
k = 5: 4 个站点 ████
|
||||
k = 6: 4 个站点 ████
|
||||
k = 8: 1 个站点 █
|
||||
k = 9: 3 个站点 ███
|
||||
k = 10: 3 个站点 ███
|
||||
k = 11: 6 个站点 ██████
|
||||
k = 12: 5 个站点 █████
|
||||
k = 13: 6 个站点 ██████
|
||||
k = 14: 5 个站点 █████
|
||||
k = 15: 4 个站点 ████
|
||||
k = 16: 2 个站点 ██
|
||||
k = 17: 1 个站点 █
|
||||
k = 18: 2 个站点 ██
|
||||
k = 19: 4 个站点 ████
|
||||
k = 20: 1 个站点 █
|
||||
k = 22: 1 个站点 █
|
||||
k = 32: 1 个站点 █
|
||||
```
|
||||
|
||||
**频次最高的10个站点**:
|
||||
|
||||
| site_id | 站点名称 | $\mu$ | $\tilde{\mu}$ | $k$ | 年度服务量 |
|
||||
|---------|---------|-------|---------------|-----|-----------|
|
||||
| 66 | MFP Waverly | 396.6 | 471.9 | 32 | 12,692 |
|
||||
| 2 | MFP Avoca | 314.6 | 323.2 | 22 | 6,921 |
|
||||
| 17 | MFP Endwell United Methodist | 285.3 | 288.6 | 20 | 5,705 |
|
||||
| 3 | MFP Bath | 279.5 | 279.5 | 19 | 5,310 |
|
||||
| 13 | MFP College TC3 | 261.5 | 268.4 | 19 | 4,969 |
|
||||
| 28 | MFP Rathbone | 269.1 | 269.1 | 19 | 5,113 |
|
||||
| 32 | MFP Richford | 265.9 | 265.9 | 19 | 5,052 |
|
||||
| 11 | MFP College Corning | 251.0 | 251.0 | 18 | 4,518 |
|
||||
| 62 | MFP The Love Church | 259.3 | 259.3 | 18 | 4,668 |
|
||||
| 31 | MFP Rehoboth Deliverance | 235.9 | 235.9 | 17 | 4,010 |
|
||||
|
||||
---
|
||||
|
||||
## 4. 阶段三:效果评估指标
|
||||
|
||||
### 4.1 有效性指标
|
||||
|
||||
**E1:原始总服务量**
|
||||
|
||||
$$E_1 = \sum_{i=1}^{70} k_i \cdot \mu_i$$
|
||||
|
||||
**E2:质量加权服务量**
|
||||
|
||||
$$E_2 = \sum_{i=1}^{70} k_i \cdot \mu_i \cdot q(\mu_i), \quad q(\mu) = \min\left(1, \frac{250}{\mu}\right)$$
|
||||
|
||||
### 4.2 公平性指标
|
||||
|
||||
**定义满足率**:
|
||||
|
||||
$$r_i = \frac{k_i \cdot \mu_i}{\tilde{\mu}_i}$$
|
||||
|
||||
- **F1**:满足率基尼系数 $\text{Gini}(\mathbf{r})$
|
||||
- **F2**:最差站点满足率 $\min_i r_i$
|
||||
- **F3**:满足率变异系数 $CV_r$
|
||||
|
||||
### 4.3 实际评估结果
|
||||
|
||||
| 方案 | E1 | E2 | F1 (Gini) | F2 (min r) | F3 (CV) |
|
||||
|------|-----|-----|-----------|------------|---------|
|
||||
| **推荐方案** (μ̃比例) | **140,120.6** | **131,673.2** | 0.314 | 2.00 | 0.561 |
|
||||
| 基线1: 均匀分配 | 104,797.3 | 101,308.9 | **0.026** | **8.41** | **0.052** |
|
||||
| 基线2: 2019缩放 | 104,070.7 | 100,263.8 | 0.092 | 5.00 | 0.177 |
|
||||
| 基线3: μ比例(无修正) | 139,129.2 | 131,397.1 | 0.311 | 2.00 | 0.550 |
|
||||
|
||||
**推荐方案优势**:
|
||||
- 相对均匀分配:E1 +33.7%,E2 +30.0%
|
||||
- 相对2019缩放:E1 +34.6%,E2 +31.3%
|
||||
|
||||
**有效性-公平性权衡**:推荐方案最大化了总服务量,但公平性(Gini系数)较差。这是按需求比例分配的固有特性。
|
||||
|
||||
---
|
||||
|
||||
## 5. 阶段四:日历排程
|
||||
|
||||
### 5.1 目标
|
||||
|
||||
将 $\{k_i\}_{i=1}^{70}$ 转化为365天日历,满足每日2站点约束,并最小化访问间隔的不均匀性。
|
||||
|
||||
### 5.2 算法:贪心装箱 + 局部优化
|
||||
|
||||
1. **生成理想日期**:$t_{i,j}^* = \text{round}\left(\frac{(j + 0.5) \cdot 365}{k_i}\right)$
|
||||
2. **贪心分配**:按理想日期排序,就近分配到可用槽位
|
||||
3. **局部优化**:随机交换站点,若改善间隔方差则接受
|
||||
|
||||
### 5.3 实际排程结果
|
||||
|
||||
**验证**:
|
||||
- 已分配访问事件:730 / 730 ✓
|
||||
- 日历统计:365天满载 + 0天部分 + 0天空闲 ✓
|
||||
- 局部优化:5000次迭代,接受33次改进
|
||||
|
||||
**间隔统计**:
|
||||
|
||||
| 指标 | 值 |
|
||||
|------|-----|
|
||||
| 平均间隔均值 | 55.4 天 |
|
||||
| 平均间隔标准差 | 3.2 天 |
|
||||
| 最大单次间隔 | 179 天 |
|
||||
| 平均间隔CV | 0.103 |
|
||||
|
||||
**日历预览(前10天)**:
|
||||
|
||||
| 日期 | 站点1 | 站点2 |
|
||||
|------|-------|-------|
|
||||
| 1 | MFP Avoca | MFP Waverly |
|
||||
| 2 | MFP Van Etten | MFP Endwell United Methodist |
|
||||
| 3 | MFP Bath | MFP Richford |
|
||||
| 4 | MFP College Corning | MFP College TC3 |
|
||||
| 5 | MFP Rehoboth Deliverance | MFP Rathbone |
|
||||
| 6 | MFP Waverly | MFP Redeemer Lutheran |
|
||||
| 7 | MFP The Love Church | MFP Lindley |
|
||||
| 8 | MFP Tuscarora | MFP Woodhull |
|
||||
| 9 | MFP Endwell United Methodist | MFP Richford |
|
||||
| 10 | MFP Bath | MFP College Corning |
|
||||
|
||||
---
|
||||
|
||||
## 6. 可复现流水线
|
||||
|
||||
### 6.1 脚本结构
|
||||
|
||||
```
|
||||
task1/
|
||||
├── 01_clean.py # 数据清洗与标准化
|
||||
├── 02_demand_correction.py # 截断回归修正
|
||||
├── 03_allocate.py # Hamilton频次分配
|
||||
├── 04_evaluate.py # 评估指标计算
|
||||
├── 05_schedule.py # 日历排程生成
|
||||
├── 01_clean.xlsx # Step 1 输出
|
||||
├── 02_demand.xlsx # Step 2 输出
|
||||
├── 03_allocate.xlsx # Step 3 输出
|
||||
├── 04_metrics.xlsx # Step 4 输出
|
||||
└── 05_schedule.xlsx # Step 5 输出
|
||||
```
|
||||
|
||||
### 6.2 运行方法
|
||||
|
||||
```bash
|
||||
# 从项目根目录依次运行
|
||||
python task1/01_clean.py
|
||||
python task1/02_demand_correction.py
|
||||
python task1/03_allocate.py
|
||||
python task1/04_evaluate.py
|
||||
python task1/05_schedule.py
|
||||
```
|
||||
|
||||
### 6.3 关键参数
|
||||
|
||||
| 参数 | 值 | 位置 |
|
||||
|------|-----|------|
|
||||
| 有效容量上限 $C$ | 400 | `02_demand_correction.py` |
|
||||
| 截断概率阈值 | 0.02 | `02_demand_correction.py` |
|
||||
| 质量折扣阈值 $\bar{C}$ | 250 | `04_evaluate.py` |
|
||||
| 年度总访问次数 $N$ | 730 | `03_allocate.py` |
|
||||
| 全年天数 $T$ | 365 | `05_schedule.py` |
|
||||
|
||||
### 6.4 输出文件说明
|
||||
|
||||
| 文件 | 内容 |
|
||||
|------|------|
|
||||
| `01_clean.xlsx` | 标准化数据:site_id, site_name, lat, lon, visits_2019, mu, sigma |
|
||||
| `02_demand.xlsx` | 需求修正:+ mu_tilde, p_trunc, is_corrected |
|
||||
| `03_allocate.xlsx` | 频次分配:+ k, annual_service, r |
|
||||
| `04_metrics.xlsx` | 评估指标:metrics_summary, site_details, parameters |
|
||||
| `05_schedule.xlsx` | 日历排程:calendar, site_dates, gap_statistics, parameters |
|
||||
|
||||
---
|
||||
|
||||
## 7. 敏感性分析
|
||||
|
||||
### 7.1 参数范围
|
||||
|
||||
| 参数 | 符号 | 基准值 | 敏感性范围 |
|
||||
|------|------|--------|-----------|
|
||||
| 有效容量 | $C$ | 400 | {350, 400, 450} |
|
||||
| 截断概率阈值 | $p^{trunc}$ | 0.02 | {0.01, 0.02, 0.05, 0.10} |
|
||||
| 典型服务量 | $\bar{C}$ | 250 | {200, 250, 300} |
|
||||
|
||||
### 7.2 阈值敏感性
|
||||
|
||||
| 阈值 | 被修正站点数 | $\sum \tilde{\mu}$ 增幅 |
|
||||
|------|-------------|----------------------|
|
||||
| 0.01 | 7 | +1.2% |
|
||||
| 0.02 | 5 | +0.98% |
|
||||
| 0.05 | 2 | +0.82% |
|
||||
| 0.10 | 1 | +0.76% |
|
||||
|
||||
---
|
||||
|
||||
## 8. 假设与局限性
|
||||
|
||||
### 8.1 显式假设
|
||||
|
||||
| 编号 | 假设 | 依据 |
|
||||
|------|------|------|
|
||||
| A1 | 真实需求服从正态分布 | 中心极限定理 |
|
||||
| A2 | 有效容量 $C=400$ | 基于 $\mu_{max}=396.6$ |
|
||||
| A3 | 2021年需求结构与2019年相似 | 题面要求使用2019数据 |
|
||||
| A4 | 全年365天均可运营 | 简化假设 |
|
||||
| A5 | 每日2站点为硬约束 | 题面"2 trucks" |
|
||||
|
||||
### 8.2 局限性
|
||||
|
||||
1. **截断修正的简化**:采用线性近似而非完整截断正态MLE
|
||||
2. **需求外生性**:未建模"访问频次影响需求"的内生效应
|
||||
3. **空间相关性**:未考虑相邻站点需求的空间自相关
|
||||
4. **季节性**:未考虑需求的季节波动(如冬季低需求)
|
||||
|
||||
---
|
||||
|
||||
## 9. 结论与建议
|
||||
|
||||
### 9.1 方法论贡献
|
||||
|
||||
1. **识别截断偏误**:通过数据分析发现高需求站点的 $\mu$ 被容量截断,提出截断回归修正
|
||||
2. **质量-数量权衡**:引入质量折扣因子 $q(\mu)$,避免简单最大化服务人次
|
||||
3. **满足率公平**:以需求满足率而非访问次数衡量公平
|
||||
|
||||
### 9.2 对FBST的建议
|
||||
|
||||
1. **高需求站点**:MFP Waverly ($k=32$) 需求远超其他站点,建议考虑增派车辆或设立分站
|
||||
2. **数据收集**:建议记录"被拒绝服务的客户数"以更准确估计真实需求
|
||||
3. **动态调整**:建议季度末根据实际服务数据调整下季度排程
|
||||
|
||||
### 9.3 有效性-公平性权衡
|
||||
|
||||
推荐方案在有效性(总服务量)上显著优于基线,但公平性较差。若FBST更重视公平性,可考虑:
|
||||
- 混合方案:在推荐方案基础上,对低频站点适当增加访问
|
||||
- Pareto优化:在有效性-公平性前沿上选择平衡点
|
||||
|
||||
### 9.4 后续研究方向
|
||||
|
||||
- **Task 2**:考虑天气对需求的影响,建立动态调度模型
|
||||
- **Task 3**:双站点同车访问的最优分配策略
|
||||
|
||||
Reference in New Issue
Block a user