Files
mcm-mfp/p3_kmin.py

529 lines
17 KiB
Python
Raw Normal View History

2026-01-18 17:06:19 +08:00
"""
Task 3: k-min allocation with ordered two-stop pairing.
- Uses 2019 data to allocate visit frequencies.
- Pairs are drawn from ordered_pairs_allocation_k6_cap250.csv (ordered i->j).
- Total annual trips (paired + single) are fixed to N_TARGET via fixed-point adjustment.
"""
from __future__ import annotations
import argparse
import os
import math
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd
try:
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
_HAS_MPL = True
except ModuleNotFoundError:
plt = None
_HAS_MPL = False
INPUT_XLSX = "prob/MFP Regular Sites 2019.xlsx"
INPUT_PAIRS = "data/ordered_pairs_allocation_k6_cap250.csv"
OUTPUT_DIR = "data"
C_OPT = 250
N_TARGET = 730
ALPHA = 0.6
BETA = 0.2
N_SIMS = 2000
RANDOM_SEED = 606
def gini_coefficient(values: np.ndarray) -> float:
x = np.asarray(values, dtype=float)
x = x[np.isfinite(x)]
if x.size == 0:
return 0.0
x = np.clip(x, 0, None)
total = x.sum()
if total <= 0:
return 0.0
x_sorted = np.sort(x)
n = x_sorted.size
idx = np.arange(1, n + 1, dtype=float)
return float((2.0 * (idx * x_sorted).sum()) / (n * total) - (n + 1.0) / n)
def _norm_pdf(z):
return np.exp(-0.5 * z * z) / np.sqrt(2.0 * np.pi)
def _norm_cdf(z):
z = np.asarray(z, dtype=float)
erf_vec = np.vectorize(math.erf, otypes=[float])
return 0.5 * (1.0 + erf_vec(z / np.sqrt(2.0)))
def expected_clipped_normal(mu, sigma, lower=0.0, upper=1.0):
mu = np.asarray(mu, dtype=float)
sigma = np.asarray(sigma, dtype=float)
lower = float(lower)
upper = float(upper)
if lower > upper:
raise ValueError("lower must be <= upper")
out = np.empty_like(mu, dtype=float)
mask = sigma > 0
out[~mask] = np.clip(mu[~mask], lower, upper)
if np.any(mask):
m = mu[mask]
s = sigma[mask]
z_u = (upper - m) / s
z_l = (lower - m) / s
Phi_u = _norm_cdf(z_u)
Phi_l = _norm_cdf(z_l)
phi_u = _norm_pdf(z_u)
phi_l = _norm_pdf(z_l)
ex_le_u = m * Phi_u - s * phi_u
ex_le_l = m * Phi_l - s * phi_l
p_le_l = Phi_l
p_gt_u = 1.0 - Phi_u
out[mask] = lower * p_le_l + (ex_le_u - ex_le_l) + upper * p_gt_u
return out
def _find_col(df: pd.DataFrame, candidates: List[str]) -> str:
for name in candidates:
if name in df.columns:
return name
lower_map = {c.lower(): c for c in df.columns}
for name in candidates:
key = name.lower()
if key in lower_map:
return lower_map[key]
raise ValueError(f"Missing required column. Tried: {candidates}")
def load_sites(path: str) -> pd.DataFrame:
df = pd.read_excel(path)
col_site = _find_col(df, ["Site Name", "site name", "site"])
col_mu = _find_col(df, ["Average Demand per Visit", "average demand per visit", "avg demand"])
col_sigma = _find_col(df, ["StDev(Demand per Visit)", "stdev(demand per visit)", "stdev", "std"])
col_visits = _find_col(df, ["Number of Visits in 2019", "number of visits in 2019", "visits"])
out = df[[col_site, col_mu, col_sigma, col_visits]].copy()
out.columns = ["site_name", "mu", "sigma", "visits_2019"]
out["mu"] = pd.to_numeric(out["mu"], errors="coerce")
out["sigma"] = pd.to_numeric(out["sigma"], errors="coerce").fillna(0.0)
out["visits_2019"] = pd.to_numeric(out["visits_2019"], errors="coerce")
if out[["mu", "visits_2019"]].isna().any().any():
missing = out[out[["mu", "visits_2019"]].isna().any(axis=1)]
raise ValueError(f"Missing mu/visits_2019 for {len(missing)} rows.")
out = out.reset_index(drop=True)
out["site_idx"] = np.arange(1, len(out) + 1, dtype=int)
out["TotalDemand"] = out["mu"] * out["visits_2019"]
return out
def allocate_visits(df: pd.DataFrame, k_min_real: float, n_total: int) -> np.ndarray:
df_sorted = df.sort_values("TotalDemand").reset_index(drop=False)
n = len(df_sorted)
k_floor = int(np.floor(k_min_real))
k_ceil = int(np.ceil(k_min_real))
frac = k_min_real - k_floor
n_ceil = int(round(n * frac))
n_floor = n - n_ceil
k_base = np.array([k_floor] * n_floor + [k_ceil] * n_ceil, dtype=int)
n_reserved = int(k_base.sum())
n_free = int(n_total - n_reserved)
if n_free < 0:
return None
weights = df_sorted["TotalDemand"] / df_sorted["TotalDemand"].sum()
allocated = (k_base + n_free * weights.values).round().astype(int)
allocated = np.maximum(allocated, k_base)
diff = int(n_total - allocated.sum())
if diff != 0:
sorted_idx = weights.sort_values(ascending=(diff < 0)).index.tolist()
for idx in sorted_idx[:abs(diff)]:
allocated[idx] += int(np.sign(diff))
alloc_sorted = df_sorted[["site_idx"]].copy()
alloc_sorted["AllocatedVisits"] = allocated
alloc = alloc_sorted.sort_values("site_idx")["AllocatedVisits"].to_numpy(dtype=int)
return alloc
def assign_pairs(pairs_df: pd.DataFrame, visits: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray]:
remaining = visits.astype(int).copy()
pair_counts = np.zeros(len(pairs_df), dtype=int)
for idx, row in pairs_df.iterrows():
i = int(row["site_i_idx"]) - 1
j = int(row["site_j_idx"]) - 1
if remaining[i] <= 0 or remaining[j] <= 0:
continue
t = int(min(remaining[i], remaining[j]))
if t <= 0:
continue
pair_counts[idx] = t
remaining[i] -= t
remaining[j] -= t
out = pairs_df.copy()
out["pair_count"] = pair_counts
return out, remaining
def _compute_metrics(
sites: pd.DataFrame,
visits: np.ndarray,
pairs_with_counts: pd.DataFrame,
singles: np.ndarray,
*,
alpha: float,
beta: float,
capacity: float,
) -> Dict[str, float]:
n = len(sites)
mu = sites["mu"].to_numpy(dtype=float)
sigma = sites["sigma"].to_numpy(dtype=float)
demand = sites["TotalDemand"].to_numpy(dtype=float)
eff_single = expected_clipped_normal(mu, sigma, lower=0.0, upper=capacity)
served_single = singles * eff_single
cap_single = singles * capacity
pair_first = np.zeros(n, dtype=float)
pair_second = np.zeros(n, dtype=float)
served_first = np.zeros(n, dtype=float)
served_second = np.zeros(n, dtype=float)
cap_first = np.zeros(n, dtype=float)
cap_second = np.zeros(n, dtype=float)
for _, row in pairs_with_counts.iterrows():
count = int(row["pair_count"])
if count <= 0:
continue
i = int(row["site_i_idx"]) - 1
j = int(row["site_j_idx"]) - 1
q_opt = float(row["q_opt"])
served_i = float(row["served_i_mean"])
served_j = float(row["served_j_mean"])
pair_first[i] += count
pair_second[j] += count
served_first[i] += count * served_i
served_second[j] += count * served_j
cap_first[i] += count * q_opt
cap_second[j] += count * (capacity - q_opt)
annual_eff = served_single + served_first + served_second
cap_total = cap_single + cap_first + cap_second
with np.errstate(divide="ignore", invalid="ignore"):
base = np.where(demand > 0, annual_eff / demand, 0.0)
unmet = np.where(demand > 0, np.maximum(0.0, demand - annual_eff) / demand, 0.0)
waste = np.where(cap_total > 0, np.maximum(0.0, cap_total - annual_eff) / cap_total, 0.0)
score = np.clip(base - alpha * unmet - beta * waste, 0.0, 1.0)
bottom_n = max(1, int(np.ceil(n * 0.10)))
total_served = float(annual_eff.sum())
total_demand = float(demand.sum())
total_unmet = float(np.maximum(0.0, demand - annual_eff).sum())
total_waste = float(np.maximum(0.0, cap_total - annual_eff).sum())
return {
"effectiveness": float(score.mean()),
"min_eff": float(score.min()),
"bottom10_eff": float(np.sort(score)[:bottom_n].mean()),
"gini_eff": float(gini_coefficient(score)),
"std_eff": float(score.std()),
"total_unmet": total_unmet,
"total_waste": total_waste,
"total_served": total_served,
"total_demand": total_demand,
"serve_ratio": float(total_served / total_demand) if total_demand > 0 else 0.0,
"score_per_site": score,
"annual_eff": annual_eff,
"pair_first": pair_first,
"pair_second": pair_second,
}
def allocate_with_pairs(
sites: pd.DataFrame,
pairs_df: pd.DataFrame,
k_min: float,
*,
n_target: int,
capacity: float,
max_iter: int = 30,
) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame, int]:
n_total_guess = int(n_target)
for _ in range(max_iter):
visits = allocate_visits(sites, k_min, n_total_guess)
if visits is None:
return None, None, None, None
pairs_with_counts, singles = assign_pairs(pairs_df, visits)
pair_total = int(pairs_with_counts["pair_count"].sum())
new_guess = int(n_target + pair_total)
if new_guess == n_total_guess:
return visits, singles, pairs_with_counts, n_total_guess
n_total_guess = new_guess
return visits, singles, pairs_with_counts, n_total_guess
def select_kmin(results: pd.DataFrame) -> float:
gini_candidates = results.loc[results["gini_eff"] < 0.2, "k_min"]
if len(gini_candidates) > 0:
return float(gini_candidates.iloc[0])
idx = results["effectiveness"].idxmax()
return float(results.loc[idx, "k_min"])
def plot_results(results: pd.DataFrame, output_dir: str) -> float:
if not _HAS_MPL:
raise RuntimeError("缺少依赖: matplotlib无法绘图。请先安装 matplotlib 再运行绘图部分。")
fig, axes = plt.subplots(4, 2, figsize=(12, 13))
selected_k = select_kmin(results)
selected_idx = (results["k_min"] - selected_k).abs().idxmin()
selected_eff = float(results.loc[selected_idx, "effectiveness"])
selected_label = f"Selected k_min={selected_k:.1f}"
ax = axes[0, 0]
ax.plot(results["k_min"], results["effectiveness"], "b-", lw=2)
ax.axvline(selected_k, color="r", ls="--", label=selected_label)
ax.scatter([selected_k], [selected_eff], c="r", s=100, zorder=5)
ax.set_xlabel("k_min")
ax.set_ylabel("Mean Effectiveness")
ax.set_title("Mean Effectiveness vs k_min")
ax.legend()
ax.grid(True, alpha=0.3)
ax = axes[0, 1]
ax.plot(results["k_min"], results["bottom10_eff"], "m-", lw=2)
ax.axvline(selected_k, color="r", ls="--")
ax.set_xlabel("k_min")
ax.set_ylabel("Bottom 10% Mean Effectiveness")
ax.set_title("Bottom 10% Mean Effectiveness vs k_min")
ax.grid(True, alpha=0.3)
ax = axes[1, 0]
ax.plot(results["k_min"], results["total_served"] / 1000, "c-", lw=2)
ax.axhline(results["total_demand"].iloc[0] / 1000, color="gray", ls=":", label="Total Demand")
ax.axvline(selected_k, color="r", ls="--")
ax.set_xlabel("k_min")
ax.set_ylabel("Served Families (×1000)")
ax.set_title("Total Served vs k_min")
ax.legend()
ax.grid(True, alpha=0.3)
ax = axes[1, 1]
ax.plot(results["k_min"], results["min_eff"], "g-", lw=2)
ax.axvline(selected_k, color="r", ls="--")
ax.set_xlabel("k_min")
ax.set_ylabel("Min Effectiveness")
ax.set_title("Worst Site Effectiveness vs k_min")
ax.grid(True, alpha=0.3)
ax = axes[2, 0]
ax.plot(results["k_min"], results["unmet"] / 1000, "r-", lw=2, label="Unmet")
ax.plot(results["k_min"], results["waste"] / 1000, "b-", lw=2, label="Waste")
ax.axvline(selected_k, color="gray", ls="--")
ax.set_xlabel("k_min")
ax.set_ylabel("Families (×1000)")
ax.set_title("Unmet Demand vs Wasted Capacity")
ax.legend()
ax.grid(True, alpha=0.3)
ax = axes[2, 1]
ax.plot(results["k_min"], results["std_eff"], color="tab:orange", lw=2)
ax.axvline(selected_k, color="gray", ls="--")
ax.set_xlabel("k_min")
ax.set_ylabel("Std Effectiveness")
ax.set_title("Effectiveness Std vs k_min")
ax.grid(True, alpha=0.3)
ax = axes[3, 0]
ax.plot(results["k_min"], results["gini_eff"], color="tab:purple", lw=2)
ax.axhline(0.2, color="gray", ls=":", lw=1)
ax.axvline(selected_k, color="r", ls="--")
ax.set_xlabel("k_min")
ax.set_ylabel("Gini Coefficient")
ax.set_title("Gini (Effectiveness) vs k_min")
ax.grid(True, alpha=0.3)
axes[3, 1].axis("off")
plt.tight_layout()
os.makedirs(output_dir, exist_ok=True)
plt.savefig(os.path.join(output_dir, "p3_kmin_effectiveness.png"), dpi=150)
plt.close(fig)
return selected_k
def main() -> None:
parser = argparse.ArgumentParser(description="Task 3 k-min allocation with two-stop pairing.")
parser.add_argument("--input-xlsx", default=INPUT_XLSX)
parser.add_argument("--input-pairs", default=INPUT_PAIRS)
parser.add_argument("--output-dir", default=OUTPUT_DIR)
parser.add_argument("--kmin-start", type=float, default=1.0)
parser.add_argument("--kmin-end", type=float, default=10.0)
parser.add_argument("--kmin-step", type=float, default=0.1)
parser.add_argument("--capacity", type=float, default=C_OPT)
parser.add_argument("--n-target", type=int, default=N_TARGET)
parser.add_argument("--alpha", type=float, default=ALPHA)
parser.add_argument("--beta", type=float, default=BETA)
args = parser.parse_args()
sites = load_sites(args.input_xlsx)
pairs = pd.read_csv(args.input_pairs)
required_cols = {
"site_i_idx",
"site_j_idx",
"score_mean",
"q_opt",
"served_i_mean",
"served_j_mean",
"distance_miles",
}
missing = required_cols.difference(pairs.columns)
if missing:
raise ValueError(f"Missing columns in pairs CSV: {sorted(missing)}")
pairs = pairs.sort_values(
["score_mean", "distance_miles"], ascending=[False, True]
).reset_index(drop=True)
k_range = np.arange(args.kmin_start, args.kmin_end + 1e-9, args.kmin_step)
results = []
for k_min in k_range:
visits, singles, pairs_with_counts, n_total_guess = allocate_with_pairs(
sites,
pairs,
float(k_min),
n_target=args.n_target,
capacity=args.capacity,
)
if visits is None:
continue
metrics = _compute_metrics(
sites,
visits,
pairs_with_counts,
singles,
alpha=args.alpha,
beta=args.beta,
capacity=args.capacity,
)
pair_total = int(pairs_with_counts["pair_count"].sum())
total_trips = int(visits.sum() - pair_total)
row = {
"k_min": float(k_min),
"effectiveness": metrics["effectiveness"],
"min_eff": metrics["min_eff"],
"bottom10_eff": metrics["bottom10_eff"],
"gini_eff": metrics["gini_eff"],
"std_eff": metrics["std_eff"],
"unmet": metrics["total_unmet"],
"waste": metrics["total_waste"],
"total_served": metrics["total_served"],
"total_demand": metrics["total_demand"],
"serve_ratio": metrics["serve_ratio"],
"total_visits_allocated": int(visits.sum()),
"pair_trips": pair_total,
"total_trips": total_trips,
"n_total_guess": int(n_total_guess),
}
results.append(row)
results_df = pd.DataFrame(results)
if len(results_df) == 0:
raise RuntimeError("No feasible k_min values found.")
best_k = select_kmin(results_df)
best_idx = (results_df["k_min"] - best_k).abs().idxmin()
visits, singles, pairs_with_counts, n_total_guess = allocate_with_pairs(
sites,
pairs,
float(best_k),
n_target=args.n_target,
capacity=args.capacity,
)
metrics = _compute_metrics(
sites,
visits,
pairs_with_counts,
singles,
alpha=args.alpha,
beta=args.beta,
capacity=args.capacity,
)
pair_total = int(pairs_with_counts["pair_count"].sum())
total_trips = int(visits.sum() - pair_total)
site_rows = pd.DataFrame(
{
"site_idx": sites["site_idx"],
"site_name": sites["site_name"],
"total_visits_allocated": visits,
"single_visits": singles,
"paired_first": metrics["pair_first"].astype(int),
"paired_second": metrics["pair_second"].astype(int),
"paired_total": (metrics["pair_first"] + metrics["pair_second"]).astype(int),
}
)
pairs_out = pairs_with_counts.loc[pairs_with_counts["pair_count"] > 0].copy()
pairs_out = pairs_out.sort_values(["pair_count", "score_mean"], ascending=[False, False])
os.makedirs(args.output_dir, exist_ok=True)
results_df.to_csv(os.path.join(args.output_dir, "p3_kmin_data.csv"), index=False)
site_rows.to_csv(os.path.join(args.output_dir, "p3_kmin_sites.csv"), index=False)
pairs_out.to_csv(os.path.join(args.output_dir, "p3_kmin_pairs.csv"), index=False)
if _HAS_MPL:
plot_results(results_df, args.output_dir)
else:
print("未检测到 matplotlib跳过绘图仍会保存CSV结果")
print(f"Best k_min={best_k:.1f} (total_trips={total_trips}, pair_trips={pair_total})")
print(
"Saved: data/p3_kmin_data.csv, data/p3_kmin_sites.csv, "
"data/p3_kmin_pairs.csv, data/p3_kmin_effectiveness.png"
)
if __name__ == "__main__":
main()