Files
mcm-mfp/p3_kmin.py
2026-01-18 17:06:19 +08:00

529 lines
17 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Task 3: k-min allocation with ordered two-stop pairing.
- Uses 2019 data to allocate visit frequencies.
- Pairs are drawn from ordered_pairs_allocation_k6_cap250.csv (ordered i->j).
- Total annual trips (paired + single) are fixed to N_TARGET via fixed-point adjustment.
"""
from __future__ import annotations
import argparse
import os
import math
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd
try:
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
_HAS_MPL = True
except ModuleNotFoundError:
plt = None
_HAS_MPL = False
INPUT_XLSX = "prob/MFP Regular Sites 2019.xlsx"
INPUT_PAIRS = "data/ordered_pairs_allocation_k6_cap250.csv"
OUTPUT_DIR = "data"
C_OPT = 250
N_TARGET = 730
ALPHA = 0.6
BETA = 0.2
N_SIMS = 2000
RANDOM_SEED = 606
def gini_coefficient(values: np.ndarray) -> float:
x = np.asarray(values, dtype=float)
x = x[np.isfinite(x)]
if x.size == 0:
return 0.0
x = np.clip(x, 0, None)
total = x.sum()
if total <= 0:
return 0.0
x_sorted = np.sort(x)
n = x_sorted.size
idx = np.arange(1, n + 1, dtype=float)
return float((2.0 * (idx * x_sorted).sum()) / (n * total) - (n + 1.0) / n)
def _norm_pdf(z):
return np.exp(-0.5 * z * z) / np.sqrt(2.0 * np.pi)
def _norm_cdf(z):
z = np.asarray(z, dtype=float)
erf_vec = np.vectorize(math.erf, otypes=[float])
return 0.5 * (1.0 + erf_vec(z / np.sqrt(2.0)))
def expected_clipped_normal(mu, sigma, lower=0.0, upper=1.0):
mu = np.asarray(mu, dtype=float)
sigma = np.asarray(sigma, dtype=float)
lower = float(lower)
upper = float(upper)
if lower > upper:
raise ValueError("lower must be <= upper")
out = np.empty_like(mu, dtype=float)
mask = sigma > 0
out[~mask] = np.clip(mu[~mask], lower, upper)
if np.any(mask):
m = mu[mask]
s = sigma[mask]
z_u = (upper - m) / s
z_l = (lower - m) / s
Phi_u = _norm_cdf(z_u)
Phi_l = _norm_cdf(z_l)
phi_u = _norm_pdf(z_u)
phi_l = _norm_pdf(z_l)
ex_le_u = m * Phi_u - s * phi_u
ex_le_l = m * Phi_l - s * phi_l
p_le_l = Phi_l
p_gt_u = 1.0 - Phi_u
out[mask] = lower * p_le_l + (ex_le_u - ex_le_l) + upper * p_gt_u
return out
def _find_col(df: pd.DataFrame, candidates: List[str]) -> str:
for name in candidates:
if name in df.columns:
return name
lower_map = {c.lower(): c for c in df.columns}
for name in candidates:
key = name.lower()
if key in lower_map:
return lower_map[key]
raise ValueError(f"Missing required column. Tried: {candidates}")
def load_sites(path: str) -> pd.DataFrame:
df = pd.read_excel(path)
col_site = _find_col(df, ["Site Name", "site name", "site"])
col_mu = _find_col(df, ["Average Demand per Visit", "average demand per visit", "avg demand"])
col_sigma = _find_col(df, ["StDev(Demand per Visit)", "stdev(demand per visit)", "stdev", "std"])
col_visits = _find_col(df, ["Number of Visits in 2019", "number of visits in 2019", "visits"])
out = df[[col_site, col_mu, col_sigma, col_visits]].copy()
out.columns = ["site_name", "mu", "sigma", "visits_2019"]
out["mu"] = pd.to_numeric(out["mu"], errors="coerce")
out["sigma"] = pd.to_numeric(out["sigma"], errors="coerce").fillna(0.0)
out["visits_2019"] = pd.to_numeric(out["visits_2019"], errors="coerce")
if out[["mu", "visits_2019"]].isna().any().any():
missing = out[out[["mu", "visits_2019"]].isna().any(axis=1)]
raise ValueError(f"Missing mu/visits_2019 for {len(missing)} rows.")
out = out.reset_index(drop=True)
out["site_idx"] = np.arange(1, len(out) + 1, dtype=int)
out["TotalDemand"] = out["mu"] * out["visits_2019"]
return out
def allocate_visits(df: pd.DataFrame, k_min_real: float, n_total: int) -> np.ndarray:
df_sorted = df.sort_values("TotalDemand").reset_index(drop=False)
n = len(df_sorted)
k_floor = int(np.floor(k_min_real))
k_ceil = int(np.ceil(k_min_real))
frac = k_min_real - k_floor
n_ceil = int(round(n * frac))
n_floor = n - n_ceil
k_base = np.array([k_floor] * n_floor + [k_ceil] * n_ceil, dtype=int)
n_reserved = int(k_base.sum())
n_free = int(n_total - n_reserved)
if n_free < 0:
return None
weights = df_sorted["TotalDemand"] / df_sorted["TotalDemand"].sum()
allocated = (k_base + n_free * weights.values).round().astype(int)
allocated = np.maximum(allocated, k_base)
diff = int(n_total - allocated.sum())
if diff != 0:
sorted_idx = weights.sort_values(ascending=(diff < 0)).index.tolist()
for idx in sorted_idx[:abs(diff)]:
allocated[idx] += int(np.sign(diff))
alloc_sorted = df_sorted[["site_idx"]].copy()
alloc_sorted["AllocatedVisits"] = allocated
alloc = alloc_sorted.sort_values("site_idx")["AllocatedVisits"].to_numpy(dtype=int)
return alloc
def assign_pairs(pairs_df: pd.DataFrame, visits: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray]:
remaining = visits.astype(int).copy()
pair_counts = np.zeros(len(pairs_df), dtype=int)
for idx, row in pairs_df.iterrows():
i = int(row["site_i_idx"]) - 1
j = int(row["site_j_idx"]) - 1
if remaining[i] <= 0 or remaining[j] <= 0:
continue
t = int(min(remaining[i], remaining[j]))
if t <= 0:
continue
pair_counts[idx] = t
remaining[i] -= t
remaining[j] -= t
out = pairs_df.copy()
out["pair_count"] = pair_counts
return out, remaining
def _compute_metrics(
sites: pd.DataFrame,
visits: np.ndarray,
pairs_with_counts: pd.DataFrame,
singles: np.ndarray,
*,
alpha: float,
beta: float,
capacity: float,
) -> Dict[str, float]:
n = len(sites)
mu = sites["mu"].to_numpy(dtype=float)
sigma = sites["sigma"].to_numpy(dtype=float)
demand = sites["TotalDemand"].to_numpy(dtype=float)
eff_single = expected_clipped_normal(mu, sigma, lower=0.0, upper=capacity)
served_single = singles * eff_single
cap_single = singles * capacity
pair_first = np.zeros(n, dtype=float)
pair_second = np.zeros(n, dtype=float)
served_first = np.zeros(n, dtype=float)
served_second = np.zeros(n, dtype=float)
cap_first = np.zeros(n, dtype=float)
cap_second = np.zeros(n, dtype=float)
for _, row in pairs_with_counts.iterrows():
count = int(row["pair_count"])
if count <= 0:
continue
i = int(row["site_i_idx"]) - 1
j = int(row["site_j_idx"]) - 1
q_opt = float(row["q_opt"])
served_i = float(row["served_i_mean"])
served_j = float(row["served_j_mean"])
pair_first[i] += count
pair_second[j] += count
served_first[i] += count * served_i
served_second[j] += count * served_j
cap_first[i] += count * q_opt
cap_second[j] += count * (capacity - q_opt)
annual_eff = served_single + served_first + served_second
cap_total = cap_single + cap_first + cap_second
with np.errstate(divide="ignore", invalid="ignore"):
base = np.where(demand > 0, annual_eff / demand, 0.0)
unmet = np.where(demand > 0, np.maximum(0.0, demand - annual_eff) / demand, 0.0)
waste = np.where(cap_total > 0, np.maximum(0.0, cap_total - annual_eff) / cap_total, 0.0)
score = np.clip(base - alpha * unmet - beta * waste, 0.0, 1.0)
bottom_n = max(1, int(np.ceil(n * 0.10)))
total_served = float(annual_eff.sum())
total_demand = float(demand.sum())
total_unmet = float(np.maximum(0.0, demand - annual_eff).sum())
total_waste = float(np.maximum(0.0, cap_total - annual_eff).sum())
return {
"effectiveness": float(score.mean()),
"min_eff": float(score.min()),
"bottom10_eff": float(np.sort(score)[:bottom_n].mean()),
"gini_eff": float(gini_coefficient(score)),
"std_eff": float(score.std()),
"total_unmet": total_unmet,
"total_waste": total_waste,
"total_served": total_served,
"total_demand": total_demand,
"serve_ratio": float(total_served / total_demand) if total_demand > 0 else 0.0,
"score_per_site": score,
"annual_eff": annual_eff,
"pair_first": pair_first,
"pair_second": pair_second,
}
def allocate_with_pairs(
sites: pd.DataFrame,
pairs_df: pd.DataFrame,
k_min: float,
*,
n_target: int,
capacity: float,
max_iter: int = 30,
) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame, int]:
n_total_guess = int(n_target)
for _ in range(max_iter):
visits = allocate_visits(sites, k_min, n_total_guess)
if visits is None:
return None, None, None, None
pairs_with_counts, singles = assign_pairs(pairs_df, visits)
pair_total = int(pairs_with_counts["pair_count"].sum())
new_guess = int(n_target + pair_total)
if new_guess == n_total_guess:
return visits, singles, pairs_with_counts, n_total_guess
n_total_guess = new_guess
return visits, singles, pairs_with_counts, n_total_guess
def select_kmin(results: pd.DataFrame) -> float:
gini_candidates = results.loc[results["gini_eff"] < 0.2, "k_min"]
if len(gini_candidates) > 0:
return float(gini_candidates.iloc[0])
idx = results["effectiveness"].idxmax()
return float(results.loc[idx, "k_min"])
def plot_results(results: pd.DataFrame, output_dir: str) -> float:
if not _HAS_MPL:
raise RuntimeError("缺少依赖: matplotlib无法绘图。请先安装 matplotlib 再运行绘图部分。")
fig, axes = plt.subplots(4, 2, figsize=(12, 13))
selected_k = select_kmin(results)
selected_idx = (results["k_min"] - selected_k).abs().idxmin()
selected_eff = float(results.loc[selected_idx, "effectiveness"])
selected_label = f"Selected k_min={selected_k:.1f}"
ax = axes[0, 0]
ax.plot(results["k_min"], results["effectiveness"], "b-", lw=2)
ax.axvline(selected_k, color="r", ls="--", label=selected_label)
ax.scatter([selected_k], [selected_eff], c="r", s=100, zorder=5)
ax.set_xlabel("k_min")
ax.set_ylabel("Mean Effectiveness")
ax.set_title("Mean Effectiveness vs k_min")
ax.legend()
ax.grid(True, alpha=0.3)
ax = axes[0, 1]
ax.plot(results["k_min"], results["bottom10_eff"], "m-", lw=2)
ax.axvline(selected_k, color="r", ls="--")
ax.set_xlabel("k_min")
ax.set_ylabel("Bottom 10% Mean Effectiveness")
ax.set_title("Bottom 10% Mean Effectiveness vs k_min")
ax.grid(True, alpha=0.3)
ax = axes[1, 0]
ax.plot(results["k_min"], results["total_served"] / 1000, "c-", lw=2)
ax.axhline(results["total_demand"].iloc[0] / 1000, color="gray", ls=":", label="Total Demand")
ax.axvline(selected_k, color="r", ls="--")
ax.set_xlabel("k_min")
ax.set_ylabel("Served Families (×1000)")
ax.set_title("Total Served vs k_min")
ax.legend()
ax.grid(True, alpha=0.3)
ax = axes[1, 1]
ax.plot(results["k_min"], results["min_eff"], "g-", lw=2)
ax.axvline(selected_k, color="r", ls="--")
ax.set_xlabel("k_min")
ax.set_ylabel("Min Effectiveness")
ax.set_title("Worst Site Effectiveness vs k_min")
ax.grid(True, alpha=0.3)
ax = axes[2, 0]
ax.plot(results["k_min"], results["unmet"] / 1000, "r-", lw=2, label="Unmet")
ax.plot(results["k_min"], results["waste"] / 1000, "b-", lw=2, label="Waste")
ax.axvline(selected_k, color="gray", ls="--")
ax.set_xlabel("k_min")
ax.set_ylabel("Families (×1000)")
ax.set_title("Unmet Demand vs Wasted Capacity")
ax.legend()
ax.grid(True, alpha=0.3)
ax = axes[2, 1]
ax.plot(results["k_min"], results["std_eff"], color="tab:orange", lw=2)
ax.axvline(selected_k, color="gray", ls="--")
ax.set_xlabel("k_min")
ax.set_ylabel("Std Effectiveness")
ax.set_title("Effectiveness Std vs k_min")
ax.grid(True, alpha=0.3)
ax = axes[3, 0]
ax.plot(results["k_min"], results["gini_eff"], color="tab:purple", lw=2)
ax.axhline(0.2, color="gray", ls=":", lw=1)
ax.axvline(selected_k, color="r", ls="--")
ax.set_xlabel("k_min")
ax.set_ylabel("Gini Coefficient")
ax.set_title("Gini (Effectiveness) vs k_min")
ax.grid(True, alpha=0.3)
axes[3, 1].axis("off")
plt.tight_layout()
os.makedirs(output_dir, exist_ok=True)
plt.savefig(os.path.join(output_dir, "p3_kmin_effectiveness.png"), dpi=150)
plt.close(fig)
return selected_k
def main() -> None:
parser = argparse.ArgumentParser(description="Task 3 k-min allocation with two-stop pairing.")
parser.add_argument("--input-xlsx", default=INPUT_XLSX)
parser.add_argument("--input-pairs", default=INPUT_PAIRS)
parser.add_argument("--output-dir", default=OUTPUT_DIR)
parser.add_argument("--kmin-start", type=float, default=1.0)
parser.add_argument("--kmin-end", type=float, default=10.0)
parser.add_argument("--kmin-step", type=float, default=0.1)
parser.add_argument("--capacity", type=float, default=C_OPT)
parser.add_argument("--n-target", type=int, default=N_TARGET)
parser.add_argument("--alpha", type=float, default=ALPHA)
parser.add_argument("--beta", type=float, default=BETA)
args = parser.parse_args()
sites = load_sites(args.input_xlsx)
pairs = pd.read_csv(args.input_pairs)
required_cols = {
"site_i_idx",
"site_j_idx",
"score_mean",
"q_opt",
"served_i_mean",
"served_j_mean",
"distance_miles",
}
missing = required_cols.difference(pairs.columns)
if missing:
raise ValueError(f"Missing columns in pairs CSV: {sorted(missing)}")
pairs = pairs.sort_values(
["score_mean", "distance_miles"], ascending=[False, True]
).reset_index(drop=True)
k_range = np.arange(args.kmin_start, args.kmin_end + 1e-9, args.kmin_step)
results = []
for k_min in k_range:
visits, singles, pairs_with_counts, n_total_guess = allocate_with_pairs(
sites,
pairs,
float(k_min),
n_target=args.n_target,
capacity=args.capacity,
)
if visits is None:
continue
metrics = _compute_metrics(
sites,
visits,
pairs_with_counts,
singles,
alpha=args.alpha,
beta=args.beta,
capacity=args.capacity,
)
pair_total = int(pairs_with_counts["pair_count"].sum())
total_trips = int(visits.sum() - pair_total)
row = {
"k_min": float(k_min),
"effectiveness": metrics["effectiveness"],
"min_eff": metrics["min_eff"],
"bottom10_eff": metrics["bottom10_eff"],
"gini_eff": metrics["gini_eff"],
"std_eff": metrics["std_eff"],
"unmet": metrics["total_unmet"],
"waste": metrics["total_waste"],
"total_served": metrics["total_served"],
"total_demand": metrics["total_demand"],
"serve_ratio": metrics["serve_ratio"],
"total_visits_allocated": int(visits.sum()),
"pair_trips": pair_total,
"total_trips": total_trips,
"n_total_guess": int(n_total_guess),
}
results.append(row)
results_df = pd.DataFrame(results)
if len(results_df) == 0:
raise RuntimeError("No feasible k_min values found.")
best_k = select_kmin(results_df)
best_idx = (results_df["k_min"] - best_k).abs().idxmin()
visits, singles, pairs_with_counts, n_total_guess = allocate_with_pairs(
sites,
pairs,
float(best_k),
n_target=args.n_target,
capacity=args.capacity,
)
metrics = _compute_metrics(
sites,
visits,
pairs_with_counts,
singles,
alpha=args.alpha,
beta=args.beta,
capacity=args.capacity,
)
pair_total = int(pairs_with_counts["pair_count"].sum())
total_trips = int(visits.sum() - pair_total)
site_rows = pd.DataFrame(
{
"site_idx": sites["site_idx"],
"site_name": sites["site_name"],
"total_visits_allocated": visits,
"single_visits": singles,
"paired_first": metrics["pair_first"].astype(int),
"paired_second": metrics["pair_second"].astype(int),
"paired_total": (metrics["pair_first"] + metrics["pair_second"]).astype(int),
}
)
pairs_out = pairs_with_counts.loc[pairs_with_counts["pair_count"] > 0].copy()
pairs_out = pairs_out.sort_values(["pair_count", "score_mean"], ascending=[False, False])
os.makedirs(args.output_dir, exist_ok=True)
results_df.to_csv(os.path.join(args.output_dir, "p3_kmin_data.csv"), index=False)
site_rows.to_csv(os.path.join(args.output_dir, "p3_kmin_sites.csv"), index=False)
pairs_out.to_csv(os.path.join(args.output_dir, "p3_kmin_pairs.csv"), index=False)
if _HAS_MPL:
plot_results(results_df, args.output_dir)
else:
print("未检测到 matplotlib跳过绘图仍会保存CSV结果")
print(f"Best k_min={best_k:.1f} (total_trips={total_trips}, pair_trips={pair_total})")
print(
"Saved: data/p3_kmin_data.csv, data/p3_kmin_sites.csv, "
"data/p3_kmin_pairs.csv, data/p3_kmin_effectiveness.png"
)
if __name__ == "__main__":
main()