""" Task 3: k-min allocation with ordered two-stop pairing. - Uses 2019 data to allocate visit frequencies. - Pairs are drawn from ordered_pairs_allocation_k6_cap250.csv (ordered i->j). - Total annual trips (paired + single) are fixed to N_TARGET via fixed-point adjustment. """ from __future__ import annotations import argparse import os import math from typing import Dict, List, Tuple import numpy as np import pandas as pd try: import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt _HAS_MPL = True except ModuleNotFoundError: plt = None _HAS_MPL = False INPUT_XLSX = "prob/MFP Regular Sites 2019.xlsx" INPUT_PAIRS = "data/ordered_pairs_allocation_k6_cap250.csv" OUTPUT_DIR = "data" C_OPT = 250 N_TARGET = 730 ALPHA = 0.6 BETA = 0.2 N_SIMS = 2000 RANDOM_SEED = 606 def gini_coefficient(values: np.ndarray) -> float: x = np.asarray(values, dtype=float) x = x[np.isfinite(x)] if x.size == 0: return 0.0 x = np.clip(x, 0, None) total = x.sum() if total <= 0: return 0.0 x_sorted = np.sort(x) n = x_sorted.size idx = np.arange(1, n + 1, dtype=float) return float((2.0 * (idx * x_sorted).sum()) / (n * total) - (n + 1.0) / n) def _norm_pdf(z): return np.exp(-0.5 * z * z) / np.sqrt(2.0 * np.pi) def _norm_cdf(z): z = np.asarray(z, dtype=float) erf_vec = np.vectorize(math.erf, otypes=[float]) return 0.5 * (1.0 + erf_vec(z / np.sqrt(2.0))) def expected_clipped_normal(mu, sigma, lower=0.0, upper=1.0): mu = np.asarray(mu, dtype=float) sigma = np.asarray(sigma, dtype=float) lower = float(lower) upper = float(upper) if lower > upper: raise ValueError("lower must be <= upper") out = np.empty_like(mu, dtype=float) mask = sigma > 0 out[~mask] = np.clip(mu[~mask], lower, upper) if np.any(mask): m = mu[mask] s = sigma[mask] z_u = (upper - m) / s z_l = (lower - m) / s Phi_u = _norm_cdf(z_u) Phi_l = _norm_cdf(z_l) phi_u = _norm_pdf(z_u) phi_l = _norm_pdf(z_l) ex_le_u = m * Phi_u - s * phi_u ex_le_l = m * Phi_l - s * phi_l p_le_l = Phi_l p_gt_u = 1.0 - Phi_u out[mask] = lower * p_le_l + (ex_le_u - ex_le_l) + upper * p_gt_u return out def _find_col(df: pd.DataFrame, candidates: List[str]) -> str: for name in candidates: if name in df.columns: return name lower_map = {c.lower(): c for c in df.columns} for name in candidates: key = name.lower() if key in lower_map: return lower_map[key] raise ValueError(f"Missing required column. Tried: {candidates}") def load_sites(path: str) -> pd.DataFrame: df = pd.read_excel(path) col_site = _find_col(df, ["Site Name", "site name", "site"]) col_mu = _find_col(df, ["Average Demand per Visit", "average demand per visit", "avg demand"]) col_sigma = _find_col(df, ["StDev(Demand per Visit)", "stdev(demand per visit)", "stdev", "std"]) col_visits = _find_col(df, ["Number of Visits in 2019", "number of visits in 2019", "visits"]) out = df[[col_site, col_mu, col_sigma, col_visits]].copy() out.columns = ["site_name", "mu", "sigma", "visits_2019"] out["mu"] = pd.to_numeric(out["mu"], errors="coerce") out["sigma"] = pd.to_numeric(out["sigma"], errors="coerce").fillna(0.0) out["visits_2019"] = pd.to_numeric(out["visits_2019"], errors="coerce") if out[["mu", "visits_2019"]].isna().any().any(): missing = out[out[["mu", "visits_2019"]].isna().any(axis=1)] raise ValueError(f"Missing mu/visits_2019 for {len(missing)} rows.") out = out.reset_index(drop=True) out["site_idx"] = np.arange(1, len(out) + 1, dtype=int) out["TotalDemand"] = out["mu"] * out["visits_2019"] return out def allocate_visits(df: pd.DataFrame, k_min_real: float, n_total: int) -> np.ndarray: df_sorted = df.sort_values("TotalDemand").reset_index(drop=False) n = len(df_sorted) k_floor = int(np.floor(k_min_real)) k_ceil = int(np.ceil(k_min_real)) frac = k_min_real - k_floor n_ceil = int(round(n * frac)) n_floor = n - n_ceil k_base = np.array([k_floor] * n_floor + [k_ceil] * n_ceil, dtype=int) n_reserved = int(k_base.sum()) n_free = int(n_total - n_reserved) if n_free < 0: return None weights = df_sorted["TotalDemand"] / df_sorted["TotalDemand"].sum() allocated = (k_base + n_free * weights.values).round().astype(int) allocated = np.maximum(allocated, k_base) diff = int(n_total - allocated.sum()) if diff != 0: sorted_idx = weights.sort_values(ascending=(diff < 0)).index.tolist() for idx in sorted_idx[:abs(diff)]: allocated[idx] += int(np.sign(diff)) alloc_sorted = df_sorted[["site_idx"]].copy() alloc_sorted["AllocatedVisits"] = allocated alloc = alloc_sorted.sort_values("site_idx")["AllocatedVisits"].to_numpy(dtype=int) return alloc def assign_pairs(pairs_df: pd.DataFrame, visits: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray]: remaining = visits.astype(int).copy() pair_counts = np.zeros(len(pairs_df), dtype=int) for idx, row in pairs_df.iterrows(): i = int(row["site_i_idx"]) - 1 j = int(row["site_j_idx"]) - 1 if remaining[i] <= 0 or remaining[j] <= 0: continue t = int(min(remaining[i], remaining[j])) if t <= 0: continue pair_counts[idx] = t remaining[i] -= t remaining[j] -= t out = pairs_df.copy() out["pair_count"] = pair_counts return out, remaining def _compute_metrics( sites: pd.DataFrame, visits: np.ndarray, pairs_with_counts: pd.DataFrame, singles: np.ndarray, *, alpha: float, beta: float, capacity: float, ) -> Dict[str, float]: n = len(sites) mu = sites["mu"].to_numpy(dtype=float) sigma = sites["sigma"].to_numpy(dtype=float) demand = sites["TotalDemand"].to_numpy(dtype=float) eff_single = expected_clipped_normal(mu, sigma, lower=0.0, upper=capacity) served_single = singles * eff_single cap_single = singles * capacity pair_first = np.zeros(n, dtype=float) pair_second = np.zeros(n, dtype=float) served_first = np.zeros(n, dtype=float) served_second = np.zeros(n, dtype=float) cap_first = np.zeros(n, dtype=float) cap_second = np.zeros(n, dtype=float) for _, row in pairs_with_counts.iterrows(): count = int(row["pair_count"]) if count <= 0: continue i = int(row["site_i_idx"]) - 1 j = int(row["site_j_idx"]) - 1 q_opt = float(row["q_opt"]) served_i = float(row["served_i_mean"]) served_j = float(row["served_j_mean"]) pair_first[i] += count pair_second[j] += count served_first[i] += count * served_i served_second[j] += count * served_j cap_first[i] += count * q_opt cap_second[j] += count * (capacity - q_opt) annual_eff = served_single + served_first + served_second cap_total = cap_single + cap_first + cap_second with np.errstate(divide="ignore", invalid="ignore"): base = np.where(demand > 0, annual_eff / demand, 0.0) unmet = np.where(demand > 0, np.maximum(0.0, demand - annual_eff) / demand, 0.0) waste = np.where(cap_total > 0, np.maximum(0.0, cap_total - annual_eff) / cap_total, 0.0) score = np.clip(base - alpha * unmet - beta * waste, 0.0, 1.0) bottom_n = max(1, int(np.ceil(n * 0.10))) total_served = float(annual_eff.sum()) total_demand = float(demand.sum()) total_unmet = float(np.maximum(0.0, demand - annual_eff).sum()) total_waste = float(np.maximum(0.0, cap_total - annual_eff).sum()) return { "effectiveness": float(score.mean()), "min_eff": float(score.min()), "bottom10_eff": float(np.sort(score)[:bottom_n].mean()), "gini_eff": float(gini_coefficient(score)), "std_eff": float(score.std()), "total_unmet": total_unmet, "total_waste": total_waste, "total_served": total_served, "total_demand": total_demand, "serve_ratio": float(total_served / total_demand) if total_demand > 0 else 0.0, "score_per_site": score, "annual_eff": annual_eff, "pair_first": pair_first, "pair_second": pair_second, } def allocate_with_pairs( sites: pd.DataFrame, pairs_df: pd.DataFrame, k_min: float, *, n_target: int, capacity: float, max_iter: int = 30, ) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame, int]: n_total_guess = int(n_target) for _ in range(max_iter): visits = allocate_visits(sites, k_min, n_total_guess) if visits is None: return None, None, None, None pairs_with_counts, singles = assign_pairs(pairs_df, visits) pair_total = int(pairs_with_counts["pair_count"].sum()) new_guess = int(n_target + pair_total) if new_guess == n_total_guess: return visits, singles, pairs_with_counts, n_total_guess n_total_guess = new_guess return visits, singles, pairs_with_counts, n_total_guess def select_kmin(results: pd.DataFrame) -> float: gini_candidates = results.loc[results["gini_eff"] < 0.2, "k_min"] if len(gini_candidates) > 0: return float(gini_candidates.iloc[0]) idx = results["effectiveness"].idxmax() return float(results.loc[idx, "k_min"]) def plot_results(results: pd.DataFrame, output_dir: str) -> float: if not _HAS_MPL: raise RuntimeError("缺少依赖: matplotlib(无法绘图)。请先安装 matplotlib 再运行绘图部分。") fig, axes = plt.subplots(4, 2, figsize=(12, 13)) selected_k = select_kmin(results) selected_idx = (results["k_min"] - selected_k).abs().idxmin() selected_eff = float(results.loc[selected_idx, "effectiveness"]) selected_label = f"Selected k_min={selected_k:.1f}" ax = axes[0, 0] ax.plot(results["k_min"], results["effectiveness"], "b-", lw=2) ax.axvline(selected_k, color="r", ls="--", label=selected_label) ax.scatter([selected_k], [selected_eff], c="r", s=100, zorder=5) ax.set_xlabel("k_min") ax.set_ylabel("Mean Effectiveness") ax.set_title("Mean Effectiveness vs k_min") ax.legend() ax.grid(True, alpha=0.3) ax = axes[0, 1] ax.plot(results["k_min"], results["bottom10_eff"], "m-", lw=2) ax.axvline(selected_k, color="r", ls="--") ax.set_xlabel("k_min") ax.set_ylabel("Bottom 10% Mean Effectiveness") ax.set_title("Bottom 10% Mean Effectiveness vs k_min") ax.grid(True, alpha=0.3) ax = axes[1, 0] ax.plot(results["k_min"], results["total_served"] / 1000, "c-", lw=2) ax.axhline(results["total_demand"].iloc[0] / 1000, color="gray", ls=":", label="Total Demand") ax.axvline(selected_k, color="r", ls="--") ax.set_xlabel("k_min") ax.set_ylabel("Served Families (×1000)") ax.set_title("Total Served vs k_min") ax.legend() ax.grid(True, alpha=0.3) ax = axes[1, 1] ax.plot(results["k_min"], results["min_eff"], "g-", lw=2) ax.axvline(selected_k, color="r", ls="--") ax.set_xlabel("k_min") ax.set_ylabel("Min Effectiveness") ax.set_title("Worst Site Effectiveness vs k_min") ax.grid(True, alpha=0.3) ax = axes[2, 0] ax.plot(results["k_min"], results["unmet"] / 1000, "r-", lw=2, label="Unmet") ax.plot(results["k_min"], results["waste"] / 1000, "b-", lw=2, label="Waste") ax.axvline(selected_k, color="gray", ls="--") ax.set_xlabel("k_min") ax.set_ylabel("Families (×1000)") ax.set_title("Unmet Demand vs Wasted Capacity") ax.legend() ax.grid(True, alpha=0.3) ax = axes[2, 1] ax.plot(results["k_min"], results["std_eff"], color="tab:orange", lw=2) ax.axvline(selected_k, color="gray", ls="--") ax.set_xlabel("k_min") ax.set_ylabel("Std Effectiveness") ax.set_title("Effectiveness Std vs k_min") ax.grid(True, alpha=0.3) ax = axes[3, 0] ax.plot(results["k_min"], results["gini_eff"], color="tab:purple", lw=2) ax.axhline(0.2, color="gray", ls=":", lw=1) ax.axvline(selected_k, color="r", ls="--") ax.set_xlabel("k_min") ax.set_ylabel("Gini Coefficient") ax.set_title("Gini (Effectiveness) vs k_min") ax.grid(True, alpha=0.3) axes[3, 1].axis("off") plt.tight_layout() os.makedirs(output_dir, exist_ok=True) plt.savefig(os.path.join(output_dir, "p3_kmin_effectiveness.png"), dpi=150) plt.close(fig) return selected_k def main() -> None: parser = argparse.ArgumentParser(description="Task 3 k-min allocation with two-stop pairing.") parser.add_argument("--input-xlsx", default=INPUT_XLSX) parser.add_argument("--input-pairs", default=INPUT_PAIRS) parser.add_argument("--output-dir", default=OUTPUT_DIR) parser.add_argument("--kmin-start", type=float, default=1.0) parser.add_argument("--kmin-end", type=float, default=10.0) parser.add_argument("--kmin-step", type=float, default=0.1) parser.add_argument("--capacity", type=float, default=C_OPT) parser.add_argument("--n-target", type=int, default=N_TARGET) parser.add_argument("--alpha", type=float, default=ALPHA) parser.add_argument("--beta", type=float, default=BETA) args = parser.parse_args() sites = load_sites(args.input_xlsx) pairs = pd.read_csv(args.input_pairs) required_cols = { "site_i_idx", "site_j_idx", "score_mean", "q_opt", "served_i_mean", "served_j_mean", "distance_miles", } missing = required_cols.difference(pairs.columns) if missing: raise ValueError(f"Missing columns in pairs CSV: {sorted(missing)}") pairs = pairs.sort_values( ["score_mean", "distance_miles"], ascending=[False, True] ).reset_index(drop=True) k_range = np.arange(args.kmin_start, args.kmin_end + 1e-9, args.kmin_step) results = [] for k_min in k_range: visits, singles, pairs_with_counts, n_total_guess = allocate_with_pairs( sites, pairs, float(k_min), n_target=args.n_target, capacity=args.capacity, ) if visits is None: continue metrics = _compute_metrics( sites, visits, pairs_with_counts, singles, alpha=args.alpha, beta=args.beta, capacity=args.capacity, ) pair_total = int(pairs_with_counts["pair_count"].sum()) total_trips = int(visits.sum() - pair_total) row = { "k_min": float(k_min), "effectiveness": metrics["effectiveness"], "min_eff": metrics["min_eff"], "bottom10_eff": metrics["bottom10_eff"], "gini_eff": metrics["gini_eff"], "std_eff": metrics["std_eff"], "unmet": metrics["total_unmet"], "waste": metrics["total_waste"], "total_served": metrics["total_served"], "total_demand": metrics["total_demand"], "serve_ratio": metrics["serve_ratio"], "total_visits_allocated": int(visits.sum()), "pair_trips": pair_total, "total_trips": total_trips, "n_total_guess": int(n_total_guess), } results.append(row) results_df = pd.DataFrame(results) if len(results_df) == 0: raise RuntimeError("No feasible k_min values found.") best_k = select_kmin(results_df) best_idx = (results_df["k_min"] - best_k).abs().idxmin() visits, singles, pairs_with_counts, n_total_guess = allocate_with_pairs( sites, pairs, float(best_k), n_target=args.n_target, capacity=args.capacity, ) metrics = _compute_metrics( sites, visits, pairs_with_counts, singles, alpha=args.alpha, beta=args.beta, capacity=args.capacity, ) pair_total = int(pairs_with_counts["pair_count"].sum()) total_trips = int(visits.sum() - pair_total) site_rows = pd.DataFrame( { "site_idx": sites["site_idx"], "site_name": sites["site_name"], "total_visits_allocated": visits, "single_visits": singles, "paired_first": metrics["pair_first"].astype(int), "paired_second": metrics["pair_second"].astype(int), "paired_total": (metrics["pair_first"] + metrics["pair_second"]).astype(int), } ) pairs_out = pairs_with_counts.loc[pairs_with_counts["pair_count"] > 0].copy() pairs_out = pairs_out.sort_values(["pair_count", "score_mean"], ascending=[False, False]) os.makedirs(args.output_dir, exist_ok=True) results_df.to_csv(os.path.join(args.output_dir, "p3_kmin_data.csv"), index=False) site_rows.to_csv(os.path.join(args.output_dir, "p3_kmin_sites.csv"), index=False) pairs_out.to_csv(os.path.join(args.output_dir, "p3_kmin_pairs.csv"), index=False) if _HAS_MPL: plot_results(results_df, args.output_dir) else: print("未检测到 matplotlib,跳过绘图(仍会保存CSV结果)。") print(f"Best k_min={best_k:.1f} (total_trips={total_trips}, pair_trips={pair_total})") print( "Saved: data/p3_kmin_data.csv, data/p3_kmin_sites.csv, " "data/p3_kmin_pairs.csv, data/p3_kmin_effectiveness.png" ) if __name__ == "__main__": main()