529 lines
17 KiB
Python
529 lines
17 KiB
Python
"""
|
||
Task 3: k-min allocation with ordered two-stop pairing.
|
||
|
||
- Uses 2019 data to allocate visit frequencies.
|
||
- Pairs are drawn from ordered_pairs_allocation_k6_cap250.csv (ordered i->j).
|
||
- Total annual trips (paired + single) are fixed to N_TARGET via fixed-point adjustment.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import os
|
||
import math
|
||
from typing import Dict, List, Tuple
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
|
||
try:
|
||
import matplotlib
|
||
|
||
matplotlib.use("Agg")
|
||
import matplotlib.pyplot as plt
|
||
|
||
_HAS_MPL = True
|
||
except ModuleNotFoundError:
|
||
plt = None
|
||
_HAS_MPL = False
|
||
|
||
|
||
INPUT_XLSX = "prob/MFP Regular Sites 2019.xlsx"
|
||
INPUT_PAIRS = "data/ordered_pairs_allocation_k6_cap250.csv"
|
||
OUTPUT_DIR = "data"
|
||
|
||
C_OPT = 250
|
||
N_TARGET = 730
|
||
ALPHA = 0.6
|
||
BETA = 0.2
|
||
N_SIMS = 2000
|
||
RANDOM_SEED = 606
|
||
|
||
|
||
def gini_coefficient(values: np.ndarray) -> float:
|
||
x = np.asarray(values, dtype=float)
|
||
x = x[np.isfinite(x)]
|
||
if x.size == 0:
|
||
return 0.0
|
||
x = np.clip(x, 0, None)
|
||
total = x.sum()
|
||
if total <= 0:
|
||
return 0.0
|
||
|
||
x_sorted = np.sort(x)
|
||
n = x_sorted.size
|
||
idx = np.arange(1, n + 1, dtype=float)
|
||
return float((2.0 * (idx * x_sorted).sum()) / (n * total) - (n + 1.0) / n)
|
||
|
||
|
||
def _norm_pdf(z):
|
||
return np.exp(-0.5 * z * z) / np.sqrt(2.0 * np.pi)
|
||
|
||
|
||
def _norm_cdf(z):
|
||
z = np.asarray(z, dtype=float)
|
||
erf_vec = np.vectorize(math.erf, otypes=[float])
|
||
return 0.5 * (1.0 + erf_vec(z / np.sqrt(2.0)))
|
||
|
||
|
||
def expected_clipped_normal(mu, sigma, lower=0.0, upper=1.0):
|
||
mu = np.asarray(mu, dtype=float)
|
||
sigma = np.asarray(sigma, dtype=float)
|
||
lower = float(lower)
|
||
upper = float(upper)
|
||
|
||
if lower > upper:
|
||
raise ValueError("lower must be <= upper")
|
||
|
||
out = np.empty_like(mu, dtype=float)
|
||
mask = sigma > 0
|
||
out[~mask] = np.clip(mu[~mask], lower, upper)
|
||
|
||
if np.any(mask):
|
||
m = mu[mask]
|
||
s = sigma[mask]
|
||
|
||
z_u = (upper - m) / s
|
||
z_l = (lower - m) / s
|
||
|
||
Phi_u = _norm_cdf(z_u)
|
||
Phi_l = _norm_cdf(z_l)
|
||
phi_u = _norm_pdf(z_u)
|
||
phi_l = _norm_pdf(z_l)
|
||
|
||
ex_le_u = m * Phi_u - s * phi_u
|
||
ex_le_l = m * Phi_l - s * phi_l
|
||
|
||
p_le_l = Phi_l
|
||
p_gt_u = 1.0 - Phi_u
|
||
|
||
out[mask] = lower * p_le_l + (ex_le_u - ex_le_l) + upper * p_gt_u
|
||
|
||
return out
|
||
|
||
|
||
def _find_col(df: pd.DataFrame, candidates: List[str]) -> str:
|
||
for name in candidates:
|
||
if name in df.columns:
|
||
return name
|
||
lower_map = {c.lower(): c for c in df.columns}
|
||
for name in candidates:
|
||
key = name.lower()
|
||
if key in lower_map:
|
||
return lower_map[key]
|
||
raise ValueError(f"Missing required column. Tried: {candidates}")
|
||
|
||
|
||
def load_sites(path: str) -> pd.DataFrame:
|
||
df = pd.read_excel(path)
|
||
|
||
col_site = _find_col(df, ["Site Name", "site name", "site"])
|
||
col_mu = _find_col(df, ["Average Demand per Visit", "average demand per visit", "avg demand"])
|
||
col_sigma = _find_col(df, ["StDev(Demand per Visit)", "stdev(demand per visit)", "stdev", "std"])
|
||
col_visits = _find_col(df, ["Number of Visits in 2019", "number of visits in 2019", "visits"])
|
||
|
||
out = df[[col_site, col_mu, col_sigma, col_visits]].copy()
|
||
out.columns = ["site_name", "mu", "sigma", "visits_2019"]
|
||
out["mu"] = pd.to_numeric(out["mu"], errors="coerce")
|
||
out["sigma"] = pd.to_numeric(out["sigma"], errors="coerce").fillna(0.0)
|
||
out["visits_2019"] = pd.to_numeric(out["visits_2019"], errors="coerce")
|
||
if out[["mu", "visits_2019"]].isna().any().any():
|
||
missing = out[out[["mu", "visits_2019"]].isna().any(axis=1)]
|
||
raise ValueError(f"Missing mu/visits_2019 for {len(missing)} rows.")
|
||
|
||
out = out.reset_index(drop=True)
|
||
out["site_idx"] = np.arange(1, len(out) + 1, dtype=int)
|
||
out["TotalDemand"] = out["mu"] * out["visits_2019"]
|
||
return out
|
||
|
||
|
||
def allocate_visits(df: pd.DataFrame, k_min_real: float, n_total: int) -> np.ndarray:
|
||
df_sorted = df.sort_values("TotalDemand").reset_index(drop=False)
|
||
n = len(df_sorted)
|
||
|
||
k_floor = int(np.floor(k_min_real))
|
||
k_ceil = int(np.ceil(k_min_real))
|
||
frac = k_min_real - k_floor
|
||
|
||
n_ceil = int(round(n * frac))
|
||
n_floor = n - n_ceil
|
||
|
||
k_base = np.array([k_floor] * n_floor + [k_ceil] * n_ceil, dtype=int)
|
||
|
||
n_reserved = int(k_base.sum())
|
||
n_free = int(n_total - n_reserved)
|
||
if n_free < 0:
|
||
return None
|
||
|
||
weights = df_sorted["TotalDemand"] / df_sorted["TotalDemand"].sum()
|
||
allocated = (k_base + n_free * weights.values).round().astype(int)
|
||
allocated = np.maximum(allocated, k_base)
|
||
|
||
diff = int(n_total - allocated.sum())
|
||
if diff != 0:
|
||
sorted_idx = weights.sort_values(ascending=(diff < 0)).index.tolist()
|
||
for idx in sorted_idx[:abs(diff)]:
|
||
allocated[idx] += int(np.sign(diff))
|
||
|
||
alloc_sorted = df_sorted[["site_idx"]].copy()
|
||
alloc_sorted["AllocatedVisits"] = allocated
|
||
alloc = alloc_sorted.sort_values("site_idx")["AllocatedVisits"].to_numpy(dtype=int)
|
||
return alloc
|
||
|
||
|
||
def assign_pairs(pairs_df: pd.DataFrame, visits: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray]:
|
||
remaining = visits.astype(int).copy()
|
||
pair_counts = np.zeros(len(pairs_df), dtype=int)
|
||
|
||
for idx, row in pairs_df.iterrows():
|
||
i = int(row["site_i_idx"]) - 1
|
||
j = int(row["site_j_idx"]) - 1
|
||
if remaining[i] <= 0 or remaining[j] <= 0:
|
||
continue
|
||
t = int(min(remaining[i], remaining[j]))
|
||
if t <= 0:
|
||
continue
|
||
pair_counts[idx] = t
|
||
remaining[i] -= t
|
||
remaining[j] -= t
|
||
|
||
out = pairs_df.copy()
|
||
out["pair_count"] = pair_counts
|
||
return out, remaining
|
||
|
||
|
||
def _compute_metrics(
|
||
sites: pd.DataFrame,
|
||
visits: np.ndarray,
|
||
pairs_with_counts: pd.DataFrame,
|
||
singles: np.ndarray,
|
||
*,
|
||
alpha: float,
|
||
beta: float,
|
||
capacity: float,
|
||
) -> Dict[str, float]:
|
||
n = len(sites)
|
||
mu = sites["mu"].to_numpy(dtype=float)
|
||
sigma = sites["sigma"].to_numpy(dtype=float)
|
||
demand = sites["TotalDemand"].to_numpy(dtype=float)
|
||
|
||
eff_single = expected_clipped_normal(mu, sigma, lower=0.0, upper=capacity)
|
||
served_single = singles * eff_single
|
||
cap_single = singles * capacity
|
||
|
||
pair_first = np.zeros(n, dtype=float)
|
||
pair_second = np.zeros(n, dtype=float)
|
||
served_first = np.zeros(n, dtype=float)
|
||
served_second = np.zeros(n, dtype=float)
|
||
cap_first = np.zeros(n, dtype=float)
|
||
cap_second = np.zeros(n, dtype=float)
|
||
|
||
for _, row in pairs_with_counts.iterrows():
|
||
count = int(row["pair_count"])
|
||
if count <= 0:
|
||
continue
|
||
i = int(row["site_i_idx"]) - 1
|
||
j = int(row["site_j_idx"]) - 1
|
||
q_opt = float(row["q_opt"])
|
||
served_i = float(row["served_i_mean"])
|
||
served_j = float(row["served_j_mean"])
|
||
|
||
pair_first[i] += count
|
||
pair_second[j] += count
|
||
served_first[i] += count * served_i
|
||
served_second[j] += count * served_j
|
||
cap_first[i] += count * q_opt
|
||
cap_second[j] += count * (capacity - q_opt)
|
||
|
||
annual_eff = served_single + served_first + served_second
|
||
cap_total = cap_single + cap_first + cap_second
|
||
|
||
with np.errstate(divide="ignore", invalid="ignore"):
|
||
base = np.where(demand > 0, annual_eff / demand, 0.0)
|
||
unmet = np.where(demand > 0, np.maximum(0.0, demand - annual_eff) / demand, 0.0)
|
||
waste = np.where(cap_total > 0, np.maximum(0.0, cap_total - annual_eff) / cap_total, 0.0)
|
||
|
||
score = np.clip(base - alpha * unmet - beta * waste, 0.0, 1.0)
|
||
bottom_n = max(1, int(np.ceil(n * 0.10)))
|
||
|
||
total_served = float(annual_eff.sum())
|
||
total_demand = float(demand.sum())
|
||
total_unmet = float(np.maximum(0.0, demand - annual_eff).sum())
|
||
total_waste = float(np.maximum(0.0, cap_total - annual_eff).sum())
|
||
|
||
return {
|
||
"effectiveness": float(score.mean()),
|
||
"min_eff": float(score.min()),
|
||
"bottom10_eff": float(np.sort(score)[:bottom_n].mean()),
|
||
"gini_eff": float(gini_coefficient(score)),
|
||
"std_eff": float(score.std()),
|
||
"total_unmet": total_unmet,
|
||
"total_waste": total_waste,
|
||
"total_served": total_served,
|
||
"total_demand": total_demand,
|
||
"serve_ratio": float(total_served / total_demand) if total_demand > 0 else 0.0,
|
||
"score_per_site": score,
|
||
"annual_eff": annual_eff,
|
||
"pair_first": pair_first,
|
||
"pair_second": pair_second,
|
||
}
|
||
|
||
|
||
def allocate_with_pairs(
|
||
sites: pd.DataFrame,
|
||
pairs_df: pd.DataFrame,
|
||
k_min: float,
|
||
*,
|
||
n_target: int,
|
||
capacity: float,
|
||
max_iter: int = 30,
|
||
) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame, int]:
|
||
n_total_guess = int(n_target)
|
||
|
||
for _ in range(max_iter):
|
||
visits = allocate_visits(sites, k_min, n_total_guess)
|
||
if visits is None:
|
||
return None, None, None, None
|
||
|
||
pairs_with_counts, singles = assign_pairs(pairs_df, visits)
|
||
pair_total = int(pairs_with_counts["pair_count"].sum())
|
||
new_guess = int(n_target + pair_total)
|
||
if new_guess == n_total_guess:
|
||
return visits, singles, pairs_with_counts, n_total_guess
|
||
n_total_guess = new_guess
|
||
|
||
return visits, singles, pairs_with_counts, n_total_guess
|
||
|
||
|
||
def select_kmin(results: pd.DataFrame) -> float:
|
||
gini_candidates = results.loc[results["gini_eff"] < 0.2, "k_min"]
|
||
if len(gini_candidates) > 0:
|
||
return float(gini_candidates.iloc[0])
|
||
idx = results["effectiveness"].idxmax()
|
||
return float(results.loc[idx, "k_min"])
|
||
|
||
|
||
def plot_results(results: pd.DataFrame, output_dir: str) -> float:
|
||
if not _HAS_MPL:
|
||
raise RuntimeError("缺少依赖: matplotlib(无法绘图)。请先安装 matplotlib 再运行绘图部分。")
|
||
|
||
fig, axes = plt.subplots(4, 2, figsize=(12, 13))
|
||
|
||
selected_k = select_kmin(results)
|
||
selected_idx = (results["k_min"] - selected_k).abs().idxmin()
|
||
selected_eff = float(results.loc[selected_idx, "effectiveness"])
|
||
selected_label = f"Selected k_min={selected_k:.1f}"
|
||
|
||
ax = axes[0, 0]
|
||
ax.plot(results["k_min"], results["effectiveness"], "b-", lw=2)
|
||
ax.axvline(selected_k, color="r", ls="--", label=selected_label)
|
||
ax.scatter([selected_k], [selected_eff], c="r", s=100, zorder=5)
|
||
ax.set_xlabel("k_min")
|
||
ax.set_ylabel("Mean Effectiveness")
|
||
ax.set_title("Mean Effectiveness vs k_min")
|
||
ax.legend()
|
||
ax.grid(True, alpha=0.3)
|
||
|
||
ax = axes[0, 1]
|
||
ax.plot(results["k_min"], results["bottom10_eff"], "m-", lw=2)
|
||
ax.axvline(selected_k, color="r", ls="--")
|
||
ax.set_xlabel("k_min")
|
||
ax.set_ylabel("Bottom 10% Mean Effectiveness")
|
||
ax.set_title("Bottom 10% Mean Effectiveness vs k_min")
|
||
ax.grid(True, alpha=0.3)
|
||
|
||
ax = axes[1, 0]
|
||
ax.plot(results["k_min"], results["total_served"] / 1000, "c-", lw=2)
|
||
ax.axhline(results["total_demand"].iloc[0] / 1000, color="gray", ls=":", label="Total Demand")
|
||
ax.axvline(selected_k, color="r", ls="--")
|
||
ax.set_xlabel("k_min")
|
||
ax.set_ylabel("Served Families (×1000)")
|
||
ax.set_title("Total Served vs k_min")
|
||
ax.legend()
|
||
ax.grid(True, alpha=0.3)
|
||
|
||
ax = axes[1, 1]
|
||
ax.plot(results["k_min"], results["min_eff"], "g-", lw=2)
|
||
ax.axvline(selected_k, color="r", ls="--")
|
||
ax.set_xlabel("k_min")
|
||
ax.set_ylabel("Min Effectiveness")
|
||
ax.set_title("Worst Site Effectiveness vs k_min")
|
||
ax.grid(True, alpha=0.3)
|
||
|
||
ax = axes[2, 0]
|
||
ax.plot(results["k_min"], results["unmet"] / 1000, "r-", lw=2, label="Unmet")
|
||
ax.plot(results["k_min"], results["waste"] / 1000, "b-", lw=2, label="Waste")
|
||
ax.axvline(selected_k, color="gray", ls="--")
|
||
ax.set_xlabel("k_min")
|
||
ax.set_ylabel("Families (×1000)")
|
||
ax.set_title("Unmet Demand vs Wasted Capacity")
|
||
ax.legend()
|
||
ax.grid(True, alpha=0.3)
|
||
|
||
ax = axes[2, 1]
|
||
ax.plot(results["k_min"], results["std_eff"], color="tab:orange", lw=2)
|
||
ax.axvline(selected_k, color="gray", ls="--")
|
||
ax.set_xlabel("k_min")
|
||
ax.set_ylabel("Std Effectiveness")
|
||
ax.set_title("Effectiveness Std vs k_min")
|
||
ax.grid(True, alpha=0.3)
|
||
|
||
ax = axes[3, 0]
|
||
ax.plot(results["k_min"], results["gini_eff"], color="tab:purple", lw=2)
|
||
ax.axhline(0.2, color="gray", ls=":", lw=1)
|
||
ax.axvline(selected_k, color="r", ls="--")
|
||
ax.set_xlabel("k_min")
|
||
ax.set_ylabel("Gini Coefficient")
|
||
ax.set_title("Gini (Effectiveness) vs k_min")
|
||
ax.grid(True, alpha=0.3)
|
||
|
||
axes[3, 1].axis("off")
|
||
|
||
plt.tight_layout()
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
plt.savefig(os.path.join(output_dir, "p3_kmin_effectiveness.png"), dpi=150)
|
||
plt.close(fig)
|
||
|
||
return selected_k
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(description="Task 3 k-min allocation with two-stop pairing.")
|
||
parser.add_argument("--input-xlsx", default=INPUT_XLSX)
|
||
parser.add_argument("--input-pairs", default=INPUT_PAIRS)
|
||
parser.add_argument("--output-dir", default=OUTPUT_DIR)
|
||
parser.add_argument("--kmin-start", type=float, default=1.0)
|
||
parser.add_argument("--kmin-end", type=float, default=10.0)
|
||
parser.add_argument("--kmin-step", type=float, default=0.1)
|
||
parser.add_argument("--capacity", type=float, default=C_OPT)
|
||
parser.add_argument("--n-target", type=int, default=N_TARGET)
|
||
parser.add_argument("--alpha", type=float, default=ALPHA)
|
||
parser.add_argument("--beta", type=float, default=BETA)
|
||
args = parser.parse_args()
|
||
|
||
sites = load_sites(args.input_xlsx)
|
||
pairs = pd.read_csv(args.input_pairs)
|
||
required_cols = {
|
||
"site_i_idx",
|
||
"site_j_idx",
|
||
"score_mean",
|
||
"q_opt",
|
||
"served_i_mean",
|
||
"served_j_mean",
|
||
"distance_miles",
|
||
}
|
||
missing = required_cols.difference(pairs.columns)
|
||
if missing:
|
||
raise ValueError(f"Missing columns in pairs CSV: {sorted(missing)}")
|
||
|
||
pairs = pairs.sort_values(
|
||
["score_mean", "distance_miles"], ascending=[False, True]
|
||
).reset_index(drop=True)
|
||
|
||
k_range = np.arange(args.kmin_start, args.kmin_end + 1e-9, args.kmin_step)
|
||
results = []
|
||
|
||
for k_min in k_range:
|
||
visits, singles, pairs_with_counts, n_total_guess = allocate_with_pairs(
|
||
sites,
|
||
pairs,
|
||
float(k_min),
|
||
n_target=args.n_target,
|
||
capacity=args.capacity,
|
||
)
|
||
if visits is None:
|
||
continue
|
||
|
||
metrics = _compute_metrics(
|
||
sites,
|
||
visits,
|
||
pairs_with_counts,
|
||
singles,
|
||
alpha=args.alpha,
|
||
beta=args.beta,
|
||
capacity=args.capacity,
|
||
)
|
||
|
||
pair_total = int(pairs_with_counts["pair_count"].sum())
|
||
total_trips = int(visits.sum() - pair_total)
|
||
row = {
|
||
"k_min": float(k_min),
|
||
"effectiveness": metrics["effectiveness"],
|
||
"min_eff": metrics["min_eff"],
|
||
"bottom10_eff": metrics["bottom10_eff"],
|
||
"gini_eff": metrics["gini_eff"],
|
||
"std_eff": metrics["std_eff"],
|
||
"unmet": metrics["total_unmet"],
|
||
"waste": metrics["total_waste"],
|
||
"total_served": metrics["total_served"],
|
||
"total_demand": metrics["total_demand"],
|
||
"serve_ratio": metrics["serve_ratio"],
|
||
"total_visits_allocated": int(visits.sum()),
|
||
"pair_trips": pair_total,
|
||
"total_trips": total_trips,
|
||
"n_total_guess": int(n_total_guess),
|
||
}
|
||
results.append(row)
|
||
|
||
results_df = pd.DataFrame(results)
|
||
if len(results_df) == 0:
|
||
raise RuntimeError("No feasible k_min values found.")
|
||
|
||
best_k = select_kmin(results_df)
|
||
best_idx = (results_df["k_min"] - best_k).abs().idxmin()
|
||
|
||
visits, singles, pairs_with_counts, n_total_guess = allocate_with_pairs(
|
||
sites,
|
||
pairs,
|
||
float(best_k),
|
||
n_target=args.n_target,
|
||
capacity=args.capacity,
|
||
)
|
||
metrics = _compute_metrics(
|
||
sites,
|
||
visits,
|
||
pairs_with_counts,
|
||
singles,
|
||
alpha=args.alpha,
|
||
beta=args.beta,
|
||
capacity=args.capacity,
|
||
)
|
||
|
||
pair_total = int(pairs_with_counts["pair_count"].sum())
|
||
total_trips = int(visits.sum() - pair_total)
|
||
|
||
site_rows = pd.DataFrame(
|
||
{
|
||
"site_idx": sites["site_idx"],
|
||
"site_name": sites["site_name"],
|
||
"total_visits_allocated": visits,
|
||
"single_visits": singles,
|
||
"paired_first": metrics["pair_first"].astype(int),
|
||
"paired_second": metrics["pair_second"].astype(int),
|
||
"paired_total": (metrics["pair_first"] + metrics["pair_second"]).astype(int),
|
||
}
|
||
)
|
||
|
||
pairs_out = pairs_with_counts.loc[pairs_with_counts["pair_count"] > 0].copy()
|
||
pairs_out = pairs_out.sort_values(["pair_count", "score_mean"], ascending=[False, False])
|
||
|
||
os.makedirs(args.output_dir, exist_ok=True)
|
||
results_df.to_csv(os.path.join(args.output_dir, "p3_kmin_data.csv"), index=False)
|
||
site_rows.to_csv(os.path.join(args.output_dir, "p3_kmin_sites.csv"), index=False)
|
||
pairs_out.to_csv(os.path.join(args.output_dir, "p3_kmin_pairs.csv"), index=False)
|
||
|
||
if _HAS_MPL:
|
||
plot_results(results_df, args.output_dir)
|
||
else:
|
||
print("未检测到 matplotlib,跳过绘图(仍会保存CSV结果)。")
|
||
|
||
print(f"Best k_min={best_k:.1f} (total_trips={total_trips}, pair_trips={pair_total})")
|
||
print(
|
||
"Saved: data/p3_kmin_data.csv, data/p3_kmin_sites.csv, "
|
||
"data/p3_kmin_pairs.csv, data/p3_kmin_effectiveness.png"
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|