commit 44b5b3182586a5f25814838cdc6d6744ae9addff Author: ntnt Date: Mon Jan 19 01:40:19 2026 +0800 P1 diff --git a/task1/01_clean.py b/task1/01_clean.py new file mode 100644 index 0000000..6d63e74 --- /dev/null +++ b/task1/01_clean.py @@ -0,0 +1,58 @@ +import pandas as pd + + +INPUT_XLSX = "data.xlsx" +OUTPUT_XLSX = "task1/01_clean.xlsx" +SHEET_NAME = "addresses2019 updated" + + +def main() -> None: + df_raw = pd.read_excel(INPUT_XLSX, sheet_name=SHEET_NAME) + + required = [ + "Site Name", + "latitude", + "longitude", + "Number of Visits in 2019", + "Average Demand per Visit", + "StDev(Demand per Visit)", + ] + missing = [c for c in required if c not in df_raw.columns] + if missing: + raise ValueError(f"Missing required columns: {missing}") + + df = df_raw[required].copy() + df = df.rename( + columns={ + "Site Name": "site_name", + "latitude": "lat", + "longitude": "lon", + "Number of Visits in 2019": "visits_2019", + "Average Demand per Visit": "mu_clients_per_visit", + "StDev(Demand per Visit)": "sd_clients_per_visit", + } + ) + + df.insert(0, "site_id", range(1, len(df) + 1)) + + numeric_cols = ["lat", "lon", "visits_2019", "mu_clients_per_visit", "sd_clients_per_visit"] + for col in numeric_cols: + df[col] = pd.to_numeric(df[col], errors="coerce") + + if df["site_name"].isna().any(): + raise ValueError("Found missing site_name values.") + if df[numeric_cols].isna().any().any(): + bad = df[df[numeric_cols].isna().any(axis=1)][["site_id", "site_name"] + numeric_cols] + raise ValueError(f"Found missing numeric values:\n{bad}") + if (df["mu_clients_per_visit"] < 0).any() or (df["sd_clients_per_visit"] < 0).any(): + raise ValueError("Found negative mu/sd values; expected nonnegative.") + if (df["visits_2019"] <= 0).any(): + raise ValueError("Found non-positive visits_2019; expected >0 for all 70 regular sites.") + + with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as writer: + df.to_excel(writer, index=False, sheet_name="sites") + + +if __name__ == "__main__": + main() + diff --git a/task1/01_clean.xlsx b/task1/01_clean.xlsx new file mode 100644 index 0000000..accd722 Binary files /dev/null and b/task1/01_clean.xlsx differ diff --git a/task1/02_neighbor.xlsx b/task1/02_neighbor.xlsx new file mode 100644 index 0000000..126638e Binary files /dev/null and b/task1/02_neighbor.xlsx differ diff --git a/task1/02_neighbor_demand.py b/task1/02_neighbor_demand.py new file mode 100644 index 0000000..f3af973 --- /dev/null +++ b/task1/02_neighbor_demand.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +import math + +import numpy as np +import pandas as pd + + +INPUT_XLSX = "task1/01_clean.xlsx" +OUTPUT_XLSX = "task1/02_neighbor.xlsx" + +RHO_MILES_LIST = [10.0, 20.0, 30.0] + + +def haversine_miles(lat1: float, lon1: float, lat2: float, lon2: float) -> float: + r_miles = 3958.7613 + phi1 = math.radians(lat1) + phi2 = math.radians(lat2) + dphi = math.radians(lat2 - lat1) + dlambda = math.radians(lon2 - lon1) + + a = math.sin(dphi / 2.0) ** 2 + math.cos(phi1) * math.cos(phi2) * math.sin(dlambda / 2.0) ** 2 + c = 2.0 * math.atan2(math.sqrt(a), math.sqrt(1.0 - a)) + return r_miles * c + + +def main() -> None: + sites = pd.read_excel(INPUT_XLSX, sheet_name="sites") + + lat = sites["lat"].to_numpy(float) + lon = sites["lon"].to_numpy(float) + mu = sites["mu_clients_per_visit"].to_numpy(float) + + n = len(sites) + dist = np.zeros((n, n), dtype=float) + for i in range(n): + for j in range(i + 1, n): + d = haversine_miles(lat[i], lon[i], lat[j], lon[j]) + dist[i, j] = d + dist[j, i] = d + + out = sites.copy() + out["neighbor_demand_mu_self"] = mu.copy() + for rho in RHO_MILES_LIST: + w = np.exp(-(dist**2) / (2.0 * rho * rho)) + # D_i(rho) = sum_j mu_j * exp(-dist(i,j)^2 / (2 rho^2)) + out[f"neighbor_demand_mu_rho_{int(rho)}mi"] = w @ mu + + dist_df = pd.DataFrame(dist, columns=sites["site_id"].tolist()) + dist_df.insert(0, "site_id", sites["site_id"]) + + with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as writer: + out.to_excel(writer, index=False, sheet_name="sites") + dist_df.to_excel(writer, index=False, sheet_name="dist_miles") + + +if __name__ == "__main__": + main() + diff --git a/task1/03_allocate.xlsx b/task1/03_allocate.xlsx new file mode 100644 index 0000000..e586260 Binary files /dev/null and b/task1/03_allocate.xlsx differ diff --git a/task1/03_allocate_k.py b/task1/03_allocate_k.py new file mode 100644 index 0000000..5ea797d --- /dev/null +++ b/task1/03_allocate_k.py @@ -0,0 +1,301 @@ +from __future__ import annotations + +import math +from dataclasses import dataclass + +import numpy as np +import pandas as pd + + +INPUT_XLSX = "task1/02_neighbor.xlsx" +OUTPUT_XLSX = "task1/03_allocate.xlsx" + +N_TOTAL_2021 = 730 # scenario B: 2 sites/day * 365 days +K_MIN = 1 + +RHO_COLS = [ + ("self", "neighbor_demand_mu_self"), + ("rho10", "neighbor_demand_mu_rho_10mi"), + ("rho20", "neighbor_demand_mu_rho_20mi"), + ("rho30", "neighbor_demand_mu_rho_30mi"), +] + + +def gini(x: np.ndarray) -> float: + x = np.asarray(x, dtype=float) + if np.any(x < 0): + raise ValueError("Gini expects nonnegative values.") + if np.allclose(x.sum(), 0.0): + return 0.0 + n = len(x) + diff_sum = np.abs(x.reshape(-1, 1) - x.reshape(1, -1)).sum() + return diff_sum / (2.0 * n * n * x.mean()) + + +def largest_remainder_allocation(weights: np.ndarray, total: int, k_min: int) -> np.ndarray: + if total < k_min * len(weights): + raise ValueError("total too small for k_min constraint.") + w = np.asarray(weights, dtype=float) + if np.any(w < 0): + raise ValueError("weights must be nonnegative.") + if np.allclose(w.sum(), 0.0): + # Purely equal split + base = np.full(len(w), total // len(w), dtype=int) + base[: total - base.sum()] += 1 + return np.maximum(base, k_min) + + remaining = total - k_min * len(w) + w_norm = w / w.sum() + raw = remaining * w_norm + flo = np.floor(raw).astype(int) + k = flo + k_min + + need = total - k.sum() + if need > 0: + frac = raw - flo + order = np.argsort(-frac, kind="stable") + k[order[:need]] += 1 + elif need < 0: + frac = raw - flo + order = np.argsort(frac, kind="stable") + to_remove = -need + for idx in order: + if to_remove == 0: + break + if k[idx] > k_min: + k[idx] -= 1 + to_remove -= 1 + if k.sum() != total: + raise RuntimeError("Failed to adjust allocation to exact total.") + + if k.sum() != total: + raise RuntimeError("Allocation did not match total.") + if (k < k_min).any(): + raise RuntimeError("Allocation violated k_min.") + return k + + +def feasibility_sum_for_t(t: float, d: np.ndarray, mu: np.ndarray, k_min: int) -> int: + # Constraints: k_i >= max(k_min, ceil(t * D_i / mu_i)) + if t < 0: + return math.inf + req = np.ceil((t * d) / mu).astype(int) + req = np.maximum(req, k_min) + return int(req.sum()) + + +def max_min_service_level(d: np.ndarray, mu: np.ndarray, n_total: int, k_min: int) -> tuple[float, np.ndarray]: + if (mu <= 0).any(): + raise ValueError("mu must be positive to define service level.") + if (d <= 0).any(): + raise ValueError("neighbor demand proxy D must be positive.") + + # Upper bound for t: if all visits assigned to best site, service level there is (n_total*mu_i/d_i). + # For max-min, t cannot exceed min_i (n_total*mu_i/d_i) but k_min can also bind; use conservative. + t_hi = float(np.min((n_total * mu) / d)) + t_lo = 0.0 + + # Binary search for max feasible t + for _ in range(60): + t_mid = (t_lo + t_hi) / 2.0 + s = feasibility_sum_for_t(t_mid, d=d, mu=mu, k_min=k_min) + if s <= n_total: + t_lo = t_mid + else: + t_hi = t_mid + + t_star = t_lo + k_base = np.ceil((t_star * d) / mu).astype(int) + k_base = np.maximum(k_base, k_min) + + if k_base.sum() > n_total: + # Numerical edge case: back off to ensure feasibility + t_star *= 0.999 + k_base = np.ceil((t_star * d) / mu).astype(int) + k_base = np.maximum(k_base, k_min) + if k_base.sum() > n_total: + raise RuntimeError("Failed to construct feasible k at t*.") + + return t_star, k_base + + +def distribute_remaining_fair(k: np.ndarray, mu: np.ndarray, d: np.ndarray, remaining: int) -> np.ndarray: + # Greedy: repeatedly allocate to smallest current s_i = k_i*mu_i/d_i + k_out = k.copy() + for _ in range(remaining): + s = (k_out * mu) / d + idx = int(np.argmin(s)) + k_out[idx] += 1 + return k_out + + +def distribute_remaining_efficient(k: np.ndarray, mu: np.ndarray, remaining: int) -> np.ndarray: + # Greedy: allocate to highest mu_i (maximizes sum k_i*mu_i) + k_out = k.copy() + order = np.argsort(-mu, kind="stable") + for t in range(remaining): + k_out[order[t % len(order)]] += 1 + return k_out + + +@dataclass(frozen=True) +class AllocationResult: + method: str + scenario: str + k: np.ndarray + t_star: float | None + + +def compute_metrics(k: np.ndarray, mu: np.ndarray, d: np.ndarray) -> dict[str, float]: + s = (k * mu) / d + return { + "k_sum": float(k.sum()), + "total_expected_clients": float(np.dot(k, mu)), + "service_level_min": float(s.min()), + "service_level_mean": float(s.mean()), + "service_level_gini": float(gini(s)), + "service_level_cv": float(s.std(ddof=0) / (s.mean() + 1e-12)), + "k_gini": float(gini(k.astype(float))), + } + + +def improve_efficiency_under_gini_cap( + *, + k_start: np.ndarray, + mu: np.ndarray, + d: np.ndarray, + n_total: int, + k_min: int, + gini_cap: float, + max_iters: int = 20000, +) -> np.ndarray: + if k_start.sum() != n_total: + raise ValueError("k_start must sum to n_total.") + if (k_start < k_min).any(): + raise ValueError("k_start violates k_min.") + + k = k_start.copy() + s = (k * mu) / d + g = gini(s) + if g <= gini_cap: + return k + + for _ in range(max_iters): + s = (k * mu) / d + g = gini(s) + if g <= gini_cap: + break + + donor_candidates = np.argsort(-s, kind="stable")[:10] + receiver_candidates = np.argsort(s, kind="stable")[:10] + + best_move = None + best_score = None + + for donor in donor_candidates: + if k[donor] <= k_min: + continue + for receiver in receiver_candidates: + if donor == receiver: + continue + + k2 = k.copy() + k2[donor] -= 1 + k2[receiver] += 1 + s2 = (k2 * mu) / d + g2 = gini(s2) + if g2 >= g: + continue + + eff_loss = mu[donor] - mu[receiver] + if eff_loss < 0: + # This move increases effectiveness and improves fairness; prefer strongly. + score = (g - g2) * 1e9 + (-eff_loss) + else: + score = (g - g2) / (eff_loss + 1e-9) + + if best_score is None or score > best_score: + best_score = score + best_move = (donor, receiver, k2) + + if best_move is None: + # No improving local swap found; stop. + break + k = best_move[2] + + if k.sum() != n_total or (k < k_min).any(): + raise RuntimeError("Post-optimization allocation violated constraints.") + return k + + +def main() -> None: + sites = pd.read_excel(INPUT_XLSX, sheet_name="sites") + mu = sites["mu_clients_per_visit"].to_numpy(float) + visits_2019 = sites["visits_2019"].to_numpy(int) + + results: list[AllocationResult] = [] + metrics_rows: list[dict[str, object]] = [] + + for scenario, dcol in RHO_COLS: + d = sites[dcol].to_numpy(float) + + # Baseline: scaled 2019 to sum to N_TOTAL_2021 + k_2019_scaled = largest_remainder_allocation( + weights=visits_2019.astype(float), total=N_TOTAL_2021, k_min=K_MIN + ) + results.append(AllocationResult(method="baseline_2019_scaled", scenario=scenario, k=k_2019_scaled, t_star=None)) + + # Max-min baseline k achieving best possible min service level under sum constraint. + t_star, k_base = max_min_service_level(d=d, mu=mu, n_total=N_TOTAL_2021, k_min=K_MIN) + remaining = N_TOTAL_2021 - int(k_base.sum()) + + k_fair = distribute_remaining_fair(k_base, mu=mu, d=d, remaining=remaining) + results.append(AllocationResult(method="fairness_waterfill", scenario=scenario, k=k_fair, t_star=t_star)) + + k_eff = distribute_remaining_efficient(k_base, mu=mu, remaining=remaining) + results.append(AllocationResult(method="efficiency_post_fair", scenario=scenario, k=k_eff, t_star=t_star)) + + # Proportional to surrounding demand proxy D (uses "surrounding communities demand" directly) + k_D = largest_remainder_allocation(weights=d, total=N_TOTAL_2021, k_min=K_MIN) + results.append(AllocationResult(method="proportional_D", scenario=scenario, k=k_D, t_star=None)) + + # Simple proportional fairness: k proportional to D/mu gives near-constant s in continuous relaxation. + k_prop = largest_remainder_allocation(weights=d / mu, total=N_TOTAL_2021, k_min=K_MIN) + results.append(AllocationResult(method="proportional_D_over_mu", scenario=scenario, k=k_prop, t_star=None)) + + # Pure efficiency: k proportional to mu (with k_min) + k_mu = largest_remainder_allocation(weights=mu, total=N_TOTAL_2021, k_min=K_MIN) + results.append(AllocationResult(method="proportional_mu", scenario=scenario, k=k_mu, t_star=None)) + + # Efficiency-maximizing subject to "no worse fairness than baseline_2019_scaled" (auditable cap) + gini_cap = compute_metrics(k_2019_scaled, mu=mu, d=d)["service_level_gini"] + k_mu_capped = improve_efficiency_under_gini_cap( + k_start=k_mu, mu=mu, d=d, n_total=N_TOTAL_2021, k_min=K_MIN, gini_cap=float(gini_cap) + ) + results.append(AllocationResult(method="proportional_mu_gini_capped_to_2019", scenario=scenario, k=k_mu_capped, t_star=None)) + + # Assemble per-site output + out_sites = sites[["site_id", "site_name", "lat", "lon", "visits_2019", "mu_clients_per_visit", "sd_clients_per_visit"]].copy() + + for res in results: + col_k = f"k_{res.method}_{res.scenario}" + out_sites[col_k] = res.k + + d = sites[[c for s, c in RHO_COLS if s == res.scenario][0]].to_numpy(float) + s_vals = (res.k * mu) / d + out_sites[f"s_{res.method}_{res.scenario}"] = s_vals + + m = compute_metrics(res.k, mu=mu, d=d) + row = {"scenario": res.scenario, "method": res.method, "t_star": res.t_star} + row.update(m) + metrics_rows.append(row) + + metrics_df = pd.DataFrame(metrics_rows).sort_values(["scenario", "method"]).reset_index(drop=True) + + with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as writer: + out_sites.to_excel(writer, index=False, sheet_name="allocations") + metrics_df.to_excel(writer, index=False, sheet_name="metrics") + + +if __name__ == "__main__": + main() diff --git a/task1/04_schedule.xlsx b/task1/04_schedule.xlsx new file mode 100644 index 0000000..66dcb7d Binary files /dev/null and b/task1/04_schedule.xlsx differ diff --git a/task1/04_schedule_2021.py b/task1/04_schedule_2021.py new file mode 100644 index 0000000..9fd6ab1 --- /dev/null +++ b/task1/04_schedule_2021.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +import bisect +import datetime as dt +from dataclasses import dataclass + +import numpy as np +import pandas as pd + + +ALLOC_XLSX = "task1/03_allocate.xlsx" +OUTPUT_XLSX = "task1/04_schedule.xlsx" + +YEAR = 2021 +DAYS = 365 +SLOTS_PER_DAY = 2 # scenario B: 2 trucks, 2 distinct sites/day + +# Default recommendation +DEFAULT_SCENARIO = "rho20" +DEFAULT_METHOD = "proportional_D" + + +@dataclass(frozen=True) +class Event: + site_id: int + site_name: str + target_day: int # 1..365 + + +def build_targets(site_id: int, site_name: str, k: int) -> list[Event]: + if k <= 0: + return [] + targets: list[Event] = [] + for j in range(k): + # Even spacing: place j-th visit at (j+0.5)*DAYS/k + t = int(round((j + 0.5) * DAYS / k)) + t = max(1, min(DAYS, t)) + targets.append(Event(site_id=site_id, site_name=site_name, target_day=t)) + return targets + + +def assign_events_to_days(events: list[Event]) -> dict[int, list[Event]]: + # Initial binning by rounded target day + day_to_events: dict[int, list[Event]] = {d: [] for d in range(1, DAYS + 1)} + overflow: list[Event] = [] + + # Put into bins + for ev in sorted(events, key=lambda e: (e.target_day, e.site_id)): + day_to_events[ev.target_day].append(ev) + + # Enforce per-day capacity and per-day unique site_id + for day in range(1, DAYS + 1): + bucket = day_to_events[day] + if not bucket: + continue + seen: set[int] = set() + kept: list[Event] = [] + for ev in bucket: + if ev.site_id in seen: + overflow.append(ev) + else: + seen.add(ev.site_id) + kept.append(ev) + # If still over capacity, keep earliest (already sorted) and overflow rest + day_to_events[day] = kept[:SLOTS_PER_DAY] + overflow.extend(kept[SLOTS_PER_DAY:]) + + # Underfull days list + underfull_days: list[int] = [] + for day in range(1, DAYS + 1): + cap = SLOTS_PER_DAY - len(day_to_events[day]) + underfull_days.extend([day] * cap) + underfull_days.sort() + + # Fill underfull days with closest assignment to target_day + def day_has_site(day: int, site_id: int) -> bool: + return any(ev.site_id == site_id for ev in day_to_events[day]) + + for ev in sorted(overflow, key=lambda e: (e.target_day, e.site_id)): + if not underfull_days: + raise RuntimeError("No remaining capacity but overflow events remain.") + pos = bisect.bisect_left(underfull_days, ev.target_day) + candidate_positions = [] + for delta in range(0, len(underfull_days)): + # Check outward from the insertion point + for p in (pos - delta, pos + delta): + if 0 <= p < len(underfull_days): + candidate_positions.append(p) + if candidate_positions: + # We gathered some; break after first ring to keep cost small + break + + assigned_idx = None + for p in candidate_positions: + day = underfull_days[p] + if not day_has_site(day, ev.site_id): + assigned_idx = p + break + + if assigned_idx is None: + # Fallback: scan until we find any feasible slot + for p, day in enumerate(underfull_days): + if not day_has_site(day, ev.site_id): + assigned_idx = p + break + + if assigned_idx is None: + raise RuntimeError(f"Unable to place event for site_id={ev.site_id}; per-day uniqueness too strict.") + + day = underfull_days.pop(assigned_idx) + day_to_events[day].append(ev) + + # Final sanity: every day filled, and no day has duplicate site_id + for day in range(1, DAYS + 1): + if len(day_to_events[day]) != SLOTS_PER_DAY: + raise RuntimeError(f"Day {day} not filled: {len(day_to_events[day])} events.") + ids = [e.site_id for e in day_to_events[day]] + if len(set(ids)) != len(ids): + raise RuntimeError(f"Day {day} has duplicate site assignments.") + return day_to_events + + +def main() -> None: + alloc = pd.read_excel(ALLOC_XLSX, sheet_name="allocations") + + k_col = f"k_{DEFAULT_METHOD}_{DEFAULT_SCENARIO}" + if k_col not in alloc.columns: + raise ValueError(f"Allocation column not found: {k_col}") + + alloc = alloc[["site_id", "site_name", k_col]].copy() + alloc = alloc.rename(columns={k_col: "k_2021"}) + alloc["k_2021"] = pd.to_numeric(alloc["k_2021"], errors="raise").astype(int) + + if int(alloc["k_2021"].sum()) != DAYS * SLOTS_PER_DAY: + raise ValueError("k_2021 does not match total required slots.") + if (alloc["k_2021"] < 1).any(): + raise ValueError("k_2021 violates coverage constraint k_i >= 1.") + + events: list[Event] = [] + for row in alloc.itertuples(index=False): + events.extend(build_targets(site_id=int(row.site_id), site_name=str(row.site_name), k=int(row.k_2021))) + + if len(events) != DAYS * SLOTS_PER_DAY: + raise RuntimeError("Generated events mismatch total required slots.") + + day_to_events = assign_events_to_days(events) + + start = dt.date(YEAR, 1, 1) + calendar_rows: list[dict[str, object]] = [] + per_site_rows: list[dict[str, object]] = [] + + for day in range(1, DAYS + 1): + date = start + dt.timedelta(days=day - 1) + evs = sorted(day_to_events[day], key=lambda e: e.site_id) + calendar_rows.append( + { + "date": date.isoformat(), + "day_of_year": day, + "site1_id": evs[0].site_id, + "site1_name": evs[0].site_name, + "site2_id": evs[1].site_id, + "site2_name": evs[1].site_name, + } + ) + for slot, ev in enumerate(evs, start=1): + per_site_rows.append( + { + "site_id": ev.site_id, + "site_name": ev.site_name, + "date": date.isoformat(), + "day_of_year": day, + "slot": slot, + "target_day": ev.target_day, + } + ) + + calendar_df = pd.DataFrame(calendar_rows) + site_dates_df = pd.DataFrame(per_site_rows).sort_values(["site_id", "day_of_year"]).reset_index(drop=True) + + # Schedule quality metrics: gaps between visits for each site + gap_rows: list[dict[str, object]] = [] + for site_id, group in site_dates_df.groupby("site_id"): + days = group["day_of_year"].to_numpy(int) + gaps = np.diff(days) + if len(gaps) == 0: + gap_rows.append({"site_id": int(site_id), "k": 1, "gap_max": None, "gap_mean": None, "gap_std": None}) + else: + gap_rows.append( + { + "site_id": int(site_id), + "k": int(len(days)), + "gap_max": int(gaps.max()), + "gap_mean": float(gaps.mean()), + "gap_std": float(gaps.std(ddof=0)), + } + ) + gap_df = pd.DataFrame(gap_rows).merge(alloc[["site_id", "site_name"]], on="site_id", how="left") + + meta_df = pd.DataFrame( + [ + {"key": "year", "value": YEAR}, + {"key": "days", "value": DAYS}, + {"key": "slots_per_day", "value": SLOTS_PER_DAY}, + {"key": "total_visits", "value": int(DAYS * SLOTS_PER_DAY)}, + {"key": "allocation_scenario", "value": DEFAULT_SCENARIO}, + {"key": "allocation_method", "value": DEFAULT_METHOD}, + {"key": "k_column", "value": k_col}, + ] + ) + + with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as writer: + meta_df.to_excel(writer, index=False, sheet_name="meta") + calendar_df.to_excel(writer, index=False, sheet_name="calendar") + site_dates_df.to_excel(writer, index=False, sheet_name="site_dates") + gap_df.to_excel(writer, index=False, sheet_name="gap_metrics") + + +if __name__ == "__main__": + main() diff --git a/task1/README.md b/task1/README.md new file mode 100644 index 0000000..94b939e --- /dev/null +++ b/task1/README.md @@ -0,0 +1,108 @@ +# Task 1(2021 年 MFP 计划)——可审计建模与排程(论文式说明) + +## 摘要 +本文给出一套在“仅有站点经纬度、2019 年访问次数、单次到访人数均值/标准差”的数据条件下,仍然**闭环、可复现、可审计**的 2021 年 MFP 访问频次与全年日历排程方案。方案遵循三层结构: +(1) 用空间核平滑把“周边社区总需求”落地为站点邻域需求代理; +(2) 在给定全年总访问次数约束下分配每个站点年度访问次数; +(3) 将年度次数转换为 2021 年逐日(每天 2 个站点)可发布的排程日历。 +本文严格采用题面确认的运营情景:**每天 2 个站点、全年 365 天运营、总访问次数 `N_total=730`、必须覆盖全部 70 个站点、且不建单次容量上限**。 + +## 1. 问题与数据 +### 1.1 输入数据(`data.xlsx`) +对每个站点 `i=1..70`,数据给出: +- 位置:`(lat_i, lon_i)` +- 2019 年访问次数:`v_i^{2019}` +- 单次到访人数统计量:均值 `μ_i`、标准差 `σ_i`(单位为 “clients per visit”,按题面口径理解) + +### 1.2 决策变量与约束 +我们需要为 2021 年决策每个站点年度访问次数 `k_i`,并生成逐日排程。 +- 覆盖约束(题面/用户确认):`k_i >= 1` +- 总资源约束(情景 B):`Σ_i k_i = N_total = 730`(= 365 天 × 2 站点/天) + +## 2. “周边社区总需求”的可审计代理 +题面要求“频次由周边社区总需求指导”,但数据没有社区人口/贫困率等外部字段,因此我们将其定义为**可审计的空间平滑代理**。 + +### 2.1 距离 +使用 haversine 距离 `dist(i,j)`(英里)。 + +### 2.2 高斯核平滑(核心) +给定尺度参数 `ρ`(英里),定义站点 i 的邻域需求代理: + +`D_i(ρ) = Σ_j μ_j · exp( - dist(i,j)^2 / (2ρ^2) )` + +含义:越近的站点对“周边需求”贡献越大,且贡献按距离平滑衰减;`ρ` 越大表示“更大范围的周边”。 + +### 2.3 敏感性分析 +本文默认同时计算 `ρ ∈ {10, 20, 30}` miles 三个情景,用于审计“周边”尺度选择对结果的影响。 + +## 3. 年度频次分配:有效性与公平性 +### 3.1 有效性(Effectiveness) +用户确认“不建单次容量上限”,因此总体服务量(期望)可用下式作为有效性代理: + +`Eff = Σ_i k_i · μ_i` + +该指标等价于:假设单次服务量随到访人数线性增长,且不考虑单次运力/食品上限截断。 + +### 3.2 公平性(Fairness):服务水平而非次数 +题面“served much better”更自然地对应“服务水平/满足率”而非“访问次数相等”。 +我们定义站点 i 的服务水平(对邻域需求的相对供给)为: + +`S_i(ρ) = (k_i · μ_i) / D_i(ρ)` + +然后用两类审计指标衡量不均等: +- `min_i S_i(ρ)`:最弱站点的服务水平(max-min 视角) +- `Gini({S_i(ρ)})`:服务水平分布的不均等程度(标准 Gini 公式) + +### 3.3 分配规则(主推荐:按周边需求比例分配) +在覆盖约束 `k_i>=1` 下,我们采用**按周边需求代理 `D_i(ρ)` 比例分配剩余次数**: +1) 先给每站点 1 次:`k_i := 1` +2) 将剩余 `R = N_total - 70` 次按权重 `w_i = D_i(ρ)` 做整数分配(largest remainder / Hamilton 方法): + - 连续目标:`k_i ≈ 1 + R · w_i/Σw` + - 再通过取整与余数分配保证 `Σk_i=N_total` + +该规则的解释性很强:**周边需求越大,年度访问越多**,且覆盖约束保证所有站点至少一次。 + +### 3.4 基线与对比 +为可审计地量化改进,输出中包含“2019 访问次数按比例缩放到 730 次”的基线(`baseline_2019_scaled`),并在同一套指标下对比: +- 总有效性 `Σ k_i μ_i` +- 公平性 `Gini(S)`、`min S` + +## 4. 排程层(何时去):将 `k_i` 变成 2021 日历 +目标:把每站点年度次数转成可发布的具体日期,同时保证每天正好 2 个不同站点。 + +### 4.1 均匀间隔的目标日期 +对站点 i 的第 `j` 次访问(`j=0..k_i-1`),设理想目标日(1..365)为: + +`t_{i,j} = round( (j+0.5) · 365 / k_i )` + +直观含义:尽量均匀地把 `k_i` 次撒在全年。 + +### 4.2 装箱与修复 +先按 `t_{i,j}` 把访问事件放入对应日期桶;若某天超过容量(2 个站点)或出现同一站点重复,则将溢出事件移动到最接近的仍有空位且不重复的日期,直至: +- 每天正好 2 个站点 +- 每天两站点不同 +- 总计 730 个事件全部入日历 + +输出同时给出每站点相邻访问间隔的统计(最大/均值/标准差),用于审计“服务连续性”。 + +## 5. 可复现流水线(脚本 + xlsx 传输) +按步骤运行(从项目根目录): +1) `python task1/01_clean.py` → `task1/01_clean.xlsx`(标准化字段、补 `site_id`) +2) `python task1/02_neighbor_demand.py` → `task1/02_neighbor.xlsx`(`D_i(ρ)` 与距离矩阵) +3) `python task1/03_allocate_k.py` → `task1/03_allocate.xlsx`(多种分配方法 + 指标对比) +4) `python task1/04_schedule_2021.py` → `task1/04_schedule.xlsx`(2021 日历排程;默认 `ρ=20mi` + `proportional_D`) + +### 5.1 关键输出表 +- `task1/03_allocate.xlsx`: + - `allocations`:每站点的 `k_i` 以及对应 `S_i(ρ)`(按不同 `ρ`/方法) + - `metrics`:每种方法/情景的有效性与公平性汇总 +- `task1/04_schedule.xlsx`: + - `meta`:排程采用的 `ρ` 与方法列名 + - `calendar`:每天两个站点(可直接发布的日历) + - `site_dates`:每站点的具体日期列表 + - `gap_metrics`:每站点访问间隔统计(连续性审计) + +## 6. 假设与局限(必须在正文显式声明) +- 由于用户确认“不建单次容量上限”,本文未建模单次运力/食品约束,也无法估计缺供概率;有效性以 `Σ k_i μ_i` 作为线性代理。 +- `μ_i, σ_i` 被视为“真实到访需求”的代理统计量;若历史存在供给截断,则高需求站点可能被系统性低估(需在后续任务中引入容量或外部数据修正)。 +- “周边需求”完全由站点间空间平滑构造;`ρ` 的选择需要与 FBST 对“可接受出行半径”的运营经验校准,因此本文提供 `ρ∈{10,20,30}` 的敏感性结果便于审计与讨论。