import pandas as pd INPUT_XLSX = "data.xlsx" OUTPUT_XLSX = "task1/01_clean.xlsx" SHEET_NAME = "addresses2019 updated" def main() -> None: df_raw = pd.read_excel(INPUT_XLSX, sheet_name=SHEET_NAME) required = [ "Site Name", "latitude", "longitude", "Number of Visits in 2019", "Average Demand per Visit", "StDev(Demand per Visit)", ] missing = [c for c in required if c not in df_raw.columns] if missing: raise ValueError(f"Missing required columns: {missing}") df = df_raw[required].copy() df = df.rename( columns={ "Site Name": "site_name", "latitude": "lat", "longitude": "lon", "Number of Visits in 2019": "visits_2019", "Average Demand per Visit": "mu_clients_per_visit", "StDev(Demand per Visit)": "sd_clients_per_visit", } ) df.insert(0, "site_id", range(1, len(df) + 1)) numeric_cols = ["lat", "lon", "visits_2019", "mu_clients_per_visit", "sd_clients_per_visit"] for col in numeric_cols: df[col] = pd.to_numeric(df[col], errors="coerce") if df["site_name"].isna().any(): raise ValueError("Found missing site_name values.") if df[numeric_cols].isna().any().any(): bad = df[df[numeric_cols].isna().any(axis=1)][["site_id", "site_name"] + numeric_cols] raise ValueError(f"Found missing numeric values:\n{bad}") if (df["mu_clients_per_visit"] < 0).any() or (df["sd_clients_per_visit"] < 0).any(): raise ValueError("Found negative mu/sd values; expected nonnegative.") if (df["visits_2019"] <= 0).any(): raise ValueError("Found non-positive visits_2019; expected >0 for all 70 regular sites.") with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as writer: df.to_excel(writer, index=False, sheet_name="sites") if __name__ == "__main__": main()