P1
This commit is contained in:
58
task1/01_clean.py
Normal file
58
task1/01_clean.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import pandas as pd
|
||||
|
||||
|
||||
INPUT_XLSX = "data.xlsx"
|
||||
OUTPUT_XLSX = "task1/01_clean.xlsx"
|
||||
SHEET_NAME = "addresses2019 updated"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
df_raw = pd.read_excel(INPUT_XLSX, sheet_name=SHEET_NAME)
|
||||
|
||||
required = [
|
||||
"Site Name",
|
||||
"latitude",
|
||||
"longitude",
|
||||
"Number of Visits in 2019",
|
||||
"Average Demand per Visit",
|
||||
"StDev(Demand per Visit)",
|
||||
]
|
||||
missing = [c for c in required if c not in df_raw.columns]
|
||||
if missing:
|
||||
raise ValueError(f"Missing required columns: {missing}")
|
||||
|
||||
df = df_raw[required].copy()
|
||||
df = df.rename(
|
||||
columns={
|
||||
"Site Name": "site_name",
|
||||
"latitude": "lat",
|
||||
"longitude": "lon",
|
||||
"Number of Visits in 2019": "visits_2019",
|
||||
"Average Demand per Visit": "mu_clients_per_visit",
|
||||
"StDev(Demand per Visit)": "sd_clients_per_visit",
|
||||
}
|
||||
)
|
||||
|
||||
df.insert(0, "site_id", range(1, len(df) + 1))
|
||||
|
||||
numeric_cols = ["lat", "lon", "visits_2019", "mu_clients_per_visit", "sd_clients_per_visit"]
|
||||
for col in numeric_cols:
|
||||
df[col] = pd.to_numeric(df[col], errors="coerce")
|
||||
|
||||
if df["site_name"].isna().any():
|
||||
raise ValueError("Found missing site_name values.")
|
||||
if df[numeric_cols].isna().any().any():
|
||||
bad = df[df[numeric_cols].isna().any(axis=1)][["site_id", "site_name"] + numeric_cols]
|
||||
raise ValueError(f"Found missing numeric values:\n{bad}")
|
||||
if (df["mu_clients_per_visit"] < 0).any() or (df["sd_clients_per_visit"] < 0).any():
|
||||
raise ValueError("Found negative mu/sd values; expected nonnegative.")
|
||||
if (df["visits_2019"] <= 0).any():
|
||||
raise ValueError("Found non-positive visits_2019; expected >0 for all 70 regular sites.")
|
||||
|
||||
with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as writer:
|
||||
df.to_excel(writer, index=False, sheet_name="sites")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user