modify: cleanup project structure and docs
This commit is contained in:
139
services/forecast.py
Normal file
139
services/forecast.py
Normal file
@@ -0,0 +1,139 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import streamlit as st
|
||||
import statsmodels.api as sm
|
||||
from statsmodels.tsa.arima.model import ARIMA
|
||||
from statsmodels.tools.sm_exceptions import ValueWarning
|
||||
from sklearn.neighbors import KNeighborsRegressor
|
||||
from sklearn.svm import SVR
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
from config.settings import ARIMA_P, ARIMA_D, ARIMA_Q, MAX_PRE_DAYS
|
||||
|
||||
|
||||
@st.cache_data(show_spinner=False)
|
||||
def evaluate_arima_model(series, arima_order):
|
||||
try:
|
||||
model = ARIMA(series, order=arima_order)
|
||||
model_fit = model.fit()
|
||||
return model_fit.aic
|
||||
except Exception:
|
||||
return float("inf")
|
||||
|
||||
|
||||
@st.cache_data(show_spinner=False)
|
||||
def arima_forecast_with_grid_search(accident_series: pd.Series,
|
||||
start_date: pd.Timestamp,
|
||||
horizon: int = 30,
|
||||
p_values: list = tuple(ARIMA_P),
|
||||
d_values: list = tuple(ARIMA_D),
|
||||
q_values: list = tuple(ARIMA_Q)) -> pd.DataFrame:
|
||||
series = accident_series.asfreq('D').fillna(0)
|
||||
start_date = pd.to_datetime(start_date)
|
||||
|
||||
warnings.filterwarnings("ignore", category=ValueWarning)
|
||||
best_score, best_cfg = float("inf"), None
|
||||
for p in p_values:
|
||||
for d in d_values:
|
||||
for q in q_values:
|
||||
order = (p, d, q)
|
||||
try:
|
||||
aic = evaluate_arima_model(series, order)
|
||||
if aic < best_score:
|
||||
best_score, best_cfg = aic, order
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
model = ARIMA(series, order=best_cfg)
|
||||
fit = model.fit()
|
||||
forecast_index = pd.date_range(start=start_date, periods=horizon, freq='D')
|
||||
res = fit.get_forecast(steps=horizon)
|
||||
df = res.summary_frame()
|
||||
df.index = forecast_index
|
||||
df.index.name = 'date'
|
||||
df.rename(columns={'mean': 'forecast'}, inplace=True)
|
||||
return df
|
||||
|
||||
|
||||
def knn_forecast_counterfactual(accident_series: pd.Series,
|
||||
intervention_date: pd.Timestamp,
|
||||
lookback: int = 14,
|
||||
horizon: int = 30):
|
||||
series = accident_series.asfreq('D').fillna(0)
|
||||
intervention_date = pd.to_datetime(intervention_date).normalize()
|
||||
|
||||
df = pd.DataFrame({'y': series})
|
||||
for i in range(1, lookback + 1):
|
||||
df[f'lag_{i}'] = df['y'].shift(i)
|
||||
|
||||
train = df.loc[:intervention_date - pd.Timedelta(days=1)].dropna()
|
||||
if len(train) < 5:
|
||||
return None, None
|
||||
X_train = train.filter(like='lag_').values
|
||||
y_train = train['y'].values
|
||||
knn = KNeighborsRegressor(n_neighbors=5)
|
||||
knn.fit(X_train, y_train)
|
||||
|
||||
history = df.loc[:intervention_date - pd.Timedelta(days=1), 'y'].tolist()
|
||||
preds = []
|
||||
for _ in range(horizon):
|
||||
if len(history) < lookback:
|
||||
return None, None
|
||||
x = np.array(history[-lookback:][::-1]).reshape(1, -1)
|
||||
pred = knn.predict(x)[0]
|
||||
preds.append(pred)
|
||||
history.append(pred)
|
||||
|
||||
pred_index = pd.date_range(intervention_date, periods=horizon, freq='D')
|
||||
return pd.Series(preds, index=pred_index, name='knn_pred'), None
|
||||
|
||||
|
||||
def fit_and_extrapolate(series: pd.Series,
|
||||
intervention_date: pd.Timestamp,
|
||||
days: int = 30,
|
||||
max_pre_days: int = MAX_PRE_DAYS):
|
||||
series = series.asfreq('D').fillna(0)
|
||||
series.index = pd.to_datetime(series.index).tz_localize(None).normalize()
|
||||
intervention_date = pd.to_datetime(intervention_date).tz_localize(None).normalize()
|
||||
|
||||
pre = series.loc[:intervention_date - pd.Timedelta(days=1)]
|
||||
if len(pre) > max_pre_days:
|
||||
pre = pre.iloc[-max_pre_days:]
|
||||
if len(pre) < 3:
|
||||
return None, None, None
|
||||
|
||||
x_pre = np.arange(len(pre))
|
||||
x_future = np.arange(len(pre), len(pre) + days)
|
||||
|
||||
try:
|
||||
X_pre_glm = sm.add_constant(np.column_stack([x_pre, x_pre**2]))
|
||||
glm = sm.GLM(pre.values, X_pre_glm, family=sm.families.Poisson())
|
||||
glm_res = glm.fit()
|
||||
X_future_glm = sm.add_constant(np.column_stack([x_future, x_future**2]))
|
||||
glm_pred = glm_res.predict(X_future_glm)
|
||||
except Exception:
|
||||
glm_pred = None
|
||||
|
||||
try:
|
||||
svr = make_pipeline(StandardScaler(), SVR(kernel='rbf', C=10, gamma=0.1))
|
||||
svr.fit(x_pre.reshape(-1, 1), pre.values)
|
||||
svr_pred = svr.predict(x_future.reshape(-1, 1))
|
||||
except Exception:
|
||||
svr_pred = None
|
||||
|
||||
post_index = pd.date_range(intervention_date, periods=days, freq='D')
|
||||
|
||||
glm_pred = pd.Series(glm_pred, index=post_index, name='glm_pred') if glm_pred is not None else None
|
||||
svr_pred = pd.Series(svr_pred, index=post_index, name='svr_pred') if svr_pred is not None else None
|
||||
|
||||
post = series.reindex(post_index)
|
||||
residuals = None
|
||||
if svr_pred is not None:
|
||||
residuals = pd.Series(post.values - svr_pred[:len(post)], index=post_index, name='residual')
|
||||
|
||||
return glm_pred, svr_pred, residuals
|
||||
|
||||
227
services/hotspot.py
Normal file
227
services/hotspot.py
Normal file
@@ -0,0 +1,227 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Iterable
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
LOCATION_KEYWORDS: tuple[str, ...] = (
|
||||
"路",
|
||||
"道",
|
||||
"街",
|
||||
"巷",
|
||||
"路口",
|
||||
"交叉口",
|
||||
"大道",
|
||||
"公路",
|
||||
"口",
|
||||
)
|
||||
AREA_KEYWORDS: tuple[str, ...] = (
|
||||
"新城",
|
||||
"临城",
|
||||
"千岛",
|
||||
"翁山",
|
||||
"海天",
|
||||
"海宇",
|
||||
"定沈",
|
||||
"滨海",
|
||||
"港岛",
|
||||
"体育",
|
||||
"长升",
|
||||
"金岛",
|
||||
"桃湾",
|
||||
)
|
||||
|
||||
LOCATION_MAPPING: dict[str, str] = {
|
||||
"新城千岛路": "千岛路",
|
||||
"千岛路海天大道": "千岛路海天大道口",
|
||||
"海天大道千岛路": "千岛路海天大道口",
|
||||
"新城翁山路": "翁山路",
|
||||
"翁山路金岛路": "翁山路金岛路口",
|
||||
"海天大道临长路": "海天大道临长路口",
|
||||
"定沈路卫生医院门口": "定沈路医院段",
|
||||
"翁山路海城路西口": "翁山路海城路口",
|
||||
"海宇道路口": "海宇道",
|
||||
"海天大道路口": "海天大道",
|
||||
"定沈路交叉路口": "定沈路",
|
||||
"千岛路路口": "千岛路",
|
||||
"体育路路口": "体育路",
|
||||
"金岛路路口": "金岛路",
|
||||
}
|
||||
|
||||
SEVERITY_MAP: dict[str, int] = {"财损": 1, "伤人": 2, "亡人": 4}
|
||||
|
||||
|
||||
def _extract_road_info(location: str | float | None) -> str:
|
||||
if pd.isna(location):
|
||||
return "未知路段"
|
||||
text = str(location)
|
||||
for keyword in LOCATION_KEYWORDS + AREA_KEYWORDS:
|
||||
if keyword in text:
|
||||
words = text.replace(",", " ").replace(",", " ").split()
|
||||
for word in words:
|
||||
if keyword in word:
|
||||
return word
|
||||
return text
|
||||
return text[:20] if len(text) > 20 else text
|
||||
|
||||
|
||||
def prepare_hotspot_dataset(accident_records: pd.DataFrame) -> pd.DataFrame:
|
||||
df = accident_records.copy()
|
||||
required_defaults: dict[str, str] = {
|
||||
"道路类型": "未知道路类型",
|
||||
"路口路段类型": "未知路段",
|
||||
"事故具体地点": "未知路段",
|
||||
"事故类型": "财损",
|
||||
"所在街道": "未知街道",
|
||||
}
|
||||
for column, default_value in required_defaults.items():
|
||||
if column not in df.columns:
|
||||
df[column] = default_value
|
||||
else:
|
||||
df[column] = df[column].fillna(default_value)
|
||||
|
||||
if "severity" not in df.columns:
|
||||
df["severity"] = df["事故类型"].map(SEVERITY_MAP).fillna(1).astype(int)
|
||||
|
||||
df["事故时间"] = pd.to_datetime(df["事故时间"], errors="coerce")
|
||||
df = df.dropna(subset=["事故时间"]).sort_values("事故时间").reset_index(drop=True)
|
||||
df["standardized_location"] = (
|
||||
df["事故具体地点"].apply(_extract_road_info).replace(LOCATION_MAPPING)
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
def analyze_hotspot_frequency(df: pd.DataFrame, time_window: str = "7D") -> pd.DataFrame:
|
||||
recent_cutoff = df["事故时间"].max() - pd.Timedelta(time_window)
|
||||
|
||||
overall_stats = df.groupby("standardized_location").agg(
|
||||
accident_count=("事故时间", "count"),
|
||||
last_accident=("事故时间", "max"),
|
||||
main_accident_type=("事故类型", _mode_fallback),
|
||||
main_road_type=("道路类型", _mode_fallback),
|
||||
main_intersection_type=("路口路段类型", _mode_fallback),
|
||||
total_severity=("severity", "sum"),
|
||||
)
|
||||
|
||||
recent_stats = (
|
||||
df[df["事故时间"] >= recent_cutoff]
|
||||
.groupby("standardized_location")
|
||||
.agg(
|
||||
recent_count=("事故时间", "count"),
|
||||
recent_accident_type=("事故类型", _mode_fallback),
|
||||
recent_severity=("severity", "sum"),
|
||||
)
|
||||
)
|
||||
|
||||
result = (
|
||||
overall_stats.merge(recent_stats, left_index=True, right_index=True, how="left")
|
||||
.fillna({"recent_count": 0, "recent_severity": 0})
|
||||
.fillna("")
|
||||
)
|
||||
result["recent_count"] = result["recent_count"].astype(int)
|
||||
result["trend_ratio"] = result["recent_count"] / result["accident_count"]
|
||||
result["days_since_last"] = (
|
||||
df["事故时间"].max() - result["last_accident"]
|
||||
).dt.days.astype(int)
|
||||
result["avg_severity"] = result["total_severity"] / result["accident_count"]
|
||||
return result.sort_values(["recent_count", "accident_count"], ascending=False)
|
||||
|
||||
|
||||
def calculate_hotspot_risk_score(hotspot_df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = hotspot_df.copy()
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
df["frequency_score"] = (df["accident_count"] / df["accident_count"].max() * 40).clip(
|
||||
0, 40
|
||||
)
|
||||
df["trend_score"] = (df["trend_ratio"] * 30).clip(0, 30)
|
||||
severity_map = {"财损": 5, "伤人": 15, "亡人": 20}
|
||||
df["severity_score"] = df["main_accident_type"].map(severity_map).fillna(5)
|
||||
df["urgency_score"] = ((30 - df["days_since_last"]) / 30 * 10).clip(0, 10)
|
||||
df["risk_score"] = (
|
||||
df["frequency_score"]
|
||||
+ df["trend_score"]
|
||||
+ df["severity_score"]
|
||||
+ df["urgency_score"]
|
||||
)
|
||||
conditions = [
|
||||
df["risk_score"] >= 70,
|
||||
df["risk_score"] >= 50,
|
||||
df["risk_score"] >= 30,
|
||||
]
|
||||
choices = ["高风险", "中风险", "低风险"]
|
||||
df["risk_level"] = np.select(conditions, choices, default="一般风险")
|
||||
return df.sort_values("risk_score", ascending=False)
|
||||
|
||||
|
||||
def generate_hotspot_strategies(
|
||||
hotspot_df: pd.DataFrame, time_period: str = "本周"
|
||||
) -> list[dict[str, str | float]]:
|
||||
strategies: list[dict[str, str | float]] = []
|
||||
for location_name, location_data in hotspot_df.iterrows():
|
||||
accident_count = float(location_data["accident_count"])
|
||||
recent_count = float(location_data.get("recent_count", 0))
|
||||
accident_type = str(location_data.get("main_accident_type", "财损"))
|
||||
intersection_type = str(location_data.get("main_intersection_type", "普通路段"))
|
||||
trend_ratio = float(location_data.get("trend_ratio", 0))
|
||||
risk_level = str(location_data.get("risk_level", "一般风险"))
|
||||
|
||||
base_info = f"{time_period}对【{location_name}】"
|
||||
data_support = (
|
||||
f"(近期{int(recent_count)}起,累计{int(accident_count)}起,{accident_type}为主)"
|
||||
)
|
||||
|
||||
strategy_parts: list[str] = []
|
||||
if "信号灯" in intersection_type:
|
||||
if accident_type == "财损":
|
||||
strategy_parts.extend(["加强闯红灯查处", "优化信号配时", "整治不按规定让行"])
|
||||
else:
|
||||
strategy_parts.extend(["完善人行过街设施", "加强非机动车管理", "设置警示标志"])
|
||||
elif "普通路段" in intersection_type:
|
||||
strategy_parts.extend(["加强巡逻管控", "整治违法停车", "设置限速标志"])
|
||||
else:
|
||||
strategy_parts.extend(["分析事故成因", "制定综合整治方案"])
|
||||
|
||||
if risk_level == "高风险":
|
||||
strategy_parts.extend(["列为重点整治路段", "开展专项整治行动"])
|
||||
elif risk_level == "中风险":
|
||||
strategy_parts.append("加强日常监管")
|
||||
|
||||
if trend_ratio > 0.4:
|
||||
strategy_parts.append("近期重点监控")
|
||||
|
||||
strategy_text = (
|
||||
base_info + "," + ",".join(strategy_parts) + data_support
|
||||
if strategy_parts
|
||||
else base_info + "加强交通安全管理" + data_support
|
||||
)
|
||||
|
||||
strategies.append(
|
||||
{
|
||||
"location": location_name,
|
||||
"strategy": strategy_text,
|
||||
"risk_level": risk_level,
|
||||
"accident_count": accident_count,
|
||||
"recent_count": recent_count,
|
||||
}
|
||||
)
|
||||
return strategies
|
||||
|
||||
|
||||
def serialise_datetime_columns(df: pd.DataFrame, columns: Iterable[str]) -> pd.DataFrame:
|
||||
result = df.copy()
|
||||
for column in columns:
|
||||
if column in result.columns and pd.api.types.is_datetime64_any_dtype(result[column]):
|
||||
result[column] = result[column].dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||
return result
|
||||
|
||||
|
||||
def _mode_fallback(series: pd.Series) -> str:
|
||||
if series.empty:
|
||||
return ""
|
||||
mode = series.mode()
|
||||
return str(mode.iloc[0]) if not mode.empty else str(series.iloc[0])
|
||||
|
||||
270
services/io.py
Normal file
270
services/io.py
Normal file
@@ -0,0 +1,270 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Iterable, Mapping
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
COLUMN_ALIASES: Mapping[str, str] = {
|
||||
'事故发生时间': '事故时间',
|
||||
'发生时间': '事故时间',
|
||||
'时间': '事故时间',
|
||||
'街道': '所在街道',
|
||||
'所属街道': '所在街道',
|
||||
'所属辖区': '所在区县',
|
||||
'辖区街道': '所在街道',
|
||||
'事故发生地点': '事故地点',
|
||||
'事故地址': '事故地点',
|
||||
'事故位置': '事故地点',
|
||||
'事故具体地址': '事故具体地点',
|
||||
'案件类型': '事故类型',
|
||||
'事故类别': '事故类型',
|
||||
'事故性质': '事故类型',
|
||||
'事故类型1': '事故类型',
|
||||
}
|
||||
|
||||
ACCIDENT_TYPE_NORMALIZATION: Mapping[str, str] = {
|
||||
'财产损失': '财损',
|
||||
'财产损失事故': '财损',
|
||||
'一般程序': '伤人',
|
||||
'一般程序事故': '伤人',
|
||||
'伤人事故': '伤人',
|
||||
'造成人员受伤': '伤人',
|
||||
'造成人员死亡': '亡人',
|
||||
'死亡事故': '亡人',
|
||||
'亡人事故': '亡人',
|
||||
'亡人死亡': '亡人',
|
||||
'号': '财损',
|
||||
}
|
||||
|
||||
REGION_FROM_LOCATION_PATTERN = re.compile(r'([一-龥]{2,8}(街道|新区|开发区|镇|区))')
|
||||
|
||||
REGION_NORMALIZATION: Mapping[str, str] = {
|
||||
'临城中队': '临城街道',
|
||||
'临城新区': '临城街道',
|
||||
'临城': '临城街道',
|
||||
'新城': '临城街道',
|
||||
'千岛中队': '千岛街道',
|
||||
'千岛新区': '千岛街道',
|
||||
'千岛': '千岛街道',
|
||||
'沈家门中队': '沈家门街道',
|
||||
'沈家门': '沈家门街道',
|
||||
'普陀城区': '沈家门街道',
|
||||
'普陀': '沈家门街道',
|
||||
}
|
||||
|
||||
|
||||
def _clean_text(series: pd.Series) -> pd.Series:
|
||||
"""Strip whitespace and normalise obvious null placeholders."""
|
||||
cleaned = series.astype(str).str.strip()
|
||||
null_tokens = {'', 'nan', 'NaN', 'None', 'NULL', '<NA>', '无', '—'}
|
||||
return cleaned.mask(cleaned.isin(null_tokens))
|
||||
|
||||
|
||||
def _maybe_seek_start(file_obj) -> None:
|
||||
if hasattr(file_obj, "seek"):
|
||||
try:
|
||||
file_obj.seek(0)
|
||||
except Exception: # pragma: no cover - guard against non file-likes
|
||||
pass
|
||||
|
||||
|
||||
def _prepare_sheet(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Standardise a single sheet from the事故数据 workbook."""
|
||||
if df is None or df.empty:
|
||||
return pd.DataFrame()
|
||||
|
||||
sheet = df.copy()
|
||||
# Normalise column names first
|
||||
sheet.columns = [str(col).strip() for col in sheet.columns]
|
||||
# If 栏目 still not recognised, attempt to locate header row inside the data
|
||||
if '事故时间' not in sheet.columns and '事故发生时间' not in sheet.columns:
|
||||
header_row = None
|
||||
for idx, row in sheet.iterrows():
|
||||
values = [str(cell).strip() for cell in row.tolist()]
|
||||
if '事故时间' in values or '事故发生时间' in values or '报警时间' in values:
|
||||
header_row = idx
|
||||
break
|
||||
if header_row is not None:
|
||||
sheet.columns = [str(x).strip() for x in sheet.iloc[header_row].tolist()]
|
||||
sheet = sheet.iloc[header_row + 1 :].reset_index(drop=True)
|
||||
sheet.columns = [str(col).strip() for col in sheet.columns]
|
||||
|
||||
# Apply aliases after potential header relocation
|
||||
sheet = sheet.rename(columns={src: dst for src, dst in COLUMN_ALIASES.items() if src in sheet.columns})
|
||||
|
||||
return sheet
|
||||
|
||||
|
||||
def _coalesce_columns(df: pd.DataFrame, columns: Iterable[str]) -> pd.Series:
|
||||
result = pd.Series(pd.NA, index=df.index, dtype="object")
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
candidate = _clean_text(df[col])
|
||||
result = result.fillna(candidate)
|
||||
return result
|
||||
|
||||
|
||||
def _infer_region_from_location(location: str) -> str | None:
|
||||
if pd.isna(location):
|
||||
return None
|
||||
text = str(location).strip()
|
||||
if not text:
|
||||
return None
|
||||
match = REGION_FROM_LOCATION_PATTERN.search(text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def _normalise_region_series(series: pd.Series) -> pd.Series:
|
||||
return series.map(lambda val: REGION_NORMALIZATION.get(val, val) if pd.notna(val) else val)
|
||||
|
||||
|
||||
def load_accident_records(accident_file, *, require_location: bool = False) -> pd.DataFrame:
|
||||
"""
|
||||
Load accident records from the updated Excel template.
|
||||
|
||||
The function supports workbooks with a single sheet (e.g. sample/事故处理/事故2021-2022.xlsx)
|
||||
as well as legacy multi-sheet formats where the header row might sit within the data.
|
||||
"""
|
||||
_maybe_seek_start(accident_file)
|
||||
sheets = pd.read_excel(accident_file, sheet_name=None)
|
||||
if isinstance(sheets, dict):
|
||||
frames = [frame for frame in ( _prepare_sheet(df) for df in sheets.values() ) if not frame.empty]
|
||||
else: # pragma: no cover - pandas only returns dict when sheet_name=None, but keep guard
|
||||
frames = [_prepare_sheet(sheets)]
|
||||
|
||||
if not frames:
|
||||
raise ValueError("未在上传的事故数据中检测到有效的事故记录,请确认文件内容。")
|
||||
|
||||
accident_df = pd.concat(frames, ignore_index=True)
|
||||
|
||||
# Normalise columns of interest
|
||||
if '事故时间' not in accident_df.columns and '报警时间' in accident_df.columns:
|
||||
accident_df['事故时间'] = accident_df['报警时间']
|
||||
|
||||
if '事故时间' not in accident_df.columns:
|
||||
raise ValueError("事故数据缺少“事故时间”字段,请确认模板是否为最新版本。")
|
||||
|
||||
accident_df['事故时间'] = pd.to_datetime(accident_df['事故时间'], errors='coerce')
|
||||
|
||||
# Location harmonisation (used for both region inference and hotspot analysis)
|
||||
location_columns_available = [col for col in ['事故具体地点', '事故地点'] if col in accident_df.columns]
|
||||
location_series = _coalesce_columns(accident_df, ['事故具体地点', '事故地点'])
|
||||
|
||||
# Region handling
|
||||
region = _coalesce_columns(accident_df, ['所在街道', '所属街道', '所在区县', '辖区中队'])
|
||||
# Infer region from location fields when still missing
|
||||
if region.isna().any():
|
||||
inferred = location_series.map(_infer_region_from_location)
|
||||
region = region.fillna(inferred)
|
||||
region = region.fillna(_clean_text(location_series))
|
||||
|
||||
region_clean = _clean_text(region)
|
||||
accident_df['所在街道'] = _normalise_region_series(region_clean)
|
||||
|
||||
# Accident type normalisation
|
||||
accident_type = _coalesce_columns(accident_df, ['事故类型', '事故类别', '事故性质'])
|
||||
accident_type = accident_type.replace(ACCIDENT_TYPE_NORMALIZATION)
|
||||
accident_type = _clean_text(accident_type).replace(ACCIDENT_TYPE_NORMALIZATION)
|
||||
accident_df['事故类型'] = accident_type.fillna('财损')
|
||||
|
||||
# Location column harmonisation
|
||||
if require_location and not location_columns_available and location_series.isna().all():
|
||||
raise ValueError("事故数据缺少“事故具体地点”字段,请确认模板是否与 sample/事故处理 中示例一致。")
|
||||
accident_df['事故具体地点'] = _clean_text(location_series)
|
||||
|
||||
# Drop records with missing core fields
|
||||
subset = ['事故时间', '所在街道', '事故类型']
|
||||
if require_location:
|
||||
subset.append('事故具体地点')
|
||||
accident_df = accident_df.dropna(subset=subset)
|
||||
|
||||
# Severity score
|
||||
severity_map = {'财损': 1, '伤人': 2, '亡人': 4}
|
||||
accident_df['severity'] = accident_df['事故类型'].map(severity_map).fillna(1).astype(int)
|
||||
|
||||
accident_df = accident_df.sort_values('事故时间').reset_index(drop=True)
|
||||
|
||||
return accident_df
|
||||
|
||||
|
||||
@st.cache_data(show_spinner=False)
|
||||
def load_and_clean_data(accident_file, strategy_file):
|
||||
accident_records = load_accident_records(accident_file)
|
||||
|
||||
accident_data = accident_records.rename(
|
||||
columns={'事故时间': 'date_time', '所在街道': 'region', '事故类型': 'category'}
|
||||
)
|
||||
|
||||
_maybe_seek_start(strategy_file)
|
||||
strategy_df = pd.read_excel(strategy_file)
|
||||
strategy_df = strategy_df.rename(columns=lambda col: str(col).strip())
|
||||
if '发布时间' not in strategy_df.columns:
|
||||
raise ValueError("策略数据缺少“发布时间”字段,请确认文件格式。")
|
||||
|
||||
strategy_df['发布时间'] = pd.to_datetime(strategy_df['发布时间'], errors='coerce')
|
||||
if '交通策略类型' not in strategy_df.columns:
|
||||
raise ValueError("策略数据缺少“交通策略类型”字段,请确认文件格式。")
|
||||
|
||||
strategy_df['交通策略类型'] = _clean_text(strategy_df['交通策略类型'])
|
||||
strategy_df = strategy_df.dropna(subset=['发布时间', '交通策略类型'])
|
||||
|
||||
accident_data = accident_data[['date_time', 'region', 'category', 'severity']]
|
||||
strategy_df = strategy_df[['发布时间', '交通策略类型']].rename(
|
||||
columns={'发布时间': 'date_time', '交通策略类型': 'strategy_type'}
|
||||
)
|
||||
|
||||
return accident_data, strategy_df
|
||||
|
||||
|
||||
@st.cache_data(show_spinner=False)
|
||||
def aggregate_daily_data(accident_data: pd.DataFrame, strategy_data: pd.DataFrame) -> pd.DataFrame:
|
||||
accident_data = accident_data.copy()
|
||||
strategy_data = strategy_data.copy()
|
||||
|
||||
accident_data['date'] = accident_data['date_time'].dt.date
|
||||
daily_accidents = accident_data.groupby('date').agg(
|
||||
accident_count=('date_time', 'count'),
|
||||
severity=('severity', 'sum')
|
||||
)
|
||||
daily_accidents.index = pd.to_datetime(daily_accidents.index)
|
||||
|
||||
strategy_data['date'] = strategy_data['date_time'].dt.date
|
||||
daily_strategies = strategy_data.groupby('date')['strategy_type'].apply(list)
|
||||
daily_strategies.index = pd.to_datetime(daily_strategies.index)
|
||||
|
||||
combined = daily_accidents.join(daily_strategies, how='left')
|
||||
combined['strategy_type'] = combined['strategy_type'].apply(lambda x: x if isinstance(x, list) else [])
|
||||
combined = combined.asfreq('D')
|
||||
combined[['accident_count', 'severity']] = combined[['accident_count', 'severity']].fillna(0)
|
||||
combined['strategy_type'] = combined['strategy_type'].apply(lambda x: x if isinstance(x, list) else [])
|
||||
return combined
|
||||
|
||||
|
||||
@st.cache_data(show_spinner=False)
|
||||
def aggregate_daily_data_by_region(accident_data: pd.DataFrame, strategy_data: pd.DataFrame) -> pd.DataFrame:
|
||||
df = accident_data.copy()
|
||||
df['date'] = df['date_time'].dt.date
|
||||
g = df.groupby(['region', 'date']).agg(
|
||||
accident_count=('date_time', 'count'),
|
||||
severity=('severity', 'sum')
|
||||
)
|
||||
g.index = g.index.set_levels([g.index.levels[0], pd.to_datetime(g.index.levels[1])])
|
||||
g = g.sort_index()
|
||||
|
||||
s = strategy_data.copy()
|
||||
s['date'] = s['date_time'].dt.date
|
||||
daily_strategies = s.groupby('date')['strategy_type'].apply(list)
|
||||
daily_strategies.index = pd.to_datetime(daily_strategies.index)
|
||||
|
||||
regions = g.index.get_level_values(0).unique()
|
||||
dates = pd.date_range(g.index.get_level_values(1).min(), g.index.get_level_values(1).max(), freq='D')
|
||||
full_index = pd.MultiIndex.from_product([regions, dates], names=['region', 'date'])
|
||||
g = g.reindex(full_index).fillna(0)
|
||||
|
||||
strat_map = daily_strategies.to_dict()
|
||||
g = g.assign(strategy_type=[strat_map.get(d, []) for d in g.index.get_level_values('date')])
|
||||
return g
|
||||
86
services/metrics.py
Normal file
86
services/metrics.py
Normal file
@@ -0,0 +1,86 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error
|
||||
from statsmodels.tsa.arima.model import ARIMA
|
||||
import streamlit as st
|
||||
|
||||
|
||||
@st.cache_data(show_spinner=False)
|
||||
def evaluate_models(series: pd.Series,
|
||||
horizon: int = 30,
|
||||
lookback: int = 14,
|
||||
p_values: range = range(0, 4),
|
||||
d_values: range = range(0, 2),
|
||||
q_values: range = range(0, 4)) -> pd.DataFrame:
|
||||
"""
|
||||
留出法(最后 horizon 天作为验证集)比较 ARIMA / KNN / GLM / SVR,
|
||||
输出 MAE・RMSE・MAPE,并按 RMSE 升序排序。
|
||||
"""
|
||||
series = series.asfreq('D').fillna(0)
|
||||
if len(series) <= horizon + 10:
|
||||
raise ValueError("序列太短,无法留出 %d 天进行评估。" % horizon)
|
||||
|
||||
train, test = series.iloc[:-horizon], series.iloc[-horizon:]
|
||||
|
||||
def _to_series_like(pred, a_index):
|
||||
if isinstance(pred, pd.Series):
|
||||
return pred.reindex(a_index)
|
||||
return pd.Series(pred, index=a_index)
|
||||
|
||||
def _metrics(a: pd.Series, p) -> dict:
|
||||
p = _to_series_like(p, a.index).astype(float)
|
||||
a = a.astype(float)
|
||||
mae = mean_absolute_error(a, p)
|
||||
try:
|
||||
rmse = mean_squared_error(a, p, squared=False)
|
||||
except TypeError:
|
||||
rmse = mean_squared_error(a, p) ** 0.5
|
||||
mape = np.nanmean(np.abs((a - p) / np.where(a == 0, np.nan, a))) * 100
|
||||
return {"MAE": mae, "RMSE": rmse, "MAPE": mape}
|
||||
|
||||
results = {}
|
||||
|
||||
best_aic, best_order = float('inf'), (1, 0, 1)
|
||||
for p in p_values:
|
||||
for d in d_values:
|
||||
for q in q_values:
|
||||
try:
|
||||
aic = ARIMA(train, order=(p, d, q)).fit().aic
|
||||
if aic < best_aic:
|
||||
best_aic, best_order = aic, (p, d, q)
|
||||
except Exception:
|
||||
continue
|
||||
arima_train = train.asfreq('D').fillna(0)
|
||||
arima_pred = ARIMA(arima_train, order=best_order).fit().forecast(steps=horizon)
|
||||
results['ARIMA'] = _metrics(test, arima_pred)
|
||||
|
||||
# Import local utilities to avoid circular dependencies
|
||||
from services.forecast import knn_forecast_counterfactual, fit_and_extrapolate
|
||||
|
||||
try:
|
||||
knn_pred, _ = knn_forecast_counterfactual(series,
|
||||
train.index[-1] + pd.Timedelta(days=1),
|
||||
lookback=lookback,
|
||||
horizon=horizon)
|
||||
if knn_pred is not None:
|
||||
results['KNN'] = _metrics(test, knn_pred)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
glm_pred, svr_pred, _ = fit_and_extrapolate(series,
|
||||
train.index[-1] + pd.Timedelta(days=1),
|
||||
days=horizon)
|
||||
if glm_pred is not None:
|
||||
results['GLM'] = _metrics(test, glm_pred)
|
||||
if svr_pred is not None:
|
||||
results['SVR'] = _metrics(test, svr_pred)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return (pd.DataFrame(results)
|
||||
.T.sort_values('RMSE')
|
||||
.round(3))
|
||||
|
||||
157
services/strategy.py
Normal file
157
services/strategy.py
Normal file
@@ -0,0 +1,157 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
from services.forecast import fit_and_extrapolate, arima_forecast_with_grid_search
|
||||
from config.settings import MIN_PRE_DAYS, MAX_PRE_DAYS
|
||||
|
||||
|
||||
def evaluate_strategy_effectiveness(actual_series: pd.Series,
|
||||
counterfactual_series: pd.Series,
|
||||
severity_series: pd.Series,
|
||||
strategy_date: pd.Timestamp,
|
||||
window: int = 30):
|
||||
strategy_date = pd.to_datetime(strategy_date)
|
||||
window_end = strategy_date + pd.Timedelta(days=window - 1)
|
||||
pre_sev = severity_series.loc[strategy_date - pd.Timedelta(days=window):strategy_date - pd.Timedelta(days=1)].sum()
|
||||
post_sev = severity_series.loc[strategy_date:window_end].sum()
|
||||
actual_post = actual_series.loc[strategy_date:window_end]
|
||||
counter_post = counterfactual_series.loc[strategy_date:window_end].reindex(actual_post.index)
|
||||
window_len = len(actual_post)
|
||||
if window_len == 0:
|
||||
return False, False, (0.0, 0.0), '三级'
|
||||
effective_days = (actual_post < counter_post).sum()
|
||||
count_effective = effective_days >= (window_len / 2)
|
||||
severity_effective = post_sev < pre_sev
|
||||
cf_sum = counter_post.sum()
|
||||
F1 = (cf_sum - actual_post.sum()) / cf_sum if cf_sum > 0 else 0.0
|
||||
F2 = (pre_sev - post_sev) / pre_sev if pre_sev > 0 else 0.0
|
||||
if F1 > 0.5 and F2 > 0.5:
|
||||
safety_state = '一级'
|
||||
elif F1 > 0.3:
|
||||
safety_state = '二级'
|
||||
else:
|
||||
safety_state = '三级'
|
||||
return count_effective, severity_effective, (F1, F2), safety_state
|
||||
|
||||
|
||||
@st.cache_data(show_spinner=False)
|
||||
def generate_output_and_recommendations(combined_data: pd.DataFrame,
|
||||
strategy_types: list,
|
||||
region: str = '全市',
|
||||
horizon: int = 30):
|
||||
results = {}
|
||||
combined_data = combined_data.copy().asfreq('D')
|
||||
combined_data[['accident_count','severity']] = combined_data[['accident_count','severity']].fillna(0)
|
||||
combined_data['strategy_type'] = combined_data['strategy_type'].apply(lambda x: x if isinstance(x, list) else [])
|
||||
|
||||
acc_full = combined_data['accident_count']
|
||||
sev_full = combined_data['severity']
|
||||
|
||||
max_fit_days = max(horizon + 60, MAX_PRE_DAYS)
|
||||
|
||||
for strategy in strategy_types:
|
||||
has_strategy = combined_data['strategy_type'].apply(lambda x: strategy in x)
|
||||
if not has_strategy.any():
|
||||
continue
|
||||
candidate_dates = has_strategy[has_strategy].index
|
||||
intervention_date = None
|
||||
fit_start_dt = None
|
||||
for dt in candidate_dates:
|
||||
fit_start_dt = max(acc_full.index.min(), dt - pd.Timedelta(days=max_fit_days))
|
||||
pre_hist = acc_full.loc[fit_start_dt:dt - pd.Timedelta(days=1)]
|
||||
if len(pre_hist) >= MIN_PRE_DAYS:
|
||||
intervention_date = dt
|
||||
break
|
||||
if intervention_date is None:
|
||||
intervention_date = candidate_dates[0]
|
||||
fit_start_dt = max(acc_full.index.min(), intervention_date - pd.Timedelta(days=max_fit_days))
|
||||
|
||||
acc = acc_full.loc[fit_start_dt:]
|
||||
sev = sev_full.loc[fit_start_dt:]
|
||||
horizon_eff = max(7, min(horizon, len(acc.loc[intervention_date:]) ))
|
||||
|
||||
glm_pred, svr_pred, residuals = fit_and_extrapolate(acc, intervention_date, days=horizon_eff)
|
||||
|
||||
counter = None
|
||||
if svr_pred is not None:
|
||||
counter = svr_pred
|
||||
elif glm_pred is not None:
|
||||
counter = glm_pred
|
||||
else:
|
||||
try:
|
||||
arima_df = arima_forecast_with_grid_search(acc.loc[:intervention_date],
|
||||
start_date=intervention_date + pd.Timedelta(days=1),
|
||||
horizon=horizon_eff)
|
||||
counter = pd.Series(arima_df['forecast'].values, index=arima_df.index, name='cf_arima')
|
||||
residuals = (acc.reindex(counter.index) - counter)
|
||||
except Exception:
|
||||
counter = None
|
||||
if counter is None:
|
||||
continue
|
||||
|
||||
count_eff, sev_eff, (F1, F2), state = evaluate_strategy_effectiveness(
|
||||
actual_series=acc,
|
||||
counterfactual_series=counter,
|
||||
severity_series=sev,
|
||||
strategy_date=intervention_date,
|
||||
window=horizon_eff
|
||||
)
|
||||
results[strategy] = {
|
||||
'effect_strength': float(residuals.dropna().mean()) if residuals is not None else 0.0,
|
||||
'adaptability': float(F1 + F2),
|
||||
'count_effective': bool(count_eff),
|
||||
'severity_effective': bool(sev_eff),
|
||||
'safety_state': state,
|
||||
'F1': float(F1),
|
||||
'F2': float(F2),
|
||||
'intervention_date': str(intervention_date.date())
|
||||
}
|
||||
|
||||
# Secondary attempt with 14-day window if no results
|
||||
if not results:
|
||||
for strategy in strategy_types:
|
||||
has_strategy = combined_data['strategy_type'].apply(lambda x: strategy in x)
|
||||
if not has_strategy.any():
|
||||
continue
|
||||
intervention_date = has_strategy[has_strategy].index[0]
|
||||
glm_pred, svr_pred, residuals = fit_and_extrapolate(acc_full, intervention_date, days=14)
|
||||
counter = None
|
||||
if svr_pred is not None:
|
||||
counter = svr_pred
|
||||
elif glm_pred is not None:
|
||||
counter = glm_pred
|
||||
else:
|
||||
try:
|
||||
arima_df = arima_forecast_with_grid_search(acc_full.loc[:intervention_date],
|
||||
start_date=intervention_date + pd.Timedelta(days=1),
|
||||
horizon=14)
|
||||
counter = pd.Series(arima_df['forecast'].values, index=arima_df.index, name='cf_arima')
|
||||
residuals = (acc_full.reindex(counter.index) - counter)
|
||||
except Exception:
|
||||
counter = None
|
||||
if counter is None:
|
||||
continue
|
||||
count_eff, sev_eff, (F1, F2), state = evaluate_strategy_effectiveness(
|
||||
actual_series=acc_full,
|
||||
counterfactual_series=counter,
|
||||
severity_series=sev_full,
|
||||
strategy_date=intervention_date,
|
||||
window=14
|
||||
)
|
||||
results[strategy] = {
|
||||
'effect_strength': float(residuals.dropna().mean()) if residuals is not None else 0.0,
|
||||
'adaptability': float(F1 + F2),
|
||||
'count_effective': bool(count_eff),
|
||||
'severity_effective': bool(sev_eff),
|
||||
'safety_state': state,
|
||||
'F1': float(F1),
|
||||
'F2': float(F2),
|
||||
'intervention_date': str(intervention_date.date())
|
||||
}
|
||||
|
||||
best_strategy = max(results, key=lambda x: results[x]['adaptability']) if results else None
|
||||
recommendation = f"建议在{region}区域长期实施策略类型 {best_strategy}" if best_strategy else "无足够数据推荐策略"
|
||||
return results, recommendation
|
||||
|
||||
Reference in New Issue
Block a user