first commit

This commit is contained in:
2026-03-05 11:50:15 +08:00
commit b1b14fd964
45 changed files with 7779 additions and 0 deletions

View File

@@ -0,0 +1 @@
# Services package marker.

View File

@@ -0,0 +1,70 @@
from io import BytesIO
from pathlib import Path
from openpyxl import Workbook, load_workbook
QUESTION_COLUMNS = [
("chapter", "章节"),
("primary_knowledge", "一级知识点"),
("secondary_knowledge", "二级知识点"),
("question_type", "题目类型"),
("difficulty", "难度"),
("stem", "题干"),
("option_a", "选项A"),
("option_b", "选项B"),
("option_c", "选项C"),
("option_d", "选项D"),
("answer", "正确答案"),
("explanation", "解析"),
("notes", "备注"),
("source_file", "来源文件"),
]
def create_template_bytes() -> bytes:
wb = Workbook()
ws = wb.active
ws.title = "questions"
ws.append([col[1] for col in QUESTION_COLUMNS])
ws.append(["函数", "", "函数性质", "单选", "", "示例题干", "A", "B", "C", "D", "A", "示例解析", "", "template.xlsx"])
buf = BytesIO()
wb.save(buf)
return buf.getvalue()
def parse_excel_file(path: Path) -> list[dict]:
wb = load_workbook(path, read_only=True)
ws = wb.active
rows = list(ws.iter_rows(values_only=True))
if not rows:
return []
headers = [str(v).strip() if v is not None else "" for v in rows[0]]
key_map = {label: key for key, label in QUESTION_COLUMNS}
result: list[dict] = []
for row in rows[1:]:
if not row or all(v is None or str(v).strip() == "" for v in row):
continue
item = {key: "" for key, _ in QUESTION_COLUMNS}
for idx, val in enumerate(row):
if idx >= len(headers):
continue
label = headers[idx]
key = key_map.get(label)
if not key:
continue
item[key] = "" if val is None else str(val)
if item["stem"]:
result.append(item)
return result
def export_excel_bytes(items: list[dict]) -> bytes:
wb = Workbook()
ws = wb.active
ws.title = "questions"
ws.append([label for _, label in QUESTION_COLUMNS])
for item in items:
ws.append([item.get(key, "") for key, _ in QUESTION_COLUMNS])
buf = BytesIO()
wb.save(buf)
return buf.getvalue()

View File

@@ -0,0 +1,20 @@
import shutil
from pathlib import Path
from fastapi import UploadFile
from backend.config import settings
def ensure_upload_dir() -> Path:
target = Path(settings.upload_dir)
target.mkdir(parents=True, exist_ok=True)
return target
def save_upload(upload_file: UploadFile) -> Path:
target_dir = ensure_upload_dir()
target_path = target_dir / upload_file.filename
with target_path.open("wb") as buffer:
shutil.copyfileobj(upload_file.file, buffer)
return target_path

167
backend/services/parser.py Normal file
View File

@@ -0,0 +1,167 @@
import json
import re
import subprocess
from pathlib import Path
import requests
from backend.config import settings
class DMXAPIService:
def __init__(self) -> None:
self.api_key = settings.api_key
self.model_name = settings.model_name
self.api_url = settings.dmxapi_url
def parse_file(self, file_path: str) -> list[dict]:
path = Path(file_path)
if path.suffix.lower() != ".pdf":
pdf_path = self._convert_to_pdf(path)
else:
pdf_path = path
try:
file_url = self._upload_to_temp_host(pdf_path)
questions = self._parse_with_file_url(file_url, path.name)
finally:
if pdf_path != path and pdf_path.exists():
pdf_path.unlink(missing_ok=True)
return questions
def _upload_to_temp_host(self, path: Path) -> str:
try:
with path.open("rb") as f:
response = requests.post("https://file.io", files={"file": f}, timeout=60)
if response.status_code == 200 and response.json().get("success"):
return response.json()["link"]
except Exception:
pass
with path.open("rb") as f:
response = requests.post("https://tmpfiles.org/api/v1/upload", files={"file": f}, timeout=60)
if response.status_code != 200:
raise ValueError(f"上传失败: {response.text}")
data = response.json()
if data.get("status") != "success":
raise ValueError(f"上传失败: {data}")
return data["data"]["url"].replace("tmpfiles.org/", "tmpfiles.org/dl/")
def _convert_to_pdf(self, path: Path) -> Path:
pdf_path = path.with_suffix(".pdf")
pdf_path.unlink(missing_ok=True)
cmd = [
"pandoc",
str(path),
"-o",
str(pdf_path),
"--pdf-engine=xelatex",
"-V",
"CJKmainfont=PingFang SC",
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=90)
if result.returncode != 0:
fallback = [
"pandoc",
str(path),
"-o",
str(pdf_path),
"--pdf-engine=weasyprint",
]
result = subprocess.run(fallback, capture_output=True, text=True, timeout=90)
if result.returncode != 0:
raise ValueError(f"文件转 PDF 失败: {result.stderr}")
return pdf_path
def _parse_with_file_url(self, file_url: str, original_filename: str) -> list[dict]:
if not self.api_key:
raise ValueError("未配置 API_KEY无法调用 DMXAPI")
payload = {
"model": self.model_name,
"input": [
{
"role": "user",
"content": [
{"type": "input_file", "file_url": file_url},
{"type": "input_text", "text": self._build_instruction(original_filename)},
],
}
],
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
response = requests.post(
self.api_url, headers=headers, data=json.dumps(payload), timeout=180
)
if response.status_code != 200:
raise ValueError(f"DMXAPI 请求失败: {response.status_code} {response.text}")
return self._extract_questions(response.json())
def _build_instruction(self, filename: str) -> str:
return f"""请从文件\"{filename}\"中提取所有题目信息并以JSON数组格式返回。
提取字段题干、选项A、选项B、选项C、选项D、正确答案、解析、备注。
数学公式统一转为 LaTeX内联公式使用 $...$,独立公式使用 $$...$$。
答案统一大写字母,清理“答案:”等前缀。
仅返回 JSON 数组,不要返回额外说明。"""
def _extract_questions(self, response: dict) -> list[dict]:
if response.get("status") != "completed":
raise ValueError(f"响应状态异常: {response.get('status')}")
text = None
for item in response.get("output", []):
if item.get("type") == "message":
for content in item.get("content", []):
if content.get("type") == "output_text":
text = content.get("text")
break
if not text:
raise ValueError("未在响应中找到文本内容")
questions = self._parse_json(text)
for q in questions:
for field in ["选项A", "选项B", "选项C", "选项D", "解析", "备注"]:
q.setdefault(field, "")
return questions
def _parse_json(self, text: str) -> list[dict]:
start_idx = text.find("[")
end_idx = text.rfind("]")
if start_idx < 0 or end_idx < 0:
raise ValueError(f"未找到 JSON 数组: {text[:200]}")
json_str = text[start_idx : end_idx + 1]
data = json.loads(json_str)
if not isinstance(data, list):
raise ValueError("解析结果不是数组")
for index, item in enumerate(data):
if "题干" not in item or "正确答案" not in item:
raise ValueError(f"{index + 1} 题缺少题干或正确答案")
item["正确答案"] = re.sub(r"[^A-D0-9]", "", str(item.get("正确答案", "")).upper())
return data
def extract_metadata(filename: str) -> dict:
basename = Path(filename).stem
separators = ["+", " ", "-", "_"]
parts = [basename]
for sep in separators:
if sep in basename:
parts = basename.split(sep)
break
secondary = parts[0].strip() if len(parts) > 0 else ""
raw_type = parts[1].strip() if len(parts) > 1 else ""
difficulty = parts[2].strip() if len(parts) > 2 else ""
mapped_type = settings.type_map.get(raw_type, raw_type)
chapter = secondary.split("")[0] if "" in secondary else secondary
chapter = chapter or "未分类"
return {
"chapter": chapter,
"secondary_knowledge": secondary,
"question_type": mapped_type,
"difficulty": difficulty,
"source_file": filename,
}