first commit

2026-03-05 11:50:15 +08:00
commit b1b14fd964
45 changed files with 7779 additions and 0 deletions
--- a/backend/services/init.py
+++ b/backend/services/init.py
@@ -0,0 +1 @@
+# Services package marker.
--- a/backend/services/excel_service.py
+++ b/backend/services/excel_service.py
@@ -0,0 +1,70 @@
+from io import BytesIO
+from pathlib import Path
+
+from openpyxl import Workbook, load_workbook
+
+QUESTION_COLUMNS = [
+    ("chapter", "章节"),
+    ("primary_knowledge", "一级知识点"),
+    ("secondary_knowledge", "二级知识点"),
+    ("question_type", "题目类型"),
+    ("difficulty", "难度"),
+    ("stem", "题干"),
+    ("option_a", "选项A"),
+    ("option_b", "选项B"),
+    ("option_c", "选项C"),
+    ("option_d", "选项D"),
+    ("answer", "正确答案"),
+    ("explanation", "解析"),
+    ("notes", "备注"),
+    ("source_file", "来源文件"),
+]
+
+
+def create_template_bytes() -> bytes:
+    wb = Workbook()
+    ws = wb.active
+    ws.title = "questions"
+    ws.append([col[1] for col in QUESTION_COLUMNS])
+    ws.append(["函数", "", "函数性质", "单选", "中", "示例题干", "A", "B", "C", "D", "A", "示例解析", "", "template.xlsx"])
+    buf = BytesIO()
+    wb.save(buf)
+    return buf.getvalue()
+
+
+def parse_excel_file(path: Path) -> list[dict]:
+    wb = load_workbook(path, read_only=True)
+    ws = wb.active
+    rows = list(ws.iter_rows(values_only=True))
+    if not rows:
+        return []
+    headers = [str(v).strip() if v is not None else "" for v in rows[0]]
+    key_map = {label: key for key, label in QUESTION_COLUMNS}
+    result: list[dict] = []
+    for row in rows[1:]:
+        if not row or all(v is None or str(v).strip() == "" for v in row):
+            continue
+        item = {key: "" for key, _ in QUESTION_COLUMNS}
+        for idx, val in enumerate(row):
+            if idx >= len(headers):
+                continue
+            label = headers[idx]
+            key = key_map.get(label)
+            if not key:
+                continue
+            item[key] = "" if val is None else str(val)
+        if item["stem"]:
+            result.append(item)
+    return result
+
+
+def export_excel_bytes(items: list[dict]) -> bytes:
+    wb = Workbook()
+    ws = wb.active
+    ws.title = "questions"
+    ws.append([label for _, label in QUESTION_COLUMNS])
+    for item in items:
+        ws.append([item.get(key, "") for key, _ in QUESTION_COLUMNS])
+    buf = BytesIO()
+    wb.save(buf)
+    return buf.getvalue()
--- a/backend/services/file_utils.py
+++ b/backend/services/file_utils.py
@@ -0,0 +1,20 @@
+import shutil
+from pathlib import Path
+
+from fastapi import UploadFile
+
+from backend.config import settings
+
+
+def ensure_upload_dir() -> Path:
+    target = Path(settings.upload_dir)
+    target.mkdir(parents=True, exist_ok=True)
+    return target
+
+
+def save_upload(upload_file: UploadFile) -> Path:
+    target_dir = ensure_upload_dir()
+    target_path = target_dir / upload_file.filename
+    with target_path.open("wb") as buffer:
+        shutil.copyfileobj(upload_file.file, buffer)
+    return target_path
--- a/backend/services/parser.py
+++ b/backend/services/parser.py
@@ -0,0 +1,167 @@
+import json
+import re
+import subprocess
+from pathlib import Path
+
+import requests
+
+from backend.config import settings
+
+
+class DMXAPIService:
+    def __init__(self) -> None:
+        self.api_key = settings.api_key
+        self.model_name = settings.model_name
+        self.api_url = settings.dmxapi_url
+
+    def parse_file(self, file_path: str) -> list[dict]:
+        path = Path(file_path)
+        if path.suffix.lower() != ".pdf":
+            pdf_path = self._convert_to_pdf(path)
+        else:
+            pdf_path = path
+
+        try:
+            file_url = self._upload_to_temp_host(pdf_path)
+            questions = self._parse_with_file_url(file_url, path.name)
+        finally:
+            if pdf_path != path and pdf_path.exists():
+                pdf_path.unlink(missing_ok=True)
+
+        return questions
+
+    def _upload_to_temp_host(self, path: Path) -> str:
+        try:
+            with path.open("rb") as f:
+                response = requests.post("https://file.io", files={"file": f}, timeout=60)
+            if response.status_code == 200 and response.json().get("success"):
+                return response.json()["link"]
+        except Exception:
+            pass
+
+        with path.open("rb") as f:
+            response = requests.post("https://tmpfiles.org/api/v1/upload", files={"file": f}, timeout=60)
+        if response.status_code != 200:
+            raise ValueError(f"上传失败: {response.text}")
+        data = response.json()
+        if data.get("status") != "success":
+            raise ValueError(f"上传失败: {data}")
+        return data["data"]["url"].replace("tmpfiles.org/", "tmpfiles.org/dl/")
+
+    def _convert_to_pdf(self, path: Path) -> Path:
+        pdf_path = path.with_suffix(".pdf")
+        pdf_path.unlink(missing_ok=True)
+
+        cmd = [
+            "pandoc",
+            str(path),
+            "-o",
+            str(pdf_path),
+            "--pdf-engine=xelatex",
+            "-V",
+            "CJKmainfont=PingFang SC",
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=90)
+        if result.returncode != 0:
+            fallback = [
+                "pandoc",
+                str(path),
+                "-o",
+                str(pdf_path),
+                "--pdf-engine=weasyprint",
+            ]
+            result = subprocess.run(fallback, capture_output=True, text=True, timeout=90)
+            if result.returncode != 0:
+                raise ValueError(f"文件转 PDF 失败: {result.stderr}")
+        return pdf_path
+
+    def _parse_with_file_url(self, file_url: str, original_filename: str) -> list[dict]:
+        if not self.api_key:
+            raise ValueError("未配置 API_KEY，无法调用 DMXAPI")
+
+        payload = {
+            "model": self.model_name,
+            "input": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "input_file", "file_url": file_url},
+                        {"type": "input_text", "text": self._build_instruction(original_filename)},
+                    ],
+                }
+            ],
+        }
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+        }
+        response = requests.post(
+            self.api_url, headers=headers, data=json.dumps(payload), timeout=180
+        )
+        if response.status_code != 200:
+            raise ValueError(f"DMXAPI 请求失败: {response.status_code} {response.text}")
+        return self._extract_questions(response.json())
+
+    def _build_instruction(self, filename: str) -> str:
+        return f"""请从文件\"{filename}\"中提取所有题目信息，并以JSON数组格式返回。
+
+提取字段：题干、选项A、选项B、选项C、选项D、正确答案、解析、备注。
+数学公式统一转为 LaTeX，内联公式使用 $...$，独立公式使用 $$...$$。
+答案统一大写字母，清理“答案:”等前缀。
+仅返回 JSON 数组，不要返回额外说明。"""
+
+    def _extract_questions(self, response: dict) -> list[dict]:
+        if response.get("status") != "completed":
+            raise ValueError(f"响应状态异常: {response.get('status')}")
+        text = None
+        for item in response.get("output", []):
+            if item.get("type") == "message":
+                for content in item.get("content", []):
+                    if content.get("type") == "output_text":
+                        text = content.get("text")
+                        break
+        if not text:
+            raise ValueError("未在响应中找到文本内容")
+        questions = self._parse_json(text)
+        for q in questions:
+            for field in ["选项A", "选项B", "选项C", "选项D", "解析", "备注"]:
+                q.setdefault(field, "")
+        return questions
+
+    def _parse_json(self, text: str) -> list[dict]:
+        start_idx = text.find("[")
+        end_idx = text.rfind("]")
+        if start_idx < 0 or end_idx < 0:
+            raise ValueError(f"未找到 JSON 数组: {text[:200]}")
+        json_str = text[start_idx : end_idx + 1]
+        data = json.loads(json_str)
+        if not isinstance(data, list):
+            raise ValueError("解析结果不是数组")
+        for index, item in enumerate(data):
+            if "题干" not in item or "正确答案" not in item:
+                raise ValueError(f"第 {index + 1} 题缺少题干或正确答案")
+            item["正确答案"] = re.sub(r"[^A-D0-9]", "", str(item.get("正确答案", "")).upper())
+        return data
+
+
+def extract_metadata(filename: str) -> dict:
+    basename = Path(filename).stem
+    separators = ["+", " ", "-", "_"]
+    parts = [basename]
+    for sep in separators:
+        if sep in basename:
+            parts = basename.split(sep)
+            break
+    secondary = parts[0].strip() if len(parts) > 0 else ""
+    raw_type = parts[1].strip() if len(parts) > 1 else ""
+    difficulty = parts[2].strip() if len(parts) > 2 else ""
+    mapped_type = settings.type_map.get(raw_type, raw_type)
+    chapter = secondary.split("的")[0] if "的" in secondary else secondary
+    chapter = chapter or "未分类"
+    return {
+        "chapter": chapter,
+        "secondary_knowledge": secondary,
+        "question_type": mapped_type,
+        "difficulty": difficulty,
+        "source_file": filename,
+    }