first commit

2026-03-05 11:50:15 +08:00
commit b1b14fd964
45 changed files with 7779 additions and 0 deletions
--- a/backend/services/parser.py
+++ b/backend/services/parser.py
@@ -0,0 +1,167 @@
+import json
+import re
+import subprocess
+from pathlib import Path
+
+import requests
+
+from backend.config import settings
+
+
+class DMXAPIService:
+    def __init__(self) -> None:
+        self.api_key = settings.api_key
+        self.model_name = settings.model_name
+        self.api_url = settings.dmxapi_url
+
+    def parse_file(self, file_path: str) -> list[dict]:
+        path = Path(file_path)
+        if path.suffix.lower() != ".pdf":
+            pdf_path = self._convert_to_pdf(path)
+        else:
+            pdf_path = path
+
+        try:
+            file_url = self._upload_to_temp_host(pdf_path)
+            questions = self._parse_with_file_url(file_url, path.name)
+        finally:
+            if pdf_path != path and pdf_path.exists():
+                pdf_path.unlink(missing_ok=True)
+
+        return questions
+
+    def _upload_to_temp_host(self, path: Path) -> str:
+        try:
+            with path.open("rb") as f:
+                response = requests.post("https://file.io", files={"file": f}, timeout=60)
+            if response.status_code == 200 and response.json().get("success"):
+                return response.json()["link"]
+        except Exception:
+            pass
+
+        with path.open("rb") as f:
+            response = requests.post("https://tmpfiles.org/api/v1/upload", files={"file": f}, timeout=60)
+        if response.status_code != 200:
+            raise ValueError(f"上传失败: {response.text}")
+        data = response.json()
+        if data.get("status") != "success":
+            raise ValueError(f"上传失败: {data}")
+        return data["data"]["url"].replace("tmpfiles.org/", "tmpfiles.org/dl/")
+
+    def _convert_to_pdf(self, path: Path) -> Path:
+        pdf_path = path.with_suffix(".pdf")
+        pdf_path.unlink(missing_ok=True)
+
+        cmd = [
+            "pandoc",
+            str(path),
+            "-o",
+            str(pdf_path),
+            "--pdf-engine=xelatex",
+            "-V",
+            "CJKmainfont=PingFang SC",
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=90)
+        if result.returncode != 0:
+            fallback = [
+                "pandoc",
+                str(path),
+                "-o",
+                str(pdf_path),
+                "--pdf-engine=weasyprint",
+            ]
+            result = subprocess.run(fallback, capture_output=True, text=True, timeout=90)
+            if result.returncode != 0:
+                raise ValueError(f"文件转 PDF 失败: {result.stderr}")
+        return pdf_path
+
+    def _parse_with_file_url(self, file_url: str, original_filename: str) -> list[dict]:
+        if not self.api_key:
+            raise ValueError("未配置 API_KEY，无法调用 DMXAPI")
+
+        payload = {
+            "model": self.model_name,
+            "input": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "input_file", "file_url": file_url},
+                        {"type": "input_text", "text": self._build_instruction(original_filename)},
+                    ],
+                }
+            ],
+        }
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+        }
+        response = requests.post(
+            self.api_url, headers=headers, data=json.dumps(payload), timeout=180
+        )
+        if response.status_code != 200:
+            raise ValueError(f"DMXAPI 请求失败: {response.status_code} {response.text}")
+        return self._extract_questions(response.json())
+
+    def _build_instruction(self, filename: str) -> str:
+        return f"""请从文件\"{filename}\"中提取所有题目信息，并以JSON数组格式返回。
+
+提取字段：题干、选项A、选项B、选项C、选项D、正确答案、解析、备注。
+数学公式统一转为 LaTeX，内联公式使用 $...$，独立公式使用 $$...$$。
+答案统一大写字母，清理“答案:”等前缀。
+仅返回 JSON 数组，不要返回额外说明。"""
+
+    def _extract_questions(self, response: dict) -> list[dict]:
+        if response.get("status") != "completed":
+            raise ValueError(f"响应状态异常: {response.get('status')}")
+        text = None
+        for item in response.get("output", []):
+            if item.get("type") == "message":
+                for content in item.get("content", []):
+                    if content.get("type") == "output_text":
+                        text = content.get("text")
+                        break
+        if not text:
+            raise ValueError("未在响应中找到文本内容")
+        questions = self._parse_json(text)
+        for q in questions:
+            for field in ["选项A", "选项B", "选项C", "选项D", "解析", "备注"]:
+                q.setdefault(field, "")
+        return questions
+
+    def _parse_json(self, text: str) -> list[dict]:
+        start_idx = text.find("[")
+        end_idx = text.rfind("]")
+        if start_idx < 0 or end_idx < 0:
+            raise ValueError(f"未找到 JSON 数组: {text[:200]}")
+        json_str = text[start_idx : end_idx + 1]
+        data = json.loads(json_str)
+        if not isinstance(data, list):
+            raise ValueError("解析结果不是数组")
+        for index, item in enumerate(data):
+            if "题干" not in item or "正确答案" not in item:
+                raise ValueError(f"第 {index + 1} 题缺少题干或正确答案")
+            item["正确答案"] = re.sub(r"[^A-D0-9]", "", str(item.get("正确答案", "")).upper())
+        return data
+
+
+def extract_metadata(filename: str) -> dict:
+    basename = Path(filename).stem
+    separators = ["+", " ", "-", "_"]
+    parts = [basename]
+    for sep in separators:
+        if sep in basename:
+            parts = basename.split(sep)
+            break
+    secondary = parts[0].strip() if len(parts) > 0 else ""
+    raw_type = parts[1].strip() if len(parts) > 1 else ""
+    difficulty = parts[2].strip() if len(parts) > 2 else ""
+    mapped_type = settings.type_map.get(raw_type, raw_type)
+    chapter = secondary.split("的")[0] if "的" in secondary else secondary
+    chapter = chapter or "未分类"
+    return {
+        "chapter": chapter,
+        "secondary_knowledge": secondary,
+        "question_type": mapped_type,
+        "difficulty": difficulty,
+        "source_file": filename,
+    }