Files
problem-bank/backend/services/parser.py
2026-03-05 11:50:15 +08:00

168 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import re
import subprocess
from pathlib import Path
import requests
from backend.config import settings
class DMXAPIService:
def __init__(self) -> None:
self.api_key = settings.api_key
self.model_name = settings.model_name
self.api_url = settings.dmxapi_url
def parse_file(self, file_path: str) -> list[dict]:
path = Path(file_path)
if path.suffix.lower() != ".pdf":
pdf_path = self._convert_to_pdf(path)
else:
pdf_path = path
try:
file_url = self._upload_to_temp_host(pdf_path)
questions = self._parse_with_file_url(file_url, path.name)
finally:
if pdf_path != path and pdf_path.exists():
pdf_path.unlink(missing_ok=True)
return questions
def _upload_to_temp_host(self, path: Path) -> str:
try:
with path.open("rb") as f:
response = requests.post("https://file.io", files={"file": f}, timeout=60)
if response.status_code == 200 and response.json().get("success"):
return response.json()["link"]
except Exception:
pass
with path.open("rb") as f:
response = requests.post("https://tmpfiles.org/api/v1/upload", files={"file": f}, timeout=60)
if response.status_code != 200:
raise ValueError(f"上传失败: {response.text}")
data = response.json()
if data.get("status") != "success":
raise ValueError(f"上传失败: {data}")
return data["data"]["url"].replace("tmpfiles.org/", "tmpfiles.org/dl/")
def _convert_to_pdf(self, path: Path) -> Path:
pdf_path = path.with_suffix(".pdf")
pdf_path.unlink(missing_ok=True)
cmd = [
"pandoc",
str(path),
"-o",
str(pdf_path),
"--pdf-engine=xelatex",
"-V",
"CJKmainfont=PingFang SC",
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=90)
if result.returncode != 0:
fallback = [
"pandoc",
str(path),
"-o",
str(pdf_path),
"--pdf-engine=weasyprint",
]
result = subprocess.run(fallback, capture_output=True, text=True, timeout=90)
if result.returncode != 0:
raise ValueError(f"文件转 PDF 失败: {result.stderr}")
return pdf_path
def _parse_with_file_url(self, file_url: str, original_filename: str) -> list[dict]:
if not self.api_key:
raise ValueError("未配置 API_KEY无法调用 DMXAPI")
payload = {
"model": self.model_name,
"input": [
{
"role": "user",
"content": [
{"type": "input_file", "file_url": file_url},
{"type": "input_text", "text": self._build_instruction(original_filename)},
],
}
],
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
response = requests.post(
self.api_url, headers=headers, data=json.dumps(payload), timeout=180
)
if response.status_code != 200:
raise ValueError(f"DMXAPI 请求失败: {response.status_code} {response.text}")
return self._extract_questions(response.json())
def _build_instruction(self, filename: str) -> str:
return f"""请从文件\"{filename}\"中提取所有题目信息并以JSON数组格式返回。
提取字段题干、选项A、选项B、选项C、选项D、正确答案、解析、备注。
数学公式统一转为 LaTeX内联公式使用 $...$,独立公式使用 $$...$$。
答案统一大写字母,清理“答案:”等前缀。
仅返回 JSON 数组,不要返回额外说明。"""
def _extract_questions(self, response: dict) -> list[dict]:
if response.get("status") != "completed":
raise ValueError(f"响应状态异常: {response.get('status')}")
text = None
for item in response.get("output", []):
if item.get("type") == "message":
for content in item.get("content", []):
if content.get("type") == "output_text":
text = content.get("text")
break
if not text:
raise ValueError("未在响应中找到文本内容")
questions = self._parse_json(text)
for q in questions:
for field in ["选项A", "选项B", "选项C", "选项D", "解析", "备注"]:
q.setdefault(field, "")
return questions
def _parse_json(self, text: str) -> list[dict]:
start_idx = text.find("[")
end_idx = text.rfind("]")
if start_idx < 0 or end_idx < 0:
raise ValueError(f"未找到 JSON 数组: {text[:200]}")
json_str = text[start_idx : end_idx + 1]
data = json.loads(json_str)
if not isinstance(data, list):
raise ValueError("解析结果不是数组")
for index, item in enumerate(data):
if "题干" not in item or "正确答案" not in item:
raise ValueError(f"{index + 1} 题缺少题干或正确答案")
item["正确答案"] = re.sub(r"[^A-D0-9]", "", str(item.get("正确答案", "")).upper())
return data
def extract_metadata(filename: str) -> dict:
basename = Path(filename).stem
separators = ["+", " ", "-", "_"]
parts = [basename]
for sep in separators:
if sep in basename:
parts = basename.split(sep)
break
secondary = parts[0].strip() if len(parts) > 0 else ""
raw_type = parts[1].strip() if len(parts) > 1 else ""
difficulty = parts[2].strip() if len(parts) > 2 else ""
mapped_type = settings.type_map.get(raw_type, raw_type)
chapter = secondary.split("")[0] if "" in secondary else secondary
chapter = chapter or "未分类"
return {
"chapter": chapter,
"secondary_knowledge": secondary,
"question_type": mapped_type,
"difficulty": difficulty,
"source_file": filename,
}