first commit
This commit is contained in:
167
backend/services/parser.py
Normal file
167
backend/services/parser.py
Normal file
@@ -0,0 +1,167 @@
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
from backend.config import settings
|
||||
|
||||
|
||||
class DMXAPIService:
|
||||
def __init__(self) -> None:
|
||||
self.api_key = settings.api_key
|
||||
self.model_name = settings.model_name
|
||||
self.api_url = settings.dmxapi_url
|
||||
|
||||
def parse_file(self, file_path: str) -> list[dict]:
|
||||
path = Path(file_path)
|
||||
if path.suffix.lower() != ".pdf":
|
||||
pdf_path = self._convert_to_pdf(path)
|
||||
else:
|
||||
pdf_path = path
|
||||
|
||||
try:
|
||||
file_url = self._upload_to_temp_host(pdf_path)
|
||||
questions = self._parse_with_file_url(file_url, path.name)
|
||||
finally:
|
||||
if pdf_path != path and pdf_path.exists():
|
||||
pdf_path.unlink(missing_ok=True)
|
||||
|
||||
return questions
|
||||
|
||||
def _upload_to_temp_host(self, path: Path) -> str:
|
||||
try:
|
||||
with path.open("rb") as f:
|
||||
response = requests.post("https://file.io", files={"file": f}, timeout=60)
|
||||
if response.status_code == 200 and response.json().get("success"):
|
||||
return response.json()["link"]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
with path.open("rb") as f:
|
||||
response = requests.post("https://tmpfiles.org/api/v1/upload", files={"file": f}, timeout=60)
|
||||
if response.status_code != 200:
|
||||
raise ValueError(f"上传失败: {response.text}")
|
||||
data = response.json()
|
||||
if data.get("status") != "success":
|
||||
raise ValueError(f"上传失败: {data}")
|
||||
return data["data"]["url"].replace("tmpfiles.org/", "tmpfiles.org/dl/")
|
||||
|
||||
def _convert_to_pdf(self, path: Path) -> Path:
|
||||
pdf_path = path.with_suffix(".pdf")
|
||||
pdf_path.unlink(missing_ok=True)
|
||||
|
||||
cmd = [
|
||||
"pandoc",
|
||||
str(path),
|
||||
"-o",
|
||||
str(pdf_path),
|
||||
"--pdf-engine=xelatex",
|
||||
"-V",
|
||||
"CJKmainfont=PingFang SC",
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=90)
|
||||
if result.returncode != 0:
|
||||
fallback = [
|
||||
"pandoc",
|
||||
str(path),
|
||||
"-o",
|
||||
str(pdf_path),
|
||||
"--pdf-engine=weasyprint",
|
||||
]
|
||||
result = subprocess.run(fallback, capture_output=True, text=True, timeout=90)
|
||||
if result.returncode != 0:
|
||||
raise ValueError(f"文件转 PDF 失败: {result.stderr}")
|
||||
return pdf_path
|
||||
|
||||
def _parse_with_file_url(self, file_url: str, original_filename: str) -> list[dict]:
|
||||
if not self.api_key:
|
||||
raise ValueError("未配置 API_KEY,无法调用 DMXAPI")
|
||||
|
||||
payload = {
|
||||
"model": self.model_name,
|
||||
"input": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "input_file", "file_url": file_url},
|
||||
{"type": "input_text", "text": self._build_instruction(original_filename)},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
response = requests.post(
|
||||
self.api_url, headers=headers, data=json.dumps(payload), timeout=180
|
||||
)
|
||||
if response.status_code != 200:
|
||||
raise ValueError(f"DMXAPI 请求失败: {response.status_code} {response.text}")
|
||||
return self._extract_questions(response.json())
|
||||
|
||||
def _build_instruction(self, filename: str) -> str:
|
||||
return f"""请从文件\"{filename}\"中提取所有题目信息,并以JSON数组格式返回。
|
||||
|
||||
提取字段:题干、选项A、选项B、选项C、选项D、正确答案、解析、备注。
|
||||
数学公式统一转为 LaTeX,内联公式使用 $...$,独立公式使用 $$...$$。
|
||||
答案统一大写字母,清理“答案:”等前缀。
|
||||
仅返回 JSON 数组,不要返回额外说明。"""
|
||||
|
||||
def _extract_questions(self, response: dict) -> list[dict]:
|
||||
if response.get("status") != "completed":
|
||||
raise ValueError(f"响应状态异常: {response.get('status')}")
|
||||
text = None
|
||||
for item in response.get("output", []):
|
||||
if item.get("type") == "message":
|
||||
for content in item.get("content", []):
|
||||
if content.get("type") == "output_text":
|
||||
text = content.get("text")
|
||||
break
|
||||
if not text:
|
||||
raise ValueError("未在响应中找到文本内容")
|
||||
questions = self._parse_json(text)
|
||||
for q in questions:
|
||||
for field in ["选项A", "选项B", "选项C", "选项D", "解析", "备注"]:
|
||||
q.setdefault(field, "")
|
||||
return questions
|
||||
|
||||
def _parse_json(self, text: str) -> list[dict]:
|
||||
start_idx = text.find("[")
|
||||
end_idx = text.rfind("]")
|
||||
if start_idx < 0 or end_idx < 0:
|
||||
raise ValueError(f"未找到 JSON 数组: {text[:200]}")
|
||||
json_str = text[start_idx : end_idx + 1]
|
||||
data = json.loads(json_str)
|
||||
if not isinstance(data, list):
|
||||
raise ValueError("解析结果不是数组")
|
||||
for index, item in enumerate(data):
|
||||
if "题干" not in item or "正确答案" not in item:
|
||||
raise ValueError(f"第 {index + 1} 题缺少题干或正确答案")
|
||||
item["正确答案"] = re.sub(r"[^A-D0-9]", "", str(item.get("正确答案", "")).upper())
|
||||
return data
|
||||
|
||||
|
||||
def extract_metadata(filename: str) -> dict:
|
||||
basename = Path(filename).stem
|
||||
separators = ["+", " ", "-", "_"]
|
||||
parts = [basename]
|
||||
for sep in separators:
|
||||
if sep in basename:
|
||||
parts = basename.split(sep)
|
||||
break
|
||||
secondary = parts[0].strip() if len(parts) > 0 else ""
|
||||
raw_type = parts[1].strip() if len(parts) > 1 else ""
|
||||
difficulty = parts[2].strip() if len(parts) > 2 else ""
|
||||
mapped_type = settings.type_map.get(raw_type, raw_type)
|
||||
chapter = secondary.split("的")[0] if "的" in secondary else secondary
|
||||
chapter = chapter or "未分类"
|
||||
return {
|
||||
"chapter": chapter,
|
||||
"secondary_knowledge": secondary,
|
||||
"question_type": mapped_type,
|
||||
"difficulty": difficulty,
|
||||
"source_file": filename,
|
||||
}
|
||||
Reference in New Issue
Block a user