168 lines
6.2 KiB
Python
168 lines
6.2 KiB
Python
import json
|
||
import re
|
||
import subprocess
|
||
from pathlib import Path
|
||
|
||
import requests
|
||
|
||
from backend.config import settings
|
||
|
||
|
||
class DMXAPIService:
|
||
def __init__(self) -> None:
|
||
self.api_key = settings.api_key
|
||
self.model_name = settings.model_name
|
||
self.api_url = settings.dmxapi_url
|
||
|
||
def parse_file(self, file_path: str) -> list[dict]:
|
||
path = Path(file_path)
|
||
if path.suffix.lower() != ".pdf":
|
||
pdf_path = self._convert_to_pdf(path)
|
||
else:
|
||
pdf_path = path
|
||
|
||
try:
|
||
file_url = self._upload_to_temp_host(pdf_path)
|
||
questions = self._parse_with_file_url(file_url, path.name)
|
||
finally:
|
||
if pdf_path != path and pdf_path.exists():
|
||
pdf_path.unlink(missing_ok=True)
|
||
|
||
return questions
|
||
|
||
def _upload_to_temp_host(self, path: Path) -> str:
|
||
try:
|
||
with path.open("rb") as f:
|
||
response = requests.post("https://file.io", files={"file": f}, timeout=60)
|
||
if response.status_code == 200 and response.json().get("success"):
|
||
return response.json()["link"]
|
||
except Exception:
|
||
pass
|
||
|
||
with path.open("rb") as f:
|
||
response = requests.post("https://tmpfiles.org/api/v1/upload", files={"file": f}, timeout=60)
|
||
if response.status_code != 200:
|
||
raise ValueError(f"上传失败: {response.text}")
|
||
data = response.json()
|
||
if data.get("status") != "success":
|
||
raise ValueError(f"上传失败: {data}")
|
||
return data["data"]["url"].replace("tmpfiles.org/", "tmpfiles.org/dl/")
|
||
|
||
def _convert_to_pdf(self, path: Path) -> Path:
|
||
pdf_path = path.with_suffix(".pdf")
|
||
pdf_path.unlink(missing_ok=True)
|
||
|
||
cmd = [
|
||
"pandoc",
|
||
str(path),
|
||
"-o",
|
||
str(pdf_path),
|
||
"--pdf-engine=xelatex",
|
||
"-V",
|
||
"CJKmainfont=PingFang SC",
|
||
]
|
||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=90)
|
||
if result.returncode != 0:
|
||
fallback = [
|
||
"pandoc",
|
||
str(path),
|
||
"-o",
|
||
str(pdf_path),
|
||
"--pdf-engine=weasyprint",
|
||
]
|
||
result = subprocess.run(fallback, capture_output=True, text=True, timeout=90)
|
||
if result.returncode != 0:
|
||
raise ValueError(f"文件转 PDF 失败: {result.stderr}")
|
||
return pdf_path
|
||
|
||
def _parse_with_file_url(self, file_url: str, original_filename: str) -> list[dict]:
|
||
if not self.api_key:
|
||
raise ValueError("未配置 API_KEY,无法调用 DMXAPI")
|
||
|
||
payload = {
|
||
"model": self.model_name,
|
||
"input": [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
{"type": "input_file", "file_url": file_url},
|
||
{"type": "input_text", "text": self._build_instruction(original_filename)},
|
||
],
|
||
}
|
||
],
|
||
}
|
||
headers = {
|
||
"Authorization": f"Bearer {self.api_key}",
|
||
"Content-Type": "application/json",
|
||
}
|
||
response = requests.post(
|
||
self.api_url, headers=headers, data=json.dumps(payload), timeout=180
|
||
)
|
||
if response.status_code != 200:
|
||
raise ValueError(f"DMXAPI 请求失败: {response.status_code} {response.text}")
|
||
return self._extract_questions(response.json())
|
||
|
||
def _build_instruction(self, filename: str) -> str:
|
||
return f"""请从文件\"{filename}\"中提取所有题目信息,并以JSON数组格式返回。
|
||
|
||
提取字段:题干、选项A、选项B、选项C、选项D、正确答案、解析、备注。
|
||
数学公式统一转为 LaTeX,内联公式使用 $...$,独立公式使用 $$...$$。
|
||
答案统一大写字母,清理“答案:”等前缀。
|
||
仅返回 JSON 数组,不要返回额外说明。"""
|
||
|
||
def _extract_questions(self, response: dict) -> list[dict]:
|
||
if response.get("status") != "completed":
|
||
raise ValueError(f"响应状态异常: {response.get('status')}")
|
||
text = None
|
||
for item in response.get("output", []):
|
||
if item.get("type") == "message":
|
||
for content in item.get("content", []):
|
||
if content.get("type") == "output_text":
|
||
text = content.get("text")
|
||
break
|
||
if not text:
|
||
raise ValueError("未在响应中找到文本内容")
|
||
questions = self._parse_json(text)
|
||
for q in questions:
|
||
for field in ["选项A", "选项B", "选项C", "选项D", "解析", "备注"]:
|
||
q.setdefault(field, "")
|
||
return questions
|
||
|
||
def _parse_json(self, text: str) -> list[dict]:
|
||
start_idx = text.find("[")
|
||
end_idx = text.rfind("]")
|
||
if start_idx < 0 or end_idx < 0:
|
||
raise ValueError(f"未找到 JSON 数组: {text[:200]}")
|
||
json_str = text[start_idx : end_idx + 1]
|
||
data = json.loads(json_str)
|
||
if not isinstance(data, list):
|
||
raise ValueError("解析结果不是数组")
|
||
for index, item in enumerate(data):
|
||
if "题干" not in item or "正确答案" not in item:
|
||
raise ValueError(f"第 {index + 1} 题缺少题干或正确答案")
|
||
item["正确答案"] = re.sub(r"[^A-D0-9]", "", str(item.get("正确答案", "")).upper())
|
||
return data
|
||
|
||
|
||
def extract_metadata(filename: str) -> dict:
|
||
basename = Path(filename).stem
|
||
separators = ["+", " ", "-", "_"]
|
||
parts = [basename]
|
||
for sep in separators:
|
||
if sep in basename:
|
||
parts = basename.split(sep)
|
||
break
|
||
secondary = parts[0].strip() if len(parts) > 0 else ""
|
||
raw_type = parts[1].strip() if len(parts) > 1 else ""
|
||
difficulty = parts[2].strip() if len(parts) > 2 else ""
|
||
mapped_type = settings.type_map.get(raw_type, raw_type)
|
||
chapter = secondary.split("的")[0] if "的" in secondary else secondary
|
||
chapter = chapter or "未分类"
|
||
return {
|
||
"chapter": chapter,
|
||
"secondary_knowledge": secondary,
|
||
"question_type": mapped_type,
|
||
"difficulty": difficulty,
|
||
"source_file": filename,
|
||
}
|