first commit
This commit is contained in:
1
backend/services/__init__.py
Normal file
1
backend/services/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Services package marker.
|
||||
70
backend/services/excel_service.py
Normal file
70
backend/services/excel_service.py
Normal file
@@ -0,0 +1,70 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
from openpyxl import Workbook, load_workbook
|
||||
|
||||
QUESTION_COLUMNS = [
|
||||
("chapter", "章节"),
|
||||
("primary_knowledge", "一级知识点"),
|
||||
("secondary_knowledge", "二级知识点"),
|
||||
("question_type", "题目类型"),
|
||||
("difficulty", "难度"),
|
||||
("stem", "题干"),
|
||||
("option_a", "选项A"),
|
||||
("option_b", "选项B"),
|
||||
("option_c", "选项C"),
|
||||
("option_d", "选项D"),
|
||||
("answer", "正确答案"),
|
||||
("explanation", "解析"),
|
||||
("notes", "备注"),
|
||||
("source_file", "来源文件"),
|
||||
]
|
||||
|
||||
|
||||
def create_template_bytes() -> bytes:
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "questions"
|
||||
ws.append([col[1] for col in QUESTION_COLUMNS])
|
||||
ws.append(["函数", "", "函数性质", "单选", "中", "示例题干", "A", "B", "C", "D", "A", "示例解析", "", "template.xlsx"])
|
||||
buf = BytesIO()
|
||||
wb.save(buf)
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def parse_excel_file(path: Path) -> list[dict]:
|
||||
wb = load_workbook(path, read_only=True)
|
||||
ws = wb.active
|
||||
rows = list(ws.iter_rows(values_only=True))
|
||||
if not rows:
|
||||
return []
|
||||
headers = [str(v).strip() if v is not None else "" for v in rows[0]]
|
||||
key_map = {label: key for key, label in QUESTION_COLUMNS}
|
||||
result: list[dict] = []
|
||||
for row in rows[1:]:
|
||||
if not row or all(v is None or str(v).strip() == "" for v in row):
|
||||
continue
|
||||
item = {key: "" for key, _ in QUESTION_COLUMNS}
|
||||
for idx, val in enumerate(row):
|
||||
if idx >= len(headers):
|
||||
continue
|
||||
label = headers[idx]
|
||||
key = key_map.get(label)
|
||||
if not key:
|
||||
continue
|
||||
item[key] = "" if val is None else str(val)
|
||||
if item["stem"]:
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
|
||||
def export_excel_bytes(items: list[dict]) -> bytes:
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "questions"
|
||||
ws.append([label for _, label in QUESTION_COLUMNS])
|
||||
for item in items:
|
||||
ws.append([item.get(key, "") for key, _ in QUESTION_COLUMNS])
|
||||
buf = BytesIO()
|
||||
wb.save(buf)
|
||||
return buf.getvalue()
|
||||
20
backend/services/file_utils.py
Normal file
20
backend/services/file_utils.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import UploadFile
|
||||
|
||||
from backend.config import settings
|
||||
|
||||
|
||||
def ensure_upload_dir() -> Path:
|
||||
target = Path(settings.upload_dir)
|
||||
target.mkdir(parents=True, exist_ok=True)
|
||||
return target
|
||||
|
||||
|
||||
def save_upload(upload_file: UploadFile) -> Path:
|
||||
target_dir = ensure_upload_dir()
|
||||
target_path = target_dir / upload_file.filename
|
||||
with target_path.open("wb") as buffer:
|
||||
shutil.copyfileobj(upload_file.file, buffer)
|
||||
return target_path
|
||||
167
backend/services/parser.py
Normal file
167
backend/services/parser.py
Normal file
@@ -0,0 +1,167 @@
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
from backend.config import settings
|
||||
|
||||
|
||||
class DMXAPIService:
|
||||
def __init__(self) -> None:
|
||||
self.api_key = settings.api_key
|
||||
self.model_name = settings.model_name
|
||||
self.api_url = settings.dmxapi_url
|
||||
|
||||
def parse_file(self, file_path: str) -> list[dict]:
|
||||
path = Path(file_path)
|
||||
if path.suffix.lower() != ".pdf":
|
||||
pdf_path = self._convert_to_pdf(path)
|
||||
else:
|
||||
pdf_path = path
|
||||
|
||||
try:
|
||||
file_url = self._upload_to_temp_host(pdf_path)
|
||||
questions = self._parse_with_file_url(file_url, path.name)
|
||||
finally:
|
||||
if pdf_path != path and pdf_path.exists():
|
||||
pdf_path.unlink(missing_ok=True)
|
||||
|
||||
return questions
|
||||
|
||||
def _upload_to_temp_host(self, path: Path) -> str:
|
||||
try:
|
||||
with path.open("rb") as f:
|
||||
response = requests.post("https://file.io", files={"file": f}, timeout=60)
|
||||
if response.status_code == 200 and response.json().get("success"):
|
||||
return response.json()["link"]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
with path.open("rb") as f:
|
||||
response = requests.post("https://tmpfiles.org/api/v1/upload", files={"file": f}, timeout=60)
|
||||
if response.status_code != 200:
|
||||
raise ValueError(f"上传失败: {response.text}")
|
||||
data = response.json()
|
||||
if data.get("status") != "success":
|
||||
raise ValueError(f"上传失败: {data}")
|
||||
return data["data"]["url"].replace("tmpfiles.org/", "tmpfiles.org/dl/")
|
||||
|
||||
def _convert_to_pdf(self, path: Path) -> Path:
|
||||
pdf_path = path.with_suffix(".pdf")
|
||||
pdf_path.unlink(missing_ok=True)
|
||||
|
||||
cmd = [
|
||||
"pandoc",
|
||||
str(path),
|
||||
"-o",
|
||||
str(pdf_path),
|
||||
"--pdf-engine=xelatex",
|
||||
"-V",
|
||||
"CJKmainfont=PingFang SC",
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=90)
|
||||
if result.returncode != 0:
|
||||
fallback = [
|
||||
"pandoc",
|
||||
str(path),
|
||||
"-o",
|
||||
str(pdf_path),
|
||||
"--pdf-engine=weasyprint",
|
||||
]
|
||||
result = subprocess.run(fallback, capture_output=True, text=True, timeout=90)
|
||||
if result.returncode != 0:
|
||||
raise ValueError(f"文件转 PDF 失败: {result.stderr}")
|
||||
return pdf_path
|
||||
|
||||
def _parse_with_file_url(self, file_url: str, original_filename: str) -> list[dict]:
|
||||
if not self.api_key:
|
||||
raise ValueError("未配置 API_KEY,无法调用 DMXAPI")
|
||||
|
||||
payload = {
|
||||
"model": self.model_name,
|
||||
"input": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "input_file", "file_url": file_url},
|
||||
{"type": "input_text", "text": self._build_instruction(original_filename)},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
response = requests.post(
|
||||
self.api_url, headers=headers, data=json.dumps(payload), timeout=180
|
||||
)
|
||||
if response.status_code != 200:
|
||||
raise ValueError(f"DMXAPI 请求失败: {response.status_code} {response.text}")
|
||||
return self._extract_questions(response.json())
|
||||
|
||||
def _build_instruction(self, filename: str) -> str:
|
||||
return f"""请从文件\"{filename}\"中提取所有题目信息,并以JSON数组格式返回。
|
||||
|
||||
提取字段:题干、选项A、选项B、选项C、选项D、正确答案、解析、备注。
|
||||
数学公式统一转为 LaTeX,内联公式使用 $...$,独立公式使用 $$...$$。
|
||||
答案统一大写字母,清理“答案:”等前缀。
|
||||
仅返回 JSON 数组,不要返回额外说明。"""
|
||||
|
||||
def _extract_questions(self, response: dict) -> list[dict]:
|
||||
if response.get("status") != "completed":
|
||||
raise ValueError(f"响应状态异常: {response.get('status')}")
|
||||
text = None
|
||||
for item in response.get("output", []):
|
||||
if item.get("type") == "message":
|
||||
for content in item.get("content", []):
|
||||
if content.get("type") == "output_text":
|
||||
text = content.get("text")
|
||||
break
|
||||
if not text:
|
||||
raise ValueError("未在响应中找到文本内容")
|
||||
questions = self._parse_json(text)
|
||||
for q in questions:
|
||||
for field in ["选项A", "选项B", "选项C", "选项D", "解析", "备注"]:
|
||||
q.setdefault(field, "")
|
||||
return questions
|
||||
|
||||
def _parse_json(self, text: str) -> list[dict]:
|
||||
start_idx = text.find("[")
|
||||
end_idx = text.rfind("]")
|
||||
if start_idx < 0 or end_idx < 0:
|
||||
raise ValueError(f"未找到 JSON 数组: {text[:200]}")
|
||||
json_str = text[start_idx : end_idx + 1]
|
||||
data = json.loads(json_str)
|
||||
if not isinstance(data, list):
|
||||
raise ValueError("解析结果不是数组")
|
||||
for index, item in enumerate(data):
|
||||
if "题干" not in item or "正确答案" not in item:
|
||||
raise ValueError(f"第 {index + 1} 题缺少题干或正确答案")
|
||||
item["正确答案"] = re.sub(r"[^A-D0-9]", "", str(item.get("正确答案", "")).upper())
|
||||
return data
|
||||
|
||||
|
||||
def extract_metadata(filename: str) -> dict:
|
||||
basename = Path(filename).stem
|
||||
separators = ["+", " ", "-", "_"]
|
||||
parts = [basename]
|
||||
for sep in separators:
|
||||
if sep in basename:
|
||||
parts = basename.split(sep)
|
||||
break
|
||||
secondary = parts[0].strip() if len(parts) > 0 else ""
|
||||
raw_type = parts[1].strip() if len(parts) > 1 else ""
|
||||
difficulty = parts[2].strip() if len(parts) > 2 else ""
|
||||
mapped_type = settings.type_map.get(raw_type, raw_type)
|
||||
chapter = secondary.split("的")[0] if "的" in secondary else secondary
|
||||
chapter = chapter or "未分类"
|
||||
return {
|
||||
"chapter": chapter,
|
||||
"secondary_knowledge": secondary,
|
||||
"question_type": mapped_type,
|
||||
"difficulty": difficulty,
|
||||
"source_file": filename,
|
||||
}
|
||||
Reference in New Issue
Block a user