update: uploads

This commit is contained in:
2026-03-06 15:52:34 +08:00
parent b1b14fd964
commit f9b9b821df
19 changed files with 1333 additions and 106 deletions

View File

@@ -1,6 +1,8 @@
import json
import re
import shutil
import subprocess
import tempfile
from pathlib import Path
import requests
@@ -8,11 +10,13 @@ import requests
from backend.config import settings
class DMXAPIService:
class OpenAICompatibleParserService:
def __init__(self) -> None:
self.api_key = settings.api_key
self.model_name = settings.model_name
self.api_url = settings.dmxapi_url
self.api_url = settings.openai_api_url
def parse_file(self, file_path: str) -> list[dict]:
path = Path(file_path)
@@ -51,33 +55,94 @@ class DMXAPIService:
def _convert_to_pdf(self, path: Path) -> Path:
pdf_path = path.with_suffix(".pdf")
pdf_path.unlink(missing_ok=True)
source_path = path
temp_dir: str | None = None
cmd = [
"pandoc",
str(path),
"-o",
str(pdf_path),
"--pdf-engine=xelatex",
"-V",
"CJKmainfont=PingFang SC",
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=90)
if result.returncode != 0:
fallback = [
if path.suffix.lower() == ".doc":
source_path, temp_dir = self._convert_doc_to_docx(path)
try:
cmd = [
"pandoc",
str(path),
str(source_path),
"-o",
str(pdf_path),
"--pdf-engine=weasyprint",
"--pdf-engine=xelatex",
"-V",
"CJKmainfont=PingFang SC",
]
result = subprocess.run(fallback, capture_output=True, text=True, timeout=90)
result = subprocess.run(cmd, capture_output=True, text=True, timeout=90)
if result.returncode != 0:
raise ValueError(f"文件转 PDF 失败: {result.stderr}")
return pdf_path
fallback = [
"pandoc",
str(source_path),
"-o",
str(pdf_path),
"--pdf-engine=weasyprint",
]
result = subprocess.run(fallback, capture_output=True, text=True, timeout=90)
if result.returncode != 0:
raise ValueError(f"文件转 PDF 失败: {result.stderr}")
return pdf_path
finally:
if temp_dir:
shutil.rmtree(temp_dir, ignore_errors=True)
def _convert_doc_to_docx(self, path: Path) -> tuple[Path, str]:
temp_dir = tempfile.mkdtemp(prefix="pb_doc_convert_")
converted_path = Path(temp_dir) / f"{path.stem}.docx"
convert_errors: list[str] = []
if shutil.which("soffice"):
result = subprocess.run(
[
"soffice",
"--headless",
"--convert-to",
"docx",
"--outdir",
temp_dir,
str(path),
],
capture_output=True,
text=True,
timeout=120,
)
if result.returncode == 0 and converted_path.exists():
return converted_path, temp_dir
convert_errors.append(f"soffice: {(result.stderr or result.stdout).strip()}")
else:
convert_errors.append("soffice: 未安装")
if shutil.which("textutil"):
result = subprocess.run(
[
"textutil",
"-convert",
"docx",
"-output",
str(converted_path),
str(path),
],
capture_output=True,
text=True,
timeout=120,
)
if result.returncode == 0 and converted_path.exists():
return converted_path, temp_dir
convert_errors.append(f"textutil: {(result.stderr or result.stdout).strip()}")
else:
convert_errors.append("textutil: 未安装")
raise ValueError(
"检测到 .doc 文件pandoc 不支持直接转换。"
"已尝试自动转换为 .docx 但失败。请先把 .doc 另存为 .docx 后重试。"
f" 详细信息: {' | '.join(convert_errors)}"
)
def _parse_with_file_url(self, file_url: str, original_filename: str) -> list[dict]:
if not self.api_key:
raise ValueError("未配置 API_KEY无法调用 DMXAPI")
raise ValueError("未配置 API_KEY无法调用 OpenAI 兼容接口")
payload = {
"model": self.model_name,
@@ -99,7 +164,7 @@ class DMXAPIService:
self.api_url, headers=headers, data=json.dumps(payload), timeout=180
)
if response.status_code != 200:
raise ValueError(f"DMXAPI 请求失败: {response.status_code} {response.text}")
raise ValueError(f"OpenAI 兼容接口请求失败: {response.status_code} {response.text}")
return self._extract_questions(response.json())
def _build_instruction(self, filename: str) -> str: