update: uploads
This commit is contained in:
@@ -18,3 +18,14 @@ def save_upload(upload_file: UploadFile) -> Path:
|
||||
with target_path.open("wb") as buffer:
|
||||
shutil.copyfileobj(upload_file.file, buffer)
|
||||
return target_path
|
||||
|
||||
|
||||
def save_upload_for_job(job_id: int, seq: int, upload_file: UploadFile) -> Path:
|
||||
"""Save file with unique path under upload_dir/job_id/ to avoid overwrites."""
|
||||
base = ensure_upload_dir() / str(job_id)
|
||||
base.mkdir(parents=True, exist_ok=True)
|
||||
name = upload_file.filename or "file"
|
||||
target_path = base / f"{seq}_{name}"
|
||||
with target_path.open("wb") as buffer:
|
||||
shutil.copyfileobj(upload_file.file, buffer)
|
||||
return target_path
|
||||
|
||||
180
backend/services/import_queue_service.py
Normal file
180
backend/services/import_queue_service.py
Normal file
@@ -0,0 +1,180 @@
|
||||
"""
|
||||
Import queue: single-consumer FIFO worker and job execution.
|
||||
Run run_worker_loop() in a background thread; on startup call reset_stale_running_jobs().
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from backend.database import SessionLocal
|
||||
from backend.models import (
|
||||
JOB_STATUS_FAILED,
|
||||
JOB_STATUS_QUEUED,
|
||||
JOB_STATUS_RUNNING,
|
||||
JOB_STATUS_SUCCESS,
|
||||
ImportHistory,
|
||||
ImportJob,
|
||||
ImportJobItem,
|
||||
Question,
|
||||
)
|
||||
from backend.repositories import import_job_repo as repo
|
||||
from backend.services.excel_service import parse_excel_file
|
||||
from backend.services.parser import OpenAICompatibleParserService, extract_metadata
|
||||
|
||||
|
||||
def _build_ai_rows(path: Path) -> list[dict]:
|
||||
parser = OpenAICompatibleParserService()
|
||||
metadata = extract_metadata(path.name)
|
||||
questions = parser.parse_file(str(path))
|
||||
rows = []
|
||||
for q in questions:
|
||||
rows.append(
|
||||
{
|
||||
"chapter": metadata["chapter"],
|
||||
"primary_knowledge": "",
|
||||
"secondary_knowledge": metadata["secondary_knowledge"],
|
||||
"question_type": metadata["question_type"],
|
||||
"difficulty": metadata["difficulty"],
|
||||
"stem": q.get("题干", ""),
|
||||
"option_a": q.get("选项A", ""),
|
||||
"option_b": q.get("选项B", ""),
|
||||
"option_c": q.get("选项C", ""),
|
||||
"option_d": q.get("选项D", ""),
|
||||
"answer": q.get("正确答案", ""),
|
||||
"explanation": q.get("解析", ""),
|
||||
"notes": q.get("备注", ""),
|
||||
"source_file": metadata["source_file"],
|
||||
}
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def _process_one_item(
|
||||
db: Session,
|
||||
job: ImportJob,
|
||||
item: ImportJobItem,
|
||||
method: str,
|
||||
) -> None:
|
||||
path = Path(item.stored_path)
|
||||
filename = item.filename
|
||||
job.current_file = filename
|
||||
job.current_index = item.seq
|
||||
job.updated_at = datetime.utcnow()
|
||||
item.status = JOB_STATUS_RUNNING
|
||||
item.started_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
try:
|
||||
if method == "excel":
|
||||
if path.suffix.lower() not in [".xlsx", ".xlsm", ".xltx", ".xltm"]:
|
||||
raise ValueError("仅支持 Excel 文件")
|
||||
rows = parse_excel_file(path)
|
||||
else:
|
||||
rows = _build_ai_rows(path)
|
||||
|
||||
questions = [Question(**row) for row in rows]
|
||||
if questions:
|
||||
db.add_all(questions)
|
||||
db.add(
|
||||
ImportHistory(
|
||||
filename=filename,
|
||||
method=method,
|
||||
question_count=len(questions),
|
||||
status="success",
|
||||
)
|
||||
)
|
||||
db.commit()
|
||||
|
||||
item.status = JOB_STATUS_SUCCESS
|
||||
item.question_count = len(questions)
|
||||
item.ended_at = datetime.utcnow()
|
||||
job.success_count += 1
|
||||
job.processed += 1
|
||||
job.updated_at = datetime.utcnow()
|
||||
db.commit()
|
||||
except Exception as exc:
|
||||
db.rollback()
|
||||
db.add(
|
||||
ImportHistory(
|
||||
filename=filename,
|
||||
method=method,
|
||||
question_count=0,
|
||||
status="failed",
|
||||
)
|
||||
)
|
||||
db.commit()
|
||||
item.status = JOB_STATUS_FAILED
|
||||
item.error = str(exc)
|
||||
item.ended_at = datetime.utcnow()
|
||||
job.failed_count += 1
|
||||
job.processed += 1
|
||||
job.updated_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
|
||||
def process_job(db: Session, job_id: int) -> None:
|
||||
"""Execute a single job: process all items in order, then set job terminal status."""
|
||||
job = repo.get_job(db, job_id)
|
||||
if not job or job.status != JOB_STATUS_RUNNING:
|
||||
return
|
||||
method = job.method
|
||||
items = sorted(job.items, key=lambda x: x.seq)
|
||||
# Resume: ensure processed/success_count/failed_count reflect already-completed items
|
||||
job.processed = sum(1 for it in items if it.status in (JOB_STATUS_SUCCESS, JOB_STATUS_FAILED))
|
||||
job.success_count = sum(1 for it in items if it.status == JOB_STATUS_SUCCESS)
|
||||
job.failed_count = sum(1 for it in items if it.status == JOB_STATUS_FAILED)
|
||||
db.commit()
|
||||
for item in items:
|
||||
if item.status in (JOB_STATUS_SUCCESS, JOB_STATUS_FAILED):
|
||||
continue
|
||||
_process_one_item(db, job, item, method)
|
||||
db.refresh(job)
|
||||
|
||||
job = repo.get_job(db, job_id)
|
||||
if not job:
|
||||
return
|
||||
if job.failed_count > 0 and job.success_count == 0:
|
||||
job.status = JOB_STATUS_FAILED
|
||||
job.error = "部分或全部文件处理失败"
|
||||
else:
|
||||
job.status = JOB_STATUS_SUCCESS
|
||||
job.error = ""
|
||||
job.ended_at = datetime.utcnow()
|
||||
job.current_file = ""
|
||||
job.updated_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
|
||||
def reset_stale_running_jobs(db: Session) -> int:
|
||||
"""On startup: set any job left in 'running' back to 'queued' so worker can pick it up."""
|
||||
count = 0
|
||||
for job in db.query(ImportJob).filter(ImportJob.status == JOB_STATUS_RUNNING).all():
|
||||
job.status = JOB_STATUS_QUEUED
|
||||
count += 1
|
||||
if count:
|
||||
db.commit()
|
||||
return count
|
||||
|
||||
|
||||
def run_worker_loop(interval_seconds: float = 1.0) -> None:
|
||||
"""
|
||||
Single-consumer FIFO loop. Call from a background thread.
|
||||
Claims oldest queued job, processes it, then repeats. Sleeps when no job.
|
||||
"""
|
||||
while True:
|
||||
db = SessionLocal()
|
||||
try:
|
||||
job = repo.claim_oldest_queued(db)
|
||||
if job:
|
||||
process_job(db, job.id)
|
||||
else:
|
||||
time.sleep(interval_seconds)
|
||||
except Exception:
|
||||
if db:
|
||||
db.rollback()
|
||||
time.sleep(interval_seconds)
|
||||
finally:
|
||||
db.close()
|
||||
@@ -1,6 +1,8 @@
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
@@ -8,11 +10,13 @@ import requests
|
||||
from backend.config import settings
|
||||
|
||||
|
||||
class DMXAPIService:
|
||||
|
||||
|
||||
class OpenAICompatibleParserService:
|
||||
def __init__(self) -> None:
|
||||
self.api_key = settings.api_key
|
||||
self.model_name = settings.model_name
|
||||
self.api_url = settings.dmxapi_url
|
||||
self.api_url = settings.openai_api_url
|
||||
|
||||
def parse_file(self, file_path: str) -> list[dict]:
|
||||
path = Path(file_path)
|
||||
@@ -51,33 +55,94 @@ class DMXAPIService:
|
||||
def _convert_to_pdf(self, path: Path) -> Path:
|
||||
pdf_path = path.with_suffix(".pdf")
|
||||
pdf_path.unlink(missing_ok=True)
|
||||
source_path = path
|
||||
temp_dir: str | None = None
|
||||
|
||||
cmd = [
|
||||
"pandoc",
|
||||
str(path),
|
||||
"-o",
|
||||
str(pdf_path),
|
||||
"--pdf-engine=xelatex",
|
||||
"-V",
|
||||
"CJKmainfont=PingFang SC",
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=90)
|
||||
if result.returncode != 0:
|
||||
fallback = [
|
||||
if path.suffix.lower() == ".doc":
|
||||
source_path, temp_dir = self._convert_doc_to_docx(path)
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
"pandoc",
|
||||
str(path),
|
||||
str(source_path),
|
||||
"-o",
|
||||
str(pdf_path),
|
||||
"--pdf-engine=weasyprint",
|
||||
"--pdf-engine=xelatex",
|
||||
"-V",
|
||||
"CJKmainfont=PingFang SC",
|
||||
]
|
||||
result = subprocess.run(fallback, capture_output=True, text=True, timeout=90)
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=90)
|
||||
if result.returncode != 0:
|
||||
raise ValueError(f"文件转 PDF 失败: {result.stderr}")
|
||||
return pdf_path
|
||||
fallback = [
|
||||
"pandoc",
|
||||
str(source_path),
|
||||
"-o",
|
||||
str(pdf_path),
|
||||
"--pdf-engine=weasyprint",
|
||||
]
|
||||
result = subprocess.run(fallback, capture_output=True, text=True, timeout=90)
|
||||
if result.returncode != 0:
|
||||
raise ValueError(f"文件转 PDF 失败: {result.stderr}")
|
||||
return pdf_path
|
||||
finally:
|
||||
if temp_dir:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
def _convert_doc_to_docx(self, path: Path) -> tuple[Path, str]:
|
||||
temp_dir = tempfile.mkdtemp(prefix="pb_doc_convert_")
|
||||
converted_path = Path(temp_dir) / f"{path.stem}.docx"
|
||||
convert_errors: list[str] = []
|
||||
|
||||
if shutil.which("soffice"):
|
||||
result = subprocess.run(
|
||||
[
|
||||
"soffice",
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"docx",
|
||||
"--outdir",
|
||||
temp_dir,
|
||||
str(path),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
)
|
||||
if result.returncode == 0 and converted_path.exists():
|
||||
return converted_path, temp_dir
|
||||
convert_errors.append(f"soffice: {(result.stderr or result.stdout).strip()}")
|
||||
else:
|
||||
convert_errors.append("soffice: 未安装")
|
||||
|
||||
if shutil.which("textutil"):
|
||||
result = subprocess.run(
|
||||
[
|
||||
"textutil",
|
||||
"-convert",
|
||||
"docx",
|
||||
"-output",
|
||||
str(converted_path),
|
||||
str(path),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
)
|
||||
if result.returncode == 0 and converted_path.exists():
|
||||
return converted_path, temp_dir
|
||||
convert_errors.append(f"textutil: {(result.stderr or result.stdout).strip()}")
|
||||
else:
|
||||
convert_errors.append("textutil: 未安装")
|
||||
|
||||
raise ValueError(
|
||||
"检测到 .doc 文件,pandoc 不支持直接转换。"
|
||||
"已尝试自动转换为 .docx 但失败。请先把 .doc 另存为 .docx 后重试。"
|
||||
f" 详细信息: {' | '.join(convert_errors)}"
|
||||
)
|
||||
|
||||
def _parse_with_file_url(self, file_url: str, original_filename: str) -> list[dict]:
|
||||
if not self.api_key:
|
||||
raise ValueError("未配置 API_KEY,无法调用 DMXAPI")
|
||||
raise ValueError("未配置 API_KEY,无法调用 OpenAI 兼容接口")
|
||||
|
||||
payload = {
|
||||
"model": self.model_name,
|
||||
@@ -99,7 +164,7 @@ class DMXAPIService:
|
||||
self.api_url, headers=headers, data=json.dumps(payload), timeout=180
|
||||
)
|
||||
if response.status_code != 200:
|
||||
raise ValueError(f"DMXAPI 请求失败: {response.status_code} {response.text}")
|
||||
raise ValueError(f"OpenAI 兼容接口请求失败: {response.status_code} {response.text}")
|
||||
return self._extract_questions(response.json())
|
||||
|
||||
def _build_instruction(self, filename: str) -> str:
|
||||
|
||||
Reference in New Issue
Block a user