update: uploads

2026-03-06 15:52:34 +08:00
parent b1b14fd964
commit f9b9b821df
19 changed files with 1333 additions and 106 deletions
--- a/backend/services/file_utils.py
+++ b/backend/services/file_utils.py
@@ -18,3 +18,14 @@ def save_upload(upload_file: UploadFile) -> Path:
    with target_path.open("wb") as buffer:
        shutil.copyfileobj(upload_file.file, buffer)
    return target_path
+
+
+def save_upload_for_job(job_id: int, seq: int, upload_file: UploadFile) -> Path:
+    """Save file with unique path under upload_dir/job_id/ to avoid overwrites."""
+    base = ensure_upload_dir() / str(job_id)
+    base.mkdir(parents=True, exist_ok=True)
+    name = upload_file.filename or "file"
+    target_path = base / f"{seq}_{name}"
+    with target_path.open("wb") as buffer:
+        shutil.copyfileobj(upload_file.file, buffer)
+    return target_path
--- a/backend/services/import_queue_service.py
+++ b/backend/services/import_queue_service.py
@@ -0,0 +1,180 @@
+"""
+Import queue: single-consumer FIFO worker and job execution.
+Run run_worker_loop() in a background thread; on startup call reset_stale_running_jobs().
+"""
+
+from datetime import datetime
+from pathlib import Path
+import time
+
+from sqlalchemy.orm import Session
+
+from backend.database import SessionLocal
+from backend.models import (
+    JOB_STATUS_FAILED,
+    JOB_STATUS_QUEUED,
+    JOB_STATUS_RUNNING,
+    JOB_STATUS_SUCCESS,
+    ImportHistory,
+    ImportJob,
+    ImportJobItem,
+    Question,
+)
+from backend.repositories import import_job_repo as repo
+from backend.services.excel_service import parse_excel_file
+from backend.services.parser import OpenAICompatibleParserService, extract_metadata
+
+
+def _build_ai_rows(path: Path) -> list[dict]:
+    parser = OpenAICompatibleParserService()
+    metadata = extract_metadata(path.name)
+    questions = parser.parse_file(str(path))
+    rows = []
+    for q in questions:
+        rows.append(
+            {
+                "chapter": metadata["chapter"],
+                "primary_knowledge": "",
+                "secondary_knowledge": metadata["secondary_knowledge"],
+                "question_type": metadata["question_type"],
+                "difficulty": metadata["difficulty"],
+                "stem": q.get("题干", ""),
+                "option_a": q.get("选项A", ""),
+                "option_b": q.get("选项B", ""),
+                "option_c": q.get("选项C", ""),
+                "option_d": q.get("选项D", ""),
+                "answer": q.get("正确答案", ""),
+                "explanation": q.get("解析", ""),
+                "notes": q.get("备注", ""),
+                "source_file": metadata["source_file"],
+            }
+        )
+    return rows
+
+
+def _process_one_item(
+    db: Session,
+    job: ImportJob,
+    item: ImportJobItem,
+    method: str,
+) -> None:
+    path = Path(item.stored_path)
+    filename = item.filename
+    job.current_file = filename
+    job.current_index = item.seq
+    job.updated_at = datetime.utcnow()
+    item.status = JOB_STATUS_RUNNING
+    item.started_at = datetime.utcnow()
+    db.commit()
+
+    try:
+        if method == "excel":
+            if path.suffix.lower() not in [".xlsx", ".xlsm", ".xltx", ".xltm"]:
+                raise ValueError("仅支持 Excel 文件")
+            rows = parse_excel_file(path)
+        else:
+            rows = _build_ai_rows(path)
+
+        questions = [Question(**row) for row in rows]
+        if questions:
+            db.add_all(questions)
+        db.add(
+            ImportHistory(
+                filename=filename,
+                method=method,
+                question_count=len(questions),
+                status="success",
+            )
+        )
+        db.commit()
+
+        item.status = JOB_STATUS_SUCCESS
+        item.question_count = len(questions)
+        item.ended_at = datetime.utcnow()
+        job.success_count += 1
+        job.processed += 1
+        job.updated_at = datetime.utcnow()
+        db.commit()
+    except Exception as exc:
+        db.rollback()
+        db.add(
+            ImportHistory(
+                filename=filename,
+                method=method,
+                question_count=0,
+                status="failed",
+            )
+        )
+        db.commit()
+        item.status = JOB_STATUS_FAILED
+        item.error = str(exc)
+        item.ended_at = datetime.utcnow()
+        job.failed_count += 1
+        job.processed += 1
+        job.updated_at = datetime.utcnow()
+        db.commit()
+
+
+def process_job(db: Session, job_id: int) -> None:
+    """Execute a single job: process all items in order, then set job terminal status."""
+    job = repo.get_job(db, job_id)
+    if not job or job.status != JOB_STATUS_RUNNING:
+        return
+    method = job.method
+    items = sorted(job.items, key=lambda x: x.seq)
+    # Resume: ensure processed/success_count/failed_count reflect already-completed items
+    job.processed = sum(1 for it in items if it.status in (JOB_STATUS_SUCCESS, JOB_STATUS_FAILED))
+    job.success_count = sum(1 for it in items if it.status == JOB_STATUS_SUCCESS)
+    job.failed_count = sum(1 for it in items if it.status == JOB_STATUS_FAILED)
+    db.commit()
+    for item in items:
+        if item.status in (JOB_STATUS_SUCCESS, JOB_STATUS_FAILED):
+            continue
+        _process_one_item(db, job, item, method)
+        db.refresh(job)
+
+    job = repo.get_job(db, job_id)
+    if not job:
+        return
+    if job.failed_count > 0 and job.success_count == 0:
+        job.status = JOB_STATUS_FAILED
+        job.error = "部分或全部文件处理失败"
+    else:
+        job.status = JOB_STATUS_SUCCESS
+        job.error = ""
+    job.ended_at = datetime.utcnow()
+    job.current_file = ""
+    job.updated_at = datetime.utcnow()
+    db.commit()
+
+
+def reset_stale_running_jobs(db: Session) -> int:
+    """On startup: set any job left in 'running' back to 'queued' so worker can pick it up."""
+    count = 0
+    for job in db.query(ImportJob).filter(ImportJob.status == JOB_STATUS_RUNNING).all():
+        job.status = JOB_STATUS_QUEUED
+        count += 1
+    if count:
+        db.commit()
+    return count
+
+
+def run_worker_loop(interval_seconds: float = 1.0) -> None:
+    """
+    Single-consumer FIFO loop. Call from a background thread.
+    Claims oldest queued job, processes it, then repeats. Sleeps when no job.
+    """
+    while True:
+        db = SessionLocal()
+        try:
+            job = repo.claim_oldest_queued(db)
+            if job:
+                process_job(db, job.id)
+            else:
+                time.sleep(interval_seconds)
+        except Exception:
+            if db:
+                db.rollback()
+            time.sleep(interval_seconds)
+        finally:
+            db.close()
--- a/backend/services/parser.py
+++ b/backend/services/parser.py
@@ -1,6 +1,8 @@
 import json
 import re
+import shutil
 import subprocess
+import tempfile
 from pathlib import Path

 import requests
@@ -8,11 +10,13 @@ import requests
 from backend.config import settings


-class DMXAPIService:
+
+
+class OpenAICompatibleParserService:
    def __init__(self) -> None:
        self.api_key = settings.api_key
        self.model_name = settings.model_name
-        self.api_url = settings.dmxapi_url
+        self.api_url = settings.openai_api_url

    def parse_file(self, file_path: str) -> list[dict]:
        path = Path(file_path)
@@ -51,33 +55,94 @@ class DMXAPIService:
    def _convert_to_pdf(self, path: Path) -> Path:
        pdf_path = path.with_suffix(".pdf")
        pdf_path.unlink(missing_ok=True)
+        source_path = path
+        temp_dir: str | None = None

-        cmd = [
-            "pandoc",
-            str(path),
-            "-o",
-            str(pdf_path),
-            "--pdf-engine=xelatex",
-            "-V",
-            "CJKmainfont=PingFang SC",
-        ]
-        result = subprocess.run(cmd, capture_output=True, text=True, timeout=90)
-        if result.returncode != 0:
-            fallback = [
+        if path.suffix.lower() == ".doc":
+            source_path, temp_dir = self._convert_doc_to_docx(path)
+
+        try:
+            cmd = [
                "pandoc",
-                str(path),
+                str(source_path),
                "-o",
                str(pdf_path),
-                "--pdf-engine=weasyprint",
+                "--pdf-engine=xelatex",
+                "-V",
+                "CJKmainfont=PingFang SC",
            ]
-            result = subprocess.run(fallback, capture_output=True, text=True, timeout=90)
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=90)
            if result.returncode != 0:
-                raise ValueError(f"文件转 PDF 失败: {result.stderr}")
-        return pdf_path
+                fallback = [
+                    "pandoc",
+                    str(source_path),
+                    "-o",
+                    str(pdf_path),
+                    "--pdf-engine=weasyprint",
+                ]
+                result = subprocess.run(fallback, capture_output=True, text=True, timeout=90)
+                if result.returncode != 0:
+                    raise ValueError(f"文件转 PDF 失败: {result.stderr}")
+            return pdf_path
+        finally:
+            if temp_dir:
+                shutil.rmtree(temp_dir, ignore_errors=True)
+
+    def _convert_doc_to_docx(self, path: Path) -> tuple[Path, str]:
+        temp_dir = tempfile.mkdtemp(prefix="pb_doc_convert_")
+        converted_path = Path(temp_dir) / f"{path.stem}.docx"
+        convert_errors: list[str] = []
+
+        if shutil.which("soffice"):
+            result = subprocess.run(
+                [
+                    "soffice",
+                    "--headless",
+                    "--convert-to",
+                    "docx",
+                    "--outdir",
+                    temp_dir,
+                    str(path),
+                ],
+                capture_output=True,
+                text=True,
+                timeout=120,
+            )
+            if result.returncode == 0 and converted_path.exists():
+                return converted_path, temp_dir
+            convert_errors.append(f"soffice: {(result.stderr or result.stdout).strip()}")
+        else:
+            convert_errors.append("soffice: 未安装")
+
+        if shutil.which("textutil"):
+            result = subprocess.run(
+                [
+                    "textutil",
+                    "-convert",
+                    "docx",
+                    "-output",
+                    str(converted_path),
+                    str(path),
+                ],
+                capture_output=True,
+                text=True,
+                timeout=120,
+            )
+            if result.returncode == 0 and converted_path.exists():
+                return converted_path, temp_dir
+            convert_errors.append(f"textutil: {(result.stderr or result.stdout).strip()}")
+        else:
+            convert_errors.append("textutil: 未安装")
+
+        raise ValueError(
+            "检测到 .doc 文件，pandoc 不支持直接转换。"
+            "已尝试自动转换为 .docx 但失败。请先把 .doc 另存为 .docx 后重试。"
+            f" 详细信息: {' | '.join(convert_errors)}"
+        )

    def _parse_with_file_url(self, file_url: str, original_filename: str) -> list[dict]:
        if not self.api_key:
-            raise ValueError("未配置 API_KEY，无法调用 DMXAPI")
+            raise ValueError("未配置 API_KEY，无法调用 OpenAI 兼容接口")

        payload = {
            "model": self.model_name,
@@ -99,7 +164,7 @@ class DMXAPIService:
            self.api_url, headers=headers, data=json.dumps(payload), timeout=180
        )
        if response.status_code != 200:
-            raise ValueError(f"DMXAPI 请求失败: {response.status_code} {response.text}")
+            raise ValueError(f"OpenAI 兼容接口请求失败: {response.status_code} {response.text}")
        return self._extract_questions(response.json())

    def _build_instruction(self, filename: str) -> str: