problem-bank/backend/services/parser.py

import json
import re
import shutil
import subprocess
import tempfile
from pathlib import Path

import requests

from backend.config import settings


class OpenAICompatibleParserService:
    def __init__(self) -> None:
        self.api_key = settings.api_key
        self.model_name = settings.model_name
        self.api_url = settings.openai_api_url

    def parse_file(self, file_path: str) -> list[dict]:
        path = Path(file_path)
        if path.suffix.lower() != ".pdf":
            pdf_path = self._convert_to_pdf(path)
        else:
            pdf_path = path

        try:
            file_url = self._upload_to_temp_host(pdf_path)
            questions = self._parse_with_file_url(file_url, path.name)
        finally:
            if pdf_path != path and pdf_path.exists():
                pdf_path.unlink(missing_ok=True)

        return questions

    def _upload_to_temp_host(self, path: Path) -> str:
        try:
            with path.open("rb") as f:
                response = requests.post("https://file.io", files={"file": f}, timeout=60)
            if response.status_code == 200 and response.json().get("success"):
                return response.json()["link"]
        except Exception:
            pass

        with path.open("rb") as f:
            response = requests.post("https://tmpfiles.org/api/v1/upload", files={"file": f}, timeout=60)
        if response.status_code != 200:
            raise ValueError(f"上传失败: {response.text}")
        data = response.json()
        if data.get("status") != "success":
            raise ValueError(f"上传失败: {data}")
        return data["data"]["url"].replace("tmpfiles.org/", "tmpfiles.org/dl/")

    def _convert_to_pdf(self, path: Path) -> Path:
        pdf_path = path.with_suffix(".pdf")
        pdf_path.unlink(missing_ok=True)
        source_path = path
        temp_dir: str | None = None

        if path.suffix.lower() == ".doc":
            source_path, temp_dir = self._convert_doc_to_docx(path)

        try:
            cmd = [
                "pandoc",
                str(source_path),
                "-o",
                str(pdf_path),
                "--pdf-engine=xelatex",
                "-V",
                "CJKmainfont=PingFang SC",
            ]
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=90)
            if result.returncode != 0:
                fallback = [
                    "pandoc",
                    str(source_path),
                    "-o",
                    str(pdf_path),
                    "--pdf-engine=weasyprint",
                ]
                result = subprocess.run(fallback, capture_output=True, text=True, timeout=90)
                if result.returncode != 0:
                    raise ValueError(f"文件转 PDF 失败: {result.stderr}")
            return pdf_path
        finally:
            if temp_dir:
                shutil.rmtree(temp_dir, ignore_errors=True)

    def _convert_doc_to_docx(self, path: Path) -> tuple[Path, str]:
        temp_dir = tempfile.mkdtemp(prefix="pb_doc_convert_")
        converted_path = Path(temp_dir) / f"{path.stem}.docx"
        convert_errors: list[str] = []

        if shutil.which("soffice"):
            result = subprocess.run(
                [
                    "soffice",
                    "--headless",
                    "--convert-to",
                    "docx",
                    "--outdir",
                    temp_dir,
                    str(path),
                ],
                capture_output=True,
                text=True,
                timeout=120,
            )
            if result.returncode == 0 and converted_path.exists():
                return converted_path, temp_dir
            convert_errors.append(f"soffice: {(result.stderr or result.stdout).strip()}")
        else:
            convert_errors.append("soffice: 未安装")

        if shutil.which("textutil"):
            result = subprocess.run(
                [
                    "textutil",
                    "-convert",
                    "docx",
                    "-output",
                    str(converted_path),
                    str(path),
                ],
                capture_output=True,
                text=True,
                timeout=120,
            )
            if result.returncode == 0 and converted_path.exists():
                return converted_path, temp_dir
            convert_errors.append(f"textutil: {(result.stderr or result.stdout).strip()}")
        else:
            convert_errors.append("textutil: 未安装")

        raise ValueError(
            "检测到 .doc 文件，pandoc 不支持直接转换。"
            "已尝试自动转换为 .docx 但失败。请先把 .doc 另存为 .docx 后重试。"
            f" 详细信息: {' | '.join(convert_errors)}"
        )

    def _parse_with_file_url(self, file_url: str, original_filename: str) -> list[dict]:
        if not self.api_key:
            raise ValueError("未配置 API_KEY，无法调用 OpenAI 兼容接口")

        payload = {
            "model": self.model_name,
            "input": [
                {
                    "role": "user",
                    "content": [
                        {"type": "input_file", "file_url": file_url},
                        {"type": "input_text", "text": self._build_instruction(original_filename)},
                    ],
                }
            ],
        }
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }
        response = requests.post(
            self.api_url, headers=headers, data=json.dumps(payload), timeout=180
        )
        if response.status_code != 200:
            raise ValueError(f"OpenAI 兼容接口请求失败: {response.status_code} {response.text}")
        return self._extract_questions(response.json())

    def _build_instruction(self, filename: str) -> str:
        return f"""请从文件\"{filename}\"中提取所有题目信息，并以JSON数组格式返回。

提取字段：题干、选项A、选项B、选项C、选项D、正确答案、解析、备注。
数学公式统一转为 LaTeX，内联公式使用 $...$，独立公式使用 $$...$$。
答案统一大写字母，清理“答案:”等前缀。
仅返回 JSON 数组，不要返回额外说明。"""

    def _extract_questions(self, response: dict) -> list[dict]:
        if response.get("status") != "completed":
            raise ValueError(f"响应状态异常: {response.get('status')}")
        text = None
        for item in response.get("output", []):
            if item.get("type") == "message":
                for content in item.get("content", []):
                    if content.get("type") == "output_text":
                        text = content.get("text")
                        break
        if not text:
            raise ValueError("未在响应中找到文本内容")
        questions = self._parse_json(text)
        for q in questions:
            for field in ["选项A", "选项B", "选项C", "选项D", "解析", "备注"]:
                q.setdefault(field, "")
        return questions

    def _parse_json(self, text: str) -> list[dict]:
        start_idx = text.find("[")
        end_idx = text.rfind("]")
        if start_idx < 0 or end_idx < 0:
            raise ValueError(f"未找到 JSON 数组: {text[:200]}")
        json_str = text[start_idx : end_idx + 1]
        data = json.loads(json_str)
        if not isinstance(data, list):
            raise ValueError("解析结果不是数组")
        for index, item in enumerate(data):
            if "题干" not in item or "正确答案" not in item:
                raise ValueError(f"第 {index + 1} 题缺少题干或正确答案")
            item["正确答案"] = re.sub(r"[^A-D0-9]", "", str(item.get("正确答案", "")).upper())
        return data


def extract_metadata(filename: str) -> dict:
    basename = Path(filename).stem
    separators = ["+", " ", "-", "_"]
    parts = [basename]
    for sep in separators:
        if sep in basename:
            parts = basename.split(sep)
            break
    secondary = parts[0].strip() if len(parts) > 0 else ""
    raw_type = parts[1].strip() if len(parts) > 1 else ""
    difficulty = parts[2].strip() if len(parts) > 2 else ""
    mapped_type = settings.type_map.get(raw_type, raw_type)
    chapter = secondary.split("的")[0] if "的" in secondary else secondary
    chapter = chapter or "未分类"
    return {
        "chapter": chapter,
        "secondary_knowledge": secondary,
        "question_type": mapped_type,
        "difficulty": difficulty,
        "source_file": filename,
    }