first commit

2026-03-05 11:50:15 +08:00
commit b1b14fd964
45 changed files with 7779 additions and 0 deletions
--- a/analysis.py
+++ b/analysis.py
@@ -0,0 +1,550 @@
+#!/usr/bin/env python3
+"""
+DMXAPI直接解析 - 无需本地文本提取
+将文件上传到临时托管服务，然后使用DMXAPI的responses接口分析
+"""
+import sys
+import json
+import requests
+from pathlib import Path
+
+# 添加项目路径
+sys.path.insert(0, str(Path(__file__).parent))
+
+from config import settings
+from config.database import get_db_manager
+
+
+class DMXAPIParser:
+    """DMXAPI解析器"""
+
+    def __init__(self):
+        self.api_key = settings.API_KEY
+        self.model_name = settings.MODEL_NAME
+        self.api_url = "https://www.dmxapi.cn/v1/responses"
+
+    def upload_to_temp_host(self, file_path):
+        """
+        上传文件到临时托管服务获取公网URL
+
+        Args:
+            file_path: 文件路径
+
+        Returns:
+            文件的公网访问URL
+        """
+        path = Path(file_path)
+
+        print(f'  上传PDF到临时托管服务...')
+        print(f'  文件: {path.name}')
+        print(f'  大小: {path.stat().st_size / 1024:.2f} KB')
+
+        # 使用 file.io 临时文件托管（24小时有效，一次下载后删除）
+        try:
+            with open(file_path, 'rb') as f:
+                files = {'file': f}
+                response = requests.post(
+                    'https://file.io',
+                    files=files,
+                    timeout=60
+                )
+
+            if response.status_code == 200:
+                result = response.json()
+                if result.get('success'):
+                    file_url = result['link']
+                    print(f'  ✓ 上传成功')
+                    print(f'  URL: {file_url}')
+                    return file_url
+                else:
+                    raise Exception(f'上传失败: {result}')
+            else:
+                raise Exception(f'HTTP错误: {response.status_code} - {response.text}')
+
+        except Exception as e:
+            print(f'  ✗ file.io上传失败: {e}')
+            print('  尝试备用服务...')
+
+            # 备用方案：使用 tmpfiles.org (7天有效)
+            try:
+                with open(file_path, 'rb') as f:
+                    files = {'file': f}
+                    response = requests.post(
+                        'https://tmpfiles.org/api/v1/upload',
+                        files=files,
+                        timeout=60
+                    )
+
+                if response.status_code == 200:
+                    result = response.json()
+                    if result.get('status') == 'success':
+                        # tmpfiles.org返回的URL需要调整
+                        url = result['data']['url']
+                        # 转换为直接下载链接
+                        file_url = url.replace('tmpfiles.org/', 'tmpfiles.org/dl/')
+                        print(f'  ✓ 上传成功（备用服务）')
+                        print(f'  URL: {file_url}')
+                        return file_url
+
+                raise Exception(f'备用服务也失败: {response.text}')
+
+            except Exception as e2:
+                raise Exception(f'所有上传服务均失败: {e2}')
+
+    def parse_file(self, file_path):
+        """
+        使用DMXAPI分析文件
+
+        Args:
+            file_path: 文件路径
+
+        Returns:
+            解析出的题目列表
+        """
+        print(f'\n[2/4] 准备文件并调用DMXAPI...')
+
+        path = Path(file_path)
+
+        # 判断文件类型，如果不是PDF则转换
+        if path.suffix.lower() != '.pdf':
+            print(f'  检测到 {path.suffix} 文件，使用pandoc转换为PDF')
+            pdf_path = self._convert_to_pdf(file_path)
+        else:
+            print(f'  检测到PDF文件，直接使用')
+            pdf_path = file_path
+
+        # 上传PDF到临时服务
+        file_url = self.upload_to_temp_host(pdf_path)
+
+        # 使用DMXAPI分析
+        questions = self._parse_with_file_url(file_url, path.name)
+
+        # 清理临时PDF（如果是转换生成的）
+        if pdf_path != file_path:
+            try:
+                Path(pdf_path).unlink()
+                print(f'  ✓ 已清理临时PDF文件')
+            except:
+                pass
+
+        return questions
+
+    def _convert_to_pdf(self, file_path):
+        """
+        使用pandoc将文件转换为PDF
+
+        Args:
+            file_path: 原始文件路径
+
+        Returns:
+            转换后的PDF文件路径
+        """
+        import subprocess
+
+        path = Path(file_path)
+        pdf_path = path.with_suffix('.pdf')
+
+        # 如果PDF已存在，先删除
+        if pdf_path.exists():
+            pdf_path.unlink()
+
+        print(f'  运行: pandoc (使用xelatex引擎)')
+
+        try:
+            # 使用xelatex引擎支持中文
+            result = subprocess.run(
+                [
+                    'pandoc',
+                    str(path),
+                    '-o', str(pdf_path),
+                    '--pdf-engine=xelatex',
+                    '-V', 'CJKmainfont=PingFang SC'  # macOS中文字体
+                ],
+                capture_output=True,
+                text=True,
+                timeout=60
+            )
+
+            if result.returncode != 0:
+                # xelatex失败，尝试使用weasyprint
+                print(f'  xelatex失败，尝试weasyprint引擎...')
+                result = subprocess.run(
+                    [
+                        'pandoc',
+                        str(path),
+                        '-o', str(pdf_path),
+                        '--pdf-engine=weasyprint'
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=60
+                )
+
+                if result.returncode != 0:
+                    raise Exception(f'所有PDF引擎均失败\nxelatex错误: {result.stderr}')
+
+            if not pdf_path.exists():
+                raise Exception('PDF文件未生成')
+
+            print(f'  ✓ 转换成功: {pdf_path.name}')
+            return str(pdf_path)
+
+        except FileNotFoundError:
+            raise Exception('未找到pandoc命令，请先安装pandoc\n  macOS: brew install pandoc\n  Ubuntu: sudo apt install pandoc')
+        except subprocess.TimeoutExpired:
+            raise Exception('pandoc转换超时')
+
+    def _parse_with_file_url(self, file_url, original_filename):
+        """使用file_url方式解析PDF"""
+        print(f'  文件URL: {file_url}')
+        print(f'  模型: {self.model_name}')
+        print(f'  正在分析...')
+
+        # 构建请求
+        payload = {
+            "model": self.model_name,
+            "input": [{
+                "role": "user",
+                "content": [
+                    {
+                        "type": "input_file",
+                        "file_url": file_url
+                    },
+                    {
+                        "type": "input_text",
+                        "text": self._build_instruction(original_filename)
+                    }
+                ]
+            }]
+        }
+
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        }
+
+        # 发送请求
+        print(f'  模型: {self.model_name}')
+        print(f'  正在分析...')
+
+        response = requests.post(
+            self.api_url,
+            headers=headers,
+            data=json.dumps(payload),
+            timeout=180  # 3分钟超时
+        )
+
+        if response.status_code != 200:
+            raise Exception(f'API请求失败: {response.status_code} - {response.text}')
+
+        result = response.json()
+
+        # 提取题目
+        questions = self._extract_questions(result)
+
+        print(f'  ✓ 解析完成，共 {len(questions)} 个题目')
+
+        return questions
+
+    def _build_instruction(self, filename):
+        """构建分析指令"""
+        return f"""请从文件"{filename}"中提取所有题目信息，并以JSON数组格式返回。
+
+**提取要求：**
+
+1. **题目识别**：
+   - 如果文档中有"修改试题"作为分隔符，请以此分割题目
+   - 否则根据题目序号、题干、选项、答案、解析的结构识别每个题目
+
+2. **字段提取**：
+   - 题干：题目的问题部分
+   - 选项A、选项B、选项C、选项D：选择题的选项（填空题或解答题为空字符串）
+   - 正确答案：答案内容（单选如"A"，多选如"ABD"）
+   - 解析：题目的解答过程
+   - 备注：提取"难度 难"或"属性:共享 难度:易 采用:是"等元数据
+
+3. **数学公式格式要求（重要）：**
+   - **所有数学公式必须转换为LaTeX格式**
+   - 内联公式使用 $...$ 包围
+   - 独立公式使用 $$...$$ 包围
+   - 示例：
+     * "𝑥²" → "$x^2$"
+     * "sin x" → "$\\sin x$"
+     * "∫₀¹ x dx" → "$\\int_0^1 x \\, dx$"
+     * "f(x) = x² + 1" → "$f(x) = x^2 + 1$"
+     * 分数：a/b → "$\\frac{{a}}{{b}}$"
+     * 根号：√x → "$\\sqrt{{x}}$"
+     * 上下标：xₙ → "$x_n$"，x² → "$x^2$"
+
+4. **其他格式要求**：
+   - 答案统一为大写字母（选择题）
+   - 移除答案前缀（如"答案："、"选"等）
+   - 保留其他文本格式
+
+**返回格式（只返回JSON数组）：**
+
+```json
+[
+  {{
+    "题干": "下列函数中定义域为R的是",
+    "选项A": "$y = \\frac{{1}}{{x}}$",
+    "选项B": "$y = \\sqrt{{x}}$",
+    "选项C": "$y = x^2$",
+    "选项D": "$y = \\ln(x)$",
+    "正确答案": "C",
+    "解析": "$x^2$对所有实数都有定义",
+    "备注": "难度 易"
+  }}
+]
+```
+
+**注意：请确保所有数学符号、公式都转换为LaTeX格式！**"""
+
+    def _extract_questions(self, response):
+        """从DMXAPI响应中提取题目"""
+        # 检查状态
+        if response.get('status') != 'completed':
+            raise ValueError(f'响应状态异常: {response.get("status")}')
+
+        # 提取文本内容
+        output = response.get('output', [])
+        text_content = None
+
+        for item in output:
+            if item.get('type') == 'message':
+                content = item.get('content', [])
+                for c in content:
+                    if c.get('type') == 'output_text':
+                        text_content = c.get('text', '')
+                        break
+                if text_content:
+                    break
+
+        if not text_content:
+            raise ValueError('未找到文本内容')
+
+        # 解析JSON
+        questions = self._parse_json(text_content)
+
+        # 补充缺失字段
+        for q in questions:
+            for field in ['选项A', '选项B', '选项C', '选项D', '解析', '备注']:
+                if field not in q:
+                    q[field] = ''
+
+        return questions
+
+    def _parse_json(self, text):
+        """从文本中解析JSON"""
+        # 查找JSON数组
+        start_idx = text.find('[')
+        end_idx = text.rfind(']')
+
+        if start_idx == -1 or end_idx == -1:
+            # 尝试查找被```包围的JSON
+            if '```json' in text:
+                lines = text.split('\n')
+                json_lines = []
+                in_json = False
+                for line in lines:
+                    if '```json' in line:
+                        in_json = True
+                        continue
+                    elif '```' in line and in_json:
+                        break
+                    elif in_json:
+                        json_lines.append(line)
+                text = '\n'.join(json_lines)
+                start_idx = text.find('[')
+                end_idx = text.rfind(']')
+
+        if start_idx == -1 or end_idx == -1:
+            raise ValueError(f'未找到JSON数组\n文本: {text[:200]}...')
+
+        json_str = text[start_idx:end_idx + 1]
+
+        try:
+            questions = json.loads(json_str)
+
+            if not isinstance(questions, list):
+                raise ValueError('解析结果不是数组')
+
+            # 验证必填字段
+            for i, q in enumerate(questions):
+                if '题干' not in q or '正确答案' not in q:
+                    raise ValueError(f'第 {i+1} 个题目缺少必填字段')
+
+            return questions
+
+        except json.JSONDecodeError as e:
+            raise ValueError(f'JSON解析失败: {e}\nJSON: {json_str[:300]}...')
+
+
+def extract_metadata(filename):
+    """从文件名提取元数据"""
+    from config.settings import TYPE_MAP
+
+    path = Path(filename)
+    basename = path.stem
+
+    # 解析文件名
+    separators = ['+', ' ', '-', '_']
+    parts = [basename]
+
+    for sep in separators:
+        if sep in basename:
+            parts = basename.split(sep)
+            break
+
+    secondary_knowledge = parts[0].strip() if len(parts) > 0 else ''
+    question_type_raw = parts[1].strip() if len(parts) > 1 else ''
+    difficulty = parts[2].strip() if len(parts) > 2 else ''
+
+    # 映射题型
+    question_type = TYPE_MAP.get(question_type_raw, question_type_raw)
+    valid_types = ['单选', '多选', '不定项', '填空', '解答']
+    if question_type not in valid_types and question_type_raw in valid_types:
+        question_type = question_type_raw
+
+    # 推断章节
+    chapter = ''
+    if '的' in secondary_knowledge:
+        chapter = secondary_knowledge.split('的')[0]
+    else:
+        chapter = secondary_knowledge
+
+    if not chapter:
+        chapter = '未分类'
+
+    return {
+        '章节': chapter,
+        '二级知识点': secondary_knowledge,
+        '题目类型': question_type,
+        '难度': difficulty,
+        '文件路径': filename
+    }
+
+
+def save_to_database(questions, metadata):
+    """保存题目到数据库"""
+    print(f'  保存题目到数据库...')
+
+    import re
+    db = get_db_manager()
+    saved_ids = []
+
+    for i, q_data in enumerate(questions, 1):
+        # 标准化答案
+        answer = q_data.get('正确答案', '').strip().upper()
+        answer = re.sub(r'[^A-D0-9]', '', answer)
+
+        sql = """
+        INSERT INTO questions
+        (章节, 一级知识点, 二级知识点, 题目类型, 难度, 题干,
+         选项A, 选项B, 选项C, 选项D, 正确答案, 解析, 备注, 文件路径)
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """
+        params = (
+            metadata['章节'], '', metadata['二级知识点'],
+            metadata['题目类型'], metadata['难度'], q_data.get('题干', ''),
+            q_data.get('选项A', ''), q_data.get('选项B', ''),
+            q_data.get('选项C', ''), q_data.get('选项D', ''),
+            answer, q_data.get('解析', ''),
+            q_data.get('备注', ''), metadata['文件路径']
+        )
+
+        question_id = db.execute(sql, params)
+        saved_ids.append(question_id)
+        print(f'  ✓ 题目 {i}/{len(questions)}, ID={question_id}')
+
+    return saved_ids
+
+
+def display_results(questions, metadata, saved_ids):
+    """显示结果"""
+    print(f'\n[4/4] 完成')
+    print('='*60)
+    print(f'文件: {metadata["文件路径"]}')
+    print(f'\n元数据:')
+    print(f'  章节: {metadata["章节"]}')
+    print(f'  知识点: {metadata["二级知识点"]}')
+    print(f'  题型: {metadata["题目类型"]}')
+    print(f'  难度: {metadata["难度"]}')
+
+    print(f'\n题目列表 (共 {len(questions)} 个):')
+    print('='*60)
+
+    for i, q in enumerate(questions, 1):
+        print(f'\n题目 {i} (ID={saved_ids[i-1] if saved_ids else "未保存"}):')
+        print(f'  题干: {q["题干"][:60]}...')
+        if q.get('选项A'):
+            print(f'  选项A: {q["选项A"][:40]}...')
+        print(f'  答案: {q["正确答案"]}')
+        if q.get('备注'):
+            print(f'  备注: {q["备注"]}')
+
+    print('='*60)
+    print(f'\n✓ 所有题目已保存到数据库')
+
+    # 显示数据库统计
+    db = get_db_manager()
+    total = db.execute("SELECT COUNT(*) as count FROM questions", fetch_one=True)
+    print(f'✓ 数据库中共有 {total["count"]} 个题目')
+
+
+def main():
+    """主函数"""
+    if len(sys.argv) < 2:
+        print('用法: python dmxapi_parse.py <文件路径>')
+        print('\n示例:')
+        print('  python dmxapi_parse.py "函数的周期性 单选题 难.docx"')
+        print('  python dmxapi_parse.py test.pdf')
+        print('\n说明:')
+        print('  - 支持格式: PDF, Word (.docx, .doc)')
+        print('  - 文件名格式: 知识点+题型+难度.扩展名')
+        print('  - 不在本地解析，直接上传到DMXAPI分析')
+        sys.exit(1)
+
+    file_path = sys.argv[1]
+    path = Path(file_path)
+
+    if not path.exists():
+        print(f'✗ 文件不存在: {file_path}')
+        sys.exit(1)
+
+    print('='*60)
+    print('DMXAPI 文件解析 (无本地解析)')
+    print('='*60)
+
+    try:
+        # 创建解析器
+        parser = DMXAPIParser()
+
+        # [1/4] 文件转换和上传在parse_file中完成
+        # [2/4] DMXAPI分析在parse_file中完成
+        questions = parser.parse_file(file_path)
+
+        if not questions:
+            print('\n✗ 未解析到任何题目')
+            sys.exit(1)
+
+        # [3/4] 提取元数据
+        print(f'\n[3/4] 提取元数据...')
+        metadata = extract_metadata(path.name)
+
+        # [4/4] 保存到数据库和显示
+        saved_ids = save_to_database(questions, metadata)
+        display_results(questions, metadata, saved_ids)
+
+        print('\n✓ 处理完成！')
+        sys.exit(0)
+
+    except Exception as e:
+        print(f'\n✗ 处理失败: {e}')
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()