first commit
This commit is contained in:
550
analysis.py
Normal file
550
analysis.py
Normal file
@@ -0,0 +1,550 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DMXAPI直接解析 - 无需本地文本提取
|
||||
将文件上传到临时托管服务,然后使用DMXAPI的responses接口分析
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
# 添加项目路径
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from config import settings
|
||||
from config.database import get_db_manager
|
||||
|
||||
|
||||
class DMXAPIParser:
|
||||
"""DMXAPI解析器"""
|
||||
|
||||
def __init__(self):
|
||||
self.api_key = settings.API_KEY
|
||||
self.model_name = settings.MODEL_NAME
|
||||
self.api_url = "https://www.dmxapi.cn/v1/responses"
|
||||
|
||||
def upload_to_temp_host(self, file_path):
|
||||
"""
|
||||
上传文件到临时托管服务获取公网URL
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
文件的公网访问URL
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
print(f' 上传PDF到临时托管服务...')
|
||||
print(f' 文件: {path.name}')
|
||||
print(f' 大小: {path.stat().st_size / 1024:.2f} KB')
|
||||
|
||||
# 使用 file.io 临时文件托管(24小时有效,一次下载后删除)
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
files = {'file': f}
|
||||
response = requests.post(
|
||||
'https://file.io',
|
||||
files=files,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
if result.get('success'):
|
||||
file_url = result['link']
|
||||
print(f' ✓ 上传成功')
|
||||
print(f' URL: {file_url}')
|
||||
return file_url
|
||||
else:
|
||||
raise Exception(f'上传失败: {result}')
|
||||
else:
|
||||
raise Exception(f'HTTP错误: {response.status_code} - {response.text}')
|
||||
|
||||
except Exception as e:
|
||||
print(f' ✗ file.io上传失败: {e}')
|
||||
print(' 尝试备用服务...')
|
||||
|
||||
# 备用方案:使用 tmpfiles.org (7天有效)
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
files = {'file': f}
|
||||
response = requests.post(
|
||||
'https://tmpfiles.org/api/v1/upload',
|
||||
files=files,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
if result.get('status') == 'success':
|
||||
# tmpfiles.org返回的URL需要调整
|
||||
url = result['data']['url']
|
||||
# 转换为直接下载链接
|
||||
file_url = url.replace('tmpfiles.org/', 'tmpfiles.org/dl/')
|
||||
print(f' ✓ 上传成功(备用服务)')
|
||||
print(f' URL: {file_url}')
|
||||
return file_url
|
||||
|
||||
raise Exception(f'备用服务也失败: {response.text}')
|
||||
|
||||
except Exception as e2:
|
||||
raise Exception(f'所有上传服务均失败: {e2}')
|
||||
|
||||
def parse_file(self, file_path):
|
||||
"""
|
||||
使用DMXAPI分析文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
解析出的题目列表
|
||||
"""
|
||||
print(f'\n[2/4] 准备文件并调用DMXAPI...')
|
||||
|
||||
path = Path(file_path)
|
||||
|
||||
# 判断文件类型,如果不是PDF则转换
|
||||
if path.suffix.lower() != '.pdf':
|
||||
print(f' 检测到 {path.suffix} 文件,使用pandoc转换为PDF')
|
||||
pdf_path = self._convert_to_pdf(file_path)
|
||||
else:
|
||||
print(f' 检测到PDF文件,直接使用')
|
||||
pdf_path = file_path
|
||||
|
||||
# 上传PDF到临时服务
|
||||
file_url = self.upload_to_temp_host(pdf_path)
|
||||
|
||||
# 使用DMXAPI分析
|
||||
questions = self._parse_with_file_url(file_url, path.name)
|
||||
|
||||
# 清理临时PDF(如果是转换生成的)
|
||||
if pdf_path != file_path:
|
||||
try:
|
||||
Path(pdf_path).unlink()
|
||||
print(f' ✓ 已清理临时PDF文件')
|
||||
except:
|
||||
pass
|
||||
|
||||
return questions
|
||||
|
||||
def _convert_to_pdf(self, file_path):
|
||||
"""
|
||||
使用pandoc将文件转换为PDF
|
||||
|
||||
Args:
|
||||
file_path: 原始文件路径
|
||||
|
||||
Returns:
|
||||
转换后的PDF文件路径
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
path = Path(file_path)
|
||||
pdf_path = path.with_suffix('.pdf')
|
||||
|
||||
# 如果PDF已存在,先删除
|
||||
if pdf_path.exists():
|
||||
pdf_path.unlink()
|
||||
|
||||
print(f' 运行: pandoc (使用xelatex引擎)')
|
||||
|
||||
try:
|
||||
# 使用xelatex引擎支持中文
|
||||
result = subprocess.run(
|
||||
[
|
||||
'pandoc',
|
||||
str(path),
|
||||
'-o', str(pdf_path),
|
||||
'--pdf-engine=xelatex',
|
||||
'-V', 'CJKmainfont=PingFang SC' # macOS中文字体
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
# xelatex失败,尝试使用weasyprint
|
||||
print(f' xelatex失败,尝试weasyprint引擎...')
|
||||
result = subprocess.run(
|
||||
[
|
||||
'pandoc',
|
||||
str(path),
|
||||
'-o', str(pdf_path),
|
||||
'--pdf-engine=weasyprint'
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise Exception(f'所有PDF引擎均失败\nxelatex错误: {result.stderr}')
|
||||
|
||||
if not pdf_path.exists():
|
||||
raise Exception('PDF文件未生成')
|
||||
|
||||
print(f' ✓ 转换成功: {pdf_path.name}')
|
||||
return str(pdf_path)
|
||||
|
||||
except FileNotFoundError:
|
||||
raise Exception('未找到pandoc命令,请先安装pandoc\n macOS: brew install pandoc\n Ubuntu: sudo apt install pandoc')
|
||||
except subprocess.TimeoutExpired:
|
||||
raise Exception('pandoc转换超时')
|
||||
|
||||
def _parse_with_file_url(self, file_url, original_filename):
|
||||
"""使用file_url方式解析PDF"""
|
||||
print(f' 文件URL: {file_url}')
|
||||
print(f' 模型: {self.model_name}')
|
||||
print(f' 正在分析...')
|
||||
|
||||
# 构建请求
|
||||
payload = {
|
||||
"model": self.model_name,
|
||||
"input": [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_file",
|
||||
"file_url": file_url
|
||||
},
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": self._build_instruction(original_filename)
|
||||
}
|
||||
]
|
||||
}]
|
||||
}
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
# 发送请求
|
||||
print(f' 模型: {self.model_name}')
|
||||
print(f' 正在分析...')
|
||||
|
||||
response = requests.post(
|
||||
self.api_url,
|
||||
headers=headers,
|
||||
data=json.dumps(payload),
|
||||
timeout=180 # 3分钟超时
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise Exception(f'API请求失败: {response.status_code} - {response.text}')
|
||||
|
||||
result = response.json()
|
||||
|
||||
# 提取题目
|
||||
questions = self._extract_questions(result)
|
||||
|
||||
print(f' ✓ 解析完成,共 {len(questions)} 个题目')
|
||||
|
||||
return questions
|
||||
|
||||
def _build_instruction(self, filename):
|
||||
"""构建分析指令"""
|
||||
return f"""请从文件"{filename}"中提取所有题目信息,并以JSON数组格式返回。
|
||||
|
||||
**提取要求:**
|
||||
|
||||
1. **题目识别**:
|
||||
- 如果文档中有"修改试题"作为分隔符,请以此分割题目
|
||||
- 否则根据题目序号、题干、选项、答案、解析的结构识别每个题目
|
||||
|
||||
2. **字段提取**:
|
||||
- 题干:题目的问题部分
|
||||
- 选项A、选项B、选项C、选项D:选择题的选项(填空题或解答题为空字符串)
|
||||
- 正确答案:答案内容(单选如"A",多选如"ABD")
|
||||
- 解析:题目的解答过程
|
||||
- 备注:提取"难度 难"或"属性:共享 难度:易 采用:是"等元数据
|
||||
|
||||
3. **数学公式格式要求(重要):**
|
||||
- **所有数学公式必须转换为LaTeX格式**
|
||||
- 内联公式使用 $...$ 包围
|
||||
- 独立公式使用 $$...$$ 包围
|
||||
- 示例:
|
||||
* "𝑥²" → "$x^2$"
|
||||
* "sin x" → "$\\sin x$"
|
||||
* "∫₀¹ x dx" → "$\\int_0^1 x \\, dx$"
|
||||
* "f(x) = x² + 1" → "$f(x) = x^2 + 1$"
|
||||
* 分数:a/b → "$\\frac{{a}}{{b}}$"
|
||||
* 根号:√x → "$\\sqrt{{x}}$"
|
||||
* 上下标:xₙ → "$x_n$",x² → "$x^2$"
|
||||
|
||||
4. **其他格式要求**:
|
||||
- 答案统一为大写字母(选择题)
|
||||
- 移除答案前缀(如"答案:"、"选"等)
|
||||
- 保留其他文本格式
|
||||
|
||||
**返回格式(只返回JSON数组):**
|
||||
|
||||
```json
|
||||
[
|
||||
{{
|
||||
"题干": "下列函数中定义域为R的是",
|
||||
"选项A": "$y = \\frac{{1}}{{x}}$",
|
||||
"选项B": "$y = \\sqrt{{x}}$",
|
||||
"选项C": "$y = x^2$",
|
||||
"选项D": "$y = \\ln(x)$",
|
||||
"正确答案": "C",
|
||||
"解析": "$x^2$对所有实数都有定义",
|
||||
"备注": "难度 易"
|
||||
}}
|
||||
]
|
||||
```
|
||||
|
||||
**注意:请确保所有数学符号、公式都转换为LaTeX格式!**"""
|
||||
|
||||
def _extract_questions(self, response):
|
||||
"""从DMXAPI响应中提取题目"""
|
||||
# 检查状态
|
||||
if response.get('status') != 'completed':
|
||||
raise ValueError(f'响应状态异常: {response.get("status")}')
|
||||
|
||||
# 提取文本内容
|
||||
output = response.get('output', [])
|
||||
text_content = None
|
||||
|
||||
for item in output:
|
||||
if item.get('type') == 'message':
|
||||
content = item.get('content', [])
|
||||
for c in content:
|
||||
if c.get('type') == 'output_text':
|
||||
text_content = c.get('text', '')
|
||||
break
|
||||
if text_content:
|
||||
break
|
||||
|
||||
if not text_content:
|
||||
raise ValueError('未找到文本内容')
|
||||
|
||||
# 解析JSON
|
||||
questions = self._parse_json(text_content)
|
||||
|
||||
# 补充缺失字段
|
||||
for q in questions:
|
||||
for field in ['选项A', '选项B', '选项C', '选项D', '解析', '备注']:
|
||||
if field not in q:
|
||||
q[field] = ''
|
||||
|
||||
return questions
|
||||
|
||||
def _parse_json(self, text):
|
||||
"""从文本中解析JSON"""
|
||||
# 查找JSON数组
|
||||
start_idx = text.find('[')
|
||||
end_idx = text.rfind(']')
|
||||
|
||||
if start_idx == -1 or end_idx == -1:
|
||||
# 尝试查找被```包围的JSON
|
||||
if '```json' in text:
|
||||
lines = text.split('\n')
|
||||
json_lines = []
|
||||
in_json = False
|
||||
for line in lines:
|
||||
if '```json' in line:
|
||||
in_json = True
|
||||
continue
|
||||
elif '```' in line and in_json:
|
||||
break
|
||||
elif in_json:
|
||||
json_lines.append(line)
|
||||
text = '\n'.join(json_lines)
|
||||
start_idx = text.find('[')
|
||||
end_idx = text.rfind(']')
|
||||
|
||||
if start_idx == -1 or end_idx == -1:
|
||||
raise ValueError(f'未找到JSON数组\n文本: {text[:200]}...')
|
||||
|
||||
json_str = text[start_idx:end_idx + 1]
|
||||
|
||||
try:
|
||||
questions = json.loads(json_str)
|
||||
|
||||
if not isinstance(questions, list):
|
||||
raise ValueError('解析结果不是数组')
|
||||
|
||||
# 验证必填字段
|
||||
for i, q in enumerate(questions):
|
||||
if '题干' not in q or '正确答案' not in q:
|
||||
raise ValueError(f'第 {i+1} 个题目缺少必填字段')
|
||||
|
||||
return questions
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f'JSON解析失败: {e}\nJSON: {json_str[:300]}...')
|
||||
|
||||
|
||||
def extract_metadata(filename):
|
||||
"""从文件名提取元数据"""
|
||||
from config.settings import TYPE_MAP
|
||||
|
||||
path = Path(filename)
|
||||
basename = path.stem
|
||||
|
||||
# 解析文件名
|
||||
separators = ['+', ' ', '-', '_']
|
||||
parts = [basename]
|
||||
|
||||
for sep in separators:
|
||||
if sep in basename:
|
||||
parts = basename.split(sep)
|
||||
break
|
||||
|
||||
secondary_knowledge = parts[0].strip() if len(parts) > 0 else ''
|
||||
question_type_raw = parts[1].strip() if len(parts) > 1 else ''
|
||||
difficulty = parts[2].strip() if len(parts) > 2 else ''
|
||||
|
||||
# 映射题型
|
||||
question_type = TYPE_MAP.get(question_type_raw, question_type_raw)
|
||||
valid_types = ['单选', '多选', '不定项', '填空', '解答']
|
||||
if question_type not in valid_types and question_type_raw in valid_types:
|
||||
question_type = question_type_raw
|
||||
|
||||
# 推断章节
|
||||
chapter = ''
|
||||
if '的' in secondary_knowledge:
|
||||
chapter = secondary_knowledge.split('的')[0]
|
||||
else:
|
||||
chapter = secondary_knowledge
|
||||
|
||||
if not chapter:
|
||||
chapter = '未分类'
|
||||
|
||||
return {
|
||||
'章节': chapter,
|
||||
'二级知识点': secondary_knowledge,
|
||||
'题目类型': question_type,
|
||||
'难度': difficulty,
|
||||
'文件路径': filename
|
||||
}
|
||||
|
||||
|
||||
def save_to_database(questions, metadata):
|
||||
"""保存题目到数据库"""
|
||||
print(f' 保存题目到数据库...')
|
||||
|
||||
import re
|
||||
db = get_db_manager()
|
||||
saved_ids = []
|
||||
|
||||
for i, q_data in enumerate(questions, 1):
|
||||
# 标准化答案
|
||||
answer = q_data.get('正确答案', '').strip().upper()
|
||||
answer = re.sub(r'[^A-D0-9]', '', answer)
|
||||
|
||||
sql = """
|
||||
INSERT INTO questions
|
||||
(章节, 一级知识点, 二级知识点, 题目类型, 难度, 题干,
|
||||
选项A, 选项B, 选项C, 选项D, 正确答案, 解析, 备注, 文件路径)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
"""
|
||||
params = (
|
||||
metadata['章节'], '', metadata['二级知识点'],
|
||||
metadata['题目类型'], metadata['难度'], q_data.get('题干', ''),
|
||||
q_data.get('选项A', ''), q_data.get('选项B', ''),
|
||||
q_data.get('选项C', ''), q_data.get('选项D', ''),
|
||||
answer, q_data.get('解析', ''),
|
||||
q_data.get('备注', ''), metadata['文件路径']
|
||||
)
|
||||
|
||||
question_id = db.execute(sql, params)
|
||||
saved_ids.append(question_id)
|
||||
print(f' ✓ 题目 {i}/{len(questions)}, ID={question_id}')
|
||||
|
||||
return saved_ids
|
||||
|
||||
|
||||
def display_results(questions, metadata, saved_ids):
|
||||
"""显示结果"""
|
||||
print(f'\n[4/4] 完成')
|
||||
print('='*60)
|
||||
print(f'文件: {metadata["文件路径"]}')
|
||||
print(f'\n元数据:')
|
||||
print(f' 章节: {metadata["章节"]}')
|
||||
print(f' 知识点: {metadata["二级知识点"]}')
|
||||
print(f' 题型: {metadata["题目类型"]}')
|
||||
print(f' 难度: {metadata["难度"]}')
|
||||
|
||||
print(f'\n题目列表 (共 {len(questions)} 个):')
|
||||
print('='*60)
|
||||
|
||||
for i, q in enumerate(questions, 1):
|
||||
print(f'\n题目 {i} (ID={saved_ids[i-1] if saved_ids else "未保存"}):')
|
||||
print(f' 题干: {q["题干"][:60]}...')
|
||||
if q.get('选项A'):
|
||||
print(f' 选项A: {q["选项A"][:40]}...')
|
||||
print(f' 答案: {q["正确答案"]}')
|
||||
if q.get('备注'):
|
||||
print(f' 备注: {q["备注"]}')
|
||||
|
||||
print('='*60)
|
||||
print(f'\n✓ 所有题目已保存到数据库')
|
||||
|
||||
# 显示数据库统计
|
||||
db = get_db_manager()
|
||||
total = db.execute("SELECT COUNT(*) as count FROM questions", fetch_one=True)
|
||||
print(f'✓ 数据库中共有 {total["count"]} 个题目')
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
if len(sys.argv) < 2:
|
||||
print('用法: python dmxapi_parse.py <文件路径>')
|
||||
print('\n示例:')
|
||||
print(' python dmxapi_parse.py "函数的周期性 单选题 难.docx"')
|
||||
print(' python dmxapi_parse.py test.pdf')
|
||||
print('\n说明:')
|
||||
print(' - 支持格式: PDF, Word (.docx, .doc)')
|
||||
print(' - 文件名格式: 知识点+题型+难度.扩展名')
|
||||
print(' - 不在本地解析,直接上传到DMXAPI分析')
|
||||
sys.exit(1)
|
||||
|
||||
file_path = sys.argv[1]
|
||||
path = Path(file_path)
|
||||
|
||||
if not path.exists():
|
||||
print(f'✗ 文件不存在: {file_path}')
|
||||
sys.exit(1)
|
||||
|
||||
print('='*60)
|
||||
print('DMXAPI 文件解析 (无本地解析)')
|
||||
print('='*60)
|
||||
|
||||
try:
|
||||
# 创建解析器
|
||||
parser = DMXAPIParser()
|
||||
|
||||
# [1/4] 文件转换和上传在parse_file中完成
|
||||
# [2/4] DMXAPI分析在parse_file中完成
|
||||
questions = parser.parse_file(file_path)
|
||||
|
||||
if not questions:
|
||||
print('\n✗ 未解析到任何题目')
|
||||
sys.exit(1)
|
||||
|
||||
# [3/4] 提取元数据
|
||||
print(f'\n[3/4] 提取元数据...')
|
||||
metadata = extract_metadata(path.name)
|
||||
|
||||
# [4/4] 保存到数据库和显示
|
||||
saved_ids = save_to_database(questions, metadata)
|
||||
display_results(questions, metadata, saved_ids)
|
||||
|
||||
print('\n✓ 处理完成!')
|
||||
sys.exit(0)
|
||||
|
||||
except Exception as e:
|
||||
print(f'\n✗ 处理失败: {e}')
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user