fix: update keywords_match

This commit is contained in:
2026-01-18 18:25:36 +08:00
parent 29f6e25f70
commit 4ed90734df
7 changed files with 1406 additions and 269 deletions

50
.env.example Normal file
View File

@@ -0,0 +1,50 @@
# ========================================
# 环境变量配置示例
# 复制此文件为 .env 并填入实际值
# ========================================
# 默认 LLM API 类型 (openai | dmx | dify | ollama)
export LLM_API_TYPE="dify"
# ========== OpenAI API 配置 ==========
# export OPENAI_API_KEY="sk-your-openai-api-key"
# export OPENAI_BASE_URL="https://api.openai.com/v1"
# export OPENAI_MODEL="gpt-4o-mini"
# ========== DMX API 配置 (OpenAI 兼容) ==========
export DMX_API_KEY="your-dmx-api-key"
export DMX_BASE_URL="https://www.dmxapi.cn"
export DMX_MODEL="gpt-4o-mini"
# ========== Dify API 配置 ==========
# image_batch_recognizer.py 默认使用
export DIFY_API_KEY="app-your-dify-api-key"
export DIFY_BASE_URL="https://your-dify-server:4433"
export DIFY_USER_ID="default-user"
export DIFY_MODEL="dify-chatflow"
# ========== Ollama 本地模型配置 ==========
# export OLLAMA_BASE_URL="http://localhost:11434"
# export OLLAMA_MODEL="qwen2.5:7b"
# ========== Anthropic API 配置 ==========
# export ANTHROPIC_API_KEY="sk-ant-your-anthropic-api-key"
# export ANTHROPIC_BASE_URL="https://api.anthropic.com"
# export ANTHROPIC_MODEL="claude-3-5-sonnet-20241022"
# ========================================
# verify_high_confidence.py 独立配置
# ========================================
# 使用 VERIFY_ 前缀,与 image_batch_recognizer.py 区分
# 如果不设置,会回退到上面的通用配置
# VERIFY_API_TYPE=dmx
# VERIFY_API_KEY=your-api-key
# VERIFY_BASE_URL=https://api.example.com
# VERIFY_MODEL=gpt-4o-mini
# VERIFY_USER_ID=default-user
# ========================================
# 自定义 Prompt (可选)
# ========================================
# export VISION_ANALYSIS_PROMPT="你的自定义图片分析提示词..."

318
CLAUDE.md
View File

@@ -7,9 +7,10 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
This is a drug risk monitoring and data processing system for detecting controlled substances in text and image data from e-commerce platforms, darknet sources, and social media.
**Core Capabilities:**
1. **CAS Number Matching**: Extract and match chemical CAS numbers from text using regex patterns (supports multiple formats)
2. **Keyword Matching**: High-performance multi-mode keyword matching (fuzzy, CAS)
3. **Keyword Expansion**: LLM-powered expansion of chemical/drug names to include variants, abbreviations, and aliases
1. **Image Recognition**: Batch image analysis using LLM APIs (OpenAI, Anthropic, DMX, Dify) for OCR and risk detection
2. **Keyword Matching**: Multi-mode keyword matching with CAS number extraction and exact matching
3. **LLM Verification**: Secondary verification of high-confidence unmatched records using LLM
4. **Data Collection**: Merge and consolidate results from batch processing
## Running Scripts
@@ -18,28 +19,24 @@ All scripts must be run from the `scripts/` directory:
```bash
cd scripts/
# Quick start (recommended for testing)
python3 quick_start.py
# Image batch recognition (mock mode for testing)
python3 image_batch_recognizer.py --mock --limit 5
# CAS number matching
python3 match_cas_numbers.py
# Image recognition with API
python3 image_batch_recognizer.py --api-type dify --limit 10
# Multi-mode keyword matching (default: both modes)
# Collect and merge xlsx files from batch output
python3 collect_xlsx.py
# Multi-mode keyword matching (default: cas + exact)
python3 keyword_matcher.py
# Single mode matching
python3 keyword_matcher.py -m cas # CAS number only
python3 keyword_matcher.py -m fuzzy --threshold 90 # Fuzzy matching only
python3 keyword_matcher.py -m exact # Exact matching only
# Use larger keyword database
python3 keyword_matcher.py -k ../data/input/keyword_all.xlsx
# Keyword expansion (mock mode, no API)
python3 expand_keywords_with_llm.py -m
# Keyword expansion (with OpenAI API)
export OPENAI_API_KEY="sk-..."
python3 expand_keywords_with_llm.py ../data/input/keywords.xlsx
# Verify high-confidence unmatched records
python3 verify_high_confidence.py -o original.xlsx -m matched.xlsx --mock
```
## Dependencies
@@ -49,221 +46,130 @@ python3 expand_keywords_with_llm.py ../data/input/keywords.xlsx
pip install pandas openpyxl
```
**Optional (for fuzzy keyword matching):**
**Optional:**
```bash
pip install rapidfuzz
pip install pyahocorasick # 5x faster exact matching
pip install requests # Required for Dify API
pip install tqdm # Progress bars
pip install openai # For OpenAI-compatible APIs in verify script
```
**Optional (for LLM keyword expansion):**
## Environment Configuration
Copy `.env.example` to `.env` and configure API keys:
```bash
pip install openai anthropic
# Default API type (openai | dmx | dify | ollama)
LLM_API_TYPE="dify"
# DMX API (OpenAI compatible)
DMX_API_KEY="your-key"
DMX_BASE_URL="https://www.dmxapi.cn"
DMX_MODEL="gpt-4o-mini"
# Dify API (used by image_batch_recognizer.py)
DIFY_API_KEY="app-xxx"
DIFY_BASE_URL="https://your-dify-server:4433"
DIFY_USER_ID="default-user"
# Separate config for verify_high_confidence.py (VERIFY_ prefix)
VERIFY_API_TYPE="dmx"
VERIFY_API_KEY="your-key"
VERIFY_BASE_URL="https://api.example.com"
VERIFY_MODEL="gpt-4o-mini"
```
## Data Flow Architecture
All scripts use relative paths from `scripts/` directory:
```
Input: ../data/input/
clickin_text_img.xlsx (2779 rows: text + image paths)
keywords.xlsx (22 rows, basic keyword list)
keyword_all.xlsx (1659 rows, 1308 unique CAS numbers)
Output: ../data/output/
keyword_matched_results.xlsx (multi-mode merged results)
cas_matched_results_final.xlsx
test_keywords_expanded_rows.xlsx
Images: ../data/images/ (1955 JPG files, 84MB)
data/
├── input/ # Source data
├── clickin_text_img.xlsx # Text + image paths
└── keywords.xlsx # Keyword database
├── images/ # Image files for recognition
├── batch_output/ # Per-folder recognition results
└── {name}/results.xlsx
├── data_all/ # Original data by source
└── {name}_text_img.xlsx
├── collected_xlsx/ # Merged results (collect_xlsx.py output)
└── output/ # Final processed results
```
**Processing Pipeline:**
```
Raw data collection -> Text extraction (OCR/LLM) ->
Feature matching (CAS/keywords) -> Data cleaning ->
Risk determination
1. image_batch_recognizer.py → batch_output/{name}/results.xlsx
2. collect_xlsx.py → Merge results.xlsx with {name}_text_img.xlsx → collected_xlsx/
3. keyword_matcher.py → Match keywords in text → output/keyword_matched_results.xlsx
4. verify_high_confidence.py → LLM verify unmatched high-confidence → *_llm_verified.xlsx
```
## Key Technical Details
## Key Scripts
### 1. CAS Number Matching (`match_cas_numbers.py`)
- Supports multiple formats: `123-45-6`, `123 45 6`, `123 - 45 - 6`
- Auto-normalizes to standard format `XXX-XX-X`
- Uses regex pattern: `\b\d{2,7}[\s\-]+\d{2}[\s\-]+\d\b`
- Dual-mode: `"regex"` for CAS matching, `"keywords"` for keyword matching
### keyword_matcher.py
### 2. Keyword Matching (`keyword_matcher.py`) - REFACTORED
Two detection modes with Strategy Pattern architecture:
**Architecture:**
- Strategy Pattern with `KeywordMatcher` base class
- Concrete matchers: `CASRegexMatcher`, `FuzzyMatcher`
- Factory Pattern for matcher creation
- Dataclass-based result handling
1. **CAS Number Recognition (`-m cas`)**
- Regex pattern: `\b(\d{2,7})[\s\-._]?(\d{2})[\s\-._]?(\d)\b`
- Supports formats: `123-45-6`, `123 45 6`, `123456`, `123.45.6`
- Auto-normalizes to standard `XXX-XX-X` format
- Source column: `CAS号`
**Two Detection Modes:**
1. **CAS Number Recognition (CAS号识别)**
- Uses `CASRegexMatcher` with comprehensive regex pattern
- Supports formats: `123-45-6`, `123 45 6`, `12345 6`, `123456`, `123.45.6`, `123_45_6`
- Auto-normalizes all formats to standard `XXX-XX-X`
- Regex: `\b(\d{2,7})[\s\-._]?(\d{2})[\s\-._]?(\d)\b`
- Extracts CAS from text, normalizes, compares with keyword database
- Source columns: `CAS号`
2. **Fuzzy Matching (模糊匹配)**
- Uses `FuzzyMatcher` with RapidFuzz library
- Default threshold: 85 (configurable via `--threshold`)
- Scoring function: `partial_ratio`
2. **Exact Matching (`-m exact`)**
- Uses Aho-Corasick automaton (if pyahocorasick installed) or regex with word boundaries
- Source columns: `中文名`, `英文名`, `CAS号`, `简称`, `可能名称`
- **Note**: Fuzzy matching covers all cases that exact matching would find, making exact mode redundant
**Multi-Mode Result Merging:**
- Automatically merges results from multiple modes
- Deduplicates by row index
- Combines matched keywords with ` | ` separator
- Adds `匹配模式` column showing which modes matched (e.g., "CAS号识别 + 模糊匹配")
**Command-Line Options:**
```bash
-k, --keywords # Path to keywords file (default: ../data/input/keywords.xlsx)
-t, --text # Path to text file (default: ../data/input/clickin_text_img.xlsx)
-o, --output # Output file path (default: ../data/output/keyword_matched_results.xlsx)
-c, --text-column # Column containing text to search (default: "文本")
-m, --modes # Modes to run: cas, fuzzy (default: both)
--threshold # Fuzzy matching threshold 0-100 (default: 85)
--separator # Keyword separator in cells (default: "|||")
```
**Performance:**
- With keyword_all.xlsx (1308 CAS numbers):
- CAS mode: 255 rows matched (9.18%)
- Fuzzy mode: 513 rows matched (18.46%)
- Merged (both modes): ~516 unique rows
**Uses `|||` separator:**
- Chemical names contain commas, hyphens, slashes, semicolons
- Triple pipe avoids conflicts with chemical nomenclature
- Example: `甲基苯丙胺|||冰毒|||Methamphetamine|||MA`
### 3. Keyword Expansion (`expand_keywords_with_llm.py`)
- Expands Chinese names, English names, abbreviations
- Supports OpenAI and Anthropic APIs
- Mock mode available for testing without API costs
- Output formats: compact (single row with `|||` separators) or expanded (one name per row)
## Configuration Patterns
Scripts use command-line arguments (keyword_matcher.py) or in-file configuration blocks:
```python
# ========== Configuration ==========
keywords_file = "../data/input/keywords.xlsx"
text_file = "../data/input/clickin_text_img.xlsx"
keywords_column = "中文名"
text_column = "文本"
separator = "|||"
output_file = "../data/output/results.xlsx"
# =============================
```
## Excel File Schemas
**Input - clickin_text_img.xlsx:**
- Columns: `文本` (text), image paths, metadata
- 2779 rows of scraped e-commerce/social media data
**Input - keywords.xlsx:**
- Columns: `中文名`, `英文名`, `CAS号`, `简称`, `备注`, `可能名称`
- `可能名称` contains multiple keywords separated by `|||`
- 22 rows (small test dataset)
**Input - keyword_all.xlsx:**
- Same schema as keywords.xlsx
- 1659 rows with 1308 unique CAS numbers
- Production keyword database
**Output - Multi-mode matched (keyword_matched_results.xlsx):**
- Adds columns:
- `匹配到的关键词` (matched keywords, separated by ` | `)
- `匹配模式` (matching modes, e.g., "CAS号识别 + 模糊匹配")
- Preserves all original columns
- Deduplicated across all modes
**Output - CAS matched:**
- Adds column: `匹配到的CAS号` (matched CAS numbers)
- Preserves all original columns
- Typical match rate: ~9-11% (255-303/2779 rows)
## Common Modifications
**To change input/output paths:**
Use command-line arguments for `keyword_matcher.py`:
```bash
python3 keyword_matcher.py -k /path/to/keywords.xlsx -t /path/to/text.xlsx -o /path/to/output.xlsx
```
Or edit the configuration block in other scripts' `main()` function.
**To switch between CAS and keyword matching:**
In `match_cas_numbers.py`, change `match_mode = "regex"` to `match_mode = "keywords"`.
In `keyword_matcher.py`, use `-m` flag:
```bash
python3 keyword_matcher.py -m cas # CAS only
python3 keyword_matcher.py -m fuzzy # Fuzzy only
```
**To adjust fuzzy matching sensitivity:**
```bash
python3 keyword_matcher.py -m fuzzy --threshold 90 # Stricter (fewer matches)
python3 keyword_matcher.py -m fuzzy --threshold 70 # More lenient (more matches)
```
**To use different LLM APIs:**
```bash
# OpenAI (default)
python3 expand_keywords_with_llm.py input.xlsx
# Anthropic
python3 expand_keywords_with_llm.py input.xlsx -a anthropic
```
## Code Architecture Highlights
### keyword_matcher.py Design Patterns
1. **Strategy Pattern**: Different matching algorithms (`KeywordMatcher` subclasses)
2. **Template Method**: Common matching workflow in base class `match()` method
3. **Factory Pattern**: `create_matcher()` selects appropriate matcher
4. **Dependency Injection**: Optional dependency (rapidfuzz) handled gracefully
**Multi-column text matching:**
- Automatically detects and combines `detected_text` and `文本` columns
- Use `-c col1 col2` to specify custom columns
**Class Hierarchy:**
```
KeywordMatcher (ABC)
├── CASRegexMatcher # Regex-based CAS number extraction
── FuzzyMatcher # RapidFuzz partial_ratio matching
├── CASRegexMatcher # Regex CAS extraction + normalization
── RegexExactMatcher # Word-boundary exact matching
├── AhoCorasickMatcher # Fast multi-pattern matching
└── SetMatcher # Simple substring matching
```
**Data Flow:**
```
1. Load keywords -> load_keywords_for_mode()
2. Create matcher -> create_matcher()
3. Match text -> matcher.match()
├── _prepare() (build automaton, etc.)
└── For each row:
├── _match_single_text()
└── _format_matches()
4. Save results -> save_results()
5. If multiple modes -> merge_mode_results()
```
### verify_high_confidence.py
## Data Sensitivity
Compares keyword_matcher output with original data to find high-confidence rows that weren't matched, then uses LLM for secondary verification.
This codebase handles sensitive data related to controlled substances monitoring. The data includes:
- Chemical compound names (Chinese and English)
- CAS registry numbers
- Image data from suspected illegal substance trading platforms
- All data is for legitimate law enforcement/research purposes
- Uses `VERIFY_` prefixed env vars (separate from image_batch_recognizer.py)
- Supports: OpenAI, DMX, Dify, Ollama, Mock modes
- Input columns: `raw_response`, `文本`
Do not commit actual data files or API keys to version control.
- to memorize
### collect_xlsx.py
Merges batch recognition results with original data:
- Matches by image filename (handles both Windows `\` and Unix `/` paths)
- Adds original columns (`文本`, metadata) to recognition results
### image_batch_recognizer.py
Batch image recognition with multiple API backends:
- Supports: OpenAI, Anthropic, DMX, Dify, Mock
- Outputs: `detected_text`, `detected_objects`, `sensitive_items`, `summary`, `confidence`
- Parallel processing with `--max-workers`
## Excel File Schemas
**keywords.xlsx columns:**
- `中文名`, `英文名`, `CAS号`, `简称`, `备注`, `可能名称`
- `可能名称` uses `|||` separator for multiple values
**Recognition output columns:**
- `image_name`, `image_path`, `detected_text`, `detected_objects`
- `sensitive_items`, `summary`, `confidence`, `raw_response`
**Matched output adds:**
- `匹配到的关键词` (matched keywords, ` | ` separated)
- `匹配模式` (e.g., "CAS号识别 + 精确匹配")
## Key Conventions
- Triple pipe `|||` separator in keyword cells (avoids conflicts with chemical names)
- Match result separator: ` | `
- All scripts use relative paths from `scripts/` directory
- Configuration priority: command-line args > VERIFY_* env > general env > defaults

204
README.md
View File

@@ -18,6 +18,8 @@ python3 keyword_matcher.py
python3 image_batch_recognizer.py --mock --limit 5
```
---
## 功能 1关键词匹配
从文本中识别 CAS 号和关键词。
@@ -45,6 +47,9 @@ python3 keyword_matcher.py \
-k ../data/input/keyword_all.xlsx \
-t ../data/input/clickin_text_img.xlsx \
-o ../data/output/results.xlsx
# 指定多个文本列
python3 keyword_matcher.py -c detected_text 文本
```
### 参数说明
@@ -54,19 +59,15 @@ python3 keyword_matcher.py \
| `-k, --keywords` | 关键词文件 | `../data/input/keywords.xlsx` |
| `-t, --text` | 文本文件 | `../data/input/clickin_text_img.xlsx` |
| `-o, --output` | 输出文件 | `../data/output/keyword_matched_results.xlsx` |
| `-c, --text-column` | 文本列名 | `文本` |
| `-c, --text-columns` | 文本列名(支持多列) | 自动检测 `detected_text` `文本` |
| `-m, --modes` | 匹配模式 | `cas exact` |
| `--separator` | 关键词分隔符 | `\|\|\|` |
### 输出说明
每种模式生成独立文件
- `keyword_matched_results_cas.xlsx` - CAS 号匹配结果
- `keyword_matched_results_exact.xlsx` - 精确匹配结果
输出列:
- `匹配到的关键词` - 匹配的关键词列表
- `匹配模式` - 使用的匹配模式
输出合并后的匹配结果文件,包含以下列
- `匹配到的关键词` - 匹配的关键词列表(` | ` 分隔)
- `匹配模式` - 使用的匹配模式(如 "CAS号识别 + 精确匹配"
---
@@ -82,6 +83,9 @@ cd scripts/
# 模拟模式(无需 API用于测试
python3 image_batch_recognizer.py --mock --limit 5
# 使用 Dify API
python3 image_batch_recognizer.py --api-type dify --limit 10
# 使用 OpenAI API
python3 image_batch_recognizer.py --api-type openai --limit 10
@@ -89,7 +93,7 @@ python3 image_batch_recognizer.py --api-type openai --limit 10
python3 image_batch_recognizer.py --api-type dmx --limit 10
# 并行处理
python3 image_batch_recognizer.py --api-type openai --max-workers 3
python3 image_batch_recognizer.py --api-type dify --max-workers 3
```
### 参数说明
@@ -98,7 +102,7 @@ python3 image_batch_recognizer.py --api-type openai --max-workers 3
|------|------|--------|
| `--image-dir` | 图片目录 | `../data/images` |
| `--output` | 输出文件 | `../data/output/image_recognition_results.xlsx` |
| `--api-type` | API 类型 | 从 `.env` 读取 |
| `--api-type` | API 类型 (openai/dmx/dify/anthropic/mock) | 从 `.env` 读取 |
| `--model` | 模型名称 | 从 `.env` 读取 |
| `--limit` | 最大处理数 | 无限制 |
| `--offset` | 跳过前 N 张 | 0 |
@@ -106,26 +110,6 @@ python3 image_batch_recognizer.py --api-type openai --max-workers 3
| `--mock` | 模拟模式 | 否 |
| `--recursive` | 递归子目录 | 否 |
### API 配置
复制 `.env` 配置文件并填写:
```bash
cp config.env.example .env
```
`.env` 示例:
```
OPENAI_API_KEY=sk-...
OPENAI_MODEL=gpt-4o-mini
DMX_API_KEY=sk-dmx-...
DMX_BASE_URL=https://www.dmxapi.cn
DMX_MODEL=gpt-5-mini
LLM_API_TYPE=openai
```
### 输出说明
输出 Excel 包含以下列:
@@ -133,27 +117,163 @@ LLM_API_TYPE=openai
- `detected_objects` - 物品描述
- `sensitive_items` - 敏感要素
- `summary` - 风险摘要
- `confidence` - 置信度
- `confidence` - 置信度 (High/Medium/Low)
- `raw_response` - 原始 LLM 响应
---
## 功能 3数据收集与合并
收集批处理结果并与原始数据合并。
```bash
cd scripts/
# 默认合并 batch_output 和 data_all 中的数据
python3 collect_xlsx.py
# 指定输出目录
python3 collect_xlsx.py -o ../data/merged
# 预览模式(不执行)
python3 collect_xlsx.py -n
# 仅复制,不合并原始数据
python3 collect_xlsx.py --no-merge
```
合并逻辑:
-`data/batch_output/{name}/results.xlsx` 读取识别结果
-`data/data_all/{name}_text_img.xlsx` 按图片名匹配合并
- 输出到 `data/collected_xlsx/{name}.xlsx`
---
## 功能 4高置信度记录验证
对关键词未匹配但置信度为 High/Medium 的记录进行 LLM 二次验证。
```bash
cd scripts/
# 完整示例
python3 verify_high_confidence.py \
-o ../data/pho_analysis_merged/clickin.xlsx \
-m ../data/output/clickin_matched.xlsx \
-r ../data/output/clickin_verify.xlsx
# Mock 模式测试(不调用 API
python3 verify_high_confidence.py \
-o ../data/pho_analysis_merged/clickin.xlsx \
-m ../data/output/clickin_matched.xlsx \
--mock --limit 5
# 使用 DMX API
python3 verify_high_confidence.py -o original.xlsx -m matched.xlsx --api dmx
```
### 参数说明
| 参数 | 说明 | 默认值 |
|------|------|--------|
| `-o, --original` | 原始 Excel 文件 | 必填 |
| `-m, --matched` | keyword_matcher 匹配结果 | 必填 |
| `-r, --result` | 输出文件 | `{原始文件名}_llm_verified.xlsx` |
| `--api` | API 类型 (openai/dmx/dify/ollama) | 从 `.env` 读取 |
| `--mock` | 模拟模式 | 否 |
| `--confidence` | 验证的置信度级别 | `High Medium` |
| `--limit` | 限制验证条数 | 全部 |
### 输出说明
输出 Excel 在原始列基础上添加以下验证结果列:
- `llm_is_risky` - 是否涉及风险(布尔值)
- `llm_substances` - 涉及的物质名称或 CAS 号(` | ` 分隔)
- `llm_risk_level` - 风险等级(高/中/低)
- `llm_reason` - 判定理由
- `llm_raw_response` - LLM 原始响应
---
## API 配置
复制 `.env.example` 并填写:
```bash
cp .env.example .env
```
`.env` 示例:
```bash
# 默认 API 类型
LLM_API_TYPE="dify"
# DMX APIOpenAI 兼容)
DMX_API_KEY="sk-xxx"
DMX_BASE_URL="https://www.dmxapi.cn"
DMX_MODEL="gpt-4o-mini"
# Dify API图片识别使用
DIFY_API_KEY="app-xxx"
DIFY_BASE_URL="https://your-dify-server:4433"
DIFY_USER_ID="default-user"
# OpenAI API
OPENAI_API_KEY="sk-xxx"
OPENAI_MODEL="gpt-4o-mini"
# verify_high_confidence.py 独立配置VERIFY_ 前缀)
VERIFY_API_TYPE="dmx"
VERIFY_API_KEY="sk-xxx"
VERIFY_BASE_URL="https://www.dmxapi.cn"
VERIFY_MODEL="gpt-4o-mini"
```
---
## 目录结构
```
20251126_s2/
chem-risk-detect/
├── scripts/
│ ├── keyword_matcher.py # 关键词匹配
│ ├── image_batch_recognizer.py # 图片识别
│ ├── collect_xlsx.py # 数据收集合并
│ ├── verify_high_confidence.py # LLM 二次验证
│ ├── run.sh # 批处理管理
│ └── run_batch_background.sh # 后台运行
├── data/
│ ├── input/ # 输入数据
│ │ ├── clickin_text_img.xlsx # 文本数据
│ │ └── keywords.xlsx # 关键词库
│ ├── output/ # 输出结果
── images/ # 图片文件
│ ├── images/ # 图片文件
── batch_output/ # 批处理输出
│ │ └── {name}/results.xlsx
│ ├── data_all/ # 原始数据
│ │ └── {name}_text_img.xlsx
│ ├── collected_xlsx/ # 合并后数据
│ └── output/ # 最终输出结果
├── .env # API 配置
└── config.env.example # 配置模板
└── .env.example # 配置模板
```
---
## 处理流程
```
1. 图片识别
image_batch_recognizer.py → batch_output/{name}/results.xlsx
2. 数据合并
collect_xlsx.py → 合并 results.xlsx + {name}_text_img.xlsx → collected_xlsx/
3. 关键词匹配
keyword_matcher.py → output/keyword_matched_results.xlsx
4. 二次验证
verify_high_confidence.py → 验证未匹配的高置信度记录 → *_llm_verified.xlsx
```
---
@@ -173,7 +293,7 @@ cd scripts/
设置参数:
```bash
API_TYPE=openai MAX_WORKERS=3 ./run.sh start
API_TYPE=dify MAX_WORKERS=3 ./run.sh start
```
---
@@ -184,10 +304,11 @@ API_TYPE=openai MAX_WORKERS=3 ./run.sh start
# 必需
pip install pandas openpyxl
# 可选(提升性能)
pip install pyahocorasick # 关键词匹配加速
# 可选(提升性能和功能
pip install pyahocorasick # 关键词匹配加速5x
pip install requests # Dify API 必需
pip install tqdm # 进度条
pip install requests # HTTP 请求
pip install openai # verify 脚本的 OpenAI 兼容 API
```
---
@@ -203,9 +324,12 @@ pip install requests # HTTP 请求
**Q: 输出的分隔符能改吗?**
使用 `--separator` 参数,默认 `|||` 不与化学名称冲突。
**Q: verify 脚本和 image_batch_recognizer 能用不同的 API 吗?**
可以。verify 脚本使用 `VERIFY_` 前缀的环境变量,与其他脚本独立配置。
---
## 技术支持
## 技术要求
- Python 3.7+
- 查看帮助:`python3 script.py -h`

107
scripts/batch_keyword_match.sh Executable file
View File

@@ -0,0 +1,107 @@
#!/bin/bash
# 批量关键词匹配脚本
# 处理 data/pho_analysis_merged/ 中的所有 xlsx 文件
set -e
# 获取脚本所在目录
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
# 目录配置
INPUT_DIR="$PROJECT_DIR/data/pho_analysis_merged"
OUTPUT_DIR="$PROJECT_DIR/data/output"
KEYWORDS_FILE="$PROJECT_DIR/data/keywords/keywords_all.xlsx"
# 颜色输出
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m' # No Color
echo "=============================================="
echo "批量关键词匹配"
echo "=============================================="
echo "输入目录: $INPUT_DIR"
echo "输出目录: $OUTPUT_DIR"
echo "关键词文件: $KEYWORDS_FILE"
echo ""
# 检查输入目录
if [ ! -d "$INPUT_DIR" ]; then
echo -e "${RED}错误: 输入目录不存在: $INPUT_DIR${NC}"
exit 1
fi
# 检查关键词文件
if [ ! -f "$KEYWORDS_FILE" ]; then
echo -e "${YELLOW}警告: 关键词文件不存在: $KEYWORDS_FILE${NC}"
echo "将使用默认关键词文件"
KEYWORDS_FILE=""
fi
# 创建输出目录
mkdir -p "$OUTPUT_DIR"
# 统计
total=0
success=0
failed=0
# 获取所有 xlsx 文件
files=("$INPUT_DIR"/*.xlsx)
# 检查是否有文件
if [ ! -e "${files[0]}" ]; then
echo -e "${YELLOW}没有找到 xlsx 文件${NC}"
exit 0
fi
# 计算总数
for f in "${files[@]}"; do
if [ -f "$f" ]; then
((total++))
fi
done
echo "找到 $total 个文件待处理"
echo "----------------------------------------------"
# 处理每个文件
current=0
for input_file in "${files[@]}"; do
if [ ! -f "$input_file" ]; then
continue
fi
((current++))
# 获取文件名(不含扩展名)
filename=$(basename "$input_file" .xlsx)
output_file="$OUTPUT_DIR/${filename}_matched.xlsx"
echo -e "\n[$current/$total] 处理: $filename"
# 构建命令
cmd="python3 $SCRIPT_DIR/keyword_matcher.py -t \"$input_file\" -o \"$output_file\""
if [ -n "$KEYWORDS_FILE" ]; then
cmd="$cmd -k \"$KEYWORDS_FILE\""
fi
# 执行匹配
if eval "$cmd"; then
echo -e "${GREEN} ✓ 完成: ${filename}_matched.xlsx${NC}"
((success++))
else
echo -e "${RED} ✗ 失败: $filename${NC}"
((failed++))
fi
done
# 汇总
echo ""
echo "=============================================="
echo "处理完成"
echo "=============================================="
echo -e "总计: $total | ${GREEN}成功: $success${NC} | ${RED}失败: $failed${NC}"
echo "输出目录: $OUTPUT_DIR"

315
scripts/collect_xlsx.py Normal file
View File

@@ -0,0 +1,315 @@
#!/usr/bin/env python3
"""
收集并合并 xlsx 文件
功能:
1. 从 data/batch_output 子文件夹收集 results.xlsx图片分析结果
2. 与 data/data_all 中对应的原始数据({name}_text_img.xlsx合并
3. 通过图片名关联两个数据源
4. 保存合并后的文件到目标目录
用法:
python3 collect_xlsx.py # 默认合并并输出
python3 collect_xlsx.py -o ../data/merged # 指定输出目录
python3 collect_xlsx.py --no-merge # 不合并,只复制
python3 collect_xlsx.py -n # 预览模式
"""
import argparse
from pathlib import Path
from typing import Optional, Tuple, List
import pandas as pd
def extract_image_name(path: str) -> str:
"""
从完整路径提取图片文件名
支持 Windows 和 Unix 路径格式
"""
if pd.isna(path):
return ""
path_str = str(path).strip()
if not path_str:
return ""
# 同时处理 Windows (\) 和 Unix (/) 路径分隔符
# 先统一替换为 /,再提取文件名
normalized = path_str.replace("\\", "/")
filename = normalized.split("/")[-1]
return filename
def merge_xlsx_files(
results_file: Path,
original_file: Path,
results_image_col: str = "image_name",
original_image_cols: list = None,
original_text_col: str = "文本"
) -> Tuple[pd.DataFrame, dict]:
"""
合并分析结果和原始数据
Args:
results_file: 分析结果文件 (batch_output/.../results.xlsx)
original_file: 原始数据文件 (data_all/..._text_img.xlsx)
results_image_col: 结果文件中的图片名列
original_image_cols: 原始文件中可能的图片路径列(按优先级)
original_text_col: 原始文件中的文本列
Returns:
合并后的 DataFrame 和统计信息
"""
if original_image_cols is None:
original_image_cols = ["图片_新", "图片", "图片链接"]
# 读取文件
results_df = pd.read_excel(results_file)
original_df = pd.read_excel(original_file)
stats = {
"results_rows": len(results_df),
"original_rows": len(original_df),
"merged_rows": 0,
"unmatched_results": 0,
"original_columns_added": [],
"image_col_used": None
}
# 找到可用的图片列
image_col = None
for col in original_image_cols:
if col in original_df.columns:
image_col = col
break
if image_col is None:
raise ValueError(f"原始文件中未找到图片列,尝试过: {original_image_cols}")
stats["image_col_used"] = image_col
# 从原始数据提取图片名作为关联键
original_df["_image_name"] = original_df[image_col].apply(extract_image_name)
# 去重:原始数据可能有重复图片,保留第一条
original_dedup = original_df.drop_duplicates(subset=["_image_name"], keep="first")
# 确定要添加的原始数据列(排除图片路径列和临时列)
exclude_cols = set(original_image_cols + ["_image_name"])
original_cols_to_add = [col for col in original_df.columns
if col not in exclude_cols
and col not in results_df.columns]
stats["original_columns_added"] = original_cols_to_add
# 创建图片名到原始数据的映射
original_map = original_dedup.set_index("_image_name")[original_cols_to_add].to_dict("index")
# 合并:为结果数据添加原始数据列
merged_df = results_df.copy()
# 初始化新列
for col in original_cols_to_add:
merged_df[col] = None
# 逐行匹配并填充
matched_count = 0
for idx, row in merged_df.iterrows():
image_name = row[results_image_col]
if image_name in original_map:
for col in original_cols_to_add:
merged_df.at[idx, col] = original_map[image_name].get(col)
matched_count += 1
stats["merged_rows"] = len(merged_df)
stats["matched_count"] = matched_count
stats["unmatched_results"] = len(merged_df) - matched_count
return merged_df, stats
def collect_and_merge_xlsx(
source_dir: str,
data_all_dir: str,
output_dir: str,
merge: bool = True,
dry_run: bool = False
) -> List[dict]:
"""
收集并合并 xlsx 文件
Args:
source_dir: batch_output 目录路径
data_all_dir: data_all 目录路径
output_dir: 输出目录路径
merge: 是否合并原始数据
dry_run: 预览模式
Returns:
处理结果列表
"""
source_path = Path(source_dir)
data_all_path = Path(data_all_dir)
output_path = Path(output_dir)
if not source_path.exists():
print(f"错误: 源目录不存在: {source_dir}")
return []
# 创建输出目录
if not dry_run:
output_path.mkdir(parents=True, exist_ok=True)
results = []
# 遍历子文件夹
for folder in sorted(source_path.iterdir()):
if not folder.is_dir():
continue
folder_name = folder.name
results_file = folder / "results.xlsx"
if not results_file.exists():
continue
# 输出文件名
output_file = output_path / f"{folder_name}.xlsx"
# 查找对应的原始数据文件
original_file = data_all_path / f"{folder_name}_text_img.xlsx"
result_info = {
"folder": folder_name,
"results_file": str(results_file),
"original_file": str(original_file) if original_file.exists() else None,
"output_file": str(output_file),
"merged": False,
"stats": {}
}
if dry_run:
if merge and original_file.exists():
print(f"[预览] 合并: {folder_name}/results.xlsx + {folder_name}_text_img.xlsx -> {folder_name}.xlsx")
else:
print(f"[预览] 复制: {folder_name}/results.xlsx -> {folder_name}.xlsx")
results.append(result_info)
continue
# 执行合并或复制
if merge and original_file.exists():
try:
merged_df, stats = merge_xlsx_files(results_file, original_file)
merged_df.to_excel(output_file, index=False, engine="openpyxl")
result_info["merged"] = True
result_info["stats"] = stats
print(f"已合并: {folder_name}")
print(f" - 分析结果: {stats['results_rows']}")
print(f" - 原始数据: {stats['original_rows']}")
print(f" - 匹配成功: {stats['matched_count']}")
print(f" - 添加列: {stats['original_columns_added']}")
except Exception as e:
print(f"合并失败 {folder_name}: {e}")
# 回退到复制模式
import shutil
shutil.copy2(results_file, output_file)
print(f" 已回退到复制模式")
else:
# 只复制,不合并
import shutil
shutil.copy2(results_file, output_file)
if merge and not original_file.exists():
print(f"已复制: {folder_name} (原始数据不存在: {folder_name}_text_img.xlsx)")
else:
print(f"已复制: {folder_name}")
results.append(result_info)
return results
def main():
parser = argparse.ArgumentParser(
description="收集并合并 batch_output 和 data_all 中的 xlsx 文件",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
python3 collect_xlsx.py # 默认合并并输出
python3 collect_xlsx.py -o ../data/merged # 指定输出目录
python3 collect_xlsx.py --no-merge # 不合并,只复制
python3 collect_xlsx.py -n # 预览模式
"""
)
parser.add_argument(
"-s", "--source",
default="../data/batch_output",
help="batch_output 目录路径 (默认: ../data/batch_output)"
)
parser.add_argument(
"-d", "--data-all",
default="../data/data_all",
help="data_all 目录路径 (默认: ../data/data_all)"
)
parser.add_argument(
"-o", "--output",
default="../data/collected_xlsx",
help="输出目录路径 (默认: ../data/collected_xlsx)"
)
parser.add_argument(
"--no-merge",
action="store_true",
help="不合并原始数据,只复制分析结果"
)
parser.add_argument(
"-n", "--dry-run",
action="store_true",
help="预览模式,只打印不执行"
)
args = parser.parse_args()
# 转换为绝对路径
script_dir = Path(__file__).parent
source_dir = (script_dir / args.source).resolve()
data_all_dir = (script_dir / args.data_all).resolve()
output_dir = (script_dir / args.output).resolve()
print("=" * 60)
print("收集并合并 xlsx 文件")
print("=" * 60)
print(f"分析结果目录: {source_dir}")
print(f"原始数据目录: {data_all_dir}")
print(f"输出目录: {output_dir}")
print(f"合并模式: {'' if args.no_merge else ''}")
print("-" * 60)
results = collect_and_merge_xlsx(
str(source_dir),
str(data_all_dir),
str(output_dir),
merge=not args.no_merge,
dry_run=args.dry_run
)
print("-" * 60)
merged_count = sum(1 for r in results if r.get("merged"))
print(f"共处理 {len(results)} 个文件")
if not args.no_merge:
print(f" - 合并成功: {merged_count}")
print(f" - 仅复制: {len(results) - merged_count}")
if __name__ == "__main__":
main()

View File

@@ -47,6 +47,18 @@ MODE_LABELS = {
"exact": "精确匹配",
}
# 常见的文本列名(按优先级排序)
COMMON_TEXT_COLUMNS = [
"detected_text", # 新格式(图片分析结果)
"文本", # 旧格式 / 合并后的原始文本
"text",
"content",
"summary",
]
# 默认多列匹配组合
DEFAULT_TEXT_COLUMNS = ["detected_text", "文本"]
# ========== 数据类 ==========
@dataclass
@@ -136,6 +148,71 @@ def split_value(value: str, separator: str) -> List[str]:
return [part.strip() for part in parts if part and part.strip()]
def detect_text_columns(
df: pd.DataFrame,
specified_columns: Optional[List[str]] = None
) -> List[str]:
"""
检测并验证文本列名
参数:
df: 数据框
specified_columns: 用户指定的列名列表
返回:存在的文本列名列表
异常:如果找不到任何合适的列则抛出 ValueError
"""
# 如果用户指定了列名
if specified_columns:
available = [col for col in specified_columns if col in df.columns]
missing = [col for col in specified_columns if col not in df.columns]
if missing:
print(f"警告: 以下指定的列不存在: {missing}")
if available:
return available
else:
print("警告: 所有指定的列都不存在,尝试自动检测...")
# 自动检测:优先使用默认多列组合
available_default = [col for col in DEFAULT_TEXT_COLUMNS if col in df.columns]
if available_default:
print(f"自动检测到文本列: {available_default}")
return available_default
# 回退:使用第一个找到的常见列
for col in COMMON_TEXT_COLUMNS:
if col in df.columns:
print(f"自动检测到文本列: ['{col}']")
return [col]
# 都没找到,抛出异常
raise ValueError(
f"无法自动检测文本列。可用列: {df.columns.tolist()}\n"
f"请使用 -c 参数指定文本列名"
)
def combine_text_columns(row: pd.Series, text_columns: List[str]) -> str:
"""
合并多列文本内容
参数:
row: DataFrame 的一行
text_columns: 要合并的列名列表
返回:合并后的文本(用换行符分隔)
"""
texts = []
for col in text_columns:
val = row.get(col)
if pd.notna(val) and str(val).strip():
texts.append(str(val).strip())
return "\n".join(texts)
def load_keywords_for_mode(
df: pd.DataFrame,
mode: str,
@@ -205,22 +282,32 @@ class KeywordMatcher(ABC):
self,
df: pd.DataFrame,
keywords: Set[str],
text_column: str
text_columns: List[str]
) -> MatchResult:
"""执行匹配(模板方法)"""
"""执行匹配(模板方法)
参数:
df: 数据框
keywords: 关键词集合
text_columns: 文本列名列表(支持多列)
"""
print(f"开始匹配(使用{self.name}...")
print(f"搜索列: {text_columns}")
self._prepare(keywords)
matched_indices = []
matched_keywords_list = []
start_time = time.time()
for idx, text in enumerate(df[text_column]):
if pd.isna(text):
for idx in range(len(df)):
row = df.iloc[idx]
# 合并多列文本
combined_text = combine_text_columns(row, text_columns)
if not combined_text:
continue
text_str = str(text)
matches = self._match_single_text(text_str, keywords)
matches = self._match_single_text(combined_text, keywords)
if matches:
matched_indices.append(idx)
@@ -435,22 +522,36 @@ def preview_results(result_df: pd.DataFrame, num_rows: int = 5) -> None:
def perform_matching(
df: pd.DataFrame,
keywords: Set[str],
text_column: str,
text_columns: List[str],
output_file: str,
algorithm: str = "auto",
mode: str = None
) -> Optional[pd.DataFrame]:
"""执行完整的匹配流程"""
"""执行完整的匹配流程
参数:
df: 数据框
keywords: 关键词集合
text_columns: 文本列名列表(支持多列)
output_file: 输出文件路径
algorithm: 匹配算法
mode: 匹配模式
"""
# 验证列存在
if text_column not in df.columns:
missing_cols = [col for col in text_columns if col not in df.columns]
if missing_cols:
print(f"警告: 以下列不存在: {missing_cols}")
text_columns = [col for col in text_columns if col in df.columns]
if not text_columns:
print(f"可用列名: {df.columns.tolist()}")
raise ValueError(f"'{text_column}' 不存在")
raise ValueError("没有可用的文本列")
print(f"文本文件共有 {len(df)} 行数据\n")
# 创建匹配器并执行匹配
matcher = create_matcher(algorithm, mode=mode)
result = matcher.match(df, keywords, text_column)
result = matcher.match(df, keywords, text_columns)
# 输出统计信息
print_statistics(result)
@@ -465,7 +566,7 @@ def process_single_mode(
keywords_df: pd.DataFrame,
text_df: pd.DataFrame,
mode: str,
text_column: str,
text_columns: List[str],
output_file: Path,
separator: str = SEPARATOR,
save_to_file: bool = True
@@ -473,6 +574,9 @@ def process_single_mode(
"""
处理单个检测模式
参数:
text_columns: 文本列名列表(支持多列)
返回:匹配结果 DataFrame包含原始索引
"""
mode_lower = mode.lower()
@@ -501,7 +605,7 @@ def process_single_mode(
result_df = perform_matching(
df=text_df,
keywords=keywords,
text_column=text_column,
text_columns=text_columns,
output_file=temp_output,
algorithm=algorithm,
mode=mode_lower # 传递模式参数
@@ -528,11 +632,15 @@ def run_multiple_modes(
keywords_file: Path,
text_file: Path,
output_file: Path,
text_column: str,
text_columns: Optional[List[str]],
modes: List[str],
separator: str = SEPARATOR
) -> None:
"""运行多个检测模式,合并结果到单一文件"""
"""运行多个检测模式,合并结果到单一文件
参数:
text_columns: 文本列名列表支持多列None 表示自动检测
"""
# 验证文件存在
if not keywords_file.exists():
raise FileNotFoundError(f"找不到关键词文件: {keywords_file}")
@@ -546,7 +654,10 @@ def run_multiple_modes(
print(f"正在加载文本文件: {text_file}")
text_df = pd.read_excel(text_file)
print(f"文本列: {text_column}\n")
# 自动检测或验证文本列
actual_text_columns = detect_text_columns(text_df, text_columns)
print(f"使用文本列: {actual_text_columns}\n")
# 验证模式
if not modes:
@@ -568,7 +679,7 @@ def run_multiple_modes(
keywords_df=keywords_df,
text_df=text_df,
mode=mode_lower,
text_column=text_column,
text_columns=actual_text_columns,
output_file=output_file, # 这个参数在 save_to_file=False 时不使用
separator=separator,
save_to_file=False # 不保存到单独文件
@@ -668,7 +779,7 @@ def parse_args():
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
# 使用默认配置(两种模式
# 使用默认配置(自动检测 detected_text 和 文本 列
python keyword_matcher.py
# 仅执行 CAS 号识别
@@ -677,6 +788,12 @@ def parse_args():
# 仅执行精确匹配
python keyword_matcher.py -m exact
# 指定单个文本列
python keyword_matcher.py -c detected_text
# 指定多个文本列
python keyword_matcher.py -c detected_text 文本 summary
# 指定自定义文件路径
python keyword_matcher.py -k ../data/input/keywords.xlsx -t ../data/input/text.xlsx
"""
@@ -701,10 +818,11 @@ def parse_args():
)
parser.add_argument(
'-c', '--text-column',
'-c', '--text-columns',
nargs='+',
type=str,
default='文本',
help='文本列名 (默认: 文本)'
default=None,
help='文本列名,支持多列 (默认: 自动检测 detected_text 和 文本)'
)
parser.add_argument(
@@ -759,7 +877,7 @@ def main():
keywords_file=keywords_file,
text_file=text_file,
output_file=output_file,
text_column=args.text_column,
text_columns=args.text_columns,
modes=args.modes,
separator=args.separator
)

View File

@@ -0,0 +1,517 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
验证高置信度未匹配记录
功能:比对 keyword_matcher 结果与原始 Excel找出高置信度未匹配行调用 LLM 二次验证。
用法:
python3 verify_high_confidence.py -o original.xlsx -m matched.xlsx
python3 verify_high_confidence.py -o original.xlsx -m matched.xlsx --mock --limit 5
"""
import argparse
import json
import os
import sys
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional
import pandas as pd
# 可选依赖
try:
import openai
HAS_OPENAI = True
except ImportError:
HAS_OPENAI = False
try:
import requests
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
HAS_REQUESTS = True
except ImportError:
HAS_REQUESTS = False
# ========== 常量与配置 ==========
CONFIDENCE_LEVELS = ["High", "Medium"]
REQUEST_DELAY = 0.5
# 环境变量映射: api_type -> (key_env, url_env, model_env, default_model)
ENV_MAPPING = {
"openai": ("OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_MODEL", "gpt-4o-mini"),
"dmx": ("DMX_API_KEY", "DMX_BASE_URL", "DMX_MODEL", "gpt-4o-mini"),
"dify": ("DIFY_API_KEY", "DIFY_BASE_URL", "DIFY_MODEL", "dify-chatflow"),
"ollama": (None, "OLLAMA_BASE_URL", "OLLAMA_MODEL", "qwen2.5:7b"),
}
SYSTEM_PROMPT = """你是一位化学品风险识别专家。请分析文本内容,判断是否涉及管制化学品、毒品前体或非法药物交易。
请以 JSON 格式回答,包含以下字段:
- is_risky: 布尔值,是否涉及风险
- substances: 数组涉及的具体物质名称或CAS号
- risk_level: 字符串,风险等级(高/中/低)
- reason: 字符串,判定理由(简要)
示例输出:
{"is_risky": true, "substances": ["甲基苯丙胺", "CAS 537-46-2"], "risk_level": "", "reason": "文本中明确提到毒品名称和交易信息"}
"""
USER_PROMPT_TEMPLATE = """请分析以下内容是否涉及管制化学品或毒品:
【图片分析结果】
{raw_response}
【原始文本】
{original_text}
请以 JSON 格式输出分析结果。"""
# ========== 数据类 ==========
@dataclass
class VerifyConfig:
api_type: str = "openai"
api_key: str = ""
base_url: Optional[str] = None
model: str = "gpt-4o-mini"
user_id: str = "default-user"
@dataclass
class VerificationResult:
is_risky: Optional[bool] = None
substances: List[str] = field(default_factory=list)
risk_level: str = ""
reason: str = ""
raw_response: str = ""
def to_columns(self) -> dict:
return {
"llm_is_risky": self.is_risky,
"llm_substances": " | ".join(self.substances) if self.substances else "",
"llm_risk_level": self.risk_level,
"llm_reason": self.reason,
"llm_raw_response": self.raw_response,
}
# ========== 工具函数 ==========
def load_env_file(env_path: str) -> None:
"""从 .env 文件加载环境变量"""
env_file = Path(env_path)
if not env_file.exists():
return
print(f"加载环境配置: {env_file}")
with open(env_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
if line.startswith("export "):
line = line[7:]
if "=" in line:
key, _, value = line.partition("=")
os.environ[key.strip()] = value.strip().strip('"').strip("'")
def get_config() -> VerifyConfig:
"""获取验证配置,优先使用 VERIFY_ 前缀"""
api_type = (os.getenv("VERIFY_API_TYPE") or os.getenv("LLM_API_TYPE") or "openai").lower()
mapping = ENV_MAPPING.get(api_type, (None, None, None, "gpt-4o-mini"))
key_env, url_env, model_env, default_model = mapping
return VerifyConfig(
api_type=api_type,
api_key=os.getenv("VERIFY_API_KEY") or (os.getenv(key_env) if key_env else "") or "",
base_url=os.getenv("VERIFY_BASE_URL") or (os.getenv(url_env) if url_env else None),
model=os.getenv("VERIFY_MODEL") or (os.getenv(model_env) if model_env else default_model) or default_model,
user_id=os.getenv("VERIFY_USER_ID") or os.getenv("DIFY_USER_ID") or "default-user",
)
def parse_json_response(content: str) -> dict:
"""从 LLM 响应提取 JSON处理 markdown 代码块"""
# 移除 markdown 代码块
if "```json" in content:
start = content.find("```json") + 7
end = content.find("```", start)
content = content[start:end].strip()
elif "```" in content:
start = content.find("```") + 3
end = content.find("```", start)
content = content[start:end].strip()
try:
start = content.find("{")
end = content.rfind("}") + 1
if start >= 0 and end > start:
return json.loads(content[start:end])
except json.JSONDecodeError:
pass
return {"is_risky": None, "substances": [], "risk_level": "未知", "reason": "JSON 解析失败"}
def build_prompt(row: pd.Series, max_len: int = 3000) -> str:
"""构建用户提示"""
raw = str(row.get("raw_response", "") or "")
text = str(row.get("文本", "") or "")
if len(raw) > max_len:
raw = raw[:max_len] + "...(截断)"
if len(text) > max_len:
text = text[:max_len] + "...(截断)"
return USER_PROMPT_TEMPLATE.format(raw_response=raw, original_text=text)
# ========== 验证器类 ==========
class LLMVerifier(ABC):
"""LLM 验证器抽象基类"""
@abstractmethod
def verify(self, row: pd.Series) -> VerificationResult:
pass
class OpenAIVerifier(LLMVerifier):
"""OpenAI 兼容 API 验证器 (支持 OpenAI, DMX, Ollama)"""
def __init__(self, config: VerifyConfig):
if not HAS_OPENAI:
raise ImportError("请安装 openai: pip install openai")
if config.api_type != "ollama" and not config.api_key:
raise ValueError("未提供 API Key")
base_url = config.base_url
if config.api_type == "ollama":
base_url = (config.base_url or "http://localhost:11434") + "/v1"
self.client = openai.OpenAI(
api_key=config.api_key or "ollama",
base_url=base_url,
)
self.model = config.model
def verify(self, row: pd.Series) -> VerificationResult:
prompt = build_prompt(row)
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
temperature=0.1,
max_tokens=500,
)
content = response.choices[0].message.content or ""
if response.choices[0].finish_reason != "stop":
return VerificationResult(
risk_level="错误",
reason=f"响应不完整 (finish_reason={response.choices[0].finish_reason})",
raw_response=content,
)
parsed = parse_json_response(content)
return VerificationResult(
is_risky=parsed.get("is_risky"),
substances=parsed.get("substances", []),
risk_level=parsed.get("risk_level", ""),
reason=parsed.get("reason", ""),
raw_response=content,
)
except Exception as e:
return VerificationResult(risk_level="错误", reason=f"API 调用失败: {e}", raw_response=str(e))
class DifyVerifier(LLMVerifier):
"""Dify API 验证器"""
def __init__(self, config: VerifyConfig):
if not HAS_REQUESTS:
raise ImportError("请安装 requests: pip install requests")
if not config.api_key:
raise ValueError("未提供 Dify API Key")
self.base_url = (config.base_url or "").rstrip("/")
self.api_key = config.api_key
self.user_id = config.user_id
def verify(self, row: pd.Series) -> VerificationResult:
prompt = f"{SYSTEM_PROMPT}\n\n{build_prompt(row)}"
try:
resp = requests.post(
f"{self.base_url}/v1/chat-messages",
headers={"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"},
json={"inputs": {}, "query": prompt, "response_mode": "blocking", "user": self.user_id},
timeout=120,
verify=False,
)
resp.raise_for_status()
content = resp.json().get("answer", "")
parsed = parse_json_response(content)
return VerificationResult(
is_risky=parsed.get("is_risky"),
substances=parsed.get("substances", []),
risk_level=parsed.get("risk_level", ""),
reason=parsed.get("reason", ""),
raw_response=content,
)
except Exception as e:
return VerificationResult(risk_level="错误", reason=f"Dify 调用失败: {e}", raw_response=str(e))
class MockVerifier(LLMVerifier):
"""Mock 验证器(测试用)"""
RISK_KEYWORDS = [
"毒品", "非法", "管制", "药物", "化学品", "CAS", "阿片", "芬太尼",
"冰毒", "大麻", "可卡因", "海洛因", "摇头丸", "麻黄碱",
"fentanyl", "methamphetamine", "cocaine", "heroin", "mdma",
"ketamine", "lsd", "precursor", "controlled",
]
def verify(self, row: pd.Series) -> VerificationResult:
all_text = f"{row.get('raw_response', '')} {row.get('文本', '')}".lower()
found = [kw for kw in self.RISK_KEYWORDS if kw.lower() in all_text]
is_risky = len(found) > 0
return VerificationResult(
is_risky=is_risky,
substances=found[:5],
risk_level="" if is_risky else "",
reason=f"Mock模式 - 发现关键词: {found[:3]}" if is_risky else "Mock模式 - 未发现风险关键词",
raw_response="(mock)",
)
def create_verifier(config: VerifyConfig) -> LLMVerifier:
"""根据配置创建验证器"""
if config.api_type == "mock":
return MockVerifier()
elif config.api_type == "dify":
return DifyVerifier(config)
elif config.api_type in ("openai", "dmx", "ollama"):
return OpenAIVerifier(config)
else:
raise ValueError(f"不支持的 API 类型: {config.api_type}")
# ========== 数据处理 ==========
def load_excel(file_path: Path) -> pd.DataFrame:
"""加载 Excel 文件"""
if not file_path.exists():
raise FileNotFoundError(f"文件不存在: {file_path}")
return pd.read_excel(file_path)
def find_unmatched(
original_df: pd.DataFrame,
matched_df: pd.DataFrame,
confidence_col: str = "confidence",
confidence_levels: List[str] = None,
) -> pd.DataFrame:
"""找出高置信度但未被关键词匹配的行"""
levels = confidence_levels or CONFIDENCE_LEVELS
if confidence_col not in original_df.columns:
print(f"警告: 原始文件中不存在 '{confidence_col}'")
print(f"可用列: {original_df.columns.tolist()}")
return pd.DataFrame()
# 高置信度行索引
conf_lower = original_df[confidence_col].astype(str).str.lower()
levels_lower = [l.lower() for l in levels]
high_conf_idx = set(original_df[conf_lower.isin(levels_lower)].index)
matched_idx = set(matched_df.index)
unmatched_idx = high_conf_idx - matched_idx
# 统计信息
print(f"\n{'='*50}")
print("数据比对统计")
print(f"{'='*50}")
print(f"原始数据总行数: {len(original_df)}")
print(f"高置信度 ({'/'.join(levels)}) 行数: {len(high_conf_idx)}")
print(f"关键词匹配到的行数: {len(matched_idx)}")
print(f"高置信度中已匹配: {len(high_conf_idx & matched_idx)}")
print(f"高置信度中未匹配 (需验证): {len(unmatched_idx)}")
print(f"{'='*50}\n")
if not unmatched_idx:
return pd.DataFrame()
return original_df.loc[list(unmatched_idx)].copy()
def verify_batch(df: pd.DataFrame, verifier: LLMVerifier, delay: float = REQUEST_DELAY, limit: int = 0) -> pd.DataFrame:
"""批量验证记录"""
if limit > 0:
df = df.head(limit)
total = len(df)
print(f"开始 LLM 验证,共 {total} 条记录...")
print("-" * 50)
results = []
start_time = time.time()
for i, (idx, row) in enumerate(df.iterrows()):
if (i + 1) % 10 == 0 or i == 0 or i == total - 1:
elapsed = time.time() - start_time
speed = (i + 1) / elapsed if elapsed > 0 else 0
print(f"进度: {i + 1}/{total} ({(i+1)/total*100:.1f}%) - 速度: {speed:.1f} 条/秒")
result = verifier.verify(row)
results.append({"original_index": idx, **result.to_columns()})
if delay > 0 and i < total - 1:
time.sleep(delay)
results_df = pd.DataFrame(results).set_index("original_index")
verified_df = df.copy()
for col in results_df.columns:
verified_df[col] = results_df[col]
return verified_df
# ========== 结果输出 ==========
def save_results(df: pd.DataFrame, output_file: Path, risky_only: bool = False) -> None:
"""保存结果"""
if risky_only and "llm_is_risky" in df.columns:
df = df[df["llm_is_risky"] == True]
df.to_excel(output_file, index=False, engine="openpyxl")
print(f"\n已保存 {len(df)} 条记录到: {output_file}")
def print_summary(df: pd.DataFrame) -> None:
"""打印验证摘要"""
print(f"\n{'='*50}")
print("验证结果摘要")
print(f"{'='*50}")
total = len(df)
if "llm_is_risky" not in df.columns:
print(f"总记录数: {total}")
return
risky = (df["llm_is_risky"] == True).sum()
not_risky = (df["llm_is_risky"] == False).sum()
unknown = total - risky - not_risky
print(f"总验证数: {total}")
print(f" ├─ LLM 判定有风险: {risky} ({risky/total*100:.1f}%)")
print(f" ├─ LLM 判定无风险: {not_risky} ({not_risky/total*100:.1f}%)")
if unknown > 0:
print(f" └─ 判定失败/未知: {unknown}")
if "llm_risk_level" in df.columns:
print(f"\n风险等级分布:")
for level, count in df["llm_risk_level"].value_counts().items():
print(f" - {level}: {count}")
print(f"{'='*50}")
# ========== CLI ==========
def parse_args():
parser = argparse.ArgumentParser(
description="验证高置信度未匹配记录",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
python3 verify_high_confidence.py -o original.xlsx -m matched.xlsx
python3 verify_high_confidence.py -o original.xlsx -m matched.xlsx --mock --limit 5
python3 verify_high_confidence.py -o original.xlsx -m matched.xlsx --api dmx --model gpt-4o-mini
""",
)
parser.add_argument("-o", "--original", required=True, help="原始 Excel 文件路径")
parser.add_argument("-m", "--matched", required=True, help="keyword_matcher 匹配结果文件路径")
parser.add_argument("-r", "--result", help="输出结果文件路径 (默认: 原始文件名_llm_verified.xlsx)")
parser.add_argument("--env-file", help="环境变量文件路径 (默认: ../.env)")
parser.add_argument("--api", choices=["openai", "dmx", "dify", "ollama"], help="LLM API 类型")
parser.add_argument("--model", help="LLM 模型名称")
parser.add_argument("--base-url", help="API base URL")
parser.add_argument("--api-key", help="API Key")
parser.add_argument("--mock", action="store_true", help="使用 mock 模式(不调用 API")
parser.add_argument("--confidence", nargs="+", default=["High", "Medium"], help="需要验证的置信度级别")
parser.add_argument("--confidence-col", default="confidence", help="置信度列名")
parser.add_argument("--delay", type=float, default=REQUEST_DELAY, help="API 请求间隔秒数")
parser.add_argument("--limit", type=int, default=0, help="限制验证条数 (0=全部)")
parser.add_argument("--risky-only", action="store_true", help="只保存有风险的记录")
return parser.parse_args()
def main():
args = parse_args()
# 加载 .env
base_dir = Path(__file__).resolve().parent
env_file = args.env_file or str(base_dir.parent / ".env")
load_env_file(env_file)
# 获取配置
config = get_config()
# 命令行参数覆盖
if args.mock:
config.api_type = "mock"
elif args.api:
config.api_type = args.api
if args.model:
config.model = args.model
if args.base_url:
config.base_url = args.base_url
if args.api_key:
config.api_key = args.api_key
# 文件路径
original_file = Path(args.original)
matched_file = Path(args.matched)
result_file = Path(args.result) if args.result else original_file.parent / f"{original_file.stem}_llm_verified.xlsx"
print("=" * 60)
print("高置信度未匹配记录验证")
print("=" * 60)
print(f"原始文件: {original_file}")
print(f"匹配结果: {matched_file}")
print(f"输出文件: {result_file}")
print(f"置信度级别: {args.confidence}")
print(f"API 类型: {config.api_type}")
print(f"模型: {config.model}")
if config.base_url:
print(f"Base URL: {config.base_url}")
# 加载数据
print("\n正在加载数据...")
original_df = load_excel(original_file)
matched_df = load_excel(matched_file)
# 找出未匹配的高置信度行
unmatched_df = find_unmatched(original_df, matched_df, args.confidence_col, args.confidence)
if unmatched_df.empty:
print("\n所有高置信度行都已被关键词匹配,无需验证。")
return
# 创建验证器
try:
verifier = create_verifier(config)
except (ImportError, ValueError) as e:
print(f"\n错误: {e}")
sys.exit(1)
# 执行验证
verified_df = verify_batch(unmatched_df, verifier, delay=args.delay, limit=args.limit)
# 打印摘要并保存
print_summary(verified_df)
save_results(verified_df, result_file, args.risky_only)
print("\n✓ 验证完成!")
if __name__ == "__main__":
main()