fix: update keywords_match
This commit is contained in:
50
.env.example
Normal file
50
.env.example
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
# ========================================
|
||||||
|
# 环境变量配置示例
|
||||||
|
# 复制此文件为 .env 并填入实际值
|
||||||
|
# ========================================
|
||||||
|
|
||||||
|
# 默认 LLM API 类型 (openai | dmx | dify | ollama)
|
||||||
|
export LLM_API_TYPE="dify"
|
||||||
|
|
||||||
|
# ========== OpenAI API 配置 ==========
|
||||||
|
# export OPENAI_API_KEY="sk-your-openai-api-key"
|
||||||
|
# export OPENAI_BASE_URL="https://api.openai.com/v1"
|
||||||
|
# export OPENAI_MODEL="gpt-4o-mini"
|
||||||
|
|
||||||
|
# ========== DMX API 配置 (OpenAI 兼容) ==========
|
||||||
|
export DMX_API_KEY="your-dmx-api-key"
|
||||||
|
export DMX_BASE_URL="https://www.dmxapi.cn"
|
||||||
|
export DMX_MODEL="gpt-4o-mini"
|
||||||
|
|
||||||
|
# ========== Dify API 配置 ==========
|
||||||
|
# image_batch_recognizer.py 默认使用
|
||||||
|
export DIFY_API_KEY="app-your-dify-api-key"
|
||||||
|
export DIFY_BASE_URL="https://your-dify-server:4433"
|
||||||
|
export DIFY_USER_ID="default-user"
|
||||||
|
export DIFY_MODEL="dify-chatflow"
|
||||||
|
|
||||||
|
# ========== Ollama 本地模型配置 ==========
|
||||||
|
# export OLLAMA_BASE_URL="http://localhost:11434"
|
||||||
|
# export OLLAMA_MODEL="qwen2.5:7b"
|
||||||
|
|
||||||
|
# ========== Anthropic API 配置 ==========
|
||||||
|
# export ANTHROPIC_API_KEY="sk-ant-your-anthropic-api-key"
|
||||||
|
# export ANTHROPIC_BASE_URL="https://api.anthropic.com"
|
||||||
|
# export ANTHROPIC_MODEL="claude-3-5-sonnet-20241022"
|
||||||
|
|
||||||
|
# ========================================
|
||||||
|
# verify_high_confidence.py 独立配置
|
||||||
|
# ========================================
|
||||||
|
# 使用 VERIFY_ 前缀,与 image_batch_recognizer.py 区分
|
||||||
|
# 如果不设置,会回退到上面的通用配置
|
||||||
|
|
||||||
|
# VERIFY_API_TYPE=dmx
|
||||||
|
# VERIFY_API_KEY=your-api-key
|
||||||
|
# VERIFY_BASE_URL=https://api.example.com
|
||||||
|
# VERIFY_MODEL=gpt-4o-mini
|
||||||
|
# VERIFY_USER_ID=default-user
|
||||||
|
|
||||||
|
# ========================================
|
||||||
|
# 自定义 Prompt (可选)
|
||||||
|
# ========================================
|
||||||
|
# export VISION_ANALYSIS_PROMPT="你的自定义图片分析提示词..."
|
||||||
320
CLAUDE.md
320
CLAUDE.md
@@ -7,9 +7,10 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
|||||||
This is a drug risk monitoring and data processing system for detecting controlled substances in text and image data from e-commerce platforms, darknet sources, and social media.
|
This is a drug risk monitoring and data processing system for detecting controlled substances in text and image data from e-commerce platforms, darknet sources, and social media.
|
||||||
|
|
||||||
**Core Capabilities:**
|
**Core Capabilities:**
|
||||||
1. **CAS Number Matching**: Extract and match chemical CAS numbers from text using regex patterns (supports multiple formats)
|
1. **Image Recognition**: Batch image analysis using LLM APIs (OpenAI, Anthropic, DMX, Dify) for OCR and risk detection
|
||||||
2. **Keyword Matching**: High-performance multi-mode keyword matching (fuzzy, CAS)
|
2. **Keyword Matching**: Multi-mode keyword matching with CAS number extraction and exact matching
|
||||||
3. **Keyword Expansion**: LLM-powered expansion of chemical/drug names to include variants, abbreviations, and aliases
|
3. **LLM Verification**: Secondary verification of high-confidence unmatched records using LLM
|
||||||
|
4. **Data Collection**: Merge and consolidate results from batch processing
|
||||||
|
|
||||||
## Running Scripts
|
## Running Scripts
|
||||||
|
|
||||||
@@ -18,28 +19,24 @@ All scripts must be run from the `scripts/` directory:
|
|||||||
```bash
|
```bash
|
||||||
cd scripts/
|
cd scripts/
|
||||||
|
|
||||||
# Quick start (recommended for testing)
|
# Image batch recognition (mock mode for testing)
|
||||||
python3 quick_start.py
|
python3 image_batch_recognizer.py --mock --limit 5
|
||||||
|
|
||||||
# CAS number matching
|
# Image recognition with API
|
||||||
python3 match_cas_numbers.py
|
python3 image_batch_recognizer.py --api-type dify --limit 10
|
||||||
|
|
||||||
# Multi-mode keyword matching (default: both modes)
|
# Collect and merge xlsx files from batch output
|
||||||
|
python3 collect_xlsx.py
|
||||||
|
|
||||||
|
# Multi-mode keyword matching (default: cas + exact)
|
||||||
python3 keyword_matcher.py
|
python3 keyword_matcher.py
|
||||||
|
|
||||||
# Single mode matching
|
# Single mode matching
|
||||||
python3 keyword_matcher.py -m cas # CAS number only
|
python3 keyword_matcher.py -m cas # CAS number only
|
||||||
python3 keyword_matcher.py -m fuzzy --threshold 90 # Fuzzy matching only
|
python3 keyword_matcher.py -m exact # Exact matching only
|
||||||
|
|
||||||
# Use larger keyword database
|
# Verify high-confidence unmatched records
|
||||||
python3 keyword_matcher.py -k ../data/input/keyword_all.xlsx
|
python3 verify_high_confidence.py -o original.xlsx -m matched.xlsx --mock
|
||||||
|
|
||||||
# Keyword expansion (mock mode, no API)
|
|
||||||
python3 expand_keywords_with_llm.py -m
|
|
||||||
|
|
||||||
# Keyword expansion (with OpenAI API)
|
|
||||||
export OPENAI_API_KEY="sk-..."
|
|
||||||
python3 expand_keywords_with_llm.py ../data/input/keywords.xlsx
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
@@ -49,221 +46,130 @@ python3 expand_keywords_with_llm.py ../data/input/keywords.xlsx
|
|||||||
pip install pandas openpyxl
|
pip install pandas openpyxl
|
||||||
```
|
```
|
||||||
|
|
||||||
**Optional (for fuzzy keyword matching):**
|
**Optional:**
|
||||||
```bash
|
```bash
|
||||||
pip install rapidfuzz
|
pip install pyahocorasick # 5x faster exact matching
|
||||||
|
pip install requests # Required for Dify API
|
||||||
|
pip install tqdm # Progress bars
|
||||||
|
pip install openai # For OpenAI-compatible APIs in verify script
|
||||||
```
|
```
|
||||||
|
|
||||||
**Optional (for LLM keyword expansion):**
|
## Environment Configuration
|
||||||
|
|
||||||
|
Copy `.env.example` to `.env` and configure API keys:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install openai anthropic
|
# Default API type (openai | dmx | dify | ollama)
|
||||||
|
LLM_API_TYPE="dify"
|
||||||
|
|
||||||
|
# DMX API (OpenAI compatible)
|
||||||
|
DMX_API_KEY="your-key"
|
||||||
|
DMX_BASE_URL="https://www.dmxapi.cn"
|
||||||
|
DMX_MODEL="gpt-4o-mini"
|
||||||
|
|
||||||
|
# Dify API (used by image_batch_recognizer.py)
|
||||||
|
DIFY_API_KEY="app-xxx"
|
||||||
|
DIFY_BASE_URL="https://your-dify-server:4433"
|
||||||
|
DIFY_USER_ID="default-user"
|
||||||
|
|
||||||
|
# Separate config for verify_high_confidence.py (VERIFY_ prefix)
|
||||||
|
VERIFY_API_TYPE="dmx"
|
||||||
|
VERIFY_API_KEY="your-key"
|
||||||
|
VERIFY_BASE_URL="https://api.example.com"
|
||||||
|
VERIFY_MODEL="gpt-4o-mini"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Data Flow Architecture
|
## Data Flow Architecture
|
||||||
|
|
||||||
All scripts use relative paths from `scripts/` directory:
|
|
||||||
|
|
||||||
```
|
```
|
||||||
Input: ../data/input/
|
data/
|
||||||
clickin_text_img.xlsx (2779 rows: text + image paths)
|
├── input/ # Source data
|
||||||
keywords.xlsx (22 rows, basic keyword list)
|
│ ├── clickin_text_img.xlsx # Text + image paths
|
||||||
keyword_all.xlsx (1659 rows, 1308 unique CAS numbers)
|
│ └── keywords.xlsx # Keyword database
|
||||||
|
├── images/ # Image files for recognition
|
||||||
Output: ../data/output/
|
├── batch_output/ # Per-folder recognition results
|
||||||
keyword_matched_results.xlsx (multi-mode merged results)
|
│ └── {name}/results.xlsx
|
||||||
cas_matched_results_final.xlsx
|
├── data_all/ # Original data by source
|
||||||
test_keywords_expanded_rows.xlsx
|
│ └── {name}_text_img.xlsx
|
||||||
|
├── collected_xlsx/ # Merged results (collect_xlsx.py output)
|
||||||
Images: ../data/images/ (1955 JPG files, 84MB)
|
└── output/ # Final processed results
|
||||||
```
|
```
|
||||||
|
|
||||||
**Processing Pipeline:**
|
**Processing Pipeline:**
|
||||||
```
|
```
|
||||||
Raw data collection -> Text extraction (OCR/LLM) ->
|
1. image_batch_recognizer.py → batch_output/{name}/results.xlsx
|
||||||
Feature matching (CAS/keywords) -> Data cleaning ->
|
2. collect_xlsx.py → Merge results.xlsx with {name}_text_img.xlsx → collected_xlsx/
|
||||||
Risk determination
|
3. keyword_matcher.py → Match keywords in text → output/keyword_matched_results.xlsx
|
||||||
|
4. verify_high_confidence.py → LLM verify unmatched high-confidence → *_llm_verified.xlsx
|
||||||
```
|
```
|
||||||
|
|
||||||
## Key Technical Details
|
## Key Scripts
|
||||||
|
|
||||||
### 1. CAS Number Matching (`match_cas_numbers.py`)
|
### keyword_matcher.py
|
||||||
- Supports multiple formats: `123-45-6`, `123 45 6`, `123 - 45 - 6`
|
|
||||||
- Auto-normalizes to standard format `XXX-XX-X`
|
|
||||||
- Uses regex pattern: `\b\d{2,7}[\s\-]+\d{2}[\s\-]+\d\b`
|
|
||||||
- Dual-mode: `"regex"` for CAS matching, `"keywords"` for keyword matching
|
|
||||||
|
|
||||||
### 2. Keyword Matching (`keyword_matcher.py`) - REFACTORED
|
Two detection modes with Strategy Pattern architecture:
|
||||||
|
|
||||||
**Architecture:**
|
1. **CAS Number Recognition (`-m cas`)**
|
||||||
- Strategy Pattern with `KeywordMatcher` base class
|
- Regex pattern: `\b(\d{2,7})[\s\-._]?(\d{2})[\s\-._]?(\d)\b`
|
||||||
- Concrete matchers: `CASRegexMatcher`, `FuzzyMatcher`
|
- Supports formats: `123-45-6`, `123 45 6`, `123456`, `123.45.6`
|
||||||
- Factory Pattern for matcher creation
|
- Auto-normalizes to standard `XXX-XX-X` format
|
||||||
- Dataclass-based result handling
|
- Source column: `CAS号`
|
||||||
|
|
||||||
**Two Detection Modes:**
|
2. **Exact Matching (`-m exact`)**
|
||||||
|
- Uses Aho-Corasick automaton (if pyahocorasick installed) or regex with word boundaries
|
||||||
1. **CAS Number Recognition (CAS号识别)**
|
|
||||||
- Uses `CASRegexMatcher` with comprehensive regex pattern
|
|
||||||
- Supports formats: `123-45-6`, `123 45 6`, `12345 6`, `123456`, `123.45.6`, `123_45_6`
|
|
||||||
- Auto-normalizes all formats to standard `XXX-XX-X`
|
|
||||||
- Regex: `\b(\d{2,7})[\s\-._]?(\d{2})[\s\-._]?(\d)\b`
|
|
||||||
- Extracts CAS from text, normalizes, compares with keyword database
|
|
||||||
- Source columns: `CAS号`
|
|
||||||
|
|
||||||
2. **Fuzzy Matching (模糊匹配)**
|
|
||||||
- Uses `FuzzyMatcher` with RapidFuzz library
|
|
||||||
- Default threshold: 85 (configurable via `--threshold`)
|
|
||||||
- Scoring function: `partial_ratio`
|
|
||||||
- Source columns: `中文名`, `英文名`, `CAS号`, `简称`, `可能名称`
|
- Source columns: `中文名`, `英文名`, `CAS号`, `简称`, `可能名称`
|
||||||
- **Note**: Fuzzy matching covers all cases that exact matching would find, making exact mode redundant
|
|
||||||
|
|
||||||
**Multi-Mode Result Merging:**
|
**Multi-column text matching:**
|
||||||
- Automatically merges results from multiple modes
|
- Automatically detects and combines `detected_text` and `文本` columns
|
||||||
- Deduplicates by row index
|
- Use `-c col1 col2` to specify custom columns
|
||||||
- Combines matched keywords with ` | ` separator
|
|
||||||
- Adds `匹配模式` column showing which modes matched (e.g., "CAS号识别 + 模糊匹配")
|
|
||||||
|
|
||||||
**Command-Line Options:**
|
|
||||||
```bash
|
|
||||||
-k, --keywords # Path to keywords file (default: ../data/input/keywords.xlsx)
|
|
||||||
-t, --text # Path to text file (default: ../data/input/clickin_text_img.xlsx)
|
|
||||||
-o, --output # Output file path (default: ../data/output/keyword_matched_results.xlsx)
|
|
||||||
-c, --text-column # Column containing text to search (default: "文本")
|
|
||||||
-m, --modes # Modes to run: cas, fuzzy (default: both)
|
|
||||||
--threshold # Fuzzy matching threshold 0-100 (default: 85)
|
|
||||||
--separator # Keyword separator in cells (default: "|||")
|
|
||||||
```
|
|
||||||
|
|
||||||
**Performance:**
|
|
||||||
- With keyword_all.xlsx (1308 CAS numbers):
|
|
||||||
- CAS mode: 255 rows matched (9.18%)
|
|
||||||
- Fuzzy mode: 513 rows matched (18.46%)
|
|
||||||
- Merged (both modes): ~516 unique rows
|
|
||||||
|
|
||||||
**Uses `|||` separator:**
|
|
||||||
- Chemical names contain commas, hyphens, slashes, semicolons
|
|
||||||
- Triple pipe avoids conflicts with chemical nomenclature
|
|
||||||
- Example: `甲基苯丙胺|||冰毒|||Methamphetamine|||MA`
|
|
||||||
|
|
||||||
### 3. Keyword Expansion (`expand_keywords_with_llm.py`)
|
|
||||||
- Expands Chinese names, English names, abbreviations
|
|
||||||
- Supports OpenAI and Anthropic APIs
|
|
||||||
- Mock mode available for testing without API costs
|
|
||||||
- Output formats: compact (single row with `|||` separators) or expanded (one name per row)
|
|
||||||
|
|
||||||
## Configuration Patterns
|
|
||||||
|
|
||||||
Scripts use command-line arguments (keyword_matcher.py) or in-file configuration blocks:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# ========== Configuration ==========
|
|
||||||
keywords_file = "../data/input/keywords.xlsx"
|
|
||||||
text_file = "../data/input/clickin_text_img.xlsx"
|
|
||||||
keywords_column = "中文名"
|
|
||||||
text_column = "文本"
|
|
||||||
separator = "|||"
|
|
||||||
output_file = "../data/output/results.xlsx"
|
|
||||||
# =============================
|
|
||||||
```
|
|
||||||
|
|
||||||
## Excel File Schemas
|
|
||||||
|
|
||||||
**Input - clickin_text_img.xlsx:**
|
|
||||||
- Columns: `文本` (text), image paths, metadata
|
|
||||||
- 2779 rows of scraped e-commerce/social media data
|
|
||||||
|
|
||||||
**Input - keywords.xlsx:**
|
|
||||||
- Columns: `中文名`, `英文名`, `CAS号`, `简称`, `备注`, `可能名称`
|
|
||||||
- `可能名称` contains multiple keywords separated by `|||`
|
|
||||||
- 22 rows (small test dataset)
|
|
||||||
|
|
||||||
**Input - keyword_all.xlsx:**
|
|
||||||
- Same schema as keywords.xlsx
|
|
||||||
- 1659 rows with 1308 unique CAS numbers
|
|
||||||
- Production keyword database
|
|
||||||
|
|
||||||
**Output - Multi-mode matched (keyword_matched_results.xlsx):**
|
|
||||||
- Adds columns:
|
|
||||||
- `匹配到的关键词` (matched keywords, separated by ` | `)
|
|
||||||
- `匹配模式` (matching modes, e.g., "CAS号识别 + 模糊匹配")
|
|
||||||
- Preserves all original columns
|
|
||||||
- Deduplicated across all modes
|
|
||||||
|
|
||||||
**Output - CAS matched:**
|
|
||||||
- Adds column: `匹配到的CAS号` (matched CAS numbers)
|
|
||||||
- Preserves all original columns
|
|
||||||
- Typical match rate: ~9-11% (255-303/2779 rows)
|
|
||||||
|
|
||||||
## Common Modifications
|
|
||||||
|
|
||||||
**To change input/output paths:**
|
|
||||||
Use command-line arguments for `keyword_matcher.py`:
|
|
||||||
```bash
|
|
||||||
python3 keyword_matcher.py -k /path/to/keywords.xlsx -t /path/to/text.xlsx -o /path/to/output.xlsx
|
|
||||||
```
|
|
||||||
|
|
||||||
Or edit the configuration block in other scripts' `main()` function.
|
|
||||||
|
|
||||||
**To switch between CAS and keyword matching:**
|
|
||||||
In `match_cas_numbers.py`, change `match_mode = "regex"` to `match_mode = "keywords"`.
|
|
||||||
|
|
||||||
In `keyword_matcher.py`, use `-m` flag:
|
|
||||||
```bash
|
|
||||||
python3 keyword_matcher.py -m cas # CAS only
|
|
||||||
python3 keyword_matcher.py -m fuzzy # Fuzzy only
|
|
||||||
```
|
|
||||||
|
|
||||||
**To adjust fuzzy matching sensitivity:**
|
|
||||||
```bash
|
|
||||||
python3 keyword_matcher.py -m fuzzy --threshold 90 # Stricter (fewer matches)
|
|
||||||
python3 keyword_matcher.py -m fuzzy --threshold 70 # More lenient (more matches)
|
|
||||||
```
|
|
||||||
|
|
||||||
**To use different LLM APIs:**
|
|
||||||
```bash
|
|
||||||
# OpenAI (default)
|
|
||||||
python3 expand_keywords_with_llm.py input.xlsx
|
|
||||||
|
|
||||||
# Anthropic
|
|
||||||
python3 expand_keywords_with_llm.py input.xlsx -a anthropic
|
|
||||||
```
|
|
||||||
|
|
||||||
## Code Architecture Highlights
|
|
||||||
|
|
||||||
### keyword_matcher.py Design Patterns
|
|
||||||
|
|
||||||
1. **Strategy Pattern**: Different matching algorithms (`KeywordMatcher` subclasses)
|
|
||||||
2. **Template Method**: Common matching workflow in base class `match()` method
|
|
||||||
3. **Factory Pattern**: `create_matcher()` selects appropriate matcher
|
|
||||||
4. **Dependency Injection**: Optional dependency (rapidfuzz) handled gracefully
|
|
||||||
|
|
||||||
**Class Hierarchy:**
|
**Class Hierarchy:**
|
||||||
```
|
```
|
||||||
KeywordMatcher (ABC)
|
KeywordMatcher (ABC)
|
||||||
├── CASRegexMatcher # Regex-based CAS number extraction
|
├── CASRegexMatcher # Regex CAS extraction + normalization
|
||||||
└── FuzzyMatcher # RapidFuzz partial_ratio matching
|
├── RegexExactMatcher # Word-boundary exact matching
|
||||||
|
├── AhoCorasickMatcher # Fast multi-pattern matching
|
||||||
|
└── SetMatcher # Simple substring matching
|
||||||
```
|
```
|
||||||
|
|
||||||
**Data Flow:**
|
### verify_high_confidence.py
|
||||||
```
|
|
||||||
1. Load keywords -> load_keywords_for_mode()
|
|
||||||
2. Create matcher -> create_matcher()
|
|
||||||
3. Match text -> matcher.match()
|
|
||||||
├── _prepare() (build automaton, etc.)
|
|
||||||
└── For each row:
|
|
||||||
├── _match_single_text()
|
|
||||||
└── _format_matches()
|
|
||||||
4. Save results -> save_results()
|
|
||||||
5. If multiple modes -> merge_mode_results()
|
|
||||||
```
|
|
||||||
|
|
||||||
## Data Sensitivity
|
Compares keyword_matcher output with original data to find high-confidence rows that weren't matched, then uses LLM for secondary verification.
|
||||||
|
|
||||||
This codebase handles sensitive data related to controlled substances monitoring. The data includes:
|
- Uses `VERIFY_` prefixed env vars (separate from image_batch_recognizer.py)
|
||||||
- Chemical compound names (Chinese and English)
|
- Supports: OpenAI, DMX, Dify, Ollama, Mock modes
|
||||||
- CAS registry numbers
|
- Input columns: `raw_response`, `文本`
|
||||||
- Image data from suspected illegal substance trading platforms
|
|
||||||
- All data is for legitimate law enforcement/research purposes
|
|
||||||
|
|
||||||
Do not commit actual data files or API keys to version control.
|
### collect_xlsx.py
|
||||||
- to memorize
|
|
||||||
|
Merges batch recognition results with original data:
|
||||||
|
- Matches by image filename (handles both Windows `\` and Unix `/` paths)
|
||||||
|
- Adds original columns (`文本`, metadata) to recognition results
|
||||||
|
|
||||||
|
### image_batch_recognizer.py
|
||||||
|
|
||||||
|
Batch image recognition with multiple API backends:
|
||||||
|
- Supports: OpenAI, Anthropic, DMX, Dify, Mock
|
||||||
|
- Outputs: `detected_text`, `detected_objects`, `sensitive_items`, `summary`, `confidence`
|
||||||
|
- Parallel processing with `--max-workers`
|
||||||
|
|
||||||
|
## Excel File Schemas
|
||||||
|
|
||||||
|
**keywords.xlsx columns:**
|
||||||
|
- `中文名`, `英文名`, `CAS号`, `简称`, `备注`, `可能名称`
|
||||||
|
- `可能名称` uses `|||` separator for multiple values
|
||||||
|
|
||||||
|
**Recognition output columns:**
|
||||||
|
- `image_name`, `image_path`, `detected_text`, `detected_objects`
|
||||||
|
- `sensitive_items`, `summary`, `confidence`, `raw_response`
|
||||||
|
|
||||||
|
**Matched output adds:**
|
||||||
|
- `匹配到的关键词` (matched keywords, ` | ` separated)
|
||||||
|
- `匹配模式` (e.g., "CAS号识别 + 精确匹配")
|
||||||
|
|
||||||
|
## Key Conventions
|
||||||
|
|
||||||
|
- Triple pipe `|||` separator in keyword cells (avoids conflicts with chemical names)
|
||||||
|
- Match result separator: ` | `
|
||||||
|
- All scripts use relative paths from `scripts/` directory
|
||||||
|
- Configuration priority: command-line args > VERIFY_* env > general env > defaults
|
||||||
|
|||||||
204
README.md
204
README.md
@@ -18,6 +18,8 @@ python3 keyword_matcher.py
|
|||||||
python3 image_batch_recognizer.py --mock --limit 5
|
python3 image_batch_recognizer.py --mock --limit 5
|
||||||
```
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## 功能 1:关键词匹配
|
## 功能 1:关键词匹配
|
||||||
|
|
||||||
从文本中识别 CAS 号和关键词。
|
从文本中识别 CAS 号和关键词。
|
||||||
@@ -45,6 +47,9 @@ python3 keyword_matcher.py \
|
|||||||
-k ../data/input/keyword_all.xlsx \
|
-k ../data/input/keyword_all.xlsx \
|
||||||
-t ../data/input/clickin_text_img.xlsx \
|
-t ../data/input/clickin_text_img.xlsx \
|
||||||
-o ../data/output/results.xlsx
|
-o ../data/output/results.xlsx
|
||||||
|
|
||||||
|
# 指定多个文本列
|
||||||
|
python3 keyword_matcher.py -c detected_text 文本
|
||||||
```
|
```
|
||||||
|
|
||||||
### 参数说明
|
### 参数说明
|
||||||
@@ -54,19 +59,15 @@ python3 keyword_matcher.py \
|
|||||||
| `-k, --keywords` | 关键词文件 | `../data/input/keywords.xlsx` |
|
| `-k, --keywords` | 关键词文件 | `../data/input/keywords.xlsx` |
|
||||||
| `-t, --text` | 文本文件 | `../data/input/clickin_text_img.xlsx` |
|
| `-t, --text` | 文本文件 | `../data/input/clickin_text_img.xlsx` |
|
||||||
| `-o, --output` | 输出文件 | `../data/output/keyword_matched_results.xlsx` |
|
| `-o, --output` | 输出文件 | `../data/output/keyword_matched_results.xlsx` |
|
||||||
| `-c, --text-column` | 文本列名 | `文本` |
|
| `-c, --text-columns` | 文本列名(支持多列) | 自动检测 `detected_text` 和 `文本` |
|
||||||
| `-m, --modes` | 匹配模式 | `cas exact` |
|
| `-m, --modes` | 匹配模式 | `cas exact` |
|
||||||
| `--separator` | 关键词分隔符 | `\|\|\|` |
|
| `--separator` | 关键词分隔符 | `\|\|\|` |
|
||||||
|
|
||||||
### 输出说明
|
### 输出说明
|
||||||
|
|
||||||
每种模式生成独立文件:
|
输出合并后的匹配结果文件,包含以下列:
|
||||||
- `keyword_matched_results_cas.xlsx` - CAS 号匹配结果
|
- `匹配到的关键词` - 匹配的关键词列表(` | ` 分隔)
|
||||||
- `keyword_matched_results_exact.xlsx` - 精确匹配结果
|
- `匹配模式` - 使用的匹配模式(如 "CAS号识别 + 精确匹配")
|
||||||
|
|
||||||
输出列:
|
|
||||||
- `匹配到的关键词` - 匹配的关键词列表
|
|
||||||
- `匹配模式` - 使用的匹配模式
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -82,6 +83,9 @@ cd scripts/
|
|||||||
# 模拟模式(无需 API,用于测试)
|
# 模拟模式(无需 API,用于测试)
|
||||||
python3 image_batch_recognizer.py --mock --limit 5
|
python3 image_batch_recognizer.py --mock --limit 5
|
||||||
|
|
||||||
|
# 使用 Dify API
|
||||||
|
python3 image_batch_recognizer.py --api-type dify --limit 10
|
||||||
|
|
||||||
# 使用 OpenAI API
|
# 使用 OpenAI API
|
||||||
python3 image_batch_recognizer.py --api-type openai --limit 10
|
python3 image_batch_recognizer.py --api-type openai --limit 10
|
||||||
|
|
||||||
@@ -89,7 +93,7 @@ python3 image_batch_recognizer.py --api-type openai --limit 10
|
|||||||
python3 image_batch_recognizer.py --api-type dmx --limit 10
|
python3 image_batch_recognizer.py --api-type dmx --limit 10
|
||||||
|
|
||||||
# 并行处理
|
# 并行处理
|
||||||
python3 image_batch_recognizer.py --api-type openai --max-workers 3
|
python3 image_batch_recognizer.py --api-type dify --max-workers 3
|
||||||
```
|
```
|
||||||
|
|
||||||
### 参数说明
|
### 参数说明
|
||||||
@@ -98,7 +102,7 @@ python3 image_batch_recognizer.py --api-type openai --max-workers 3
|
|||||||
|------|------|--------|
|
|------|------|--------|
|
||||||
| `--image-dir` | 图片目录 | `../data/images` |
|
| `--image-dir` | 图片目录 | `../data/images` |
|
||||||
| `--output` | 输出文件 | `../data/output/image_recognition_results.xlsx` |
|
| `--output` | 输出文件 | `../data/output/image_recognition_results.xlsx` |
|
||||||
| `--api-type` | API 类型 | 从 `.env` 读取 |
|
| `--api-type` | API 类型 (openai/dmx/dify/anthropic/mock) | 从 `.env` 读取 |
|
||||||
| `--model` | 模型名称 | 从 `.env` 读取 |
|
| `--model` | 模型名称 | 从 `.env` 读取 |
|
||||||
| `--limit` | 最大处理数 | 无限制 |
|
| `--limit` | 最大处理数 | 无限制 |
|
||||||
| `--offset` | 跳过前 N 张 | 0 |
|
| `--offset` | 跳过前 N 张 | 0 |
|
||||||
@@ -106,26 +110,6 @@ python3 image_batch_recognizer.py --api-type openai --max-workers 3
|
|||||||
| `--mock` | 模拟模式 | 否 |
|
| `--mock` | 模拟模式 | 否 |
|
||||||
| `--recursive` | 递归子目录 | 否 |
|
| `--recursive` | 递归子目录 | 否 |
|
||||||
|
|
||||||
### API 配置
|
|
||||||
|
|
||||||
复制 `.env` 配置文件并填写:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cp config.env.example .env
|
|
||||||
```
|
|
||||||
|
|
||||||
`.env` 示例:
|
|
||||||
```
|
|
||||||
OPENAI_API_KEY=sk-...
|
|
||||||
OPENAI_MODEL=gpt-4o-mini
|
|
||||||
|
|
||||||
DMX_API_KEY=sk-dmx-...
|
|
||||||
DMX_BASE_URL=https://www.dmxapi.cn
|
|
||||||
DMX_MODEL=gpt-5-mini
|
|
||||||
|
|
||||||
LLM_API_TYPE=openai
|
|
||||||
```
|
|
||||||
|
|
||||||
### 输出说明
|
### 输出说明
|
||||||
|
|
||||||
输出 Excel 包含以下列:
|
输出 Excel 包含以下列:
|
||||||
@@ -133,27 +117,163 @@ LLM_API_TYPE=openai
|
|||||||
- `detected_objects` - 物品描述
|
- `detected_objects` - 物品描述
|
||||||
- `sensitive_items` - 敏感要素
|
- `sensitive_items` - 敏感要素
|
||||||
- `summary` - 风险摘要
|
- `summary` - 风险摘要
|
||||||
- `confidence` - 置信度
|
- `confidence` - 置信度 (High/Medium/Low)
|
||||||
|
- `raw_response` - 原始 LLM 响应
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 功能 3:数据收集与合并
|
||||||
|
|
||||||
|
收集批处理结果并与原始数据合并。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd scripts/
|
||||||
|
|
||||||
|
# 默认合并 batch_output 和 data_all 中的数据
|
||||||
|
python3 collect_xlsx.py
|
||||||
|
|
||||||
|
# 指定输出目录
|
||||||
|
python3 collect_xlsx.py -o ../data/merged
|
||||||
|
|
||||||
|
# 预览模式(不执行)
|
||||||
|
python3 collect_xlsx.py -n
|
||||||
|
|
||||||
|
# 仅复制,不合并原始数据
|
||||||
|
python3 collect_xlsx.py --no-merge
|
||||||
|
```
|
||||||
|
|
||||||
|
合并逻辑:
|
||||||
|
- 从 `data/batch_output/{name}/results.xlsx` 读取识别结果
|
||||||
|
- 与 `data/data_all/{name}_text_img.xlsx` 按图片名匹配合并
|
||||||
|
- 输出到 `data/collected_xlsx/{name}.xlsx`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 功能 4:高置信度记录验证
|
||||||
|
|
||||||
|
对关键词未匹配但置信度为 High/Medium 的记录进行 LLM 二次验证。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd scripts/
|
||||||
|
|
||||||
|
# 完整示例
|
||||||
|
python3 verify_high_confidence.py \
|
||||||
|
-o ../data/pho_analysis_merged/clickin.xlsx \
|
||||||
|
-m ../data/output/clickin_matched.xlsx \
|
||||||
|
-r ../data/output/clickin_verify.xlsx
|
||||||
|
|
||||||
|
# Mock 模式测试(不调用 API)
|
||||||
|
python3 verify_high_confidence.py \
|
||||||
|
-o ../data/pho_analysis_merged/clickin.xlsx \
|
||||||
|
-m ../data/output/clickin_matched.xlsx \
|
||||||
|
--mock --limit 5
|
||||||
|
|
||||||
|
# 使用 DMX API
|
||||||
|
python3 verify_high_confidence.py -o original.xlsx -m matched.xlsx --api dmx
|
||||||
|
```
|
||||||
|
|
||||||
|
### 参数说明
|
||||||
|
|
||||||
|
| 参数 | 说明 | 默认值 |
|
||||||
|
|------|------|--------|
|
||||||
|
| `-o, --original` | 原始 Excel 文件 | 必填 |
|
||||||
|
| `-m, --matched` | keyword_matcher 匹配结果 | 必填 |
|
||||||
|
| `-r, --result` | 输出文件 | `{原始文件名}_llm_verified.xlsx` |
|
||||||
|
| `--api` | API 类型 (openai/dmx/dify/ollama) | 从 `.env` 读取 |
|
||||||
|
| `--mock` | 模拟模式 | 否 |
|
||||||
|
| `--confidence` | 验证的置信度级别 | `High Medium` |
|
||||||
|
| `--limit` | 限制验证条数 | 全部 |
|
||||||
|
|
||||||
|
### 输出说明
|
||||||
|
|
||||||
|
输出 Excel 在原始列基础上添加以下验证结果列:
|
||||||
|
- `llm_is_risky` - 是否涉及风险(布尔值)
|
||||||
|
- `llm_substances` - 涉及的物质名称或 CAS 号(` | ` 分隔)
|
||||||
|
- `llm_risk_level` - 风险等级(高/中/低)
|
||||||
|
- `llm_reason` - 判定理由
|
||||||
|
- `llm_raw_response` - LLM 原始响应
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## API 配置
|
||||||
|
|
||||||
|
复制 `.env.example` 并填写:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
```
|
||||||
|
|
||||||
|
`.env` 示例:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 默认 API 类型
|
||||||
|
LLM_API_TYPE="dify"
|
||||||
|
|
||||||
|
# DMX API(OpenAI 兼容)
|
||||||
|
DMX_API_KEY="sk-xxx"
|
||||||
|
DMX_BASE_URL="https://www.dmxapi.cn"
|
||||||
|
DMX_MODEL="gpt-4o-mini"
|
||||||
|
|
||||||
|
# Dify API(图片识别使用)
|
||||||
|
DIFY_API_KEY="app-xxx"
|
||||||
|
DIFY_BASE_URL="https://your-dify-server:4433"
|
||||||
|
DIFY_USER_ID="default-user"
|
||||||
|
|
||||||
|
# OpenAI API
|
||||||
|
OPENAI_API_KEY="sk-xxx"
|
||||||
|
OPENAI_MODEL="gpt-4o-mini"
|
||||||
|
|
||||||
|
# verify_high_confidence.py 独立配置(VERIFY_ 前缀)
|
||||||
|
VERIFY_API_TYPE="dmx"
|
||||||
|
VERIFY_API_KEY="sk-xxx"
|
||||||
|
VERIFY_BASE_URL="https://www.dmxapi.cn"
|
||||||
|
VERIFY_MODEL="gpt-4o-mini"
|
||||||
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 目录结构
|
## 目录结构
|
||||||
|
|
||||||
```
|
```
|
||||||
20251126_s2/
|
chem-risk-detect/
|
||||||
├── scripts/
|
├── scripts/
|
||||||
│ ├── keyword_matcher.py # 关键词匹配
|
│ ├── keyword_matcher.py # 关键词匹配
|
||||||
│ ├── image_batch_recognizer.py # 图片识别
|
│ ├── image_batch_recognizer.py # 图片识别
|
||||||
|
│ ├── collect_xlsx.py # 数据收集合并
|
||||||
|
│ ├── verify_high_confidence.py # LLM 二次验证
|
||||||
│ ├── run.sh # 批处理管理
|
│ ├── run.sh # 批处理管理
|
||||||
│ └── run_batch_background.sh # 后台运行
|
│ └── run_batch_background.sh # 后台运行
|
||||||
├── data/
|
├── data/
|
||||||
│ ├── input/ # 输入数据
|
│ ├── input/ # 输入数据
|
||||||
│ │ ├── clickin_text_img.xlsx # 文本数据
|
│ │ ├── clickin_text_img.xlsx # 文本数据
|
||||||
│ │ └── keywords.xlsx # 关键词库
|
│ │ └── keywords.xlsx # 关键词库
|
||||||
│ ├── output/ # 输出结果
|
│ ├── images/ # 图片文件
|
||||||
│ └── images/ # 图片文件
|
│ ├── batch_output/ # 批处理输出
|
||||||
|
│ │ └── {name}/results.xlsx
|
||||||
|
│ ├── data_all/ # 原始数据
|
||||||
|
│ │ └── {name}_text_img.xlsx
|
||||||
|
│ ├── collected_xlsx/ # 合并后数据
|
||||||
|
│ └── output/ # 最终输出结果
|
||||||
├── .env # API 配置
|
├── .env # API 配置
|
||||||
└── config.env.example # 配置模板
|
└── .env.example # 配置模板
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 处理流程
|
||||||
|
|
||||||
|
```
|
||||||
|
1. 图片识别
|
||||||
|
image_batch_recognizer.py → batch_output/{name}/results.xlsx
|
||||||
|
|
||||||
|
2. 数据合并
|
||||||
|
collect_xlsx.py → 合并 results.xlsx + {name}_text_img.xlsx → collected_xlsx/
|
||||||
|
|
||||||
|
3. 关键词匹配
|
||||||
|
keyword_matcher.py → output/keyword_matched_results.xlsx
|
||||||
|
|
||||||
|
4. 二次验证
|
||||||
|
verify_high_confidence.py → 验证未匹配的高置信度记录 → *_llm_verified.xlsx
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -173,7 +293,7 @@ cd scripts/
|
|||||||
|
|
||||||
设置参数:
|
设置参数:
|
||||||
```bash
|
```bash
|
||||||
API_TYPE=openai MAX_WORKERS=3 ./run.sh start
|
API_TYPE=dify MAX_WORKERS=3 ./run.sh start
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -184,10 +304,11 @@ API_TYPE=openai MAX_WORKERS=3 ./run.sh start
|
|||||||
# 必需
|
# 必需
|
||||||
pip install pandas openpyxl
|
pip install pandas openpyxl
|
||||||
|
|
||||||
# 可选(提升性能)
|
# 可选(提升性能和功能)
|
||||||
pip install pyahocorasick # 关键词匹配加速
|
pip install pyahocorasick # 关键词匹配加速(5x)
|
||||||
|
pip install requests # Dify API 必需
|
||||||
pip install tqdm # 进度条
|
pip install tqdm # 进度条
|
||||||
pip install requests # HTTP 请求
|
pip install openai # verify 脚本的 OpenAI 兼容 API
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -203,9 +324,12 @@ pip install requests # HTTP 请求
|
|||||||
**Q: 输出的分隔符能改吗?**
|
**Q: 输出的分隔符能改吗?**
|
||||||
使用 `--separator` 参数,默认 `|||` 不与化学名称冲突。
|
使用 `--separator` 参数,默认 `|||` 不与化学名称冲突。
|
||||||
|
|
||||||
|
**Q: verify 脚本和 image_batch_recognizer 能用不同的 API 吗?**
|
||||||
|
可以。verify 脚本使用 `VERIFY_` 前缀的环境变量,与其他脚本独立配置。
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 技术支持
|
## 技术要求
|
||||||
|
|
||||||
- Python 3.7+
|
- Python 3.7+
|
||||||
- 查看帮助:`python3 script.py -h`
|
- 查看帮助:`python3 script.py -h`
|
||||||
|
|||||||
107
scripts/batch_keyword_match.sh
Executable file
107
scripts/batch_keyword_match.sh
Executable file
@@ -0,0 +1,107 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# 批量关键词匹配脚本
|
||||||
|
# 处理 data/pho_analysis_merged/ 中的所有 xlsx 文件
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# 获取脚本所在目录
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||||
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
|
||||||
|
# 目录配置
|
||||||
|
INPUT_DIR="$PROJECT_DIR/data/pho_analysis_merged"
|
||||||
|
OUTPUT_DIR="$PROJECT_DIR/data/output"
|
||||||
|
KEYWORDS_FILE="$PROJECT_DIR/data/keywords/keywords_all.xlsx"
|
||||||
|
|
||||||
|
# 颜色输出
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
RED='\033[0;31m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
echo "=============================================="
|
||||||
|
echo "批量关键词匹配"
|
||||||
|
echo "=============================================="
|
||||||
|
echo "输入目录: $INPUT_DIR"
|
||||||
|
echo "输出目录: $OUTPUT_DIR"
|
||||||
|
echo "关键词文件: $KEYWORDS_FILE"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 检查输入目录
|
||||||
|
if [ ! -d "$INPUT_DIR" ]; then
|
||||||
|
echo -e "${RED}错误: 输入目录不存在: $INPUT_DIR${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 检查关键词文件
|
||||||
|
if [ ! -f "$KEYWORDS_FILE" ]; then
|
||||||
|
echo -e "${YELLOW}警告: 关键词文件不存在: $KEYWORDS_FILE${NC}"
|
||||||
|
echo "将使用默认关键词文件"
|
||||||
|
KEYWORDS_FILE=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 创建输出目录
|
||||||
|
mkdir -p "$OUTPUT_DIR"
|
||||||
|
|
||||||
|
# 统计
|
||||||
|
total=0
|
||||||
|
success=0
|
||||||
|
failed=0
|
||||||
|
|
||||||
|
# 获取所有 xlsx 文件
|
||||||
|
files=("$INPUT_DIR"/*.xlsx)
|
||||||
|
|
||||||
|
# 检查是否有文件
|
||||||
|
if [ ! -e "${files[0]}" ]; then
|
||||||
|
echo -e "${YELLOW}没有找到 xlsx 文件${NC}"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 计算总数
|
||||||
|
for f in "${files[@]}"; do
|
||||||
|
if [ -f "$f" ]; then
|
||||||
|
((total++))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "找到 $total 个文件待处理"
|
||||||
|
echo "----------------------------------------------"
|
||||||
|
|
||||||
|
# 处理每个文件
|
||||||
|
current=0
|
||||||
|
for input_file in "${files[@]}"; do
|
||||||
|
if [ ! -f "$input_file" ]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
((current++))
|
||||||
|
|
||||||
|
# 获取文件名(不含扩展名)
|
||||||
|
filename=$(basename "$input_file" .xlsx)
|
||||||
|
output_file="$OUTPUT_DIR/${filename}_matched.xlsx"
|
||||||
|
|
||||||
|
echo -e "\n[$current/$total] 处理: $filename"
|
||||||
|
|
||||||
|
# 构建命令
|
||||||
|
cmd="python3 $SCRIPT_DIR/keyword_matcher.py -t \"$input_file\" -o \"$output_file\""
|
||||||
|
if [ -n "$KEYWORDS_FILE" ]; then
|
||||||
|
cmd="$cmd -k \"$KEYWORDS_FILE\""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 执行匹配
|
||||||
|
if eval "$cmd"; then
|
||||||
|
echo -e "${GREEN} ✓ 完成: ${filename}_matched.xlsx${NC}"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
echo -e "${RED} ✗ 失败: $filename${NC}"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# 汇总
|
||||||
|
echo ""
|
||||||
|
echo "=============================================="
|
||||||
|
echo "处理完成"
|
||||||
|
echo "=============================================="
|
||||||
|
echo -e "总计: $total | ${GREEN}成功: $success${NC} | ${RED}失败: $failed${NC}"
|
||||||
|
echo "输出目录: $OUTPUT_DIR"
|
||||||
315
scripts/collect_xlsx.py
Normal file
315
scripts/collect_xlsx.py
Normal file
@@ -0,0 +1,315 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
收集并合并 xlsx 文件
|
||||||
|
|
||||||
|
功能:
|
||||||
|
1. 从 data/batch_output 子文件夹收集 results.xlsx(图片分析结果)
|
||||||
|
2. 与 data/data_all 中对应的原始数据({name}_text_img.xlsx)合并
|
||||||
|
3. 通过图片名关联两个数据源
|
||||||
|
4. 保存合并后的文件到目标目录
|
||||||
|
|
||||||
|
用法:
|
||||||
|
python3 collect_xlsx.py # 默认合并并输出
|
||||||
|
python3 collect_xlsx.py -o ../data/merged # 指定输出目录
|
||||||
|
python3 collect_xlsx.py --no-merge # 不合并,只复制
|
||||||
|
python3 collect_xlsx.py -n # 预览模式
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Tuple, List
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def extract_image_name(path: str) -> str:
|
||||||
|
"""
|
||||||
|
从完整路径提取图片文件名
|
||||||
|
|
||||||
|
支持 Windows 和 Unix 路径格式
|
||||||
|
"""
|
||||||
|
if pd.isna(path):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
path_str = str(path).strip()
|
||||||
|
if not path_str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# 同时处理 Windows (\) 和 Unix (/) 路径分隔符
|
||||||
|
# 先统一替换为 /,再提取文件名
|
||||||
|
normalized = path_str.replace("\\", "/")
|
||||||
|
filename = normalized.split("/")[-1]
|
||||||
|
|
||||||
|
return filename
|
||||||
|
|
||||||
|
|
||||||
|
def merge_xlsx_files(
|
||||||
|
results_file: Path,
|
||||||
|
original_file: Path,
|
||||||
|
results_image_col: str = "image_name",
|
||||||
|
original_image_cols: list = None,
|
||||||
|
original_text_col: str = "文本"
|
||||||
|
) -> Tuple[pd.DataFrame, dict]:
|
||||||
|
"""
|
||||||
|
合并分析结果和原始数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
results_file: 分析结果文件 (batch_output/.../results.xlsx)
|
||||||
|
original_file: 原始数据文件 (data_all/..._text_img.xlsx)
|
||||||
|
results_image_col: 结果文件中的图片名列
|
||||||
|
original_image_cols: 原始文件中可能的图片路径列(按优先级)
|
||||||
|
original_text_col: 原始文件中的文本列
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
合并后的 DataFrame 和统计信息
|
||||||
|
"""
|
||||||
|
if original_image_cols is None:
|
||||||
|
original_image_cols = ["图片_新", "图片", "图片链接"]
|
||||||
|
|
||||||
|
# 读取文件
|
||||||
|
results_df = pd.read_excel(results_file)
|
||||||
|
original_df = pd.read_excel(original_file)
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
"results_rows": len(results_df),
|
||||||
|
"original_rows": len(original_df),
|
||||||
|
"merged_rows": 0,
|
||||||
|
"unmatched_results": 0,
|
||||||
|
"original_columns_added": [],
|
||||||
|
"image_col_used": None
|
||||||
|
}
|
||||||
|
|
||||||
|
# 找到可用的图片列
|
||||||
|
image_col = None
|
||||||
|
for col in original_image_cols:
|
||||||
|
if col in original_df.columns:
|
||||||
|
image_col = col
|
||||||
|
break
|
||||||
|
|
||||||
|
if image_col is None:
|
||||||
|
raise ValueError(f"原始文件中未找到图片列,尝试过: {original_image_cols}")
|
||||||
|
|
||||||
|
stats["image_col_used"] = image_col
|
||||||
|
|
||||||
|
# 从原始数据提取图片名作为关联键
|
||||||
|
original_df["_image_name"] = original_df[image_col].apply(extract_image_name)
|
||||||
|
|
||||||
|
# 去重:原始数据可能有重复图片,保留第一条
|
||||||
|
original_dedup = original_df.drop_duplicates(subset=["_image_name"], keep="first")
|
||||||
|
|
||||||
|
# 确定要添加的原始数据列(排除图片路径列和临时列)
|
||||||
|
exclude_cols = set(original_image_cols + ["_image_name"])
|
||||||
|
original_cols_to_add = [col for col in original_df.columns
|
||||||
|
if col not in exclude_cols
|
||||||
|
and col not in results_df.columns]
|
||||||
|
|
||||||
|
stats["original_columns_added"] = original_cols_to_add
|
||||||
|
|
||||||
|
# 创建图片名到原始数据的映射
|
||||||
|
original_map = original_dedup.set_index("_image_name")[original_cols_to_add].to_dict("index")
|
||||||
|
|
||||||
|
# 合并:为结果数据添加原始数据列
|
||||||
|
merged_df = results_df.copy()
|
||||||
|
|
||||||
|
# 初始化新列
|
||||||
|
for col in original_cols_to_add:
|
||||||
|
merged_df[col] = None
|
||||||
|
|
||||||
|
# 逐行匹配并填充
|
||||||
|
matched_count = 0
|
||||||
|
for idx, row in merged_df.iterrows():
|
||||||
|
image_name = row[results_image_col]
|
||||||
|
if image_name in original_map:
|
||||||
|
for col in original_cols_to_add:
|
||||||
|
merged_df.at[idx, col] = original_map[image_name].get(col)
|
||||||
|
matched_count += 1
|
||||||
|
|
||||||
|
stats["merged_rows"] = len(merged_df)
|
||||||
|
stats["matched_count"] = matched_count
|
||||||
|
stats["unmatched_results"] = len(merged_df) - matched_count
|
||||||
|
|
||||||
|
return merged_df, stats
|
||||||
|
|
||||||
|
|
||||||
|
def collect_and_merge_xlsx(
|
||||||
|
source_dir: str,
|
||||||
|
data_all_dir: str,
|
||||||
|
output_dir: str,
|
||||||
|
merge: bool = True,
|
||||||
|
dry_run: bool = False
|
||||||
|
) -> List[dict]:
|
||||||
|
"""
|
||||||
|
收集并合并 xlsx 文件
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source_dir: batch_output 目录路径
|
||||||
|
data_all_dir: data_all 目录路径
|
||||||
|
output_dir: 输出目录路径
|
||||||
|
merge: 是否合并原始数据
|
||||||
|
dry_run: 预览模式
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
处理结果列表
|
||||||
|
"""
|
||||||
|
source_path = Path(source_dir)
|
||||||
|
data_all_path = Path(data_all_dir)
|
||||||
|
output_path = Path(output_dir)
|
||||||
|
|
||||||
|
if not source_path.exists():
|
||||||
|
print(f"错误: 源目录不存在: {source_dir}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 创建输出目录
|
||||||
|
if not dry_run:
|
||||||
|
output_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# 遍历子文件夹
|
||||||
|
for folder in sorted(source_path.iterdir()):
|
||||||
|
if not folder.is_dir():
|
||||||
|
continue
|
||||||
|
|
||||||
|
folder_name = folder.name
|
||||||
|
results_file = folder / "results.xlsx"
|
||||||
|
|
||||||
|
if not results_file.exists():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 输出文件名
|
||||||
|
output_file = output_path / f"{folder_name}.xlsx"
|
||||||
|
|
||||||
|
# 查找对应的原始数据文件
|
||||||
|
original_file = data_all_path / f"{folder_name}_text_img.xlsx"
|
||||||
|
|
||||||
|
result_info = {
|
||||||
|
"folder": folder_name,
|
||||||
|
"results_file": str(results_file),
|
||||||
|
"original_file": str(original_file) if original_file.exists() else None,
|
||||||
|
"output_file": str(output_file),
|
||||||
|
"merged": False,
|
||||||
|
"stats": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
if merge and original_file.exists():
|
||||||
|
print(f"[预览] 合并: {folder_name}/results.xlsx + {folder_name}_text_img.xlsx -> {folder_name}.xlsx")
|
||||||
|
else:
|
||||||
|
print(f"[预览] 复制: {folder_name}/results.xlsx -> {folder_name}.xlsx")
|
||||||
|
results.append(result_info)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 执行合并或复制
|
||||||
|
if merge and original_file.exists():
|
||||||
|
try:
|
||||||
|
merged_df, stats = merge_xlsx_files(results_file, original_file)
|
||||||
|
merged_df.to_excel(output_file, index=False, engine="openpyxl")
|
||||||
|
|
||||||
|
result_info["merged"] = True
|
||||||
|
result_info["stats"] = stats
|
||||||
|
|
||||||
|
print(f"已合并: {folder_name}")
|
||||||
|
print(f" - 分析结果: {stats['results_rows']} 行")
|
||||||
|
print(f" - 原始数据: {stats['original_rows']} 行")
|
||||||
|
print(f" - 匹配成功: {stats['matched_count']} 行")
|
||||||
|
print(f" - 添加列: {stats['original_columns_added']}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"合并失败 {folder_name}: {e}")
|
||||||
|
# 回退到复制模式
|
||||||
|
import shutil
|
||||||
|
shutil.copy2(results_file, output_file)
|
||||||
|
print(f" 已回退到复制模式")
|
||||||
|
else:
|
||||||
|
# 只复制,不合并
|
||||||
|
import shutil
|
||||||
|
shutil.copy2(results_file, output_file)
|
||||||
|
|
||||||
|
if merge and not original_file.exists():
|
||||||
|
print(f"已复制: {folder_name} (原始数据不存在: {folder_name}_text_img.xlsx)")
|
||||||
|
else:
|
||||||
|
print(f"已复制: {folder_name}")
|
||||||
|
|
||||||
|
results.append(result_info)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="收集并合并 batch_output 和 data_all 中的 xlsx 文件",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
示例:
|
||||||
|
python3 collect_xlsx.py # 默认合并并输出
|
||||||
|
python3 collect_xlsx.py -o ../data/merged # 指定输出目录
|
||||||
|
python3 collect_xlsx.py --no-merge # 不合并,只复制
|
||||||
|
python3 collect_xlsx.py -n # 预览模式
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-s", "--source",
|
||||||
|
default="../data/batch_output",
|
||||||
|
help="batch_output 目录路径 (默认: ../data/batch_output)"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-d", "--data-all",
|
||||||
|
default="../data/data_all",
|
||||||
|
help="data_all 目录路径 (默认: ../data/data_all)"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-o", "--output",
|
||||||
|
default="../data/collected_xlsx",
|
||||||
|
help="输出目录路径 (默认: ../data/collected_xlsx)"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-merge",
|
||||||
|
action="store_true",
|
||||||
|
help="不合并原始数据,只复制分析结果"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-n", "--dry-run",
|
||||||
|
action="store_true",
|
||||||
|
help="预览模式,只打印不执行"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# 转换为绝对路径
|
||||||
|
script_dir = Path(__file__).parent
|
||||||
|
source_dir = (script_dir / args.source).resolve()
|
||||||
|
data_all_dir = (script_dir / args.data_all).resolve()
|
||||||
|
output_dir = (script_dir / args.output).resolve()
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("收集并合并 xlsx 文件")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"分析结果目录: {source_dir}")
|
||||||
|
print(f"原始数据目录: {data_all_dir}")
|
||||||
|
print(f"输出目录: {output_dir}")
|
||||||
|
print(f"合并模式: {'否' if args.no_merge else '是'}")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
results = collect_and_merge_xlsx(
|
||||||
|
str(source_dir),
|
||||||
|
str(data_all_dir),
|
||||||
|
str(output_dir),
|
||||||
|
merge=not args.no_merge,
|
||||||
|
dry_run=args.dry_run
|
||||||
|
)
|
||||||
|
|
||||||
|
print("-" * 60)
|
||||||
|
merged_count = sum(1 for r in results if r.get("merged"))
|
||||||
|
print(f"共处理 {len(results)} 个文件")
|
||||||
|
if not args.no_merge:
|
||||||
|
print(f" - 合并成功: {merged_count}")
|
||||||
|
print(f" - 仅复制: {len(results) - merged_count}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -47,6 +47,18 @@ MODE_LABELS = {
|
|||||||
"exact": "精确匹配",
|
"exact": "精确匹配",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# 常见的文本列名(按优先级排序)
|
||||||
|
COMMON_TEXT_COLUMNS = [
|
||||||
|
"detected_text", # 新格式(图片分析结果)
|
||||||
|
"文本", # 旧格式 / 合并后的原始文本
|
||||||
|
"text",
|
||||||
|
"content",
|
||||||
|
"summary",
|
||||||
|
]
|
||||||
|
|
||||||
|
# 默认多列匹配组合
|
||||||
|
DEFAULT_TEXT_COLUMNS = ["detected_text", "文本"]
|
||||||
|
|
||||||
|
|
||||||
# ========== 数据类 ==========
|
# ========== 数据类 ==========
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -136,6 +148,71 @@ def split_value(value: str, separator: str) -> List[str]:
|
|||||||
return [part.strip() for part in parts if part and part.strip()]
|
return [part.strip() for part in parts if part and part.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def detect_text_columns(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
specified_columns: Optional[List[str]] = None
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
检测并验证文本列名
|
||||||
|
|
||||||
|
参数:
|
||||||
|
df: 数据框
|
||||||
|
specified_columns: 用户指定的列名列表
|
||||||
|
|
||||||
|
返回:存在的文本列名列表
|
||||||
|
|
||||||
|
异常:如果找不到任何合适的列则抛出 ValueError
|
||||||
|
"""
|
||||||
|
# 如果用户指定了列名
|
||||||
|
if specified_columns:
|
||||||
|
available = [col for col in specified_columns if col in df.columns]
|
||||||
|
missing = [col for col in specified_columns if col not in df.columns]
|
||||||
|
|
||||||
|
if missing:
|
||||||
|
print(f"警告: 以下指定的列不存在: {missing}")
|
||||||
|
|
||||||
|
if available:
|
||||||
|
return available
|
||||||
|
else:
|
||||||
|
print("警告: 所有指定的列都不存在,尝试自动检测...")
|
||||||
|
|
||||||
|
# 自动检测:优先使用默认多列组合
|
||||||
|
available_default = [col for col in DEFAULT_TEXT_COLUMNS if col in df.columns]
|
||||||
|
if available_default:
|
||||||
|
print(f"自动检测到文本列: {available_default}")
|
||||||
|
return available_default
|
||||||
|
|
||||||
|
# 回退:使用第一个找到的常见列
|
||||||
|
for col in COMMON_TEXT_COLUMNS:
|
||||||
|
if col in df.columns:
|
||||||
|
print(f"自动检测到文本列: ['{col}']")
|
||||||
|
return [col]
|
||||||
|
|
||||||
|
# 都没找到,抛出异常
|
||||||
|
raise ValueError(
|
||||||
|
f"无法自动检测文本列。可用列: {df.columns.tolist()}\n"
|
||||||
|
f"请使用 -c 参数指定文本列名"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def combine_text_columns(row: pd.Series, text_columns: List[str]) -> str:
|
||||||
|
"""
|
||||||
|
合并多列文本内容
|
||||||
|
|
||||||
|
参数:
|
||||||
|
row: DataFrame 的一行
|
||||||
|
text_columns: 要合并的列名列表
|
||||||
|
|
||||||
|
返回:合并后的文本(用换行符分隔)
|
||||||
|
"""
|
||||||
|
texts = []
|
||||||
|
for col in text_columns:
|
||||||
|
val = row.get(col)
|
||||||
|
if pd.notna(val) and str(val).strip():
|
||||||
|
texts.append(str(val).strip())
|
||||||
|
return "\n".join(texts)
|
||||||
|
|
||||||
|
|
||||||
def load_keywords_for_mode(
|
def load_keywords_for_mode(
|
||||||
df: pd.DataFrame,
|
df: pd.DataFrame,
|
||||||
mode: str,
|
mode: str,
|
||||||
@@ -205,22 +282,32 @@ class KeywordMatcher(ABC):
|
|||||||
self,
|
self,
|
||||||
df: pd.DataFrame,
|
df: pd.DataFrame,
|
||||||
keywords: Set[str],
|
keywords: Set[str],
|
||||||
text_column: str
|
text_columns: List[str]
|
||||||
) -> MatchResult:
|
) -> MatchResult:
|
||||||
"""执行匹配(模板方法)"""
|
"""执行匹配(模板方法)
|
||||||
|
|
||||||
|
参数:
|
||||||
|
df: 数据框
|
||||||
|
keywords: 关键词集合
|
||||||
|
text_columns: 文本列名列表(支持多列)
|
||||||
|
"""
|
||||||
print(f"开始匹配(使用{self.name})...")
|
print(f"开始匹配(使用{self.name})...")
|
||||||
|
print(f"搜索列: {text_columns}")
|
||||||
self._prepare(keywords)
|
self._prepare(keywords)
|
||||||
|
|
||||||
matched_indices = []
|
matched_indices = []
|
||||||
matched_keywords_list = []
|
matched_keywords_list = []
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
for idx, text in enumerate(df[text_column]):
|
for idx in range(len(df)):
|
||||||
if pd.isna(text):
|
row = df.iloc[idx]
|
||||||
|
# 合并多列文本
|
||||||
|
combined_text = combine_text_columns(row, text_columns)
|
||||||
|
|
||||||
|
if not combined_text:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
text_str = str(text)
|
matches = self._match_single_text(combined_text, keywords)
|
||||||
matches = self._match_single_text(text_str, keywords)
|
|
||||||
|
|
||||||
if matches:
|
if matches:
|
||||||
matched_indices.append(idx)
|
matched_indices.append(idx)
|
||||||
@@ -435,22 +522,36 @@ def preview_results(result_df: pd.DataFrame, num_rows: int = 5) -> None:
|
|||||||
def perform_matching(
|
def perform_matching(
|
||||||
df: pd.DataFrame,
|
df: pd.DataFrame,
|
||||||
keywords: Set[str],
|
keywords: Set[str],
|
||||||
text_column: str,
|
text_columns: List[str],
|
||||||
output_file: str,
|
output_file: str,
|
||||||
algorithm: str = "auto",
|
algorithm: str = "auto",
|
||||||
mode: str = None
|
mode: str = None
|
||||||
) -> Optional[pd.DataFrame]:
|
) -> Optional[pd.DataFrame]:
|
||||||
"""执行完整的匹配流程"""
|
"""执行完整的匹配流程
|
||||||
|
|
||||||
|
参数:
|
||||||
|
df: 数据框
|
||||||
|
keywords: 关键词集合
|
||||||
|
text_columns: 文本列名列表(支持多列)
|
||||||
|
output_file: 输出文件路径
|
||||||
|
algorithm: 匹配算法
|
||||||
|
mode: 匹配模式
|
||||||
|
"""
|
||||||
# 验证列存在
|
# 验证列存在
|
||||||
if text_column not in df.columns:
|
missing_cols = [col for col in text_columns if col not in df.columns]
|
||||||
|
if missing_cols:
|
||||||
|
print(f"警告: 以下列不存在: {missing_cols}")
|
||||||
|
text_columns = [col for col in text_columns if col in df.columns]
|
||||||
|
|
||||||
|
if not text_columns:
|
||||||
print(f"可用列名: {df.columns.tolist()}")
|
print(f"可用列名: {df.columns.tolist()}")
|
||||||
raise ValueError(f"列 '{text_column}' 不存在")
|
raise ValueError("没有可用的文本列")
|
||||||
|
|
||||||
print(f"文本文件共有 {len(df)} 行数据\n")
|
print(f"文本文件共有 {len(df)} 行数据\n")
|
||||||
|
|
||||||
# 创建匹配器并执行匹配
|
# 创建匹配器并执行匹配
|
||||||
matcher = create_matcher(algorithm, mode=mode)
|
matcher = create_matcher(algorithm, mode=mode)
|
||||||
result = matcher.match(df, keywords, text_column)
|
result = matcher.match(df, keywords, text_columns)
|
||||||
|
|
||||||
# 输出统计信息
|
# 输出统计信息
|
||||||
print_statistics(result)
|
print_statistics(result)
|
||||||
@@ -465,7 +566,7 @@ def process_single_mode(
|
|||||||
keywords_df: pd.DataFrame,
|
keywords_df: pd.DataFrame,
|
||||||
text_df: pd.DataFrame,
|
text_df: pd.DataFrame,
|
||||||
mode: str,
|
mode: str,
|
||||||
text_column: str,
|
text_columns: List[str],
|
||||||
output_file: Path,
|
output_file: Path,
|
||||||
separator: str = SEPARATOR,
|
separator: str = SEPARATOR,
|
||||||
save_to_file: bool = True
|
save_to_file: bool = True
|
||||||
@@ -473,6 +574,9 @@ def process_single_mode(
|
|||||||
"""
|
"""
|
||||||
处理单个检测模式
|
处理单个检测模式
|
||||||
|
|
||||||
|
参数:
|
||||||
|
text_columns: 文本列名列表(支持多列)
|
||||||
|
|
||||||
返回:匹配结果 DataFrame(包含原始索引)
|
返回:匹配结果 DataFrame(包含原始索引)
|
||||||
"""
|
"""
|
||||||
mode_lower = mode.lower()
|
mode_lower = mode.lower()
|
||||||
@@ -501,7 +605,7 @@ def process_single_mode(
|
|||||||
result_df = perform_matching(
|
result_df = perform_matching(
|
||||||
df=text_df,
|
df=text_df,
|
||||||
keywords=keywords,
|
keywords=keywords,
|
||||||
text_column=text_column,
|
text_columns=text_columns,
|
||||||
output_file=temp_output,
|
output_file=temp_output,
|
||||||
algorithm=algorithm,
|
algorithm=algorithm,
|
||||||
mode=mode_lower # 传递模式参数
|
mode=mode_lower # 传递模式参数
|
||||||
@@ -528,11 +632,15 @@ def run_multiple_modes(
|
|||||||
keywords_file: Path,
|
keywords_file: Path,
|
||||||
text_file: Path,
|
text_file: Path,
|
||||||
output_file: Path,
|
output_file: Path,
|
||||||
text_column: str,
|
text_columns: Optional[List[str]],
|
||||||
modes: List[str],
|
modes: List[str],
|
||||||
separator: str = SEPARATOR
|
separator: str = SEPARATOR
|
||||||
) -> None:
|
) -> None:
|
||||||
"""运行多个检测模式,合并结果到单一文件"""
|
"""运行多个检测模式,合并结果到单一文件
|
||||||
|
|
||||||
|
参数:
|
||||||
|
text_columns: 文本列名列表(支持多列),None 表示自动检测
|
||||||
|
"""
|
||||||
# 验证文件存在
|
# 验证文件存在
|
||||||
if not keywords_file.exists():
|
if not keywords_file.exists():
|
||||||
raise FileNotFoundError(f"找不到关键词文件: {keywords_file}")
|
raise FileNotFoundError(f"找不到关键词文件: {keywords_file}")
|
||||||
@@ -546,7 +654,10 @@ def run_multiple_modes(
|
|||||||
|
|
||||||
print(f"正在加载文本文件: {text_file}")
|
print(f"正在加载文本文件: {text_file}")
|
||||||
text_df = pd.read_excel(text_file)
|
text_df = pd.read_excel(text_file)
|
||||||
print(f"文本列: {text_column}\n")
|
|
||||||
|
# 自动检测或验证文本列
|
||||||
|
actual_text_columns = detect_text_columns(text_df, text_columns)
|
||||||
|
print(f"使用文本列: {actual_text_columns}\n")
|
||||||
|
|
||||||
# 验证模式
|
# 验证模式
|
||||||
if not modes:
|
if not modes:
|
||||||
@@ -568,7 +679,7 @@ def run_multiple_modes(
|
|||||||
keywords_df=keywords_df,
|
keywords_df=keywords_df,
|
||||||
text_df=text_df,
|
text_df=text_df,
|
||||||
mode=mode_lower,
|
mode=mode_lower,
|
||||||
text_column=text_column,
|
text_columns=actual_text_columns,
|
||||||
output_file=output_file, # 这个参数在 save_to_file=False 时不使用
|
output_file=output_file, # 这个参数在 save_to_file=False 时不使用
|
||||||
separator=separator,
|
separator=separator,
|
||||||
save_to_file=False # 不保存到单独文件
|
save_to_file=False # 不保存到单独文件
|
||||||
@@ -668,7 +779,7 @@ def parse_args():
|
|||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
epilog="""
|
epilog="""
|
||||||
示例:
|
示例:
|
||||||
# 使用默认配置(两种模式)
|
# 使用默认配置(自动检测 detected_text 和 文本 列)
|
||||||
python keyword_matcher.py
|
python keyword_matcher.py
|
||||||
|
|
||||||
# 仅执行 CAS 号识别
|
# 仅执行 CAS 号识别
|
||||||
@@ -677,6 +788,12 @@ def parse_args():
|
|||||||
# 仅执行精确匹配
|
# 仅执行精确匹配
|
||||||
python keyword_matcher.py -m exact
|
python keyword_matcher.py -m exact
|
||||||
|
|
||||||
|
# 指定单个文本列
|
||||||
|
python keyword_matcher.py -c detected_text
|
||||||
|
|
||||||
|
# 指定多个文本列
|
||||||
|
python keyword_matcher.py -c detected_text 文本 summary
|
||||||
|
|
||||||
# 指定自定义文件路径
|
# 指定自定义文件路径
|
||||||
python keyword_matcher.py -k ../data/input/keywords.xlsx -t ../data/input/text.xlsx
|
python keyword_matcher.py -k ../data/input/keywords.xlsx -t ../data/input/text.xlsx
|
||||||
"""
|
"""
|
||||||
@@ -701,10 +818,11 @@ def parse_args():
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-c', '--text-column',
|
'-c', '--text-columns',
|
||||||
|
nargs='+',
|
||||||
type=str,
|
type=str,
|
||||||
default='文本',
|
default=None,
|
||||||
help='文本列名 (默认: 文本)'
|
help='文本列名,支持多列 (默认: 自动检测 detected_text 和 文本)'
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@@ -759,7 +877,7 @@ def main():
|
|||||||
keywords_file=keywords_file,
|
keywords_file=keywords_file,
|
||||||
text_file=text_file,
|
text_file=text_file,
|
||||||
output_file=output_file,
|
output_file=output_file,
|
||||||
text_column=args.text_column,
|
text_columns=args.text_columns,
|
||||||
modes=args.modes,
|
modes=args.modes,
|
||||||
separator=args.separator
|
separator=args.separator
|
||||||
)
|
)
|
||||||
|
|||||||
517
scripts/verify_high_confidence.py
Normal file
517
scripts/verify_high_confidence.py
Normal file
@@ -0,0 +1,517 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
验证高置信度未匹配记录
|
||||||
|
|
||||||
|
功能:比对 keyword_matcher 结果与原始 Excel,找出高置信度未匹配行,调用 LLM 二次验证。
|
||||||
|
|
||||||
|
用法:
|
||||||
|
python3 verify_high_confidence.py -o original.xlsx -m matched.xlsx
|
||||||
|
python3 verify_high_confidence.py -o original.xlsx -m matched.xlsx --mock --limit 5
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# 可选依赖
|
||||||
|
try:
|
||||||
|
import openai
|
||||||
|
HAS_OPENAI = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_OPENAI = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
import urllib3
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
HAS_REQUESTS = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_REQUESTS = False
|
||||||
|
|
||||||
|
# ========== 常量与配置 ==========
|
||||||
|
CONFIDENCE_LEVELS = ["High", "Medium"]
|
||||||
|
REQUEST_DELAY = 0.5
|
||||||
|
|
||||||
|
# 环境变量映射: api_type -> (key_env, url_env, model_env, default_model)
|
||||||
|
ENV_MAPPING = {
|
||||||
|
"openai": ("OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_MODEL", "gpt-4o-mini"),
|
||||||
|
"dmx": ("DMX_API_KEY", "DMX_BASE_URL", "DMX_MODEL", "gpt-4o-mini"),
|
||||||
|
"dify": ("DIFY_API_KEY", "DIFY_BASE_URL", "DIFY_MODEL", "dify-chatflow"),
|
||||||
|
"ollama": (None, "OLLAMA_BASE_URL", "OLLAMA_MODEL", "qwen2.5:7b"),
|
||||||
|
}
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """你是一位化学品风险识别专家。请分析文本内容,判断是否涉及管制化学品、毒品前体或非法药物交易。
|
||||||
|
|
||||||
|
请以 JSON 格式回答,包含以下字段:
|
||||||
|
- is_risky: 布尔值,是否涉及风险
|
||||||
|
- substances: 数组,涉及的具体物质名称或CAS号
|
||||||
|
- risk_level: 字符串,风险等级(高/中/低)
|
||||||
|
- reason: 字符串,判定理由(简要)
|
||||||
|
|
||||||
|
示例输出:
|
||||||
|
{"is_risky": true, "substances": ["甲基苯丙胺", "CAS 537-46-2"], "risk_level": "高", "reason": "文本中明确提到毒品名称和交易信息"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
USER_PROMPT_TEMPLATE = """请分析以下内容是否涉及管制化学品或毒品:
|
||||||
|
|
||||||
|
【图片分析结果】
|
||||||
|
{raw_response}
|
||||||
|
|
||||||
|
【原始文本】
|
||||||
|
{original_text}
|
||||||
|
|
||||||
|
请以 JSON 格式输出分析结果。"""
|
||||||
|
|
||||||
|
|
||||||
|
# ========== 数据类 ==========
|
||||||
|
@dataclass
|
||||||
|
class VerifyConfig:
|
||||||
|
api_type: str = "openai"
|
||||||
|
api_key: str = ""
|
||||||
|
base_url: Optional[str] = None
|
||||||
|
model: str = "gpt-4o-mini"
|
||||||
|
user_id: str = "default-user"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VerificationResult:
|
||||||
|
is_risky: Optional[bool] = None
|
||||||
|
substances: List[str] = field(default_factory=list)
|
||||||
|
risk_level: str = ""
|
||||||
|
reason: str = ""
|
||||||
|
raw_response: str = ""
|
||||||
|
|
||||||
|
def to_columns(self) -> dict:
|
||||||
|
return {
|
||||||
|
"llm_is_risky": self.is_risky,
|
||||||
|
"llm_substances": " | ".join(self.substances) if self.substances else "",
|
||||||
|
"llm_risk_level": self.risk_level,
|
||||||
|
"llm_reason": self.reason,
|
||||||
|
"llm_raw_response": self.raw_response,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ========== 工具函数 ==========
|
||||||
|
def load_env_file(env_path: str) -> None:
|
||||||
|
"""从 .env 文件加载环境变量"""
|
||||||
|
env_file = Path(env_path)
|
||||||
|
if not env_file.exists():
|
||||||
|
return
|
||||||
|
print(f"加载环境配置: {env_file}")
|
||||||
|
with open(env_file, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line or line.startswith("#"):
|
||||||
|
continue
|
||||||
|
if line.startswith("export "):
|
||||||
|
line = line[7:]
|
||||||
|
if "=" in line:
|
||||||
|
key, _, value = line.partition("=")
|
||||||
|
os.environ[key.strip()] = value.strip().strip('"').strip("'")
|
||||||
|
|
||||||
|
|
||||||
|
def get_config() -> VerifyConfig:
|
||||||
|
"""获取验证配置,优先使用 VERIFY_ 前缀"""
|
||||||
|
api_type = (os.getenv("VERIFY_API_TYPE") or os.getenv("LLM_API_TYPE") or "openai").lower()
|
||||||
|
mapping = ENV_MAPPING.get(api_type, (None, None, None, "gpt-4o-mini"))
|
||||||
|
key_env, url_env, model_env, default_model = mapping
|
||||||
|
|
||||||
|
return VerifyConfig(
|
||||||
|
api_type=api_type,
|
||||||
|
api_key=os.getenv("VERIFY_API_KEY") or (os.getenv(key_env) if key_env else "") or "",
|
||||||
|
base_url=os.getenv("VERIFY_BASE_URL") or (os.getenv(url_env) if url_env else None),
|
||||||
|
model=os.getenv("VERIFY_MODEL") or (os.getenv(model_env) if model_env else default_model) or default_model,
|
||||||
|
user_id=os.getenv("VERIFY_USER_ID") or os.getenv("DIFY_USER_ID") or "default-user",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_json_response(content: str) -> dict:
|
||||||
|
"""从 LLM 响应提取 JSON,处理 markdown 代码块"""
|
||||||
|
# 移除 markdown 代码块
|
||||||
|
if "```json" in content:
|
||||||
|
start = content.find("```json") + 7
|
||||||
|
end = content.find("```", start)
|
||||||
|
content = content[start:end].strip()
|
||||||
|
elif "```" in content:
|
||||||
|
start = content.find("```") + 3
|
||||||
|
end = content.find("```", start)
|
||||||
|
content = content[start:end].strip()
|
||||||
|
|
||||||
|
try:
|
||||||
|
start = content.find("{")
|
||||||
|
end = content.rfind("}") + 1
|
||||||
|
if start >= 0 and end > start:
|
||||||
|
return json.loads(content[start:end])
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return {"is_risky": None, "substances": [], "risk_level": "未知", "reason": "JSON 解析失败"}
|
||||||
|
|
||||||
|
|
||||||
|
def build_prompt(row: pd.Series, max_len: int = 3000) -> str:
|
||||||
|
"""构建用户提示"""
|
||||||
|
raw = str(row.get("raw_response", "") or "")
|
||||||
|
text = str(row.get("文本", "") or "")
|
||||||
|
if len(raw) > max_len:
|
||||||
|
raw = raw[:max_len] + "...(截断)"
|
||||||
|
if len(text) > max_len:
|
||||||
|
text = text[:max_len] + "...(截断)"
|
||||||
|
return USER_PROMPT_TEMPLATE.format(raw_response=raw, original_text=text)
|
||||||
|
|
||||||
|
|
||||||
|
# ========== 验证器类 ==========
|
||||||
|
class LLMVerifier(ABC):
|
||||||
|
"""LLM 验证器抽象基类"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def verify(self, row: pd.Series) -> VerificationResult:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAIVerifier(LLMVerifier):
|
||||||
|
"""OpenAI 兼容 API 验证器 (支持 OpenAI, DMX, Ollama)"""
|
||||||
|
|
||||||
|
def __init__(self, config: VerifyConfig):
|
||||||
|
if not HAS_OPENAI:
|
||||||
|
raise ImportError("请安装 openai: pip install openai")
|
||||||
|
if config.api_type != "ollama" and not config.api_key:
|
||||||
|
raise ValueError("未提供 API Key")
|
||||||
|
|
||||||
|
base_url = config.base_url
|
||||||
|
if config.api_type == "ollama":
|
||||||
|
base_url = (config.base_url or "http://localhost:11434") + "/v1"
|
||||||
|
|
||||||
|
self.client = openai.OpenAI(
|
||||||
|
api_key=config.api_key or "ollama",
|
||||||
|
base_url=base_url,
|
||||||
|
)
|
||||||
|
self.model = config.model
|
||||||
|
|
||||||
|
def verify(self, row: pd.Series) -> VerificationResult:
|
||||||
|
prompt = build_prompt(row)
|
||||||
|
try:
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=self.model,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
temperature=0.1,
|
||||||
|
max_tokens=500,
|
||||||
|
)
|
||||||
|
content = response.choices[0].message.content or ""
|
||||||
|
if response.choices[0].finish_reason != "stop":
|
||||||
|
return VerificationResult(
|
||||||
|
risk_level="错误",
|
||||||
|
reason=f"响应不完整 (finish_reason={response.choices[0].finish_reason})",
|
||||||
|
raw_response=content,
|
||||||
|
)
|
||||||
|
parsed = parse_json_response(content)
|
||||||
|
return VerificationResult(
|
||||||
|
is_risky=parsed.get("is_risky"),
|
||||||
|
substances=parsed.get("substances", []),
|
||||||
|
risk_level=parsed.get("risk_level", ""),
|
||||||
|
reason=parsed.get("reason", ""),
|
||||||
|
raw_response=content,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return VerificationResult(risk_level="错误", reason=f"API 调用失败: {e}", raw_response=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
class DifyVerifier(LLMVerifier):
|
||||||
|
"""Dify API 验证器"""
|
||||||
|
|
||||||
|
def __init__(self, config: VerifyConfig):
|
||||||
|
if not HAS_REQUESTS:
|
||||||
|
raise ImportError("请安装 requests: pip install requests")
|
||||||
|
if not config.api_key:
|
||||||
|
raise ValueError("未提供 Dify API Key")
|
||||||
|
self.base_url = (config.base_url or "").rstrip("/")
|
||||||
|
self.api_key = config.api_key
|
||||||
|
self.user_id = config.user_id
|
||||||
|
|
||||||
|
def verify(self, row: pd.Series) -> VerificationResult:
|
||||||
|
prompt = f"{SYSTEM_PROMPT}\n\n{build_prompt(row)}"
|
||||||
|
try:
|
||||||
|
resp = requests.post(
|
||||||
|
f"{self.base_url}/v1/chat-messages",
|
||||||
|
headers={"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"},
|
||||||
|
json={"inputs": {}, "query": prompt, "response_mode": "blocking", "user": self.user_id},
|
||||||
|
timeout=120,
|
||||||
|
verify=False,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
content = resp.json().get("answer", "")
|
||||||
|
parsed = parse_json_response(content)
|
||||||
|
return VerificationResult(
|
||||||
|
is_risky=parsed.get("is_risky"),
|
||||||
|
substances=parsed.get("substances", []),
|
||||||
|
risk_level=parsed.get("risk_level", ""),
|
||||||
|
reason=parsed.get("reason", ""),
|
||||||
|
raw_response=content,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return VerificationResult(risk_level="错误", reason=f"Dify 调用失败: {e}", raw_response=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
class MockVerifier(LLMVerifier):
|
||||||
|
"""Mock 验证器(测试用)"""
|
||||||
|
|
||||||
|
RISK_KEYWORDS = [
|
||||||
|
"毒品", "非法", "管制", "药物", "化学品", "CAS", "阿片", "芬太尼",
|
||||||
|
"冰毒", "大麻", "可卡因", "海洛因", "摇头丸", "麻黄碱",
|
||||||
|
"fentanyl", "methamphetamine", "cocaine", "heroin", "mdma",
|
||||||
|
"ketamine", "lsd", "precursor", "controlled",
|
||||||
|
]
|
||||||
|
|
||||||
|
def verify(self, row: pd.Series) -> VerificationResult:
|
||||||
|
all_text = f"{row.get('raw_response', '')} {row.get('文本', '')}".lower()
|
||||||
|
found = [kw for kw in self.RISK_KEYWORDS if kw.lower() in all_text]
|
||||||
|
is_risky = len(found) > 0
|
||||||
|
return VerificationResult(
|
||||||
|
is_risky=is_risky,
|
||||||
|
substances=found[:5],
|
||||||
|
risk_level="中" if is_risky else "低",
|
||||||
|
reason=f"Mock模式 - 发现关键词: {found[:3]}" if is_risky else "Mock模式 - 未发现风险关键词",
|
||||||
|
raw_response="(mock)",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def create_verifier(config: VerifyConfig) -> LLMVerifier:
|
||||||
|
"""根据配置创建验证器"""
|
||||||
|
if config.api_type == "mock":
|
||||||
|
return MockVerifier()
|
||||||
|
elif config.api_type == "dify":
|
||||||
|
return DifyVerifier(config)
|
||||||
|
elif config.api_type in ("openai", "dmx", "ollama"):
|
||||||
|
return OpenAIVerifier(config)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"不支持的 API 类型: {config.api_type}")
|
||||||
|
|
||||||
|
|
||||||
|
# ========== 数据处理 ==========
|
||||||
|
def load_excel(file_path: Path) -> pd.DataFrame:
|
||||||
|
"""加载 Excel 文件"""
|
||||||
|
if not file_path.exists():
|
||||||
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||||||
|
return pd.read_excel(file_path)
|
||||||
|
|
||||||
|
|
||||||
|
def find_unmatched(
|
||||||
|
original_df: pd.DataFrame,
|
||||||
|
matched_df: pd.DataFrame,
|
||||||
|
confidence_col: str = "confidence",
|
||||||
|
confidence_levels: List[str] = None,
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""找出高置信度但未被关键词匹配的行"""
|
||||||
|
levels = confidence_levels or CONFIDENCE_LEVELS
|
||||||
|
|
||||||
|
if confidence_col not in original_df.columns:
|
||||||
|
print(f"警告: 原始文件中不存在 '{confidence_col}' 列")
|
||||||
|
print(f"可用列: {original_df.columns.tolist()}")
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
# 高置信度行索引
|
||||||
|
conf_lower = original_df[confidence_col].astype(str).str.lower()
|
||||||
|
levels_lower = [l.lower() for l in levels]
|
||||||
|
high_conf_idx = set(original_df[conf_lower.isin(levels_lower)].index)
|
||||||
|
matched_idx = set(matched_df.index)
|
||||||
|
unmatched_idx = high_conf_idx - matched_idx
|
||||||
|
|
||||||
|
# 统计信息
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print("数据比对统计")
|
||||||
|
print(f"{'='*50}")
|
||||||
|
print(f"原始数据总行数: {len(original_df)}")
|
||||||
|
print(f"高置信度 ({'/'.join(levels)}) 行数: {len(high_conf_idx)}")
|
||||||
|
print(f"关键词匹配到的行数: {len(matched_idx)}")
|
||||||
|
print(f"高置信度中已匹配: {len(high_conf_idx & matched_idx)}")
|
||||||
|
print(f"高置信度中未匹配 (需验证): {len(unmatched_idx)}")
|
||||||
|
print(f"{'='*50}\n")
|
||||||
|
|
||||||
|
if not unmatched_idx:
|
||||||
|
return pd.DataFrame()
|
||||||
|
return original_df.loc[list(unmatched_idx)].copy()
|
||||||
|
|
||||||
|
|
||||||
|
def verify_batch(df: pd.DataFrame, verifier: LLMVerifier, delay: float = REQUEST_DELAY, limit: int = 0) -> pd.DataFrame:
|
||||||
|
"""批量验证记录"""
|
||||||
|
if limit > 0:
|
||||||
|
df = df.head(limit)
|
||||||
|
|
||||||
|
total = len(df)
|
||||||
|
print(f"开始 LLM 验证,共 {total} 条记录...")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
for i, (idx, row) in enumerate(df.iterrows()):
|
||||||
|
if (i + 1) % 10 == 0 or i == 0 or i == total - 1:
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
speed = (i + 1) / elapsed if elapsed > 0 else 0
|
||||||
|
print(f"进度: {i + 1}/{total} ({(i+1)/total*100:.1f}%) - 速度: {speed:.1f} 条/秒")
|
||||||
|
|
||||||
|
result = verifier.verify(row)
|
||||||
|
results.append({"original_index": idx, **result.to_columns()})
|
||||||
|
|
||||||
|
if delay > 0 and i < total - 1:
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
results_df = pd.DataFrame(results).set_index("original_index")
|
||||||
|
verified_df = df.copy()
|
||||||
|
for col in results_df.columns:
|
||||||
|
verified_df[col] = results_df[col]
|
||||||
|
return verified_df
|
||||||
|
|
||||||
|
|
||||||
|
# ========== 结果输出 ==========
|
||||||
|
def save_results(df: pd.DataFrame, output_file: Path, risky_only: bool = False) -> None:
|
||||||
|
"""保存结果"""
|
||||||
|
if risky_only and "llm_is_risky" in df.columns:
|
||||||
|
df = df[df["llm_is_risky"] == True]
|
||||||
|
df.to_excel(output_file, index=False, engine="openpyxl")
|
||||||
|
print(f"\n已保存 {len(df)} 条记录到: {output_file}")
|
||||||
|
|
||||||
|
|
||||||
|
def print_summary(df: pd.DataFrame) -> None:
|
||||||
|
"""打印验证摘要"""
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print("验证结果摘要")
|
||||||
|
print(f"{'='*50}")
|
||||||
|
|
||||||
|
total = len(df)
|
||||||
|
if "llm_is_risky" not in df.columns:
|
||||||
|
print(f"总记录数: {total}")
|
||||||
|
return
|
||||||
|
|
||||||
|
risky = (df["llm_is_risky"] == True).sum()
|
||||||
|
not_risky = (df["llm_is_risky"] == False).sum()
|
||||||
|
unknown = total - risky - not_risky
|
||||||
|
|
||||||
|
print(f"总验证数: {total}")
|
||||||
|
print(f" ├─ LLM 判定有风险: {risky} ({risky/total*100:.1f}%)")
|
||||||
|
print(f" ├─ LLM 判定无风险: {not_risky} ({not_risky/total*100:.1f}%)")
|
||||||
|
if unknown > 0:
|
||||||
|
print(f" └─ 判定失败/未知: {unknown}")
|
||||||
|
|
||||||
|
if "llm_risk_level" in df.columns:
|
||||||
|
print(f"\n风险等级分布:")
|
||||||
|
for level, count in df["llm_risk_level"].value_counts().items():
|
||||||
|
print(f" - {level}: {count}")
|
||||||
|
print(f"{'='*50}")
|
||||||
|
|
||||||
|
|
||||||
|
# ========== CLI ==========
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="验证高置信度未匹配记录",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
示例:
|
||||||
|
python3 verify_high_confidence.py -o original.xlsx -m matched.xlsx
|
||||||
|
python3 verify_high_confidence.py -o original.xlsx -m matched.xlsx --mock --limit 5
|
||||||
|
python3 verify_high_confidence.py -o original.xlsx -m matched.xlsx --api dmx --model gpt-4o-mini
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument("-o", "--original", required=True, help="原始 Excel 文件路径")
|
||||||
|
parser.add_argument("-m", "--matched", required=True, help="keyword_matcher 匹配结果文件路径")
|
||||||
|
parser.add_argument("-r", "--result", help="输出结果文件路径 (默认: 原始文件名_llm_verified.xlsx)")
|
||||||
|
|
||||||
|
parser.add_argument("--env-file", help="环境变量文件路径 (默认: ../.env)")
|
||||||
|
parser.add_argument("--api", choices=["openai", "dmx", "dify", "ollama"], help="LLM API 类型")
|
||||||
|
parser.add_argument("--model", help="LLM 模型名称")
|
||||||
|
parser.add_argument("--base-url", help="API base URL")
|
||||||
|
parser.add_argument("--api-key", help="API Key")
|
||||||
|
parser.add_argument("--mock", action="store_true", help="使用 mock 模式(不调用 API)")
|
||||||
|
|
||||||
|
parser.add_argument("--confidence", nargs="+", default=["High", "Medium"], help="需要验证的置信度级别")
|
||||||
|
parser.add_argument("--confidence-col", default="confidence", help="置信度列名")
|
||||||
|
|
||||||
|
parser.add_argument("--delay", type=float, default=REQUEST_DELAY, help="API 请求间隔秒数")
|
||||||
|
parser.add_argument("--limit", type=int, default=0, help="限制验证条数 (0=全部)")
|
||||||
|
parser.add_argument("--risky-only", action="store_true", help="只保存有风险的记录")
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
# 加载 .env
|
||||||
|
base_dir = Path(__file__).resolve().parent
|
||||||
|
env_file = args.env_file or str(base_dir.parent / ".env")
|
||||||
|
load_env_file(env_file)
|
||||||
|
|
||||||
|
# 获取配置
|
||||||
|
config = get_config()
|
||||||
|
|
||||||
|
# 命令行参数覆盖
|
||||||
|
if args.mock:
|
||||||
|
config.api_type = "mock"
|
||||||
|
elif args.api:
|
||||||
|
config.api_type = args.api
|
||||||
|
if args.model:
|
||||||
|
config.model = args.model
|
||||||
|
if args.base_url:
|
||||||
|
config.base_url = args.base_url
|
||||||
|
if args.api_key:
|
||||||
|
config.api_key = args.api_key
|
||||||
|
|
||||||
|
# 文件路径
|
||||||
|
original_file = Path(args.original)
|
||||||
|
matched_file = Path(args.matched)
|
||||||
|
result_file = Path(args.result) if args.result else original_file.parent / f"{original_file.stem}_llm_verified.xlsx"
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("高置信度未匹配记录验证")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"原始文件: {original_file}")
|
||||||
|
print(f"匹配结果: {matched_file}")
|
||||||
|
print(f"输出文件: {result_file}")
|
||||||
|
print(f"置信度级别: {args.confidence}")
|
||||||
|
print(f"API 类型: {config.api_type}")
|
||||||
|
print(f"模型: {config.model}")
|
||||||
|
if config.base_url:
|
||||||
|
print(f"Base URL: {config.base_url}")
|
||||||
|
|
||||||
|
# 加载数据
|
||||||
|
print("\n正在加载数据...")
|
||||||
|
original_df = load_excel(original_file)
|
||||||
|
matched_df = load_excel(matched_file)
|
||||||
|
|
||||||
|
# 找出未匹配的高置信度行
|
||||||
|
unmatched_df = find_unmatched(original_df, matched_df, args.confidence_col, args.confidence)
|
||||||
|
|
||||||
|
if unmatched_df.empty:
|
||||||
|
print("\n所有高置信度行都已被关键词匹配,无需验证。")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 创建验证器
|
||||||
|
try:
|
||||||
|
verifier = create_verifier(config)
|
||||||
|
except (ImportError, ValueError) as e:
|
||||||
|
print(f"\n错误: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# 执行验证
|
||||||
|
verified_df = verify_batch(unmatched_df, verifier, delay=args.delay, limit=args.limit)
|
||||||
|
|
||||||
|
# 打印摘要并保存
|
||||||
|
print_summary(verified_df)
|
||||||
|
save_results(verified_df, result_file, args.risky_only)
|
||||||
|
print("\n✓ 验证完成!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user