fix: update keywords_match
This commit is contained in:
107
scripts/batch_keyword_match.sh
Executable file
107
scripts/batch_keyword_match.sh
Executable file
@@ -0,0 +1,107 @@
|
||||
#!/bin/bash
|
||||
# 批量关键词匹配脚本
|
||||
# 处理 data/pho_analysis_merged/ 中的所有 xlsx 文件
|
||||
|
||||
set -e
|
||||
|
||||
# 获取脚本所在目录
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
|
||||
# 目录配置
|
||||
INPUT_DIR="$PROJECT_DIR/data/pho_analysis_merged"
|
||||
OUTPUT_DIR="$PROJECT_DIR/data/output"
|
||||
KEYWORDS_FILE="$PROJECT_DIR/data/keywords/keywords_all.xlsx"
|
||||
|
||||
# 颜色输出
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
RED='\033[0;31m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
echo "=============================================="
|
||||
echo "批量关键词匹配"
|
||||
echo "=============================================="
|
||||
echo "输入目录: $INPUT_DIR"
|
||||
echo "输出目录: $OUTPUT_DIR"
|
||||
echo "关键词文件: $KEYWORDS_FILE"
|
||||
echo ""
|
||||
|
||||
# 检查输入目录
|
||||
if [ ! -d "$INPUT_DIR" ]; then
|
||||
echo -e "${RED}错误: 输入目录不存在: $INPUT_DIR${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 检查关键词文件
|
||||
if [ ! -f "$KEYWORDS_FILE" ]; then
|
||||
echo -e "${YELLOW}警告: 关键词文件不存在: $KEYWORDS_FILE${NC}"
|
||||
echo "将使用默认关键词文件"
|
||||
KEYWORDS_FILE=""
|
||||
fi
|
||||
|
||||
# 创建输出目录
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
# 统计
|
||||
total=0
|
||||
success=0
|
||||
failed=0
|
||||
|
||||
# 获取所有 xlsx 文件
|
||||
files=("$INPUT_DIR"/*.xlsx)
|
||||
|
||||
# 检查是否有文件
|
||||
if [ ! -e "${files[0]}" ]; then
|
||||
echo -e "${YELLOW}没有找到 xlsx 文件${NC}"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 计算总数
|
||||
for f in "${files[@]}"; do
|
||||
if [ -f "$f" ]; then
|
||||
((total++))
|
||||
fi
|
||||
done
|
||||
|
||||
echo "找到 $total 个文件待处理"
|
||||
echo "----------------------------------------------"
|
||||
|
||||
# 处理每个文件
|
||||
current=0
|
||||
for input_file in "${files[@]}"; do
|
||||
if [ ! -f "$input_file" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
((current++))
|
||||
|
||||
# 获取文件名(不含扩展名)
|
||||
filename=$(basename "$input_file" .xlsx)
|
||||
output_file="$OUTPUT_DIR/${filename}_matched.xlsx"
|
||||
|
||||
echo -e "\n[$current/$total] 处理: $filename"
|
||||
|
||||
# 构建命令
|
||||
cmd="python3 $SCRIPT_DIR/keyword_matcher.py -t \"$input_file\" -o \"$output_file\""
|
||||
if [ -n "$KEYWORDS_FILE" ]; then
|
||||
cmd="$cmd -k \"$KEYWORDS_FILE\""
|
||||
fi
|
||||
|
||||
# 执行匹配
|
||||
if eval "$cmd"; then
|
||||
echo -e "${GREEN} ✓ 完成: ${filename}_matched.xlsx${NC}"
|
||||
((success++))
|
||||
else
|
||||
echo -e "${RED} ✗ 失败: $filename${NC}"
|
||||
((failed++))
|
||||
fi
|
||||
done
|
||||
|
||||
# 汇总
|
||||
echo ""
|
||||
echo "=============================================="
|
||||
echo "处理完成"
|
||||
echo "=============================================="
|
||||
echo -e "总计: $total | ${GREEN}成功: $success${NC} | ${RED}失败: $failed${NC}"
|
||||
echo "输出目录: $OUTPUT_DIR"
|
||||
Reference in New Issue
Block a user