fix: update keywords_match

2026-01-18 18:25:36 +08:00
parent 29f6e25f70
commit 4ed90734df
7 changed files with 1406 additions and 269 deletions
--- a/scripts/keyword_matcher.py
+++ b/scripts/keyword_matcher.py
@@ -47,6 +47,18 @@ MODE_LABELS = {
    "exact": "精确匹配",
 }

+# 常见的文本列名（按优先级排序）
+COMMON_TEXT_COLUMNS = [
+    "detected_text",  # 新格式（图片分析结果）
+    "文本",           # 旧格式 / 合并后的原始文本
+    "text",
+    "content",
+    "summary",
+]
+
+# 默认多列匹配组合
+DEFAULT_TEXT_COLUMNS = ["detected_text", "文本"]
+

 # ========== 数据类 ==========
@dataclass
@@ -136,6 +148,71 @@ def split_value(value: str, separator: str) -> List[str]:
    return [part.strip() for part in parts if part and part.strip()]


+def detect_text_columns(
+    df: pd.DataFrame,
+    specified_columns: Optional[List[str]] = None
+) -> List[str]:
+    """
+    检测并验证文本列名
+
+    参数：
+        df: 数据框
+        specified_columns: 用户指定的列名列表
+
+    返回：存在的文本列名列表
+
+    异常：如果找不到任何合适的列则抛出 ValueError
+    """
+    # 如果用户指定了列名
+    if specified_columns:
+        available = [col for col in specified_columns if col in df.columns]
+        missing = [col for col in specified_columns if col not in df.columns]
+
+        if missing:
+            print(f"警告: 以下指定的列不存在: {missing}")
+
+        if available:
+            return available
+        else:
+            print("警告: 所有指定的列都不存在，尝试自动检测...")
+
+    # 自动检测：优先使用默认多列组合
+    available_default = [col for col in DEFAULT_TEXT_COLUMNS if col in df.columns]
+    if available_default:
+        print(f"自动检测到文本列: {available_default}")
+        return available_default
+
+    # 回退：使用第一个找到的常见列
+    for col in COMMON_TEXT_COLUMNS:
+        if col in df.columns:
+            print(f"自动检测到文本列: ['{col}']")
+            return [col]
+
+    # 都没找到，抛出异常
+    raise ValueError(
+        f"无法自动检测文本列。可用列: {df.columns.tolist()}\n"
+        f"请使用 -c 参数指定文本列名"
+    )
+
+
+def combine_text_columns(row: pd.Series, text_columns: List[str]) -> str:
+    """
+    合并多列文本内容
+
+    参数：
+        row: DataFrame 的一行
+        text_columns: 要合并的列名列表
+
+    返回：合并后的文本（用换行符分隔）
+    """
+    texts = []
+    for col in text_columns:
+        val = row.get(col)
+        if pd.notna(val) and str(val).strip():
+            texts.append(str(val).strip())
+    return "\n".join(texts)
+
+
 def load_keywords_for_mode(
    df: pd.DataFrame,
    mode: str,
@@ -205,22 +282,32 @@ class KeywordMatcher(ABC):
        self,
        df: pd.DataFrame,
        keywords: Set[str],
-        text_column: str
+        text_columns: List[str]
    ) -> MatchResult:
-        """执行匹配（模板方法）"""
+        """执行匹配（模板方法）
+
+        参数：
+            df: 数据框
+            keywords: 关键词集合
+            text_columns: 文本列名列表（支持多列）
+        """
        print(f"开始匹配（使用{self.name}）...")
+        print(f"搜索列: {text_columns}")
        self._prepare(keywords)

        matched_indices = []
        matched_keywords_list = []
        start_time = time.time()

-        for idx, text in enumerate(df[text_column]):
-            if pd.isna(text):
+        for idx in range(len(df)):
+            row = df.iloc[idx]
+            # 合并多列文本
+            combined_text = combine_text_columns(row, text_columns)
+
+            if not combined_text:
                continue

-            text_str = str(text)
-            matches = self._match_single_text(text_str, keywords)
+            matches = self._match_single_text(combined_text, keywords)

            if matches:
                matched_indices.append(idx)
@@ -435,22 +522,36 @@ def preview_results(result_df: pd.DataFrame, num_rows: int = 5) -> None:
 def perform_matching(
    df: pd.DataFrame,
    keywords: Set[str],
-    text_column: str,
+    text_columns: List[str],
    output_file: str,
    algorithm: str = "auto",
    mode: str = None
 ) -> Optional[pd.DataFrame]:
-    """执行完整的匹配流程"""
+    """执行完整的匹配流程
+
+    参数：
+        df: 数据框
+        keywords: 关键词集合
+        text_columns: 文本列名列表（支持多列）
+        output_file: 输出文件路径
+        algorithm: 匹配算法
+        mode: 匹配模式
+    """
    # 验证列存在
-    if text_column not in df.columns:
+    missing_cols = [col for col in text_columns if col not in df.columns]
+    if missing_cols:
+        print(f"警告: 以下列不存在: {missing_cols}")
+        text_columns = [col for col in text_columns if col in df.columns]
+
+    if not text_columns:
        print(f"可用列名: {df.columns.tolist()}")
-        raise ValueError(f"列 '{text_column}' 不存在")
+        raise ValueError("没有可用的文本列")

    print(f"文本文件共有 {len(df)} 行数据\n")

    # 创建匹配器并执行匹配
    matcher = create_matcher(algorithm, mode=mode)
-    result = matcher.match(df, keywords, text_column)
+    result = matcher.match(df, keywords, text_columns)

    # 输出统计信息
    print_statistics(result)
@@ -465,7 +566,7 @@ def process_single_mode(
    keywords_df: pd.DataFrame,
    text_df: pd.DataFrame,
    mode: str,
-    text_column: str,
+    text_columns: List[str],
    output_file: Path,
    separator: str = SEPARATOR,
    save_to_file: bool = True
@@ -473,6 +574,9 @@ def process_single_mode(
    """
    处理单个检测模式

+    参数：
+        text_columns: 文本列名列表（支持多列）
+
    返回：匹配结果 DataFrame（包含原始索引）
    """
    mode_lower = mode.lower()
@@ -501,7 +605,7 @@ def process_single_mode(
    result_df = perform_matching(
        df=text_df,
        keywords=keywords,
-        text_column=text_column,
+        text_columns=text_columns,
        output_file=temp_output,
        algorithm=algorithm,
        mode=mode_lower  # 传递模式参数
@@ -528,11 +632,15 @@ def run_multiple_modes(
    keywords_file: Path,
    text_file: Path,
    output_file: Path,
-    text_column: str,
+    text_columns: Optional[List[str]],
    modes: List[str],
    separator: str = SEPARATOR
 ) -> None:
-    """运行多个检测模式，合并结果到单一文件"""
+    """运行多个检测模式，合并结果到单一文件
+
+    参数：
+        text_columns: 文本列名列表（支持多列），None 表示自动检测
+    """
    # 验证文件存在
    if not keywords_file.exists():
        raise FileNotFoundError(f"找不到关键词文件: {keywords_file}")
@@ -546,7 +654,10 @@ def run_multiple_modes(

    print(f"正在加载文本文件: {text_file}")
    text_df = pd.read_excel(text_file)
-    print(f"文本列: {text_column}\n")
+
+    # 自动检测或验证文本列
+    actual_text_columns = detect_text_columns(text_df, text_columns)
+    print(f"使用文本列: {actual_text_columns}\n")

    # 验证模式
    if not modes:
@@ -568,7 +679,7 @@ def run_multiple_modes(
            keywords_df=keywords_df,
            text_df=text_df,
            mode=mode_lower,
-            text_column=text_column,
+            text_columns=actual_text_columns,
            output_file=output_file,  # 这个参数在 save_to_file=False 时不使用
            separator=separator,
            save_to_file=False  # 不保存到单独文件
@@ -668,7 +779,7 @@ def parse_args():
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 示例:
-  # 使用默认配置（两种模式）
+  # 使用默认配置（自动检测 detected_text 和 文本 列）
  python keyword_matcher.py

  # 仅执行 CAS 号识别
@@ -677,6 +788,12 @@ def parse_args():
  # 仅执行精确匹配
  python keyword_matcher.py -m exact

+  # 指定单个文本列
+  python keyword_matcher.py -c detected_text
+
+  # 指定多个文本列
+  python keyword_matcher.py -c detected_text 文本 summary
+
  # 指定自定义文件路径
  python keyword_matcher.py -k ../data/input/keywords.xlsx -t ../data/input/text.xlsx
        """
@@ -701,10 +818,11 @@ def parse_args():
    )

    parser.add_argument(
-        '-c', '--text-column',
+        '-c', '--text-columns',
+        nargs='+',
        type=str,
-        default='文本',
-        help='文本列名 (默认: 文本)'
+        default=None,
+        help='文本列名，支持多列 (默认: 自动检测 detected_text 和 文本)'
    )

    parser.add_argument(
@@ -759,7 +877,7 @@ def main():
            keywords_file=keywords_file,
            text_file=text_file,
            output_file=output_file,
-            text_column=args.text_column,
+            text_columns=args.text_columns,
            modes=args.modes,
            separator=args.separator
        )