fix ocr

2026-03-12 12:32:29 +08:00
parent c0f9ddabbf
commit 470446fa6f
18 changed files with 591 additions and 142 deletions
--- a/backend/app/workers/ocr_tasks.py
+++ b/backend/app/workers/ocr_tasks.py
@@ -1,6 +1,7 @@
 """Celery tasks for OCR processing of uploaded screenshots."""
 import asyncio
 import logging
+import json
 from uuid import UUID

 from app.workers.celery_app import celery_app
@@ -20,18 +21,19 @@ def _run_async(coro):
@celery_app.task(name="app.workers.ocr_tasks.process_image_ocr", bind=True, max_retries=3)
 def process_image_ocr(self, image_id: str):
    """Process a single image: classify page, extract fields, save to DB."""
-    _run_async(_process(image_id))
+    _run_async(process_image_ocr_async(image_id))


-async def _process(image_id_str: str):
+async def process_image_ocr_async(image_id_str: str):
    from app.core.database import async_session_factory
+    from sqlalchemy import delete
    from app.models.evidence_image import EvidenceImage, OcrStatus
    from app.models.ocr_block import OcrBlock
+    from app.models.transaction import TransactionRecord
    from app.services.ocr_service import classify_page, extract_transaction_fields
    from app.services.parser_service import parse_extracted_fields

    image_id = UUID(image_id_str)
-
    async with async_session_factory() as db:
        image = await db.get(EvidenceImage, image_id)
        if not image:
@@ -40,23 +42,39 @@ async def _process(image_id_str: str):

        image.ocr_status = OcrStatus.processing
        await db.flush()
+        # Re-run OCR for this image should replace old OCR blocks/records.
+        await db.execute(delete(OcrBlock).where(OcrBlock.image_id == image.id))
+        await db.execute(delete(TransactionRecord).where(TransactionRecord.evidence_image_id == image.id))

        try:
            source_app, page_type = await classify_page(image.file_path)
            image.source_app = source_app
            image.page_type = page_type

-            raw_fields = await extract_transaction_fields(image.file_path, source_app, page_type)
+            raw_fields, raw_ocr_text = await extract_transaction_fields(image.file_path, source_app, page_type)

-            # save raw OCR block
-            block = OcrBlock(
-                image_id=image.id,
-                content=str(raw_fields),
-                bbox={},
-                seq_order=0,
-                confidence=raw_fields.get("confidence", 0.5) if isinstance(raw_fields, dict) else 0.5,
-            )
-            db.add(block)
+            is_empty_extract = raw_fields is None or raw_fields == {} or raw_fields == []
+
+            # save raw OCR block (direct model output for debugging)
+            raw_block_content = (raw_ocr_text or "").strip()
+            if raw_block_content:
+                block = OcrBlock(
+                    image_id=image.id,
+                    content=raw_block_content,
+                    bbox={},
+                    seq_order=0,
+                    confidence=raw_fields.get("confidence", 0.5) if isinstance(raw_fields, dict) else 0.5,
+                )
+                db.add(block)
+            elif not is_empty_extract:
+                block = OcrBlock(
+                    image_id=image.id,
+                    content=json.dumps(raw_fields, ensure_ascii=False),
+                    bbox={},
+                    seq_order=0,
+                    confidence=raw_fields.get("confidence", 0.5) if isinstance(raw_fields, dict) else 0.5,
+                )
+                db.add(block)

            # parse into transaction records
            records = parse_extracted_fields(raw_fields, image.case_id, image.id, source_app)