fix ocr

2026-03-12 12:32:29 +08:00
parent c0f9ddabbf
commit 470446fa6f
18 changed files with 591 additions and 142 deletions
--- a/backend/app/api/v1/images.py
+++ b/backend/app/api/v1/images.py
@@ -1,15 +1,16 @@
 from uuid import UUID
+import asyncio

-from fastapi import APIRouter, Depends, UploadFile, File, HTTPException, Query
+from fastapi import APIRouter, Depends, UploadFile, File, HTTPException
 from fastapi.responses import FileResponse
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.core.config import settings
 from app.core.database import get_db
-from app.models.evidence_image import EvidenceImage, SourceApp, PageType
+from app.models.evidence_image import EvidenceImage, SourceApp, PageType, OcrStatus
 from app.repositories.image_repo import ImageRepository
 from app.repositories.case_repo import CaseRepository
-from app.schemas.image import ImageOut, ImageDetailOut, OcrFieldCorrection
+from app.schemas.image import ImageOut, ImageDetailOut, OcrFieldCorrection, CaseOcrStartIn
 from app.utils.hash import sha256_file
 from app.utils.file_storage import save_upload

@@ -32,9 +33,11 @@ async def upload_images(

    for f in files:
        data = await f.read()
-        file_hash = sha256_file(data)
+        raw_hash = sha256_file(data)
+        # Scope hash by case to avoid cross-case unique conflicts while still deduplicating inside one case.
+        scoped_hash = f"{raw_hash}:{case_id}"

-        existing = await img_repo.find_by_hash(file_hash)
+        existing = await img_repo.find_by_hash_in_case(case_id, [raw_hash, scoped_hash])
        if existing:
            results.append(existing)
            continue
@@ -44,7 +47,7 @@ async def upload_images(
            case_id=case_id,
            file_path=file_path,
            thumb_path=thumb_path,
-            file_hash=file_hash,
+            file_hash=scoped_hash,
            file_size=len(data),
        )
        image = await img_repo.create(image)
@@ -53,14 +56,11 @@ async def upload_images(
    case.image_count = await img_repo.count_by_case(case_id)
    await db.flush()

-    # trigger OCR tasks (non-blocking)
-    from app.workers.ocr_tasks import process_image_ocr
+    # trigger OCR tasks in-process background (non-blocking for API response)
+    from app.workers.ocr_tasks import process_image_ocr_async
    for img in results:
        if img.ocr_status.value == "pending":
-            try:
-                process_image_ocr.delay(str(img.id))
-            except Exception:
-                pass
+            asyncio.create_task(process_image_ocr_async(str(img.id)))

    return results

@@ -73,7 +73,21 @@ async def list_images(
    db: AsyncSession = Depends(get_db),
 ):
    repo = ImageRepository(db)
-    return await repo.list_by_case(case_id, source_app=source_app, page_type=page_type)
+    images = await repo.list_by_case(case_id, source_app=source_app, page_type=page_type)
+    return [
+        ImageOut(
+            id=img.id,
+            case_id=img.case_id,
+            url=f"/api/v1/images/{img.id}/file",
+            thumb_url=f"/api/v1/images/{img.id}/file",
+            source_app=img.source_app,
+            page_type=img.page_type,
+            ocr_status=img.ocr_status,
+            file_hash=img.file_hash,
+            uploaded_at=img.uploaded_at,
+        )
+        for img in images
+    ]


@router.get("/images/{image_id}", response_model=ImageDetailOut)
@@ -128,3 +142,43 @@ async def get_image_file(image_id: UUID, db: AsyncSession = Depends(get_db)):
    if not full_path.exists():
        raise HTTPException(404, "文件不存在")
    return FileResponse(full_path)
+
+
+@router.post("/cases/{case_id}/ocr/start")
+async def start_case_ocr(
+    case_id: UUID,
+    payload: CaseOcrStartIn | None = None,
+    db: AsyncSession = Depends(get_db),
+):
+    case_repo = CaseRepository(db)
+    case = await case_repo.get(case_id)
+    if not case:
+        raise HTTPException(404, "案件不存在")
+
+    repo = ImageRepository(db)
+    include_done = payload.include_done if payload else False
+    image_ids = payload.image_ids if payload else []
+    if image_ids:
+        images = await repo.list_by_ids_in_case(case_id, image_ids)
+        # For explicit re-run, mark selected images as processing immediately
+        # so frontend can reflect state transition without full page refresh.
+        for img in images:
+            img.ocr_status = OcrStatus.processing
+        await db.flush()
+        await db.commit()
+    else:
+        images = await repo.list_for_ocr(case_id, include_done=include_done)
+
+    from app.workers.ocr_tasks import process_image_ocr_async
+
+    submitted = 0
+    for img in images:
+        asyncio.create_task(process_image_ocr_async(str(img.id)))
+        submitted += 1
+
+    return {
+        "caseId": str(case_id),
+        "submitted": submitted,
+        "totalCandidates": len(images),
+        "message": f"已提交 {submitted} 张截图的 OCR 任务",
+    }