From ce537bb3dca9431822a8062dc8c50d76af9d1843 Mon Sep 17 00:00:00 2001 From: ntnt Date: Thu, 12 Mar 2026 19:57:30 +0800 Subject: [PATCH] fix: bugs-01 --- backend/.env.example | 1 + backend/app/api/v1/images.py | 26 ++++++++++++++++-------- backend/app/core/config.py | 1 + backend/app/rules/dedup_rules.py | 26 ++++++------------------ backend/app/services/matching_service.py | 13 ++++++------ backend/app/workers/ocr_tasks.py | 18 ++++++++++++++++ backend/tests/test_rules.py | 14 +++++++++++-- 7 files changed, 62 insertions(+), 37 deletions(-) diff --git a/backend/.env.example b/backend/.env.example index 99d38bf..2367b88 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -4,3 +4,4 @@ REDIS_URL=redis://localhost:6379/0 UPLOAD_DIR=./uploads SECRET_KEY=dev-secret-key DEBUG=true +OCR_PARALLELISM=4 diff --git a/backend/app/api/v1/images.py b/backend/app/api/v1/images.py index 5490708..58f0dab 100644 --- a/backend/app/api/v1/images.py +++ b/backend/app/api/v1/images.py @@ -57,10 +57,14 @@ async def upload_images( await db.flush() # trigger OCR tasks in-process background (non-blocking for API response) - from app.workers.ocr_tasks import process_image_ocr_async - for img in results: - if img.ocr_status.value == "pending": - asyncio.create_task(process_image_ocr_async(str(img.id))) + from app.workers.ocr_tasks import process_images_ocr_batch_async + pending_ids = [str(img.id) for img in results if img.ocr_status.value == "pending"] + if pending_ids: + asyncio.create_task( + process_images_ocr_batch_async( + pending_ids, settings.OCR_PARALLELISM + ) + ) return results @@ -169,12 +173,16 @@ async def start_case_ocr( else: images = await repo.list_for_ocr(case_id, include_done=include_done) - from app.workers.ocr_tasks import process_image_ocr_async + from app.workers.ocr_tasks import process_images_ocr_batch_async - submitted = 0 - for img in images: - asyncio.create_task(process_image_ocr_async(str(img.id))) - submitted += 1 + image_ids_to_run = [str(img.id) for img in images] + submitted = len(image_ids_to_run) + if image_ids_to_run: + asyncio.create_task( + process_images_ocr_batch_async( + image_ids_to_run, settings.OCR_PARALLELISM + ) + ) return { "caseId": str(case_id), diff --git a/backend/app/core/config.py b/backend/app/core/config.py index f5443da..01b4306 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -18,6 +18,7 @@ class Settings(BaseSettings): OCR_API_KEY: str = "" OCR_API_URL: str = "" OCR_MODEL: str = "" + OCR_PARALLELISM: int = 4 LLM_API_KEY: str = "" LLM_API_URL: str = "" LLM_MODEL: str = "" diff --git a/backend/app/rules/dedup_rules.py b/backend/app/rules/dedup_rules.py index 1af81f4..04dd726 100644 --- a/backend/app/rules/dedup_rules.py +++ b/backend/app/rules/dedup_rules.py @@ -1,32 +1,18 @@ """Transaction deduplication rules. -Determines whether two transaction records likely represent the same -underlying financial event captured from different screenshots / pages. +Only marks records as duplicate when there is strong deterministic evidence. +Highly similar records (same amount/time/counterparty) are intentionally kept +for manual review to avoid filtering out potential scam brushing transactions. """ -from datetime import timedelta from app.models.transaction import TransactionRecord -TIME_WINDOW = timedelta(minutes=5) - def is_duplicate_pair(a: TransactionRecord, b: TransactionRecord) -> bool: - # Rule 1: exact order_no match + # Rule 1: exact order_no match (strong deterministic signal). if a.order_no and b.order_no and a.order_no == b.order_no: return True - # Rule 2: same amount + close time + same account tail - if ( - float(a.amount) == float(b.amount) - and a.trade_time - and b.trade_time - and abs(a.trade_time - b.trade_time) <= TIME_WINDOW - ): - if a.self_account_tail_no and b.self_account_tail_no: - if a.self_account_tail_no == b.self_account_tail_no: - return True - # same counterparty and close time is also strong signal - if a.counterparty_name and a.counterparty_name == b.counterparty_name: - return True - + # Intentionally do NOT deduplicate by amount/time similarity. + # Those records should enter the review stage for human confirmation. return False diff --git a/backend/app/services/matching_service.py b/backend/app/services/matching_service.py index 140fcc7..50144b3 100644 --- a/backend/app/services/matching_service.py +++ b/backend/app/services/matching_service.py @@ -1,13 +1,14 @@ """Transaction deduplication and matching engine. -Multi-layer strategy: - 1. Exact order_no match - 2. Amount + time-window + account-tail match - 3. Fuzzy text similarity (placeholder for LLM-assisted matching) +Strategy: + 1. Deduplicate only by exact order_no match + 2. Mark transit (self-transfer) records for exclusion from fraud totals + +Note: + Highly similar records (same amount/time/counterparty) are preserved so they + can be reviewed by officers in the assessment review stage. """ from uuid import UUID -from datetime import timedelta - from sqlalchemy.ext.asyncio import AsyncSession from app.models.transaction import TransactionRecord diff --git a/backend/app/workers/ocr_tasks.py b/backend/app/workers/ocr_tasks.py index 8667f68..32222b5 100644 --- a/backend/app/workers/ocr_tasks.py +++ b/backend/app/workers/ocr_tasks.py @@ -24,6 +24,24 @@ def process_image_ocr(self, image_id: str): _run_async(process_image_ocr_async(image_id)) +async def process_images_ocr_batch_async(image_ids: list[str], max_concurrency: int) -> None: + """Process many images with bounded OCR concurrency.""" + if not image_ids: + return + concurrency = max(1, max_concurrency) + semaphore = asyncio.Semaphore(concurrency) + + async def _run_one(image_id: str) -> None: + async with semaphore: + try: + await process_image_ocr_async(image_id) + except Exception: + # Keep batch processing alive even if one image fails. + logger.exception("Image %s OCR failed in batch", image_id) + + await asyncio.gather(*[_run_one(image_id) for image_id in image_ids]) + + async def process_image_ocr_async(image_id_str: str): from app.core.database import async_session_factory from sqlalchemy import delete diff --git a/backend/tests/test_rules.py b/backend/tests/test_rules.py index b929f08..5aacf59 100644 --- a/backend/tests/test_rules.py +++ b/backend/tests/test_rules.py @@ -37,14 +37,14 @@ class TestDedupRules: b = _make_tx(order_no="ORD002", counterparty_name="B", self_account_tail_no="2222") assert not is_duplicate_pair(a, b) - def test_same_amount_close_time_same_tail(self): + def test_same_amount_close_time_same_tail_should_not_dedup(self): a = _make_tx(order_no="", amount=5000) b = _make_tx( order_no="", amount=5000, trade_time=datetime(2026, 3, 8, 10, 3, tzinfo=timezone.utc), ) - assert is_duplicate_pair(a, b) + assert not is_duplicate_pair(a, b) def test_same_amount_far_time(self): a = _make_tx(order_no="", amount=5000) @@ -55,6 +55,16 @@ class TestDedupRules: ) assert not is_duplicate_pair(a, b) + def test_same_amount_close_time_same_counterparty_should_not_dedup(self): + a = _make_tx(order_no="", amount=8000, counterparty_name="刷单账户A") + b = _make_tx( + order_no="", + amount=8000, + counterparty_name="刷单账户A", + trade_time=datetime(2026, 3, 8, 10, 2, tzinfo=timezone.utc), + ) + assert not is_duplicate_pair(a, b) + class TestTransitRules: def test_keyword_match(self):