fix: bugs-01

This commit is contained in:
2026-03-12 19:57:30 +08:00
parent 9e609f89a3
commit ce537bb3dc
7 changed files with 62 additions and 37 deletions

View File

@@ -4,3 +4,4 @@ REDIS_URL=redis://localhost:6379/0
UPLOAD_DIR=./uploads
SECRET_KEY=dev-secret-key
DEBUG=true
OCR_PARALLELISM=4

View File

@@ -57,10 +57,14 @@ async def upload_images(
await db.flush()
# trigger OCR tasks in-process background (non-blocking for API response)
from app.workers.ocr_tasks import process_image_ocr_async
for img in results:
if img.ocr_status.value == "pending":
asyncio.create_task(process_image_ocr_async(str(img.id)))
from app.workers.ocr_tasks import process_images_ocr_batch_async
pending_ids = [str(img.id) for img in results if img.ocr_status.value == "pending"]
if pending_ids:
asyncio.create_task(
process_images_ocr_batch_async(
pending_ids, settings.OCR_PARALLELISM
)
)
return results
@@ -169,12 +173,16 @@ async def start_case_ocr(
else:
images = await repo.list_for_ocr(case_id, include_done=include_done)
from app.workers.ocr_tasks import process_image_ocr_async
from app.workers.ocr_tasks import process_images_ocr_batch_async
submitted = 0
for img in images:
asyncio.create_task(process_image_ocr_async(str(img.id)))
submitted += 1
image_ids_to_run = [str(img.id) for img in images]
submitted = len(image_ids_to_run)
if image_ids_to_run:
asyncio.create_task(
process_images_ocr_batch_async(
image_ids_to_run, settings.OCR_PARALLELISM
)
)
return {
"caseId": str(case_id),

View File

@@ -18,6 +18,7 @@ class Settings(BaseSettings):
OCR_API_KEY: str = ""
OCR_API_URL: str = ""
OCR_MODEL: str = ""
OCR_PARALLELISM: int = 4
LLM_API_KEY: str = ""
LLM_API_URL: str = ""
LLM_MODEL: str = ""

View File

@@ -1,32 +1,18 @@
"""Transaction deduplication rules.
Determines whether two transaction records likely represent the same
underlying financial event captured from different screenshots / pages.
Only marks records as duplicate when there is strong deterministic evidence.
Highly similar records (same amount/time/counterparty) are intentionally kept
for manual review to avoid filtering out potential scam brushing transactions.
"""
from datetime import timedelta
from app.models.transaction import TransactionRecord
TIME_WINDOW = timedelta(minutes=5)
def is_duplicate_pair(a: TransactionRecord, b: TransactionRecord) -> bool:
# Rule 1: exact order_no match
# Rule 1: exact order_no match (strong deterministic signal).
if a.order_no and b.order_no and a.order_no == b.order_no:
return True
# Rule 2: same amount + close time + same account tail
if (
float(a.amount) == float(b.amount)
and a.trade_time
and b.trade_time
and abs(a.trade_time - b.trade_time) <= TIME_WINDOW
):
if a.self_account_tail_no and b.self_account_tail_no:
if a.self_account_tail_no == b.self_account_tail_no:
return True
# same counterparty and close time is also strong signal
if a.counterparty_name and a.counterparty_name == b.counterparty_name:
return True
# Intentionally do NOT deduplicate by amount/time similarity.
# Those records should enter the review stage for human confirmation.
return False

View File

@@ -1,13 +1,14 @@
"""Transaction deduplication and matching engine.
Multi-layer strategy:
1. Exact order_no match
2. Amount + time-window + account-tail match
3. Fuzzy text similarity (placeholder for LLM-assisted matching)
Strategy:
1. Deduplicate only by exact order_no match
2. Mark transit (self-transfer) records for exclusion from fraud totals
Note:
Highly similar records (same amount/time/counterparty) are preserved so they
can be reviewed by officers in the assessment review stage.
"""
from uuid import UUID
from datetime import timedelta
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.transaction import TransactionRecord

View File

@@ -24,6 +24,24 @@ def process_image_ocr(self, image_id: str):
_run_async(process_image_ocr_async(image_id))
async def process_images_ocr_batch_async(image_ids: list[str], max_concurrency: int) -> None:
"""Process many images with bounded OCR concurrency."""
if not image_ids:
return
concurrency = max(1, max_concurrency)
semaphore = asyncio.Semaphore(concurrency)
async def _run_one(image_id: str) -> None:
async with semaphore:
try:
await process_image_ocr_async(image_id)
except Exception:
# Keep batch processing alive even if one image fails.
logger.exception("Image %s OCR failed in batch", image_id)
await asyncio.gather(*[_run_one(image_id) for image_id in image_ids])
async def process_image_ocr_async(image_id_str: str):
from app.core.database import async_session_factory
from sqlalchemy import delete

View File

@@ -37,14 +37,14 @@ class TestDedupRules:
b = _make_tx(order_no="ORD002", counterparty_name="B", self_account_tail_no="2222")
assert not is_duplicate_pair(a, b)
def test_same_amount_close_time_same_tail(self):
def test_same_amount_close_time_same_tail_should_not_dedup(self):
a = _make_tx(order_no="", amount=5000)
b = _make_tx(
order_no="",
amount=5000,
trade_time=datetime(2026, 3, 8, 10, 3, tzinfo=timezone.utc),
)
assert is_duplicate_pair(a, b)
assert not is_duplicate_pair(a, b)
def test_same_amount_far_time(self):
a = _make_tx(order_no="", amount=5000)
@@ -55,6 +55,16 @@ class TestDedupRules:
)
assert not is_duplicate_pair(a, b)
def test_same_amount_close_time_same_counterparty_should_not_dedup(self):
a = _make_tx(order_no="", amount=8000, counterparty_name="刷单账户A")
b = _make_tx(
order_no="",
amount=8000,
counterparty_name="刷单账户A",
trade_time=datetime(2026, 3, 8, 10, 2, tzinfo=timezone.utc),
)
assert not is_duplicate_pair(a, b)
class TestTransitRules:
def test_keyword_match(self):