fix: bugs-01

This commit is contained in:
2026-03-12 19:57:30 +08:00
parent 9e609f89a3
commit ce537bb3dc
7 changed files with 62 additions and 37 deletions

View File

@@ -4,3 +4,4 @@ REDIS_URL=redis://localhost:6379/0
UPLOAD_DIR=./uploads UPLOAD_DIR=./uploads
SECRET_KEY=dev-secret-key SECRET_KEY=dev-secret-key
DEBUG=true DEBUG=true
OCR_PARALLELISM=4

View File

@@ -57,10 +57,14 @@ async def upload_images(
await db.flush() await db.flush()
# trigger OCR tasks in-process background (non-blocking for API response) # trigger OCR tasks in-process background (non-blocking for API response)
from app.workers.ocr_tasks import process_image_ocr_async from app.workers.ocr_tasks import process_images_ocr_batch_async
for img in results: pending_ids = [str(img.id) for img in results if img.ocr_status.value == "pending"]
if img.ocr_status.value == "pending": if pending_ids:
asyncio.create_task(process_image_ocr_async(str(img.id))) asyncio.create_task(
process_images_ocr_batch_async(
pending_ids, settings.OCR_PARALLELISM
)
)
return results return results
@@ -169,12 +173,16 @@ async def start_case_ocr(
else: else:
images = await repo.list_for_ocr(case_id, include_done=include_done) images = await repo.list_for_ocr(case_id, include_done=include_done)
from app.workers.ocr_tasks import process_image_ocr_async from app.workers.ocr_tasks import process_images_ocr_batch_async
submitted = 0 image_ids_to_run = [str(img.id) for img in images]
for img in images: submitted = len(image_ids_to_run)
asyncio.create_task(process_image_ocr_async(str(img.id))) if image_ids_to_run:
submitted += 1 asyncio.create_task(
process_images_ocr_batch_async(
image_ids_to_run, settings.OCR_PARALLELISM
)
)
return { return {
"caseId": str(case_id), "caseId": str(case_id),

View File

@@ -18,6 +18,7 @@ class Settings(BaseSettings):
OCR_API_KEY: str = "" OCR_API_KEY: str = ""
OCR_API_URL: str = "" OCR_API_URL: str = ""
OCR_MODEL: str = "" OCR_MODEL: str = ""
OCR_PARALLELISM: int = 4
LLM_API_KEY: str = "" LLM_API_KEY: str = ""
LLM_API_URL: str = "" LLM_API_URL: str = ""
LLM_MODEL: str = "" LLM_MODEL: str = ""

View File

@@ -1,32 +1,18 @@
"""Transaction deduplication rules. """Transaction deduplication rules.
Determines whether two transaction records likely represent the same Only marks records as duplicate when there is strong deterministic evidence.
underlying financial event captured from different screenshots / pages. Highly similar records (same amount/time/counterparty) are intentionally kept
for manual review to avoid filtering out potential scam brushing transactions.
""" """
from datetime import timedelta
from app.models.transaction import TransactionRecord from app.models.transaction import TransactionRecord
TIME_WINDOW = timedelta(minutes=5)
def is_duplicate_pair(a: TransactionRecord, b: TransactionRecord) -> bool: def is_duplicate_pair(a: TransactionRecord, b: TransactionRecord) -> bool:
# Rule 1: exact order_no match # Rule 1: exact order_no match (strong deterministic signal).
if a.order_no and b.order_no and a.order_no == b.order_no: if a.order_no and b.order_no and a.order_no == b.order_no:
return True return True
# Rule 2: same amount + close time + same account tail # Intentionally do NOT deduplicate by amount/time similarity.
if ( # Those records should enter the review stage for human confirmation.
float(a.amount) == float(b.amount)
and a.trade_time
and b.trade_time
and abs(a.trade_time - b.trade_time) <= TIME_WINDOW
):
if a.self_account_tail_no and b.self_account_tail_no:
if a.self_account_tail_no == b.self_account_tail_no:
return True
# same counterparty and close time is also strong signal
if a.counterparty_name and a.counterparty_name == b.counterparty_name:
return True
return False return False

View File

@@ -1,13 +1,14 @@
"""Transaction deduplication and matching engine. """Transaction deduplication and matching engine.
Multi-layer strategy: Strategy:
1. Exact order_no match 1. Deduplicate only by exact order_no match
2. Amount + time-window + account-tail match 2. Mark transit (self-transfer) records for exclusion from fraud totals
3. Fuzzy text similarity (placeholder for LLM-assisted matching)
Note:
Highly similar records (same amount/time/counterparty) are preserved so they
can be reviewed by officers in the assessment review stage.
""" """
from uuid import UUID from uuid import UUID
from datetime import timedelta
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.models.transaction import TransactionRecord from app.models.transaction import TransactionRecord

View File

@@ -24,6 +24,24 @@ def process_image_ocr(self, image_id: str):
_run_async(process_image_ocr_async(image_id)) _run_async(process_image_ocr_async(image_id))
async def process_images_ocr_batch_async(image_ids: list[str], max_concurrency: int) -> None:
"""Process many images with bounded OCR concurrency."""
if not image_ids:
return
concurrency = max(1, max_concurrency)
semaphore = asyncio.Semaphore(concurrency)
async def _run_one(image_id: str) -> None:
async with semaphore:
try:
await process_image_ocr_async(image_id)
except Exception:
# Keep batch processing alive even if one image fails.
logger.exception("Image %s OCR failed in batch", image_id)
await asyncio.gather(*[_run_one(image_id) for image_id in image_ids])
async def process_image_ocr_async(image_id_str: str): async def process_image_ocr_async(image_id_str: str):
from app.core.database import async_session_factory from app.core.database import async_session_factory
from sqlalchemy import delete from sqlalchemy import delete

View File

@@ -37,14 +37,14 @@ class TestDedupRules:
b = _make_tx(order_no="ORD002", counterparty_name="B", self_account_tail_no="2222") b = _make_tx(order_no="ORD002", counterparty_name="B", self_account_tail_no="2222")
assert not is_duplicate_pair(a, b) assert not is_duplicate_pair(a, b)
def test_same_amount_close_time_same_tail(self): def test_same_amount_close_time_same_tail_should_not_dedup(self):
a = _make_tx(order_no="", amount=5000) a = _make_tx(order_no="", amount=5000)
b = _make_tx( b = _make_tx(
order_no="", order_no="",
amount=5000, amount=5000,
trade_time=datetime(2026, 3, 8, 10, 3, tzinfo=timezone.utc), trade_time=datetime(2026, 3, 8, 10, 3, tzinfo=timezone.utc),
) )
assert is_duplicate_pair(a, b) assert not is_duplicate_pair(a, b)
def test_same_amount_far_time(self): def test_same_amount_far_time(self):
a = _make_tx(order_no="", amount=5000) a = _make_tx(order_no="", amount=5000)
@@ -55,6 +55,16 @@ class TestDedupRules:
) )
assert not is_duplicate_pair(a, b) assert not is_duplicate_pair(a, b)
def test_same_amount_close_time_same_counterparty_should_not_dedup(self):
a = _make_tx(order_no="", amount=8000, counterparty_name="刷单账户A")
b = _make_tx(
order_no="",
amount=8000,
counterparty_name="刷单账户A",
trade_time=datetime(2026, 3, 8, 10, 2, tzinfo=timezone.utc),
)
assert not is_duplicate_pair(a, b)
class TestTransitRules: class TestTransitRules:
def test_keyword_match(self): def test_keyword_match(self):