fix: bugs-01
This commit is contained in:
@@ -4,3 +4,4 @@ REDIS_URL=redis://localhost:6379/0
|
||||
UPLOAD_DIR=./uploads
|
||||
SECRET_KEY=dev-secret-key
|
||||
DEBUG=true
|
||||
OCR_PARALLELISM=4
|
||||
|
||||
@@ -57,10 +57,14 @@ async def upload_images(
|
||||
await db.flush()
|
||||
|
||||
# trigger OCR tasks in-process background (non-blocking for API response)
|
||||
from app.workers.ocr_tasks import process_image_ocr_async
|
||||
for img in results:
|
||||
if img.ocr_status.value == "pending":
|
||||
asyncio.create_task(process_image_ocr_async(str(img.id)))
|
||||
from app.workers.ocr_tasks import process_images_ocr_batch_async
|
||||
pending_ids = [str(img.id) for img in results if img.ocr_status.value == "pending"]
|
||||
if pending_ids:
|
||||
asyncio.create_task(
|
||||
process_images_ocr_batch_async(
|
||||
pending_ids, settings.OCR_PARALLELISM
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
@@ -169,12 +173,16 @@ async def start_case_ocr(
|
||||
else:
|
||||
images = await repo.list_for_ocr(case_id, include_done=include_done)
|
||||
|
||||
from app.workers.ocr_tasks import process_image_ocr_async
|
||||
from app.workers.ocr_tasks import process_images_ocr_batch_async
|
||||
|
||||
submitted = 0
|
||||
for img in images:
|
||||
asyncio.create_task(process_image_ocr_async(str(img.id)))
|
||||
submitted += 1
|
||||
image_ids_to_run = [str(img.id) for img in images]
|
||||
submitted = len(image_ids_to_run)
|
||||
if image_ids_to_run:
|
||||
asyncio.create_task(
|
||||
process_images_ocr_batch_async(
|
||||
image_ids_to_run, settings.OCR_PARALLELISM
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"caseId": str(case_id),
|
||||
|
||||
@@ -18,6 +18,7 @@ class Settings(BaseSettings):
|
||||
OCR_API_KEY: str = ""
|
||||
OCR_API_URL: str = ""
|
||||
OCR_MODEL: str = ""
|
||||
OCR_PARALLELISM: int = 4
|
||||
LLM_API_KEY: str = ""
|
||||
LLM_API_URL: str = ""
|
||||
LLM_MODEL: str = ""
|
||||
|
||||
@@ -1,32 +1,18 @@
|
||||
"""Transaction deduplication rules.
|
||||
|
||||
Determines whether two transaction records likely represent the same
|
||||
underlying financial event captured from different screenshots / pages.
|
||||
Only marks records as duplicate when there is strong deterministic evidence.
|
||||
Highly similar records (same amount/time/counterparty) are intentionally kept
|
||||
for manual review to avoid filtering out potential scam brushing transactions.
|
||||
"""
|
||||
from datetime import timedelta
|
||||
|
||||
from app.models.transaction import TransactionRecord
|
||||
|
||||
TIME_WINDOW = timedelta(minutes=5)
|
||||
|
||||
|
||||
def is_duplicate_pair(a: TransactionRecord, b: TransactionRecord) -> bool:
|
||||
# Rule 1: exact order_no match
|
||||
# Rule 1: exact order_no match (strong deterministic signal).
|
||||
if a.order_no and b.order_no and a.order_no == b.order_no:
|
||||
return True
|
||||
|
||||
# Rule 2: same amount + close time + same account tail
|
||||
if (
|
||||
float(a.amount) == float(b.amount)
|
||||
and a.trade_time
|
||||
and b.trade_time
|
||||
and abs(a.trade_time - b.trade_time) <= TIME_WINDOW
|
||||
):
|
||||
if a.self_account_tail_no and b.self_account_tail_no:
|
||||
if a.self_account_tail_no == b.self_account_tail_no:
|
||||
return True
|
||||
# same counterparty and close time is also strong signal
|
||||
if a.counterparty_name and a.counterparty_name == b.counterparty_name:
|
||||
return True
|
||||
|
||||
# Intentionally do NOT deduplicate by amount/time similarity.
|
||||
# Those records should enter the review stage for human confirmation.
|
||||
return False
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
"""Transaction deduplication and matching engine.
|
||||
|
||||
Multi-layer strategy:
|
||||
1. Exact order_no match
|
||||
2. Amount + time-window + account-tail match
|
||||
3. Fuzzy text similarity (placeholder for LLM-assisted matching)
|
||||
Strategy:
|
||||
1. Deduplicate only by exact order_no match
|
||||
2. Mark transit (self-transfer) records for exclusion from fraud totals
|
||||
|
||||
Note:
|
||||
Highly similar records (same amount/time/counterparty) are preserved so they
|
||||
can be reviewed by officers in the assessment review stage.
|
||||
"""
|
||||
from uuid import UUID
|
||||
from datetime import timedelta
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.models.transaction import TransactionRecord
|
||||
|
||||
@@ -24,6 +24,24 @@ def process_image_ocr(self, image_id: str):
|
||||
_run_async(process_image_ocr_async(image_id))
|
||||
|
||||
|
||||
async def process_images_ocr_batch_async(image_ids: list[str], max_concurrency: int) -> None:
|
||||
"""Process many images with bounded OCR concurrency."""
|
||||
if not image_ids:
|
||||
return
|
||||
concurrency = max(1, max_concurrency)
|
||||
semaphore = asyncio.Semaphore(concurrency)
|
||||
|
||||
async def _run_one(image_id: str) -> None:
|
||||
async with semaphore:
|
||||
try:
|
||||
await process_image_ocr_async(image_id)
|
||||
except Exception:
|
||||
# Keep batch processing alive even if one image fails.
|
||||
logger.exception("Image %s OCR failed in batch", image_id)
|
||||
|
||||
await asyncio.gather(*[_run_one(image_id) for image_id in image_ids])
|
||||
|
||||
|
||||
async def process_image_ocr_async(image_id_str: str):
|
||||
from app.core.database import async_session_factory
|
||||
from sqlalchemy import delete
|
||||
|
||||
@@ -37,14 +37,14 @@ class TestDedupRules:
|
||||
b = _make_tx(order_no="ORD002", counterparty_name="B", self_account_tail_no="2222")
|
||||
assert not is_duplicate_pair(a, b)
|
||||
|
||||
def test_same_amount_close_time_same_tail(self):
|
||||
def test_same_amount_close_time_same_tail_should_not_dedup(self):
|
||||
a = _make_tx(order_no="", amount=5000)
|
||||
b = _make_tx(
|
||||
order_no="",
|
||||
amount=5000,
|
||||
trade_time=datetime(2026, 3, 8, 10, 3, tzinfo=timezone.utc),
|
||||
)
|
||||
assert is_duplicate_pair(a, b)
|
||||
assert not is_duplicate_pair(a, b)
|
||||
|
||||
def test_same_amount_far_time(self):
|
||||
a = _make_tx(order_no="", amount=5000)
|
||||
@@ -55,6 +55,16 @@ class TestDedupRules:
|
||||
)
|
||||
assert not is_duplicate_pair(a, b)
|
||||
|
||||
def test_same_amount_close_time_same_counterparty_should_not_dedup(self):
|
||||
a = _make_tx(order_no="", amount=8000, counterparty_name="刷单账户A")
|
||||
b = _make_tx(
|
||||
order_no="",
|
||||
amount=8000,
|
||||
counterparty_name="刷单账户A",
|
||||
trade_time=datetime(2026, 3, 8, 10, 2, tzinfo=timezone.utc),
|
||||
)
|
||||
assert not is_duplicate_pair(a, b)
|
||||
|
||||
|
||||
class TestTransitRules:
|
||||
def test_keyword_match(self):
|
||||
|
||||
Reference in New Issue
Block a user