fix: bugs-01
This commit is contained in:
@@ -4,3 +4,4 @@ REDIS_URL=redis://localhost:6379/0
|
|||||||
UPLOAD_DIR=./uploads
|
UPLOAD_DIR=./uploads
|
||||||
SECRET_KEY=dev-secret-key
|
SECRET_KEY=dev-secret-key
|
||||||
DEBUG=true
|
DEBUG=true
|
||||||
|
OCR_PARALLELISM=4
|
||||||
|
|||||||
@@ -57,10 +57,14 @@ async def upload_images(
|
|||||||
await db.flush()
|
await db.flush()
|
||||||
|
|
||||||
# trigger OCR tasks in-process background (non-blocking for API response)
|
# trigger OCR tasks in-process background (non-blocking for API response)
|
||||||
from app.workers.ocr_tasks import process_image_ocr_async
|
from app.workers.ocr_tasks import process_images_ocr_batch_async
|
||||||
for img in results:
|
pending_ids = [str(img.id) for img in results if img.ocr_status.value == "pending"]
|
||||||
if img.ocr_status.value == "pending":
|
if pending_ids:
|
||||||
asyncio.create_task(process_image_ocr_async(str(img.id)))
|
asyncio.create_task(
|
||||||
|
process_images_ocr_batch_async(
|
||||||
|
pending_ids, settings.OCR_PARALLELISM
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@@ -169,12 +173,16 @@ async def start_case_ocr(
|
|||||||
else:
|
else:
|
||||||
images = await repo.list_for_ocr(case_id, include_done=include_done)
|
images = await repo.list_for_ocr(case_id, include_done=include_done)
|
||||||
|
|
||||||
from app.workers.ocr_tasks import process_image_ocr_async
|
from app.workers.ocr_tasks import process_images_ocr_batch_async
|
||||||
|
|
||||||
submitted = 0
|
image_ids_to_run = [str(img.id) for img in images]
|
||||||
for img in images:
|
submitted = len(image_ids_to_run)
|
||||||
asyncio.create_task(process_image_ocr_async(str(img.id)))
|
if image_ids_to_run:
|
||||||
submitted += 1
|
asyncio.create_task(
|
||||||
|
process_images_ocr_batch_async(
|
||||||
|
image_ids_to_run, settings.OCR_PARALLELISM
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"caseId": str(case_id),
|
"caseId": str(case_id),
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ class Settings(BaseSettings):
|
|||||||
OCR_API_KEY: str = ""
|
OCR_API_KEY: str = ""
|
||||||
OCR_API_URL: str = ""
|
OCR_API_URL: str = ""
|
||||||
OCR_MODEL: str = ""
|
OCR_MODEL: str = ""
|
||||||
|
OCR_PARALLELISM: int = 4
|
||||||
LLM_API_KEY: str = ""
|
LLM_API_KEY: str = ""
|
||||||
LLM_API_URL: str = ""
|
LLM_API_URL: str = ""
|
||||||
LLM_MODEL: str = ""
|
LLM_MODEL: str = ""
|
||||||
|
|||||||
@@ -1,32 +1,18 @@
|
|||||||
"""Transaction deduplication rules.
|
"""Transaction deduplication rules.
|
||||||
|
|
||||||
Determines whether two transaction records likely represent the same
|
Only marks records as duplicate when there is strong deterministic evidence.
|
||||||
underlying financial event captured from different screenshots / pages.
|
Highly similar records (same amount/time/counterparty) are intentionally kept
|
||||||
|
for manual review to avoid filtering out potential scam brushing transactions.
|
||||||
"""
|
"""
|
||||||
from datetime import timedelta
|
|
||||||
|
|
||||||
from app.models.transaction import TransactionRecord
|
from app.models.transaction import TransactionRecord
|
||||||
|
|
||||||
TIME_WINDOW = timedelta(minutes=5)
|
|
||||||
|
|
||||||
|
|
||||||
def is_duplicate_pair(a: TransactionRecord, b: TransactionRecord) -> bool:
|
def is_duplicate_pair(a: TransactionRecord, b: TransactionRecord) -> bool:
|
||||||
# Rule 1: exact order_no match
|
# Rule 1: exact order_no match (strong deterministic signal).
|
||||||
if a.order_no and b.order_no and a.order_no == b.order_no:
|
if a.order_no and b.order_no and a.order_no == b.order_no:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Rule 2: same amount + close time + same account tail
|
# Intentionally do NOT deduplicate by amount/time similarity.
|
||||||
if (
|
# Those records should enter the review stage for human confirmation.
|
||||||
float(a.amount) == float(b.amount)
|
|
||||||
and a.trade_time
|
|
||||||
and b.trade_time
|
|
||||||
and abs(a.trade_time - b.trade_time) <= TIME_WINDOW
|
|
||||||
):
|
|
||||||
if a.self_account_tail_no and b.self_account_tail_no:
|
|
||||||
if a.self_account_tail_no == b.self_account_tail_no:
|
|
||||||
return True
|
|
||||||
# same counterparty and close time is also strong signal
|
|
||||||
if a.counterparty_name and a.counterparty_name == b.counterparty_name:
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|||||||
@@ -1,13 +1,14 @@
|
|||||||
"""Transaction deduplication and matching engine.
|
"""Transaction deduplication and matching engine.
|
||||||
|
|
||||||
Multi-layer strategy:
|
Strategy:
|
||||||
1. Exact order_no match
|
1. Deduplicate only by exact order_no match
|
||||||
2. Amount + time-window + account-tail match
|
2. Mark transit (self-transfer) records for exclusion from fraud totals
|
||||||
3. Fuzzy text similarity (placeholder for LLM-assisted matching)
|
|
||||||
|
Note:
|
||||||
|
Highly similar records (same amount/time/counterparty) are preserved so they
|
||||||
|
can be reviewed by officers in the assessment review stage.
|
||||||
"""
|
"""
|
||||||
from uuid import UUID
|
from uuid import UUID
|
||||||
from datetime import timedelta
|
|
||||||
|
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.models.transaction import TransactionRecord
|
from app.models.transaction import TransactionRecord
|
||||||
|
|||||||
@@ -24,6 +24,24 @@ def process_image_ocr(self, image_id: str):
|
|||||||
_run_async(process_image_ocr_async(image_id))
|
_run_async(process_image_ocr_async(image_id))
|
||||||
|
|
||||||
|
|
||||||
|
async def process_images_ocr_batch_async(image_ids: list[str], max_concurrency: int) -> None:
|
||||||
|
"""Process many images with bounded OCR concurrency."""
|
||||||
|
if not image_ids:
|
||||||
|
return
|
||||||
|
concurrency = max(1, max_concurrency)
|
||||||
|
semaphore = asyncio.Semaphore(concurrency)
|
||||||
|
|
||||||
|
async def _run_one(image_id: str) -> None:
|
||||||
|
async with semaphore:
|
||||||
|
try:
|
||||||
|
await process_image_ocr_async(image_id)
|
||||||
|
except Exception:
|
||||||
|
# Keep batch processing alive even if one image fails.
|
||||||
|
logger.exception("Image %s OCR failed in batch", image_id)
|
||||||
|
|
||||||
|
await asyncio.gather(*[_run_one(image_id) for image_id in image_ids])
|
||||||
|
|
||||||
|
|
||||||
async def process_image_ocr_async(image_id_str: str):
|
async def process_image_ocr_async(image_id_str: str):
|
||||||
from app.core.database import async_session_factory
|
from app.core.database import async_session_factory
|
||||||
from sqlalchemy import delete
|
from sqlalchemy import delete
|
||||||
|
|||||||
@@ -37,14 +37,14 @@ class TestDedupRules:
|
|||||||
b = _make_tx(order_no="ORD002", counterparty_name="B", self_account_tail_no="2222")
|
b = _make_tx(order_no="ORD002", counterparty_name="B", self_account_tail_no="2222")
|
||||||
assert not is_duplicate_pair(a, b)
|
assert not is_duplicate_pair(a, b)
|
||||||
|
|
||||||
def test_same_amount_close_time_same_tail(self):
|
def test_same_amount_close_time_same_tail_should_not_dedup(self):
|
||||||
a = _make_tx(order_no="", amount=5000)
|
a = _make_tx(order_no="", amount=5000)
|
||||||
b = _make_tx(
|
b = _make_tx(
|
||||||
order_no="",
|
order_no="",
|
||||||
amount=5000,
|
amount=5000,
|
||||||
trade_time=datetime(2026, 3, 8, 10, 3, tzinfo=timezone.utc),
|
trade_time=datetime(2026, 3, 8, 10, 3, tzinfo=timezone.utc),
|
||||||
)
|
)
|
||||||
assert is_duplicate_pair(a, b)
|
assert not is_duplicate_pair(a, b)
|
||||||
|
|
||||||
def test_same_amount_far_time(self):
|
def test_same_amount_far_time(self):
|
||||||
a = _make_tx(order_no="", amount=5000)
|
a = _make_tx(order_no="", amount=5000)
|
||||||
@@ -55,6 +55,16 @@ class TestDedupRules:
|
|||||||
)
|
)
|
||||||
assert not is_duplicate_pair(a, b)
|
assert not is_duplicate_pair(a, b)
|
||||||
|
|
||||||
|
def test_same_amount_close_time_same_counterparty_should_not_dedup(self):
|
||||||
|
a = _make_tx(order_no="", amount=8000, counterparty_name="刷单账户A")
|
||||||
|
b = _make_tx(
|
||||||
|
order_no="",
|
||||||
|
amount=8000,
|
||||||
|
counterparty_name="刷单账户A",
|
||||||
|
trade_time=datetime(2026, 3, 8, 10, 2, tzinfo=timezone.utc),
|
||||||
|
)
|
||||||
|
assert not is_duplicate_pair(a, b)
|
||||||
|
|
||||||
|
|
||||||
class TestTransitRules:
|
class TestTransitRules:
|
||||||
def test_keyword_match(self):
|
def test_keyword_match(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user