feat(prospecting): add batch delay + fix Celery error_message field
- Add PROSPECTING_BATCH_DELAY_SECONDS config (default 1.0s) — polite delay between prospects in batch scans to avoid rate limiting - Apply delay to all 5 batch API endpoints and all Celery tasks - Fix Celery tasks: error_message → error_log (matches model field) - Add batch-scanning.md docs with rate limiting guide, scaling estimates for 70k+ URL imports, and pipeline order recommendations Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -8,6 +8,7 @@ catch "batch" as a string before trying to parse it as int → 422.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
from fastapi import APIRouter, Depends, Path, Query
|
||||
from fastapi.responses import HTMLResponse
|
||||
@@ -15,6 +16,7 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from app.api.deps import get_current_admin_api
|
||||
from app.core.database import get_db
|
||||
from app.modules.prospecting.config import config as prospecting_config
|
||||
from app.modules.prospecting.models import JobType
|
||||
from app.modules.prospecting.schemas.enrichment import (
|
||||
ContactScrapeResponse,
|
||||
@@ -45,6 +47,12 @@ router = APIRouter(prefix="/enrichment")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _batch_delay():
|
||||
"""Delay between prospects in batch scans to avoid rate limiting."""
|
||||
if prospecting_config.batch_delay_seconds > 0:
|
||||
time.sleep(prospecting_config.batch_delay_seconds)
|
||||
|
||||
|
||||
# ── Batch endpoints (must be before /{prospect_id} routes) ──────────────────
|
||||
|
||||
|
||||
@@ -58,10 +66,12 @@ def http_check_batch(
|
||||
job = stats_service.create_job(db,JobType.HTTP_CHECK)
|
||||
prospects = prospect_service.get_pending_http_check(db, limit=limit)
|
||||
results = []
|
||||
for prospect in prospects:
|
||||
for i, prospect in enumerate(prospects):
|
||||
result = enrichment_service.check_http(db, prospect)
|
||||
results.append(HttpCheckBatchItem(domain=prospect.domain_name, **result))
|
||||
stats_service.complete_job(job,processed=len(results))
|
||||
if i < len(prospects) - 1:
|
||||
_batch_delay()
|
||||
stats_service.complete_job(job, processed=len(results))
|
||||
db.commit()
|
||||
return HttpCheckBatchResponse(processed=len(results), results=results)
|
||||
|
||||
@@ -76,11 +86,13 @@ def tech_scan_batch(
|
||||
job = stats_service.create_job(db,JobType.TECH_SCAN)
|
||||
prospects = prospect_service.get_pending_tech_scan(db, limit=limit)
|
||||
count = 0
|
||||
for prospect in prospects:
|
||||
for i, prospect in enumerate(prospects):
|
||||
result = enrichment_service.scan_tech_stack(db, prospect)
|
||||
if result:
|
||||
count += 1
|
||||
stats_service.complete_job(job,processed=len(prospects))
|
||||
if i < len(prospects) - 1:
|
||||
_batch_delay()
|
||||
stats_service.complete_job(job, processed=len(prospects))
|
||||
db.commit()
|
||||
return ScanBatchResponse(processed=len(prospects), successful=count)
|
||||
|
||||
@@ -95,11 +107,13 @@ def performance_scan_batch(
|
||||
job = stats_service.create_job(db,JobType.PERFORMANCE_SCAN)
|
||||
prospects = prospect_service.get_pending_performance_scan(db, limit=limit)
|
||||
count = 0
|
||||
for prospect in prospects:
|
||||
for i, prospect in enumerate(prospects):
|
||||
result = enrichment_service.scan_performance(db, prospect)
|
||||
if result:
|
||||
count += 1
|
||||
stats_service.complete_job(job,processed=len(prospects))
|
||||
if i < len(prospects) - 1:
|
||||
_batch_delay()
|
||||
stats_service.complete_job(job, processed=len(prospects))
|
||||
db.commit()
|
||||
return ScanBatchResponse(processed=len(prospects), successful=count)
|
||||
|
||||
@@ -114,11 +128,13 @@ def contact_scrape_batch(
|
||||
job = stats_service.create_job(db,JobType.CONTACT_SCRAPE)
|
||||
prospects = prospect_service.get_pending_contact_scrape(db, limit=limit)
|
||||
count = 0
|
||||
for prospect in prospects:
|
||||
for i, prospect in enumerate(prospects):
|
||||
contacts = enrichment_service.scrape_contacts(db, prospect)
|
||||
if contacts:
|
||||
count += 1
|
||||
stats_service.complete_job(job,processed=len(prospects))
|
||||
if i < len(prospects) - 1:
|
||||
_batch_delay()
|
||||
stats_service.complete_job(job, processed=len(prospects))
|
||||
db.commit()
|
||||
return ScanBatchResponse(processed=len(prospects), successful=count)
|
||||
|
||||
@@ -133,10 +149,12 @@ def security_audit_batch(
|
||||
job = stats_service.create_job(db, JobType.SECURITY_AUDIT)
|
||||
prospects = prospect_service.get_pending_security_audit(db, limit=limit)
|
||||
count = 0
|
||||
for prospect in prospects:
|
||||
for i, prospect in enumerate(prospects):
|
||||
result = security_audit_service.run_audit(db, prospect)
|
||||
if result:
|
||||
count += 1
|
||||
if i < len(prospects) - 1:
|
||||
_batch_delay()
|
||||
stats_service.complete_job(job, processed=len(prospects))
|
||||
db.commit()
|
||||
return ScanBatchResponse(processed=len(prospects), successful=count)
|
||||
|
||||
Reference in New Issue
Block a user