feat(prospecting): add batch delay + fix Celery error_message field

- Add PROSPECTING_BATCH_DELAY_SECONDS config (default 1.0s) — polite
  delay between prospects in batch scans to avoid rate limiting
- Apply delay to all 5 batch API endpoints and all Celery tasks
- Fix Celery tasks: error_message → error_log (matches model field)
- Add batch-scanning.md docs with rate limiting guide, scaling estimates
  for 70k+ URL imports, and pipeline order recommendations

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-01 21:55:24 +02:00
parent 30f3dae5a3
commit 50a4fc38a7
4 changed files with 120 additions and 15 deletions

View File

@@ -8,6 +8,7 @@ catch "batch" as a string before trying to parse it as int → 422.
"""
import logging
import time
from fastapi import APIRouter, Depends, Path, Query
from fastapi.responses import HTMLResponse
@@ -15,6 +16,7 @@ from sqlalchemy.orm import Session
from app.api.deps import get_current_admin_api
from app.core.database import get_db
from app.modules.prospecting.config import config as prospecting_config
from app.modules.prospecting.models import JobType
from app.modules.prospecting.schemas.enrichment import (
ContactScrapeResponse,
@@ -45,6 +47,12 @@ router = APIRouter(prefix="/enrichment")
logger = logging.getLogger(__name__)
def _batch_delay():
"""Delay between prospects in batch scans to avoid rate limiting."""
if prospecting_config.batch_delay_seconds > 0:
time.sleep(prospecting_config.batch_delay_seconds)
# ── Batch endpoints (must be before /{prospect_id} routes) ──────────────────
@@ -58,10 +66,12 @@ def http_check_batch(
job = stats_service.create_job(db,JobType.HTTP_CHECK)
prospects = prospect_service.get_pending_http_check(db, limit=limit)
results = []
for prospect in prospects:
for i, prospect in enumerate(prospects):
result = enrichment_service.check_http(db, prospect)
results.append(HttpCheckBatchItem(domain=prospect.domain_name, **result))
stats_service.complete_job(job,processed=len(results))
if i < len(prospects) - 1:
_batch_delay()
stats_service.complete_job(job, processed=len(results))
db.commit()
return HttpCheckBatchResponse(processed=len(results), results=results)
@@ -76,11 +86,13 @@ def tech_scan_batch(
job = stats_service.create_job(db,JobType.TECH_SCAN)
prospects = prospect_service.get_pending_tech_scan(db, limit=limit)
count = 0
for prospect in prospects:
for i, prospect in enumerate(prospects):
result = enrichment_service.scan_tech_stack(db, prospect)
if result:
count += 1
stats_service.complete_job(job,processed=len(prospects))
if i < len(prospects) - 1:
_batch_delay()
stats_service.complete_job(job, processed=len(prospects))
db.commit()
return ScanBatchResponse(processed=len(prospects), successful=count)
@@ -95,11 +107,13 @@ def performance_scan_batch(
job = stats_service.create_job(db,JobType.PERFORMANCE_SCAN)
prospects = prospect_service.get_pending_performance_scan(db, limit=limit)
count = 0
for prospect in prospects:
for i, prospect in enumerate(prospects):
result = enrichment_service.scan_performance(db, prospect)
if result:
count += 1
stats_service.complete_job(job,processed=len(prospects))
if i < len(prospects) - 1:
_batch_delay()
stats_service.complete_job(job, processed=len(prospects))
db.commit()
return ScanBatchResponse(processed=len(prospects), successful=count)
@@ -114,11 +128,13 @@ def contact_scrape_batch(
job = stats_service.create_job(db,JobType.CONTACT_SCRAPE)
prospects = prospect_service.get_pending_contact_scrape(db, limit=limit)
count = 0
for prospect in prospects:
for i, prospect in enumerate(prospects):
contacts = enrichment_service.scrape_contacts(db, prospect)
if contacts:
count += 1
stats_service.complete_job(job,processed=len(prospects))
if i < len(prospects) - 1:
_batch_delay()
stats_service.complete_job(job, processed=len(prospects))
db.commit()
return ScanBatchResponse(processed=len(prospects), successful=count)
@@ -133,10 +149,12 @@ def security_audit_batch(
job = stats_service.create_job(db, JobType.SECURITY_AUDIT)
prospects = prospect_service.get_pending_security_audit(db, limit=limit)
count = 0
for prospect in prospects:
for i, prospect in enumerate(prospects):
result = security_audit_service.run_audit(db, prospect)
if result:
count += 1
if i < len(prospects) - 1:
_batch_delay()
stats_service.complete_job(job, processed=len(prospects))
db.commit()
return ScanBatchResponse(processed=len(prospects), successful=count)