- New scrape_content() method in enrichment_service: extracts meta
description, H1/H2 headings, paragraphs, images (filtered for size),
social links, service items, and detected languages using BeautifulSoup
- Scans 6 pages per prospect: /, /about, /a-propos, /services,
/nos-services, /contact
- Results stored as JSON in prospect.scraped_content_json
- New endpoints: POST /content-scrape/{id} and /content-scrape/batch
- Added to full_enrichment pipeline (Step 5, before security audit)
- CONTENT_SCRAPE job type for scan-jobs tracking
- "Content Scrape" batch button on scan-jobs page
- Add beautifulsoup4 to requirements.txt
Tested on batirenovation-strasbourg.fr: extracted 30 headings,
21 paragraphs, 13 images.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
358 lines
13 KiB
Python
358 lines
13 KiB
Python
# app/modules/prospecting/routes/api/admin_enrichment.py
|
|
"""
|
|
Admin API routes for enrichment/scanning pipeline.
|
|
|
|
NOTE: Batch routes MUST be defined before /{prospect_id} routes.
|
|
FastAPI matches routes in definition order, and {prospect_id} would
|
|
catch "batch" as a string before trying to parse it as int → 422.
|
|
"""
|
|
|
|
import logging
|
|
import time
|
|
|
|
from fastapi import APIRouter, Depends, Path, Query
|
|
from fastapi.responses import HTMLResponse
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.api.deps import get_current_admin_api
|
|
from app.core.database import get_db
|
|
from app.modules.prospecting.config import config as prospecting_config
|
|
from app.modules.prospecting.models import JobType
|
|
from app.modules.prospecting.schemas.enrichment import (
|
|
ContactScrapeResponse,
|
|
FullEnrichmentResponse,
|
|
HttpCheckBatchItem,
|
|
HttpCheckBatchResponse,
|
|
HttpCheckResult,
|
|
ScanBatchResponse,
|
|
ScanSingleResponse,
|
|
ScoreComputeBatchResponse,
|
|
)
|
|
from app.modules.prospecting.schemas.security_audit import (
|
|
SecurityAuditSingleResponse,
|
|
)
|
|
from app.modules.prospecting.services.enrichment_service import enrichment_service
|
|
from app.modules.prospecting.services.prospect_service import prospect_service
|
|
from app.modules.prospecting.services.scoring_service import scoring_service
|
|
from app.modules.prospecting.services.security_audit_service import (
|
|
security_audit_service,
|
|
)
|
|
from app.modules.prospecting.services.security_report_service import (
|
|
security_report_service,
|
|
)
|
|
from app.modules.prospecting.services.stats_service import stats_service
|
|
from app.modules.tenancy.schemas.auth import UserContext
|
|
|
|
router = APIRouter(prefix="/enrichment")
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _batch_delay():
|
|
"""Delay between prospects in batch scans to avoid rate limiting."""
|
|
if prospecting_config.batch_delay_seconds > 0:
|
|
time.sleep(prospecting_config.batch_delay_seconds)
|
|
|
|
|
|
# ── Batch endpoints (must be before /{prospect_id} routes) ──────────────────
|
|
|
|
|
|
@router.post("/http-check/batch", response_model=HttpCheckBatchResponse)
|
|
def http_check_batch(
|
|
limit: int = Query(100, ge=1, le=500),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Run HTTP check for pending prospects."""
|
|
job = stats_service.create_job(db,JobType.HTTP_CHECK)
|
|
prospects = prospect_service.get_pending_http_check(db, limit=limit)
|
|
results = []
|
|
for i, prospect in enumerate(prospects):
|
|
result = enrichment_service.check_http(db, prospect)
|
|
results.append(HttpCheckBatchItem(domain=prospect.domain_name, **result))
|
|
if i < len(prospects) - 1:
|
|
_batch_delay()
|
|
stats_service.complete_job(job, processed=len(results))
|
|
db.commit()
|
|
return HttpCheckBatchResponse(processed=len(results), results=results)
|
|
|
|
|
|
@router.post("/tech-scan/batch", response_model=ScanBatchResponse)
|
|
def tech_scan_batch(
|
|
limit: int = Query(100, ge=1, le=500),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Run tech scan for pending prospects."""
|
|
job = stats_service.create_job(db,JobType.TECH_SCAN)
|
|
prospects = prospect_service.get_pending_tech_scan(db, limit=limit)
|
|
count = 0
|
|
for i, prospect in enumerate(prospects):
|
|
result = enrichment_service.scan_tech_stack(db, prospect)
|
|
if result:
|
|
count += 1
|
|
if i < len(prospects) - 1:
|
|
_batch_delay()
|
|
stats_service.complete_job(job, processed=len(prospects))
|
|
db.commit()
|
|
return ScanBatchResponse(processed=len(prospects), successful=count)
|
|
|
|
|
|
@router.post("/performance/batch", response_model=ScanBatchResponse)
|
|
def performance_scan_batch(
|
|
limit: int = Query(50, ge=1, le=200),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Run performance scan for pending prospects."""
|
|
job = stats_service.create_job(db,JobType.PERFORMANCE_SCAN)
|
|
prospects = prospect_service.get_pending_performance_scan(db, limit=limit)
|
|
count = 0
|
|
for i, prospect in enumerate(prospects):
|
|
result = enrichment_service.scan_performance(db, prospect)
|
|
if result:
|
|
count += 1
|
|
if i < len(prospects) - 1:
|
|
_batch_delay()
|
|
stats_service.complete_job(job, processed=len(prospects))
|
|
db.commit()
|
|
return ScanBatchResponse(processed=len(prospects), successful=count)
|
|
|
|
|
|
@router.post("/contacts/batch", response_model=ScanBatchResponse)
|
|
def contact_scrape_batch(
|
|
limit: int = Query(50, ge=1, le=200),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Scrape contacts for pending prospects."""
|
|
job = stats_service.create_job(db,JobType.CONTACT_SCRAPE)
|
|
prospects = prospect_service.get_pending_contact_scrape(db, limit=limit)
|
|
count = 0
|
|
for i, prospect in enumerate(prospects):
|
|
contacts = enrichment_service.scrape_contacts(db, prospect)
|
|
if contacts:
|
|
count += 1
|
|
if i < len(prospects) - 1:
|
|
_batch_delay()
|
|
stats_service.complete_job(job, processed=len(prospects))
|
|
db.commit()
|
|
return ScanBatchResponse(processed=len(prospects), successful=count)
|
|
|
|
|
|
@router.post("/content-scrape/batch", response_model=ScanBatchResponse)
|
|
def content_scrape_batch(
|
|
limit: int = Query(50, ge=1, le=200),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Scrape page content for pending prospects."""
|
|
job = stats_service.create_job(db, JobType.CONTENT_SCRAPE)
|
|
prospects = prospect_service.get_pending_content_scrape(db, limit=limit)
|
|
count = 0
|
|
for i, prospect in enumerate(prospects):
|
|
result = enrichment_service.scrape_content(db, prospect)
|
|
if result:
|
|
count += 1
|
|
if i < len(prospects) - 1:
|
|
_batch_delay()
|
|
stats_service.complete_job(job, processed=len(prospects))
|
|
db.commit()
|
|
return ScanBatchResponse(processed=len(prospects), successful=count)
|
|
|
|
|
|
@router.post("/security-audit/batch", response_model=ScanBatchResponse)
|
|
def security_audit_batch(
|
|
limit: int = Query(50, ge=1, le=200),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Run security audit for pending prospects."""
|
|
job = stats_service.create_job(db, JobType.SECURITY_AUDIT)
|
|
prospects = prospect_service.get_pending_security_audit(db, limit=limit)
|
|
count = 0
|
|
for i, prospect in enumerate(prospects):
|
|
result = security_audit_service.run_audit(db, prospect)
|
|
if result:
|
|
count += 1
|
|
if i < len(prospects) - 1:
|
|
_batch_delay()
|
|
stats_service.complete_job(job, processed=len(prospects))
|
|
db.commit()
|
|
return ScanBatchResponse(processed=len(prospects), successful=count)
|
|
|
|
|
|
@router.post("/score-compute/batch", response_model=ScoreComputeBatchResponse)
|
|
def compute_scores_batch(
|
|
limit: int = Query(500, ge=1, le=5000),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Compute or recompute scores for all prospects."""
|
|
job = stats_service.create_job(db,JobType.SCORE_COMPUTE)
|
|
count = scoring_service.compute_all(db, limit=limit)
|
|
stats_service.complete_job(job,processed=count)
|
|
db.commit()
|
|
return ScoreComputeBatchResponse(scored=count)
|
|
|
|
|
|
# ── Report endpoints ────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.get("/security-audit/report/{prospect_id}", response_class=HTMLResponse)
|
|
def security_audit_report(
|
|
prospect_id: int = Path(...),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Generate branded HTML security audit report."""
|
|
prospect = prospect_service.get_by_id(db, prospect_id)
|
|
if not prospect.security_audit:
|
|
from app.exceptions.base import ResourceNotFoundException
|
|
|
|
raise ResourceNotFoundException("SecurityAudit", str(prospect_id))
|
|
html = security_report_service.generate_html_report(
|
|
audit=prospect.security_audit,
|
|
domain=prospect.domain_name,
|
|
)
|
|
return HTMLResponse(content=html)
|
|
|
|
|
|
# ── Single-prospect endpoints ───────────────────────────────────────────────
|
|
|
|
|
|
@router.post("/http-check/{prospect_id}", response_model=HttpCheckResult)
|
|
def http_check_single(
|
|
prospect_id: int = Path(...),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Run HTTP connectivity check for a single prospect."""
|
|
prospect = prospect_service.get_by_id(db, prospect_id)
|
|
result = enrichment_service.check_http(db, prospect)
|
|
db.commit()
|
|
return HttpCheckResult(**result)
|
|
|
|
|
|
@router.post("/tech-scan/{prospect_id}", response_model=ScanSingleResponse)
|
|
def tech_scan_single(
|
|
prospect_id: int = Path(...),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Run technology scan for a single prospect."""
|
|
prospect = prospect_service.get_by_id(db, prospect_id)
|
|
profile = enrichment_service.scan_tech_stack(db, prospect)
|
|
db.commit()
|
|
return ScanSingleResponse(domain=prospect.domain_name, profile=profile is not None)
|
|
|
|
|
|
@router.post("/performance/{prospect_id}", response_model=ScanSingleResponse)
|
|
def performance_scan_single(
|
|
prospect_id: int = Path(...),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Run PageSpeed audit for a single prospect."""
|
|
prospect = prospect_service.get_by_id(db, prospect_id)
|
|
profile = enrichment_service.scan_performance(db, prospect)
|
|
db.commit()
|
|
return ScanSingleResponse(domain=prospect.domain_name, profile=profile is not None)
|
|
|
|
|
|
@router.post("/contacts/{prospect_id}", response_model=ContactScrapeResponse)
|
|
def scrape_contacts_single(
|
|
prospect_id: int = Path(...),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Scrape contacts for a single prospect."""
|
|
prospect = prospect_service.get_by_id(db, prospect_id)
|
|
contacts = enrichment_service.scrape_contacts(db, prospect)
|
|
db.commit()
|
|
return ContactScrapeResponse(domain=prospect.domain_name, contacts_found=len(contacts))
|
|
|
|
|
|
@router.post("/security-audit/{prospect_id}", response_model=SecurityAuditSingleResponse)
|
|
def security_audit_single(
|
|
prospect_id: int = Path(...),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Run security audit for a single prospect."""
|
|
prospect = prospect_service.get_by_id(db, prospect_id)
|
|
audit = security_audit_service.run_audit(db, prospect)
|
|
db.commit()
|
|
findings_count = 0
|
|
if audit:
|
|
findings_count = audit.findings_count_critical + audit.findings_count_high + audit.findings_count_medium + audit.findings_count_low
|
|
return SecurityAuditSingleResponse(
|
|
domain=prospect.domain_name,
|
|
score=audit.score if audit else 0,
|
|
grade=audit.grade if audit else "F",
|
|
findings_count=findings_count,
|
|
)
|
|
|
|
|
|
@router.post("/content-scrape/{prospect_id}", response_model=ScanSingleResponse)
|
|
def content_scrape_single(
|
|
prospect_id: int = Path(...),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Scrape page content for a single prospect."""
|
|
prospect = prospect_service.get_by_id(db, prospect_id)
|
|
result = enrichment_service.scrape_content(db, prospect)
|
|
db.commit()
|
|
return ScanSingleResponse(domain=prospect.domain_name, profile=result is not None)
|
|
|
|
|
|
@router.post("/full/{prospect_id}", response_model=FullEnrichmentResponse)
|
|
def full_enrichment(
|
|
prospect_id: int = Path(...),
|
|
db: Session = Depends(get_db),
|
|
current_admin: UserContext = Depends(get_current_admin_api),
|
|
):
|
|
"""Run full enrichment pipeline for a single prospect."""
|
|
prospect = prospect_service.get_by_id(db, prospect_id)
|
|
|
|
# Step 1: HTTP check
|
|
enrichment_service.check_http(db, prospect)
|
|
|
|
# Step 2: Tech scan (if has website)
|
|
tech_profile = None
|
|
if prospect.has_website:
|
|
tech_profile = enrichment_service.scan_tech_stack(db, prospect)
|
|
|
|
# Step 3: Performance scan (if has website)
|
|
perf_profile = None
|
|
if prospect.has_website:
|
|
perf_profile = enrichment_service.scan_performance(db, prospect)
|
|
|
|
# Step 4: Contact scrape (if has website)
|
|
contacts = []
|
|
if prospect.has_website:
|
|
contacts = enrichment_service.scrape_contacts(db, prospect)
|
|
|
|
# Step 5: Content scrape (if has website)
|
|
if prospect.has_website:
|
|
enrichment_service.scrape_content(db, prospect)
|
|
|
|
# Step 6: Security audit (if has website)
|
|
if prospect.has_website:
|
|
security_audit_service.run_audit(db, prospect)
|
|
|
|
# Step 7: Compute score
|
|
db.refresh(prospect)
|
|
score = scoring_service.compute_score(db, prospect)
|
|
db.commit()
|
|
|
|
return FullEnrichmentResponse(
|
|
domain=prospect.domain_name,
|
|
has_website=prospect.has_website,
|
|
tech_scanned=tech_profile is not None,
|
|
perf_scanned=perf_profile is not None,
|
|
contacts_found=len(contacts),
|
|
score=score.score,
|
|
lead_tier=score.lead_tier,
|
|
)
|