feat(prospecting): add content scraping for POC builder (Workstream 3A)
- New scrape_content() method in enrichment_service: extracts meta
description, H1/H2 headings, paragraphs, images (filtered for size),
social links, service items, and detected languages using BeautifulSoup
- Scans 6 pages per prospect: /, /about, /a-propos, /services,
/nos-services, /contact
- Results stored as JSON in prospect.scraped_content_json
- New endpoints: POST /content-scrape/{id} and /content-scrape/batch
- Added to full_enrichment pipeline (Step 5, before security audit)
- CONTENT_SCRAPE job type for scan-jobs tracking
- "Content Scrape" batch button on scan-jobs page
- Add beautifulsoup4 to requirements.txt
Tested on batirenovation-strasbourg.fr: extracted 30 headings,
21 paragraphs, 13 images.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -139,6 +139,27 @@ def contact_scrape_batch(
|
||||
return ScanBatchResponse(processed=len(prospects), successful=count)
|
||||
|
||||
|
||||
@router.post("/content-scrape/batch", response_model=ScanBatchResponse)
|
||||
def content_scrape_batch(
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
db: Session = Depends(get_db),
|
||||
current_admin: UserContext = Depends(get_current_admin_api),
|
||||
):
|
||||
"""Scrape page content for pending prospects."""
|
||||
job = stats_service.create_job(db, JobType.CONTENT_SCRAPE)
|
||||
prospects = prospect_service.get_pending_content_scrape(db, limit=limit)
|
||||
count = 0
|
||||
for i, prospect in enumerate(prospects):
|
||||
result = enrichment_service.scrape_content(db, prospect)
|
||||
if result:
|
||||
count += 1
|
||||
if i < len(prospects) - 1:
|
||||
_batch_delay()
|
||||
stats_service.complete_job(job, processed=len(prospects))
|
||||
db.commit()
|
||||
return ScanBatchResponse(processed=len(prospects), successful=count)
|
||||
|
||||
|
||||
@router.post("/security-audit/batch", response_model=ScanBatchResponse)
|
||||
def security_audit_batch(
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
@@ -272,6 +293,19 @@ def security_audit_single(
|
||||
)
|
||||
|
||||
|
||||
@router.post("/content-scrape/{prospect_id}", response_model=ScanSingleResponse)
|
||||
def content_scrape_single(
|
||||
prospect_id: int = Path(...),
|
||||
db: Session = Depends(get_db),
|
||||
current_admin: UserContext = Depends(get_current_admin_api),
|
||||
):
|
||||
"""Scrape page content for a single prospect."""
|
||||
prospect = prospect_service.get_by_id(db, prospect_id)
|
||||
result = enrichment_service.scrape_content(db, prospect)
|
||||
db.commit()
|
||||
return ScanSingleResponse(domain=prospect.domain_name, profile=result is not None)
|
||||
|
||||
|
||||
@router.post("/full/{prospect_id}", response_model=FullEnrichmentResponse)
|
||||
def full_enrichment(
|
||||
prospect_id: int = Path(...),
|
||||
@@ -299,11 +333,15 @@ def full_enrichment(
|
||||
if prospect.has_website:
|
||||
contacts = enrichment_service.scrape_contacts(db, prospect)
|
||||
|
||||
# Step 5: Security audit (if has website)
|
||||
# Step 5: Content scrape (if has website)
|
||||
if prospect.has_website:
|
||||
enrichment_service.scrape_content(db, prospect)
|
||||
|
||||
# Step 6: Security audit (if has website)
|
||||
if prospect.has_website:
|
||||
security_audit_service.run_audit(db, prospect)
|
||||
|
||||
# Step 6: Compute score
|
||||
# Step 7: Compute score
|
||||
db.refresh(prospect)
|
||||
score = scoring_service.compute_score(db, prospect)
|
||||
db.commit()
|
||||
|
||||
Reference in New Issue
Block a user