feat(prospecting): add content scraping for POC builder (Workstream 3A)

- New scrape_content() method in enrichment_service: extracts meta
  description, H1/H2 headings, paragraphs, images (filtered for size),
  social links, service items, and detected languages using BeautifulSoup
- Scans 6 pages per prospect: /, /about, /a-propos, /services,
  /nos-services, /contact
- Results stored as JSON in prospect.scraped_content_json
- New endpoints: POST /content-scrape/{id} and /content-scrape/batch
- Added to full_enrichment pipeline (Step 5, before security audit)
- CONTENT_SCRAPE job type for scan-jobs tracking
- "Content Scrape" batch button on scan-jobs page
- Add beautifulsoup4 to requirements.txt

Tested on batirenovation-strasbourg.fr: extracted 30 headings,
21 paragraphs, 13 images.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-01 22:26:56 +02:00
parent 50a4fc38a7
commit 1828ac85eb
8 changed files with 218 additions and 2 deletions

View File

@@ -70,6 +70,10 @@ class Prospect(Base, TimestampMixin):
last_perf_scan_at = Column(DateTime, nullable=True)
last_contact_scrape_at = Column(DateTime, nullable=True)
last_security_audit_at = Column(DateTime, nullable=True)
last_content_scrape_at = Column(DateTime, nullable=True)
# Scraped page content for POC builder
scraped_content_json = Column(Text, nullable=True)
# Relationships
tech_profile = relationship("ProspectTechProfile", back_populates="prospect", uselist=False, cascade="all, delete-orphan")

View File

@@ -20,6 +20,7 @@ class JobType(str, enum.Enum):
SCORE_COMPUTE = "score_compute"
FULL_ENRICHMENT = "full_enrichment"
SECURITY_AUDIT = "security_audit"
CONTENT_SCRAPE = "content_scrape"
class JobStatus(str, enum.Enum):

View File

@@ -139,6 +139,27 @@ def contact_scrape_batch(
return ScanBatchResponse(processed=len(prospects), successful=count)
@router.post("/content-scrape/batch", response_model=ScanBatchResponse)
def content_scrape_batch(
limit: int = Query(50, ge=1, le=200),
db: Session = Depends(get_db),
current_admin: UserContext = Depends(get_current_admin_api),
):
"""Scrape page content for pending prospects."""
job = stats_service.create_job(db, JobType.CONTENT_SCRAPE)
prospects = prospect_service.get_pending_content_scrape(db, limit=limit)
count = 0
for i, prospect in enumerate(prospects):
result = enrichment_service.scrape_content(db, prospect)
if result:
count += 1
if i < len(prospects) - 1:
_batch_delay()
stats_service.complete_job(job, processed=len(prospects))
db.commit()
return ScanBatchResponse(processed=len(prospects), successful=count)
@router.post("/security-audit/batch", response_model=ScanBatchResponse)
def security_audit_batch(
limit: int = Query(50, ge=1, le=200),
@@ -272,6 +293,19 @@ def security_audit_single(
)
@router.post("/content-scrape/{prospect_id}", response_model=ScanSingleResponse)
def content_scrape_single(
prospect_id: int = Path(...),
db: Session = Depends(get_db),
current_admin: UserContext = Depends(get_current_admin_api),
):
"""Scrape page content for a single prospect."""
prospect = prospect_service.get_by_id(db, prospect_id)
result = enrichment_service.scrape_content(db, prospect)
db.commit()
return ScanSingleResponse(domain=prospect.domain_name, profile=result is not None)
@router.post("/full/{prospect_id}", response_model=FullEnrichmentResponse)
def full_enrichment(
prospect_id: int = Path(...),
@@ -299,11 +333,15 @@ def full_enrichment(
if prospect.has_website:
contacts = enrichment_service.scrape_contacts(db, prospect)
# Step 5: Security audit (if has website)
# Step 5: Content scrape (if has website)
if prospect.has_website:
enrichment_service.scrape_content(db, prospect)
# Step 6: Security audit (if has website)
if prospect.has_website:
security_audit_service.run_audit(db, prospect)
# Step 6: Compute score
# Step 7: Compute score
db.refresh(prospect)
score = scoring_service.compute_score(db, prospect)
db.commit()

View File

@@ -468,4 +468,159 @@ class EnrichmentService:
return ",".join(found) if found else None
def scrape_content(self, db: Session, prospect: Prospect) -> dict | None:
"""Scrape page content (headings, paragraphs, images, services) for POC builder.
Uses BeautifulSoup to extract structured content from the prospect's
website. Stores results as JSON in prospect.scraped_content_json.
"""
import json
from bs4 import BeautifulSoup
domain = prospect.domain_name
if not domain or not prospect.has_website:
return None
scheme = "https" if prospect.uses_https else "http"
base_url = f"{scheme}://{domain}"
paths = ["", "/about", "/a-propos", "/services", "/nos-services", "/contact"]
session = requests.Session()
session.verify = False # noqa: SEC047 passive scan
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"})
content = {
"meta_description": None,
"headings": [],
"paragraphs": [],
"services": [],
"images": [],
"social_links": {},
"business_hours": None,
"languages_detected": [],
}
seen_headings = set()
seen_paragraphs = set()
for path in paths:
try:
url = base_url + path
resp = session.get(url, timeout=config.http_timeout, allow_redirects=True)
if resp.status_code != 200:
continue
soup = BeautifulSoup(resp.text, "html.parser")
# Meta description (first one found)
if not content["meta_description"]:
meta = soup.find("meta", attrs={"name": "description"})
if meta and meta.get("content"):
content["meta_description"] = meta["content"].strip()
# Language detection
html_tag = soup.find("html")
if html_tag and html_tag.get("lang"):
lang = html_tag["lang"][:2].lower()
if lang not in content["languages_detected"]:
content["languages_detected"].append(lang)
# Headings (H1, H2)
for tag in soup.find_all(["h1", "h2"]):
text = tag.get_text(strip=True)
if text and len(text) > 3 and text not in seen_headings:
seen_headings.add(text)
content["headings"].append(text)
# Paragraphs (substantial ones, skip tiny/boilerplate)
for tag in soup.find_all("p"):
text = tag.get_text(strip=True)
if text and len(text) > 50 and text not in seen_paragraphs:
seen_paragraphs.add(text)
content["paragraphs"].append(text)
if len(content["paragraphs"]) >= 20:
break
# Images (hero/banner sized, skip tiny icons)
for img in soup.find_all("img"):
src = img.get("src") or img.get("data-src")
if not src:
continue
# Make absolute
if src.startswith("//"):
src = "https:" + src
elif src.startswith("/"):
src = base_url + src
elif not src.startswith("http"):
continue
# Skip tiny images, data URIs, tracking pixels
if "1x1" in src or "pixel" in src or src.startswith("data:"):
continue
width = img.get("width", "")
height = img.get("height", "")
if width and width.isdigit() and int(width) < 100:
continue
if height and height.isdigit() and int(height) < 100:
continue
if src not in content["images"]:
content["images"].append(src)
if len(content["images"]) >= 15:
break
# Social links
for a in soup.find_all("a", href=True):
href = a["href"]
for platform, pattern in [
("facebook", "facebook.com"),
("instagram", "instagram.com"),
("linkedin", "linkedin.com"),
("twitter", "twitter.com"),
("youtube", "youtube.com"),
("tiktok", "tiktok.com"),
]:
if pattern in href and platform not in content["social_links"]:
content["social_links"][platform] = href
# Service items (from list items near "service" headings)
for heading in soup.find_all(["h2", "h3"]):
heading_text = heading.get_text(strip=True).lower()
if any(kw in heading_text for kw in ["service", "prestation", "leistung", "angebot", "nos activit"]):
# Look for list items or cards after this heading
sibling = heading.find_next_sibling()
while sibling and sibling.name not in ["h1", "h2", "h3"]:
if sibling.name in ["ul", "ol"]:
for li in sibling.find_all("li"):
text = li.get_text(strip=True)
if text and len(text) > 3 and text not in content["services"]:
content["services"].append(text)
elif sibling.name == "div":
# Cards pattern: divs with h3/h4 + p
card_title = sibling.find(["h3", "h4", "h5"])
if card_title:
text = card_title.get_text(strip=True)
if text and text not in content["services"]:
content["services"].append(text)
sibling = sibling.find_next_sibling()
if len(content["services"]) >= 10:
break
except Exception as e: # noqa: EXC003
logger.debug("Content scrape failed for %s%s: %s", domain, path, e)
session.close()
# Store results
prospect.scraped_content_json = json.dumps(content, ensure_ascii=False)
prospect.last_content_scrape_at = datetime.now(UTC)
db.flush()
logger.info(
"Content scrape for %s: %d headings, %d paragraphs, %d images, %d services",
domain, len(content["headings"]), len(content["paragraphs"]),
len(content["images"]), len(content["services"]),
)
return content
enrichment_service = EnrichmentService()

View File

@@ -251,6 +251,17 @@ class ProspectService:
.all()
)
def get_pending_content_scrape(self, db: Session, limit: int = 100) -> list[Prospect]:
return (
db.query(Prospect)
.filter(
Prospect.has_website.is_(True),
Prospect.last_content_scrape_at.is_(None),
)
.limit(limit)
.all()
)
def get_pending_security_audit(self, db: Session, limit: int = 50) -> list[Prospect]:
return (
db.query(Prospect)

View File

@@ -53,6 +53,7 @@ function scanJobs() {
'tech_scan': 'tech-scan',
'performance_scan': 'performance',
'contact_scrape': 'contacts',
'content_scrape': 'content-scrape',
'security_audit': 'security-audit',
'score_compute': 'score-compute',
},

View File

@@ -34,6 +34,11 @@
<span x-html="$icon('mail', 'w-4 h-4 mr-2')"></span>
Contact Scrape
</button>
<button type="button" @click="startBatchJob('content_scrape')"
class="inline-flex items-center px-4 py-2 text-sm font-medium leading-5 text-white transition-colors duration-150 bg-teal-600 border border-transparent rounded-lg hover:bg-teal-700 focus:outline-none">
<span x-html="$icon('document-text', 'w-4 h-4 mr-2')"></span>
Content Scrape
</button>
<button type="button" @click="startBatchJob('security_audit')"
class="inline-flex items-center px-4 py-2 text-sm font-medium leading-5 text-white transition-colors duration-150 bg-yellow-600 border border-transparent rounded-lg hover:bg-yellow-700 focus:outline-none">
<span x-html="$icon('shield-check', 'w-4 h-4 mr-2')"></span>