feat(prospecting): add content scraping for POC builder (Workstream 3A)
- New scrape_content() method in enrichment_service: extracts meta
description, H1/H2 headings, paragraphs, images (filtered for size),
social links, service items, and detected languages using BeautifulSoup
- Scans 6 pages per prospect: /, /about, /a-propos, /services,
/nos-services, /contact
- Results stored as JSON in prospect.scraped_content_json
- New endpoints: POST /content-scrape/{id} and /content-scrape/batch
- Added to full_enrichment pipeline (Step 5, before security audit)
- CONTENT_SCRAPE job type for scan-jobs tracking
- "Content Scrape" batch button on scan-jobs page
- Add beautifulsoup4 to requirements.txt
Tested on batirenovation-strasbourg.fr: extracted 30 headings,
21 paragraphs, 13 images.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -70,6 +70,10 @@ class Prospect(Base, TimestampMixin):
|
||||
last_perf_scan_at = Column(DateTime, nullable=True)
|
||||
last_contact_scrape_at = Column(DateTime, nullable=True)
|
||||
last_security_audit_at = Column(DateTime, nullable=True)
|
||||
last_content_scrape_at = Column(DateTime, nullable=True)
|
||||
|
||||
# Scraped page content for POC builder
|
||||
scraped_content_json = Column(Text, nullable=True)
|
||||
|
||||
# Relationships
|
||||
tech_profile = relationship("ProspectTechProfile", back_populates="prospect", uselist=False, cascade="all, delete-orphan")
|
||||
|
||||
@@ -20,6 +20,7 @@ class JobType(str, enum.Enum):
|
||||
SCORE_COMPUTE = "score_compute"
|
||||
FULL_ENRICHMENT = "full_enrichment"
|
||||
SECURITY_AUDIT = "security_audit"
|
||||
CONTENT_SCRAPE = "content_scrape"
|
||||
|
||||
|
||||
class JobStatus(str, enum.Enum):
|
||||
|
||||
@@ -139,6 +139,27 @@ def contact_scrape_batch(
|
||||
return ScanBatchResponse(processed=len(prospects), successful=count)
|
||||
|
||||
|
||||
@router.post("/content-scrape/batch", response_model=ScanBatchResponse)
|
||||
def content_scrape_batch(
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
db: Session = Depends(get_db),
|
||||
current_admin: UserContext = Depends(get_current_admin_api),
|
||||
):
|
||||
"""Scrape page content for pending prospects."""
|
||||
job = stats_service.create_job(db, JobType.CONTENT_SCRAPE)
|
||||
prospects = prospect_service.get_pending_content_scrape(db, limit=limit)
|
||||
count = 0
|
||||
for i, prospect in enumerate(prospects):
|
||||
result = enrichment_service.scrape_content(db, prospect)
|
||||
if result:
|
||||
count += 1
|
||||
if i < len(prospects) - 1:
|
||||
_batch_delay()
|
||||
stats_service.complete_job(job, processed=len(prospects))
|
||||
db.commit()
|
||||
return ScanBatchResponse(processed=len(prospects), successful=count)
|
||||
|
||||
|
||||
@router.post("/security-audit/batch", response_model=ScanBatchResponse)
|
||||
def security_audit_batch(
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
@@ -272,6 +293,19 @@ def security_audit_single(
|
||||
)
|
||||
|
||||
|
||||
@router.post("/content-scrape/{prospect_id}", response_model=ScanSingleResponse)
|
||||
def content_scrape_single(
|
||||
prospect_id: int = Path(...),
|
||||
db: Session = Depends(get_db),
|
||||
current_admin: UserContext = Depends(get_current_admin_api),
|
||||
):
|
||||
"""Scrape page content for a single prospect."""
|
||||
prospect = prospect_service.get_by_id(db, prospect_id)
|
||||
result = enrichment_service.scrape_content(db, prospect)
|
||||
db.commit()
|
||||
return ScanSingleResponse(domain=prospect.domain_name, profile=result is not None)
|
||||
|
||||
|
||||
@router.post("/full/{prospect_id}", response_model=FullEnrichmentResponse)
|
||||
def full_enrichment(
|
||||
prospect_id: int = Path(...),
|
||||
@@ -299,11 +333,15 @@ def full_enrichment(
|
||||
if prospect.has_website:
|
||||
contacts = enrichment_service.scrape_contacts(db, prospect)
|
||||
|
||||
# Step 5: Security audit (if has website)
|
||||
# Step 5: Content scrape (if has website)
|
||||
if prospect.has_website:
|
||||
enrichment_service.scrape_content(db, prospect)
|
||||
|
||||
# Step 6: Security audit (if has website)
|
||||
if prospect.has_website:
|
||||
security_audit_service.run_audit(db, prospect)
|
||||
|
||||
# Step 6: Compute score
|
||||
# Step 7: Compute score
|
||||
db.refresh(prospect)
|
||||
score = scoring_service.compute_score(db, prospect)
|
||||
db.commit()
|
||||
|
||||
@@ -468,4 +468,159 @@ class EnrichmentService:
|
||||
return ",".join(found) if found else None
|
||||
|
||||
|
||||
def scrape_content(self, db: Session, prospect: Prospect) -> dict | None:
|
||||
"""Scrape page content (headings, paragraphs, images, services) for POC builder.
|
||||
|
||||
Uses BeautifulSoup to extract structured content from the prospect's
|
||||
website. Stores results as JSON in prospect.scraped_content_json.
|
||||
"""
|
||||
import json
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
domain = prospect.domain_name
|
||||
if not domain or not prospect.has_website:
|
||||
return None
|
||||
|
||||
scheme = "https" if prospect.uses_https else "http"
|
||||
base_url = f"{scheme}://{domain}"
|
||||
paths = ["", "/about", "/a-propos", "/services", "/nos-services", "/contact"]
|
||||
|
||||
session = requests.Session()
|
||||
session.verify = False # noqa: SEC047 passive scan
|
||||
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"})
|
||||
|
||||
content = {
|
||||
"meta_description": None,
|
||||
"headings": [],
|
||||
"paragraphs": [],
|
||||
"services": [],
|
||||
"images": [],
|
||||
"social_links": {},
|
||||
"business_hours": None,
|
||||
"languages_detected": [],
|
||||
}
|
||||
|
||||
seen_headings = set()
|
||||
seen_paragraphs = set()
|
||||
|
||||
for path in paths:
|
||||
try:
|
||||
url = base_url + path
|
||||
resp = session.get(url, timeout=config.http_timeout, allow_redirects=True)
|
||||
if resp.status_code != 200:
|
||||
continue
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
# Meta description (first one found)
|
||||
if not content["meta_description"]:
|
||||
meta = soup.find("meta", attrs={"name": "description"})
|
||||
if meta and meta.get("content"):
|
||||
content["meta_description"] = meta["content"].strip()
|
||||
|
||||
# Language detection
|
||||
html_tag = soup.find("html")
|
||||
if html_tag and html_tag.get("lang"):
|
||||
lang = html_tag["lang"][:2].lower()
|
||||
if lang not in content["languages_detected"]:
|
||||
content["languages_detected"].append(lang)
|
||||
|
||||
# Headings (H1, H2)
|
||||
for tag in soup.find_all(["h1", "h2"]):
|
||||
text = tag.get_text(strip=True)
|
||||
if text and len(text) > 3 and text not in seen_headings:
|
||||
seen_headings.add(text)
|
||||
content["headings"].append(text)
|
||||
|
||||
# Paragraphs (substantial ones, skip tiny/boilerplate)
|
||||
for tag in soup.find_all("p"):
|
||||
text = tag.get_text(strip=True)
|
||||
if text and len(text) > 50 and text not in seen_paragraphs:
|
||||
seen_paragraphs.add(text)
|
||||
content["paragraphs"].append(text)
|
||||
if len(content["paragraphs"]) >= 20:
|
||||
break
|
||||
|
||||
# Images (hero/banner sized, skip tiny icons)
|
||||
for img in soup.find_all("img"):
|
||||
src = img.get("src") or img.get("data-src")
|
||||
if not src:
|
||||
continue
|
||||
# Make absolute
|
||||
if src.startswith("//"):
|
||||
src = "https:" + src
|
||||
elif src.startswith("/"):
|
||||
src = base_url + src
|
||||
elif not src.startswith("http"):
|
||||
continue
|
||||
# Skip tiny images, data URIs, tracking pixels
|
||||
if "1x1" in src or "pixel" in src or src.startswith("data:"):
|
||||
continue
|
||||
width = img.get("width", "")
|
||||
height = img.get("height", "")
|
||||
if width and width.isdigit() and int(width) < 100:
|
||||
continue
|
||||
if height and height.isdigit() and int(height) < 100:
|
||||
continue
|
||||
if src not in content["images"]:
|
||||
content["images"].append(src)
|
||||
if len(content["images"]) >= 15:
|
||||
break
|
||||
|
||||
# Social links
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
for platform, pattern in [
|
||||
("facebook", "facebook.com"),
|
||||
("instagram", "instagram.com"),
|
||||
("linkedin", "linkedin.com"),
|
||||
("twitter", "twitter.com"),
|
||||
("youtube", "youtube.com"),
|
||||
("tiktok", "tiktok.com"),
|
||||
]:
|
||||
if pattern in href and platform not in content["social_links"]:
|
||||
content["social_links"][platform] = href
|
||||
|
||||
# Service items (from list items near "service" headings)
|
||||
for heading in soup.find_all(["h2", "h3"]):
|
||||
heading_text = heading.get_text(strip=True).lower()
|
||||
if any(kw in heading_text for kw in ["service", "prestation", "leistung", "angebot", "nos activit"]):
|
||||
# Look for list items or cards after this heading
|
||||
sibling = heading.find_next_sibling()
|
||||
while sibling and sibling.name not in ["h1", "h2", "h3"]:
|
||||
if sibling.name in ["ul", "ol"]:
|
||||
for li in sibling.find_all("li"):
|
||||
text = li.get_text(strip=True)
|
||||
if text and len(text) > 3 and text not in content["services"]:
|
||||
content["services"].append(text)
|
||||
elif sibling.name == "div":
|
||||
# Cards pattern: divs with h3/h4 + p
|
||||
card_title = sibling.find(["h3", "h4", "h5"])
|
||||
if card_title:
|
||||
text = card_title.get_text(strip=True)
|
||||
if text and text not in content["services"]:
|
||||
content["services"].append(text)
|
||||
sibling = sibling.find_next_sibling()
|
||||
if len(content["services"]) >= 10:
|
||||
break
|
||||
|
||||
except Exception as e: # noqa: EXC003
|
||||
logger.debug("Content scrape failed for %s%s: %s", domain, path, e)
|
||||
|
||||
session.close()
|
||||
|
||||
# Store results
|
||||
prospect.scraped_content_json = json.dumps(content, ensure_ascii=False)
|
||||
prospect.last_content_scrape_at = datetime.now(UTC)
|
||||
db.flush()
|
||||
|
||||
logger.info(
|
||||
"Content scrape for %s: %d headings, %d paragraphs, %d images, %d services",
|
||||
domain, len(content["headings"]), len(content["paragraphs"]),
|
||||
len(content["images"]), len(content["services"]),
|
||||
)
|
||||
return content
|
||||
|
||||
|
||||
enrichment_service = EnrichmentService()
|
||||
|
||||
@@ -251,6 +251,17 @@ class ProspectService:
|
||||
.all()
|
||||
)
|
||||
|
||||
def get_pending_content_scrape(self, db: Session, limit: int = 100) -> list[Prospect]:
|
||||
return (
|
||||
db.query(Prospect)
|
||||
.filter(
|
||||
Prospect.has_website.is_(True),
|
||||
Prospect.last_content_scrape_at.is_(None),
|
||||
)
|
||||
.limit(limit)
|
||||
.all()
|
||||
)
|
||||
|
||||
def get_pending_security_audit(self, db: Session, limit: int = 50) -> list[Prospect]:
|
||||
return (
|
||||
db.query(Prospect)
|
||||
|
||||
@@ -53,6 +53,7 @@ function scanJobs() {
|
||||
'tech_scan': 'tech-scan',
|
||||
'performance_scan': 'performance',
|
||||
'contact_scrape': 'contacts',
|
||||
'content_scrape': 'content-scrape',
|
||||
'security_audit': 'security-audit',
|
||||
'score_compute': 'score-compute',
|
||||
},
|
||||
|
||||
@@ -34,6 +34,11 @@
|
||||
<span x-html="$icon('mail', 'w-4 h-4 mr-2')"></span>
|
||||
Contact Scrape
|
||||
</button>
|
||||
<button type="button" @click="startBatchJob('content_scrape')"
|
||||
class="inline-flex items-center px-4 py-2 text-sm font-medium leading-5 text-white transition-colors duration-150 bg-teal-600 border border-transparent rounded-lg hover:bg-teal-700 focus:outline-none">
|
||||
<span x-html="$icon('document-text', 'w-4 h-4 mr-2')"></span>
|
||||
Content Scrape
|
||||
</button>
|
||||
<button type="button" @click="startBatchJob('security_audit')"
|
||||
class="inline-flex items-center px-4 py-2 text-sm font-medium leading-5 text-white transition-colors duration-150 bg-yellow-600 border border-transparent rounded-lg hover:bg-yellow-700 focus:outline-none">
|
||||
<span x-html="$icon('shield-check', 'w-4 h-4 mr-2')"></span>
|
||||
|
||||
Reference in New Issue
Block a user