feat(prospecting): add content scraping for POC builder (Workstream 3A)
- New scrape_content() method in enrichment_service: extracts meta
description, H1/H2 headings, paragraphs, images (filtered for size),
social links, service items, and detected languages using BeautifulSoup
- Scans 6 pages per prospect: /, /about, /a-propos, /services,
/nos-services, /contact
- Results stored as JSON in prospect.scraped_content_json
- New endpoints: POST /content-scrape/{id} and /content-scrape/batch
- Added to full_enrichment pipeline (Step 5, before security audit)
- CONTENT_SCRAPE job type for scan-jobs tracking
- "Content Scrape" batch button on scan-jobs page
- Add beautifulsoup4 to requirements.txt
Tested on batirenovation-strasbourg.fr: extracted 30 headings,
21 paragraphs, 13 images.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -70,6 +70,10 @@ class Prospect(Base, TimestampMixin):
|
|||||||
last_perf_scan_at = Column(DateTime, nullable=True)
|
last_perf_scan_at = Column(DateTime, nullable=True)
|
||||||
last_contact_scrape_at = Column(DateTime, nullable=True)
|
last_contact_scrape_at = Column(DateTime, nullable=True)
|
||||||
last_security_audit_at = Column(DateTime, nullable=True)
|
last_security_audit_at = Column(DateTime, nullable=True)
|
||||||
|
last_content_scrape_at = Column(DateTime, nullable=True)
|
||||||
|
|
||||||
|
# Scraped page content for POC builder
|
||||||
|
scraped_content_json = Column(Text, nullable=True)
|
||||||
|
|
||||||
# Relationships
|
# Relationships
|
||||||
tech_profile = relationship("ProspectTechProfile", back_populates="prospect", uselist=False, cascade="all, delete-orphan")
|
tech_profile = relationship("ProspectTechProfile", back_populates="prospect", uselist=False, cascade="all, delete-orphan")
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ class JobType(str, enum.Enum):
|
|||||||
SCORE_COMPUTE = "score_compute"
|
SCORE_COMPUTE = "score_compute"
|
||||||
FULL_ENRICHMENT = "full_enrichment"
|
FULL_ENRICHMENT = "full_enrichment"
|
||||||
SECURITY_AUDIT = "security_audit"
|
SECURITY_AUDIT = "security_audit"
|
||||||
|
CONTENT_SCRAPE = "content_scrape"
|
||||||
|
|
||||||
|
|
||||||
class JobStatus(str, enum.Enum):
|
class JobStatus(str, enum.Enum):
|
||||||
|
|||||||
@@ -139,6 +139,27 @@ def contact_scrape_batch(
|
|||||||
return ScanBatchResponse(processed=len(prospects), successful=count)
|
return ScanBatchResponse(processed=len(prospects), successful=count)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/content-scrape/batch", response_model=ScanBatchResponse)
|
||||||
|
def content_scrape_batch(
|
||||||
|
limit: int = Query(50, ge=1, le=200),
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
current_admin: UserContext = Depends(get_current_admin_api),
|
||||||
|
):
|
||||||
|
"""Scrape page content for pending prospects."""
|
||||||
|
job = stats_service.create_job(db, JobType.CONTENT_SCRAPE)
|
||||||
|
prospects = prospect_service.get_pending_content_scrape(db, limit=limit)
|
||||||
|
count = 0
|
||||||
|
for i, prospect in enumerate(prospects):
|
||||||
|
result = enrichment_service.scrape_content(db, prospect)
|
||||||
|
if result:
|
||||||
|
count += 1
|
||||||
|
if i < len(prospects) - 1:
|
||||||
|
_batch_delay()
|
||||||
|
stats_service.complete_job(job, processed=len(prospects))
|
||||||
|
db.commit()
|
||||||
|
return ScanBatchResponse(processed=len(prospects), successful=count)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/security-audit/batch", response_model=ScanBatchResponse)
|
@router.post("/security-audit/batch", response_model=ScanBatchResponse)
|
||||||
def security_audit_batch(
|
def security_audit_batch(
|
||||||
limit: int = Query(50, ge=1, le=200),
|
limit: int = Query(50, ge=1, le=200),
|
||||||
@@ -272,6 +293,19 @@ def security_audit_single(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/content-scrape/{prospect_id}", response_model=ScanSingleResponse)
|
||||||
|
def content_scrape_single(
|
||||||
|
prospect_id: int = Path(...),
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
current_admin: UserContext = Depends(get_current_admin_api),
|
||||||
|
):
|
||||||
|
"""Scrape page content for a single prospect."""
|
||||||
|
prospect = prospect_service.get_by_id(db, prospect_id)
|
||||||
|
result = enrichment_service.scrape_content(db, prospect)
|
||||||
|
db.commit()
|
||||||
|
return ScanSingleResponse(domain=prospect.domain_name, profile=result is not None)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/full/{prospect_id}", response_model=FullEnrichmentResponse)
|
@router.post("/full/{prospect_id}", response_model=FullEnrichmentResponse)
|
||||||
def full_enrichment(
|
def full_enrichment(
|
||||||
prospect_id: int = Path(...),
|
prospect_id: int = Path(...),
|
||||||
@@ -299,11 +333,15 @@ def full_enrichment(
|
|||||||
if prospect.has_website:
|
if prospect.has_website:
|
||||||
contacts = enrichment_service.scrape_contacts(db, prospect)
|
contacts = enrichment_service.scrape_contacts(db, prospect)
|
||||||
|
|
||||||
# Step 5: Security audit (if has website)
|
# Step 5: Content scrape (if has website)
|
||||||
|
if prospect.has_website:
|
||||||
|
enrichment_service.scrape_content(db, prospect)
|
||||||
|
|
||||||
|
# Step 6: Security audit (if has website)
|
||||||
if prospect.has_website:
|
if prospect.has_website:
|
||||||
security_audit_service.run_audit(db, prospect)
|
security_audit_service.run_audit(db, prospect)
|
||||||
|
|
||||||
# Step 6: Compute score
|
# Step 7: Compute score
|
||||||
db.refresh(prospect)
|
db.refresh(prospect)
|
||||||
score = scoring_service.compute_score(db, prospect)
|
score = scoring_service.compute_score(db, prospect)
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|||||||
@@ -468,4 +468,159 @@ class EnrichmentService:
|
|||||||
return ",".join(found) if found else None
|
return ",".join(found) if found else None
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_content(self, db: Session, prospect: Prospect) -> dict | None:
|
||||||
|
"""Scrape page content (headings, paragraphs, images, services) for POC builder.
|
||||||
|
|
||||||
|
Uses BeautifulSoup to extract structured content from the prospect's
|
||||||
|
website. Stores results as JSON in prospect.scraped_content_json.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
domain = prospect.domain_name
|
||||||
|
if not domain or not prospect.has_website:
|
||||||
|
return None
|
||||||
|
|
||||||
|
scheme = "https" if prospect.uses_https else "http"
|
||||||
|
base_url = f"{scheme}://{domain}"
|
||||||
|
paths = ["", "/about", "/a-propos", "/services", "/nos-services", "/contact"]
|
||||||
|
|
||||||
|
session = requests.Session()
|
||||||
|
session.verify = False # noqa: SEC047 passive scan
|
||||||
|
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"})
|
||||||
|
|
||||||
|
content = {
|
||||||
|
"meta_description": None,
|
||||||
|
"headings": [],
|
||||||
|
"paragraphs": [],
|
||||||
|
"services": [],
|
||||||
|
"images": [],
|
||||||
|
"social_links": {},
|
||||||
|
"business_hours": None,
|
||||||
|
"languages_detected": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
seen_headings = set()
|
||||||
|
seen_paragraphs = set()
|
||||||
|
|
||||||
|
for path in paths:
|
||||||
|
try:
|
||||||
|
url = base_url + path
|
||||||
|
resp = session.get(url, timeout=config.http_timeout, allow_redirects=True)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
continue
|
||||||
|
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
|
# Meta description (first one found)
|
||||||
|
if not content["meta_description"]:
|
||||||
|
meta = soup.find("meta", attrs={"name": "description"})
|
||||||
|
if meta and meta.get("content"):
|
||||||
|
content["meta_description"] = meta["content"].strip()
|
||||||
|
|
||||||
|
# Language detection
|
||||||
|
html_tag = soup.find("html")
|
||||||
|
if html_tag and html_tag.get("lang"):
|
||||||
|
lang = html_tag["lang"][:2].lower()
|
||||||
|
if lang not in content["languages_detected"]:
|
||||||
|
content["languages_detected"].append(lang)
|
||||||
|
|
||||||
|
# Headings (H1, H2)
|
||||||
|
for tag in soup.find_all(["h1", "h2"]):
|
||||||
|
text = tag.get_text(strip=True)
|
||||||
|
if text and len(text) > 3 and text not in seen_headings:
|
||||||
|
seen_headings.add(text)
|
||||||
|
content["headings"].append(text)
|
||||||
|
|
||||||
|
# Paragraphs (substantial ones, skip tiny/boilerplate)
|
||||||
|
for tag in soup.find_all("p"):
|
||||||
|
text = tag.get_text(strip=True)
|
||||||
|
if text and len(text) > 50 and text not in seen_paragraphs:
|
||||||
|
seen_paragraphs.add(text)
|
||||||
|
content["paragraphs"].append(text)
|
||||||
|
if len(content["paragraphs"]) >= 20:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Images (hero/banner sized, skip tiny icons)
|
||||||
|
for img in soup.find_all("img"):
|
||||||
|
src = img.get("src") or img.get("data-src")
|
||||||
|
if not src:
|
||||||
|
continue
|
||||||
|
# Make absolute
|
||||||
|
if src.startswith("//"):
|
||||||
|
src = "https:" + src
|
||||||
|
elif src.startswith("/"):
|
||||||
|
src = base_url + src
|
||||||
|
elif not src.startswith("http"):
|
||||||
|
continue
|
||||||
|
# Skip tiny images, data URIs, tracking pixels
|
||||||
|
if "1x1" in src or "pixel" in src or src.startswith("data:"):
|
||||||
|
continue
|
||||||
|
width = img.get("width", "")
|
||||||
|
height = img.get("height", "")
|
||||||
|
if width and width.isdigit() and int(width) < 100:
|
||||||
|
continue
|
||||||
|
if height and height.isdigit() and int(height) < 100:
|
||||||
|
continue
|
||||||
|
if src not in content["images"]:
|
||||||
|
content["images"].append(src)
|
||||||
|
if len(content["images"]) >= 15:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Social links
|
||||||
|
for a in soup.find_all("a", href=True):
|
||||||
|
href = a["href"]
|
||||||
|
for platform, pattern in [
|
||||||
|
("facebook", "facebook.com"),
|
||||||
|
("instagram", "instagram.com"),
|
||||||
|
("linkedin", "linkedin.com"),
|
||||||
|
("twitter", "twitter.com"),
|
||||||
|
("youtube", "youtube.com"),
|
||||||
|
("tiktok", "tiktok.com"),
|
||||||
|
]:
|
||||||
|
if pattern in href and platform not in content["social_links"]:
|
||||||
|
content["social_links"][platform] = href
|
||||||
|
|
||||||
|
# Service items (from list items near "service" headings)
|
||||||
|
for heading in soup.find_all(["h2", "h3"]):
|
||||||
|
heading_text = heading.get_text(strip=True).lower()
|
||||||
|
if any(kw in heading_text for kw in ["service", "prestation", "leistung", "angebot", "nos activit"]):
|
||||||
|
# Look for list items or cards after this heading
|
||||||
|
sibling = heading.find_next_sibling()
|
||||||
|
while sibling and sibling.name not in ["h1", "h2", "h3"]:
|
||||||
|
if sibling.name in ["ul", "ol"]:
|
||||||
|
for li in sibling.find_all("li"):
|
||||||
|
text = li.get_text(strip=True)
|
||||||
|
if text and len(text) > 3 and text not in content["services"]:
|
||||||
|
content["services"].append(text)
|
||||||
|
elif sibling.name == "div":
|
||||||
|
# Cards pattern: divs with h3/h4 + p
|
||||||
|
card_title = sibling.find(["h3", "h4", "h5"])
|
||||||
|
if card_title:
|
||||||
|
text = card_title.get_text(strip=True)
|
||||||
|
if text and text not in content["services"]:
|
||||||
|
content["services"].append(text)
|
||||||
|
sibling = sibling.find_next_sibling()
|
||||||
|
if len(content["services"]) >= 10:
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e: # noqa: EXC003
|
||||||
|
logger.debug("Content scrape failed for %s%s: %s", domain, path, e)
|
||||||
|
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
# Store results
|
||||||
|
prospect.scraped_content_json = json.dumps(content, ensure_ascii=False)
|
||||||
|
prospect.last_content_scrape_at = datetime.now(UTC)
|
||||||
|
db.flush()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Content scrape for %s: %d headings, %d paragraphs, %d images, %d services",
|
||||||
|
domain, len(content["headings"]), len(content["paragraphs"]),
|
||||||
|
len(content["images"]), len(content["services"]),
|
||||||
|
)
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
enrichment_service = EnrichmentService()
|
enrichment_service = EnrichmentService()
|
||||||
|
|||||||
@@ -251,6 +251,17 @@ class ProspectService:
|
|||||||
.all()
|
.all()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def get_pending_content_scrape(self, db: Session, limit: int = 100) -> list[Prospect]:
|
||||||
|
return (
|
||||||
|
db.query(Prospect)
|
||||||
|
.filter(
|
||||||
|
Prospect.has_website.is_(True),
|
||||||
|
Prospect.last_content_scrape_at.is_(None),
|
||||||
|
)
|
||||||
|
.limit(limit)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
def get_pending_security_audit(self, db: Session, limit: int = 50) -> list[Prospect]:
|
def get_pending_security_audit(self, db: Session, limit: int = 50) -> list[Prospect]:
|
||||||
return (
|
return (
|
||||||
db.query(Prospect)
|
db.query(Prospect)
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ function scanJobs() {
|
|||||||
'tech_scan': 'tech-scan',
|
'tech_scan': 'tech-scan',
|
||||||
'performance_scan': 'performance',
|
'performance_scan': 'performance',
|
||||||
'contact_scrape': 'contacts',
|
'contact_scrape': 'contacts',
|
||||||
|
'content_scrape': 'content-scrape',
|
||||||
'security_audit': 'security-audit',
|
'security_audit': 'security-audit',
|
||||||
'score_compute': 'score-compute',
|
'score_compute': 'score-compute',
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -34,6 +34,11 @@
|
|||||||
<span x-html="$icon('mail', 'w-4 h-4 mr-2')"></span>
|
<span x-html="$icon('mail', 'w-4 h-4 mr-2')"></span>
|
||||||
Contact Scrape
|
Contact Scrape
|
||||||
</button>
|
</button>
|
||||||
|
<button type="button" @click="startBatchJob('content_scrape')"
|
||||||
|
class="inline-flex items-center px-4 py-2 text-sm font-medium leading-5 text-white transition-colors duration-150 bg-teal-600 border border-transparent rounded-lg hover:bg-teal-700 focus:outline-none">
|
||||||
|
<span x-html="$icon('document-text', 'w-4 h-4 mr-2')"></span>
|
||||||
|
Content Scrape
|
||||||
|
</button>
|
||||||
<button type="button" @click="startBatchJob('security_audit')"
|
<button type="button" @click="startBatchJob('security_audit')"
|
||||||
class="inline-flex items-center px-4 py-2 text-sm font-medium leading-5 text-white transition-colors duration-150 bg-yellow-600 border border-transparent rounded-lg hover:bg-yellow-700 focus:outline-none">
|
class="inline-flex items-center px-4 py-2 text-sm font-medium leading-5 text-white transition-colors duration-150 bg-yellow-600 border border-transparent rounded-lg hover:bg-yellow-700 focus:outline-none">
|
||||||
<span x-html="$icon('shield-check', 'w-4 h-4 mr-2')"></span>
|
<span x-html="$icon('shield-check', 'w-4 h-4 mr-2')"></span>
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ python-multipart==0.0.20
|
|||||||
# Data processing
|
# Data processing
|
||||||
pandas==2.2.3
|
pandas==2.2.3
|
||||||
requests==2.32.3
|
requests==2.32.3
|
||||||
|
beautifulsoup4==4.14.3
|
||||||
|
|
||||||
# Image processing
|
# Image processing
|
||||||
Pillow>=10.0.0
|
Pillow>=10.0.0
|
||||||
|
|||||||
Reference in New Issue
Block a user