diff --git a/app/modules/prospecting/models/prospect.py b/app/modules/prospecting/models/prospect.py
index 490f1852..78e68e30 100644
--- a/app/modules/prospecting/models/prospect.py
+++ b/app/modules/prospecting/models/prospect.py
@@ -70,6 +70,10 @@ class Prospect(Base, TimestampMixin):
last_perf_scan_at = Column(DateTime, nullable=True)
last_contact_scrape_at = Column(DateTime, nullable=True)
last_security_audit_at = Column(DateTime, nullable=True)
+ last_content_scrape_at = Column(DateTime, nullable=True)
+
+ # Scraped page content for POC builder
+ scraped_content_json = Column(Text, nullable=True)
# Relationships
tech_profile = relationship("ProspectTechProfile", back_populates="prospect", uselist=False, cascade="all, delete-orphan")
diff --git a/app/modules/prospecting/models/scan_job.py b/app/modules/prospecting/models/scan_job.py
index 85885bd2..a13efa44 100644
--- a/app/modules/prospecting/models/scan_job.py
+++ b/app/modules/prospecting/models/scan_job.py
@@ -20,6 +20,7 @@ class JobType(str, enum.Enum):
SCORE_COMPUTE = "score_compute"
FULL_ENRICHMENT = "full_enrichment"
SECURITY_AUDIT = "security_audit"
+ CONTENT_SCRAPE = "content_scrape"
class JobStatus(str, enum.Enum):
diff --git a/app/modules/prospecting/routes/api/admin_enrichment.py b/app/modules/prospecting/routes/api/admin_enrichment.py
index 86b5de17..38b36d99 100644
--- a/app/modules/prospecting/routes/api/admin_enrichment.py
+++ b/app/modules/prospecting/routes/api/admin_enrichment.py
@@ -139,6 +139,27 @@ def contact_scrape_batch(
return ScanBatchResponse(processed=len(prospects), successful=count)
+@router.post("/content-scrape/batch", response_model=ScanBatchResponse)
+def content_scrape_batch(
+ limit: int = Query(50, ge=1, le=200),
+ db: Session = Depends(get_db),
+ current_admin: UserContext = Depends(get_current_admin_api),
+):
+ """Scrape page content for pending prospects."""
+ job = stats_service.create_job(db, JobType.CONTENT_SCRAPE)
+ prospects = prospect_service.get_pending_content_scrape(db, limit=limit)
+ count = 0
+ for i, prospect in enumerate(prospects):
+ result = enrichment_service.scrape_content(db, prospect)
+ if result:
+ count += 1
+ if i < len(prospects) - 1:
+ _batch_delay()
+ stats_service.complete_job(job, processed=len(prospects))
+ db.commit()
+ return ScanBatchResponse(processed=len(prospects), successful=count)
+
+
@router.post("/security-audit/batch", response_model=ScanBatchResponse)
def security_audit_batch(
limit: int = Query(50, ge=1, le=200),
@@ -272,6 +293,19 @@ def security_audit_single(
)
+@router.post("/content-scrape/{prospect_id}", response_model=ScanSingleResponse)
+def content_scrape_single(
+ prospect_id: int = Path(...),
+ db: Session = Depends(get_db),
+ current_admin: UserContext = Depends(get_current_admin_api),
+):
+ """Scrape page content for a single prospect."""
+ prospect = prospect_service.get_by_id(db, prospect_id)
+ result = enrichment_service.scrape_content(db, prospect)
+ db.commit()
+ return ScanSingleResponse(domain=prospect.domain_name, profile=result is not None)
+
+
@router.post("/full/{prospect_id}", response_model=FullEnrichmentResponse)
def full_enrichment(
prospect_id: int = Path(...),
@@ -299,11 +333,15 @@ def full_enrichment(
if prospect.has_website:
contacts = enrichment_service.scrape_contacts(db, prospect)
- # Step 5: Security audit (if has website)
+ # Step 5: Content scrape (if has website)
+ if prospect.has_website:
+ enrichment_service.scrape_content(db, prospect)
+
+ # Step 6: Security audit (if has website)
if prospect.has_website:
security_audit_service.run_audit(db, prospect)
- # Step 6: Compute score
+ # Step 7: Compute score
db.refresh(prospect)
score = scoring_service.compute_score(db, prospect)
db.commit()
diff --git a/app/modules/prospecting/services/enrichment_service.py b/app/modules/prospecting/services/enrichment_service.py
index a8f787f7..250ec3fb 100644
--- a/app/modules/prospecting/services/enrichment_service.py
+++ b/app/modules/prospecting/services/enrichment_service.py
@@ -468,4 +468,159 @@ class EnrichmentService:
return ",".join(found) if found else None
+ def scrape_content(self, db: Session, prospect: Prospect) -> dict | None:
+ """Scrape page content (headings, paragraphs, images, services) for POC builder.
+
+ Uses BeautifulSoup to extract structured content from the prospect's
+ website. Stores results as JSON in prospect.scraped_content_json.
+ """
+ import json
+
+ from bs4 import BeautifulSoup
+
+ domain = prospect.domain_name
+ if not domain or not prospect.has_website:
+ return None
+
+ scheme = "https" if prospect.uses_https else "http"
+ base_url = f"{scheme}://{domain}"
+ paths = ["", "/about", "/a-propos", "/services", "/nos-services", "/contact"]
+
+ session = requests.Session()
+ session.verify = False # noqa: SEC047 passive scan
+ session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"})
+
+ content = {
+ "meta_description": None,
+ "headings": [],
+ "paragraphs": [],
+ "services": [],
+ "images": [],
+ "social_links": {},
+ "business_hours": None,
+ "languages_detected": [],
+ }
+
+ seen_headings = set()
+ seen_paragraphs = set()
+
+ for path in paths:
+ try:
+ url = base_url + path
+ resp = session.get(url, timeout=config.http_timeout, allow_redirects=True)
+ if resp.status_code != 200:
+ continue
+
+ soup = BeautifulSoup(resp.text, "html.parser")
+
+ # Meta description (first one found)
+ if not content["meta_description"]:
+ meta = soup.find("meta", attrs={"name": "description"})
+ if meta and meta.get("content"):
+ content["meta_description"] = meta["content"].strip()
+
+ # Language detection
+ html_tag = soup.find("html")
+ if html_tag and html_tag.get("lang"):
+ lang = html_tag["lang"][:2].lower()
+ if lang not in content["languages_detected"]:
+ content["languages_detected"].append(lang)
+
+ # Headings (H1, H2)
+ for tag in soup.find_all(["h1", "h2"]):
+ text = tag.get_text(strip=True)
+ if text and len(text) > 3 and text not in seen_headings:
+ seen_headings.add(text)
+ content["headings"].append(text)
+
+ # Paragraphs (substantial ones, skip tiny/boilerplate)
+ for tag in soup.find_all("p"):
+ text = tag.get_text(strip=True)
+ if text and len(text) > 50 and text not in seen_paragraphs:
+ seen_paragraphs.add(text)
+ content["paragraphs"].append(text)
+ if len(content["paragraphs"]) >= 20:
+ break
+
+ # Images (hero/banner sized, skip tiny icons)
+ for img in soup.find_all("img"):
+ src = img.get("src") or img.get("data-src")
+ if not src:
+ continue
+ # Make absolute
+ if src.startswith("//"):
+ src = "https:" + src
+ elif src.startswith("/"):
+ src = base_url + src
+ elif not src.startswith("http"):
+ continue
+ # Skip tiny images, data URIs, tracking pixels
+ if "1x1" in src or "pixel" in src or src.startswith("data:"):
+ continue
+ width = img.get("width", "")
+ height = img.get("height", "")
+ if width and width.isdigit() and int(width) < 100:
+ continue
+ if height and height.isdigit() and int(height) < 100:
+ continue
+ if src not in content["images"]:
+ content["images"].append(src)
+ if len(content["images"]) >= 15:
+ break
+
+ # Social links
+ for a in soup.find_all("a", href=True):
+ href = a["href"]
+ for platform, pattern in [
+ ("facebook", "facebook.com"),
+ ("instagram", "instagram.com"),
+ ("linkedin", "linkedin.com"),
+ ("twitter", "twitter.com"),
+ ("youtube", "youtube.com"),
+ ("tiktok", "tiktok.com"),
+ ]:
+ if pattern in href and platform not in content["social_links"]:
+ content["social_links"][platform] = href
+
+ # Service items (from list items near "service" headings)
+ for heading in soup.find_all(["h2", "h3"]):
+ heading_text = heading.get_text(strip=True).lower()
+ if any(kw in heading_text for kw in ["service", "prestation", "leistung", "angebot", "nos activit"]):
+ # Look for list items or cards after this heading
+ sibling = heading.find_next_sibling()
+ while sibling and sibling.name not in ["h1", "h2", "h3"]:
+ if sibling.name in ["ul", "ol"]:
+ for li in sibling.find_all("li"):
+ text = li.get_text(strip=True)
+ if text and len(text) > 3 and text not in content["services"]:
+ content["services"].append(text)
+ elif sibling.name == "div":
+ # Cards pattern: divs with h3/h4 + p
+ card_title = sibling.find(["h3", "h4", "h5"])
+ if card_title:
+ text = card_title.get_text(strip=True)
+ if text and text not in content["services"]:
+ content["services"].append(text)
+ sibling = sibling.find_next_sibling()
+ if len(content["services"]) >= 10:
+ break
+
+ except Exception as e: # noqa: EXC003
+ logger.debug("Content scrape failed for %s%s: %s", domain, path, e)
+
+ session.close()
+
+ # Store results
+ prospect.scraped_content_json = json.dumps(content, ensure_ascii=False)
+ prospect.last_content_scrape_at = datetime.now(UTC)
+ db.flush()
+
+ logger.info(
+ "Content scrape for %s: %d headings, %d paragraphs, %d images, %d services",
+ domain, len(content["headings"]), len(content["paragraphs"]),
+ len(content["images"]), len(content["services"]),
+ )
+ return content
+
+
enrichment_service = EnrichmentService()
diff --git a/app/modules/prospecting/services/prospect_service.py b/app/modules/prospecting/services/prospect_service.py
index 3183a60b..09abf8d5 100644
--- a/app/modules/prospecting/services/prospect_service.py
+++ b/app/modules/prospecting/services/prospect_service.py
@@ -251,6 +251,17 @@ class ProspectService:
.all()
)
+ def get_pending_content_scrape(self, db: Session, limit: int = 100) -> list[Prospect]:
+ return (
+ db.query(Prospect)
+ .filter(
+ Prospect.has_website.is_(True),
+ Prospect.last_content_scrape_at.is_(None),
+ )
+ .limit(limit)
+ .all()
+ )
+
def get_pending_security_audit(self, db: Session, limit: int = 50) -> list[Prospect]:
return (
db.query(Prospect)
diff --git a/app/modules/prospecting/static/admin/js/scan-jobs.js b/app/modules/prospecting/static/admin/js/scan-jobs.js
index c806e99d..3fb68d5f 100644
--- a/app/modules/prospecting/static/admin/js/scan-jobs.js
+++ b/app/modules/prospecting/static/admin/js/scan-jobs.js
@@ -53,6 +53,7 @@ function scanJobs() {
'tech_scan': 'tech-scan',
'performance_scan': 'performance',
'contact_scrape': 'contacts',
+ 'content_scrape': 'content-scrape',
'security_audit': 'security-audit',
'score_compute': 'score-compute',
},
diff --git a/app/modules/prospecting/templates/prospecting/admin/scan-jobs.html b/app/modules/prospecting/templates/prospecting/admin/scan-jobs.html
index 7e2161c1..a7d44e5b 100644
--- a/app/modules/prospecting/templates/prospecting/admin/scan-jobs.html
+++ b/app/modules/prospecting/templates/prospecting/admin/scan-jobs.html
@@ -34,6 +34,11 @@
Contact Scrape
+