From 1828ac85eb212a2b283607ca7ad303fdc1d35326 Mon Sep 17 00:00:00 2001
From: Samir Boulahtit <samir.boulahtit@wizard.lu>
Date: Wed, 1 Apr 2026 22:26:56 +0200
Subject: [PATCH] feat(prospecting): add content scraping for POC builder
 (Workstream 3A)

- New scrape_content() method in enrichment_service: extracts meta
  description, H1/H2 headings, paragraphs, images (filtered for size),
  social links, service items, and detected languages using BeautifulSoup
- Scans 6 pages per prospect: /, /about, /a-propos, /services,
  /nos-services, /contact
- Results stored as JSON in prospect.scraped_content_json
- New endpoints: POST /content-scrape/{id} and /content-scrape/batch
- Added to full_enrichment pipeline (Step 5, before security audit)
- CONTENT_SCRAPE job type for scan-jobs tracking
- "Content Scrape" batch button on scan-jobs page
- Add beautifulsoup4 to requirements.txt

Tested on batirenovation-strasbourg.fr: extracted 30 headings,
21 paragraphs, 13 images.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 app/modules/prospecting/models/prospect.py    |   4 +
 app/modules/prospecting/models/scan_job.py    |   1 +
 .../routes/api/admin_enrichment.py            |  42 ++++-
 .../services/enrichment_service.py            | 155 ++++++++++++++++++
 .../prospecting/services/prospect_service.py  |  11 ++
 .../prospecting/static/admin/js/scan-jobs.js  |   1 +
 .../prospecting/admin/scan-jobs.html          |   5 +
 requirements.txt                              |   1 +
 8 files changed, 218 insertions(+), 2 deletions(-)

diff --git a/app/modules/prospecting/models/prospect.py b/app/modules/prospecting/models/prospect.py
index 490f1852..78e68e30 100644
--- a/app/modules/prospecting/models/prospect.py
+++ b/app/modules/prospecting/models/prospect.py
@@ -70,6 +70,10 @@ class Prospect(Base, TimestampMixin):
     last_perf_scan_at = Column(DateTime, nullable=True)
     last_contact_scrape_at = Column(DateTime, nullable=True)
     last_security_audit_at = Column(DateTime, nullable=True)
+    last_content_scrape_at = Column(DateTime, nullable=True)
+
+    # Scraped page content for POC builder
+    scraped_content_json = Column(Text, nullable=True)
 
     # Relationships
     tech_profile = relationship("ProspectTechProfile", back_populates="prospect", uselist=False, cascade="all, delete-orphan")
diff --git a/app/modules/prospecting/models/scan_job.py b/app/modules/prospecting/models/scan_job.py
index 85885bd2..a13efa44 100644
--- a/app/modules/prospecting/models/scan_job.py
+++ b/app/modules/prospecting/models/scan_job.py
@@ -20,6 +20,7 @@ class JobType(str, enum.Enum):
     SCORE_COMPUTE = "score_compute"
     FULL_ENRICHMENT = "full_enrichment"
     SECURITY_AUDIT = "security_audit"
+    CONTENT_SCRAPE = "content_scrape"
 
 
 class JobStatus(str, enum.Enum):
diff --git a/app/modules/prospecting/routes/api/admin_enrichment.py b/app/modules/prospecting/routes/api/admin_enrichment.py
index 86b5de17..38b36d99 100644
--- a/app/modules/prospecting/routes/api/admin_enrichment.py
+++ b/app/modules/prospecting/routes/api/admin_enrichment.py
@@ -139,6 +139,27 @@ def contact_scrape_batch(
     return ScanBatchResponse(processed=len(prospects), successful=count)
 
 
+@router.post("/content-scrape/batch", response_model=ScanBatchResponse)
+def content_scrape_batch(
+    limit: int = Query(50, ge=1, le=200),
+    db: Session = Depends(get_db),
+    current_admin: UserContext = Depends(get_current_admin_api),
+):
+    """Scrape page content for pending prospects."""
+    job = stats_service.create_job(db, JobType.CONTENT_SCRAPE)
+    prospects = prospect_service.get_pending_content_scrape(db, limit=limit)
+    count = 0
+    for i, prospect in enumerate(prospects):
+        result = enrichment_service.scrape_content(db, prospect)
+        if result:
+            count += 1
+        if i < len(prospects) - 1:
+            _batch_delay()
+    stats_service.complete_job(job, processed=len(prospects))
+    db.commit()
+    return ScanBatchResponse(processed=len(prospects), successful=count)
+
+
 @router.post("/security-audit/batch", response_model=ScanBatchResponse)
 def security_audit_batch(
     limit: int = Query(50, ge=1, le=200),
@@ -272,6 +293,19 @@ def security_audit_single(
     )
 
 
+@router.post("/content-scrape/{prospect_id}", response_model=ScanSingleResponse)
+def content_scrape_single(
+    prospect_id: int = Path(...),
+    db: Session = Depends(get_db),
+    current_admin: UserContext = Depends(get_current_admin_api),
+):
+    """Scrape page content for a single prospect."""
+    prospect = prospect_service.get_by_id(db, prospect_id)
+    result = enrichment_service.scrape_content(db, prospect)
+    db.commit()
+    return ScanSingleResponse(domain=prospect.domain_name, profile=result is not None)
+
+
 @router.post("/full/{prospect_id}", response_model=FullEnrichmentResponse)
 def full_enrichment(
     prospect_id: int = Path(...),
@@ -299,11 +333,15 @@ def full_enrichment(
     if prospect.has_website:
         contacts = enrichment_service.scrape_contacts(db, prospect)
 
-    # Step 5: Security audit (if has website)
+    # Step 5: Content scrape (if has website)
+    if prospect.has_website:
+        enrichment_service.scrape_content(db, prospect)
+
+    # Step 6: Security audit (if has website)
     if prospect.has_website:
         security_audit_service.run_audit(db, prospect)
 
-    # Step 6: Compute score
+    # Step 7: Compute score
     db.refresh(prospect)
     score = scoring_service.compute_score(db, prospect)
     db.commit()
diff --git a/app/modules/prospecting/services/enrichment_service.py b/app/modules/prospecting/services/enrichment_service.py
index a8f787f7..250ec3fb 100644
--- a/app/modules/prospecting/services/enrichment_service.py
+++ b/app/modules/prospecting/services/enrichment_service.py
@@ -468,4 +468,159 @@ class EnrichmentService:
         return ",".join(found) if found else None
 
 
+    def scrape_content(self, db: Session, prospect: Prospect) -> dict | None:
+        """Scrape page content (headings, paragraphs, images, services) for POC builder.
+
+        Uses BeautifulSoup to extract structured content from the prospect's
+        website. Stores results as JSON in prospect.scraped_content_json.
+        """
+        import json
+
+        from bs4 import BeautifulSoup
+
+        domain = prospect.domain_name
+        if not domain or not prospect.has_website:
+            return None
+
+        scheme = "https" if prospect.uses_https else "http"
+        base_url = f"{scheme}://{domain}"
+        paths = ["", "/about", "/a-propos", "/services", "/nos-services", "/contact"]
+
+        session = requests.Session()
+        session.verify = False  # noqa: SEC047 passive scan
+        session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"})
+
+        content = {
+            "meta_description": None,
+            "headings": [],
+            "paragraphs": [],
+            "services": [],
+            "images": [],
+            "social_links": {},
+            "business_hours": None,
+            "languages_detected": [],
+        }
+
+        seen_headings = set()
+        seen_paragraphs = set()
+
+        for path in paths:
+            try:
+                url = base_url + path
+                resp = session.get(url, timeout=config.http_timeout, allow_redirects=True)
+                if resp.status_code != 200:
+                    continue
+
+                soup = BeautifulSoup(resp.text, "html.parser")
+
+                # Meta description (first one found)
+                if not content["meta_description"]:
+                    meta = soup.find("meta", attrs={"name": "description"})
+                    if meta and meta.get("content"):
+                        content["meta_description"] = meta["content"].strip()
+
+                # Language detection
+                html_tag = soup.find("html")
+                if html_tag and html_tag.get("lang"):
+                    lang = html_tag["lang"][:2].lower()
+                    if lang not in content["languages_detected"]:
+                        content["languages_detected"].append(lang)
+
+                # Headings (H1, H2)
+                for tag in soup.find_all(["h1", "h2"]):
+                    text = tag.get_text(strip=True)
+                    if text and len(text) > 3 and text not in seen_headings:
+                        seen_headings.add(text)
+                        content["headings"].append(text)
+
+                # Paragraphs (substantial ones, skip tiny/boilerplate)
+                for tag in soup.find_all("p"):
+                    text = tag.get_text(strip=True)
+                    if text and len(text) > 50 and text not in seen_paragraphs:
+                        seen_paragraphs.add(text)
+                        content["paragraphs"].append(text)
+                        if len(content["paragraphs"]) >= 20:
+                            break
+
+                # Images (hero/banner sized, skip tiny icons)
+                for img in soup.find_all("img"):
+                    src = img.get("src") or img.get("data-src")
+                    if not src:
+                        continue
+                    # Make absolute
+                    if src.startswith("//"):
+                        src = "https:" + src
+                    elif src.startswith("/"):
+                        src = base_url + src
+                    elif not src.startswith("http"):
+                        continue
+                    # Skip tiny images, data URIs, tracking pixels
+                    if "1x1" in src or "pixel" in src or src.startswith("data:"):
+                        continue
+                    width = img.get("width", "")
+                    height = img.get("height", "")
+                    if width and width.isdigit() and int(width) < 100:
+                        continue
+                    if height and height.isdigit() and int(height) < 100:
+                        continue
+                    if src not in content["images"]:
+                        content["images"].append(src)
+                        if len(content["images"]) >= 15:
+                            break
+
+                # Social links
+                for a in soup.find_all("a", href=True):
+                    href = a["href"]
+                    for platform, pattern in [
+                        ("facebook", "facebook.com"),
+                        ("instagram", "instagram.com"),
+                        ("linkedin", "linkedin.com"),
+                        ("twitter", "twitter.com"),
+                        ("youtube", "youtube.com"),
+                        ("tiktok", "tiktok.com"),
+                    ]:
+                        if pattern in href and platform not in content["social_links"]:
+                            content["social_links"][platform] = href
+
+                # Service items (from list items near "service" headings)
+                for heading in soup.find_all(["h2", "h3"]):
+                    heading_text = heading.get_text(strip=True).lower()
+                    if any(kw in heading_text for kw in ["service", "prestation", "leistung", "angebot", "nos activit"]):
+                        # Look for list items or cards after this heading
+                        sibling = heading.find_next_sibling()
+                        while sibling and sibling.name not in ["h1", "h2", "h3"]:
+                            if sibling.name in ["ul", "ol"]:
+                                for li in sibling.find_all("li"):
+                                    text = li.get_text(strip=True)
+                                    if text and len(text) > 3 and text not in content["services"]:
+                                        content["services"].append(text)
+                            elif sibling.name == "div":
+                                # Cards pattern: divs with h3/h4 + p
+                                card_title = sibling.find(["h3", "h4", "h5"])
+                                if card_title:
+                                    text = card_title.get_text(strip=True)
+                                    if text and text not in content["services"]:
+                                        content["services"].append(text)
+                            sibling = sibling.find_next_sibling()
+                            if len(content["services"]) >= 10:
+                                break
+
+            except Exception as e:  # noqa: EXC003
+                logger.debug("Content scrape failed for %s%s: %s", domain, path, e)
+
+        session.close()
+
+        # Store results
+        prospect.scraped_content_json = json.dumps(content, ensure_ascii=False)
+        prospect.last_content_scrape_at = datetime.now(UTC)
+        db.flush()
+
+        logger.info(
+            "Content scrape for %s: %d headings, %d paragraphs, %d images, %d services",
+            domain, len(content["headings"]), len(content["paragraphs"]),
+            len(content["images"]), len(content["services"]),
+        )
+        return content
+
+
 enrichment_service = EnrichmentService()
diff --git a/app/modules/prospecting/services/prospect_service.py b/app/modules/prospecting/services/prospect_service.py
index 3183a60b..09abf8d5 100644
--- a/app/modules/prospecting/services/prospect_service.py
+++ b/app/modules/prospecting/services/prospect_service.py
@@ -251,6 +251,17 @@ class ProspectService:
             .all()
         )
 
+    def get_pending_content_scrape(self, db: Session, limit: int = 100) -> list[Prospect]:
+        return (
+            db.query(Prospect)
+            .filter(
+                Prospect.has_website.is_(True),
+                Prospect.last_content_scrape_at.is_(None),
+            )
+            .limit(limit)
+            .all()
+        )
+
     def get_pending_security_audit(self, db: Session, limit: int = 50) -> list[Prospect]:
         return (
             db.query(Prospect)
diff --git a/app/modules/prospecting/static/admin/js/scan-jobs.js b/app/modules/prospecting/static/admin/js/scan-jobs.js
index c806e99d..3fb68d5f 100644
--- a/app/modules/prospecting/static/admin/js/scan-jobs.js
+++ b/app/modules/prospecting/static/admin/js/scan-jobs.js
@@ -53,6 +53,7 @@ function scanJobs() {
             'tech_scan': 'tech-scan',
             'performance_scan': 'performance',
             'contact_scrape': 'contacts',
+            'content_scrape': 'content-scrape',
             'security_audit': 'security-audit',
             'score_compute': 'score-compute',
         },
diff --git a/app/modules/prospecting/templates/prospecting/admin/scan-jobs.html b/app/modules/prospecting/templates/prospecting/admin/scan-jobs.html
index 7e2161c1..a7d44e5b 100644
--- a/app/modules/prospecting/templates/prospecting/admin/scan-jobs.html
+++ b/app/modules/prospecting/templates/prospecting/admin/scan-jobs.html
@@ -34,6 +34,11 @@
             <span x-html="$icon('mail', 'w-4 h-4 mr-2')"></span>
             Contact Scrape
         </button>
+        <button type="button" @click="startBatchJob('content_scrape')"
+                class="inline-flex items-center px-4 py-2 text-sm font-medium leading-5 text-white transition-colors duration-150 bg-teal-600 border border-transparent rounded-lg hover:bg-teal-700 focus:outline-none">
+            <span x-html="$icon('document-text', 'w-4 h-4 mr-2')"></span>
+            Content Scrape
+        </button>
         <button type="button" @click="startBatchJob('security_audit')"
                 class="inline-flex items-center px-4 py-2 text-sm font-medium leading-5 text-white transition-colors duration-150 bg-yellow-600 border border-transparent rounded-lg hover:bg-yellow-700 focus:outline-none">
             <span x-html="$icon('shield-check', 'w-4 h-4 mr-2')"></span>
diff --git a/requirements.txt b/requirements.txt
index b5302d5d..23ae6d82 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,6 +21,7 @@ python-multipart==0.0.20
 # Data processing
 pandas==2.2.3
 requests==2.32.3
+beautifulsoup4==4.14.3
 
 # Image processing
 Pillow>=10.0.0