From 1828ac85eb212a2b283607ca7ad303fdc1d35326 Mon Sep 17 00:00:00 2001 From: Samir Boulahtit Date: Wed, 1 Apr 2026 22:26:56 +0200 Subject: [PATCH] feat(prospecting): add content scraping for POC builder (Workstream 3A) - New scrape_content() method in enrichment_service: extracts meta description, H1/H2 headings, paragraphs, images (filtered for size), social links, service items, and detected languages using BeautifulSoup - Scans 6 pages per prospect: /, /about, /a-propos, /services, /nos-services, /contact - Results stored as JSON in prospect.scraped_content_json - New endpoints: POST /content-scrape/{id} and /content-scrape/batch - Added to full_enrichment pipeline (Step 5, before security audit) - CONTENT_SCRAPE job type for scan-jobs tracking - "Content Scrape" batch button on scan-jobs page - Add beautifulsoup4 to requirements.txt Tested on batirenovation-strasbourg.fr: extracted 30 headings, 21 paragraphs, 13 images. Co-Authored-By: Claude Opus 4.6 (1M context) --- app/modules/prospecting/models/prospect.py | 4 + app/modules/prospecting/models/scan_job.py | 1 + .../routes/api/admin_enrichment.py | 42 ++++- .../services/enrichment_service.py | 155 ++++++++++++++++++ .../prospecting/services/prospect_service.py | 11 ++ .../prospecting/static/admin/js/scan-jobs.js | 1 + .../prospecting/admin/scan-jobs.html | 5 + requirements.txt | 1 + 8 files changed, 218 insertions(+), 2 deletions(-) diff --git a/app/modules/prospecting/models/prospect.py b/app/modules/prospecting/models/prospect.py index 490f1852..78e68e30 100644 --- a/app/modules/prospecting/models/prospect.py +++ b/app/modules/prospecting/models/prospect.py @@ -70,6 +70,10 @@ class Prospect(Base, TimestampMixin): last_perf_scan_at = Column(DateTime, nullable=True) last_contact_scrape_at = Column(DateTime, nullable=True) last_security_audit_at = Column(DateTime, nullable=True) + last_content_scrape_at = Column(DateTime, nullable=True) + + # Scraped page content for POC builder + scraped_content_json = Column(Text, nullable=True) # Relationships tech_profile = relationship("ProspectTechProfile", back_populates="prospect", uselist=False, cascade="all, delete-orphan") diff --git a/app/modules/prospecting/models/scan_job.py b/app/modules/prospecting/models/scan_job.py index 85885bd2..a13efa44 100644 --- a/app/modules/prospecting/models/scan_job.py +++ b/app/modules/prospecting/models/scan_job.py @@ -20,6 +20,7 @@ class JobType(str, enum.Enum): SCORE_COMPUTE = "score_compute" FULL_ENRICHMENT = "full_enrichment" SECURITY_AUDIT = "security_audit" + CONTENT_SCRAPE = "content_scrape" class JobStatus(str, enum.Enum): diff --git a/app/modules/prospecting/routes/api/admin_enrichment.py b/app/modules/prospecting/routes/api/admin_enrichment.py index 86b5de17..38b36d99 100644 --- a/app/modules/prospecting/routes/api/admin_enrichment.py +++ b/app/modules/prospecting/routes/api/admin_enrichment.py @@ -139,6 +139,27 @@ def contact_scrape_batch( return ScanBatchResponse(processed=len(prospects), successful=count) +@router.post("/content-scrape/batch", response_model=ScanBatchResponse) +def content_scrape_batch( + limit: int = Query(50, ge=1, le=200), + db: Session = Depends(get_db), + current_admin: UserContext = Depends(get_current_admin_api), +): + """Scrape page content for pending prospects.""" + job = stats_service.create_job(db, JobType.CONTENT_SCRAPE) + prospects = prospect_service.get_pending_content_scrape(db, limit=limit) + count = 0 + for i, prospect in enumerate(prospects): + result = enrichment_service.scrape_content(db, prospect) + if result: + count += 1 + if i < len(prospects) - 1: + _batch_delay() + stats_service.complete_job(job, processed=len(prospects)) + db.commit() + return ScanBatchResponse(processed=len(prospects), successful=count) + + @router.post("/security-audit/batch", response_model=ScanBatchResponse) def security_audit_batch( limit: int = Query(50, ge=1, le=200), @@ -272,6 +293,19 @@ def security_audit_single( ) +@router.post("/content-scrape/{prospect_id}", response_model=ScanSingleResponse) +def content_scrape_single( + prospect_id: int = Path(...), + db: Session = Depends(get_db), + current_admin: UserContext = Depends(get_current_admin_api), +): + """Scrape page content for a single prospect.""" + prospect = prospect_service.get_by_id(db, prospect_id) + result = enrichment_service.scrape_content(db, prospect) + db.commit() + return ScanSingleResponse(domain=prospect.domain_name, profile=result is not None) + + @router.post("/full/{prospect_id}", response_model=FullEnrichmentResponse) def full_enrichment( prospect_id: int = Path(...), @@ -299,11 +333,15 @@ def full_enrichment( if prospect.has_website: contacts = enrichment_service.scrape_contacts(db, prospect) - # Step 5: Security audit (if has website) + # Step 5: Content scrape (if has website) + if prospect.has_website: + enrichment_service.scrape_content(db, prospect) + + # Step 6: Security audit (if has website) if prospect.has_website: security_audit_service.run_audit(db, prospect) - # Step 6: Compute score + # Step 7: Compute score db.refresh(prospect) score = scoring_service.compute_score(db, prospect) db.commit() diff --git a/app/modules/prospecting/services/enrichment_service.py b/app/modules/prospecting/services/enrichment_service.py index a8f787f7..250ec3fb 100644 --- a/app/modules/prospecting/services/enrichment_service.py +++ b/app/modules/prospecting/services/enrichment_service.py @@ -468,4 +468,159 @@ class EnrichmentService: return ",".join(found) if found else None + def scrape_content(self, db: Session, prospect: Prospect) -> dict | None: + """Scrape page content (headings, paragraphs, images, services) for POC builder. + + Uses BeautifulSoup to extract structured content from the prospect's + website. Stores results as JSON in prospect.scraped_content_json. + """ + import json + + from bs4 import BeautifulSoup + + domain = prospect.domain_name + if not domain or not prospect.has_website: + return None + + scheme = "https" if prospect.uses_https else "http" + base_url = f"{scheme}://{domain}" + paths = ["", "/about", "/a-propos", "/services", "/nos-services", "/contact"] + + session = requests.Session() + session.verify = False # noqa: SEC047 passive scan + session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"}) + + content = { + "meta_description": None, + "headings": [], + "paragraphs": [], + "services": [], + "images": [], + "social_links": {}, + "business_hours": None, + "languages_detected": [], + } + + seen_headings = set() + seen_paragraphs = set() + + for path in paths: + try: + url = base_url + path + resp = session.get(url, timeout=config.http_timeout, allow_redirects=True) + if resp.status_code != 200: + continue + + soup = BeautifulSoup(resp.text, "html.parser") + + # Meta description (first one found) + if not content["meta_description"]: + meta = soup.find("meta", attrs={"name": "description"}) + if meta and meta.get("content"): + content["meta_description"] = meta["content"].strip() + + # Language detection + html_tag = soup.find("html") + if html_tag and html_tag.get("lang"): + lang = html_tag["lang"][:2].lower() + if lang not in content["languages_detected"]: + content["languages_detected"].append(lang) + + # Headings (H1, H2) + for tag in soup.find_all(["h1", "h2"]): + text = tag.get_text(strip=True) + if text and len(text) > 3 and text not in seen_headings: + seen_headings.add(text) + content["headings"].append(text) + + # Paragraphs (substantial ones, skip tiny/boilerplate) + for tag in soup.find_all("p"): + text = tag.get_text(strip=True) + if text and len(text) > 50 and text not in seen_paragraphs: + seen_paragraphs.add(text) + content["paragraphs"].append(text) + if len(content["paragraphs"]) >= 20: + break + + # Images (hero/banner sized, skip tiny icons) + for img in soup.find_all("img"): + src = img.get("src") or img.get("data-src") + if not src: + continue + # Make absolute + if src.startswith("//"): + src = "https:" + src + elif src.startswith("/"): + src = base_url + src + elif not src.startswith("http"): + continue + # Skip tiny images, data URIs, tracking pixels + if "1x1" in src or "pixel" in src or src.startswith("data:"): + continue + width = img.get("width", "") + height = img.get("height", "") + if width and width.isdigit() and int(width) < 100: + continue + if height and height.isdigit() and int(height) < 100: + continue + if src not in content["images"]: + content["images"].append(src) + if len(content["images"]) >= 15: + break + + # Social links + for a in soup.find_all("a", href=True): + href = a["href"] + for platform, pattern in [ + ("facebook", "facebook.com"), + ("instagram", "instagram.com"), + ("linkedin", "linkedin.com"), + ("twitter", "twitter.com"), + ("youtube", "youtube.com"), + ("tiktok", "tiktok.com"), + ]: + if pattern in href and platform not in content["social_links"]: + content["social_links"][platform] = href + + # Service items (from list items near "service" headings) + for heading in soup.find_all(["h2", "h3"]): + heading_text = heading.get_text(strip=True).lower() + if any(kw in heading_text for kw in ["service", "prestation", "leistung", "angebot", "nos activit"]): + # Look for list items or cards after this heading + sibling = heading.find_next_sibling() + while sibling and sibling.name not in ["h1", "h2", "h3"]: + if sibling.name in ["ul", "ol"]: + for li in sibling.find_all("li"): + text = li.get_text(strip=True) + if text and len(text) > 3 and text not in content["services"]: + content["services"].append(text) + elif sibling.name == "div": + # Cards pattern: divs with h3/h4 + p + card_title = sibling.find(["h3", "h4", "h5"]) + if card_title: + text = card_title.get_text(strip=True) + if text and text not in content["services"]: + content["services"].append(text) + sibling = sibling.find_next_sibling() + if len(content["services"]) >= 10: + break + + except Exception as e: # noqa: EXC003 + logger.debug("Content scrape failed for %s%s: %s", domain, path, e) + + session.close() + + # Store results + prospect.scraped_content_json = json.dumps(content, ensure_ascii=False) + prospect.last_content_scrape_at = datetime.now(UTC) + db.flush() + + logger.info( + "Content scrape for %s: %d headings, %d paragraphs, %d images, %d services", + domain, len(content["headings"]), len(content["paragraphs"]), + len(content["images"]), len(content["services"]), + ) + return content + + enrichment_service = EnrichmentService() diff --git a/app/modules/prospecting/services/prospect_service.py b/app/modules/prospecting/services/prospect_service.py index 3183a60b..09abf8d5 100644 --- a/app/modules/prospecting/services/prospect_service.py +++ b/app/modules/prospecting/services/prospect_service.py @@ -251,6 +251,17 @@ class ProspectService: .all() ) + def get_pending_content_scrape(self, db: Session, limit: int = 100) -> list[Prospect]: + return ( + db.query(Prospect) + .filter( + Prospect.has_website.is_(True), + Prospect.last_content_scrape_at.is_(None), + ) + .limit(limit) + .all() + ) + def get_pending_security_audit(self, db: Session, limit: int = 50) -> list[Prospect]: return ( db.query(Prospect) diff --git a/app/modules/prospecting/static/admin/js/scan-jobs.js b/app/modules/prospecting/static/admin/js/scan-jobs.js index c806e99d..3fb68d5f 100644 --- a/app/modules/prospecting/static/admin/js/scan-jobs.js +++ b/app/modules/prospecting/static/admin/js/scan-jobs.js @@ -53,6 +53,7 @@ function scanJobs() { 'tech_scan': 'tech-scan', 'performance_scan': 'performance', 'contact_scrape': 'contacts', + 'content_scrape': 'content-scrape', 'security_audit': 'security-audit', 'score_compute': 'score-compute', }, diff --git a/app/modules/prospecting/templates/prospecting/admin/scan-jobs.html b/app/modules/prospecting/templates/prospecting/admin/scan-jobs.html index 7e2161c1..a7d44e5b 100644 --- a/app/modules/prospecting/templates/prospecting/admin/scan-jobs.html +++ b/app/modules/prospecting/templates/prospecting/admin/scan-jobs.html @@ -34,6 +34,11 @@ Contact Scrape +