Files
orion/app/modules/prospecting/services/enrichment_service.py
Samir Boulahtit 754bfca87d
Some checks failed
CI / validate (push) Has been cancelled
CI / dependency-scanning (push) Has been cancelled
CI / docs (push) Has been cancelled
CI / deploy (push) Has been cancelled
CI / ruff (push) Successful in 13s
CI / pytest (push) Has been cancelled
fix(prospecting): fix contact scraper and add address extraction
- Fix contact_type column: Enum(ContactType) → String(20) to match the
  migration (fixes "type contacttype does not exist" on insert)
- Rewrite scrape_contacts with structured-first approach:
  Phase 1: tel:/mailto: href extraction (high confidence)
  Phase 2: regex fallback with SVG/script stripping, international phone
           pattern (requires + prefix, min 10 digits)
  Phase 3: address extraction from Schema.org JSON-LD, <address> tags,
           and European street address regex (FR/DE/EN street keywords)
- URL-decode email values, strip tags to plain text for cross-element
  address matching
- Add /mentions-legales to scanned paths

Tested on batirenovation-strasbourg.fr: finds 3 contacts (email, phone,
address) vs 120+ false positives and a crash before.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-30 21:18:43 +02:00

458 lines
18 KiB
Python

# app/modules/prospecting/services/enrichment_service.py
"""
Enrichment service for prospect scanning pipeline.
Migrated from marketing-.lu-domains/app/services/enrichment_service.py.
Performs passive HTTP checks, technology detection, performance audits,
and contact scraping for digital prospects.
Uses `requests` (sync) to match Orion's tech stack.
"""
import logging
import re
import socket
import ssl
from datetime import UTC, datetime
import requests
from sqlalchemy.orm import Session
from app.modules.prospecting.config import config
from app.modules.prospecting.models import (
Prospect,
ProspectContact,
ProspectPerformanceProfile,
ProspectTechProfile,
)
logger = logging.getLogger(__name__)
# CMS detection patterns
CMS_PATTERNS = {
"wordpress": [r"wp-content", r"wp-includes", r"wordpress"],
"drupal": [r"drupal", r"sites/default", r"sites/all"],
"joomla": [r"/media/jui/", r"joomla", r"/components/com_"],
"shopify": [r"cdn\.shopify\.com", r"shopify"],
"wix": [r"wix\.com", r"wixstatic\.com", r"parastorage\.com"],
"squarespace": [r"squarespace\.com", r"sqsp\.com"],
"webflow": [r"webflow\.com", r"webflow\.io"],
"typo3": [r"typo3", r"/typo3conf/"],
"prestashop": [r"prestashop", r"/modules/ps_"],
"magento": [r"magento", r"mage/", r"/static/version"],
}
JS_FRAMEWORK_PATTERNS = {
"react": [r"react", r"__NEXT_DATA__", r"_next/"],
"vue": [r"vue\.js", r"vue\.min\.js", r"__vue__"],
"angular": [r"angular", r"ng-version"],
"jquery": [r"jquery"],
"alpine": [r"alpine\.js", r"alpinejs"],
}
ANALYTICS_PATTERNS = {
"google_analytics": [r"google-analytics\.com", r"gtag/js", r"ga\.js"],
"google_tag_manager": [r"googletagmanager\.com", r"gtm\.js"],
"matomo": [r"matomo", r"piwik"],
"facebook_pixel": [r"facebook\.net/en_US/fbevents"],
}
class EnrichmentService:
"""Service for prospect enrichment via passive scanning."""
def check_http(self, db: Session, prospect: Prospect) -> dict:
"""Check HTTP connectivity for a prospect's domain."""
result = {
"has_website": False,
"uses_https": False,
"http_status_code": None,
"redirect_url": None,
"error": None,
}
domain = prospect.domain_name
if not domain:
result["error"] = "No domain name"
return result
# Try HTTPS first, then HTTP
for scheme in ["https", "http"]:
try:
url = f"{scheme}://{domain}"
response = requests.get(
url,
timeout=config.http_timeout,
allow_redirects=True,
verify=False, # noqa: SEC047 passive scan, not sending sensitive data
)
result["has_website"] = True
result["uses_https"] = scheme == "https"
result["http_status_code"] = response.status_code
if response.url != url:
result["redirect_url"] = str(response.url)
break
except requests.exceptions.Timeout:
result["error"] = f"Timeout on {scheme}"
except requests.exceptions.RequestException as e:
result["error"] = str(e)
if scheme == "https":
continue
break
# Update prospect
prospect.has_website = result["has_website"]
prospect.uses_https = result["uses_https"]
prospect.http_status_code = result["http_status_code"]
prospect.redirect_url = result["redirect_url"]
prospect.last_http_check_at = datetime.now(UTC)
if result["has_website"]:
prospect.status = "active"
db.flush()
return result
def scan_tech_stack(self, db: Session, prospect: Prospect) -> ProspectTechProfile | None:
"""Scan technology stack from prospect's website HTML."""
domain = prospect.domain_name
if not domain or not prospect.has_website:
return None
scheme = "https" if prospect.uses_https else "http"
url = f"{scheme}://{domain}"
try:
response = requests.get(
url,
timeout=config.http_timeout,
allow_redirects=True,
verify=False, # noqa: SEC047 passive scan, not sending sensitive data
)
html = response.text.lower()
headers = dict(response.headers)
cms = self._detect_cms(html)
js_framework = self._detect_js_framework(html)
analytics = self._detect_analytics(html)
server = headers.get("Server", "").split("/")[0] if "Server" in headers else None
server_version = None
if server and "/" in headers.get("Server", ""):
server_version = headers["Server"].split("/", 1)[1].strip()
# SSL certificate check
has_valid_cert = None
cert_issuer = None
cert_expires_at = None
if prospect.uses_https:
try:
ctx = ssl.create_default_context()
with ctx.wrap_socket(
socket.create_connection((domain, 443), timeout=5),
server_hostname=domain,
) as sock:
cert = sock.getpeercert()
has_valid_cert = True
cert_issuer = dict(x[0] for x in cert.get("issuer", [()])).get("organizationName")
not_after = cert.get("notAfter")
if not_after:
cert_expires_at = datetime.strptime(not_after, "%b %d %H:%M:%S %Y %Z")
except Exception: # noqa: EXC003
has_valid_cert = False
# Upsert tech profile
profile = prospect.tech_profile
if not profile:
profile = ProspectTechProfile(prospect_id=prospect.id)
db.add(profile)
profile.cms = cms
profile.server = server
profile.server_version = server_version
profile.js_framework = js_framework
profile.analytics = analytics
profile.has_valid_cert = has_valid_cert
profile.cert_issuer = cert_issuer
profile.cert_expires_at = cert_expires_at
profile.scan_source = "basic_http"
prospect.last_tech_scan_at = datetime.now(UTC)
db.flush()
return profile
except Exception as e: # noqa: EXC003
logger.error("Tech scan failed for %s: %s", domain, e)
if prospect.tech_profile:
prospect.tech_profile.scan_error = str(e)
prospect.last_tech_scan_at = datetime.now(UTC)
db.flush()
return None
def scan_performance(self, db: Session, prospect: Prospect) -> ProspectPerformanceProfile | None:
"""Run PageSpeed Insights audit for a prospect's website."""
domain = prospect.domain_name
if not domain or not prospect.has_website:
return None
scheme = "https" if prospect.uses_https else "http"
url = f"{scheme}://{domain}"
api_url = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
params = {
"url": url,
"strategy": "mobile",
"category": ["performance", "accessibility", "best-practices", "seo"],
}
if config.pagespeed_api_key:
params["key"] = config.pagespeed_api_key
try:
response = requests.get(api_url, params=params, timeout=60)
data = response.json()
lighthouse = data.get("lighthouseResult", {})
categories = lighthouse.get("categories", {})
audits = lighthouse.get("audits", {})
perf_score = int((categories.get("performance", {}).get("score") or 0) * 100)
accessibility = int((categories.get("accessibility", {}).get("score") or 0) * 100)
best_practices = int((categories.get("best-practices", {}).get("score") or 0) * 100)
seo = int((categories.get("seo", {}).get("score") or 0) * 100)
# Upsert performance profile
profile = prospect.performance_profile
if not profile:
profile = ProspectPerformanceProfile(prospect_id=prospect.id)
db.add(profile)
profile.performance_score = perf_score
profile.accessibility_score = accessibility
profile.best_practices_score = best_practices
profile.seo_score = seo
# Core Web Vitals
fcp = audits.get("first-contentful-paint", {}).get("numericValue")
profile.first_contentful_paint_ms = int(fcp) if fcp else None
lcp = audits.get("largest-contentful-paint", {}).get("numericValue")
profile.largest_contentful_paint_ms = int(lcp) if lcp else None
tbt = audits.get("total-blocking-time", {}).get("numericValue")
profile.total_blocking_time_ms = int(tbt) if tbt else None
cls_val = audits.get("cumulative-layout-shift", {}).get("numericValue")
profile.cumulative_layout_shift = cls_val
si = audits.get("speed-index", {}).get("numericValue")
profile.speed_index = int(si) if si else None
tti = audits.get("interactive", {}).get("numericValue")
profile.time_to_interactive_ms = int(tti) if tti else None
# Mobile-friendly check
viewport = audits.get("viewport", {}).get("score")
profile.viewport_configured = viewport == 1 if viewport is not None else None
profile.is_mobile_friendly = profile.viewport_configured
profile.scan_strategy = "mobile"
prospect.last_perf_scan_at = datetime.now(UTC)
db.flush()
return profile
except Exception as e: # noqa: EXC003
logger.error("Performance scan failed for %s: %s", domain, e)
prospect.last_perf_scan_at = datetime.now(UTC)
db.flush()
return None
def scrape_contacts(self, db: Session, prospect: Prospect) -> list[ProspectContact]:
"""Scrape email and phone contacts from prospect's website.
Uses a two-phase approach:
1. Structured extraction from <a href="tel:..."> and <a href="mailto:..."> (high confidence)
2. Regex fallback for emails and international phone numbers (stricter filtering)
"""
from urllib.parse import unquote
domain = prospect.domain_name
if not domain or not prospect.has_website:
return []
scheme = "https" if prospect.uses_https else "http"
base_url = f"{scheme}://{domain}"
paths = ["", "/contact", "/kontakt", "/impressum", "/about", "/mentions-legales"]
# Structured patterns (from <a href> tags)
tel_pattern = re.compile(r'href=["\']tel:([^"\'>\s]+)', re.IGNORECASE)
mailto_pattern = re.compile(r'href=["\']mailto:([^"\'>\s?]+)', re.IGNORECASE)
# Regex fallback patterns
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
# International phone: requires + prefix to avoid matching random digit sequences
phone_regex = re.compile(
r"\+\d{1,3}[\s.-]?\(?\d{1,4}\)?[\s.-]?\d{2,4}[\s.-]?\d{2,4}(?:[\s.-]?\d{2,4})?"
)
false_positive_domains = {
"example.com", "email.com", "domain.com", "wordpress.org",
"w3.org", "schema.org", "sentry.io", "googleapis.com",
}
found_emails: set[str] = set()
found_phones: set[str] = set()
contacts: list[ProspectContact] = []
def _add_email(email: str, url: str, source: str) -> None:
email = unquote(email).strip().lower()
email_domain = email.split("@")[1] if "@" in email else ""
if email_domain in false_positive_domains or email in found_emails:
return
found_emails.add(email)
contacts.append(ProspectContact(
prospect_id=prospect.id,
contact_type="email",
value=email,
source_url=url,
source_element=source,
))
def _add_phone(phone: str, url: str, source: str) -> None:
phone_clean = re.sub(r"[\s.()\-]", "", phone)
if len(phone_clean) < 10 or phone_clean in found_phones:
return
found_phones.add(phone_clean)
contacts.append(ProspectContact(
prospect_id=prospect.id,
contact_type="phone",
value=phone_clean,
source_url=url,
source_element=source,
))
found_addresses: set[str] = set()
def _add_address(address: str, url: str, source: str) -> None:
address = re.sub(r"\s+", " ", address).strip()
if len(address) < 10 or address in found_addresses:
return
found_addresses.add(address)
contacts.append(ProspectContact(
prospect_id=prospect.id,
contact_type="address",
value=address,
source_url=url,
source_element=source,
))
session = requests.Session()
session.verify = False # noqa: SEC047 passive scan, not sending sensitive data
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"})
for path in paths:
try:
url = base_url + path
response = session.get(url, timeout=config.http_timeout, allow_redirects=True)
if response.status_code != 200:
continue
html = response.text
# Phase 1: structured extraction from href attributes
for phone in tel_pattern.findall(html):
_add_phone(unquote(phone), url, "tel_href")
for email in mailto_pattern.findall(html):
_add_email(email, url, "mailto_href")
# Phase 2: regex fallback — strip SVG/script content first
text_html = re.sub(r"<(svg|script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
for email in email_regex.findall(text_html):
_add_email(email, url, "regex")
for phone in phone_regex.findall(text_html):
_add_phone(phone, url, "regex")
# Phase 3: address extraction
# 3a: Schema.org JSON-LD
for m in re.finditer(r'"streetAddress"\s*:\s*"([^"]+)"', html):
parts = [m.group(1)]
# Try to find locality/postal near the same JSON block
block_end = html[m.end():m.end() + 200]
locality = re.search(r'"addressLocality"\s*:\s*"([^"]+)"', block_end)
postal = re.search(r'"postalCode"\s*:\s*"([^"]+)"', block_end)
if postal:
parts.append(postal.group(1))
if locality:
parts.append(locality.group(1))
_add_address(", ".join(parts), url, "schema_org")
# 3b: <address> HTML tag
for addr_match in re.finditer(r"<address[^>]*>(.*?)</address>", html, re.DOTALL | re.IGNORECASE):
clean = re.sub(r"<[^>]+>", " ", addr_match.group(1))
clean = re.sub(r"\s+", " ", clean).strip()
if clean:
_add_address(clean, url, "address_tag")
# 3c: European street address pattern (number + street keyword + postal code + city)
# Strip tags to plain text (replace tags with spaces for cross-element matching)
plain = re.sub(r"<[^>]+>", " ", text_html)
plain = re.sub(r"\s+", " ", plain)
street_keywords = (
r"(?:rue|avenue|boulevard|allée|impasse|chemin|place|route|passage|quai|"
r"straße|strasse|stra[ßs]e|weg|platz|gasse|" # German
r"street|road|lane|drive|way)" # English
)
addr_pattern = re.compile(
rf"\d{{1,4}}[\s,]+{street_keywords}\s[^<]{{3,60}}?\d{{4,5}}\s+[A-ZÀ-Ü][a-zà-ü]{{2,}}",
re.IGNORECASE,
)
for m in addr_pattern.finditer(plain):
_add_address(m.group(), url, "regex")
except Exception as e: # noqa: EXC003
logger.debug("Contact scrape failed for %s%s: %s", domain, path, e)
session.close()
# Save contacts (replace existing auto-scraped ones)
db.query(ProspectContact).filter(
ProspectContact.prospect_id == prospect.id,
ProspectContact.source_element.in_(["regex", "tel_href", "mailto_href", "schema_org", "address_tag"]),
).delete()
db.add_all(contacts)
# Mark first email and phone as primary
for c in contacts:
if c.contact_type == "email":
c.is_primary = True
break
for c in contacts:
if c.contact_type == "phone":
c.is_primary = True
break
prospect.last_contact_scrape_at = datetime.now(UTC)
db.flush()
return contacts
def _detect_cms(self, html: str) -> str | None:
for cms, patterns in CMS_PATTERNS.items():
for pattern in patterns:
if re.search(pattern, html):
return cms
return None
def _detect_js_framework(self, html: str) -> str | None:
for framework, patterns in JS_FRAMEWORK_PATTERNS.items():
for pattern in patterns:
if re.search(pattern, html):
return framework
return None
def _detect_analytics(self, html: str) -> str | None:
found = []
for tool, patterns in ANALYTICS_PATTERNS.items():
for pattern in patterns:
if re.search(pattern, html):
found.append(tool)
break
return ",".join(found) if found else None
enrichment_service = EnrichmentService()