feat(prospecting): add complete prospecting module for lead discovery and scoring
Some checks failed
Some checks failed
Migrates scanning pipeline from marketing-.lu-domains app into Orion module. Supports digital (domain scan) and offline (manual capture) lead channels with enrichment, scoring, campaign management, and interaction tracking. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
369
app/modules/prospecting/services/enrichment_service.py
Normal file
369
app/modules/prospecting/services/enrichment_service.py
Normal file
@@ -0,0 +1,369 @@
|
||||
# app/modules/prospecting/services/enrichment_service.py
|
||||
"""
|
||||
Enrichment service for prospect scanning pipeline.
|
||||
|
||||
Migrated from marketing-.lu-domains/app/services/enrichment_service.py.
|
||||
Performs passive HTTP checks, technology detection, performance audits,
|
||||
and contact scraping for digital prospects.
|
||||
|
||||
Uses `requests` (sync) to match Orion's tech stack.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import socket
|
||||
import ssl
|
||||
from datetime import UTC, datetime
|
||||
|
||||
import requests
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.modules.prospecting.config import config
|
||||
from app.modules.prospecting.models import (
|
||||
Prospect,
|
||||
ProspectContact,
|
||||
ProspectPerformanceProfile,
|
||||
ProspectTechProfile,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# CMS detection patterns
|
||||
CMS_PATTERNS = {
|
||||
"wordpress": [r"wp-content", r"wp-includes", r"wordpress"],
|
||||
"drupal": [r"drupal", r"sites/default", r"sites/all"],
|
||||
"joomla": [r"/media/jui/", r"joomla", r"/components/com_"],
|
||||
"shopify": [r"cdn\.shopify\.com", r"shopify"],
|
||||
"wix": [r"wix\.com", r"wixstatic\.com", r"parastorage\.com"],
|
||||
"squarespace": [r"squarespace\.com", r"sqsp\.com"],
|
||||
"webflow": [r"webflow\.com", r"webflow\.io"],
|
||||
"typo3": [r"typo3", r"/typo3conf/"],
|
||||
"prestashop": [r"prestashop", r"/modules/ps_"],
|
||||
"magento": [r"magento", r"mage/", r"/static/version"],
|
||||
}
|
||||
|
||||
JS_FRAMEWORK_PATTERNS = {
|
||||
"react": [r"react", r"__NEXT_DATA__", r"_next/"],
|
||||
"vue": [r"vue\.js", r"vue\.min\.js", r"__vue__"],
|
||||
"angular": [r"angular", r"ng-version"],
|
||||
"jquery": [r"jquery"],
|
||||
"alpine": [r"alpine\.js", r"alpinejs"],
|
||||
}
|
||||
|
||||
ANALYTICS_PATTERNS = {
|
||||
"google_analytics": [r"google-analytics\.com", r"gtag/js", r"ga\.js"],
|
||||
"google_tag_manager": [r"googletagmanager\.com", r"gtm\.js"],
|
||||
"matomo": [r"matomo", r"piwik"],
|
||||
"facebook_pixel": [r"facebook\.net/en_US/fbevents"],
|
||||
}
|
||||
|
||||
|
||||
class EnrichmentService:
|
||||
"""Service for prospect enrichment via passive scanning."""
|
||||
|
||||
def check_http(self, db: Session, prospect: Prospect) -> dict:
|
||||
"""Check HTTP connectivity for a prospect's domain."""
|
||||
result = {
|
||||
"has_website": False,
|
||||
"uses_https": False,
|
||||
"http_status_code": None,
|
||||
"redirect_url": None,
|
||||
"error": None,
|
||||
}
|
||||
|
||||
domain = prospect.domain_name
|
||||
if not domain:
|
||||
result["error"] = "No domain name"
|
||||
return result
|
||||
|
||||
# Try HTTPS first, then HTTP
|
||||
for scheme in ["https", "http"]:
|
||||
try:
|
||||
url = f"{scheme}://{domain}"
|
||||
response = requests.get(
|
||||
url,
|
||||
timeout=config.http_timeout,
|
||||
allow_redirects=True,
|
||||
verify=False, # noqa: SEC047 passive scan, not sending sensitive data
|
||||
)
|
||||
result["has_website"] = True
|
||||
result["uses_https"] = scheme == "https"
|
||||
result["http_status_code"] = response.status_code
|
||||
if response.url != url:
|
||||
result["redirect_url"] = str(response.url)
|
||||
break
|
||||
except requests.exceptions.Timeout:
|
||||
result["error"] = f"Timeout on {scheme}"
|
||||
except requests.exceptions.RequestException as e:
|
||||
result["error"] = str(e)
|
||||
if scheme == "https":
|
||||
continue
|
||||
break
|
||||
|
||||
# Update prospect
|
||||
prospect.has_website = result["has_website"]
|
||||
prospect.uses_https = result["uses_https"]
|
||||
prospect.http_status_code = result["http_status_code"]
|
||||
prospect.redirect_url = result["redirect_url"]
|
||||
prospect.last_http_check_at = datetime.now(UTC)
|
||||
|
||||
if result["has_website"]:
|
||||
prospect.status = "active"
|
||||
|
||||
db.commit()
|
||||
return result
|
||||
|
||||
def scan_tech_stack(self, db: Session, prospect: Prospect) -> ProspectTechProfile | None:
|
||||
"""Scan technology stack from prospect's website HTML."""
|
||||
domain = prospect.domain_name
|
||||
if not domain or not prospect.has_website:
|
||||
return None
|
||||
|
||||
scheme = "https" if prospect.uses_https else "http"
|
||||
url = f"{scheme}://{domain}"
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
url,
|
||||
timeout=config.http_timeout,
|
||||
allow_redirects=True,
|
||||
verify=False, # noqa: SEC047 passive scan, not sending sensitive data
|
||||
)
|
||||
html = response.text.lower()
|
||||
headers = dict(response.headers)
|
||||
|
||||
cms = self._detect_cms(html)
|
||||
js_framework = self._detect_js_framework(html)
|
||||
analytics = self._detect_analytics(html)
|
||||
server = headers.get("Server", "").split("/")[0] if "Server" in headers else None
|
||||
server_version = None
|
||||
if server and "/" in headers.get("Server", ""):
|
||||
server_version = headers["Server"].split("/", 1)[1].strip()
|
||||
|
||||
# SSL certificate check
|
||||
has_valid_cert = None
|
||||
cert_issuer = None
|
||||
cert_expires_at = None
|
||||
if prospect.uses_https:
|
||||
try:
|
||||
ctx = ssl.create_default_context()
|
||||
with ctx.wrap_socket(
|
||||
socket.create_connection((domain, 443), timeout=5),
|
||||
server_hostname=domain,
|
||||
) as sock:
|
||||
cert = sock.getpeercert()
|
||||
has_valid_cert = True
|
||||
cert_issuer = dict(x[0] for x in cert.get("issuer", [()])).get("organizationName")
|
||||
not_after = cert.get("notAfter")
|
||||
if not_after:
|
||||
cert_expires_at = datetime.strptime(not_after, "%b %d %H:%M:%S %Y %Z")
|
||||
except Exception:
|
||||
has_valid_cert = False
|
||||
|
||||
# Upsert tech profile
|
||||
profile = prospect.tech_profile
|
||||
if not profile:
|
||||
profile = ProspectTechProfile(prospect_id=prospect.id)
|
||||
db.add(profile)
|
||||
|
||||
profile.cms = cms
|
||||
profile.server = server
|
||||
profile.server_version = server_version
|
||||
profile.js_framework = js_framework
|
||||
profile.analytics = analytics
|
||||
profile.has_valid_cert = has_valid_cert
|
||||
profile.cert_issuer = cert_issuer
|
||||
profile.cert_expires_at = cert_expires_at
|
||||
profile.scan_source = "basic_http"
|
||||
|
||||
prospect.last_tech_scan_at = datetime.now(UTC)
|
||||
db.commit()
|
||||
return profile
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Tech scan failed for %s: %s", domain, e)
|
||||
if prospect.tech_profile:
|
||||
prospect.tech_profile.scan_error = str(e)
|
||||
prospect.last_tech_scan_at = datetime.now(UTC)
|
||||
db.commit()
|
||||
return None
|
||||
|
||||
def scan_performance(self, db: Session, prospect: Prospect) -> ProspectPerformanceProfile | None:
|
||||
"""Run PageSpeed Insights audit for a prospect's website."""
|
||||
domain = prospect.domain_name
|
||||
if not domain or not prospect.has_website:
|
||||
return None
|
||||
|
||||
scheme = "https" if prospect.uses_https else "http"
|
||||
url = f"{scheme}://{domain}"
|
||||
|
||||
api_url = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
|
||||
params = {
|
||||
"url": url,
|
||||
"strategy": "mobile",
|
||||
"category": ["performance", "accessibility", "best-practices", "seo"],
|
||||
}
|
||||
if config.pagespeed_api_key:
|
||||
params["key"] = config.pagespeed_api_key
|
||||
|
||||
try:
|
||||
response = requests.get(api_url, params=params, timeout=60)
|
||||
data = response.json()
|
||||
|
||||
lighthouse = data.get("lighthouseResult", {})
|
||||
categories = lighthouse.get("categories", {})
|
||||
audits = lighthouse.get("audits", {})
|
||||
|
||||
perf_score = int((categories.get("performance", {}).get("score") or 0) * 100)
|
||||
accessibility = int((categories.get("accessibility", {}).get("score") or 0) * 100)
|
||||
best_practices = int((categories.get("best-practices", {}).get("score") or 0) * 100)
|
||||
seo = int((categories.get("seo", {}).get("score") or 0) * 100)
|
||||
|
||||
# Upsert performance profile
|
||||
profile = prospect.performance_profile
|
||||
if not profile:
|
||||
profile = ProspectPerformanceProfile(prospect_id=prospect.id)
|
||||
db.add(profile)
|
||||
|
||||
profile.performance_score = perf_score
|
||||
profile.accessibility_score = accessibility
|
||||
profile.best_practices_score = best_practices
|
||||
profile.seo_score = seo
|
||||
|
||||
# Core Web Vitals
|
||||
fcp = audits.get("first-contentful-paint", {}).get("numericValue")
|
||||
profile.first_contentful_paint_ms = int(fcp) if fcp else None
|
||||
lcp = audits.get("largest-contentful-paint", {}).get("numericValue")
|
||||
profile.largest_contentful_paint_ms = int(lcp) if lcp else None
|
||||
tbt = audits.get("total-blocking-time", {}).get("numericValue")
|
||||
profile.total_blocking_time_ms = int(tbt) if tbt else None
|
||||
cls_val = audits.get("cumulative-layout-shift", {}).get("numericValue")
|
||||
profile.cumulative_layout_shift = cls_val
|
||||
si = audits.get("speed-index", {}).get("numericValue")
|
||||
profile.speed_index = int(si) if si else None
|
||||
tti = audits.get("interactive", {}).get("numericValue")
|
||||
profile.time_to_interactive_ms = int(tti) if tti else None
|
||||
|
||||
# Mobile-friendly check
|
||||
viewport = audits.get("viewport", {}).get("score")
|
||||
profile.viewport_configured = viewport == 1 if viewport is not None else None
|
||||
profile.is_mobile_friendly = profile.viewport_configured
|
||||
profile.scan_strategy = "mobile"
|
||||
|
||||
prospect.last_perf_scan_at = datetime.now(UTC)
|
||||
db.commit()
|
||||
return profile
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Performance scan failed for %s: %s", domain, e)
|
||||
prospect.last_perf_scan_at = datetime.now(UTC)
|
||||
db.commit()
|
||||
return None
|
||||
|
||||
def scrape_contacts(self, db: Session, prospect: Prospect) -> list[ProspectContact]:
|
||||
"""Scrape email and phone contacts from prospect's website."""
|
||||
domain = prospect.domain_name
|
||||
if not domain or not prospect.has_website:
|
||||
return []
|
||||
|
||||
scheme = "https" if prospect.uses_https else "http"
|
||||
base_url = f"{scheme}://{domain}"
|
||||
paths = ["", "/contact", "/kontakt", "/impressum", "/about"]
|
||||
|
||||
email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
|
||||
phone_pattern = re.compile(r"(?:\+352|00352)?[\s.-]?\d{2,3}[\s.-]?\d{2,3}[\s.-]?\d{2,3}")
|
||||
|
||||
false_positive_domains = {"example.com", "email.com", "domain.com", "wordpress.org", "w3.org", "schema.org"}
|
||||
found_emails = set()
|
||||
found_phones = set()
|
||||
contacts = []
|
||||
|
||||
session = requests.Session()
|
||||
session.verify = False # noqa: SEC047 passive scan, not sending sensitive data
|
||||
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"})
|
||||
|
||||
for path in paths:
|
||||
try:
|
||||
url = base_url + path
|
||||
response = session.get(url, timeout=config.http_timeout, allow_redirects=True)
|
||||
if response.status_code != 200:
|
||||
continue
|
||||
html = response.text
|
||||
|
||||
for email in email_pattern.findall(html):
|
||||
email_domain = email.split("@")[1].lower()
|
||||
if email_domain not in false_positive_domains and email not in found_emails:
|
||||
found_emails.add(email)
|
||||
contacts.append(ProspectContact(
|
||||
prospect_id=prospect.id,
|
||||
contact_type="email",
|
||||
value=email.lower(),
|
||||
source_url=url,
|
||||
source_element="regex",
|
||||
))
|
||||
|
||||
for phone in phone_pattern.findall(html):
|
||||
phone_clean = re.sub(r"[\s.-]", "", phone)
|
||||
if len(phone_clean) >= 8 and phone_clean not in found_phones:
|
||||
found_phones.add(phone_clean)
|
||||
contacts.append(ProspectContact(
|
||||
prospect_id=prospect.id,
|
||||
contact_type="phone",
|
||||
value=phone_clean,
|
||||
source_url=url,
|
||||
source_element="regex",
|
||||
))
|
||||
except Exception as e:
|
||||
logger.debug("Contact scrape failed for %s%s: %s", domain, path, e)
|
||||
|
||||
session.close()
|
||||
|
||||
# Save contacts (replace existing auto-scraped ones)
|
||||
db.query(ProspectContact).filter(
|
||||
ProspectContact.prospect_id == prospect.id,
|
||||
ProspectContact.source_element == "regex",
|
||||
).delete()
|
||||
|
||||
for contact in contacts:
|
||||
db.add(contact)
|
||||
|
||||
# Mark first email and phone as primary
|
||||
if contacts:
|
||||
for c in contacts:
|
||||
if c.contact_type == "email":
|
||||
c.is_primary = True
|
||||
break
|
||||
for c in contacts:
|
||||
if c.contact_type == "phone":
|
||||
c.is_primary = True
|
||||
break
|
||||
|
||||
prospect.last_contact_scrape_at = datetime.now(UTC)
|
||||
db.commit()
|
||||
return contacts
|
||||
|
||||
def _detect_cms(self, html: str) -> str | None:
|
||||
for cms, patterns in CMS_PATTERNS.items():
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, html):
|
||||
return cms
|
||||
return None
|
||||
|
||||
def _detect_js_framework(self, html: str) -> str | None:
|
||||
for framework, patterns in JS_FRAMEWORK_PATTERNS.items():
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, html):
|
||||
return framework
|
||||
return None
|
||||
|
||||
def _detect_analytics(self, html: str) -> str | None:
|
||||
found = []
|
||||
for tool, patterns in ANALYTICS_PATTERNS.items():
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, html):
|
||||
found.append(tool)
|
||||
break
|
||||
return ",".join(found) if found else None
|
||||
|
||||
|
||||
enrichment_service = EnrichmentService()
|
||||
Reference in New Issue
Block a user