Some checks failed
Move db.commit() from services to API endpoints and Celery tasks. Services now use db.flush() only; endpoints own the transaction. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
370 lines
14 KiB
Python
370 lines
14 KiB
Python
# app/modules/prospecting/services/enrichment_service.py
|
|
"""
|
|
Enrichment service for prospect scanning pipeline.
|
|
|
|
Migrated from marketing-.lu-domains/app/services/enrichment_service.py.
|
|
Performs passive HTTP checks, technology detection, performance audits,
|
|
and contact scraping for digital prospects.
|
|
|
|
Uses `requests` (sync) to match Orion's tech stack.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
import socket
|
|
import ssl
|
|
from datetime import UTC, datetime
|
|
|
|
import requests
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.modules.prospecting.config import config
|
|
from app.modules.prospecting.models import (
|
|
Prospect,
|
|
ProspectContact,
|
|
ProspectPerformanceProfile,
|
|
ProspectTechProfile,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# CMS detection patterns
|
|
CMS_PATTERNS = {
|
|
"wordpress": [r"wp-content", r"wp-includes", r"wordpress"],
|
|
"drupal": [r"drupal", r"sites/default", r"sites/all"],
|
|
"joomla": [r"/media/jui/", r"joomla", r"/components/com_"],
|
|
"shopify": [r"cdn\.shopify\.com", r"shopify"],
|
|
"wix": [r"wix\.com", r"wixstatic\.com", r"parastorage\.com"],
|
|
"squarespace": [r"squarespace\.com", r"sqsp\.com"],
|
|
"webflow": [r"webflow\.com", r"webflow\.io"],
|
|
"typo3": [r"typo3", r"/typo3conf/"],
|
|
"prestashop": [r"prestashop", r"/modules/ps_"],
|
|
"magento": [r"magento", r"mage/", r"/static/version"],
|
|
}
|
|
|
|
JS_FRAMEWORK_PATTERNS = {
|
|
"react": [r"react", r"__NEXT_DATA__", r"_next/"],
|
|
"vue": [r"vue\.js", r"vue\.min\.js", r"__vue__"],
|
|
"angular": [r"angular", r"ng-version"],
|
|
"jquery": [r"jquery"],
|
|
"alpine": [r"alpine\.js", r"alpinejs"],
|
|
}
|
|
|
|
ANALYTICS_PATTERNS = {
|
|
"google_analytics": [r"google-analytics\.com", r"gtag/js", r"ga\.js"],
|
|
"google_tag_manager": [r"googletagmanager\.com", r"gtm\.js"],
|
|
"matomo": [r"matomo", r"piwik"],
|
|
"facebook_pixel": [r"facebook\.net/en_US/fbevents"],
|
|
}
|
|
|
|
|
|
class EnrichmentService:
|
|
"""Service for prospect enrichment via passive scanning."""
|
|
|
|
def check_http(self, db: Session, prospect: Prospect) -> dict:
|
|
"""Check HTTP connectivity for a prospect's domain."""
|
|
result = {
|
|
"has_website": False,
|
|
"uses_https": False,
|
|
"http_status_code": None,
|
|
"redirect_url": None,
|
|
"error": None,
|
|
}
|
|
|
|
domain = prospect.domain_name
|
|
if not domain:
|
|
result["error"] = "No domain name"
|
|
return result
|
|
|
|
# Try HTTPS first, then HTTP
|
|
for scheme in ["https", "http"]:
|
|
try:
|
|
url = f"{scheme}://{domain}"
|
|
response = requests.get(
|
|
url,
|
|
timeout=config.http_timeout,
|
|
allow_redirects=True,
|
|
verify=False, # noqa: SEC047 passive scan, not sending sensitive data
|
|
)
|
|
result["has_website"] = True
|
|
result["uses_https"] = scheme == "https"
|
|
result["http_status_code"] = response.status_code
|
|
if response.url != url:
|
|
result["redirect_url"] = str(response.url)
|
|
break
|
|
except requests.exceptions.Timeout:
|
|
result["error"] = f"Timeout on {scheme}"
|
|
except requests.exceptions.RequestException as e:
|
|
result["error"] = str(e)
|
|
if scheme == "https":
|
|
continue
|
|
break
|
|
|
|
# Update prospect
|
|
prospect.has_website = result["has_website"]
|
|
prospect.uses_https = result["uses_https"]
|
|
prospect.http_status_code = result["http_status_code"]
|
|
prospect.redirect_url = result["redirect_url"]
|
|
prospect.last_http_check_at = datetime.now(UTC)
|
|
|
|
if result["has_website"]:
|
|
prospect.status = "active"
|
|
|
|
db.flush()
|
|
return result
|
|
|
|
def scan_tech_stack(self, db: Session, prospect: Prospect) -> ProspectTechProfile | None:
|
|
"""Scan technology stack from prospect's website HTML."""
|
|
domain = prospect.domain_name
|
|
if not domain or not prospect.has_website:
|
|
return None
|
|
|
|
scheme = "https" if prospect.uses_https else "http"
|
|
url = f"{scheme}://{domain}"
|
|
|
|
try:
|
|
response = requests.get(
|
|
url,
|
|
timeout=config.http_timeout,
|
|
allow_redirects=True,
|
|
verify=False, # noqa: SEC047 passive scan, not sending sensitive data
|
|
)
|
|
html = response.text.lower()
|
|
headers = dict(response.headers)
|
|
|
|
cms = self._detect_cms(html)
|
|
js_framework = self._detect_js_framework(html)
|
|
analytics = self._detect_analytics(html)
|
|
server = headers.get("Server", "").split("/")[0] if "Server" in headers else None
|
|
server_version = None
|
|
if server and "/" in headers.get("Server", ""):
|
|
server_version = headers["Server"].split("/", 1)[1].strip()
|
|
|
|
# SSL certificate check
|
|
has_valid_cert = None
|
|
cert_issuer = None
|
|
cert_expires_at = None
|
|
if prospect.uses_https:
|
|
try:
|
|
ctx = ssl.create_default_context()
|
|
with ctx.wrap_socket(
|
|
socket.create_connection((domain, 443), timeout=5),
|
|
server_hostname=domain,
|
|
) as sock:
|
|
cert = sock.getpeercert()
|
|
has_valid_cert = True
|
|
cert_issuer = dict(x[0] for x in cert.get("issuer", [()])).get("organizationName")
|
|
not_after = cert.get("notAfter")
|
|
if not_after:
|
|
cert_expires_at = datetime.strptime(not_after, "%b %d %H:%M:%S %Y %Z")
|
|
except Exception:
|
|
has_valid_cert = False
|
|
|
|
# Upsert tech profile
|
|
profile = prospect.tech_profile
|
|
if not profile:
|
|
profile = ProspectTechProfile(prospect_id=prospect.id)
|
|
db.add(profile)
|
|
|
|
profile.cms = cms
|
|
profile.server = server
|
|
profile.server_version = server_version
|
|
profile.js_framework = js_framework
|
|
profile.analytics = analytics
|
|
profile.has_valid_cert = has_valid_cert
|
|
profile.cert_issuer = cert_issuer
|
|
profile.cert_expires_at = cert_expires_at
|
|
profile.scan_source = "basic_http"
|
|
|
|
prospect.last_tech_scan_at = datetime.now(UTC)
|
|
db.flush()
|
|
return profile
|
|
|
|
except Exception as e:
|
|
logger.error("Tech scan failed for %s: %s", domain, e)
|
|
if prospect.tech_profile:
|
|
prospect.tech_profile.scan_error = str(e)
|
|
prospect.last_tech_scan_at = datetime.now(UTC)
|
|
db.flush()
|
|
return None
|
|
|
|
def scan_performance(self, db: Session, prospect: Prospect) -> ProspectPerformanceProfile | None:
|
|
"""Run PageSpeed Insights audit for a prospect's website."""
|
|
domain = prospect.domain_name
|
|
if not domain or not prospect.has_website:
|
|
return None
|
|
|
|
scheme = "https" if prospect.uses_https else "http"
|
|
url = f"{scheme}://{domain}"
|
|
|
|
api_url = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
|
|
params = {
|
|
"url": url,
|
|
"strategy": "mobile",
|
|
"category": ["performance", "accessibility", "best-practices", "seo"],
|
|
}
|
|
if config.pagespeed_api_key:
|
|
params["key"] = config.pagespeed_api_key
|
|
|
|
try:
|
|
response = requests.get(api_url, params=params, timeout=60)
|
|
data = response.json()
|
|
|
|
lighthouse = data.get("lighthouseResult", {})
|
|
categories = lighthouse.get("categories", {})
|
|
audits = lighthouse.get("audits", {})
|
|
|
|
perf_score = int((categories.get("performance", {}).get("score") or 0) * 100)
|
|
accessibility = int((categories.get("accessibility", {}).get("score") or 0) * 100)
|
|
best_practices = int((categories.get("best-practices", {}).get("score") or 0) * 100)
|
|
seo = int((categories.get("seo", {}).get("score") or 0) * 100)
|
|
|
|
# Upsert performance profile
|
|
profile = prospect.performance_profile
|
|
if not profile:
|
|
profile = ProspectPerformanceProfile(prospect_id=prospect.id)
|
|
db.add(profile)
|
|
|
|
profile.performance_score = perf_score
|
|
profile.accessibility_score = accessibility
|
|
profile.best_practices_score = best_practices
|
|
profile.seo_score = seo
|
|
|
|
# Core Web Vitals
|
|
fcp = audits.get("first-contentful-paint", {}).get("numericValue")
|
|
profile.first_contentful_paint_ms = int(fcp) if fcp else None
|
|
lcp = audits.get("largest-contentful-paint", {}).get("numericValue")
|
|
profile.largest_contentful_paint_ms = int(lcp) if lcp else None
|
|
tbt = audits.get("total-blocking-time", {}).get("numericValue")
|
|
profile.total_blocking_time_ms = int(tbt) if tbt else None
|
|
cls_val = audits.get("cumulative-layout-shift", {}).get("numericValue")
|
|
profile.cumulative_layout_shift = cls_val
|
|
si = audits.get("speed-index", {}).get("numericValue")
|
|
profile.speed_index = int(si) if si else None
|
|
tti = audits.get("interactive", {}).get("numericValue")
|
|
profile.time_to_interactive_ms = int(tti) if tti else None
|
|
|
|
# Mobile-friendly check
|
|
viewport = audits.get("viewport", {}).get("score")
|
|
profile.viewport_configured = viewport == 1 if viewport is not None else None
|
|
profile.is_mobile_friendly = profile.viewport_configured
|
|
profile.scan_strategy = "mobile"
|
|
|
|
prospect.last_perf_scan_at = datetime.now(UTC)
|
|
db.flush()
|
|
return profile
|
|
|
|
except Exception as e:
|
|
logger.error("Performance scan failed for %s: %s", domain, e)
|
|
prospect.last_perf_scan_at = datetime.now(UTC)
|
|
db.flush()
|
|
return None
|
|
|
|
def scrape_contacts(self, db: Session, prospect: Prospect) -> list[ProspectContact]:
|
|
"""Scrape email and phone contacts from prospect's website."""
|
|
domain = prospect.domain_name
|
|
if not domain or not prospect.has_website:
|
|
return []
|
|
|
|
scheme = "https" if prospect.uses_https else "http"
|
|
base_url = f"{scheme}://{domain}"
|
|
paths = ["", "/contact", "/kontakt", "/impressum", "/about"]
|
|
|
|
email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
|
|
phone_pattern = re.compile(r"(?:\+352|00352)?[\s.-]?\d{2,3}[\s.-]?\d{2,3}[\s.-]?\d{2,3}")
|
|
|
|
false_positive_domains = {"example.com", "email.com", "domain.com", "wordpress.org", "w3.org", "schema.org"}
|
|
found_emails = set()
|
|
found_phones = set()
|
|
contacts = []
|
|
|
|
session = requests.Session()
|
|
session.verify = False # noqa: SEC047 passive scan, not sending sensitive data
|
|
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"})
|
|
|
|
for path in paths:
|
|
try:
|
|
url = base_url + path
|
|
response = session.get(url, timeout=config.http_timeout, allow_redirects=True)
|
|
if response.status_code != 200:
|
|
continue
|
|
html = response.text
|
|
|
|
for email in email_pattern.findall(html):
|
|
email_domain = email.split("@")[1].lower()
|
|
if email_domain not in false_positive_domains and email not in found_emails:
|
|
found_emails.add(email)
|
|
contacts.append(ProspectContact(
|
|
prospect_id=prospect.id,
|
|
contact_type="email",
|
|
value=email.lower(),
|
|
source_url=url,
|
|
source_element="regex",
|
|
))
|
|
|
|
for phone in phone_pattern.findall(html):
|
|
phone_clean = re.sub(r"[\s.-]", "", phone)
|
|
if len(phone_clean) >= 8 and phone_clean not in found_phones:
|
|
found_phones.add(phone_clean)
|
|
contacts.append(ProspectContact(
|
|
prospect_id=prospect.id,
|
|
contact_type="phone",
|
|
value=phone_clean,
|
|
source_url=url,
|
|
source_element="regex",
|
|
))
|
|
except Exception as e:
|
|
logger.debug("Contact scrape failed for %s%s: %s", domain, path, e)
|
|
|
|
session.close()
|
|
|
|
# Save contacts (replace existing auto-scraped ones)
|
|
db.query(ProspectContact).filter(
|
|
ProspectContact.prospect_id == prospect.id,
|
|
ProspectContact.source_element == "regex",
|
|
).delete()
|
|
|
|
for contact in contacts:
|
|
db.add(contact)
|
|
|
|
# Mark first email and phone as primary
|
|
if contacts:
|
|
for c in contacts:
|
|
if c.contact_type == "email":
|
|
c.is_primary = True
|
|
break
|
|
for c in contacts:
|
|
if c.contact_type == "phone":
|
|
c.is_primary = True
|
|
break
|
|
|
|
prospect.last_contact_scrape_at = datetime.now(UTC)
|
|
db.flush()
|
|
return contacts
|
|
|
|
def _detect_cms(self, html: str) -> str | None:
|
|
for cms, patterns in CMS_PATTERNS.items():
|
|
for pattern in patterns:
|
|
if re.search(pattern, html):
|
|
return cms
|
|
return None
|
|
|
|
def _detect_js_framework(self, html: str) -> str | None:
|
|
for framework, patterns in JS_FRAMEWORK_PATTERNS.items():
|
|
for pattern in patterns:
|
|
if re.search(pattern, html):
|
|
return framework
|
|
return None
|
|
|
|
def _detect_analytics(self, html: str) -> str | None:
|
|
found = []
|
|
for tool, patterns in ANALYTICS_PATTERNS.items():
|
|
for pattern in patterns:
|
|
if re.search(pattern, html):
|
|
found.append(tool)
|
|
break
|
|
return ",".join(found) if found else None
|
|
|
|
|
|
enrichment_service = EnrichmentService()
|