orion/app/modules/prospecting/services/enrichment_service.py

# app/modules/prospecting/services/enrichment_service.py
"""
Enrichment service for prospect scanning pipeline.

Migrated from marketing-.lu-domains/app/services/enrichment_service.py.
Performs passive HTTP checks, technology detection, performance audits,
and contact scraping for digital prospects.

Uses `requests` (sync) to match Orion's tech stack.
"""

import logging
import re
import socket
import ssl
from datetime import UTC, datetime

import requests
from sqlalchemy.orm import Session

from app.modules.prospecting.config import config
from app.modules.prospecting.models import (
    Prospect,
    ProspectContact,
    ProspectPerformanceProfile,
    ProspectTechProfile,
)

logger = logging.getLogger(__name__)

# CMS detection patterns
CMS_PATTERNS = {
    "wordpress": [r"wp-content", r"wp-includes", r"wordpress"],
    "drupal": [r"drupal", r"sites/default", r"sites/all"],
    "joomla": [r"/media/jui/", r"joomla", r"/components/com_"],
    "shopify": [r"cdn\.shopify\.com", r"shopify"],
    "wix": [r"wix\.com", r"wixstatic\.com", r"parastorage\.com"],
    "squarespace": [r"squarespace\.com", r"sqsp\.com"],
    "webflow": [r"webflow\.com", r"webflow\.io"],
    "typo3": [r"typo3", r"/typo3conf/"],
    "prestashop": [r"prestashop", r"/modules/ps_"],
    "magento": [r"magento", r"mage/", r"/static/version"],
}

JS_FRAMEWORK_PATTERNS = {
    "react": [r"react", r"__NEXT_DATA__", r"_next/"],
    "vue": [r"vue\.js", r"vue\.min\.js", r"__vue__"],
    "angular": [r"angular", r"ng-version"],
    "jquery": [r"jquery"],
    "alpine": [r"alpine\.js", r"alpinejs"],
}

ANALYTICS_PATTERNS = {
    "google_analytics": [r"google-analytics\.com", r"gtag/js", r"ga\.js"],
    "google_tag_manager": [r"googletagmanager\.com", r"gtm\.js"],
    "matomo": [r"matomo", r"piwik"],
    "facebook_pixel": [r"facebook\.net/en_US/fbevents"],
}


class EnrichmentService:
    """Service for prospect enrichment via passive scanning."""

    def check_http(self, db: Session, prospect: Prospect) -> dict:
        """Check HTTP connectivity for a prospect's domain."""
        result = {
            "has_website": False,
            "uses_https": False,
            "http_status_code": None,
            "redirect_url": None,
            "error": None,
        }

        domain = prospect.domain_name
        if not domain:
            result["error"] = "No domain name"
            return result

        # Try HTTPS first, then HTTP
        for scheme in ["https", "http"]:
            try:
                url = f"{scheme}://{domain}"
                response = requests.get(
                    url,
                    timeout=config.http_timeout,
                    allow_redirects=True,
                    verify=False,  # noqa: SEC047 passive scan, not sending sensitive data
                )
                result["has_website"] = True
                result["uses_https"] = scheme == "https"
                result["http_status_code"] = response.status_code
                if response.url != url:
                    result["redirect_url"] = str(response.url)
                break
            except requests.exceptions.Timeout:
                result["error"] = f"Timeout on {scheme}"
            except requests.exceptions.RequestException as e:
                result["error"] = str(e)
                if scheme == "https":
                    continue
                break

        # Update prospect
        prospect.has_website = result["has_website"]
        prospect.uses_https = result["uses_https"]
        prospect.http_status_code = result["http_status_code"]
        prospect.redirect_url = result["redirect_url"]
        prospect.last_http_check_at = datetime.now(UTC)

        if result["has_website"]:
            prospect.status = "active"

        db.flush()
        return result

    def scan_tech_stack(self, db: Session, prospect: Prospect) -> ProspectTechProfile | None:
        """Scan technology stack from prospect's website HTML."""
        domain = prospect.domain_name
        if not domain or not prospect.has_website:
            return None

        scheme = "https" if prospect.uses_https else "http"
        url = f"{scheme}://{domain}"

        try:
            response = requests.get(
                url,
                timeout=config.http_timeout,
                allow_redirects=True,
                verify=False,  # noqa: SEC047 passive scan, not sending sensitive data
            )
            html = response.text.lower()
            headers = dict(response.headers)

            cms = self._detect_cms(html)
            js_framework = self._detect_js_framework(html)
            analytics = self._detect_analytics(html)
            server = headers.get("Server", "").split("/")[0] if "Server" in headers else None
            server_version = None
            if server and "/" in headers.get("Server", ""):
                server_version = headers["Server"].split("/", 1)[1].strip()

            # SSL certificate check
            has_valid_cert = None
            cert_issuer = None
            cert_expires_at = None
            if prospect.uses_https:
                try:
                    ctx = ssl.create_default_context()
                    with ctx.wrap_socket(
                        socket.create_connection((domain, 443), timeout=5),
                        server_hostname=domain,
                    ) as sock:
                        cert = sock.getpeercert()
                        has_valid_cert = True
                        cert_issuer = dict(x[0] for x in cert.get("issuer", [()])).get("organizationName")
                        not_after = cert.get("notAfter")
                        if not_after:
                            cert_expires_at = datetime.strptime(not_after, "%b %d %H:%M:%S %Y %Z")
                except Exception:  # noqa: EXC003
                    has_valid_cert = False

            # Upsert tech profile
            profile = prospect.tech_profile
            if not profile:
                profile = ProspectTechProfile(prospect_id=prospect.id)
                db.add(profile)

            profile.cms = cms
            profile.server = server
            profile.server_version = server_version
            profile.js_framework = js_framework
            profile.analytics = analytics
            profile.has_valid_cert = has_valid_cert
            profile.cert_issuer = cert_issuer
            profile.cert_expires_at = cert_expires_at
            profile.scan_source = "basic_http"

            prospect.last_tech_scan_at = datetime.now(UTC)
            db.flush()
            return profile

        except Exception as e:  # noqa: EXC003
            logger.error("Tech scan failed for %s: %s", domain, e)
            if prospect.tech_profile:
                prospect.tech_profile.scan_error = str(e)
            prospect.last_tech_scan_at = datetime.now(UTC)
            db.flush()
            return None

    def scan_performance(self, db: Session, prospect: Prospect) -> ProspectPerformanceProfile | None:
        """Run PageSpeed Insights audit for a prospect's website."""
        domain = prospect.domain_name
        if not domain or not prospect.has_website:
            return None

        scheme = "https" if prospect.uses_https else "http"
        url = f"{scheme}://{domain}"

        api_url = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
        params = {
            "url": url,
            "strategy": "mobile",
            "category": ["performance", "accessibility", "best-practices", "seo"],
        }
        if config.pagespeed_api_key:
            params["key"] = config.pagespeed_api_key

        try:
            response = requests.get(api_url, params=params, timeout=60)
            data = response.json()

            lighthouse = data.get("lighthouseResult", {})
            categories = lighthouse.get("categories", {})
            audits = lighthouse.get("audits", {})

            perf_score = int((categories.get("performance", {}).get("score") or 0) * 100)
            accessibility = int((categories.get("accessibility", {}).get("score") or 0) * 100)
            best_practices = int((categories.get("best-practices", {}).get("score") or 0) * 100)
            seo = int((categories.get("seo", {}).get("score") or 0) * 100)

            # Upsert performance profile
            profile = prospect.performance_profile
            if not profile:
                profile = ProspectPerformanceProfile(prospect_id=prospect.id)
                db.add(profile)

            profile.performance_score = perf_score
            profile.accessibility_score = accessibility
            profile.best_practices_score = best_practices
            profile.seo_score = seo

            # Core Web Vitals
            fcp = audits.get("first-contentful-paint", {}).get("numericValue")
            profile.first_contentful_paint_ms = int(fcp) if fcp else None
            lcp = audits.get("largest-contentful-paint", {}).get("numericValue")
            profile.largest_contentful_paint_ms = int(lcp) if lcp else None
            tbt = audits.get("total-blocking-time", {}).get("numericValue")
            profile.total_blocking_time_ms = int(tbt) if tbt else None
            cls_val = audits.get("cumulative-layout-shift", {}).get("numericValue")
            profile.cumulative_layout_shift = cls_val
            si = audits.get("speed-index", {}).get("numericValue")
            profile.speed_index = int(si) if si else None
            tti = audits.get("interactive", {}).get("numericValue")
            profile.time_to_interactive_ms = int(tti) if tti else None

            # Mobile-friendly check
            viewport = audits.get("viewport", {}).get("score")
            profile.viewport_configured = viewport == 1 if viewport is not None else None
            profile.is_mobile_friendly = profile.viewport_configured
            profile.scan_strategy = "mobile"

            prospect.last_perf_scan_at = datetime.now(UTC)
            db.flush()
            return profile

        except Exception as e:  # noqa: EXC003
            logger.error("Performance scan failed for %s: %s", domain, e)
            prospect.last_perf_scan_at = datetime.now(UTC)
            db.flush()
            return None

    def scrape_contacts(self, db: Session, prospect: Prospect) -> list[ProspectContact]:
        """Scrape email and phone contacts from prospect's website.

        Uses a two-phase approach:
        1. Structured extraction from <a href="tel:..."> and <a href="mailto:..."> (high confidence)
        2. Regex fallback for emails and international phone numbers (stricter filtering)
        """
        from urllib.parse import unquote

        domain = prospect.domain_name
        if not domain or not prospect.has_website:
            return []

        scheme = "https" if prospect.uses_https else "http"
        base_url = f"{scheme}://{domain}"
        paths = ["", "/contact", "/kontakt", "/impressum", "/about", "/mentions-legales"]

        # Structured patterns (from <a href> tags)
        tel_pattern = re.compile(r'href=["\']tel:([^"\'>\s]+)', re.IGNORECASE)
        mailto_pattern = re.compile(r'href=["\']mailto:([^"\'>\s?]+)', re.IGNORECASE)

        # Regex fallback patterns
        email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
        # International phone: requires + prefix to avoid matching random digit sequences
        phone_regex = re.compile(
            r"\+\d{1,3}[\s.-]?\(?\d{1,4}\)?[\s.-]?\d{2,4}[\s.-]?\d{2,4}(?:[\s.-]?\d{2,4})?"
        )

        false_positive_domains = {
            "example.com", "email.com", "domain.com", "wordpress.org",
            "w3.org", "schema.org", "sentry.io", "googleapis.com",
        }
        found_emails: set[str] = set()
        found_phones: set[str] = set()
        contacts: list[ProspectContact] = []

        def _add_email(email: str, url: str, source: str) -> None:
            email = unquote(email).strip().lower()
            email_domain = email.split("@")[1] if "@" in email else ""
            if email_domain in false_positive_domains or email in found_emails:
                return
            found_emails.add(email)
            contacts.append(ProspectContact(
                prospect_id=prospect.id,
                contact_type="email",
                value=email,
                source_url=url,
                source_element=source,
            ))

        def _add_phone(phone: str, url: str, source: str) -> None:
            phone_clean = re.sub(r"[\s.()\-]", "", phone)
            if len(phone_clean) < 10 or phone_clean in found_phones:
                return
            found_phones.add(phone_clean)
            contacts.append(ProspectContact(
                prospect_id=prospect.id,
                contact_type="phone",
                value=phone_clean,
                source_url=url,
                source_element=source,
            ))

        found_addresses: set[str] = set()

        def _add_address(address: str, url: str, source: str) -> None:
            address = re.sub(r"\s+", " ", address).strip()
            if len(address) < 10 or address in found_addresses:
                return
            found_addresses.add(address)
            contacts.append(ProspectContact(
                prospect_id=prospect.id,
                contact_type="address",
                value=address,
                source_url=url,
                source_element=source,
            ))

        session = requests.Session()
        session.verify = False  # noqa: SEC047 passive scan, not sending sensitive data
        session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"})

        for path in paths:
            try:
                url = base_url + path
                response = session.get(url, timeout=config.http_timeout, allow_redirects=True)
                if response.status_code != 200:
                    continue
                html = response.text

                # Phase 1: structured extraction from href attributes
                for phone in tel_pattern.findall(html):
                    _add_phone(unquote(phone), url, "tel_href")

                for email in mailto_pattern.findall(html):
                    _add_email(email, url, "mailto_href")

                # Phase 2: regex fallback — strip SVG/script content first
                text_html = re.sub(r"<(svg|script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)

                for email in email_regex.findall(text_html):
                    _add_email(email, url, "regex")

                for phone in phone_regex.findall(text_html):
                    _add_phone(phone, url, "regex")

                # Phase 3: address extraction
                # 3a: Schema.org JSON-LD
                for m in re.finditer(r'"streetAddress"\s*:\s*"([^"]+)"', html):
                    parts = [m.group(1)]
                    # Try to find locality/postal near the same JSON block
                    block_end = html[m.end():m.end() + 200]
                    locality = re.search(r'"addressLocality"\s*:\s*"([^"]+)"', block_end)
                    postal = re.search(r'"postalCode"\s*:\s*"([^"]+)"', block_end)
                    if postal:
                        parts.append(postal.group(1))
                    if locality:
                        parts.append(locality.group(1))
                    _add_address(", ".join(parts), url, "schema_org")

                # 3b: <address> HTML tag
                for addr_match in re.finditer(r"<address[^>]*>(.*?)</address>", html, re.DOTALL | re.IGNORECASE):
                    clean = re.sub(r"<[^>]+>", " ", addr_match.group(1))
                    clean = re.sub(r"\s+", " ", clean).strip()
                    if clean:
                        _add_address(clean, url, "address_tag")

                # 3c: European street address pattern (number + street keyword + postal code + city)
                # Strip tags to plain text (replace tags with spaces for cross-element matching)
                plain = re.sub(r"<[^>]+>", " ", text_html)
                plain = re.sub(r"\s+", " ", plain)
                street_keywords = (
                    r"(?:rue|avenue|boulevard|allée|impasse|chemin|place|route|passage|quai|"
                    r"straße|strasse|stra[ßs]e|weg|platz|gasse|"  # German
                    r"street|road|lane|drive|way)"  # English
                )
                addr_pattern = re.compile(
                    rf"\d{{1,4}}[\s,]+{street_keywords}\s[^<]{{3,60}}?\d{{4,5}}\s+[A-ZÀ-Ü][a-zà-ü]{{2,}}",
                    re.IGNORECASE,
                )
                for m in addr_pattern.finditer(plain):
                    _add_address(m.group(), url, "regex")

            except Exception as e:  # noqa: EXC003
                logger.debug("Contact scrape failed for %s%s: %s", domain, path, e)

        session.close()

        # Save contacts (replace existing auto-scraped ones)
        db.query(ProspectContact).filter(
            ProspectContact.prospect_id == prospect.id,
            ProspectContact.source_element.in_(["regex", "tel_href", "mailto_href", "schema_org", "address_tag"]),
        ).delete()

        db.add_all(contacts)

        # Mark first email and phone as primary
        for c in contacts:
            if c.contact_type == "email":
                c.is_primary = True
                break
        for c in contacts:
            if c.contact_type == "phone":
                c.is_primary = True
                break

        prospect.last_contact_scrape_at = datetime.now(UTC)
        db.flush()
        return contacts

    def _detect_cms(self, html: str) -> str | None:
        for cms, patterns in CMS_PATTERNS.items():
            for pattern in patterns:
                if re.search(pattern, html):
                    return cms
        return None

    def _detect_js_framework(self, html: str) -> str | None:
        for framework, patterns in JS_FRAMEWORK_PATTERNS.items():
            for pattern in patterns:
                if re.search(pattern, html):
                    return framework
        return None

    def _detect_analytics(self, html: str) -> str | None:
        found = []
        for tool, patterns in ANALYTICS_PATTERNS.items():
            for pattern in patterns:
                if re.search(pattern, html):
                    found.append(tool)
                    break
        return ",".join(found) if found else None


enrichment_service = EnrichmentService()