feat(prospecting): implement security audit pipeline (Workstream 2A)

Complete security audit integration into the enrichment pipeline:

Backend:
- SecurityAuditService with 7 passive checks: HTTPS, SSL cert, security
  headers, exposed files, cookies, server info, technology detection
- Constants file with SECURITY_HEADERS, EXPOSED_PATHS, SEVERITY_SCORES
- SecurityAuditResponse schema with JSON field validators + aliases
- Endpoints: POST /security-audit/{id}, POST /security-audit/batch
- Added to full_enrichment pipeline (Step 5, before scoring)
- get_pending_security_audit() query in prospect_service

Frontend:
- Security tab on prospect detail page with grade badge (A+ to F),
  score/100, severity counts, HTTPS/SSL status, missing headers,
  exposed files, technologies, and full findings list
- "Run Security Audit" button with loading state
- "Security Audit" batch button on scan-jobs page

Tested on batirenovation-strasbourg.fr: Grade D (50/100), 11 issues
found (missing headers, exposed wp-login, server version disclosure).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-01 20:58:11 +02:00
parent 59b0d8977a
commit 4c750f0268
10 changed files with 812 additions and 1 deletions

View File

@@ -251,6 +251,17 @@ class ProspectService:
.all()
)
def get_pending_security_audit(self, db: Session, limit: int = 50) -> list[Prospect]:
return (
db.query(Prospect)
.filter(
Prospect.has_website.is_(True),
Prospect.last_security_audit_at.is_(None),
)
.limit(limit)
.all()
)
def count_by_status(self, db: Session) -> dict[str, int]:
results = db.query(Prospect.status, func.count(Prospect.id)).group_by(Prospect.status).all() # noqa: SVC-005 - prospecting is platform-scoped, not store-scoped
return {status.value if hasattr(status, "value") else str(status): count for status, count in results}

View File

@@ -0,0 +1,75 @@
# app/modules/prospecting/services/security_audit_constants.py
"""
Constants for security audit checks.
Structural data used by SecurityAuditService. Translations for report
generation are kept in the standalone script (scripts/security-audit/audit.py)
until Phase 2B (report service) migrates them.
"""
# Severity scores — deducted from a starting score of 100
SEVERITY_SCORES = {
"critical": 15,
"high": 10,
"medium": 5,
"low": 2,
"info": 0,
}
# Security headers to check and their severity if missing
SECURITY_HEADERS = {
"Strict-Transport-Security": {"severity": "high", "impact": "MITM attacks, session hijacking via HTTP downgrade"},
"Content-Security-Policy": {"severity": "high", "impact": "XSS attacks, script injection, data theft"},
"X-Frame-Options": {"severity": "medium", "impact": "Clickjacking attacks via invisible iframes"},
"X-Content-Type-Options": {"severity": "medium", "impact": "MIME type confusion, content injection"},
"Referrer-Policy": {"severity": "low", "impact": "URL parameter leakage to third parties"},
"Permissions-Policy": {"severity": "low", "impact": "Unrestricted browser API access (camera, mic, location)"},
"X-XSS-Protection": {"severity": "info", "impact": "Legacy XSS filter not configured"},
}
# Paths to check for exposed sensitive files/directories
EXPOSED_PATHS = [
("/.env", "Environment file (database passwords, API keys)", "critical"),
("/.git/config", "Git repository (full source code)", "critical"),
("/.git/HEAD", "Git repository HEAD", "critical"),
("/.htpasswd", "Password file", "critical"),
("/wp-admin/", "WordPress admin panel", "high"),
("/wp-login.php", "WordPress login page", "high"),
("/administrator/", "Joomla admin panel", "high"),
("/admin/", "Admin panel", "high"),
("/admin/login", "Admin login page", "high"),
("/phpmyadmin/", "phpMyAdmin (database manager)", "high"),
("/backup/", "Backup directory", "high"),
("/backup.zip", "Backup archive", "high"),
("/backup.sql", "Database backup", "high"),
("/db.sql", "Database dump", "high"),
("/dump.sql", "Database dump", "high"),
("/.htaccess", "Server configuration", "medium"),
("/web.config", "IIS configuration", "medium"),
("/server-status", "Apache server status", "medium"),
("/server-info", "Apache server info", "medium"),
("/info.php", "PHP info page", "medium"),
("/phpinfo.php", "PHP info page", "medium"),
("/graphql", "GraphQL endpoint", "medium"),
("/debug/", "Debug endpoint", "medium"),
("/elmah.axd", ".NET error log", "medium"),
("/trace.axd", ".NET trace log", "medium"),
("/readme.html", "CMS readme (reveals version)", "low"),
("/license.txt", "CMS license (reveals version)", "low"),
("/CHANGELOG.md", "Changelog (reveals version)", "low"),
("/robots.txt", "Robots file", "info"),
("/.well-known/security.txt", "Security contact file", "info"),
("/sitemap.xml", "Sitemap", "info"),
("/crossdomain.xml", "Flash cross-domain policy", "low"),
("/api/", "API endpoint", "info"),
]
# Paths that are admin panels (separate severity logic)
ADMIN_PATHS = {"/wp-admin/", "/wp-login.php", "/administrator/", "/admin/", "/admin/login"}
# Robots.txt disallow patterns that may reveal sensitive areas
ROBOTS_SENSITIVE_PATTERNS = [
"admin", "backup", "private", "secret", "staging",
"test", "dev", "internal", "api", "config",
"database", "panel", "dashboard", "login", "cgi-bin",
]

View File

@@ -0,0 +1,443 @@
# app/modules/prospecting/services/security_audit_service.py
"""
Security audit service for prospect websites.
Performs passive security checks (HTTPS, SSL, headers, exposed files,
cookies, server info, technology detection) and stores results as
ProspectSecurityAudit. All checks are read-only — no active exploitation.
Migrated from scripts/security-audit/audit.py into the enrichment pipeline.
"""
import json
import logging
import re
import socket
import ssl
from datetime import UTC, datetime
import requests
from sqlalchemy.orm import Session
from app.modules.prospecting.models import Prospect, ProspectSecurityAudit
from app.modules.prospecting.services.security_audit_constants import (
ADMIN_PATHS,
EXPOSED_PATHS,
ROBOTS_SENSITIVE_PATTERNS,
SECURITY_HEADERS,
SEVERITY_SCORES,
)
logger = logging.getLogger(__name__)
REQUEST_TIMEOUT = 10
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
class SecurityAuditService:
"""Runs passive security checks against a prospect's website."""
def run_audit(self, db: Session, prospect: Prospect) -> ProspectSecurityAudit | None:
"""Run all security checks and store results."""
domain = prospect.domain_name
if not domain or not prospect.has_website:
return None
scheme = "https" if prospect.uses_https else "http"
url = f"{scheme}://{domain}"
findings = []
technologies = []
score = 100
has_https = None
has_valid_ssl = None
ssl_expires_at = None
missing_headers = []
exposed_files = []
session = requests.Session()
session.headers["User-Agent"] = USER_AGENT
session.verify = True
session.max_redirects = 5
# Fetch the page
response = None
html_content = ""
try:
response = session.get(url, timeout=REQUEST_TIMEOUT, allow_redirects=True)
html_content = response.text
if response.url != url:
url = response.url
except requests.exceptions.SSLError:
findings.append(self._finding("Weak SSL/TLS configuration", "critical", "transport",
"Server supports outdated encryption protocols"))
try:
session.verify = False # noqa: SEC047 fallback for broken SSL
response = session.get(url, timeout=REQUEST_TIMEOUT, allow_redirects=True)
html_content = response.text
except Exception:
pass
except requests.exceptions.RequestException as e:
logger.warning("Security audit: cannot reach %s: %s", domain, e)
return self._save_audit(db, prospect, score=0, grade="F", findings=findings,
scan_error=f"Cannot reach website: {e}",
technologies=technologies)
# Run checks
https_findings, has_https = self._check_https(url, html_content)
findings.extend(https_findings)
ssl_findings, has_valid_ssl, ssl_expires_at = self._check_ssl(domain)
findings.extend(ssl_findings)
header_findings, missing_headers = self._check_headers(response)
findings.extend(header_findings)
server_findings, server_techs = self._check_server_info(response)
findings.extend(server_findings)
technologies.extend(server_techs)
tech_findings, detected_techs = self._check_technology(html_content, response)
findings.extend(tech_findings)
technologies.extend(detected_techs)
cookie_findings = self._check_cookies(response)
findings.extend(cookie_findings)
exposed_findings, exposed_files = self._check_exposed_files(domain, scheme, session)
findings.extend(exposed_findings)
session.close()
# Calculate score
for f in findings:
if not f.get("is_positive", False):
score = max(0, score - SEVERITY_SCORES.get(f["severity"], 0))
grade = self._calculate_grade(score)
return self._save_audit(
db, prospect,
score=score, grade=grade, findings=findings,
has_https=has_https, has_valid_ssl=has_valid_ssl,
ssl_expires_at=ssl_expires_at,
missing_headers=missing_headers, exposed_files=exposed_files,
technologies=technologies,
)
# ── Check methods ───────────────────────────────────────────────────────
def _check_https(self, url: str, html_content: str) -> tuple[list[dict], bool | None]:
"""Check HTTPS configuration."""
findings = []
from urllib.parse import urlparse
parsed = urlparse(url)
has_https = parsed.scheme == "https"
if has_https:
findings.append(self._finding("HTTPS enabled", "info", "transport",
"Website uses encrypted connections", is_positive=True))
# Check mixed content
http_resources = re.findall(r'(src|href|action)=["\']http://[^"\']+["\']', html_content, re.IGNORECASE)
if http_resources:
findings.append(self._finding("Mixed content detected", "medium", "transport",
"HTTPS site loads resources over insecure HTTP"))
else:
findings.append(self._finding("No HTTPS", "critical", "transport",
"Website transmits all data in plain text"))
return findings, has_https
def _check_ssl(self, domain: str) -> tuple[list[dict], bool | None, datetime | None]:
"""Check SSL certificate validity."""
findings = []
has_valid_ssl = None
ssl_expires_at = None
try:
context = ssl.create_default_context()
with socket.create_connection((domain, 443), timeout=REQUEST_TIMEOUT) as sock:
with context.wrap_socket(sock, server_hostname=domain) as ssock:
cert = ssock.getpeercert()
not_after = datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z").replace(tzinfo=UTC)
days_remaining = (not_after - datetime.now(UTC)).days
ssl_expires_at = not_after
if days_remaining < 0:
has_valid_ssl = False
findings.append(self._finding("SSL certificate expired", "critical", "transport",
f"Certificate expired on {not_after.strftime('%Y-%m-%d')}"))
elif days_remaining < 30:
has_valid_ssl = True
findings.append(self._finding(f"SSL expires in {days_remaining} days", "high", "transport",
f"Certificate expires on {not_after.strftime('%Y-%m-%d')}"))
else:
has_valid_ssl = True
findings.append(self._finding("SSL certificate valid", "info", "transport",
f"Valid until {not_after.strftime('%Y-%m-%d')} ({days_remaining} days)",
is_positive=True))
# Check TLS version
protocol = ssock.version()
if protocol in ("TLSv1", "TLSv1.1", "SSLv3", "SSLv2"):
findings.append(self._finding("Weak TLS version", "high", "transport",
f"Server supports outdated protocol: {protocol}"))
except ssl.SSLCertVerificationError:
has_valid_ssl = False
findings.append(self._finding("SSL certificate invalid", "critical", "transport",
"Certificate verification failed"))
except (TimeoutError, ConnectionRefusedError, OSError):
pass # No SSL, already caught by HTTPS check
return findings, has_valid_ssl, ssl_expires_at
def _check_headers(self, response) -> tuple[list[dict], list[str]]:
"""Check for missing security headers."""
findings = []
missing = []
if not response:
return findings, missing
for header_name, config in SECURITY_HEADERS.items():
if header_name in response.headers:
findings.append(self._finding(f"Header present: {header_name}", "info", "headers",
header_name, is_positive=True))
else:
missing.append(header_name)
findings.append(self._finding(f"Missing: {header_name}", config["severity"], "headers",
config["impact"]))
return findings, missing
def _check_server_info(self, response) -> tuple[list[dict], list[str]]:
"""Check for server version disclosure."""
findings = []
technologies = []
if not response:
return findings, technologies
server = response.headers.get("Server", "")
x_powered = response.headers.get("X-Powered-By", "")
info_parts = []
if server:
info_parts.append(server)
technologies.append(server)
if x_powered:
info_parts.append(f"X-Powered-By: {x_powered}")
technologies.append(x_powered)
if info_parts:
has_version = bool(re.search(r"\d+\.\d+", " ".join(info_parts)))
severity = "medium" if has_version else "low"
findings.append(self._finding("Server version exposed", severity, "config",
" | ".join(info_parts)))
return findings, technologies
def _check_technology(self, html_content: str, response) -> tuple[list[dict], list[str]]:
"""Detect CMS and technology stack."""
findings = []
technologies = []
if not html_content:
return findings, technologies
# WordPress
wp_indicators = ["wp-content/", "wp-includes/", 'name="generator" content="WordPress']
if any(ind in html_content for ind in wp_indicators):
version = "unknown"
ver_match = re.search(r'content="WordPress\s+([\d.]+)"', html_content)
if ver_match:
version = ver_match.group(1)
severity = "medium" if version != "unknown" else "low"
findings.append(self._finding(f"WordPress detected (v{version})", severity, "technology",
"Version publicly visible" if version != "unknown" else "CMS detected"))
technologies.append(f"WordPress {version}")
# Joomla
if "/media/jui/" in html_content or "Joomla" in html_content:
findings.append(self._finding("Joomla detected", "low", "technology", "CMS detected"))
technologies.append("Joomla")
# Drupal
if "Drupal" in html_content or "/sites/default/" in html_content:
findings.append(self._finding("Drupal detected", "low", "technology", "CMS detected"))
technologies.append("Drupal")
# Hosted platforms (not vulnerable in the same way)
if "wix.com" in html_content:
technologies.append("Wix")
if "squarespace.com" in html_content:
technologies.append("Squarespace")
if "cdn.shopify.com" in html_content:
technologies.append("Shopify")
return findings, technologies
def _check_cookies(self, response) -> list[dict]:
"""Check cookie security flags."""
findings = []
if not response:
return findings
set_cookie_headers = response.headers.get("Set-Cookie", "")
if not set_cookie_headers:
return findings
has_insecure = False
has_no_httponly = False
has_no_samesite = False
for cookie in set_cookie_headers.split(","):
cookie_lower = cookie.lower()
if "secure" not in cookie_lower:
has_insecure = True
if "httponly" not in cookie_lower:
has_no_httponly = True
if "samesite" not in cookie_lower:
has_no_samesite = True
if has_insecure:
findings.append(self._finding("Cookies lack Secure flag", "medium", "cookies",
"Session cookies can be intercepted over HTTP"))
if has_no_httponly:
findings.append(self._finding("Cookies lack HttpOnly flag", "medium", "cookies",
"Cookies accessible to JavaScript (XSS risk)"))
if has_no_samesite:
findings.append(self._finding("Cookies lack SameSite attribute", "low", "cookies",
"Vulnerable to cross-site request attacks"))
return findings
def _check_exposed_files(self, domain: str, scheme: str, session) -> tuple[list[dict], list[str]]:
"""Check for exposed sensitive files and directories."""
findings = []
exposed = []
base = f"{scheme}://{domain}"
security_txt_found = False
robots_content = None
for path, description, default_severity in EXPOSED_PATHS:
try:
resp = session.get(f"{base}{path}", timeout=REQUEST_TIMEOUT, allow_redirects=False)
if path == "/.well-known/security.txt" and resp.status_code == 200:
security_txt_found = True
continue
if path == "/robots.txt" and resp.status_code == 200:
robots_content = resp.text
continue
if path == "/sitemap.xml" or path == "/api/":
continue
if resp.status_code == 200:
if path in ADMIN_PATHS:
findings.append(self._finding(f"Admin panel exposed: {path}", "high", "exposure",
f"Admin login at {base}{path} is publicly accessible"))
else:
findings.append(self._finding(f"Exposed: {path}", default_severity, "exposure",
f"{description} is publicly accessible"))
exposed.append(path)
except Exception:
continue
# Security.txt check
if not security_txt_found:
findings.append(self._finding("No security.txt", "info", "exposure",
"No /.well-known/security.txt for responsible disclosure"))
# Robots.txt analysis
if robots_content:
disallowed = re.findall(r"Disallow:\s*(.+)", robots_content, re.IGNORECASE)
sensitive_found = []
for path in disallowed:
path = path.strip()
if any(pattern in path.lower() for pattern in ROBOTS_SENSITIVE_PATTERNS):
sensitive_found.append(path)
if sensitive_found:
findings.append(self._finding("Robots.txt reveals sensitive paths", "low", "exposure",
f"Disallowed paths: {', '.join(sensitive_found[:5])}"))
return findings, exposed
# ── Helpers ──────────────────────────────────────────────────────────────
@staticmethod
def _finding(title: str, severity: str, category: str, detail: str, is_positive: bool = False) -> dict:
"""Create a finding dict."""
return {
"title": title,
"severity": severity,
"category": category,
"detail": detail,
"is_positive": is_positive,
}
@staticmethod
def _calculate_grade(score: int) -> str:
if score >= 95:
return "A+"
if score >= 85:
return "A"
if score >= 70:
return "B"
if score >= 55:
return "C"
if score >= 40:
return "D"
return "F"
def _save_audit(
self, db: Session, prospect: Prospect, *,
score: int, grade: str, findings: list[dict],
has_https: bool | None = None, has_valid_ssl: bool | None = None,
ssl_expires_at: datetime | None = None,
missing_headers: list[str] | None = None,
exposed_files: list[str] | None = None,
technologies: list[str] | None = None,
scan_error: str | None = None,
) -> ProspectSecurityAudit:
"""Upsert security audit results."""
audit = prospect.security_audit
if not audit:
audit = ProspectSecurityAudit(prospect_id=prospect.id)
db.add(audit)
audit.score = score
audit.grade = grade
audit.findings_json = json.dumps(findings)
audit.has_https = has_https
audit.has_valid_ssl = has_valid_ssl
audit.ssl_expires_at = ssl_expires_at
audit.missing_headers_json = json.dumps(missing_headers or [])
audit.exposed_files_json = json.dumps(exposed_files or [])
audit.technologies_json = json.dumps(technologies or [])
audit.scan_error = scan_error
# Denormalized counts
audit.findings_count_critical = sum(1 for f in findings if f["severity"] == "critical" and not f.get("is_positive"))
audit.findings_count_high = sum(1 for f in findings if f["severity"] == "high" and not f.get("is_positive"))
audit.findings_count_medium = sum(1 for f in findings if f["severity"] == "medium" and not f.get("is_positive"))
audit.findings_count_low = sum(1 for f in findings if f["severity"] == "low" and not f.get("is_positive"))
audit.findings_count_info = sum(1 for f in findings if f["severity"] == "info" and not f.get("is_positive"))
prospect.last_security_audit_at = datetime.now(UTC)
db.flush()
logger.info("Security audit for %s: score=%d grade=%s (%d findings)",
prospect.domain_name, score, grade,
len([f for f in findings if not f.get("is_positive")]))
return audit
security_audit_service = SecurityAuditService()