Files
orion/app/modules/prospecting/models/prospect.py
Samir Boulahtit 1828ac85eb feat(prospecting): add content scraping for POC builder (Workstream 3A)
- New scrape_content() method in enrichment_service: extracts meta
  description, H1/H2 headings, paragraphs, images (filtered for size),
  social links, service items, and detected languages using BeautifulSoup
- Scans 6 pages per prospect: /, /about, /a-propos, /services,
  /nos-services, /contact
- Results stored as JSON in prospect.scraped_content_json
- New endpoints: POST /content-scrape/{id} and /content-scrape/batch
- Added to full_enrichment pipeline (Step 5, before security audit)
- CONTENT_SCRAPE job type for scan-jobs tracking
- "Content Scrape" batch button on scan-jobs page
- Add beautifulsoup4 to requirements.txt

Tested on batirenovation-strasbourg.fr: extracted 30 headings,
21 paragraphs, 13 images.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-01 22:26:56 +02:00

89 lines
3.4 KiB
Python

# app/modules/prospecting/models/prospect.py
"""
Prospect model - core entity for lead discovery.
Supports two channels:
- digital: discovered via domain scanning (.lu domains)
- offline: manually captured (street encounters, networking)
"""
import enum
from sqlalchemy import Boolean, Column, DateTime, Enum, Float, Integer, String, Text
from sqlalchemy.orm import relationship
from app.core.database import Base
from models.database.base import TimestampMixin
class ProspectChannel(str, enum.Enum):
DIGITAL = "digital"
OFFLINE = "offline"
class ProspectStatus(str, enum.Enum):
PENDING = "pending"
ACTIVE = "active"
INACTIVE = "inactive"
PARKED = "parked"
ERROR = "error"
CONTACTED = "contacted"
CONVERTED = "converted"
class Prospect(Base, TimestampMixin):
"""Represents a business prospect (potential client)."""
__tablename__ = "prospects"
id = Column(Integer, primary_key=True, index=True)
channel = Column(Enum(ProspectChannel), nullable=False, default=ProspectChannel.DIGITAL)
business_name = Column(String(255), nullable=True)
domain_name = Column(String(255), nullable=True, unique=True, index=True)
status = Column(Enum(ProspectStatus), nullable=False, default=ProspectStatus.PENDING)
source = Column(String(100), nullable=True)
# Website status (digital channel)
has_website = Column(Boolean, nullable=True)
uses_https = Column(Boolean, nullable=True)
http_status_code = Column(Integer, nullable=True)
redirect_url = Column(Text, nullable=True)
# Location (offline channel)
address = Column(String(500), nullable=True)
city = Column(String(100), nullable=True)
postal_code = Column(String(10), nullable=True)
country = Column(String(2), nullable=False, default="LU")
# Notes and metadata
notes = Column(Text, nullable=True)
tags = Column(Text, nullable=True) # JSON string of tags
# Capture info
captured_by_user_id = Column(Integer, nullable=True)
location_lat = Column(Float, nullable=True)
location_lng = Column(Float, nullable=True)
# Scan timestamps
last_http_check_at = Column(DateTime, nullable=True)
last_tech_scan_at = Column(DateTime, nullable=True)
last_perf_scan_at = Column(DateTime, nullable=True)
last_contact_scrape_at = Column(DateTime, nullable=True)
last_security_audit_at = Column(DateTime, nullable=True)
last_content_scrape_at = Column(DateTime, nullable=True)
# Scraped page content for POC builder
scraped_content_json = Column(Text, nullable=True)
# Relationships
tech_profile = relationship("ProspectTechProfile", back_populates="prospect", uselist=False, cascade="all, delete-orphan")
performance_profile = relationship("ProspectPerformanceProfile", back_populates="prospect", uselist=False, cascade="all, delete-orphan")
security_audit = relationship("ProspectSecurityAudit", back_populates="prospect", uselist=False, cascade="all, delete-orphan")
score = relationship("ProspectScore", back_populates="prospect", uselist=False, cascade="all, delete-orphan")
contacts = relationship("ProspectContact", back_populates="prospect", cascade="all, delete-orphan")
interactions = relationship("ProspectInteraction", back_populates="prospect", cascade="all, delete-orphan")
@property
def display_name(self) -> str:
return self.business_name or self.domain_name or f"Prospect #{self.id}"