- New scrape_content() method in enrichment_service: extracts meta
description, H1/H2 headings, paragraphs, images (filtered for size),
social links, service items, and detected languages using BeautifulSoup
- Scans 6 pages per prospect: /, /about, /a-propos, /services,
/nos-services, /contact
- Results stored as JSON in prospect.scraped_content_json
- New endpoints: POST /content-scrape/{id} and /content-scrape/batch
- Added to full_enrichment pipeline (Step 5, before security audit)
- CONTENT_SCRAPE job type for scan-jobs tracking
- "Content Scrape" batch button on scan-jobs page
- Add beautifulsoup4 to requirements.txt
Tested on batirenovation-strasbourg.fr: extracted 30 headings,
21 paragraphs, 13 images.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
63 lines
1.9 KiB
Python
63 lines
1.9 KiB
Python
# app/modules/prospecting/models/scan_job.py
|
|
"""
|
|
Scan job tracking for batch enrichment operations.
|
|
"""
|
|
|
|
import enum
|
|
|
|
from sqlalchemy import Column, DateTime, Enum, Integer, String, Text
|
|
|
|
from app.core.database import Base
|
|
from models.database.base import TimestampMixin
|
|
|
|
|
|
class JobType(str, enum.Enum):
|
|
IMPORT = "import"
|
|
HTTP_CHECK = "http_check"
|
|
TECH_SCAN = "tech_scan"
|
|
PERFORMANCE_SCAN = "performance_scan"
|
|
CONTACT_SCRAPE = "contact_scrape"
|
|
SCORE_COMPUTE = "score_compute"
|
|
FULL_ENRICHMENT = "full_enrichment"
|
|
SECURITY_AUDIT = "security_audit"
|
|
CONTENT_SCRAPE = "content_scrape"
|
|
|
|
|
|
class JobStatus(str, enum.Enum):
|
|
PENDING = "pending"
|
|
RUNNING = "running"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
CANCELLED = "cancelled"
|
|
|
|
|
|
class ProspectScanJob(Base, TimestampMixin):
|
|
"""Tracks batch scanning operations."""
|
|
|
|
__tablename__ = "prospect_scan_jobs"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
job_type = Column(Enum(JobType), nullable=False)
|
|
status = Column(Enum(JobStatus), nullable=False, default=JobStatus.PENDING)
|
|
|
|
total_items = Column(Integer, nullable=False, default=0)
|
|
processed_items = Column(Integer, nullable=False, default=0)
|
|
failed_items = Column(Integer, nullable=False, default=0)
|
|
skipped_items = Column(Integer, nullable=False, default=0)
|
|
|
|
started_at = Column(DateTime, nullable=True)
|
|
completed_at = Column(DateTime, nullable=True)
|
|
|
|
config = Column(Text, nullable=True) # JSON string
|
|
result_summary = Column(Text, nullable=True) # JSON string
|
|
error_log = Column(Text, nullable=True)
|
|
source_file = Column(String(500), nullable=True)
|
|
|
|
celery_task_id = Column(String(255), nullable=True)
|
|
|
|
@property
|
|
def progress_percent(self) -> float:
|
|
if self.total_items == 0:
|
|
return 0.0
|
|
return round(self.processed_items / self.total_items * 100, 1)
|