fix(prospecting): fix contact scraper and add address extraction

- Fix contact_type column: Enum(ContactType) → String(20) to match the migration (fixes "type contacttype does not exist" on insert) - Rewrite scrape_contacts with structured-first approach: Phase 1: tel:/mailto: href extraction (high confidence) Phase 2: regex fallback with SVG/script stripping, international phone pattern (requires + prefix, min 10 digits) Phase 3: address extraction from Schema.org JSON-LD, <address> tags, and European street address regex (FR/DE/EN street keywords) - URL-decode email values, strip tags to plain text for cross-element address matching - Add /mentions-legales to scanned paths Tested on batirenovation-strasbourg.fr: finds 3 contacts (email, phone, address) vs 120+ false positives and a crash before. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-30 21:18:43 +02:00
parent 1decb4572c
commit 754bfca87d
2 changed files with 62 additions and 3 deletions
--- a/app/modules/prospecting/services/enrichment_service.py
+++ b/app/modules/prospecting/services/enrichment_service.py
@@ -323,6 +323,21 @@ class EnrichmentService:
                source_element=source,
            ))
        found_addresses: set[str] = set()
        def _add_address(address: str, url: str, source: str) -> None:
            address = re.sub(r"\s+", " ", address).strip()
            if len(address) < 10 or address in found_addresses:
                return
            found_addresses.add(address)
            contacts.append(ProspectContact(
                prospect_id=prospect.id,
                contact_type="address",
                value=address,
                source_url=url,
                source_element=source,
            ))
        session = requests.Session()
        session.verify = False  # noqa: SEC047 passive scan, not sending sensitive data
        session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"})
@@ -351,6 +366,43 @@ class EnrichmentService:
                for phone in phone_regex.findall(text_html):
                    _add_phone(phone, url, "regex")
                # Phase 3: address extraction
                # 3a: Schema.org JSON-LD
                for m in re.finditer(r'"streetAddress"\s*:\s*"([^"]+)"', html):
                    parts = [m.group(1)]
                    # Try to find locality/postal near the same JSON block
                    block_end = html[m.end():m.end() + 200]
                    locality = re.search(r'"addressLocality"\s*:\s*"([^"]+)"', block_end)
                    postal = re.search(r'"postalCode"\s*:\s*"([^"]+)"', block_end)
                    if postal:
                        parts.append(postal.group(1))
                    if locality:
                        parts.append(locality.group(1))
                    _add_address(", ".join(parts), url, "schema_org")
                # 3b: <address> HTML tag
                for addr_match in re.finditer(r"<address[^>]*>(.*?)</address>", html, re.DOTALL | re.IGNORECASE):
                    clean = re.sub(r"<[^>]+>", " ", addr_match.group(1))
                    clean = re.sub(r"\s+", " ", clean).strip()
                    if clean:
                        _add_address(clean, url, "address_tag")
                # 3c: European street address pattern (number + street keyword + postal code + city)
                # Strip tags to plain text (replace tags with spaces for cross-element matching)
                plain = re.sub(r"<[^>]+>", " ", text_html)
                plain = re.sub(r"\s+", " ", plain)
                street_keywords = (
                    r"(?:rue|avenue|boulevard|allée|impasse|chemin|place|route|passage|quai|"
                    r"straße|strasse|stra[ßs]e|weg|platz|gasse|"  # German
                    r"street|road|lane|drive|way)"  # English
                )
                addr_pattern = re.compile(
                    rf"\d{{1,4}}[\s,]+{street_keywords}\s[^<]{{3,60}}?\d{{4,5}}\s+[A-ZÀ-Ü][a-zà-ü]{{2,}}",
                    re.IGNORECASE,
                )
                for m in addr_pattern.finditer(plain):
                    _add_address(m.group(), url, "regex")
            except Exception as e:  # noqa: EXC003
                logger.debug("Contact scrape failed for %s%s: %s", domain, path, e)
@@ -359,7 +411,7 @@ class EnrichmentService:
        # Save contacts (replace existing auto-scraped ones)
        db.query(ProspectContact).filter(
            ProspectContact.prospect_id == prospect.id,
-            ProspectContact.source_element.in_(["regex", "tel_href", "mailto_href"]),
+            ProspectContact.source_element.in_(["regex", "tel_href", "mailto_href", "schema_org", "address_tag"]),
        ).delete()
        db.add_all(contacts)
--- a/app/modules/tenancy/templates/tenancy/merchant/team.html
+++ b/app/modules/tenancy/templates/tenancy/merchant/team.html
@@ -472,9 +472,16 @@
                            <p class="text-sm font-medium text-gray-800 dark:text-gray-200" x-text="store.store_name"></p>
                            <p class="text-xs text-gray-400 font-mono" x-text="store.store_code"></p>
                        </div>
-                        <span class="px-2 py-1 text-xs rounded-full bg-purple-100 dark:bg-purple-900 text-purple-700 dark:text-purple-300"
+                        <div class="flex items-center gap-2">
                            <span class="px-2 py-0.5 text-xs rounded-full"
                                  :class="store.is_pending
                                      ? 'bg-orange-100 text-orange-700 dark:bg-orange-900 dark:text-orange-200'
                                      : 'bg-green-100 text-green-700 dark:bg-green-900 dark:text-green-200'"
                                  x-text="store.is_pending ? '{{ _('common.pending') }}' : '{{ _('common.active') }}'"></span>
                            <span class="px-2 py-0.5 text-xs rounded-full bg-purple-100 dark:bg-purple-900 text-purple-700 dark:text-purple-300"
                                  x-text="store.role_name || 'Owner'"></span>
                        </div>
                    </div>
                </template>
            </div>
        </div>