fix(prospecting): fix contact scraper and add address extraction
Some checks failed
CI / validate (push) Has been cancelled
CI / dependency-scanning (push) Has been cancelled
CI / docs (push) Has been cancelled
CI / deploy (push) Has been cancelled
CI / ruff (push) Successful in 13s
CI / pytest (push) Has been cancelled

- Fix contact_type column: Enum(ContactType) → String(20) to match the
  migration (fixes "type contacttype does not exist" on insert)
- Rewrite scrape_contacts with structured-first approach:
  Phase 1: tel:/mailto: href extraction (high confidence)
  Phase 2: regex fallback with SVG/script stripping, international phone
           pattern (requires + prefix, min 10 digits)
  Phase 3: address extraction from Schema.org JSON-LD, <address> tags,
           and European street address regex (FR/DE/EN street keywords)
- URL-decode email values, strip tags to plain text for cross-element
  address matching
- Add /mentions-legales to scanned paths

Tested on batirenovation-strasbourg.fr: finds 3 contacts (email, phone,
address) vs 120+ false positives and a crash before.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-30 21:18:43 +02:00
parent 1decb4572c
commit 754bfca87d
2 changed files with 62 additions and 3 deletions

View File

@@ -323,6 +323,21 @@ class EnrichmentService:
source_element=source, source_element=source,
)) ))
found_addresses: set[str] = set()
def _add_address(address: str, url: str, source: str) -> None:
address = re.sub(r"\s+", " ", address).strip()
if len(address) < 10 or address in found_addresses:
return
found_addresses.add(address)
contacts.append(ProspectContact(
prospect_id=prospect.id,
contact_type="address",
value=address,
source_url=url,
source_element=source,
))
session = requests.Session() session = requests.Session()
session.verify = False # noqa: SEC047 passive scan, not sending sensitive data session.verify = False # noqa: SEC047 passive scan, not sending sensitive data
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"}) session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"})
@@ -351,6 +366,43 @@ class EnrichmentService:
for phone in phone_regex.findall(text_html): for phone in phone_regex.findall(text_html):
_add_phone(phone, url, "regex") _add_phone(phone, url, "regex")
# Phase 3: address extraction
# 3a: Schema.org JSON-LD
for m in re.finditer(r'"streetAddress"\s*:\s*"([^"]+)"', html):
parts = [m.group(1)]
# Try to find locality/postal near the same JSON block
block_end = html[m.end():m.end() + 200]
locality = re.search(r'"addressLocality"\s*:\s*"([^"]+)"', block_end)
postal = re.search(r'"postalCode"\s*:\s*"([^"]+)"', block_end)
if postal:
parts.append(postal.group(1))
if locality:
parts.append(locality.group(1))
_add_address(", ".join(parts), url, "schema_org")
# 3b: <address> HTML tag
for addr_match in re.finditer(r"<address[^>]*>(.*?)</address>", html, re.DOTALL | re.IGNORECASE):
clean = re.sub(r"<[^>]+>", " ", addr_match.group(1))
clean = re.sub(r"\s+", " ", clean).strip()
if clean:
_add_address(clean, url, "address_tag")
# 3c: European street address pattern (number + street keyword + postal code + city)
# Strip tags to plain text (replace tags with spaces for cross-element matching)
plain = re.sub(r"<[^>]+>", " ", text_html)
plain = re.sub(r"\s+", " ", plain)
street_keywords = (
r"(?:rue|avenue|boulevard|allée|impasse|chemin|place|route|passage|quai|"
r"straße|strasse|stra[ßs]e|weg|platz|gasse|" # German
r"street|road|lane|drive|way)" # English
)
addr_pattern = re.compile(
rf"\d{{1,4}}[\s,]+{street_keywords}\s[^<]{{3,60}}?\d{{4,5}}\s+[A-ZÀ-Ü][a-zà-ü]{{2,}}",
re.IGNORECASE,
)
for m in addr_pattern.finditer(plain):
_add_address(m.group(), url, "regex")
except Exception as e: # noqa: EXC003 except Exception as e: # noqa: EXC003
logger.debug("Contact scrape failed for %s%s: %s", domain, path, e) logger.debug("Contact scrape failed for %s%s: %s", domain, path, e)
@@ -359,7 +411,7 @@ class EnrichmentService:
# Save contacts (replace existing auto-scraped ones) # Save contacts (replace existing auto-scraped ones)
db.query(ProspectContact).filter( db.query(ProspectContact).filter(
ProspectContact.prospect_id == prospect.id, ProspectContact.prospect_id == prospect.id,
ProspectContact.source_element.in_(["regex", "tel_href", "mailto_href"]), ProspectContact.source_element.in_(["regex", "tel_href", "mailto_href", "schema_org", "address_tag"]),
).delete() ).delete()
db.add_all(contacts) db.add_all(contacts)

View File

@@ -472,9 +472,16 @@
<p class="text-sm font-medium text-gray-800 dark:text-gray-200" x-text="store.store_name"></p> <p class="text-sm font-medium text-gray-800 dark:text-gray-200" x-text="store.store_name"></p>
<p class="text-xs text-gray-400 font-mono" x-text="store.store_code"></p> <p class="text-xs text-gray-400 font-mono" x-text="store.store_code"></p>
</div> </div>
<span class="px-2 py-1 text-xs rounded-full bg-purple-100 dark:bg-purple-900 text-purple-700 dark:text-purple-300" <div class="flex items-center gap-2">
<span class="px-2 py-0.5 text-xs rounded-full"
:class="store.is_pending
? 'bg-orange-100 text-orange-700 dark:bg-orange-900 dark:text-orange-200'
: 'bg-green-100 text-green-700 dark:bg-green-900 dark:text-green-200'"
x-text="store.is_pending ? '{{ _('common.pending') }}' : '{{ _('common.active') }}'"></span>
<span class="px-2 py-0.5 text-xs rounded-full bg-purple-100 dark:bg-purple-900 text-purple-700 dark:text-purple-300"
x-text="store.role_name || 'Owner'"></span> x-text="store.role_name || 'Owner'"></span>
</div> </div>
</div>
</template> </template>
</div> </div>
</div> </div>