fix(prospecting): fix contact scraper and add address extraction
Some checks failed
Some checks failed
- Fix contact_type column: Enum(ContactType) → String(20) to match the
migration (fixes "type contacttype does not exist" on insert)
- Rewrite scrape_contacts with structured-first approach:
Phase 1: tel:/mailto: href extraction (high confidence)
Phase 2: regex fallback with SVG/script stripping, international phone
pattern (requires + prefix, min 10 digits)
Phase 3: address extraction from Schema.org JSON-LD, <address> tags,
and European street address regex (FR/DE/EN street keywords)
- URL-decode email values, strip tags to plain text for cross-element
address matching
- Add /mentions-legales to scanned paths
Tested on batirenovation-strasbourg.fr: finds 3 contacts (email, phone,
address) vs 120+ false positives and a crash before.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -323,6 +323,21 @@ class EnrichmentService:
|
||||
source_element=source,
|
||||
))
|
||||
|
||||
found_addresses: set[str] = set()
|
||||
|
||||
def _add_address(address: str, url: str, source: str) -> None:
|
||||
address = re.sub(r"\s+", " ", address).strip()
|
||||
if len(address) < 10 or address in found_addresses:
|
||||
return
|
||||
found_addresses.add(address)
|
||||
contacts.append(ProspectContact(
|
||||
prospect_id=prospect.id,
|
||||
contact_type="address",
|
||||
value=address,
|
||||
source_url=url,
|
||||
source_element=source,
|
||||
))
|
||||
|
||||
session = requests.Session()
|
||||
session.verify = False # noqa: SEC047 passive scan, not sending sensitive data
|
||||
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"})
|
||||
@@ -351,6 +366,43 @@ class EnrichmentService:
|
||||
for phone in phone_regex.findall(text_html):
|
||||
_add_phone(phone, url, "regex")
|
||||
|
||||
# Phase 3: address extraction
|
||||
# 3a: Schema.org JSON-LD
|
||||
for m in re.finditer(r'"streetAddress"\s*:\s*"([^"]+)"', html):
|
||||
parts = [m.group(1)]
|
||||
# Try to find locality/postal near the same JSON block
|
||||
block_end = html[m.end():m.end() + 200]
|
||||
locality = re.search(r'"addressLocality"\s*:\s*"([^"]+)"', block_end)
|
||||
postal = re.search(r'"postalCode"\s*:\s*"([^"]+)"', block_end)
|
||||
if postal:
|
||||
parts.append(postal.group(1))
|
||||
if locality:
|
||||
parts.append(locality.group(1))
|
||||
_add_address(", ".join(parts), url, "schema_org")
|
||||
|
||||
# 3b: <address> HTML tag
|
||||
for addr_match in re.finditer(r"<address[^>]*>(.*?)</address>", html, re.DOTALL | re.IGNORECASE):
|
||||
clean = re.sub(r"<[^>]+>", " ", addr_match.group(1))
|
||||
clean = re.sub(r"\s+", " ", clean).strip()
|
||||
if clean:
|
||||
_add_address(clean, url, "address_tag")
|
||||
|
||||
# 3c: European street address pattern (number + street keyword + postal code + city)
|
||||
# Strip tags to plain text (replace tags with spaces for cross-element matching)
|
||||
plain = re.sub(r"<[^>]+>", " ", text_html)
|
||||
plain = re.sub(r"\s+", " ", plain)
|
||||
street_keywords = (
|
||||
r"(?:rue|avenue|boulevard|allée|impasse|chemin|place|route|passage|quai|"
|
||||
r"straße|strasse|stra[ßs]e|weg|platz|gasse|" # German
|
||||
r"street|road|lane|drive|way)" # English
|
||||
)
|
||||
addr_pattern = re.compile(
|
||||
rf"\d{{1,4}}[\s,]+{street_keywords}\s[^<]{{3,60}}?\d{{4,5}}\s+[A-ZÀ-Ü][a-zà-ü]{{2,}}",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
for m in addr_pattern.finditer(plain):
|
||||
_add_address(m.group(), url, "regex")
|
||||
|
||||
except Exception as e: # noqa: EXC003
|
||||
logger.debug("Contact scrape failed for %s%s: %s", domain, path, e)
|
||||
|
||||
@@ -359,7 +411,7 @@ class EnrichmentService:
|
||||
# Save contacts (replace existing auto-scraped ones)
|
||||
db.query(ProspectContact).filter(
|
||||
ProspectContact.prospect_id == prospect.id,
|
||||
ProspectContact.source_element.in_(["regex", "tel_href", "mailto_href"]),
|
||||
ProspectContact.source_element.in_(["regex", "tel_href", "mailto_href", "schema_org", "address_tag"]),
|
||||
).delete()
|
||||
|
||||
db.add_all(contacts)
|
||||
|
||||
@@ -472,8 +472,15 @@
|
||||
<p class="text-sm font-medium text-gray-800 dark:text-gray-200" x-text="store.store_name"></p>
|
||||
<p class="text-xs text-gray-400 font-mono" x-text="store.store_code"></p>
|
||||
</div>
|
||||
<span class="px-2 py-1 text-xs rounded-full bg-purple-100 dark:bg-purple-900 text-purple-700 dark:text-purple-300"
|
||||
x-text="store.role_name || 'Owner'"></span>
|
||||
<div class="flex items-center gap-2">
|
||||
<span class="px-2 py-0.5 text-xs rounded-full"
|
||||
:class="store.is_pending
|
||||
? 'bg-orange-100 text-orange-700 dark:bg-orange-900 dark:text-orange-200'
|
||||
: 'bg-green-100 text-green-700 dark:bg-green-900 dark:text-green-200'"
|
||||
x-text="store.is_pending ? '{{ _('common.pending') }}' : '{{ _('common.active') }}'"></span>
|
||||
<span class="px-2 py-0.5 text-xs rounded-full bg-purple-100 dark:bg-purple-900 text-purple-700 dark:text-purple-300"
|
||||
x-text="store.role_name || 'Owner'"></span>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
</div>
|
||||
|
||||
Reference in New Issue
Block a user