fix(prospecting): fix contact scraper and add address extraction
Some checks failed
Some checks failed
- Fix contact_type column: Enum(ContactType) → String(20) to match the
migration (fixes "type contacttype does not exist" on insert)
- Rewrite scrape_contacts with structured-first approach:
Phase 1: tel:/mailto: href extraction (high confidence)
Phase 2: regex fallback with SVG/script stripping, international phone
pattern (requires + prefix, min 10 digits)
Phase 3: address extraction from Schema.org JSON-LD, <address> tags,
and European street address regex (FR/DE/EN street keywords)
- URL-decode email values, strip tags to plain text for cross-element
address matching
- Add /mentions-legales to scanned paths
Tested on batirenovation-strasbourg.fr: finds 3 contacts (email, phone,
address) vs 120+ false positives and a crash before.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -323,6 +323,21 @@ class EnrichmentService:
|
|||||||
source_element=source,
|
source_element=source,
|
||||||
))
|
))
|
||||||
|
|
||||||
|
found_addresses: set[str] = set()
|
||||||
|
|
||||||
|
def _add_address(address: str, url: str, source: str) -> None:
|
||||||
|
address = re.sub(r"\s+", " ", address).strip()
|
||||||
|
if len(address) < 10 or address in found_addresses:
|
||||||
|
return
|
||||||
|
found_addresses.add(address)
|
||||||
|
contacts.append(ProspectContact(
|
||||||
|
prospect_id=prospect.id,
|
||||||
|
contact_type="address",
|
||||||
|
value=address,
|
||||||
|
source_url=url,
|
||||||
|
source_element=source,
|
||||||
|
))
|
||||||
|
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
session.verify = False # noqa: SEC047 passive scan, not sending sensitive data
|
session.verify = False # noqa: SEC047 passive scan, not sending sensitive data
|
||||||
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"})
|
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"})
|
||||||
@@ -351,6 +366,43 @@ class EnrichmentService:
|
|||||||
for phone in phone_regex.findall(text_html):
|
for phone in phone_regex.findall(text_html):
|
||||||
_add_phone(phone, url, "regex")
|
_add_phone(phone, url, "regex")
|
||||||
|
|
||||||
|
# Phase 3: address extraction
|
||||||
|
# 3a: Schema.org JSON-LD
|
||||||
|
for m in re.finditer(r'"streetAddress"\s*:\s*"([^"]+)"', html):
|
||||||
|
parts = [m.group(1)]
|
||||||
|
# Try to find locality/postal near the same JSON block
|
||||||
|
block_end = html[m.end():m.end() + 200]
|
||||||
|
locality = re.search(r'"addressLocality"\s*:\s*"([^"]+)"', block_end)
|
||||||
|
postal = re.search(r'"postalCode"\s*:\s*"([^"]+)"', block_end)
|
||||||
|
if postal:
|
||||||
|
parts.append(postal.group(1))
|
||||||
|
if locality:
|
||||||
|
parts.append(locality.group(1))
|
||||||
|
_add_address(", ".join(parts), url, "schema_org")
|
||||||
|
|
||||||
|
# 3b: <address> HTML tag
|
||||||
|
for addr_match in re.finditer(r"<address[^>]*>(.*?)</address>", html, re.DOTALL | re.IGNORECASE):
|
||||||
|
clean = re.sub(r"<[^>]+>", " ", addr_match.group(1))
|
||||||
|
clean = re.sub(r"\s+", " ", clean).strip()
|
||||||
|
if clean:
|
||||||
|
_add_address(clean, url, "address_tag")
|
||||||
|
|
||||||
|
# 3c: European street address pattern (number + street keyword + postal code + city)
|
||||||
|
# Strip tags to plain text (replace tags with spaces for cross-element matching)
|
||||||
|
plain = re.sub(r"<[^>]+>", " ", text_html)
|
||||||
|
plain = re.sub(r"\s+", " ", plain)
|
||||||
|
street_keywords = (
|
||||||
|
r"(?:rue|avenue|boulevard|allée|impasse|chemin|place|route|passage|quai|"
|
||||||
|
r"straße|strasse|stra[ßs]e|weg|platz|gasse|" # German
|
||||||
|
r"street|road|lane|drive|way)" # English
|
||||||
|
)
|
||||||
|
addr_pattern = re.compile(
|
||||||
|
rf"\d{{1,4}}[\s,]+{street_keywords}\s[^<]{{3,60}}?\d{{4,5}}\s+[A-ZÀ-Ü][a-zà-ü]{{2,}}",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
for m in addr_pattern.finditer(plain):
|
||||||
|
_add_address(m.group(), url, "regex")
|
||||||
|
|
||||||
except Exception as e: # noqa: EXC003
|
except Exception as e: # noqa: EXC003
|
||||||
logger.debug("Contact scrape failed for %s%s: %s", domain, path, e)
|
logger.debug("Contact scrape failed for %s%s: %s", domain, path, e)
|
||||||
|
|
||||||
@@ -359,7 +411,7 @@ class EnrichmentService:
|
|||||||
# Save contacts (replace existing auto-scraped ones)
|
# Save contacts (replace existing auto-scraped ones)
|
||||||
db.query(ProspectContact).filter(
|
db.query(ProspectContact).filter(
|
||||||
ProspectContact.prospect_id == prospect.id,
|
ProspectContact.prospect_id == prospect.id,
|
||||||
ProspectContact.source_element.in_(["regex", "tel_href", "mailto_href"]),
|
ProspectContact.source_element.in_(["regex", "tel_href", "mailto_href", "schema_org", "address_tag"]),
|
||||||
).delete()
|
).delete()
|
||||||
|
|
||||||
db.add_all(contacts)
|
db.add_all(contacts)
|
||||||
|
|||||||
@@ -472,9 +472,16 @@
|
|||||||
<p class="text-sm font-medium text-gray-800 dark:text-gray-200" x-text="store.store_name"></p>
|
<p class="text-sm font-medium text-gray-800 dark:text-gray-200" x-text="store.store_name"></p>
|
||||||
<p class="text-xs text-gray-400 font-mono" x-text="store.store_code"></p>
|
<p class="text-xs text-gray-400 font-mono" x-text="store.store_code"></p>
|
||||||
</div>
|
</div>
|
||||||
<span class="px-2 py-1 text-xs rounded-full bg-purple-100 dark:bg-purple-900 text-purple-700 dark:text-purple-300"
|
<div class="flex items-center gap-2">
|
||||||
|
<span class="px-2 py-0.5 text-xs rounded-full"
|
||||||
|
:class="store.is_pending
|
||||||
|
? 'bg-orange-100 text-orange-700 dark:bg-orange-900 dark:text-orange-200'
|
||||||
|
: 'bg-green-100 text-green-700 dark:bg-green-900 dark:text-green-200'"
|
||||||
|
x-text="store.is_pending ? '{{ _('common.pending') }}' : '{{ _('common.active') }}'"></span>
|
||||||
|
<span class="px-2 py-0.5 text-xs rounded-full bg-purple-100 dark:bg-purple-900 text-purple-700 dark:text-purple-300"
|
||||||
x-text="store.role_name || 'Owner'"></span>
|
x-text="store.role_name || 'Owner'"></span>
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
</template>
|
</template>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
Reference in New Issue
Block a user