diff --git a/app/modules/prospecting/services/enrichment_service.py b/app/modules/prospecting/services/enrichment_service.py index 2e3f672a..e4a7a251 100644 --- a/app/modules/prospecting/services/enrichment_service.py +++ b/app/modules/prospecting/services/enrichment_service.py @@ -323,6 +323,21 @@ class EnrichmentService: source_element=source, )) + found_addresses: set[str] = set() + + def _add_address(address: str, url: str, source: str) -> None: + address = re.sub(r"\s+", " ", address).strip() + if len(address) < 10 or address in found_addresses: + return + found_addresses.add(address) + contacts.append(ProspectContact( + prospect_id=prospect.id, + contact_type="address", + value=address, + source_url=url, + source_element=source, + )) + session = requests.Session() session.verify = False # noqa: SEC047 passive scan, not sending sensitive data session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; OrionBot/1.0)"}) @@ -351,6 +366,43 @@ class EnrichmentService: for phone in phone_regex.findall(text_html): _add_phone(phone, url, "regex") + # Phase 3: address extraction + # 3a: Schema.org JSON-LD + for m in re.finditer(r'"streetAddress"\s*:\s*"([^"]+)"', html): + parts = [m.group(1)] + # Try to find locality/postal near the same JSON block + block_end = html[m.end():m.end() + 200] + locality = re.search(r'"addressLocality"\s*:\s*"([^"]+)"', block_end) + postal = re.search(r'"postalCode"\s*:\s*"([^"]+)"', block_end) + if postal: + parts.append(postal.group(1)) + if locality: + parts.append(locality.group(1)) + _add_address(", ".join(parts), url, "schema_org") + + # 3b:
HTML tag + for addr_match in re.finditer(r"]*>(.*?)
", html, re.DOTALL | re.IGNORECASE): + clean = re.sub(r"<[^>]+>", " ", addr_match.group(1)) + clean = re.sub(r"\s+", " ", clean).strip() + if clean: + _add_address(clean, url, "address_tag") + + # 3c: European street address pattern (number + street keyword + postal code + city) + # Strip tags to plain text (replace tags with spaces for cross-element matching) + plain = re.sub(r"<[^>]+>", " ", text_html) + plain = re.sub(r"\s+", " ", plain) + street_keywords = ( + r"(?:rue|avenue|boulevard|allée|impasse|chemin|place|route|passage|quai|" + r"straße|strasse|stra[ßs]e|weg|platz|gasse|" # German + r"street|road|lane|drive|way)" # English + ) + addr_pattern = re.compile( + rf"\d{{1,4}}[\s,]+{street_keywords}\s[^<]{{3,60}}?\d{{4,5}}\s+[A-ZÀ-Ü][a-zà-ü]{{2,}}", + re.IGNORECASE, + ) + for m in addr_pattern.finditer(plain): + _add_address(m.group(), url, "regex") + except Exception as e: # noqa: EXC003 logger.debug("Contact scrape failed for %s%s: %s", domain, path, e) @@ -359,7 +411,7 @@ class EnrichmentService: # Save contacts (replace existing auto-scraped ones) db.query(ProspectContact).filter( ProspectContact.prospect_id == prospect.id, - ProspectContact.source_element.in_(["regex", "tel_href", "mailto_href"]), + ProspectContact.source_element.in_(["regex", "tel_href", "mailto_href", "schema_org", "address_tag"]), ).delete() db.add_all(contacts) diff --git a/app/modules/tenancy/templates/tenancy/merchant/team.html b/app/modules/tenancy/templates/tenancy/merchant/team.html index 08da142e..15775d86 100644 --- a/app/modules/tenancy/templates/tenancy/merchant/team.html +++ b/app/modules/tenancy/templates/tenancy/merchant/team.html @@ -472,8 +472,15 @@

- +
+ + +