Problem: - Ruff removed 'from app.core.database import Base' from models/database/base.py - Import appeared "unused" (F401) but was actually a critical re-export - Caused ImportError: cannot import name 'Base' at runtime - Re-export pattern: import in one file to export from package Solution: 1. Added F401 ignore for models/database/base.py in pyproject.toml 2. Created scripts/verify_critical_imports.py verification script 3. Integrated verification into make check and CI pipeline 4. Updated documentation with explanation New Verification Script: - Checks all critical re-export imports exist - Detects import variations (parentheses, 'as' clauses) - Handles SQLAlchemy declarative_base alternatives - Runs as part of make check automatically Protected Files: - models/database/base.py - Re-exports Base for all models - models/__init__.py - Exports Base for Alembic - models/database/__init__.py - Exports Base from package - All __init__.py files (already protected) Makefile Changes: - make verify-imports - Run import verification - make check - Now includes verify-imports - make ci - Includes verify-imports in pipeline Documentation Updated: - Code quality guide explains re-export protection - Pre-commit workflow includes verification - Examples of why re-exports matter This prevents future issues where linters remove seemingly "unused" imports that are actually critical for application structure. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
155 lines
5.1 KiB
Python
155 lines
5.1 KiB
Python
# utils/data_processing.py
|
|
"""Data processing utilities for GTIN validation and price parsing.
|
|
|
|
This module provides classes and functions for:
|
|
- GTIN (Global Trade Item Number) validation and normalization
|
|
- Price parsing with currency detection
|
|
- Data cleaning and validation utilities
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
|
|
import pandas as pd
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class GTINProcessor:
|
|
"""Handles GTIN normalization and validation."""
|
|
|
|
VALID_LENGTHS = [8, 12, 13, 14] # List of valid GTIN lengths
|
|
|
|
def normalize(self, gtin_value: any) -> str | None:
|
|
"""
|
|
Normalize GTIN to proper format.
|
|
|
|
Args:
|
|
gtin_value (any): The GTIN value to be normalized.
|
|
|
|
Returns:
|
|
Optional[str]: The normalized GTIN string or None if the input is invalid.
|
|
"""
|
|
if not gtin_value or pd.isna(gtin_value):
|
|
return None
|
|
|
|
gtin_str = str(gtin_value).strip()
|
|
if not gtin_str:
|
|
return None
|
|
|
|
# Remove decimal point (e.g., "889698116923.0" -> "889698116923")
|
|
if "." in gtin_str:
|
|
gtin_str = gtin_str.split(".")[0]
|
|
|
|
# Keep only digits
|
|
gtin_clean = "".join(filter(str.isdigit, gtin_str))
|
|
|
|
if not gtin_clean:
|
|
return None
|
|
|
|
# Validate and normalize length
|
|
length = len(gtin_clean)
|
|
|
|
if length in self.VALID_LENGTHS:
|
|
# Standard lengths - pad appropriately
|
|
if length == 8:
|
|
return gtin_clean.zfill(8) # EAN-8
|
|
if length == 12:
|
|
return gtin_clean.zfill(12) # UPC-A
|
|
if length == 13:
|
|
return gtin_clean.zfill(13) # EAN-13
|
|
if length == 14:
|
|
return gtin_clean.zfill(14) # GTIN-14
|
|
|
|
elif length > 14:
|
|
# Too long - truncate to EAN-13
|
|
logger.warning(f"GTIN too long, truncating: {gtin_clean}")
|
|
return gtin_clean[-13:]
|
|
|
|
elif 0 < length < 8:
|
|
# Too short - pad to EAN-13
|
|
logger.warning(f"GTIN too short, padding: {gtin_clean}")
|
|
return gtin_clean.zfill(13)
|
|
|
|
logger.warning(f"Invalid GTIN format: '{gtin_value}'")
|
|
return None
|
|
|
|
def validate(self, gtin: str) -> bool:
|
|
"""
|
|
Validate the GTIN format.
|
|
|
|
Args:
|
|
gtin (str): The GTIN string to be validated.
|
|
|
|
Returns:
|
|
bool: True if the GTIN is valid, False otherwise.
|
|
"""
|
|
if not gtin:
|
|
return False
|
|
return len(gtin) in self.VALID_LENGTHS and gtin.isdigit()
|
|
|
|
|
|
class PriceProcessor:
|
|
"""Handles price parsing and currency extraction."""
|
|
|
|
CURRENCY_PATTERNS = {
|
|
# Amount followed by currency
|
|
r"([0-9.,]+)\s*(EUR|€)": lambda m: (m.group(1), "EUR"),
|
|
r"([0-9.,]+)\s*(USD|\$)": lambda m: (m.group(1), "USD"),
|
|
r"([0-9.,]+)\s*(GBP|£)": lambda m: (m.group(1), "GBP"),
|
|
r"([0-9.,]+)\s*(CHF)": lambda m: (m.group(1), "CHF"),
|
|
r"([0-9.,]+)\s*(CAD|AUD|JPY|¥)": lambda m: (m.group(1), m.group(2).upper()),
|
|
# Currency followed by amount
|
|
r"(EUR|€)\s*([0-9.,]+)": lambda m: (m.group(2), "EUR"),
|
|
r"(USD|\$)\s*([0-9.,]+)": lambda m: (m.group(2), "USD"),
|
|
r"(GBP|£)\s*([0-9.,]+)": lambda m: (m.group(2), "GBP"),
|
|
# Generic 3-letter currency codes
|
|
r"([0-9.,]+)\s*([A-Z]{3})": lambda m: (m.group(1), m.group(2)),
|
|
r"([A-Z]{3})\s*([0-9.,]+)": lambda m: (m.group(2), m.group(1)),
|
|
}
|
|
|
|
def parse_price_currency(self, price_str: any) -> tuple[str | None, str | None]:
|
|
"""
|
|
Parse a price string to extract the numeric value and currency.
|
|
|
|
Args:
|
|
price_str (any): The price string to be parsed.
|
|
|
|
Returns:
|
|
Tuple[Optional[str], Optional[str]]: A tuple containing the parsed price and currency, or None if parsing fails.
|
|
"""
|
|
if not price_str or pd.isna(price_str):
|
|
return None, None
|
|
|
|
price_str = str(price_str).strip()
|
|
if not price_str:
|
|
return None, None
|
|
|
|
# Try each pattern
|
|
for pattern, extract_func in self.CURRENCY_PATTERNS.items():
|
|
match = re.search(pattern, price_str, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
price_val, currency_val = extract_func(match)
|
|
# Normalize price (remove spaces, handle comma as decimal)
|
|
price_val = price_val.replace(" ", "").replace(",", ".")
|
|
# Validate numeric
|
|
float(price_val)
|
|
return price_val, currency_val.upper()
|
|
except (ValueError, AttributeError):
|
|
continue
|
|
|
|
# Fallback: extract just numbers
|
|
number_match = re.search(r"([0-9.,]+)", price_str)
|
|
if number_match:
|
|
try:
|
|
price_val = number_match.group(1).replace(",", ".")
|
|
float(price_val) # Validate
|
|
return price_val, None
|
|
except ValueError:
|
|
pass
|
|
|
|
# If we get here, parsing failed completely
|
|
logger.error(f"Could not parse price: '{price_str}'")
|
|
raise ValueError(f"Invalid price format: '{price_str}'")
|