# utils/data_processing.py """Data processing utilities for GTIN validation and price parsing. This module provides classes and functions for: - GTIN (Global Trade Item Number) validation and normalization - Price parsing with currency detection - Data cleaning and validation utilities """ import logging import re import pandas as pd logger = logging.getLogger(__name__) class GTINProcessor: """Handles GTIN normalization and validation.""" VALID_LENGTHS = [8, 12, 13, 14] # List of valid GTIN lengths def normalize(self, gtin_value: any) -> str | None: """ Normalize GTIN to proper format. Args: gtin_value (any): The GTIN value to be normalized. Returns: Optional[str]: The normalized GTIN string or None if the input is invalid. """ if not gtin_value or pd.isna(gtin_value): return None gtin_str = str(gtin_value).strip() if not gtin_str: return None # Remove decimal point (e.g., "889698116923.0" -> "889698116923") if "." in gtin_str: gtin_str = gtin_str.split(".")[0] # Keep only digits gtin_clean = "".join(filter(str.isdigit, gtin_str)) if not gtin_clean: return None # Validate and normalize length length = len(gtin_clean) if length in self.VALID_LENGTHS: # Standard lengths - pad appropriately if length == 8: return gtin_clean.zfill(8) # EAN-8 if length == 12: return gtin_clean.zfill(12) # UPC-A if length == 13: return gtin_clean.zfill(13) # EAN-13 if length == 14: return gtin_clean.zfill(14) # GTIN-14 elif length > 14: # Too long - truncate to EAN-13 logger.warning(f"GTIN too long, truncating: {gtin_clean}") return gtin_clean[-13:] elif 0 < length < 8: # Too short - pad to EAN-13 logger.warning(f"GTIN too short, padding: {gtin_clean}") return gtin_clean.zfill(13) logger.warning(f"Invalid GTIN format: '{gtin_value}'") return None def validate(self, gtin: str) -> bool: """ Validate the GTIN format. Args: gtin (str): The GTIN string to be validated. Returns: bool: True if the GTIN is valid, False otherwise. """ if not gtin: return False return len(gtin) in self.VALID_LENGTHS and gtin.isdigit() class PriceProcessor: """Handles price parsing and currency extraction.""" CURRENCY_PATTERNS = { # Amount followed by currency r"([0-9.,]+)\s*(EUR|€)": lambda m: (m.group(1), "EUR"), r"([0-9.,]+)\s*(USD|\$)": lambda m: (m.group(1), "USD"), r"([0-9.,]+)\s*(GBP|£)": lambda m: (m.group(1), "GBP"), r"([0-9.,]+)\s*(CHF)": lambda m: (m.group(1), "CHF"), r"([0-9.,]+)\s*(CAD|AUD|JPY|¥)": lambda m: (m.group(1), m.group(2).upper()), # Currency followed by amount r"(EUR|€)\s*([0-9.,]+)": lambda m: (m.group(2), "EUR"), r"(USD|\$)\s*([0-9.,]+)": lambda m: (m.group(2), "USD"), r"(GBP|£)\s*([0-9.,]+)": lambda m: (m.group(2), "GBP"), # Generic 3-letter currency codes r"([0-9.,]+)\s*([A-Z]{3})": lambda m: (m.group(1), m.group(2)), r"([A-Z]{3})\s*([0-9.,]+)": lambda m: (m.group(2), m.group(1)), } def parse_price_currency(self, price_str: any) -> tuple[str | None, str | None]: """ Parse a price string to extract the numeric value and currency. Args: price_str (any): The price string to be parsed. Returns: Tuple[Optional[str], Optional[str]]: A tuple containing the parsed price and currency, or None if parsing fails. """ if not price_str or pd.isna(price_str): return None, None price_str = str(price_str).strip() if not price_str: return None, None # Try each pattern for pattern, extract_func in self.CURRENCY_PATTERNS.items(): match = re.search(pattern, price_str, re.IGNORECASE) if match: try: price_val, currency_val = extract_func(match) # Normalize price (remove spaces, handle comma as decimal) price_val = price_val.replace(" ", "").replace(",", ".") # Validate numeric float(price_val) return price_val, currency_val.upper() except (ValueError, AttributeError): continue # Fallback: extract just numbers number_match = re.search(r"([0-9.,]+)", price_str) if number_match: try: price_val = number_match.group(1).replace(",", ".") float(price_val) # Validate return price_val, None except ValueError: pass # If we get here, parsing failed completely logger.error(f"Could not parse price: '{price_str}'") raise ValueError(f"Invalid price format: '{price_str}'")