orion/app/utils/data_processing.py

# utils/data_processing.py
"""Data processing utilities for GTIN validation and price parsing.

This module provides classes and functions for:
- GTIN (Global Trade Item Number) validation and normalization
- Price parsing with currency detection
- Data cleaning and validation utilities
"""

import logging
import re

import pandas as pd

logger = logging.getLogger(__name__)


class GTINProcessor:
    """Handles GTIN normalization and validation."""

    VALID_LENGTHS = [8, 12, 13, 14]  # List of valid GTIN lengths

    def normalize(self, gtin_value: any) -> str | None:
        """
        Normalize GTIN to proper format.

        Args:
            gtin_value (any): The GTIN value to be normalized.

        Returns:
            Optional[str]: The normalized GTIN string or None if the input is invalid.
        """
        if not gtin_value or pd.isna(gtin_value):
            return None

        gtin_str = str(gtin_value).strip()
        if not gtin_str:
            return None

        # Remove decimal point (e.g., "889698116923.0" -> "889698116923")
        if "." in gtin_str:
            gtin_str = gtin_str.split(".")[0]

        # Keep only digits
        gtin_clean = "".join(filter(str.isdigit, gtin_str))

        if not gtin_clean:
            return None

        # Validate and normalize length
        length = len(gtin_clean)

        if length in self.VALID_LENGTHS:
            # Standard lengths - return as-is (already valid)
            return gtin_clean

        if length > 14:
            # Too long - truncate to EAN-13
            logger.debug(f"GTIN too long ({length} digits), truncating: {gtin_clean}")
            return gtin_clean[-13:]

        if 0 < length < 14:
            # Non-standard length - pad to EAN-13 (European standard)
            # EAN-13 is the international standard used in Europe and most of the world
            # UPC-A (12 digits) is primarily US/Canada
            logger.debug(
                f"GTIN non-standard ({length} digits), padding to EAN-13: {gtin_clean}"
            )
            return gtin_clean.zfill(13)

        logger.warning(f"Invalid GTIN format: '{gtin_value}'")
        return None

    def validate(self, gtin: str) -> bool:
        """
        Validate the GTIN format.

        Args:
            gtin (str): The GTIN string to be validated.

        Returns:
            bool: True if the GTIN is valid, False otherwise.
        """
        if not gtin:
            return False
        return len(gtin) in self.VALID_LENGTHS and gtin.isdigit()


class PriceProcessor:
    """Handles price parsing and currency extraction."""

    CURRENCY_PATTERNS = {
        # Amount followed by currency
        r"([0-9.,]+)\s*(EUR|€)": lambda m: (m.group(1), "EUR"),
        r"([0-9.,]+)\s*(USD|\$)": lambda m: (m.group(1), "USD"),
        r"([0-9.,]+)\s*(GBP|£)": lambda m: (m.group(1), "GBP"),
        r"([0-9.,]+)\s*(CHF)": lambda m: (m.group(1), "CHF"),
        r"([0-9.,]+)\s*(CAD|AUD|JPY|¥)": lambda m: (m.group(1), m.group(2).upper()),
        # Currency followed by amount
        r"(EUR|€)\s*([0-9.,]+)": lambda m: (m.group(2), "EUR"),
        r"(USD|\$)\s*([0-9.,]+)": lambda m: (m.group(2), "USD"),
        r"(GBP|£)\s*([0-9.,]+)": lambda m: (m.group(2), "GBP"),
        # Generic 3-letter currency codes
        r"([0-9.,]+)\s*([A-Z]{3})": lambda m: (m.group(1), m.group(2)),
        r"([A-Z]{3})\s*([0-9.,]+)": lambda m: (m.group(2), m.group(1)),
    }

    def parse_price_currency(self, price_str: any) -> tuple[str | None, str | None]:
        """
        Parse a price string to extract the numeric value and currency.

        Args:
            price_str (any): The price string to be parsed.

        Returns:
            Tuple[Optional[str], Optional[str]]: A tuple containing the parsed price and currency, or None if parsing fails.
        """
        if not price_str or pd.isna(price_str):
            return None, None

        price_str = str(price_str).strip()
        if not price_str:
            return None, None

        # Try each pattern
        for pattern, extract_func in self.CURRENCY_PATTERNS.items():
            match = re.search(pattern, price_str, re.IGNORECASE)
            if match:
                try:
                    price_val, currency_val = extract_func(match)
                    # Normalize price (remove spaces, handle comma as decimal)
                    price_val = price_val.replace(" ", "").replace(",", ".")
                    # Validate numeric
                    float(price_val)
                    return price_val, currency_val.upper()
                except (ValueError, AttributeError):
                    continue

        # Fallback: extract just numbers
        number_match = re.search(r"([0-9.,]+)", price_str)
        if number_match:
            try:
                price_val = number_match.group(1).replace(",", ".")
                float(price_val)  # Validate
                return price_val, None
            except ValueError:
                pass

        # If we get here, parsing failed completely
        logger.error(f"Could not parse price: '{price_str}'")
        raise ValueError(f"Invalid price format: '{price_str}'")