Files
orion/app/utils/data_processing.py
Samir Boulahtit 238c1ec9b8 refactor: modernize code quality tooling with Ruff
- Replace black, isort, and flake8 with Ruff (all-in-one linter and formatter)
- Add comprehensive pyproject.toml configuration
- Simplify Makefile code quality targets
- Configure exclusions for venv/.venv in pyproject.toml
- Auto-fix 1,359 linting issues across codebase

Benefits:
- Much faster builds (Ruff is written in Rust)
- Single tool replaces multiple tools
- More comprehensive rule set (UP, B, C4, SIM, PIE, RET, Q)
- All configuration centralized in pyproject.toml
- Better import sorting and formatting consistency

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-28 19:37:38 +01:00

157 lines
5.1 KiB
Python

# utils/data_processing.py
"""Data processing utilities for GTIN validation and price parsing.
This module provides classes and functions for:
- GTIN (Global Trade Item Number) validation and normalization
- Price parsing with currency detection
- Data cleaning and validation utilities
"""
import logging
import re
import pandas as pd
logger = logging.getLogger(__name__)
class GTINProcessor:
"""Handles GTIN normalization and validation."""
VALID_LENGTHS = [8, 12, 13, 14] # List of valid GTIN lengths
def normalize(self, gtin_value: any) -> str | None:
"""
Normalize GTIN to proper format.
Args:
gtin_value (any): The GTIN value to be normalized.
Returns:
Optional[str]: The normalized GTIN string or None if the input is invalid.
"""
if not gtin_value or pd.isna(gtin_value):
return None
gtin_str = str(gtin_value).strip()
if not gtin_str:
return None
# Remove decimal point (e.g., "889698116923.0" -> "889698116923")
if "." in gtin_str:
gtin_str = gtin_str.split(".")[0]
# Keep only digits
gtin_clean = "".join(filter(str.isdigit, gtin_str))
if not gtin_clean:
return None
# Validate and normalize length
length = len(gtin_clean)
if length in self.VALID_LENGTHS:
# Standard lengths - pad appropriately
if length == 8:
return gtin_clean.zfill(8) # EAN-8
if length == 12:
return gtin_clean.zfill(12) # UPC-A
if length == 13:
return gtin_clean.zfill(13) # EAN-13
if length == 14:
return gtin_clean.zfill(14) # GTIN-14
elif length > 14:
# Too long - truncate to EAN-13
logger.warning(f"GTIN too long, truncating: {gtin_clean}")
return gtin_clean[-13:]
elif 0 < length < 8:
# Too short - pad to EAN-13
logger.warning(f"GTIN too short, padding: {gtin_clean}")
return gtin_clean.zfill(13)
logger.warning(f"Invalid GTIN format: '{gtin_value}'")
return None
def validate(self, gtin: str) -> bool:
"""
Validate the GTIN format.
Args:
gtin (str): The GTIN string to be validated.
Returns:
bool: True if the GTIN is valid, False otherwise.
"""
if not gtin:
return False
return len(gtin) in self.VALID_LENGTHS and gtin.isdigit()
class PriceProcessor:
"""Handles price parsing and currency extraction."""
CURRENCY_PATTERNS = {
# Amount followed by currency
r"([0-9.,]+)\s*(EUR|€)": lambda m: (m.group(1), "EUR"),
r"([0-9.,]+)\s*(USD|\$)": lambda m: (m.group(1), "USD"),
r"([0-9.,]+)\s*(GBP|£)": lambda m: (m.group(1), "GBP"),
r"([0-9.,]+)\s*(CHF)": lambda m: (m.group(1), "CHF"),
r"([0-9.,]+)\s*(CAD|AUD|JPY|¥)": lambda m: (m.group(1), m.group(2).upper()),
# Currency followed by amount
r"(EUR|€)\s*([0-9.,]+)": lambda m: (m.group(2), "EUR"),
r"(USD|\$)\s*([0-9.,]+)": lambda m: (m.group(2), "USD"),
r"(GBP|£)\s*([0-9.,]+)": lambda m: (m.group(2), "GBP"),
# Generic 3-letter currency codes
r"([0-9.,]+)\s*([A-Z]{3})": lambda m: (m.group(1), m.group(2)),
r"([A-Z]{3})\s*([0-9.,]+)": lambda m: (m.group(2), m.group(1)),
}
def parse_price_currency(
self, price_str: any
) -> tuple[str | None, str | None]:
"""
Parse a price string to extract the numeric value and currency.
Args:
price_str (any): The price string to be parsed.
Returns:
Tuple[Optional[str], Optional[str]]: A tuple containing the parsed price and currency, or None if parsing fails.
"""
if not price_str or pd.isna(price_str):
return None, None
price_str = str(price_str).strip()
if not price_str:
return None, None
# Try each pattern
for pattern, extract_func in self.CURRENCY_PATTERNS.items():
match = re.search(pattern, price_str, re.IGNORECASE)
if match:
try:
price_val, currency_val = extract_func(match)
# Normalize price (remove spaces, handle comma as decimal)
price_val = price_val.replace(" ", "").replace(",", ".")
# Validate numeric
float(price_val)
return price_val, currency_val.upper()
except (ValueError, AttributeError):
continue
# Fallback: extract just numbers
number_match = re.search(r"([0-9.,]+)", price_str)
if number_match:
try:
price_val = number_match.group(1).replace(",", ".")
float(price_val) # Validate
return price_val, None
except ValueError:
pass
# If we get here, parsing failed completely
logger.error(f"Could not parse price: '{price_str}'")
raise ValueError(f"Invalid price format: '{price_str}'")