fixing DQ issues

This commit is contained in:
2025-09-14 15:47:38 +02:00
parent 3eb18ef91e
commit 0ce708cf09
27 changed files with 430 additions and 214 deletions

View File

@@ -1,4 +1,12 @@
# utils/data_processing.py
"""Data processing utilities for GTIN validation and price parsing.
This module provides classes and functions for:
- GTIN (Global Trade Item Number) validation and normalization
- Price parsing with currency detection
- Data cleaning and validation utilities
"""
import logging
import re
from typing import Optional, Tuple
@@ -9,14 +17,15 @@ logger = logging.getLogger(__name__)
class GTINProcessor:
"""Handles GTIN normalization and validation"""
"""Handles GTIN normalization and validation."""
VALID_LENGTHS = [8, 12, 13, 14]
def normalize(self, gtin_value: any) -> Optional[str]:
"""
Normalize GTIN to proper format
Returns None for invalid GTINs
Normalize GTIN to proper format.
Returns None for invalid GTINs.
"""
if not gtin_value or pd.isna(gtin_value):
return None
@@ -63,14 +72,14 @@ class GTINProcessor:
return None
def validate(self, gtin: str) -> bool:
"""Validate GTIN format"""
"""Validate GTIN format."""
if not gtin:
return False
return len(gtin) in self.VALID_LENGTHS and gtin.isdigit()
class PriceProcessor:
"""Handles price parsing and currency extraction"""
"""Handles price parsing and currency extraction."""
CURRENCY_PATTERNS = {
# Amount followed by currency
@@ -92,7 +101,8 @@ class PriceProcessor:
self, price_str: any
) -> Tuple[Optional[str], Optional[str]]:
"""
Parse price string into (price, currency) tuple
Parse price string into (price, currency) tuple.
Returns (None, None) if parsing fails
"""
if not price_str or pd.isna(price_str):