fixing DQ issues

This commit is contained in:
2025-09-14 15:47:38 +02:00
parent 3eb18ef91e
commit 0ce708cf09
27 changed files with 430 additions and 214 deletions

View File

@@ -1,4 +1,12 @@
# utils/csv_processor.py
"""CSV processor utilities ....
This module provides classes and functions for:
- ....
- ....
- ....
"""
import logging
from datetime import datetime
from io import StringIO
@@ -15,7 +23,7 @@ logger = logging.getLogger(__name__)
class CSVProcessor:
"""Handles CSV import with robust parsing and batching"""
"""Handles CSV import with robust parsing and batching."""
ENCODINGS = ["utf-8", "latin-1", "iso-8859-1", "cp1252", "utf-8-sig"]
@@ -75,13 +83,14 @@ class CSVProcessor:
}
def __init__(self):
"""Class constructor."""
from utils.data_processing import GTINProcessor, PriceProcessor
self.gtin_processor = GTINProcessor()
self.price_processor = PriceProcessor()
def download_csv(self, url: str) -> str:
"""Download and decode CSV with multiple encoding attempts"""
"""Download and decode CSV with multiple encoding attempts."""
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
@@ -107,8 +116,7 @@ class CSVProcessor:
raise
def parse_csv(self, csv_content: str) -> pd.DataFrame:
"""Parse CSV with multiple separator attempts"""
"""Parse CSV with multiple separator attempts."""
for config in self.PARSING_CONFIGS:
try:
df = pd.read_csv(
@@ -127,7 +135,7 @@ class CSVProcessor:
raise pd.errors.ParserError("Could not parse CSV with any configuration")
def normalize_columns(self, df: pd.DataFrame) -> pd.DataFrame:
"""Normalize column names using mapping"""
"""Normalize column names using mapping."""
# Clean column names
df.columns = df.columns.str.strip()
@@ -138,7 +146,7 @@ class CSVProcessor:
return df
def _clean_row_data(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
"""Process a single row with data normalization"""
"""Process a single row with data normalization."""
# Handle NaN values
processed_data = {k: (v if pd.notna(v) else None) for k, v in row_data.items()}
@@ -182,7 +190,7 @@ class CSVProcessor:
self, url: str, marketplace: str, shop_name: str, batch_size: int, db: Session
) -> Dict[str, Any]:
"""
Process CSV from URL with marketplace and shop information
Process CSV from URL with marketplace and shop information.
Args:
url: URL to the CSV file
@@ -194,7 +202,6 @@ class CSVProcessor:
Returns:
Dictionary with processing results
"""
logger.info(
f"Starting marketplace CSV import from {url} for {marketplace} -> {shop_name}"
)
@@ -239,13 +246,14 @@ class CSVProcessor:
db: Session,
batch_num: int,
) -> Dict[str, int]:
"""Process a batch of CSV rows with marketplace information"""
"""Process a batch of CSV rows with marketplace information."""
imported = 0
updated = 0
errors = 0
logger.info(
f"Processing batch {batch_num} with {len(batch_df)} rows for {marketplace} -> {shop_name}"
f"Processing batch {batch_num} with {len(batch_df)} rows for "
f"{marketplace} -> {shop_name}"
)
for index, row in batch_df.iterrows():
@@ -285,7 +293,8 @@ class CSVProcessor:
existing_product.updated_at = datetime.utcnow()
updated += 1
logger.debug(
f"Updated product {product_data['product_id']} for {marketplace} and shop {shop_name}"
f"Updated product {product_data['product_id']} for "
f"{marketplace} and shop {shop_name}"
)
else:
# Create new product
@@ -299,8 +308,8 @@ class CSVProcessor:
db.add(new_product)
imported += 1
logger.debug(
f"Imported new product {product_data['product_id']} for {marketplace} and shop "
f"{shop_name}"
f"Imported new product {product_data['product_id']} "
f"for {marketplace} and shop {shop_name}"
)
except Exception as e:

View File

@@ -1,4 +1,12 @@
# utils/data_processing.py
"""Data processing utilities for GTIN validation and price parsing.
This module provides classes and functions for:
- GTIN (Global Trade Item Number) validation and normalization
- Price parsing with currency detection
- Data cleaning and validation utilities
"""
import logging
import re
from typing import Optional, Tuple
@@ -9,14 +17,15 @@ logger = logging.getLogger(__name__)
class GTINProcessor:
"""Handles GTIN normalization and validation"""
"""Handles GTIN normalization and validation."""
VALID_LENGTHS = [8, 12, 13, 14]
def normalize(self, gtin_value: any) -> Optional[str]:
"""
Normalize GTIN to proper format
Returns None for invalid GTINs
Normalize GTIN to proper format.
Returns None for invalid GTINs.
"""
if not gtin_value or pd.isna(gtin_value):
return None
@@ -63,14 +72,14 @@ class GTINProcessor:
return None
def validate(self, gtin: str) -> bool:
"""Validate GTIN format"""
"""Validate GTIN format."""
if not gtin:
return False
return len(gtin) in self.VALID_LENGTHS and gtin.isdigit()
class PriceProcessor:
"""Handles price parsing and currency extraction"""
"""Handles price parsing and currency extraction."""
CURRENCY_PATTERNS = {
# Amount followed by currency
@@ -92,7 +101,8 @@ class PriceProcessor:
self, price_str: any
) -> Tuple[Optional[str], Optional[str]]:
"""
Parse price string into (price, currency) tuple
Parse price string into (price, currency) tuple.
Returns (None, None) if parsing fails
"""
if not price_str or pd.isna(price_str):

View File

@@ -1,4 +1,12 @@
# utils/database.py
"""Database utilities ....
This module provides classes and functions for:
- ....
- ....
- ....
"""
import logging
from sqlalchemy import create_engine
@@ -9,7 +17,7 @@ logger = logging.getLogger(__name__)
def get_db_engine(database_url: str):
"""Create database engine with connection pooling"""
"""Create database engine with connection pooling."""
if database_url.startswith("sqlite"):
# SQLite configuration
engine = create_engine(
@@ -26,10 +34,10 @@ def get_db_engine(database_url: str):
echo=False,
)
logger.info(f"Database engine created for: {database_url.split('@')[0]}@...")
logger.info(f"Database engine created for: " f"{database_url.split('@')[0]}@...")
return engine
def get_session_local(engine):
"""Create session factory"""
"""Create session factory."""
return sessionmaker(autocommit=False, autoflush=False, bind=engine)