fixing DQ issues

This commit is contained in:
2025-09-14 15:47:38 +02:00
parent 3eb18ef91e
commit 0ce708cf09
27 changed files with 430 additions and 214 deletions

View File

@@ -1,4 +1,12 @@
# utils/csv_processor.py
"""CSV processor utilities ....
This module provides classes and functions for:
- ....
- ....
- ....
"""
import logging
from datetime import datetime
from io import StringIO
@@ -15,7 +23,7 @@ logger = logging.getLogger(__name__)
class CSVProcessor:
"""Handles CSV import with robust parsing and batching"""
"""Handles CSV import with robust parsing and batching."""
ENCODINGS = ["utf-8", "latin-1", "iso-8859-1", "cp1252", "utf-8-sig"]
@@ -75,13 +83,14 @@ class CSVProcessor:
}
def __init__(self):
"""Class constructor."""
from utils.data_processing import GTINProcessor, PriceProcessor
self.gtin_processor = GTINProcessor()
self.price_processor = PriceProcessor()
def download_csv(self, url: str) -> str:
"""Download and decode CSV with multiple encoding attempts"""
"""Download and decode CSV with multiple encoding attempts."""
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
@@ -107,8 +116,7 @@ class CSVProcessor:
raise
def parse_csv(self, csv_content: str) -> pd.DataFrame:
"""Parse CSV with multiple separator attempts"""
"""Parse CSV with multiple separator attempts."""
for config in self.PARSING_CONFIGS:
try:
df = pd.read_csv(
@@ -127,7 +135,7 @@ class CSVProcessor:
raise pd.errors.ParserError("Could not parse CSV with any configuration")
def normalize_columns(self, df: pd.DataFrame) -> pd.DataFrame:
"""Normalize column names using mapping"""
"""Normalize column names using mapping."""
# Clean column names
df.columns = df.columns.str.strip()
@@ -138,7 +146,7 @@ class CSVProcessor:
return df
def _clean_row_data(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
"""Process a single row with data normalization"""
"""Process a single row with data normalization."""
# Handle NaN values
processed_data = {k: (v if pd.notna(v) else None) for k, v in row_data.items()}
@@ -182,7 +190,7 @@ class CSVProcessor:
self, url: str, marketplace: str, shop_name: str, batch_size: int, db: Session
) -> Dict[str, Any]:
"""
Process CSV from URL with marketplace and shop information
Process CSV from URL with marketplace and shop information.
Args:
url: URL to the CSV file
@@ -194,7 +202,6 @@ class CSVProcessor:
Returns:
Dictionary with processing results
"""
logger.info(
f"Starting marketplace CSV import from {url} for {marketplace} -> {shop_name}"
)
@@ -239,13 +246,14 @@ class CSVProcessor:
db: Session,
batch_num: int,
) -> Dict[str, int]:
"""Process a batch of CSV rows with marketplace information"""
"""Process a batch of CSV rows with marketplace information."""
imported = 0
updated = 0
errors = 0
logger.info(
f"Processing batch {batch_num} with {len(batch_df)} rows for {marketplace} -> {shop_name}"
f"Processing batch {batch_num} with {len(batch_df)} rows for "
f"{marketplace} -> {shop_name}"
)
for index, row in batch_df.iterrows():
@@ -285,7 +293,8 @@ class CSVProcessor:
existing_product.updated_at = datetime.utcnow()
updated += 1
logger.debug(
f"Updated product {product_data['product_id']} for {marketplace} and shop {shop_name}"
f"Updated product {product_data['product_id']} for "
f"{marketplace} and shop {shop_name}"
)
else:
# Create new product
@@ -299,8 +308,8 @@ class CSVProcessor:
db.add(new_product)
imported += 1
logger.debug(
f"Imported new product {product_data['product_id']} for {marketplace} and shop "
f"{shop_name}"
f"Imported new product {product_data['product_id']} "
f"for {marketplace} and shop {shop_name}"
)
except Exception as e: