fixing DQ issues

2025-09-14 15:47:38 +02:00
parent 3eb18ef91e
commit 0ce708cf09
27 changed files with 430 additions and 214 deletions
--- a/utils/csv_processor.py
+++ b/utils/csv_processor.py
@@ -1,4 +1,12 @@
 # utils/csv_processor.py
+"""CSV processor utilities ....
+
+This module provides classes and functions for:
+- ....
+- ....
+- ....
+"""
+
 import logging
 from datetime import datetime
 from io import StringIO
@@ -15,7 +23,7 @@ logger = logging.getLogger(__name__)


 class CSVProcessor:
-    """Handles CSV import with robust parsing and batching"""
+    """Handles CSV import with robust parsing and batching."""

    ENCODINGS = ["utf-8", "latin-1", "iso-8859-1", "cp1252", "utf-8-sig"]

@@ -75,13 +83,14 @@ class CSVProcessor:
    }

    def __init__(self):
+        """Class constructor."""
        from utils.data_processing import GTINProcessor, PriceProcessor

        self.gtin_processor = GTINProcessor()
        self.price_processor = PriceProcessor()

    def download_csv(self, url: str) -> str:
-        """Download and decode CSV with multiple encoding attempts"""
+        """Download and decode CSV with multiple encoding attempts."""
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
@@ -107,8 +116,7 @@ class CSVProcessor:
            raise

    def parse_csv(self, csv_content: str) -> pd.DataFrame:
-        """Parse CSV with multiple separator attempts"""
-
+        """Parse CSV with multiple separator attempts."""
        for config in self.PARSING_CONFIGS:
            try:
                df = pd.read_csv(
@@ -127,7 +135,7 @@ class CSVProcessor:
        raise pd.errors.ParserError("Could not parse CSV with any configuration")

    def normalize_columns(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Normalize column names using mapping"""
+        """Normalize column names using mapping."""
        # Clean column names
        df.columns = df.columns.str.strip()

@@ -138,7 +146,7 @@ class CSVProcessor:
        return df

    def _clean_row_data(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
-        """Process a single row with data normalization"""
+        """Process a single row with data normalization."""
        # Handle NaN values
        processed_data = {k: (v if pd.notna(v) else None) for k, v in row_data.items()}

@@ -182,7 +190,7 @@ class CSVProcessor:
        self, url: str, marketplace: str, shop_name: str, batch_size: int, db: Session
    ) -> Dict[str, Any]:
        """
-        Process CSV from URL with marketplace and shop information
+        Process CSV from URL with marketplace and shop information.

        Args:
            url: URL to the CSV file
@@ -194,7 +202,6 @@ class CSVProcessor:
        Returns:
            Dictionary with processing results
        """
-
        logger.info(
            f"Starting marketplace CSV import from {url} for {marketplace} -> {shop_name}"
        )
@@ -239,13 +246,14 @@ class CSVProcessor:
        db: Session,
        batch_num: int,
    ) -> Dict[str, int]:
-        """Process a batch of CSV rows with marketplace information"""
+        """Process a batch of CSV rows with marketplace information."""
        imported = 0
        updated = 0
        errors = 0

        logger.info(
-            f"Processing batch {batch_num} with {len(batch_df)} rows for {marketplace} -> {shop_name}"
+            f"Processing batch {batch_num} with {len(batch_df)} rows for "
+            f"{marketplace} -> {shop_name}"
        )

        for index, row in batch_df.iterrows():
@@ -285,7 +293,8 @@ class CSVProcessor:
                    existing_product.updated_at = datetime.utcnow()
                    updated += 1
                    logger.debug(
-                        f"Updated product {product_data['product_id']} for {marketplace} and shop {shop_name}"
+                        f"Updated product {product_data['product_id']} for "
+                        f"{marketplace} and shop {shop_name}"
                    )
                else:
                    # Create new product
@@ -299,8 +308,8 @@ class CSVProcessor:
                    db.add(new_product)
                    imported += 1
                    logger.debug(
-                        f"Imported new product {product_data['product_id']} for {marketplace} and shop "
-                        f"{shop_name}"
+                        f"Imported new product {product_data['product_id']} "
+                        f"for {marketplace} and shop {shop_name}"
                    )

            except Exception as e:
--- a/utils/data_processing.py
+++ b/utils/data_processing.py
@@ -1,4 +1,12 @@
 # utils/data_processing.py
+"""Data processing utilities for GTIN validation and price parsing.
+
+This module provides classes and functions for:
+- GTIN (Global Trade Item Number) validation and normalization
+- Price parsing with currency detection
+- Data cleaning and validation utilities
+"""
+
 import logging
 import re
 from typing import Optional, Tuple
@@ -9,14 +17,15 @@ logger = logging.getLogger(__name__)


 class GTINProcessor:
-    """Handles GTIN normalization and validation"""
+    """Handles GTIN normalization and validation."""

    VALID_LENGTHS = [8, 12, 13, 14]

    def normalize(self, gtin_value: any) -> Optional[str]:
        """
-        Normalize GTIN to proper format
-        Returns None for invalid GTINs
+        Normalize GTIN to proper format.
+
+        Returns None for invalid GTINs.
        """
        if not gtin_value or pd.isna(gtin_value):
            return None
@@ -63,14 +72,14 @@ class GTINProcessor:
        return None

    def validate(self, gtin: str) -> bool:
-        """Validate GTIN format"""
+        """Validate GTIN format."""
        if not gtin:
            return False
        return len(gtin) in self.VALID_LENGTHS and gtin.isdigit()


 class PriceProcessor:
-    """Handles price parsing and currency extraction"""
+    """Handles price parsing and currency extraction."""

    CURRENCY_PATTERNS = {
        # Amount followed by currency
@@ -92,7 +101,8 @@ class PriceProcessor:
        self, price_str: any
    ) -> Tuple[Optional[str], Optional[str]]:
        """
-        Parse price string into (price, currency) tuple
+        Parse price string into (price, currency) tuple.
+
        Returns (None, None) if parsing fails
        """
        if not price_str or pd.isna(price_str):
--- a/utils/database.py
+++ b/utils/database.py
@@ -1,4 +1,12 @@
 # utils/database.py
+"""Database utilities ....
+
+This module provides classes and functions for:
+- ....
+- ....
+- ....
+"""
+
 import logging

 from sqlalchemy import create_engine
@@ -9,7 +17,7 @@ logger = logging.getLogger(__name__)


 def get_db_engine(database_url: str):
-    """Create database engine with connection pooling"""
+    """Create database engine with connection pooling."""
    if database_url.startswith("sqlite"):
        # SQLite configuration
        engine = create_engine(
@@ -26,10 +34,10 @@ def get_db_engine(database_url: str):
            echo=False,
        )

-    logger.info(f"Database engine created for: {database_url.split('@')[0]}@...")
+    logger.info(f"Database engine created for: " f"{database_url.split('@')[0]}@...")
    return engine


 def get_session_local(engine):
-    """Create session factory"""
+    """Create session factory."""
    return sessionmaker(autocommit=False, autoflush=False, bind=engine)