code quality run

2025-09-13 21:58:54 +02:00
parent 0dfd885847
commit 3eb18ef91e
63 changed files with 1802 additions and 1289 deletions
--- a/utils/csv_processor.py
+++ b/utils/csv_processor.py
@@ -1,14 +1,15 @@
 # utils/csv_processor.py
+import logging
+from datetime import datetime
+from io import StringIO
+from typing import Any, Dict
+
 import pandas as pd
 import requests
-from io import StringIO
-from typing import Dict, Any
-
 from sqlalchemy import literal
 from sqlalchemy.orm import Session
+
 from models.database_models import Product
-from datetime import datetime
-import logging

 logger = logging.getLogger(__name__)

@@ -16,67 +17,66 @@ logger = logging.getLogger(__name__)
 class CSVProcessor:
    """Handles CSV import with robust parsing and batching"""

-    ENCODINGS = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252', 'utf-8-sig']
+    ENCODINGS = ["utf-8", "latin-1", "iso-8859-1", "cp1252", "utf-8-sig"]

    PARSING_CONFIGS = [
        # Try auto-detection first
-        {'sep': None, 'engine': 'python'},
+        {"sep": None, "engine": "python"},
        # Try semicolon (common in European CSVs)
-        {'sep': ';', 'engine': 'python'},
+        {"sep": ";", "engine": "python"},
        # Try comma
-        {'sep': ',', 'engine': 'python'},
+        {"sep": ",", "engine": "python"},
        # Try tab
-        {'sep': '\t', 'engine': 'python'},
+        {"sep": "\t", "engine": "python"},
    ]

    COLUMN_MAPPING = {
        # Standard variations
-        'id': 'product_id',
-        'ID': 'product_id',
-        'Product ID': 'product_id',
-        'name': 'title',
-        'Name': 'title',
-        'product_name': 'title',
-        'Product Name': 'title',
-
+        "id": "product_id",
+        "ID": "product_id",
+        "Product ID": "product_id",
+        "name": "title",
+        "Name": "title",
+        "product_name": "title",
+        "Product Name": "title",
        # Google Shopping feed standard
-        'g:id': 'product_id',
-        'g:title': 'title',
-        'g:description': 'description',
-        'g:link': 'link',
-        'g:image_link': 'image_link',
-        'g:availability': 'availability',
-        'g:price': 'price',
-        'g:brand': 'brand',
-        'g:gtin': 'gtin',
-        'g:mpn': 'mpn',
-        'g:condition': 'condition',
-        'g:adult': 'adult',
-        'g:multipack': 'multipack',
-        'g:is_bundle': 'is_bundle',
-        'g:age_group': 'age_group',
-        'g:color': 'color',
-        'g:gender': 'gender',
-        'g:material': 'material',
-        'g:pattern': 'pattern',
-        'g:size': 'size',
-        'g:size_type': 'size_type',
-        'g:size_system': 'size_system',
-        'g:item_group_id': 'item_group_id',
-        'g:google_product_category': 'google_product_category',
-        'g:product_type': 'product_type',
-        'g:custom_label_0': 'custom_label_0',
-        'g:custom_label_1': 'custom_label_1',
-        'g:custom_label_2': 'custom_label_2',
-        'g:custom_label_3': 'custom_label_3',
-        'g:custom_label_4': 'custom_label_4',
-
+        "g:id": "product_id",
+        "g:title": "title",
+        "g:description": "description",
+        "g:link": "link",
+        "g:image_link": "image_link",
+        "g:availability": "availability",
+        "g:price": "price",
+        "g:brand": "brand",
+        "g:gtin": "gtin",
+        "g:mpn": "mpn",
+        "g:condition": "condition",
+        "g:adult": "adult",
+        "g:multipack": "multipack",
+        "g:is_bundle": "is_bundle",
+        "g:age_group": "age_group",
+        "g:color": "color",
+        "g:gender": "gender",
+        "g:material": "material",
+        "g:pattern": "pattern",
+        "g:size": "size",
+        "g:size_type": "size_type",
+        "g:size_system": "size_system",
+        "g:item_group_id": "item_group_id",
+        "g:google_product_category": "google_product_category",
+        "g:product_type": "product_type",
+        "g:custom_label_0": "custom_label_0",
+        "g:custom_label_1": "custom_label_1",
+        "g:custom_label_2": "custom_label_2",
+        "g:custom_label_3": "custom_label_3",
+        "g:custom_label_4": "custom_label_4",
        # Handle complex shipping column
-        'shipping(country:price:max_handling_time:min_transit_time:max_transit_time)': 'shipping'
+        "shipping(country:price:max_handling_time:min_transit_time:max_transit_time)": "shipping",
    }

    def __init__(self):
        from utils.data_processing import GTINProcessor, PriceProcessor
+
        self.gtin_processor = GTINProcessor()
        self.price_processor = PriceProcessor()

@@ -98,7 +98,7 @@ class CSVProcessor:
                    continue

            # Fallback with error ignoring
-            decoded_content = content.decode('utf-8', errors='ignore')
+            decoded_content = content.decode("utf-8", errors="ignore")
            logger.warning("Used UTF-8 with error ignoring for CSV decoding")
            return decoded_content

@@ -113,11 +113,11 @@ class CSVProcessor:
            try:
                df = pd.read_csv(
                    StringIO(csv_content),
-                    on_bad_lines='skip',
+                    on_bad_lines="skip",
                    quotechar='"',
                    skip_blank_lines=True,
                    skipinitialspace=True,
-                    **config
+                    **config,
                )
                logger.info(f"Successfully parsed CSV with config: {config}")
                return df
@@ -143,42 +143,43 @@ class CSVProcessor:
        processed_data = {k: (v if pd.notna(v) else None) for k, v in row_data.items()}

        # Process GTIN
-        if processed_data.get('gtin'):
-            processed_data['gtin'] = self.gtin_processor.normalize(processed_data['gtin'])
+        if processed_data.get("gtin"):
+            processed_data["gtin"] = self.gtin_processor.normalize(
+                processed_data["gtin"]
+            )

        # Process price and currency
-        if processed_data.get('price'):
-            parsed_price, currency = self.price_processor.parse_price_currency(processed_data['price'])
-            processed_data['price'] = parsed_price
-            processed_data['currency'] = currency
+        if processed_data.get("price"):
+            parsed_price, currency = self.price_processor.parse_price_currency(
+                processed_data["price"]
+            )
+            processed_data["price"] = parsed_price
+            processed_data["currency"] = currency

        # Process sale_price
-        if processed_data.get('sale_price'):
-            parsed_sale_price, _ = self.price_processor.parse_price_currency(processed_data['sale_price'])
-            processed_data['sale_price'] = parsed_sale_price
+        if processed_data.get("sale_price"):
+            parsed_sale_price, _ = self.price_processor.parse_price_currency(
+                processed_data["sale_price"]
+            )
+            processed_data["sale_price"] = parsed_sale_price

        # Clean MPN (remove .0 endings)
-        if processed_data.get('mpn'):
-            mpn_str = str(processed_data['mpn']).strip()
-            if mpn_str.endswith('.0'):
-                processed_data['mpn'] = mpn_str[:-2]
+        if processed_data.get("mpn"):
+            mpn_str = str(processed_data["mpn"]).strip()
+            if mpn_str.endswith(".0"):
+                processed_data["mpn"] = mpn_str[:-2]

        # Handle multipack type conversion
-        if processed_data.get('multipack') is not None:
+        if processed_data.get("multipack") is not None:
            try:
-                processed_data['multipack'] = int(float(processed_data['multipack']))
+                processed_data["multipack"] = int(float(processed_data["multipack"]))
            except (ValueError, TypeError):
-                processed_data['multipack'] = None
+                processed_data["multipack"] = None

        return processed_data

    async def process_marketplace_csv_from_url(
-            self,
-            url: str,
-            marketplace: str,
-            shop_name: str,
-            batch_size: int,
-            db: Session
+        self, url: str, marketplace: str, shop_name: str, batch_size: int, db: Session
    ) -> Dict[str, Any]:
        """
        Process CSV from URL with marketplace and shop information
@@ -194,7 +195,9 @@ class CSVProcessor:
            Dictionary with processing results
        """

-        logger.info(f"Starting marketplace CSV import from {url} for {marketplace} -> {shop_name}")
+        logger.info(
+            f"Starting marketplace CSV import from {url} for {marketplace} -> {shop_name}"
+        )
        # Download and parse CSV
        csv_content = self.download_csv(url)
        df = self.parse_csv(csv_content)
@@ -208,40 +211,42 @@ class CSVProcessor:

        # Process in batches
        for i in range(0, len(df), batch_size):
-            batch_df = df.iloc[i:i + batch_size]
+            batch_df = df.iloc[i : i + batch_size]
            batch_result = await self._process_marketplace_batch(
                batch_df, marketplace, shop_name, db, i // batch_size + 1
            )

-            imported += batch_result['imported']
-            updated += batch_result['updated']
-            errors += batch_result['errors']
+            imported += batch_result["imported"]
+            updated += batch_result["updated"]
+            errors += batch_result["errors"]

            logger.info(f"Processed batch {i // batch_size + 1}: {batch_result}")

        return {
-            'total_processed': imported + updated + errors,
-            'imported': imported,
-            'updated': updated,
-            'errors': errors,
-            'marketplace': marketplace,
-            'shop_name': shop_name
+            "total_processed": imported + updated + errors,
+            "imported": imported,
+            "updated": updated,
+            "errors": errors,
+            "marketplace": marketplace,
+            "shop_name": shop_name,
        }

    async def _process_marketplace_batch(
-            self,
-            batch_df: pd.DataFrame,
-            marketplace: str,
-            shop_name: str,
-            db: Session,
-            batch_num: int
+        self,
+        batch_df: pd.DataFrame,
+        marketplace: str,
+        shop_name: str,
+        db: Session,
+        batch_num: int,
    ) -> Dict[str, int]:
        """Process a batch of CSV rows with marketplace information"""
        imported = 0
        updated = 0
        errors = 0

-        logger.info(f"Processing batch {batch_num} with {len(batch_df)} rows for {marketplace} -> {shop_name}")
+        logger.info(
+            f"Processing batch {batch_num} with {len(batch_df)} rows for {marketplace} -> {shop_name}"
+        )

        for index, row in batch_df.iterrows():
            try:
@@ -249,42 +254,54 @@ class CSVProcessor:
                product_data = self._clean_row_data(row.to_dict())

                # Add marketplace and shop information
-                product_data['marketplace'] = marketplace
-                product_data['shop_name'] = shop_name
+                product_data["marketplace"] = marketplace
+                product_data["shop_name"] = shop_name

                # Validate required fields
-                if not product_data.get('product_id'):
+                if not product_data.get("product_id"):
                    logger.warning(f"Row {index}: Missing product_id, skipping")
                    errors += 1
                    continue

-                if not product_data.get('title'):
+                if not product_data.get("title"):
                    logger.warning(f"Row {index}: Missing title, skipping")
                    errors += 1
                    continue

                # Check if product exists
-                existing_product = db.query(Product).filter(
-                    Product.product_id == literal(product_data['product_id'])
-                ).first()
+                existing_product = (
+                    db.query(Product)
+                    .filter(Product.product_id == literal(product_data["product_id"]))
+                    .first()
+                )

                if existing_product:
                    # Update existing product
                    for key, value in product_data.items():
-                        if key not in ['id', 'created_at'] and hasattr(existing_product, key):
+                        if key not in ["id", "created_at"] and hasattr(
+                            existing_product, key
+                        ):
                            setattr(existing_product, key, value)
                    existing_product.updated_at = datetime.utcnow()
                    updated += 1
-                    logger.debug(f"Updated product {product_data['product_id']} for {marketplace} and shop {shop_name}")
+                    logger.debug(
+                        f"Updated product {product_data['product_id']} for {marketplace} and shop {shop_name}"
+                    )
                else:
                    # Create new product
-                    filtered_data = {k: v for k, v in product_data.items()
-                                     if k not in ['id', 'created_at', 'updated_at'] and hasattr(Product, k)}
+                    filtered_data = {
+                        k: v
+                        for k, v in product_data.items()
+                        if k not in ["id", "created_at", "updated_at"]
+                        and hasattr(Product, k)
+                    }
                    new_product = Product(**filtered_data)
                    db.add(new_product)
                    imported += 1
-                    logger.debug(f"Imported new product {product_data['product_id']} for {marketplace} and shop "
-                                 f"{shop_name}")
+                    logger.debug(
+                        f"Imported new product {product_data['product_id']} for {marketplace} and shop "
+                        f"{shop_name}"
+                    )

            except Exception as e:
                logger.error(f"Error processing row: {e}")
@@ -303,8 +320,4 @@ class CSVProcessor:
            imported = 0
            updated = 0

-        return {
-            'imported': imported,
-            'updated': updated,
-            'errors': errors
-        }
+        return {"imported": imported, "updated": updated, "errors": errors}