Initial commit

2025-09-05 17:27:39 +02:00
commit 9dd177bddc
36 changed files with 3755 additions and 0 deletions
--- a/utils/csv_processor.py
+++ b/utils/csv_processor.py
@@ -0,0 +1,253 @@
+# utils/csv_processor.py
+import pandas as pd
+import requests
+from io import StringIO
+from typing import Dict, Any, Optional
+from sqlalchemy.orm import Session
+from models.database_models import Product
+from datetime import datetime
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class CSVProcessor:
+    """Handles CSV import with robust parsing and batching"""
+
+    ENCODINGS = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252', 'utf-8-sig']
+
+    COLUMN_MAPPING = {
+        # Standard variations
+        'id': 'product_id',
+        'ID': 'product_id',
+        'Product ID': 'product_id',
+        'name': 'title',
+        'Name': 'title',
+        'product_name': 'title',
+        'Product Name': 'title',
+
+        # Google Shopping feed standard
+        'g:id': 'product_id',
+        'g:title': 'title',
+        'g:description': 'description',
+        'g:link': 'link',
+        'g:image_link': 'image_link',
+        'g:availability': 'availability',
+        'g:price': 'price',
+        'g:brand': 'brand',
+        'g:gtin': 'gtin',
+        'g:mpn': 'mpn',
+        'g:condition': 'condition',
+        'g:adult': 'adult',
+        'g:multipack': 'multipack',
+        'g:is_bundle': 'is_bundle',
+        'g:age_group': 'age_group',
+        'g:color': 'color',
+        'g:gender': 'gender',
+        'g:material': 'material',
+        'g:pattern': 'pattern',
+        'g:size': 'size',
+        'g:size_type': 'size_type',
+        'g:size_system': 'size_system',
+        'g:item_group_id': 'item_group_id',
+        'g:google_product_category': 'google_product_category',
+        'g:product_type': 'product_type',
+        'g:custom_label_0': 'custom_label_0',
+        'g:custom_label_1': 'custom_label_1',
+        'g:custom_label_2': 'custom_label_2',
+        'g:custom_label_3': 'custom_label_3',
+        'g:custom_label_4': 'custom_label_4',
+
+        # Handle complex shipping column
+        'shipping(country:price:max_handling_time:min_transit_time:max_transit_time)': 'shipping'
+    }
+
+    def __init__(self):
+        from utils.data_processing import GTINProcessor, PriceProcessor
+        self.gtin_processor = GTINProcessor()
+        self.price_processor = PriceProcessor()
+
+    def download_csv(self, url: str) -> str:
+        """Download and decode CSV with multiple encoding attempts"""
+        try:
+            response = requests.get(url, timeout=30)
+            response.raise_for_status()
+
+            content = response.content
+
+            # Try different encodings
+            for encoding in self.ENCODINGS:
+                try:
+                    decoded_content = content.decode(encoding)
+                    logger.info(f"Successfully decoded CSV with encoding: {encoding}")
+                    return decoded_content
+                except UnicodeDecodeError:
+                    continue
+
+            # Fallback with error ignoring
+            decoded_content = content.decode('utf-8', errors='ignore')
+            logger.warning("Used UTF-8 with error ignoring for CSV decoding")
+            return decoded_content
+
+        except requests.RequestException as e:
+            logger.error(f"Error downloading CSV: {e}")
+            raise
+
+    def parse_csv(self, csv_content: str) -> pd.DataFrame:
+        """Parse CSV with multiple separator attempts"""
+        parsing_configs = [
+            # Try auto-detection first
+            {'sep': None, 'engine': 'python'},
+            # Try semicolon (common in European CSVs)
+            {'sep': ';', 'engine': 'python'},
+            # Try comma
+            {'sep': ',', 'engine': 'python'},
+            # Try tab
+            {'sep': '\t', 'engine': 'python'},
+        ]
+
+        for config in parsing_configs:
+            try:
+                df = pd.read_csv(
+                    StringIO(csv_content),
+                    on_bad_lines='skip',
+                    quotechar='"',
+                    skip_blank_lines=True,
+                    skipinitialspace=True,
+                    **config
+                )
+                logger.info(f"Successfully parsed CSV with config: {config}")
+                return df
+            except pd.errors.ParserError:
+                continue
+
+        raise pd.errors.ParserError("Could not parse CSV with any configuration")
+
+    def normalize_columns(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Normalize column names using mapping"""
+        # Clean column names
+        df.columns = df.columns.str.strip()
+
+        # Apply mapping
+        df = df.rename(columns=self.COLUMN_MAPPING)
+
+        logger.info(f"Normalized columns: {list(df.columns)}")
+        return df
+
+    def process_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Process a single row with data normalization"""
+        # Handle NaN values
+        processed_data = {k: (v if pd.notna(v) else None) for k, v in row_data.items()}
+
+        # Process GTIN
+        if processed_data.get('gtin'):
+            processed_data['gtin'] = self.gtin_processor.normalize(processed_data['gtin'])
+
+        # Process price and currency
+        if processed_data.get('price'):
+            parsed_price, currency = self.price_processor.parse_price_currency(processed_data['price'])
+            processed_data['price'] = parsed_price
+            processed_data['currency'] = currency
+
+        # Process sale_price
+        if processed_data.get('sale_price'):
+            parsed_sale_price, _ = self.price_processor.parse_price_currency(processed_data['sale_price'])
+            processed_data['sale_price'] = parsed_sale_price
+
+        # Clean MPN (remove .0 endings)
+        if processed_data.get('mpn'):
+            mpn_str = str(processed_data['mpn']).strip()
+            if mpn_str.endswith('.0'):
+                processed_data['mpn'] = mpn_str[:-2]
+
+        # Handle multipack type conversion
+        if processed_data.get('multipack') is not None:
+            try:
+                processed_data['multipack'] = int(float(processed_data['multipack']))
+            except (ValueError, TypeError):
+                processed_data['multipack'] = None
+
+        return processed_data
+
+    async def process_csv_from_url(self, url: str, batch_size: int, db: Session) -> Dict[str, int]:
+        """Process CSV import with batching"""
+        # Download and parse CSV
+        csv_content = self.download_csv(url)
+        df = self.parse_csv(csv_content)
+        df = self.normalize_columns(df)
+
+        logger.info(f"Processing CSV with {len(df)} rows")
+
+        imported = 0
+        updated = 0
+        errors = 0
+
+        # Process in batches
+        for i in range(0, len(df), batch_size):
+            batch_df = df.iloc[i:i + batch_size]
+            batch_imported, batch_updated, batch_errors = self._process_batch(batch_df, db)
+
+            imported += batch_imported
+            updated += batch_updated
+            errors += batch_errors
+
+            # Commit batch
+            try:
+                db.commit()
+                logger.info(
+                    f"Processed batch {i // batch_size + 1}: +{batch_imported} imported, +{batch_updated} updated, +{batch_errors} errors")
+            except Exception as e:
+                db.rollback()
+                logger.error(f"Batch commit failed: {e}")
+                errors += len(batch_df)
+
+        return {
+            "imported": imported,
+            "updated": updated,
+            "errors": errors,
+            "total_processed": imported + updated + errors
+        }
+
+    def _process_batch(self, df_batch: pd.DataFrame, db: Session) -> tuple:
+        """Process a single batch of rows"""
+        imported = 0
+        updated = 0
+        errors = 0
+
+        for _, row in df_batch.iterrows():
+            try:
+                product_data = self.process_row(row.to_dict())
+
+                # Validate required fields
+                product_id = product_data.get('product_id')
+                title = product_data.get('title')
+
+                if not product_id or not title:
+                    errors += 1
+                    continue
+
+                # Check for existing product
+                existing_product = db.query(Product).filter(
+                    Product.product_id == product_id
+                ).first()
+
+                if existing_product:
+                    # Update existing
+                    for key, value in product_data.items():
+                        if key not in ['id', 'created_at'] and hasattr(existing_product, key):
+                            setattr(existing_product, key, value)
+                    existing_product.updated_at = datetime.utcnow()
+                    updated += 1
+                else:
+                    # Create new
+                    filtered_data = {k: v for k, v in product_data.items()
+                                     if k not in ['id', 'created_at', 'updated_at'] and hasattr(Product, k)}
+                    new_product = Product(**filtered_data)
+                    db.add(new_product)
+                    imported += 1
+
+            except Exception as e:
+                logger.error(f"Error processing row: {e}")
+                errors += 1
+
+        return imported, updated, errors