# utils/csv_processor.py import pandas as pd import requests from io import StringIO from typing import Dict, Any, Optional from sqlalchemy.orm import Session from models.database_models import Product from datetime import datetime import logging logger = logging.getLogger(__name__) class CSVProcessor: """Handles CSV import with robust parsing and batching""" ENCODINGS = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252', 'utf-8-sig'] COLUMN_MAPPING = { # Standard variations 'id': 'product_id', 'ID': 'product_id', 'Product ID': 'product_id', 'name': 'title', 'Name': 'title', 'product_name': 'title', 'Product Name': 'title', # Google Shopping feed standard 'g:id': 'product_id', 'g:title': 'title', 'g:description': 'description', 'g:link': 'link', 'g:image_link': 'image_link', 'g:availability': 'availability', 'g:price': 'price', 'g:brand': 'brand', 'g:gtin': 'gtin', 'g:mpn': 'mpn', 'g:condition': 'condition', 'g:adult': 'adult', 'g:multipack': 'multipack', 'g:is_bundle': 'is_bundle', 'g:age_group': 'age_group', 'g:color': 'color', 'g:gender': 'gender', 'g:material': 'material', 'g:pattern': 'pattern', 'g:size': 'size', 'g:size_type': 'size_type', 'g:size_system': 'size_system', 'g:item_group_id': 'item_group_id', 'g:google_product_category': 'google_product_category', 'g:product_type': 'product_type', 'g:custom_label_0': 'custom_label_0', 'g:custom_label_1': 'custom_label_1', 'g:custom_label_2': 'custom_label_2', 'g:custom_label_3': 'custom_label_3', 'g:custom_label_4': 'custom_label_4', # Handle complex shipping column 'shipping(country:price:max_handling_time:min_transit_time:max_transit_time)': 'shipping' } def __init__(self): from utils.data_processing import GTINProcessor, PriceProcessor self.gtin_processor = GTINProcessor() self.price_processor = PriceProcessor() def download_csv(self, url: str) -> str: """Download and decode CSV with multiple encoding attempts""" try: response = requests.get(url, timeout=30) response.raise_for_status() content = response.content # Try different encodings for encoding in self.ENCODINGS: try: decoded_content = content.decode(encoding) logger.info(f"Successfully decoded CSV with encoding: {encoding}") return decoded_content except UnicodeDecodeError: continue # Fallback with error ignoring decoded_content = content.decode('utf-8', errors='ignore') logger.warning("Used UTF-8 with error ignoring for CSV decoding") return decoded_content except requests.RequestException as e: logger.error(f"Error downloading CSV: {e}") raise def parse_csv(self, csv_content: str) -> pd.DataFrame: """Parse CSV with multiple separator attempts""" parsing_configs = [ # Try auto-detection first {'sep': None, 'engine': 'python'}, # Try semicolon (common in European CSVs) {'sep': ';', 'engine': 'python'}, # Try comma {'sep': ',', 'engine': 'python'}, # Try tab {'sep': '\t', 'engine': 'python'}, ] for config in parsing_configs: try: df = pd.read_csv( StringIO(csv_content), on_bad_lines='skip', quotechar='"', skip_blank_lines=True, skipinitialspace=True, **config ) logger.info(f"Successfully parsed CSV with config: {config}") return df except pd.errors.ParserError: continue raise pd.errors.ParserError("Could not parse CSV with any configuration") def normalize_columns(self, df: pd.DataFrame) -> pd.DataFrame: """Normalize column names using mapping""" # Clean column names df.columns = df.columns.str.strip() # Apply mapping df = df.rename(columns=self.COLUMN_MAPPING) logger.info(f"Normalized columns: {list(df.columns)}") return df def process_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: """Process a single row with data normalization""" # Handle NaN values processed_data = {k: (v if pd.notna(v) else None) for k, v in row_data.items()} # Process GTIN if processed_data.get('gtin'): processed_data['gtin'] = self.gtin_processor.normalize(processed_data['gtin']) # Process price and currency if processed_data.get('price'): parsed_price, currency = self.price_processor.parse_price_currency(processed_data['price']) processed_data['price'] = parsed_price processed_data['currency'] = currency # Process sale_price if processed_data.get('sale_price'): parsed_sale_price, _ = self.price_processor.parse_price_currency(processed_data['sale_price']) processed_data['sale_price'] = parsed_sale_price # Clean MPN (remove .0 endings) if processed_data.get('mpn'): mpn_str = str(processed_data['mpn']).strip() if mpn_str.endswith('.0'): processed_data['mpn'] = mpn_str[:-2] # Handle multipack type conversion if processed_data.get('multipack') is not None: try: processed_data['multipack'] = int(float(processed_data['multipack'])) except (ValueError, TypeError): processed_data['multipack'] = None return processed_data async def process_csv_from_url(self, url: str, batch_size: int, db: Session) -> Dict[str, int]: """Process CSV import with batching""" # Download and parse CSV csv_content = self.download_csv(url) df = self.parse_csv(csv_content) df = self.normalize_columns(df) logger.info(f"Processing CSV with {len(df)} rows") imported = 0 updated = 0 errors = 0 # Process in batches for i in range(0, len(df), batch_size): batch_df = df.iloc[i:i + batch_size] batch_imported, batch_updated, batch_errors = self._process_batch(batch_df, db) imported += batch_imported updated += batch_updated errors += batch_errors # Commit batch try: db.commit() logger.info( f"Processed batch {i // batch_size + 1}: +{batch_imported} imported, +{batch_updated} updated, +{batch_errors} errors") except Exception as e: db.rollback() logger.error(f"Batch commit failed: {e}") errors += len(batch_df) return { "imported": imported, "updated": updated, "errors": errors, "total_processed": imported + updated + errors } def _process_batch(self, df_batch: pd.DataFrame, db: Session) -> tuple: """Process a single batch of rows""" imported = 0 updated = 0 errors = 0 for _, row in df_batch.iterrows(): try: product_data = self.process_row(row.to_dict()) # Validate required fields product_id = product_data.get('product_id') title = product_data.get('title') if not product_id or not title: errors += 1 continue # Check for existing product existing_product = db.query(Product).filter( Product.product_id == product_id ).first() if existing_product: # Update existing for key, value in product_data.items(): if key not in ['id', 'created_at'] and hasattr(existing_product, key): setattr(existing_product, key, value) existing_product.updated_at = datetime.utcnow() updated += 1 else: # Create new filtered_data = {k: v for k, v in product_data.items() if k not in ['id', 'created_at', 'updated_at'] and hasattr(Product, k)} new_product = Product(**filtered_data) db.add(new_product) imported += 1 except Exception as e: logger.error(f"Error processing row: {e}") errors += 1 return imported, updated, errors