254 lines
8.9 KiB
Python
254 lines
8.9 KiB
Python
# utils/csv_processor.py
|
|
import pandas as pd
|
|
import requests
|
|
from io import StringIO
|
|
from typing import Dict, Any, Optional
|
|
from sqlalchemy.orm import Session
|
|
from models.database_models import Product
|
|
from datetime import datetime
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class CSVProcessor:
|
|
"""Handles CSV import with robust parsing and batching"""
|
|
|
|
ENCODINGS = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252', 'utf-8-sig']
|
|
|
|
COLUMN_MAPPING = {
|
|
# Standard variations
|
|
'id': 'product_id',
|
|
'ID': 'product_id',
|
|
'Product ID': 'product_id',
|
|
'name': 'title',
|
|
'Name': 'title',
|
|
'product_name': 'title',
|
|
'Product Name': 'title',
|
|
|
|
# Google Shopping feed standard
|
|
'g:id': 'product_id',
|
|
'g:title': 'title',
|
|
'g:description': 'description',
|
|
'g:link': 'link',
|
|
'g:image_link': 'image_link',
|
|
'g:availability': 'availability',
|
|
'g:price': 'price',
|
|
'g:brand': 'brand',
|
|
'g:gtin': 'gtin',
|
|
'g:mpn': 'mpn',
|
|
'g:condition': 'condition',
|
|
'g:adult': 'adult',
|
|
'g:multipack': 'multipack',
|
|
'g:is_bundle': 'is_bundle',
|
|
'g:age_group': 'age_group',
|
|
'g:color': 'color',
|
|
'g:gender': 'gender',
|
|
'g:material': 'material',
|
|
'g:pattern': 'pattern',
|
|
'g:size': 'size',
|
|
'g:size_type': 'size_type',
|
|
'g:size_system': 'size_system',
|
|
'g:item_group_id': 'item_group_id',
|
|
'g:google_product_category': 'google_product_category',
|
|
'g:product_type': 'product_type',
|
|
'g:custom_label_0': 'custom_label_0',
|
|
'g:custom_label_1': 'custom_label_1',
|
|
'g:custom_label_2': 'custom_label_2',
|
|
'g:custom_label_3': 'custom_label_3',
|
|
'g:custom_label_4': 'custom_label_4',
|
|
|
|
# Handle complex shipping column
|
|
'shipping(country:price:max_handling_time:min_transit_time:max_transit_time)': 'shipping'
|
|
}
|
|
|
|
def __init__(self):
|
|
from utils.data_processing import GTINProcessor, PriceProcessor
|
|
self.gtin_processor = GTINProcessor()
|
|
self.price_processor = PriceProcessor()
|
|
|
|
def download_csv(self, url: str) -> str:
|
|
"""Download and decode CSV with multiple encoding attempts"""
|
|
try:
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
content = response.content
|
|
|
|
# Try different encodings
|
|
for encoding in self.ENCODINGS:
|
|
try:
|
|
decoded_content = content.decode(encoding)
|
|
logger.info(f"Successfully decoded CSV with encoding: {encoding}")
|
|
return decoded_content
|
|
except UnicodeDecodeError:
|
|
continue
|
|
|
|
# Fallback with error ignoring
|
|
decoded_content = content.decode('utf-8', errors='ignore')
|
|
logger.warning("Used UTF-8 with error ignoring for CSV decoding")
|
|
return decoded_content
|
|
|
|
except requests.RequestException as e:
|
|
logger.error(f"Error downloading CSV: {e}")
|
|
raise
|
|
|
|
def parse_csv(self, csv_content: str) -> pd.DataFrame:
|
|
"""Parse CSV with multiple separator attempts"""
|
|
parsing_configs = [
|
|
# Try auto-detection first
|
|
{'sep': None, 'engine': 'python'},
|
|
# Try semicolon (common in European CSVs)
|
|
{'sep': ';', 'engine': 'python'},
|
|
# Try comma
|
|
{'sep': ',', 'engine': 'python'},
|
|
# Try tab
|
|
{'sep': '\t', 'engine': 'python'},
|
|
]
|
|
|
|
for config in parsing_configs:
|
|
try:
|
|
df = pd.read_csv(
|
|
StringIO(csv_content),
|
|
on_bad_lines='skip',
|
|
quotechar='"',
|
|
skip_blank_lines=True,
|
|
skipinitialspace=True,
|
|
**config
|
|
)
|
|
logger.info(f"Successfully parsed CSV with config: {config}")
|
|
return df
|
|
except pd.errors.ParserError:
|
|
continue
|
|
|
|
raise pd.errors.ParserError("Could not parse CSV with any configuration")
|
|
|
|
def normalize_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Normalize column names using mapping"""
|
|
# Clean column names
|
|
df.columns = df.columns.str.strip()
|
|
|
|
# Apply mapping
|
|
df = df.rename(columns=self.COLUMN_MAPPING)
|
|
|
|
logger.info(f"Normalized columns: {list(df.columns)}")
|
|
return df
|
|
|
|
def process_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Process a single row with data normalization"""
|
|
# Handle NaN values
|
|
processed_data = {k: (v if pd.notna(v) else None) for k, v in row_data.items()}
|
|
|
|
# Process GTIN
|
|
if processed_data.get('gtin'):
|
|
processed_data['gtin'] = self.gtin_processor.normalize(processed_data['gtin'])
|
|
|
|
# Process price and currency
|
|
if processed_data.get('price'):
|
|
parsed_price, currency = self.price_processor.parse_price_currency(processed_data['price'])
|
|
processed_data['price'] = parsed_price
|
|
processed_data['currency'] = currency
|
|
|
|
# Process sale_price
|
|
if processed_data.get('sale_price'):
|
|
parsed_sale_price, _ = self.price_processor.parse_price_currency(processed_data['sale_price'])
|
|
processed_data['sale_price'] = parsed_sale_price
|
|
|
|
# Clean MPN (remove .0 endings)
|
|
if processed_data.get('mpn'):
|
|
mpn_str = str(processed_data['mpn']).strip()
|
|
if mpn_str.endswith('.0'):
|
|
processed_data['mpn'] = mpn_str[:-2]
|
|
|
|
# Handle multipack type conversion
|
|
if processed_data.get('multipack') is not None:
|
|
try:
|
|
processed_data['multipack'] = int(float(processed_data['multipack']))
|
|
except (ValueError, TypeError):
|
|
processed_data['multipack'] = None
|
|
|
|
return processed_data
|
|
|
|
async def process_csv_from_url(self, url: str, batch_size: int, db: Session) -> Dict[str, int]:
|
|
"""Process CSV import with batching"""
|
|
# Download and parse CSV
|
|
csv_content = self.download_csv(url)
|
|
df = self.parse_csv(csv_content)
|
|
df = self.normalize_columns(df)
|
|
|
|
logger.info(f"Processing CSV with {len(df)} rows")
|
|
|
|
imported = 0
|
|
updated = 0
|
|
errors = 0
|
|
|
|
# Process in batches
|
|
for i in range(0, len(df), batch_size):
|
|
batch_df = df.iloc[i:i + batch_size]
|
|
batch_imported, batch_updated, batch_errors = self._process_batch(batch_df, db)
|
|
|
|
imported += batch_imported
|
|
updated += batch_updated
|
|
errors += batch_errors
|
|
|
|
# Commit batch
|
|
try:
|
|
db.commit()
|
|
logger.info(
|
|
f"Processed batch {i // batch_size + 1}: +{batch_imported} imported, +{batch_updated} updated, +{batch_errors} errors")
|
|
except Exception as e:
|
|
db.rollback()
|
|
logger.error(f"Batch commit failed: {e}")
|
|
errors += len(batch_df)
|
|
|
|
return {
|
|
"imported": imported,
|
|
"updated": updated,
|
|
"errors": errors,
|
|
"total_processed": imported + updated + errors
|
|
}
|
|
|
|
def _process_batch(self, df_batch: pd.DataFrame, db: Session) -> tuple:
|
|
"""Process a single batch of rows"""
|
|
imported = 0
|
|
updated = 0
|
|
errors = 0
|
|
|
|
for _, row in df_batch.iterrows():
|
|
try:
|
|
product_data = self.process_row(row.to_dict())
|
|
|
|
# Validate required fields
|
|
product_id = product_data.get('product_id')
|
|
title = product_data.get('title')
|
|
|
|
if not product_id or not title:
|
|
errors += 1
|
|
continue
|
|
|
|
# Check for existing product
|
|
existing_product = db.query(Product).filter(
|
|
Product.product_id == product_id
|
|
).first()
|
|
|
|
if existing_product:
|
|
# Update existing
|
|
for key, value in product_data.items():
|
|
if key not in ['id', 'created_at'] and hasattr(existing_product, key):
|
|
setattr(existing_product, key, value)
|
|
existing_product.updated_at = datetime.utcnow()
|
|
updated += 1
|
|
else:
|
|
# Create new
|
|
filtered_data = {k: v for k, v in product_data.items()
|
|
if k not in ['id', 'created_at', 'updated_at'] and hasattr(Product, k)}
|
|
new_product = Product(**filtered_data)
|
|
db.add(new_product)
|
|
imported += 1
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing row: {e}")
|
|
errors += 1
|
|
|
|
return imported, updated, errors
|