- Auto-fixed 4,496 lint issues (import sorting, modern syntax, etc.) - Added ignore rules for patterns intentional in this codebase: E402 (late imports), E712 (SQLAlchemy filters), B904 (raise from), SIM108/SIM105/SIM117 (readability preferences) - Added per-file ignores for tests and scripts - Excluded broken scripts/rename_terminology.py (has curly quotes) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
574 lines
21 KiB
Python
574 lines
21 KiB
Python
# app/utils/csv_processor.py
|
|
"""CSV processor utilities for marketplace product imports.
|
|
|
|
This module provides classes and functions for:
|
|
- Downloading and parsing CSV files with multiple encoding support
|
|
- Normalizing column names to match database schema
|
|
- Creating/updating MarketplaceProduct records with translations
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from datetime import UTC, datetime
|
|
from io import StringIO
|
|
from typing import Any
|
|
|
|
import pandas as pd
|
|
import requests
|
|
from sqlalchemy import literal
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.modules.marketplace.models import (
|
|
MarketplaceImportError,
|
|
MarketplaceProduct,
|
|
MarketplaceProductTranslation,
|
|
)
|
|
from app.utils.money import euros_to_cents
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class CSVProcessor:
|
|
"""Handles CSV import with robust parsing and batching."""
|
|
|
|
ENCODINGS = ["utf-8", "latin-1", "iso-8859-1", "cp1252", "utf-8-sig"]
|
|
|
|
PARSING_CONFIGS = [
|
|
# Try auto-detection first
|
|
{"sep": None, "engine": "python"},
|
|
# Try semicolon (common in European CSVs)
|
|
{"sep": ";", "engine": "python"},
|
|
# Try comma
|
|
{"sep": ",", "engine": "python"},
|
|
# Try tab
|
|
{"sep": "\t", "engine": "python"},
|
|
]
|
|
|
|
# Fields that belong to the translation table, not MarketplaceProduct
|
|
TRANSLATION_FIELDS = {"title", "description", "short_description"}
|
|
|
|
COLUMN_MAPPING = {
|
|
# Standard variations
|
|
"id": "marketplace_product_id",
|
|
"ID": "marketplace_product_id",
|
|
"MarketplaceProduct ID": "marketplace_product_id",
|
|
"name": "title",
|
|
"Name": "title",
|
|
"product_name": "title",
|
|
"MarketplaceProduct Name": "title",
|
|
# Google Shopping feed standard
|
|
"g:id": "marketplace_product_id",
|
|
"g:title": "title",
|
|
"g:description": "description",
|
|
"g:link": "link",
|
|
"g:image_link": "image_link",
|
|
"g:availability": "availability",
|
|
"g:price": "price",
|
|
"g:brand": "brand",
|
|
"g:gtin": "gtin",
|
|
"g:mpn": "mpn",
|
|
"g:condition": "condition",
|
|
"g:adult": "adult",
|
|
"g:multipack": "multipack",
|
|
"g:is_bundle": "is_bundle",
|
|
"g:age_group": "age_group",
|
|
"g:color": "color",
|
|
"g:gender": "gender",
|
|
"g:material": "material",
|
|
"g:pattern": "pattern",
|
|
"g:size": "size",
|
|
"g:size_type": "size_type",
|
|
"g:size_system": "size_system",
|
|
"g:item_group_id": "item_group_id",
|
|
"g:google_product_category": "google_product_category",
|
|
"g:product_type": "product_type_raw", # Maps to product_type_raw (renamed)
|
|
"product_type": "product_type_raw", # Also map plain product_type
|
|
"g:custom_label_0": "custom_label_0",
|
|
"g:custom_label_1": "custom_label_1",
|
|
"g:custom_label_2": "custom_label_2",
|
|
"g:custom_label_3": "custom_label_3",
|
|
"g:custom_label_4": "custom_label_4",
|
|
# Handle complex shipping column
|
|
"shipping(country:price:max_handling_time:min_transit_time:max_transit_time)": "shipping",
|
|
}
|
|
|
|
def __init__(self):
|
|
"""Class constructor."""
|
|
from app.utils.data_processing import GTINProcessor, PriceProcessor
|
|
|
|
self.gtin_processor = GTINProcessor()
|
|
self.price_processor = PriceProcessor()
|
|
|
|
def download_csv(self, url: str) -> str:
|
|
"""Download and decode CSV with multiple encoding attempts."""
|
|
try:
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
content = response.content
|
|
|
|
# Try different encodings
|
|
for encoding in self.ENCODINGS:
|
|
try:
|
|
decoded_content = content.decode(encoding)
|
|
logger.info(f"Successfully decoded CSV with encoding: {encoding}")
|
|
return decoded_content
|
|
except UnicodeDecodeError:
|
|
continue
|
|
|
|
# Fallback with error ignoring
|
|
decoded_content = content.decode("utf-8", errors="ignore")
|
|
logger.warning("Used UTF-8 with error ignoring for CSV decoding")
|
|
return decoded_content
|
|
|
|
except requests.RequestException as e:
|
|
logger.error(f"Error downloading CSV: {e}")
|
|
raise
|
|
|
|
def parse_csv(self, csv_content: str) -> pd.DataFrame:
|
|
"""Parse CSV with multiple separator attempts."""
|
|
for config in self.PARSING_CONFIGS:
|
|
try:
|
|
df = pd.read_csv(
|
|
StringIO(csv_content),
|
|
on_bad_lines="skip",
|
|
quotechar='"',
|
|
skip_blank_lines=True,
|
|
skipinitialspace=True,
|
|
**config,
|
|
)
|
|
logger.info(f"Successfully parsed CSV with config: {config}")
|
|
return df
|
|
except pd.errors.ParserError:
|
|
continue
|
|
|
|
raise pd.errors.ParserError("Could not parse CSV with any configuration")
|
|
|
|
def normalize_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Normalize column names using mapping."""
|
|
# Clean column names
|
|
df.columns = df.columns.str.strip()
|
|
|
|
# Apply mapping
|
|
df = df.rename(columns=self.COLUMN_MAPPING)
|
|
|
|
logger.info(f"Normalized columns: {list(df.columns)}")
|
|
return df
|
|
|
|
def _parse_price_to_cents(self, price_str: str | None) -> int | None:
|
|
"""Parse price string like '19.99 EUR' to integer cents.
|
|
|
|
Uses the money utility for precise conversion.
|
|
Example: '19.99 EUR' -> 1999
|
|
|
|
Args:
|
|
price_str: Price string with optional currency
|
|
|
|
Returns:
|
|
Price in integer cents, or None if parsing fails
|
|
"""
|
|
if not price_str:
|
|
return None
|
|
|
|
# Extract numeric value
|
|
numbers = re.findall(r"[\d.,]+", str(price_str))
|
|
if numbers:
|
|
num_str = numbers[0].replace(",", ".")
|
|
try:
|
|
# Convert euros to cents using money utility
|
|
return euros_to_cents(num_str)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
return None
|
|
|
|
def _clean_row_data(self, row_data: dict[str, Any]) -> dict[str, Any]:
|
|
"""Process a single row with data normalization."""
|
|
# Handle NaN values
|
|
processed_data = {k: (v if pd.notna(v) else None) for k, v in row_data.items()}
|
|
|
|
# Process GTIN
|
|
if processed_data.get("gtin"):
|
|
processed_data["gtin"] = self.gtin_processor.normalize(
|
|
processed_data["gtin"]
|
|
)
|
|
|
|
# Process price and currency
|
|
if processed_data.get("price"):
|
|
parsed_price, currency = self.price_processor.parse_price_currency(
|
|
processed_data["price"]
|
|
)
|
|
# Store both raw price string and numeric value in cents
|
|
raw_price = processed_data["price"]
|
|
processed_data["price"] = parsed_price
|
|
processed_data["price_cents"] = self._parse_price_to_cents(raw_price)
|
|
processed_data["currency"] = currency
|
|
|
|
# Process sale_price
|
|
if processed_data.get("sale_price"):
|
|
raw_sale_price = processed_data["sale_price"]
|
|
parsed_sale_price, _ = self.price_processor.parse_price_currency(
|
|
processed_data["sale_price"]
|
|
)
|
|
processed_data["sale_price"] = parsed_sale_price
|
|
processed_data["sale_price_cents"] = self._parse_price_to_cents(
|
|
raw_sale_price
|
|
)
|
|
|
|
# Clean MPN (remove .0 endings)
|
|
if processed_data.get("mpn"):
|
|
mpn_str = str(processed_data["mpn"]).strip()
|
|
if mpn_str.endswith(".0"):
|
|
processed_data["mpn"] = mpn_str[:-2]
|
|
|
|
# Handle multipack type conversion
|
|
if processed_data.get("multipack") is not None:
|
|
try:
|
|
processed_data["multipack"] = int(float(processed_data["multipack"]))
|
|
except (ValueError, TypeError):
|
|
processed_data["multipack"] = None
|
|
|
|
return processed_data
|
|
|
|
def _extract_translation_data(self, product_data: dict[str, Any]) -> dict[str, Any]:
|
|
"""Extract translation fields from product data.
|
|
|
|
Returns a dict with title, description, etc. that belong
|
|
in the translation table. Removes these fields from product_data in place.
|
|
"""
|
|
translation_data = {}
|
|
for field in self.TRANSLATION_FIELDS:
|
|
if field in product_data:
|
|
translation_data[field] = product_data.pop(field)
|
|
return translation_data
|
|
|
|
def _create_or_update_translation(
|
|
self,
|
|
db: Session,
|
|
marketplace_product: MarketplaceProduct,
|
|
translation_data: dict[str, Any],
|
|
language: str = "en",
|
|
source_file: str | None = None,
|
|
) -> None:
|
|
"""Create or update a translation record for the marketplace product."""
|
|
if not translation_data.get("title"):
|
|
# Title is required for translations
|
|
return
|
|
|
|
# Check if translation exists
|
|
existing_translation = (
|
|
db.query(MarketplaceProductTranslation)
|
|
.filter(
|
|
MarketplaceProductTranslation.marketplace_product_id
|
|
== marketplace_product.id,
|
|
MarketplaceProductTranslation.language == language,
|
|
)
|
|
.first()
|
|
)
|
|
|
|
if existing_translation:
|
|
# Update existing translation
|
|
for key, value in translation_data.items():
|
|
if hasattr(existing_translation, key):
|
|
setattr(existing_translation, key, value)
|
|
existing_translation.updated_at = datetime.now(UTC)
|
|
if source_file:
|
|
existing_translation.source_file = source_file
|
|
else:
|
|
# Create new translation
|
|
new_translation = MarketplaceProductTranslation(
|
|
marketplace_product_id=marketplace_product.id,
|
|
language=language,
|
|
title=translation_data.get("title"),
|
|
description=translation_data.get("description"),
|
|
short_description=translation_data.get("short_description"),
|
|
source_file=source_file,
|
|
)
|
|
db.add(new_translation)
|
|
|
|
async def process_marketplace_csv_from_url(
|
|
self,
|
|
url: str,
|
|
marketplace: str,
|
|
store_name: str,
|
|
batch_size: int,
|
|
db: Session,
|
|
language: str = "en",
|
|
import_job_id: int | None = None,
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Process CSV from URL with marketplace and store information.
|
|
|
|
Args:
|
|
url: URL to the CSV file
|
|
marketplace: Name of the marketplace (e.g., 'Letzshop', 'Amazon')
|
|
store_name: Name of the store
|
|
batch_size: Number of rows to process in each batch
|
|
db: Database session
|
|
language: Language code for translations (default: 'en')
|
|
import_job_id: ID of the import job for error tracking (optional)
|
|
|
|
Returns:
|
|
Dictionary with processing results
|
|
"""
|
|
logger.info(
|
|
f"Starting marketplace CSV import from {url} for {marketplace} -> {store_name} (lang={language})"
|
|
)
|
|
# Download and parse CSV
|
|
csv_content = self.download_csv(url)
|
|
df = self.parse_csv(csv_content)
|
|
df = self.normalize_columns(df)
|
|
|
|
logger.info(f"Processing CSV with {len(df)} rows and {len(df.columns)} columns")
|
|
|
|
imported = 0
|
|
updated = 0
|
|
errors = 0
|
|
|
|
# Extract source file name from URL
|
|
source_file = url.split("/")[-1] if "/" in url else url
|
|
|
|
# Process in batches
|
|
for i in range(0, len(df), batch_size):
|
|
batch_df = df.iloc[i : i + batch_size]
|
|
# Calculate base row number for this batch (1-indexed for user-friendly display)
|
|
base_row_num = i + 2 # +2 for header row and 1-indexing
|
|
batch_result = await self._process_marketplace_batch(
|
|
batch_df,
|
|
marketplace,
|
|
store_name,
|
|
db,
|
|
i // batch_size + 1,
|
|
language=language,
|
|
source_file=source_file,
|
|
import_job_id=import_job_id,
|
|
base_row_num=base_row_num,
|
|
)
|
|
|
|
imported += batch_result["imported"]
|
|
updated += batch_result["updated"]
|
|
errors += batch_result["errors"]
|
|
|
|
logger.info(f"Processed batch {i // batch_size + 1}: {batch_result}")
|
|
|
|
return {
|
|
"total_processed": imported + updated + errors,
|
|
"imported": imported,
|
|
"updated": updated,
|
|
"errors": errors,
|
|
"marketplace": marketplace,
|
|
"store_name": store_name,
|
|
"language": language,
|
|
}
|
|
|
|
def _create_import_error(
|
|
self,
|
|
db: Session,
|
|
import_job_id: int,
|
|
row_number: int,
|
|
error_type: str,
|
|
error_message: str,
|
|
identifier: str | None = None,
|
|
row_data: dict | None = None,
|
|
) -> None:
|
|
"""Create an import error record in the database."""
|
|
# Limit row_data size to prevent huge JSON storage
|
|
if row_data:
|
|
# Keep only key fields for review
|
|
limited_data = {
|
|
k: v
|
|
for k, v in row_data.items()
|
|
if k
|
|
in [
|
|
"marketplace_product_id",
|
|
"title",
|
|
"gtin",
|
|
"mpn",
|
|
"sku",
|
|
"brand",
|
|
"price",
|
|
"availability",
|
|
"link",
|
|
]
|
|
and v is not None
|
|
and str(v).strip()
|
|
}
|
|
row_data = limited_data if limited_data else None
|
|
|
|
error_record = MarketplaceImportError(
|
|
import_job_id=import_job_id,
|
|
row_number=row_number,
|
|
identifier=identifier,
|
|
error_type=error_type,
|
|
error_message=error_message,
|
|
row_data=row_data,
|
|
)
|
|
db.add(error_record)
|
|
|
|
async def _process_marketplace_batch(
|
|
self,
|
|
batch_df: pd.DataFrame,
|
|
marketplace: str,
|
|
store_name: str,
|
|
db: Session,
|
|
batch_num: int,
|
|
language: str = "en",
|
|
source_file: str | None = None,
|
|
import_job_id: int | None = None,
|
|
base_row_num: int = 2,
|
|
) -> dict[str, int]:
|
|
"""Process a batch of CSV rows with marketplace information."""
|
|
imported = 0
|
|
updated = 0
|
|
errors = 0
|
|
|
|
logger.info(
|
|
f"Processing batch {batch_num} with {len(batch_df)} rows for "
|
|
f"{marketplace} -> {store_name}"
|
|
)
|
|
|
|
for batch_idx, (_index, row) in enumerate(batch_df.iterrows()):
|
|
row_number = base_row_num + batch_idx
|
|
row_dict = row.to_dict()
|
|
|
|
try:
|
|
# Convert row to dictionary and clean up
|
|
product_data = self._clean_row_data(row_dict)
|
|
|
|
# Extract translation fields BEFORE processing product
|
|
translation_data = self._extract_translation_data(product_data)
|
|
|
|
# Add marketplace and store information
|
|
product_data["marketplace"] = marketplace
|
|
product_data["store_name"] = store_name
|
|
|
|
# Get identifier for error tracking
|
|
identifier = (
|
|
product_data.get("marketplace_product_id")
|
|
or product_data.get("gtin")
|
|
or product_data.get("mpn")
|
|
)
|
|
|
|
# Validate required fields
|
|
if not product_data.get("marketplace_product_id"):
|
|
logger.warning(
|
|
f"Row {row_number}: Missing marketplace_product_id, skipping"
|
|
)
|
|
if import_job_id:
|
|
self._create_import_error(
|
|
db=db,
|
|
import_job_id=import_job_id,
|
|
row_number=row_number,
|
|
error_type="missing_id",
|
|
error_message="Missing marketplace_product_id - product cannot be identified",
|
|
identifier=identifier,
|
|
row_data=row_dict,
|
|
)
|
|
errors += 1
|
|
continue
|
|
|
|
# Title is now required in translation_data
|
|
if not translation_data.get("title"):
|
|
logger.warning(f"Row {row_number}: Missing title, skipping")
|
|
if import_job_id:
|
|
self._create_import_error(
|
|
db=db,
|
|
import_job_id=import_job_id,
|
|
row_number=row_number,
|
|
error_type="missing_title",
|
|
error_message="Missing title - product title is required",
|
|
identifier=product_data.get("marketplace_product_id"),
|
|
row_data=row_dict,
|
|
)
|
|
errors += 1
|
|
continue
|
|
|
|
# Check if product exists
|
|
existing_product = (
|
|
db.query(MarketplaceProduct)
|
|
.filter(
|
|
MarketplaceProduct.marketplace_product_id
|
|
== literal(product_data["marketplace_product_id"])
|
|
)
|
|
.first()
|
|
)
|
|
|
|
if existing_product:
|
|
# Update existing product (only non-translation fields)
|
|
for key, value in product_data.items():
|
|
if key not in ["id", "created_at"] and hasattr(
|
|
existing_product, key
|
|
):
|
|
setattr(existing_product, key, value)
|
|
existing_product.updated_at = datetime.now(UTC)
|
|
|
|
# Update or create translation
|
|
self._create_or_update_translation(
|
|
db,
|
|
existing_product,
|
|
translation_data,
|
|
language=language,
|
|
source_file=source_file,
|
|
)
|
|
|
|
updated += 1
|
|
logger.debug(
|
|
f"Updated product {product_data['marketplace_product_id']} for "
|
|
f"{marketplace} and store {store_name}"
|
|
)
|
|
else:
|
|
# Create new product (filter to valid model fields)
|
|
filtered_data = {
|
|
k: v
|
|
for k, v in product_data.items()
|
|
if k not in ["id", "created_at", "updated_at"]
|
|
and hasattr(MarketplaceProduct, k)
|
|
}
|
|
new_product = MarketplaceProduct(**filtered_data)
|
|
db.add(new_product)
|
|
db.flush() # Get the ID for the translation
|
|
|
|
# Create translation for new product
|
|
self._create_or_update_translation(
|
|
db,
|
|
new_product,
|
|
translation_data,
|
|
language=language,
|
|
source_file=source_file,
|
|
)
|
|
|
|
imported += 1
|
|
logger.debug(
|
|
f"Imported new product {product_data['marketplace_product_id']} "
|
|
f"for {marketplace} and store {store_name}"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing row {row_number}: {e}")
|
|
if import_job_id:
|
|
self._create_import_error(
|
|
db=db,
|
|
import_job_id=import_job_id,
|
|
row_number=row_number,
|
|
error_type="processing_error",
|
|
error_message=str(e),
|
|
identifier=row_dict.get("marketplace_product_id")
|
|
or row_dict.get("id"),
|
|
row_data=row_dict,
|
|
)
|
|
errors += 1
|
|
continue
|
|
|
|
# Commit the batch
|
|
try:
|
|
db.commit()
|
|
logger.info(f"Batch {batch_num} committed successfully")
|
|
except Exception as e:
|
|
logger.error(f"Failed to commit batch {batch_num}: {e}")
|
|
db.rollback()
|
|
# Count all rows in this batch as errors
|
|
errors = len(batch_df)
|
|
imported = 0
|
|
updated = 0
|
|
|
|
return {"imported": imported, "updated": updated, "errors": errors}
|