code quality run

This commit is contained in:
2025-09-13 21:58:54 +02:00
parent 0dfd885847
commit 3eb18ef91e
63 changed files with 1802 additions and 1289 deletions

View File

@@ -1,14 +1,15 @@
# utils/csv_processor.py
import logging
from datetime import datetime
from io import StringIO
from typing import Any, Dict
import pandas as pd
import requests
from io import StringIO
from typing import Dict, Any
from sqlalchemy import literal
from sqlalchemy.orm import Session
from models.database_models import Product
from datetime import datetime
import logging
logger = logging.getLogger(__name__)
@@ -16,67 +17,66 @@ logger = logging.getLogger(__name__)
class CSVProcessor:
"""Handles CSV import with robust parsing and batching"""
ENCODINGS = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252', 'utf-8-sig']
ENCODINGS = ["utf-8", "latin-1", "iso-8859-1", "cp1252", "utf-8-sig"]
PARSING_CONFIGS = [
# Try auto-detection first
{'sep': None, 'engine': 'python'},
{"sep": None, "engine": "python"},
# Try semicolon (common in European CSVs)
{'sep': ';', 'engine': 'python'},
{"sep": ";", "engine": "python"},
# Try comma
{'sep': ',', 'engine': 'python'},
{"sep": ",", "engine": "python"},
# Try tab
{'sep': '\t', 'engine': 'python'},
{"sep": "\t", "engine": "python"},
]
COLUMN_MAPPING = {
# Standard variations
'id': 'product_id',
'ID': 'product_id',
'Product ID': 'product_id',
'name': 'title',
'Name': 'title',
'product_name': 'title',
'Product Name': 'title',
"id": "product_id",
"ID": "product_id",
"Product ID": "product_id",
"name": "title",
"Name": "title",
"product_name": "title",
"Product Name": "title",
# Google Shopping feed standard
'g:id': 'product_id',
'g:title': 'title',
'g:description': 'description',
'g:link': 'link',
'g:image_link': 'image_link',
'g:availability': 'availability',
'g:price': 'price',
'g:brand': 'brand',
'g:gtin': 'gtin',
'g:mpn': 'mpn',
'g:condition': 'condition',
'g:adult': 'adult',
'g:multipack': 'multipack',
'g:is_bundle': 'is_bundle',
'g:age_group': 'age_group',
'g:color': 'color',
'g:gender': 'gender',
'g:material': 'material',
'g:pattern': 'pattern',
'g:size': 'size',
'g:size_type': 'size_type',
'g:size_system': 'size_system',
'g:item_group_id': 'item_group_id',
'g:google_product_category': 'google_product_category',
'g:product_type': 'product_type',
'g:custom_label_0': 'custom_label_0',
'g:custom_label_1': 'custom_label_1',
'g:custom_label_2': 'custom_label_2',
'g:custom_label_3': 'custom_label_3',
'g:custom_label_4': 'custom_label_4',
"g:id": "product_id",
"g:title": "title",
"g:description": "description",
"g:link": "link",
"g:image_link": "image_link",
"g:availability": "availability",
"g:price": "price",
"g:brand": "brand",
"g:gtin": "gtin",
"g:mpn": "mpn",
"g:condition": "condition",
"g:adult": "adult",
"g:multipack": "multipack",
"g:is_bundle": "is_bundle",
"g:age_group": "age_group",
"g:color": "color",
"g:gender": "gender",
"g:material": "material",
"g:pattern": "pattern",
"g:size": "size",
"g:size_type": "size_type",
"g:size_system": "size_system",
"g:item_group_id": "item_group_id",
"g:google_product_category": "google_product_category",
"g:product_type": "product_type",
"g:custom_label_0": "custom_label_0",
"g:custom_label_1": "custom_label_1",
"g:custom_label_2": "custom_label_2",
"g:custom_label_3": "custom_label_3",
"g:custom_label_4": "custom_label_4",
# Handle complex shipping column
'shipping(country:price:max_handling_time:min_transit_time:max_transit_time)': 'shipping'
"shipping(country:price:max_handling_time:min_transit_time:max_transit_time)": "shipping",
}
def __init__(self):
from utils.data_processing import GTINProcessor, PriceProcessor
self.gtin_processor = GTINProcessor()
self.price_processor = PriceProcessor()
@@ -98,7 +98,7 @@ class CSVProcessor:
continue
# Fallback with error ignoring
decoded_content = content.decode('utf-8', errors='ignore')
decoded_content = content.decode("utf-8", errors="ignore")
logger.warning("Used UTF-8 with error ignoring for CSV decoding")
return decoded_content
@@ -113,11 +113,11 @@ class CSVProcessor:
try:
df = pd.read_csv(
StringIO(csv_content),
on_bad_lines='skip',
on_bad_lines="skip",
quotechar='"',
skip_blank_lines=True,
skipinitialspace=True,
**config
**config,
)
logger.info(f"Successfully parsed CSV with config: {config}")
return df
@@ -143,42 +143,43 @@ class CSVProcessor:
processed_data = {k: (v if pd.notna(v) else None) for k, v in row_data.items()}
# Process GTIN
if processed_data.get('gtin'):
processed_data['gtin'] = self.gtin_processor.normalize(processed_data['gtin'])
if processed_data.get("gtin"):
processed_data["gtin"] = self.gtin_processor.normalize(
processed_data["gtin"]
)
# Process price and currency
if processed_data.get('price'):
parsed_price, currency = self.price_processor.parse_price_currency(processed_data['price'])
processed_data['price'] = parsed_price
processed_data['currency'] = currency
if processed_data.get("price"):
parsed_price, currency = self.price_processor.parse_price_currency(
processed_data["price"]
)
processed_data["price"] = parsed_price
processed_data["currency"] = currency
# Process sale_price
if processed_data.get('sale_price'):
parsed_sale_price, _ = self.price_processor.parse_price_currency(processed_data['sale_price'])
processed_data['sale_price'] = parsed_sale_price
if processed_data.get("sale_price"):
parsed_sale_price, _ = self.price_processor.parse_price_currency(
processed_data["sale_price"]
)
processed_data["sale_price"] = parsed_sale_price
# Clean MPN (remove .0 endings)
if processed_data.get('mpn'):
mpn_str = str(processed_data['mpn']).strip()
if mpn_str.endswith('.0'):
processed_data['mpn'] = mpn_str[:-2]
if processed_data.get("mpn"):
mpn_str = str(processed_data["mpn"]).strip()
if mpn_str.endswith(".0"):
processed_data["mpn"] = mpn_str[:-2]
# Handle multipack type conversion
if processed_data.get('multipack') is not None:
if processed_data.get("multipack") is not None:
try:
processed_data['multipack'] = int(float(processed_data['multipack']))
processed_data["multipack"] = int(float(processed_data["multipack"]))
except (ValueError, TypeError):
processed_data['multipack'] = None
processed_data["multipack"] = None
return processed_data
async def process_marketplace_csv_from_url(
self,
url: str,
marketplace: str,
shop_name: str,
batch_size: int,
db: Session
self, url: str, marketplace: str, shop_name: str, batch_size: int, db: Session
) -> Dict[str, Any]:
"""
Process CSV from URL with marketplace and shop information
@@ -194,7 +195,9 @@ class CSVProcessor:
Dictionary with processing results
"""
logger.info(f"Starting marketplace CSV import from {url} for {marketplace} -> {shop_name}")
logger.info(
f"Starting marketplace CSV import from {url} for {marketplace} -> {shop_name}"
)
# Download and parse CSV
csv_content = self.download_csv(url)
df = self.parse_csv(csv_content)
@@ -208,40 +211,42 @@ class CSVProcessor:
# Process in batches
for i in range(0, len(df), batch_size):
batch_df = df.iloc[i:i + batch_size]
batch_df = df.iloc[i : i + batch_size]
batch_result = await self._process_marketplace_batch(
batch_df, marketplace, shop_name, db, i // batch_size + 1
)
imported += batch_result['imported']
updated += batch_result['updated']
errors += batch_result['errors']
imported += batch_result["imported"]
updated += batch_result["updated"]
errors += batch_result["errors"]
logger.info(f"Processed batch {i // batch_size + 1}: {batch_result}")
return {
'total_processed': imported + updated + errors,
'imported': imported,
'updated': updated,
'errors': errors,
'marketplace': marketplace,
'shop_name': shop_name
"total_processed": imported + updated + errors,
"imported": imported,
"updated": updated,
"errors": errors,
"marketplace": marketplace,
"shop_name": shop_name,
}
async def _process_marketplace_batch(
self,
batch_df: pd.DataFrame,
marketplace: str,
shop_name: str,
db: Session,
batch_num: int
self,
batch_df: pd.DataFrame,
marketplace: str,
shop_name: str,
db: Session,
batch_num: int,
) -> Dict[str, int]:
"""Process a batch of CSV rows with marketplace information"""
imported = 0
updated = 0
errors = 0
logger.info(f"Processing batch {batch_num} with {len(batch_df)} rows for {marketplace} -> {shop_name}")
logger.info(
f"Processing batch {batch_num} with {len(batch_df)} rows for {marketplace} -> {shop_name}"
)
for index, row in batch_df.iterrows():
try:
@@ -249,42 +254,54 @@ class CSVProcessor:
product_data = self._clean_row_data(row.to_dict())
# Add marketplace and shop information
product_data['marketplace'] = marketplace
product_data['shop_name'] = shop_name
product_data["marketplace"] = marketplace
product_data["shop_name"] = shop_name
# Validate required fields
if not product_data.get('product_id'):
if not product_data.get("product_id"):
logger.warning(f"Row {index}: Missing product_id, skipping")
errors += 1
continue
if not product_data.get('title'):
if not product_data.get("title"):
logger.warning(f"Row {index}: Missing title, skipping")
errors += 1
continue
# Check if product exists
existing_product = db.query(Product).filter(
Product.product_id == literal(product_data['product_id'])
).first()
existing_product = (
db.query(Product)
.filter(Product.product_id == literal(product_data["product_id"]))
.first()
)
if existing_product:
# Update existing product
for key, value in product_data.items():
if key not in ['id', 'created_at'] and hasattr(existing_product, key):
if key not in ["id", "created_at"] and hasattr(
existing_product, key
):
setattr(existing_product, key, value)
existing_product.updated_at = datetime.utcnow()
updated += 1
logger.debug(f"Updated product {product_data['product_id']} for {marketplace} and shop {shop_name}")
logger.debug(
f"Updated product {product_data['product_id']} for {marketplace} and shop {shop_name}"
)
else:
# Create new product
filtered_data = {k: v for k, v in product_data.items()
if k not in ['id', 'created_at', 'updated_at'] and hasattr(Product, k)}
filtered_data = {
k: v
for k, v in product_data.items()
if k not in ["id", "created_at", "updated_at"]
and hasattr(Product, k)
}
new_product = Product(**filtered_data)
db.add(new_product)
imported += 1
logger.debug(f"Imported new product {product_data['product_id']} for {marketplace} and shop "
f"{shop_name}")
logger.debug(
f"Imported new product {product_data['product_id']} for {marketplace} and shop "
f"{shop_name}"
)
except Exception as e:
logger.error(f"Error processing row: {e}")
@@ -303,8 +320,4 @@ class CSVProcessor:
imported = 0
updated = 0
return {
'imported': imported,
'updated': updated,
'errors': errors
}
return {"imported": imported, "updated": updated, "errors": errors}