code quality run
This commit is contained in:
@@ -1,14 +1,15 @@
|
||||
# utils/csv_processor.py
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
from typing import Any, Dict
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from io import StringIO
|
||||
from typing import Dict, Any
|
||||
|
||||
from sqlalchemy import literal
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from models.database_models import Product
|
||||
from datetime import datetime
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -16,67 +17,66 @@ logger = logging.getLogger(__name__)
|
||||
class CSVProcessor:
|
||||
"""Handles CSV import with robust parsing and batching"""
|
||||
|
||||
ENCODINGS = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252', 'utf-8-sig']
|
||||
ENCODINGS = ["utf-8", "latin-1", "iso-8859-1", "cp1252", "utf-8-sig"]
|
||||
|
||||
PARSING_CONFIGS = [
|
||||
# Try auto-detection first
|
||||
{'sep': None, 'engine': 'python'},
|
||||
{"sep": None, "engine": "python"},
|
||||
# Try semicolon (common in European CSVs)
|
||||
{'sep': ';', 'engine': 'python'},
|
||||
{"sep": ";", "engine": "python"},
|
||||
# Try comma
|
||||
{'sep': ',', 'engine': 'python'},
|
||||
{"sep": ",", "engine": "python"},
|
||||
# Try tab
|
||||
{'sep': '\t', 'engine': 'python'},
|
||||
{"sep": "\t", "engine": "python"},
|
||||
]
|
||||
|
||||
COLUMN_MAPPING = {
|
||||
# Standard variations
|
||||
'id': 'product_id',
|
||||
'ID': 'product_id',
|
||||
'Product ID': 'product_id',
|
||||
'name': 'title',
|
||||
'Name': 'title',
|
||||
'product_name': 'title',
|
||||
'Product Name': 'title',
|
||||
|
||||
"id": "product_id",
|
||||
"ID": "product_id",
|
||||
"Product ID": "product_id",
|
||||
"name": "title",
|
||||
"Name": "title",
|
||||
"product_name": "title",
|
||||
"Product Name": "title",
|
||||
# Google Shopping feed standard
|
||||
'g:id': 'product_id',
|
||||
'g:title': 'title',
|
||||
'g:description': 'description',
|
||||
'g:link': 'link',
|
||||
'g:image_link': 'image_link',
|
||||
'g:availability': 'availability',
|
||||
'g:price': 'price',
|
||||
'g:brand': 'brand',
|
||||
'g:gtin': 'gtin',
|
||||
'g:mpn': 'mpn',
|
||||
'g:condition': 'condition',
|
||||
'g:adult': 'adult',
|
||||
'g:multipack': 'multipack',
|
||||
'g:is_bundle': 'is_bundle',
|
||||
'g:age_group': 'age_group',
|
||||
'g:color': 'color',
|
||||
'g:gender': 'gender',
|
||||
'g:material': 'material',
|
||||
'g:pattern': 'pattern',
|
||||
'g:size': 'size',
|
||||
'g:size_type': 'size_type',
|
||||
'g:size_system': 'size_system',
|
||||
'g:item_group_id': 'item_group_id',
|
||||
'g:google_product_category': 'google_product_category',
|
||||
'g:product_type': 'product_type',
|
||||
'g:custom_label_0': 'custom_label_0',
|
||||
'g:custom_label_1': 'custom_label_1',
|
||||
'g:custom_label_2': 'custom_label_2',
|
||||
'g:custom_label_3': 'custom_label_3',
|
||||
'g:custom_label_4': 'custom_label_4',
|
||||
|
||||
"g:id": "product_id",
|
||||
"g:title": "title",
|
||||
"g:description": "description",
|
||||
"g:link": "link",
|
||||
"g:image_link": "image_link",
|
||||
"g:availability": "availability",
|
||||
"g:price": "price",
|
||||
"g:brand": "brand",
|
||||
"g:gtin": "gtin",
|
||||
"g:mpn": "mpn",
|
||||
"g:condition": "condition",
|
||||
"g:adult": "adult",
|
||||
"g:multipack": "multipack",
|
||||
"g:is_bundle": "is_bundle",
|
||||
"g:age_group": "age_group",
|
||||
"g:color": "color",
|
||||
"g:gender": "gender",
|
||||
"g:material": "material",
|
||||
"g:pattern": "pattern",
|
||||
"g:size": "size",
|
||||
"g:size_type": "size_type",
|
||||
"g:size_system": "size_system",
|
||||
"g:item_group_id": "item_group_id",
|
||||
"g:google_product_category": "google_product_category",
|
||||
"g:product_type": "product_type",
|
||||
"g:custom_label_0": "custom_label_0",
|
||||
"g:custom_label_1": "custom_label_1",
|
||||
"g:custom_label_2": "custom_label_2",
|
||||
"g:custom_label_3": "custom_label_3",
|
||||
"g:custom_label_4": "custom_label_4",
|
||||
# Handle complex shipping column
|
||||
'shipping(country:price:max_handling_time:min_transit_time:max_transit_time)': 'shipping'
|
||||
"shipping(country:price:max_handling_time:min_transit_time:max_transit_time)": "shipping",
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
from utils.data_processing import GTINProcessor, PriceProcessor
|
||||
|
||||
self.gtin_processor = GTINProcessor()
|
||||
self.price_processor = PriceProcessor()
|
||||
|
||||
@@ -98,7 +98,7 @@ class CSVProcessor:
|
||||
continue
|
||||
|
||||
# Fallback with error ignoring
|
||||
decoded_content = content.decode('utf-8', errors='ignore')
|
||||
decoded_content = content.decode("utf-8", errors="ignore")
|
||||
logger.warning("Used UTF-8 with error ignoring for CSV decoding")
|
||||
return decoded_content
|
||||
|
||||
@@ -113,11 +113,11 @@ class CSVProcessor:
|
||||
try:
|
||||
df = pd.read_csv(
|
||||
StringIO(csv_content),
|
||||
on_bad_lines='skip',
|
||||
on_bad_lines="skip",
|
||||
quotechar='"',
|
||||
skip_blank_lines=True,
|
||||
skipinitialspace=True,
|
||||
**config
|
||||
**config,
|
||||
)
|
||||
logger.info(f"Successfully parsed CSV with config: {config}")
|
||||
return df
|
||||
@@ -143,42 +143,43 @@ class CSVProcessor:
|
||||
processed_data = {k: (v if pd.notna(v) else None) for k, v in row_data.items()}
|
||||
|
||||
# Process GTIN
|
||||
if processed_data.get('gtin'):
|
||||
processed_data['gtin'] = self.gtin_processor.normalize(processed_data['gtin'])
|
||||
if processed_data.get("gtin"):
|
||||
processed_data["gtin"] = self.gtin_processor.normalize(
|
||||
processed_data["gtin"]
|
||||
)
|
||||
|
||||
# Process price and currency
|
||||
if processed_data.get('price'):
|
||||
parsed_price, currency = self.price_processor.parse_price_currency(processed_data['price'])
|
||||
processed_data['price'] = parsed_price
|
||||
processed_data['currency'] = currency
|
||||
if processed_data.get("price"):
|
||||
parsed_price, currency = self.price_processor.parse_price_currency(
|
||||
processed_data["price"]
|
||||
)
|
||||
processed_data["price"] = parsed_price
|
||||
processed_data["currency"] = currency
|
||||
|
||||
# Process sale_price
|
||||
if processed_data.get('sale_price'):
|
||||
parsed_sale_price, _ = self.price_processor.parse_price_currency(processed_data['sale_price'])
|
||||
processed_data['sale_price'] = parsed_sale_price
|
||||
if processed_data.get("sale_price"):
|
||||
parsed_sale_price, _ = self.price_processor.parse_price_currency(
|
||||
processed_data["sale_price"]
|
||||
)
|
||||
processed_data["sale_price"] = parsed_sale_price
|
||||
|
||||
# Clean MPN (remove .0 endings)
|
||||
if processed_data.get('mpn'):
|
||||
mpn_str = str(processed_data['mpn']).strip()
|
||||
if mpn_str.endswith('.0'):
|
||||
processed_data['mpn'] = mpn_str[:-2]
|
||||
if processed_data.get("mpn"):
|
||||
mpn_str = str(processed_data["mpn"]).strip()
|
||||
if mpn_str.endswith(".0"):
|
||||
processed_data["mpn"] = mpn_str[:-2]
|
||||
|
||||
# Handle multipack type conversion
|
||||
if processed_data.get('multipack') is not None:
|
||||
if processed_data.get("multipack") is not None:
|
||||
try:
|
||||
processed_data['multipack'] = int(float(processed_data['multipack']))
|
||||
processed_data["multipack"] = int(float(processed_data["multipack"]))
|
||||
except (ValueError, TypeError):
|
||||
processed_data['multipack'] = None
|
||||
processed_data["multipack"] = None
|
||||
|
||||
return processed_data
|
||||
|
||||
async def process_marketplace_csv_from_url(
|
||||
self,
|
||||
url: str,
|
||||
marketplace: str,
|
||||
shop_name: str,
|
||||
batch_size: int,
|
||||
db: Session
|
||||
self, url: str, marketplace: str, shop_name: str, batch_size: int, db: Session
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Process CSV from URL with marketplace and shop information
|
||||
@@ -194,7 +195,9 @@ class CSVProcessor:
|
||||
Dictionary with processing results
|
||||
"""
|
||||
|
||||
logger.info(f"Starting marketplace CSV import from {url} for {marketplace} -> {shop_name}")
|
||||
logger.info(
|
||||
f"Starting marketplace CSV import from {url} for {marketplace} -> {shop_name}"
|
||||
)
|
||||
# Download and parse CSV
|
||||
csv_content = self.download_csv(url)
|
||||
df = self.parse_csv(csv_content)
|
||||
@@ -208,40 +211,42 @@ class CSVProcessor:
|
||||
|
||||
# Process in batches
|
||||
for i in range(0, len(df), batch_size):
|
||||
batch_df = df.iloc[i:i + batch_size]
|
||||
batch_df = df.iloc[i : i + batch_size]
|
||||
batch_result = await self._process_marketplace_batch(
|
||||
batch_df, marketplace, shop_name, db, i // batch_size + 1
|
||||
)
|
||||
|
||||
imported += batch_result['imported']
|
||||
updated += batch_result['updated']
|
||||
errors += batch_result['errors']
|
||||
imported += batch_result["imported"]
|
||||
updated += batch_result["updated"]
|
||||
errors += batch_result["errors"]
|
||||
|
||||
logger.info(f"Processed batch {i // batch_size + 1}: {batch_result}")
|
||||
|
||||
return {
|
||||
'total_processed': imported + updated + errors,
|
||||
'imported': imported,
|
||||
'updated': updated,
|
||||
'errors': errors,
|
||||
'marketplace': marketplace,
|
||||
'shop_name': shop_name
|
||||
"total_processed": imported + updated + errors,
|
||||
"imported": imported,
|
||||
"updated": updated,
|
||||
"errors": errors,
|
||||
"marketplace": marketplace,
|
||||
"shop_name": shop_name,
|
||||
}
|
||||
|
||||
async def _process_marketplace_batch(
|
||||
self,
|
||||
batch_df: pd.DataFrame,
|
||||
marketplace: str,
|
||||
shop_name: str,
|
||||
db: Session,
|
||||
batch_num: int
|
||||
self,
|
||||
batch_df: pd.DataFrame,
|
||||
marketplace: str,
|
||||
shop_name: str,
|
||||
db: Session,
|
||||
batch_num: int,
|
||||
) -> Dict[str, int]:
|
||||
"""Process a batch of CSV rows with marketplace information"""
|
||||
imported = 0
|
||||
updated = 0
|
||||
errors = 0
|
||||
|
||||
logger.info(f"Processing batch {batch_num} with {len(batch_df)} rows for {marketplace} -> {shop_name}")
|
||||
logger.info(
|
||||
f"Processing batch {batch_num} with {len(batch_df)} rows for {marketplace} -> {shop_name}"
|
||||
)
|
||||
|
||||
for index, row in batch_df.iterrows():
|
||||
try:
|
||||
@@ -249,42 +254,54 @@ class CSVProcessor:
|
||||
product_data = self._clean_row_data(row.to_dict())
|
||||
|
||||
# Add marketplace and shop information
|
||||
product_data['marketplace'] = marketplace
|
||||
product_data['shop_name'] = shop_name
|
||||
product_data["marketplace"] = marketplace
|
||||
product_data["shop_name"] = shop_name
|
||||
|
||||
# Validate required fields
|
||||
if not product_data.get('product_id'):
|
||||
if not product_data.get("product_id"):
|
||||
logger.warning(f"Row {index}: Missing product_id, skipping")
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
if not product_data.get('title'):
|
||||
if not product_data.get("title"):
|
||||
logger.warning(f"Row {index}: Missing title, skipping")
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
# Check if product exists
|
||||
existing_product = db.query(Product).filter(
|
||||
Product.product_id == literal(product_data['product_id'])
|
||||
).first()
|
||||
existing_product = (
|
||||
db.query(Product)
|
||||
.filter(Product.product_id == literal(product_data["product_id"]))
|
||||
.first()
|
||||
)
|
||||
|
||||
if existing_product:
|
||||
# Update existing product
|
||||
for key, value in product_data.items():
|
||||
if key not in ['id', 'created_at'] and hasattr(existing_product, key):
|
||||
if key not in ["id", "created_at"] and hasattr(
|
||||
existing_product, key
|
||||
):
|
||||
setattr(existing_product, key, value)
|
||||
existing_product.updated_at = datetime.utcnow()
|
||||
updated += 1
|
||||
logger.debug(f"Updated product {product_data['product_id']} for {marketplace} and shop {shop_name}")
|
||||
logger.debug(
|
||||
f"Updated product {product_data['product_id']} for {marketplace} and shop {shop_name}"
|
||||
)
|
||||
else:
|
||||
# Create new product
|
||||
filtered_data = {k: v for k, v in product_data.items()
|
||||
if k not in ['id', 'created_at', 'updated_at'] and hasattr(Product, k)}
|
||||
filtered_data = {
|
||||
k: v
|
||||
for k, v in product_data.items()
|
||||
if k not in ["id", "created_at", "updated_at"]
|
||||
and hasattr(Product, k)
|
||||
}
|
||||
new_product = Product(**filtered_data)
|
||||
db.add(new_product)
|
||||
imported += 1
|
||||
logger.debug(f"Imported new product {product_data['product_id']} for {marketplace} and shop "
|
||||
f"{shop_name}")
|
||||
logger.debug(
|
||||
f"Imported new product {product_data['product_id']} for {marketplace} and shop "
|
||||
f"{shop_name}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing row: {e}")
|
||||
@@ -303,8 +320,4 @@ class CSVProcessor:
|
||||
imported = 0
|
||||
updated = 0
|
||||
|
||||
return {
|
||||
'imported': imported,
|
||||
'updated': updated,
|
||||
'errors': errors
|
||||
}
|
||||
return {"imported": imported, "updated": updated, "errors": errors}
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
# utils/data_processing.py
|
||||
import re
|
||||
import pandas as pd
|
||||
from typing import Tuple, Optional
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -25,11 +26,11 @@ class GTINProcessor:
|
||||
return None
|
||||
|
||||
# Remove decimal point (e.g., "889698116923.0" -> "889698116923")
|
||||
if '.' in gtin_str:
|
||||
gtin_str = gtin_str.split('.')[0]
|
||||
if "." in gtin_str:
|
||||
gtin_str = gtin_str.split(".")[0]
|
||||
|
||||
# Keep only digits
|
||||
gtin_clean = ''.join(filter(str.isdigit, gtin_str))
|
||||
gtin_clean = "".join(filter(str.isdigit, gtin_str))
|
||||
|
||||
if not gtin_clean:
|
||||
return None
|
||||
@@ -73,23 +74,23 @@ class PriceProcessor:
|
||||
|
||||
CURRENCY_PATTERNS = {
|
||||
# Amount followed by currency
|
||||
r'([0-9.,]+)\s*(EUR|€)': lambda m: (m.group(1), 'EUR'),
|
||||
r'([0-9.,]+)\s*(USD|\$)': lambda m: (m.group(1), 'USD'),
|
||||
r'([0-9.,]+)\s*(GBP|£)': lambda m: (m.group(1), 'GBP'),
|
||||
r'([0-9.,]+)\s*(CHF)': lambda m: (m.group(1), 'CHF'),
|
||||
r'([0-9.,]+)\s*(CAD|AUD|JPY|¥)': lambda m: (m.group(1), m.group(2).upper()),
|
||||
|
||||
r"([0-9.,]+)\s*(EUR|€)": lambda m: (m.group(1), "EUR"),
|
||||
r"([0-9.,]+)\s*(USD|\$)": lambda m: (m.group(1), "USD"),
|
||||
r"([0-9.,]+)\s*(GBP|£)": lambda m: (m.group(1), "GBP"),
|
||||
r"([0-9.,]+)\s*(CHF)": lambda m: (m.group(1), "CHF"),
|
||||
r"([0-9.,]+)\s*(CAD|AUD|JPY|¥)": lambda m: (m.group(1), m.group(2).upper()),
|
||||
# Currency followed by amount
|
||||
r'(EUR|€)\s*([0-9.,]+)': lambda m: (m.group(2), 'EUR'),
|
||||
r'(USD|\$)\s*([0-9.,]+)': lambda m: (m.group(2), 'USD'),
|
||||
r'(GBP|£)\s*([0-9.,]+)': lambda m: (m.group(2), 'GBP'),
|
||||
|
||||
r"(EUR|€)\s*([0-9.,]+)": lambda m: (m.group(2), "EUR"),
|
||||
r"(USD|\$)\s*([0-9.,]+)": lambda m: (m.group(2), "USD"),
|
||||
r"(GBP|£)\s*([0-9.,]+)": lambda m: (m.group(2), "GBP"),
|
||||
# Generic 3-letter currency codes
|
||||
r'([0-9.,]+)\s*([A-Z]{3})': lambda m: (m.group(1), m.group(2)),
|
||||
r'([A-Z]{3})\s*([0-9.,]+)': lambda m: (m.group(2), m.group(1)),
|
||||
r"([0-9.,]+)\s*([A-Z]{3})": lambda m: (m.group(1), m.group(2)),
|
||||
r"([A-Z]{3})\s*([0-9.,]+)": lambda m: (m.group(2), m.group(1)),
|
||||
}
|
||||
|
||||
def parse_price_currency(self, price_str: any) -> Tuple[Optional[str], Optional[str]]:
|
||||
def parse_price_currency(
|
||||
self, price_str: any
|
||||
) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Parse price string into (price, currency) tuple
|
||||
Returns (None, None) if parsing fails
|
||||
@@ -108,7 +109,7 @@ class PriceProcessor:
|
||||
try:
|
||||
price_val, currency_val = extract_func(match)
|
||||
# Normalize price (remove spaces, handle comma as decimal)
|
||||
price_val = price_val.replace(' ', '').replace(',', '.')
|
||||
price_val = price_val.replace(" ", "").replace(",", ".")
|
||||
# Validate numeric
|
||||
float(price_val)
|
||||
return price_val, currency_val.upper()
|
||||
@@ -116,10 +117,10 @@ class PriceProcessor:
|
||||
continue
|
||||
|
||||
# Fallback: extract just numbers
|
||||
number_match = re.search(r'([0-9.,]+)', price_str)
|
||||
number_match = re.search(r"([0-9.,]+)", price_str)
|
||||
if number_match:
|
||||
try:
|
||||
price_val = number_match.group(1).replace(',', '.')
|
||||
price_val = number_match.group(1).replace(",", ".")
|
||||
float(price_val) # Validate
|
||||
return price_val, None
|
||||
except ValueError:
|
||||
|
||||
@@ -1,20 +1,19 @@
|
||||
# utils/database.py
|
||||
import logging
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from sqlalchemy.pool import QueuePool
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_db_engine(database_url: str):
|
||||
"""Create database engine with connection pooling"""
|
||||
if database_url.startswith('sqlite'):
|
||||
if database_url.startswith("sqlite"):
|
||||
# SQLite configuration
|
||||
engine = create_engine(
|
||||
database_url,
|
||||
connect_args={"check_same_thread": False},
|
||||
echo=False
|
||||
database_url, connect_args={"check_same_thread": False}, echo=False
|
||||
)
|
||||
else:
|
||||
# PostgreSQL configuration with connection pooling
|
||||
@@ -24,7 +23,7 @@ def get_db_engine(database_url: str):
|
||||
pool_size=10,
|
||||
max_overflow=20,
|
||||
pool_pre_ping=True,
|
||||
echo=False
|
||||
echo=False,
|
||||
)
|
||||
|
||||
logger.info(f"Database engine created for: {database_url.split('@')[0]}@...")
|
||||
|
||||
Reference in New Issue
Block a user