fixing DQ issues
This commit is contained in:
@@ -1,4 +1,12 @@
|
||||
# utils/csv_processor.py
|
||||
"""CSV processor utilities ....
|
||||
|
||||
This module provides classes and functions for:
|
||||
- ....
|
||||
- ....
|
||||
- ....
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
@@ -15,7 +23,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CSVProcessor:
|
||||
"""Handles CSV import with robust parsing and batching"""
|
||||
"""Handles CSV import with robust parsing and batching."""
|
||||
|
||||
ENCODINGS = ["utf-8", "latin-1", "iso-8859-1", "cp1252", "utf-8-sig"]
|
||||
|
||||
@@ -75,13 +83,14 @@ class CSVProcessor:
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
"""Class constructor."""
|
||||
from utils.data_processing import GTINProcessor, PriceProcessor
|
||||
|
||||
self.gtin_processor = GTINProcessor()
|
||||
self.price_processor = PriceProcessor()
|
||||
|
||||
def download_csv(self, url: str) -> str:
|
||||
"""Download and decode CSV with multiple encoding attempts"""
|
||||
"""Download and decode CSV with multiple encoding attempts."""
|
||||
try:
|
||||
response = requests.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
@@ -107,8 +116,7 @@ class CSVProcessor:
|
||||
raise
|
||||
|
||||
def parse_csv(self, csv_content: str) -> pd.DataFrame:
|
||||
"""Parse CSV with multiple separator attempts"""
|
||||
|
||||
"""Parse CSV with multiple separator attempts."""
|
||||
for config in self.PARSING_CONFIGS:
|
||||
try:
|
||||
df = pd.read_csv(
|
||||
@@ -127,7 +135,7 @@ class CSVProcessor:
|
||||
raise pd.errors.ParserError("Could not parse CSV with any configuration")
|
||||
|
||||
def normalize_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Normalize column names using mapping"""
|
||||
"""Normalize column names using mapping."""
|
||||
# Clean column names
|
||||
df.columns = df.columns.str.strip()
|
||||
|
||||
@@ -138,7 +146,7 @@ class CSVProcessor:
|
||||
return df
|
||||
|
||||
def _clean_row_data(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Process a single row with data normalization"""
|
||||
"""Process a single row with data normalization."""
|
||||
# Handle NaN values
|
||||
processed_data = {k: (v if pd.notna(v) else None) for k, v in row_data.items()}
|
||||
|
||||
@@ -182,7 +190,7 @@ class CSVProcessor:
|
||||
self, url: str, marketplace: str, shop_name: str, batch_size: int, db: Session
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Process CSV from URL with marketplace and shop information
|
||||
Process CSV from URL with marketplace and shop information.
|
||||
|
||||
Args:
|
||||
url: URL to the CSV file
|
||||
@@ -194,7 +202,6 @@ class CSVProcessor:
|
||||
Returns:
|
||||
Dictionary with processing results
|
||||
"""
|
||||
|
||||
logger.info(
|
||||
f"Starting marketplace CSV import from {url} for {marketplace} -> {shop_name}"
|
||||
)
|
||||
@@ -239,13 +246,14 @@ class CSVProcessor:
|
||||
db: Session,
|
||||
batch_num: int,
|
||||
) -> Dict[str, int]:
|
||||
"""Process a batch of CSV rows with marketplace information"""
|
||||
"""Process a batch of CSV rows with marketplace information."""
|
||||
imported = 0
|
||||
updated = 0
|
||||
errors = 0
|
||||
|
||||
logger.info(
|
||||
f"Processing batch {batch_num} with {len(batch_df)} rows for {marketplace} -> {shop_name}"
|
||||
f"Processing batch {batch_num} with {len(batch_df)} rows for "
|
||||
f"{marketplace} -> {shop_name}"
|
||||
)
|
||||
|
||||
for index, row in batch_df.iterrows():
|
||||
@@ -285,7 +293,8 @@ class CSVProcessor:
|
||||
existing_product.updated_at = datetime.utcnow()
|
||||
updated += 1
|
||||
logger.debug(
|
||||
f"Updated product {product_data['product_id']} for {marketplace} and shop {shop_name}"
|
||||
f"Updated product {product_data['product_id']} for "
|
||||
f"{marketplace} and shop {shop_name}"
|
||||
)
|
||||
else:
|
||||
# Create new product
|
||||
@@ -299,8 +308,8 @@ class CSVProcessor:
|
||||
db.add(new_product)
|
||||
imported += 1
|
||||
logger.debug(
|
||||
f"Imported new product {product_data['product_id']} for {marketplace} and shop "
|
||||
f"{shop_name}"
|
||||
f"Imported new product {product_data['product_id']} "
|
||||
f"for {marketplace} and shop {shop_name}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -1,4 +1,12 @@
|
||||
# utils/data_processing.py
|
||||
"""Data processing utilities for GTIN validation and price parsing.
|
||||
|
||||
This module provides classes and functions for:
|
||||
- GTIN (Global Trade Item Number) validation and normalization
|
||||
- Price parsing with currency detection
|
||||
- Data cleaning and validation utilities
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional, Tuple
|
||||
@@ -9,14 +17,15 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GTINProcessor:
|
||||
"""Handles GTIN normalization and validation"""
|
||||
"""Handles GTIN normalization and validation."""
|
||||
|
||||
VALID_LENGTHS = [8, 12, 13, 14]
|
||||
|
||||
def normalize(self, gtin_value: any) -> Optional[str]:
|
||||
"""
|
||||
Normalize GTIN to proper format
|
||||
Returns None for invalid GTINs
|
||||
Normalize GTIN to proper format.
|
||||
|
||||
Returns None for invalid GTINs.
|
||||
"""
|
||||
if not gtin_value or pd.isna(gtin_value):
|
||||
return None
|
||||
@@ -63,14 +72,14 @@ class GTINProcessor:
|
||||
return None
|
||||
|
||||
def validate(self, gtin: str) -> bool:
|
||||
"""Validate GTIN format"""
|
||||
"""Validate GTIN format."""
|
||||
if not gtin:
|
||||
return False
|
||||
return len(gtin) in self.VALID_LENGTHS and gtin.isdigit()
|
||||
|
||||
|
||||
class PriceProcessor:
|
||||
"""Handles price parsing and currency extraction"""
|
||||
"""Handles price parsing and currency extraction."""
|
||||
|
||||
CURRENCY_PATTERNS = {
|
||||
# Amount followed by currency
|
||||
@@ -92,7 +101,8 @@ class PriceProcessor:
|
||||
self, price_str: any
|
||||
) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Parse price string into (price, currency) tuple
|
||||
Parse price string into (price, currency) tuple.
|
||||
|
||||
Returns (None, None) if parsing fails
|
||||
"""
|
||||
if not price_str or pd.isna(price_str):
|
||||
|
||||
@@ -1,4 +1,12 @@
|
||||
# utils/database.py
|
||||
"""Database utilities ....
|
||||
|
||||
This module provides classes and functions for:
|
||||
- ....
|
||||
- ....
|
||||
- ....
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
@@ -9,7 +17,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_db_engine(database_url: str):
|
||||
"""Create database engine with connection pooling"""
|
||||
"""Create database engine with connection pooling."""
|
||||
if database_url.startswith("sqlite"):
|
||||
# SQLite configuration
|
||||
engine = create_engine(
|
||||
@@ -26,10 +34,10 @@ def get_db_engine(database_url: str):
|
||||
echo=False,
|
||||
)
|
||||
|
||||
logger.info(f"Database engine created for: {database_url.split('@')[0]}@...")
|
||||
logger.info(f"Database engine created for: " f"{database_url.split('@')[0]}@...")
|
||||
return engine
|
||||
|
||||
|
||||
def get_session_local(engine):
|
||||
"""Create session factory"""
|
||||
"""Create session factory."""
|
||||
return sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
Reference in New Issue
Block a user