| """Generate realistic e-commerce demo data for the Enterprise AI Assistant.""" |
|
|
| import random |
| import sqlite3 |
| import string |
| from datetime import datetime, timedelta |
| from pathlib import Path |
|
|
| from logger.logging import get_logger |
|
|
| logger = get_logger(__name__) |
|
|
| |
| CATEGORIES = { |
| "Electronics": { |
| "subcategories": ["Smartphones", "Laptops", "Headphones", "Tablets", "Cameras"], |
| "brands": ["TechPro", "NovaTech", "DigitalEdge", "SmartLine", "PixelCore"], |
| "price_range": (49.99, 1999.99), |
| "cost_ratio": (0.4, 0.6), |
| }, |
| "Clothing": { |
| "subcategories": ["T-Shirts", "Jeans", "Jackets", "Dresses", "Shoes"], |
| "brands": ["UrbanStyle", "ClassicWear", "TrendSet", "ActiveFit", "EcoThread"], |
| "price_range": (19.99, 299.99), |
| "cost_ratio": (0.3, 0.5), |
| }, |
| "Home & Kitchen": { |
| "subcategories": ["Cookware", "Furniture", "Decor", "Appliances", "Bedding"], |
| "brands": [ |
| "HomeEssentials", |
| "CozyLiving", |
| "KitchenPro", |
| "ModernNest", |
| "PureLiving", |
| ], |
| "price_range": (14.99, 899.99), |
| "cost_ratio": (0.35, 0.55), |
| }, |
| "Books": { |
| "subcategories": [ |
| "Fiction", |
| "Non-Fiction", |
| "Technical", |
| "Self-Help", |
| "Children", |
| ], |
| "brands": ["PageTurner", "MindGrowth", "TechReads", "StoryHouse", "KidsWorld"], |
| "price_range": (7.99, 59.99), |
| "cost_ratio": (0.2, 0.4), |
| }, |
| "Sports": { |
| "subcategories": ["Fitness", "Outdoor", "Team Sports", "Yoga", "Running"], |
| "brands": ["FitGear", "TrailBlazer", "ProSport", "ZenFlex", "SpeedRun"], |
| "price_range": (9.99, 499.99), |
| "cost_ratio": (0.3, 0.5), |
| }, |
| "Beauty": { |
| "subcategories": ["Skincare", "Makeup", "Haircare", "Fragrance", "Wellness"], |
| "brands": ["GlowUp", "PureSkin", "LuxBeauty", "NatureGlow", "VitalCare"], |
| "price_range": (5.99, 199.99), |
| "cost_ratio": (0.15, 0.35), |
| }, |
| "Toys": { |
| "subcategories": [ |
| "Educational", |
| "Action Figures", |
| "Board Games", |
| "Building Sets", |
| "Dolls", |
| ], |
| "brands": ["FunLearn", "HeroWorld", "GameMaster", "BuildIt", "DreamPlay"], |
| "price_range": (9.99, 149.99), |
| "cost_ratio": (0.25, 0.45), |
| }, |
| "Office": { |
| "subcategories": [ |
| "Supplies", |
| "Furniture", |
| "Technology", |
| "Organization", |
| "Writing", |
| ], |
| "brands": ["WorkSmart", "DeskPro", "OfficeTech", "NeatSpace", "PenCraft"], |
| "price_range": (4.99, 599.99), |
| "cost_ratio": (0.3, 0.5), |
| }, |
| "Garden": { |
| "subcategories": [ |
| "Tools", |
| "Plants", |
| "Outdoor Furniture", |
| "Lighting", |
| "Irrigation", |
| ], |
| "brands": ["GreenThumb", "GardenPro", "OutdoorLife", "SunGlow", "AquaGrow"], |
| "price_range": (7.99, 399.99), |
| "cost_ratio": (0.3, 0.5), |
| }, |
| "Automotive": { |
| "subcategories": ["Accessories", "Parts", "Tools", "Electronics", "Care"], |
| "brands": ["AutoPro", "DriveMax", "MechTools", "CarTech", "ShineGuard"], |
| "price_range": (9.99, 499.99), |
| "cost_ratio": (0.35, 0.55), |
| }, |
| } |
|
|
| FIRST_NAMES = [ |
| "James", |
| "Mary", |
| "Robert", |
| "Patricia", |
| "John", |
| "Jennifer", |
| "Michael", |
| "Linda", |
| "David", |
| "Elizabeth", |
| "William", |
| "Barbara", |
| "Richard", |
| "Susan", |
| "Joseph", |
| "Jessica", |
| "Thomas", |
| "Sarah", |
| "Christopher", |
| "Karen", |
| "Charles", |
| "Lisa", |
| "Daniel", |
| "Nancy", |
| "Matthew", |
| "Betty", |
| "Anthony", |
| "Margaret", |
| "Mark", |
| "Sandra", |
| "Donald", |
| "Ashley", |
| "Steven", |
| "Kimberly", |
| "Paul", |
| "Emily", |
| "Andrew", |
| "Donna", |
| "Joshua", |
| "Michelle", |
| "Kenneth", |
| "Carol", |
| "Kevin", |
| "Amanda", |
| "Brian", |
| "Dorothy", |
| "George", |
| "Melissa", |
| "Timothy", |
| "Deborah", |
| "Ronald", |
| "Stephanie", |
| "Edward", |
| "Rebecca", |
| "Jason", |
| "Sharon", |
| "Jeffrey", |
| "Laura", |
| "Ryan", |
| "Cynthia", |
| "Jacob", |
| "Kathleen", |
| "Gary", |
| "Amy", |
| "Nicholas", |
| "Angela", |
| "Eric", |
| "Shirley", |
| "Jonathan", |
| "Anna", |
| "Stephen", |
| "Brenda", |
| ] |
|
|
| LAST_NAMES = [ |
| "Smith", |
| "Johnson", |
| "Williams", |
| "Brown", |
| "Jones", |
| "Garcia", |
| "Miller", |
| "Davis", |
| "Rodriguez", |
| "Martinez", |
| "Hernandez", |
| "Lopez", |
| "Gonzalez", |
| "Wilson", |
| "Anderson", |
| "Thomas", |
| "Taylor", |
| "Moore", |
| "Jackson", |
| "Martin", |
| "Lee", |
| "Perez", |
| "Thompson", |
| "White", |
| "Harris", |
| "Sanchez", |
| "Clark", |
| "Ramirez", |
| "Lewis", |
| "Robinson", |
| "Walker", |
| "Young", |
| "Allen", |
| "King", |
| "Wright", |
| "Scott", |
| "Torres", |
| "Nguyen", |
| "Hill", |
| "Flores", |
| "Green", |
| "Adams", |
| "Nelson", |
| "Baker", |
| "Hall", |
| "Rivera", |
| "Campbell", |
| "Mitchell", |
| "Carter", |
| ] |
|
|
| CITIES = [ |
| ("New York", "NY"), |
| ("Los Angeles", "CA"), |
| ("Chicago", "IL"), |
| ("Houston", "TX"), |
| ("Phoenix", "AZ"), |
| ("Philadelphia", "PA"), |
| ("San Antonio", "TX"), |
| ("San Diego", "CA"), |
| ("Dallas", "TX"), |
| ("San Jose", "CA"), |
| ("Austin", "TX"), |
| ("Jacksonville", "FL"), |
| ("Fort Worth", "TX"), |
| ("Columbus", "OH"), |
| ("Charlotte", "NC"), |
| ("San Francisco", "CA"), |
| ("Indianapolis", "IN"), |
| ("Seattle", "WA"), |
| ("Denver", "CO"), |
| ("Nashville", "TN"), |
| ("Portland", "OR"), |
| ("Boston", "MA"), |
| ("Atlanta", "GA"), |
| ("Miami", "FL"), |
| ("Minneapolis", "MN"), |
| ("Detroit", "MI"), |
| ("Tampa", "FL"), |
| ("Pittsburgh", "PA"), |
| ] |
|
|
| REVIEW_TITLES_POSITIVE = [ |
| "Excellent product!", |
| "Highly recommend!", |
| "Best purchase ever", |
| "Great value for money", |
| "Exceeded expectations", |
| "Love it!", |
| "Perfect for my needs", |
| "Outstanding quality", |
| "Would buy again", |
| "Fantastic!", |
| "Very satisfied", |
| "Top notch", |
| ] |
|
|
| REVIEW_TITLES_NEGATIVE = [ |
| "Disappointed", |
| "Not as expected", |
| "Poor quality", |
| "Would not recommend", |
| "Broke after a week", |
| "Waste of money", |
| "Terrible experience", |
| "Not worth it", |
| "Very dissatisfied", |
| ] |
|
|
| REVIEW_BODIES_POSITIVE = [ |
| "This product exceeded my expectations in every way. The build quality is excellent and it works perfectly.", |
| "I've been using this for a few weeks now and I'm very happy with my purchase. Highly recommended!", |
| "Great product at a fair price. Shipping was fast and packaging was secure.", |
| "Exactly what I was looking for. The quality is impressive for this price point.", |
| "This is my second purchase of this product. That should tell you how good it is!", |
| "Bought this as a gift and the recipient loved it. Will definitely buy from this brand again.", |
| ] |
|
|
| REVIEW_BODIES_NEGATIVE = [ |
| "The product arrived damaged and customer support was unhelpful. Very disappointed.", |
| "Quality is much lower than what was advertised. The materials feel cheap.", |
| "Stopped working after just two weeks of normal use. Not worth the money.", |
| "The product looks nothing like the pictures. I'm returning this.", |
| ] |
|
|
|
|
| def generate_tracking_number(): |
| """Generate a realistic tracking number.""" |
| prefix = random.choice(["1Z", "94", "92", "JD"]) |
| digits = "".join(random.choices(string.digits, k=12)) |
| return f"{prefix}{digits}" |
|
|
|
|
| def seed_database(db_path: str = "database/ecommerce.db"): |
| """Seed the database with realistic e-commerce data.""" |
| random.seed(42) |
|
|
| db_path = Path(db_path) |
| db_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
| |
| schema_path = Path(__file__).parent / "schema.sql" |
| with open(schema_path, "r") as f: |
| schema_sql = f.read() |
|
|
| conn = sqlite3.connect(str(db_path)) |
| cursor = conn.cursor() |
| cursor.executescript(schema_sql) |
|
|
| logger.info("Schema created, seeding data...") |
|
|
| |
| customers = [] |
| used_emails = set() |
| segments = ["Consumer", "Corporate", "Enterprise"] |
| segment_weights = [0.6, 0.3, 0.1] |
|
|
| for i in range(500): |
| first = random.choice(FIRST_NAMES) |
| last = random.choice(LAST_NAMES) |
|
|
| |
| base_email = f"{first.lower()}.{last.lower()}" |
| email = f"{base_email}@example.com" |
| suffix = 1 |
| while email in used_emails: |
| email = f"{base_email}{suffix}@example.com" |
| suffix += 1 |
| used_emails.add(email) |
|
|
| city, state = random.choice(CITIES) |
| segment = random.choices(segments, weights=segment_weights, k=1)[0] |
| zip_code = f"{random.randint(10000, 99999)}" |
| phone = f"({random.randint(200, 999)}) {random.randint(200, 999)}-{random.randint(1000, 9999)}" |
| address = f"{random.randint(1, 9999)} {random.choice(['Main', 'Oak', 'Elm', 'Park', 'Cedar', 'Maple', 'Pine', 'Lake', 'Hill', 'River'])} {random.choice(['St', 'Ave', 'Blvd', 'Dr', 'Ln', 'Rd'])}" |
|
|
| |
| days_ago = random.randint(1, 730) |
| created_at = datetime.now() - timedelta(days=days_ago) |
|
|
| customers.append( |
| ( |
| first, |
| last, |
| email, |
| phone, |
| address, |
| city, |
| state, |
| zip_code, |
| "US", |
| segment, |
| 0, |
| created_at.strftime("%Y-%m-%d %H:%M:%S"), |
| ) |
| ) |
|
|
| cursor.executemany( |
| "INSERT INTO customers (first_name, last_name, email, phone, address, city, state, zip_code, country, segment, lifetime_value, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", |
| customers, |
| ) |
| logger.info(f"Inserted {len(customers)} customers") |
|
|
| |
| products = [] |
| product_id = 0 |
| for category, info in CATEGORIES.items(): |
| for _ in range(20): |
| product_id += 1 |
| subcat = random.choice(info["subcategories"]) |
| brand = random.choice(info["brands"]) |
| name = f"{brand} {subcat} {random.choice(['Pro', 'Plus', 'Max', 'Lite', 'Ultra', 'Classic', 'Elite', 'Eco', 'Prime', 'Basic'])} {random.choice(['X', 'V2', 'SE', '2024', 'Series', ''])}" |
| name = name.strip() |
|
|
| price = round(random.uniform(*info["price_range"]), 2) |
| cost_ratio = random.uniform(*info["cost_ratio"]) |
| cost = round(price * cost_ratio, 2) |
| stock = random.randint(0, 500) |
| rating = round(random.uniform(2.5, 5.0), 1) |
| review_count = random.randint(0, 300) |
|
|
| days_ago = random.randint(1, 730) |
| created_at = datetime.now() - timedelta(days=days_ago) |
|
|
| products.append( |
| ( |
| name, |
| category, |
| subcat, |
| brand, |
| price, |
| cost, |
| stock, |
| rating, |
| review_count, |
| 1, |
| created_at.strftime("%Y-%m-%d %H:%M:%S"), |
| ) |
| ) |
|
|
| cursor.executemany( |
| "INSERT INTO products (name, category, subcategory, brand, price, cost, stock_quantity, rating, review_count, is_active, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", |
| products, |
| ) |
| logger.info(f"Inserted {len(products)} products") |
|
|
| |
| orders = [] |
| statuses = [ |
| "Pending", |
| "Processing", |
| "Shipped", |
| "Delivered", |
| "Cancelled", |
| "Returned", |
| ] |
| status_weights = [0.05, 0.05, 0.1, 0.65, 0.1, 0.05] |
| payment_methods = [ |
| "Credit Card", |
| "Debit Card", |
| "PayPal", |
| "Bank Transfer", |
| "Gift Card", |
| ] |
| payment_weights = [0.4, 0.2, 0.25, 0.1, 0.05] |
|
|
| for _ in range(2000): |
| customer_id = random.randint(1, 500) |
| days_ago = random.randint(1, 365) |
| order_date = datetime.now() - timedelta( |
| days=days_ago, hours=random.randint(0, 23), minutes=random.randint(0, 59) |
| ) |
| status = random.choices(statuses, weights=status_weights, k=1)[0] |
| payment = random.choices(payment_methods, weights=payment_weights, k=1)[0] |
| shipping_cost = round(random.choice([0, 4.99, 7.99, 9.99, 14.99]), 2) |
| discount = round(random.choice([0, 0, 0, 5.0, 10.0, 15.0, 20.0, 25.0]), 2) |
|
|
| city, state = random.choice(CITIES) |
| shipping_address = f"{random.randint(1, 9999)} {random.choice(['Main', 'Oak', 'Elm'])} St, {city}, {state}" |
| tracking = ( |
| generate_tracking_number() if status in ["Shipped", "Delivered"] else None |
| ) |
|
|
| orders.append( |
| ( |
| customer_id, |
| order_date.strftime("%Y-%m-%d %H:%M:%S"), |
| status, |
| 0, |
| discount, |
| shipping_cost, |
| payment, |
| shipping_address, |
| tracking, |
| ) |
| ) |
|
|
| cursor.executemany( |
| "INSERT INTO orders (customer_id, order_date, status, total_amount, discount_amount, shipping_cost, payment_method, shipping_address, tracking_number) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", |
| orders, |
| ) |
| logger.info(f"Inserted {len(orders)} orders") |
|
|
| |
| order_items = [] |
| order_totals = {} |
|
|
| for _ in range(5000): |
| order_id = random.randint(1, 2000) |
| product_id = random.randint(1, 200) |
|
|
| |
| cursor.execute("SELECT price FROM products WHERE product_id = ?", (product_id,)) |
| row = cursor.fetchone() |
| unit_price = row[0] if row else 29.99 |
|
|
| quantity = random.choices( |
| [1, 2, 3, 4, 5], weights=[0.5, 0.25, 0.15, 0.05, 0.05], k=1 |
| )[0] |
| discount_pct = random.choice([0, 0, 0, 0, 5, 10, 15, 20]) |
|
|
| line_total = unit_price * quantity * (1 - discount_pct / 100) |
| order_totals[order_id] = order_totals.get(order_id, 0) + line_total |
|
|
| order_items.append((order_id, product_id, quantity, unit_price, discount_pct)) |
|
|
| cursor.executemany( |
| "INSERT INTO order_items (order_id, product_id, quantity, unit_price, discount_percent) VALUES (?, ?, ?, ?, ?)", |
| order_items, |
| ) |
| logger.info(f"Inserted {len(order_items)} order items") |
|
|
| |
| for order_id, total in order_totals.items(): |
| cursor.execute( |
| "UPDATE orders SET total_amount = ? WHERE order_id = ?", |
| (round(total, 2), order_id), |
| ) |
|
|
| |
| cursor.execute(""" |
| UPDATE customers SET lifetime_value = ( |
| SELECT COALESCE(SUM(o.total_amount), 0) FROM orders o |
| WHERE o.customer_id = customers.customer_id AND o.status != 'Cancelled' |
| ) |
| """) |
|
|
| |
| reviews = [] |
| for _ in range(1500): |
| product_id = random.randint(1, 200) |
| customer_id = random.randint(1, 500) |
| rating = random.choices( |
| [1, 2, 3, 4, 5], weights=[0.05, 0.08, 0.12, 0.35, 0.40], k=1 |
| )[0] |
|
|
| if rating >= 4: |
| title = random.choice(REVIEW_TITLES_POSITIVE) |
| body = random.choice(REVIEW_BODIES_POSITIVE) |
| else: |
| title = random.choice(REVIEW_TITLES_NEGATIVE) |
| body = random.choice(REVIEW_BODIES_NEGATIVE) |
|
|
| helpful_votes = random.randint(0, 50) |
| verified = random.choices([1, 0], weights=[0.8, 0.2], k=1)[0] |
| days_ago = random.randint(1, 365) |
| created_at = datetime.now() - timedelta(days=days_ago) |
|
|
| reviews.append( |
| ( |
| product_id, |
| customer_id, |
| rating, |
| title, |
| body, |
| helpful_votes, |
| verified, |
| created_at.strftime("%Y-%m-%d %H:%M:%S"), |
| ) |
| ) |
|
|
| cursor.executemany( |
| "INSERT INTO reviews (product_id, customer_id, rating, title, body, helpful_votes, verified_purchase, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", |
| reviews, |
| ) |
| logger.info(f"Inserted {len(reviews)} reviews") |
|
|
| |
| inv_logs = [] |
| change_types = ["Restock", "Sale", "Return", "Adjustment", "Damaged"] |
| change_weights = [0.2, 0.5, 0.1, 0.15, 0.05] |
|
|
| for _ in range(3000): |
| product_id = random.randint(1, 200) |
| change_type = random.choices(change_types, weights=change_weights, k=1)[0] |
|
|
| if change_type == "Restock": |
| qty_change = random.randint(10, 200) |
| elif change_type == "Sale": |
| qty_change = -random.randint(1, 10) |
| elif change_type == "Return": |
| qty_change = random.randint(1, 5) |
| elif change_type == "Adjustment": |
| qty_change = random.randint(-20, 20) |
| else: |
| qty_change = -random.randint(1, 5) |
|
|
| prev_stock = random.randint(10, 500) |
| new_stock = max(0, prev_stock + qty_change) |
| notes = f"{change_type}: {'Added' if qty_change > 0 else 'Removed'} {abs(qty_change)} units" |
|
|
| days_ago = random.randint(1, 365) |
| created_at = datetime.now() - timedelta(days=days_ago) |
|
|
| inv_logs.append( |
| ( |
| product_id, |
| change_type, |
| qty_change, |
| prev_stock, |
| new_stock, |
| notes, |
| created_at.strftime("%Y-%m-%d %H:%M:%S"), |
| ) |
| ) |
|
|
| cursor.executemany( |
| "INSERT INTO inventory_log (product_id, change_type, quantity_change, previous_stock, new_stock, notes, created_at) VALUES (?, ?, ?, ?, ?, ?, ?)", |
| inv_logs, |
| ) |
| logger.info(f"Inserted {len(inv_logs)} inventory log entries") |
|
|
| conn.commit() |
| conn.close() |
|
|
| logger.info(f"Database seeded successfully at {db_path}") |
| logger.info( |
| f"Summary: 500 customers, 200 products, 2000 orders, 5000 order items, 1500 reviews, 3000 inventory logs" |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| seed_database() |
|
|