#!/usr/bin/env python3 """ Generate realistic, high-quality synthetic data for the retail analytics SQL database. Outputs INSERT statements to seed_data.sql. Respects FK constraints and business patterns (seasonality, regional bias, price tiers). Uses faker for names/emails. Run: python3 generate_seed_sql.py """ import random from datetime import date, timedelta from pathlib import Path from faker import Faker Faker.seed(42) random.seed(42) fake = Faker() OUTPUT_FILE = Path(__file__).parent / "seed_data.sql" # --------------------------------------------------------------------------- # 1. Static: regions & product_categories (exact lists) # --------------------------------------------------------------------------- REGIONS = [ ("North America", "US"), ("North America", "CA"), ("North America", "MX"), ("South America", "BR"), ("Europe West", "FR"), ("Europe West", "DE"), ("UK & Ireland", "GB"), ("Asia Pacific", "JP"), ("Asia Pacific", "IN"), ("ANZ", "AU"), ("Middle East", "AE"), ("Africa", "ZA"), ] # Region IDs 1-12. For customer distribution: 1,2,3=NA (45%), 5,6,7=Europe (30%), 8,9=Asia (15%), 4,10,11,12=RoW (10%) REGION_IDS_NA = [1, 2, 3] REGION_IDS_EUROPE = [5, 6, 7] REGION_IDS_ASIA = [8, 9] REGION_IDS_ROW = [4, 10, 11, 12] CATEGORIES = [ "Electronics", "Computers", "Clothing", "Home & Garden", "Sports & Outdoors", "Toys & Games", "Books & Media", "Health & Beauty", "Office Supplies", "Automotive", "Pet Supplies", "Groceries", ] # Price ranges (min, max) per category index 0..11 CATEGORY_PRICE_RANGES = { 0: (200, 2500), # Electronics 1: (200, 2500), # Computers 2: (20, 150), # Clothing 3: (15, 180), # Home & Garden 4: (25, 200), # Sports & Outdoors 5: (10, 120), # Toys & Games 6: (8, 80), # Books & Media 7: (5, 100), # Health & Beauty 8: (2, 50), # Office Supplies 9: (30, 400), # Automotive 10: (5, 80), # Pet Supplies 11: (2, 50), # Groceries } # Meaningful product name templates per category (index 0..11) PRODUCT_TEMPLATES = { 0: ["Pro Wireless Headphones", "Smart Watch Series", "Bluetooth Speaker", "USB-C Hub", "Noise-Cancelling Earbuds", "Tablet Stand", "HD Webcam", "Portable Charger", "Gaming Mouse", "Mechanical Keyboard"], 1: ["Gaming Laptop 15\"", "Ultrabook 14\"", "All-in-One PC", "External SSD 1TB", "Monitor 27\"", "Docking Station", "Wireless Router", "Laptop Sleeve", "USB-C Dock", "NAS 4-Bay"], 2: ["Cotton T-Shirt", "Denim Jacket", "Running Shorts", "Wool Sweater", "Summer Dress", "Casual Blazer", "Athletic Leggings", "Polo Shirt", "Winter Coat", "Cargo Pants"], 3: ["LED Desk Lamp", "Plant Pot Set", "Kitchen Tool Set", "Throw Pillow", "Garden Hose", "Storage Bins", "Curtains", "Coffee Maker", "Blender", "Vacuum Cleaner"], 4: ["Yoga Mat", "Dumbbells Set", "Running Shoes", "Cycling Helmet", "Camping Tent", "Water Bottle", "Resistance Bands", "Fitness Tracker", "Tennis Racket", "Sleeping Bag"], 5: ["Board Game", "Building Blocks Set", "Action Figure", "Puzzle 500pc", "Art Supplies Kit", "Stuffed Animal", "Remote Car", "Educational Kit", "Card Game", "Outdoor Playset"], 6: ["Bestseller Novel", "Cookbook", "Children's Book", "Magazine Subscription", "Audiobook", "Reference Guide", "Comic Book", "Journal", "Puzzle Book", "Travel Guide"], 7: ["Moisturizer SPF 30", "Shampoo & Conditioner", "Face Cream", "Sunscreen Lotion", "Hand Soap", "Body Lotion", "Hair Serum", "Lip Balm Pack", "Vitamin C Serum", "Scented Candle"], 8: ["Sticky Notes Pack", "Ballpoint Pens 12pk", "Notebook A4", "File Folders", "Stapler", "Highlighters", "Paper Clips", "Desk Organizer", "Printer Paper", "Label Maker"], 9: ["Car Phone Mount", "Dash Cam", "Car Vacuum", "Seat Cover", "Snow Brush", "Tire Inflator", "Jump Starter", "Car Organizer", "Air Freshener", "Floor Mats"], 10: ["Dog Food 5kg", "Cat Litter", "Pet Bowl", "Chew Toy", "Grooming Brush", "Aquarium Filter", "Bird Feeder", "Flea Collar", "Pet Bed", "Treat Pouch"], 11: ["Organic Cereal", "Whole Grain Bread", "Olive Oil 500ml", "Pasta Pack", "Canned Tomatoes", "Honey Jar", "Nut Mix", "Granola", "Rice 2kg", "Coffee Beans"], } RETURN_REASONS = ["Defective", "Wrong item", "Changed mind", "Size fit issue", "Not as described"] def sql_escape(s): """Escape single quotes for SQL.""" if s is None: return "NULL" return "'" + str(s).replace("'", "''") + "'" def write_sql_header(f): f.write("-- Retail analytics seed data (generated by generate_seed_sql.py)\n") f.write("-- Idempotent: truncate then insert. Run against existing schema.\n\n") f.write("TRUNCATE TABLE returns, sales, promotions, products, customers, product_categories, regions RESTART IDENTITY CASCADE;\n\n") def emit_regions(f): f.write("-- Regions (static)\n") for name, country in REGIONS: f.write(f"INSERT INTO regions (name, country) VALUES ({sql_escape(name)}, {sql_escape(country)});\n") f.write("\n") def emit_categories(f): f.write("-- Product categories (static)\n") for c in CATEGORIES: f.write(f"INSERT INTO product_categories (name) VALUES ({sql_escape(c)});\n") f.write("\n") def emit_products(f): f.write("-- Products (10-20 per category, price by tier, meaningful names)\n") product_id = 0 products_by_cat = [] # list of (id, category_id, name, base_price) for later for cat_idx, cat_name in enumerate(CATEGORIES): n = random.randint(10, 20) low, high = CATEGORY_PRICE_RANGES[cat_idx] templates = PRODUCT_TEMPLATES[cat_idx] for i in range(n): product_id += 1 name = random.choice(templates) if n > len(templates): name = f"{name} {fake.word().capitalize()} {random.randint(1, 99)}" base_price = round(random.uniform(low, high), 2) f.write(f"INSERT INTO products (id, name, category_id, base_price) VALUES ({product_id}, {sql_escape(name)}, {cat_idx + 1}, {base_price});\n") products_by_cat.append((product_id, cat_idx + 1, name, base_price)) f.write("\n") return products_by_cat def emit_customers(f): # Distribution: 45% NA, 30% Europe, 15% Asia, 10% RoW. Growth: more created recently (last 2 years). f.write("-- Customers (~400, regional bias, growth trend over 2 years)\n") weights = [0.45, 0.30, 0.15, 0.10] pools = [REGION_IDS_NA, REGION_IDS_EUROPE, REGION_IDS_ASIA, REGION_IDS_ROW] end_date = date.today() start_date = end_date - timedelta(days=730) # Bias created_at toward recent: weight by (days_ago)^{-0.5} so more in last months day_offsets = list(range(730)) weights_days = [(730 - d) ** 0.5 for d in day_offsets] total_w = sum(weights_days) weights_days = [w / total_w for w in weights_days] for i in range(400): rid = random.choices(pools, weights=weights)[0] region_id = random.choice(rid) name = fake.name() email = fake.email() d = random.choices(day_offsets, weights=weights_days)[0] created_at = start_date + timedelta(days=d) f.write(f"INSERT INTO customers (id, name, email, region_id, created_at) VALUES ({i + 1}, {sql_escape(name)}, {sql_escape(email)}, {region_id}, '{created_at}');\n") f.write("\n") return 400 def emit_promotions(f): # 10-15 promotions. Codes: SUMMER2023 (15%), BLACKFRIDAY (30%), WELCOME (10%). end_date > start_date. f.write("-- Promotions (10-15, code types, end_date > start_date)\n") code_specs = [ ("SUMMER2023", 15), ("SUMMER2024", 15), ("BLACKFRIDAY", 30), ("CYBERMONDAY", 25), ("WELCOME", 10), ("WELCOME10", 10), ("SAVE20", 20), ("FLASH25", 25), ("HOLIDAY2023", 20), ("HOLIDAY2024", 20), ("NEWYEAR", 15), ("SPRINGSALE", 15), ("FALLSALE", 15), ("MEMBERS", 10), ("VIP30", 30), ] chosen = random.sample(code_specs, random.randint(10, 15)) promo_windows = [] for i, (code, pct) in enumerate(chosen): start = fake.date_between(date(2022, 1, 1), date(2024, 6, 1)) end = start + timedelta(days=random.randint(7, 60)) if end > date(2025, 12, 31): end = date(2025, 12, 31) promo_windows.append((start, end, pct)) f.write(f"INSERT INTO promotions (id, code, discount_pct, start_date, end_date) VALUES ({i + 1}, {sql_escape(code)}, {pct}, '{start}', '{end}');\n") f.write("\n") return len(chosen), promo_windows def emit_sales(f, n_customers, products_list, promo_windows): # 2000 sales. Dates last 2 years, +30% volume Nov/Dec. Quantity 90% 1-3, 10% 4-10. amount = base_price * quantity; optionally apply promo. f.write("-- Sales (2000, seasonality Nov/Dec, quantity 1-3 dominant, amount = base_price * quantity, optional promo)\n") end_date = date.today() start_date = end_date - timedelta(days=730) # Build list of (sale_date, weight). Nov/Dec get 1.3x weight. date_weights = [] for d in range(730): dt = start_date + timedelta(days=d) w = 1.3 if dt.month in (11, 12) else 1.0 date_weights.append((dt, w)) total_dw = sum(w for _, w in date_weights) date_weights = [(dt, w / total_dw) for dt, w in date_weights] # Quantity: 90% 1-3, 10% 4-10 qty_choices = list(range(1, 4)) + list(range(4, 11)) qty_weights = [0.30, 0.30, 0.30] + [0.10 / 7] * 7 sales_rows = [] for sale_id in range(1, 2001): customer_id = random.randint(1, n_customers) prod = random.choice(products_list) pid, cat_id, name, base_price = prod[0], prod[1], prod[2], prod[3] quantity = random.choices(qty_choices, weights=qty_weights)[0] sale_date = random.choices([dt for dt, w in date_weights], weights=[w for dt, w in date_weights])[0] amount = round(base_price * quantity, 2) # Occasionally apply a promotion if sale_date falls in a promo window if random.random() < 0.15 and promo_windows: for start_p, end_p, discount_pct in promo_windows: if start_p <= sale_date <= end_p: amount = round(amount * (1 - discount_pct / 100), 2) break sales_rows.append((sale_id, customer_id, pid, quantity, amount, sale_date)) for row in sales_rows: sid, cid, pid, qty, amt, dt = row f.write(f"INSERT INTO sales (id, customer_id, product_id, quantity, amount, sale_date) VALUES ({sid}, {cid}, {pid}, {qty}, {amt}, '{dt}');\n") f.write("\n") return sales_rows def emit_returns(f, sales_rows): # 5-8% of sales. return_date 3-30 days after sale_date. amount = original sale amount. reason from list. f.write("-- Returns (5-8% of sales, return_date 3-30 days after sale, amount = sale amount)\n") total_sales = len(sales_rows) n_returns = random.randint(int(total_sales * 0.05), int(total_sales * 0.08)) chosen_sales = random.sample(sales_rows, n_returns) return_id = 0 for sale_id, customer_id, product_id, quantity, amount, sale_date in chosen_sales: return_id += 1 delta = random.randint(3, 30) return_date = sale_date + timedelta(days=delta) reason = random.choice(RETURN_REASONS) f.write(f"INSERT INTO returns (id, sale_id, amount, reason, return_date) VALUES ({return_id}, {sale_id}, {amount}, {sql_escape(reason)}, '{return_date}');\n") f.write("\n") # Reset sequences so future INSERTs without explicit id work f.write("-- Reset sequences after explicit IDs\n") for table in ("products", "customers", "sales", "returns", "promotions"): f.write(f"SELECT setval(pg_get_serial_sequence('{table}', 'id'), COALESCE((SELECT MAX(id) FROM {table}), 1));\n") f.write("\n") def main(): with open(OUTPUT_FILE, "w") as f: write_sql_header(f) emit_regions(f) emit_categories(f) products_list = emit_products(f) n_customers = emit_customers(f) n_promos, promo_windows = emit_promotions(f) sales_rows = emit_sales(f, n_customers, products_list, promo_windows) emit_returns(f, sales_rows) print(f"Written {OUTPUT_FILE.absolute()}") print("Run against your DB with: psql $NEON_DATABASE_URL -f seed_data.sql") print("Or from Python: cursor.execute(open('seed_data.sql').read())") if __name__ == "__main__": main()