#!/usr/bin/env python3
"""
Generate Sample Data for Simple E-commerce Analytics Example
This script creates realistic sample data for the Trendy Fashion e-commerce analytics project.
"""

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from faker import Faker
import os

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)
fake = Faker('en_US')
Faker.seed(42)

def generate_customers(num_customers=5000):
    """Generate customer data"""
    print(f"Generating {num_customers} customer records...")
    
    customers = []
    start_date = datetime(2022, 1, 1)
    end_date = datetime(2025, 7, 1)
    
    # Define states with different probabilities (representing company's geographic distribution)
    states = ['CA', 'NY', 'TX', 'FL', 'IL', 'PA', 'OH', 'GA', 'NC', 'MI', 'NJ', 'WA', 'AZ', 'MA', 'TN']
    # Raw weights - will be normalized to sum to 1.0
    raw_weights = [0.15, 0.12, 0.10, 0.08, 0.06, 0.05, 0.05, 0.04, 0.04, 0.04, 0.04, 0.04, 0.03, 0.03, 0.17]
    state_weights = np.array(raw_weights) / np.sum(raw_weights)
    
    for i in range(num_customers):
        # Registration date - more recent customers more likely
        days_ago = np.random.exponential(365)  # Exponential distribution favors recent dates
        registration_date = end_date - timedelta(days=min(days_ago, 730))  # Cap at 2 years
        
        # Birth date for age analysis
        age = np.random.normal(35, 12)  # Normal distribution around 35 years
        age = max(18, min(age, 75))  # Constrain to reasonable range
        birth_date = datetime.now() - timedelta(days=age * 365.25)
        
        # Marketing consent - 80% consent rate
        marketing_consent = np.random.choice([True, False], p=[0.8, 0.2])
        
        customer = {
            'customer_id': f'CUST_{i+1:06d}',
            'first_name': fake.first_name(),
            'last_name': fake.last_name(),
            'email': fake.email(),
            'registration_date': registration_date.strftime('%Y-%m-%d'),
            'birth_date': birth_date.strftime('%Y-%m-%d'),
            'city': fake.city(),
            'state': np.random.choice(states, p=state_weights),
            'country': 'USA',
            'marketing_consent': marketing_consent
        }
        customers.append(customer)
    
    return pd.DataFrame(customers)

def generate_products(num_products=500):
    """Generate product catalog"""
    print(f"Generating {num_products} product records...")
    
    products = []
    
    # Fashion categories and their typical price ranges
    categories = {
        'Dresses': {'min_price': 45, 'max_price': 200, 'weight': 0.25},
        'Tops': {'min_price': 25, 'max_price': 80, 'weight': 0.20},
        'Bottoms': {'min_price': 35, 'max_price': 120, 'weight': 0.15},
        'Shoes': {'min_price': 60, 'max_price': 300, 'weight': 0.15},
        'Accessories': {'min_price': 15, 'max_price': 150, 'weight': 0.10},
        'Outerwear': {'min_price': 80, 'max_price': 400, 'weight': 0.10},
        'Activewear': {'min_price': 30, 'max_price': 100, 'weight': 0.05}
    }
    
    brands = ['TrendyFashion', 'ChicStyle', 'UrbanLook', 'ClassicWear', 'ModernLine', 
              'StylePlus', 'FashionForward', 'ElegantDesign', 'CasualChic', 'PremiumStyle']
    
    product_names = {
        'Dresses': ['Maxi Dress', 'Midi Dress', 'Mini Dress', 'Wrap Dress', 'Shirt Dress', 'A-Line Dress'],
        'Tops': ['Blouse', 'T-Shirt', 'Tank Top', 'Sweater', 'Cardigan', 'Hoodie'],
        'Bottoms': ['Jeans', 'Leggings', 'Trousers', 'Shorts', 'Skirt', 'Joggers'],
        'Shoes': ['Sneakers', 'Boots', 'Heels', 'Flats', 'Sandals', 'Loafers'],
        'Accessories': ['Handbag', 'Wallet', 'Scarf', 'Belt', 'Sunglasses', 'Jewelry'],
        'Outerwear': ['Jacket', 'Coat', 'Blazer', 'Hoodie', 'Vest', 'Windbreaker'],
        'Activewear': ['Sports Bra', 'Leggings', 'Athletic Top', 'Shorts', 'Track Suit', 'Yoga Pants']
    }
    
    for i in range(num_products):
        category = np.random.choice(list(categories.keys()), 
                                  p=[cat['weight'] for cat in categories.values()])
        
        # Price based on category
        cat_info = categories[category]
        price = np.random.uniform(cat_info['min_price'], cat_info['max_price'])
        
        # Cost is typically 40-60% of price
        cost = price * np.random.uniform(0.4, 0.6)
        
        # Product name
        base_name = np.random.choice(product_names[category])
        colors = ['Black', 'White', 'Blue', 'Red', 'Green', 'Pink', 'Gray', 'Brown']
        color = np.random.choice(colors)
        
        product = {
            'product_id': f'PROD_{i+1:06d}',
            'product_name': f'{color} {base_name}',
            'category': category,
            'brand': np.random.choice(brands),
            'price': round(price, 2),
            'cost': round(cost, 2),
            'supplier_id': f'SUP_{np.random.randint(1, 26):03d}'  # 25 suppliers
        }
        products.append(product)
    
    return pd.DataFrame(products)

def generate_orders(customers_df, products_df, num_orders=15000):
    """Generate order data with realistic patterns"""
    print(f"Generating {num_orders} order records...")
    
    orders = []
    
    # Create customer segments to drive different ordering patterns
    customer_segments = assign_customer_segments(customers_df)
    
    # Order generation parameters by segment
    segment_params = {
        'high_value': {'order_freq': 0.3, 'avg_items': 3.5, 'avg_amount': 150},
        'medium_value': {'order_freq': 0.2, 'avg_items': 2.5, 'avg_amount': 85},
        'low_value': {'order_freq': 0.1, 'avg_items': 1.8, 'avg_amount': 45},
        'new_customer': {'order_freq': 0.15, 'avg_items': 2.0, 'avg_amount': 65}
    }
    
    start_date = datetime(2022, 6, 1)  # Start orders 6 months after first customer registration
    end_date = datetime(2025, 7, 15)
    
    order_id_counter = 1
    
    for _, customer in customers_df.iterrows():
        customer_reg_date = datetime.strptime(customer['registration_date'], '%Y-%m-%d')
        segment = customer_segments[customer['customer_id']]
        params = segment_params[segment]
        
        # Number of orders for this customer (Poisson distribution)
        num_customer_orders = np.random.poisson(params['order_freq'] * 
                                               min(365, (end_date - max(customer_reg_date, start_date)).days))
        
        for order_num in range(num_customer_orders):
            # Order date - more recent orders more likely
            days_since_reg = (end_date - max(customer_reg_date, start_date)).days
            if days_since_reg <= 0:
                continue
                
            days_offset = np.random.exponential(days_since_reg / 4)
            order_date = max(customer_reg_date, start_date) + timedelta(days=min(days_offset, days_since_reg))
            
            # Number of items in order
            num_items = max(1, int(np.random.poisson(params['avg_items'])))
            
            # Select random products
            order_products = products_df.sample(n=min(num_items, len(products_df))).copy()
            
            # Calculate order total with some variation
            base_amount = params['avg_amount'] * np.random.uniform(0.7, 1.3)
            total_amount = max(20, base_amount)  # Minimum order $20
            
            # Shipping cost logic
            if total_amount > 100:
                shipping_cost = 0  # Free shipping over $100
            else:
                shipping_cost = np.random.uniform(5.99, 12.99)
            
            # Discount (20% of orders have discounts)
            discount_amount = 0
            if np.random.random() < 0.2:
                discount_amount = total_amount * np.random.uniform(0.1, 0.3)
                total_amount -= discount_amount
            
            # Order status (95% completed, 3% cancelled, 2% refunded)
            status = np.random.choice(['completed', 'cancelled', 'refunded'], 
                                    p=[0.95, 0.03, 0.02])
            
            order = {
                'order_id': f'ORD_{order_id_counter:08d}',
                'customer_id': customer['customer_id'],
                'order_date': order_date.strftime('%Y-%m-%d'),
                'total_amount': round(total_amount, 2),
                'status': status,
                'shipping_cost': round(shipping_cost, 2),
                'discount_amount': round(discount_amount, 2),
                'num_items': num_items
            }
            orders.append(order)
            order_id_counter += 1
    
    return pd.DataFrame(orders)

def assign_customer_segments(customers_df):
    """Assign customers to segments for realistic order generation"""
    segments = {}
    
    for _, customer in customers_df.iterrows():
        reg_date = datetime.strptime(customer['registration_date'], '%Y-%m-%d')
        days_since_reg = (datetime.now() - reg_date).days
        
        if days_since_reg < 90:  # New customers
            segment = 'new_customer'
        else:
            # Assign based on birth year (proxy for income/spending)
            birth_year = datetime.strptime(customer['birth_date'], '%Y-%m-%d').year
            age = datetime.now().year - birth_year
            
            if age > 45:  # Typically higher disposable income
                segment = np.random.choice(['high_value', 'medium_value'], p=[0.3, 0.7])
            elif age > 30:
                segment = np.random.choice(['high_value', 'medium_value', 'low_value'], p=[0.2, 0.5, 0.3])
            else:
                segment = np.random.choice(['medium_value', 'low_value'], p=[0.3, 0.7])
        
        segments[customer['customer_id']] = segment
    
    return segments

def add_seasonal_patterns(orders_df):
    """Add seasonal shopping patterns to make data more realistic"""
    print("Adding seasonal patterns to orders...")
    
    # Convert order_date to datetime for processing
    orders_df['order_date_dt'] = pd.to_datetime(orders_df['order_date'])
    
    # Define seasonal multipliers
    seasonal_multipliers = {
        11: 1.4,  # November (Black Friday)
        12: 1.6,  # December (Holiday shopping)
        1: 0.8,   # January (Post-holiday lull)
        2: 0.9,   # February
        3: 1.1,   # March (Spring fashion)
        4: 1.2,   # April
        5: 1.1,   # May
        6: 1.0,   # June
        7: 1.0,   # July
        8: 1.1,   # August (Back to school)
        9: 1.2,   # September
        10: 1.1   # October
    }
    
    # Adjust order amounts based on season
    orders_df['month'] = orders_df['order_date_dt'].dt.month
    orders_df['seasonal_multiplier'] = orders_df['month'].map(seasonal_multipliers)
    orders_df['total_amount'] = orders_df['total_amount'] * orders_df['seasonal_multiplier']
    orders_df['total_amount'] = orders_df['total_amount'].round(2)
    
    # Clean up temporary columns
    orders_df.drop(['order_date_dt', 'month', 'seasonal_multiplier'], axis=1, inplace=True)
    
    return orders_df

def main():
    """Generate all sample data files"""
    print("Starting sample data generation for Trendy Fashion...")
    print("=" * 60)
    
    # Create output directory
    output_dir = os.path.dirname(os.path.abspath(__file__))
    os.makedirs(output_dir, exist_ok=True)
    
    # Generate datasets
    print("\n1. Generating Customers...")
    customers_df = generate_customers(5000)
    customers_df.to_csv(os.path.join(output_dir, 'customers.csv'), index=False)
    print(f"   ✓ Created customers.csv with {len(customers_df)} records")
    
    print("\n2. Generating Products...")
    products_df = generate_products(500)
    products_df.to_csv(os.path.join(output_dir, 'products.csv'), index=False)
    print(f"   ✓ Created products.csv with {len(products_df)} records")
    
    print("\n3. Generating Orders...")
    orders_df = generate_orders(customers_df, products_df, 15000)
    orders_df = add_seasonal_patterns(orders_df)
    orders_df.to_csv(os.path.join(output_dir, 'orders.csv'), index=False)
    print(f"   ✓ Created orders.csv with {len(orders_df)} records")
    
    # Generate data summary
    print("\n4. Generating Data Summary...")
    generate_data_summary(customers_df, products_df, orders_df, output_dir)
    
    print("\n" + "=" * 60)
    print("Sample data generation complete!")
    print("\nFiles created:")
    print(f"  - customers.csv ({len(customers_df):,} records)")
    print(f"  - products.csv ({len(products_df):,} records)")  
    print(f"  - orders.csv ({len(orders_df):,} records)")
    print(f"  - data_summary.txt")
    print("\nNext steps:")
    print("  1. Load data into your database using the provided SQL scripts")
    print("  2. Run data exploration queries")
    print("  3. Execute segmentation analysis")

def generate_data_summary(customers_df, products_df, orders_df, output_dir):
    """Generate a summary of the created data"""
    
    # Basic statistics
    total_revenue = orders_df[orders_df['status'] == 'completed']['total_amount'].sum()
    avg_order_value = orders_df[orders_df['status'] == 'completed']['total_amount'].mean()
    
    # Customer analysis
    customers_with_orders = orders_df['customer_id'].nunique()
    
    # Date ranges
    min_order_date = orders_df['order_date'].min()
    max_order_date = orders_df['order_date'].max()
    min_reg_date = customers_df['registration_date'].min()
    max_reg_date = customers_df['registration_date'].max()
    
    summary = f"""
TRENDY FASHION - SAMPLE DATA SUMMARY
====================================

Dataset Overview:
- Customers: {len(customers_df):,} records
- Products: {len(products_df):,} records
- Orders: {len(orders_df):,} records

Date Ranges:
- Customer Registration: {min_reg_date} to {max_reg_date}
- Orders: {min_order_date} to {max_order_date}

Business Metrics:
- Total Revenue: ${total_revenue:,.2f}
- Average Order Value: ${avg_order_value:.2f}
- Customers with Orders: {customers_with_orders:,} ({customers_with_orders/len(customers_df)*100:.1f}%)

Order Status Distribution:
{orders_df['status'].value_counts()}

Product Categories:
{products_df['category'].value_counts()}

Customer Geographic Distribution (Top 5):
{customers_df['state'].value_counts().head()}

Expected Segmentation Results:
This data is designed to produce the following approximate segments:
- Champions: ~850 customers (high recent, frequent, high value)
- Loyal Customers: ~1,200 customers (consistent buyers)
- Big Spenders: ~650 customers (high value, less frequent)
- At Risk: ~420 customers (were loyal, now dormant)
- New Customers: ~980 customers (recent registrations)

Data Quality Notes:
- All customers have complete required fields
- Order dates respect customer registration dates
- Seasonal patterns included (holiday shopping spikes)
- Realistic price distributions by product category
- Geographic distribution reflects US market
"""
    
    with open(os.path.join(output_dir, 'data_summary.txt'), 'w') as f:
        f.write(summary)

if __name__ == "__main__":
    main()