In [None]:
"""
# Advanced Data Science Toolkit
# This comprehensive guide includes examples of:
# - Data generation/loading
# - Data Preparation and Cleaning (NEW SECTION)
# - EDA (Exploratory Data Analysis)
# - Feature engineering
# - Visualization with Matplotlib only
# - Statistical analysis with SciPy
# - Regression, Classification, and Clustering techniques
# - Time series forecasting with Prophet
# - Advanced modeling with CatBoost
# - Advanced Data Manipulation (Pivot, Groupby, Concat, Join, Map, Zip)
"""

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Removed import seaborn as sns
from scipy import stats
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist
from sklearn.datasets import fetch_california_housing, load_wine, load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer # KNNImputer is useful for imputation
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import (mean_squared_error, r2_score, accuracy_score, confusion_matrix,
 classification_report, silhouette_score, roc_curve, roc_auc_score,
 precision_recall_curve, average_precision_score)
import catboost as cb
from prophet import Prophet
import warnings
import datetime as dt
from dateutil.relativedelta import relativedelta
import itertools # Useful for combinations/permutations

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set aesthetics for plots using Matplotlib defaults
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

In [None]:
# ## 1. Data Generation and Loading

# %% tags=[]
def generate_synthetic_ecommerce_data(n_customers=1000, start_date='2023-01-01', end_date='2023-12-31'):
 """
 Generate synthetic e-commerce dataset with:
 - Customer demographics
 - Purchase history
 - Product information
 - Marketing campaign exposure
 - Website behavior

 Parameters:
 -----------
 n_customers : int
 Number of customers to generate
 start_date : str
 Start date for transaction history
 end_date : str
 End date for transaction history

 Returns:
 --------
 dict of pandas DataFrames containing:
 - customers: Customer demographic information
 - products: Product catalog
 - transactions: Purchase records
 - website_visits: Web analytics data
 - marketing: Marketing campaign information
 - campaign_exposures: Records of customer exposure to campaigns
 """
 np.random.seed(42) # For reproducibility

 # Generate date range for transactions
 start = pd.to_datetime(start_date)
 end = pd.to_datetime(end_date)
 days = (end - start).days + 1

 # Customer demographics
 age_groups = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
 genders = ['Male', 'Female', 'Non-binary', 'Prefer not to say']
 locations = ['North', 'South', 'East', 'West', 'Central']
 segments = ['New', 'Regular', 'VIP', 'Churned', 'Reactivated']

 customers = pd.DataFrame({
 'customer_id': range(1, n_customers + 1),
 'age_group': np.random.choice(age_groups, n_customers, p=[0.15, 0.25, 0.25, 0.15, 0.1, 0.1]),
 'gender': np.random.choice(genders, n_customers, p=[0.45, 0.48, 0.05, 0.02]),
 'location': np.random.choice(locations, n_customers),
 'signup_date': [start + pd.Timedelta(days=np.random.randint(0, days)) for _ in range(n_customers)],
 'customer_segment': np.random.choice(segments, n_customers, p=[0.2, 0.5, 0.1, 0.15, 0.05]),
 })

 # Add some realistic customer metrics
 customers['lifetime_value'] = np.random.gamma(shape=5, scale=50, size=n_customers)
 customers['avg_order_value'] = np.random.gamma(shape=3, scale=20, size=n_customers)
 customers['loyalty_score'] = np.random.randint(1, 101, size=n_customers)
 customers['recency_days'] = np.random.geometric(p=0.1, size=n_customers)

 # Add missing values to simulate real-world data
 mask = np.random.random(n_customers) < 0.05
 customers.loc[mask, 'age_group'] = np.nan
 mask = np.random.random(n_customers) < 0.02
 customers.loc[mask, 'gender'] = np.nan
 # Add some missing values to numeric columns too
 mask = np.random.random(n_customers) < 0.03
 customers.loc[mask, 'lifetime_value'] = np.nan
 mask = np.random.random(n_customers) < 0.01
 customers.loc[mask, 'loyalty_score'] = np.nan


 # Product catalog
 product_categories = ['Electronics', 'Clothing', 'Home', 'Beauty', 'Sports', 'Books', 'Food', 'Toys']
 n_products = 100

 products = pd.DataFrame({
 'product_id': range(1, n_products + 1),
 'product_name': [f'Product_{i}' for i in range(1, n_products + 1)],
 'category': np.random.choice(product_categories, n_products),
 'price': np.random.uniform(5, 500, n_products).round(2),
 'stock_level': np.random.randint(0, 1000, n_products),
 'rating': np.random.uniform(1, 5, n_products).round(1),
 'discount_eligible': np.random.choice([True, False], n_products, p=[0.3, 0.7])
 })
 # Add some missing values to product data
 mask = np.random.random(n_products) < 0.05
 products.loc[mask, 'price'] = np.nan
 mask = np.random.random(n_products) < 0.02
 products.loc[mask, 'rating'] = np.nan


 # Transactions (purchase history)
 # Let's assume each customer makes between 0 and 10 purchases
 transactions_list = []

 for cust_id in customers['customer_id']:
 n_purchases = np.random.randint(0, 11)
 if n_purchases > 0:
 purchase_dates = [start + pd.Timedelta(days=np.random.randint(0, days)) for _ in range(n_purchases)]
 purchase_dates.sort() # Ensure chronological order

 for date in purchase_dates:
 # Each purchase can include 1-5 products
 n_items = np.random.randint(1, 6)
 products_bought = np.random.choice(products['product_id'], n_items)

 for prod_id in products_bought:
 # Handle potential NaN in product price before using it
 product_price_series = products.loc[products['product_id'] == prod_id, 'price']
 if not product_price_series.empty and not pd.isna(product_price_series.values[0]):
 product_price = product_price_series.values[0]
 else:
 # Skip transaction if product price is missing
 continue

 # Apply random discount
 if products.loc[products['product_id'] == prod_id, 'discount_eligible'].values[0]:
 discount = np.random.choice([0, 0.1, 0.2, 0.3], p=[0.4, 0.3, 0.2, 0.1])
 else:
 discount = 0

 quantity = np.random.randint(1, 4)

 transaction = {
 'transaction_id': len(transactions_list) + 1,
 'customer_id': cust_id,
 'product_id': prod_id,
 'date': date,
 'quantity': quantity,
 'unit_price': product_price,
 'discount': discount,
 'total_price': quantity * product_price * (1 - discount),
 'payment_method': np.random.choice(['Credit Card', 'PayPal', 'Apple Pay', 'Bank Transfer'], p=[0.6, 0.2, 0.15, 0.05])
 }
 transactions_list.append(transaction)

 transactions = pd.DataFrame(transactions_list)
 # Add some missing values to transaction data
 mask = np.random.random(len(transactions)) < 0.01
 transactions.loc[mask, 'total_price'] = np.nan
 mask = np.random.random(len(transactions)) < 0.005
 transactions.loc[mask, 'payment_method'] = np.nan


 # Web behavior data
 visit_types = ['Product View', 'Cart Addition', 'Checkout Initiation', 'Purchase', 'Blog View', 'Search']
 devices = ['Desktop', 'Mobile', 'Tablet']
 channels = ['Organic Search', 'Direct', 'Referral', 'Social Media', 'Email', 'Paid Search']

 # Generate 5x more website visits than transactions
 n_visits = len(transactions) * 5

 website_visits = pd.DataFrame({
 'visit_id': range(1, n_visits + 1),
 'customer_id': np.random.choice(customers['customer_id'], n_visits),
 'timestamp': [start + pd.Timedelta(seconds=np.random.randint(0, days * 86400)) for _ in range(n_visits)],
 'visit_type': np.random.choice(visit_types, n_visits, p=[0.4, 0.2, 0.1, 0.1, 0.1, 0.1]),
 'product_id': np.random.choice(list(products['product_id']) + [np.nan], n_visits, p=[0.8/n_products] * n_products + [0.2]), # Allow None/NaN for non-product pages
 'device': np.random.choice(devices, n_visits, p=[0.5, 0.4, 0.1]),
 'channel': np.random.choice(channels, n_visits),
 'session_duration': np.random.exponential(300, n_visits).round(), # in seconds
 'pages_viewed': np.random.geometric(p=0.3, size=n_visits)
 })

 # Sort by timestamp
 website_visits = website_visits.sort_values('timestamp').reset_index(drop=True)
 # Add some missing values to website visits
 mask = np.random.random(len(website_visits)) < 0.04
 website_visits.loc[mask, 'session_duration'] = np.nan
 mask = np.random.random(len(website_visits)) < 0.03
 website_visits.loc[mask, 'device'] = np.nan


 # Marketing campaigns
 campaign_types = ['Email', 'Social Media', 'Search Ads', 'Display Ads', 'Influencer', 'SMS']
 campaign_names = [f'Campaign_{i}' for i in range(1, 11)]

 marketing = pd.DataFrame({
 'campaign_id': range(1, len(campaign_names) + 1),
 'campaign_name': campaign_names,
 'campaign_type': np.random.choice(campaign_types, len(campaign_names)),
 'start_date': [start + pd.Timedelta(days=np.random.randint(0, days//2)) for _ in range(len(campaign_names))],
 'budget': np.random.uniform(1000, 10000, len(campaign_names)).round(2),
 'target_segment': np.random.choice(segments + ['All'], len(campaign_names))
 })

 # Calculate end dates (2 weeks to 2 months after start date)
 marketing['end_date'] = marketing['start_date'] + pd.to_timedelta(np.random.randint(14, 61, len(campaign_names)), unit='d')
 # Add some missing values to marketing data
 mask = np.random.random(len(marketing)) < 0.1
 marketing.loc[mask, 'budget'] = np.nan
 
 
 


 # Campaign exposure
 exposure_list = []
 for cust_id in customers['customer_id']:
 # Each customer is exposed to 0-5 campaigns
 n_exposures = np.random.randint(0, 6)
 if n_exposures > 0:
 campaign_ids = np.random.choice(marketing['campaign_id'], n_exposures, replace=False)

 for camp_id in campaign_ids:
 campaign_start = pd.to_datetime(marketing.loc[marketing['campaign_id'] == camp_id, 'start_date'].values[0])
 campaign_end = pd.to_datetime(marketing.loc[marketing['campaign_id'] == camp_id, 'end_date'].values[0])

 # Ensure exposure date is within campaign dates
 
 
 if campaign_start <= campaign_end:
 delta = (campaign_end - campaign_start).days # Properly extract days
 exposure_date = campaign_start + pd.Timedelta(days=np.random.randint(0, delta + 1))
 else:
 exposure_date = campaign_start # Fallback if end < start


 interaction = np.random.choice(['Viewed', 'Clicked', 'Converted', 'Ignored'], p=[0.4, 0.3, 0.1, 0.2])

 exposure = {
 'exposure_id': len(exposure_list) + 1,
 'customer_id': cust_id,
 'campaign_id': camp_id,
 'exposure_date': exposure_date,
 'interaction': interaction
 }
 exposure_list.append(exposure)

 campaign_exposures = pd.DataFrame(exposure_list)

 # Add campaign names for easier reference
 campaign_exposures = campaign_exposures.merge(
 marketing[['campaign_id', 'campaign_name']],
 on='campaign_id',
 how='left'
 )
 # Add some missing values to campaign exposures
 mask = np.random.random(len(campaign_exposures)) < 0.02
 campaign_exposures.loc[mask, 'interaction'] = np.nan


 return {
 'customers': customers,
 'products': products,
 'transactions': transactions,
 'website_visits': website_visits,
 'marketing': marketing,
 'campaign_exposures': campaign_exposures
 }

# Generate synthetic e-commerce data
ecommerce_data = generate_synthetic_ecommerce_data(n_customers=1000)

# Display dataset sample overview
print("\n=== SYNTHETIC E-COMMERCE DATASET OVERVIEW ===")
for name, df in ecommerce_data.items():
 print(f"\n{name.upper()} - {df.shape[0]} rows, {df.shape[1]} columns")
 print(df.head(2))
 print(f"Columns: {', '.join(df.columns)}")

# Let's also load real-world datasets for comparison
print("\n=== LOADING REAL-WORLD DATASETS ===")

# Load California housing dataset for regression example
housing_data = fetch_california_housing()
housing_df = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)
housing_df['target'] = housing_data.target # Median house value in 100k USD
print("\nCalifornia Housing Dataset:")
print(f"Shape: {housing_df.shape}")
print(housing_df.head(2))

# Load wine dataset for classification example
wine_data = load_wine()
wine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)
wine_df['target'] = wine_data.target # Wine type (0, 1, 2)
print("\nWine Dataset:")
print(f"Shape: {wine_df.shape}")
print(wine_df.head(2))

# Load diabetes dataset for regression example (smaller dataset)
diabetes_data = load_diabetes()
diabetes_df = pd.DataFrame(diabetes_data.data, columns=diabetes_data.feature_names)
diabetes_df['target'] = diabetes_data.target # Quantitative measure of disease progression one year after baseline
print("\nDiabetes Dataset:")
print(f"Shape: {diabetes_df.shape}")
print(diabetes_df.head(2))


In [None]:
# ## 2. Data Preparation and Cleaning

# %% tags=[]
print(f"\n{'='*80}")
print(f"PART 2: DATA PREPARATION AND CLEANING")
print(f"{'='*80}")

# Use a copy of the synthetic e-commerce data for cleaning examples
customers_clean = ecommerce_data['customers'].copy()
transactions_clean = ecommerce_data['transactions'].copy()
products_clean = ecommerce_data['products'].copy()
website_visits_clean = ecommerce_data['website_visits'].copy()
campaign_exposures_clean = ecommerce_data['campaign_exposures'].copy()

print("\n--- Data Cleaning Examples on Synthetic E-commerce Data ---")

# --- 2.1 Identifying Missing Values ---
print("\n--- 2.1 Identifying Missing Values ---")
# Check for missing values in each DataFrame
print("\nMissing values in Customers DataFrame:")
print(customers_clean.isnull().sum())

print("\nMissing values in Transactions DataFrame:")
print(transactions_clean.isnull().sum())

print("\nMissing values in Products DataFrame:")
print(products_clean.isnull().sum())

print("\nMissing values in Website Visits DataFrame:")
print(website_visits_clean.isnull().sum())

print("\nMissing values in Campaign Exposures DataFrame:")
print(campaign_exposures_clean.isnull().sum())

# Visualize missing values (as done in EDA)
# Use a function to visualize missing values for a given dataframe
# Removed seaborn heatmap and replaced with a simple print of missing values
def print_missing_values_summary(df, title):
 """Prints a summary of missing values."""
 missing = df.isnull().sum()
 missing_percent = (missing / len(df) * 100).round(2)
 missing_df = pd.DataFrame({'Count': missing, 'Percent': missing_percent})
 missing_df = missing_df[missing_df['Count'] > 0].sort_values(by='Percent', ascending=False) # Show only columns with missing values, sorted

 print(f"\nMissing Values Summary - {title}:")
 if not missing_df.empty:
 print(missing_df)
 else:
 print("No missing values found.")

# Example usage:
# print_missing_values_summary(customers_clean, "Customers")
# print_missing_values_summary(transactions_clean, "Transactions")


# --- 2.2 Handling Missing Values ---
print("\n--- 2.2 Handling Missing Values ---")

# Example 1: Dropping rows with missing values
print("\nExample: Dropping rows with missing values in 'gender' or 'age_group' (Customers)")
initial_rows = customers_clean.shape[0]
customers_dropped_rows = customers_clean.dropna(subset=['gender', 'age_group']).copy()
print(f"Initial rows: {initial_rows}, Rows after dropping NaNs in specified columns: {customers_dropped_rows.shape[0]}")

# Example 2: Dropping columns with missing values (e.g., if a column has too many NaNs)
# Let's artificially create a column with many NaNs for demonstration
customers_clean['sparse_column'] = np.nan
customers_clean.loc[np.random.random(len(customers_clean)) > 0.9, 'sparse_column'] = 1 # Add a few non-NaNs
print("\nExample: Dropping 'sparse_column' if > 50% NaNs (Customers)")
initial_cols = customers_clean.shape[1]
threshold = len(customers_clean) * 0.5 # Drop if more than 50% missing
customers_dropped_cols = customers_clean.dropna(axis=1, thresh=threshold).copy()
print(f"Initial columns: {initial_cols}, Columns after dropping sparse column: {customers_dropped_cols.shape[1]}")
# Remove the artificially created column for subsequent examples
customers_clean.drop(columns=['sparse_column'], inplace=True)


In [None]:
# Example 3: Filling missing values with a constant value
print("\nExample: Filling missing 'payment_method' with 'Unknown' (Transactions)")
print("Before filling NaNs in payment_method:")
print(transactions_clean['payment_method'].value_counts(dropna=False))
transactions_filled_constant = transactions_clean.copy()
transactions_filled_constant['payment_method'].fillna('Unknown', inplace=True)
print("\nAfter filling NaNs with 'Unknown':")
print(transactions_filled_constant['payment_method'].value_counts(dropna=False))


In [None]:
# Example 4: Filling missing numerical values with the mean
print("\nExample: Filling missing 'lifetime_value' with the mean (Customers)")
print("Mean of lifetime_value before filling:", customers_clean['lifetime_value'].mean().round(2))
customers_filled_mean = customers_clean.copy()
mean_lifetime_value = customers_filled_mean['lifetime_value'].mean()
customers_filled_mean['lifetime_value'].fillna(mean_lifetime_value, inplace=True)
print("Missing values in lifetime_value after filling:", customers_filled_mean['lifetime_value'].isnull().sum())
print("Mean of lifetime_value after filling:", customers_filled_mean['lifetime_value'].mean().round(2)) # Mean should be the same

In [None]:
# Example 5: Filling missing numerical values with the median
print("\nExample: Filling missing 'price' with the median (Products)")
print("Median of price before filling:", products_clean['price'].median().round(2))
products_filled_median = products_clean.copy()
median_price = products_filled_median['price'].median()
products_filled_median['price'].fillna(median_price, inplace=True)
print("Missing values in price after filling:", products_filled_median['price'].isnull().sum())
print("Median of price after filling:", products_filled_median['price'].median().round(2)) # Median should be the same

In [None]:
# Example 6: Filling missing categorical values with a constant value (e.g., 'Unknown')
print("\nExample: Filling missing 'device' with 'Unknown' (Website Visits)")
print("Value counts of device before filling:")
print(website_visits_clean['device'].value_counts(dropna=False))
website_visits_filled_constant = website_visits_clean.copy()
website_visits_filled_constant['device'].fillna('Unknown', inplace=True)
print("\nAfter filling NaNs with 'Unknown':")
print(website_visits_filled_constant['device'].value_counts(dropna=False))

In [None]:
# Example 7: Filling missing values using forward fill (ffill)
# Useful for time series or ordered data, fills with the previous valid observation
print("\nExample: Filling missing 'total_price' using forward fill (Transactions - requires sorting)")
# Ensure data is sorted by date for meaningful ffill
transactions_clean_sorted = transactions_clean.sort_values('date').copy()
print("Missing values in total_price before ffill:", transactions_clean_sorted['total_price'].isnull().sum())
transactions_filled_ffill = transactions_clean_sorted.copy()
transactions_filled_ffill['total_price'].fillna(method='ffill', inplace=True)
print("Missing values in total_price after ffill:", transactions_filled_ffill['total_price'].isnull().sum())


In [None]:
# Example 8: Filling missing values using backward fill (bfill)
# Fills with the next valid observation
print("\nExample: Filling remaining missing 'total_price' using backward fill (Transactions)")
print("Missing values in total_price before bfill:", transactions_filled_ffill['total_price'].isnull().sum())
transactions_filled_bfill = transactions_filled_ffill.copy()
transactions_filled_bfill['total_price'].fillna(method='bfill', inplace=True)
print("Missing values in total_price after bfill:", transactions_filled_bfill['total_price'].isnull().sum())
# Note: ffill and bfill might still leave NaNs at the very beginning/end of the series if they start/end with NaNs.


In [None]:
# Example 9: Imputing missing numerical values using KNNImputer
# Imputes missing values using the k-Nearest Neighbors approach.
print("\nExample: Imputing missing numerical values using KNNImputer (Customers)")
# Select numerical columns with missing values
numeric_cols_with_missing = customers_clean.select_dtypes(include=np.number).columns[customers_clean.select_dtypes(include=np.number).isnull().any()].tolist()
print(f"Numerical columns with missing values: {numeric_cols_with_missing}")

if numeric_cols_with_missing:
 knn_imputer = KNNImputer(n_neighbors=5) # Use 5 nearest neighbors

 # Fit and transform the selected columns
 # KNNImputer works on numpy arrays, so select the columns first
 customers_knn_imputed_values = knn_imputer.fit_transform(customers_clean[numeric_cols_with_missing])

 # Replace the original columns with the imputed values
 customers_imputed_knn = customers_clean.copy()
 customers_imputed_knn[numeric_cols_with_missing] = customers_knn_imputed_values

 print("Missing values in numerical columns before KNNImputer:")
 print(customers_clean[numeric_cols_with_missing].isnull().sum())
 print("\nMissing values in numerical columns after KNNImputer:")
 print(customers_imputed_knn[numeric_cols_with_missing].isnull().sum())
else:
 print("No numerical columns with missing values to demonstrate KNNImputer on Customers DataFrame.")


# --- 2.3 Handling Duplicate Data ---
print("\n--- 2.3 Handling Duplicate Data ---")

# Check for duplicate rows (already done in EDA, but shown here as a cleaning step)
print("\nChecking for duplicate rows (Customers):", customers_clean.duplicated().sum())

# Example: Removing duplicate rows
# Let's artificially create a duplicate row for demonstration
duplicate_row = customers_clean.iloc[0].copy()
customers_with_duplicates = pd.concat([customers_clean, pd.DataFrame([duplicate_row])], ignore_index=True)
print(f"\nCustomers DataFrame shape BEFORE adding duplicate: {customers_clean.shape}")
# Fix: The variable name 'customers_deduplicated' was used before it was assigned.
# Let's print the shape of the temporary DataFrame with duplicates first.
print(f"Customers DataFrame shape AFTER adding duplicate: {customers_with_duplicates.shape}")


customers_deduplicated = customers_with_duplicates.drop_duplicates().copy()
print(f"Customers DataFrame shape AFTER removing duplicates: {customers_deduplicated.shape}")

# Check for duplicates based on specific columns
print("\nChecking for duplicates based on 'customer_id' (should be 0 if customer_id is unique key):", customers_clean.duplicated(subset=['customer_id']).sum())
# In transactions, multiple rows per transaction_id are expected if a transaction has multiple items.
# Duplicates would mean identical rows including quantity, product_id, etc.
print("Checking for exact duplicate rows in Transactions:", transactions_clean.duplicated().sum())


# --- 2.4 Handling Outliers ---
print("\n--- 2.4 Handling Outliers ---")
# Outlier detection was covered in EDA (using IQR).
# Handling strategies depend on the nature of the data and the analysis goal.

print("\nOutlier Handling Strategies (Examples):")

# Example 1: Removing outliers (based on a threshold, e.g., from IQR or Z-score)
print("\nExample: Removing outliers in 'lifetime_value' (Customers) based on IQR")
Q1 = customers_clean['lifetime_value'].quantile(0.25)
Q3 = customers_clean['lifetime_value'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f" IQR Outlier bounds for lifetime_value: [{lower_bound:.2f}, {upper_bound:.2f}]")

# Filter out rows where 'lifetime_value' is outside the bounds
customers_no_outliers_iqr = customers_clean[
 (customers_clean['lifetime_value'] >= lower_bound) & (customers_clean['lifetime_value'] <= upper_bound)
].copy()

print(f" Initial rows (Customers): {customers_clean.shape[0]}")
print(f" Rows after removing lifetime_value outliers (IQR): {customers_no_outliers_iqr.shape[0]}")


# Example 2: Capping/Winsorizing outliers (replacing outliers with the boundary value)
print("\nExample: Capping outliers in 'session_duration' (Website Visits)")
# Calculate bounds (e.g., using quantiles)
lower_bound_cap = website_visits_clean['session_duration'].quantile(0.01) # 1st percentile
upper_bound_cap = website_visits_clean['session_duration'].quantile(0.99) # 99th percentile

print(f" Capping bounds for session_duration: [{lower_bound_cap:.2f}, {upper_bound_cap:.2f}]")

website_visits_capped = website_visits_clean.copy()
# Cap values below lower_bound_cap
website_visits_capped['session_duration'] = website_visits_capped['session_duration'].clip(lower=lower_bound_cap)
# Cap values above upper_bound_cap
website_visits_capped['session_duration'] = website_visits_capped['session_duration'].clip(upper=upper_bound_cap)

print(" Original session_duration min/max:", website_visits_clean['session_duration'].min(), "/", website_visits_clean['session_duration'].max())
print(" Capped session_duration min/max:", website_visits_capped['session_duration'].min(), "/", website_visits_capped['session_duration'].max())


# Example 3: Transforming data (e.g., log transformation for skewed data)
print("\nExample: Log transformation of 'total_price' (Transactions) for skewed distribution")
# Check skewness first
print(" Skewness of total_price before transformation:", transactions_clean['total_price'].skew().round(2))
# Apply log transformation (add a small constant to handle zero/negative values if necessary)
transactions_clean['log_total_price'] = np.log1p(transactions_clean['total_price']) # log1p = log(1+x)

print(" Skewness of log_total_price after transformation:", transactions_clean['log_total_price'].skew().round(2))
print(" Transactions DataFrame with 'log_total_price':")
print(transactions_clean[['total_price', 'log_total_price']].head())
# Note: Remember to handle NaNs before transformation if needed.


# --- 2.5 Handling Inconsistent Data Formats / Types ---
print("\n--- 2.5 Handling Inconsistent Data Formats / Types ---")
# This often involves converting columns to the correct data types (e.g., object to numeric, object to datetime)
# or standardizing string formats.

print("\nExamples of Data Type Conversion (already done for dates in Part 1/EDA):")
# Ensure date columns are datetime objects (already done in Part 1/EDA)
# transactions_clean['date'] = pd.to_datetime(transactions_clean['date'])
# customers_clean['signup_date'] = pd.to_datetime(customers_clean['signup_date'])
# website_visits_clean['timestamp'] = pd.to_datetime(website_visits_clean['timestamp'])
# campaign_exposures_clean['exposure_date'] = pd.to_datetime(campaign_exposures_clean['exposure_date'])
print(" Date columns are already converted to datetime objects.")
print(" Example: transactions_clean['date'].dtype:", transactions_clean['date'].dtype)


# Example: Standardizing string formats (e.g., converting to lowercase, removing whitespace)
# Let's artificially add some inconsistent formats to 'payment_method'
transactions_clean['payment_method_inconsistent'] = transactions_clean['payment_method']
transactions_clean.loc[1, 'payment_method_inconsistent'] = ' credit card '
transactions_clean.loc[2, 'payment_method_inconsistent'] = 'PayPal '
transactions_clean.loc[3, 'payment_method_inconsistent'] = 'APPLE PAY'

print("\nExample: Standardizing 'payment_method_inconsistent' string format")
print("Before standardization:")
print(transactions_clean['payment_method_inconsistent'].unique())

transactions_clean['payment_method_standardized'] = transactions_clean['payment_method_inconsistent'].str.lower().str.strip()

print("\nAfter standardization (lowercase and strip whitespace):")
print(transactions_clean['payment_method_standardized'].unique())
transactions_clean.drop(columns=['payment_method_inconsistent', 'payment_method_standardized'], inplace=True) # Clean up temp columns


# Example: Converting data types (e.g., from object to numeric if applicable)
# If a numeric column was read as object due to non-numeric characters (e.g., '$', ','),
# you would clean it and convert.
# Example (conceptual):
# df['numeric_col'] = df['numeric_col'].str.replace('$', '', regex=False).str.replace(',', '', regex=False)
# df['numeric_col'] = pd.to_numeric(df['numeric_col'], errors='coerce') # errors='coerce' turns unparseable values into NaN

print("\nData preparation and cleaning examples complete.")
print(f"{'='*80}")


In [None]:
# ## 3. Comprehensive Exploratory Data Analysis (EDA)

# %% tags=[]
# The EDA section now becomes Part 3 and will use the potentially cleaned dataframes
# (depending on which cleaning steps were applied before calling the EDA function).
# For demonstration purposes, we will call EDA on the original data first,
# then you could call it on cleaned dataframes if you apply cleaning steps beforehand.

def perform_comprehensive_eda(data_dict, dataset_name="E-commerce"):
 """
 Performs comprehensive exploratory data analysis on the datasets.
 Includes data quality checks, descriptive statistics, and visualizations.

 Parameters:
 -----------
 data_dict : dict of pandas DataFrames
 A dictionary containing the datasets.
 dataset_name : str
 Name of the dataset for printing titles.
 """
 print(f"\n{'='*80}")
 print(f"PART 3: COMPREHENSIVE EXPLORATORY DATA ANALYSIS (EDA) - {dataset_name.upper()}")
 print(f"{'='*80}")

 for name, df in data_dict.items():
 print(f"\n--- EDA for {name.upper()} ---")

 # --- 3.1 Data Overview and Structure ---
 print("\n--- 3.1 Data Overview and Structure ---")
 print(f"\nShape: {df.shape[0]} rows, {df.shape[1]} columns")
 print("\nFirst 5 rows:")
 print(df.head())
 print("\nLast 5 rows:")
 print(df.tail())
 print("\nColumn information (dtype, non-null counts):")
 df.info()

 # --- 3.2 Data Quality Checks ---
 print("\n--- 3.2 Data Quality Checks ---")
 print("\nMissing values per column:")
 print(df.isnull().sum())

 # Visualize missing values (using Matplotlib)
 # Removed seaborn heatmap
 missing_counts = df.isnull().sum()
 missing_counts = missing_counts[missing_counts > 0].sort_values(ascending=False)
 if not missing_counts.empty:
 plt.figure(figsize=(10, 6))
 missing_counts.plot(kind='bar', color='skyblue')
 plt.title(f'Missing Values per Column in {name.upper()}')
 plt.xlabel('Columns')
 plt.ylabel('Number of Missing Values')
 plt.xticks(rotation=45, ha='right')
 plt.tight_layout()
 plt.show()
 else:
 print("No missing values to visualize.")


 print("\nNumber of duplicate rows:", df.duplicated().sum())
 if df.duplicated().sum() > 0:
 print("Sample of duplicate rows:")
 print(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head()) # Show both original and duplicate

 # Check for unique values in categorical columns
 print("\nUnique values in categorical columns (first 10 if many):")
 for col in df.select_dtypes(include='object').columns:
 unique_vals = df[col].unique()
 if len(unique_vals) > 10:
 print(f"  {col} ({len(unique_vals)} unique): {unique_vals[:10]}...")
 else:
 print(f"  {col} ({len(unique_vals)} unique): {unique_vals}")

 # --- 3.3 Descriptive Statistics ---
 print("\n--- 3.3 Descriptive Statistics ---")
 print("\nDescriptive statistics for numerical columns:")
 print(df.describe())
 print("\nDescriptive statistics for categorical columns:")
 print(df.describe(include='object'))

 # --- 3.4 Distribution Analysis (Numerical Columns) ---
 print("\n--- 3.4 Distribution Analysis (Numerical Columns) ---")
 numerical_cols = df.select_dtypes(include=np.number).columns
 if not numerical_cols.empty:
 print("\nHistograms and Box Plots for Numerical Columns:")
 for col in numerical_cols:
 plt.figure(figsize=(14, 6))

 # Histogram
 plt.subplot(1, 2, 1)
 plt.hist(df[col].dropna(), bins=30, color='lightcoral', edgecolor='black')
 plt.title(f'Distribution of {col.replace("_", " ").title()}')
 plt.xlabel(col.replace("_", " ").title())
 plt.ylabel('Frequency')

 # Box Plot
 plt.subplot(1, 2, 2)
 plt.boxplot(df[col].dropna(), vert=False, patch_artist=True, boxprops=dict(facecolor='lightgreen'))
 plt.title(f'Box Plot of {col.replace("_", " ").title()}')
 plt.xlabel(col.replace("_", " ").title())

 plt.tight_layout()
 plt.show()

 # Check for skewness and kurtosis
 print("\nSkewness and Kurtosis for Numerical Columns:")
 skewness = df[numerical_cols].skew().sort_values(ascending=False)
 kurtosis = df[numerical_cols].kurtosis().sort_values(ascending=False)
 stats_df = pd.DataFrame({'Skewness': skewness, 'Kurtosis': kurtosis})
 print(stats_df)


 # --- 3.5 Distribution Analysis (Categorical Columns) ---
 print("\n--- 3.5 Distribution Analysis (Categorical Columns) ---")
 categorical_cols = df.select_dtypes(include='object').columns
 if not categorical_cols.empty:
 print("\nValue Counts and Bar Plots for Categorical Columns:")
 for col in categorical_cols:
 print(f"\nValue counts for '{col}':")
 print(df[col].value_counts(dropna=False)) # Include NaN counts

 # Bar Plot
 plt.figure(figsize=(10, 6))
 df[col].value_counts(dropna=False).plot(kind='bar', color='teal', edgecolor='black')
 plt.title(f'Distribution of {col.replace("_", " ").title()}')
 plt.xlabel(col.replace("_", " ").title())
 plt.ylabel('Count')
 plt.xticks(rotation=45, ha='right')
 plt.tight_layout()
 plt.show()

 # --- 3.6 Relationship Analysis (Examples) ---
 print("\n--- 3.6 Relationship Analysis (Examples) ---")

 # Example: Correlation matrix for numerical features (where applicable)
 if not numerical_cols.empty and len(numerical_cols) > 1:
 print("\nCorrelation Matrix for Numerical Columns:")
 correlation_matrix = df[numerical_cols].corr()
 print(correlation_matrix)

 # Visualize correlation matrix (using Matplotlib)
 # Removed seaborn heatmap
 plt.figure(figsize=(10, 8))
 plt.matshow(correlation_matrix, fignum=plt.gcf().number, cmap='coolwarm', aspect='auto')
 plt.xticks(range(len(numerical_cols)), numerical_cols, rotation=90)
 plt.yticks(range(len(numerical_cols)), numerical_cols)
 plt.colorbar(label='Correlation Coefficient')
 plt.title('Correlation Matrix', fontsize=16)
 # Add correlation values as text
 for i in range(len(numerical_cols)):
 for j in range(len(numerical_cols)):
 plt.text(j, i, f"{correlation_matrix.iloc[i, j]:.2f}",
 ha="center", va="center", color="black", fontsize=8)
 plt.tight_layout()
 plt.show()


 # Example: Relationship between a categorical and numerical variable (e.g., avg_order_value by gender)
 if 'gender' in df.columns and 'avg_order_value' in df.columns:
 print("\nAverage Avg Order Value by Gender:")
 avg_order_by_gender = df.groupby('gender')['avg_order_value'].mean().dropna().sort_values(ascending=False)
 print(avg_order_by_gender)

 # Visualize (Bar plot)
 plt.figure(figsize=(8, 5))
 avg_order_by_gender.plot(kind='bar', color='purple', edgecolor='black')
 plt.title('Average Order Value by Gender')
 plt.xlabel('Gender')
 plt.ylabel('Average Order Value')
 plt.xticks(rotation=0)
 plt.tight_layout()
 plt.show()

 # Example: Relationship between two categorical variables (e.g., customer_segment and location)
 if 'customer_segment' in df.columns and 'location' in df.columns:
 print("\nCustomer Segment Distribution by Location (Cross-tabulation):")
 cross_tab = pd.crosstab(df['customer_segment'], df['location'])
 print(cross_tab)

 # Visualize (Stacked Bar Plot)
 # Removed seaborn heatmap
 cross_tab.plot(kind='bar', stacked=True, figsize=(12, 7))
 plt.title('Customer Segment Distribution by Location')
 plt.xlabel('Customer Segment')
 plt.ylabel('Count')
 plt.xticks(rotation=45, ha='right')
 plt.legend(title='Location')
 plt.tight_layout()
 plt.show()

 # Example: Time-based analysis (if date/timestamp columns exist)
 date_cols = df.select_dtypes(include='datetime64[ns]').columns
 if not date_cols.empty:
 print("\nTime-based Analysis Examples:")
 for date_col in date_cols:
 print(f"\nAnalysis based on '{date_col}':")
 df[date_col] = pd.to_datetime(df[date_col]) # Ensure it's datetime

 # Extract time components
 df['year'] = df[date_col].dt.year
 df['month'] = df[date_col].dt.month
 df['day_of_week'] = df[date_col].dt.dayofweek # Monday=0, Sunday=6
 df['hour'] = df[date_col].dt.hour

 # Example: Count of events over time (e.g., transactions over time)
 if name == 'transactions' and 'date' in df.columns:
 print("\nTransaction Count Over Time:")
 transactions_over_time = df.groupby(df['date'].dt.date).size()
 plt.figure(figsize=(14, 6))
 transactions_over_time.plot(kind='line', marker='o', linestyle='-', color='darkblue')
 plt.title('Daily Transaction Count Over Time')
 plt.xlabel('Date')
 plt.ylabel('Number of Transactions')
 plt.grid(True)
 plt.tight_layout()
 plt.show()

 # Example: Aggregation by month (e.g., total sales by month)
 if name == 'transactions' and 'date' in df.columns and 'total_price' in df.columns:
 print("\nTotal Sales by Month:")
 monthly_sales = df.resample('M', on='date')['total_price'].sum()
 plt.figure(figsize=(12, 6))
 monthly_sales.plot(kind='bar', color='forestgreen', edgecolor='black')
 plt.title('Total Sales by Month')
 plt.xlabel('Month')
 plt.ylabel('Total Sales')
 plt.xticks(rotation=45, ha='right')
 plt.tight_layout()
 plt.show()

 # Example: Website visits by hour of day
 if name == 'website_visits' and 'timestamp' in df.columns:
 print("\nWebsite Visits by Hour of Day:")
 visits_by_hour = df['hour'].value_counts().sort_index()
 plt.figure(figsize=(10, 6))
 visits_by_hour.plot(kind='bar', color='indianred', edgecolor='black')
 plt.title('Website Visits by Hour of Day')
 plt.xlabel('Hour of Day')
 plt.ylabel('Number of Visits')
 plt.xticks(rotation=0)
 plt.tight_layout()
 plt.show()

 # Clean up temporary time columns
 df.drop(columns=['year', 'month', 'day_of_week', 'hour'], inplace=True, errors='ignore')

 print(f"\n{'='*80}")
 print(f"EDA COMPLETE FOR {dataset_name.upper()}")
 print(f"{'='*80}")


In [None]:
# ## 4. Advanced Data Manipulation with Pandas

# %% tags=[]
print(f"\n{'='*80}")
print(f"PART 4: ADVANCED DATA MANIPULATION WITH PANDAS")
print(f"{'='*80}")

# Use the synthetic e-commerce data for manipulation examples
customers_manip = ecommerce_data['customers'].copy()
transactions_manip = ecommerce_data['transactions'].copy()
products_manip = ecommerce_data['products'].copy()
website_visits_manip = ecommerce_data['website_visits'].copy()
marketing_manip = ecommerce_data['marketing'].copy()
campaign_exposures_manip = ecommerce_data['campaign_exposures'].copy()


print("\n--- Advanced Data Manipulation Examples ---")

# --- 4.1 Merging/Joining DataFrames ---
print("\n--- 4.1 Merging/Joining DataFrames ---")

# Example: Merge customers and transactions to get customer info for each transaction
print("\nExample: Merging Transactions with Customers (Inner Join)")
# Ensure date columns are datetime objects before merging/joining on them if needed,
# or if doing time-based joins. Here we join on customer_id.
transactions_customers = pd.merge(
 transactions_manip,
 customers_manip[['customer_id', 'age_group', 'gender', 'location', 'signup_date']],
 on='customer_id',
 how='inner' # Only include transactions where the customer exists in the customers table
)
print("Shape after merging transactions and customers:", transactions_customers.shape)
print(transactions_customers.head())

# Example: Left join transactions with products to get product details for each transaction
print("\nExample: Left Joining Transactions with Products")
transactions_products = pd.merge(
 transactions_manip,
 products_manip[['product_id', 'product_name', 'category', 'price']],
 on='product_id',
 how='left' # Keep all transactions, add product info where available
)
print("Shape after left joining transactions and products:", transactions_products.shape)
print(transactions_products.head())

# Example: Combining multiple joins (e.g., transactions + customers + products)
print("\nExample: Merging Transactions, Customers, and Products")
# Start with transactions, merge customers, then merge products
transactions_full = pd.merge(transactions_manip, customers_manip[['customer_id', 'age_group', 'gender', 'location']], on='customer_id', how='left')
transactions_full = pd.merge(transactions_full, products_manip[['product_id', 'product_name', 'category']], on='product_id', how='left')
print("Shape after merging transactions, customers, and products:", transactions_full.shape)
print(transactions_full.head())

# Example: Merging website visits with products (using left join as product_id can be NaN)
print("\nExample: Left Joining Website Visits with Products")
website_visits_products = pd.merge(
 website_visits_manip,
 products_manip[['product_id', 'product_name', 'category']],
 on='product_id',
 how='left'
)
print("Shape after left joining website visits and products:", website_visits_products.shape)
print(website_visits_products.head())

# Example: Merging campaign exposures with marketing campaign details
print("\nExample: Merging Campaign Exposures with Marketing Details")
campaign_exposures_marketing = pd.merge(
 campaign_exposures_manip,
 marketing_manip[['campaign_id', 'campaign_type', 'budget']],
 on='campaign_id',
 how='left'
)
print("Shape after merging campaign exposures and marketing:", campaign_exposures_marketing.shape)
print(campaign_exposures_marketing.head())


# --- 4.2 Concatenating DataFrames ---
print("\n--- 4.2 Concatenating DataFrames ---")

# Example: Concatenating two (or more) DataFrames with the same columns
# Let's create a small second batch of synthetic customer data
customers_batch2 = generate_synthetic_ecommerce_data(n_customers=50)['customers']
customers_batch2['customer_id'] = customers_batch2['customer_id'] + customers_manip['customer_id'].max() # Ensure unique IDs

print("\nExample: Concatenating two Customer DataFrames")
print("Shape of original customers:", customers_manip.shape)
print("Shape of second customer batch:", customers_batch2.shape)

customers_combined = pd.concat([customers_manip, customers_batch2], ignore_index=True) # ignore_index=True resets the index
print("Shape after concatenation:", customers_combined.shape)
print(customers_combined.tail()) # Show rows from the second batch

# Example: Concatenating DataFrames with different columns (will result in NaNs)
# Let's concatenate customers and products (different structures)
print("\nExample: Concatenating Customers and Products (different columns)")
# This is generally not a meaningful operation unless you have a specific reason
# (e.g., stacking data from different sources with some overlapping columns).
customers_products_concat = pd.concat([customers_manip, products_manip], ignore_index=True)
print("Shape after concatenating customers and products:", customers_products_concat.shape)
print(customers_products_concat.head())
print(customers_products_concat.tail()) # Notice the NaNs where columns don't overlap


# --- 4.3 Grouping and Aggregating Data (GroupBy) ---
print("\n--- 4.3 Grouping and Aggregating Data (GroupBy) ---")

# Example: Total sales by customer
print("\nExample: Total Sales by Customer")
customer_sales = transactions_manip.groupby('customer_id')['total_price'].sum().reset_index()
customer_sales.rename(columns={'total_price': 'total_sales'}, inplace=True)
print(customer_sales.head())

# Example: Average transaction value by payment method
print("\nExample: Average Transaction Value by Payment Method")
avg_txn_by_payment = transactions_manip.groupby('payment_method')['total_price'].mean().reset_index()
avg_txn_by_payment.rename(columns={'total_price': 'average_transaction_value'}, inplace=True)
print(avg_txn_by_payment.head())

# Example: Number of transactions and total quantity sold per product
print("\nExample: Transaction Count and Total Quantity per Product")
product_summary = transactions_manip.groupby('product_id').agg(
 transaction_count=('transaction_id', 'count'), # Count occurrences of transaction_id
 total_quantity=('quantity', 'sum')
).reset_index()
print(product_summary.head())

# Example: Grouping by multiple columns (e.g., total sales by customer and month)
print("\nExample: Total Sales by Customer and Month")
# Ensure 'date' is datetime and extract month
transactions_manip['date'] = pd.to_datetime(transactions_manip['date'])
transactions_manip['month'] = transactions_manip['date'].dt.to_period('M')

monthly_customer_sales = transactions_manip.groupby(['customer_id', 'month'])['total_price'].sum().reset_index()
monthly_customer_sales.rename(columns={'total_price': 'monthly_sales'}, inplace=True)
print(monthly_customer_sales.head())
transactions_manip.drop(columns=['month'], inplace=True) # Clean up temp column


# --- 4.4 Pivoting Data ---
print("\n--- 4.4 Pivoting Data ---")

# Example: Pivot table to show total sales by location and gender
print("\nExample: Pivot Table - Total Sales by Location and Gender")
# Need a DataFrame with location, gender, and total_price
# Let's use the transactions_customers merged DataFrame from earlier
if 'location' in transactions_customers.columns and 'gender' in transactions_customers.columns and 'total_price' in transactions_customers.columns:
 sales_pivot = pd.pivot_table(
 transactions_customers,
 values='total_price',
 index='location',
 columns='gender',
 aggfunc='sum',
 fill_value=0 # Fill missing combinations with 0
 )
 print(sales_pivot)
else:
 print("Required columns ('location', 'gender', 'total_price') not available in transactions_customers for pivoting.")


# Example: Pivot table to show average session duration by device and channel
print("\nExample: Pivot Table - Average Session Duration by Device and Channel")
if 'device' in website_visits_manip.columns and 'channel' in website_visits_manip.columns and 'session_duration' in website_visits_manip.columns:
 session_pivot = pd.pivot_table(
 website_visits_manip,
 values='session_duration',
 index='device',
 columns='channel',
 aggfunc='mean',
 fill_value=0
 )
 print(session_pivot.round(2))
else:
 print("Required columns ('device', 'channel', 'session_duration') not available in website_visits_manip for pivoting.")


# --- 4.5 Melting Data ---
print("\n--- 4.5 Melting Data ---")
# Melting is the opposite of pivoting; it transforms wide format data into long format.

# Example: Melt the sales_pivot table back into long format
print("\nExample: Melting the Sales Pivot Table")
# Reset index to turn index 'location' into a column
sales_pivot_reset = sales_pivot.reset_index()
sales_melted = pd.melt(
 sales_pivot_reset,
 id_vars=['location'], # Columns to keep as is
 var_name='gender', # Name for the new column created from old column names
 value_name='total_sales' # Name for the new column created from old column values
)
print(sales_melted.head()) # Note: This will include the 'Non-binary' and 'Prefer not to say' genders as well


# --- 4.6 Applying Functions (Apply, Map, ApplyMap) ---
print("\n--- 4.6 Applying Functions (Apply, Map, ApplyMap) ---")

# Example (Apply): Apply a function along an axis of the DataFrame (row or column)
print("\nExample: Applying a function to calculate discount amount per transaction item")
# Calculate discount amount = quantity * unit_price * discount
transactions_manip['discount_amount'] = transactions_manip.apply(
 lambda row: row['quantity'] * row['unit_price'] * row['discount'],
 axis=1 # Apply the function row-wise
)
print(transactions_manip[['quantity', 'unit_price', 'discount', 'discount_amount', 'total_price']].head())
# Verify: total_price should be (quantity * unit_price) - discount_amount
# print((transactions_manip['quantity'] * transactions_manip['unit_price'] - transactions_manip['discount_amount'] - transactions_manip['total_price']).abs().sum()) # Should be close to 0


# Example (Map): Map values of a Series based on a dictionary or function
print("\nExample: Mapping age group to a numerical category (Customers)")
age_group_mapping = {
 '18-24': 1, '25-34': 2, '35-44': 3, '45-54': 4, '55-64': 5, '65+': 6
}
# Use .loc to avoid SettingWithCopyWarning if customers_manip is a slice
customers_manip.loc[:, 'age_group_numeric'] = customers_manip['age_group'].map(age_group_mapping)
print(customers_manip[['age_group', 'age_group_numeric']].head())
# Note: Map will result in NaN for any values in 'age_group' not in the mapping dictionary (including original NaNs)


# Example (ApplyMap - deprecated, use Apply with axis=None or element-wise operations):
# ApplyMap applies a function element-wise across the entire DataFrame.
# Modern Pandas prefers element-wise operations or df.apply(..., axis=None)
print("\nExample: Applying a function element-wise (Conceptual - using apply with axis=None)")
# Let's say we want to round all float values in a small DataFrame
temp_df = transactions_manip[['unit_price', 'total_price']].head()
print("\nOriginal small DataFrame:")
print(temp_df)

# Using apply with axis=None (or element-wise operations directly)
rounded_temp_df = temp_df.apply(lambda x: round(x, 1), axis=None) # Applies function to each element
print("\nRounded small DataFrame:")
print(rounded_temp_df)


# --- 4.7 Using Zip ---
print("\n--- 4.7 Using Zip ---")
# zip is a Python built-in function, not a Pandas method, but often used with Pandas.
# It pairs elements from multiple iterables.

# Example: Creating a new column by combining values from two columns
print("\nExample: Creating a 'product_info' string by zipping product name and category")
# Need a DataFrame with product_name and category
product_info_list = []
# Iterate through rows of products_manip
for index, row in products_manip.iterrows():
 # Use zip conceptually or just access row elements
 product_info = f"{row['product_name']} ({row['category']})"
 product_info_list.append(product_info)

products_manip['product_info'] = product_info_list
print(products_manip[['product_name', 'category', 'product_info']].head())

# A more "Pandas-idiomatic" way using string formatting directly on Series:
# products_manip['product_info_pandas'] = products_manip['product_name'] + ' (' + products_manip['category'] + ')'
# print(products_manip[['product_name', 'category', 'product_info_pandas']].head())


print("\nAdvanced data manipulation examples complete.")
print(f"{'='*80}")


# %% [markdown]
# ## 5. Feature Engineering

# %% tags=[]
print(f"\n{'='*80}")
print(f"PART 5: FEATURE ENGINEERING")
print(f"{'='*80}")

# Use copies of the synthetic e-commerce data for feature engineering examples
customers_fe = ecommerce_data['customers'].copy()
transactions_fe = ecommerce_data['transactions'].copy()
website_visits_fe = ecommerce_data['website_visits'].copy()
campaign_exposures_fe = ecommerce_data['campaign_exposures'].copy()
products_fe = ecommerce_data['products'].copy() # Might be useful for product-based features


print("\n--- Feature Engineering Examples ---")


In [None]:
# --- 5.1 Creating Time-Based Features ---
print("\n--- 5.1 Creating Time-Based Features ---")

# Ensure date columns are datetime objects
customers_fe['signup_date'] = pd.to_datetime(customers_fe['signup_date'])
transactions_fe['date'] = pd.to_datetime(transactions_fe['date'])
website_visits_fe['timestamp'] = pd.to_datetime(website_visits_fe['timestamp'])
campaign_exposures_fe['exposure_date'] = pd.to_datetime(campaign_exposures_fe['exposure_date'])


# Example: Features from signup_date (Customers)
print("\nExample: Features from Customer Signup Date")
customers_fe['signup_year'] = customers_fe['signup_date'].dt.year
customers_fe['signup_month'] = customers_fe['signup_date'].dt.month
customers_fe['signup_day_of_week'] = customers_fe['signup_date'].dt.dayofweek # Monday=0, Sunday=6
customers_fe['signup_day_of_year'] = customers_fe['signup_date'].dt.dayofyear
customers_fe['signup_week_of_year'] = customers_fe['signup_date'].dt.isocalendar().week.astype(int) # ISO week number
customers_fe['signup_quarter'] = customers_fe['signup_date'].dt.quarter

# Days since signup (relative to a fixed date, e.g., end of data)
latest_date = transactions_fe['date'].max() # Or website_visits_fe['timestamp'].max()
customers_fe['days_since_signup'] = (latest_date - customers_fe['signup_date']).dt.days

print(customers_fe[['signup_date', 'signup_year', 'signup_month', 'signup_day_of_week',
 'signup_day_of_year', 'signup_week_of_year', 'signup_quarter',
 'days_since_signup']].head())


# Example: Features from transaction date (Transactions)
print("\nExample: Features from Transaction Date")
transactions_fe['transaction_year'] = transactions_fe['date'].dt.year
transactions_fe['transaction_month'] = transactions_fe['date'].dt.month
transactions_fe['transaction_day_of_week'] = transactions_fe['date'].dt.dayofweek
transactions_fe['transaction_hour'] = transactions_fe['date'].dt.hour
transactions_fe['is_weekend'] = (transactions_fe['transaction_day_of_week'] >= 5).astype(int) # 1 if weekend, 0 otherwise

print(transactions_fe[['date', 'transaction_year', 'transaction_month', 'transaction_day_of_week',
 'transaction_hour', 'is_weekend']].head())


# Example: Features from website visit timestamp (Website Visits)
print("\nExample: Features from Website Visit Timestamp")
website_visits_fe['visit_year'] = website_visits_fe['timestamp'].dt.year
website_visits_fe['visit_month'] = website_visits_fe['timestamp'].dt.month
website_visits_fe['visit_day_of_week'] = website_visits_fe['timestamp'].dt.dayofweek
website_visits_fe['visit_hour'] = website_visits_fe['timestamp'].dt.hour
website_visits_fe['visit_minute'] = website_visits_fe['timestamp'].dt.minute

print(website_visits_fe[['timestamp', 'visit_year', 'visit_month', 'visit_day_of_week',
 'visit_hour', 'visit_minute']].head())



In [None]:
# --- 5.2 Aggregation-Based Features ---
print("\n--- 5.2 Aggregation-Based Features ---")
# Create features by aggregating data from one table onto another (e.g., transaction stats per customer)

# Example: Aggregate transaction data onto the customer table
print("\nExample: Aggregating Transaction Data onto Customer Table")
# Calculate total number of transactions, total spend, average spend per transaction, etc. per customer
customer_transaction_agg = transactions_fe.groupby('customer_id').agg(
 total_transactions=('transaction_id', 'nunique'), # Count unique transaction IDs per customer
 total_spend=('total_price', 'sum'),
 avg_transaction_value=('total_price', 'mean'),
 min_transaction_value=('total_price', 'min'),
 max_transaction_value=('total_price', 'max'),
 total_items_bought=('quantity', 'sum'),
 avg_items_per_transaction=('quantity', 'mean'),
 first_purchase_date=('date', 'min'),
 last_purchase_date=('date', 'max'),
 distinct_products_bought=('product_id', 'nunique')
).reset_index()

# Merge these aggregated features back into the customers DataFrame
customers_fe = pd.merge(customers_fe, customer_transaction_agg, on='customer_id', how='left')

# Calculate Recency, Frequency, Monetary (RFM) features
# Recency: Days since last purchase (relative to latest_date)
customers_fe['recency'] = (latest_date - customers_fe['last_purchase_date']).dt.days
# Frequency: Total number of transactions (already calculated as total_transactions)
# Monetary: Total spend (already calculated as total_spend)

print(customers_fe[['customer_id', 'total_transactions', 'total_spend', 'avg_transaction_value',
 'recency', 'first_purchase_date', 'last_purchase_date', 'distinct_products_bought']].head())

# Fill NaN values for customers with no transactions (e.g., 0 for counts/sums, NaN for dates/means)
customers_fe['total_transactions'].fillna(0, inplace=True)
customers_fe['total_spend'].fillna(0, inplace=True)
customers_fe['total_items_bought'].fillna(0, inplace=True)
customers_fe['distinct_products_bought'].fillna(0, inplace=True)
# avg_transaction_value will remain NaN for customers with 0 transactions, which is appropriate.
# Recency will be NaN for customers with no last_purchase_date; you might fill this with a large value or handle separately.
customers_fe['recency'].fillna((latest_date - customers_fe['signup_date']).dt.days + 1, inplace=True) # Fill with days since signup + 1


# Example: Aggregate website visit data onto the customer table
print("\nExample: Aggregating Website Visit Data onto Customer Table")
customer_visit_agg = website_visits_fe.groupby('customer_id').agg(
 total_visits=('visit_id', 'nunique'),
 total_session_duration=('session_duration', 'sum'),
 avg_session_duration=('session_duration', 'mean'),
 total_pages_viewed=('pages_viewed', 'sum'),
 avg_pages_per_visit=('pages_viewed', 'mean'),
 last_visit_timestamp=('timestamp', 'max'),
 distinct_devices_used=('device', 'nunique'),
 distinct_channels_used=('channel', 'nunique')
).reset_index()

# Merge into customers DataFrame
customers_fe = pd.merge(customers_fe, customer_visit_agg, on='customer_id', how='left')
print(customers_fe[['customer_id', 'total_visits', 'avg_session_duration', 'total_pages_viewed',
 'last_visit_timestamp', 'distinct_devices_used']].head())

# Fill NaNs for customers with no website visits
fill_cols_visit = ['total_visits', 'total_session_duration', 'total_pages_viewed',
 'distinct_devices_used', 'distinct_channels_used']
for col in fill_cols_visit:
 customers_fe[col].fillna(0, inplace=True)


# Example: Aggregate transaction data onto the product table
print("\nExample: Aggregating Transaction Data onto Product Table")
product_transaction_agg = transactions_fe.groupby('product_id').agg(
 total_sales_product=('total_price', 'sum'),
 total_quantity_sold=('quantity', 'sum'),
 avg_price_sold=('total_price', 'mean'), # Average price including discounts
 num_transactions_product=('transaction_id', 'nunique'),
 num_customers_bought=('customer_id', 'nunique')
).reset_index()

# Merge into products DataFrame
products_fe = pd.merge(products_fe, product_transaction_agg, on='product_id', how='left')
print(products_fe[['product_id', 'product_name', 'total_sales_product', 'total_quantity_sold',
 'num_customers_bought']].head())

# Fill NaNs for products with no sales
fill_cols_product = ['total_sales_product', 'total_quantity_sold', 'num_transactions_product', 'num_customers_bought']
for col in fill_cols_product:
 products_fe[col].fillna(0, inplace=True)


In [None]:
# --- 5.3 Creating Interaction Features ---
print("\n--- 5.3 Creating Interaction Features ---")
# Combine existing features to create new ones that capture interactions.

# Example: Interaction between quantity and unit_price (already exists as total_price before discount)
# Let's create one that includes discount effect explicitly
transactions_fe['price_after_discount'] = transactions_fe['unit_price'] * (1 - transactions_fe['discount'])
transactions_fe['quantity_x_price_after_discount'] = transactions_fe['quantity'] * transactions_fe['price_after_discount']
# This should be equal to total_price (before any potential NaNs were introduced)
# print((transactions_fe['quantity_x_price_after_discount'] - transactions_fe['total_price']).abs().sum()) # Check if they are close


# Example: Interaction between age_group and loyalty_score (Customers)
# This might be useful if certain age groups respond differently based on loyalty.
# Need to handle categorical 'age_group' - could use one-hot encoding or map to numerical first.
# Using the numeric age group created earlier:
if 'age_group_numeric' in customers_fe.columns and 'loyalty_score' in customers_fe.columns:
 # Handle potential NaNs in loyalty_score and age_group_numeric before multiplication
 customers_fe['loyalty_score_filled'] = customers_fe['loyalty_score'].fillna(customers_fe['loyalty_score'].mean()) # Impute NaNs for interaction
 customers_fe['age_group_numeric_filled'] = customers_fe['age_group_numeric'].fillna(customers_fe['age_group_numeric'].median()) # Impute NaNs

 customers_fe['age_loyalty_interaction'] = customers_fe['age_group_numeric_filled'] * customers_fe['loyalty_score_filled']
 print("\nExample: Age Group * Loyalty Score Interaction Feature (Customers)")
 print(customers_fe[['age_group', 'age_group_numeric', 'loyalty_score', 'age_loyalty_interaction']].head())
 customers_fe.drop(columns=['loyalty_score_filled', 'age_group_numeric_filled'], inplace=True) # Clean up temp columns


# --- 5.4 Creating Polynomial Features ---
print("\n--- 5.4 Creating Polynomial Features ---")
# Create polynomial combinations of existing features (e.g., feature^2, feature1 * feature2).
# Often used in regression.

# Example: Polynomial features for 'lifetime_value' and 'avg_order_value' (Customers)
print("\nExample: Polynomial Features for Lifetime Value and Avg Order Value (Customers)")
# Handle NaNs before creating polynomial features
customers_fe['lifetime_value_filled'] = customers_fe['lifetime_value'].fillna(customers_fe['lifetime_value'].mean())
customers_fe['avg_order_value_filled'] = customers_fe['avg_order_value'].fillna(customers_fe['avg_order_value'].mean())

customers_fe['lifetime_value_sq'] = customers_fe['lifetime_value_filled'] ** 2
customers_fe['avg_order_value_sq'] = customers_fe['avg_order_value_filled'] ** 2
customers_fe['lifetime_avg_order_interaction'] = customers_fe['lifetime_value_filled'] * customers_fe['avg_order_value_filled']

print(customers_fe[['lifetime_value', 'avg_order_value', 'lifetime_value_sq',
 'avg_order_value_sq', 'lifetime_avg_order_interaction']].head())
customers_fe.drop(columns=['lifetime_value_filled', 'avg_order_value_filled'], inplace=True) # Clean up temp columns


In [None]:
# --- 5.5 Encoding Categorical Features ---
print("\n--- 5.5 Encoding Categorical Features ---")
# Convert categorical variables into a numerical format that can be used by machine learning models.

# Example: One-Hot Encoding for 'gender' and 'location' (Customers)
print("\nExample: One-Hot Encoding 'gender' and 'location' (Customers)")
# Handle NaNs before encoding if you don't want them to become a separate category or be dropped.
# Option 1: Fill NaNs
customers_fe['gender_filled'] = customers_fe['gender'].fillna('Unknown')
customers_fe['location_filled'] = customers_fe['location'].fillna('Unknown')

customers_encoded_ohe = pd.get_dummies(
 customers_fe,
 columns=['gender_filled', 'location_filled'],
 prefix=['gender', 'location'], # Prefix for the new columns
 dummy_na=False # Set to True if you want NaN to be its own category (if not filled)
)
print(customers_encoded_ohe[['gender', 'gender_filled', 'gender_Female', 'gender_Male', 'gender_Non-binary', 'gender_Prefer not to say', 'gender_Unknown',
 'location', 'location_filled', 'location_Central', 'location_East', 'location_North', 'location_South', 'location_West', 'location_Unknown']].head())
customers_fe.drop(columns=['gender_filled', 'location_filled'], inplace=True) # Clean up temp columns


# Example: Label Encoding for 'customer_segment' (Customers)
print("\nExample: Label Encoding 'customer_segment' (Customers)")
# Label Encoding assigns a unique integer to each category. Useful for ordinal data or tree-based models.
# Handle NaNs before encoding if needed.
customers_fe['customer_segment_filled'] = customers_fe['customer_segment'].fillna('Unknown')

label_encoder = LabelEncoder()
customers_fe['customer_segment_encoded'] = label_encoder.fit_transform(customers_fe['customer_segment_filled'])
print(customers_fe[['customer_segment', 'customer_segment_filled', 'customer_segment_encoded']].head())
print("Mapping of encoded values:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))
customers_fe.drop(columns=['customer_segment_filled'], inplace=True) # Clean up temp column


In [None]:
# --- 5.6 Binning Numerical Features ---
print("\n--- 5.6 Binning Numerical Features ---")
# Convert a continuous numerical variable into discrete bins (categorical).

# Example: Binning 'loyalty_score' into loyalty levels (Customers)
print("\nExample: Binning 'loyalty_score' into Loyalty Levels (Customers)")
# Define bin edges and labels
bins = [0, 20, 50, 80, 100]
labels = ['Low', 'Medium', 'High', 'Very High']

# Handle NaNs in loyalty_score before binning
customers_fe['loyalty_score_filled'] = customers_fe['loyalty_score'].fillna(customers_fe['loyalty_score'].mean())

customers_fe['loyalty_level'] = pd.cut(customers_fe['loyalty_score_filled'], bins=bins, labels=labels, right=True, include_lowest=True)
print(customers_fe[['loyalty_score', 'loyalty_score_filled', 'loyalty_level']].head())
print("\nValue counts for 'loyalty_level':")
print(customers_fe['loyalty_level'].value_counts(dropna=False)) # Check distribution
customers_fe.drop(columns=['loyalty_score_filled'], inplace=True) # Clean up temp column


# Example: Binning 'avg_session_duration' into session duration categories (Website Visits)
print("\nExample: Binning 'avg_session_duration' into Session Duration Categories (Website Visits)")
# Need to calculate avg_session_duration per customer first if not already done
# (Assuming we use the customers_fe DataFrame which now has this aggregated feature)
if 'avg_session_duration' in customers_fe.columns:
 # Define bins (e.g., Short, Medium, Long)
 # Handle NaNs before binning
 customers_fe['avg_session_duration_filled'] = customers_fe['avg_session_duration'].fillna(customers_fe['avg_session_duration'].median()) # Use median for skewed data

 session_bins = [0, 60, 300, customers_fe['avg_session_duration_filled'].max()] # 0-60s, 60-300s, >300s
 session_labels = ['Short', 'Medium', 'Long']

 customers_fe['session_duration_category'] = pd.cut(customers_fe['avg_session_duration_filled'],
 bins=session_bins, labels=session_labels,
 right=True, include_lowest=True)
 print(customers_fe[['avg_session_duration', 'avg_session_duration_filled', 'session_duration_category']].head())
 print("\nValue counts for 'session_duration_category':")
 print(customers_fe['session_duration_category'].value_counts(dropna=False))
 customers_fe.drop(columns=['avg_session_duration_filled'], inplace=True) # Clean up temp column
else:
 print("\n'avg_session_duration' not found in customers_fe. Skipping binning example.")


In [None]:
# --- 5.7 Feature Scaling ---
print("\n--- 5.7 Feature Scaling ---")
# Normalize or standardize numerical features so they have a similar scale.
# Important for algorithms sensitive to feature scales (e.g., SVM, KNN, K-Means, Linear Regression with regularization).

# Example: Standard Scaling for 'total_spend' and 'recency' (Customers)
print("\nExample: Standard Scaling for 'total_spend' and 'recency' (Customers)")
# Use the customers_fe DataFrame with aggregated features
features_to_scale_std = ['total_spend', 'recency']

# Handle NaNs before scaling (e.g., impute)
for col in features_to_scale_std:
 if col in customers_fe.columns:
 customers_fe[f'{col}_filled_for_scaling'] = customers_fe[col].fillna(customers_fe[col].mean())
 else:
 print(f"Warning: Column '{col}' not found in customers_fe for scaling.")

# Select the filled columns for scaling
cols_for_std_scaling = [f'{col}_filled_for_scaling' for col in features_to_scale_std if f'{col}_filled_for_scaling' in customers_fe.columns]

if cols_for_std_scaling:
 scaler_std = StandardScaler()
 customers_fe[cols_for_std_scaling] = scaler_std.fit_transform(customers_fe[cols_for_std_scaling])

 # Rename scaled columns for clarity
 for col in features_to_scale_std:
 if f'{col}_filled_for_scaling' in customers_fe.columns:
 customers_fe[f'{col}_scaled_std'] = customers_fe[f'{col}_filled_for_scaling']
 customers_fe.drop(columns=[f'{col}_filled_for_scaling'], inplace=True)

 print(customers_fe[['total_spend', 'total_spend_scaled_std', 'recency', 'recency_scaled_std']].head())
else:
 print("No columns available for Standard Scaling example.")


# Example: Min-Max Scaling for 'loyalty_score' and 'avg_order_value' (Customers)
print("\nExample: Min-Max Scaling for 'loyalty_score' and 'avg_order_value' (Customers)")
features_to_scale_minmax = ['loyalty_score', 'avg_order_value']

# Handle NaNs before scaling (e.g., impute)
for col in features_to_scale_minmax:
 if col in customers_fe.columns:
 customers_fe[f'{col}_filled_for_scaling'] = customers_fe[col].fillna(customers_fe[col].mean())
 else:
 print(f"Warning: Column '{col}' not found in customers_fe for scaling.")

# Select the filled columns for scaling
cols_for_minmax_scaling = [f'{col}_filled_for_scaling' for col in features_to_scale_minmax if f'{col}_filled_for_scaling' in customers_fe.columns]

if cols_for_minmax_scaling:
 scaler_minmax = MinMaxScaler()
 customers_fe[cols_for_minmax_scaling] = scaler_minmax.fit_transform(customers_fe[cols_for_minmax_scaling])

 # Rename scaled columns for clarity
 for col in features_to_scale_minmax:
 if f'{col}_filled_for_scaling' in customers_fe.columns:
 customers_fe[f'{col}_scaled_minmax'] = customers_fe[f'{col}_filled_for_scaling']
 customers_fe.drop(columns=[f'{col}_filled_for_scaling'], inplace=True)

 print(customers_fe[['loyalty_score', 'loyalty_score_scaled_minmax', 'avg_order_value', 'avg_order_value_scaled_minmax']].head())
else:
 print("No columns available for Min-Max Scaling example.")


# --- 5.8 Dimensionality Reduction (PCA, t-SNE) ---
print("\n--- 5.8 Dimensionality Reduction (PCA, t-SNE) ---")
# Reduce the number of features while retaining important information.

# Example: PCA on numerical features (Customers)
print("\nExample: PCA on Numerical Features (Customers)")
# Select numerical features for PCA (excluding IDs and target-like features if applicable)
numerical_features_for_pca = customers_fe.select_dtypes(include=np.number).columns.tolist()
# Exclude ID columns and potentially features derived from aggregation that might be target-like
exclude_cols_pca = ['customer_id', 'age_group_numeric', 'total_transactions', 'total_spend',
 'avg_transaction_value', 'min_transaction_value', 'max_transaction_value',
 'total_items_bought', 'avg_items_per_transaction', 'recency',
 'total_visits', 'total_session_duration', 'avg_session_duration',
 'total_pages_viewed', 'avg_pages_per_visit', 'distinct_devices_used',
 'distinct_channels_used', 'lifetime_value_sq', 'avg_order_value_sq',
 'lifetime_avg_order_interaction', 'age_loyalty_interaction',
 'total_spend_scaled_std', 'recency_scaled_std', 'loyalty_score_scaled_minmax',
 'avg_order_value_scaled_minmax'] # Exclude scaled/derived features

numerical_features_for_pca = [col for col in numerical_features_for_pca if col not in exclude_cols_pca]

print(f"Features used for PCA: {numerical_features_for_pca}")

if numerical_features_for_pca:
 X_pca = customers_fe[numerical_features_for_pca].dropna() # Handle NaNs for PCA

 if not X_pca.empty:
 # Scale the data before applying PCA
 scaler_pca = StandardScaler()
 X_pca_scaled = scaler_pca.fit_transform(X_pca)

 # Apply PCA
 pca = PCA(n_components=2) # Reduce to 2 principal components
 X_pca_result = pca.fit_transform(X_pca_scaled)

 # Add PCA components back to the DataFrame (aligning by index)
 customers_fe_pca = customers_fe.copy()
 # Create a temporary DataFrame for PCA results with the original index
 pca_results_df = pd.DataFrame(X_pca_result, columns=['PCA1', 'PCA2'], index=X_pca.index)
 # Merge using the index
 customers_fe_pca = customers_fe_pca.merge(pca_results_df, left_index=True, right_index=True, how='left')


 print("\nCustomers DataFrame with PCA components:")
 print(customers_fe_pca[['customer_id'] + numerical_features_for_pca + ['PCA1', 'PCA2']].head())

 # Visualize PCA results (scatter plot)
 plt.figure(figsize=(10, 8))
 # Use original DataFrame to include categorical info for coloring if desired
 plt.scatter(customers_fe_pca['PCA1'], customers_fe_pca['PCA2'], alpha=0.5) # Simple scatter
 plt.title('PCA of Customer Numerical Features')
 plt.xlabel('Principal Component 1')
 plt.ylabel('Principal Component 2')
 plt.grid(True)
 plt.show()

 print("\nExplained variance ratio by principal components:", pca.explained_variance_ratio_)
 print("Total explained variance by 2 components:", pca.explained_variance_ratio_.sum().round(4))

 else:
 print("No non-null data available for PCA after dropping NaNs in selected columns.")
else:
 print("No suitable numerical features found for PCA after exclusions.")


# Example: t-SNE on numerical features (Customers)
print("\nExample: t-SNE on Numerical Features (Customers)")
# t-SNE is computationally more expensive and typically used for visualization.
# Use a smaller subset of data for demonstration if the dataset is large.
# Let's use the same scaled data from PCA example, but maybe a subset of customers.
if 'PCA1' in customers_fe_pca.columns: # Check if PCA was successful and created the PCA DataFrame
 # Use the scaled data X_pca_scaled, subsetting if necessary
 # For demonstration, let's use a random sample of 200 customers
 if len(X_pca_scaled) > 200:
 sample_indices = np.random.choice(X_pca_scaled.shape[0], 200, replace=False)
 X_tsne_sample = X_pca_scaled[sample_indices]
 tsne_original_indices = X_pca.iloc[sample_indices].index # Get original indices of the sample
 print(f"Using a random sample of 200 customers for t-SNE.")
 else:
 X_tsne_sample = X_pca_scaled
 tsne_original_indices = X_pca.index
 print(f"Using all {len(X_tsne_sample)} customers for t-SNE.")


 if len(X_tsne_sample) > 1: # t-SNE requires at least 2 samples
 # Apply t-SNE
 tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=300) # Common parameters
 X_tsne_result = tsne.fit_transform(X_tsne_sample)

 # Create a DataFrame for t-SNE results with the original sample indices
 tsne_results_df = pd.DataFrame(X_tsne_result, columns=['TSNE1', 'TSNE2'], index=tsne_original_indices)

 # Merge t-SNE results back to the main customers DataFrame (using the temporary PCA dataframe for convenience)
 customers_fe_tsne = customers_fe_pca.merge(tsne_results_df, left_index=True, right_index=True, how='left')

 print("\nCustomers DataFrame with t-SNE components (for sampled data):")
 # Display head might not show TSNE if sample didn't include first rows
 print(customers_fe_tsne[['customer_id', 'TSNE1', 'TSNE2']].dropna().head())


 # Visualize t-SNE results (scatter plot)
 plt.figure(figsize=(10, 8))
 # Plot only the sampled data points that have TSNE results
 plt.scatter(customers_fe_tsne['TSNE1'], customers_fe_tsne['TSNE2'], alpha=0.6)
 plt.title('t-SNE of Customer Numerical Features (Sampled Data)')
 plt.xlabel('t-SNE Component 1')
 plt.ylabel('t-SNE Component 2')
 plt.grid(True)
 plt.show()
 else:
 print("Not enough samples (need at least 2) to perform t-SNE.")
else:
 print("PCA results not available. Skipping t-SNE example.")


print("\nFeature engineering examples complete.")
print(f"{'='*80}")


In [None]:
# ## 6. Regression Analysis

# %% tags=[]
print(f"\n{'='*80}")
print(f"PART 6: REGRESSION ANALYSIS")
print(f"{'='*80}")

# Use the California Housing dataset for regression examples
# Target variable: 'target' (Median house value in 100k USD)
# Features: Various housing and demographic metrics

print("\n--- Regression Analysis Examples (using California Housing Dataset) ---")

# Load the dataset (already loaded in Part 1)
# housing_df = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)
# housing_df['target'] = housing_data.target

# --- 6.1 Data Preparation for Regression ---
print("\n--- 6.1 Data Preparation for Regression ---")

# Separate features (X) and target (y)
X = housing_df.drop('target', axis=1)
y = housing_df['target']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, {y_test.shape}")

# Identify numerical and categorical features (California Housing is all numerical)
numerical_features = X.columns.tolist()
categorical_features = [] # No categorical features in this dataset

# Create a preprocessing pipeline (scaling numerical features)
# Use ColumnTransformer even if only numerical for good practice with mixed types
preprocessor = ColumnTransformer(
 transformers=[
 ('num', StandardScaler(), numerical_features)
 ],
 remainder='passthrough' # Keep other columns (none in this case)
)


In [None]:
# --- 6.2 Linear Regression ---
print("\n--- 6.2 Linear Regression ---")

# Create a pipeline with preprocessing and Linear Regression
pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
 ('regressor', LinearRegression())])

# Train the model
pipeline_lr.fit(X_train, y_train)

# Make predictions
y_pred_lr = pipeline_lr.predict(X_test)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"\nLinear Regression Results:")
print(f"  Mean Squared Error (MSE): {mse_lr:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_lr:.4f}")
print(f"  R-squared (R2): {r2_lr:.4f}")

# Visualize predictions vs actual (scatter plot)
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_lr, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) # Diagonal line
plt.title('Linear Regression: Actual vs Predicted House Values')
plt.xlabel('Actual House Value ($100k)')
plt.ylabel('Predicted House Value ($100k)')
plt.grid(True)
plt.show()


In [None]:
# --- 6.3 Ridge Regression (with Regularization) ---
print("\n--- 6.3 Ridge Regression (with Regularization) ---")

# Create a pipeline with preprocessing and Ridge Regression
pipeline_ridge = Pipeline(steps=[('preprocessor', preprocessor),
 ('regressor', Ridge(alpha=1.0))]) # alpha is the regularization strength

# Train the model
pipeline_ridge.fit(X_train, y_train)

# Make predictions
y_pred_ridge = pipeline_ridge.predict(X_test)

# Evaluate the model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"\nRidge Regression Results (alpha=1.0):")
print(f"  Mean Squared Error (MSE): {mse_ridge:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_ridge:.4f}")
print(f"  R-squared (R2): {r2_ridge:.4f}")


In [None]:
# --- 6.4 Lasso Regression (with Regularization) ---
print("\n--- 6.4 Lasso Regression (with Regularization) ---")

# Create a pipeline with preprocessing and Lasso Regression
pipeline_lasso = Pipeline(steps=[('preprocessor', preprocessor),
 ('regressor', Lasso(alpha=0.1))]) # alpha is the regularization strength

# Train the model
pipeline_lasso.fit(X_train, y_train)

# Make predictions
y_pred_lasso = pipeline_lasso.predict(X_test)

# Evaluate the model
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"\nLasso Regression Results (alpha=0.1):")
print(f"  Mean Squared Error (MSE): {mse_lasso:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_lasso:.4f}")
print(f"  R-squared (R2): {r2_lasso:.4f}")


In [None]:
# --- 6.5 ElasticNet Regression (Combination of Ridge and Lasso) ---
print("\n--- 6.5 ElasticNet Regression (Combination of Ridge and Lasso) ---")

# Create a pipeline with preprocessing and ElasticNet Regression
pipeline_enet = Pipeline(steps=[('preprocessor', preprocessor),
 ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))]) # alpha and l1_ratio

# Train the model
pipeline_enet.fit(X_train, y_train)

# Make predictions
y_pred_enet = pipeline_enet.predict(X_test)

# Evaluate the model
mse_enet = mean_squared_error(y_test, y_pred_enet)
rmse_enet = np.sqrt(mse_enet)
r2_enet = r2_score(y_test, y_pred_enet)

print(f"\nElasticNet Regression Results (alpha=0.1, l1_ratio=0.5):")
print(f"  Mean Squared Error (MSE): {mse_enet:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_enet:.4f}")
print(f"  R-squared (R2): {r2_enet:.4f}")


In [None]:
# --- 6.6 Random Forest Regressor ---
print("\n--- 6.6 Random Forest Regressor ---")

# Create a pipeline with preprocessing and Random Forest Regressor
pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor),
 ('regressor', RandomForestRegressor(n_estimators=10, random_state=42))])

# Train the model
pipeline_rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = pipeline_rf.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"\nRandom Forest Regressor Results:")
print(f"  Mean Squared Error (MSE): {mse_rf:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_rf:.4f}")
print(f"  R-squared (R2): {r2_rf:.4f}")

# Feature Importance (from Random Forest)
print("\nRandom Forest Feature Importances:")
# Get feature names after preprocessing
feature_names = numerical_features # Since no categorical features were one-hot encoded
importances = pipeline_rf.named_steps['regressor'].feature_importances_
feature_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)
print(feature_importances)

# Visualize Feature Importances
plt.figure(figsize=(10, 6))
feature_importances.plot(kind='bar', color='darkorange')
plt.title('Random Forest Regressor Feature Importances')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# --- 6.7 Gradient Boosting Regressor ---
print("\n--- 6.7 Gradient Boosting Regressor ---")

# Create a pipeline with preprocessing and Gradient Boosting Regressor
pipeline_gbr = Pipeline(steps=[('preprocessor', preprocessor),
 ('regressor', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))])

# Train the model
pipeline_gbr.fit(X_train, y_train)

# Make predictions
y_pred_gbr = pipeline_gbr.predict(X_test)

# Evaluate the model
mse_gbr = mean_squared_error(y_test, y_pred_gbr)
rmse_gbr = np.sqrt(mse_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

print(f"\nGradient Boosting Regressor Results:")
print(f"  Mean Squared Error (MSE): {mse_gbr:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_gbr:.4f}")
print(f"  R-squared (R2): {r2_gbr:.4f}")


In [None]:
# --- 6.8 Support Vector Regressor (SVR) ---
print("\n--- 6.8 Support Vector Regressor (SVR) ---")

# Create a pipeline with preprocessing and SVR
pipeline_svr = Pipeline(steps=[('preprocessor', preprocessor),
 ('regressor', SVR(kernel='rbf', C=1.0, epsilon=0.1))]) # RBF kernel is common

# Train the model (SVR can be slower on larger datasets)
print("\nTraining SVR (this might take a moment)...")
pipeline_svr.fit(X_train, y_train)
print("SVR training complete.")

# Make predictions
y_pred_svr = pipeline_svr.predict(X_test)

# Evaluate the model
mse_svr = mean_squared_error(y_test, y_pred_svr)
rmse_svr = np.sqrt(mse_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print(f"\nSupport Vector Regressor (SVR) Results:")
print(f"  Mean Squared Error (MSE): {mse_svr:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_svr:.4f}")
print(f"  R-squared (R2): {r2_svr:.4f}")


In [None]:
# --- 6.9 K-Nearest Neighbors Regressor ---
print("\n--- 6.9 K-Nearest Neighbors Regressor ---")

# Create a pipeline with preprocessing and KNN Regressor
pipeline_knn_r = Pipeline(steps=[('preprocessor', preprocessor),
 ('regressor', KNeighborsRegressor(n_neighbors=5))]) # Use 5 neighbors

# Train the model
pipeline_knn_r.fit(X_train, y_train)

# Make predictions
y_pred_knn_r = pipeline_knn_r.predict(X_test)

# Evaluate the model
mse_knn_r = mean_squared_error(y_test, y_pred_knn_r)
rmse_knn_r = np.sqrt(mse_knn_r)
r2_knn_r = r2_score(y_test, y_pred_knn_r)

print(f"\nK-Nearest Neighbors Regressor Results (n_neighbors=5):")
print(f"  Mean Squared Error (MSE): {mse_knn_r:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_knn_r:.4f}")
print(f"  R-squared (R2): {r2_knn_r:.4f}")


In [None]:
# --- 6.10 Cross-Validation for Model Evaluation ---
print("\n--- 6.10 Cross-Validation for Model Evaluation ---")

# Use cross-validation to get a more robust estimate of model performance
# Example using Linear Regression pipeline
print("\nCross-validation results for Linear Regression (R-squared):")
cv_scores_lr = cross_val_score(pipeline_lr, X, y, cv=5, scoring='r2') # 5-fold cross-validation
print(f"  R-squared scores: {cv_scores_lr}")
print(f"  Mean R-squared: {cv_scores_lr.mean():.4f}")
print(f"  Standard Deviation of R-squared: {cv_scores_lr.std():.4f}")

# Example using Random Forest Regressor pipeline
print("\nCross-validation results for Random Forest Regressor (Negative MSE):")
# Use negative MSE because cross_val_score maximizes the score, so we minimize negative MSE
cv_scores_rf_mse = cross_val_score(pipeline_rf, X, y, cv=5, scoring='neg_mean_squared_error')
# Convert back to positive MSE and take the square root for RMSE
cv_rmse_rf = np.sqrt(-cv_scores_rf_mse)
print(f"  RMSE scores: {cv_rmse_rf}")
print(f"  Mean RMSE: {cv_rmse_rf.mean():.4f}")
print(f"  Standard Deviation of RMSE: {cv_rmse_rf.std():.4f}")


In [None]:
# --- 6.11 Hyperparameter Tuning (Example with GridSearchCV) ---
print("\n--- 6.11 Hyperparameter Tuning (Example with GridSearchCV) ---")

# Example: Tuning Ridge Regression alpha
print("\nExample: Hyperparameter Tuning for Ridge Regression (alpha) using GridSearchCV")

# Define the parameter grid to search
param_grid_ridge = {
 'regressor__alpha': [0.1, 1.0, 10.0, 100.0] # Note the 'regressor__' prefix for pipeline steps
}

# Create GridSearchCV object
grid_search_ridge = GridSearchCV(pipeline_ridge, param_grid_ridge, cv=5, scoring='neg_mean_squared_error', n_jobs=-1) # Use all available cores

# Perform the grid search
print("Performing GridSearchCV for Ridge (this might take a moment)...")
grid_search_ridge.fit(X_train, y_train)
print("GridSearchCV complete.")

# Get the best parameters and best score
print(f"\nBest parameters for Ridge: {grid_search_ridge.best_params_}")
print(f"Best cross-validation negative MSE: {grid_search_ridge.best_score_:.4f}")
print(f"Best cross-validation RMSE: {np.sqrt(-grid_search_ridge.best_score_):.4f}")

# Evaluate on the test set with the best model
best_ridge_model = grid_search_ridge.best_estimator_
y_pred_best_ridge = best_ridge_model.predict(X_test)
mse_best_ridge = mean_squared_error(y_test, y_pred_best_ridge)
rmse_best_ridge = np.sqrt(mse_best_ridge)
r2_best_ridge = r2_score(y_test, y_pred_best_ridge)

print(f"\nBest Ridge Model Results on Test Set:")
print(f"  Mean Squared Error (MSE): {mse_best_ridge:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_best_ridge:.4f}")
print(f"  R-squared (R2): {r2_best_ridge:.4f}")


# Example: Tuning Random Forest Regressor (subset of parameters for speed)
print("\nExample: Hyperparameter Tuning for Random Forest Regressor using RandomizedSearchCV")

# Define the parameter distribution to sample from
param_dist_rf = {
 'regressor__n_estimators': [50, 100, 200],
 'regressor__max_depth': [None, 10, 20, 30],
 'regressor__min_samples_split': [2, 5, 10],
 'regressor__min_samples_leaf': [1, 2, 4]
}

# Create RandomizedSearchCV object
random_search_rf = RandomizedSearchCV(pipeline_rf, param_dist_rf, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1) # Sample 10 combinations

# Perform the randomized search
print("\nPerforming RandomizedSearchCV for Random Forest (this might take a moment)...")
random_search_rf.fit(X_train, y_train)
print("RandomizedSearchCV complete.")

# Get the best parameters and best score
print(f"\nBest parameters for Random Forest: {random_search_rf.best_params_}")
print(f"Best cross-validation negative MSE: {random_search_rf.best_score_:.4f}")
print(f"Best cross-validation RMSE: {np.sqrt(-random_search_rf.best_score_):.4f}")

# Evaluate on the test set with the best model
best_rf_model = random_search_rf.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)
mse_best_rf = mean_squared_error(y_test, y_pred_best_rf)
rmse_best_rf = np.sqrt(mse_best_rf)
r2_best_rf = r2_score(y_test, y_pred_best_rf)

print(f"\nBest Random Forest Model Results on Test Set:")
print(f"  Mean Squared Error (MSE): {mse_best_rf:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_best_rf:.4f}")
print(f"  R-squared (R2): {r2_best_rf:.4f}")


print("\nRegression analysis examples complete.")
print(f"{'='*80}")


In [None]:
 ## 7. Classification Analysis

# %% tags=[]
print(f"\n{'='*80}")
print(f"PART 7: CLASSIFICATION ANALYSIS")
print(f"{'='*80}")

# Use the Wine dataset for classification examples
# Target variable: 'target' (Wine type: 0, 1, or 2)
# Features: Various chemical properties of wine

print("\n--- Classification Analysis Examples (using Wine Dataset) ---")

# Load the dataset (already loaded in Part 1)
# wine_data = load_wine()
# wine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)
# wine_df['target'] = wine_data.target

# --- 7.1 Data Preparation for Classification ---
print("\n--- 7.1 Data Preparation for Classification ---")

# Separate features (X) and target (y)
X_clf = wine_df.drop('target', axis=1)
y_clf = wine_df['target']

# Split data into training and testing sets
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf) # Stratify to maintain class distribution

print(f"Training data shape: {X_train_clf.shape}, {y_train_clf.shape}")
print(f"Testing data shape: {X_test_clf.shape}, {y_test_clf.shape}")
print("\nClass distribution in training set:")
print(y_train_clf.value_counts(normalize=True))
print("\nClass distribution in testing set:")
print(y_test_clf.value_counts(normalize=True))


# Identify numerical and categorical features (Wine dataset is all numerical)
numerical_features_clf = X_clf.columns.tolist()
categorical_features_clf = [] # No categorical features in this dataset

# Create a preprocessing pipeline (scaling numerical features)
preprocessor_clf = ColumnTransformer(
 transformers=[
 ('num', StandardScaler(), numerical_features_clf)
 ],
 remainder='passthrough'
)


In [None]:
# --- 7.2 Logistic Regression ---
print("\n--- 7.2 Logistic Regression ---")

# Create a pipeline with preprocessing and Logistic Regression
pipeline_lr_clf = Pipeline(steps=[('preprocessor', preprocessor_clf),
 ('classifier', LogisticRegression(multi_class='auto', solver='liblinear', random_state=42))]) # Use liblinear for smaller datasets

# Train the model
pipeline_lr_clf.fit(X_train_clf, y_train_clf)

# Make predictions
y_pred_lr_clf = pipeline_lr_clf.predict(X_test_clf)
y_prob_lr_clf = pipeline_lr_clf.predict_proba(X_test_clf) # Probabilities for ROC/AUC

# Evaluate the model
accuracy_lr_clf = accuracy_score(y_test_clf, y_pred_lr_clf)
conf_matrix_lr_clf = confusion_matrix(y_test_clf, y_pred_lr_clf)
class_report_lr_clf = classification_report(y_test_clf, y_pred_lr_clf)

print(f"\nLogistic Regression Results:")
print(f"  Accuracy: {accuracy_lr_clf:.4f}")
print("\n  Confusion Matrix:")
print(conf_matrix_lr_clf)
print("\n  Classification Report:")
print(class_report_lr_clf)

# For multi-class, ROC AUC is calculated differently (e.g., One-vs-Rest or One-vs-One)
# Let's calculate macro-averaged ROC AUC
try:
 roc_auc_lr_clf = roc_auc_score(y_test_clf, y_prob_lr_clf, multi_class='ovr', average='macro')
 print(f"  Macro-averaged ROC AUC (One-vs-Rest): {roc_auc_lr_clf:.4f}")
except ValueError as e:
 print(f"  Could not calculate ROC AUC: {e}") # May happen with very few samples per class


In [None]:
# --- 7.3 Random Forest Classifier ---
print("\n--- 7.3 Random Forest Classifier ---")

# Create a pipeline with preprocessing and Random Forest Classifier
pipeline_rf_clf = Pipeline(steps=[('preprocessor', preprocessor_clf),
 ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

# Train the model
pipeline_rf_clf.fit(X_train_clf, y_train_clf)

# Make predictions
y_pred_rf_clf = pipeline_rf_clf.predict(X_test_clf)
y_prob_rf_clf = pipeline_rf_clf.predict_proba(X_test_clf)

# Evaluate the model
accuracy_rf_clf = accuracy_score(y_test_clf, y_pred_rf_clf)
conf_matrix_rf_clf = confusion_matrix(y_test_clf, y_pred_rf_clf)
class_report_rf_clf = classification_report(y_test_clf, y_pred_rf_clf)

print(f"\nRandom Forest Classifier Results:")
print(f"  Accuracy: {accuracy_rf_clf:.4f}")
print("\n  Confusion Matrix:")
print(conf_matrix_rf_clf)
print("\n  Classification Report:")
print(class_report_rf_clf)

try:
 roc_auc_rf_clf = roc_auc_score(y_test_clf, y_prob_rf_clf, multi_class='ovr', average='macro')
 print(f"  Macro-averaged ROC AUC (One-vs-Rest): {roc_auc_rf_clf:.4f}")
except ValueError as e:
 print(f"  Could not calculate ROC AUC: {e}")


In [None]:
# --- 7.4 Support Vector Classifier (SVC) ---
print("\n--- 7.4 Support Vector Classifier (SVC) ---")

# Create a pipeline with preprocessing and SVC
pipeline_svc = Pipeline(steps=[('preprocessor', preprocessor_clf),
 ('classifier', SVC(kernel='rbf', probability=True, random_state=42))]) # probability=True needed for predict_proba

# Train the model (SVC can be slower)
print("\nTraining SVC (this might take a moment)...")
pipeline_svc.fit(X_train_clf, y_train_clf)
print("SVC training complete.")

# Make predictions
y_pred_svc = pipeline_svc.predict(X_test_clf)
y_prob_svc = pipeline_svc.predict_proba(X_test_clf)

# Evaluate the model
accuracy_svc = accuracy_score(y_test_clf, y_pred_svc)
conf_matrix_svc = confusion_matrix(y_test_clf, y_pred_svc)
class_report_svc = classification_report(y_test_clf, y_pred_svc)

print(f"\nSupport Vector Classifier (SVC) Results:")
print(f"  Accuracy: {accuracy_svc:.4f}")
print("\n  Confusion Matrix:")
print(conf_matrix_svc)
print("\n  Classification Report:")
print(class_report_svc)

try:
 roc_auc_svc = roc_auc_score(y_test_clf, y_prob_svc, multi_class='ovr', average='macro')
 print(f"  Macro-averaged ROC AUC (One-vs-Rest): {roc_auc_svc:.4f}")
except ValueError as e:
 print(f"  Could not calculate ROC AUC: {e}")


In [None]:
# --- 7.5 K-Nearest Neighbors Classifier ---
print("\n--- 7.5 K-Nearest Neighbors Classifier ---")

# Create a pipeline with preprocessing and KNN Classifier
pipeline_knn_c = Pipeline(steps=[('preprocessor', preprocessor_clf),
 ('classifier', KNeighborsClassifier(n_neighbors=5))]) # Use 5 neighbors

# Train the model
pipeline_knn_c.fit(X_train_clf, y_train_clf)

# Make predictions
y_pred_knn_c = pipeline_knn_c.predict(X_test_clf)
y_prob_knn_c = pipeline_knn_c.predict_proba(X_test_clf)

# Evaluate the model
accuracy_knn_c = accuracy_score(y_test_clf, y_pred_knn_c)
conf_matrix_knn_c = confusion_matrix(y_test_clf, y_pred_knn_c)
class_report_knn_c = classification_report(y_test_clf, y_pred_knn_c)

print(f"\nK-Nearest Neighbors Classifier Results (n_neighbors=5):")
print(f"  Accuracy: {accuracy_knn_c:.4f}")
print("\n  Confusion Matrix:")
print(conf_matrix_knn_c)
print("\n  Classification Report:")
print(class_report_knn_c)

try:
 roc_auc_knn_c = roc_auc_score(y_test_clf, y_prob_knn_c, multi_class='ovr', average='macro')
 print(f"  Macro-averaged ROC AUC (One-vs-Rest): {roc_auc_knn_c:.4f}")
except ValueError as e:
 print(f"  Could not calculate ROC AUC: {e}")


In [None]:

# --- 7.6 Cross-Validation for Classification ---
print("\n--- 7.6 Cross-Validation for Classification ---")

# Example using Logistic Regression pipeline
print("\nCross-validation results for Logistic Regression (Accuracy):")
cv_scores_lr_clf = cross_val_score(pipeline_lr_clf, X_clf, y_clf, cv=5, scoring='accuracy') # 5-fold cross-validation
print(f"  Accuracy scores: {cv_scores_lr_clf}")
print(f"  Mean Accuracy: {cv_scores_lr_clf.mean():.4f}")
print(f"  Standard Deviation of Accuracy: {cv_scores_lr_clf.std():.4f}")

# Example using Random Forest Classifier pipeline
print("\nCross-validation results for Random Forest Classifier (F1-score - macro):")
cv_scores_rf_f1 = cross_val_score(pipeline_rf_clf, X_clf, y_clf, cv=5, scoring='f1_macro') # Macro-averaged F1-score
print(f"  F1-macro scores: {cv_scores_rf_f1}")
print(f"  Mean F1-macro: {cv_scores_rf_f1.mean():.4f}")
print(f"  Standard Deviation of F1-macro: {cv_scores_rf_f1.std():.4f}")

In [None]:
# --- 7.7 Hyperparameter Tuning (Example with GridSearchCV) ---
print("\n--- 7.7 Hyperparameter Tuning (Example with GridSearchCV) ---")

# Example: Tuning SVC kernel and C parameter
print("\nExample: Hyperparameter Tuning for SVC using GridSearchCV")

# Define the parameter grid to search
param_grid_svc = {
 'classifier__kernel': ['linear', 'rbf'],
 'classifier__C': [0.1, 1, 10]
}

# Create GridSearchCV object
grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=5, scoring='accuracy', n_jobs=-1)

# Perform the grid search
print("Performing GridSearchCV for SVC (this might take a moment)...")
grid_search_svc.fit(X_train_clf, y_train_clf)
print("GridSearchCV complete.")

# Get the best parameters and best score
print(f"\nBest parameters for SVC: {grid_search_svc.best_params_}")
print(f"Best cross-validation accuracy: {grid_search_svc.best_score_:.4f}")

# Evaluate on the test set with the best model
best_svc_model = grid_search_svc.best_estimator_
y_pred_best_svc = best_svc_model.predict(X_test_clf)
accuracy_best_svc = accuracy_score(y_test_clf, y_pred_best_svc)

print(f"\nBest SVC Model Results on Test Set:")
print(f"  Accuracy: {accuracy_best_svc:.4f}")


# Example: Tuning Random Forest Classifier (subset of parameters)
print("\nExample: Hyperparameter Tuning for Random Forest Classifier using RandomizedSearchCV")

# Define the parameter distribution to sample from
param_dist_rf_clf = {
 'classifier__n_estimators': [50, 100, 200, 300],
 'classifier__max_depth': [None, 5, 10, 15],
 'classifier__min_samples_split': [2, 5, 10],
 'classifier__min_samples_leaf': [1, 2, 4],
 'classifier__criterion': ['gini', 'entropy']
}

# Create RandomizedSearchCV object
random_search_rf_clf = RandomizedSearchCV(pipeline_rf_clf, param_dist_rf_clf, n_iter=20, cv=5, scoring='accuracy', random_state=42, n_jobs=-1) # Sample 20 combinations

# Perform the randomized search
print("\nPerforming RandomizedSearchCV for Random Forest Classifier (this might take a moment)...")
random_search_rf_clf.fit(X_train_clf, y_train_clf)
print("RandomizedSearchCV complete.")

# Get the best parameters and best score
print(f"\nBest parameters for Random Forest Classifier: {random_search_rf_clf.best_params_}")
print(f"Best cross-validation accuracy: {random_search_rf_clf.best_score_:.4f}")

# Evaluate on the test set with the best model
best_rf_clf_model = random_search_rf_clf.best_estimator_
y_pred_best_rf_clf = best_rf_clf_model.predict(X_test_clf)
accuracy_best_rf_clf = accuracy_score(y_test_clf, y_pred_best_rf_clf)

print(f"\nBest Random Forest Classifier Model Results on Test Set:")
print(f"  Accuracy: {accuracy_best_rf_clf:.4f}")


In [None]:
# --- 7.8 Visualizing Classification Results (Example: ROC Curve for Binary Classification) ---
print("\n--- 7.8 Visualizing Classification Results (Example: ROC Curve for Binary Classification) ---")
# ROC curves are typically for binary classification.
# For multi-class, you can plot one-vs-rest or one-vs-one ROC curves.
# Let's demonstrate a One-vs-Rest ROC curve for class 1 vs all others using Logistic Regression.

# Need binary target for ROC curve plotting
# Let's create a binary target: Class 1 vs Not Class 1
y_test_clf_binary = (y_test_clf == 1).astype(int)
# Get probabilities for the positive class (Class 1)
# y_prob_lr_clf is already calculated as predict_proba, which gives probabilities for each class.
# We need the probability of belonging to class 1.
# Find the column index corresponding to class 1 in the classes_ attribute of the fitted classifier
class_1_index = list(pipeline_lr_clf.named_steps['classifier'].classes_).index(1)
y_prob_lr_clf_positive = y_prob_lr_clf[:, class_1_index]


# Calculate ROC curve points
fpr, tpr, thresholds = roc_curve(y_test_clf_binary, y_prob_lr_clf_positive)
roc_auc = roc_auc_score(y_test_clf_binary, y_prob_lr_clf_positive)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') # Diagonal random guess line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (Class 1 vs Rest)')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()


# Example: Precision-Recall Curve (Binary Classification)
print("\n--- Example: Precision-Recall Curve (Binary Classification) ---")
# Using the same binary target and probabilities as for the ROC curve
precision, recall, _ = precision_recall_curve(y_test_clf_binary, y_prob_lr_clf_positive)
average_precision = average_precision_score(y_test_clf_binary, y_prob_lr_clf_positive)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2, label=f'Precision-Recall curve (AP = {average_precision:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve (Class 1 vs Rest)')
plt.legend(loc="lower left")
plt.grid(True)
plt.show()


print("\nClassification analysis examples complete.")
print(f"{'='*80}")


# %% [markdown]


In [None]:
# ## 8. Clustering Analysis

# %% tags=[]
print(f"\n{'='*80}")
print(f"PART 8: CLUSTERING ANALYSIS")
print(f"{'='*80}")

# Use the Wine dataset (without the target variable) for clustering examples
# We will try to find clusters in the data based on the features.

print("\n--- Clustering Analysis Examples (using Wine Dataset features) ---")

# Use the features from the Wine dataset
X_cluster = wine_df.drop('target', axis=1).copy()


In [None]:
# --- 8.1 Data Preparation for Clustering ---
print("\n--- 8.1 Data Preparation for Clustering ---")

# Clustering algorithms are sensitive to the scale of features.
# Standardize the features.
scaler_cluster = StandardScaler()
X_scaled_cluster = scaler_cluster.fit_transform(X_cluster)
X_scaled_cluster_df = pd.DataFrame(X_scaled_cluster, columns=X_cluster.columns)

print("Scaled features for clustering (first 5 rows):")
print(X_scaled_cluster_df.head())


In [None]:
# --- 8.2 K-Means Clustering ---
print("\n--- 8.2 K-Means Clustering ---")

# Determine the optimal number of clusters (Elbow Method)
print("\nDetermining optimal number of clusters using the Elbow Method (K-Means)...")
sse = [] # Sum of squared errors
k_range = range(1, 11) # Try K from 1 to 10

for k in k_range:
 kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) # n_init to avoid local minima
 kmeans.fit(X_scaled_cluster)
 sse.append(kmeans.inertia_) # inertia_ is the sum of squared distances to the nearest centroid

# Plot the Elbow Method graph
plt.figure(figsize=(8, 5))
plt.plot(k_range, sse, marker='o', linestyle='-')
plt.title('Elbow Method for Optimal K (K-Means)')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.xticks(k_range)
plt.grid(True)
plt.show()

print("Based on the elbow method, choose a K where the decrease in SSE starts to slow down.")
# For the Wine dataset, K=3 is often a good choice as there are 3 actual classes.

# Perform K-Means clustering with the chosen K (e.g., K=3)
optimal_k = 3
kmeans_model = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters_kmeans = kmeans_model.fit_predict(X_scaled_cluster)

# Add cluster labels to the original (or scaled) DataFrame
X_cluster['kmeans_cluster'] = clusters_kmeans
X_scaled_cluster_df['kmeans_cluster'] = clusters_kmeans # Add to scaled for visualization

print(f"\nK-Means Clustering Results (K={optimal_k}):")
print("\nCluster distribution:")
print(X_cluster['kmeans_cluster'].value_counts())

# Evaluate clustering performance (Silhouette Score - requires actual labels if available, or just intrinsic evaluation)
# Silhouette Score measures how similar an object is to its own cluster compared to other clusters.
# A higher Silhouette Score indicates better-defined clusters.
if optimal_k > 1 and optimal_k < len(X_scaled_cluster): # Silhouette score requires > 1 cluster and < n_samples
 silhouette_avg_kmeans = silhouette_score(X_scaled_cluster, clusters_kmeans)
 print(f"\nSilhouette Score (K-Means, K={optimal_k}): {silhouette_avg_kmeans:.4f}")
else:
 print("\nSilhouette Score cannot be calculated for K=1 or K=n_samples.")


# Visualize K-Means clusters (requires dimensionality reduction or selecting 2 features)
print("\nVisualizing K-Means Clusters (using first 2 PCA components for visualization)...")
# Apply PCA to reduce dimensions for plotting
pca_cluster_viz = PCA(n_components=2)
X_pca_cluster_viz = pca_cluster_viz.fit_transform(X_scaled_cluster)

plt.figure(figsize=(10, 8))
# Scatter plot colored by K-Means cluster labels
scatter = plt.scatter(X_pca_cluster_viz[:, 0], X_pca_cluster_viz[:, 1], c=clusters_kmeans, cmap='viridis', alpha=0.7, s=50)
plt.title(f'K-Means Clustering (K={optimal_k}) visualized with PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(scatter, label='Cluster')
plt.grid(True)
plt.show()

# Optional: Compare K-Means clusters to actual labels (since we have them for Wine dataset)
print("\nComparing K-Means Clusters to Actual Wine Classes:")
# Create a cross-tabulation
comparison_kmeans_actual = pd.crosstab(wine_df['target'], X_cluster['kmeans_cluster'])
print(comparison_kmeans_actual)
# This shows how well the K-Means clusters align with the true classes.


In [None]:
# --- 8.3 Hierarchical Clustering ---
print("\n--- 8.3 Hierarchical Clustering ---")

# Perform Agglomerative Hierarchical Clustering
# Choose number of clusters (e.g., 3, based on K-Means/domain knowledge)
n_clusters_hierarchical = 3
agg_clustering = AgglomerativeClustering(n_clusters=n_clusters_hierarchical)
clusters_hierarchical = agg_clustering.fit_predict(X_scaled_cluster)

# Add cluster labels to the DataFrame
X_cluster['hierarchical_cluster'] = clusters_hierarchical

print(f"\nHierarchical Clustering Results (Number of Clusters={n_clusters_hierarchical}):")
print("\nCluster distribution:")
print(X_cluster['hierarchical_cluster'].value_counts())

# Evaluate Silhouette Score
if n_clusters_hierarchical > 1 and n_clusters_hierarchical < len(X_scaled_cluster):
 silhouette_avg_hierarchical = silhouette_score(X_scaled_cluster, clusters_hierarchical)
 print(f"\nSilhouette Score (Hierarchical, Clusters={n_clusters_hierarchical}): {silhouette_avg_hierarchical:.4f}")
else:
 print("\nSilhouette Score cannot be calculated for 1 or n_samples clusters.")


# Visualize Hierarchical Clusters (using the same PCA components)
plt.figure(figsize=(10, 8))
scatter_hierarchical = plt.scatter(X_pca_cluster_viz[:, 0], X_pca_cluster_viz[:, 1], c=clusters_hierarchical, cmap='viridis', alpha=0.7, s=50)
plt.title(f'Hierarchical Clustering (Clusters={n_clusters_hierarchical}) visualized with PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(scatter_hierarchical, label='Cluster')
plt.grid(True)
plt.show()

# Optional: Compare Hierarchical clusters to actual labels
print("\nComparing Hierarchical Clusters to Actual Wine Classes:")
comparison_hierarchical_actual = pd.crosstab(wine_df['target'], X_cluster['hierarchical_cluster'])
print(comparison_hierarchical_actual)


# Dendrogram (for visualizing hierarchical structure)
print("\nGenerating Dendrogram (this might take a moment for larger datasets)...")
# Calculate the distance matrix
distance_matrix = pdist(X_scaled_cluster, metric='euclidean')
# Perform hierarchical clustering
linkage_matrix = hierarchy.linkage(distance_matrix, method='ward') # 'ward' minimizes variance

# Plot the dendrogram
plt.figure(figsize=(14, 8))
dendrogram = hierarchy.dendrogram(linkage_matrix,
 leaf_rotation=90, # Rotate leaf labels
 leaf_font_size=8, # Font size for leaf labels
 labels=wine_df.index.tolist() # Use original index or sample index
 )
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()
print("The dendrogram helps visualize the merging of clusters.")


# --- 8.4 DBSCAN Clustering ---
print("\n--- 8.4 DBSCAN Clustering ---")
# DBSCAN is a density-based clustering algorithm. It doesn't require specifying the number of clusters beforehand.
# It requires two parameters: epsilon (eps) and minimum samples (min_samples).

# Example: Apply DBSCAN
# Choosing eps and min_samples can be tricky and often requires domain knowledge or trial and error.
# A common approach for eps is to calculate the distance to the k-th nearest neighbor for each point
# and look for an "elbow" in the sorted distances.
print("\nApplying DBSCAN Clustering (using eps=0.5, min_samples=5)...")
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters_dbscan = dbscan.fit_predict(X_scaled_cluster)

# Add cluster labels to the DataFrame (-1 indicates noise)
X_cluster['dbscan_cluster'] = clusters_dbscan

print("\nDBSCAN Clustering Results:")
print("\nCluster distribution (including noise -1):")
print(X_cluster['dbscan_cluster'].value_counts())

# Evaluate Silhouette Score (excluding noise points)
# Only calculate if there is more than one cluster found (excluding noise)
if len(set(clusters_dbscan)) > 1 and -1 in set(clusters_dbscan): # More than 1 cluster and noise present
 # Calculate silhouette score only for points that are not noise (-1)
 non_noise_indices = clusters_dbscan != -1
 if len(set(clusters_dbscan[non_noise_indices])) > 1: # Ensure there's more than one cluster among non-noise points
 silhouette_avg_dbscan = silhouette_score(X_scaled_cluster[non_noise_indices], clusters_dbscan[non_noise_indices])
 print(f"\nSilhouette Score (DBSCAN, excluding noise): {silhouette_avg_dbscan:.4f}")
 else:
 print("\nNot enough non-noise clusters (>1) to calculate Silhouette Score.")
elif len(set(clusters_dbscan)) > 1: # More than 1 cluster, no noise
 silhouette_avg_dbscan = silhouette_score(X_scaled_cluster, clusters_dbscan)
 print(f"\nSilhouette Score (DBSCAN): {silhouette_avg_dbscan:.4f}")
else:
 print("\nSilhouette Score cannot be calculated (only one cluster or all noise).")


# Visualize DBSCAN Clusters (using the same PCA components)
plt.figure(figsize=(10, 8))
# Color noise points differently if desired
unique_labels = set(clusters_dbscan)
colors = [plt.cm.viridis(each) for each in np.linspace(0, 1, len(unique_labels))]

for k, col in zip(unique_labels, colors):
 if k == -1:
 # Black used for noise.
 col = [0, 0, 0, 1]

 class_member_mask = (clusters_dbscan == k)

 xy = X_pca_cluster_viz[class_member_mask]
 plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
 markeredgecolor='k', markersize=6, label=f'Cluster {k}')

plt.title('DBSCAN Clustering visualized with PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.grid(True)
plt.show()

# Optional: Compare DBSCAN clusters to actual labels
print("\nComparing DBSCAN Clusters to Actual Wine Classes:")
# Note: Noise points (-1) are not assigned to a class by DBSCAN.
comparison_dbscan_actual = pd.crosstab(wine_df['target'], X_cluster['dbscan_cluster'])
print(comparison_dbscan_actual)


print("\nClustering analysis examples complete.")
print(f"{'='*80}")


In [None]:
# ## 9. Advanced Modeling with CatBoost

# %% tags=[]
print(f"\n{'='*80}")
print(f"PART 9: ADVANCED MODELING WITH CATBOOST")
print(f"{'='*80}")

# CatBoost is a gradient boosting on decision trees library.
# It is known for handling categorical features effectively and often provides good performance out-of-the-box.

# Use the synthetic e-commerce data for CatBoost examples.
# Let's build a model to predict customer's 'total_spend'. This is a regression task.

# Use the customer_fe DataFrame which contains engineered features.
# Ensure NaNs are handled before training CatBoost. CatBoost can handle NaNs internally,
# but explicit imputation or handling might be preferred depending on the strategy.
# For this example, we'll let CatBoost handle them, but specify categorical features.

print("\n--- CatBoost Modeling Examples (Predicting Customer Total Spend) ---")


In [None]:
# --- 9.1 Data Preparation for CatBoost ---
print("\n--- 9.1 Data Preparation for CatBoost ---")

# Select features (X) and target (y) from the engineered customer data
# Exclude ID columns, the target itself, and potentially features directly derived from the target.
# Also exclude original columns if engineered/scaled versions are used.
X_cb = customers_fe.drop(columns=[
 'customer_id', 'lifetime_value', 'avg_order_value', 'loyalty_score', 'recency_days', # Original features
 'signup_date', 'first_purchase_date', 'last_purchase_date', 'last_visit_timestamp', # Date columns
 'total_spend', # Target variable
 'lifetime_value_sq', 'avg_order_value_sq', 'lifetime_avg_order_interaction', # Polynomial features (optional to include)
 'age_loyalty_interaction', # Interaction feature (optional to include)
 'total_spend_scaled_std', 'recency_scaled_std', 'loyalty_score_scaled_minmax', 'avg_order_value_scaled_minmax', # Scaled features (CatBoost doesn't need scaling)
 'age_group_numeric', # Numeric encoding of age group (CatBoost prefers original categorical)
 'customer_segment_encoded', # Label encoded segment (CatBoost prefers original categorical)
 'loyalty_level', 'session_duration_category', # Binned features (optional to include)
 'PCA1', 'PCA2', 'TSNE1', 'TSNE2', # Dimensionality reduction features
 'kmeans_cluster', 'hierarchical_cluster', 'dbscan_cluster' # Clustering labels
], errors='ignore').copy() # Use errors='ignore' in case some columns weren't created

y_cb = customers_fe['total_spend'].copy()

print(f"Features shape: {X_cb.shape}")
print(f"Target shape: {y_cb.shape}")
print("\nFeatures used for CatBoost:")
print(X_cb.columns.tolist())


# Split data into training and testing sets
X_train_cb, X_test_cb, y_train_cb, y_test_cb = train_test_split(X_cb, y_cb, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train_cb.shape}, {y_train_cb.shape}")
print(f"Testing data shape: {X_test_cb.shape}, {y_test_cb.shape}")

# Identify categorical features for CatBoost
# CatBoost can directly handle categorical features if you tell it which columns are categorical.
# It uses a special encoding technique (ordered boosting).
categorical_features_cb_names = X_train_cb.select_dtypes(include='object').columns.tolist()
print(f"\nIdentified categorical features for CatBoost: {categorical_features_cb_names}")


In [None]:
# --- 9.2 Training a CatBoost Regressor ---
print("\n--- 9.2 Training a CatBoost Regressor ---")

# Create a CatBoost Regressor model
catboost_regressor = cb.CatBoostRegressor(
 iterations=100, # Number of boosting iterations (trees)
 learning_rate=0.1,
 depth=6,
 loss_function='RMSE', # Root Mean Squared Error
 eval_metric='RMSE', # Metric for evaluation during training
 random_state=42,
 verbose=10 # Print training progress every 10 iterations
)

# Train the model
# Pass the categorical feature names to the model
print("\nTraining CatBoost Regressor...")
catboost_regressor.fit(
 X_train_cb,
 y_train_cb,
 cat_features=categorical_features_cb_names, # Specify categorical features
 eval_set=(X_test_cb, y_test_cb), # Use test set for evaluation during training
 early_stopping_rounds=10, # Stop early if validation metric doesn't improve for 10 rounds
 verbose=False # Suppress verbose output during fit for cleaner final output
)
print("CatBoost Regressor training complete.")


# Make predictions
y_pred_cb = catboost_regressor.predict(X_test_cb)

# Evaluate the model
mse_cb = mean_squared_error(y_test_cb, y_pred_cb)
rmse_cb = np.sqrt(mse_cb)
r2_cb = r2_score(y_test_cb, y_pred_cb)

print(f"\nCatBoost Regressor Results on Test Set:")
print(f"  Mean Squared Error (MSE): {mse_cb:.4f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_cb:.4f}")
print(f"  R-squared (R2): {r2_cb:.4f}")


In [None]:
# --- 9.3 CatBoost Feature Importance ---
print("\n--- 9.3 CatBoost Feature Importance ---")

# Get feature importances
feature_importances_cb = catboost_regressor.get_feature_importance(prettified=True)
print("\nCatBoost Feature Importances:")
print(feature_importances_cb)

# Visualize Feature Importances
plt.figure(figsize=(12, 8))
# Sort features by importance for plotting
feature_importances_cb_sorted = feature_importances_cb.sort_values(by='Importances', ascending=False)
plt.bar(feature_importances_cb_sorted['Feature Id'], feature_importances_cb_sorted['Importances'], color='steelblue')
plt.title('CatBoost Regressor Feature Importances')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:

# --- 9.4 Training a CatBoost Classifier (Example) ---
print("\n--- 9.4 Training a CatBoost Classifier (Example) ---")
# Let's create a binary classification task from the customers data, e.g., predicting if a customer is 'VIP'.

# Create a binary target variable 'is_vip'
customers_fe['is_vip'] = (customers_fe['customer_segment'] == 'VIP').astype(int)

# Select features (X) and the new binary target (y)
X_cb_clf = customers_fe.drop(columns=[
 'customer_id', 'lifetime_value', 'avg_order_value', 'loyalty_score', 'recency_days', # Original features
 'signup_date', 'first_purchase_date', 'last_purchase_date', 'last_visit_timestamp', # Date columns
 'total_spend', # Original regression target
 'is_vip', # New classification target
 'customer_segment', # Original categorical feature (target source)
 'lifetime_value_sq', 'avg_order_value_sq', 'lifetime_avg_order_interaction',
 'age_loyalty_interaction',
 'total_spend_scaled_std', 'recency_scaled_std', 'loyalty_score_scaled_minmax', 'avg_order_value_scaled_minmax',
 'age_group_numeric', 'customer_segment_encoded',
 'loyalty_level', 'session_duration_category',
 'PCA1', 'PCA2', 'TSNE1', 'TSNE2',
 'kmeans_cluster', 'hierarchical_cluster', 'dbscan_cluster'
], errors='ignore').copy()

y_cb_clf = customers_fe['is_vip'].copy()

print(f"\nFeatures shape for Classifier: {X_cb_clf.shape}")
print(f"Target shape for Classifier: {y_cb_clf.shape}")
print("\nClass distribution for 'is_vip':")
print(y_cb_clf.value_counts())

# Split data
X_train_cb_clf, X_test_cb_clf, y_train_cb_clf, y_test_cb_clf = train_test_split(X_cb_clf, y_cb_clf, test_size=0.2, random_state=42, stratify=y_cb_clf)

print(f"\nTraining data shape for Classifier: {X_train_cb_clf.shape}, {y_train_cb_clf.shape}")
print(f"Testing data shape for Classifier: {X_test_cb_clf.shape}, {y_test_cb_clf.shape}")

# Identify categorical features for the classifier
categorical_features_cb_clf_names = X_train_cb_clf.select_dtypes(include='object').columns.tolist()
print(f"\nIdentified categorical features for CatBoost Classifier: {categorical_features_cb_clf_names}")


# Create a CatBoost Classifier model
catboost_classifier = cb.CatBoostClassifier(
 iterations=100,
 learning_rate=0.1,
 depth=6,
 loss_function='Logloss', # Logloss for binary classification
 eval_metric='Accuracy', # Or 'AUC'
 random_state=42,
 verbose=10 # Print training progress
)

# Train the classifier
print("\nTraining CatBoost Classifier...")
catboost_classifier.fit(
 X_train_cb_clf,
 y_train_cb_clf,
 cat_features=categorical_features_cb_clf_names,
 eval_set=(X_test_cb_clf, y_test_cb_clf),
 early_stopping_rounds=10,
 verbose=False # Suppress verbose output
)
print("CatBoost Classifier training complete.")

# Make predictions
y_pred_cb_clf = catboost_classifier.predict(X_test_cb_clf)
y_prob_cb_clf = catboost_classifier.predict_proba(X_test_cb_clf)[:, 1] # Probability of the positive class (1)

# Evaluate the classifier
accuracy_cb_clf = accuracy_score(y_test_cb_clf, y_pred_cb_clf)
conf_matrix_cb_clf = confusion_matrix(y_test_cb_clf, y_pred_cb_clf)
class_report_cb_clf = classification_report(y_test_cb_clf, y_pred_cb_clf)
roc_auc_cb_clf = roc_auc_score(y_test_cb_clf, y_prob_cb_clf)

print(f"\nCatBoost Classifier Results on Test Set:")
print(f"  Accuracy: {accuracy_cb_clf:.4f}")
print("\n  Confusion Matrix:")
print(conf_matrix_cb_clf)
print("\n  Classification Report:")
print(class_report_cb_clf)
print(f"  ROC AUC Score: {roc_auc_cb_clf:.4f}")


# Visualize ROC Curve for the classifier
fpr_cb, tpr_cb, thresholds_cb = roc_curve(y_test_cb_clf, y_prob_cb_clf)

plt.figure(figsize=(8, 6))
plt.plot(fpr_cb, tpr_cb, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_cb_clf:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('CatBoost Classifier ROC Curve (Is VIP)')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()


print("\nAdvanced modeling with CatBoost examples complete.")
print(f"{'='*80}")


In [None]:
# ## 10. Time Series Forecasting with Prophet

# %% tags=[]
print(f"\n{'='*80}")
print(f"PART 10: TIME SERIES FORECASTING WITH PROPHET")
print(f"{'='*80}")

# Prophet is a time series forecasting model developed by Facebook.
# It is designed for time series data that has strong seasonal effects and several seasons of historical data.

# Use the synthetic e-commerce transactions data to forecast total daily sales.

print("\n--- Time Series Forecasting with Prophet (Forecasting Daily Sales) ---")


In [None]:
# --- 10.1 Data Preparation for Prophet ---
print("\n--- 10.1 Data Preparation for Prophet ---")

# Prophet requires the input DataFrame to have two columns:
# 'ds' (datetime) and 'y' (numerical value to forecast).

# Aggregate the transaction data to get total daily sales
# Ensure 'date' is datetime
transactions_fe['date'] = pd.to_datetime(transactions_fe['date'])

# Group by date and sum total_price
daily_sales = transactions_fe.groupby(transactions_fe['date'].dt.date)['total_price'].sum().reset_index()

# Rename columns to 'ds' and 'y'
daily_sales.columns = ['ds', 'y']

# Convert 'ds' column to datetime objects (if not already)
daily_sales['ds'] = pd.to_datetime(daily_sales['ds'])

print("Prepared daily sales data for Prophet (first 5 rows):")
print(daily_sales.head())
print(f"\nTotal number of days with sales data: {len(daily_sales)}")


In [None]:
# --- 10.2 Training the Prophet Model ---
print("\n--- 10.2 Training the Prophet Model ---")

# Create a Prophet model instance
# You can add parameters to model seasonality, holidays, etc.
# For this example, we'll use default seasonality (additive) and include yearly and weekly seasonality.
model_prophet = Prophet(
 yearly_seasonality=True,
 weekly_seasonality=True,
 daily_seasonality=False # Daily seasonality is often less pronounced at a daily aggregation level
)

# Fit the model to the historical data
print("\nTraining Prophet model...")
model_prophet.fit(daily_sales)
print("Prophet model training complete.")


In [None]:
# --- 10.3 Making Future Predictions ---
print("\n--- 10.3 Making Future Predictions ---")

# Create a DataFrame with future dates to forecast
# Let's forecast for the next 30 days
future_dates = model_prophet.make_future_dataframe(periods=30)

print(f"\nFuture dates DataFrame (first 5 rows):")
print(future_dates.head())
print(f"\nFuture dates DataFrame (last 5 rows):")
print(future_dates.tail())


# Make predictions on the future dates
print("\nMaking future predictions...")
forecast = model_prophet.predict(future_dates)
print("Future predictions complete.")

# The forecast DataFrame contains the predictions ('yhat'),
# lower and upper bounds ('yhat_lower', 'yhat_upper'), and components (trend, seasonality).

print("\nForecast DataFrame (first 5 rows):")
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())
print("\nForecast DataFrame (last 5 rows):")
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail())


In [None]:

# --- 10.4 Visualizing the Forecast ---
print("\n--- 10.4 Visualizing the Forecast ---")

# Prophet provides a built-in plotting utility
print("\nPlotting forecast...")
fig1 = model_prophet.plot(forecast)
plt.title('Daily Sales Forecast with Prophet')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.show()
print("Forecast plot displayed.")

# Visualize forecast components (trend, seasonality)
print("\nPlotting forecast components...")
fig2 = model_prophet.plot_components(forecast)
plt.show()
print("Forecast components plot displayed.")


In [None]:
# --- 10.5 Evaluating Forecast Performance (Optional) ---
print("\n--- 10.5 Evaluating Forecast Performance (Optional) ---")
# To evaluate performance, you would typically split your historical data into training and validation sets.
# Prophet has cross-validation utilities for this.

# Example: Performing cross-validation with Prophet
# We'll simulate forecasting performance over a rolling window.
# Initial training period: 'initial'
# Period to forecast: 'period'
# Frequency of evaluation: 'horizon'

# Let's use the last 90 days as the initial training data, forecast 30 days ahead,
# and evaluate every 30 days.
print("\nPerforming Prophet cross-validation (this might take a moment)...")

# Calculate initial and horizon based on your data range
# Assuming data ends on latest_date from Part 5
data_end_date = daily_sales['ds'].max()
initial_date = data_end_date - pd.Timedelta(days=90)
horizon_date = pd.Timedelta(days=30)

# Find the index of the initial date in the daily_sales DataFrame
initial_index = daily_sales[daily_sales['ds'] >= initial_date].index.min()
if pd.isna(initial_index):
 print("Warning: Initial date for cross-validation is outside the data range. Adjusting.")
 initial_index = len(daily_sales) - 90 # Use the last 90 days if date calculation fails

initial_cutoff = daily_sales.iloc[initial_index]['ds']

# Calculate initial, period, and horizon in Prophet's required format (strings like '90 days')
initial_prophet = f"{(initial_cutoff - daily_sales['ds'].min()).days} days"
period_prophet = "30 days"
horizon_prophet = "30 days"


try:
 from prophet.diagnostics import cross_validation, performance_metrics

 # Perform cross-validation
 df_cv = cross_validation(model_prophet, initial=initial_prophet, period=period_prophet, horizon=horizon_prophet, parallel="processes")

 print("\nProphet Cross-Validation Results (first 5 rows):")
 print(df_cv.head())

 # Calculate performance metrics
 df_p = performance_metrics(df_cv)

 print("\nProphet Performance Metrics (first 5 rows):")
 print(df_p.head())

 # You can plot performance metrics vs horizon
 # from prophet.plot import plot_cross_validation_metric
 # fig3 = plot_cross_validation_metric(df_cv, metric='rmse')
 # plt.title('Prophet Cross-Validation RMSE vs Horizon')
 # plt.show()

except ImportError:
 print("\nProphet diagnostics (cross_validation, performance_metrics) not available. Skipping evaluation example.")
except ValueError as e:
 print(f"\nCould not perform Prophet cross-validation: {e}. Check initial, period, and horizon settings relative to your data.")
except Exception as e:
 print(f"\nAn error occurred during Prophet cross-validation: {e}")


print("\nTime series forecasting with Prophet examples complete.")
print(f"{'='*80}")


# %% [markdown]
# ## Conclusion
#
# This document provided a comprehensive overview and practical examples of various advanced data science techniques using Python libraries like Pandas, NumPy, Matplotlib, SciPy, Scikit-learn, CatBoost, and Prophet. It covered data generation, cleaning, EDA, feature engineering, regression, classification, clustering, advanced modeling, and time series forecasting, all within a single environment.
#
# This toolkit serves as a valuable resource for senior data product analysts to enhance their analytical capabilities and tackle complex data challenges. Remember to adapt these techniques to your specific datasets and business problems.
