{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "864ae46f", "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "# Advanced Data Science Toolkit\n", "# This comprehensive guide includes examples of:\n", "# - Data generation/loading\n", "# - Data Preparation and Cleaning (NEW SECTION)\n", "# - EDA (Exploratory Data Analysis)\n", "# - Feature engineering\n", "# - Visualization with Matplotlib only\n", "# - Statistical analysis with SciPy\n", "# - Regression, Classification, and Clustering techniques\n", "# - Time series forecasting with Prophet\n", "# - Advanced modeling with CatBoost\n", "# - Advanced Data Manipulation (Pivot, Groupby, Concat, Join, Map, Zip)\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "id": "f25f57f7", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "# Removed import seaborn as sns\n", "from scipy import stats\n", "from scipy.cluster import hierarchy\n", "from scipy.spatial.distance import pdist\n", "from sklearn.datasets import fetch_california_housing, load_wine, load_diabetes\n", "from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV\n", "from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer, KNNImputer # KNNImputer is useful for imputation\n", "from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression\n", "from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression\n", "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor\n", "from sklearn.svm import SVR, SVC\n", "from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier\n", "from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering\n", "from sklearn.decomposition import PCA\n", "from sklearn.manifold import TSNE\n", "from sklearn.metrics import (mean_squared_error, r2_score, accuracy_score, confusion_matrix,\n", " classification_report, silhouette_score, roc_curve, roc_auc_score,\n", " precision_recall_curve, average_precision_score)\n", "import catboost as cb\n", "from prophet import Prophet\n", "import warnings\n", "import datetime as dt\n", "from dateutil.relativedelta import relativedelta\n", "import itertools # Useful for combinations/permutations\n", "\n", "# Suppress warnings for cleaner output\n", "warnings.filterwarnings('ignore')\n", "\n", "# Set aesthetics for plots using Matplotlib defaults\n", "plt.rcParams['figure.figsize'] = (12, 8)\n", "plt.rcParams['font.size'] = 12" ] }, { "cell_type": "code", "execution_count": null, "id": "7735d99d", "metadata": {}, "outputs": [], "source": [ "# ## 1. Data Generation and Loading\n", "\n", "# %% tags=[]\n", "def generate_synthetic_ecommerce_data(n_customers=1000, start_date='2023-01-01', end_date='2023-12-31'):\n", " \"\"\"\n", " Generate synthetic e-commerce dataset with:\n", " - Customer demographics\n", " - Purchase history\n", " - Product information\n", " - Marketing campaign exposure\n", " - Website behavior\n", "\n", " Parameters:\n", " -----------\n", " n_customers : int\n", " Number of customers to generate\n", " start_date : str\n", " Start date for transaction history\n", " end_date : str\n", " End date for transaction history\n", "\n", " Returns:\n", " --------\n", " dict of pandas DataFrames containing:\n", " - customers: Customer demographic information\n", " - products: Product catalog\n", " - transactions: Purchase records\n", " - website_visits: Web analytics data\n", " - marketing: Marketing campaign information\n", " - campaign_exposures: Records of customer exposure to campaigns\n", " \"\"\"\n", " np.random.seed(42) # For reproducibility\n", "\n", " # Generate date range for transactions\n", " start = pd.to_datetime(start_date)\n", " end = pd.to_datetime(end_date)\n", " days = (end - start).days + 1\n", "\n", " # Customer demographics\n", " age_groups = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']\n", " genders = ['Male', 'Female', 'Non-binary', 'Prefer not to say']\n", " locations = ['North', 'South', 'East', 'West', 'Central']\n", " segments = ['New', 'Regular', 'VIP', 'Churned', 'Reactivated']\n", "\n", " customers = pd.DataFrame({\n", " 'customer_id': range(1, n_customers + 1),\n", " 'age_group': np.random.choice(age_groups, n_customers, p=[0.15, 0.25, 0.25, 0.15, 0.1, 0.1]),\n", " 'gender': np.random.choice(genders, n_customers, p=[0.45, 0.48, 0.05, 0.02]),\n", " 'location': np.random.choice(locations, n_customers),\n", " 'signup_date': [start + pd.Timedelta(days=np.random.randint(0, days)) for _ in range(n_customers)],\n", " 'customer_segment': np.random.choice(segments, n_customers, p=[0.2, 0.5, 0.1, 0.15, 0.05]),\n", " })\n", "\n", " # Add some realistic customer metrics\n", " customers['lifetime_value'] = np.random.gamma(shape=5, scale=50, size=n_customers)\n", " customers['avg_order_value'] = np.random.gamma(shape=3, scale=20, size=n_customers)\n", " customers['loyalty_score'] = np.random.randint(1, 101, size=n_customers)\n", " customers['recency_days'] = np.random.geometric(p=0.1, size=n_customers)\n", "\n", " # Add missing values to simulate real-world data\n", " mask = np.random.random(n_customers) < 0.05\n", " customers.loc[mask, 'age_group'] = np.nan\n", " mask = np.random.random(n_customers) < 0.02\n", " customers.loc[mask, 'gender'] = np.nan\n", " # Add some missing values to numeric columns too\n", " mask = np.random.random(n_customers) < 0.03\n", " customers.loc[mask, 'lifetime_value'] = np.nan\n", " mask = np.random.random(n_customers) < 0.01\n", " customers.loc[mask, 'loyalty_score'] = np.nan\n", "\n", "\n", " # Product catalog\n", " product_categories = ['Electronics', 'Clothing', 'Home', 'Beauty', 'Sports', 'Books', 'Food', 'Toys']\n", " n_products = 100\n", "\n", " products = pd.DataFrame({\n", " 'product_id': range(1, n_products + 1),\n", " 'product_name': [f'Product_{i}' for i in range(1, n_products + 1)],\n", " 'category': np.random.choice(product_categories, n_products),\n", " 'price': np.random.uniform(5, 500, n_products).round(2),\n", " 'stock_level': np.random.randint(0, 1000, n_products),\n", " 'rating': np.random.uniform(1, 5, n_products).round(1),\n", " 'discount_eligible': np.random.choice([True, False], n_products, p=[0.3, 0.7])\n", " })\n", " # Add some missing values to product data\n", " mask = np.random.random(n_products) < 0.05\n", " products.loc[mask, 'price'] = np.nan\n", " mask = np.random.random(n_products) < 0.02\n", " products.loc[mask, 'rating'] = np.nan\n", "\n", "\n", " # Transactions (purchase history)\n", " # Let's assume each customer makes between 0 and 10 purchases\n", " transactions_list = []\n", "\n", " for cust_id in customers['customer_id']:\n", " n_purchases = np.random.randint(0, 11)\n", " if n_purchases > 0:\n", " purchase_dates = [start + pd.Timedelta(days=np.random.randint(0, days)) for _ in range(n_purchases)]\n", " purchase_dates.sort() # Ensure chronological order\n", "\n", " for date in purchase_dates:\n", " # Each purchase can include 1-5 products\n", " n_items = np.random.randint(1, 6)\n", " products_bought = np.random.choice(products['product_id'], n_items)\n", "\n", " for prod_id in products_bought:\n", " # Handle potential NaN in product price before using it\n", " product_price_series = products.loc[products['product_id'] == prod_id, 'price']\n", " if not product_price_series.empty and not pd.isna(product_price_series.values[0]):\n", " product_price = product_price_series.values[0]\n", " else:\n", " # Skip transaction if product price is missing\n", " continue\n", "\n", " # Apply random discount\n", " if products.loc[products['product_id'] == prod_id, 'discount_eligible'].values[0]:\n", " discount = np.random.choice([0, 0.1, 0.2, 0.3], p=[0.4, 0.3, 0.2, 0.1])\n", " else:\n", " discount = 0\n", "\n", " quantity = np.random.randint(1, 4)\n", "\n", " transaction = {\n", " 'transaction_id': len(transactions_list) + 1,\n", " 'customer_id': cust_id,\n", " 'product_id': prod_id,\n", " 'date': date,\n", " 'quantity': quantity,\n", " 'unit_price': product_price,\n", " 'discount': discount,\n", " 'total_price': quantity * product_price * (1 - discount),\n", " 'payment_method': np.random.choice(['Credit Card', 'PayPal', 'Apple Pay', 'Bank Transfer'], p=[0.6, 0.2, 0.15, 0.05])\n", " }\n", " transactions_list.append(transaction)\n", "\n", " transactions = pd.DataFrame(transactions_list)\n", " # Add some missing values to transaction data\n", " mask = np.random.random(len(transactions)) < 0.01\n", " transactions.loc[mask, 'total_price'] = np.nan\n", " mask = np.random.random(len(transactions)) < 0.005\n", " transactions.loc[mask, 'payment_method'] = np.nan\n", "\n", "\n", " # Web behavior data\n", " visit_types = ['Product View', 'Cart Addition', 'Checkout Initiation', 'Purchase', 'Blog View', 'Search']\n", " devices = ['Desktop', 'Mobile', 'Tablet']\n", " channels = ['Organic Search', 'Direct', 'Referral', 'Social Media', 'Email', 'Paid Search']\n", "\n", " # Generate 5x more website visits than transactions\n", " n_visits = len(transactions) * 5\n", "\n", " website_visits = pd.DataFrame({\n", " 'visit_id': range(1, n_visits + 1),\n", " 'customer_id': np.random.choice(customers['customer_id'], n_visits),\n", " 'timestamp': [start + pd.Timedelta(seconds=np.random.randint(0, days * 86400)) for _ in range(n_visits)],\n", " 'visit_type': np.random.choice(visit_types, n_visits, p=[0.4, 0.2, 0.1, 0.1, 0.1, 0.1]),\n", " 'product_id': np.random.choice(list(products['product_id']) + [np.nan], n_visits, p=[0.8/n_products] * n_products + [0.2]), # Allow None/NaN for non-product pages\n", " 'device': np.random.choice(devices, n_visits, p=[0.5, 0.4, 0.1]),\n", " 'channel': np.random.choice(channels, n_visits),\n", " 'session_duration': np.random.exponential(300, n_visits).round(), # in seconds\n", " 'pages_viewed': np.random.geometric(p=0.3, size=n_visits)\n", " })\n", "\n", " # Sort by timestamp\n", " website_visits = website_visits.sort_values('timestamp').reset_index(drop=True)\n", " # Add some missing values to website visits\n", " mask = np.random.random(len(website_visits)) < 0.04\n", " website_visits.loc[mask, 'session_duration'] = np.nan\n", " mask = np.random.random(len(website_visits)) < 0.03\n", " website_visits.loc[mask, 'device'] = np.nan\n", "\n", "\n", " # Marketing campaigns\n", " campaign_types = ['Email', 'Social Media', 'Search Ads', 'Display Ads', 'Influencer', 'SMS']\n", " campaign_names = [f'Campaign_{i}' for i in range(1, 11)]\n", "\n", " marketing = pd.DataFrame({\n", " 'campaign_id': range(1, len(campaign_names) + 1),\n", " 'campaign_name': campaign_names,\n", " 'campaign_type': np.random.choice(campaign_types, len(campaign_names)),\n", " 'start_date': [start + pd.Timedelta(days=np.random.randint(0, days//2)) for _ in range(len(campaign_names))],\n", " 'budget': np.random.uniform(1000, 10000, len(campaign_names)).round(2),\n", " 'target_segment': np.random.choice(segments + ['All'], len(campaign_names))\n", " })\n", "\n", " # Calculate end dates (2 weeks to 2 months after start date)\n", " marketing['end_date'] = marketing['start_date'] + pd.to_timedelta(np.random.randint(14, 61, len(campaign_names)), unit='d')\n", " # Add some missing values to marketing data\n", " mask = np.random.random(len(marketing)) < 0.1\n", " marketing.loc[mask, 'budget'] = np.nan\n", " \n", " \n", " \n", "\n", "\n", " # Campaign exposure\n", " exposure_list = []\n", " for cust_id in customers['customer_id']:\n", " # Each customer is exposed to 0-5 campaigns\n", " n_exposures = np.random.randint(0, 6)\n", " if n_exposures > 0:\n", " campaign_ids = np.random.choice(marketing['campaign_id'], n_exposures, replace=False)\n", "\n", " for camp_id in campaign_ids:\n", " campaign_start = pd.to_datetime(marketing.loc[marketing['campaign_id'] == camp_id, 'start_date'].values[0])\n", " campaign_end = pd.to_datetime(marketing.loc[marketing['campaign_id'] == camp_id, 'end_date'].values[0])\n", "\n", " # Ensure exposure date is within campaign dates\n", " \n", " \n", " if campaign_start <= campaign_end:\n", " delta = (campaign_end - campaign_start).days # Properly extract days\n", " exposure_date = campaign_start + pd.Timedelta(days=np.random.randint(0, delta + 1))\n", " else:\n", " exposure_date = campaign_start # Fallback if end < start\n", "\n", "\n", " interaction = np.random.choice(['Viewed', 'Clicked', 'Converted', 'Ignored'], p=[0.4, 0.3, 0.1, 0.2])\n", "\n", " exposure = {\n", " 'exposure_id': len(exposure_list) + 1,\n", " 'customer_id': cust_id,\n", " 'campaign_id': camp_id,\n", " 'exposure_date': exposure_date,\n", " 'interaction': interaction\n", " }\n", " exposure_list.append(exposure)\n", "\n", " campaign_exposures = pd.DataFrame(exposure_list)\n", "\n", " # Add campaign names for easier reference\n", " campaign_exposures = campaign_exposures.merge(\n", " marketing[['campaign_id', 'campaign_name']],\n", " on='campaign_id',\n", " how='left'\n", " )\n", " # Add some missing values to campaign exposures\n", " mask = np.random.random(len(campaign_exposures)) < 0.02\n", " campaign_exposures.loc[mask, 'interaction'] = np.nan\n", "\n", "\n", " return {\n", " 'customers': customers,\n", " 'products': products,\n", " 'transactions': transactions,\n", " 'website_visits': website_visits,\n", " 'marketing': marketing,\n", " 'campaign_exposures': campaign_exposures\n", " }\n", "\n", "# Generate synthetic e-commerce data\n", "ecommerce_data = generate_synthetic_ecommerce_data(n_customers=1000)\n", "\n", "# Display dataset sample overview\n", "print(\"\\n=== SYNTHETIC E-COMMERCE DATASET OVERVIEW ===\")\n", "for name, df in ecommerce_data.items():\n", " print(f\"\\n{name.upper()} - {df.shape[0]} rows, {df.shape[1]} columns\")\n", " print(df.head(2))\n", " print(f\"Columns: {', '.join(df.columns)}\")\n", "\n", "# Let's also load real-world datasets for comparison\n", "print(\"\\n=== LOADING REAL-WORLD DATASETS ===\")\n", "\n", "# Load California housing dataset for regression example\n", "housing_data = fetch_california_housing()\n", "housing_df = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)\n", "housing_df['target'] = housing_data.target # Median house value in 100k USD\n", "print(\"\\nCalifornia Housing Dataset:\")\n", "print(f\"Shape: {housing_df.shape}\")\n", "print(housing_df.head(2))\n", "\n", "# Load wine dataset for classification example\n", "wine_data = load_wine()\n", "wine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)\n", "wine_df['target'] = wine_data.target # Wine type (0, 1, 2)\n", "print(\"\\nWine Dataset:\")\n", "print(f\"Shape: {wine_df.shape}\")\n", "print(wine_df.head(2))\n", "\n", "# Load diabetes dataset for regression example (smaller dataset)\n", "diabetes_data = load_diabetes()\n", "diabetes_df = pd.DataFrame(diabetes_data.data, columns=diabetes_data.feature_names)\n", "diabetes_df['target'] = diabetes_data.target # Quantitative measure of disease progression one year after baseline\n", "print(\"\\nDiabetes Dataset:\")\n", "print(f\"Shape: {diabetes_df.shape}\")\n", "print(diabetes_df.head(2))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "564dbc01", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "3237fb73", "metadata": {}, "outputs": [], "source": [ "# ## 2. Data Preparation and Cleaning\n", "\n", "# %% tags=[]\n", "print(f\"\\n{'='*80}\")\n", "print(f\"PART 2: DATA PREPARATION AND CLEANING\")\n", "print(f\"{'='*80}\")\n", "\n", "# Use a copy of the synthetic e-commerce data for cleaning examples\n", "customers_clean = ecommerce_data['customers'].copy()\n", "transactions_clean = ecommerce_data['transactions'].copy()\n", "products_clean = ecommerce_data['products'].copy()\n", "website_visits_clean = ecommerce_data['website_visits'].copy()\n", "campaign_exposures_clean = ecommerce_data['campaign_exposures'].copy()\n", "\n", "print(\"\\n--- Data Cleaning Examples on Synthetic E-commerce Data ---\")\n", "\n", "# --- 2.1 Identifying Missing Values ---\n", "print(\"\\n--- 2.1 Identifying Missing Values ---\")\n", "# Check for missing values in each DataFrame\n", "print(\"\\nMissing values in Customers DataFrame:\")\n", "print(customers_clean.isnull().sum())\n", "\n", "print(\"\\nMissing values in Transactions DataFrame:\")\n", "print(transactions_clean.isnull().sum())\n", "\n", "print(\"\\nMissing values in Products DataFrame:\")\n", "print(products_clean.isnull().sum())\n", "\n", "print(\"\\nMissing values in Website Visits DataFrame:\")\n", "print(website_visits_clean.isnull().sum())\n", "\n", "print(\"\\nMissing values in Campaign Exposures DataFrame:\")\n", "print(campaign_exposures_clean.isnull().sum())\n", "\n", "# Visualize missing values (as done in EDA)\n", "# Use a function to visualize missing values for a given dataframe\n", "# Removed seaborn heatmap and replaced with a simple print of missing values\n", "def print_missing_values_summary(df, title):\n", " \"\"\"Prints a summary of missing values.\"\"\"\n", " missing = df.isnull().sum()\n", " missing_percent = (missing / len(df) * 100).round(2)\n", " missing_df = pd.DataFrame({'Count': missing, 'Percent': missing_percent})\n", " missing_df = missing_df[missing_df['Count'] > 0].sort_values(by='Percent', ascending=False) # Show only columns with missing values, sorted\n", "\n", " print(f\"\\nMissing Values Summary - {title}:\")\n", " if not missing_df.empty:\n", " print(missing_df)\n", " else:\n", " print(\"No missing values found.\")\n", "\n", "# Example usage:\n", "# print_missing_values_summary(customers_clean, \"Customers\")\n", "# print_missing_values_summary(transactions_clean, \"Transactions\")\n", "\n", "\n", "# --- 2.2 Handling Missing Values ---\n", "print(\"\\n--- 2.2 Handling Missing Values ---\")\n", "\n", "# Example 1: Dropping rows with missing values\n", "print(\"\\nExample: Dropping rows with missing values in 'gender' or 'age_group' (Customers)\")\n", "initial_rows = customers_clean.shape[0]\n", "customers_dropped_rows = customers_clean.dropna(subset=['gender', 'age_group']).copy()\n", "print(f\"Initial rows: {initial_rows}, Rows after dropping NaNs in specified columns: {customers_dropped_rows.shape[0]}\")\n", "\n", "# Example 2: Dropping columns with missing values (e.g., if a column has too many NaNs)\n", "# Let's artificially create a column with many NaNs for demonstration\n", "customers_clean['sparse_column'] = np.nan\n", "customers_clean.loc[np.random.random(len(customers_clean)) > 0.9, 'sparse_column'] = 1 # Add a few non-NaNs\n", "print(\"\\nExample: Dropping 'sparse_column' if > 50% NaNs (Customers)\")\n", "initial_cols = customers_clean.shape[1]\n", "threshold = len(customers_clean) * 0.5 # Drop if more than 50% missing\n", "customers_dropped_cols = customers_clean.dropna(axis=1, thresh=threshold).copy()\n", "print(f\"Initial columns: {initial_cols}, Columns after dropping sparse column: {customers_dropped_cols.shape[1]}\")\n", "# Remove the artificially created column for subsequent examples\n", "customers_clean.drop(columns=['sparse_column'], inplace=True)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "54e3d974", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "2b0562c8", "metadata": {}, "outputs": [], "source": [ "# Example 3: Filling missing values with a constant value\n", "print(\"\\nExample: Filling missing 'payment_method' with 'Unknown' (Transactions)\")\n", "print(\"Before filling NaNs in payment_method:\")\n", "print(transactions_clean['payment_method'].value_counts(dropna=False))\n", "transactions_filled_constant = transactions_clean.copy()\n", "transactions_filled_constant['payment_method'].fillna('Unknown', inplace=True)\n", "print(\"\\nAfter filling NaNs with 'Unknown':\")\n", "print(transactions_filled_constant['payment_method'].value_counts(dropna=False))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "df560ef1", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "56f963ee", "metadata": {}, "outputs": [], "source": [ "# Example 4: Filling missing numerical values with the mean\n", "print(\"\\nExample: Filling missing 'lifetime_value' with the mean (Customers)\")\n", "print(\"Mean of lifetime_value before filling:\", customers_clean['lifetime_value'].mean().round(2))\n", "customers_filled_mean = customers_clean.copy()\n", "mean_lifetime_value = customers_filled_mean['lifetime_value'].mean()\n", "customers_filled_mean['lifetime_value'].fillna(mean_lifetime_value, inplace=True)\n", "print(\"Missing values in lifetime_value after filling:\", customers_filled_mean['lifetime_value'].isnull().sum())\n", "print(\"Mean of lifetime_value after filling:\", customers_filled_mean['lifetime_value'].mean().round(2)) # Mean should be the same" ] }, { "cell_type": "code", "execution_count": null, "id": "3d667d50", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "a63d9a98", "metadata": {}, "outputs": [], "source": [ "# Example 5: Filling missing numerical values with the median\n", "print(\"\\nExample: Filling missing 'price' with the median (Products)\")\n", "print(\"Median of price before filling:\", products_clean['price'].median().round(2))\n", "products_filled_median = products_clean.copy()\n", "median_price = products_filled_median['price'].median()\n", "products_filled_median['price'].fillna(median_price, inplace=True)\n", "print(\"Missing values in price after filling:\", products_filled_median['price'].isnull().sum())\n", "print(\"Median of price after filling:\", products_filled_median['price'].median().round(2)) # Median should be the same" ] }, { "cell_type": "code", "execution_count": null, "id": "12ce0439", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "b5f834ce", "metadata": {}, "outputs": [], "source": [ "# Example 6: Filling missing categorical values with a constant value (e.g., 'Unknown')\n", "print(\"\\nExample: Filling missing 'device' with 'Unknown' (Website Visits)\")\n", "print(\"Value counts of device before filling:\")\n", "print(website_visits_clean['device'].value_counts(dropna=False))\n", "website_visits_filled_constant = website_visits_clean.copy()\n", "website_visits_filled_constant['device'].fillna('Unknown', inplace=True)\n", "print(\"\\nAfter filling NaNs with 'Unknown':\")\n", "print(website_visits_filled_constant['device'].value_counts(dropna=False))" ] }, { "cell_type": "code", "execution_count": null, "id": "63b0e6d3", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "7399818c", "metadata": {}, "outputs": [], "source": [ "# Example 7: Filling missing values using forward fill (ffill)\n", "# Useful for time series or ordered data, fills with the previous valid observation\n", "print(\"\\nExample: Filling missing 'total_price' using forward fill (Transactions - requires sorting)\")\n", "# Ensure data is sorted by date for meaningful ffill\n", "transactions_clean_sorted = transactions_clean.sort_values('date').copy()\n", "print(\"Missing values in total_price before ffill:\", transactions_clean_sorted['total_price'].isnull().sum())\n", "transactions_filled_ffill = transactions_clean_sorted.copy()\n", "transactions_filled_ffill['total_price'].fillna(method='ffill', inplace=True)\n", "print(\"Missing values in total_price after ffill:\", transactions_filled_ffill['total_price'].isnull().sum())\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a94a7cd1", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "71b7d9ed", "metadata": {}, "outputs": [], "source": [ "# Example 8: Filling missing values using backward fill (bfill)\n", "# Fills with the next valid observation\n", "print(\"\\nExample: Filling remaining missing 'total_price' using backward fill (Transactions)\")\n", "print(\"Missing values in total_price before bfill:\", transactions_filled_ffill['total_price'].isnull().sum())\n", "transactions_filled_bfill = transactions_filled_ffill.copy()\n", "transactions_filled_bfill['total_price'].fillna(method='bfill', inplace=True)\n", "print(\"Missing values in total_price after bfill:\", transactions_filled_bfill['total_price'].isnull().sum())\n", "# Note: ffill and bfill might still leave NaNs at the very beginning/end of the series if they start/end with NaNs.\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ea26173f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "2ce7e456", "metadata": {}, "outputs": [], "source": [ "# Example 9: Imputing missing numerical values using KNNImputer\n", "# Imputes missing values using the k-Nearest Neighbors approach.\n", "print(\"\\nExample: Imputing missing numerical values using KNNImputer (Customers)\")\n", "# Select numerical columns with missing values\n", "numeric_cols_with_missing = customers_clean.select_dtypes(include=np.number).columns[customers_clean.select_dtypes(include=np.number).isnull().any()].tolist()\n", "print(f\"Numerical columns with missing values: {numeric_cols_with_missing}\")\n", "\n", "if numeric_cols_with_missing:\n", " knn_imputer = KNNImputer(n_neighbors=5) # Use 5 nearest neighbors\n", "\n", " # Fit and transform the selected columns\n", " # KNNImputer works on numpy arrays, so select the columns first\n", " customers_knn_imputed_values = knn_imputer.fit_transform(customers_clean[numeric_cols_with_missing])\n", "\n", " # Replace the original columns with the imputed values\n", " customers_imputed_knn = customers_clean.copy()\n", " customers_imputed_knn[numeric_cols_with_missing] = customers_knn_imputed_values\n", "\n", " print(\"Missing values in numerical columns before KNNImputer:\")\n", " print(customers_clean[numeric_cols_with_missing].isnull().sum())\n", " print(\"\\nMissing values in numerical columns after KNNImputer:\")\n", " print(customers_imputed_knn[numeric_cols_with_missing].isnull().sum())\n", "else:\n", " print(\"No numerical columns with missing values to demonstrate KNNImputer on Customers DataFrame.\")\n", "\n", "\n", "# --- 2.3 Handling Duplicate Data ---\n", "print(\"\\n--- 2.3 Handling Duplicate Data ---\")\n", "\n", "# Check for duplicate rows (already done in EDA, but shown here as a cleaning step)\n", "print(\"\\nChecking for duplicate rows (Customers):\", customers_clean.duplicated().sum())\n", "\n", "# Example: Removing duplicate rows\n", "# Let's artificially create a duplicate row for demonstration\n", "duplicate_row = customers_clean.iloc[0].copy()\n", "customers_with_duplicates = pd.concat([customers_clean, pd.DataFrame([duplicate_row])], ignore_index=True)\n", "print(f\"\\nCustomers DataFrame shape BEFORE adding duplicate: {customers_clean.shape}\")\n", "# Fix: The variable name 'customers_deduplicated' was used before it was assigned.\n", "# Let's print the shape of the temporary DataFrame with duplicates first.\n", "print(f\"Customers DataFrame shape AFTER adding duplicate: {customers_with_duplicates.shape}\")\n", "\n", "\n", "customers_deduplicated = customers_with_duplicates.drop_duplicates().copy()\n", "print(f\"Customers DataFrame shape AFTER removing duplicates: {customers_deduplicated.shape}\")\n", "\n", "# Check for duplicates based on specific columns\n", "print(\"\\nChecking for duplicates based on 'customer_id' (should be 0 if customer_id is unique key):\", customers_clean.duplicated(subset=['customer_id']).sum())\n", "# In transactions, multiple rows per transaction_id are expected if a transaction has multiple items.\n", "# Duplicates would mean identical rows including quantity, product_id, etc.\n", "print(\"Checking for exact duplicate rows in Transactions:\", transactions_clean.duplicated().sum())\n", "\n", "\n", "# --- 2.4 Handling Outliers ---\n", "print(\"\\n--- 2.4 Handling Outliers ---\")\n", "# Outlier detection was covered in EDA (using IQR).\n", "# Handling strategies depend on the nature of the data and the analysis goal.\n", "\n", "print(\"\\nOutlier Handling Strategies (Examples):\")\n", "\n", "# Example 1: Removing outliers (based on a threshold, e.g., from IQR or Z-score)\n", "print(\"\\nExample: Removing outliers in 'lifetime_value' (Customers) based on IQR\")\n", "Q1 = customers_clean['lifetime_value'].quantile(0.25)\n", "Q3 = customers_clean['lifetime_value'].quantile(0.75)\n", "IQR = Q3 - Q1\n", "lower_bound = Q1 - 1.5 * IQR\n", "upper_bound = Q3 + 1.5 * IQR\n", "\n", "print(f\" IQR Outlier bounds for lifetime_value: [{lower_bound:.2f}, {upper_bound:.2f}]\")\n", "\n", "# Filter out rows where 'lifetime_value' is outside the bounds\n", "customers_no_outliers_iqr = customers_clean[\n", " (customers_clean['lifetime_value'] >= lower_bound) & (customers_clean['lifetime_value'] <= upper_bound)\n", "].copy()\n", "\n", "print(f\" Initial rows (Customers): {customers_clean.shape[0]}\")\n", "print(f\" Rows after removing lifetime_value outliers (IQR): {customers_no_outliers_iqr.shape[0]}\")\n", "\n", "\n", "# Example 2: Capping/Winsorizing outliers (replacing outliers with the boundary value)\n", "print(\"\\nExample: Capping outliers in 'session_duration' (Website Visits)\")\n", "# Calculate bounds (e.g., using quantiles)\n", "lower_bound_cap = website_visits_clean['session_duration'].quantile(0.01) # 1st percentile\n", "upper_bound_cap = website_visits_clean['session_duration'].quantile(0.99) # 99th percentile\n", "\n", "print(f\" Capping bounds for session_duration: [{lower_bound_cap:.2f}, {upper_bound_cap:.2f}]\")\n", "\n", "website_visits_capped = website_visits_clean.copy()\n", "# Cap values below lower_bound_cap\n", "website_visits_capped['session_duration'] = website_visits_capped['session_duration'].clip(lower=lower_bound_cap)\n", "# Cap values above upper_bound_cap\n", "website_visits_capped['session_duration'] = website_visits_capped['session_duration'].clip(upper=upper_bound_cap)\n", "\n", "print(\" Original session_duration min/max:\", website_visits_clean['session_duration'].min(), \"/\", website_visits_clean['session_duration'].max())\n", "print(\" Capped session_duration min/max:\", website_visits_capped['session_duration'].min(), \"/\", website_visits_capped['session_duration'].max())\n", "\n", "\n", "# Example 3: Transforming data (e.g., log transformation for skewed data)\n", "print(\"\\nExample: Log transformation of 'total_price' (Transactions) for skewed distribution\")\n", "# Check skewness first\n", "print(\" Skewness of total_price before transformation:\", transactions_clean['total_price'].skew().round(2))\n", "# Apply log transformation (add a small constant to handle zero/negative values if necessary)\n", "transactions_clean['log_total_price'] = np.log1p(transactions_clean['total_price']) # log1p = log(1+x)\n", "\n", "print(\" Skewness of log_total_price after transformation:\", transactions_clean['log_total_price'].skew().round(2))\n", "print(\" Transactions DataFrame with 'log_total_price':\")\n", "print(transactions_clean[['total_price', 'log_total_price']].head())\n", "# Note: Remember to handle NaNs before transformation if needed.\n", "\n", "\n", "# --- 2.5 Handling Inconsistent Data Formats / Types ---\n", "print(\"\\n--- 2.5 Handling Inconsistent Data Formats / Types ---\")\n", "# This often involves converting columns to the correct data types (e.g., object to numeric, object to datetime)\n", "# or standardizing string formats.\n", "\n", "print(\"\\nExamples of Data Type Conversion (already done for dates in Part 1/EDA):\")\n", "# Ensure date columns are datetime objects (already done in Part 1/EDA)\n", "# transactions_clean['date'] = pd.to_datetime(transactions_clean['date'])\n", "# customers_clean['signup_date'] = pd.to_datetime(customers_clean['signup_date'])\n", "# website_visits_clean['timestamp'] = pd.to_datetime(website_visits_clean['timestamp'])\n", "# campaign_exposures_clean['exposure_date'] = pd.to_datetime(campaign_exposures_clean['exposure_date'])\n", "print(\" Date columns are already converted to datetime objects.\")\n", "print(\" Example: transactions_clean['date'].dtype:\", transactions_clean['date'].dtype)\n", "\n", "\n", "# Example: Standardizing string formats (e.g., converting to lowercase, removing whitespace)\n", "# Let's artificially add some inconsistent formats to 'payment_method'\n", "transactions_clean['payment_method_inconsistent'] = transactions_clean['payment_method']\n", "transactions_clean.loc[1, 'payment_method_inconsistent'] = ' credit card '\n", "transactions_clean.loc[2, 'payment_method_inconsistent'] = 'PayPal '\n", "transactions_clean.loc[3, 'payment_method_inconsistent'] = 'APPLE PAY'\n", "\n", "print(\"\\nExample: Standardizing 'payment_method_inconsistent' string format\")\n", "print(\"Before standardization:\")\n", "print(transactions_clean['payment_method_inconsistent'].unique())\n", "\n", "transactions_clean['payment_method_standardized'] = transactions_clean['payment_method_inconsistent'].str.lower().str.strip()\n", "\n", "print(\"\\nAfter standardization (lowercase and strip whitespace):\")\n", "print(transactions_clean['payment_method_standardized'].unique())\n", "transactions_clean.drop(columns=['payment_method_inconsistent', 'payment_method_standardized'], inplace=True) # Clean up temp columns\n", "\n", "\n", "# Example: Converting data types (e.g., from object to numeric if applicable)\n", "# If a numeric column was read as object due to non-numeric characters (e.g., '$', ','),\n", "# you would clean it and convert.\n", "# Example (conceptual):\n", "# df['numeric_col'] = df['numeric_col'].str.replace('$', '', regex=False).str.replace(',', '', regex=False)\n", "# df['numeric_col'] = pd.to_numeric(df['numeric_col'], errors='coerce') # errors='coerce' turns unparseable values into NaN\n", "\n", "print(\"\\nData preparation and cleaning examples complete.\")\n", "print(f\"{'='*80}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e499d876", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "4df22c5c", "metadata": {}, "outputs": [], "source": [ "# ## 3. Comprehensive Exploratory Data Analysis (EDA)\n", "\n", "# %% tags=[]\n", "# The EDA section now becomes Part 3 and will use the potentially cleaned dataframes\n", "# (depending on which cleaning steps were applied before calling the EDA function).\n", "# For demonstration purposes, we will call EDA on the original data first,\n", "# then you could call it on cleaned dataframes if you apply cleaning steps beforehand.\n", "\n", "def perform_comprehensive_eda(data_dict, dataset_name=\"E-commerce\"):\n", " \"\"\"\n", " Performs comprehensive exploratory data analysis on the datasets.\n", " Includes data quality checks, descriptive statistics, and visualizations.\n", "\n", " Parameters:\n", " -----------\n", " data_dict : dict of pandas DataFrames\n", " A dictionary containing the datasets.\n", " dataset_name : str\n", " Name of the dataset for printing titles.\n", " \"\"\"\n", " print(f\"\\n{'='*80}\")\n", " print(f\"PART 3: COMPREHENSIVE EXPLORATORY DATA ANALYSIS (EDA) - {dataset_name.upper()}\")\n", " print(f\"{'='*80}\")\n", "\n", " for name, df in data_dict.items():\n", " print(f\"\\n--- EDA for {name.upper()} ---\")\n", "\n", " # --- 3.1 Data Overview and Structure ---\n", " print(\"\\n--- 3.1 Data Overview and Structure ---\")\n", " print(f\"\\nShape: {df.shape[0]} rows, {df.shape[1]} columns\")\n", " print(\"\\nFirst 5 rows:\")\n", " print(df.head())\n", " print(\"\\nLast 5 rows:\")\n", " print(df.tail())\n", " print(\"\\nColumn information (dtype, non-null counts):\")\n", " df.info()\n", "\n", " # --- 3.2 Data Quality Checks ---\n", " print(\"\\n--- 3.2 Data Quality Checks ---\")\n", " print(\"\\nMissing values per column:\")\n", " print(df.isnull().sum())\n", "\n", " # Visualize missing values (using Matplotlib)\n", " # Removed seaborn heatmap\n", " missing_counts = df.isnull().sum()\n", " missing_counts = missing_counts[missing_counts > 0].sort_values(ascending=False)\n", " if not missing_counts.empty:\n", " plt.figure(figsize=(10, 6))\n", " missing_counts.plot(kind='bar', color='skyblue')\n", " plt.title(f'Missing Values per Column in {name.upper()}')\n", " plt.xlabel('Columns')\n", " plt.ylabel('Number of Missing Values')\n", " plt.xticks(rotation=45, ha='right')\n", " plt.tight_layout()\n", " plt.show()\n", " else:\n", " print(\"No missing values to visualize.\")\n", "\n", "\n", " print(\"\\nNumber of duplicate rows:\", df.duplicated().sum())\n", " if df.duplicated().sum() > 0:\n", " print(\"Sample of duplicate rows:\")\n", " print(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head()) # Show both original and duplicate\n", "\n", " # Check for unique values in categorical columns\n", " print(\"\\nUnique values in categorical columns (first 10 if many):\")\n", " for col in df.select_dtypes(include='object').columns:\n", " unique_vals = df[col].unique()\n", " if len(unique_vals) > 10:\n", " print(f\"  {col} ({len(unique_vals)} unique): {unique_vals[:10]}...\")\n", " else:\n", " print(f\"  {col} ({len(unique_vals)} unique): {unique_vals}\")\n", "\n", " # --- 3.3 Descriptive Statistics ---\n", " print(\"\\n--- 3.3 Descriptive Statistics ---\")\n", " print(\"\\nDescriptive statistics for numerical columns:\")\n", " print(df.describe())\n", " print(\"\\nDescriptive statistics for categorical columns:\")\n", " print(df.describe(include='object'))\n", "\n", " # --- 3.4 Distribution Analysis (Numerical Columns) ---\n", " print(\"\\n--- 3.4 Distribution Analysis (Numerical Columns) ---\")\n", " numerical_cols = df.select_dtypes(include=np.number).columns\n", " if not numerical_cols.empty:\n", " print(\"\\nHistograms and Box Plots for Numerical Columns:\")\n", " for col in numerical_cols:\n", " plt.figure(figsize=(14, 6))\n", "\n", " # Histogram\n", " plt.subplot(1, 2, 1)\n", " plt.hist(df[col].dropna(), bins=30, color='lightcoral', edgecolor='black')\n", " plt.title(f'Distribution of {col.replace(\"_\", \" \").title()}')\n", " plt.xlabel(col.replace(\"_\", \" \").title())\n", " plt.ylabel('Frequency')\n", "\n", " # Box Plot\n", " plt.subplot(1, 2, 2)\n", " plt.boxplot(df[col].dropna(), vert=False, patch_artist=True, boxprops=dict(facecolor='lightgreen'))\n", " plt.title(f'Box Plot of {col.replace(\"_\", \" \").title()}')\n", " plt.xlabel(col.replace(\"_\", \" \").title())\n", "\n", " plt.tight_layout()\n", " plt.show()\n", "\n", " # Check for skewness and kurtosis\n", " print(\"\\nSkewness and Kurtosis for Numerical Columns:\")\n", " skewness = df[numerical_cols].skew().sort_values(ascending=False)\n", " kurtosis = df[numerical_cols].kurtosis().sort_values(ascending=False)\n", " stats_df = pd.DataFrame({'Skewness': skewness, 'Kurtosis': kurtosis})\n", " print(stats_df)\n", "\n", "\n", " # --- 3.5 Distribution Analysis (Categorical Columns) ---\n", " print(\"\\n--- 3.5 Distribution Analysis (Categorical Columns) ---\")\n", " categorical_cols = df.select_dtypes(include='object').columns\n", " if not categorical_cols.empty:\n", " print(\"\\nValue Counts and Bar Plots for Categorical Columns:\")\n", " for col in categorical_cols:\n", " print(f\"\\nValue counts for '{col}':\")\n", " print(df[col].value_counts(dropna=False)) # Include NaN counts\n", "\n", " # Bar Plot\n", " plt.figure(figsize=(10, 6))\n", " df[col].value_counts(dropna=False).plot(kind='bar', color='teal', edgecolor='black')\n", " plt.title(f'Distribution of {col.replace(\"_\", \" \").title()}')\n", " plt.xlabel(col.replace(\"_\", \" \").title())\n", " plt.ylabel('Count')\n", " plt.xticks(rotation=45, ha='right')\n", " plt.tight_layout()\n", " plt.show()\n", "\n", " # --- 3.6 Relationship Analysis (Examples) ---\n", " print(\"\\n--- 3.6 Relationship Analysis (Examples) ---\")\n", "\n", " # Example: Correlation matrix for numerical features (where applicable)\n", " if not numerical_cols.empty and len(numerical_cols) > 1:\n", " print(\"\\nCorrelation Matrix for Numerical Columns:\")\n", " correlation_matrix = df[numerical_cols].corr()\n", " print(correlation_matrix)\n", "\n", " # Visualize correlation matrix (using Matplotlib)\n", " # Removed seaborn heatmap\n", " plt.figure(figsize=(10, 8))\n", " plt.matshow(correlation_matrix, fignum=plt.gcf().number, cmap='coolwarm', aspect='auto')\n", " plt.xticks(range(len(numerical_cols)), numerical_cols, rotation=90)\n", " plt.yticks(range(len(numerical_cols)), numerical_cols)\n", " plt.colorbar(label='Correlation Coefficient')\n", " plt.title('Correlation Matrix', fontsize=16)\n", " # Add correlation values as text\n", " for i in range(len(numerical_cols)):\n", " for j in range(len(numerical_cols)):\n", " plt.text(j, i, f\"{correlation_matrix.iloc[i, j]:.2f}\",\n", " ha=\"center\", va=\"center\", color=\"black\", fontsize=8)\n", " plt.tight_layout()\n", " plt.show()\n", "\n", "\n", " # Example: Relationship between a categorical and numerical variable (e.g., avg_order_value by gender)\n", " if 'gender' in df.columns and 'avg_order_value' in df.columns:\n", " print(\"\\nAverage Avg Order Value by Gender:\")\n", " avg_order_by_gender = df.groupby('gender')['avg_order_value'].mean().dropna().sort_values(ascending=False)\n", " print(avg_order_by_gender)\n", "\n", " # Visualize (Bar plot)\n", " plt.figure(figsize=(8, 5))\n", " avg_order_by_gender.plot(kind='bar', color='purple', edgecolor='black')\n", " plt.title('Average Order Value by Gender')\n", " plt.xlabel('Gender')\n", " plt.ylabel('Average Order Value')\n", " plt.xticks(rotation=0)\n", " plt.tight_layout()\n", " plt.show()\n", "\n", " # Example: Relationship between two categorical variables (e.g., customer_segment and location)\n", " if 'customer_segment' in df.columns and 'location' in df.columns:\n", " print(\"\\nCustomer Segment Distribution by Location (Cross-tabulation):\")\n", " cross_tab = pd.crosstab(df['customer_segment'], df['location'])\n", " print(cross_tab)\n", "\n", " # Visualize (Stacked Bar Plot)\n", " # Removed seaborn heatmap\n", " cross_tab.plot(kind='bar', stacked=True, figsize=(12, 7))\n", " plt.title('Customer Segment Distribution by Location')\n", " plt.xlabel('Customer Segment')\n", " plt.ylabel('Count')\n", " plt.xticks(rotation=45, ha='right')\n", " plt.legend(title='Location')\n", " plt.tight_layout()\n", " plt.show()\n", "\n", " # Example: Time-based analysis (if date/timestamp columns exist)\n", " date_cols = df.select_dtypes(include='datetime64[ns]').columns\n", " if not date_cols.empty:\n", " print(\"\\nTime-based Analysis Examples:\")\n", " for date_col in date_cols:\n", " print(f\"\\nAnalysis based on '{date_col}':\")\n", " df[date_col] = pd.to_datetime(df[date_col]) # Ensure it's datetime\n", "\n", " # Extract time components\n", " df['year'] = df[date_col].dt.year\n", " df['month'] = df[date_col].dt.month\n", " df['day_of_week'] = df[date_col].dt.dayofweek # Monday=0, Sunday=6\n", " df['hour'] = df[date_col].dt.hour\n", "\n", " # Example: Count of events over time (e.g., transactions over time)\n", " if name == 'transactions' and 'date' in df.columns:\n", " print(\"\\nTransaction Count Over Time:\")\n", " transactions_over_time = df.groupby(df['date'].dt.date).size()\n", " plt.figure(figsize=(14, 6))\n", " transactions_over_time.plot(kind='line', marker='o', linestyle='-', color='darkblue')\n", " plt.title('Daily Transaction Count Over Time')\n", " plt.xlabel('Date')\n", " plt.ylabel('Number of Transactions')\n", " plt.grid(True)\n", " plt.tight_layout()\n", " plt.show()\n", "\n", " # Example: Aggregation by month (e.g., total sales by month)\n", " if name == 'transactions' and 'date' in df.columns and 'total_price' in df.columns:\n", " print(\"\\nTotal Sales by Month:\")\n", " monthly_sales = df.resample('M', on='date')['total_price'].sum()\n", " plt.figure(figsize=(12, 6))\n", " monthly_sales.plot(kind='bar', color='forestgreen', edgecolor='black')\n", " plt.title('Total Sales by Month')\n", " plt.xlabel('Month')\n", " plt.ylabel('Total Sales')\n", " plt.xticks(rotation=45, ha='right')\n", " plt.tight_layout()\n", " plt.show()\n", "\n", " # Example: Website visits by hour of day\n", " if name == 'website_visits' and 'timestamp' in df.columns:\n", " print(\"\\nWebsite Visits by Hour of Day:\")\n", " visits_by_hour = df['hour'].value_counts().sort_index()\n", " plt.figure(figsize=(10, 6))\n", " visits_by_hour.plot(kind='bar', color='indianred', edgecolor='black')\n", " plt.title('Website Visits by Hour of Day')\n", " plt.xlabel('Hour of Day')\n", " plt.ylabel('Number of Visits')\n", " plt.xticks(rotation=0)\n", " plt.tight_layout()\n", " plt.show()\n", "\n", " # Clean up temporary time columns\n", " df.drop(columns=['year', 'month', 'day_of_week', 'hour'], inplace=True, errors='ignore')\n", "\n", " print(f\"\\n{'='*80}\")\n", " print(f\"EDA COMPLETE FOR {dataset_name.upper()}\")\n", " print(f\"{'='*80}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "90c677ac", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "90f40f36", "metadata": {}, "outputs": [], "source": [ "# ## 4. Advanced Data Manipulation with Pandas\n", "\n", "# %% tags=[]\n", "print(f\"\\n{'='*80}\")\n", "print(f\"PART 4: ADVANCED DATA MANIPULATION WITH PANDAS\")\n", "print(f\"{'='*80}\")\n", "\n", "# Use the synthetic e-commerce data for manipulation examples\n", "customers_manip = ecommerce_data['customers'].copy()\n", "transactions_manip = ecommerce_data['transactions'].copy()\n", "products_manip = ecommerce_data['products'].copy()\n", "website_visits_manip = ecommerce_data['website_visits'].copy()\n", "marketing_manip = ecommerce_data['marketing'].copy()\n", "campaign_exposures_manip = ecommerce_data['campaign_exposures'].copy()\n", "\n", "\n", "print(\"\\n--- Advanced Data Manipulation Examples ---\")\n", "\n", "# --- 4.1 Merging/Joining DataFrames ---\n", "print(\"\\n--- 4.1 Merging/Joining DataFrames ---\")\n", "\n", "# Example: Merge customers and transactions to get customer info for each transaction\n", "print(\"\\nExample: Merging Transactions with Customers (Inner Join)\")\n", "# Ensure date columns are datetime objects before merging/joining on them if needed,\n", "# or if doing time-based joins. Here we join on customer_id.\n", "transactions_customers = pd.merge(\n", " transactions_manip,\n", " customers_manip[['customer_id', 'age_group', 'gender', 'location', 'signup_date']],\n", " on='customer_id',\n", " how='inner' # Only include transactions where the customer exists in the customers table\n", ")\n", "print(\"Shape after merging transactions and customers:\", transactions_customers.shape)\n", "print(transactions_customers.head())\n", "\n", "# Example: Left join transactions with products to get product details for each transaction\n", "print(\"\\nExample: Left Joining Transactions with Products\")\n", "transactions_products = pd.merge(\n", " transactions_manip,\n", " products_manip[['product_id', 'product_name', 'category', 'price']],\n", " on='product_id',\n", " how='left' # Keep all transactions, add product info where available\n", ")\n", "print(\"Shape after left joining transactions and products:\", transactions_products.shape)\n", "print(transactions_products.head())\n", "\n", "# Example: Combining multiple joins (e.g., transactions + customers + products)\n", "print(\"\\nExample: Merging Transactions, Customers, and Products\")\n", "# Start with transactions, merge customers, then merge products\n", "transactions_full = pd.merge(transactions_manip, customers_manip[['customer_id', 'age_group', 'gender', 'location']], on='customer_id', how='left')\n", "transactions_full = pd.merge(transactions_full, products_manip[['product_id', 'product_name', 'category']], on='product_id', how='left')\n", "print(\"Shape after merging transactions, customers, and products:\", transactions_full.shape)\n", "print(transactions_full.head())\n", "\n", "# Example: Merging website visits with products (using left join as product_id can be NaN)\n", "print(\"\\nExample: Left Joining Website Visits with Products\")\n", "website_visits_products = pd.merge(\n", " website_visits_manip,\n", " products_manip[['product_id', 'product_name', 'category']],\n", " on='product_id',\n", " how='left'\n", ")\n", "print(\"Shape after left joining website visits and products:\", website_visits_products.shape)\n", "print(website_visits_products.head())\n", "\n", "# Example: Merging campaign exposures with marketing campaign details\n", "print(\"\\nExample: Merging Campaign Exposures with Marketing Details\")\n", "campaign_exposures_marketing = pd.merge(\n", " campaign_exposures_manip,\n", " marketing_manip[['campaign_id', 'campaign_type', 'budget']],\n", " on='campaign_id',\n", " how='left'\n", ")\n", "print(\"Shape after merging campaign exposures and marketing:\", campaign_exposures_marketing.shape)\n", "print(campaign_exposures_marketing.head())\n", "\n", "\n", "# --- 4.2 Concatenating DataFrames ---\n", "print(\"\\n--- 4.2 Concatenating DataFrames ---\")\n", "\n", "# Example: Concatenating two (or more) DataFrames with the same columns\n", "# Let's create a small second batch of synthetic customer data\n", "customers_batch2 = generate_synthetic_ecommerce_data(n_customers=50)['customers']\n", "customers_batch2['customer_id'] = customers_batch2['customer_id'] + customers_manip['customer_id'].max() # Ensure unique IDs\n", "\n", "print(\"\\nExample: Concatenating two Customer DataFrames\")\n", "print(\"Shape of original customers:\", customers_manip.shape)\n", "print(\"Shape of second customer batch:\", customers_batch2.shape)\n", "\n", "customers_combined = pd.concat([customers_manip, customers_batch2], ignore_index=True) # ignore_index=True resets the index\n", "print(\"Shape after concatenation:\", customers_combined.shape)\n", "print(customers_combined.tail()) # Show rows from the second batch\n", "\n", "# Example: Concatenating DataFrames with different columns (will result in NaNs)\n", "# Let's concatenate customers and products (different structures)\n", "print(\"\\nExample: Concatenating Customers and Products (different columns)\")\n", "# This is generally not a meaningful operation unless you have a specific reason\n", "# (e.g., stacking data from different sources with some overlapping columns).\n", "customers_products_concat = pd.concat([customers_manip, products_manip], ignore_index=True)\n", "print(\"Shape after concatenating customers and products:\", customers_products_concat.shape)\n", "print(customers_products_concat.head())\n", "print(customers_products_concat.tail()) # Notice the NaNs where columns don't overlap\n", "\n", "\n", "# --- 4.3 Grouping and Aggregating Data (GroupBy) ---\n", "print(\"\\n--- 4.3 Grouping and Aggregating Data (GroupBy) ---\")\n", "\n", "# Example: Total sales by customer\n", "print(\"\\nExample: Total Sales by Customer\")\n", "customer_sales = transactions_manip.groupby('customer_id')['total_price'].sum().reset_index()\n", "customer_sales.rename(columns={'total_price': 'total_sales'}, inplace=True)\n", "print(customer_sales.head())\n", "\n", "# Example: Average transaction value by payment method\n", "print(\"\\nExample: Average Transaction Value by Payment Method\")\n", "avg_txn_by_payment = transactions_manip.groupby('payment_method')['total_price'].mean().reset_index()\n", "avg_txn_by_payment.rename(columns={'total_price': 'average_transaction_value'}, inplace=True)\n", "print(avg_txn_by_payment.head())\n", "\n", "# Example: Number of transactions and total quantity sold per product\n", "print(\"\\nExample: Transaction Count and Total Quantity per Product\")\n", "product_summary = transactions_manip.groupby('product_id').agg(\n", " transaction_count=('transaction_id', 'count'), # Count occurrences of transaction_id\n", " total_quantity=('quantity', 'sum')\n", ").reset_index()\n", "print(product_summary.head())\n", "\n", "# Example: Grouping by multiple columns (e.g., total sales by customer and month)\n", "print(\"\\nExample: Total Sales by Customer and Month\")\n", "# Ensure 'date' is datetime and extract month\n", "transactions_manip['date'] = pd.to_datetime(transactions_manip['date'])\n", "transactions_manip['month'] = transactions_manip['date'].dt.to_period('M')\n", "\n", "monthly_customer_sales = transactions_manip.groupby(['customer_id', 'month'])['total_price'].sum().reset_index()\n", "monthly_customer_sales.rename(columns={'total_price': 'monthly_sales'}, inplace=True)\n", "print(monthly_customer_sales.head())\n", "transactions_manip.drop(columns=['month'], inplace=True) # Clean up temp column\n", "\n", "\n", "# --- 4.4 Pivoting Data ---\n", "print(\"\\n--- 4.4 Pivoting Data ---\")\n", "\n", "# Example: Pivot table to show total sales by location and gender\n", "print(\"\\nExample: Pivot Table - Total Sales by Location and Gender\")\n", "# Need a DataFrame with location, gender, and total_price\n", "# Let's use the transactions_customers merged DataFrame from earlier\n", "if 'location' in transactions_customers.columns and 'gender' in transactions_customers.columns and 'total_price' in transactions_customers.columns:\n", " sales_pivot = pd.pivot_table(\n", " transactions_customers,\n", " values='total_price',\n", " index='location',\n", " columns='gender',\n", " aggfunc='sum',\n", " fill_value=0 # Fill missing combinations with 0\n", " )\n", " print(sales_pivot)\n", "else:\n", " print(\"Required columns ('location', 'gender', 'total_price') not available in transactions_customers for pivoting.\")\n", "\n", "\n", "# Example: Pivot table to show average session duration by device and channel\n", "print(\"\\nExample: Pivot Table - Average Session Duration by Device and Channel\")\n", "if 'device' in website_visits_manip.columns and 'channel' in website_visits_manip.columns and 'session_duration' in website_visits_manip.columns:\n", " session_pivot = pd.pivot_table(\n", " website_visits_manip,\n", " values='session_duration',\n", " index='device',\n", " columns='channel',\n", " aggfunc='mean',\n", " fill_value=0\n", " )\n", " print(session_pivot.round(2))\n", "else:\n", " print(\"Required columns ('device', 'channel', 'session_duration') not available in website_visits_manip for pivoting.\")\n", "\n", "\n", "# --- 4.5 Melting Data ---\n", "print(\"\\n--- 4.5 Melting Data ---\")\n", "# Melting is the opposite of pivoting; it transforms wide format data into long format.\n", "\n", "# Example: Melt the sales_pivot table back into long format\n", "print(\"\\nExample: Melting the Sales Pivot Table\")\n", "# Reset index to turn index 'location' into a column\n", "sales_pivot_reset = sales_pivot.reset_index()\n", "sales_melted = pd.melt(\n", " sales_pivot_reset,\n", " id_vars=['location'], # Columns to keep as is\n", " var_name='gender', # Name for the new column created from old column names\n", " value_name='total_sales' # Name for the new column created from old column values\n", ")\n", "print(sales_melted.head()) # Note: This will include the 'Non-binary' and 'Prefer not to say' genders as well\n", "\n", "\n", "# --- 4.6 Applying Functions (Apply, Map, ApplyMap) ---\n", "print(\"\\n--- 4.6 Applying Functions (Apply, Map, ApplyMap) ---\")\n", "\n", "# Example (Apply): Apply a function along an axis of the DataFrame (row or column)\n", "print(\"\\nExample: Applying a function to calculate discount amount per transaction item\")\n", "# Calculate discount amount = quantity * unit_price * discount\n", "transactions_manip['discount_amount'] = transactions_manip.apply(\n", " lambda row: row['quantity'] * row['unit_price'] * row['discount'],\n", " axis=1 # Apply the function row-wise\n", ")\n", "print(transactions_manip[['quantity', 'unit_price', 'discount', 'discount_amount', 'total_price']].head())\n", "# Verify: total_price should be (quantity * unit_price) - discount_amount\n", "# print((transactions_manip['quantity'] * transactions_manip['unit_price'] - transactions_manip['discount_amount'] - transactions_manip['total_price']).abs().sum()) # Should be close to 0\n", "\n", "\n", "# Example (Map): Map values of a Series based on a dictionary or function\n", "print(\"\\nExample: Mapping age group to a numerical category (Customers)\")\n", "age_group_mapping = {\n", " '18-24': 1, '25-34': 2, '35-44': 3, '45-54': 4, '55-64': 5, '65+': 6\n", "}\n", "# Use .loc to avoid SettingWithCopyWarning if customers_manip is a slice\n", "customers_manip.loc[:, 'age_group_numeric'] = customers_manip['age_group'].map(age_group_mapping)\n", "print(customers_manip[['age_group', 'age_group_numeric']].head())\n", "# Note: Map will result in NaN for any values in 'age_group' not in the mapping dictionary (including original NaNs)\n", "\n", "\n", "# Example (ApplyMap - deprecated, use Apply with axis=None or element-wise operations):\n", "# ApplyMap applies a function element-wise across the entire DataFrame.\n", "# Modern Pandas prefers element-wise operations or df.apply(..., axis=None)\n", "print(\"\\nExample: Applying a function element-wise (Conceptual - using apply with axis=None)\")\n", "# Let's say we want to round all float values in a small DataFrame\n", "temp_df = transactions_manip[['unit_price', 'total_price']].head()\n", "print(\"\\nOriginal small DataFrame:\")\n", "print(temp_df)\n", "\n", "# Using apply with axis=None (or element-wise operations directly)\n", "rounded_temp_df = temp_df.apply(lambda x: round(x, 1), axis=None) # Applies function to each element\n", "print(\"\\nRounded small DataFrame:\")\n", "print(rounded_temp_df)\n", "\n", "\n", "# --- 4.7 Using Zip ---\n", "print(\"\\n--- 4.7 Using Zip ---\")\n", "# zip is a Python built-in function, not a Pandas method, but often used with Pandas.\n", "# It pairs elements from multiple iterables.\n", "\n", "# Example: Creating a new column by combining values from two columns\n", "print(\"\\nExample: Creating a 'product_info' string by zipping product name and category\")\n", "# Need a DataFrame with product_name and category\n", "product_info_list = []\n", "# Iterate through rows of products_manip\n", "for index, row in products_manip.iterrows():\n", " # Use zip conceptually or just access row elements\n", " product_info = f\"{row['product_name']} ({row['category']})\"\n", " product_info_list.append(product_info)\n", "\n", "products_manip['product_info'] = product_info_list\n", "print(products_manip[['product_name', 'category', 'product_info']].head())\n", "\n", "# A more \"Pandas-idiomatic\" way using string formatting directly on Series:\n", "# products_manip['product_info_pandas'] = products_manip['product_name'] + ' (' + products_manip['category'] + ')'\n", "# print(products_manip[['product_name', 'category', 'product_info_pandas']].head())\n", "\n", "\n", "print(\"\\nAdvanced data manipulation examples complete.\")\n", "print(f\"{'='*80}\")\n", "\n", "\n", "# %% [markdown]\n", "# ## 5. Feature Engineering\n", "\n", "# %% tags=[]\n", "print(f\"\\n{'='*80}\")\n", "print(f\"PART 5: FEATURE ENGINEERING\")\n", "print(f\"{'='*80}\")\n", "\n", "# Use copies of the synthetic e-commerce data for feature engineering examples\n", "customers_fe = ecommerce_data['customers'].copy()\n", "transactions_fe = ecommerce_data['transactions'].copy()\n", "website_visits_fe = ecommerce_data['website_visits'].copy()\n", "campaign_exposures_fe = ecommerce_data['campaign_exposures'].copy()\n", "products_fe = ecommerce_data['products'].copy() # Might be useful for product-based features\n", "\n", "\n", "print(\"\\n--- Feature Engineering Examples ---\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f41f899e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "69b8942d", "metadata": {}, "outputs": [], "source": [ "# --- 5.1 Creating Time-Based Features ---\n", "print(\"\\n--- 5.1 Creating Time-Based Features ---\")\n", "\n", "# Ensure date columns are datetime objects\n", "customers_fe['signup_date'] = pd.to_datetime(customers_fe['signup_date'])\n", "transactions_fe['date'] = pd.to_datetime(transactions_fe['date'])\n", "website_visits_fe['timestamp'] = pd.to_datetime(website_visits_fe['timestamp'])\n", "campaign_exposures_fe['exposure_date'] = pd.to_datetime(campaign_exposures_fe['exposure_date'])\n", "\n", "\n", "# Example: Features from signup_date (Customers)\n", "print(\"\\nExample: Features from Customer Signup Date\")\n", "customers_fe['signup_year'] = customers_fe['signup_date'].dt.year\n", "customers_fe['signup_month'] = customers_fe['signup_date'].dt.month\n", "customers_fe['signup_day_of_week'] = customers_fe['signup_date'].dt.dayofweek # Monday=0, Sunday=6\n", "customers_fe['signup_day_of_year'] = customers_fe['signup_date'].dt.dayofyear\n", "customers_fe['signup_week_of_year'] = customers_fe['signup_date'].dt.isocalendar().week.astype(int) # ISO week number\n", "customers_fe['signup_quarter'] = customers_fe['signup_date'].dt.quarter\n", "\n", "# Days since signup (relative to a fixed date, e.g., end of data)\n", "latest_date = transactions_fe['date'].max() # Or website_visits_fe['timestamp'].max()\n", "customers_fe['days_since_signup'] = (latest_date - customers_fe['signup_date']).dt.days\n", "\n", "print(customers_fe[['signup_date', 'signup_year', 'signup_month', 'signup_day_of_week',\n", " 'signup_day_of_year', 'signup_week_of_year', 'signup_quarter',\n", " 'days_since_signup']].head())\n", "\n", "\n", "# Example: Features from transaction date (Transactions)\n", "print(\"\\nExample: Features from Transaction Date\")\n", "transactions_fe['transaction_year'] = transactions_fe['date'].dt.year\n", "transactions_fe['transaction_month'] = transactions_fe['date'].dt.month\n", "transactions_fe['transaction_day_of_week'] = transactions_fe['date'].dt.dayofweek\n", "transactions_fe['transaction_hour'] = transactions_fe['date'].dt.hour\n", "transactions_fe['is_weekend'] = (transactions_fe['transaction_day_of_week'] >= 5).astype(int) # 1 if weekend, 0 otherwise\n", "\n", "print(transactions_fe[['date', 'transaction_year', 'transaction_month', 'transaction_day_of_week',\n", " 'transaction_hour', 'is_weekend']].head())\n", "\n", "\n", "# Example: Features from website visit timestamp (Website Visits)\n", "print(\"\\nExample: Features from Website Visit Timestamp\")\n", "website_visits_fe['visit_year'] = website_visits_fe['timestamp'].dt.year\n", "website_visits_fe['visit_month'] = website_visits_fe['timestamp'].dt.month\n", "website_visits_fe['visit_day_of_week'] = website_visits_fe['timestamp'].dt.dayofweek\n", "website_visits_fe['visit_hour'] = website_visits_fe['timestamp'].dt.hour\n", "website_visits_fe['visit_minute'] = website_visits_fe['timestamp'].dt.minute\n", "\n", "print(website_visits_fe[['timestamp', 'visit_year', 'visit_month', 'visit_day_of_week',\n", " 'visit_hour', 'visit_minute']].head())\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0eba39ee", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "bcc53908", "metadata": {}, "outputs": [], "source": [ "# --- 5.2 Aggregation-Based Features ---\n", "print(\"\\n--- 5.2 Aggregation-Based Features ---\")\n", "# Create features by aggregating data from one table onto another (e.g., transaction stats per customer)\n", "\n", "# Example: Aggregate transaction data onto the customer table\n", "print(\"\\nExample: Aggregating Transaction Data onto Customer Table\")\n", "# Calculate total number of transactions, total spend, average spend per transaction, etc. per customer\n", "customer_transaction_agg = transactions_fe.groupby('customer_id').agg(\n", " total_transactions=('transaction_id', 'nunique'), # Count unique transaction IDs per customer\n", " total_spend=('total_price', 'sum'),\n", " avg_transaction_value=('total_price', 'mean'),\n", " min_transaction_value=('total_price', 'min'),\n", " max_transaction_value=('total_price', 'max'),\n", " total_items_bought=('quantity', 'sum'),\n", " avg_items_per_transaction=('quantity', 'mean'),\n", " first_purchase_date=('date', 'min'),\n", " last_purchase_date=('date', 'max'),\n", " distinct_products_bought=('product_id', 'nunique')\n", ").reset_index()\n", "\n", "# Merge these aggregated features back into the customers DataFrame\n", "customers_fe = pd.merge(customers_fe, customer_transaction_agg, on='customer_id', how='left')\n", "\n", "# Calculate Recency, Frequency, Monetary (RFM) features\n", "# Recency: Days since last purchase (relative to latest_date)\n", "customers_fe['recency'] = (latest_date - customers_fe['last_purchase_date']).dt.days\n", "# Frequency: Total number of transactions (already calculated as total_transactions)\n", "# Monetary: Total spend (already calculated as total_spend)\n", "\n", "print(customers_fe[['customer_id', 'total_transactions', 'total_spend', 'avg_transaction_value',\n", " 'recency', 'first_purchase_date', 'last_purchase_date', 'distinct_products_bought']].head())\n", "\n", "# Fill NaN values for customers with no transactions (e.g., 0 for counts/sums, NaN for dates/means)\n", "customers_fe['total_transactions'].fillna(0, inplace=True)\n", "customers_fe['total_spend'].fillna(0, inplace=True)\n", "customers_fe['total_items_bought'].fillna(0, inplace=True)\n", "customers_fe['distinct_products_bought'].fillna(0, inplace=True)\n", "# avg_transaction_value will remain NaN for customers with 0 transactions, which is appropriate.\n", "# Recency will be NaN for customers with no last_purchase_date; you might fill this with a large value or handle separately.\n", "customers_fe['recency'].fillna((latest_date - customers_fe['signup_date']).dt.days + 1, inplace=True) # Fill with days since signup + 1\n", "\n", "\n", "# Example: Aggregate website visit data onto the customer table\n", "print(\"\\nExample: Aggregating Website Visit Data onto Customer Table\")\n", "customer_visit_agg = website_visits_fe.groupby('customer_id').agg(\n", " total_visits=('visit_id', 'nunique'),\n", " total_session_duration=('session_duration', 'sum'),\n", " avg_session_duration=('session_duration', 'mean'),\n", " total_pages_viewed=('pages_viewed', 'sum'),\n", " avg_pages_per_visit=('pages_viewed', 'mean'),\n", " last_visit_timestamp=('timestamp', 'max'),\n", " distinct_devices_used=('device', 'nunique'),\n", " distinct_channels_used=('channel', 'nunique')\n", ").reset_index()\n", "\n", "# Merge into customers DataFrame\n", "customers_fe = pd.merge(customers_fe, customer_visit_agg, on='customer_id', how='left')\n", "print(customers_fe[['customer_id', 'total_visits', 'avg_session_duration', 'total_pages_viewed',\n", " 'last_visit_timestamp', 'distinct_devices_used']].head())\n", "\n", "# Fill NaNs for customers with no website visits\n", "fill_cols_visit = ['total_visits', 'total_session_duration', 'total_pages_viewed',\n", " 'distinct_devices_used', 'distinct_channels_used']\n", "for col in fill_cols_visit:\n", " customers_fe[col].fillna(0, inplace=True)\n", "\n", "\n", "# Example: Aggregate transaction data onto the product table\n", "print(\"\\nExample: Aggregating Transaction Data onto Product Table\")\n", "product_transaction_agg = transactions_fe.groupby('product_id').agg(\n", " total_sales_product=('total_price', 'sum'),\n", " total_quantity_sold=('quantity', 'sum'),\n", " avg_price_sold=('total_price', 'mean'), # Average price including discounts\n", " num_transactions_product=('transaction_id', 'nunique'),\n", " num_customers_bought=('customer_id', 'nunique')\n", ").reset_index()\n", "\n", "# Merge into products DataFrame\n", "products_fe = pd.merge(products_fe, product_transaction_agg, on='product_id', how='left')\n", "print(products_fe[['product_id', 'product_name', 'total_sales_product', 'total_quantity_sold',\n", " 'num_customers_bought']].head())\n", "\n", "# Fill NaNs for products with no sales\n", "fill_cols_product = ['total_sales_product', 'total_quantity_sold', 'num_transactions_product', 'num_customers_bought']\n", "for col in fill_cols_product:\n", " products_fe[col].fillna(0, inplace=True)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1a2f655f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "6c0dd061", "metadata": {}, "outputs": [], "source": [ "# --- 5.3 Creating Interaction Features ---\n", "print(\"\\n--- 5.3 Creating Interaction Features ---\")\n", "# Combine existing features to create new ones that capture interactions.\n", "\n", "# Example: Interaction between quantity and unit_price (already exists as total_price before discount)\n", "# Let's create one that includes discount effect explicitly\n", "transactions_fe['price_after_discount'] = transactions_fe['unit_price'] * (1 - transactions_fe['discount'])\n", "transactions_fe['quantity_x_price_after_discount'] = transactions_fe['quantity'] * transactions_fe['price_after_discount']\n", "# This should be equal to total_price (before any potential NaNs were introduced)\n", "# print((transactions_fe['quantity_x_price_after_discount'] - transactions_fe['total_price']).abs().sum()) # Check if they are close\n", "\n", "\n", "# Example: Interaction between age_group and loyalty_score (Customers)\n", "# This might be useful if certain age groups respond differently based on loyalty.\n", "# Need to handle categorical 'age_group' - could use one-hot encoding or map to numerical first.\n", "# Using the numeric age group created earlier:\n", "if 'age_group_numeric' in customers_fe.columns and 'loyalty_score' in customers_fe.columns:\n", " # Handle potential NaNs in loyalty_score and age_group_numeric before multiplication\n", " customers_fe['loyalty_score_filled'] = customers_fe['loyalty_score'].fillna(customers_fe['loyalty_score'].mean()) # Impute NaNs for interaction\n", " customers_fe['age_group_numeric_filled'] = customers_fe['age_group_numeric'].fillna(customers_fe['age_group_numeric'].median()) # Impute NaNs\n", "\n", " customers_fe['age_loyalty_interaction'] = customers_fe['age_group_numeric_filled'] * customers_fe['loyalty_score_filled']\n", " print(\"\\nExample: Age Group * Loyalty Score Interaction Feature (Customers)\")\n", " print(customers_fe[['age_group', 'age_group_numeric', 'loyalty_score', 'age_loyalty_interaction']].head())\n", " customers_fe.drop(columns=['loyalty_score_filled', 'age_group_numeric_filled'], inplace=True) # Clean up temp columns\n", "\n", "\n", "# --- 5.4 Creating Polynomial Features ---\n", "print(\"\\n--- 5.4 Creating Polynomial Features ---\")\n", "# Create polynomial combinations of existing features (e.g., feature^2, feature1 * feature2).\n", "# Often used in regression.\n", "\n", "# Example: Polynomial features for 'lifetime_value' and 'avg_order_value' (Customers)\n", "print(\"\\nExample: Polynomial Features for Lifetime Value and Avg Order Value (Customers)\")\n", "# Handle NaNs before creating polynomial features\n", "customers_fe['lifetime_value_filled'] = customers_fe['lifetime_value'].fillna(customers_fe['lifetime_value'].mean())\n", "customers_fe['avg_order_value_filled'] = customers_fe['avg_order_value'].fillna(customers_fe['avg_order_value'].mean())\n", "\n", "customers_fe['lifetime_value_sq'] = customers_fe['lifetime_value_filled'] ** 2\n", "customers_fe['avg_order_value_sq'] = customers_fe['avg_order_value_filled'] ** 2\n", "customers_fe['lifetime_avg_order_interaction'] = customers_fe['lifetime_value_filled'] * customers_fe['avg_order_value_filled']\n", "\n", "print(customers_fe[['lifetime_value', 'avg_order_value', 'lifetime_value_sq',\n", " 'avg_order_value_sq', 'lifetime_avg_order_interaction']].head())\n", "customers_fe.drop(columns=['lifetime_value_filled', 'avg_order_value_filled'], inplace=True) # Clean up temp columns\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8968b295", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "d028dcba", "metadata": {}, "outputs": [], "source": [ "# --- 5.5 Encoding Categorical Features ---\n", "print(\"\\n--- 5.5 Encoding Categorical Features ---\")\n", "# Convert categorical variables into a numerical format that can be used by machine learning models.\n", "\n", "# Example: One-Hot Encoding for 'gender' and 'location' (Customers)\n", "print(\"\\nExample: One-Hot Encoding 'gender' and 'location' (Customers)\")\n", "# Handle NaNs before encoding if you don't want them to become a separate category or be dropped.\n", "# Option 1: Fill NaNs\n", "customers_fe['gender_filled'] = customers_fe['gender'].fillna('Unknown')\n", "customers_fe['location_filled'] = customers_fe['location'].fillna('Unknown')\n", "\n", "customers_encoded_ohe = pd.get_dummies(\n", " customers_fe,\n", " columns=['gender_filled', 'location_filled'],\n", " prefix=['gender', 'location'], # Prefix for the new columns\n", " dummy_na=False # Set to True if you want NaN to be its own category (if not filled)\n", ")\n", "print(customers_encoded_ohe[['gender', 'gender_filled', 'gender_Female', 'gender_Male', 'gender_Non-binary', 'gender_Prefer not to say', 'gender_Unknown',\n", " 'location', 'location_filled', 'location_Central', 'location_East', 'location_North', 'location_South', 'location_West', 'location_Unknown']].head())\n", "customers_fe.drop(columns=['gender_filled', 'location_filled'], inplace=True) # Clean up temp columns\n", "\n", "\n", "# Example: Label Encoding for 'customer_segment' (Customers)\n", "print(\"\\nExample: Label Encoding 'customer_segment' (Customers)\")\n", "# Label Encoding assigns a unique integer to each category. Useful for ordinal data or tree-based models.\n", "# Handle NaNs before encoding if needed.\n", "customers_fe['customer_segment_filled'] = customers_fe['customer_segment'].fillna('Unknown')\n", "\n", "label_encoder = LabelEncoder()\n", "customers_fe['customer_segment_encoded'] = label_encoder.fit_transform(customers_fe['customer_segment_filled'])\n", "print(customers_fe[['customer_segment', 'customer_segment_filled', 'customer_segment_encoded']].head())\n", "print(\"Mapping of encoded values:\", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))\n", "customers_fe.drop(columns=['customer_segment_filled'], inplace=True) # Clean up temp column\n" ] }, { "cell_type": "code", "execution_count": null, "id": "21997160", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "0f0b5ae9", "metadata": {}, "outputs": [], "source": [ "# --- 5.6 Binning Numerical Features ---\n", "print(\"\\n--- 5.6 Binning Numerical Features ---\")\n", "# Convert a continuous numerical variable into discrete bins (categorical).\n", "\n", "# Example: Binning 'loyalty_score' into loyalty levels (Customers)\n", "print(\"\\nExample: Binning 'loyalty_score' into Loyalty Levels (Customers)\")\n", "# Define bin edges and labels\n", "bins = [0, 20, 50, 80, 100]\n", "labels = ['Low', 'Medium', 'High', 'Very High']\n", "\n", "# Handle NaNs in loyalty_score before binning\n", "customers_fe['loyalty_score_filled'] = customers_fe['loyalty_score'].fillna(customers_fe['loyalty_score'].mean())\n", "\n", "customers_fe['loyalty_level'] = pd.cut(customers_fe['loyalty_score_filled'], bins=bins, labels=labels, right=True, include_lowest=True)\n", "print(customers_fe[['loyalty_score', 'loyalty_score_filled', 'loyalty_level']].head())\n", "print(\"\\nValue counts for 'loyalty_level':\")\n", "print(customers_fe['loyalty_level'].value_counts(dropna=False)) # Check distribution\n", "customers_fe.drop(columns=['loyalty_score_filled'], inplace=True) # Clean up temp column\n", "\n", "\n", "# Example: Binning 'avg_session_duration' into session duration categories (Website Visits)\n", "print(\"\\nExample: Binning 'avg_session_duration' into Session Duration Categories (Website Visits)\")\n", "# Need to calculate avg_session_duration per customer first if not already done\n", "# (Assuming we use the customers_fe DataFrame which now has this aggregated feature)\n", "if 'avg_session_duration' in customers_fe.columns:\n", " # Define bins (e.g., Short, Medium, Long)\n", " # Handle NaNs before binning\n", " customers_fe['avg_session_duration_filled'] = customers_fe['avg_session_duration'].fillna(customers_fe['avg_session_duration'].median()) # Use median for skewed data\n", "\n", " session_bins = [0, 60, 300, customers_fe['avg_session_duration_filled'].max()] # 0-60s, 60-300s, >300s\n", " session_labels = ['Short', 'Medium', 'Long']\n", "\n", " customers_fe['session_duration_category'] = pd.cut(customers_fe['avg_session_duration_filled'],\n", " bins=session_bins, labels=session_labels,\n", " right=True, include_lowest=True)\n", " print(customers_fe[['avg_session_duration', 'avg_session_duration_filled', 'session_duration_category']].head())\n", " print(\"\\nValue counts for 'session_duration_category':\")\n", " print(customers_fe['session_duration_category'].value_counts(dropna=False))\n", " customers_fe.drop(columns=['avg_session_duration_filled'], inplace=True) # Clean up temp column\n", "else:\n", " print(\"\\n'avg_session_duration' not found in customers_fe. Skipping binning example.\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a941bdbe", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "eee9c52a", "metadata": {}, "outputs": [], "source": [ "# --- 5.7 Feature Scaling ---\n", "print(\"\\n--- 5.7 Feature Scaling ---\")\n", "# Normalize or standardize numerical features so they have a similar scale.\n", "# Important for algorithms sensitive to feature scales (e.g., SVM, KNN, K-Means, Linear Regression with regularization).\n", "\n", "# Example: Standard Scaling for 'total_spend' and 'recency' (Customers)\n", "print(\"\\nExample: Standard Scaling for 'total_spend' and 'recency' (Customers)\")\n", "# Use the customers_fe DataFrame with aggregated features\n", "features_to_scale_std = ['total_spend', 'recency']\n", "\n", "# Handle NaNs before scaling (e.g., impute)\n", "for col in features_to_scale_std:\n", " if col in customers_fe.columns:\n", " customers_fe[f'{col}_filled_for_scaling'] = customers_fe[col].fillna(customers_fe[col].mean())\n", " else:\n", " print(f\"Warning: Column '{col}' not found in customers_fe for scaling.\")\n", "\n", "# Select the filled columns for scaling\n", "cols_for_std_scaling = [f'{col}_filled_for_scaling' for col in features_to_scale_std if f'{col}_filled_for_scaling' in customers_fe.columns]\n", "\n", "if cols_for_std_scaling:\n", " scaler_std = StandardScaler()\n", " customers_fe[cols_for_std_scaling] = scaler_std.fit_transform(customers_fe[cols_for_std_scaling])\n", "\n", " # Rename scaled columns for clarity\n", " for col in features_to_scale_std:\n", " if f'{col}_filled_for_scaling' in customers_fe.columns:\n", " customers_fe[f'{col}_scaled_std'] = customers_fe[f'{col}_filled_for_scaling']\n", " customers_fe.drop(columns=[f'{col}_filled_for_scaling'], inplace=True)\n", "\n", " print(customers_fe[['total_spend', 'total_spend_scaled_std', 'recency', 'recency_scaled_std']].head())\n", "else:\n", " print(\"No columns available for Standard Scaling example.\")\n", "\n", "\n", "# Example: Min-Max Scaling for 'loyalty_score' and 'avg_order_value' (Customers)\n", "print(\"\\nExample: Min-Max Scaling for 'loyalty_score' and 'avg_order_value' (Customers)\")\n", "features_to_scale_minmax = ['loyalty_score', 'avg_order_value']\n", "\n", "# Handle NaNs before scaling (e.g., impute)\n", "for col in features_to_scale_minmax:\n", " if col in customers_fe.columns:\n", " customers_fe[f'{col}_filled_for_scaling'] = customers_fe[col].fillna(customers_fe[col].mean())\n", " else:\n", " print(f\"Warning: Column '{col}' not found in customers_fe for scaling.\")\n", "\n", "# Select the filled columns for scaling\n", "cols_for_minmax_scaling = [f'{col}_filled_for_scaling' for col in features_to_scale_minmax if f'{col}_filled_for_scaling' in customers_fe.columns]\n", "\n", "if cols_for_minmax_scaling:\n", " scaler_minmax = MinMaxScaler()\n", " customers_fe[cols_for_minmax_scaling] = scaler_minmax.fit_transform(customers_fe[cols_for_minmax_scaling])\n", "\n", " # Rename scaled columns for clarity\n", " for col in features_to_scale_minmax:\n", " if f'{col}_filled_for_scaling' in customers_fe.columns:\n", " customers_fe[f'{col}_scaled_minmax'] = customers_fe[f'{col}_filled_for_scaling']\n", " customers_fe.drop(columns=[f'{col}_filled_for_scaling'], inplace=True)\n", "\n", " print(customers_fe[['loyalty_score', 'loyalty_score_scaled_minmax', 'avg_order_value', 'avg_order_value_scaled_minmax']].head())\n", "else:\n", " print(\"No columns available for Min-Max Scaling example.\")\n", "\n", "\n", "# --- 5.8 Dimensionality Reduction (PCA, t-SNE) ---\n", "print(\"\\n--- 5.8 Dimensionality Reduction (PCA, t-SNE) ---\")\n", "# Reduce the number of features while retaining important information.\n", "\n", "# Example: PCA on numerical features (Customers)\n", "print(\"\\nExample: PCA on Numerical Features (Customers)\")\n", "# Select numerical features for PCA (excluding IDs and target-like features if applicable)\n", "numerical_features_for_pca = customers_fe.select_dtypes(include=np.number).columns.tolist()\n", "# Exclude ID columns and potentially features derived from aggregation that might be target-like\n", "exclude_cols_pca = ['customer_id', 'age_group_numeric', 'total_transactions', 'total_spend',\n", " 'avg_transaction_value', 'min_transaction_value', 'max_transaction_value',\n", " 'total_items_bought', 'avg_items_per_transaction', 'recency',\n", " 'total_visits', 'total_session_duration', 'avg_session_duration',\n", " 'total_pages_viewed', 'avg_pages_per_visit', 'distinct_devices_used',\n", " 'distinct_channels_used', 'lifetime_value_sq', 'avg_order_value_sq',\n", " 'lifetime_avg_order_interaction', 'age_loyalty_interaction',\n", " 'total_spend_scaled_std', 'recency_scaled_std', 'loyalty_score_scaled_minmax',\n", " 'avg_order_value_scaled_minmax'] # Exclude scaled/derived features\n", "\n", "numerical_features_for_pca = [col for col in numerical_features_for_pca if col not in exclude_cols_pca]\n", "\n", "print(f\"Features used for PCA: {numerical_features_for_pca}\")\n", "\n", "if numerical_features_for_pca:\n", " X_pca = customers_fe[numerical_features_for_pca].dropna() # Handle NaNs for PCA\n", "\n", " if not X_pca.empty:\n", " # Scale the data before applying PCA\n", " scaler_pca = StandardScaler()\n", " X_pca_scaled = scaler_pca.fit_transform(X_pca)\n", "\n", " # Apply PCA\n", " pca = PCA(n_components=2) # Reduce to 2 principal components\n", " X_pca_result = pca.fit_transform(X_pca_scaled)\n", "\n", " # Add PCA components back to the DataFrame (aligning by index)\n", " customers_fe_pca = customers_fe.copy()\n", " # Create a temporary DataFrame for PCA results with the original index\n", " pca_results_df = pd.DataFrame(X_pca_result, columns=['PCA1', 'PCA2'], index=X_pca.index)\n", " # Merge using the index\n", " customers_fe_pca = customers_fe_pca.merge(pca_results_df, left_index=True, right_index=True, how='left')\n", "\n", "\n", " print(\"\\nCustomers DataFrame with PCA components:\")\n", " print(customers_fe_pca[['customer_id'] + numerical_features_for_pca + ['PCA1', 'PCA2']].head())\n", "\n", " # Visualize PCA results (scatter plot)\n", " plt.figure(figsize=(10, 8))\n", " # Use original DataFrame to include categorical info for coloring if desired\n", " plt.scatter(customers_fe_pca['PCA1'], customers_fe_pca['PCA2'], alpha=0.5) # Simple scatter\n", " plt.title('PCA of Customer Numerical Features')\n", " plt.xlabel('Principal Component 1')\n", " plt.ylabel('Principal Component 2')\n", " plt.grid(True)\n", " plt.show()\n", "\n", " print(\"\\nExplained variance ratio by principal components:\", pca.explained_variance_ratio_)\n", " print(\"Total explained variance by 2 components:\", pca.explained_variance_ratio_.sum().round(4))\n", "\n", " else:\n", " print(\"No non-null data available for PCA after dropping NaNs in selected columns.\")\n", "else:\n", " print(\"No suitable numerical features found for PCA after exclusions.\")\n", "\n", "\n", "# Example: t-SNE on numerical features (Customers)\n", "print(\"\\nExample: t-SNE on Numerical Features (Customers)\")\n", "# t-SNE is computationally more expensive and typically used for visualization.\n", "# Use a smaller subset of data for demonstration if the dataset is large.\n", "# Let's use the same scaled data from PCA example, but maybe a subset of customers.\n", "if 'PCA1' in customers_fe_pca.columns: # Check if PCA was successful and created the PCA DataFrame\n", " # Use the scaled data X_pca_scaled, subsetting if necessary\n", " # For demonstration, let's use a random sample of 200 customers\n", " if len(X_pca_scaled) > 200:\n", " sample_indices = np.random.choice(X_pca_scaled.shape[0], 200, replace=False)\n", " X_tsne_sample = X_pca_scaled[sample_indices]\n", " tsne_original_indices = X_pca.iloc[sample_indices].index # Get original indices of the sample\n", " print(f\"Using a random sample of 200 customers for t-SNE.\")\n", " else:\n", " X_tsne_sample = X_pca_scaled\n", " tsne_original_indices = X_pca.index\n", " print(f\"Using all {len(X_tsne_sample)} customers for t-SNE.\")\n", "\n", "\n", " if len(X_tsne_sample) > 1: # t-SNE requires at least 2 samples\n", " # Apply t-SNE\n", " tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=300) # Common parameters\n", " X_tsne_result = tsne.fit_transform(X_tsne_sample)\n", "\n", " # Create a DataFrame for t-SNE results with the original sample indices\n", " tsne_results_df = pd.DataFrame(X_tsne_result, columns=['TSNE1', 'TSNE2'], index=tsne_original_indices)\n", "\n", " # Merge t-SNE results back to the main customers DataFrame (using the temporary PCA dataframe for convenience)\n", " customers_fe_tsne = customers_fe_pca.merge(tsne_results_df, left_index=True, right_index=True, how='left')\n", "\n", " print(\"\\nCustomers DataFrame with t-SNE components (for sampled data):\")\n", " # Display head might not show TSNE if sample didn't include first rows\n", " print(customers_fe_tsne[['customer_id', 'TSNE1', 'TSNE2']].dropna().head())\n", "\n", "\n", " # Visualize t-SNE results (scatter plot)\n", " plt.figure(figsize=(10, 8))\n", " # Plot only the sampled data points that have TSNE results\n", " plt.scatter(customers_fe_tsne['TSNE1'], customers_fe_tsne['TSNE2'], alpha=0.6)\n", " plt.title('t-SNE of Customer Numerical Features (Sampled Data)')\n", " plt.xlabel('t-SNE Component 1')\n", " plt.ylabel('t-SNE Component 2')\n", " plt.grid(True)\n", " plt.show()\n", " else:\n", " print(\"Not enough samples (need at least 2) to perform t-SNE.\")\n", "else:\n", " print(\"PCA results not available. Skipping t-SNE example.\")\n", "\n", "\n", "print(\"\\nFeature engineering examples complete.\")\n", "print(f\"{'='*80}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8ab88600", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "03c2bca3", "metadata": {}, "outputs": [], "source": [ "# ## 6. Regression Analysis\n", "\n", "# %% tags=[]\n", "print(f\"\\n{'='*80}\")\n", "print(f\"PART 6: REGRESSION ANALYSIS\")\n", "print(f\"{'='*80}\")\n", "\n", "# Use the California Housing dataset for regression examples\n", "# Target variable: 'target' (Median house value in 100k USD)\n", "# Features: Various housing and demographic metrics\n", "\n", "print(\"\\n--- Regression Analysis Examples (using California Housing Dataset) ---\")\n", "\n", "# Load the dataset (already loaded in Part 1)\n", "# housing_df = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)\n", "# housing_df['target'] = housing_data.target\n", "\n", "# --- 6.1 Data Preparation for Regression ---\n", "print(\"\\n--- 6.1 Data Preparation for Regression ---\")\n", "\n", "# Separate features (X) and target (y)\n", "X = housing_df.drop('target', axis=1)\n", "y = housing_df['target']\n", "\n", "# Split data into training and testing sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", "print(f\"Training data shape: {X_train.shape}, {y_train.shape}\")\n", "print(f\"Testing data shape: {X_test.shape}, {y_test.shape}\")\n", "\n", "# Identify numerical and categorical features (California Housing is all numerical)\n", "numerical_features = X.columns.tolist()\n", "categorical_features = [] # No categorical features in this dataset\n", "\n", "# Create a preprocessing pipeline (scaling numerical features)\n", "# Use ColumnTransformer even if only numerical for good practice with mixed types\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('num', StandardScaler(), numerical_features)\n", " ],\n", " remainder='passthrough' # Keep other columns (none in this case)\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "bbe7fa8e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "848a534f", "metadata": {}, "outputs": [], "source": [ "# --- 6.2 Linear Regression ---\n", "print(\"\\n--- 6.2 Linear Regression ---\")\n", "\n", "# Create a pipeline with preprocessing and Linear Regression\n", "pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),\n", " ('regressor', LinearRegression())])\n", "\n", "# Train the model\n", "pipeline_lr.fit(X_train, y_train)\n", "\n", "# Make predictions\n", "y_pred_lr = pipeline_lr.predict(X_test)\n", "\n", "# Evaluate the model\n", "mse_lr = mean_squared_error(y_test, y_pred_lr)\n", "rmse_lr = np.sqrt(mse_lr)\n", "r2_lr = r2_score(y_test, y_pred_lr)\n", "\n", "print(f\"\\nLinear Regression Results:\")\n", "print(f\"  Mean Squared Error (MSE): {mse_lr:.4f}\")\n", "print(f\"  Root Mean Squared Error (RMSE): {rmse_lr:.4f}\")\n", "print(f\"  R-squared (R2): {r2_lr:.4f}\")\n", "\n", "# Visualize predictions vs actual (scatter plot)\n", "plt.figure(figsize=(10, 6))\n", "plt.scatter(y_test, y_pred_lr, alpha=0.5)\n", "plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) # Diagonal line\n", "plt.title('Linear Regression: Actual vs Predicted House Values')\n", "plt.xlabel('Actual House Value ($100k)')\n", "plt.ylabel('Predicted House Value ($100k)')\n", "plt.grid(True)\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "fa3c8ea5", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "6725e934", "metadata": {}, "outputs": [], "source": [ "# --- 6.3 Ridge Regression (with Regularization) ---\n", "print(\"\\n--- 6.3 Ridge Regression (with Regularization) ---\")\n", "\n", "# Create a pipeline with preprocessing and Ridge Regression\n", "pipeline_ridge = Pipeline(steps=[('preprocessor', preprocessor),\n", " ('regressor', Ridge(alpha=1.0))]) # alpha is the regularization strength\n", "\n", "# Train the model\n", "pipeline_ridge.fit(X_train, y_train)\n", "\n", "# Make predictions\n", "y_pred_ridge = pipeline_ridge.predict(X_test)\n", "\n", "# Evaluate the model\n", "mse_ridge = mean_squared_error(y_test, y_pred_ridge)\n", "rmse_ridge = np.sqrt(mse_ridge)\n", "r2_ridge = r2_score(y_test, y_pred_ridge)\n", "\n", "print(f\"\\nRidge Regression Results (alpha=1.0):\")\n", "print(f\"  Mean Squared Error (MSE): {mse_ridge:.4f}\")\n", "print(f\"  Root Mean Squared Error (RMSE): {rmse_ridge:.4f}\")\n", "print(f\"  R-squared (R2): {r2_ridge:.4f}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "160cd7b3", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c1934c0c", "metadata": {}, "outputs": [], "source": [ "# --- 6.4 Lasso Regression (with Regularization) ---\n", "print(\"\\n--- 6.4 Lasso Regression (with Regularization) ---\")\n", "\n", "# Create a pipeline with preprocessing and Lasso Regression\n", "pipeline_lasso = Pipeline(steps=[('preprocessor', preprocessor),\n", " ('regressor', Lasso(alpha=0.1))]) # alpha is the regularization strength\n", "\n", "# Train the model\n", "pipeline_lasso.fit(X_train, y_train)\n", "\n", "# Make predictions\n", "y_pred_lasso = pipeline_lasso.predict(X_test)\n", "\n", "# Evaluate the model\n", "mse_lasso = mean_squared_error(y_test, y_pred_lasso)\n", "rmse_lasso = np.sqrt(mse_lasso)\n", "r2_lasso = r2_score(y_test, y_pred_lasso)\n", "\n", "print(f\"\\nLasso Regression Results (alpha=0.1):\")\n", "print(f\"  Mean Squared Error (MSE): {mse_lasso:.4f}\")\n", "print(f\"  Root Mean Squared Error (RMSE): {rmse_lasso:.4f}\")\n", "print(f\"  R-squared (R2): {r2_lasso:.4f}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9ea75792", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "06f86a7b", "metadata": {}, "outputs": [], "source": [ "# --- 6.5 ElasticNet Regression (Combination of Ridge and Lasso) ---\n", "print(\"\\n--- 6.5 ElasticNet Regression (Combination of Ridge and Lasso) ---\")\n", "\n", "# Create a pipeline with preprocessing and ElasticNet Regression\n", "pipeline_enet = Pipeline(steps=[('preprocessor', preprocessor),\n", " ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))]) # alpha and l1_ratio\n", "\n", "# Train the model\n", "pipeline_enet.fit(X_train, y_train)\n", "\n", "# Make predictions\n", "y_pred_enet = pipeline_enet.predict(X_test)\n", "\n", "# Evaluate the model\n", "mse_enet = mean_squared_error(y_test, y_pred_enet)\n", "rmse_enet = np.sqrt(mse_enet)\n", "r2_enet = r2_score(y_test, y_pred_enet)\n", "\n", "print(f\"\\nElasticNet Regression Results (alpha=0.1, l1_ratio=0.5):\")\n", "print(f\"  Mean Squared Error (MSE): {mse_enet:.4f}\")\n", "print(f\"  Root Mean Squared Error (RMSE): {rmse_enet:.4f}\")\n", "print(f\"  R-squared (R2): {r2_enet:.4f}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "4b7fb4c1", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "61ba9966", "metadata": {}, "outputs": [], "source": [ "# --- 6.6 Random Forest Regressor ---\n", "print(\"\\n--- 6.6 Random Forest Regressor ---\")\n", "\n", "# Create a pipeline with preprocessing and Random Forest Regressor\n", "pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor),\n", " ('regressor', RandomForestRegressor(n_estimators=10, random_state=42))])\n", "\n", "# Train the model\n", "pipeline_rf.fit(X_train, y_train)\n", "\n", "# Make predictions\n", "y_pred_rf = pipeline_rf.predict(X_test)\n", "\n", "# Evaluate the model\n", "mse_rf = mean_squared_error(y_test, y_pred_rf)\n", "rmse_rf = np.sqrt(mse_rf)\n", "r2_rf = r2_score(y_test, y_pred_rf)\n", "\n", "print(f\"\\nRandom Forest Regressor Results:\")\n", "print(f\"  Mean Squared Error (MSE): {mse_rf:.4f}\")\n", "print(f\"  Root Mean Squared Error (RMSE): {rmse_rf:.4f}\")\n", "print(f\"  R-squared (R2): {r2_rf:.4f}\")\n", "\n", "# Feature Importance (from Random Forest)\n", "print(\"\\nRandom Forest Feature Importances:\")\n", "# Get feature names after preprocessing\n", "feature_names = numerical_features # Since no categorical features were one-hot encoded\n", "importances = pipeline_rf.named_steps['regressor'].feature_importances_\n", "feature_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)\n", "print(feature_importances)\n", "\n", "# Visualize Feature Importances\n", "plt.figure(figsize=(10, 6))\n", "feature_importances.plot(kind='bar', color='darkorange')\n", "plt.title('Random Forest Regressor Feature Importances')\n", "plt.xlabel('Features')\n", "plt.ylabel('Importance')\n", "plt.xticks(rotation=45, ha='right')\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9866146e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "4e14c2db", "metadata": {}, "outputs": [], "source": [ "# --- 6.7 Gradient Boosting Regressor ---\n", "print(\"\\n--- 6.7 Gradient Boosting Regressor ---\")\n", "\n", "# Create a pipeline with preprocessing and Gradient Boosting Regressor\n", "pipeline_gbr = Pipeline(steps=[('preprocessor', preprocessor),\n", " ('regressor', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))])\n", "\n", "# Train the model\n", "pipeline_gbr.fit(X_train, y_train)\n", "\n", "# Make predictions\n", "y_pred_gbr = pipeline_gbr.predict(X_test)\n", "\n", "# Evaluate the model\n", "mse_gbr = mean_squared_error(y_test, y_pred_gbr)\n", "rmse_gbr = np.sqrt(mse_gbr)\n", "r2_gbr = r2_score(y_test, y_pred_gbr)\n", "\n", "print(f\"\\nGradient Boosting Regressor Results:\")\n", "print(f\"  Mean Squared Error (MSE): {mse_gbr:.4f}\")\n", "print(f\"  Root Mean Squared Error (RMSE): {rmse_gbr:.4f}\")\n", "print(f\"  R-squared (R2): {r2_gbr:.4f}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c7cecf35", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c056abc2", "metadata": {}, "outputs": [], "source": [ "# --- 6.8 Support Vector Regressor (SVR) ---\n", "print(\"\\n--- 6.8 Support Vector Regressor (SVR) ---\")\n", "\n", "# Create a pipeline with preprocessing and SVR\n", "pipeline_svr = Pipeline(steps=[('preprocessor', preprocessor),\n", " ('regressor', SVR(kernel='rbf', C=1.0, epsilon=0.1))]) # RBF kernel is common\n", "\n", "# Train the model (SVR can be slower on larger datasets)\n", "print(\"\\nTraining SVR (this might take a moment)...\")\n", "pipeline_svr.fit(X_train, y_train)\n", "print(\"SVR training complete.\")\n", "\n", "# Make predictions\n", "y_pred_svr = pipeline_svr.predict(X_test)\n", "\n", "# Evaluate the model\n", "mse_svr = mean_squared_error(y_test, y_pred_svr)\n", "rmse_svr = np.sqrt(mse_svr)\n", "r2_svr = r2_score(y_test, y_pred_svr)\n", "\n", "print(f\"\\nSupport Vector Regressor (SVR) Results:\")\n", "print(f\"  Mean Squared Error (MSE): {mse_svr:.4f}\")\n", "print(f\"  Root Mean Squared Error (RMSE): {rmse_svr:.4f}\")\n", "print(f\"  R-squared (R2): {r2_svr:.4f}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "620f1225", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ae531aad", "metadata": {}, "outputs": [], "source": [ "# --- 6.9 K-Nearest Neighbors Regressor ---\n", "print(\"\\n--- 6.9 K-Nearest Neighbors Regressor ---\")\n", "\n", "# Create a pipeline with preprocessing and KNN Regressor\n", "pipeline_knn_r = Pipeline(steps=[('preprocessor', preprocessor),\n", " ('regressor', KNeighborsRegressor(n_neighbors=5))]) # Use 5 neighbors\n", "\n", "# Train the model\n", "pipeline_knn_r.fit(X_train, y_train)\n", "\n", "# Make predictions\n", "y_pred_knn_r = pipeline_knn_r.predict(X_test)\n", "\n", "# Evaluate the model\n", "mse_knn_r = mean_squared_error(y_test, y_pred_knn_r)\n", "rmse_knn_r = np.sqrt(mse_knn_r)\n", "r2_knn_r = r2_score(y_test, y_pred_knn_r)\n", "\n", "print(f\"\\nK-Nearest Neighbors Regressor Results (n_neighbors=5):\")\n", "print(f\"  Mean Squared Error (MSE): {mse_knn_r:.4f}\")\n", "print(f\"  Root Mean Squared Error (RMSE): {rmse_knn_r:.4f}\")\n", "print(f\"  R-squared (R2): {r2_knn_r:.4f}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "bc4ca8ef", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "e379b7e9", "metadata": {}, "outputs": [], "source": [ "# --- 6.10 Cross-Validation for Model Evaluation ---\n", "print(\"\\n--- 6.10 Cross-Validation for Model Evaluation ---\")\n", "\n", "# Use cross-validation to get a more robust estimate of model performance\n", "# Example using Linear Regression pipeline\n", "print(\"\\nCross-validation results for Linear Regression (R-squared):\")\n", "cv_scores_lr = cross_val_score(pipeline_lr, X, y, cv=5, scoring='r2') # 5-fold cross-validation\n", "print(f\"  R-squared scores: {cv_scores_lr}\")\n", "print(f\"  Mean R-squared: {cv_scores_lr.mean():.4f}\")\n", "print(f\"  Standard Deviation of R-squared: {cv_scores_lr.std():.4f}\")\n", "\n", "# Example using Random Forest Regressor pipeline\n", "print(\"\\nCross-validation results for Random Forest Regressor (Negative MSE):\")\n", "# Use negative MSE because cross_val_score maximizes the score, so we minimize negative MSE\n", "cv_scores_rf_mse = cross_val_score(pipeline_rf, X, y, cv=5, scoring='neg_mean_squared_error')\n", "# Convert back to positive MSE and take the square root for RMSE\n", "cv_rmse_rf = np.sqrt(-cv_scores_rf_mse)\n", "print(f\"  RMSE scores: {cv_rmse_rf}\")\n", "print(f\"  Mean RMSE: {cv_rmse_rf.mean():.4f}\")\n", "print(f\"  Standard Deviation of RMSE: {cv_rmse_rf.std():.4f}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2afd4b1e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "8c98e99a", "metadata": {}, "outputs": [], "source": [ "# --- 6.11 Hyperparameter Tuning (Example with GridSearchCV) ---\n", "print(\"\\n--- 6.11 Hyperparameter Tuning (Example with GridSearchCV) ---\")\n", "\n", "# Example: Tuning Ridge Regression alpha\n", "print(\"\\nExample: Hyperparameter Tuning for Ridge Regression (alpha) using GridSearchCV\")\n", "\n", "# Define the parameter grid to search\n", "param_grid_ridge = {\n", " 'regressor__alpha': [0.1, 1.0, 10.0, 100.0] # Note the 'regressor__' prefix for pipeline steps\n", "}\n", "\n", "# Create GridSearchCV object\n", "grid_search_ridge = GridSearchCV(pipeline_ridge, param_grid_ridge, cv=5, scoring='neg_mean_squared_error', n_jobs=-1) # Use all available cores\n", "\n", "# Perform the grid search\n", "print(\"Performing GridSearchCV for Ridge (this might take a moment)...\")\n", "grid_search_ridge.fit(X_train, y_train)\n", "print(\"GridSearchCV complete.\")\n", "\n", "# Get the best parameters and best score\n", "print(f\"\\nBest parameters for Ridge: {grid_search_ridge.best_params_}\")\n", "print(f\"Best cross-validation negative MSE: {grid_search_ridge.best_score_:.4f}\")\n", "print(f\"Best cross-validation RMSE: {np.sqrt(-grid_search_ridge.best_score_):.4f}\")\n", "\n", "# Evaluate on the test set with the best model\n", "best_ridge_model = grid_search_ridge.best_estimator_\n", "y_pred_best_ridge = best_ridge_model.predict(X_test)\n", "mse_best_ridge = mean_squared_error(y_test, y_pred_best_ridge)\n", "rmse_best_ridge = np.sqrt(mse_best_ridge)\n", "r2_best_ridge = r2_score(y_test, y_pred_best_ridge)\n", "\n", "print(f\"\\nBest Ridge Model Results on Test Set:\")\n", "print(f\"  Mean Squared Error (MSE): {mse_best_ridge:.4f}\")\n", "print(f\"  Root Mean Squared Error (RMSE): {rmse_best_ridge:.4f}\")\n", "print(f\"  R-squared (R2): {r2_best_ridge:.4f}\")\n", "\n", "\n", "# Example: Tuning Random Forest Regressor (subset of parameters for speed)\n", "print(\"\\nExample: Hyperparameter Tuning for Random Forest Regressor using RandomizedSearchCV\")\n", "\n", "# Define the parameter distribution to sample from\n", "param_dist_rf = {\n", " 'regressor__n_estimators': [50, 100, 200],\n", " 'regressor__max_depth': [None, 10, 20, 30],\n", " 'regressor__min_samples_split': [2, 5, 10],\n", " 'regressor__min_samples_leaf': [1, 2, 4]\n", "}\n", "\n", "# Create RandomizedSearchCV object\n", "random_search_rf = RandomizedSearchCV(pipeline_rf, param_dist_rf, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1) # Sample 10 combinations\n", "\n", "# Perform the randomized search\n", "print(\"\\nPerforming RandomizedSearchCV for Random Forest (this might take a moment)...\")\n", "random_search_rf.fit(X_train, y_train)\n", "print(\"RandomizedSearchCV complete.\")\n", "\n", "# Get the best parameters and best score\n", "print(f\"\\nBest parameters for Random Forest: {random_search_rf.best_params_}\")\n", "print(f\"Best cross-validation negative MSE: {random_search_rf.best_score_:.4f}\")\n", "print(f\"Best cross-validation RMSE: {np.sqrt(-random_search_rf.best_score_):.4f}\")\n", "\n", "# Evaluate on the test set with the best model\n", "best_rf_model = random_search_rf.best_estimator_\n", "y_pred_best_rf = best_rf_model.predict(X_test)\n", "mse_best_rf = mean_squared_error(y_test, y_pred_best_rf)\n", "rmse_best_rf = np.sqrt(mse_best_rf)\n", "r2_best_rf = r2_score(y_test, y_pred_best_rf)\n", "\n", "print(f\"\\nBest Random Forest Model Results on Test Set:\")\n", "print(f\"  Mean Squared Error (MSE): {mse_best_rf:.4f}\")\n", "print(f\"  Root Mean Squared Error (RMSE): {rmse_best_rf:.4f}\")\n", "print(f\"  R-squared (R2): {r2_best_rf:.4f}\")\n", "\n", "\n", "print(\"\\nRegression analysis examples complete.\")\n", "print(f\"{'='*80}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "505c1555", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "02c7fe7b", "metadata": {}, "outputs": [], "source": [ " ## 7. Classification Analysis\n", "\n", "# %% tags=[]\n", "print(f\"\\n{'='*80}\")\n", "print(f\"PART 7: CLASSIFICATION ANALYSIS\")\n", "print(f\"{'='*80}\")\n", "\n", "# Use the Wine dataset for classification examples\n", "# Target variable: 'target' (Wine type: 0, 1, or 2)\n", "# Features: Various chemical properties of wine\n", "\n", "print(\"\\n--- Classification Analysis Examples (using Wine Dataset) ---\")\n", "\n", "# Load the dataset (already loaded in Part 1)\n", "# wine_data = load_wine()\n", "# wine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)\n", "# wine_df['target'] = wine_data.target\n", "\n", "# --- 7.1 Data Preparation for Classification ---\n", "print(\"\\n--- 7.1 Data Preparation for Classification ---\")\n", "\n", "# Separate features (X) and target (y)\n", "X_clf = wine_df.drop('target', axis=1)\n", "y_clf = wine_df['target']\n", "\n", "# Split data into training and testing sets\n", "X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf) # Stratify to maintain class distribution\n", "\n", "print(f\"Training data shape: {X_train_clf.shape}, {y_train_clf.shape}\")\n", "print(f\"Testing data shape: {X_test_clf.shape}, {y_test_clf.shape}\")\n", "print(\"\\nClass distribution in training set:\")\n", "print(y_train_clf.value_counts(normalize=True))\n", "print(\"\\nClass distribution in testing set:\")\n", "print(y_test_clf.value_counts(normalize=True))\n", "\n", "\n", "# Identify numerical and categorical features (Wine dataset is all numerical)\n", "numerical_features_clf = X_clf.columns.tolist()\n", "categorical_features_clf = [] # No categorical features in this dataset\n", "\n", "# Create a preprocessing pipeline (scaling numerical features)\n", "preprocessor_clf = ColumnTransformer(\n", " transformers=[\n", " ('num', StandardScaler(), numerical_features_clf)\n", " ],\n", " remainder='passthrough'\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a345d06c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "8edf4f49", "metadata": {}, "outputs": [], "source": [ "# --- 7.2 Logistic Regression ---\n", "print(\"\\n--- 7.2 Logistic Regression ---\")\n", "\n", "# Create a pipeline with preprocessing and Logistic Regression\n", "pipeline_lr_clf = Pipeline(steps=[('preprocessor', preprocessor_clf),\n", " ('classifier', LogisticRegression(multi_class='auto', solver='liblinear', random_state=42))]) # Use liblinear for smaller datasets\n", "\n", "# Train the model\n", "pipeline_lr_clf.fit(X_train_clf, y_train_clf)\n", "\n", "# Make predictions\n", "y_pred_lr_clf = pipeline_lr_clf.predict(X_test_clf)\n", "y_prob_lr_clf = pipeline_lr_clf.predict_proba(X_test_clf) # Probabilities for ROC/AUC\n", "\n", "# Evaluate the model\n", "accuracy_lr_clf = accuracy_score(y_test_clf, y_pred_lr_clf)\n", "conf_matrix_lr_clf = confusion_matrix(y_test_clf, y_pred_lr_clf)\n", "class_report_lr_clf = classification_report(y_test_clf, y_pred_lr_clf)\n", "\n", "print(f\"\\nLogistic Regression Results:\")\n", "print(f\"  Accuracy: {accuracy_lr_clf:.4f}\")\n", "print(\"\\n  Confusion Matrix:\")\n", "print(conf_matrix_lr_clf)\n", "print(\"\\n  Classification Report:\")\n", "print(class_report_lr_clf)\n", "\n", "# For multi-class, ROC AUC is calculated differently (e.g., One-vs-Rest or One-vs-One)\n", "# Let's calculate macro-averaged ROC AUC\n", "try:\n", " roc_auc_lr_clf = roc_auc_score(y_test_clf, y_prob_lr_clf, multi_class='ovr', average='macro')\n", " print(f\"  Macro-averaged ROC AUC (One-vs-Rest): {roc_auc_lr_clf:.4f}\")\n", "except ValueError as e:\n", " print(f\"  Could not calculate ROC AUC: {e}\") # May happen with very few samples per class\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6b711466", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "8db26b04", "metadata": {}, "outputs": [], "source": [ "# --- 7.3 Random Forest Classifier ---\n", "print(\"\\n--- 7.3 Random Forest Classifier ---\")\n", "\n", "# Create a pipeline with preprocessing and Random Forest Classifier\n", "pipeline_rf_clf = Pipeline(steps=[('preprocessor', preprocessor_clf),\n", " ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])\n", "\n", "# Train the model\n", "pipeline_rf_clf.fit(X_train_clf, y_train_clf)\n", "\n", "# Make predictions\n", "y_pred_rf_clf = pipeline_rf_clf.predict(X_test_clf)\n", "y_prob_rf_clf = pipeline_rf_clf.predict_proba(X_test_clf)\n", "\n", "# Evaluate the model\n", "accuracy_rf_clf = accuracy_score(y_test_clf, y_pred_rf_clf)\n", "conf_matrix_rf_clf = confusion_matrix(y_test_clf, y_pred_rf_clf)\n", "class_report_rf_clf = classification_report(y_test_clf, y_pred_rf_clf)\n", "\n", "print(f\"\\nRandom Forest Classifier Results:\")\n", "print(f\"  Accuracy: {accuracy_rf_clf:.4f}\")\n", "print(\"\\n  Confusion Matrix:\")\n", "print(conf_matrix_rf_clf)\n", "print(\"\\n  Classification Report:\")\n", "print(class_report_rf_clf)\n", "\n", "try:\n", " roc_auc_rf_clf = roc_auc_score(y_test_clf, y_prob_rf_clf, multi_class='ovr', average='macro')\n", " print(f\"  Macro-averaged ROC AUC (One-vs-Rest): {roc_auc_rf_clf:.4f}\")\n", "except ValueError as e:\n", " print(f\"  Could not calculate ROC AUC: {e}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0c48a133", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "b581eb49", "metadata": {}, "outputs": [], "source": [ "# --- 7.4 Support Vector Classifier (SVC) ---\n", "print(\"\\n--- 7.4 Support Vector Classifier (SVC) ---\")\n", "\n", "# Create a pipeline with preprocessing and SVC\n", "pipeline_svc = Pipeline(steps=[('preprocessor', preprocessor_clf),\n", " ('classifier', SVC(kernel='rbf', probability=True, random_state=42))]) # probability=True needed for predict_proba\n", "\n", "# Train the model (SVC can be slower)\n", "print(\"\\nTraining SVC (this might take a moment)...\")\n", "pipeline_svc.fit(X_train_clf, y_train_clf)\n", "print(\"SVC training complete.\")\n", "\n", "# Make predictions\n", "y_pred_svc = pipeline_svc.predict(X_test_clf)\n", "y_prob_svc = pipeline_svc.predict_proba(X_test_clf)\n", "\n", "# Evaluate the model\n", "accuracy_svc = accuracy_score(y_test_clf, y_pred_svc)\n", "conf_matrix_svc = confusion_matrix(y_test_clf, y_pred_svc)\n", "class_report_svc = classification_report(y_test_clf, y_pred_svc)\n", "\n", "print(f\"\\nSupport Vector Classifier (SVC) Results:\")\n", "print(f\"  Accuracy: {accuracy_svc:.4f}\")\n", "print(\"\\n  Confusion Matrix:\")\n", "print(conf_matrix_svc)\n", "print(\"\\n  Classification Report:\")\n", "print(class_report_svc)\n", "\n", "try:\n", " roc_auc_svc = roc_auc_score(y_test_clf, y_prob_svc, multi_class='ovr', average='macro')\n", " print(f\"  Macro-averaged ROC AUC (One-vs-Rest): {roc_auc_svc:.4f}\")\n", "except ValueError as e:\n", " print(f\"  Could not calculate ROC AUC: {e}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9e5ce052", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "77f2c2ba", "metadata": {}, "outputs": [], "source": [ "# --- 7.5 K-Nearest Neighbors Classifier ---\n", "print(\"\\n--- 7.5 K-Nearest Neighbors Classifier ---\")\n", "\n", "# Create a pipeline with preprocessing and KNN Classifier\n", "pipeline_knn_c = Pipeline(steps=[('preprocessor', preprocessor_clf),\n", " ('classifier', KNeighborsClassifier(n_neighbors=5))]) # Use 5 neighbors\n", "\n", "# Train the model\n", "pipeline_knn_c.fit(X_train_clf, y_train_clf)\n", "\n", "# Make predictions\n", "y_pred_knn_c = pipeline_knn_c.predict(X_test_clf)\n", "y_prob_knn_c = pipeline_knn_c.predict_proba(X_test_clf)\n", "\n", "# Evaluate the model\n", "accuracy_knn_c = accuracy_score(y_test_clf, y_pred_knn_c)\n", "conf_matrix_knn_c = confusion_matrix(y_test_clf, y_pred_knn_c)\n", "class_report_knn_c = classification_report(y_test_clf, y_pred_knn_c)\n", "\n", "print(f\"\\nK-Nearest Neighbors Classifier Results (n_neighbors=5):\")\n", "print(f\"  Accuracy: {accuracy_knn_c:.4f}\")\n", "print(\"\\n  Confusion Matrix:\")\n", "print(conf_matrix_knn_c)\n", "print(\"\\n  Classification Report:\")\n", "print(class_report_knn_c)\n", "\n", "try:\n", " roc_auc_knn_c = roc_auc_score(y_test_clf, y_prob_knn_c, multi_class='ovr', average='macro')\n", " print(f\"  Macro-averaged ROC AUC (One-vs-Rest): {roc_auc_knn_c:.4f}\")\n", "except ValueError as e:\n", " print(f\"  Could not calculate ROC AUC: {e}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "91d5beb1", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "a31a4f20", "metadata": {}, "outputs": [], "source": [ "\n", "# --- 7.6 Cross-Validation for Classification ---\n", "print(\"\\n--- 7.6 Cross-Validation for Classification ---\")\n", "\n", "# Example using Logistic Regression pipeline\n", "print(\"\\nCross-validation results for Logistic Regression (Accuracy):\")\n", "cv_scores_lr_clf = cross_val_score(pipeline_lr_clf, X_clf, y_clf, cv=5, scoring='accuracy') # 5-fold cross-validation\n", "print(f\"  Accuracy scores: {cv_scores_lr_clf}\")\n", "print(f\"  Mean Accuracy: {cv_scores_lr_clf.mean():.4f}\")\n", "print(f\"  Standard Deviation of Accuracy: {cv_scores_lr_clf.std():.4f}\")\n", "\n", "# Example using Random Forest Classifier pipeline\n", "print(\"\\nCross-validation results for Random Forest Classifier (F1-score - macro):\")\n", "cv_scores_rf_f1 = cross_val_score(pipeline_rf_clf, X_clf, y_clf, cv=5, scoring='f1_macro') # Macro-averaged F1-score\n", "print(f\"  F1-macro scores: {cv_scores_rf_f1}\")\n", "print(f\"  Mean F1-macro: {cv_scores_rf_f1.mean():.4f}\")\n", "print(f\"  Standard Deviation of F1-macro: {cv_scores_rf_f1.std():.4f}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "0c9be063", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "d86a6649", "metadata": {}, "outputs": [], "source": [ "# --- 7.7 Hyperparameter Tuning (Example with GridSearchCV) ---\n", "print(\"\\n--- 7.7 Hyperparameter Tuning (Example with GridSearchCV) ---\")\n", "\n", "# Example: Tuning SVC kernel and C parameter\n", "print(\"\\nExample: Hyperparameter Tuning for SVC using GridSearchCV\")\n", "\n", "# Define the parameter grid to search\n", "param_grid_svc = {\n", " 'classifier__kernel': ['linear', 'rbf'],\n", " 'classifier__C': [0.1, 1, 10]\n", "}\n", "\n", "# Create GridSearchCV object\n", "grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=5, scoring='accuracy', n_jobs=-1)\n", "\n", "# Perform the grid search\n", "print(\"Performing GridSearchCV for SVC (this might take a moment)...\")\n", "grid_search_svc.fit(X_train_clf, y_train_clf)\n", "print(\"GridSearchCV complete.\")\n", "\n", "# Get the best parameters and best score\n", "print(f\"\\nBest parameters for SVC: {grid_search_svc.best_params_}\")\n", "print(f\"Best cross-validation accuracy: {grid_search_svc.best_score_:.4f}\")\n", "\n", "# Evaluate on the test set with the best model\n", "best_svc_model = grid_search_svc.best_estimator_\n", "y_pred_best_svc = best_svc_model.predict(X_test_clf)\n", "accuracy_best_svc = accuracy_score(y_test_clf, y_pred_best_svc)\n", "\n", "print(f\"\\nBest SVC Model Results on Test Set:\")\n", "print(f\"  Accuracy: {accuracy_best_svc:.4f}\")\n", "\n", "\n", "# Example: Tuning Random Forest Classifier (subset of parameters)\n", "print(\"\\nExample: Hyperparameter Tuning for Random Forest Classifier using RandomizedSearchCV\")\n", "\n", "# Define the parameter distribution to sample from\n", "param_dist_rf_clf = {\n", " 'classifier__n_estimators': [50, 100, 200, 300],\n", " 'classifier__max_depth': [None, 5, 10, 15],\n", " 'classifier__min_samples_split': [2, 5, 10],\n", " 'classifier__min_samples_leaf': [1, 2, 4],\n", " 'classifier__criterion': ['gini', 'entropy']\n", "}\n", "\n", "# Create RandomizedSearchCV object\n", "random_search_rf_clf = RandomizedSearchCV(pipeline_rf_clf, param_dist_rf_clf, n_iter=20, cv=5, scoring='accuracy', random_state=42, n_jobs=-1) # Sample 20 combinations\n", "\n", "# Perform the randomized search\n", "print(\"\\nPerforming RandomizedSearchCV for Random Forest Classifier (this might take a moment)...\")\n", "random_search_rf_clf.fit(X_train_clf, y_train_clf)\n", "print(\"RandomizedSearchCV complete.\")\n", "\n", "# Get the best parameters and best score\n", "print(f\"\\nBest parameters for Random Forest Classifier: {random_search_rf_clf.best_params_}\")\n", "print(f\"Best cross-validation accuracy: {random_search_rf_clf.best_score_:.4f}\")\n", "\n", "# Evaluate on the test set with the best model\n", "best_rf_clf_model = random_search_rf_clf.best_estimator_\n", "y_pred_best_rf_clf = best_rf_clf_model.predict(X_test_clf)\n", "accuracy_best_rf_clf = accuracy_score(y_test_clf, y_pred_best_rf_clf)\n", "\n", "print(f\"\\nBest Random Forest Classifier Model Results on Test Set:\")\n", "print(f\"  Accuracy: {accuracy_best_rf_clf:.4f}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "923696d0", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ee82d11f", "metadata": {}, "outputs": [], "source": [ "# --- 7.8 Visualizing Classification Results (Example: ROC Curve for Binary Classification) ---\n", "print(\"\\n--- 7.8 Visualizing Classification Results (Example: ROC Curve for Binary Classification) ---\")\n", "# ROC curves are typically for binary classification.\n", "# For multi-class, you can plot one-vs-rest or one-vs-one ROC curves.\n", "# Let's demonstrate a One-vs-Rest ROC curve for class 1 vs all others using Logistic Regression.\n", "\n", "# Need binary target for ROC curve plotting\n", "# Let's create a binary target: Class 1 vs Not Class 1\n", "y_test_clf_binary = (y_test_clf == 1).astype(int)\n", "# Get probabilities for the positive class (Class 1)\n", "# y_prob_lr_clf is already calculated as predict_proba, which gives probabilities for each class.\n", "# We need the probability of belonging to class 1.\n", "# Find the column index corresponding to class 1 in the classes_ attribute of the fitted classifier\n", "class_1_index = list(pipeline_lr_clf.named_steps['classifier'].classes_).index(1)\n", "y_prob_lr_clf_positive = y_prob_lr_clf[:, class_1_index]\n", "\n", "\n", "# Calculate ROC curve points\n", "fpr, tpr, thresholds = roc_curve(y_test_clf_binary, y_prob_lr_clf_positive)\n", "roc_auc = roc_auc_score(y_test_clf_binary, y_prob_lr_clf_positive)\n", "\n", "plt.figure(figsize=(8, 6))\n", "plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')\n", "plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') # Diagonal random guess line\n", "plt.xlim([0.0, 1.0])\n", "plt.ylim([0.0, 1.05])\n", "plt.xlabel('False Positive Rate')\n", "plt.ylabel('True Positive Rate')\n", "plt.title('Receiver Operating Characteristic (ROC) Curve (Class 1 vs Rest)')\n", "plt.legend(loc=\"lower right\")\n", "plt.grid(True)\n", "plt.show()\n", "\n", "\n", "# Example: Precision-Recall Curve (Binary Classification)\n", "print(\"\\n--- Example: Precision-Recall Curve (Binary Classification) ---\")\n", "# Using the same binary target and probabilities as for the ROC curve\n", "precision, recall, _ = precision_recall_curve(y_test_clf_binary, y_prob_lr_clf_positive)\n", "average_precision = average_precision_score(y_test_clf_binary, y_prob_lr_clf_positive)\n", "\n", "plt.figure(figsize=(8, 6))\n", "plt.plot(recall, precision, color='blue', lw=2, label=f'Precision-Recall curve (AP = {average_precision:.2f})')\n", "plt.xlabel('Recall')\n", "plt.ylabel('Precision')\n", "plt.title('Precision-Recall Curve (Class 1 vs Rest)')\n", "plt.legend(loc=\"lower left\")\n", "plt.grid(True)\n", "plt.show()\n", "\n", "\n", "print(\"\\nClassification analysis examples complete.\")\n", "print(f\"{'='*80}\")\n", "\n", "\n", "# %% [markdown]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "940cafe1", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ca3728e6", "metadata": {}, "outputs": [], "source": [ "# ## 8. Clustering Analysis\n", "\n", "# %% tags=[]\n", "print(f\"\\n{'='*80}\")\n", "print(f\"PART 8: CLUSTERING ANALYSIS\")\n", "print(f\"{'='*80}\")\n", "\n", "# Use the Wine dataset (without the target variable) for clustering examples\n", "# We will try to find clusters in the data based on the features.\n", "\n", "print(\"\\n--- Clustering Analysis Examples (using Wine Dataset features) ---\")\n", "\n", "# Use the features from the Wine dataset\n", "X_cluster = wine_df.drop('target', axis=1).copy()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "256cf97f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ea0815f3", "metadata": {}, "outputs": [], "source": [ "# --- 8.1 Data Preparation for Clustering ---\n", "print(\"\\n--- 8.1 Data Preparation for Clustering ---\")\n", "\n", "# Clustering algorithms are sensitive to the scale of features.\n", "# Standardize the features.\n", "scaler_cluster = StandardScaler()\n", "X_scaled_cluster = scaler_cluster.fit_transform(X_cluster)\n", "X_scaled_cluster_df = pd.DataFrame(X_scaled_cluster, columns=X_cluster.columns)\n", "\n", "print(\"Scaled features for clustering (first 5 rows):\")\n", "print(X_scaled_cluster_df.head())\n" ] }, { "cell_type": "code", "execution_count": null, "id": "566c382c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ad57a9d1", "metadata": {}, "outputs": [], "source": [ "# --- 8.2 K-Means Clustering ---\n", "print(\"\\n--- 8.2 K-Means Clustering ---\")\n", "\n", "# Determine the optimal number of clusters (Elbow Method)\n", "print(\"\\nDetermining optimal number of clusters using the Elbow Method (K-Means)...\")\n", "sse = [] # Sum of squared errors\n", "k_range = range(1, 11) # Try K from 1 to 10\n", "\n", "for k in k_range:\n", " kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) # n_init to avoid local minima\n", " kmeans.fit(X_scaled_cluster)\n", " sse.append(kmeans.inertia_) # inertia_ is the sum of squared distances to the nearest centroid\n", "\n", "# Plot the Elbow Method graph\n", "plt.figure(figsize=(8, 5))\n", "plt.plot(k_range, sse, marker='o', linestyle='-')\n", "plt.title('Elbow Method for Optimal K (K-Means)')\n", "plt.xlabel('Number of Clusters (K)')\n", "plt.ylabel('Sum of Squared Errors (SSE)')\n", "plt.xticks(k_range)\n", "plt.grid(True)\n", "plt.show()\n", "\n", "print(\"Based on the elbow method, choose a K where the decrease in SSE starts to slow down.\")\n", "# For the Wine dataset, K=3 is often a good choice as there are 3 actual classes.\n", "\n", "# Perform K-Means clustering with the chosen K (e.g., K=3)\n", "optimal_k = 3\n", "kmeans_model = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)\n", "clusters_kmeans = kmeans_model.fit_predict(X_scaled_cluster)\n", "\n", "# Add cluster labels to the original (or scaled) DataFrame\n", "X_cluster['kmeans_cluster'] = clusters_kmeans\n", "X_scaled_cluster_df['kmeans_cluster'] = clusters_kmeans # Add to scaled for visualization\n", "\n", "print(f\"\\nK-Means Clustering Results (K={optimal_k}):\")\n", "print(\"\\nCluster distribution:\")\n", "print(X_cluster['kmeans_cluster'].value_counts())\n", "\n", "# Evaluate clustering performance (Silhouette Score - requires actual labels if available, or just intrinsic evaluation)\n", "# Silhouette Score measures how similar an object is to its own cluster compared to other clusters.\n", "# A higher Silhouette Score indicates better-defined clusters.\n", "if optimal_k > 1 and optimal_k < len(X_scaled_cluster): # Silhouette score requires > 1 cluster and < n_samples\n", " silhouette_avg_kmeans = silhouette_score(X_scaled_cluster, clusters_kmeans)\n", " print(f\"\\nSilhouette Score (K-Means, K={optimal_k}): {silhouette_avg_kmeans:.4f}\")\n", "else:\n", " print(\"\\nSilhouette Score cannot be calculated for K=1 or K=n_samples.\")\n", "\n", "\n", "# Visualize K-Means clusters (requires dimensionality reduction or selecting 2 features)\n", "print(\"\\nVisualizing K-Means Clusters (using first 2 PCA components for visualization)...\")\n", "# Apply PCA to reduce dimensions for plotting\n", "pca_cluster_viz = PCA(n_components=2)\n", "X_pca_cluster_viz = pca_cluster_viz.fit_transform(X_scaled_cluster)\n", "\n", "plt.figure(figsize=(10, 8))\n", "# Scatter plot colored by K-Means cluster labels\n", "scatter = plt.scatter(X_pca_cluster_viz[:, 0], X_pca_cluster_viz[:, 1], c=clusters_kmeans, cmap='viridis', alpha=0.7, s=50)\n", "plt.title(f'K-Means Clustering (K={optimal_k}) visualized with PCA')\n", "plt.xlabel('Principal Component 1')\n", "plt.ylabel('Principal Component 2')\n", "plt.colorbar(scatter, label='Cluster')\n", "plt.grid(True)\n", "plt.show()\n", "\n", "# Optional: Compare K-Means clusters to actual labels (since we have them for Wine dataset)\n", "print(\"\\nComparing K-Means Clusters to Actual Wine Classes:\")\n", "# Create a cross-tabulation\n", "comparison_kmeans_actual = pd.crosstab(wine_df['target'], X_cluster['kmeans_cluster'])\n", "print(comparison_kmeans_actual)\n", "# This shows how well the K-Means clusters align with the true classes.\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1fdc9cae", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c3baf752", "metadata": {}, "outputs": [], "source": [ "# --- 8.3 Hierarchical Clustering ---\n", "print(\"\\n--- 8.3 Hierarchical Clustering ---\")\n", "\n", "# Perform Agglomerative Hierarchical Clustering\n", "# Choose number of clusters (e.g., 3, based on K-Means/domain knowledge)\n", "n_clusters_hierarchical = 3\n", "agg_clustering = AgglomerativeClustering(n_clusters=n_clusters_hierarchical)\n", "clusters_hierarchical = agg_clustering.fit_predict(X_scaled_cluster)\n", "\n", "# Add cluster labels to the DataFrame\n", "X_cluster['hierarchical_cluster'] = clusters_hierarchical\n", "\n", "print(f\"\\nHierarchical Clustering Results (Number of Clusters={n_clusters_hierarchical}):\")\n", "print(\"\\nCluster distribution:\")\n", "print(X_cluster['hierarchical_cluster'].value_counts())\n", "\n", "# Evaluate Silhouette Score\n", "if n_clusters_hierarchical > 1 and n_clusters_hierarchical < len(X_scaled_cluster):\n", " silhouette_avg_hierarchical = silhouette_score(X_scaled_cluster, clusters_hierarchical)\n", " print(f\"\\nSilhouette Score (Hierarchical, Clusters={n_clusters_hierarchical}): {silhouette_avg_hierarchical:.4f}\")\n", "else:\n", " print(\"\\nSilhouette Score cannot be calculated for 1 or n_samples clusters.\")\n", "\n", "\n", "# Visualize Hierarchical Clusters (using the same PCA components)\n", "plt.figure(figsize=(10, 8))\n", "scatter_hierarchical = plt.scatter(X_pca_cluster_viz[:, 0], X_pca_cluster_viz[:, 1], c=clusters_hierarchical, cmap='viridis', alpha=0.7, s=50)\n", "plt.title(f'Hierarchical Clustering (Clusters={n_clusters_hierarchical}) visualized with PCA')\n", "plt.xlabel('Principal Component 1')\n", "plt.ylabel('Principal Component 2')\n", "plt.colorbar(scatter_hierarchical, label='Cluster')\n", "plt.grid(True)\n", "plt.show()\n", "\n", "# Optional: Compare Hierarchical clusters to actual labels\n", "print(\"\\nComparing Hierarchical Clusters to Actual Wine Classes:\")\n", "comparison_hierarchical_actual = pd.crosstab(wine_df['target'], X_cluster['hierarchical_cluster'])\n", "print(comparison_hierarchical_actual)\n", "\n", "\n", "# Dendrogram (for visualizing hierarchical structure)\n", "print(\"\\nGenerating Dendrogram (this might take a moment for larger datasets)...\")\n", "# Calculate the distance matrix\n", "distance_matrix = pdist(X_scaled_cluster, metric='euclidean')\n", "# Perform hierarchical clustering\n", "linkage_matrix = hierarchy.linkage(distance_matrix, method='ward') # 'ward' minimizes variance\n", "\n", "# Plot the dendrogram\n", "plt.figure(figsize=(14, 8))\n", "dendrogram = hierarchy.dendrogram(linkage_matrix,\n", " leaf_rotation=90, # Rotate leaf labels\n", " leaf_font_size=8, # Font size for leaf labels\n", " labels=wine_df.index.tolist() # Use original index or sample index\n", " )\n", "plt.title('Hierarchical Clustering Dendrogram')\n", "plt.xlabel('Sample Index')\n", "plt.ylabel('Distance')\n", "plt.show()\n", "print(\"The dendrogram helps visualize the merging of clusters.\")\n", "\n", "\n", "# --- 8.4 DBSCAN Clustering ---\n", "print(\"\\n--- 8.4 DBSCAN Clustering ---\")\n", "# DBSCAN is a density-based clustering algorithm. It doesn't require specifying the number of clusters beforehand.\n", "# It requires two parameters: epsilon (eps) and minimum samples (min_samples).\n", "\n", "# Example: Apply DBSCAN\n", "# Choosing eps and min_samples can be tricky and often requires domain knowledge or trial and error.\n", "# A common approach for eps is to calculate the distance to the k-th nearest neighbor for each point\n", "# and look for an \"elbow\" in the sorted distances.\n", "print(\"\\nApplying DBSCAN Clustering (using eps=0.5, min_samples=5)...\")\n", "dbscan = DBSCAN(eps=0.5, min_samples=5)\n", "clusters_dbscan = dbscan.fit_predict(X_scaled_cluster)\n", "\n", "# Add cluster labels to the DataFrame (-1 indicates noise)\n", "X_cluster['dbscan_cluster'] = clusters_dbscan\n", "\n", "print(\"\\nDBSCAN Clustering Results:\")\n", "print(\"\\nCluster distribution (including noise -1):\")\n", "print(X_cluster['dbscan_cluster'].value_counts())\n", "\n", "# Evaluate Silhouette Score (excluding noise points)\n", "# Only calculate if there is more than one cluster found (excluding noise)\n", "if len(set(clusters_dbscan)) > 1 and -1 in set(clusters_dbscan): # More than 1 cluster and noise present\n", " # Calculate silhouette score only for points that are not noise (-1)\n", " non_noise_indices = clusters_dbscan != -1\n", " if len(set(clusters_dbscan[non_noise_indices])) > 1: # Ensure there's more than one cluster among non-noise points\n", " silhouette_avg_dbscan = silhouette_score(X_scaled_cluster[non_noise_indices], clusters_dbscan[non_noise_indices])\n", " print(f\"\\nSilhouette Score (DBSCAN, excluding noise): {silhouette_avg_dbscan:.4f}\")\n", " else:\n", " print(\"\\nNot enough non-noise clusters (>1) to calculate Silhouette Score.\")\n", "elif len(set(clusters_dbscan)) > 1: # More than 1 cluster, no noise\n", " silhouette_avg_dbscan = silhouette_score(X_scaled_cluster, clusters_dbscan)\n", " print(f\"\\nSilhouette Score (DBSCAN): {silhouette_avg_dbscan:.4f}\")\n", "else:\n", " print(\"\\nSilhouette Score cannot be calculated (only one cluster or all noise).\")\n", "\n", "\n", "# Visualize DBSCAN Clusters (using the same PCA components)\n", "plt.figure(figsize=(10, 8))\n", "# Color noise points differently if desired\n", "unique_labels = set(clusters_dbscan)\n", "colors = [plt.cm.viridis(each) for each in np.linspace(0, 1, len(unique_labels))]\n", "\n", "for k, col in zip(unique_labels, colors):\n", " if k == -1:\n", " # Black used for noise.\n", " col = [0, 0, 0, 1]\n", "\n", " class_member_mask = (clusters_dbscan == k)\n", "\n", " xy = X_pca_cluster_viz[class_member_mask]\n", " plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),\n", " markeredgecolor='k', markersize=6, label=f'Cluster {k}')\n", "\n", "plt.title('DBSCAN Clustering visualized with PCA')\n", "plt.xlabel('Principal Component 1')\n", "plt.ylabel('Principal Component 2')\n", "plt.legend()\n", "plt.grid(True)\n", "plt.show()\n", "\n", "# Optional: Compare DBSCAN clusters to actual labels\n", "print(\"\\nComparing DBSCAN Clusters to Actual Wine Classes:\")\n", "# Note: Noise points (-1) are not assigned to a class by DBSCAN.\n", "comparison_dbscan_actual = pd.crosstab(wine_df['target'], X_cluster['dbscan_cluster'])\n", "print(comparison_dbscan_actual)\n", "\n", "\n", "print(\"\\nClustering analysis examples complete.\")\n", "print(f\"{'='*80}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "130256b9", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "6145c5cd", "metadata": {}, "outputs": [], "source": [ "# ## 9. Advanced Modeling with CatBoost\n", "\n", "# %% tags=[]\n", "print(f\"\\n{'='*80}\")\n", "print(f\"PART 9: ADVANCED MODELING WITH CATBOOST\")\n", "print(f\"{'='*80}\")\n", "\n", "# CatBoost is a gradient boosting on decision trees library.\n", "# It is known for handling categorical features effectively and often provides good performance out-of-the-box.\n", "\n", "# Use the synthetic e-commerce data for CatBoost examples.\n", "# Let's build a model to predict customer's 'total_spend'. This is a regression task.\n", "\n", "# Use the customer_fe DataFrame which contains engineered features.\n", "# Ensure NaNs are handled before training CatBoost. CatBoost can handle NaNs internally,\n", "# but explicit imputation or handling might be preferred depending on the strategy.\n", "# For this example, we'll let CatBoost handle them, but specify categorical features.\n", "\n", "print(\"\\n--- CatBoost Modeling Examples (Predicting Customer Total Spend) ---\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1cab16e2", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "a79fb378", "metadata": {}, "outputs": [], "source": [ "# --- 9.1 Data Preparation for CatBoost ---\n", "print(\"\\n--- 9.1 Data Preparation for CatBoost ---\")\n", "\n", "# Select features (X) and target (y) from the engineered customer data\n", "# Exclude ID columns, the target itself, and potentially features directly derived from the target.\n", "# Also exclude original columns if engineered/scaled versions are used.\n", "X_cb = customers_fe.drop(columns=[\n", " 'customer_id', 'lifetime_value', 'avg_order_value', 'loyalty_score', 'recency_days', # Original features\n", " 'signup_date', 'first_purchase_date', 'last_purchase_date', 'last_visit_timestamp', # Date columns\n", " 'total_spend', # Target variable\n", " 'lifetime_value_sq', 'avg_order_value_sq', 'lifetime_avg_order_interaction', # Polynomial features (optional to include)\n", " 'age_loyalty_interaction', # Interaction feature (optional to include)\n", " 'total_spend_scaled_std', 'recency_scaled_std', 'loyalty_score_scaled_minmax', 'avg_order_value_scaled_minmax', # Scaled features (CatBoost doesn't need scaling)\n", " 'age_group_numeric', # Numeric encoding of age group (CatBoost prefers original categorical)\n", " 'customer_segment_encoded', # Label encoded segment (CatBoost prefers original categorical)\n", " 'loyalty_level', 'session_duration_category', # Binned features (optional to include)\n", " 'PCA1', 'PCA2', 'TSNE1', 'TSNE2', # Dimensionality reduction features\n", " 'kmeans_cluster', 'hierarchical_cluster', 'dbscan_cluster' # Clustering labels\n", "], errors='ignore').copy() # Use errors='ignore' in case some columns weren't created\n", "\n", "y_cb = customers_fe['total_spend'].copy()\n", "\n", "print(f\"Features shape: {X_cb.shape}\")\n", "print(f\"Target shape: {y_cb.shape}\")\n", "print(\"\\nFeatures used for CatBoost:\")\n", "print(X_cb.columns.tolist())\n", "\n", "\n", "# Split data into training and testing sets\n", "X_train_cb, X_test_cb, y_train_cb, y_test_cb = train_test_split(X_cb, y_cb, test_size=0.2, random_state=42)\n", "\n", "print(f\"Training data shape: {X_train_cb.shape}, {y_train_cb.shape}\")\n", "print(f\"Testing data shape: {X_test_cb.shape}, {y_test_cb.shape}\")\n", "\n", "# Identify categorical features for CatBoost\n", "# CatBoost can directly handle categorical features if you tell it which columns are categorical.\n", "# It uses a special encoding technique (ordered boosting).\n", "categorical_features_cb_names = X_train_cb.select_dtypes(include='object').columns.tolist()\n", "print(f\"\\nIdentified categorical features for CatBoost: {categorical_features_cb_names}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0108a4aa", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "7036c509", "metadata": {}, "outputs": [], "source": [ "# --- 9.2 Training a CatBoost Regressor ---\n", "print(\"\\n--- 9.2 Training a CatBoost Regressor ---\")\n", "\n", "# Create a CatBoost Regressor model\n", "catboost_regressor = cb.CatBoostRegressor(\n", " iterations=100, # Number of boosting iterations (trees)\n", " learning_rate=0.1,\n", " depth=6,\n", " loss_function='RMSE', # Root Mean Squared Error\n", " eval_metric='RMSE', # Metric for evaluation during training\n", " random_state=42,\n", " verbose=10 # Print training progress every 10 iterations\n", ")\n", "\n", "# Train the model\n", "# Pass the categorical feature names to the model\n", "print(\"\\nTraining CatBoost Regressor...\")\n", "catboost_regressor.fit(\n", " X_train_cb,\n", " y_train_cb,\n", " cat_features=categorical_features_cb_names, # Specify categorical features\n", " eval_set=(X_test_cb, y_test_cb), # Use test set for evaluation during training\n", " early_stopping_rounds=10, # Stop early if validation metric doesn't improve for 10 rounds\n", " verbose=False # Suppress verbose output during fit for cleaner final output\n", ")\n", "print(\"CatBoost Regressor training complete.\")\n", "\n", "\n", "# Make predictions\n", "y_pred_cb = catboost_regressor.predict(X_test_cb)\n", "\n", "# Evaluate the model\n", "mse_cb = mean_squared_error(y_test_cb, y_pred_cb)\n", "rmse_cb = np.sqrt(mse_cb)\n", "r2_cb = r2_score(y_test_cb, y_pred_cb)\n", "\n", "print(f\"\\nCatBoost Regressor Results on Test Set:\")\n", "print(f\"  Mean Squared Error (MSE): {mse_cb:.4f}\")\n", "print(f\"  Root Mean Squared Error (RMSE): {rmse_cb:.4f}\")\n", "print(f\"  R-squared (R2): {r2_cb:.4f}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "026369a0", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "bb228a7e", "metadata": {}, "outputs": [], "source": [ "# --- 9.3 CatBoost Feature Importance ---\n", "print(\"\\n--- 9.3 CatBoost Feature Importance ---\")\n", "\n", "# Get feature importances\n", "feature_importances_cb = catboost_regressor.get_feature_importance(prettified=True)\n", "print(\"\\nCatBoost Feature Importances:\")\n", "print(feature_importances_cb)\n", "\n", "# Visualize Feature Importances\n", "plt.figure(figsize=(12, 8))\n", "# Sort features by importance for plotting\n", "feature_importances_cb_sorted = feature_importances_cb.sort_values(by='Importances', ascending=False)\n", "plt.bar(feature_importances_cb_sorted['Feature Id'], feature_importances_cb_sorted['Importances'], color='steelblue')\n", "plt.title('CatBoost Regressor Feature Importances')\n", "plt.xlabel('Features')\n", "plt.ylabel('Importance')\n", "plt.xticks(rotation=45, ha='right')\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e6404e11", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "3941f913", "metadata": {}, "outputs": [], "source": [ "\n", "# --- 9.4 Training a CatBoost Classifier (Example) ---\n", "print(\"\\n--- 9.4 Training a CatBoost Classifier (Example) ---\")\n", "# Let's create a binary classification task from the customers data, e.g., predicting if a customer is 'VIP'.\n", "\n", "# Create a binary target variable 'is_vip'\n", "customers_fe['is_vip'] = (customers_fe['customer_segment'] == 'VIP').astype(int)\n", "\n", "# Select features (X) and the new binary target (y)\n", "X_cb_clf = customers_fe.drop(columns=[\n", " 'customer_id', 'lifetime_value', 'avg_order_value', 'loyalty_score', 'recency_days', # Original features\n", " 'signup_date', 'first_purchase_date', 'last_purchase_date', 'last_visit_timestamp', # Date columns\n", " 'total_spend', # Original regression target\n", " 'is_vip', # New classification target\n", " 'customer_segment', # Original categorical feature (target source)\n", " 'lifetime_value_sq', 'avg_order_value_sq', 'lifetime_avg_order_interaction',\n", " 'age_loyalty_interaction',\n", " 'total_spend_scaled_std', 'recency_scaled_std', 'loyalty_score_scaled_minmax', 'avg_order_value_scaled_minmax',\n", " 'age_group_numeric', 'customer_segment_encoded',\n", " 'loyalty_level', 'session_duration_category',\n", " 'PCA1', 'PCA2', 'TSNE1', 'TSNE2',\n", " 'kmeans_cluster', 'hierarchical_cluster', 'dbscan_cluster'\n", "], errors='ignore').copy()\n", "\n", "y_cb_clf = customers_fe['is_vip'].copy()\n", "\n", "print(f\"\\nFeatures shape for Classifier: {X_cb_clf.shape}\")\n", "print(f\"Target shape for Classifier: {y_cb_clf.shape}\")\n", "print(\"\\nClass distribution for 'is_vip':\")\n", "print(y_cb_clf.value_counts())\n", "\n", "# Split data\n", "X_train_cb_clf, X_test_cb_clf, y_train_cb_clf, y_test_cb_clf = train_test_split(X_cb_clf, y_cb_clf, test_size=0.2, random_state=42, stratify=y_cb_clf)\n", "\n", "print(f\"\\nTraining data shape for Classifier: {X_train_cb_clf.shape}, {y_train_cb_clf.shape}\")\n", "print(f\"Testing data shape for Classifier: {X_test_cb_clf.shape}, {y_test_cb_clf.shape}\")\n", "\n", "# Identify categorical features for the classifier\n", "categorical_features_cb_clf_names = X_train_cb_clf.select_dtypes(include='object').columns.tolist()\n", "print(f\"\\nIdentified categorical features for CatBoost Classifier: {categorical_features_cb_clf_names}\")\n", "\n", "\n", "# Create a CatBoost Classifier model\n", "catboost_classifier = cb.CatBoostClassifier(\n", " iterations=100,\n", " learning_rate=0.1,\n", " depth=6,\n", " loss_function='Logloss', # Logloss for binary classification\n", " eval_metric='Accuracy', # Or 'AUC'\n", " random_state=42,\n", " verbose=10 # Print training progress\n", ")\n", "\n", "# Train the classifier\n", "print(\"\\nTraining CatBoost Classifier...\")\n", "catboost_classifier.fit(\n", " X_train_cb_clf,\n", " y_train_cb_clf,\n", " cat_features=categorical_features_cb_clf_names,\n", " eval_set=(X_test_cb_clf, y_test_cb_clf),\n", " early_stopping_rounds=10,\n", " verbose=False # Suppress verbose output\n", ")\n", "print(\"CatBoost Classifier training complete.\")\n", "\n", "# Make predictions\n", "y_pred_cb_clf = catboost_classifier.predict(X_test_cb_clf)\n", "y_prob_cb_clf = catboost_classifier.predict_proba(X_test_cb_clf)[:, 1] # Probability of the positive class (1)\n", "\n", "# Evaluate the classifier\n", "accuracy_cb_clf = accuracy_score(y_test_cb_clf, y_pred_cb_clf)\n", "conf_matrix_cb_clf = confusion_matrix(y_test_cb_clf, y_pred_cb_clf)\n", "class_report_cb_clf = classification_report(y_test_cb_clf, y_pred_cb_clf)\n", "roc_auc_cb_clf = roc_auc_score(y_test_cb_clf, y_prob_cb_clf)\n", "\n", "print(f\"\\nCatBoost Classifier Results on Test Set:\")\n", "print(f\"  Accuracy: {accuracy_cb_clf:.4f}\")\n", "print(\"\\n  Confusion Matrix:\")\n", "print(conf_matrix_cb_clf)\n", "print(\"\\n  Classification Report:\")\n", "print(class_report_cb_clf)\n", "print(f\"  ROC AUC Score: {roc_auc_cb_clf:.4f}\")\n", "\n", "\n", "# Visualize ROC Curve for the classifier\n", "fpr_cb, tpr_cb, thresholds_cb = roc_curve(y_test_cb_clf, y_prob_cb_clf)\n", "\n", "plt.figure(figsize=(8, 6))\n", "plt.plot(fpr_cb, tpr_cb, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_cb_clf:.2f})')\n", "plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n", "plt.xlim([0.0, 1.0])\n", "plt.ylim([0.0, 1.05])\n", "plt.xlabel('False Positive Rate')\n", "plt.ylabel('True Positive Rate')\n", "plt.title('CatBoost Classifier ROC Curve (Is VIP)')\n", "plt.legend(loc=\"lower right\")\n", "plt.grid(True)\n", "plt.show()\n", "\n", "\n", "print(\"\\nAdvanced modeling with CatBoost examples complete.\")\n", "print(f\"{'='*80}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d9b464c7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "1c753fe9", "metadata": {}, "outputs": [], "source": [ "# ## 10. Time Series Forecasting with Prophet\n", "\n", "# %% tags=[]\n", "print(f\"\\n{'='*80}\")\n", "print(f\"PART 10: TIME SERIES FORECASTING WITH PROPHET\")\n", "print(f\"{'='*80}\")\n", "\n", "# Prophet is a time series forecasting model developed by Facebook.\n", "# It is designed for time series data that has strong seasonal effects and several seasons of historical data.\n", "\n", "# Use the synthetic e-commerce transactions data to forecast total daily sales.\n", "\n", "print(\"\\n--- Time Series Forecasting with Prophet (Forecasting Daily Sales) ---\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "14cb399a", "metadata": {}, "outputs": [], "source": [ "# --- 10.1 Data Preparation for Prophet ---\n", "print(\"\\n--- 10.1 Data Preparation for Prophet ---\")\n", "\n", "# Prophet requires the input DataFrame to have two columns:\n", "# 'ds' (datetime) and 'y' (numerical value to forecast).\n", "\n", "# Aggregate the transaction data to get total daily sales\n", "# Ensure 'date' is datetime\n", "transactions_fe['date'] = pd.to_datetime(transactions_fe['date'])\n", "\n", "# Group by date and sum total_price\n", "daily_sales = transactions_fe.groupby(transactions_fe['date'].dt.date)['total_price'].sum().reset_index()\n", "\n", "# Rename columns to 'ds' and 'y'\n", "daily_sales.columns = ['ds', 'y']\n", "\n", "# Convert 'ds' column to datetime objects (if not already)\n", "daily_sales['ds'] = pd.to_datetime(daily_sales['ds'])\n", "\n", "print(\"Prepared daily sales data for Prophet (first 5 rows):\")\n", "print(daily_sales.head())\n", "print(f\"\\nTotal number of days with sales data: {len(daily_sales)}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c130f3b1", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "e03c5c6c", "metadata": {}, "outputs": [], "source": [ "# --- 10.2 Training the Prophet Model ---\n", "print(\"\\n--- 10.2 Training the Prophet Model ---\")\n", "\n", "# Create a Prophet model instance\n", "# You can add parameters to model seasonality, holidays, etc.\n", "# For this example, we'll use default seasonality (additive) and include yearly and weekly seasonality.\n", "model_prophet = Prophet(\n", " yearly_seasonality=True,\n", " weekly_seasonality=True,\n", " daily_seasonality=False # Daily seasonality is often less pronounced at a daily aggregation level\n", ")\n", "\n", "# Fit the model to the historical data\n", "print(\"\\nTraining Prophet model...\")\n", "model_prophet.fit(daily_sales)\n", "print(\"Prophet model training complete.\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8f07e7d7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "59ffd805", "metadata": {}, "outputs": [], "source": [ "# --- 10.3 Making Future Predictions ---\n", "print(\"\\n--- 10.3 Making Future Predictions ---\")\n", "\n", "# Create a DataFrame with future dates to forecast\n", "# Let's forecast for the next 30 days\n", "future_dates = model_prophet.make_future_dataframe(periods=30)\n", "\n", "print(f\"\\nFuture dates DataFrame (first 5 rows):\")\n", "print(future_dates.head())\n", "print(f\"\\nFuture dates DataFrame (last 5 rows):\")\n", "print(future_dates.tail())\n", "\n", "\n", "# Make predictions on the future dates\n", "print(\"\\nMaking future predictions...\")\n", "forecast = model_prophet.predict(future_dates)\n", "print(\"Future predictions complete.\")\n", "\n", "# The forecast DataFrame contains the predictions ('yhat'),\n", "# lower and upper bounds ('yhat_lower', 'yhat_upper'), and components (trend, seasonality).\n", "\n", "print(\"\\nForecast DataFrame (first 5 rows):\")\n", "print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())\n", "print(\"\\nForecast DataFrame (last 5 rows):\")\n", "print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail())\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6539a06e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "06c5da7d", "metadata": {}, "outputs": [], "source": [ "\n", "# --- 10.4 Visualizing the Forecast ---\n", "print(\"\\n--- 10.4 Visualizing the Forecast ---\")\n", "\n", "# Prophet provides a built-in plotting utility\n", "print(\"\\nPlotting forecast...\")\n", "fig1 = model_prophet.plot(forecast)\n", "plt.title('Daily Sales Forecast with Prophet')\n", "plt.xlabel('Date')\n", "plt.ylabel('Total Sales')\n", "plt.show()\n", "print(\"Forecast plot displayed.\")\n", "\n", "# Visualize forecast components (trend, seasonality)\n", "print(\"\\nPlotting forecast components...\")\n", "fig2 = model_prophet.plot_components(forecast)\n", "plt.show()\n", "print(\"Forecast components plot displayed.\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5d18c592", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "2b29925c", "metadata": {}, "outputs": [], "source": [ "# --- 10.5 Evaluating Forecast Performance (Optional) ---\n", "print(\"\\n--- 10.5 Evaluating Forecast Performance (Optional) ---\")\n", "# To evaluate performance, you would typically split your historical data into training and validation sets.\n", "# Prophet has cross-validation utilities for this.\n", "\n", "# Example: Performing cross-validation with Prophet\n", "# We'll simulate forecasting performance over a rolling window.\n", "# Initial training period: 'initial'\n", "# Period to forecast: 'period'\n", "# Frequency of evaluation: 'horizon'\n", "\n", "# Let's use the last 90 days as the initial training data, forecast 30 days ahead,\n", "# and evaluate every 30 days.\n", "print(\"\\nPerforming Prophet cross-validation (this might take a moment)...\")\n", "\n", "# Calculate initial and horizon based on your data range\n", "# Assuming data ends on latest_date from Part 5\n", "data_end_date = daily_sales['ds'].max()\n", "initial_date = data_end_date - pd.Timedelta(days=90)\n", "horizon_date = pd.Timedelta(days=30)\n", "\n", "# Find the index of the initial date in the daily_sales DataFrame\n", "initial_index = daily_sales[daily_sales['ds'] >= initial_date].index.min()\n", "if pd.isna(initial_index):\n", " print(\"Warning: Initial date for cross-validation is outside the data range. Adjusting.\")\n", " initial_index = len(daily_sales) - 90 # Use the last 90 days if date calculation fails\n", "\n", "initial_cutoff = daily_sales.iloc[initial_index]['ds']\n", "\n", "# Calculate initial, period, and horizon in Prophet's required format (strings like '90 days')\n", "initial_prophet = f\"{(initial_cutoff - daily_sales['ds'].min()).days} days\"\n", "period_prophet = \"30 days\"\n", "horizon_prophet = \"30 days\"\n", "\n", "\n", "try:\n", " from prophet.diagnostics import cross_validation, performance_metrics\n", "\n", " # Perform cross-validation\n", " df_cv = cross_validation(model_prophet, initial=initial_prophet, period=period_prophet, horizon=horizon_prophet, parallel=\"processes\")\n", "\n", " print(\"\\nProphet Cross-Validation Results (first 5 rows):\")\n", " print(df_cv.head())\n", "\n", " # Calculate performance metrics\n", " df_p = performance_metrics(df_cv)\n", "\n", " print(\"\\nProphet Performance Metrics (first 5 rows):\")\n", " print(df_p.head())\n", "\n", " # You can plot performance metrics vs horizon\n", " # from prophet.plot import plot_cross_validation_metric\n", " # fig3 = plot_cross_validation_metric(df_cv, metric='rmse')\n", " # plt.title('Prophet Cross-Validation RMSE vs Horizon')\n", " # plt.show()\n", "\n", "except ImportError:\n", " print(\"\\nProphet diagnostics (cross_validation, performance_metrics) not available. Skipping evaluation example.\")\n", "except ValueError as e:\n", " print(f\"\\nCould not perform Prophet cross-validation: {e}. Check initial, period, and horizon settings relative to your data.\")\n", "except Exception as e:\n", " print(f\"\\nAn error occurred during Prophet cross-validation: {e}\")\n", "\n", "\n", "print(\"\\nTime series forecasting with Prophet examples complete.\")\n", "print(f\"{'='*80}\")\n", "\n", "\n", "# %% [markdown]\n", "# ## Conclusion\n", "#\n", "# This document provided a comprehensive overview and practical examples of various advanced data science techniques using Python libraries like Pandas, NumPy, Matplotlib, SciPy, Scikit-learn, CatBoost, and Prophet. It covered data generation, cleaning, EDA, feature engineering, regression, classification, clustering, advanced modeling, and time series forecasting, all within a single environment.\n", "#\n", "# This toolkit serves as a valuable resource for senior data product analysts to enhance their analytical capabilities and tackle complex data challenges. Remember to adapt these techniques to your specific datasets and business problems.\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d08e48e8", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c379283d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "b5ee690a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c93aea14", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "20356916", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "7c1b97a7", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }