!source /kaggle/working/myenv/bin/activate

import sys
sys.path.append('/kaggle/input/athieldsv25315/')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sqlalchemy import create_engine
import sqlite3

import pandas as pd
import polars as pl
pl.Config.set_tbl_cols(20)
import numpy as np
from skimpy import skim

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

import scipy.stats as stats
from scipy.stats import mannwhitneyu, bootstrap

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, make_scorer, roc_auc_score, balanced_accuracy_score, ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance, PartialDependenceDisplay, partial_dependence

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from xgboost import XGBClassifier

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

import shap

zsh:source:1: no such file or directory: /kaggle/working/myenv/bin/activate

from utils.functions import (
    set_dark_mode,
    set_white_mode,
    column_mapping,
    BackwardEliminationLogistic,
    define_x_and_y,
    plot_predicted_vs_residuals,
    plot_leverage,
    plot_cooks_distance,
    calculate_vif_polars,
    perform_cross_validation,
    plot_roc_auc, 
    plot_confusion_matrix,
    plot_cv_results,
    plot_shap_summary,
    plot_feature_importance_comparison,
)

def apply_plot_style(mode="white"):
    if mode == "dark":
        set_dark_mode()
    else:
        set_white_mode()

apply_plot_style("white")

travel_data = pl.read_csv("./kaggle/input/travel-insurance-prediction-data/TravelInsurancePrediction.csv")

print("Column names:", travel_data.columns)
print("Shape:", travel_data.shape)
print("Description", travel_data.describe())

travel_data.head(2)

Column names: ['', 'Age', 'Employment Type', 'GraduateOrNot', 'AnnualIncome', 'FamilyMembers', 'ChronicDiseases', 'FrequentFlyer', 'EverTravelledAbroad', 'TravelInsurance']
Shape: (1987, 10)
Description shape: (9, 11)
┌────────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
│ statis ┆        ┆ Age    ┆ Employ ┆ Gradua ┆ Annual ┆ Family ┆ Chroni ┆ Freque ┆ EverTr ┆ Travel │
│ tic    ┆ ---    ┆ ---    ┆ ment   ┆ teOrNo ┆ Income ┆ Member ┆ cDisea ┆ ntFlye ┆ avelle ┆ Insura │
│ ---    ┆ f64    ┆ f64    ┆ Type   ┆ t      ┆ ---    ┆ s      ┆ ses    ┆ r      ┆ dAbroa ┆ nce    │
│ str    ┆        ┆        ┆ ---    ┆ ---    ┆ f64    ┆ ---    ┆ ---    ┆ ---    ┆ d      ┆ ---    │
│        ┆        ┆        ┆ str    ┆ str    ┆        ┆ f64    ┆ f64    ┆ str    ┆ ---    ┆ f64    │
│        ┆        ┆        ┆        ┆        ┆        ┆        ┆        ┆        ┆ str    ┆        │
╞════════╪════════╪════════╪════════╪════════╪════════╪════════╪════════╪════════╪════════╪════════╡
│ count  ┆ 1987.0 ┆ 1987.0 ┆ 1987   ┆ 1987   ┆ 1987.0 ┆ 1987.0 ┆ 1987.0 ┆ 1987   ┆ 1987   ┆ 1987.0 │
│ null_c ┆ 0.0    ┆ 0.0    ┆ 0      ┆ 0      ┆ 0.0    ┆ 0.0    ┆ 0.0    ┆ 0      ┆ 0      ┆ 0.0    │
│ ount   ┆        ┆        ┆        ┆        ┆        ┆        ┆        ┆        ┆        ┆        │
│ mean   ┆ 993.0  ┆ 29.650 ┆ null   ┆ null   ┆ 932762 ┆ 4.7528 ┆ 0.2778 ┆ null   ┆ null   ┆ 0.3573 │
│        ┆        ┆ 226    ┆        ┆        ┆ .95923 ┆ 94     ┆ 06     ┆        ┆        ┆ 23     │
│        ┆        ┆        ┆        ┆        ┆ 5      ┆        ┆        ┆        ┆        ┆        │
│ std    ┆ 573.74 ┆ 2.9133 ┆ null   ┆ null   ┆ 376855 ┆ 1.6096 ┆ 0.4480 ┆ null   ┆ null   ┆ 0.4793 │
│        ┆ 1812   ┆ 08     ┆        ┆        ┆ .68474 ┆ 5      ┆ 3      ┆        ┆        ┆ 32     │
│        ┆        ┆        ┆        ┆        ┆ 8      ┆        ┆        ┆        ┆        ┆        │
│ min    ┆ 0.0    ┆ 25.0   ┆ Govern ┆ No     ┆ 300000 ┆ 2.0    ┆ 0.0    ┆ No     ┆ No     ┆ 0.0    │
│        ┆        ┆        ┆ ment   ┆        ┆ .0     ┆        ┆        ┆        ┆        ┆        │
│        ┆        ┆        ┆ Sector ┆        ┆        ┆        ┆        ┆        ┆        ┆        │
│ 25%    ┆ 497.0  ┆ 28.0   ┆ null   ┆ null   ┆ 600000 ┆ 4.0    ┆ 0.0    ┆ null   ┆ null   ┆ 0.0    │
│        ┆        ┆        ┆        ┆        ┆ .0     ┆        ┆        ┆        ┆        ┆        │
│ 50%    ┆ 993.0  ┆ 29.0   ┆ null   ┆ null   ┆ 900000 ┆ 5.0    ┆ 0.0    ┆ null   ┆ null   ┆ 0.0    │
│        ┆        ┆        ┆        ┆        ┆ .0     ┆        ┆        ┆        ┆        ┆        │
│ 75%    ┆ 1490.0 ┆ 32.0   ┆ null   ┆ null   ┆ 1.25e6 ┆ 6.0    ┆ 1.0    ┆ null   ┆ null   ┆ 1.0    │
│ max    ┆ 1986.0 ┆ 35.0   ┆ Privat ┆ Yes    ┆ 1.8e6  ┆ 9.0    ┆ 1.0    ┆ Yes    ┆ Yes    ┆ 1.0    │
│        ┆        ┆        ┆ e Sect ┆        ┆        ┆        ┆        ┆        ┆        ┆        │
│        ┆        ┆        ┆ or/Sel ┆        ┆        ┆        ┆        ┆        ┆        ┆        │
│        ┆        ┆        ┆ f Empl ┆        ┆        ┆        ┆        ┆        ┆        ┆        │
│        ┆        ┆        ┆ oyed   ┆        ┆        ┆        ┆        ┆        ┆        ┆        │
└────────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘

is_duplicated = travel_data.is_duplicated()
duplicates_df = travel_data.filter(is_duplicated)

duplicates_count = duplicates_df.group_by(['', 'Age', 'Employment Type', 'GraduateOrNot', 'AnnualIncome', 'FamilyMembers', 'ChronicDiseases', 'FrequentFlyer', 'EverTravelledAbroad', 'TravelInsurance']).count()
print(duplicates_count)

shape: (0, 11)
┌─────┬─────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬────────┬───────┐
│     ┆ Age ┆ Employm ┆ Graduat ┆ AnnualI ┆ FamilyM ┆ Chronic ┆ Frequen ┆ EverTra ┆ Travel ┆ count │
│ --- ┆ --- ┆ ent     ┆ eOrNot  ┆ ncome   ┆ embers  ┆ Disease ┆ tFlyer  ┆ velledA ┆ Insura ┆ ---   │
│ i64 ┆ i64 ┆ Type    ┆ ---     ┆ ---     ┆ ---     ┆ s       ┆ ---     ┆ broad   ┆ nce    ┆ u32   │
│     ┆     ┆ ---     ┆ str     ┆ i64     ┆ i64     ┆ ---     ┆ str     ┆ ---     ┆ ---    ┆       │
│     ┆     ┆ str     ┆         ┆         ┆         ┆ i64     ┆         ┆ str     ┆ i64    ┆       │
╞═════╪═════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╪════════╪═══════╡
└─────┴─────┴─────────┴─────────┴─────────┴─────────┴─────────┴─────────┴─────────┴────────┴───────┘

null_counts = travel_data.select(pl.all().has_nulls())

print("Null counts:", null_counts)
print("\nDescription:", travel_data.describe())

Null counts: shape: (1, 10)
┌───────┬───────┬──────────┬──────────┬──────────┬─────────┬─────────┬─────────┬─────────┬─────────┐
│       ┆ Age   ┆ Employme ┆ Graduate ┆ AnnualIn ┆ FamilyM ┆ Chronic ┆ Frequen ┆ EverTra ┆ TravelI │
│ ---   ┆ ---   ┆ nt Type  ┆ OrNot    ┆ come     ┆ embers  ┆ Disease ┆ tFlyer  ┆ velledA ┆ nsuranc │
│ bool  ┆ bool  ┆ ---      ┆ ---      ┆ ---      ┆ ---     ┆ s       ┆ ---     ┆ broad   ┆ e       │
│       ┆       ┆ bool     ┆ bool     ┆ bool     ┆ bool    ┆ ---     ┆ bool    ┆ ---     ┆ ---     │
│       ┆       ┆          ┆          ┆          ┆         ┆ bool    ┆         ┆ bool    ┆ bool    │
╞═══════╪═══════╪══════════╪══════════╪══════════╪═════════╪═════════╪═════════╪═════════╪═════════╡
│ false ┆ false ┆ false    ┆ false    ┆ false    ┆ false   ┆ false   ┆ false   ┆ false   ┆ false   │
└───────┴───────┴──────────┴──────────┴──────────┴─────────┴─────────┴─────────┴─────────┴─────────┘

Description: shape: (9, 11)
┌────────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐
│ statis ┆        ┆ Age    ┆ Employ ┆ Gradua ┆ Annual ┆ Family ┆ Chroni ┆ Freque ┆ EverTr ┆ Travel │
│ tic    ┆ ---    ┆ ---    ┆ ment   ┆ teOrNo ┆ Income ┆ Member ┆ cDisea ┆ ntFlye ┆ avelle ┆ Insura │
│ ---    ┆ f64    ┆ f64    ┆ Type   ┆ t      ┆ ---    ┆ s      ┆ ses    ┆ r      ┆ dAbroa ┆ nce    │
│ str    ┆        ┆        ┆ ---    ┆ ---    ┆ f64    ┆ ---    ┆ ---    ┆ ---    ┆ d      ┆ ---    │
│        ┆        ┆        ┆ str    ┆ str    ┆        ┆ f64    ┆ f64    ┆ str    ┆ ---    ┆ f64    │
│        ┆        ┆        ┆        ┆        ┆        ┆        ┆        ┆        ┆ str    ┆        │
╞════════╪════════╪════════╪════════╪════════╪════════╪════════╪════════╪════════╪════════╪════════╡
│ count  ┆ 1987.0 ┆ 1987.0 ┆ 1987   ┆ 1987   ┆ 1987.0 ┆ 1987.0 ┆ 1987.0 ┆ 1987   ┆ 1987   ┆ 1987.0 │
│ null_c ┆ 0.0    ┆ 0.0    ┆ 0      ┆ 0      ┆ 0.0    ┆ 0.0    ┆ 0.0    ┆ 0      ┆ 0      ┆ 0.0    │
│ ount   ┆        ┆        ┆        ┆        ┆        ┆        ┆        ┆        ┆        ┆        │
│ mean   ┆ 993.0  ┆ 29.650 ┆ null   ┆ null   ┆ 932762 ┆ 4.7528 ┆ 0.2778 ┆ null   ┆ null   ┆ 0.3573 │
│        ┆        ┆ 226    ┆        ┆        ┆ .95923 ┆ 94     ┆ 06     ┆        ┆        ┆ 23     │
│        ┆        ┆        ┆        ┆        ┆ 5      ┆        ┆        ┆        ┆        ┆        │
│ std    ┆ 573.74 ┆ 2.9133 ┆ null   ┆ null   ┆ 376855 ┆ 1.6096 ┆ 0.4480 ┆ null   ┆ null   ┆ 0.4793 │
│        ┆ 1812   ┆ 08     ┆        ┆        ┆ .68474 ┆ 5      ┆ 3      ┆        ┆        ┆ 32     │
│        ┆        ┆        ┆        ┆        ┆ 8      ┆        ┆        ┆        ┆        ┆        │
│ min    ┆ 0.0    ┆ 25.0   ┆ Govern ┆ No     ┆ 300000 ┆ 2.0    ┆ 0.0    ┆ No     ┆ No     ┆ 0.0    │
│        ┆        ┆        ┆ ment   ┆        ┆ .0     ┆        ┆        ┆        ┆        ┆        │
│        ┆        ┆        ┆ Sector ┆        ┆        ┆        ┆        ┆        ┆        ┆        │
│ 25%    ┆ 497.0  ┆ 28.0   ┆ null   ┆ null   ┆ 600000 ┆ 4.0    ┆ 0.0    ┆ null   ┆ null   ┆ 0.0    │
│        ┆        ┆        ┆        ┆        ┆ .0     ┆        ┆        ┆        ┆        ┆        │
│ 50%    ┆ 993.0  ┆ 29.0   ┆ null   ┆ null   ┆ 900000 ┆ 5.0    ┆ 0.0    ┆ null   ┆ null   ┆ 0.0    │
│        ┆        ┆        ┆        ┆        ┆ .0     ┆        ┆        ┆        ┆        ┆        │
│ 75%    ┆ 1490.0 ┆ 32.0   ┆ null   ┆ null   ┆ 1.25e6 ┆ 6.0    ┆ 1.0    ┆ null   ┆ null   ┆ 1.0    │
│ max    ┆ 1986.0 ┆ 35.0   ┆ Privat ┆ Yes    ┆ 1.8e6  ┆ 9.0    ┆ 1.0    ┆ Yes    ┆ Yes    ┆ 1.0    │
│        ┆        ┆        ┆ e Sect ┆        ┆        ┆        ┆        ┆        ┆        ┆        │
│        ┆        ┆        ┆ or/Sel ┆        ┆        ┆        ┆        ┆        ┆        ┆        │
│        ┆        ┆        ┆ f Empl ┆        ┆        ┆        ┆        ┆        ┆        ┆        │
│        ┆        ┆        ┆ oyed   ┆        ┆        ┆        ┆        ┆        ┆        ┆        │
└────────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘

skim(travel_data)

╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮
│          Data Summary                Data Types                                                                 │
│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓                                                          │
│ ┃ Dataframe         ┃ Values ┃ ┃ Column Type ┃ Count ┃                                                          │
│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩                                                          │
│ │ Number of rows    │ 1987   │ │ int64       │ 6     │                                                          │
│ │ Number of columns │ 10     │ │ string      │ 4     │                                                          │
│ └───────────────────┴────────┘ └─────────────┴───────┘                                                          │
│                                                     number                                                      │
│ ┏━━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┓  │
│ ┃ column           ┃ NA  ┃ NA %  ┃ mean    ┃ sd      ┃ p0     ┃ p25    ┃ p50    ┃ p75     ┃ p100    ┃ hist   ┃  │
│ ┡━━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━┩  │
│ │                  │   0 │     0 │     993 │   573.7 │      0 │  496.5 │    993 │    1490 │    1986 │ ██████ │  │
│ │ Age              │   0 │     0 │   29.65 │   2.913 │     25 │     28 │     29 │      32 │      35 │ ▄█▂▄▃▄ │  │
│ │ AnnualIncome     │   0 │     0 │  932800 │  376900 │ 300000 │ 600000 │ 900000 │ 1250000 │ 1800000 │ ▆▇▆█▇▁ │  │
│ │ FamilyMembers    │   0 │     0 │   4.753 │    1.61 │      2 │      4 │      5 │       6 │       9 │ ▇█▇▅▃▂ │  │
│ │ ChronicDiseases  │   0 │     0 │  0.2778 │   0.448 │      0 │      0 │      0 │       1 │       1 │ █    ▃ │  │
│ │ TravelInsurance  │   0 │     0 │  0.3573 │  0.4793 │      0 │      0 │      0 │       1 │       1 │ █    ▄ │  │
│ └──────────────────┴─────┴───────┴─────────┴─────────┴────────┴────────┴────────┴─────────┴─────────┴────────┘  │
│                                                     string                                                      │
│ ┏━━━━━━━━━━━┳━━━━┳━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━┓  │
│ ┃           ┃    ┃      ┃           ┃           ┃           ┃           ┃ chars per ┃ words per ┃ total      ┃  │
│ ┃ column    ┃ NA ┃ NA % ┃ shortest  ┃ longest   ┃ min       ┃ max       ┃ row       ┃ row       ┃ words      ┃  │
│ ┡━━━━━━━━━━━╇━━━━╇━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━┩  │
│ │ Employmen │  0 │    0 │ Governmen │ Private   │ Governmen │ Private   │      24.8 │       2.7 │       5391 │  │
│ │ t Type    │    │      │ t Sector  │ Sector/Se │ t Sector  │ Sector/Se │           │           │            │  │
│ │           │    │      │           │ lf        │           │ lf        │           │           │            │  │
│ │           │    │      │           │ Employed  │           │ Employed  │           │           │            │  │
│ │ GraduateO │  0 │    0 │ No        │ Yes       │ No        │ Yes       │      2.85 │         1 │       1987 │  │
│ │ rNot      │    │      │           │           │           │           │           │           │            │  │
│ │ FrequentF │  0 │    0 │ No        │ Yes       │ No        │ Yes       │      2.21 │         1 │       1987 │  │
│ │ lyer      │    │      │           │           │           │           │           │           │            │  │
│ │ EverTrave │  0 │    0 │ No        │ Yes       │ No        │ Yes       │      2.19 │         1 │       1987 │  │
│ │ lledAbroa │    │      │           │           │           │           │           │           │            │  │
│ │ d         │    │      │           │           │           │           │           │           │            │  │
│ └───────────┴────┴──────┴───────────┴───────────┴───────────┴───────────┴───────────┴───────────┴────────────┘  │
╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯

fig, axes = plt.subplots(3, 3, figsize=(18, 6))
plt.suptitle("Dataset variable distributions")

axes = axes.flatten()

columns_to_analyse = travel_data.columns[1:]

for i, column in enumerate(columns_to_analyse):
    sns.histplot(data=travel_data, x=column, ax=axes[i])
    axes[i].set_title(column)

plt.tight_layout()
plt.show()

columns = travel_data.columns

filtered_columns = [col for col in columns if col != '']

corr = pl.DataFrame(
    {col: [travel_data.select(pl.corr(col, other, method="spearman")).item() for other in filtered_columns]
     for col in filtered_columns}
)

corr_np = corr.to_numpy()

mask = np.triu(np.ones_like(corr_np, dtype=bool))

fig, axes = plt.subplots(1, 2, figsize=(24, 8))
sns.set_style("darkgrid")

sns.heatmap(corr_np, annot=True, mask=mask, cmap="Blues", ax=axes[0])
axes[0].set_title("Full Correlation Matrix")

corr_filtered_np = np.where(np.abs(corr_np) > 0.3, corr_np, np.nan)

sns.heatmap(corr_filtered_np, annot=True, mask=mask, cmap="Blues", ax=axes[1])
axes[1].set_title("Restricted Correlation Matrix")

plt.show()

travel_data_corr = travel_data.with_columns(
    pl.col('TravelInsurance')
)

income_insured = travel_data_corr.filter(pl.col('TravelInsurance') == 1)['AnnualIncome'].to_numpy()
income_not_insured = travel_data_corr.filter(pl.col('TravelInsurance') == 0)['AnnualIncome'].to_numpy()

u_statistic, p_value_income = stats.mannwhitneyu(income_insured, income_not_insured)
print(f'H1 - Annual Income: U Statistic = {u_statistic}, p-value = {p_value_income}')

H1 - Annual Income: U Statistic = 670230.5, p-value = 3.020101045488567e-70

def median_diff(x, y):
    return np.median(x) - np.median(y)

income_ci = bootstrap((income_insured, income_not_insured), median_diff, confidence_level=0.95, random_state=42, method='percentile')
print(f"95% CI for median income difference: ({income_ci.confidence_interval.low}, {income_ci.confidence_interval.high})")

95% CI for median income difference: (400000.0, 500000.0)

num_cols = ['Age', 'AnnualIncome', 'FamilyMembers']
binary_cols = ['Employment Type', 'GraduateOrNot', 'ChronicDiseases', 'FrequentFlyer', 'EverTravelledAbroad', 'TravelInsurance']

travel_data = column_mapping(travel_data, binary_cols, num_cols)

X = travel_data.drop("TravelInsurance")
y = travel_data["TravelInsurance"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.to_numpy())
X_test_scaled = scaler.transform(X_test.to_numpy())

X_train_scaled = pl.DataFrame(X_train_scaled, schema=X_train.columns)
X_test_scaled = pl.DataFrame(X_test_scaled, schema=X_test.columns)

X_train_pandas = X_train.to_pandas()

model_1 = BackwardEliminationLogistic()
X_selected, ols = model_1.backward_elimination(X_train_pandas, y_train)
print(ols.summary())
print(f"\nRegression ran {model_1.regression_counter} times.")

X_selected = pl.from_pandas(X_selected)

Optimization terminated successfully.
         Current function value: 0.515860
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.516011
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.516197
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.516446
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.517135
         Iterations 6
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                      y   No. Observations:                 1589
Model:                          Logit   Df Residuals:                     1583
Method:                           MLE   Df Model:                            5
Date:                Sun, 10 Aug 2025   Pseudo R-squ.:                  0.2068
Time:                        13:33:36   Log-Likelihood:                -821.73
converged:                       True   LL-Null:                       -1035.9
Covariance Type:            nonrobust   LLR p-value:                 2.220e-90
=======================================================================================
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  -1.0948      0.074    -14.851      0.000      -1.239      -0.950
Age                     0.2138      0.060      3.559      0.000       0.096       0.332
AnnualIncome            0.6040      0.072      8.439      0.000       0.464       0.744
FamilyMembers           0.2364      0.061      3.882      0.000       0.117       0.356
FrequentFlyer           0.4279      0.153      2.788      0.005       0.127       0.729
EverTravelledAbroad     1.7968      0.175     10.250      0.000       1.453       2.140
=======================================================================================

Regression ran 4 times.

X_train_np = X_selected.to_numpy()
y_train_np = y_train

log_reg = LogisticRegression()

plot_predicted_vs_residuals(ols)

plot_leverage(ols)

Number of high leverage points: 81

plot_cooks_distance(ols)

Number of influential points: 78

vif_results = calculate_vif_polars(X)

print("Variance Inflation Factors:")
for feature, vif in vif_results:
    print(f"{feature}: {vif}")

Variance Inflation Factors:
: -0.08883026831545057
Age: 1.0000295148133003
Employment Type: 1.000242544180209
GraduateOrNot: 1.000372147396212
AnnualIncome: 0.9997501574819097
FamilyMembers: 0.9999282558910093
ChronicDiseases: 1.0000518180962314
FrequentFlyer: 1.0000524811515374
EverTravelledAbroad: 1.000031340043701

X_train_np = X_selected.to_numpy()
y_train_np = y_train

perform_cross_validation(log_reg, X_train_np, y_train_np)

Stratified cross-validation results for LogisticRegression:
Accuracy: 0.7709 (+/- 0.0196)
Precision: 0.7910 (+/- 0.0661)
Recall: 0.4895 (+/- 0.0317)
F1: 0.6043 (+/- 0.0314)
Roc_auc: 0.7085 (+/- 0.0198)
Balanced_accuracy: 0.7085 (+/- 0.0198)

{'fit_time': array([0.0026958 , 0.00131083, 0.00118184, 0.00156784, 0.00111198]),
 'score_time': array([0.00258923, 0.0029211 , 0.00187802, 0.00294399, 0.00194407]),
 'test_accuracy': array([0.7672956 , 0.78930818, 0.7672956 , 0.77044025, 0.76025237]),
 'train_accuracy': array([0.77734068, 0.78206137, 0.77104642, 0.77498033, 0.77437107]),
 'test_precision': array([0.76      , 0.84057971, 0.78571429, 0.81538462, 0.75342466]),
 'train_precision': array([0.80935252, 0.80204778, 0.7985348 , 0.78571429, 0.81111111]),
 'test_recall': array([0.50442478, 0.50877193, 0.48245614, 0.46491228, 0.48672566]),
 'train_recall': array([0.49450549, 0.51762115, 0.48017621, 0.50881057, 0.48131868]),
 'test_f1': array([0.60638298, 0.63387978, 0.59782609, 0.59217877, 0.59139785]),
 'train_f1': array([0.61391542, 0.6291834 , 0.5997249 , 0.61764706, 0.60413793]),
 'test_roc_auc': array([0.70830995, 0.72742518, 0.70446336, 0.70304438, 0.69924518]),
 'train_roc_auc': array([0.71477726, 0.72331486, 0.70642838, 0.71584959, 0.70944759]),
 'test_balanced_accuracy': array([0.70830995, 0.72742518, 0.70446336, 0.70304438, 0.69924518]),
 'train_balanced_accuracy': array([0.71477726, 0.72331486, 0.70642838, 0.71584959, 0.70944759])}

features = X_selected.columns

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_selected.to_numpy())

pf = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = pf.fit_transform(X_train_scaled)

try:
    feature_names = pf.get_feature_names_out(features)
except AttributeError:
    try:
        feature_names = pf.get_feature_names(features)
    except AttributeError:
        feature_names = [f'Feature_{i}' for i in range(X_train_poly.shape[1])]

features_to_keep = [name for name in feature_names if 'const' not in name]
indices_to_keep = [list(feature_names).index(name) for name in features_to_keep]

X_train_poly = X_train_poly[:, indices_to_keep]
feature_names = features_to_keep

n_features = len(feature_names)
n_cols = 4
n_rows = (n_features + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
plt.subplots_adjust(hspace=0.8, wspace=0.3)

axes = axes.flatten()

for i, column in enumerate(feature_names):
    sns.histplot(X_train_poly[:, i], kde=True, ax=axes[i])
    axes[i].set_title(column, fontsize=8, wrap=True)
    axes[i].set_xlabel('Scaled Value', fontsize=8)
    axes[i].tick_params(axis='both', which='major', labelsize=6)

for i in range(n_features, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

cv_results_log = perform_cross_validation(log_reg, X_train_scaled, y_train)

log_reg.fit(X_train_poly, y_train)

for name, coef in zip(feature_names, log_reg.coef_[0]):
    print(f"{name}: {coef}")

Stratified cross-validation results for LogisticRegression:
Accuracy: 0.7728 (+/- 0.0192)
Precision: 0.7999 (+/- 0.0686)
Recall: 0.4877 (+/- 0.0374)
F1: 0.6054 (+/- 0.0326)
Roc_auc: 0.7096 (+/- 0.0200)
Balanced_accuracy: 0.7096 (+/- 0.0200)
Age: 0.19666137862250174
AnnualIncome: 0.7216330357238003
FamilyMembers: 0.1884566148980259
FrequentFlyer: -0.0013173897374804643
EverTravelledAbroad: 0.07260729985100645
Age^2: 0.3135731145486218
Age AnnualIncome: -0.1828757605800485
Age FamilyMembers: 0.4884880914533743
Age FrequentFlyer: 0.13975083075798644
Age EverTravelledAbroad: -0.01463312658028261
AnnualIncome^2: -0.01272457972103866
AnnualIncome FamilyMembers: 0.04418655212026896
AnnualIncome FrequentFlyer: 0.35492276085835567
AnnualIncome EverTravelledAbroad: 0.6728286871126578
FamilyMembers^2: 0.045254356834092624
FamilyMembers FrequentFlyer: 0.09494707136026524
FamilyMembers EverTravelledAbroad: -0.03224368220222625
FrequentFlyer^2: -0.0074010664394127835
FrequentFlyer EverTravelledAbroad: -0.01236217679132796
EverTravelledAbroad^2: 0.11072598816598153

log_reg.fit(X_train_poly, y_train)

y_pred_proba = log_reg.predict_proba(X_train_poly)[:, 1]

log_odds = np.log(y_pred_proba / (1 - y_pred_proba))

fig, axes = plt.subplots(5, 4, figsize=(20, 25))
plt.subplots_adjust(hspace=0.5, wspace=0.3)

for i, column in enumerate(feature_names):
    row = i // 4
    col = i % 4
    axes[row, col].scatter(X_train_poly[:, i], log_odds, alpha=0.5)
    axes[row, col].set_title(column, fontsize=10)
    axes[row, col].set_xlabel('Feature Value')
    axes[row, col].set_ylabel('Log-odds')
    axes[row, col].tick_params(axis='both', which='major', labelsize=8)

for i in range(len(feature_names), 20):
    row = i // 4
    col = i % 4
    fig.delaxes(axes[row, col])

plt.tight_layout()
plt.show()

features = ['EverTravelledAbroad', 'AnnualIncome', 'FrequentFlyer', 'FamilyMembers', 'Age']

X_train_pl = pl.DataFrame(
    {name: X_train[:, i] for i, name in enumerate(features)}
)

X_train_pl = X_train_pl.with_columns([
    (pl.col('AnnualIncome') * pl.col('Age')).alias('Income_Age_Interaction'),
    (pl.col('FamilyMembers') * pl.col('Age')).alias('Family_Age_Interaction'),
    (pl.col('EverTravelledAbroad') * pl.col('AnnualIncome')).alias('Travel_Income_Interaction')
])

X_train_interactions = X_train_pl.to_numpy()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_interactions)

feature_names = features + ['Income_Age_Interaction', 'Family_Age_Interaction', 'Travel_Income_Interaction']

cv_results = perform_cross_validation(log_reg, X_train_scaled, y_train)

log_reg.fit(X_train_scaled, y_train)

for feature, coef in zip(feature_names, log_reg.coef_[0]):
    print(f"{feature}: {coef}")

Stratified cross-validation results for LogisticRegression:
Accuracy: 0.7527 (+/- 0.0446)
Precision: 0.7114 (+/- 0.0892)
Recall: 0.5210 (+/- 0.0740)
F1: 0.6008 (+/- 0.0706)
Roc_auc: 0.7012 (+/- 0.0478)
Balanced_accuracy: 0.7012 (+/- 0.0478)
EverTravelledAbroad: 0.059905223881319795
AnnualIncome: 0.3822206216113748
FrequentFlyer: 0.12393231974257071
FamilyMembers: 0.0005970294444291089
Age: 1.0549572694413316
Income_Age_Interaction: -0.2687246026861914
Family_Age_Interaction: -0.1450955804312615
Travel_Income_Interaction: -0.09019558953038333

X_test_pl = pl.DataFrame(
    {name: X_test[:, i] for i, name in enumerate(features)}
)

X_test_pl = X_test_pl.with_columns([
    (pl.col('AnnualIncome') * pl.col('Age')).alias('Income_Age_Interaction'),
    (pl.col('FamilyMembers') * pl.col('Age')).alias('Family_Age_Interaction'),
    (pl.col('EverTravelledAbroad') * pl.col('AnnualIncome')).alias('Travel_Income_Interaction')
])

X_test_interactions = X_test_pl.to_numpy()
X_test_scaled_poly = scaler.fit_transform(X_test_interactions)

test_accuracy = log_reg.score(X_test_scaled_poly, y_test)
print(f"Test set accuracy: {test_accuracy:.4f}")

Test set accuracy: 0.7462

log_reg.fit(X_train_scaled, y_train)

y_pred_proba_train = log_reg.predict_proba(X_train_scaled)[:, 1]
log_odds_train = np.log(y_pred_proba_train / (1 - y_pred_proba_train))

def box_tidwell_test(X, y, feature_names):
    test_results = []
    for i, col in enumerate(feature_names):
        X_col = X[:, i]
        X_col_log = np.log(X_col - X_col.min() + 0.000001)
        X_interaction = X_col * X_col_log
        X_test_poly = sm.add_constant(np.column_stack((X, X_interaction)))
        
        try:
            model = sm.Logit(y, X_test_poly)
            fit_results = model.fit(disp=0)
            p_value = fit_results.pvalues[-1]
        except np.linalg.LinAlgError:
            correlation, p_value = stats.spearmanr(X_col, y)
        
        print(f"{col} - Box-Tidwell test p-value: {p_value:.4f}")
        test_results.append((col, p_value))
    
    return test_results

feature_names = features + ['Income_Age_Interaction', 'Family_Age_Interaction', 'Travel_Income_Interaction']
box_tidwell_results = box_tidwell_test(X_train_scaled, y_train, feature_names)

significant_nonlinear = [col for col, p_value in box_tidwell_results if p_value < 0.05]
print("Features with significant non-linear relationships:")
for col in significant_nonlinear:
    print(col)

EverTravelledAbroad - Box-Tidwell test p-value: 0.9459
AnnualIncome - Box-Tidwell test p-value: 0.0015
FrequentFlyer - Box-Tidwell test p-value: 1.0000
FamilyMembers - Box-Tidwell test p-value: 1.0000
Age - Box-Tidwell test p-value: 0.0433
Income_Age_Interaction - Box-Tidwell test p-value: 0.0000
Family_Age_Interaction - Box-Tidwell test p-value: 0.0024
Travel_Income_Interaction - Box-Tidwell test p-value: 0.7743
Features with significant non-linear relationships:
AnnualIncome
Age
Income_Age_Interaction
Family_Age_Interaction

fig, axes = plt.subplots(3, 3, figsize=(15, 15))
axes = axes.ravel()

for i, col in enumerate(feature_names):
    axes[i].scatter(X_train_scaled[:, i], log_odds_train, alpha=0.5)
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Log-odds')
    axes[i].set_title(f'Log-odds vs {col}')

plt.tight_layout()
plt.show()

models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42)
}

results = {}
for name, model in models.items():
    cv_results = perform_cross_validation(model, X_train_np, y_train_np)
    results[name] = cv_results

plot_cv_results(results)

Stratified cross-validation results for RandomForestClassifier:
Accuracy: 0.8049 (+/- 0.0423)
Precision: 0.7803 (+/- 0.0582)
Recall: 0.6321 (+/- 0.1088)
F1: 0.6974 (+/- 0.0823)
Roc_auc: 0.7666 (+/- 0.0551)
Balanced_accuracy: 0.7666 (+/- 0.0551)
Stratified cross-validation results for SVC:
Accuracy: 0.8194 (+/- 0.0320)
Precision: 0.8769 (+/- 0.0448)
Recall: 0.5757 (+/- 0.0902)
F1: 0.6941 (+/- 0.0714)
Roc_auc: 0.7653 (+/- 0.0439)
Balanced_accuracy: 0.7653 (+/- 0.0439)
Stratified cross-validation results for DecisionTreeClassifier:
Accuracy: 0.7886 (+/- 0.0437)
Precision: 0.7490 (+/- 0.0702)
Recall: 0.6144 (+/- 0.0860)
F1: 0.6746 (+/- 0.0736)
Roc_auc: 0.7499 (+/- 0.0515)
Balanced_accuracy: 0.7499 (+/- 0.0515)
Stratified cross-validation results for MLPClassifier:
Accuracy: 0.8301 (+/- 0.0299)
Precision: 0.8804 (+/- 0.0248)
Recall: 0.6074 (+/- 0.0959)
F1: 0.7177 (+/- 0.0666)
Roc_auc: 0.7807 (+/- 0.0441)
Balanced_accuracy: 0.7807 (+/- 0.0441)
Stratified cross-validation results for XGBClassifier:
Accuracy: 0.8169 (+/- 0.0432)
Precision: 0.8133 (+/- 0.0314)
Recall: 0.6320 (+/- 0.1288)
F1: 0.7098 (+/- 0.0903)
Roc_auc: 0.7758 (+/- 0.0619)
Balanced_accuracy: 0.7758 (+/- 0.0619)

selected_features = ['EverTravelledAbroad', 'AnnualIncome', 'FrequentFlyer', 'FamilyMembers', 'Age']

X_selected_train = X_train[selected_features]
X_selected_test = X_test[selected_features]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_selected_train)
X_test_scaled = scaler.transform(X_selected_test)

best_model_names = ['SVM', 'XGBoost', 'Random Forest']
best_models = {name: models[name] for name in best_model_names}

for name, model in best_models.items():
    model.fit(X_train_scaled, y_train)

plot_roc_auc(best_models, X_test_scaled, y_test)
plot_confusion_matrix(best_models, X_test_scaled, y_test)

selected_features = ['EverTravelledAbroad', 'AnnualIncome', 'FrequentFlyer', 'FamilyMembers', 'Age']

rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
xgb_param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.8, 0.9]
}
svm_param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'poly']
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, n_jobs=-1)
rf_grid.fit(X_train_scaled, y_train)

xgb_grid = GridSearchCV(XGBClassifier(eval_metric='logloss', random_state=42),
                        xgb_param_grid, cv=5, n_jobs=-1)
xgb_grid.fit(X_train_scaled, y_train)

svm_grid = GridSearchCV(SVC(probability=True, random_state=42), svm_param_grid, cv=5, n_jobs=-1)
svm_grid.fit(X_train_scaled, y_train)

X_selected_test_np = X_selected_test.to_numpy()

plot_shap_summary(rf_grid.best_estimator_, X_selected_test_np, selected_features, "Random Forest SHAP Values")
plot_shap_summary(xgb_grid.best_estimator_, X_selected_test_np, selected_features, "XGBoost SHAP Values")
plot_shap_summary(svm_grid.best_estimator_, X_selected_test_np[:100], selected_features, "SVM SHAP Values")

shap.initjs()

for name, model in [("Random Forest", rf_grid.best_estimator_),
                    ("XGBoost", xgb_grid.best_estimator_),
                    ("SVM", svm_grid.best_estimator_)]:
    
    print(f"\n{name} Waterfall Plot:")
    
    if isinstance(model, (RandomForestClassifier, XGBClassifier)):
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_selected_test_np)
        
        if isinstance(shap_values, list):
            shap_vals = shap_values[1][0,:]
            base_val = explainer.expected_value[1]
        else:
            shap_vals = shap_values[0,:]
            base_val = explainer.expected_value
            
        if hasattr(base_val, '__len__'):
            base_val = float(base_val[0])
        else:
            base_val = float(base_val)
            
        explanation = shap.Explanation(values=shap_vals, 
                                     base_values=base_val,
                                     feature_names=selected_features)
        
        if explanation.values.ndim > 1:
            explanation.values = explanation.values[:,0]
        
        shap.plots.waterfall(explanation)
        
    else:
        explainer = shap.KernelExplainer(model.predict_proba, X_selected_test_np[:100])
        shap_values = explainer.shap_values(X_selected_test_np[:100])
        
        if isinstance(shap_values, list):
            shap_vals = shap_values[1][0,:]
            base_val = explainer.expected_value[1]
        else:
            shap_vals = shap_values[0,:]
            base_val = explainer.expected_value
            
        if hasattr(base_val, '__len__'):
            base_val = float(base_val[0])
        else:
            base_val = float(base_val)
            
        explanation = shap.Explanation(values=shap_vals, 
                                     base_values=base_val,
                                     feature_names=selected_features)
        
        if explanation.values.ndim > 1:
            explanation.values = explanation.values[:,0]
        
        shap.plots.waterfall(explanation)
    
    plt.show()

<Figure size 1200x800 with 0 Axes>

100%|██████████| 100/100 [00:05<00:00, 19.31it/s]

<Figure size 1200x800 with 0 Axes>

Random Forest Waterfall Plot:

XGBoost Waterfall Plot:

SVM Waterfall Plot:

100%|██████████| 100/100 [00:05<00:00, 19.60it/s]

features = ['EverTravelledAbroad', 'AnnualIncome', 'FrequentFlyer', 'FamilyMembers', 'Age']

if isinstance(X_train, pl.DataFrame):
    X_train_np = X_train.select(features).to_numpy()
else:
    X_train_np = X_train

if isinstance(y_train, pl.DataFrame) or isinstance(y_train, pl.Series):
    y_train_np = y_train.to_numpy().flatten()
else:
    y_train_np = y_train

y_train_np = y_train_np.astype(int)

if isinstance(X_test, pl.DataFrame):
    X_test_np = X_test.select(features).to_numpy()
else:
    X_test_np = X_test

if isinstance(y_test, pl.DataFrame) or isinstance(y_test, pl.Series):
    y_test_np = y_test.to_numpy().flatten()
else:
    y_test_np = y_test

y_test_np = y_test_np.astype(int)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_np)
X_test_scaled = scaler.transform(X_test_np)

models = {
    'RandomForest': (RandomForestClassifier(random_state=42), {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2]
    }),
    'XGBoost': (XGBClassifier(eval_metric='logloss', random_state=42), {
        'classifier__max_depth': [3, 4, 5],
        'classifier__learning_rate': [0.01, 0.1],
        'classifier__n_estimators': [100, 200],
        'classifier__subsample': [0.8, 0.9]
    }),
    'SVM': (SVC(probability=True, random_state=42), {
        'classifier__C': [0.1, 1, 10],
        'classifier__gamma': ['scale', 'auto'],
        'classifier__kernel': ['rbf', 'poly']
    })
}

calibrated_models = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, (model, param_grid) in models.items():
    pipeline = ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])
    
    grid = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=-1)
    grid.fit(X_train_scaled, y_train_np)
    
    calibrated = CalibratedClassifierCV(grid.best_estimator_, cv=cv, method='sigmoid')
    calibrated.fit(X_train_scaled, y_train_np)
    calibrated_models[name] = calibrated

def evaluate_model(model, X, y, name):
    y_pred = model.predict(X)
    print(f"\n{name} Classification Report:")
    print(classification_report(y, y_pred))
    print(f"{name} prediction balance:", np.bincount(y_pred) / len(y_pred))

for name, model in calibrated_models.items():
    evaluate_model(model, X_test_scaled, y_test_np, name)

plot_roc_auc(calibrated_models, X_test_scaled, y_test)

RandomForest Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.95      0.87       256
           1       0.86      0.57      0.69       142

    accuracy                           0.81       398
   macro avg       0.83      0.76      0.78       398
weighted avg       0.82      0.81      0.80       398

RandomForest prediction balance: [0.7638191 0.2361809]

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.96      0.87       256
           1       0.90      0.54      0.68       142

    accuracy                           0.81       398
   macro avg       0.84      0.75      0.77       398
weighted avg       0.83      0.81      0.80       398

XGBoost prediction balance: [0.7839196 0.2160804]

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.91      0.85       256
           1       0.79      0.57      0.66       142

    accuracy                           0.79       398
   macro avg       0.79      0.74      0.76       398
weighted avg       0.79      0.79      0.78       398

SVM prediction balance: [0.74120603 0.25879397]

features = ['EverTravelledAbroad', 'AnnualIncome', 'FrequentFlyer', 'FamilyMembers', 'Age']
X_test_selected = X_test[:, [features.index(f) for f in features]]

rf_calibrated = calibrated_models['RandomForest']
xgb_calibrated = calibrated_models['XGBoost']
svm_calibrated = calibrated_models['SVM']

def calculate_metrics(y_true, y_pred_proba, threshold=0.5):
    y_pred = (y_pred_proba >= threshold).astype(int)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    return accuracy, precision, recall, f1, conf_matrix

results = {
    "Calibrated Random Forest": {},
    "Calibrated XGBoost": {},
    "Calibrated SVM": {}
}

for name, model in [("Calibrated Random Forest", rf_calibrated),
                    ("Calibrated XGBoost", xgb_calibrated),
                    ("Calibrated SVM", svm_calibrated)]:
    y_pred_proba = model.predict_proba(X_test_selected)[:, 1]
    results[name][0.5] = calculate_metrics(y_test, y_pred_proba, 0.5)
    results[name][0.3] = calculate_metrics(y_test, y_pred_proba, 0.3)

plt.style.use('seaborn-v0_8-darkgrid')
fig = plt.figure(figsize=(20, 18))
fig.suptitle("Model Performance Metrics and Confusion Matrices", fontsize=20, color='navy')

gs = fig.add_gridspec(3, 3, width_ratios=[3, 1, 1], height_ratios=[1, 1, 1])

for idx, (model, thresholds) in enumerate(results.items()):
    ax_metrics = fig.add_subplot(gs[idx, 0])
    ax_conf_matrix_03 = fig.add_subplot(gs[idx, 1])
    ax_conf_matrix_05 = fig.add_subplot(gs[idx, 2])
    
    x = np.arange(4)
    width = 0.35
    
    metrics_05 = thresholds[0.5][:4]
    metrics_03 = thresholds[0.3][:4]
    
    ax_metrics.bar(x - width/2, metrics_05, width, label='Threshold 0.5', color='royalblue')
    ax_metrics.bar(x + width/2, metrics_03, width, label='Threshold 0.3', color='skyblue')
    
    ax_metrics.set_ylabel('Score', color='navy')
    ax_metrics.set_title(model, color='navy')
    ax_metrics.set_xticks(x)
    ax_metrics.set_xticklabels(["Accuracy", "Precision", "Recall", "F1 Score"], color='navy')
    ax_metrics.legend()
    ax_metrics.set_ylim(0, 1)

    for i, v in enumerate(metrics_05):
        ax_metrics.text(i - width/2, v + 0.01, f'{v:.2f}', ha='center', va='bottom', color='navy')
    for i, v in enumerate(metrics_03):
        ax_metrics.text(i + width/2, v + 0.01, f'{v:.2f}', ha='center', va='bottom', color='navy')
    
    conf_matrix_03 = thresholds[0.3][4]
    conf_matrix_05 = thresholds[0.5][4]
    
    disp_03 = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_03)
    disp_03.plot(ax=ax_conf_matrix_03, cmap='Blues', colorbar=False)
    ax_conf_matrix_03.set_title(f"{model} (Threshold 0.3)", color='navy')
    
    disp_05 = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_05)
    disp_05.plot(ax=ax_conf_matrix_05, cmap='Blues', colorbar=False)
    ax_conf_matrix_05.set_title(f"{model} (Threshold 0.5)", color='navy')

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

features = ['EverTravelledAbroad', 'AnnualIncome', 'FrequentFlyer', 'FamilyMembers', 'Age']

X_test_selected = X_test[:, [features.index(f) for f in features]]

fig, axes = plt.subplots(2, 3, figsize=(20, 14))
fig.suptitle("Feature Distributions", fontsize=16)
axes = axes.flatten()

rf_calibrated = calibrated_models['RandomForest']
xgb_calibrated = calibrated_models['XGBoost']
svm_calibrated = calibrated_models['SVM']

for i, feature in enumerate(features):
    sns.histplot(data=pd.DataFrame({'feature': X_test_selected[:, i], 'target': y_test}),
                 x='feature', hue='target', kde=True, ax=axes[i])
    axes[i].set_title(f"Distribution of {feature}")

fig.delaxes(axes[5])
plt.tight_layout()
plt.show()

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle("Feature Importance", fontsize=16)

def plot_feature_importance(model, X, y, features, ax, title):
    result = permutation_importance(model, X, y, n_repeats=10, random_state=42)
    sorted_idx = result.importances_mean.argsort()
    ax.barh(range(len(features)), result.importances_mean[sorted_idx])
    ax.set_yticks(range(len(features)))
    ax.set_yticklabels([features[i] for i in sorted_idx])
    ax.set_title(title)

plot_feature_importance(rf_calibrated, X_test_selected, y_test, features, ax1, "Random Forest")
plot_feature_importance(xgb_calibrated, X_test_selected, y_test, features, ax2, "XGBoost")

correlation = np.zeros(len(features))
for i, feature in enumerate(features):
    correlation[i] = np.corrcoef(X_test_selected[:, i], y_test)[0, 1]

ax3.barh(range(len(features)), correlation)
ax3.set_yticks(range(len(features)))
ax3.set_yticklabels(features)
ax3.set_title("Correlation with Target")

plt.tight_layout()
plt.show()

features = ['EverTravelledAbroad', 'AnnualIncome', 'FrequentFlyer', 'FamilyMembers', 'Age']
X_test_selected = X_test.select(features).to_numpy()

models = [rf_grid.best_estimator_, xgb_grid.best_estimator_, svm_grid.best_estimator_]
model_names = ['Random Forest', 'XGBoost', 'SVM']

plot_feature_importance_comparison(models, features, model_names, X_test_selected, y_test)

rf_param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

xgb_param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.1, 1],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

def tune_model(model, param_grid, X_train, y_train, model_name):
    random_search = RandomizedSearchCV(model, param_grid, n_iter=100, cv=3, n_jobs=-1, verbose=0, random_state=42)
    random_search.fit(X_train, y_train)
    print(f"\n{model_name} Best Parameters:")
    for param, value in random_search.best_params_.items():
        print(f"{param}: {value}")
    print(f"{model_name} Best Score: {random_search.best_score_:.4f}")
    return random_search.best_estimator_

print("Performing hyperparameter tuning...")

rf_best = tune_model(RandomForestClassifier(random_state=42), rf_param_grid, X_train_scaled, y_train_np, "Random Forest")
xgb_best = tune_model(XGBClassifier(eval_metric='logloss', random_state=42),
                      xgb_param_grid, X_train_scaled, y_train_np, "XGBoost")
svm_best = tune_model(SVC(probability=True, random_state=42), svm_param_grid, X_train_scaled, y_train_np, "SVM")

print("\nHyperparameter tuning completed.")

Performing hyperparameter tuning...

Random Forest Best Parameters:
n_estimators: 400
min_samples_split: 5
min_samples_leaf: 2
max_depth: 10
bootstrap: True
Random Forest Best Score: 0.8338

XGBoost Best Parameters:
subsample: 0.6
n_estimators: 300
max_depth: 7
learning_rate: 0.01
colsample_bytree: 0.9
XGBoost Best Score: 0.8383

SVM Best Parameters:
kernel: rbf
gamma: auto
C: 10
SVM Best Score: 0.8276

Hyperparameter tuning completed.

def create_model_with_params(model_class, best_params, **kwargs):
    params = best_params.copy()
    for key, value in kwargs.items():
        params[key] = value
    return model_class(**params)

rf_final = create_model_with_params(RandomForestClassifier, rf_best.get_params(), random_state=42)
xgb_final = create_model_with_params(XGBClassifier, xgb_best.get_params(), random_state=42)
svm_final = create_model_with_params(SVC, svm_best.get_params(), probability=True, random_state=42)

rf_final.fit(X_train_scaled, y_train_np)
xgb_final.fit(X_train_scaled, y_train_np)
svm_final.fit(X_train_scaled, y_train_np)

SVC(C=10, gamma='auto', probability=True, random_state=42)

SVC(C=10, gamma='auto', probability=True, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('rf', rf_final), ('xgb', xgb_final), ('svm', svm_final)],
    voting='soft'
)
voting_clf.fit(X_train_scaled, y_train_np)

models = {
    'Ensemble': voting_clf,
    'Random Forest': rf_final,
    'XGBoost': xgb_final,
    'SVM': svm_final
}

print("Model Performances:")
print("-" * 40)
for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test_np, y_pred)
    print(f"{name:<15} Accuracy: {accuracy:.4f}")

print("\nEnsemble Detailed Performance:")
print("-" * 40)
y_pred_ensemble = voting_clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred_ensemble))

Model Performances:
----------------------------------------
Ensemble        Accuracy: 0.8141
Random Forest   Accuracy: 0.8241
XGBoost         Accuracy: 0.8141
SVM             Accuracy: 0.7990

Ensemble Detailed Performance:
----------------------------------------
              precision    recall  f1-score   support

           0       0.79      0.96      0.87       256
           1       0.89      0.55      0.68       142

    accuracy                           0.81       398
   macro avg       0.84      0.76      0.77       398
weighted avg       0.83      0.81      0.80       398

base_models = [
    ('rf', rf_calibrated),
    ('xgb', xgb_calibrated),
    ('svm', svm_calibrated)
]

meta_learner = LogisticRegression()

stacking_clf1 = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=5
)

stacking_clf2 = StackingClassifier(
    estimators=[('rf', rf_calibrated), ('xgb', xgb_calibrated)],
    final_estimator=meta_learner,
    cv=5
)

stacking_clf1.fit(X_train_scaled, y_train_np)
stacking_clf2.fit(X_train_scaled, y_train_np)

def evaluate_model(model, X, y, name):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    report = classification_report(y, y_pred, output_dict=True)
    return accuracy, report

models_to_evaluate = {
    'Random Forest': rf_calibrated,
    'XGBoost': xgb_calibrated,
    'SVM': svm_calibrated,
    'Stacking Ensemble (All Models)': stacking_clf1,
    'Stacking Ensemble (RF + XGB)': stacking_clf2
}

performance_results = {}
cv_results = {}
for name, model in models_to_evaluate.items():
    accuracy, report = evaluate_model(model, X_test_scaled, y_test_np, name)
    performance_results[name] = {'accuracy': accuracy, 'report': report}
    cv_results[name] = perform_cross_validation(model, X_train_scaled, y_train_np)

n_models = len(performance_results)
n_rows = (n_models + 1) // 2
fig, axes = plt.subplots(n_rows, 2, figsize=(20, 6 * n_rows))
fig.suptitle("Model Performance Metrics", fontsize=20, color='navy')
axes = axes.flatten()

colors = plt.cm.Blues(np.linspace(0.3, 0.9, n_models))

for idx, (name, results) in enumerate(performance_results.items()):
    ax = axes[idx]
    report = results['report']
    accuracy = results['accuracy']
    
    labels = ['Precision', 'Recall', 'F1-Score']
    class_0_metrics = [report['0'][label.lower()] for label in labels]
    class_1_metrics = [report['1'][label.lower()] for label in labels]
    
    x = np.arange(len(labels))
    width = 0.35
    
    ax.bar(x - width/2, class_0_metrics, width, label='Class 0', color=colors[idx], alpha=0.7)
    ax.bar(x + width/2, class_1_metrics, width, label='Class 1', color=colors[idx], alpha=0.4)
    
    ax.set_ylabel('Score', color='navy')
    ax.set_title(f"{name} (Accuracy: {accuracy:.4f})", color='navy')
    ax.set_xticks(x)
    ax.set_xticklabels(labels, color='navy')
    ax.legend()
    ax.set_ylim(0, 1)

for idx in range(n_models, len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

fig, ax = plt.subplots(figsize=(12, 8))
cv_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'balanced_accuracy']
x = np.arange(len(cv_metrics))

for idx, (name, results) in enumerate(cv_results.items()):
    means = [results[f'test_{metric}'].mean() for metric in cv_metrics]
    stds = [results[f'test_{metric}'].std() * 2 for metric in cv_metrics]
    ax.errorbar(x, means, yerr=stds, label=name, capsize=5, marker='o', color=colors[idx])

ax.set_xticks(x)
ax.set_xticklabels(cv_metrics, color='navy')
ax.set_ylabel('Score', color='navy')
ax.set_title('Cross-Validation Results', color='navy')
ax.legend()

plt.tight_layout()
plt.show()

Stratified cross-validation results for CalibratedClassifierCV:
Accuracy: 0.8351 (+/- 0.0401)
Precision: 0.9018 (+/- 0.0249)
Recall: 0.6039 (+/- 0.1076)
F1: 0.7223 (+/- 0.0862)
Roc_auc: 0.7838 (+/- 0.0549)
Balanced_accuracy: 0.7838 (+/- 0.0549)
Stratified cross-validation results for CalibratedClassifierCV:
Accuracy: 0.8389 (+/- 0.0309)
Precision: 0.9307 (+/- 0.0183)
Recall: 0.5933 (+/- 0.0852)
F1: 0.7239 (+/- 0.0677)
Roc_auc: 0.7844 (+/- 0.0426)
Balanced_accuracy: 0.7844 (+/- 0.0426)
Stratified cross-validation results for CalibratedClassifierCV:
Accuracy: 0.8125 (+/- 0.0504)
Precision: 0.8133 (+/- 0.0760)
Recall: 0.6163 (+/- 0.1010)
F1: 0.7007 (+/- 0.0905)
Roc_auc: 0.7689 (+/- 0.0606)
Balanced_accuracy: 0.7689 (+/- 0.0606)
Stratified cross-validation results for StackingClassifier:
Accuracy: 0.8395 (+/- 0.0316)
Precision: 0.9262 (+/- 0.0143)
Recall: 0.5986 (+/- 0.0889)
F1: 0.7263 (+/- 0.0700)
Roc_auc: 0.7861 (+/- 0.0441)
Balanced_accuracy: 0.7861 (+/- 0.0441)
Stratified cross-validation results for StackingClassifier:
Accuracy: 0.8402 (+/- 0.0314)
Precision: 0.9287 (+/- 0.0117)
Recall: 0.5986 (+/- 0.0889)
F1: 0.7271 (+/- 0.0698)
Roc_auc: 0.7866 (+/- 0.0440)
Balanced_accuracy: 0.7866 (+/- 0.0440)

¶

¶

¶

¶

⇡¶

¶

¶

¶

¶

¶

¶

⇡¶

¶

¶

¶

¶

¶

¶

¶

¶

¶

¶

¶

	Age	Employment Type	GraduateOrNot	AnnualIncome	FamilyMembers	ChronicDiseases	FrequentFlyer	EverTravelledAbroad	TravelInsurance
i64	i64	str	str	i64	i64	i64	str	str	i64
0	31	"Government Sector"	"Yes"	400000	6	1	"No"	"No"	0
1	31	"Private Sector/Self Employed"	"Yes"	1250000	7	0	"No"	"No"	0