import pickle
import warnings

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
import plotly.express as px
import shap
import seaborn as sns
import xgboost as xgb
from interpret import show
from interpret.glassbox import ExplainableBoostingClassifier
from optuna.integration.sklearn import OptunaSearchCV
from optuna.distributions import IntDistribution
from sklearn.compose import make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import (
    cross_validate,
    train_test_split,
    GridSearchCV,
    StratifiedKFold
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import (
    FunctionTransformer,
    KBinsDiscretizer,
    RobustScaler,
    OneHotEncoder
)
from sklearn.svm import SVC
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


df_full = pd.read_parquet("input/dataset.pq")
df_full.drop(columns=["customer_id"], inplace=True)
objects = df_full.select_dtypes("object")
object_columns = objects.columns
categories = df_full[object_columns].astype("category")
to_change = object_columns[
    categories.memory_usage(index=False) < objects.memory_usage(index=False)
]
df_full[to_change] = categories[to_change]
X, y = df_full.drop(columns=["credit_score"]), df_full["credit_score"]


df_full.describe()


df_full.nunique()

age                           258
occupation                     16
annual_income               12489
monthly_inhand_salary       10579
num_bank_accounts             174
num_credit_card               284
interest_rate                 300
num_of_loan                    77
delay_from_due_date            73
num_of_delayed_payment        129
changed_credit_limit         2985
num_credit_inquiries          201
outstanding_debt            12203
credit_utilization_ratio    12500
credit_history_age             34
payment_of_min_amount           3
total_emi_per_month         11226
amount_invested_monthly     11353
payment_behaviour               7
monthly_balance             12145
credit_score                    2
dtype: int64


df_full.dtypes

age                          float64
occupation                  category
annual_income                float64
monthly_inhand_salary        float64
num_bank_accounts              int64
num_credit_card                int64
interest_rate                  int64
num_of_loan                  float64
delay_from_due_date            int64
num_of_delayed_payment       float64
changed_credit_limit         float64
num_credit_inquiries         float64
outstanding_debt             float64
credit_utilization_ratio     float64
credit_history_age           float64
payment_of_min_amount       category
total_emi_per_month          float64
amount_invested_monthly      float64
payment_behaviour           category
monthly_balance              float64
credit_score                   int64
dtype: object


df_numeric = df_full.select_dtypes(include=["float64", "int64"])
fig, axes = plt.subplots(
    nrows=len(df_numeric.columns), figsize=(10, 10*len(df_numeric.columns))
)
for col, ax in zip(df_numeric.columns, axes):
    try:
        sns.histplot(
            data=df_numeric, x=col, hue="credit_score", multiple="stack", ax=ax
        )
    except ValueError:
        warnings.warn(f"Could not plot {col}")

Could not plot monthly_balance


def cleanup_transform(X):
    X_ = X.copy()
    X_["payment_behaviour_low_spent"] = X_.payment_behaviour.apply(
        lambda x: "Low_spent" in x
    ).astype(int)
    X_["payment_behaviour_value"] = X_.payment_behaviour.apply(
        lambda x: -1 if "Small_value" in x else 1 if "High_value" in x else 0
    )
    X_["payment_behaviour_na"] = (X_.payment_behaviour == "!@9#%8").astype(int)
    X_["payment_of_min_amount"].replace(
        {"No": 0.0, "Yes": 1.0, "NM": -1.0}, inplace=True
    )
    X_["payment_of_min_amount"] = X_["payment_of_min_amount"].astype(float)
    return X_


cleanup = FunctionTransformer(cleanup_transform)


transformer = make_column_transformer(
    (
        OneHotEncoder(
            drop="first",
            sparse_output=False,
            handle_unknown="ignore",
            dtype=int,
        ),
        ["occupation", "payment_behaviour"]
    ),
    (
        RobustScaler(),  # No se usa StandardScaler porque es sensible a outliers
        X.select_dtypes(include=["float64", "int64"]).columns
    ),
    remainder="passthrough",
)
transformer = make_pipeline(cleanup, transformer)
transformer.set_output(transform="pandas")

With transform="pandas", `func` should return a DataFrame to follow the set_output API.

Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function cleanup_transform at 0x0000018F0C217B50>)),
                ('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(drop='first',
                                                                dtype=<class 'int'>,
                                                                handle_unknown='ignore',
                                                                sparse_output=False),
                                                  ['occupation',
                                                   'payment_behaviour']),
                                                 ('robustscaler'...
                                                  Index(['age', 'annual_income', 'monthly_inhand_salary', 'num_bank_accounts',
       'num_credit_card', 'interest_rate', 'num_of_loan',
       'delay_from_due_date', 'num_of_delayed_payment', 'changed_credit_limit',
       'num_credit_inquiries', 'outstanding_debt', 'credit_utilization_ratio',
       'credit_history_age', 'total_emi_per_month', 'amount_invested_monthly',
       'monthly_balance'],
      dtype='object'))]))])

Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function cleanup_transform at 0x0000018F0C217B50>)),
                ('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(drop='first',
                                                                dtype=<class 'int'>,
                                                                handle_unknown='ignore',
                                                                sparse_output=False),
                                                  ['occupation',
                                                   'payment_behaviour']),
                                                 ('robustscaler'...
                                                  Index(['age', 'annual_income', 'monthly_inhand_salary', 'num_bank_accounts',
       'num_credit_card', 'interest_rate', 'num_of_loan',
       'delay_from_due_date', 'num_of_delayed_payment', 'changed_credit_limit',
       'num_credit_inquiries', 'outstanding_debt', 'credit_utilization_ratio',
       'credit_history_age', 'total_emi_per_month', 'amount_invested_monthly',
       'monthly_balance'],
      dtype='object'))]))])

FunctionTransformer(func=<function cleanup_transform at 0x0000018F0C217B50>)

ColumnTransformer(remainder='passthrough',
                  transformers=[('onehotencoder',
                                 OneHotEncoder(drop='first',
                                               dtype=<class 'int'>,
                                               handle_unknown='ignore',
                                               sparse_output=False),
                                 ['occupation', 'payment_behaviour']),
                                ('robustscaler', RobustScaler(),
                                 Index(['age', 'annual_income', 'monthly_inhand_salary', 'num_bank_accounts',
       'num_credit_card', 'interest_rate', 'num_of_loan',
       'delay_from_due_date', 'num_of_delayed_payment', 'changed_credit_limit',
       'num_credit_inquiries', 'outstanding_debt', 'credit_utilization_ratio',
       'credit_history_age', 'total_emi_per_month', 'amount_invested_monthly',
       'monthly_balance'],
      dtype='object'))])

['occupation', 'payment_behaviour']

OneHotEncoder(drop='first', dtype=<class 'int'>, handle_unknown='ignore',
              sparse_output=False)

Index(['age', 'annual_income', 'monthly_inhand_salary', 'num_bank_accounts',
       'num_credit_card', 'interest_rate', 'num_of_loan',
       'delay_from_due_date', 'num_of_delayed_payment', 'changed_credit_limit',
       'num_credit_inquiries', 'outstanding_debt', 'credit_utilization_ratio',
       'credit_history_age', 'total_emi_per_month', 'amount_invested_monthly',
       'monthly_balance'],
      dtype='object')

RobustScaler()


X_transformed = transformer.fit_transform(X)
X_transformed.head()


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=y
)
kf = StratifiedKFold(shuffle=True, random_state=0)


ebmclf = ExplainableBoostingClassifier(random_state=0)
X_adv = pd.concat([X_train, X_test])
y_adv = pd.Series(
    np.concatenate([np.zeros(len(X_train)), np.ones(len(X_test))]),
    index=X_adv.index,
)
ebmclf.fit(X_adv, y_adv)
roc_auc_score(y_adv, ebmclf.predict_proba(X_adv)[:, 1])

Missing values detected. Our visualizations do not currently display missing values. To retain the glassbox nature of the model you need to either set the missing values to an extreme value like -1000 that will be visible on the graphs, or manually examine the missing value score in ebm.term_scores_[term_index][0]

0.6059684


show(ebmclf.explain_global())


df_adv_report = pd.DataFrame(
    {
        "auc": np.zeros(5),
        "most_important_feature": np.zeros(5, dtype="object"),
    },
    index=[f"Fold {i}" for i in range(5)],
)
for i, (train_index, test_index) in enumerate(kf.split(X_adv, y_adv)):
    X_train_cv, X_test_cv = X_adv.iloc[train_index], X_adv.iloc[test_index]
    y_train_cv, y_test_cv = y_adv.iloc[train_index], y_adv.iloc[test_index]
    lgbclf = lgb.LGBMClassifier(random_state=0)
    lgbclf.fit(
        X_train_cv,
        y_train_cv,
    )
    pred = lgbclf.predict_proba(X_test_cv)[:, 1]
    df_adv_report.loc[f"Fold {i}", "auc"] = roc_auc_score(y_test_cv, pred)
    df_adv_report.loc[f"Fold {i}", "most_important_feature"] = (
        X_train_cv.columns[lgbclf.feature_importances_.argmax()]
    )
df_adv_report


X_transformed.isna().sum().sort_values(ascending=False)

robustscaler__monthly_inhand_salary                                  1916
robustscaler__credit_history_age                                     1120
robustscaler__num_of_delayed_payment                                  840
robustscaler__amount_invested_monthly                                 586
robustscaler__monthly_balance                                         355
robustscaler__num_credit_inquiries                                    257
robustscaler__changed_credit_limit                                    254
onehotencoder__occupation_Architect                                     0
robustscaler__num_bank_accounts                                         0
robustscaler__num_credit_card                                           0
robustscaler__interest_rate                                             0
robustscaler__num_of_loan                                               0
robustscaler__delay_from_due_date                                       0
robustscaler__outstanding_debt                                          0
onehotencoder__occupation_Developer                                     0
robustscaler__credit_utilization_ratio                                  0
robustscaler__total_emi_per_month                                       0
remainder__payment_of_min_amount                                        0
remainder__payment_behaviour_low_spent                                  0
remainder__payment_behaviour_value                                      0
robustscaler__annual_income                                             0
robustscaler__age                                                       0
onehotencoder__payment_behaviour_Low_spent_Small_value_payments         0
onehotencoder__payment_behaviour_Low_spent_Medium_value_payments        0
onehotencoder__occupation_Doctor                                        0
onehotencoder__occupation_Engineer                                      0
onehotencoder__occupation_Entrepreneur                                  0
onehotencoder__occupation_Journalist                                    0
onehotencoder__occupation_Lawyer                                        0
onehotencoder__occupation_Manager                                       0
onehotencoder__occupation_Mechanic                                      0
onehotencoder__occupation_Media_Manager                                 0
onehotencoder__occupation_Musician                                      0
onehotencoder__occupation_Scientist                                     0
onehotencoder__occupation_Teacher                                       0
onehotencoder__occupation_Writer                                        0
onehotencoder__occupation________                                       0
onehotencoder__payment_behaviour_High_spent_Large_value_payments        0
onehotencoder__payment_behaviour_High_spent_Medium_value_payments       0
onehotencoder__payment_behaviour_High_spent_Small_value_payments        0
onehotencoder__payment_behaviour_Low_spent_Large_value_payments         0
remainder__payment_behaviour_na                                         0
dtype: int64


null_columns = X_transformed.columns[X_transformed.isna().any()]


imputer = make_column_transformer(
    (
        SimpleImputer(strategy="median", add_indicator=True),
        null_columns,
    ),
    remainder="passthrough",
    verbose_feature_names_out=False,
)

imputer.set_output(transform="pandas")
X_transformed_imputed = imputer.fit_transform(X_transformed)
X_transformed_imputed.head()


discrete = X_transformed_imputed.dtypes == int
mi = mutual_info_classif(X_transformed_imputed, y, discrete_features=discrete)
mi = pd.Series(mi, index=X_transformed_imputed.columns).sort_values()
mi.plot.barh(figsize=(10, 20));


mi_top_five = mi.sort_values(ascending=False).head(5).index
mi_simple_names = mi_top_five.str.split("__").str[1]
X_simple_names = X_transformed_imputed.rename(
    columns=lambda x: x.split("__")[1]
)
corr = X_simple_names[mi_simple_names].corr()
px.imshow(corr)


sns.jointplot(
    data=X_transformed_imputed,
    x="robustscaler__outstanding_debt",
    y="robustscaler__interest_rate",
    hue=y_train,
);


X_train["debt_x_interest_rate"] = (
    X_train["outstanding_debt"]
    * X_train["interest_rate"]
)
X_test["debt_x_interest_rate"] = (
    X_test["outstanding_debt"]
    * X_test["interest_rate"]
)
X_transformed = transformer.fit_transform(X_train)
X_transformed_imputed = imputer.fit_transform(X_transformed)


discretizer = make_column_transformer(
    (
        KBinsDiscretizer(
            n_bins=256, encode="ordinal", random_state=0, dtype=np.float32
        ),
        X_transformed_imputed.columns[X_transformed_imputed.dtypes != int],
    ),
    remainder="passthrough",
    verbose_feature_names_out=False,
)
discretizer.set_output(transform="pandas")
discretizer.fit_transform(X_transformed_imputed).head()

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 1 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 2 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 3 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 4 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 5 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 6 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 7 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 8 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 9 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 10 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 11 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 12 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 13 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 14 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 16 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 17 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 18 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 19 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 20 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 23 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 24 are removed. Consider decreasing the number of bins.

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_discretization.py:313: UserWarning:

Bins whose width are too small (i.e., <= 1e-8) in feature 26 are removed. Consider decreasing the number of bins.


transformer_with_imputer = make_pipeline(transformer, imputer)
transformer_with_imputer.set_output(transform="pandas")

dummy = DummyClassifier(strategy="stratified", random_state=0)
logreg = make_pipeline(
    transformer_with_imputer,
    LogisticRegression(
        random_state=0, class_weight="balanced", n_jobs=-2, solver="sag"
    ),
)
knn = make_pipeline(
    transformer_with_imputer,
    KNeighborsClassifier(n_jobs=-2, algorithm="kd_tree")
)
dt = make_pipeline(
    transformer,  # no necesita imputar
    DecisionTreeClassifier(random_state=0, class_weight="balanced")
)
svc = make_pipeline(
    transformer_with_imputer,
    SVC(random_state=0, class_weight="balanced", probability=True),
)
rf = make_pipeline(
    transformer_with_imputer,
    discretizer,
    RandomForestClassifier(random_state=0, n_jobs=-2, class_weight="balanced"),
)
# No necesitan pipeline de transformaciones, pero sí se utiliza el de limpieza
# porque codifica las características de forma útil

lgbclf = make_pipeline(
    cleanup,
    lgb.LGBMClassifier(random_state=0, n_jobs=-2)
)
xgbclf = make_pipeline(
    cleanup,
    xgb.XGBClassifier(
        random_state=0,
        enable_categorical=True,
        n_jobs=-2,
        tree_method="hist",
    )
)

# extra
simple_imputer = make_column_transformer(
    (
        SimpleImputer(strategy="median"),
        X_train.select_dtypes("number").columns
    ),
    remainder="passthrough",
    verbose_feature_names_out=False,
)
simple_imputer.set_output(transform="pandas")
ebmclf = make_pipeline(
    cleanup,
    simple_imputer,
    ExplainableBoostingClassifier(random_state=0),
)
models = {
    "Dummy": dummy,
    "LogisticRegression": logreg,
    "KNeighborsClassifier": knn,
    "DecisionTreeClassifier": dt,
    "SVC": svc,
    "RandomForestClassifier": rf,
    "LightGBMClassifier": lgbclf,
    "XGBClassifier": xgbclf,
    "ExplainableBoostingClassifier": ebmclf,
}

C:\Users\David\PycharmProjects\laboratorio-mds\venv\lib\site-packages\sklearn\preprocessing\_function_transformer.py:345: UserWarning:

With transform="pandas", `func` should return a DataFrame to follow the set_output API.


report = {}
metrics = ["accuracy", "precision", "recall", "f1"]
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for name, model in models.items():
        print(name, f"\n{'-' * (len(name))}")
        cv = cross_validate(model, X_train, y_train, scoring=metrics, cv=kf)
        report[name] = cv
        print(pd.DataFrame(cv).mean(), "\n")

Dummy 
-----
fit_time          0.002401
score_time        0.013203
test_accuracy     0.595300
test_precision    0.304727
test_recall       0.315406
test_f1           0.309974
dtype: float64 

LogisticRegression 
------------------
fit_time          0.781788
score_time        0.035207
test_accuracy     0.503900
test_precision    0.339810
test_recall       0.710666
test_f1           0.452906
dtype: float64 

KNeighborsClassifier 
--------------------
fit_time          0.263283
score_time        0.392879
test_accuracy     0.736400
test_precision    0.555156
test_recall       0.430942
test_f1           0.485148
dtype: float64 

DecisionTreeClassifier 
----------------------
fit_time          0.801782
score_time        0.030407
test_accuracy     0.719400
test_precision    0.514518
test_recall       0.559000
test_f1           0.534165
dtype: float64 

SVC 
---
fit_time          32.356499
score_time         2.994010
test_accuracy      0.721400
test_precision     0.116783
test_recall        0.115972
test_f1            0.116376
dtype: float64 

RandomForestClassifier 
----------------------
fit_time          0.911007
score_time        0.079618
test_accuracy     0.782900
test_precision    0.671177
test_recall       0.485069
test_f1           0.562645
dtype: float64 

LightGBMClassifier 
------------------
fit_time          0.179041
score_time        0.019804
test_accuracy     0.788800
test_precision    0.665182
test_recall       0.537801
test_f1           0.594285
dtype: float64 

XGBClassifier 
-------------
fit_time          0.244056
score_time        0.021805
test_accuracy     0.774600
test_precision    0.637945
test_recall       0.504498
test_f1           0.563165
dtype: float64 

ExplainableBoostingClassifier 
-----------------------------
fit_time          3.938530
score_time        0.031607
test_accuracy     0.791700
test_precision    0.676979
test_recall       0.530861
test_f1           0.594524
dtype: float64


f1_vals = pd.DataFrame(
    {name: cv["test_f1"] for name, cv in report.items()}
)
with open("output/f1_scores_baseline.pkl", "wb") as f:
    pickle.dump(f1_vals, f)
f1_vals.mean().sort_values(ascending=False)

ExplainableBoostingClassifier    0.594524
LightGBMClassifier               0.594285
XGBClassifier                    0.563165
RandomForestClassifier           0.562645
DecisionTreeClassifier           0.534165
KNeighborsClassifier             0.485148
LogisticRegression               0.452906
Dummy                            0.309974
SVC                              0.116376
dtype: float64


order_index = f1_vals.mean().sort_values().index
fig = f1_vals[order_index].boxplot(
    figsize=(20, 10), rot=45, showmeans=True, fontsize=20
)
fig.set_title("F1 scores for each model")
fig.set_ylabel("F1 score");


ebm_grid = {
    "max_leaves": [2, 3, 5],
    "max_bins": [32, 256, 1024]
}
lgb_grid = {
    "extra_trees": [True, False],
    "max_bin": [127, 255, 511],
    "num_leaves": [31, 63, 127],
    "boosting_type": ["gbdt", "dart", "goss"],
}


def get_gridsearch(pipeline_or_model, params):
    if isinstance(pipeline_or_model, Pipeline):
        model_name = pipeline_or_model.steps[-1][0]
        model_params = {
            f"{model_name}__{k}": v for k, v in params.items()
        }
    else:
        model_params = params
    return GridSearchCV(
    pipeline_or_model,
    model_params,
    cv=kf,
    scoring="f1",
    )


ebm_best = get_gridsearch(
    ebmclf,
    ebm_grid,
)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    ebm_best.fit(X_train, y_train)


lgb_best = get_gridsearch(
    lgbclf,
    lgb_grid,
)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    lgb_best.fit(X_train, y_train)


scores = pd.DataFrame({
    "Model": ["EBM", "LGBM"],
    "F1": [ebm_best.best_score_, lgb_best.best_score_],
})
scores


ebm_opt_grid = {
    "explainableboostingclassifier__max_leaves": IntDistribution(2, 5),
    "explainableboostingclassifier__max_bins": IntDistribution(
        32, 1024
    ),
}
ebm_best_opt = OptunaSearchCV(
    ebmclf,
    ebm_opt_grid,
    cv=kf,
    scoring="f1",
    n_trials=15,
    timeout=600,
    random_state=0,
)
ebm_best_opt.fit(X_train, y_train)

C:\Users\David\AppData\Local\Temp\ipykernel_3560\3095283966.py:7: ExperimentalWarning:

OptunaSearchCV is experimental (supported from v0.17.0). The interface can change in the future.

[I 2023-07-22 19:29:56,318] A new study created in memory with name: no-name-89ea242c-e8a7-4903-94c3-54f8e92e220b
[I 2023-07-22 19:30:18,595] Trial 0 finished with value: 0.593441503766231 and parameters: {'explainableboostingclassifier__max_leaves': 3, 'explainableboostingclassifier__max_bins': 712}. Best is trial 0 with value: 0.593441503766231.
[I 2023-07-22 19:30:37,727] Trial 1 finished with value: 0.5920699817119327 and parameters: {'explainableboostingclassifier__max_leaves': 5, 'explainableboostingclassifier__max_bins': 226}. Best is trial 0 with value: 0.593441503766231.
[I 2023-07-22 19:30:58,229] Trial 2 finished with value: 0.5906126224969078 and parameters: {'explainableboostingclassifier__max_leaves': 4, 'explainableboostingclassifier__max_bins': 1005}. Best is trial 0 with value: 0.593441503766231.
[I 2023-07-22 19:31:23,568] Trial 3 finished with value: 0.5917231275105216 and parameters: {'explainableboostingclassifier__max_leaves': 2, 'explainableboostingclassifier__max_bins': 728}. Best is trial 0 with value: 0.593441503766231.
[I 2023-07-22 19:31:45,103] Trial 4 finished with value: 0.588672126407887 and parameters: {'explainableboostingclassifier__max_leaves': 5, 'explainableboostingclassifier__max_bins': 823}. Best is trial 0 with value: 0.593441503766231.
[I 2023-07-22 19:32:04,801] Trial 5 finished with value: 0.5912780602733626 and parameters: {'explainableboostingclassifier__max_leaves': 5, 'explainableboostingclassifier__max_bins': 545}. Best is trial 0 with value: 0.593441503766231.
[I 2023-07-22 19:32:32,062] Trial 6 finished with value: 0.5935375771688933 and parameters: {'explainableboostingclassifier__max_leaves': 2, 'explainableboostingclassifier__max_bins': 326}. Best is trial 6 with value: 0.5935375771688933.
[I 2023-07-22 19:32:57,357] Trial 7 finished with value: 0.5946450384909048 and parameters: {'explainableboostingclassifier__max_leaves': 2, 'explainableboostingclassifier__max_bins': 109}. Best is trial 7 with value: 0.5946450384909048.
[I 2023-07-22 19:33:17,958] Trial 8 finished with value: 0.5900746782293191 and parameters: {'explainableboostingclassifier__max_leaves': 5, 'explainableboostingclassifier__max_bins': 686}. Best is trial 7 with value: 0.5946450384909048.
[I 2023-07-22 19:33:40,546] Trial 9 finished with value: 0.5937622941186419 and parameters: {'explainableboostingclassifier__max_leaves': 3, 'explainableboostingclassifier__max_bins': 990}. Best is trial 7 with value: 0.5946450384909048.
[I 2023-07-22 19:34:06,822] Trial 10 finished with value: 0.5895373773527226 and parameters: {'explainableboostingclassifier__max_leaves': 2, 'explainableboostingclassifier__max_bins': 73}. Best is trial 7 with value: 0.5946450384909048.
[I 2023-07-22 19:34:27,199] Trial 11 finished with value: 0.5952078317800227 and parameters: {'explainableboostingclassifier__max_leaves': 3, 'explainableboostingclassifier__max_bins': 401}. Best is trial 11 with value: 0.5952078317800227.
[I 2023-07-22 19:34:48,835] Trial 12 finished with value: 0.5922397324137995 and parameters: {'explainableboostingclassifier__max_leaves': 3, 'explainableboostingclassifier__max_bins': 356}. Best is trial 11 with value: 0.5952078317800227.
[I 2023-07-22 19:35:07,770] Trial 13 finished with value: 0.5927716158273906 and parameters: {'explainableboostingclassifier__max_leaves': 4, 'explainableboostingclassifier__max_bins': 35}. Best is trial 11 with value: 0.5952078317800227.
[I 2023-07-22 19:35:33,932] Trial 14 finished with value: 0.5934436930707016 and parameters: {'explainableboostingclassifier__max_leaves': 2, 'explainableboostingclassifier__max_bins': 468}. Best is trial 11 with value: 0.5952078317800227.

OptunaSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
               estimator=Pipeline(steps=[('functiontransformer',
                                          FunctionTransformer(func=<function cleanup_transform at 0x0000018F0C217B50>)),
                                         ('columntransformer',
                                          ColumnTransformer(remainder='passthrough',
                                                            transformers=[('simpleimputer',
                                                                           SimpleImputer(strategy='median'),
                                                                           Index(['age', 'annual_inco...
                                                            verbose_feature_names_out=False)),
                                         ('explainableboostingclassifier',
                                          ExplainableBoostingClassifier(random_state=0))]),
               n_jobs=1, n_trials=15,
               param_distributions={'explainableboostingclassifier__max_bins': IntDistribution(high=1024, log=False, low=32, step=1),
                                    'explainableboostingclassifier__max_leaves': IntDistribution(high=5, log=False, low=2, step=1)},
               random_state=0, scoring='f1', timeout=600)

OptunaSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
               estimator=Pipeline(steps=[('functiontransformer',
                                          FunctionTransformer(func=<function cleanup_transform at 0x0000018F0C217B50>)),
                                         ('columntransformer',
                                          ColumnTransformer(remainder='passthrough',
                                                            transformers=[('simpleimputer',
                                                                           SimpleImputer(strategy='median'),
                                                                           Index(['age', 'annual_inco...
                                                            verbose_feature_names_out=False)),
                                         ('explainableboostingclassifier',
                                          ExplainableBoostingClassifier(random_state=0))]),
               n_jobs=1, n_trials=15,
               param_distributions={'explainableboostingclassifier__max_bins': IntDistribution(high=1024, log=False, low=32, step=1),
                                    'explainableboostingclassifier__max_leaves': IntDistribution(high=5, log=False, low=2, step=1)},
               random_state=0, scoring='f1', timeout=600)

Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function cleanup_transform at 0x0000018F0C217B50>)),
                ('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('simpleimputer',
                                                  SimpleImputer(strategy='median'),
                                                  Index(['age', 'annual_income', 'monthly_inhand_salary', 'num_bank_accounts',
       'num_credit_card', 'interest_rate', 'n...
       'delay_from_due_date', 'num_of_delayed_payment', 'changed_credit_limit',
       'num_credit_inquiries', 'outstanding_debt', 'credit_utilization_ratio',
       'credit_history_age', 'total_emi_per_month', 'amount_invested_monthly',
       'monthly_balance', 'debt_x_interest_rate'],
      dtype='object'))],
                                   verbose_feature_names_out=False)),
                ('explainableboostingclassifier',
                 ExplainableBoostingClassifier(random_state=0))])

FunctionTransformer(func=<function cleanup_transform at 0x0000018F0C217B50>)

ColumnTransformer(remainder='passthrough',
                  transformers=[('simpleimputer',
                                 SimpleImputer(strategy='median'),
                                 Index(['age', 'annual_income', 'monthly_inhand_salary', 'num_bank_accounts',
       'num_credit_card', 'interest_rate', 'num_of_loan',
       'delay_from_due_date', 'num_of_delayed_payment', 'changed_credit_limit',
       'num_credit_inquiries', 'outstanding_debt', 'credit_utilization_ratio',
       'credit_history_age', 'total_emi_per_month', 'amount_invested_monthly',
       'monthly_balance', 'debt_x_interest_rate'],
      dtype='object'))],
                  verbose_feature_names_out=False)

Index(['age', 'annual_income', 'monthly_inhand_salary', 'num_bank_accounts',
       'num_credit_card', 'interest_rate', 'num_of_loan',
       'delay_from_due_date', 'num_of_delayed_payment', 'changed_credit_limit',
       'num_credit_inquiries', 'outstanding_debt', 'credit_utilization_ratio',
       'credit_history_age', 'total_emi_per_month', 'amount_invested_monthly',
       'monthly_balance', 'debt_x_interest_rate'],
      dtype='object')

SimpleImputer(strategy='median')

passthrough


with open("output/ebm_opt.pkl", "wb") as f:
    pickle.dump(ebm_best_opt, f)


ebm_best_pred = ebm_best_opt.predict(X_test)
print(classification_report(y_test, ebm_best_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1780
           1       0.69      0.53      0.60       720

    accuracy                           0.80      2500
   macro avg       0.76      0.72      0.73      2500
weighted avg       0.79      0.80      0.79      2500


lgb_pred = lgb_best.predict(X_test)
print(classification_report(y_test, lgb_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1780
           1       0.68      0.53      0.60       720

    accuracy                           0.79      2500
   macro avg       0.75      0.72      0.73      2500
weighted avg       0.78      0.79      0.79      2500


ebm = ebm_best_opt.best_estimator_
ebm_processor = make_pipeline(
    ebm["functiontransformer"],
    ebm["columntransformer"]
)
ebm_optimized = ebm["explainableboostingclassifier"]


explanation = ebm_optimized.explain_global(name="EBM")
show(explanation)


ebm_processor

ColumnTransformer(remainder='passthrough',
                  transformers=[('simpleimputer',
                                 SimpleImputer(strategy='median'),
                                 Index(['age', 'annual_income', 'monthly_inhand_salary', 'num_bank_accounts',
       'num_credit_card', 'interest_rate', 'num_of_loan',
       'delay_from_due_date', 'num_of_delayed_payment', 'changed_credit_limit',
       'num_credit_inquiries', 'outstanding_debt', 'credit_utilization_ratio',
       'credit_history_age', 'total_emi_per_month', 'amount_invested_monthly',
       'monthly_balance', 'debt_x_interest_rate'],
      dtype='object'))],
                  verbose_feature_names_out=False)

ColumnTransformer(remainder='passthrough',
                  transformers=[('simpleimputer',
                                 SimpleImputer(strategy='median'),
                                 Index(['age', 'annual_income', 'monthly_inhand_salary', 'num_bank_accounts',
       'num_credit_card', 'interest_rate', 'num_of_loan',
       'delay_from_due_date', 'num_of_delayed_payment', 'changed_credit_limit',
       'num_credit_inquiries', 'outstanding_debt', 'credit_utilization_ratio',
       'credit_history_age', 'total_emi_per_month', 'amount_invested_monthly',
       'monthly_balance', 'debt_x_interest_rate'],
      dtype='object'))],
                  verbose_feature_names_out=False)

Index(['age', 'annual_income', 'monthly_inhand_salary', 'num_bank_accounts',
       'num_credit_card', 'interest_rate', 'num_of_loan',
       'delay_from_due_date', 'num_of_delayed_payment', 'changed_credit_limit',
       'num_credit_inquiries', 'outstanding_debt', 'credit_utilization_ratio',
       'credit_history_age', 'total_emi_per_month', 'amount_invested_monthly',
       'monthly_balance', 'debt_x_interest_rate'],
      dtype='object')

SimpleImputer(strategy='median')

['occupation', 'payment_of_min_amount', 'payment_behaviour']

passthrough


X_test_imputed


ebm_processor.fit(X_train)
X_test_imputed = ebm_processor.transform(X_test)
sample = resample(
    X_test_imputed, n_samples=10, random_state=0, stratify=y_test, replace=False
)
show(ebm_optimized.explain_local(sample, name="EBM"))


importances = pd.Series(ebm_optimized.term_importances(), index=ebm_optimized.term_names_)
importances.sort_values()

payment_behaviour_na                            0.014403
num_of_loan & changed_credit_limit              0.022509
changed_credit_limit & debt_x_interest_rate     0.025894
delay_from_due_date & outstanding_debt          0.026092
occupation                                      0.026904
credit_utilization_ratio                        0.027281
age                                             0.028067
interest_rate & delay_from_due_date             0.028104
delay_from_due_date & changed_credit_limit      0.028465
monthly_inhand_salary                           0.028906
annual_income                                   0.029548
num_bank_accounts & delay_from_due_date         0.038550
delay_from_due_date & num_of_delayed_payment    0.039826
payment_of_min_amount                           0.040442
changed_credit_limit & outstanding_debt         0.040586
payment_behaviour_value                         0.040983
num_credit_card & outstanding_debt              0.048976
num_bank_accounts                               0.061203
total_emi_per_month                             0.070795
credit_history_age & debt_x_interest_rate       0.072409
num_of_loan                                     0.075231
monthly_balance                                 0.083501
payment_behaviour_low_spent                     0.087697
num_credit_inquiries                            0.089706
credit_history_age                              0.095484
num_of_delayed_payment                          0.103030
amount_invested_monthly                         0.103435
payment_behaviour                               0.123510
changed_credit_limit                            0.141983
num_credit_card                                 0.169403
interest_rate                                   0.175796
delay_from_due_date                             0.200758
outstanding_debt                                0.244936
debt_x_interest_rate                            0.295801
dtype: float64


age_index = ebm_optimized.term_names_.index("age")
occupation_index = ebm_optimized.term_names_.index("occupation")
ebm_optimized.explain_global(name="EBM").visualize(age_index)


ebm_optimized.explain_global(name="EBM").visualize(occupation_index)

	age	annual_income	monthly_inhand_salary	num_bank_accounts	num_credit_card	interest_rate	num_of_loan	delay_from_due_date	num_of_delayed_payment	changed_credit_limit	num_credit_inquiries	outstanding_debt	credit_utilization_ratio	credit_history_age	total_emi_per_month	amount_invested_monthly	monthly_balance	credit_score
count	12500.000000	1.250000e+04	10584.000000	12500.000000	12500.000000	12500.000000	12500.000000	12500.000000	11660.00000	12246.000000	12243.000000	12500.000000	12500.000000	11380.000000	12500.000000	11914.000000	1.214500e+04	12500.000000
mean	105.771840	1.616206e+05	4186.634963	16.939920	23.172720	73.213360	3.099440	21.060880	32.93542	10.398582	26.292330	1426.220376	32.349265	18.230404	1488.394291	638.798715	-2.744614e+22	0.288160
std	664.502705	1.297842e+06	3173.690362	114.350815	132.005866	468.682227	65.105277	14.863091	237.43768	6.799253	181.821031	1155.169458	5.156815	8.302078	8561.449910	2049.195193	3.024684e+24	0.452924
min	-500.000000	7.005930e+03	303.645417	-1.000000	0.000000	1.000000	-100.000000	-5.000000	-3.00000	-6.490000	0.000000	0.230000	20.100770	0.000000	0.000000	0.000000	-3.333333e+26	0.000000
25%	25.000000	1.945333e+04	1622.408646	3.000000	4.000000	8.000000	1.000000	10.000000	9.00000	5.370000	4.000000	566.072500	28.066517	12.000000	31.496968	73.736810	2.701501e+02	0.000000
50%	33.000000	3.757238e+04	3087.595000	6.000000	5.000000	14.000000	3.000000	18.000000	14.00000	9.410000	6.000000	1166.155000	32.418953	18.000000	72.887628	134.093193	3.393885e+02	0.000000
75%	42.000000	7.269021e+04	5967.937500	7.000000	7.000000	20.000000	5.000000	28.000000	18.00000	14.940000	10.000000	1945.962500	36.623650	25.000000	169.634826	261.664256	4.714245e+02	1.000000
max	8678.000000	2.383470e+07	15204.633333	1756.000000	1499.000000	5789.000000	1495.000000	67.000000	4293.00000	36.970000	2554.000000	4998.070000	48.199824	33.000000	81971.000000	10000.000000	1.463792e+03	1.000000

	auc	most_important_feature
Fold 0	0.482397	changed_credit_limit
Fold 1	0.489064	changed_credit_limit
Fold 2	0.515999	monthly_inhand_salary
Fold 3	0.502982	monthly_inhand_salary
Fold 4	0.518786	monthly_inhand_salary

	robustscaler__monthly_inhand_salary	robustscaler__num_of_delayed_payment	robustscaler__changed_credit_limit	robustscaler__num_credit_inquiries	robustscaler__credit_history_age	robustscaler__amount_invested_monthly	robustscaler__monthly_balance	...	robustscaler__interest_rate	robustscaler__num_of_loan	robustscaler__delay_from_due_date	robustscaler__outstanding_debt	robustscaler__credit_utilization_ratio	robustscaler__total_emi_per_month	remainder__payment_of_min_amount	remainder__payment_behaviour_low_spent	remainder__payment_behaviour_value
0	-0.290586	-0.888889	0.194357	-0.333333	0.000000	-0.581650	0.093085	...	-0.916667	0.25	-0.833333	-0.258118	-0.991589	-0.168764	0.0	0	0
1	-0.011416	-1.111111	-0.416928	-0.666667	0.692308	0.451297	0.082920	...	-0.666667	-0.50	-0.833333	-0.406645	0.060172	-0.391431	0.0	1	-1
2	2.094020	-0.888889	-0.241379	-0.500000	0.000000	52.498488	2.762925	...	-0.500000	0.00	-0.555556	0.099178	0.696004	1.260369	0.0	0	-1
3	-0.109332	-0.555556	-0.775340	-0.333333	-0.076923	-0.045102	0.197878	...	-0.833333	-25.75	-0.777778	-0.386766	-0.594409	-0.408810	0.0	0	-1
4	-0.053914	0.111111	-0.713689	-0.333333	1.000000	0.251361	0.122278	...	-0.750000	-25.75	-0.944444	-0.161096	-0.766148	-0.527644	1.0	0	-1

	age	annual_income	monthly_inhand_salary	num_bank_accounts	num_credit_card	interest_rate	num_of_loan	delay_from_due_date	num_of_delayed_payment	changed_credit_limit	...	outstanding_debt	credit_utilization_ratio	credit_history_age	total_emi_per_month	amount_invested_monthly	monthly_balance	debt_x_interest_rate	occupation	payment_of_min_amount	payment_behaviour
118	34.0	60938.130	5163.177500	10.0	8.0	31.0	8.0	26.0	21.0	17.49	...	3947.24	36.591278	5.0	378.304673	140.425626	269.716274	122364.44	Doctor	Yes	Low_spent_Large_value_payments
8211	6991.0	14792.570	1181.714167	10.0	7.0	20.0	8.0	54.0	21.0	18.80	...	4955.69	29.747241	9.0	54.287798	64.352298	259.531321	99113.80	Writer	Yes	High_spent_Small_value_payments
11146	30.0	40113.080	3086.756667	3.0	7.0	1.0	4.0	2.0	0.0	5.88	...	25.78	32.164295	20.0	76.335329	211.712167	280.628171	25.78	Lawyer	No	High_spent_Small_value_payments
9525	4212.0	68561.310	5594.442500	6.0	5.0	12.0	2.0	30.0	15.0	8.72	...	93.67	39.179590	30.0	58.021274	144.225256	607.197720	1124.04	Writer	Yes	!@9#%8
8240	14.0	9654.115	620.509583	6.0	7.0	18.0	3.0	60.0	17.0	14.30	...	2081.23	34.767476	9.0	20.327734	30.755410	300.967815	37462.14	Doctor	Yes	Low_spent_Small_value_payments
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2301	43.0	8719.000	653.583333	9.0	5.0	19.0	6.0	40.0	10.0	15.58	...	1643.18	35.939044	8.0	34.671917	53.954000	266.732417	31220.42	Teacher	Yes	Low_spent_Small_value_payments
4101	25.0	20247.610	1733.300833	8.0	3.0	15.0	4.0	8.0	11.0	18.87	...	1300.16	33.552778	26.0	46.610685	161.712836	255.006562	19502.40	Entrepreneur	NM	!@9#%8
6030	22.0	43925.880	3889.490000	1.0	3.0	7.0	0.0	-1.0	6.0	8.84	...	107.51	25.155414	19.0	0.000000	10000.000000	256.996591	752.57	Scientist	No	Low_spent_Small_value_payments
3889	28.0	19841.000	1756.416667	6.0	10.0	17.0	5.0	51.0	22.0	17.88	...	2538.06	35.922630	5.0	79.742135	74.879969	291.019563	43147.02	Writer	Yes	Low_spent_Large_value_payments
9351	22.0	120368.320	10210.693333	1.0	7.0	12.0	3.0	1.0	9.0	8.87	...	785.01	39.002121	21.0	288.949692	338.302379	643.817262	9420.12	Mechanic	No	High_spent_Medium_value_payments

Proyecto: Riesgo en el Banco Giturra¶

Cuerpo Docente:¶

Equipo: SUPER IMPORTANTE¶

Reglas¶

Presentación del Problema¶

Instalación de Librerías y Carga de Datos.¶

1. Introducción [0.5 puntos]¶

2. Carga de datos Análisis Exploratorio de Datos [Sin puntaje]¶

Explicaciones de cada variable¶

3. Preparación de Datos [0.5 puntos]¶

3.1 Preprocesamiento con `ColumnTransformer`¶

3.2 Holdout¶

3.3 Datos nulos.¶

3.4 Feature Engineering [Bonus - 0.5 puntos]¶

4. Baseline [1.5 puntos]¶

5. Optimización del Modelo [1.5 puntos]¶

6. Interpretabilidad [1.0 puntos]¶

7. Concluir [1.0 puntos]¶

	robustscaler__monthly_inhand_salary	robustscaler__num_of_delayed_payment	robustscaler__changed_credit_limit	robustscaler__num_credit_inquiries	robustscaler__credit_history_age	robustscaler__amount_invested_monthly	robustscaler__monthly_balance	missingindicator_robustscaler__num_of_delayed_payment	...	onehotencoder__payment_behaviour_High_spent_Medium_value_payments	onehotencoder__payment_behaviour_Low_spent_Medium_value_payments	onehotencoder__payment_behaviour_Low_spent_Small_value_payments	remainder__payment_behaviour_low_spent	remainder__payment_behaviour_na
8746	3.0	27.0	240.0	6.0	1.0	0.0	36.0	0.0	...	1	0	0	0	0
9550	44.0	17.0	210.0	12.0	9.0	115.0	51.0	1.0	...	0	0	1	1	0
6961	214.0	5.0	47.0	6.0	31.0	228.0	234.0	0.0	...	0	0	0	0	1
10085	45.0	30.0	122.0	12.0	15.0	82.0	70.0	0.0	...	0	1	0	1	0
2864	141.0	17.0	24.0	10.0	18.0	66.0	190.0	1.0	...	1	0	0	0	0

	Model	F1
0	EBM	0.594524
1	LGBM	0.595015

Proyecto: Riesgo en el Banco Giturra¶

Cuerpo Docente:¶

Equipo: SUPER IMPORTANTE¶

Reglas¶

Presentación del Problema¶

Instalación de Librerías y Carga de Datos.¶

1. Introducción [0.5 puntos]¶

2. Carga de datos Análisis Exploratorio de Datos [Sin puntaje]¶

Explicaciones de cada variable¶

3. Preparación de Datos [0.5 puntos]¶

3.1 Preprocesamiento con ColumnTransformer¶

3.2 Holdout¶

3.3 Datos nulos.¶

3.4 Feature Engineering [Bonus - 0.5 puntos]¶

4. Baseline [1.5 puntos]¶

5. Optimización del Modelo [1.5 puntos]¶

6. Interpretabilidad [1.0 puntos]¶

7. Concluir [1.0 puntos]¶

3.1 Preprocesamiento con `ColumnTransformer`¶