import pandas as pd
from pandas.api.types import is_numeric_dtype

import kagglehub
from kagglehub import KaggleDatasetAdapter

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    cross_val_score,
    GridSearchCV,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    precision_score,
    recall_score,
    confusion_matrix,
    make_scorer,
    ConfusionMatrixDisplay,
    roc_curve,
    roc_auc_score,
    precision_recall_curve,
    average_precision_score,
)

import matplotlib.pyplot as plt

import numpy as np

!pip -q install kagglehub[pandas-datasets]

import kagglehub
from kagglehub import KaggleDatasetAdapter

df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "avineshprabhakaran/loan-eligibility-prediction",
    "Loan Eligibility Prediction.csv"
)

df.head()

Using Colab cache for faster access to the 'loan-eligibility-prediction' dataset.

if "Customer_ID" in df.columns:
    df = df.drop(columns=["Customer_ID"])

df.head()

df.isna().sum()

int_like_cols = [
    "Gender",
    "Married",
    "Education",
    "Self_Employed",
    "Property_Area",
    "Loan_Status",
    "Credit_History",
]

for col in int_like_cols:
    if col in df.columns:
        print(f"\nColumn: {col}")
        print(df[col].value_counts(dropna=False).head(10))

Column: Gender
Gender
Male      499
Female    115
Name: count, dtype: int64

Column: Married
Married
Yes    399
No     215
Name: count, dtype: int64

Column: Education
Education
Graduate        480
Not Graduate    134
Name: count, dtype: int64

Column: Self_Employed
Self_Employed
No     523
Yes     91
Name: count, dtype: int64

Column: Property_Area
Property_Area
Semiurban    233
Urban        202
Rural        179
Name: count, dtype: int64

Column: Loan_Status
Loan_Status
Y    422
N    192
Name: count, dtype: int64

Column: Credit_History
Credit_History
1    522
0     92
Name: count, dtype: int64

# Re-define mapping dictionaries
gender_map = {"Male": 0, "Female": 1}
married_map = {"No": 0, "Yes": 1}
education_map = {"Not Graduate": 0, "Graduate": 1}
self_employed_map = {"No": 0, "Yes": 1}
property_area_map = {"Rural": 0, "Semiurban": 1, "Urban": 2}
loan_status_map = {"N": 0, "Y": 1}

# Apply mappings only where needed
if "Gender" in df.columns:
    df["Gender"] = df["Gender"].replace(gender_map)

if "Married" in df.columns:
    df["Married"] = df["Married"].replace(married_map)

if "Education" in df.columns:
    df["Education"] = df["Education"].replace(education_map)

if "Self_Employed" in df.columns:
    df["Self_Employed"] = df["Self_Employed"].replace(self_employed_map)

if "Property_Area" in df.columns:
    df["Property_Area"] = df["Property_Area"].replace(property_area_map)

if "Loan_Status" in df.columns:
    df["Loan_Status"] = df["Loan_Status"].replace(loan_status_map)

# safely cast to integer type
int_like_cols = [
    "Gender",
    "Married",
    "Education",
    "Self_Employed",
    "Property_Area",
    "Loan_Status",
    "Credit_History",
]

for col in int_like_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")

df.dtypes

/tmp/ipython-input-3918709542.py:11: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df["Gender"] = df["Gender"].replace(gender_map)
/tmp/ipython-input-3918709542.py:14: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df["Married"] = df["Married"].replace(married_map)
/tmp/ipython-input-3918709542.py:17: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df["Education"] = df["Education"].replace(education_map)
/tmp/ipython-input-3918709542.py:20: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df["Self_Employed"] = df["Self_Employed"].replace(self_employed_map)
/tmp/ipython-input-3918709542.py:23: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df["Property_Area"] = df["Property_Area"].replace(property_area_map)
/tmp/ipython-input-3918709542.py:26: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df["Loan_Status"] = df["Loan_Status"].replace(loan_status_map)

if "Applicant_Income" in df.columns and "Coapplicant_Income" in df.columns:
    df["total_income"] = df["Applicant_Income"] + df["Coapplicant_Income"]

if "Loan_Amount" in df.columns and "total_income" in df.columns:
    df["loan_to_income"] = df["Loan_Amount"] / df["total_income"]

if "Loan_Amount" in df.columns and "Loan_Amount_Term" in df.columns:
    df["emi"] = df["Loan_Amount"] / df["Loan_Amount_Term"]

if "Loan_Amount_Term" in df.columns:
    df["Loan_Amount_Term_Years"] = df["Loan_Amount_Term"] / 12.0

df[["Loan_Amount", "total_income", "loan_to_income", "emi", "Loan_Amount_Term_Years"]].head()

df.head()

# Features and target
X = df.drop(columns=["Loan_Status"])
y = df["Loan_Status"]

# Train-test split (stratified to keep class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Stratified 5-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
y.value_counts(normalize=True)

Train shape: (491, 15) Test shape: (123, 15)

svm_baseline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", SVC(kernel="rbf", C=1.0, gamma="scale"))
])

cv_scores = cross_val_score(
    svm_baseline, X_train, y_train,
    cv=cv,
    scoring="accuracy"
)

print("Baseline SVM - CV accuracies:", cv_scores)
print("Baseline SVM - Mean CV accuracy: {:.4f} ± {:.4f}".format(cv_scores.mean(), cv_scores.std()))

Baseline SVM - CV accuracies: [0.81818182 0.76530612 0.80612245 0.80612245 0.79591837]
Baseline SVM - Mean CV accuracy: 0.7983 ± 0.0180

# Fit baseline model
svm_baseline.fit(X_train, y_train)

# Predictions on test data
y_pred_baseline = svm_baseline.predict(X_test)

# Standard metrics
test_acc_baseline = accuracy_score(y_test, y_pred_baseline)
print("Baseline SVM - Test accuracy: {:.4f}".format(test_acc_baseline))

print("\nBaseline SVM - Classification report:\n",
      classification_report(y_test, y_pred_baseline))

# Banker-centric metrics for class 1 (approved)
precision_approved = precision_score(y_test, y_pred_baseline, pos_label=1)
recall_approved = recall_score(y_test, y_pred_baseline, pos_label=1)

print("Baseline SVM - Precision (class 1 - approved): {:.4f}".format(precision_approved))
print("Baseline SVM - Recall    (class 1 - approved): {:.4f}".format(recall_approved))

Baseline SVM - Test accuracy: 0.8049

Baseline SVM - Classification report:
               precision    recall  f1-score   support

         0.0       0.79      0.50      0.61        38
         1.0       0.81      0.94      0.87        85

    accuracy                           0.80       123
   macro avg       0.80      0.72      0.74       123
weighted avg       0.80      0.80      0.79       123

Baseline SVM - Precision (class 1 - approved): 0.8081
Baseline SVM - Recall    (class 1 - approved): 0.9412

# Custom scorer: precision for class 1 (approved)
precision_scorer = make_scorer(precision_score, pos_label=1)

svm_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", SVC(kernel="rbf"))  # no class_weight for stricter approvals
])

param_grid = {
    "model__C":     [0.1, 1, 10, 50, 100],
    "model__gamma": [0.01, 0.1, "scale", "auto"]
}

grid_search_precision = GridSearchCV(
    svm_pipeline,
    param_grid,
    cv=cv,
    scoring=precision_scorer,   # <-- banker objective
    n_jobs=-1,
    verbose=1
)

grid_search_precision.fit(X_train, y_train)

print("Best params for BANKER (precision, class 1):", grid_search_precision.best_params_)
print("Best CV precision (class 1): {:.4f}".format(grid_search_precision.best_score_))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best params for BANKER (precision, class 1): {'model__C': 50, 'model__gamma': 0.01}
Best CV precision (class 1): 0.7931

svm_banker = grid_search_precision.best_estimator_

# Fit on full training data
svm_banker.fit(X_train, y_train)

# Predictions on test data
y_pred_banker = svm_banker.predict(X_test)

# Overall accuracy
test_acc_banker = accuracy_score(y_test, y_pred_banker)
print("Banker SVM - Test accuracy: {:.4f}".format(test_acc_banker))

# Full classification report
print("\nBanker SVM - Classification report:\n",
      classification_report(y_test, y_pred_banker))

# Banker-focused metrics: precision & recall on approved loans (class 1)
precision_approved_banker = precision_score(y_test, y_pred_banker, pos_label=1)
recall_approved_banker = recall_score(y_test, y_pred_banker, pos_label=1)

print("Banker SVM - Precision (class 1 - approved): {:.4f}".format(precision_approved_banker))
print("Banker SVM - Recall    (class 1 - approved): {:.4f}".format(recall_approved_banker))

Banker SVM - Test accuracy: 0.8130

Banker SVM - Classification report:
               precision    recall  f1-score   support

         0.0       0.80      0.53      0.63        38
         1.0       0.82      0.94      0.87        85

    accuracy                           0.81       123
   macro avg       0.81      0.73      0.75       123
weighted avg       0.81      0.81      0.80       123

Banker SVM - Precision (class 1 - approved): 0.8163
Banker SVM - Recall    (class 1 - approved): 0.9412

model_results = []

model_results.append({
    "Model": "Baseline SVM (RBF)",
    "CV Score": cv_scores.mean(),           # accuracy CV from svm_baseline
    "CV Scoring": "accuracy",
    "Test Accuracy": test_acc_baseline,
    "Test Bank Precision": precision_approved,
    "Test Bank Recall": recall_approved,
})

# Banker SVM metrics (CV used bank_precision via GridSearchCV)
model_results.append({
    "Model": "Banker SVM (RBF)",
    "CV Score": grid_search_precision.best_score_,  # bank precision (class 1)
    "CV Scoring": "bank_precision (class 1)",
    "Test Accuracy": test_acc_banker,
    "Test Bank Precision": precision_approved_banker,
    "Test Bank Recall": recall_approved_banker,
})

# Bank-side scorer: high precision on approved loans (class 1)
bank_precision_scorer = make_scorer(
    precision_score,
    pos_label=1
)

log_reg_clf = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(
        max_iter=1000,
        class_weight={0: 1.5, 1: 1.0},  # bank: be extra careful about bad loans
        solver='lbfgs'
    ))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

log_reg_cv_scores = cross_val_score(
    log_reg_clf,
    X_train,
    y_train,
    cv=cv,
    scoring=bank_precision_scorer
)

print("Logistic Regression - Bank precision (CV scores):", log_reg_cv_scores)
print("Mean bank precision (CV):", log_reg_cv_scores.mean())
print("Std:", log_reg_cv_scores.std())

Logistic Regression - Bank precision (CV scores): [0.79518072 0.78571429 0.78481013 0.79518072 0.77108434]
Mean bank precision (CV): 0.7863940390858188
Std: 0.008851773102915926

# Fit on training data
log_reg_clf.fit(X_train, y_train)

# Predict with default 0.5 threshold
y_pred = log_reg_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['N (Not Approved)', 'Y (Approved)']))

# Bank-side metrics explicitly
bank_precision = precision_score(y_test, y_pred, pos_label=1)
bank_recall = recall_score(y_test, y_pred, pos_label=1)

print(f"\nBank-side precision for approved loans (Y): {bank_precision:.3f}")
print(f"Bank-side recall for approved loans (Y): {bank_recall:.3f}")

# Confusion matrix to see types of errors
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
cm_df = pd.DataFrame(
    cm,
    index=['Actual N', 'Actual Y'],
    columns=['Pred N', 'Pred Y']
)
print("\nConfusion Matrix:")
print(cm_df)

Accuracy: 0.7967479674796748

Classification Report:
                  precision    recall  f1-score   support

N (Not Approved)       0.78      0.47      0.59        38
    Y (Approved)       0.80      0.94      0.86        85

        accuracy                           0.80       123
       macro avg       0.79      0.71      0.73       123
    weighted avg       0.79      0.80      0.78       123


Bank-side precision for approved loans (Y): 0.800
Bank-side recall for approved loans (Y): 0.941

Confusion Matrix:
          Pred N  Pred Y
Actual N      18      20
Actual Y       5      80

log_reg_test_accuracy = accuracy_score(y_test, y_pred)

model_results.append({
    "Model": "Logistic Regression",
    "CV Score": log_reg_cv_scores.mean(),
    "CV Scoring": "bank_precision (class 1)",
    "Test Accuracy": log_reg_test_accuracy,
    "Test Bank Precision": bank_precision,
    "Test Bank Recall": bank_recall,
})

dt_clf = DecisionTreeClassifier(
    criterion="gini",
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
)

dt_cv_scores = cross_val_score(
    dt_clf,
    X_train,
    y_train,
    cv=cv,
    scoring=bank_precision_scorer,
)

print("Decision Tree – bank precision (CV scores):", dt_cv_scores)
print(
    "Decision Tree – mean bank precision (CV): {:.4f} ± {:.4f}".format(
        dt_cv_scores.mean(), dt_cv_scores.std()
    )
)

Decision Tree – bank precision (CV scores): [0.76470588 0.81428571 0.79104478 0.75714286 0.84615385]
Decision Tree – mean bank precision (CV): 0.7947 ± 0.0327

dt_param_grid = {
    "max_depth": [3, 4, 5, 6, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

dt_grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=dt_param_grid,
    scoring=bank_precision_scorer,
    cv=cv,
    n_jobs=-1,
)

dt_grid_search.fit(X_train, y_train)

print("Best Decision Tree params:", dt_grid_search.best_params_)
print("Best CV bank precision:", dt_grid_search.best_score_)

Best Decision Tree params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best CV bank precision: 0.8118972693972694

best_dt = dt_grid_search.best_estimator_

best_dt.fit(X_train, y_train)

y_pred_dt = best_dt.predict(X_test)

print("Decision Tree – test accuracy: {:.4f}".format(
    accuracy_score(y_test, y_pred_dt)
))

print("\nDecision Tree – classification report:")
print(classification_report(
    y_test,
    y_pred_dt,
    target_names=["0 (Not Approved)", "1 (Approved)"],
))

# Bank-focused metrics (approved loans = class 1)
dt_bank_precision = precision_score(y_test, y_pred_dt, pos_label=1)
dt_bank_recall = recall_score(y_test, y_pred_dt, pos_label=1)

print("\nDecision Tree – bank precision (class 1 – approved): {:.4f}".format(dt_bank_precision))
print("Decision Tree – bank recall    (class 1 – approved): {:.4f}".format(dt_bank_recall))

Decision Tree – test accuracy: 0.6992

Decision Tree – classification report:
                  precision    recall  f1-score   support

0 (Not Approved)       0.51      0.58      0.54        38
    1 (Approved)       0.80      0.75      0.78        85

        accuracy                           0.70       123
       macro avg       0.66      0.67      0.66       123
    weighted avg       0.71      0.70      0.70       123


Decision Tree – bank precision (class 1 – approved): 0.8000
Decision Tree – bank recall    (class 1 – approved): 0.7529

dt_test_accuracy = accuracy_score(y_test, y_pred_dt)

model_results.append({
    "Model": "Decision Tree",
    "CV Score": dt_cv_scores.mean(),
    "CV Scoring": "bank_precision (class 1)",
    "Test Accuracy": dt_test_accuracy,
    "Test Bank Precision": dt_bank_precision,
    "Test Bank Recall": dt_bank_recall,
})

knn_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", KNeighborsClassifier())
])

knn_pipeline

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', KNeighborsClassifier())])

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', KNeighborsClassifier())])

StandardScaler()

KNeighborsClassifier()

# Hyperparameter grid for KNN
knn_param_grid = {
    "model__n_neighbors": [3, 5, 7, 9, 11],
    "model__weights": ["uniform", "distance"],
    "model__p": [1, 2],  # 1 = Manhattan, 2 = Euclidean
}

# Grid search using bank-side precision scorer (class 1 = approved)
knn_grid_search = GridSearchCV(
    estimator=knn_pipeline,
    param_grid=knn_param_grid,
    scoring=bank_precision_scorer,  # defined earlier
    cv=cv,                          # same StratifiedKFold as other models
    n_jobs=-1,
    verbose=1,
)

knn_grid_search

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'model__n_neighbors': [3, 5, 7, 9, 11],
                         'model__p': [1, 2],
                         'model__weights': ['uniform', 'distance']},
             scoring=make_scorer(precision_score, response_method='predict', pos_label=1),
             verbose=1)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'model__n_neighbors': [3, 5, 7, 9, 11],
                         'model__p': [1, 2],
                         'model__weights': ['uniform', 'distance']},
             scoring=make_scorer(precision_score, response_method='predict', pos_label=1),
             verbose=1)

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', KNeighborsClassifier())])

StandardScaler()

KNeighborsClassifier()

# Fit the grid search on the training data
knn_grid_search.fit(X_train, y_train)

print("Best KNN params:", knn_grid_search.best_params_)
print("Best CV bank precision (class 1): {:.4f}".format(knn_grid_search.best_score_))

# Best estimator from the search
best_knn = knn_grid_search.best_estimator_
best_knn

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best KNN params: {'model__n_neighbors': 7, 'model__p': 2, 'model__weights': 'uniform'}
Best CV bank precision (class 1): 0.7837

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', KNeighborsClassifier(n_neighbors=7))])

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', KNeighborsClassifier(n_neighbors=7))])

StandardScaler()

KNeighborsClassifier(n_neighbors=7)

# Predictions on the test set
y_pred_knn = best_knn.predict(X_test)

# Overall test accuracy
knn_test_acc = accuracy_score(y_test, y_pred_knn)
print("KNN – test accuracy: {:.4f}".format(knn_test_acc))

# Full classification report
print("\nKNN – classification report:\n",
      classification_report(y_test, y_pred_knn))

# Bank-focused metrics (approved loans = class 1)
knn_precision_approved = precision_score(y_test, y_pred_knn, pos_label=1)
knn_recall_approved = recall_score(y_test, y_pred_knn, pos_label=1)

print("KNN – precision (class 1 – approved): {:.4f}".format(knn_precision_approved))
print("KNN – recall    (class 1 – approved): {:.4f}".format(knn_recall_approved))

KNN – test accuracy: 0.7724

KNN – classification report:
               precision    recall  f1-score   support

         0.0       0.73      0.42      0.53        38
         1.0       0.78      0.93      0.85        85

    accuracy                           0.77       123
   macro avg       0.75      0.68      0.69       123
weighted avg       0.77      0.77      0.75       123

KNN – precision (class 1 – approved): 0.7822
KNN – recall    (class 1 – approved): 0.9294

model_results.append({
    "Model": "KNN (best params)",
    "CV Score": knn_grid_search.best_score_,
    "CV Scoring": "bank_precision (class 1)",
    "Test Accuracy": knn_test_acc,
    "Test Bank Precision": knn_precision_approved,
    "Test Bank Recall": knn_recall_approved,
})

results_df = pd.DataFrame(model_results)

# Drop Baseline SVM from the comparison
results_df = results_df[results_df["Model"] != "Baseline SVM (RBF)"]

# Sort and reset index
results_df = (
    results_df
    .sort_values(by="Test Bank Precision", ascending=False)
    .reset_index(drop=True)
)

results_df

# Main models you want to compare
model_dict = {
    "Banker SVM (RBF)": svm_banker,
    "Logistic Regression": log_reg_clf,
    "Decision Tree (tuned)": best_dt,
    "KNN (best params)": best_knn,
}

# Human-readable class names (your y is 0/1 with 1 = approved)
class_names = ["N (Not Approved)", "Y (Approved)"]

for name, model in model_dict.items():
    print("=" * 70)
    print(f"MODEL: {name}")
    print("=" * 70)

    # Predict class labels on the test set
    y_pred_model = model.predict(X_test)

    # --- Classification report ---
    print("\nClassification report:")
    print(
        classification_report(
            y_test,
            y_pred_model,
            target_names=class_names
        )
    )

    # --- Confusion matrix (numeric + table) ---
    cm = confusion_matrix(y_test, y_pred_model, labels=[0, 1])

    print("\nConfusion Matrix (table):")
    cm_df = pd.DataFrame(
        cm,
        index=[f"Actual {c}" for c in class_names],
        columns=[f"Pred {c}" for c in class_names],
    )
    print(cm_df)

    # --- Confusion matrix plot ---
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot()
    plt.title(f"Confusion Matrix – {name}")
    plt.tight_layout()
    plt.savefig("confusion_matrix.png", dpi=300, bbox_inches='tight')
    plt.show()

======================================================================
MODEL: Banker SVM (RBF)
======================================================================

Classification report:
                  precision    recall  f1-score   support

N (Not Approved)       0.80      0.53      0.63        38
    Y (Approved)       0.82      0.94      0.87        85

        accuracy                           0.81       123
       macro avg       0.81      0.73      0.75       123
    weighted avg       0.81      0.81      0.80       123


Confusion Matrix (table):
                         Pred N (Not Approved)  Pred Y (Approved)
Actual N (Not Approved)                     20                 18
Actual Y (Approved)                          5                 80

======================================================================
MODEL: Logistic Regression
======================================================================

Classification report:
                  precision    recall  f1-score   support

N (Not Approved)       0.78      0.47      0.59        38
    Y (Approved)       0.80      0.94      0.86        85

        accuracy                           0.80       123
       macro avg       0.79      0.71      0.73       123
    weighted avg       0.79      0.80      0.78       123


Confusion Matrix (table):
                         Pred N (Not Approved)  Pred Y (Approved)
Actual N (Not Approved)                     18                 20
Actual Y (Approved)                          5                 80

======================================================================
MODEL: Decision Tree (tuned)
======================================================================

Classification report:
                  precision    recall  f1-score   support

N (Not Approved)       0.51      0.58      0.54        38
    Y (Approved)       0.80      0.75      0.78        85

        accuracy                           0.70       123
       macro avg       0.66      0.67      0.66       123
    weighted avg       0.71      0.70      0.70       123


Confusion Matrix (table):
                         Pred N (Not Approved)  Pred Y (Approved)
Actual N (Not Approved)                     22                 16
Actual Y (Approved)                         21                 64

======================================================================
MODEL: KNN (best params)
======================================================================

Classification report:
                  precision    recall  f1-score   support

N (Not Approved)       0.73      0.42      0.53        38
    Y (Approved)       0.78      0.93      0.85        85

        accuracy                           0.77       123
       macro avg       0.75      0.68      0.69       123
    weighted avg       0.77      0.77      0.75       123


Confusion Matrix (table):
                         Pred N (Not Approved)  Pred Y (Approved)
Actual N (Not Approved)                     16                 22
Actual Y (Approved)                          6                 79

model_dict = {
    "Banker SVM (RBF)": svm_banker,
    "Logistic Regression": log_reg_clf,
    "Decision Tree (tuned)": best_dt,
    "KNN (best params)": best_knn,
}

# -----------------------
# ROC curves (AUC)
# -----------------------
plt.figure(figsize=(6, 4))

for name, model in model_dict.items():
    # Use probability for class 1 if available;
    # otherwise use decision_function (SVM case)
    if hasattr(model, "predict_proba"):
        y_scores = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_scores = model.decision_function(X_test)
    else:
        print(f"[WARN] {name} has no predict_proba/decision_function; skipping ROC.")
        continue

    fpr, tpr, _ = roc_curve(y_test, y_scores)
    auc = roc_auc_score(y_test, y_scores)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.3f})")

plt.plot([0, 1], [0, 1], linestyle="--")  # random baseline
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves – Loan Approval Models")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# -----------------------
# Precision–Recall curves
# -----------------------
plt.figure(figsize=(6, 4))

for name, model in model_dict.items():
    if hasattr(model, "predict_proba"):
        y_scores = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_scores = model.decision_function(X_test)
    else:
        print(f"[WARN] {name} has no predict_proba/decision_function; skipping PR.")
        continue

    precision, recall, _ = precision_recall_curve(y_test, y_scores)
    ap = average_precision_score(y_test, y_scores)
    plt.plot(recall, precision, label=f"{name} (AP = {ap:.3f})")

plt.xlabel("Recall (class 1 = approved)")
plt.ylabel("Precision (class 1 = approved)")
plt.title("Precision–Recall Curves – Loan Approval Models")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("ROC.png", dpi=300, bbox_inches='tight')
plt.show()

# Bar chart: Test Bank Precision by model (zoomed y-axis)

results_sorted_precision = results_df.sort_values(
    by="Test Bank Precision",
    ascending=False
)

plt.figure(figsize=(6, 4))
plt.bar(
    results_sorted_precision["Model"],
    results_sorted_precision["Test Bank Precision"],
)

# --- Zoom the y-axis around the data range ---
min_val = results_sorted_precision["Test Bank Precision"].min()
max_val = results_sorted_precision["Test Bank Precision"].max()
delta = max_val - min_val

if delta == 0:
    # all models identical – keep a tiny window around the value
    plt.ylim(min_val - 0.01, max_val + 0.01)
else:
    margin = delta * 0.3  # 30% padding around the range
    plt.ylim(min_val - margin, max_val + margin)
# ---------------------------------------------

plt.ylabel("Test Bank Precision (class 1 – approved)")
plt.title("Comparison of Bank Precision Across Models")
plt.xticks(rotation=20)
plt.tight_layout()
plt.savefig("bar_chart.png", dpi=300, bbox_inches='tight')
plt.show()

# Grouped bar chart: Test Bank Precision vs Test Bank Recall by model

x = np.arange(len(results_df["Model"]))  # label locations
width = 0.35  # bar width

plt.figure(figsize=(7, 4))
plt.bar(
    x - width/2,
    results_df["Test Bank Precision"],
    width,
    label="Bank Precision (class 1)",
)
plt.bar(
    x + width/2,
    results_df["Test Bank Recall"],
    width,
    label="Bank Recall (class 1)",
)

plt.xticks(x, results_df["Model"], rotation=20)
plt.ylabel("Score")
plt.title("Precision vs Recall for Approved Loans (class 1)")
plt.legend()
plt.tight_layout()
plt.savefig("side_by_side.png", dpi=300, bbox_inches='tight')
plt.show()

# Scatter plot: Test Accuracy vs Test Bank Precision

plt.figure(figsize=(6, 4))

plt.scatter(
    results_df["Test Accuracy"],
    results_df["Test Bank Precision"],
)

for _, row in results_df.iterrows():
    plt.annotate(
        row["Model"],
        (row["Test Accuracy"], row["Test Bank Precision"]),
        textcoords="offset points",
        xytext=(5, 5),
    )

plt.xlabel("Test Accuracy")
plt.ylabel("Test Bank Precision (class 1 – approved)")
plt.title("Accuracy vs Bank Precision by Model")
plt.tight_layout()
plt.savefig("scatter_plot.png", dpi=300, bbox_inches='tight')
plt.show()

	Loan_Amount	total_income	loan_to_income	emi	Loan_Amount_Term_Years
0	9	2378.0	0.003785	0.025000	30.0
1	17	2385.0	0.007128	0.141667	10.0
2	25	3620.0	0.006906	0.208333	10.0
3	25	3459.0	0.007228	0.208333	10.0
4	26	6500.0	0.004000	0.072222	30.0

	Gender	Married	Dependents	Education	Self_Employed	Applicant_Income	Coapplicant_Income	Loan_Amount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status	total_income	loan_to_income	emi	Loan_Amount_Term_Years
0	1	0	0	1	0	2378	0.0	9	360	1	2	0	2378.0	0.003785	0.025000	30.0
1	0	1	2	1	0	1299	1086.0	17	120	1	2	1	2385.0	0.007128	0.141667	10.0
2	0	0	0	0	0	3620	0.0	25	120	1	1	1	3620.0	0.006906	0.208333	10.0
3	0	1	0	1	1	3459	0.0	25	120	1	1	1	3459.0	0.007228	0.208333	10.0
4	0	1	1	1	0	5468	1032.0	26	360	1	1	1	6500.0	0.004000	0.072222	30.0

	proportion
Loan_Status
1	0.687296
0	0.312704

Part 0: Read Data¶

Part 1: Data Preparation¶

Part 2: Model¶

Part 3: Model Comparison¶

	Customer_ID	Gender	Married	Dependents	Education	Self_Employed	Applicant_Income	Coapplicant_Income	Loan_Amount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
0	569	Female	No	0	Graduate	No	2378	0.0	9	360	1	Urban	N
1	15	Male	Yes	2	Graduate	No	1299	1086.0	17	120	1	Urban	Y
2	95	Male	No	0	Not Graduate	No	3620	0.0	25	120	1	Semiurban	Y
3	134	Male	Yes	0	Graduate	Yes	3459	0.0	25	120	1	Semiurban	Y
4	556	Male	Yes	1	Graduate	No	5468	1032.0	26	360	1	Semiurban	Y

	0
Gender	Int64
Married	Int64
Dependents	int64
Education	Int64
Self_Employed	Int64
Applicant_Income	int64
Coapplicant_Income	float64
Loan_Amount	int64
Loan_Amount_Term	int64
Credit_History	Int64
Property_Area	Int64
Loan_Status	Int64

	Model	CV Score	CV Scoring	Test Accuracy	Test Bank Precision	Test Bank Recall
0	Banker SVM (RBF)	0.793068	bank_precision (class 1)	0.813008	0.816327	0.941176
1	Logistic Regression	0.786394	bank_precision (class 1)	0.796748	0.800000	0.941176
2	Decision Tree	0.794667	bank_precision (class 1)	0.699187	0.800000	0.752941
3	KNN (best params)	0.783696	bank_precision (class 1)	0.772358	0.782178	0.929412