import pandas as pd
from pandas.api.types import is_numeric_dtype
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.model_selection import (
train_test_split,
StratifiedKFold,
cross_val_score,
GridSearchCV,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
accuracy_score,
classification_report,
precision_score,
recall_score,
confusion_matrix,
make_scorer,
ConfusionMatrixDisplay,
roc_curve,
roc_auc_score,
precision_recall_curve,
average_precision_score,
)
import matplotlib.pyplot as plt
import numpy as np
Part 0: Read Data¶
!pip -q install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter
df = kagglehub.dataset_load(
KaggleDatasetAdapter.PANDAS,
"avineshprabhakaran/loan-eligibility-prediction",
"Loan Eligibility Prediction.csv"
)
df.head()
Using Colab cache for faster access to the 'loan-eligibility-prediction' dataset.
| Customer_ID | Gender | Married | Dependents | Education | Self_Employed | Applicant_Income | Coapplicant_Income | Loan_Amount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 569 | Female | No | 0 | Graduate | No | 2378 | 0.0 | 9 | 360 | 1 | Urban | N |
| 1 | 15 | Male | Yes | 2 | Graduate | No | 1299 | 1086.0 | 17 | 120 | 1 | Urban | Y |
| 2 | 95 | Male | No | 0 | Not Graduate | No | 3620 | 0.0 | 25 | 120 | 1 | Semiurban | Y |
| 3 | 134 | Male | Yes | 0 | Graduate | Yes | 3459 | 0.0 | 25 | 120 | 1 | Semiurban | Y |
| 4 | 556 | Male | Yes | 1 | Graduate | No | 5468 | 1032.0 | 26 | 360 | 1 | Semiurban | Y |
Part 1: Data Preparation¶
Drop Loan_ID since it's just an identifier and not useful for prediction
if "Customer_ID" in df.columns:
df = df.drop(columns=["Customer_ID"])
df.head()
| Gender | Married | Dependents | Education | Self_Employed | Applicant_Income | Coapplicant_Income | Loan_Amount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | No | 0 | Graduate | No | 2378 | 0.0 | 9 | 360 | 1 | Urban | N |
| 1 | Male | Yes | 2 | Graduate | No | 1299 | 1086.0 | 17 | 120 | 1 | Urban | Y |
| 2 | Male | No | 0 | Not Graduate | No | 3620 | 0.0 | 25 | 120 | 1 | Semiurban | Y |
| 3 | Male | Yes | 0 | Graduate | Yes | 3459 | 0.0 | 25 | 120 | 1 | Semiurban | Y |
| 4 | Male | Yes | 1 | Graduate | No | 5468 | 1032.0 | 26 | 360 | 1 | Semiurban | Y |
Check missing values
df.isna().sum()
| 0 | |
|---|---|
| Gender | 0 |
| Married | 0 |
| Dependents | 0 |
| Education | 0 |
| Self_Employed | 0 |
| Applicant_Income | 0 |
| Coapplicant_Income | 0 |
| Loan_Amount | 0 |
| Loan_Amount_Term | 0 |
| Credit_History | 0 |
| Property_Area | 0 |
| Loan_Status | 0 |
Ensure integer dtypes for binary/ordinal columns
int_like_cols = [
"Gender",
"Married",
"Education",
"Self_Employed",
"Property_Area",
"Loan_Status",
"Credit_History",
]
for col in int_like_cols:
if col in df.columns:
print(f"\nColumn: {col}")
print(df[col].value_counts(dropna=False).head(10))
Column: Gender Gender Male 499 Female 115 Name: count, dtype: int64 Column: Married Married Yes 399 No 215 Name: count, dtype: int64 Column: Education Education Graduate 480 Not Graduate 134 Name: count, dtype: int64 Column: Self_Employed Self_Employed No 523 Yes 91 Name: count, dtype: int64 Column: Property_Area Property_Area Semiurban 233 Urban 202 Rural 179 Name: count, dtype: int64 Column: Loan_Status Loan_Status Y 422 N 192 Name: count, dtype: int64 Column: Credit_History Credit_History 1 522 0 92 Name: count, dtype: int64
# Re-define mapping dictionaries
gender_map = {"Male": 0, "Female": 1}
married_map = {"No": 0, "Yes": 1}
education_map = {"Not Graduate": 0, "Graduate": 1}
self_employed_map = {"No": 0, "Yes": 1}
property_area_map = {"Rural": 0, "Semiurban": 1, "Urban": 2}
loan_status_map = {"N": 0, "Y": 1}
# Apply mappings only where needed
if "Gender" in df.columns:
df["Gender"] = df["Gender"].replace(gender_map)
if "Married" in df.columns:
df["Married"] = df["Married"].replace(married_map)
if "Education" in df.columns:
df["Education"] = df["Education"].replace(education_map)
if "Self_Employed" in df.columns:
df["Self_Employed"] = df["Self_Employed"].replace(self_employed_map)
if "Property_Area" in df.columns:
df["Property_Area"] = df["Property_Area"].replace(property_area_map)
if "Loan_Status" in df.columns:
df["Loan_Status"] = df["Loan_Status"].replace(loan_status_map)
# safely cast to integer type
int_like_cols = [
"Gender",
"Married",
"Education",
"Self_Employed",
"Property_Area",
"Loan_Status",
"Credit_History",
]
for col in int_like_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
df.dtypes
/tmp/ipython-input-3918709542.py:11: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
df["Gender"] = df["Gender"].replace(gender_map)
/tmp/ipython-input-3918709542.py:14: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
df["Married"] = df["Married"].replace(married_map)
/tmp/ipython-input-3918709542.py:17: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
df["Education"] = df["Education"].replace(education_map)
/tmp/ipython-input-3918709542.py:20: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
df["Self_Employed"] = df["Self_Employed"].replace(self_employed_map)
/tmp/ipython-input-3918709542.py:23: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
df["Property_Area"] = df["Property_Area"].replace(property_area_map)
/tmp/ipython-input-3918709542.py:26: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
df["Loan_Status"] = df["Loan_Status"].replace(loan_status_map)
| 0 | |
|---|---|
| Gender | Int64 |
| Married | Int64 |
| Dependents | int64 |
| Education | Int64 |
| Self_Employed | Int64 |
| Applicant_Income | int64 |
| Coapplicant_Income | float64 |
| Loan_Amount | int64 |
| Loan_Amount_Term | int64 |
| Credit_History | Int64 |
| Property_Area | Int64 |
| Loan_Status | Int64 |
Add engineered features (loan_to_income, emi, term in years)
if "Applicant_Income" in df.columns and "Coapplicant_Income" in df.columns:
df["total_income"] = df["Applicant_Income"] + df["Coapplicant_Income"]
if "Loan_Amount" in df.columns and "total_income" in df.columns:
df["loan_to_income"] = df["Loan_Amount"] / df["total_income"]
if "Loan_Amount" in df.columns and "Loan_Amount_Term" in df.columns:
df["emi"] = df["Loan_Amount"] / df["Loan_Amount_Term"]
if "Loan_Amount_Term" in df.columns:
df["Loan_Amount_Term_Years"] = df["Loan_Amount_Term"] / 12.0
df[["Loan_Amount", "total_income", "loan_to_income", "emi", "Loan_Amount_Term_Years"]].head()
| Loan_Amount | total_income | loan_to_income | emi | Loan_Amount_Term_Years | |
|---|---|---|---|---|---|
| 0 | 9 | 2378.0 | 0.003785 | 0.025000 | 30.0 |
| 1 | 17 | 2385.0 | 0.007128 | 0.141667 | 10.0 |
| 2 | 25 | 3620.0 | 0.006906 | 0.208333 | 10.0 |
| 3 | 25 | 3459.0 | 0.007228 | 0.208333 | 10.0 |
| 4 | 26 | 6500.0 | 0.004000 | 0.072222 | 30.0 |
df.head()
| Gender | Married | Dependents | Education | Self_Employed | Applicant_Income | Coapplicant_Income | Loan_Amount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status | total_income | loan_to_income | emi | Loan_Amount_Term_Years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 0 | 1 | 0 | 2378 | 0.0 | 9 | 360 | 1 | 2 | 0 | 2378.0 | 0.003785 | 0.025000 | 30.0 |
| 1 | 0 | 1 | 2 | 1 | 0 | 1299 | 1086.0 | 17 | 120 | 1 | 2 | 1 | 2385.0 | 0.007128 | 0.141667 | 10.0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 3620 | 0.0 | 25 | 120 | 1 | 1 | 1 | 3620.0 | 0.006906 | 0.208333 | 10.0 |
| 3 | 0 | 1 | 0 | 1 | 1 | 3459 | 0.0 | 25 | 120 | 1 | 1 | 1 | 3459.0 | 0.007228 | 0.208333 | 10.0 |
| 4 | 0 | 1 | 1 | 1 | 0 | 5468 | 1032.0 | 26 | 360 | 1 | 1 | 1 | 6500.0 | 0.004000 | 0.072222 | 30.0 |
Setup
# Features and target
X = df.drop(columns=["Loan_Status"])
y = df["Loan_Status"]
# Train-test split (stratified to keep class balance)
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2,
stratify=y,
random_state=42
)
# Stratified 5-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
y.value_counts(normalize=True)
Train shape: (491, 15) Test shape: (123, 15)
| proportion | |
|---|---|
| Loan_Status | |
| 1 | 0.687296 |
| 0 | 0.312704 |
Part 2: Model¶
For bankers, I focus on maximizing precision for approved loans (class 1) to reduce the risk of bad approvals.
SVM
svm_baseline = Pipeline(steps=[
("scaler", StandardScaler()),
("model", SVC(kernel="rbf", C=1.0, gamma="scale"))
])
cv_scores = cross_val_score(
svm_baseline, X_train, y_train,
cv=cv,
scoring="accuracy"
)
print("Baseline SVM - CV accuracies:", cv_scores)
print("Baseline SVM - Mean CV accuracy: {:.4f} ± {:.4f}".format(cv_scores.mean(), cv_scores.std()))
Baseline SVM - CV accuracies: [0.81818182 0.76530612 0.80612245 0.80612245 0.79591837] Baseline SVM - Mean CV accuracy: 0.7983 ± 0.0180
Fit baseline SVM and evaluate with banker-focused metrics
# Fit baseline model
svm_baseline.fit(X_train, y_train)
# Predictions on test data
y_pred_baseline = svm_baseline.predict(X_test)
# Standard metrics
test_acc_baseline = accuracy_score(y_test, y_pred_baseline)
print("Baseline SVM - Test accuracy: {:.4f}".format(test_acc_baseline))
print("\nBaseline SVM - Classification report:\n",
classification_report(y_test, y_pred_baseline))
# Banker-centric metrics for class 1 (approved)
precision_approved = precision_score(y_test, y_pred_baseline, pos_label=1)
recall_approved = recall_score(y_test, y_pred_baseline, pos_label=1)
print("Baseline SVM - Precision (class 1 - approved): {:.4f}".format(precision_approved))
print("Baseline SVM - Recall (class 1 - approved): {:.4f}".format(recall_approved))
Baseline SVM - Test accuracy: 0.8049
Baseline SVM - Classification report:
precision recall f1-score support
0.0 0.79 0.50 0.61 38
1.0 0.81 0.94 0.87 85
accuracy 0.80 123
macro avg 0.80 0.72 0.74 123
weighted avg 0.80 0.80 0.79 123
Baseline SVM - Precision (class 1 - approved): 0.8081
Baseline SVM - Recall (class 1 - approved): 0.9412
Hyperparameter tuning for banker using precision on class 1
# Custom scorer: precision for class 1 (approved)
precision_scorer = make_scorer(precision_score, pos_label=1)
svm_pipeline = Pipeline(steps=[
("scaler", StandardScaler()),
("model", SVC(kernel="rbf")) # no class_weight for stricter approvals
])
param_grid = {
"model__C": [0.1, 1, 10, 50, 100],
"model__gamma": [0.01, 0.1, "scale", "auto"]
}
grid_search_precision = GridSearchCV(
svm_pipeline,
param_grid,
cv=cv,
scoring=precision_scorer, # <-- banker objective
n_jobs=-1,
verbose=1
)
grid_search_precision.fit(X_train, y_train)
print("Best params for BANKER (precision, class 1):", grid_search_precision.best_params_)
print("Best CV precision (class 1): {:.4f}".format(grid_search_precision.best_score_))
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best params for BANKER (precision, class 1): {'model__C': 50, 'model__gamma': 0.01}
Best CV precision (class 1): 0.7931
Evaluate the banker-tuned SVM on the held-out test set
svm_banker = grid_search_precision.best_estimator_
# Fit on full training data
svm_banker.fit(X_train, y_train)
# Predictions on test data
y_pred_banker = svm_banker.predict(X_test)
# Overall accuracy
test_acc_banker = accuracy_score(y_test, y_pred_banker)
print("Banker SVM - Test accuracy: {:.4f}".format(test_acc_banker))
# Full classification report
print("\nBanker SVM - Classification report:\n",
classification_report(y_test, y_pred_banker))
# Banker-focused metrics: precision & recall on approved loans (class 1)
precision_approved_banker = precision_score(y_test, y_pred_banker, pos_label=1)
recall_approved_banker = recall_score(y_test, y_pred_banker, pos_label=1)
print("Banker SVM - Precision (class 1 - approved): {:.4f}".format(precision_approved_banker))
print("Banker SVM - Recall (class 1 - approved): {:.4f}".format(recall_approved_banker))
Banker SVM - Test accuracy: 0.8130
Banker SVM - Classification report:
precision recall f1-score support
0.0 0.80 0.53 0.63 38
1.0 0.82 0.94 0.87 85
accuracy 0.81 123
macro avg 0.81 0.73 0.75 123
weighted avg 0.81 0.81 0.80 123
Banker SVM - Precision (class 1 - approved): 0.8163
Banker SVM - Recall (class 1 - approved): 0.9412
Record metrics
model_results = []
model_results.append({
"Model": "Baseline SVM (RBF)",
"CV Score": cv_scores.mean(), # accuracy CV from svm_baseline
"CV Scoring": "accuracy",
"Test Accuracy": test_acc_baseline,
"Test Bank Precision": precision_approved,
"Test Bank Recall": recall_approved,
})
# Banker SVM metrics (CV used bank_precision via GridSearchCV)
model_results.append({
"Model": "Banker SVM (RBF)",
"CV Score": grid_search_precision.best_score_, # bank precision (class 1)
"CV Scoring": "bank_precision (class 1)",
"Test Accuracy": test_acc_banker,
"Test Bank Precision": precision_approved_banker,
"Test Bank Recall": recall_approved_banker,
})
Logistic Regression
Define bank-side metric
# Bank-side scorer: high precision on approved loans (class 1)
bank_precision_scorer = make_scorer(
precision_score,
pos_label=1
)
Define logistic regression pipeline (bank-biased)
log_reg_clf = Pipeline(steps=[
("scaler", StandardScaler()),
("model", LogisticRegression(
max_iter=1000,
class_weight={0: 1.5, 1: 1.0}, # bank: be extra careful about bad loans
solver='lbfgs'
))
])
Cross-validation focusing on bank precision
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
log_reg_cv_scores = cross_val_score(
log_reg_clf,
X_train,
y_train,
cv=cv,
scoring=bank_precision_scorer
)
print("Logistic Regression - Bank precision (CV scores):", log_reg_cv_scores)
print("Mean bank precision (CV):", log_reg_cv_scores.mean())
print("Std:", log_reg_cv_scores.std())
Logistic Regression - Bank precision (CV scores): [0.79518072 0.78571429 0.78481013 0.79518072 0.77108434] Mean bank precision (CV): 0.7863940390858188 Std: 0.008851773102915926
Fit on train, evaluate on test (bank-focused view)
# Fit on training data
log_reg_clf.fit(X_train, y_train)
# Predict with default 0.5 threshold
y_pred = log_reg_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['N (Not Approved)', 'Y (Approved)']))
# Bank-side metrics explicitly
bank_precision = precision_score(y_test, y_pred, pos_label=1)
bank_recall = recall_score(y_test, y_pred, pos_label=1)
print(f"\nBank-side precision for approved loans (Y): {bank_precision:.3f}")
print(f"Bank-side recall for approved loans (Y): {bank_recall:.3f}")
# Confusion matrix to see types of errors
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
cm_df = pd.DataFrame(
cm,
index=['Actual N', 'Actual Y'],
columns=['Pred N', 'Pred Y']
)
print("\nConfusion Matrix:")
print(cm_df)
Accuracy: 0.7967479674796748
Classification Report:
precision recall f1-score support
N (Not Approved) 0.78 0.47 0.59 38
Y (Approved) 0.80 0.94 0.86 85
accuracy 0.80 123
macro avg 0.79 0.71 0.73 123
weighted avg 0.79 0.80 0.78 123
Bank-side precision for approved loans (Y): 0.800
Bank-side recall for approved loans (Y): 0.941
Confusion Matrix:
Pred N Pred Y
Actual N 18 20
Actual Y 5 80
Record metrics
log_reg_test_accuracy = accuracy_score(y_test, y_pred)
model_results.append({
"Model": "Logistic Regression",
"CV Score": log_reg_cv_scores.mean(),
"CV Scoring": "bank_precision (class 1)",
"Test Accuracy": log_reg_test_accuracy,
"Test Bank Precision": bank_precision,
"Test Bank Recall": bank_recall,
})
Decision Tree
Define baseline decision tree
dt_clf = DecisionTreeClassifier(
criterion="gini",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
random_state=42,
)
Cross-validation (bank precision)
dt_cv_scores = cross_val_score(
dt_clf,
X_train,
y_train,
cv=cv,
scoring=bank_precision_scorer,
)
print("Decision Tree – bank precision (CV scores):", dt_cv_scores)
print(
"Decision Tree – mean bank precision (CV): {:.4f} ± {:.4f}".format(
dt_cv_scores.mean(), dt_cv_scores.std()
)
)
Decision Tree – bank precision (CV scores): [0.76470588 0.81428571 0.79104478 0.75714286 0.84615385] Decision Tree – mean bank precision (CV): 0.7947 ± 0.0327
Hyperparameter tuning with GridSearchCV
dt_param_grid = {
"max_depth": [3, 4, 5, 6, None],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4],
}
dt_grid_search = GridSearchCV(
estimator=DecisionTreeClassifier(random_state=42),
param_grid=dt_param_grid,
scoring=bank_precision_scorer,
cv=cv,
n_jobs=-1,
)
dt_grid_search.fit(X_train, y_train)
print("Best Decision Tree params:", dt_grid_search.best_params_)
print("Best CV bank precision:", dt_grid_search.best_score_)
Best Decision Tree params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best CV bank precision: 0.8118972693972694
Train best tree & evaluate on test set
best_dt = dt_grid_search.best_estimator_
best_dt.fit(X_train, y_train)
y_pred_dt = best_dt.predict(X_test)
print("Decision Tree – test accuracy: {:.4f}".format(
accuracy_score(y_test, y_pred_dt)
))
print("\nDecision Tree – classification report:")
print(classification_report(
y_test,
y_pred_dt,
target_names=["0 (Not Approved)", "1 (Approved)"],
))
# Bank-focused metrics (approved loans = class 1)
dt_bank_precision = precision_score(y_test, y_pred_dt, pos_label=1)
dt_bank_recall = recall_score(y_test, y_pred_dt, pos_label=1)
print("\nDecision Tree – bank precision (class 1 – approved): {:.4f}".format(dt_bank_precision))
print("Decision Tree – bank recall (class 1 – approved): {:.4f}".format(dt_bank_recall))
Decision Tree – test accuracy: 0.6992
Decision Tree – classification report:
precision recall f1-score support
0 (Not Approved) 0.51 0.58 0.54 38
1 (Approved) 0.80 0.75 0.78 85
accuracy 0.70 123
macro avg 0.66 0.67 0.66 123
weighted avg 0.71 0.70 0.70 123
Decision Tree – bank precision (class 1 – approved): 0.8000
Decision Tree – bank recall (class 1 – approved): 0.7529
Record metrics
dt_test_accuracy = accuracy_score(y_test, y_pred_dt)
model_results.append({
"Model": "Decision Tree",
"CV Score": dt_cv_scores.mean(),
"CV Scoring": "bank_precision (class 1)",
"Test Accuracy": dt_test_accuracy,
"Test Bank Precision": dt_bank_precision,
"Test Bank Recall": dt_bank_recall,
})
k-Nearest Neighbors
Define KNN pipeline
knn_pipeline = Pipeline(steps=[
("scaler", StandardScaler()),
("model", KNeighborsClassifier())
])
knn_pipeline
Pipeline(steps=[('scaler', StandardScaler()),
('model', KNeighborsClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()),
('model', KNeighborsClassifier())])StandardScaler()
KNeighborsClassifier()
Define hyperparameter grid & GridSearchCV
# Hyperparameter grid for KNN
knn_param_grid = {
"model__n_neighbors": [3, 5, 7, 9, 11],
"model__weights": ["uniform", "distance"],
"model__p": [1, 2], # 1 = Manhattan, 2 = Euclidean
}
# Grid search using bank-side precision scorer (class 1 = approved)
knn_grid_search = GridSearchCV(
estimator=knn_pipeline,
param_grid=knn_param_grid,
scoring=bank_precision_scorer, # defined earlier
cv=cv, # same StratifiedKFold as other models
n_jobs=-1,
verbose=1,
)
knn_grid_search
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('model', KNeighborsClassifier())]),
n_jobs=-1,
param_grid={'model__n_neighbors': [3, 5, 7, 9, 11],
'model__p': [1, 2],
'model__weights': ['uniform', 'distance']},
scoring=make_scorer(precision_score, response_method='predict', pos_label=1),
verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('model', KNeighborsClassifier())]),
n_jobs=-1,
param_grid={'model__n_neighbors': [3, 5, 7, 9, 11],
'model__p': [1, 2],
'model__weights': ['uniform', 'distance']},
scoring=make_scorer(precision_score, response_method='predict', pos_label=1),
verbose=1)Pipeline(steps=[('scaler', StandardScaler()),
('model', KNeighborsClassifier())])StandardScaler()
KNeighborsClassifier()
Fit GridSearchCV and inspect best parameters
# Fit the grid search on the training data
knn_grid_search.fit(X_train, y_train)
print("Best KNN params:", knn_grid_search.best_params_)
print("Best CV bank precision (class 1): {:.4f}".format(knn_grid_search.best_score_))
# Best estimator from the search
best_knn = knn_grid_search.best_estimator_
best_knn
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best KNN params: {'model__n_neighbors': 7, 'model__p': 2, 'model__weights': 'uniform'}
Best CV bank precision (class 1): 0.7837
Pipeline(steps=[('scaler', StandardScaler()),
('model', KNeighborsClassifier(n_neighbors=7))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()),
('model', KNeighborsClassifier(n_neighbors=7))])StandardScaler()
KNeighborsClassifier(n_neighbors=7)
Train KNN & evaluate on test set
# Predictions on the test set
y_pred_knn = best_knn.predict(X_test)
# Overall test accuracy
knn_test_acc = accuracy_score(y_test, y_pred_knn)
print("KNN – test accuracy: {:.4f}".format(knn_test_acc))
# Full classification report
print("\nKNN – classification report:\n",
classification_report(y_test, y_pred_knn))
# Bank-focused metrics (approved loans = class 1)
knn_precision_approved = precision_score(y_test, y_pred_knn, pos_label=1)
knn_recall_approved = recall_score(y_test, y_pred_knn, pos_label=1)
print("KNN – precision (class 1 – approved): {:.4f}".format(knn_precision_approved))
print("KNN – recall (class 1 – approved): {:.4f}".format(knn_recall_approved))
KNN – test accuracy: 0.7724
KNN – classification report:
precision recall f1-score support
0.0 0.73 0.42 0.53 38
1.0 0.78 0.93 0.85 85
accuracy 0.77 123
macro avg 0.75 0.68 0.69 123
weighted avg 0.77 0.77 0.75 123
KNN – precision (class 1 – approved): 0.7822
KNN – recall (class 1 – approved): 0.9294
Record Metrics
model_results.append({
"Model": "KNN (best params)",
"CV Score": knn_grid_search.best_score_,
"CV Scoring": "bank_precision (class 1)",
"Test Accuracy": knn_test_acc,
"Test Bank Precision": knn_precision_approved,
"Test Bank Recall": knn_recall_approved,
})
Part 3: Model Comparison¶
Build a summary DataFrame
results_df = pd.DataFrame(model_results)
# Drop Baseline SVM from the comparison
results_df = results_df[results_df["Model"] != "Baseline SVM (RBF)"]
# Sort and reset index
results_df = (
results_df
.sort_values(by="Test Bank Precision", ascending=False)
.reset_index(drop=True)
)
results_df
| Model | CV Score | CV Scoring | Test Accuracy | Test Bank Precision | Test Bank Recall | |
|---|---|---|---|---|---|---|
| 0 | Banker SVM (RBF) | 0.793068 | bank_precision (class 1) | 0.813008 | 0.816327 | 0.941176 |
| 1 | Logistic Regression | 0.786394 | bank_precision (class 1) | 0.796748 | 0.800000 | 0.941176 |
| 2 | Decision Tree | 0.794667 | bank_precision (class 1) | 0.699187 | 0.800000 | 0.752941 |
| 3 | KNN (best params) | 0.783696 | bank_precision (class 1) | 0.772358 | 0.782178 | 0.929412 |
# Main models you want to compare
model_dict = {
"Banker SVM (RBF)": svm_banker,
"Logistic Regression": log_reg_clf,
"Decision Tree (tuned)": best_dt,
"KNN (best params)": best_knn,
}
# Human-readable class names (your y is 0/1 with 1 = approved)
class_names = ["N (Not Approved)", "Y (Approved)"]
for name, model in model_dict.items():
print("=" * 70)
print(f"MODEL: {name}")
print("=" * 70)
# Predict class labels on the test set
y_pred_model = model.predict(X_test)
# --- Classification report ---
print("\nClassification report:")
print(
classification_report(
y_test,
y_pred_model,
target_names=class_names
)
)
# --- Confusion matrix (numeric + table) ---
cm = confusion_matrix(y_test, y_pred_model, labels=[0, 1])
print("\nConfusion Matrix (table):")
cm_df = pd.DataFrame(
cm,
index=[f"Actual {c}" for c in class_names],
columns=[f"Pred {c}" for c in class_names],
)
print(cm_df)
# --- Confusion matrix plot ---
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot()
plt.title(f"Confusion Matrix – {name}")
plt.tight_layout()
plt.savefig("confusion_matrix.png", dpi=300, bbox_inches='tight')
plt.show()
======================================================================
MODEL: Banker SVM (RBF)
======================================================================
Classification report:
precision recall f1-score support
N (Not Approved) 0.80 0.53 0.63 38
Y (Approved) 0.82 0.94 0.87 85
accuracy 0.81 123
macro avg 0.81 0.73 0.75 123
weighted avg 0.81 0.81 0.80 123
Confusion Matrix (table):
Pred N (Not Approved) Pred Y (Approved)
Actual N (Not Approved) 20 18
Actual Y (Approved) 5 80
======================================================================
MODEL: Logistic Regression
======================================================================
Classification report:
precision recall f1-score support
N (Not Approved) 0.78 0.47 0.59 38
Y (Approved) 0.80 0.94 0.86 85
accuracy 0.80 123
macro avg 0.79 0.71 0.73 123
weighted avg 0.79 0.80 0.78 123
Confusion Matrix (table):
Pred N (Not Approved) Pred Y (Approved)
Actual N (Not Approved) 18 20
Actual Y (Approved) 5 80
======================================================================
MODEL: Decision Tree (tuned)
======================================================================
Classification report:
precision recall f1-score support
N (Not Approved) 0.51 0.58 0.54 38
Y (Approved) 0.80 0.75 0.78 85
accuracy 0.70 123
macro avg 0.66 0.67 0.66 123
weighted avg 0.71 0.70 0.70 123
Confusion Matrix (table):
Pred N (Not Approved) Pred Y (Approved)
Actual N (Not Approved) 22 16
Actual Y (Approved) 21 64
======================================================================
MODEL: KNN (best params)
======================================================================
Classification report:
precision recall f1-score support
N (Not Approved) 0.73 0.42 0.53 38
Y (Approved) 0.78 0.93 0.85 85
accuracy 0.77 123
macro avg 0.75 0.68 0.69 123
weighted avg 0.77 0.77 0.75 123
Confusion Matrix (table):
Pred N (Not Approved) Pred Y (Approved)
Actual N (Not Approved) 16 22
Actual Y (Approved) 6 79
ROC curves + Precision–Recall curves for all main models
model_dict = {
"Banker SVM (RBF)": svm_banker,
"Logistic Regression": log_reg_clf,
"Decision Tree (tuned)": best_dt,
"KNN (best params)": best_knn,
}
# -----------------------
# ROC curves (AUC)
# -----------------------
plt.figure(figsize=(6, 4))
for name, model in model_dict.items():
# Use probability for class 1 if available;
# otherwise use decision_function (SVM case)
if hasattr(model, "predict_proba"):
y_scores = model.predict_proba(X_test)[:, 1]
elif hasattr(model, "decision_function"):
y_scores = model.decision_function(X_test)
else:
print(f"[WARN] {name} has no predict_proba/decision_function; skipping ROC.")
continue
fpr, tpr, _ = roc_curve(y_test, y_scores)
auc = roc_auc_score(y_test, y_scores)
plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.3f})")
plt.plot([0, 1], [0, 1], linestyle="--") # random baseline
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves – Loan Approval Models")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# -----------------------
# Precision–Recall curves
# -----------------------
plt.figure(figsize=(6, 4))
for name, model in model_dict.items():
if hasattr(model, "predict_proba"):
y_scores = model.predict_proba(X_test)[:, 1]
elif hasattr(model, "decision_function"):
y_scores = model.decision_function(X_test)
else:
print(f"[WARN] {name} has no predict_proba/decision_function; skipping PR.")
continue
precision, recall, _ = precision_recall_curve(y_test, y_scores)
ap = average_precision_score(y_test, y_scores)
plt.plot(recall, precision, label=f"{name} (AP = {ap:.3f})")
plt.xlabel("Recall (class 1 = approved)")
plt.ylabel("Precision (class 1 = approved)")
plt.title("Precision–Recall Curves – Loan Approval Models")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("ROC.png", dpi=300, bbox_inches='tight')
plt.show()
Bar chart of Test Bank Precision (main business metric)
# Bar chart: Test Bank Precision by model (zoomed y-axis)
results_sorted_precision = results_df.sort_values(
by="Test Bank Precision",
ascending=False
)
plt.figure(figsize=(6, 4))
plt.bar(
results_sorted_precision["Model"],
results_sorted_precision["Test Bank Precision"],
)
# --- Zoom the y-axis around the data range ---
min_val = results_sorted_precision["Test Bank Precision"].min()
max_val = results_sorted_precision["Test Bank Precision"].max()
delta = max_val - min_val
if delta == 0:
# all models identical – keep a tiny window around the value
plt.ylim(min_val - 0.01, max_val + 0.01)
else:
margin = delta * 0.3 # 30% padding around the range
plt.ylim(min_val - margin, max_val + margin)
# ---------------------------------------------
plt.ylabel("Test Bank Precision (class 1 – approved)")
plt.title("Comparison of Bank Precision Across Models")
plt.xticks(rotation=20)
plt.tight_layout()
plt.savefig("bar_chart.png", dpi=300, bbox_inches='tight')
plt.show()
Side-by-side bars: Precision vs Recall for each model
# Grouped bar chart: Test Bank Precision vs Test Bank Recall by model
x = np.arange(len(results_df["Model"])) # label locations
width = 0.35 # bar width
plt.figure(figsize=(7, 4))
plt.bar(
x - width/2,
results_df["Test Bank Precision"],
width,
label="Bank Precision (class 1)",
)
plt.bar(
x + width/2,
results_df["Test Bank Recall"],
width,
label="Bank Recall (class 1)",
)
plt.xticks(x, results_df["Model"], rotation=20)
plt.ylabel("Score")
plt.title("Precision vs Recall for Approved Loans (class 1)")
plt.legend()
plt.tight_layout()
plt.savefig("side_by_side.png", dpi=300, bbox_inches='tight')
plt.show()
Scatter plot: Accuracy vs Bank Precision
# Scatter plot: Test Accuracy vs Test Bank Precision
plt.figure(figsize=(6, 4))
plt.scatter(
results_df["Test Accuracy"],
results_df["Test Bank Precision"],
)
for _, row in results_df.iterrows():
plt.annotate(
row["Model"],
(row["Test Accuracy"], row["Test Bank Precision"]),
textcoords="offset points",
xytext=(5, 5),
)
plt.xlabel("Test Accuracy")
plt.ylabel("Test Bank Precision (class 1 – approved)")
plt.title("Accuracy vs Bank Precision by Model")
plt.tight_layout()
plt.savefig("scatter_plot.png", dpi=300, bbox_inches='tight')
plt.show()