#Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree
from zipfile import ZipFile
import urllib.request
from io import BytesIO
folder = urllib.request.urlopen('https://s3.amazonaws.com/projex.dezyre.com/classification-algorithms-for-digital-transformation-in-banking/materials/data.zip')
zipfile = ZipFile(BytesIO(folder.read()))
zipfile.namelist()
## ['data/Data1.csv', 'data/Data2.csv']
# Load customer data present in CSV file
data1 = pd.read_csv(zipfile.open("data/Data1.csv"))
data2 = pd.read_csv(zipfile.open("data/Data2.csv"))
print(data1.shape)
## (5000, 8)
print(data2.shape)
## (5000, 7)
cust_data=data1.merge(data2, how='inner', on='ID')
print(cust_data.shape)
## (5000, 14)
# Explore data types
cust_data.dtypes
## ID int64
## Age int64
## CustomerSince int64
## HighestSpend int64
## ZipCode int64
## HiddenScore int64
## MonthlyAverageSpend float64
## Level int64
## Mortgage int64
## Security int64
## FixedDepositAccount int64
## InternetBanking int64
## CreditCard int64
## LoanOnCard float64
## dtype: object
#Explore Size after null value removal
cust_data.shape
## (4980, 14)
# Let explore how data is distributed as per target class.
sns.countplot(x = 'LoanOnCard', data = cust_data);
n_true = len(cust_data.loc[cust_data['LoanOnCard'] == 1.0])
n_false = len(cust_data.loc[cust_data['LoanOnCard'] == 0.0])
print("Number of true cases: {0} ({1:2.2f}%)".format(n_true, (n_true / (n_true + n_false)) * 100 ))
## Number of true cases: 480 (9.64%)
print("Number of false cases: {0} ({1:2.2f}%)".format(n_false, (n_false / (n_true + n_false)) * 100))
## Number of false cases: 4500 (90.36%)
# Scatter plot to see how data points are distributed for "MonthlyAverageSpend" and "HighestSpend" as per target class
g = sns.scatterplot(x="HighestSpend", y="MonthlyAverageSpend", hue="LoanOnCard",
data=cust_data,legend='full')
g.set(xscale="log")
## [None]
fig, ax = plt.subplots(1, 2)
sns.histplot(cust_data.loc[cust_data.LoanOnCard == 0.0, 'Mortgage'], ax = ax[0])
sns.histplot(cust_data.loc[cust_data.LoanOnCard == 1.0, 'Mortgage'], ax = ax[1])
plt.show()
fig, ax = plt.subplots(1, 2)
sns.histplot(cust_data.loc[cust_data.LoanOnCard == 0.0, 'FixedDepositAccount'], ax = ax[0])
sns.histplot(cust_data.loc[cust_data.LoanOnCard == 1.0, 'FixedDepositAccount'], ax = ax[1])
plt.show()
columns = list(cust_data)[0:-1] # Excluding Outcome column which has only
cust_data[columns].hist(stacked=False, bins=100, figsize=(12,30), layout=(14,2));
# Histogram of first 8 columns
## array([[<AxesSubplot: title={'center': 'ID'}>,
## <AxesSubplot: title={'center': 'Age'}>],
## [<AxesSubplot: title={'center': 'CustomerSince'}>,
## <AxesSubplot: title={'center': 'HighestSpend'}>],
## [<AxesSubplot: title={'center': 'ZipCode'}>,
## <AxesSubplot: title={'center': 'HiddenScore'}>],
## [<AxesSubplot: title={'center': 'MonthlyAverageSpend'}>,
## <AxesSubplot: title={'center': 'Level'}>],
## [<AxesSubplot: title={'center': 'Mortgage'}>,
## <AxesSubplot: title={'center': 'Security'}>],
## [<AxesSubplot: title={'center': 'FixedDepositAccount'}>,
## <AxesSubplot: title={'center': 'InternetBanking'}>],
## [<AxesSubplot: title={'center': 'CreditCard'}>, <AxesSubplot: >],
## [<AxesSubplot: >, <AxesSubplot: >],
## [<AxesSubplot: >, <AxesSubplot: >],
## [<AxesSubplot: >, <AxesSubplot: >],
## [<AxesSubplot: >, <AxesSubplot: >],
## [<AxesSubplot: >, <AxesSubplot: >],
## [<AxesSubplot: >, <AxesSubplot: >],
## [<AxesSubplot: >, <AxesSubplot: >]], dtype=object)
sns.pairplot(cust_data, height=3, hue = 'LoanOnCard')
cust_data = cust_data.drop(columns='ZipCode')
#Correlation analysis
corr = cust_data.corr()
corr
## ID Age ... CreditCard LoanOnCard
## ID 1.000000 -0.010682 ... 0.015741 -0.027188
## Age -0.010682 1.000000 ... 0.007344 -0.008147
## CustomerSince -0.010366 0.994208 ... 0.008779 -0.007801
## HighestSpend -0.020739 -0.054951 ... -0.002780 0.502626
## HiddenScore -0.015721 -0.045289 ... 0.010784 0.061761
## MonthlyAverageSpend -0.026419 -0.051896 ... -0.006577 0.366912
## Level 0.021763 0.042750 ... -0.011766 0.137010
## Mortgage -0.015546 -0.013272 ... -0.007600 0.141947
## Security -0.017160 0.000323 ... -0.014518 0.021982
## FixedDepositAccount -0.008690 0.007744 ... 0.278924 0.316131
## InternetBanking -0.003940 0.011227 ... 0.004960 0.006034
## CreditCard 0.015741 0.007344 ... 1.000000 0.002536
## LoanOnCard -0.027188 -0.008147 ... 0.002536 1.000000
##
## [13 rows x 13 columns]
#heatmap
fig,ax = plt.subplots(figsize=(10, 10))
sns.heatmap(cust_data.corr(), ax=ax, annot=True, linewidths=0.05, fmt= '.2f',cmap="magma") # the color intensity is based on
plt.show()
We will use 70% of data for training and 30% for testing.
from sklearn.model_selection import train_test_split
X = cust_data.drop('LoanOnCard',axis=1) # Predictor feature columns (8 X m)
Y = cust_data['LoanOnCard'] # Predicted class (1=True, 0=False) (1 X m)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
# 1 is just any random seed number
x_train.head()
## ID Age ... InternetBanking CreditCard
## 1479 1480 28 ... 0 0
## 1727 1728 52 ... 0 1
## 2843 2844 27 ... 1 1
## 4106 4107 48 ... 0 0
## 1768 1769 43 ... 0 0
##
## [5 rows x 12 columns]
# import model and matrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
# Fit the model on train
model = LogisticRegression(solver="liblinear")
model.fit(x_train, y_train)
#predict on test
LogisticRegression(solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(solver='liblinear')
y_predict = model.predict(x_test)
coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
print(coef_df)
## 0 1 2 ... 10 11 intercept
## 0 -0.00004 -0.469698 0.464513 ... -0.572026 -0.912256 -0.537507
##
## [1 rows x 13 columns]
model_score = model.score(x_test, y_test)
print(model_score)
## 0.9437751004016064
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_predict)}')
## Accuracy Score: 0.9437751004016064
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}')
## Confusion Matrix:
## [[1321 20]
## [ 64 89]]
print(f'Area Under Curve: {roc_auc_score(y_test, y_predict)}')
## Area Under Curve: 0.7833925516515331
print(f'Recall score: {recall_score(y_test,y_predict)}')
## Recall score: 0.5816993464052288
print(f'Precision score: {precision_score(y_test,y_predict)}')
## Precision score: 0.8165137614678899
print(f'f1 score: {f1_score(y_test,y_predict)}')
## f1 score: 0.6793893129770994
# define class weights
w = {0:1, 1:2}
# Fit the model on train
model_weighted = LogisticRegression(solver="liblinear", class_weight=w)
model_weighted.fit(x_train, y_train)
#predict on test
LogisticRegression(class_weight={0: 1, 1: 2}, solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(class_weight={0: 1, 1: 2}, solver='liblinear')
y_predict = model_weighted.predict(x_test)
print(f'Accuracy Score: {accuracy_score(y_test,y_predict)}')
## Accuracy Score: 0.9390896921017403
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}')
## Confusion Matrix:
## [[1301 40]
## [ 51 102]]
print(f'Area Under Curve: {roc_auc_score(y_test, y_predict)}')
## Area Under Curve: 0.8184190902311708
print(f'Recall score: {recall_score(y_test,y_predict)}')
## Recall score: 0.6666666666666666
print(f'Precision score: {precision_score(y_test,y_predict)}')
## Precision score: 0.7183098591549296
print(f'f1 score: {f1_score(y_test,y_predict)}')
## f1 score: 0.6915254237288136
from sklearn.naive_bayes import GaussianNB # using Gaussian algorithm from Naive Bayes
# create the model
diab_model = GaussianNB()
diab_model.fit(x_train, y_train)
GaussianNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GaussianNB()
diab_train_predict = diab_model.predict(x_train)
from sklearn import metrics
print("Model Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, diab_train_predict)))
## Model Accuracy: 0.8907
print()
y_predict = diab_model.predict(x_test)
from sklearn import metrics
print("Model Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, y_predict)))
## Model Accuracy: 0.8829
print()
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_predict)}')
## Accuracy Score: 0.8828647925033467
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}')
## Confusion Matrix:
## [[1232 109]
## [ 66 87]]
print(f'Area Under Curve: {roc_auc_score(y_test, y_predict)}')
## Area Under Curve: 0.7436724130368032
print(f'Recall score: {recall_score(y_test,y_predict)}')
## Recall score: 0.5686274509803921
diab_model_cp = GaussianNB(priors=[0.1, 0.9])
#diab_model.class_prior_ = [0.9, 0.1]
diab_model_cp.fit(x_train, y_train.ravel())
GaussianNB(priors=[0.1, 0.9])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GaussianNB(priors=[0.1, 0.9])
y_predict = diab_model_cp.predict(x_test)
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_predict)}')
## Accuracy Score: 0.8159303882195449
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}')
## Confusion Matrix:
## [[1078 263]
## [ 12 141]]
print(f'Area Under Curve: {roc_auc_score(y_test, y_predict)}')
## Area Under Curve: 0.8627231653287712
print(f'Recall score: {recall_score(y_test,y_predict)}')
## Recall score: 0.9215686274509803
from sklearn import svm
clf = svm.SVC(gamma=0.25, C=10)
clf.fit(x_train , y_train)
SVC(C=10, gamma=0.25)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(C=10, gamma=0.25)
y_predict = clf.predict(x_test)
print(f'Accuracy Score: {accuracy_score(y_test,y_predict)}')
## Accuracy Score: 0.8975903614457831
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}')
## Confusion Matrix:
## [[1341 0]
## [ 153 0]]
print(f'Area Under Curve: {roc_auc_score(y_test, y_predict)}')
## Area Under Curve: 0.5
print(f'Recall score: {recall_score(y_test,y_predict)}')
## Recall score: 0.0
print(f'Precision score: {precision_score(y_test,y_predict)}')
## Precision score: 0.0
##
## C:\Users\Erick Yegon\AppData\Roaming\Python\Python39\site-packages\sklearn\metrics\_classification.py:1334: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
## _warn_prf(average, modifier, msg_start, len(result))
print(f'f1 score: {f1_score(y_test,y_predict)}')
## f1 score: 0.0
from scipy.stats import zscore
XScaled = X.apply(zscore) # convert all attributes to Z scale
XScaled.describe()
## ID Age ... InternetBanking CreditCard
## count 4.980000e+03 4.980000e+03 ... 4.980000e+03 4.980000e+03
## mean -9.131473e-17 -9.488171e-17 ... 6.705925e-17 -9.060133e-17
## std 1.000100e+00 1.000100e+00 ... 1.000100e+00 1.000100e+00
## min -1.738927e+00 -1.949969e+00 ... -1.217601e+00 -6.459012e-01
## 25% -8.655847e-01 -9.031279e-01 ... -1.217601e+00 -6.459012e-01
## 50% 1.075332e-04 -3.076058e-02 ... 8.212871e-01 -6.459012e-01
## 75% 8.657997e-01 8.416067e-01 ... 8.212871e-01 1.548224e+00
## max 1.731492e+00 1.888448e+00 ... 8.212871e-01 1.548224e+00
##
## [8 rows x 12 columns]
x_trains, x_tests, y_trains, y_tests = train_test_split(XScaled, Y, test_size=0.3, random_state=1)
clf = svm.SVC(gamma=0.25, C=10)
clf.fit(x_trains , y_trains)
SVC(C=10, gamma=0.25)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(C=10, gamma=0.25)
y_predicts = clf.predict(x_tests)
print(f'Accuracy Score: {accuracy_score(y_tests,y_predicts)}')
## Accuracy Score: 0.9692101740294511
print(f'Confusion Matrix: \n{confusion_matrix(y_tests, y_predicts)}')
## Confusion Matrix:
## [[1333 8]
## [ 38 115]]
print(f'Area Under Curve: {roc_auc_score(y_tests, y_predicts)}')
## Area Under Curve: 0.8728341448436198
print(f'Recall score: {recall_score(y_tests,y_predicts)}')
## Recall score: 0.7516339869281046
print(f'Precision score: {precision_score(y_tests,y_predicts)}')
## Precision score: 0.9349593495934959
print(f'f1 score: {f1_score(y_tests,y_predicts)}')
## f1 score: 0.8333333333333333
# Build decision tree model
from sklearn.tree import DecisionTreeClassifier
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(x_train, y_train)
DecisionTreeClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(random_state=1)
# Scoring our DT
print(dTree.score(x_train, y_train))
## 1.0
print(dTree.score(x_test, y_test))
## 0.9792503346720214
y_predict = dTree.predict(x_test)
print(f'Accuracy Score: {accuracy_score(y_test,y_predict)}')
## Accuracy Score: 0.9792503346720214
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}')
## Confusion Matrix:
## [[1327 14]
## [ 17 136]]
print(f'Area Under Curve: {roc_auc_score(y_test, y_predict)}')
## Area Under Curve: 0.9392244593586876
print(f'Recall score: {recall_score(y_test,y_predict)}')
## Recall score: 0.8888888888888888
print(f'Precision score: {precision_score(y_test,y_predict)}')
## Precision score: 0.9066666666666666
print(f'f1 score: {f1_score(y_test,y_predict)}')
## f1 score: 0.8976897689768976
#Reducing over fitting (Regularization)
dTreeR = DecisionTreeClassifier(criterion = 'gini', max_depth = 5, random_state=1)
dTreeR.fit(x_train, y_train)
DecisionTreeClassifier(max_depth=5, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(max_depth=5, random_state=1)
print(dTreeR.score(x_train, y_train))
## 0.9896729776247849
print(dTreeR.score(x_test, y_test))
## 0.9832663989290495
y_predictR = dTreeR.predict(x_test)
print(f'Accuracy Score: {accuracy_score(y_test,y_predictR)}')
## Accuracy Score: 0.9832663989290495
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predictR)}')
## Confusion Matrix:
## [[1335 6]
## [ 19 134]]
print(f'Area Under Curve: {roc_auc_score(y_test, y_predictR)}')
## Area Under Curve: 0.9356713602667018
print(f'Recall score: {recall_score(y_test,y_predictR)}')
## Recall score: 0.8758169934640523
print(f'Precision score: {precision_score(y_test,y_predictR)}')
## Precision score: 0.9571428571428572
print(f'f1 score: {f1_score(y_test,y_predictR)}')
## f1 score: 0.9146757679180887
# Decision Tree Visualize
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
dTreeR3 = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1)
dTreeR3.fit(x_train, y_train)
DecisionTreeClassifier(max_depth=3, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(max_depth=3, random_state=1)
fn = list(x_train)
cn = ['0', '1']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4, 4), dpi=300)
plot_tree(dTreeR3, feature_names = fn, class_names=cn, filled = True)
fig.savefig('tree.png')
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(random_state=1)
rfcl = rfcl.fit(x_train, y_train)
y_predict = rfcl.predict(x_test)
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_predict)}')
## Accuracy Score: 0.9846050870147256
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}')
## Confusion Matrix:
## [[1339 2]
## [ 21 132]]
print(f'Area Under Curve: {roc_auc_score(y_test, y_predict)}')
## Area Under Curve: 0.9306268368644999
print(f'Recall score: {recall_score(y_test,y_predict)}')
## Recall score: 0.8627450980392157
print(f'Precision score: {precision_score(y_test,y_predict)}')
## Precision score: 0.9850746268656716
print(f'f1 score: {f1_score(y_test,y_predict)}')
## f1 score: 0.9198606271777003
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
# summarize class distribution
counter = Counter(Y)
print(counter)
# define pipeline
## Counter({0.0: 4500, 1.0: 480})
over = SMOTE(sampling_strategy=0.3,random_state=1) #sampling_strategy=0.1,random_state=1
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [ ('o', over),('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
Xb, Yb = pipeline.fit_resample(XScaled, Y)
# summarize the new class distribution
counter = Counter(Yb)
print(counter)
## Counter({0.0: 2700, 1.0: 1350})
x_trainb, x_testb, y_trainb, y_testb = train_test_split(Xb, Yb, test_size=0.3, random_state=1)
# 1 is just any random seed number
clf = svm.SVC(gamma=0.25, C=10)
clf.fit(x_trainb , y_trainb)
SVC(C=10, gamma=0.25)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(C=10, gamma=0.25)
y_predictb = clf.predict(x_testb)
# performance
print(f'Accuracy Score: {accuracy_score(y_testb,y_predictb)}')
## Accuracy Score: 0.9827160493827161
print(f'Confusion Matrix: \n{confusion_matrix(y_testb, y_predictb)}')
## Confusion Matrix:
## [[804 11]
## [ 10 390]]
print(f'Area Under Curve: {roc_auc_score(y_testb, y_predictb)}')
## Area Under Curve: 0.9807515337423313
print(f'Recall score: {recall_score(y_testb,y_predictb)}')
## Recall score: 0.975
print(f'Precision score: {precision_score(y_testb,y_predictb)}')
## Precision score: 0.972568578553616
print(f'f1 score: {f1_score(y_testb,y_predictb)}')
## f1 score: 0.9737827715355805
rfcl = RandomForestClassifier(random_state=1)
rfcl = rfcl.fit(x_trainb, y_trainb)
y_predict = rfcl.predict(x_testb)
# performance
print(f'Accuracy Score: {accuracy_score(y_testb,y_predict)}')
## Accuracy Score: 0.9827160493827161
print(f'Confusion Matrix: \n{confusion_matrix(y_testb, y_predict)}')
## Confusion Matrix:
## [[807 8]
## [ 13 387]]
print(f'Area Under Curve: {roc_auc_score(y_testb, y_predict)}')
## Area Under Curve: 0.9788420245398772
print(f'Recall score: {recall_score(y_testb,y_predict)}')
## Recall score: 0.9675
print(f'Precision score: {precision_score(y_testb,y_predict)}')
## Precision score: 0.979746835443038
print(f'f1 score: {f1_score(y_testb,y_predict)}')
## f1 score: 0.9735849056603773
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.25,0.01],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(svm.SVC(),param_grid,refit=True,verbose=2)
grid.fit(x_trainb,y_trainb)
GridSearchCV(estimator=SVC(), param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.25, 0.01], 'kernel': ['rbf', 'poly', 'sigmoid']}, verbose=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(estimator=SVC(), param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.25, 0.01], 'kernel': ['rbf', 'poly', 'sigmoid']}, verbose=2)
SVC()
SVC()
print(grid.best_estimator_)
## SVC(C=10, gamma=0.1)
# Pickle model file
import pickle
filename = 'finalized_model.sav'
pickle.dump(rfcl, open(filename, 'wb'))
# Checking the pickle model
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(x_testb)
# performance
print(f'Accuracy Score: {accuracy_score(y_testb,result)}')
## Accuracy Score: 0.9827160493827161
print(f'Confusion Matrix: \n{confusion_matrix(y_testb, result)}')
## Confusion Matrix:
## [[807 8]
## [ 13 387]]
print(f'Area Under Curve: {roc_auc_score(y_testb, result)}')
## Area Under Curve: 0.9788420245398772
print(f'Recall score: {recall_score(y_testb,result)}')
## Recall score: 0.9675
print(f'Precision score: {precision_score(y_testb,result)}')
## Precision score: 0.979746835443038
print(f'f1 score: {f1_score(y_testb,result)}')
## f1 score: 0.9735849056603773
Comment: As all data attributes are quantitative data, we don’t need data transformation here