First, create our model.
Second, choose the best records in line with our variable of interest (e.g. chosing the best 10% most likely to accept the loan). We do this by contructing lift charts or decile charts.
rm(list = ls()) # clean environment
cat("\014") # clean console
setwd("~/UNIGE/MASTERS/MaBAn (2020-2022)/Part 1/CVTDM/HW2")
library(class)
library(caret)
library(dummies)
library(tibble)
library(ggplot2)
library(ggpmisc)
library(reshape)
library(e1071)
library(emo)
# Importing the data
bank <- read.csv("UniversalBank.csv", header = TRUE, sep = ",")
t(t(names(bank)))
## [,1]
## [1,] "ID"
## [2,] "Age"
## [3,] "Experience"
## [4,] "Income"
## [5,] "ZIP.Code"
## [6,] "Family"
## [7,] "CCAvg"
## [8,] "Education"
## [9,] "Mortgage"
## [10,] "Personal.Loan"
## [11,] "Securities.Account"
## [12,] "CD.Account"
## [13,] "Online"
## [14,] "CreditCard"
str(bank)
## 'data.frame': 5000 obs. of 14 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience : int 1 19 15 9 8 13 27 24 10 9 ...
## $ Income : int 49 34 11 100 45 29 72 22 81 180 ...
## $ ZIP.Code : int 91107 90089 94720 94112 91330 92121 91711 93943 90089 93023 ...
## $ Family : int 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : int 1 1 1 2 2 2 2 3 2 3 ...
## $ Mortgage : int 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal.Loan : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Securities.Account: int 1 1 0 0 0 0 0 0 0 0 ...
## $ CD.Account : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Online : int 0 0 0 0 0 1 1 0 1 0 ...
## $ CreditCard : int 0 0 0 0 1 0 0 1 0 0 ...
# We see that many variables are considered integers, so we must convert them into factors
bank$Education = as.factor(bank$Education)
bank$Personal.Loan = as.factor(bank$Personal.Loan)
bank$Securities.Account = as.factor(bank$Securities.Account)
bank$CD.Account = as.factor(bank$CD.Account)
bank$Online = as.factor(bank$Online)
bank$CreditCard = as.factor(bank$CreditCard)
# Now, we create dummies for our education
bank_2 <-
cbind(bank[1:7], dummy(bank$Education, sep = "_"), bank[9:14])
names(bank_2)[8:10] <- c("Education_1", "Education_2", "Education_3")
attach(bank_2)
set.seed(1)
# Partitioning the data
train.obs <- sample(rownames(bank_2), dim(bank_2)[1]*0.6)
train.set <- bank_2[train.obs, c(-1,-5)] # We take away ID and ZIP.Code
valid.obs <- setdiff(rownames(bank_2), train.obs)
valid.set <- bank_2[valid.obs, c(-1,-5)]
# Normalizing the data
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
train.set.norm <- as.data.frame(lapply(train.set[, c(1:5, 9)], normalize))
valid.set.norm <- as.data.frame(lapply(valid.set[, c(1:5, 9)], normalize))
# Regrouping into final dataset, with replacement of non-normalized variables
train.final <- cbind(train.set.norm, train.set[, c(6:8, 10:14)])
valid.final <- cbind(valid.set.norm, valid.set[, c(6:8, 10:14)])
# Creating new customer and then normalizing its variables :
# First, only create the numerical variables, in order to normalize
new_cust <-
data.frame(
Age = 40,
Experience = 10,
Income = 84,
Family = 2,
CCAvg = 2,
Mortgage = 0
)
norm.values <- preProcess(train.set[, c(1:5, 9)], method = "range")
new_cust_temp <- predict(norm.values, new_cust)
# Add the remaining non-numerical predictors
new_cust_final <- add_column(new_cust_temp, Education_1 = 0, Education_2 = 1, Education_3 = 0, Securities.Account = 0, CD.Account = 0, Online = 1, CreditCard =1, .after = "Mortgage")
# Building the model :
set.seed(1)
k_nn <-
knn(
train = train.final[, -10],
test = new_cust_final,
cl = train.final[, 10],
k = 1
)
k_nn # The model classifies this customer as NOT accepting the personal loan (value of "0")
## [1] 0
## Levels: 0 1
set.seed(1)
# We do k-NN this time with validation data instead of the single customer
k_nn_2 <- knn(
train = train.final[, -10],
test = valid.final[, -10],
cl = train.final[, 10],
k = 1,
prob = TRUE
)
# The following is the resulting confusion matrix
confusionMatrix(data = k_nn_2, as.factor(valid.final[, 10]), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1769 52
## 1 26 153
##
## Accuracy : 0.961
## 95% CI : (0.9516, 0.9691)
## No Information Rate : 0.8975
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7754
##
## Mcnemar's Test P-Value : 0.004645
##
## Sensitivity : 0.7463
## Specificity : 0.9855
## Pos Pred Value : 0.8547
## Neg Pred Value : 0.9714
## Prevalence : 0.1025
## Detection Rate : 0.0765
## Detection Prevalence : 0.0895
## Balanced Accuracy : 0.8659
##
## 'Positive' Class : 1
##
# Doing Cross-validation to find best "k"
ctrl = trainControl(method = "cv",
number = 10,
summaryFunction = twoClassSummary)
set.seed(1)
knn_cross = train(
as.factor(Personal.Loan) ~.,
data = train.final,
method = "knn",
trControl = ctrl,
metric = "Spec",
tuneGrid = expand.grid(k = seq(50)),
)
# Plotting the results and finding the maximum sensitivity
ggplot(data = knn_cross, aes(x = x, y = y)) + geom_line() + stat_peaks(col = "red", span = NULL)
# Or getting the maximum with this line of code :
which.max(knn_cross$results$Spec)
## [1] 1
train() function,
“Spec” means “Sens” and viceversa! So in the graph, the y-axis
represents the SENSITIVITY, and not the specificity…confusionMatrix(data = k_nn_2, as.factor(valid.final[, 10]), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1769 52
## 1 26 153
##
## Accuracy : 0.961
## 95% CI : (0.9516, 0.9691)
## No Information Rate : 0.8975
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7754
##
## Mcnemar's Test P-Value : 0.004645
##
## Sensitivity : 0.7463
## Specificity : 0.9855
## Pos Pred Value : 0.8547
## Neg Pred Value : 0.9714
## Prevalence : 0.1025
## Detection Rate : 0.0765
## Detection Prevalence : 0.0895
## Balanced Accuracy : 0.8659
##
## 'Positive' Class : 1
##
set.seed(1)
# Partitioning the data (50% train, 30% validation, 20% test)
train_2.obs <- sample(rownames(bank_2), dim(bank_2)[1]*0.5)
train_2.set <- bank_2[train_2.obs, c(-1,-5)] # We take away ID and ZIP.Code
valid_2.obs <- sample(setdiff(rownames(bank_2), train_2.obs), dim(bank_2)[1]*0.3)
valid_2.set <- bank_2[valid_2.obs, c(-1,-5)]
test_2.obs <- setdiff(rownames(bank_2), union(train_2.obs, valid_2.obs))
test_2.set <- bank_2[test_2.obs, c(-1, -5)]
# Normalizing the data
train_2.set.norm <- as.data.frame(lapply(train_2.set[, c(1:5, 9)], normalize))
valid_2.set.norm <- as.data.frame(lapply(valid_2.set[, c(1:5, 9)], normalize))
test_2.set.norm <- as.data.frame(lapply(test_2.set[, c(1:5, 9)], normalize))
# Regrouping into final dataset, with replacement of non-normalized variables
train_2.final <- cbind(train_2.set.norm, train_2.set[, c(6:8, 10:14)])
valid_2.final <- cbind(valid_2.set.norm, valid_2.set[, c(6:8, 10:14)])
test_2.final <- cbind(test_2.set.norm, test_2.set[, c(6:8, 10:14)])
set.seed(1)
# We do k-NN this time with validation data instead of the single customer
k_nn_first <- knn(
train = train_2.final[, -10],
test = valid_2.final[, -10],
cl = train_2.final[, 10],
k = 1,
prob = TRUE
)
# The following is the resulting confusion matrix
confusionMatrix(data = k_nn_first, as.factor(valid_2.final[, 10]), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1335 33
## 1 29 103
##
## Accuracy : 0.9587
## 95% CI : (0.9473, 0.9682)
## No Information Rate : 0.9093
## P-Value [Acc > NIR] : 1.294e-13
##
## Kappa : 0.746
##
## Mcnemar's Test P-Value : 0.7032
##
## Sensitivity : 0.75735
## Specificity : 0.97874
## Pos Pred Value : 0.78030
## Neg Pred Value : 0.97588
## Prevalence : 0.09067
## Detection Rate : 0.06867
## Detection Prevalence : 0.08800
## Balanced Accuracy : 0.86805
##
## 'Positive' Class : 1
##
# And now, we do k-NN with train and test sets :
k_nn_second <- knn(
train = train_2.final[, -10],
test = test_2.final[, -10],
cl = train_2.final[, 10],
k = 1,
prob = TRUE
)
# The resulting confusion matrix is :
confusionMatrix(data = k_nn_second, as.factor(test_2.final[, 10]), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 870 23
## 1 18 89
##
## Accuracy : 0.959
## 95% CI : (0.9448, 0.9704)
## No Information Rate : 0.888
## P-Value [Acc > NIR] : 6.875e-16
##
## Kappa : 0.7898
##
## Mcnemar's Test P-Value : 0.5322
##
## Sensitivity : 0.7946
## Specificity : 0.9797
## Pos Pred Value : 0.8318
## Neg Pred Value : 0.9742
## Prevalence : 0.1120
## Detection Rate : 0.0890
## Detection Prevalence : 0.1070
## Balanced Accuracy : 0.8872
##
## 'Positive' Class : 1
##
# Selecting only CreditCard and Online
new_bank <- bank[, c(10, 13, 14)]
set.seed(1)
# Partitioning the data
train_3.obs <- sample(rownames(new_bank), dim(new_bank)[1]*0.6)
train_3.set <- new_bank[train_3.obs, ]
attach(train_3.set)
valid_3.obs <- setdiff(rownames(new_bank), train_3.obs)
valid_3.set <- new_bank[valid_3.obs, ]
attach(valid_3.set)
# Creating a new "sum" variable :
train_3.set$Sum <- rowSums(sapply(train_3.set, as.numeric))
# Putting all the columns together :
mlt <- melt(train_3.set, id=c("Online", "CreditCard", "Personal.Loan"), measure=c("Sum"))
# Finally, creating the pivot table :
cast(mlt, CreditCard + Personal.Loan ~ Online, subset=variable=="Sum", margins=c("grand_row", "grand_col"))
## Aggregation requires fun.aggregate: length used as default
## CreditCard Personal.Loan 0 1 (all)
## 1 0 0 805 1119 1924
## 2 0 1 79 119 198
## 3 1 0 332 469 801
## 4 1 1 30 47 77
## 5 (all) (all) 1246 1754 3000
table() is easier to do, as
shown in the following few lines of code :bank_ordered <- data.frame("CreditCard" = train_3.set$CreditCard, "Online" = train_3.set$Online, "Personal.Loan" = train_3.set$Personal.Loan)
table(bank_ordered)
## , , Personal.Loan = 0
##
## Online
## CreditCard 0 1
## 0 805 1119
## 1 332 469
##
## , , Personal.Loan = 1
##
## Online
## CreditCard 0 1
## 0 79 119
## 1 30 47
First, find the total population consisting of ONLY having both CC = 1 and Online = 1. In this case, it is 469 + 47 = 516.
Second, find how many cases are favorable. In this case, it is 47, cause we are looking for Personal.Loan = 1.
Finally, since all probabilities are based on (# Favorable cases / Total # of cases), just compute 47/516. The result is thus 0.0916 (approximately).
\[\frac{P(CC = 1, Online = 1|PL = 1)*P(PL=1)}{P(CC = 1, Online = 1|PL = 0)*P(PL=0)+ P(CC = 1, Online = 1|PL = 1)*P(PL=1)}\]
\[P(PL=1) =\frac{198 + 77}{3000}\]
\[P(PL=0) =\frac{1924 + 801}{3000}\]
\[P(CC = 1, Online = 1|PL = 1) =
\frac{47}{275}\] \[P(CC = 1, Online =
1|PL = 0) = \frac{469}{2725}\]
\[P(PL = 1 | CC = 1, Online = 1) = 0.0916\]
# Loan as function of Online
table(Personal.Loan, Online)
## Online
## Personal.Loan 0 1
## 0 690 1105
## 1 80 125
# Loan as function of CreditCard
table(Personal.Loan, CreditCard)
## CreditCard
## Personal.Loan 0 1
## 0 1269 526
## 1 139 66
\[ P(CC = 1|PL=1) = 0.322\]
\[ P(Online=1|PL=1) = 0.61\]
\[ P(PL=1) = 0.0916\]
\[ P(CC = 1|PL=0) = 0.294\] \[ P(Online=1|PL=0) = 0.583\] \[ P(PL=0) = 0.908\]
\[\frac{P(CC = 1|PL = 1)*P(PL=1)*P(Online=1|PL=1)}{P(CC = 1|PL = 1)*P(PL=1)*P(Online=1|PL=1)+ P(CC = 1|PL = 0)*P(PL=0)*P(Online=1|PL=0)}\]
\[P(PL= 1 | CC = 1, Online = 1) = \frac{0.017992}{0.017992+0.15563301} = 0.1036\]
# Running the Naïve Bayes :
NB <- naiveBayes(Personal.Loan ~., data = train_3.set)
NB
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.90833333 0.09166667
##
## Conditional probabilities:
## Online
## Y 0 1
## 0 0.4172477 0.5827523
## 1 0.3963636 0.6036364
##
## CreditCard
## Y 0 1
## 0 0.706055 0.293945
## 1 0.720000 0.280000
##
## Sum
## Y [,1] [,2]
## 0 3.876697 0.6726704
## 1 4.883636 0.6680300
# Creating a new record :
new_obs <- data.frame(Online = factor(1), CreditCard = factor(1), Sum = 2)
# Predicting this new record using our Naïve Bayes classifier :
predict(NB, newdata = new_obs)
## [1] 0
## Levels: 0 1