#load data
bank <- read.csv("UniversalBank.csv")
names(bank)
## [1] "ID" "Age" "Experience"
## [4] "Income" "ZIP.Code" "Family"
## [7] "CCAvg" "Education" "Mortgage"
## [10] "Personal.Loan" "Securities.Account" "CD.Account"
## [13] "Online" "CreditCard"
head(bank,10)
## ID Age Experience Income ZIP.Code Family CCAvg Education Mortgage
## 1 1 25 1 49 91107 4 1.6 1 0
## 2 2 45 19 34 90089 3 1.5 1 0
## 3 3 39 15 11 94720 1 1.0 1 0
## 4 4 35 9 100 94112 1 2.7 2 0
## 5 5 35 8 45 91330 4 1.0 2 0
## 6 6 37 13 29 92121 4 0.4 2 155
## 7 7 53 27 72 91711 2 1.5 2 0
## 8 8 50 24 22 93943 1 0.3 3 0
## 9 9 35 10 81 90089 3 0.6 2 104
## 10 10 34 9 180 93023 1 8.9 3 0
## Personal.Loan Securities.Account CD.Account Online CreditCard
## 1 0 1 0 0 0
## 2 0 1 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 1
## 6 0 0 0 1 0
## 7 0 0 0 1 0
## 8 0 0 0 0 1
## 9 0 0 0 1 0
## 10 1 0 0 0 0
str(bank)
## 'data.frame': 5000 obs. of 14 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience : int 1 19 15 9 8 13 27 24 10 9 ...
## $ Income : int 49 34 11 100 45 29 72 22 81 180 ...
## $ ZIP.Code : int 91107 90089 94720 94112 91330 92121 91711 93943 90089 93023 ...
## $ Family : int 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : int 1 1 1 2 2 2 2 3 2 3 ...
## $ Mortgage : int 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal.Loan : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Securities.Account: int 1 1 0 0 0 0 0 0 0 0 ...
## $ CD.Account : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Online : int 0 0 0 0 0 1 1 0 1 0 ...
## $ CreditCard : int 0 0 0 0 1 0 0 1 0 0 ...
#load data
bank <- read.csv("UniversalBank.csv")
names(bank)
## [1] "ID" "Age" "Experience"
## [4] "Income" "ZIP.Code" "Family"
## [7] "CCAvg" "Education" "Mortgage"
## [10] "Personal.Loan" "Securities.Account" "CD.Account"
## [13] "Online" "CreditCard"
head(bank,10)
## ID Age Experience Income ZIP.Code Family CCAvg Education Mortgage
## 1 1 25 1 49 91107 4 1.6 1 0
## 2 2 45 19 34 90089 3 1.5 1 0
## 3 3 39 15 11 94720 1 1.0 1 0
## 4 4 35 9 100 94112 1 2.7 2 0
## 5 5 35 8 45 91330 4 1.0 2 0
## 6 6 37 13 29 92121 4 0.4 2 155
## 7 7 53 27 72 91711 2 1.5 2 0
## 8 8 50 24 22 93943 1 0.3 3 0
## 9 9 35 10 81 90089 3 0.6 2 104
## 10 10 34 9 180 93023 1 8.9 3 0
## Personal.Loan Securities.Account CD.Account Online CreditCard
## 1 0 1 0 0 0
## 2 0 1 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 1
## 6 0 0 0 1 0
## 7 0 0 0 1 0
## 8 0 0 0 0 1
## 9 0 0 0 1 0
## 10 1 0 0 0 0
str(bank)
## 'data.frame': 5000 obs. of 14 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience : int 1 19 15 9 8 13 27 24 10 9 ...
## $ Income : int 49 34 11 100 45 29 72 22 81 180 ...
## $ ZIP.Code : int 91107 90089 94720 94112 91330 92121 91711 93943 90089 93023 ...
## $ Family : int 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : int 1 1 1 2 2 2 2 3 2 3 ...
## $ Mortgage : int 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal.Loan : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Securities.Account: int 1 1 0 0 0 0 0 0 0 0 ...
## $ CD.Account : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Online : int 0 0 0 0 0 1 1 0 1 0 ...
## $ CreditCard : int 0 0 0 0 1 0 0 1 0 0 ...
#Adapted from Lab 3 partial solution v7.
#1.1: Clean up data:
# Dropping 'Bank' and 'Zip Code' columns:
bank <- bank[ , -c(1, 5)]
names(bank)
## [1] "Age" "Experience" "Income"
## [4] "Family" "CCAvg" "Education"
## [7] "Mortgage" "Personal.Loan" "Securities.Account"
## [10] "CD.Account" "Online" "CreditCard"
# Reorder variables. Put the response last.
bank <- bank[ , c(1:7, 9:12, 8)]
head(bank)
## Age Experience Income Family CCAvg Education Mortgage Securities.Account
## 1 25 1 49 4 1.6 1 0 1
## 2 45 19 34 3 1.5 1 0 1
## 3 39 15 11 1 1.0 1 0 0
## 4 35 9 100 1 2.7 2 0 0
## 5 35 8 45 4 1.0 2 0 0
## 6 37 13 29 4 0.4 2 155 0
## CD.Account Online CreditCard Personal.Loan
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 1 0
## 6 0 1 0 0
# Set categorical variables as factor:
bank$Education <- as.factor(bank$Education)
bank$Securities.Account <- as.factor(bank$Securities.Account)
bank$CD.Account <- as.factor(bank$CD.Account)
bank$Online <- as.factor(bank$Online)
bank$CreditCard <- as.factor(bank$CreditCard)
bank$Personal.Loan <- factor(bank$Personal.Loan,
levels = c("0", "1"),
labels = c("No", "Yes"))
table(bank$Personal.Loan)
##
## No Yes
## 4520 480
# 1.2. Set training and validation sets -------------------------------------
set.seed(666)
train_index <- sample(1:nrow(bank), 0.6 * nrow(bank))
valid_index <- setdiff(1:nrow(bank), train_index)
train <- bank[train_index, ]
valid <- bank[valid_index, ]
nrow(train)
## [1] 3000
nrow(valid)
## [1] 2000
# 4. Define new customer --------------------------------------------------
new_cust <- data.frame(Age = 40,
Experience = 10,
Income = 84,
Family = 2,
CCAvg = 2,
Education = 2,
Mortgage = 0,
Securities.Account = 0,
CD.Account = 0,
Online = 1,
CreditCard = 1)
# Set categorical variables as factor.
new_cust$Education <- as.factor(new_cust$Education)
new_cust$Securities.Account <- as.factor(new_cust$Securities.Account)
new_cust$CD.Account <- as.factor(new_cust$CD.Account)
new_cust$Online <- as.factor(new_cust$Online)
new_cust$CreditCard <- as.factor(new_cust$CreditCard)
new_cust
## Age Experience Income Family CCAvg Education Mortgage Securities.Account
## 1 40 10 84 2 2 2 0 0
## CD.Account Online CreditCard
## 1 0 1 1
# 5. prepare for kNN. -------------------------------------------------------------
# Normalisation, only for numerical variables
norm_values <- preProcess(train[, -c(6, 8:12)],
method = c("center",
"scale"))
# Then normalise the training and validation sets.
train_norm = predict(norm_values, train)
valid_norm = predict(norm_values, valid)
# 7. Train kNN for predictions ----------------------------------------
# 7.1 k = 3 ---------------------------------------------------------------
# Train k = 3
knn_model_k3 <- caret::knn3(Personal.Loan ~ .,
data = train_norm, k = 3)
knn_model_k3
## 3-nearest neighbor model
## Training set outcome distribution:
##
## No Yes
## 2692 308
# Predict training set
knn_pred_k3_train <- predict(knn_model_k3,
newdata = train_norm[, -c(12)],
type = "class")
head(knn_pred_k3_train)
## [1] No No No No No No
## Levels: No Yes
# Evaluate
confusionMatrix(knn_pred_k3_train, as.factor(train_norm[, 12]),
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2688 70
## Yes 4 238
##
## Accuracy : 0.9753
## 95% CI : (0.9691, 0.9806)
## No Information Rate : 0.8973
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8521
##
## Mcnemar's Test P-Value : 4.153e-14
##
## Sensitivity : 0.77273
## Specificity : 0.99851
## Pos Pred Value : 0.98347
## Neg Pred Value : 0.97462
## Prevalence : 0.10267
## Detection Rate : 0.07933
## Detection Prevalence : 0.08067
## Balanced Accuracy : 0.88562
##
## 'Positive' Class : Yes
##
# 7.2 k = 5 ---------------------------------------------------------------
# train k = 5
knn_model_k5 <- caret::knn3(Personal.Loan ~ .,
data = train_norm, k = 5)
knn_model_k5
## 5-nearest neighbor model
## Training set outcome distribution:
##
## No Yes
## 2692 308
# Predict training set
knn_pred_k5_train <- predict(knn_model_k5,
newdata = train_norm[, -c(12)],
type = "class")
head(knn_pred_k5_train)
## [1] No No No No No No
## Levels: No Yes
# Evaluate
confusionMatrix(knn_pred_k5_train, as.factor(train_norm[, 12]),
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2687 95
## Yes 5 213
##
## Accuracy : 0.9667
## 95% CI : (0.9596, 0.9728)
## No Information Rate : 0.8973
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7922
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.69156
## Specificity : 0.99814
## Pos Pred Value : 0.97706
## Neg Pred Value : 0.96585
## Prevalence : 0.10267
## Detection Rate : 0.07100
## Detection Prevalence : 0.07267
## Balanced Accuracy : 0.84485
##
## 'Positive' Class : Yes
##
# 7.3 k = 7 ---------------------------------------------------------------
# train k = 5
knn_model_k7 <- caret::knn3(Personal.Loan ~ .,
data = train_norm, k = 7)
knn_model_k7
## 7-nearest neighbor model
## Training set outcome distribution:
##
## No Yes
## 2692 308
# Predict training set
knn_pred_k7_train <- predict(knn_model_k7,
newdata = train_norm[, -c(12)],
type = "class")
head(knn_pred_k7_train)
## [1] No No No No No No
## Levels: No Yes
# Evaluate
confusionMatrix(knn_pred_k7_train, as.factor(train_norm[, 12]),
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2691 123
## Yes 1 185
##
## Accuracy : 0.9587
## 95% CI : (0.9509, 0.9655)
## No Information Rate : 0.8973
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.728
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.60065
## Specificity : 0.99963
## Pos Pred Value : 0.99462
## Neg Pred Value : 0.95629
## Prevalence : 0.10267
## Detection Rate : 0.06167
## Detection Prevalence : 0.06200
## Balanced Accuracy : 0.80014
##
## 'Positive' Class : Yes
##
# 7.4 predict validation set ----------------------------------------------
# use k = 3
# Predict training set
knn_pred_k3_valid <- predict(knn_model_k3,
newdata = valid_norm[, -c(12)],
type = "class")
head(knn_pred_k3_valid)
## [1] No No No No No No
## Levels: No Yes
# Evaluate
confusionMatrix(knn_pred_k3_valid, as.factor(valid_norm[, 12]),
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1824 73
## Yes 4 99
##
## Accuracy : 0.9615
## 95% CI : (0.9521, 0.9695)
## No Information Rate : 0.914
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7007
##
## Mcnemar's Test P-Value : 9.239e-15
##
## Sensitivity : 0.5756
## Specificity : 0.9978
## Pos Pred Value : 0.9612
## Neg Pred Value : 0.9615
## Prevalence : 0.0860
## Detection Rate : 0.0495
## Detection Prevalence : 0.0515
## Balanced Accuracy : 0.7867
##
## 'Positive' Class : Yes
##
library(ROSE)
## Loaded ROSE 0.0-4
ROSE::roc.curve(valid_norm$Personal.Loan,
knn_pred_k3_valid)
## Area under the curve (AUC): 0.787
# use k = 5
# Predict training set
knn_pred_k5_valid <- predict(knn_model_k5,
newdata = valid_norm[, -c(12)],
type = "class")
head(knn_pred_k5_valid)
## [1] No No No No No No
## Levels: No Yes
# Evaluate
confusionMatrix(knn_pred_k5_valid, as.factor(valid_norm[, 12]),
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1823 87
## Yes 5 85
##
## Accuracy : 0.954
## 95% CI : (0.9439, 0.9628)
## No Information Rate : 0.914
## P-Value [Acc > NIR] : 2.771e-12
##
## Kappa : 0.6268
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4942
## Specificity : 0.9973
## Pos Pred Value : 0.9444
## Neg Pred Value : 0.9545
## Prevalence : 0.0860
## Detection Rate : 0.0425
## Detection Prevalence : 0.0450
## Balanced Accuracy : 0.7457
##
## 'Positive' Class : Yes
##
library(ROSE)
ROSE::roc.curve(valid_norm$Personal.Loan,
knn_pred_k5_valid)
## Area under the curve (AUC): 0.746
# use k = 7
# Predict training set
knn_pred_k7_valid <- predict(knn_model_k7,
newdata = valid_norm[, -c(12)],
type = "class")
head(knn_pred_k7_valid)
## [1] No No No No No No
## Levels: No Yes
# Evaluate
confusionMatrix(knn_pred_k7_valid, as.factor(valid_norm[, 12]),
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1826 97
## Yes 2 75
##
## Accuracy : 0.9505
## 95% CI : (0.9401, 0.9596)
## No Information Rate : 0.914
## P-Value [Acc > NIR] : 2.389e-10
##
## Kappa : 0.5801
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4360
## Specificity : 0.9989
## Pos Pred Value : 0.9740
## Neg Pred Value : 0.9496
## Prevalence : 0.0860
## Detection Rate : 0.0375
## Detection Prevalence : 0.0385
## Balanced Accuracy : 0.7175
##
## 'Positive' Class : Yes
##
library(ROSE)
ROSE::roc.curve(valid_norm$Personal.Loan,
knn_pred_k7_valid)
## Area under the curve (AUC): 0.717
# 8. use kNN for new customer, k = 5 (best model) -------------------------------------------------------------
knn_cust_k5_valid <- predict(knn_model_k5,newdata = valid_norm[, -c(12)], type = "class")
head(knn_cust_k5_valid)
## [1] No No No No No No
## Levels: No Yes
confusionMatrix(knn_cust_k5_valid, as.factor(valid_norm[, 12]),
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1823 87
## Yes 5 85
##
## Accuracy : 0.954
## 95% CI : (0.9439, 0.9628)
## No Information Rate : 0.914
## P-Value [Acc > NIR] : 2.771e-12
##
## Kappa : 0.6268
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4942
## Specificity : 0.9973
## Pos Pred Value : 0.9444
## Neg Pred Value : 0.9545
## Prevalence : 0.0860
## Detection Rate : 0.0425
## Detection Prevalence : 0.0450
## Balanced Accuracy : 0.7457
##
## 'Positive' Class : Yes
##
# 9. Answers --------------------------------------------------------------
#"How good is the model based on your evaluation?"
#This model for a new customer running at k=5 returns a 95.4% accuracy, which is an improvement from the No Information Rate (NIR) of 91.4%.
#This model is exceptional at never mislabeling a "no". 99.73% of predicted "no" values are correct.
#This model is substantially weaker, however, at correctly labeling "yes" values. Only 49.42% of "yes" values were correctly identified.
#The precision (PPV) value of this model is 94.44%, meaning when it says "yes", it's almost certainly right.
#However, the low specificity of this model indicates that the model rarely predicts yes in the first place.
#Finally, balanced accuracy is only 74.57%. While this is a relatively fair balance between classes, I'd like to see this number be at least 80-85% to be able to draw practical conclusions from this model.
#Overall, this model has both strengths and weaknesses. It has high accuracy and pos/neg pred values, but predicts "yes" too few times and harbors sub-optimal accuracy.
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.