#load data
bank <- read.csv("UniversalBank.csv")
names(bank)
##  [1] "ID"                 "Age"                "Experience"        
##  [4] "Income"             "ZIP.Code"           "Family"            
##  [7] "CCAvg"              "Education"          "Mortgage"          
## [10] "Personal.Loan"      "Securities.Account" "CD.Account"        
## [13] "Online"             "CreditCard"
head(bank,10)
##    ID Age Experience Income ZIP.Code Family CCAvg Education Mortgage
## 1   1  25          1     49    91107      4   1.6         1        0
## 2   2  45         19     34    90089      3   1.5         1        0
## 3   3  39         15     11    94720      1   1.0         1        0
## 4   4  35          9    100    94112      1   2.7         2        0
## 5   5  35          8     45    91330      4   1.0         2        0
## 6   6  37         13     29    92121      4   0.4         2      155
## 7   7  53         27     72    91711      2   1.5         2        0
## 8   8  50         24     22    93943      1   0.3         3        0
## 9   9  35         10     81    90089      3   0.6         2      104
## 10 10  34          9    180    93023      1   8.9         3        0
##    Personal.Loan Securities.Account CD.Account Online CreditCard
## 1              0                  1          0      0          0
## 2              0                  1          0      0          0
## 3              0                  0          0      0          0
## 4              0                  0          0      0          0
## 5              0                  0          0      0          1
## 6              0                  0          0      1          0
## 7              0                  0          0      1          0
## 8              0                  0          0      0          1
## 9              0                  0          0      1          0
## 10             1                  0          0      0          0
str(bank)
## 'data.frame':    5000 obs. of  14 variables:
##  $ ID                : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age               : int  25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : int  1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : int  49 34 11 100 45 29 72 22 81 180 ...
##  $ ZIP.Code          : int  91107 90089 94720 94112 91330 92121 91711 93943 90089 93023 ...
##  $ Family            : int  4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num  1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : int  1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : int  0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal.Loan     : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Securities.Account: int  1 1 0 0 0 0 0 0 0 0 ...
##  $ CD.Account        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Online            : int  0 0 0 0 0 1 1 0 1 0 ...
##  $ CreditCard        : int  0 0 0 0 1 0 0 1 0 0 ...
#load data
bank <- read.csv("UniversalBank.csv")
names(bank)
##  [1] "ID"                 "Age"                "Experience"        
##  [4] "Income"             "ZIP.Code"           "Family"            
##  [7] "CCAvg"              "Education"          "Mortgage"          
## [10] "Personal.Loan"      "Securities.Account" "CD.Account"        
## [13] "Online"             "CreditCard"
head(bank,10)
##    ID Age Experience Income ZIP.Code Family CCAvg Education Mortgage
## 1   1  25          1     49    91107      4   1.6         1        0
## 2   2  45         19     34    90089      3   1.5         1        0
## 3   3  39         15     11    94720      1   1.0         1        0
## 4   4  35          9    100    94112      1   2.7         2        0
## 5   5  35          8     45    91330      4   1.0         2        0
## 6   6  37         13     29    92121      4   0.4         2      155
## 7   7  53         27     72    91711      2   1.5         2        0
## 8   8  50         24     22    93943      1   0.3         3        0
## 9   9  35         10     81    90089      3   0.6         2      104
## 10 10  34          9    180    93023      1   8.9         3        0
##    Personal.Loan Securities.Account CD.Account Online CreditCard
## 1              0                  1          0      0          0
## 2              0                  1          0      0          0
## 3              0                  0          0      0          0
## 4              0                  0          0      0          0
## 5              0                  0          0      0          1
## 6              0                  0          0      1          0
## 7              0                  0          0      1          0
## 8              0                  0          0      0          1
## 9              0                  0          0      1          0
## 10             1                  0          0      0          0
str(bank)
## 'data.frame':    5000 obs. of  14 variables:
##  $ ID                : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age               : int  25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : int  1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : int  49 34 11 100 45 29 72 22 81 180 ...
##  $ ZIP.Code          : int  91107 90089 94720 94112 91330 92121 91711 93943 90089 93023 ...
##  $ Family            : int  4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num  1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : int  1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : int  0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal.Loan     : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Securities.Account: int  1 1 0 0 0 0 0 0 0 0 ...
##  $ CD.Account        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Online            : int  0 0 0 0 0 1 1 0 1 0 ...
##  $ CreditCard        : int  0 0 0 0 1 0 0 1 0 0 ...
#Adapted from Lab 3 partial solution v7.

#1.1: Clean up data:

# Dropping 'Bank' and 'Zip Code' columns:
bank <- bank[ , -c(1, 5)]
names(bank)
##  [1] "Age"                "Experience"         "Income"            
##  [4] "Family"             "CCAvg"              "Education"         
##  [7] "Mortgage"           "Personal.Loan"      "Securities.Account"
## [10] "CD.Account"         "Online"             "CreditCard"
# Reorder variables. Put the response last.

bank <- bank[ , c(1:7, 9:12, 8)]
head(bank)
##   Age Experience Income Family CCAvg Education Mortgage Securities.Account
## 1  25          1     49      4   1.6         1        0                  1
## 2  45         19     34      3   1.5         1        0                  1
## 3  39         15     11      1   1.0         1        0                  0
## 4  35          9    100      1   2.7         2        0                  0
## 5  35          8     45      4   1.0         2        0                  0
## 6  37         13     29      4   0.4         2      155                  0
##   CD.Account Online CreditCard Personal.Loan
## 1          0      0          0             0
## 2          0      0          0             0
## 3          0      0          0             0
## 4          0      0          0             0
## 5          0      0          1             0
## 6          0      1          0             0
# Set categorical variables as factor:

bank$Education <- as.factor(bank$Education)
bank$Securities.Account <- as.factor(bank$Securities.Account)
bank$CD.Account <- as.factor(bank$CD.Account) 
bank$Online <- as.factor(bank$Online) 
bank$CreditCard <- as.factor(bank$CreditCard) 

bank$Personal.Loan <- factor(bank$Personal.Loan,
                              levels = c("0", "1"),
                              labels = c("No", "Yes"))

table(bank$Personal.Loan)
## 
##   No  Yes 
## 4520  480
# 1.2. Set training and validation sets -------------------------------------

set.seed(666)

train_index <- sample(1:nrow(bank), 0.6 * nrow(bank))
valid_index <- setdiff(1:nrow(bank), train_index)

train <- bank[train_index, ]
valid <- bank[valid_index, ]



nrow(train)
## [1] 3000
nrow(valid)
## [1] 2000
# 4. Define new customer --------------------------------------------------

new_cust <- data.frame(Age = 40,
                       Experience = 10,
                       Income = 84,
                       Family = 2,
                       CCAvg = 2,
                       Education = 2,
                       Mortgage = 0,
                       Securities.Account = 0,
                       CD.Account = 0,
                       Online = 1,
                       CreditCard = 1)

# Set categorical variables as factor.

new_cust$Education <- as.factor(new_cust$Education)
new_cust$Securities.Account <- as.factor(new_cust$Securities.Account)
new_cust$CD.Account <- as.factor(new_cust$CD.Account) 
new_cust$Online <- as.factor(new_cust$Online) 
new_cust$CreditCard <- as.factor(new_cust$CreditCard) 

new_cust
##   Age Experience Income Family CCAvg Education Mortgage Securities.Account
## 1  40         10     84      2     2         2        0                  0
##   CD.Account Online CreditCard
## 1          0      1          1
# 5. prepare for kNN. -------------------------------------------------------------

# Normalisation, only for numerical variables

norm_values <- preProcess(train[, -c(6, 8:12)],
                          method = c("center",
                                     "scale"))

# Then normalise the training and validation sets.

train_norm = predict(norm_values, train)
valid_norm = predict(norm_values, valid)

# 7. Train kNN for predictions ----------------------------------------


# 7.1 k = 3 ---------------------------------------------------------------

# Train k = 3
knn_model_k3 <- caret::knn3(Personal.Loan ~ ., 
                            data = train_norm, k = 3)
knn_model_k3
## 3-nearest neighbor model
## Training set outcome distribution:
## 
##   No  Yes 
## 2692  308
# Predict training set

knn_pred_k3_train <- predict(knn_model_k3, 
                             newdata = train_norm[, -c(12)], 
                             type = "class")
head(knn_pred_k3_train)
## [1] No No No No No No
## Levels: No Yes
# Evaluate
confusionMatrix(knn_pred_k3_train, as.factor(train_norm[, 12]),
                positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2688   70
##        Yes    4  238
##                                           
##                Accuracy : 0.9753          
##                  95% CI : (0.9691, 0.9806)
##     No Information Rate : 0.8973          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8521          
##                                           
##  Mcnemar's Test P-Value : 4.153e-14       
##                                           
##             Sensitivity : 0.77273         
##             Specificity : 0.99851         
##          Pos Pred Value : 0.98347         
##          Neg Pred Value : 0.97462         
##              Prevalence : 0.10267         
##          Detection Rate : 0.07933         
##    Detection Prevalence : 0.08067         
##       Balanced Accuracy : 0.88562         
##                                           
##        'Positive' Class : Yes             
## 
# 7.2 k = 5 ---------------------------------------------------------------

# train k = 5
knn_model_k5 <- caret::knn3(Personal.Loan ~ ., 
                            data = train_norm, k = 5)
knn_model_k5
## 5-nearest neighbor model
## Training set outcome distribution:
## 
##   No  Yes 
## 2692  308
# Predict training set
knn_pred_k5_train <- predict(knn_model_k5, 
                             newdata = train_norm[, -c(12)], 
                             type = "class")
head(knn_pred_k5_train)
## [1] No No No No No No
## Levels: No Yes
# Evaluate
confusionMatrix(knn_pred_k5_train, as.factor(train_norm[, 12]),
                positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2687   95
##        Yes    5  213
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.9596, 0.9728)
##     No Information Rate : 0.8973          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7922          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.69156         
##             Specificity : 0.99814         
##          Pos Pred Value : 0.97706         
##          Neg Pred Value : 0.96585         
##              Prevalence : 0.10267         
##          Detection Rate : 0.07100         
##    Detection Prevalence : 0.07267         
##       Balanced Accuracy : 0.84485         
##                                           
##        'Positive' Class : Yes             
## 
# 7.3 k = 7 ---------------------------------------------------------------

# train k = 5
knn_model_k7 <- caret::knn3(Personal.Loan ~ ., 
                            data = train_norm, k = 7)
knn_model_k7
## 7-nearest neighbor model
## Training set outcome distribution:
## 
##   No  Yes 
## 2692  308
# Predict training set
knn_pred_k7_train <- predict(knn_model_k7, 
                             newdata = train_norm[, -c(12)], 
                             type = "class")
head(knn_pred_k7_train)
## [1] No No No No No No
## Levels: No Yes
# Evaluate
confusionMatrix(knn_pred_k7_train, as.factor(train_norm[, 12]),
                positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2691  123
##        Yes    1  185
##                                           
##                Accuracy : 0.9587          
##                  95% CI : (0.9509, 0.9655)
##     No Information Rate : 0.8973          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.728           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.60065         
##             Specificity : 0.99963         
##          Pos Pred Value : 0.99462         
##          Neg Pred Value : 0.95629         
##              Prevalence : 0.10267         
##          Detection Rate : 0.06167         
##    Detection Prevalence : 0.06200         
##       Balanced Accuracy : 0.80014         
##                                           
##        'Positive' Class : Yes             
## 
# 7.4 predict validation set ----------------------------------------------


# use k = 3

# Predict training set

knn_pred_k3_valid <- predict(knn_model_k3, 
                             newdata = valid_norm[, -c(12)], 
                             type = "class")
head(knn_pred_k3_valid)
## [1] No No No No No No
## Levels: No Yes
# Evaluate
confusionMatrix(knn_pred_k3_valid, as.factor(valid_norm[, 12]),
                positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1824   73
##        Yes    4   99
##                                           
##                Accuracy : 0.9615          
##                  95% CI : (0.9521, 0.9695)
##     No Information Rate : 0.914           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7007          
##                                           
##  Mcnemar's Test P-Value : 9.239e-15       
##                                           
##             Sensitivity : 0.5756          
##             Specificity : 0.9978          
##          Pos Pred Value : 0.9612          
##          Neg Pred Value : 0.9615          
##              Prevalence : 0.0860          
##          Detection Rate : 0.0495          
##    Detection Prevalence : 0.0515          
##       Balanced Accuracy : 0.7867          
##                                           
##        'Positive' Class : Yes             
## 
library(ROSE)
## Loaded ROSE 0.0-4
ROSE::roc.curve(valid_norm$Personal.Loan, 
                knn_pred_k3_valid)

## Area under the curve (AUC): 0.787
# use k = 5
# Predict training set
knn_pred_k5_valid <- predict(knn_model_k5, 
                             newdata = valid_norm[, -c(12)], 
                             type = "class")
head(knn_pred_k5_valid)
## [1] No No No No No No
## Levels: No Yes
# Evaluate
confusionMatrix(knn_pred_k5_valid, as.factor(valid_norm[, 12]),
                positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1823   87
##        Yes    5   85
##                                           
##                Accuracy : 0.954           
##                  95% CI : (0.9439, 0.9628)
##     No Information Rate : 0.914           
##     P-Value [Acc > NIR] : 2.771e-12       
##                                           
##                   Kappa : 0.6268          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.4942          
##             Specificity : 0.9973          
##          Pos Pred Value : 0.9444          
##          Neg Pred Value : 0.9545          
##              Prevalence : 0.0860          
##          Detection Rate : 0.0425          
##    Detection Prevalence : 0.0450          
##       Balanced Accuracy : 0.7457          
##                                           
##        'Positive' Class : Yes             
## 
library(ROSE)

ROSE::roc.curve(valid_norm$Personal.Loan, 
                knn_pred_k5_valid)

## Area under the curve (AUC): 0.746
# use k = 7
# Predict training set
knn_pred_k7_valid <- predict(knn_model_k7, 
                             newdata = valid_norm[, -c(12)], 
                             type = "class")
head(knn_pred_k7_valid)
## [1] No No No No No No
## Levels: No Yes
# Evaluate
confusionMatrix(knn_pred_k7_valid, as.factor(valid_norm[, 12]),
                positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1826   97
##        Yes    2   75
##                                           
##                Accuracy : 0.9505          
##                  95% CI : (0.9401, 0.9596)
##     No Information Rate : 0.914           
##     P-Value [Acc > NIR] : 2.389e-10       
##                                           
##                   Kappa : 0.5801          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.4360          
##             Specificity : 0.9989          
##          Pos Pred Value : 0.9740          
##          Neg Pred Value : 0.9496          
##              Prevalence : 0.0860          
##          Detection Rate : 0.0375          
##    Detection Prevalence : 0.0385          
##       Balanced Accuracy : 0.7175          
##                                           
##        'Positive' Class : Yes             
## 
library(ROSE)

ROSE::roc.curve(valid_norm$Personal.Loan, 
                knn_pred_k7_valid)

## Area under the curve (AUC): 0.717
# 8. use kNN for new customer, k = 5 (best model) -------------------------------------------------------------

knn_cust_k5_valid <- predict(knn_model_k5,newdata = valid_norm[, -c(12)], type = "class")

head(knn_cust_k5_valid)
## [1] No No No No No No
## Levels: No Yes
confusionMatrix(knn_cust_k5_valid, as.factor(valid_norm[, 12]),
                positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1823   87
##        Yes    5   85
##                                           
##                Accuracy : 0.954           
##                  95% CI : (0.9439, 0.9628)
##     No Information Rate : 0.914           
##     P-Value [Acc > NIR] : 2.771e-12       
##                                           
##                   Kappa : 0.6268          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.4942          
##             Specificity : 0.9973          
##          Pos Pred Value : 0.9444          
##          Neg Pred Value : 0.9545          
##              Prevalence : 0.0860          
##          Detection Rate : 0.0425          
##    Detection Prevalence : 0.0450          
##       Balanced Accuracy : 0.7457          
##                                           
##        'Positive' Class : Yes             
## 
# 9. Answers --------------------------------------------------------------

#"How good is the model based on your evaluation?"

#This model for a new customer running at k=5 returns a 95.4% accuracy, which is an improvement from the No Information Rate (NIR) of 91.4%.

#This model is exceptional at never mislabeling a "no". 99.73% of predicted "no" values are correct.

#This model is substantially weaker, however, at correctly labeling "yes" values. Only 49.42% of "yes" values were correctly identified. 

#The precision (PPV) value of this model is 94.44%, meaning when it says "yes", it's almost certainly right.

#However, the low specificity of this model indicates that the model rarely predicts yes in the first place.

#Finally, balanced accuracy is only 74.57%. While this is a relatively fair balance between classes, I'd like to see this number be at least 80-85% to be able to draw practical conclusions from this model.

#Overall, this model has both strengths and weaknesses. It has high accuracy and pos/neg pred values, but predicts "yes" too few times and harbors sub-optimal accuracy. 

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.