# 1A
library(e1071)
library(psych)
library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Loading required package: lattice
library(rminer)
library(rmarkdown)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ purrr::lift()    masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(C50)
library(matrixStats)
## 
## Attaching package: 'matrixStats'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
library(knitr)

cloud_wd <- getwd()
setwd(cloud_wd)

cd <- read.csv(file = "CD_additional_modified.csv", stringsAsFactors = TRUE)

str(cd)
## 'data.frame':    4119 obs. of  21 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
##  $ default       : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 1 1 2 1 2 ...
##  $ housing       : Factor w/ 3 levels "no","unknown",..: 3 1 3 2 3 1 3 3 1 1 ...
##  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 2 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
##  $ duration      : int  487 346 227 17 58 128 290 44 68 170 ...
##  $ campaign      : int  2 4 1 3 1 3 4 2 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 2 0 0 1 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ emp.var.rate  : num  -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ euribor3m     : num  1.31 4.86 4.96 4.96 4.19 ...
##  $ nr.employed   : num  5099 5191 5228 5228 5196 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
# 1B
set.seed(100)

inTrain <- createDataPartition(cd$y, p=.7,list = FALSE)

train_set <-  cd[inTrain,]
test_set <- cd[-inTrain,]

# 1C
prop.table(table(train_set$y))
## 
##      no     yes 
## 0.89043 0.10957
prop.table(table(test_set$y))
## 
##        no       yes 
## 0.8906883 0.1093117

1 Simple Decision Tree Training and Testing

# 2A
train_model1 <- C5.0(formula = y ~., data = train_set)
train_model1
## 
## Call:
## C5.0.formula(formula = y ~ ., data = train_set)
## 
## Classification Tree
## Number of samples: 2884 
## Number of predictors: 20 
## 
## Tree size: 38 
## 
## Non-standard options: attempt to group attributes
train1_prediction <- predict(train_model1, train_set)
test1_prediction <- predict(train_model1, test_set)

mmetric(train_set$y, train1_prediction, metric="CONF")
## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  2516   52
##    yes  140  176
## 
## $roc
## NULL
## 
## $lift
## NULL
mmetric(test_set$y, test1_prediction, metric="CONF")
## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  1060   40
##    yes   76   59
## 
## $roc
## NULL
## 
## $lift
## NULL
mmetric(train_set$y, train1_prediction, metric=c("ACC","TPR","PRECISION","F1"))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11        F12 
##   93.34258   97.97508   55.69620   94.72892   77.19298   96.32466   64.70588
mmetric(test_set$y, test1_prediction, metric=c("ACC","TPR","PRECISION","F1"))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11        F12 
##   90.60729   96.36364   43.70370   93.30986   59.59596   94.81216   50.42735
# 2B
train_model2 <- C5.0(formula = y ~., control = C5.0Control(CF=.1), data = train_set)
train_model2
## 
## Call:
## C5.0.formula(formula = y ~ ., data = train_set, control = C5.0Control(CF = 0.1))
## 
## Classification Tree
## Number of samples: 2884 
## Number of predictors: 20 
## 
## Tree size: 12 
## 
## Non-standard options: attempt to group attributes, confidence level: 0.1
plot(train_model2)

train2_prediction <- predict(train_model2, train_set)
test2_prediction <- predict(train_model2, test_set)

mmetric(train_set$y, train2_prediction, metric="CONF")
## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  2512   56
##    yes  173  143
## 
## $roc
## NULL
## 
## $lift
## NULL
mmetric(test_set$y, test2_prediction, metric="CONF")
## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  1061   39
##    yes   90   45
## 
## $roc
## NULL
## 
## $lift
## NULL
mmetric(train_set$y, train2_prediction, metric=c("ACC","TPR","PRECISION","F1"))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11        F12 
##   92.05964   97.81931   45.25316   93.55680   71.85930   95.64059   55.53398
mmetric(test_set$y, test2_prediction, metric=c("ACC","TPR","PRECISION","F1"))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11        F12 
##   89.55466   96.45455   33.33333   92.18071   53.57143   94.26921   41.09589

2 Simple Naïve Bayes Model Training and Testing

# 3A
cd_w1_nb <- naiveBayes(y ~ ., data = train_set)
#cd_w1_nb

predicted_cd_w1 <- predict(cd_w1_nb, train_set)
mmetric(train_set$y, predicted_cd_w1, metric="CONF")
## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  2336  232
##    yes  122  194
## 
## $roc
## NULL
## 
## $lift
## NULL
mmetric(train_set$y, predicted_cd_w1, metric=c("ACC","TPR","PRECISION","F1"))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11        F12 
##   87.72538   90.96573   61.39241   95.03662   45.53991   92.95663   52.29111
# 3B
cd_w2_nb <- naiveBayes(y ~ . -nr.employed, data = train_set)
#cd_w2_nb

predicted_cd_w2 <- predict(cd_w2_nb, train_set)
mmetric(train_set$y, predicted_cd_w2, metric="CONF")
## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  2374  194
##    yes  139  177
## 
## $roc
## NULL
## 
## $lift
## NULL
mmetric(train_set$y, predicted_cd_w2, metric=c("ACC","TPR","PRECISION","F1"))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11        F12 
##   88.45354   92.44548   56.01266   94.46876   47.70889   93.44617   51.52838

3 Cross-validation

# 4A
cv_function <- function(df, target, nFolds, seedVal, classification, metrics_list)
{
  set.seed(seedVal)
  folds = createFolds(df[,target],nFolds) 
  
  cv_results <- lapply(folds, function(x)
  { 
    train <- df[-x,-target]
    test  <- df[x,-target]
    
    train_target <- df[-x,target]
    test_target <- df[x,target]
    
    classification_model <- classification(train,train_target) 
    
    pred<- predict(classification_model,test)
    
    return(mmetric(test_target,pred,c("ACC","PRECISION","TPR","F1")))
    
  })
  
  cv_results_m <- as.matrix(as.data.frame(cv_results))
  
  cv_mean<- as.matrix(rowMeans(cv_results_m))
  
  colnames(cv_mean) <- "Mean"
  
  cv_sd <- as.matrix(rowSds(cv_results_m))
  
  colnames(cv_sd) <- "Sd"
  
  cv_all <- cbind(cv_results_m, cv_mean, cv_sd)
  
  kable(cv_all,digits=2)
}

# 4B 4C 4D
df <- cd
target <- 21
nFolds <- 3
seedVal <- 500
assign("classification", naiveBayes)
metrics_list <- c("ACC","PRECISION","TPR","F1")

cv_function(df, target, nFolds, seedVal, classification, metrics_list)
Fold1 Fold2 Fold3 Mean Sd
ACC 88.49 86.53 86.31 87.11 1.20
PRECISION1 95.79 94.74 94.42 94.98 0.72
PRECISION2 48.10 42.06 40.87 43.67 3.88
TPR1 91.09 89.85 89.94 90.29 0.69
TPR2 67.33 59.60 56.67 61.20 5.51
F11 93.38 92.23 92.13 92.58 0.69
F12 56.11 49.32 47.49 50.97 4.54
# 5A
nFolds <- 5
cv_function(df, target, nFolds, seedVal, classification, metrics_list)
Fold1 Fold2 Fold3 Fold4 Fold5 Mean Sd
ACC 86.53 87.50 84.83 87.85 88.35 87.01 1.39
PRECISION1 93.57 95.13 95.38 95.41 95.18 94.93 0.77
PRECISION2 40.37 45.24 38.56 46.03 47.46 43.53 3.85
TPR1 91.14 90.59 87.19 90.72 91.55 90.24 1.74
TPR2 48.89 62.64 65.56 64.44 62.22 60.75 6.77
F11 92.34 92.80 91.10 93.01 93.33 92.52 0.87
F12 44.22 52.53 48.56 53.70 53.85 50.57 4.15
nFolds <- 10
cv_function(df, target, nFolds, seedVal, classification, metrics_list)
Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10 Mean Sd
ACC 88.11 84.95 86.86 86.65 86.65 86.17 86.41 89.81 89.59 86.37 87.16 1.54
PRECISION1 94.92 93.70 94.57 94.83 94.57 94.03 94.81 96.30 97.37 94.80 94.99 1.08
PRECISION2 46.55 36.51 42.62 42.19 41.94 40.00 41.54 52.46 52.11 41.54 43.75 5.13
TPR1 91.55 89.10 90.44 89.92 90.19 90.19 89.65 92.10 90.74 89.62 90.35 0.91
TPR2 60.00 51.11 57.78 60.00 57.78 53.33 60.00 71.11 80.43 60.00 61.15 8.57
F11 93.20 91.34 92.46 92.31 92.33 92.07 92.16 94.15 93.94 92.13 92.61 0.88
F12 52.43 42.59 49.06 49.54 48.60 45.71 49.09 60.38 63.25 49.09 50.97 6.30
assign("classification", C5.0)
cv_function(df, target, nFolds, seedVal, classification, metrics_list)
Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10 Mean Sd
ACC 90.78 90.29 88.81 89.56 91.02 90.05 92.48 91.75 89.83 92.46 90.70 1.23
PRECISION1 94.10 93.83 92.55 93.55 94.12 94.54 94.21 93.70 92.88 94.67 93.81 0.68
PRECISION2 58.97 56.41 48.57 52.50 60.53 54.35 71.88 67.74 55.88 69.44 59.63 7.74
TPR1 95.64 95.37 95.08 94.82 95.91 94.28 97.55 97.28 95.91 96.99 95.88 1.09
TPR2 51.11 48.89 37.78 46.67 51.11 55.56 51.11 46.67 41.30 55.56 48.57 5.71
F11 94.86 94.59 93.80 94.18 95.01 94.41 95.85 95.45 94.37 95.82 94.83 0.70
F12 54.76 52.38 42.50 49.41 55.42 54.95 59.74 55.26 47.50 61.73 53.37 5.70
nFolds <- 5
cv_function(df, target, nFolds, seedVal, classification, metrics_list)
Fold1 Fold2 Fold3 Fold4 Fold5 Mean Sd
ACC 91.63 90.05 89.44 89.79 91.87 90.56 1.11
PRECISION1 94.04 93.57 93.66 92.42 94.53 93.64 0.78
PRECISION2 65.22 55.84 51.81 55.17 65.33 58.67 6.22
TPR1 96.73 95.36 94.55 96.45 96.46 95.91 0.92
TPR2 50.00 47.25 47.78 35.56 54.44 47.01 7.00
F11 95.37 94.46 94.10 94.39 95.48 94.76 0.62
F12 56.60 51.19 49.71 43.24 59.39 52.03 6.29

4 Reflection

The first model built is decision trees. According to A2, I knew that changing the CF would change the complexity of the model. The simpler the model is, the lower the accuracy in both train and test datasets. The second model built is naïve bayes models. The overall accuracy of my model increased from 87.73 to 88.45 from removing one of the predictor (i.e. “nr.employed”). This implies that some predictors are not independent and are correlated to each other. A step like creating a correlation heat map could potentially improve the performance of a naïve bayes model. I do see that naïve bayes model is a fast and efficient model for any kind of analysis. Addition to making adjustments the hyperparameters or dataset, cross validation was also practiced avoiding overfitting. A 10-folds for sure gives a more accurate view on the performance, however, I think 3-folds and 5-folds are sufficient for the datasets that have only a few thousands of observations.