# 1A
library(e1071)
library(psych)
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Loading required package: lattice
library(rminer)
library(rmarkdown)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(C50)
library(matrixStats)
##
## Attaching package: 'matrixStats'
##
## The following object is masked from 'package:dplyr':
##
## count
library(knitr)
cloud_wd <- getwd()
setwd(cloud_wd)
cd <- read.csv(file = "CD_additional_modified.csv", stringsAsFactors = TRUE)
str(cd)
## 'data.frame': 4119 obs. of 21 variables:
## $ age : int 30 39 25 38 47 32 32 41 31 35 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
## $ default : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 1 1 2 1 2 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 3 1 3 2 3 1 3 3 1 1 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 2 1 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
## $ duration : int 487 346 227 17 58 128 290 44 68 170 ...
## $ campaign : int 2 4 1 3 1 3 4 2 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 2 0 0 1 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
## $ emp.var.rate : num -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
## $ cons.price.idx: num 92.9 94 94.5 94.5 93.2 ...
## $ cons.conf.idx : num -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
## $ euribor3m : num 1.31 4.86 4.96 4.96 4.19 ...
## $ nr.employed : num 5099 5191 5228 5228 5196 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
# 1B
set.seed(100)
inTrain <- createDataPartition(cd$y, p=.7,list = FALSE)
train_set <- cd[inTrain,]
test_set <- cd[-inTrain,]
# 1C
prop.table(table(train_set$y))
##
## no yes
## 0.89043 0.10957
prop.table(table(test_set$y))
##
## no yes
## 0.8906883 0.1093117
Simple Decision Tree
Training and Testing
# 2A
train_model1 <- C5.0(formula = y ~., data = train_set)
train_model1
##
## Call:
## C5.0.formula(formula = y ~ ., data = train_set)
##
## Classification Tree
## Number of samples: 2884
## Number of predictors: 20
##
## Tree size: 38
##
## Non-standard options: attempt to group attributes
train1_prediction <- predict(train_model1, train_set)
test1_prediction <- predict(train_model1, test_set)
mmetric(train_set$y, train1_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 2516 52
## yes 140 176
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(test_set$y, test1_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 1060 40
## yes 76 59
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(train_set$y, train1_prediction, metric=c("ACC","TPR","PRECISION","F1"))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11 F12
## 93.34258 97.97508 55.69620 94.72892 77.19298 96.32466 64.70588
mmetric(test_set$y, test1_prediction, metric=c("ACC","TPR","PRECISION","F1"))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11 F12
## 90.60729 96.36364 43.70370 93.30986 59.59596 94.81216 50.42735
# 2B
train_model2 <- C5.0(formula = y ~., control = C5.0Control(CF=.1), data = train_set)
train_model2
##
## Call:
## C5.0.formula(formula = y ~ ., data = train_set, control = C5.0Control(CF = 0.1))
##
## Classification Tree
## Number of samples: 2884
## Number of predictors: 20
##
## Tree size: 12
##
## Non-standard options: attempt to group attributes, confidence level: 0.1
plot(train_model2)

train2_prediction <- predict(train_model2, train_set)
test2_prediction <- predict(train_model2, test_set)
mmetric(train_set$y, train2_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 2512 56
## yes 173 143
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(test_set$y, test2_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 1061 39
## yes 90 45
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(train_set$y, train2_prediction, metric=c("ACC","TPR","PRECISION","F1"))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11 F12
## 92.05964 97.81931 45.25316 93.55680 71.85930 95.64059 55.53398
mmetric(test_set$y, test2_prediction, metric=c("ACC","TPR","PRECISION","F1"))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11 F12
## 89.55466 96.45455 33.33333 92.18071 53.57143 94.26921 41.09589
Simple Naïve Bayes
Model Training and Testing
# 3A
cd_w1_nb <- naiveBayes(y ~ ., data = train_set)
#cd_w1_nb
predicted_cd_w1 <- predict(cd_w1_nb, train_set)
mmetric(train_set$y, predicted_cd_w1, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 2336 232
## yes 122 194
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(train_set$y, predicted_cd_w1, metric=c("ACC","TPR","PRECISION","F1"))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11 F12
## 87.72538 90.96573 61.39241 95.03662 45.53991 92.95663 52.29111
# 3B
cd_w2_nb <- naiveBayes(y ~ . -nr.employed, data = train_set)
#cd_w2_nb
predicted_cd_w2 <- predict(cd_w2_nb, train_set)
mmetric(train_set$y, predicted_cd_w2, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 2374 194
## yes 139 177
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(train_set$y, predicted_cd_w2, metric=c("ACC","TPR","PRECISION","F1"))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11 F12
## 88.45354 92.44548 56.01266 94.46876 47.70889 93.44617 51.52838
Cross-validation
# 4A
cv_function <- function(df, target, nFolds, seedVal, classification, metrics_list)
{
set.seed(seedVal)
folds = createFolds(df[,target],nFolds)
cv_results <- lapply(folds, function(x)
{
train <- df[-x,-target]
test <- df[x,-target]
train_target <- df[-x,target]
test_target <- df[x,target]
classification_model <- classification(train,train_target)
pred<- predict(classification_model,test)
return(mmetric(test_target,pred,c("ACC","PRECISION","TPR","F1")))
})
cv_results_m <- as.matrix(as.data.frame(cv_results))
cv_mean<- as.matrix(rowMeans(cv_results_m))
colnames(cv_mean) <- "Mean"
cv_sd <- as.matrix(rowSds(cv_results_m))
colnames(cv_sd) <- "Sd"
cv_all <- cbind(cv_results_m, cv_mean, cv_sd)
kable(cv_all,digits=2)
}
# 4B 4C 4D
df <- cd
target <- 21
nFolds <- 3
seedVal <- 500
assign("classification", naiveBayes)
metrics_list <- c("ACC","PRECISION","TPR","F1")
cv_function(df, target, nFolds, seedVal, classification, metrics_list)
| ACC |
88.49 |
86.53 |
86.31 |
87.11 |
1.20 |
| PRECISION1 |
95.79 |
94.74 |
94.42 |
94.98 |
0.72 |
| PRECISION2 |
48.10 |
42.06 |
40.87 |
43.67 |
3.88 |
| TPR1 |
91.09 |
89.85 |
89.94 |
90.29 |
0.69 |
| TPR2 |
67.33 |
59.60 |
56.67 |
61.20 |
5.51 |
| F11 |
93.38 |
92.23 |
92.13 |
92.58 |
0.69 |
| F12 |
56.11 |
49.32 |
47.49 |
50.97 |
4.54 |
# 5A
nFolds <- 5
cv_function(df, target, nFolds, seedVal, classification, metrics_list)
| ACC |
86.53 |
87.50 |
84.83 |
87.85 |
88.35 |
87.01 |
1.39 |
| PRECISION1 |
93.57 |
95.13 |
95.38 |
95.41 |
95.18 |
94.93 |
0.77 |
| PRECISION2 |
40.37 |
45.24 |
38.56 |
46.03 |
47.46 |
43.53 |
3.85 |
| TPR1 |
91.14 |
90.59 |
87.19 |
90.72 |
91.55 |
90.24 |
1.74 |
| TPR2 |
48.89 |
62.64 |
65.56 |
64.44 |
62.22 |
60.75 |
6.77 |
| F11 |
92.34 |
92.80 |
91.10 |
93.01 |
93.33 |
92.52 |
0.87 |
| F12 |
44.22 |
52.53 |
48.56 |
53.70 |
53.85 |
50.57 |
4.15 |
nFolds <- 10
cv_function(df, target, nFolds, seedVal, classification, metrics_list)
| ACC |
88.11 |
84.95 |
86.86 |
86.65 |
86.65 |
86.17 |
86.41 |
89.81 |
89.59 |
86.37 |
87.16 |
1.54 |
| PRECISION1 |
94.92 |
93.70 |
94.57 |
94.83 |
94.57 |
94.03 |
94.81 |
96.30 |
97.37 |
94.80 |
94.99 |
1.08 |
| PRECISION2 |
46.55 |
36.51 |
42.62 |
42.19 |
41.94 |
40.00 |
41.54 |
52.46 |
52.11 |
41.54 |
43.75 |
5.13 |
| TPR1 |
91.55 |
89.10 |
90.44 |
89.92 |
90.19 |
90.19 |
89.65 |
92.10 |
90.74 |
89.62 |
90.35 |
0.91 |
| TPR2 |
60.00 |
51.11 |
57.78 |
60.00 |
57.78 |
53.33 |
60.00 |
71.11 |
80.43 |
60.00 |
61.15 |
8.57 |
| F11 |
93.20 |
91.34 |
92.46 |
92.31 |
92.33 |
92.07 |
92.16 |
94.15 |
93.94 |
92.13 |
92.61 |
0.88 |
| F12 |
52.43 |
42.59 |
49.06 |
49.54 |
48.60 |
45.71 |
49.09 |
60.38 |
63.25 |
49.09 |
50.97 |
6.30 |
assign("classification", C5.0)
cv_function(df, target, nFolds, seedVal, classification, metrics_list)
| ACC |
90.78 |
90.29 |
88.81 |
89.56 |
91.02 |
90.05 |
92.48 |
91.75 |
89.83 |
92.46 |
90.70 |
1.23 |
| PRECISION1 |
94.10 |
93.83 |
92.55 |
93.55 |
94.12 |
94.54 |
94.21 |
93.70 |
92.88 |
94.67 |
93.81 |
0.68 |
| PRECISION2 |
58.97 |
56.41 |
48.57 |
52.50 |
60.53 |
54.35 |
71.88 |
67.74 |
55.88 |
69.44 |
59.63 |
7.74 |
| TPR1 |
95.64 |
95.37 |
95.08 |
94.82 |
95.91 |
94.28 |
97.55 |
97.28 |
95.91 |
96.99 |
95.88 |
1.09 |
| TPR2 |
51.11 |
48.89 |
37.78 |
46.67 |
51.11 |
55.56 |
51.11 |
46.67 |
41.30 |
55.56 |
48.57 |
5.71 |
| F11 |
94.86 |
94.59 |
93.80 |
94.18 |
95.01 |
94.41 |
95.85 |
95.45 |
94.37 |
95.82 |
94.83 |
0.70 |
| F12 |
54.76 |
52.38 |
42.50 |
49.41 |
55.42 |
54.95 |
59.74 |
55.26 |
47.50 |
61.73 |
53.37 |
5.70 |
nFolds <- 5
cv_function(df, target, nFolds, seedVal, classification, metrics_list)
| ACC |
91.63 |
90.05 |
89.44 |
89.79 |
91.87 |
90.56 |
1.11 |
| PRECISION1 |
94.04 |
93.57 |
93.66 |
92.42 |
94.53 |
93.64 |
0.78 |
| PRECISION2 |
65.22 |
55.84 |
51.81 |
55.17 |
65.33 |
58.67 |
6.22 |
| TPR1 |
96.73 |
95.36 |
94.55 |
96.45 |
96.46 |
95.91 |
0.92 |
| TPR2 |
50.00 |
47.25 |
47.78 |
35.56 |
54.44 |
47.01 |
7.00 |
| F11 |
95.37 |
94.46 |
94.10 |
94.39 |
95.48 |
94.76 |
0.62 |
| F12 |
56.60 |
51.19 |
49.71 |
43.24 |
59.39 |
52.03 |
6.29 |
Reflection
The first model built is decision trees. According to A2, I knew that
changing the CF would change the complexity of the model. The simpler
the model is, the lower the accuracy in both train and test datasets.
The second model built is naïve bayes models. The overall accuracy of my
model increased from 87.73 to 88.45 from removing one of the predictor
(i.e. “nr.employed”). This implies that some predictors are not
independent and are correlated to each other. A step like creating a
correlation heat map could potentially improve the performance of a
naïve bayes model. I do see that naïve bayes model is a fast and
efficient model for any kind of analysis. Addition to making adjustments
the hyperparameters or dataset, cross validation was also practiced
avoiding overfitting. A 10-folds for sure gives a more accurate view on
the performance, however, I think 3-folds and 5-folds are sufficient for
the datasets that have only a few thousands of observations.