Set Up
# 1
library(C50)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(rminer)
library(rmarkdown)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
cloud_wd <- getwd()
setwd(cloud_wd)
cd = read.csv("CD_additional_balanced.csv", stringsAsFactors = FALSE)
str(cd)
## 'data.frame': 9280 obs. of 21 variables:
## $ age : int 41 49 49 41 45 42 39 28 44 42 ...
## $ job : chr "blue-collar" "entrepreneur" "technician" "technician" ...
## $ marital : chr "divorced" "married" "married" "married" ...
## $ education : chr "basic.4y" "university.degree" "basic.9y" "professional.course" ...
## $ default : chr "unknown" "unknown" "no" "unknown" ...
## $ housing : chr "yes" "yes" "no" "yes" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr "may" "may" "may" "may" ...
## $ day_of_week : chr "mon" "mon" "mon" "mon" ...
## $ duration : int 1575 1042 1467 579 461 673 935 1201 1030 1623 ...
## $ campaign : int 1 1 1 1 1 2 3 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : chr "yes" "yes" "yes" "yes" ...
# Categorical to Factor
cd$job <- factor(cd$job)
cd$marital <- factor(cd$marital)
cd$education <- factor(cd$education)
cd$default <- factor(cd$default)
cd$housing <- factor(cd$housing)
cd$loan <- factor(cd$loan)
cd$contact <- factor(cd$contact)
cd$month <- factor(cd$month)
cd$day_of_week <- factor(cd$day_of_week)
cd$poutcome <- factor(cd$poutcome)
cd$y <- factor(cd$y)
str(cd)
## 'data.frame': 9280 obs. of 21 variables:
## $ age : int 41 49 49 41 45 42 39 28 44 42 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 2 3 10 10 2 2 4 12 8 10 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 1 2 2 2 2 2 2 3 2 2 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 7 3 6 3 3 3 8 4 6 ...
## $ default : Factor w/ 2 levels "no","unknown": 2 2 1 2 2 1 1 2 1 1 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 3 3 1 3 3 3 3 3 3 1 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 3 1 3 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 4 4 4 ...
## $ duration : int 1575 1042 1467 579 461 673 935 1201 1030 1623 ...
## $ campaign : int 1 1 1 1 1 2 3 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
summary(cd)
## age job marital education
## Min. :17.0 admin. :2517 divorced:1021 university.degree :3007
## 1st Qu.:31.0 blue-collar:1769 married :5338 high.school :2102
## Median :38.0 technician :1459 single :2900 professional.course:1190
## Mean :40.4 services : 773 unknown : 21 basic.9y :1177
## 3rd Qu.:48.0 management : 651 basic.4y : 895
## Max. :98.0 retired : 595 basic.6y : 458
## (Other) :1516 (Other) : 451
## default housing loan contact month
## no :7824 no :4104 no :7688 cellular :6672 may :2533
## unknown:1456 unknown: 225 unknown: 225 telephone:2608 jul :1477
## yes :4951 yes :1367 aug :1353
## jun :1169
## nov : 886
## apr : 785
## (Other):1077
## day_of_week duration campaign pdays previous
## fri:1763 Min. : 1.0 Min. : 1.000 Min. : 0.0 Min. :0.0000
## mon:1846 1st Qu.: 145.0 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.0000
## thu:2000 Median : 265.0 Median : 2.000 Median :999.0 Median :0.0000
## tue:1810 Mean : 387.4 Mean : 2.333 Mean :887.3 Mean :0.3153
## wed:1861 3rd Qu.: 528.0 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.0000
## Max. :4199.0 Max. :39.000 Max. :999.0 Max. :6.0000
##
## poutcome emp.var.rate cons.price.idx cons.conf.idx
## failure :1074 Min. :-3.4000 Min. :92.20 Min. :-50.80
## nonexistent:7244 1st Qu.:-1.8000 1st Qu.:92.89 1st Qu.:-42.70
## success : 962 Median :-0.1000 Median :93.44 Median :-41.80
## Mean :-0.4963 Mean :93.48 Mean :-40.22
## 3rd Qu.: 1.4000 3rd Qu.:93.99 3rd Qu.:-36.40
## Max. : 1.4000 Max. :94.77 Max. :-26.90
##
## euribor3m nr.employed y
## Min. :0.634 Min. :4964 no :4640
## 1st Qu.:1.244 1st Qu.:5076 yes:4640
## Median :4.021 Median :5191
## Mean :2.960 Mean :5135
## 3rd Qu.:4.959 3rd Qu.:5228
## Max. :5.045 Max. :5228
##
Decision Tree
Training
# 4
train_model1 <- C5.0(formula = y ~., control = C5.0Control(noGlobalPruning=FALSE, CF=.95, earlyStopping = FALSE), data = train_set)
train_model1
##
## Call:
## C5.0.formula(formula = y ~ ., data = train_set, control
## = C5.0Control(noGlobalPruning = FALSE, CF = 0.95, earlyStopping = FALSE))
##
## Classification Tree
## Number of samples: 6496
## Number of predictors: 20
##
## Tree size: 324
##
## Non-standard options: attempt to group attributes, confidence level: 0.95
train_model2 <- C5.0(formula = y ~., control = C5.0Control(noGlobalPruning=FALSE, CF=.35, earlyStopping = FALSE), data = train_set)
train_model2
##
## Call:
## C5.0.formula(formula = y ~ ., data = train_set, control
## = C5.0Control(noGlobalPruning = FALSE, CF = 0.35, earlyStopping = FALSE))
##
## Classification Tree
## Number of samples: 6496
## Number of predictors: 20
##
## Tree size: 154
##
## Non-standard options: attempt to group attributes, confidence level: 0.35
train_model3 <- C5.0(formula = y ~., control = C5.0Control(noGlobalPruning=FALSE, CF=.1, earlyStopping = FALSE), data = train_set)
train_model3
##
## Call:
## C5.0.formula(formula = y ~ ., data = train_set, control
## = C5.0Control(noGlobalPruning = FALSE, CF = 0.1, earlyStopping = FALSE))
##
## Classification Tree
## Number of samples: 6496
## Number of predictors: 20
##
## Tree size: 47
##
## Non-standard options: attempt to group attributes, confidence level: 0.1
train_model4 <- C5.0(formula = y ~., control = C5.0Control(noGlobalPruning=FALSE, CF=.06, earlyStopping = FALSE), data = train_set)
train_model4
##
## Call:
## C5.0.formula(formula = y ~ ., data = train_set, control
## = C5.0Control(noGlobalPruning = FALSE, CF = 0.06, earlyStopping = FALSE))
##
## Classification Tree
## Number of samples: 6496
## Number of predictors: 20
##
## Tree size: 18
##
## Non-standard options: attempt to group attributes, confidence level: 0.06
train_model5 <- C5.0(formula = y ~., control = C5.0Control(noGlobalPruning=FALSE, CF=.03, earlyStopping = FALSE), data = train_set)
train_model5
##
## Call:
## C5.0.formula(formula = y ~ ., data = train_set, control
## = C5.0Control(noGlobalPruning = FALSE, CF = 0.03, earlyStopping = FALSE))
##
## Classification Tree
## Number of samples: 6496
## Number of predictors: 20
##
## Tree size: 18
##
## Non-standard options: attempt to group attributes, confidence level: 0.03
train_model6 <- C5.0(formula = y ~., control = C5.0Control(noGlobalPruning=FALSE, CF=.02, earlyStopping = FALSE), data = train_set)
train_model6
##
## Call:
## C5.0.formula(formula = y ~ ., data = train_set, control
## = C5.0Control(noGlobalPruning = FALSE, CF = 0.02, earlyStopping = FALSE))
##
## Classification Tree
## Number of samples: 6496
## Number of predictors: 20
##
## Tree size: 16
##
## Non-standard options: attempt to group attributes, confidence level: 0.02
train_model7 <- C5.0(formula = y ~., control = C5.0Control(noGlobalPruning=FALSE, CF=.0, earlyStopping = FALSE), data = train_set) # 12 leaf
train_model7
##
## Call:
## C5.0.formula(formula = y ~ ., data = train_set, control
## = C5.0Control(noGlobalPruning = FALSE, CF = 0, earlyStopping = FALSE))
##
## Classification Tree
## Number of samples: 6496
## Number of predictors: 20
##
## Tree size: 5
##
## Non-standard options: attempt to group attributes, confidence level: 0
Prediction
# 6
train1_prediction <- predict(train_model1, train_set)
train2_prediction <- predict(train_model2, train_set)
train3_prediction <- predict(train_model3, train_set)
train4_prediction <- predict(train_model4, train_set)
train5_prediction <- predict(train_model5, train_set)
train6_prediction <- predict(train_model6, train_set)
train7_prediction <- predict(train_model7, train_set)
test1_prediction <- predict(train_model1, test_set)
test2_prediction <- predict(train_model2, test_set)
test3_prediction <- predict(train_model3, test_set)
test4_prediction <- predict(train_model4, test_set)
test5_prediction <- predict(train_model5, test_set)
test6_prediction <- predict(train_model6, test_set)
test7_prediction <- predict(train_model7, test_set)
Confusion Matrix
# 7
mmetric(train_set$y, train1_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 3005 243
## yes 140 3108
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(train_set$y, train2_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 2881 367
## yes 165 3083
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(train_set$y, train3_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 2750 498
## yes 178 3070
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(train_set$y, train4_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 2607 641
## yes 147 3101
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(train_set$y, train5_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 2677 571
## yes 207 3041
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(train_set$y, train6_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 2656 592
## yes 201 3047
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(train_set$y, train7_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 2548 700
## yes 173 3075
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(test_set$y, test1_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 1180 212
## yes 167 1225
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(test_set$y, test2_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 1186 206
## yes 124 1268
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(test_set$y, test3_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 1154 238
## yes 99 1293
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(test_set$y, test4_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 1130 262
## yes 58 1334
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(test_set$y, test5_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 1154 238
## yes 93 1299
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(test_set$y, test6_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 1133 259
## yes 86 1306
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(test_set$y, test7_prediction, metric="CONF")
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 1104 288
## yes 73 1319
##
## $roc
## NULL
##
## $lift
## NULL
Evaluation
Metrics
# 8
evaluation_metrics_vector <- c("ACC","F1","PRECISION","TPR")
mmetric(train_set$y, train1_prediction, metric=evaluation_metrics_vector)
## ACC F11 F12 PRECISION1 PRECISION2 TPR1 TPR2
## 94.10406 94.00907 94.19609 95.54849 92.74843 92.51847 95.68966
mmetric(train_set$y, train2_prediction, metric=evaluation_metrics_vector)
## ACC F11 F12 PRECISION1 PRECISION2 TPR1 TPR2
## 91.81034 91.54751 92.05733 94.58306 89.36232 88.70074 94.91995
mmetric(train_set$y, train3_prediction, metric=evaluation_metrics_vector)
## ACC F11 F12 PRECISION1 PRECISION2 TPR1 TPR2
## 89.59360 89.05440 90.08216 93.92077 86.04260 84.66749 94.51970
mmetric(train_set$y, train4_prediction, metric=evaluation_metrics_vector)
## ACC F11 F12 PRECISION1 PRECISION2 TPR1 TPR2
## 87.86946 86.87104 88.72675 94.66231 82.87012 80.26478 95.47414
mmetric(train_set$y, train5_prediction, metric=evaluation_metrics_vector)
## ACC F11 F12 PRECISION1 PRECISION2 TPR1 TPR2
## 88.02340 87.31246 88.65889 92.82247 84.19158 82.41995 93.62685
mmetric(train_set$y, train6_prediction, metric=evaluation_metrics_vector)
## ACC F11 F12 PRECISION1 PRECISION2 TPR1 TPR2
## 87.79249 87.01065 88.48555 92.96465 83.73179 81.77340 93.81158
mmetric(train_set$y, train7_prediction, metric=evaluation_metrics_vector)
## ACC F11 F12 PRECISION1 PRECISION2 TPR1 TPR2
## 86.56096 85.37443 87.56941 93.64204 81.45695 78.44828 94.67365
mmetric(test_set$y, test1_prediction, metric=evaluation_metrics_vector)
## ACC F11 F12 PRECISION1 PRECISION2 TPR1 TPR2
## 86.38649 86.16283 86.60304 87.60208 85.24704 84.77011 88.00287
mmetric(test_set$y, test2_prediction, metric=evaluation_metrics_vector)
## ACC F11 F12 PRECISION1 PRECISION2 TPR1 TPR2
## 88.14655 87.78682 88.48569 90.53435 86.02442 85.20115 91.09195
mmetric(test_set$y, test3_prediction, metric=evaluation_metrics_vector)
## ACC F11 F12 PRECISION1 PRECISION2 TPR1 TPR2
## 87.89511 87.25898 88.47075 92.09896 84.45460 82.90230 92.88793
mmetric(test_set$y, test4_prediction, metric=evaluation_metrics_vector)
## ACC F11 F12 PRECISION1 PRECISION2 TPR1 TPR2
## 88.50575 87.59690 89.29050 95.11785 83.58396 81.17816 95.83333
mmetric(test_set$y, test5_prediction, metric=evaluation_metrics_vector)
## ACC F11 F12 PRECISION1 PRECISION2 TPR1 TPR2
## 88.11063 87.45737 88.69921 92.54210 84.51529 82.90230 93.31897
mmetric(test_set$y, test6_prediction, metric=evaluation_metrics_vector)
## ACC F11 F12 PRECISION1 PRECISION2 TPR1 TPR2
## 87.60776 86.78667 88.33277 92.94504 83.45048 81.39368 93.82184
mmetric(test_set$y, test7_prediction, metric=evaluation_metrics_vector)
## ACC F11 F12 PRECISION1 PRECISION2 TPR1 TPR2
## 87.03305 85.94784 87.96265 93.79779 82.07841 79.31034 94.75575
Model Feature
# 9A
C5imp(train_model1)
## Warning in (varStart + 1):length(treeDat): numerical expression has 2 elements:
## only the first used
## Overall
## duration 100.00
## nr.employed 100.00
## month 68.58
## poutcome 62.19
## contact 40.35
## emp.var.rate 33.84
## job 31.68
## euribor3m 27.96
## day_of_week 9.08
## cons.conf.idx 8.87
## education 8.81
## default 8.37
## loan 7.56
## pdays 6.59
## cons.price.idx 6.00
## campaign 4.77
## marital 3.93
## age 2.71
## housing 2.46
## previous 0.79
C5imp(train_model2)
## Warning in (varStart + 1):length(treeDat): numerical expression has 2 elements:
## only the first used
## Overall
## duration 100.00
## nr.employed 100.00
## month 56.76
## poutcome 41.55
## emp.var.rate 32.20
## euribor3m 27.77
## contact 16.52
## cons.conf.idx 8.65
## job 7.53
## day_of_week 6.74
## pdays 6.59
## default 6.30
## loan 6.27
## cons.price.idx 5.93
## age 2.16
## housing 1.94
## education 1.20
## marital 1.15
## campaign 1.02
## previous 0.52
C5imp(train_model3)
## Warning in (varStart + 1):length(treeDat): numerical expression has 2 elements:
## only the first used
## Overall
## duration 100.00
## nr.employed 100.00
## month 50.35
## emp.var.rate 43.60
## poutcome 41.29
## contact 16.19
## euribor3m 14.25
## cons.conf.idx 8.65
## default 6.30
## cons.price.idx 5.93
## day_of_week 5.56
## pdays 4.54
## loan 2.69
## age 1.68
## job 1.08
## education 0.58
## campaign 0.51
## marital 0.29
## previous 0.18
## housing 0.00
C5imp(train_model4)
## Warning in (varStart + 1):length(treeDat): numerical expression has 2 elements:
## only the first used
## Overall
## duration 100.00
## nr.employed 100.00
## month 48.18
## poutcome 41.29
## euribor3m 10.28
## default 6.30
## cons.price.idx 5.93
## cons.conf.idx 5.56
## contact 4.63
## day_of_week 4.33
## age 1.42
## job 1.08
## marital 0.00
## education 0.00
## housing 0.00
## loan 0.00
## campaign 0.00
## pdays 0.00
## previous 0.00
## emp.var.rate 0.00
C5imp(train_model5)
## Warning in (varStart + 1):length(treeDat): numerical expression has 2 elements:
## only the first used
## Overall
## duration 100.00
## nr.employed 100.00
## month 48.18
## poutcome 41.29
## contact 13.49
## cons.price.idx 6.30
## cons.conf.idx 5.93
## day_of_week 4.66
## emp.var.rate 2.63
## euribor3m 1.69
## age 1.59
## job 0.00
## marital 0.00
## education 0.00
## default 0.00
## housing 0.00
## loan 0.00
## campaign 0.00
## pdays 0.00
## previous 0.00
C5imp(train_model6)
## Warning in (varStart + 1):length(treeDat): numerical expression has 2 elements:
## only the first used
## Overall
## nr.employed 100.00
## duration 72.64
## month 48.18
## poutcome 41.29
## contact 8.85
## cons.price.idx 6.30
## cons.conf.idx 5.93
## day_of_week 4.66
## emp.var.rate 2.63
## euribor3m 1.69
## age 1.59
## job 0.00
## marital 0.00
## education 0.00
## default 0.00
## housing 0.00
## loan 0.00
## campaign 0.00
## pdays 0.00
## previous 0.00
C5imp(train_model7)
## Warning in (varStart + 1):length(treeDat): numerical expression has 2 elements:
## only the first used
## Overall
## nr.employed 100.00
## duration 72.64
## month 48.18
## age 0.00
## job 0.00
## marital 0.00
## education 0.00
## default 0.00
## housing 0.00
## loan 0.00
## contact 0.00
## day_of_week 0.00
## campaign 0.00
## pdays 0.00
## previous 0.00
## poutcome 0.00
## emp.var.rate 0.00
## cons.price.idx 0.00
## cons.conf.idx 0.00
## euribor3m 0.00
# 9B
# The top 4 features in the majority of the models are: duration, nr.employed, month and poutcome
# 9C
# The 2 least important features are: previous and housing