# 1A
library(e1071)
library(psych)
library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

## Loading required package: lattice

library(rminer)
library(rmarkdown)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ purrr::lift()    masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(C50)
library(matrixStats)

## 
## Attaching package: 'matrixStats'
## 
## The following object is masked from 'package:dplyr':
## 
##     count

library(knitr)

cloud_wd <- getwd()
setwd(cloud_wd)

cd <- read.csv(file = "CD_additional_modified.csv", stringsAsFactors = TRUE)

str(cd)

## 'data.frame':    4119 obs. of  21 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
##  $ default       : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 1 1 2 1 2 ...
##  $ housing       : Factor w/ 3 levels "no","unknown",..: 3 1 3 2 3 1 3 3 1 1 ...
##  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 2 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
##  $ duration      : int  487 346 227 17 58 128 290 44 68 170 ...
##  $ campaign      : int  2 4 1 3 1 3 4 2 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 2 0 0 1 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ emp.var.rate  : num  -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ euribor3m     : num  1.31 4.86 4.96 4.96 4.19 ...
##  $ nr.employed   : num  5099 5191 5228 5228 5196 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

# 1B
set.seed(100)

inTrain <- createDataPartition(cd$y, p=.7,list = FALSE)

train_set <-  cd[inTrain,]
test_set <- cd[-inTrain,]

# 1C
prop.table(table(train_set$y))

## 
##      no     yes 
## 0.89043 0.10957

prop.table(table(test_set$y))

## 
##        no       yes 
## 0.8906883 0.1093117

1 Simple Decision Tree Training and Testing

# 2A
train_model1 <- C5.0(formula = y ~., data = train_set)
train_model1

## 
## Call:
## C5.0.formula(formula = y ~ ., data = train_set)
## 
## Classification Tree
## Number of samples: 2884 
## Number of predictors: 20 
## 
## Tree size: 38 
## 
## Non-standard options: attempt to group attributes

train1_prediction <- predict(train_model1, train_set)
test1_prediction <- predict(train_model1, test_set)

mmetric(train_set$y, train1_prediction, metric="CONF")

## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  2516   52
##    yes  140  176
## 
## $roc
## NULL
## 
## $lift
## NULL

mmetric(test_set$y, test1_prediction, metric="CONF")

## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  1060   40
##    yes   76   59
## 
## $roc
## NULL
## 
## $lift
## NULL

mmetric(train_set$y, train1_prediction, metric=c("ACC","TPR","PRECISION","F1"))

##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11        F12 
##   93.34258   97.97508   55.69620   94.72892   77.19298   96.32466   64.70588

mmetric(test_set$y, test1_prediction, metric=c("ACC","TPR","PRECISION","F1"))

##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11        F12 
##   90.60729   96.36364   43.70370   93.30986   59.59596   94.81216   50.42735

# 2B
train_model2 <- C5.0(formula = y ~., control = C5.0Control(CF=.1), data = train_set)
train_model2

## 
## Call:
## C5.0.formula(formula = y ~ ., data = train_set, control = C5.0Control(CF = 0.1))
## 
## Classification Tree
## Number of samples: 2884 
## Number of predictors: 20 
## 
## Tree size: 12 
## 
## Non-standard options: attempt to group attributes, confidence level: 0.1

plot(train_model2)

train2_prediction <- predict(train_model2, train_set)
test2_prediction <- predict(train_model2, test_set)

mmetric(train_set$y, train2_prediction, metric="CONF")

## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  2512   56
##    yes  173  143
## 
## $roc
## NULL
## 
## $lift
## NULL

mmetric(test_set$y, test2_prediction, metric="CONF")

## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  1061   39
##    yes   90   45
## 
## $roc
## NULL
## 
## $lift
## NULL

mmetric(train_set$y, train2_prediction, metric=c("ACC","TPR","PRECISION","F1"))

##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11        F12 
##   92.05964   97.81931   45.25316   93.55680   71.85930   95.64059   55.53398

mmetric(test_set$y, test2_prediction, metric=c("ACC","TPR","PRECISION","F1"))

##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11        F12 
##   89.55466   96.45455   33.33333   92.18071   53.57143   94.26921   41.09589

2 Simple Naïve Bayes Model Training and Testing

# 3A
cd_w1_nb <- naiveBayes(y ~ ., data = train_set)
#cd_w1_nb

predicted_cd_w1 <- predict(cd_w1_nb, train_set)
mmetric(train_set$y, predicted_cd_w1, metric="CONF")

## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  2336  232
##    yes  122  194
## 
## $roc
## NULL
## 
## $lift
## NULL

mmetric(train_set$y, predicted_cd_w1, metric=c("ACC","TPR","PRECISION","F1"))

##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11        F12 
##   87.72538   90.96573   61.39241   95.03662   45.53991   92.95663   52.29111

# 3B
cd_w2_nb <- naiveBayes(y ~ . -nr.employed, data = train_set)
#cd_w2_nb

predicted_cd_w2 <- predict(cd_w2_nb, train_set)
mmetric(train_set$y, predicted_cd_w2, metric="CONF")

## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  2374  194
##    yes  139  177
## 
## $roc
## NULL
## 
## $lift
## NULL

mmetric(train_set$y, predicted_cd_w2, metric=c("ACC","TPR","PRECISION","F1"))

##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11        F12 
##   88.45354   92.44548   56.01266   94.46876   47.70889   93.44617   51.52838

3 Cross-validation

# 4A
cv_function <- function(df, target, nFolds, seedVal, classification, metrics_list)
{
  set.seed(seedVal)
  folds = createFolds(df[,target],nFolds) 
  
  cv_results <- lapply(folds, function(x)
  { 
    train <- df[-x,-target]
    test  <- df[x,-target]
    
    train_target <- df[-x,target]
    test_target <- df[x,target]
    
    classification_model <- classification(train,train_target) 
    
    pred<- predict(classification_model,test)
    
    return(mmetric(test_target,pred,c("ACC","PRECISION","TPR","F1")))
    
  })
  
  cv_results_m <- as.matrix(as.data.frame(cv_results))
  
  cv_mean<- as.matrix(rowMeans(cv_results_m))
  
  colnames(cv_mean) <- "Mean"
  
  cv_sd <- as.matrix(rowSds(cv_results_m))
  
  colnames(cv_sd) <- "Sd"
  
  cv_all <- cbind(cv_results_m, cv_mean, cv_sd)
  
  kable(cv_all,digits=2)
}

# 4B 4C 4D
df <- cd
target <- 21
nFolds <- 3
seedVal <- 500
assign("classification", naiveBayes)
metrics_list <- c("ACC","PRECISION","TPR","F1")

cv_function(df, target, nFolds, seedVal, classification, metrics_list)

	Fold1	Fold2	Fold3	Mean	Sd
ACC	88.49	86.53	86.31	87.11	1.20
PRECISION1	95.79	94.74	94.42	94.98	0.72
PRECISION2	48.10	42.06	40.87	43.67	3.88
TPR1	91.09	89.85	89.94	90.29	0.69
TPR2	67.33	59.60	56.67	61.20	5.51
F11	93.38	92.23	92.13	92.58	0.69
F12	56.11	49.32	47.49	50.97	4.54

# 5A
nFolds <- 5
cv_function(df, target, nFolds, seedVal, classification, metrics_list)

	Fold1	Fold2	Fold3	Fold4	Fold5	Mean	Sd
ACC	86.53	87.50	84.83	87.85	88.35	87.01	1.39
PRECISION1	93.57	95.13	95.38	95.41	95.18	94.93	0.77
PRECISION2	40.37	45.24	38.56	46.03	47.46	43.53	3.85
TPR1	91.14	90.59	87.19	90.72	91.55	90.24	1.74
TPR2	48.89	62.64	65.56	64.44	62.22	60.75	6.77
F11	92.34	92.80	91.10	93.01	93.33	92.52	0.87
F12	44.22	52.53	48.56	53.70	53.85	50.57	4.15

nFolds <- 10
cv_function(df, target, nFolds, seedVal, classification, metrics_list)

	Fold01	Fold02	Fold03	Fold04	Fold05	Fold06	Fold07	Fold08	Fold09	Fold10	Mean	Sd
ACC	88.11	84.95	86.86	86.65	86.65	86.17	86.41	89.81	89.59	86.37	87.16	1.54
PRECISION1	94.92	93.70	94.57	94.83	94.57	94.03	94.81	96.30	97.37	94.80	94.99	1.08
PRECISION2	46.55	36.51	42.62	42.19	41.94	40.00	41.54	52.46	52.11	41.54	43.75	5.13
TPR1	91.55	89.10	90.44	89.92	90.19	90.19	89.65	92.10	90.74	89.62	90.35	0.91
TPR2	60.00	51.11	57.78	60.00	57.78	53.33	60.00	71.11	80.43	60.00	61.15	8.57
F11	93.20	91.34	92.46	92.31	92.33	92.07	92.16	94.15	93.94	92.13	92.61	0.88
F12	52.43	42.59	49.06	49.54	48.60	45.71	49.09	60.38	63.25	49.09	50.97	6.30

assign("classification", C5.0)
cv_function(df, target, nFolds, seedVal, classification, metrics_list)

	Fold01	Fold02	Fold03	Fold04	Fold05	Fold06	Fold07	Fold08	Fold09	Fold10	Mean	Sd
ACC	90.78	90.29	88.81	89.56	91.02	90.05	92.48	91.75	89.83	92.46	90.70	1.23
PRECISION1	94.10	93.83	92.55	93.55	94.12	94.54	94.21	93.70	92.88	94.67	93.81	0.68
PRECISION2	58.97	56.41	48.57	52.50	60.53	54.35	71.88	67.74	55.88	69.44	59.63	7.74
TPR1	95.64	95.37	95.08	94.82	95.91	94.28	97.55	97.28	95.91	96.99	95.88	1.09
TPR2	51.11	48.89	37.78	46.67	51.11	55.56	51.11	46.67	41.30	55.56	48.57	5.71
F11	94.86	94.59	93.80	94.18	95.01	94.41	95.85	95.45	94.37	95.82	94.83	0.70
F12	54.76	52.38	42.50	49.41	55.42	54.95	59.74	55.26	47.50	61.73	53.37	5.70

nFolds <- 5
cv_function(df, target, nFolds, seedVal, classification, metrics_list)

	Fold1	Fold2	Fold3	Fold4	Fold5	Mean	Sd
ACC	91.63	90.05	89.44	89.79	91.87	90.56	1.11
PRECISION1	94.04	93.57	93.66	92.42	94.53	93.64	0.78
PRECISION2	65.22	55.84	51.81	55.17	65.33	58.67	6.22
TPR1	96.73	95.36	94.55	96.45	96.46	95.91	0.92
TPR2	50.00	47.25	47.78	35.56	54.44	47.01	7.00
F11	95.37	94.46	94.10	94.39	95.48	94.76	0.62
F12	56.60	51.19	49.71	43.24	59.39	52.03	6.29

4 Reflection

The first model built is decision trees. According to A2, I knew that changing the CF would change the complexity of the model. The simpler the model is, the lower the accuracy in both train and test datasets. The second model built is naïve bayes models. The overall accuracy of my model increased from 87.73 to 88.45 from removing one of the predictor (i.e. “nr.employed”). This implies that some predictors are not independent and are correlated to each other. A step like creating a correlation heat map could potentially improve the performance of a naïve bayes model. I do see that naïve bayes model is a fast and efficient model for any kind of analysis. Addition to making adjustments the hyperparameters or dataset, cross validation was also practiced avoiding overfitting. A 10-folds for sure gives a more accurate view on the performance, however, I think 3-folds and 5-folds are sufficient for the datasets that have only a few thousands of observations.

A3_Yu_WingKi_10/10/2023

WingKi Yu

2023-10-11

1 Simple Decision Tree Training and Testing

2 Simple Naïve Bayes Model Training and Testing

3 Cross-validation

4 Reflection