1 Load Package

library(tidyverse)
library(tidymodels)
library(scales)
library(knitr)
library(kableExtra)
library(caret)
library(pROC)

2 Part-A

2.1 Step 0

Pick any two classifiers of (SVM,Logistic,DecisionTree,NaiveBayes). Pick heart or ecoli dataset. Heart is simpler and ecoli compounds the problem as it is NOT a balanced dataset. From a grading perspective both carry the same weight.

2.1.1 Load Dataset

Data set ecoli is used in this project.

data<-read_csv('https://raw.githubusercontent.com/oggyluky11/DATA622-FALL-2020/main/HW2/ecoli.csv') %>%
  mutate(class= factor(class))

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Sequence_Name = col_character(),
##   mcg = col_double(),
##   gvh = col_double(),
##   lip = col_double(),
##   chg = col_double(),
##   aac = col_double(),
##   alm1 = col_double(),
##   alm2 = col_double(),
##   class = col_character()
## )

data

2.1.2 Data Profiling

The histogram of the class shows that the data set is unbalanced.

data %>% 
  ggplot(aes(x = fct_infreq(class))) +
  geom_bar(fill = 'deeppink4') +
  geom_text(stat = 'count', aes(label = ..count.., vjust = -0.3)) +
  labs(title = 'Histogram: Class Label',
       x = 'Class',
       y = 'Frequency')

The summary shows that this data set does not contain missing data, all numerical values are rescaled between 0 and 1. Becauase variable lip and chg are almost constant but with very few outliners, they are removed from modeling.

summary(data)

##  Sequence_Name           mcg              gvh            lip        
##  Length:336         Min.   :0.0000   Min.   :0.16   Min.   :0.4800  
##  Class :character   1st Qu.:0.3400   1st Qu.:0.40   1st Qu.:0.4800  
##  Mode  :character   Median :0.5000   Median :0.47   Median :0.4800  
##                     Mean   :0.5001   Mean   :0.50   Mean   :0.4955  
##                     3rd Qu.:0.6625   3rd Qu.:0.57   3rd Qu.:0.4800  
##                     Max.   :0.8900   Max.   :1.00   Max.   :1.0000  
##                                                                     
##       chg              aac             alm1             alm2       
##  Min.   :0.5000   Min.   :0.000   Min.   :0.0300   Min.   :0.0000  
##  1st Qu.:0.5000   1st Qu.:0.420   1st Qu.:0.3300   1st Qu.:0.3500  
##  Median :0.5000   Median :0.495   Median :0.4550   Median :0.4300  
##  Mean   :0.5015   Mean   :0.500   Mean   :0.5002   Mean   :0.4997  
##  3rd Qu.:0.5000   3rd Qu.:0.570   3rd Qu.:0.7100   3rd Qu.:0.7100  
##  Max.   :1.0000   Max.   :0.880   Max.   :1.0000   Max.   :0.9900  
##                                                                    
##      class    
##  cp     :143  
##  im     : 77  
##  pp     : 52  
##  imU    : 35  
##  om     : 20  
##  omL    :  5  
##  (Other):  4

boxplot(data[2:8], col = 'deeppink4')

2.1.3 Pick Two Classifiers

SVM and Decision Tree are picked for Part A.

2.2 Step 1

For each classifier, Set a seed (43)

set.seed(43)

2.3 Step 2

Do a 80/20 split and determine the Accuracy, AUC and as many metrics as returned by the Caret package (confusionMatrix) Call this the base_metric. Note down as best as you can development (engineering) cost as well as computing cost(elapsed time).

2.3.1 Convert Class Label to Factor Representation

data_label <- data %>% 
  dplyr::select(class) %>%
  mutate(class_factor = class %>% as.numeric()) %>%
  distinct()

data <- data %>%
  mutate(class = class %>% 
           as.numeric() %>% 
           as.factor())
data

2.3.2 Train Test Split

set.seed(43)

data_split <- initial_split(data = data, prop = 4/5)

data_train <- training(data_split)
data_test <- testing(data_split)

data_train

data_test

2.3.3 Warpper for Generate Models and Metrics

formula <- class ~ mcg + gvh +  aac + alm1 + alm2


train_model <- function(model.name, data.train, data.test, method, ...){
  
  time_in <- proc.time()
  
  model <- train(formula, 
                 data = data.train,
                 method = method,
                 ...)
  
  model_pred <- predict(model, data.test)
  
  time_out <- proc.time()
  elapsed_time <- time_out - time_in
  
  #metrics
  roc_auc <- roc(data.test$class, factor(model_pred, ordered = TRUE)) %>%
  auc() %>%
  as.numeric()
  
  model_cm <- confusionMatrix(model_pred, data_test$class)
  
  model_metrics <- model_cm$byClass %>%
  as.data.frame() %>%
    rownames_to_column('Class') %>%
    gather(key = 'Metrics', value = 'Value', - Class) %>%
    group_by(Metrics) %>%
    summarise(Value = mean(Value, na.rm = TRUE)) %>%
    add_row(Metrics = '_ROC_AUC', Value = roc_auc) %>%
    add_row(Metrics = '_Accuracy', Value = model_cm$overall[1]) %>%
    add_row(Metrics = '_Elapsed_Time', Value = round(elapsed_time[[3]],2)) %>%
    cbind(Model_Name = model.name) %>%
    dplyr::select(Model_Name, everything()) %>%
    arrange(Metrics)
  
  return(model_metrics)
}

2.3.4 Naive Bayes

set.seed(43)
nb_base <- train_model(model.name = 'NB_Base', 
                       data.train = data_train, 
                       data.test = data_test, 
                       method = 'nb')

nb_base

set.seed(43)
dt_base <- train_model(model.name = 'DT_Base', 
                       data.train = data_train, 
                       data.test = data_test, 
                       method = 'rpart')

dt_base

2.4 Step 3

Start with the original dataset and set a seed (43). Then run a cross validation of 5 and 10 of the model on the training set. Determine the same set of metrics and compare the cv_metrics with the base_metric. Note down as best as you can development (engineering) cost as well as computing cost(elapsed time).

2.4.1 Naive Bayes

2.4.1.1 CV = 5

set.seed(43)

nb_5cv <- train_model(model.name = 'NB_5Fold_CV', 
                      data.train = data_train, 
                      data.test = data_test, 
                      method = 'nb',
                      trControl = trainControl(method = 'cv', 
                                               number = 5, 
                                               # saves the predictions for the optimal tuning parameters
                                               savePredictions = 'final'))


nb_5cv

2.4.1.2 CV = 10

set.seed(43)

nb_10cv <- train_model(model.name = 'NB_10Fold_CV', 
                      data.train = data_train, 
                      data.test = data_test, 
                      method = 'nb',
                      trControl = trainControl(method = 'cv', 
                                               number = 10, 
                                               # saves the predictions for the optimal tuning parameters
                                               savePredictions = 'final'))


nb_10cv

2.4.2 Decision Tree

2.4.2.1 CV = 5

set.seed(43)

dt_5cv <- train_model(model.name = 'DT_5Fold_CV', 
                      data.train = data_train, 
                      data.test = data_test, 
                      method = 'rpart',
                      trControl = trainControl(method = 'cv', 
                                               number = 5, 
                                               # saves the predictions for the optimal tuning parameters
                                               savePredictions = 'final'))


dt_5cv

2.4.2.2 CV = 10

set.seed(43)

dt_10cv <- train_model(model.name = 'DT_10Fold_CV', 
                      data.train = data_train, 
                      data.test = data_test, 
                      method = 'rpart',
                      trControl = trainControl(method = 'cv', 
                                               number = 10, 
                                               # saves the predictions for the optimal tuning parameters
                                               savePredictions = 'final'))


dt_10cv

2.5 Step 4

Start with the original dataset and set a seed (43) Then run a bootstrap of 200 resamples and compute the same set of metrics and for each of the two classifiers build a three column table for each experiment (base, bootstrap, cross-validated). Note down as best as you can development (engineering) cost as well as computing cost(elapsed time).

2.5.1 Naive Bayes

2.5.2 Bootstrap with 200 resamples

set.seed(43)

nb_bs_200 <- train_model(model.name = 'NB_Bootstrap_200', 
                      data.train = data_train, 
                      data.test = data_test, 
                      method = 'nb',
                      trControl = trainControl(method = 'boot', 
                                               number = 200, 
                                               # saves the predictions for the optimal tuning parameters
                                               savePredictions = 'final'))


nb_bs_200

2.5.3 Decision Tree

2.5.4 Bootstrap with 200 resamples

set.seed(43)

dt_bs_200 <- train_model(model.name = 'DT_Bootstrap_200', 
                      data.train = data_train, 
                      data.test = data_test, 
                      method = 'rpart',
                      trControl = trainControl(method = 'boot', 
                                               number = 200, 
                                               # saves the predictions for the optimal tuning parameters
                                               savePredictions = 'final'))


dt_bs_200

3 Part B

For the same dataset, set seed (43) split 80/20. Using randomForest grow three different forests varuing the number of trees atleast three times. Start with seeding and fresh split for each forest. Note down as best as you can development (engineering) cost as well as computing cost(elapsed time) for each run. And compare these results with the experiment in Part A. Submit a pdf and executable script in python or R.

3.1 Random Forest n = 10

set.seed(43)
rf_10 <- train_model(model.name = 'RF_10', 
                       data.train = data_train, 
                       data.test = data_test, 
                       method = 'rf',
                       ntree= 10)

rf_10

3.2 Random Forest n = 30

set.seed(43)
rf_30 <- train_model(model.name = 'RF_30', 
                       data.train = data_train, 
                       data.test = data_test, 
                       method = 'rf',
                       ntree= 30)

rf_30

3.3 Random Forest n = 50

set.seed(43)
rf_50 <- train_model(model.name = 'RF_50', 
                       data.train = data_train, 
                       data.test = data_test, 
                       method = 'rf',
                       ntree= 50)

rf_50

3.4 Random Forest n = 70

set.seed(43)
rf_70 <- train_model(model.name = 'RF_70', 
                       data.train = data_train, 
                       data.test = data_test, 
                       method = 'rf',
                       ntree= 70)

rf_70

3.5 Comparison with Part A

In general Naive Bayes from Part A has better performance in terms of Accuracy, Elapsed_Time when cross validation applied, and the same AUC as Random Forest in Part B. On the other hand, although Decision Tree from Part A has smaller elapsed time when cross validation applied, however it has worst performance in terms of both Accuracy and AUC among all three types of models.

nb_base %>%
  rbind(nb_5cv,nb_10cv,nb_bs_200,
        dt_base,dt_5cv,dt_10cv,dt_bs_200,
        rf_10,rf_30,rf_50,rf_70) %>%
  spread(key = Metrics, value = Value) %>%
  kable()

Model_Name	_Accuracy	_Elapsed_Time	_ROC_AUC	Balanced Accuracy	Detection Prevalence	Detection Rate	F1	Neg Pred Value	Pos Pred Value	Precision	Prevalence	Recall	Sensitivity	Specificity
NB_Base	0.8955224	1.88	0.9666667	0.8551378	0.125	0.1119403	0.8847985	0.9799227	0.8983281	0.7486068	0.125	0.7301587	0.7301587	0.9832220
NB_5Fold_CV	0.8805970	0.64	0.9666667	0.8393074	0.125	0.1100746	0.8709524	0.9773733	0.9231766	0.6594119	0.125	0.6992063	0.6992063	0.9808250
NB_10Fold_CV	0.8955224	0.88	0.9666667	0.8551378	0.125	0.1119403	0.8847985	0.9799227	0.8983281	0.7486068	0.125	0.7301587	0.7301587	0.9832220
NB_Bootstrap_200	0.8955224	11.91	0.9666667	0.8551378	0.125	0.1119403	0.8847985	0.9799227	0.8983281	0.7486068	0.125	0.7301587	0.7301587	0.9832220
DT_Base	0.7014925	0.82	0.8592593	0.6947640	0.125	0.0876866	0.7624521	0.9430912	0.6764133	0.6764133	0.125	0.4537037	0.4537037	0.9518683
DT_5Fold_CV	0.7014925	0.53	0.8592593	0.6947640	0.125	0.0876866	0.7624521	0.9430912	0.6764133	0.6764133	0.125	0.4537037	0.4537037	0.9518683
DT_10Fold_CV	0.7014925	0.59	0.8592593	0.6947640	0.125	0.0876866	0.7624521	0.9430912	0.6764133	0.6764133	0.125	0.4537037	0.4537037	0.9518683
DT_Bootstrap_200	0.7014925	5.17	0.8592593	0.6947640	0.125	0.0876866	0.7624521	0.9430912	0.6764133	0.6764133	0.125	0.4537037	0.4537037	0.9518683
RF_10	0.8507463	0.91	0.9666667	0.8064502	0.125	0.1063433	0.8022401	0.9723532	0.7402778	0.7402778	0.125	0.6460317	0.6460317	0.9751514
RF_30	0.8656716	0.98	0.9666667	0.8268606	0.125	0.1082090	0.8390842	0.9742713	0.7263845	0.7263845	0.125	0.6825397	0.6825397	0.9783861
RF_50	0.8656716	1.19	0.9666667	0.8268606	0.125	0.1082090	0.8390842	0.9742713	0.7263845	0.7263845	0.125	0.6825397	0.6825397	0.9783861
RF_70	0.8507463	1.34	0.9666667	0.8128725	0.125	0.1063433	0.8144887	0.9717111	0.7212121	0.7212121	0.125	0.6587302	0.6587302	0.9752611

4 Part C

Include a summary of your findings. Which of the two methods bootstrap vs cv do you recommend to your customer? And why? Be elaborate. Including computing costs, engineering costs and model performance. Did you incorporate Pareto’s maxim or the Razor and how did these two heuristics influence your decision?

Three types of classifiers are built and the performance metrics are parallelly compared including Naive Bayes, Decision Tree and Random Forest. For all types of models, base model (the simple application of models), cross validation and bootstrap approches are applied.

From the comparion between Part A and Part B of this project, Narvie Bayes is deemed with best performance among the three types of models in terms of Accuracy, Elapsed Time and AUC. On the other hand, Decision Tree has generally the worst performance in terms of Accuracy and AUC, although it has smaller Elapsed Time when using cross validation and bootstrap.

ComparingCross validation and Bootstrap, according metrics in Part A, both Cross Validation and Bootstrap increases similar increment to the Accuracy compared to the base model, however Bootstrap consumes significantly large amount of time during training process. Therefore I would recommend Cross Validation to customer.

DATA 622 - Home Work 2 - Finding the Best Model