library(caret)
library(pROC)
library(tidyverse)

Part A

Step 0

Pick any two classifiers of (SVM,Logistic,DecisionTree,NaiveBayes). Pick heart or ecoli dataset. Heart is simpler and ecoli compounds the problem as it is NOT a balanced dataset. From a grading perspective both carry the same weight.

read.csv('https://raw.githubusercontent.com/mkivenson/Machine-Learning-Big-Data/master/heart.csv',head=T,sep=',',stringsAsFactors=F) -> heart

heart$target <- as.factor(heart$target)
heart$sex <- as.factor(heart$sex)
heart$cp <- as.factor(heart$cp)
heart$fbs <- as.factor(heart$fbs)
heart$restecg <- as.factor(heart$restecg)
heart$exang <- as.factor(heart$exang)
heart$slope <- as.factor(heart$slope)
heart$ca <- as.factor(heart$ca)
heart$thal <- as.factor(heart$thal)

Step 1

For each classifier, Set a seed (43)

Step 2

Do a 80/20 split and determine the Accuracy, AUC and as many metrics as returned by the Caret package (confusionMatrix) Call this the base_metric. Note down as best as you can development (engineering) cost as well as computing cost(elapsed time).

# do a 80/20 split 
set.seed(43)
smp_size <- floor(0.8 * nrow(heart))
train_ind <- sample(seq_len(nrow(heart)), size = smp_size)
train_heart <- heart[ train_ind,]
test_heart  <- heart[-train_ind,]

Function

The following function will train and score models as needed for base models, bootstapped models, cross-validated models, and random forest trees. It can be used for other datasets as well.

eval_model <- function(train_method, tr, model_name){
  # begin timer
  ptm <- proc.time()
  
  # set seed
  set.seed(43)
  
  # FOR BASE MODEL
  if (grepl("Base", model_name, fixed=TRUE)) {
    # train model
    dt_model = train(
    form = target ~ .,
    data = train_heart,
    trControl = tr,
    method = train_method
    )
    print(dt_model)
    
    # evaluate model
    model_cm <- confusionMatrix(predict(dt_model, subset(test_heart, select = -c(target))), test_heart$target)
    
    print(paste(model_name, 'results'))
    print(model_cm)
    
    # end timer
    elapsed_time <- (proc.time() - ptm)[[3]]
  
    # determine the Accuracy, AUC and as many metrics as returned by the Caret package (confusionMatrix).
    # store results
    accuracy <- model_cm$overall[[1]]
    auc_val <- as.numeric(auc(roc(test_heart$target, factor(predict(dt_model, test_heart), ordered = TRUE))))
    sensitivity <- model_cm$byClass[[1]]
    specificity <- model_cm$byClass[[2]]
    precision <- model_cm$byClass[[5]]
    recall <- model_cm$byClass[[6]]
    f1 <- model_cm$byClass[[7]]
    
  # FOR BOOTSTRAP MODEL
  } else if (grepl("Boot", model_name, fixed=TRUE)){
    
    
    dt_model = train(
    form = target ~ .,
    data = heart,
    trControl = tr,
    method = train_method
    )
    
    # end timer
    elapsed_time <- (proc.time() - ptm)[[3]]
    
    accuracy <- c()
    auc_val <- c()
    sensitivity <- c()
    specificity <- c()
    precision <- c()
    recall <- c()
    f1 <- c()
    i <- 1
    
    pred_df <- dt_model$pred
    for (resample in unique(pred_df$Resample)){
      temp <- filter(pred_df, Resample == resample)
      model_cm <- confusionMatrix(temp$pred, temp$obs)
      accuracy[i] <- model_cm$overall[[1]]
      auc_val[[i]] <- auc(roc(as.numeric(temp$pred, ordered = TRUE), as.numeric(temp$obs, ordered = TRUE)))
      sensitivity[[i]] <- model_cm$byClass[[1]]
      specificity[[i]] <- model_cm$byClass[[2]]
      precision[[i]] <- model_cm$byClass[[5]]
      recall[[i]] <- model_cm$byClass[[6]]
      f1[[i]] <- model_cm$byClass[[7]]
      i <- i + 1
    }
  
    accuracy <- mean(accuracy)
    auc_val <- mean(auc_val)
    sensitivity <- mean(sensitivity)
    specificity <- mean(specificity)
    precision <- mean(precision)
    recall <- mean(recall)
    f1 <- mean(f1)
    
    
  # FOR RANDOM FOREST
  } else if (grepl("RF", model_name, fixed=TRUE)){
  # train model
    dt_model = train(
    form = target ~ .,
    data = train_heart,
    trControl = tr,
    ntree = as.numeric(str_sub(model_name, start= -2)),
    method = train_method
    )
    print(dt_model)
    
    # evaluate model
    model_cm <- confusionMatrix(predict(dt_model, subset(test_heart, select = -c(target))), test_heart$target)
    
    print(paste(model_name, 'results'))
    print(model_cm)
    
    # end timer
    elapsed_time <- (proc.time() - ptm)[[3]]
  
    # determine the Accuracy, AUC and as many metrics as returned by the Caret package (confusionMatrix).
    # store results
    accuracy <- model_cm$overall[[1]]
    auc_val <- as.numeric(auc(roc(test_heart$target, factor(predict(dt_model, test_heart), ordered = TRUE))))
    sensitivity <- model_cm$byClass[[1]]
    specificity <- model_cm$byClass[[2]]
    precision <- model_cm$byClass[[5]]
    recall <- model_cm$byClass[[6]]
    f1 <- model_cm$byClass[[7]]
    
    
  # FOR CROSS VALIDATION
  } else {
    
  dt_model = train(
  form = target ~ .,
  data = heart,
  trControl = tr,
  method = train_method
  )

  print(dt_model)
  model_cm <- confusionMatrix(dt_model$pred[order(dt_model$pred$rowIndex),]$pred, heart$target)
  
  print(paste(model_name, 'results'))
  print(model_cm)
  
  # end timer
  elapsed_time <- (proc.time() - ptm)[[3]]

  # determine the Accuracy, AUC and as many metrics as returned by the Caret package (confusionMatrix).
  # store results
  accuracy <- model_cm$overall[[1]]
  auc_val <- as.numeric(auc(roc(test_heart$target, factor(predict(dt_model, test_heart), ordered = TRUE))))
  sensitivity <- model_cm$byClass[[1]]
  specificity <- model_cm$byClass[[2]]
  precision <- model_cm$byClass[[5]]
  recall <- model_cm$byClass[[6]]
  f1 <- model_cm$byClass[[7]]
  
  
  }
  
  full_results <- rbind(accuracy,
                        auc_val,
                        sensitivity,
                        specificity,
                        precision,
                        recall,
                        f1,
                        elapsed_time)
  colnames(full_results) <- c(model_name)
  return(full_results)
  
}

Base Model Performance

Decision Tree

# DECISION TREE MODEL - BASE METRIC
dt_base <- eval_model("rpart", trainControl(method="none"), "DT Base")
## CART 
## 
## 242 samples
##  13 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: None 
## [1] "DT Base results"
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0  0  0
##          1 27 34
##                                           
##                Accuracy : 0.5574          
##                  95% CI : (0.4245, 0.6845)
##     No Information Rate : 0.5574          
##     P-Value [Acc > NIR] : 0.5531          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 5.624e-07       
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.5574          
##              Prevalence : 0.4426          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
## 

Support Vector Machine

# SUPPORT VECTOR MACHINE MODEL - BASE METRICcaret 
svm_base <- eval_model("svmLinear", trainControl(method="none"), "SVM Base")
## Support Vector Machines with Linear Kernel 
## 
## 242 samples
##  13 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: None 
## [1] "SVM Base results"
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 20  2
##          1  7 32
##                                           
##                Accuracy : 0.8525          
##                  95% CI : (0.7383, 0.9302)
##     No Information Rate : 0.5574          
##     P-Value [Acc > NIR] : 8.993e-07       
##                                           
##                   Kappa : 0.6952          
##                                           
##  Mcnemar's Test P-Value : 0.1824          
##                                           
##             Sensitivity : 0.7407          
##             Specificity : 0.9412          
##          Pos Pred Value : 0.9091          
##          Neg Pred Value : 0.8205          
##              Prevalence : 0.4426          
##          Detection Rate : 0.3279          
##    Detection Prevalence : 0.3607          
##       Balanced Accuracy : 0.8410          
##                                           
##        'Positive' Class : 0               
## 
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

5-Fold Cross Validation

Start with the original dataset and set a seed (43). Then run a cross validation of 5 and 10 of the model on the training set. Determine the same set of metrics and compare the cv_metrics with the base_metric. Note down as best as you can development (engineering) cost as well as computing cost(elapsed time).

Decision Tree

# DECISION TREE MODEL - 5 Cross Validation Folds
dt_5cv <- eval_model("rpart", tr = trainControl(method = "cv", number = 5, savePredictions = 'final'), "DT 5 cv")
## CART 
## 
## 303 samples
##  13 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 243, 242, 242, 243, 242 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.03623188  0.7621311  0.5186339
##   0.03985507  0.7391803  0.4718201
##   0.48550725  0.6400000  0.2408988
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.03623188.
## [1] "DT 5 cv results"
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  98  32
##          1  40 133
##                                           
##                Accuracy : 0.7624          
##                  95% CI : (0.7104, 0.8092)
##     No Information Rate : 0.5446          
##     P-Value [Acc > NIR] : 3.29e-15        
##                                           
##                   Kappa : 0.5187          
##                                           
##  Mcnemar's Test P-Value : 0.4094          
##                                           
##             Sensitivity : 0.7101          
##             Specificity : 0.8061          
##          Pos Pred Value : 0.7538          
##          Neg Pred Value : 0.7688          
##              Prevalence : 0.4554          
##          Detection Rate : 0.3234          
##    Detection Prevalence : 0.4290          
##       Balanced Accuracy : 0.7581          
##                                           
##        'Positive' Class : 0               
## 
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Support Vector Machine

# SUPPORT VECTOR MACHINE MODEL - 5 Cross Validation Folds
svm_5cv <- eval_model("svmLinear", tr = trainControl(method = "cv", number = 5, savePredictions = 'final'), "SVM 5 cv")
## Support Vector Machines with Linear Kernel 
## 
## 303 samples
##  13 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 243, 242, 242, 243, 242 
## Resampling results:
## 
##   Accuracy  Kappa    
##   0.835082  0.6664592
## 
## Tuning parameter 'C' was held constant at a value of 1
## [1] "SVM 5 cv results"
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 110  22
##          1  28 143
##                                          
##                Accuracy : 0.835          
##                  95% CI : (0.7883, 0.875)
##     No Information Rate : 0.5446         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.6661         
##                                          
##  Mcnemar's Test P-Value : 0.4795         
##                                          
##             Sensitivity : 0.7971         
##             Specificity : 0.8667         
##          Pos Pred Value : 0.8333         
##          Neg Pred Value : 0.8363         
##              Prevalence : 0.4554         
##          Detection Rate : 0.3630         
##    Detection Prevalence : 0.4356         
##       Balanced Accuracy : 0.8319         
##                                          
##        'Positive' Class : 0              
## 
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

10-Fold Cross Validation

Decision Tree

# DECISION TREE MODEL - 10 Cross Validation Folds
dt_10cv <- eval_model("rpart", trainControl(method = "cv", number = 10, savePredictions = 'final'), "DT 10 cv")
## CART 
## 
## 303 samples
##  13 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 273, 272, 273, 273, 273, 273, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.03623188  0.7493548  0.4924633
##   0.03985507  0.7493548  0.4917245
##   0.48550725  0.6560215  0.2761508
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.03985507.
## [1] "DT 10 cv results"
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  96  34
##          1  42 131
##                                          
##                Accuracy : 0.7492         
##                  95% CI : (0.6964, 0.797)
##     No Information Rate : 0.5446         
##     P-Value [Acc > NIR] : 1.515e-13      
##                                          
##                   Kappa : 0.4919         
##                                          
##  Mcnemar's Test P-Value : 0.422          
##                                          
##             Sensitivity : 0.6957         
##             Specificity : 0.7939         
##          Pos Pred Value : 0.7385         
##          Neg Pred Value : 0.7572         
##              Prevalence : 0.4554         
##          Detection Rate : 0.3168         
##    Detection Prevalence : 0.4290         
##       Balanced Accuracy : 0.7448         
##                                          
##        'Positive' Class : 0              
## 
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Support Vector Machine

# SUPPORT VECTOR MACHINE MODEL - 10 Cross Validation Folds
svm_10cv <- eval_model("svmLinear", trainControl(method = "cv", number = 10, savePredictions = 'final'), "SVM 10 cv")
## Support Vector Machines with Linear Kernel 
## 
## 303 samples
##  13 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 273, 272, 273, 273, 273, 273, ... 
## Resampling results:
## 
##   Accuracy   Kappa   
##   0.8383871  0.671804
## 
## Tuning parameter 'C' was held constant at a value of 1
## [1] "SVM 10 cv results"
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 107  18
##          1  31 147
##                                           
##                Accuracy : 0.8383          
##                  95% CI : (0.7919, 0.8779)
##     No Information Rate : 0.5446          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.6714          
##                                           
##  Mcnemar's Test P-Value : 0.08648         
##                                           
##             Sensitivity : 0.7754          
##             Specificity : 0.8909          
##          Pos Pred Value : 0.8560          
##          Neg Pred Value : 0.8258          
##              Prevalence : 0.4554          
##          Detection Rate : 0.3531          
##    Detection Prevalence : 0.4125          
##       Balanced Accuracy : 0.8331          
##                                           
##        'Positive' Class : 0               
## 
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Bootstrap

Start with the original dataset and set a seed (43) Then run a bootstrap of 200 resamples.

Decision Tree

# DECISION TREE MODEL - BOOTSTRAP
dt_bt <- eval_model("rpart", trainControl(method="boot", number=200, savePredictions = 'final', returnResamp = 'final'), "DT Bootstrap")
print(dt_bt)
##              DT Bootstrap
## accuracy        0.7420239
## auc_val         0.7458831
## sensitivity     0.7039119
## specificity     0.7768658
## precision       0.7329613
## recall          0.7039119
## f1              0.7115944
## elapsed_time    4.6400000

Support Vector Machine

set.seed(43)

# SUPPORT VECTOR MACHINE MODEL - BOOTSTRAP
svm_bt <- eval_model("svmLinear", trainControl(method="boot", number=200, savePredictions = 'final', returnResamp = 'final'), "SVM Bootstrap")
print(svm_bt)
##              SVM Bootstrap
## accuracy         0.8233320
## auc_val          0.8254587
## sensitivity      0.7807475
## specificity      0.8612693
## precision        0.8290195
## recall           0.7807475
## f1               0.8016091
## elapsed_time    13.0000000

Results

Compute the same set of metrics and for each of the two classifiers and build a three column table for each experiment (base, bootstrap, cross-validated). Note down as best as you can development (engineering) cost as well as computing cost(elapsed time).

data.frame(cbind(dt_base, dt_5cv, dt_10cv, dt_bt, svm_base, svm_5cv, svm_10cv, svm_bt))
##               DT.Base   DT.5.cv  DT.10.cv DT.Bootstrap  SVM.Base  SVM.5.cv
## accuracy     0.557377 0.7623762 0.7491749    0.7420239 0.8524590 0.8349835
## auc_val      0.500000 0.7854031 0.7412854    0.7458831 0.8409586 0.8300654
## sensitivity  0.000000 0.7101449 0.6956522    0.7039119 0.7407407 0.7971014
## specificity  1.000000 0.8060606 0.7939394    0.7768658 0.9411765 0.8666667
## precision          NA 0.7538462 0.7384615    0.7329613 0.9090909 0.8333333
## recall       0.000000 0.7101449 0.6956522    0.7039119 0.7407407 0.7971014
## f1                 NA 0.7313433 0.7164179    0.7115944 0.8163265 0.8148148
## elapsed_time 0.610000 0.7100000 0.8300000    4.6400000 1.0400000 1.1700000
##              SVM.10.cv SVM.Bootstrap
## accuracy     0.8382838     0.8233320
## auc_val      0.8300654     0.8254587
## sensitivity  0.7753623     0.7807475
## specificity  0.8909091     0.8612693
## precision    0.8560000     0.8290195
## recall       0.7753623     0.7807475
## f1           0.8136882     0.8016091
## elapsed_time 0.7600000    13.0000000

Part B

For the same dataset, set seed (43) split 80/20.

# do a 80/20 split 
set.seed(43)
smp_size <- floor(0.8 * nrow(heart))
train_ind <- sample(seq_len(nrow(heart)), size = smp_size)
train_heart <- heart[ train_ind,]
test_heart  <- heart[-train_ind,]

Using randomForest grow three different forests varuing the number of trees atleast three times. Start with seeding and fresh split for each forest. Note down as best as you can development (engineering) cost as well as computing cost(elapsed time) for each run. And compare these results with the experiment in Part A. Submit a pdf and executable script in python or R.

Random Forest with 10 Trees

rf_10 <- eval_model("rf", trainControl(), "RF 10")
## Random Forest 
## 
## 242 samples
##  13 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 242, 242, 242, 242, 242, 242, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.7792646  0.5549714
##   12    0.7551120  0.5039891
##   22    0.7556300  0.5050744
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
## [1] "RF 10 results"
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 18  1
##          1  9 33
##                                           
##                Accuracy : 0.8361          
##                  95% CI : (0.7191, 0.9185)
##     No Information Rate : 0.5574          
##     P-Value [Acc > NIR] : 3.844e-06       
##                                           
##                   Kappa : 0.6573          
##                                           
##  Mcnemar's Test P-Value : 0.02686         
##                                           
##             Sensitivity : 0.6667          
##             Specificity : 0.9706          
##          Pos Pred Value : 0.9474          
##          Neg Pred Value : 0.7857          
##              Prevalence : 0.4426          
##          Detection Rate : 0.2951          
##    Detection Prevalence : 0.3115          
##       Balanced Accuracy : 0.8186          
##                                           
##        'Positive' Class : 0               
## 
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Random Forest with 50 Trees

rf_50 <- eval_model("rf", trainControl(), "RF 50")
## Random Forest 
## 
## 242 samples
##  13 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 242, 242, 242, 242, 242, 242, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.7972460  0.5896982
##   12    0.7651698  0.5239571
##   22    0.7512177  0.4950400
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
## [1] "RF 50 results"
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 19  2
##          1  8 32
##                                           
##                Accuracy : 0.8361          
##                  95% CI : (0.7191, 0.9185)
##     No Information Rate : 0.5574          
##     P-Value [Acc > NIR] : 3.844e-06       
##                                           
##                   Kappa : 0.66            
##                                           
##  Mcnemar's Test P-Value : 0.1138          
##                                           
##             Sensitivity : 0.7037          
##             Specificity : 0.9412          
##          Pos Pred Value : 0.9048          
##          Neg Pred Value : 0.8000          
##              Prevalence : 0.4426          
##          Detection Rate : 0.3115          
##    Detection Prevalence : 0.3443          
##       Balanced Accuracy : 0.8224          
##                                           
##        'Positive' Class : 0               
## 
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Random Forest with 99 Trees

rf_99 <- eval_model("rf", trainControl(), "RF 99")
## Random Forest 
## 
## 242 samples
##  13 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 242, 242, 242, 242, 242, 242, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.7968746  0.5887862
##   12    0.7688465  0.5307189
##   22    0.7515108  0.4957248
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
## [1] "RF 99 results"
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 20  2
##          1  7 32
##                                           
##                Accuracy : 0.8525          
##                  95% CI : (0.7383, 0.9302)
##     No Information Rate : 0.5574          
##     P-Value [Acc > NIR] : 8.993e-07       
##                                           
##                   Kappa : 0.6952          
##                                           
##  Mcnemar's Test P-Value : 0.1824          
##                                           
##             Sensitivity : 0.7407          
##             Specificity : 0.9412          
##          Pos Pred Value : 0.9091          
##          Neg Pred Value : 0.8205          
##              Prevalence : 0.4426          
##          Detection Rate : 0.3279          
##    Detection Prevalence : 0.3607          
##       Balanced Accuracy : 0.8410          
##                                           
##        'Positive' Class : 0               
## 
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Random Forest Results

All three random forest models performed well - better than the decision tree models from part A and comparable with the SVM models. However, computation time for the Random forest models is approximately double the time for SVM.

data.frame(cbind(rf_10, rf_50, rf_99))
##                  RF.10     RF.50     RF.99
## accuracy     0.8360656 0.8360656 0.8524590
## auc_val      0.8186275 0.8077342 0.8409586
## sensitivity  0.6666667 0.7037037 0.7407407
## specificity  0.9705882 0.9411765 0.9411765
## precision    0.9473684 0.9047619 0.9090909
## recall       0.6666667 0.7037037 0.7407407
## f1           0.7826087 0.7916667 0.8163265
## elapsed_time 1.2500000 1.7900000 2.9200000

Part C

The following dataframe shows a comprehensive summary of all models created in this exercise.

I would recommend cross validation over bootstrapping, because it was less computationally expensive and yielded comparable results. Bootstrapping had higher development cost, as it required the aggregation of 200 re-samples to calculate performance metrics.

Analysis

SVM Models

All three SVM models created yielded high accuracy, AUC, and F1 metrics. As suggested by the Pareto principle, the base SVM model seems to do the majority of the work. However, the 5-fold cross validated model has comparable results and slightly higher (yet still low) elapsed time. Since cross validation stabilizes the results of a model, it is preferable to use the cross-validated model. The SVM 10-fold CV model does not add much accuracy or stability, so Occam’s razor suggests that the simpler model (the 5-fold CV) should be used.

Decision Trees

The base decision tree model did not perform well, however applying cross validation helped with parameter selection. 5 fold cross validation yields better results than 10 fold cross validation, and in less elapsed time. As per the Occam’s razor principal, the simpler solution (5-fold CV) should be used, and there is no added benefit in using 10 fold CV.

Random Forest Trees

The RF model with 99 trees had the same performance as the base SVM model, but required more computational time. Therefore, the simpler model (SVM) has the advantage according to Occam’s razor.

data.frame(cbind(dt_base, dt_5cv, dt_10cv, dt_bt, svm_base, svm_5cv, svm_10cv, svm_bt, rf_10, rf_50, rf_99))
##               DT.Base   DT.5.cv  DT.10.cv DT.Bootstrap  SVM.Base  SVM.5.cv
## accuracy     0.557377 0.7623762 0.7491749    0.7420239 0.8524590 0.8349835
## auc_val      0.500000 0.7854031 0.7412854    0.7458831 0.8409586 0.8300654
## sensitivity  0.000000 0.7101449 0.6956522    0.7039119 0.7407407 0.7971014
## specificity  1.000000 0.8060606 0.7939394    0.7768658 0.9411765 0.8666667
## precision          NA 0.7538462 0.7384615    0.7329613 0.9090909 0.8333333
## recall       0.000000 0.7101449 0.6956522    0.7039119 0.7407407 0.7971014
## f1                 NA 0.7313433 0.7164179    0.7115944 0.8163265 0.8148148
## elapsed_time 0.610000 0.7100000 0.8300000    4.6400000 1.0400000 1.1700000
##              SVM.10.cv SVM.Bootstrap     RF.10     RF.50     RF.99
## accuracy     0.8382838     0.8233320 0.8360656 0.8360656 0.8524590
## auc_val      0.8300654     0.8254587 0.8186275 0.8077342 0.8409586
## sensitivity  0.7753623     0.7807475 0.6666667 0.7037037 0.7407407
## specificity  0.8909091     0.8612693 0.9705882 0.9411765 0.9411765
## precision    0.8560000     0.8290195 0.9473684 0.9047619 0.9090909
## recall       0.7753623     0.7807475 0.6666667 0.7037037 0.7407407
## f1           0.8136882     0.8016091 0.7826087 0.7916667 0.8163265
## elapsed_time 0.7600000    13.0000000 1.2500000 1.7900000 2.9200000