Please refer to the HW2 Document.

1 Load Package

library(tidyverse)
library(scales)
library(knitr)
library(kableExtra)
library(caret)
library(pROC)

2 Part A

2.1 Step 0

Picked two classifiers, SVM and DecisionTree, and the heart dataset for the following.

2.1.1 Load Data

heart <- read_csv('https://raw.githubusercontent.com/shirley-wong/Data-622/main/HW2/heart.csv') %>%
  mutate(target = as.factor(target))

## 
## -- Column specification ----------------------------------------------------------------------------------------------------
## cols(
##   age = col_double(),
##   sex = col_double(),
##   cp = col_double(),
##   trestbps = col_double(),
##   chol = col_double(),
##   fbs = col_double(),
##   restecg = col_double(),
##   thalach = col_double(),
##   exang = col_double(),
##   oldpeak = col_double(),
##   slope = col_double(),
##   ca = col_double(),
##   thal = col_double(),
##   target = col_double()
## )

heart

2.2 Step 1

For each classifier, set a seed (43).

set.seed(43)

2.3 Step 2

Do a 80/20 split and determine the Accuracy, AUC and as many metrics as returned by the Caret package (confusionMatrix). Note down as best as you can development (engineering) cost as well as computing cost(elapsed time).

2.3.1 Train Test Split

set.seed(43)
heart_split <- createDataPartition(heart$target, p = .8, list = FALSE, times = 1)

heart_train <- heart[heart_split,]
heart_test <- heart[-heart_split,]

2.3.2 SVM

set.seed(43)
start_time <- proc.time()

### Train Model
heart_base_svm <- train(target ~., data = heart_train, 
              method = "svmLinear", 
              #trControl = train_control,  
              #preProcess = c("center","scale")
              )

### Make Prediction
heart_base_svm_pred <- predict(heart_base_svm, heart_test)

end_time <- proc.time()
elapsed_time <- end_time - start_time

### Metrics
svm_base_ROC_AUC <- roc(heart_test$target, factor(heart_base_svm_pred, ordered = TRUE)) %>%
  auc() %>%
  as.numeric()

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

svm_base_cm <- confusionMatrix(heart_base_svm_pred, heart_test$target)

svm_Base_Metrics <- svm_base_cm$byClass %>% 
  data.frame() %>%
  rownames_to_column('Metrics') %>%
  rename('Value' = '.') %>%
  add_row(Metrics = '**ROC_AUC', Value = svm_base_ROC_AUC) %>%
  add_row(Metrics = '***Accuracy', Value = svm_base_cm$overall[1]) %>%
  add_row(Metrics = '*Elapsed_Time', Value = round(elapsed_time[[3]],2)) %>%
  arrange(Metrics)
svm_Base_Metrics

2.3.3 Decision Tree

start_time <- proc.time()

### Train Model
heart_base_dt <- train(target ~., data = heart_train, 
              method = "rpart", 
              #trControl = train_control,  
              #preProcess = c("center","scale")
              )

### Make Prediction
heart_base_dt_pred <- predict(heart_base_dt, heart_test)

end_time <- proc.time()
elapsed_time <- end_time - start_time

### Metrics
dt_base_ROC_AUC <- roc(heart_test$target, factor(heart_base_dt_pred, ordered = TRUE)) %>%
  auc() %>%
  as.numeric()

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

dt_base_cm <- confusionMatrix(heart_base_dt_pred, heart_test$target)

dt_Base_Metrics <- dt_base_cm$byClass %>% 
  data.frame() %>%
  rownames_to_column('Metrics') %>%
  rename('Value' = '.') %>%
  add_row(Metrics = '**ROC_AUC', Value = dt_base_ROC_AUC) %>%
  add_row(Metrics = '***Accuracy', Value = dt_base_cm$overall[1]) %>%
  add_row(Metrics = '*Elapsed_Time', Value = round(elapsed_time[[3]],2)) %>%
  arrange(Metrics)
dt_Base_Metrics

2.4 Step 3

Start with the original dataset and set a seed (43). Then run a cross validation of 5 and 10 of the model on the training set.

Determine the same set of metrics and compare the cv_metrics with the base_metric. Note down as best as you can development (engineering) cost as well as computing cost(elapsed time).

2.4.1 SVM w/ 5-fold CV

set.seed(43)
start_time <- proc.time()

### Train Model
heart_cv_5_svm <- train(target ~., data = heart_train, 
              method = "svmLinear", 
              trControl = trainControl(method = 'cv', 
                                       number = 5,
                                       savePredictions = 'final')  
              #preProcess = c("center","scale")
              )

### Make Prediction
heart_cv_5_svm_pred <- predict(heart_cv_5_svm, heart_test)

end_time <- proc.time()
elapsed_time <- end_time - start_time

### Metrics
svm_cv_5_ROC_AUC <- roc(heart_test$target, factor(heart_cv_5_svm_pred, ordered = TRUE)) %>%
  auc() %>%
  as.numeric()

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

svm_cv_5_cm <- confusionMatrix(heart_cv_5_svm_pred, heart_test$target)

svm_cv_5_Metrics <- svm_cv_5_cm$byClass %>% 
  data.frame() %>%
  rownames_to_column('Metrics') %>%
  rename('Value' = '.') %>%
  add_row(Metrics = '**ROC_AUC', Value = svm_cv_5_ROC_AUC) %>%
  add_row(Metrics = '***Accuracy', Value = svm_cv_5_cm$overall[1]) %>%
  add_row(Metrics = '*Elapsed_Time', Value = round(elapsed_time[[3]],2)) %>%
  arrange(Metrics)
svm_cv_5_Metrics

2.4.2 SVM w/ 10-fold CV

set.seed(43)
start_time <- proc.time()

### Train Model
heart_cv_10_svm <- train(target ~., data = heart_train, 
              method = "svmLinear", 
              trControl = trainControl(method = 'cv', 
                                       number = 10,
                                       savePredictions = 'final')  
              #preProcess = c("center","scale")
              )

### Make Prediction
heart_cv_10_svm_pred <- predict(heart_cv_10_svm, heart_test)

end_time <- proc.time()
elapsed_time <- end_time - start_time

### Metrics
svm_cv_10_ROC_AUC <- roc(heart_test$target, factor(heart_cv_10_svm_pred, ordered = TRUE)) %>%
  auc() %>%
  as.numeric()

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

svm_cv_10_cm <- confusionMatrix(heart_cv_10_svm_pred, heart_test$target)

svm_cv_10_Metrics <- svm_cv_10_cm$byClass %>% 
  data.frame() %>%
  rownames_to_column('Metrics') %>%
  rename('Value' = '.') %>%
  add_row(Metrics = '**ROC_AUC', Value = svm_cv_10_ROC_AUC) %>%
  add_row(Metrics = '***Accuracy', Value = svm_cv_10_cm$overall[1]) %>%
  add_row(Metrics = '*Elapsed_Time', Value = round(elapsed_time[[3]],2)) %>%
  arrange(Metrics)
svm_cv_10_Metrics

2.4.3 SVM w/ Bootstrap

SVM with Bootstrap 200 resamples

set.seed(43)
start_time <- proc.time()

### Train Model
heart_bs_200_svm <- train(target ~., data = heart_train, 
              method = "svmLinear", 
              trControl = trainControl(method = 'boot', 
                                       number = 200,
                                       savePredictions = 'final')  
              #preProcess = c("center","scale")
              )

### Make Prediction
heart_bs_200_svm_pred <- predict(heart_bs_200_svm, heart_test)

end_time <- proc.time()
elapsed_time <- end_time - start_time

svm_bs_200_Metrics <- data.frame()

for(res in unique(heart_bs_200_svm$pred$Resample)){
  res <- filter(heart_bs_200_svm$pred, Resample == res) 

  res_cm <- confusionMatrix(res$pred, res$obs)

  res_ROC_AUC <- roc(res$obs, factor(res$pred, ordered = TRUE)) %>%
    auc() %>%
    as.numeric()

  res_Metrics <- res_cm$byClass %>% 
    data.frame() %>%
    rownames_to_column('Metrics') %>%
    rename('Value' = '.') %>%
    add_row(Metrics = '**ROC_AUC', Value = res_ROC_AUC) %>%
    add_row(Metrics = '***Accuracy', Value = res_cm$overall[1]) %>%
    add_row(Metrics = '*Elapsed_Time', Value = round(elapsed_time[[3]],2)) %>%
    arrange(Metrics)
  
  svm_bs_200_Metrics <- svm_bs_200_Metrics %>%
    rbind(res_Metrics)
}

svm_bs_200_Metrics <- svm_bs_200_Metrics %>%
  group_by(Metrics) %>%
  summarise(Value = mean(Value))

svm_bs_200_Metrics

2.4.4 DT w/ 5-fold CV

Decision Tree with 5-fold CV

set.seed(43)
start_time <- proc.time()

### Train Model
heart_dt_5_svm <- train(target ~., data = heart_train, 
              method = "rpart", 
              trControl = trainControl(method = 'cv', 
                                       number = 5,
                                       savePredictions = 'final')  
              #preProcess = c("center","scale")
              )

### Make Prediction
heart_cv_5_dt_pred <- predict(heart_dt_5_svm, heart_test)

end_time <- proc.time()
elapsed_time <- end_time - start_time

### Metrics
dt_cv_5_ROC_AUC <- roc(heart_test$target, factor(heart_cv_5_dt_pred, ordered = TRUE)) %>%
  auc() %>%
  as.numeric()

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

dt_cv_5_cm <- confusionMatrix(heart_cv_5_dt_pred, heart_test$target)

dt_cv_5_Metrics <- dt_cv_5_cm$byClass %>% 
  data.frame() %>%
  rownames_to_column('Metrics') %>%
  rename('Value' = '.') %>%
  add_row(Metrics = '**ROC_AUC', Value = dt_cv_5_ROC_AUC) %>%
  add_row(Metrics = '***Accuracy', Value = dt_cv_5_cm$overall[1]) %>%
  add_row(Metrics = '*Elapsed_Time', Value = round(elapsed_time[[3]],2)) %>%
  arrange(Metrics)
dt_cv_5_Metrics

2.4.5 DT w/ 10-fold CV

Decision Tree with 10-fold CV

set.seed(43)
start_time <- proc.time()

### Train Model
heart_dt_10_svm <- train(target ~., data = heart_train, 
              method = "rpart", 
              trControl = trainControl(method = 'cv', 
                                       number = 10,
                                       savePredictions = 'final')  
              #preProcess = c("center","scale")
              )

### Make Prediction
heart_cv_10_dt_pred <- predict(heart_dt_10_svm, heart_test)

end_time <- proc.time()
elapsed_time <- end_time - start_time

### Metrics
dt_cv_10_ROC_AUC <- roc(heart_test$target, factor(heart_cv_10_dt_pred, ordered = TRUE)) %>%
  auc() %>%
  as.numeric()

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

dt_cv_10_cm <- confusionMatrix(heart_cv_10_dt_pred, heart_test$target)

dt_cv_10_Metrics <- dt_cv_10_cm$byClass %>% 
  data.frame() %>%
  rownames_to_column('Metrics') %>%
  rename('Value' = '.') %>%
  add_row(Metrics = '**ROC_AUC', Value = dt_cv_10_ROC_AUC) %>%
  add_row(Metrics = '***Accuracy', Value = dt_cv_10_cm$overall[1]) %>%
  add_row(Metrics = '*Elapsed_Time', Value = round(elapsed_time[[3]],2)) %>%
  arrange(Metrics)
dt_cv_10_Metrics

2.4.6 DT w/ Bootstrap

Decision Tree with Bootstrap 200 resamples

set.seed(43)
start_time <- proc.time()

### Train Model
heart_bs_200_dt <- train(target ~., data = heart_train, 
              method = "rpart", 
              trControl = trainControl(method = 'boot', 
                                       number = 200,
                                       savePredictions = 'final')  
              #preProcess = c("center","scale")
              )

### Make Prediction
heart_bs_200_dt_pred <- predict(heart_bs_200_dt, heart_test)

end_time <- proc.time()
elapsed_time <- end_time - start_time

dt_bs_200_Metrics <- data.frame()

for(res in unique(heart_bs_200_dt$pred$Resample)){
  res <- filter(heart_bs_200_dt$pred, Resample == res) 

  res_cm <- confusionMatrix(res$pred, res$obs)

  res_ROC_AUC <- roc(res$obs, factor(res$pred, ordered = TRUE)) %>%
    auc() %>%
    as.numeric()

  res_Metrics <- res_cm$byClass %>% 
    data.frame() %>%
    rownames_to_column('Metrics') %>%
    rename('Value' = '.') %>%
    add_row(Metrics = '**ROC_AUC', Value = res_ROC_AUC) %>%
    add_row(Metrics = '***Accuracy', Value = res_cm$overall[1]) %>%
    add_row(Metrics = '*Elapsed_Time', Value = round(elapsed_time[[3]],2)) %>%
    arrange(Metrics)
  
  dt_bs_200_Metrics <- dt_bs_200_Metrics %>%
    rbind(res_Metrics)
}

dt_bs_200_Metrics <- dt_bs_200_Metrics %>%
  group_by(Metrics) %>%
  summarise(Value = mean(Value))

dt_bs_200_Metrics

3 Part B

For the same dataset, set a seed (43) and split 80/20.

Using randomForest to grow 4 different forests (n = 15, 25, 50, and 75). Note down as best as you can development (engineering) cost as well as computing cost(elapsed time) for each run. And compare these results with the experiment in Part A.

3.1 Random Forest with n = 15

set.seed(43)
start_time <- proc.time()

### Train Model
heart_rf_15 <- train(target ~., data = heart_train, 
              method = "rf", 
              ntree = 15
              #trControl = train_control,  
              #preProcess = c("center","scale")
              )

### Make Prediction
heart_rf_15_pred <- predict(heart_rf_15, heart_test)

end_time <- proc.time()
elapsed_time <- end_time - start_time

### Metrics
rf_15_ROC_AUC <- roc(heart_test$target, factor(heart_rf_15_pred, ordered = TRUE)) %>%
  auc() %>%
  as.numeric()

rf_15_cm <- confusionMatrix(heart_rf_15_pred, heart_test$target)

rf_15_Metrics <- rf_15_cm$byClass %>% 
  data.frame() %>%
  rownames_to_column('Metrics') %>%
  rename('Value' = '.') %>%
  add_row(Metrics = '**ROC_AUC', Value = rf_15_ROC_AUC) %>%
  add_row(Metrics = '***Accuracy', Value = rf_15_cm$overall[1]) %>%
  add_row(Metrics = '*Elapsed_Time', Value = round(elapsed_time[[3]],2)) %>%
  arrange(Metrics)
rf_15_Metrics

3.2 Random Forest with n = 25

set.seed(43)
start_time <- proc.time()

### Train Model
heart_rf_25 <- train(target ~., data = heart_train, 
              method = "rf", 
              ntree = 25
              #trControl = train_control,  
              #preProcess = c("center","scale")
              )

### Make Prediction
heart_rf_25_pred <- predict(heart_rf_25, heart_test)

end_time <- proc.time()
elapsed_time <- end_time - start_time

### Metrics
rf_25_ROC_AUC <- roc(heart_test$target, factor(heart_rf_25_pred, ordered = TRUE)) %>%
  auc() %>%
  as.numeric()

rf_25_cm <- confusionMatrix(heart_rf_25_pred, heart_test$target)

rf_25_Metrics <- rf_25_cm$byClass %>% 
  data.frame() %>%
  rownames_to_column('Metrics') %>%
  rename('Value' = '.') %>%
  add_row(Metrics = '**ROC_AUC', Value = rf_25_ROC_AUC) %>%
  add_row(Metrics = '***Accuracy', Value = rf_25_cm$overall[1]) %>%
  add_row(Metrics = '*Elapsed_Time', Value = round(elapsed_time[[3]],2)) %>%
  arrange(Metrics)
rf_25_Metrics

3.3 Random Forest with n = 50

set.seed(43)
start_time <- proc.time()

### Train Model
heart_rf_50 <- train(target ~., data = heart_train, 
              method = "rf", 
              ntree = 50
              #trControl = train_control,  
              #preProcess = c("center","scale")
              )

### Make Prediction
heart_rf_50_pred <- predict(heart_rf_50, heart_test)

end_time <- proc.time()
elapsed_time <- end_time - start_time

### Metrics
rf_50_ROC_AUC <- roc(heart_test$target, factor(heart_rf_50_pred, ordered = TRUE)) %>%
  auc() %>%
  as.numeric()

rf_50_cm <- confusionMatrix(heart_rf_50_pred, heart_test$target)

rf_50_Metrics <- rf_50_cm$byClass %>% 
  data.frame() %>%
  rownames_to_column('Metrics') %>%
  rename('Value' = '.') %>%
  add_row(Metrics = '**ROC_AUC', Value = rf_50_ROC_AUC) %>%
  add_row(Metrics = '***Accuracy', Value = rf_50_cm$overall[1]) %>%
  add_row(Metrics = '*Elapsed_Time', Value = round(elapsed_time[[3]],2)) %>%
  arrange(Metrics)
rf_50_Metrics

3.4 Random Forest with n = 75

set.seed(43)
start_time <- proc.time()

### Train Model
heart_rf_75 <- train(target ~., data = heart_train, 
              method = "rf", 
              ntree = 75
              #trControl = train_control,  
              #preProcess = c("center","scale")
              )

### Make Prediction
heart_rf_75_pred <- predict(heart_rf_75, heart_test)

end_time <- proc.time()
elapsed_time <- end_time - start_time

### Metrics
rf_75_ROC_AUC <- roc(heart_test$target, factor(heart_rf_75_pred, ordered = TRUE)) %>%
  auc() %>%
  as.numeric()

rf_75_cm <- confusionMatrix(heart_rf_75_pred, heart_test$target)

rf_75_Metrics <- rf_75_cm$byClass %>% 
  data.frame() %>%
  rownames_to_column('Metrics') %>%
  rename('Value' = '.') %>%
  add_row(Metrics = '**ROC_AUC', Value = rf_75_ROC_AUC) %>%
  add_row(Metrics = '***Accuracy', Value = rf_75_cm$overall[1]) %>%
  add_row(Metrics = '*Elapsed_Time', Value = round(elapsed_time[[3]],2)) %>%
  arrange(Metrics)
rf_75_Metrics

3.5 Comparison

Comparing the results from Part B with the experiment in Part A, we can see that SVM models have better accuracies on average, while Random Forest have better AUC values on average. Decision Tree models have the lowest accuracies and AUC values among all 12 models. The elapsed time of the models are similar.

Summary of all 12 models metrics

svm_Base_Metrics %>% 
  cbind(Model = 'SVM_Base') %>%
  rbind(svm_cv_5_Metrics %>% cbind(Model = 'SVM_5CV'),
        svm_cv_10_Metrics %>% cbind(Model = 'SVM_10CV'),
        svm_bs_200_Metrics %>% cbind(Model = 'SVM_Bootstrap'),
        dt_Base_Metrics %>% cbind(Model = 'DT_Base'),
        dt_cv_5_Metrics %>% cbind(Model = 'DT_5CV'),
        dt_cv_10_Metrics %>% cbind(Model = 'DT_10CV'),
        dt_bs_200_Metrics %>% cbind(Model = 'DT_Bootstrap'),
        rf_15_Metrics %>% cbind(Model = 'RF_15'),
        rf_25_Metrics %>% cbind(Model = 'RF_25'),
        rf_50_Metrics %>% cbind(Model = 'RF_50'),
        rf_75_Metrics %>% cbind(Model = 'RF_75')
        ) %>%
  spread(key = Metrics, value = Value)

4 Part C

Comparing the results from bootstrap models and cross validation models, it is clear that cross validation models perform better and with less engineering cost and computing cost. Bootstrap models require the aggregation of 200 resamples, and therefore the cost is much higher than cross validation method.

Within SVM models, cross validation models have better accuracies, better AUC values, and less engineering cost and computing cost than bootstrap model.

Within Decision Tree models, cross validation models have better accuracies, better AUC values, and much less engineering cost and computing cost than bootstrap model.

Comparing 5-fold CV and 10-fold CV, 5-fold CV has lower computing cost than 10-fold CV although they produce the same accuracies and AUC values.

Thus, I would recommend my customers to use 5-fold cross validation model as it has the best metrics and the lowest cost. This also verifies Occam’s Razor principle that the simpler the model usually the better one to choose.

Data 622 HW2: Finding the Best Model