This page evaluates the results from the algorithm training, which is documented here.

Load Results

First, I get all of the .RData file names and load them all (you need to set the environment to .GlobalEnv for the load() function or the objects won’t show up).

files <- list.files()[grep("\\.RData", list.files())]
lapply(files, load, envir = .GlobalEnv)

We also need the test data.

load("../Data/split_data.RData")

Here are the objects.

objects()
##  [1] "files"              "miRNA_pvalue"       "nb_pre1_10cv"      
##  [4] "nb_pre1_10cv_pred"  "nb_pre1_10cv_prob"  "nb_pre1_5cv"       
##  [7] "nb_pre1_5cv_pred"   "nb_pre1_5cv_prob"   "nb_pre2_10cv"      
## [10] "nb_pre2_10cv_pred"  "nb_pre2_10cv_prob"  "nb_pre2_5cv"       
## [13] "nb_pre2_5cv_pred"   "nb_pre2_5cv_prob"   "rf_pre1_10cv"      
## [16] "rf_pre1_10cv_pred"  "rf_pre1_10cv_prob"  "rf_pre1_5cv"       
## [19] "rf_pre1_5cv_pred"   "rf_pre1_5cv_prob"   "rf_pre2_10cv"      
## [22] "rf_pre2_10cv_pred"  "rf_pre2_10cv_prob"  "rf_pre2_5cv"       
## [25] "rf_pre2_5cv_pred"   "rf_pre2_5cv_prob"   "svm_pre1_10cv"     
## [28] "svm_pre1_10cv_pred" "svm_pre1_10cv_prob" "svm_pre1_5cv"      
## [31] "svm_pre1_5cv_pred"  "svm_pre1_5cv_prob"  "svm_pre2_10cv"     
## [34] "svm_pre2_10cv_pred" "svm_pre2_10cv_prob" "svm_pre2_5cv"      
## [37] "svm_pre2_5cv_pred"  "svm_pre2_5cv_prob"  "test_data"         
## [40] "train_data"

Confusion Matrix

Here, I get the aptly named confusion matrix for all models.

library(caret)
predictions <- objects()[grep("_pred", objects())]
conf.matrices <- lapply(predictions, function(prediction){
  # have to use 'get()` to get the object using its name
  prediction <- get(prediction)  
  confusionMatrix(data = prediction,
                  reference = test_data$stage,
                  positive = "late")
})
names(conf.matrices) <- predictions

Accuracy

Here, I get the accuracies, and their confidence intervals, for the predictions.

Accuracy <- sapply(conf.matrices, function(mat) mat$overall["Accuracy"])
Accuracy_Upper <- sapply(conf.matrices, function(mat) mat$overall["AccuracyUpper"])
Accuracy_Lower <- sapply(conf.matrices, function(mat) mat$overall["AccuracyLower"])

Now I make a table that can be used to plot the accuracies.

Algorithm <- sapply(strsplit(predictions, "_"), function(x) x[1])
Algorithm <- toupper(Algorithm)
Preprocess <- sapply(strsplit(predictions, "_"), function(x) x[2])
Preprocess <- toupper(Preprocess)
Split <- sapply(strsplit(predictions, "_"), function(x) x[3])
Split <- toupper(Split)
accuracy.df <- data.frame(Algorithm, Preprocess, Split, Accuracy, 
                          Accuracy_Upper, Accuracy_Lower, 
                          row.names = 1:length(predictions))
accuracy.df
##    Algorithm Preprocess Split  Accuracy Accuracy_Upper Accuracy_Lower
## 1         NB       PRE1  10CV 0.6603774      0.7495536      0.5619835
## 2         NB       PRE1   5CV 0.6415094      0.7323414      0.5425503
## 3         NB       PRE2  10CV 0.6226415      0.7149780      0.5232659
## 4         NB       PRE2   5CV 0.6226415      0.7149780      0.5232659
## 5         RF       PRE1  10CV 0.7452830      0.8249489      0.6514466
## 6         RF       PRE1   5CV 0.7358491      0.8167536      0.6413293
## 7         RF       PRE2  10CV 0.7452830      0.8249489      0.6514466
## 8         RF       PRE2   5CV 0.7452830      0.8249489      0.6514466
## 9        SVM       PRE1  10CV 0.6886792      0.7750753      0.5914241
## 10       SVM       PRE1   5CV 0.7264151      0.8085095      0.6312592
## 11       SVM       PRE2  10CV 0.6792453      0.7666088      0.5815707
## 12       SVM       PRE2   5CV 0.6981132      0.7834998      0.6013186

Now I make two plots that show the comparison between algorithms and preprocessing: one plot for 5-fold cross validation and one for 10-fold.

p_accuracy <- ggplot(accuracy.df, 
                     aes(x = Algorithm, y = Accuracy, ymin = Accuracy_Lower,
                         ymax = Accuracy_Upper, colour = Preprocess))
p_accuracy <- p_accuracy + facet_grid(. ~ Split)
p_accuracy <- p_accuracy + geom_point(position=position_dodge(width = .9)) 
p_accuracy <- p_accuracy + geom_errorbar(position=position_dodge(width = .9))
p_accuracy <- p_accuracy + theme(strip.background = element_rect(fill = 'khaki'))
p_accuracy                

Sensitivity and Specificity

Now I add more information to the table to plot sensitivity and specificity.

accuracy.df$Sensitivity <- sapply(conf.matrices, function(mat) mat$byClass["Sensitivity"])
accuracy.df$Specificity <- sapply(conf.matrices, function(mat) mat$byClass["Specificity"])
accuracy.df[, -c(4:6)]
##    Algorithm Preprocess Split Sensitivity Specificity
## 1         NB       PRE1  10CV   0.2682927   0.9076923
## 2         NB       PRE1   5CV   0.4878049   0.7384615
## 3         NB       PRE2  10CV   0.3658537   0.7846154
## 4         NB       PRE2   5CV   0.4390244   0.7384615
## 5         RF       PRE1  10CV   0.5121951   0.8923077
## 6         RF       PRE1   5CV   0.4878049   0.8923077
## 7         RF       PRE2  10CV   0.4878049   0.9076923
## 8         RF       PRE2   5CV   0.5365854   0.8769231
## 9        SVM       PRE1  10CV   0.3902439   0.8769231
## 10       SVM       PRE1   5CV   0.4878049   0.8769231
## 11       SVM       PRE2  10CV   0.4146341   0.8461538
## 12       SVM       PRE2   5CV   0.3902439   0.8923077

I use tidyr to make it a long table for splitting the ggplot2 graph by Sensitivity/Specificity.

library(tidyr)
accuracy.long.df <- gather(accuracy.df[, -c(4:6)], Measure, Value, Sensitivity:Specificity)
accuracy.long.df
##    Algorithm Preprocess Split     Measure     Value
## 1         NB       PRE1  10CV Sensitivity 0.2682927
## 2         NB       PRE1   5CV Sensitivity 0.4878049
## 3         NB       PRE2  10CV Sensitivity 0.3658537
## 4         NB       PRE2   5CV Sensitivity 0.4390244
## 5         RF       PRE1  10CV Sensitivity 0.5121951
## 6         RF       PRE1   5CV Sensitivity 0.4878049
## 7         RF       PRE2  10CV Sensitivity 0.4878049
## 8         RF       PRE2   5CV Sensitivity 0.5365854
## 9        SVM       PRE1  10CV Sensitivity 0.3902439
## 10       SVM       PRE1   5CV Sensitivity 0.4878049
## 11       SVM       PRE2  10CV Sensitivity 0.4146341
## 12       SVM       PRE2   5CV Sensitivity 0.3902439
## 13        NB       PRE1  10CV Specificity 0.9076923
## 14        NB       PRE1   5CV Specificity 0.7384615
## 15        NB       PRE2  10CV Specificity 0.7846154
## 16        NB       PRE2   5CV Specificity 0.7384615
## 17        RF       PRE1  10CV Specificity 0.8923077
## 18        RF       PRE1   5CV Specificity 0.8923077
## 19        RF       PRE2  10CV Specificity 0.9076923
## 20        RF       PRE2   5CV Specificity 0.8769231
## 21       SVM       PRE1  10CV Specificity 0.8769231
## 22       SVM       PRE1   5CV Specificity 0.8769231
## 23       SVM       PRE2  10CV Specificity 0.8461538
## 24       SVM       PRE2   5CV Specificity 0.8923077
p_sens_spec <- ggplot(accuracy.long.df, 
                     aes(x = Algorithm, y = Value, colour = Preprocess, 
                         fill = Preprocess))
p_sens_spec <- p_sens_spec + facet_grid(Measure ~ Split)
p_sens_spec <- p_sens_spec + geom_bar(stat = "identity", position="dodge")
p_sens_spec <- p_sens_spec + theme(strip.background = element_rect(fill = 'khaki'))
p_sens_spec 

ROC

Here I calculate and plot the area under the ROC curve.

library(pROC)
probs <- objects()[grep("_prob", objects())]
roc.list <- lapply(probs, function(prob, resp){
  # have to use 'get()` to get the object using its name
  prob <- get(prob)
  roc(response = as.factor(resp),
                predictor = prob$early,
                ## This function assumes that the second
                ## class is the event of interest, so we
                ## reverse the labels
                levels = rev(levels(as.factor(resp))))
}, resp = test_data$stage)
names(roc.list) <- probs

AUC <- sapply(roc.list, auc)
AUC_Lower <- sapply(roc.list, function(x) ci.auc(x)[1])
AUC_Upper <- sapply(roc.list, function(x) ci.auc(x)[3])

Algorithm <- sapply(strsplit(probs, "_"), function(x) x[1])
Algorithm <- toupper(Algorithm)
Preprocess <- sapply(strsplit(probs, "_"), function(x) x[2])
Preprocess <- toupper(Preprocess)
Split <- sapply(strsplit(probs, "_"), function(x) x[3])
Split <- toupper(Split)
auc.df <- data.frame(Algorithm, Preprocess, Split, AUC, 
                          AUC_Upper, AUC_Lower, 
                          row.names = 1:length(probs))
auc.df
##    Algorithm Preprocess Split       AUC AUC_Upper AUC_Lower
## 1         NB       PRE1  10CV 0.6934615 0.8023393 0.5845838
## 2         NB       PRE1   5CV 0.6720450 0.7798444 0.5642457
## 3         NB       PRE2  10CV 0.6080769 0.7176692 0.4984846
## 4         NB       PRE2   5CV 0.6484053 0.7548538 0.5419567
## 5         RF       PRE1  10CV 0.8210131 0.9050060 0.7370203
## 6         RF       PRE1   5CV 0.8309568 0.9120329 0.7498808
## 7         RF       PRE2  10CV 0.7767355 0.8685708 0.6849001
## 8         RF       PRE2   5CV 0.7883677 0.8788491 0.6978863
## 9        SVM       PRE1  10CV 0.7643527 0.8605184 0.6681871
## 10       SVM       PRE1   5CV 0.7996248 0.8856934 0.7135561
## 11       SVM       PRE2  10CV 0.7080675 0.8095625 0.6065726
## 12       SVM       PRE2   5CV 0.6604128 0.7730238 0.5478017
p_auc <- ggplot(auc.df, 
                     aes(x = Algorithm, y = AUC, ymin = AUC_Lower,
                         ymax = AUC_Upper, colour = Preprocess))
p_auc <- p_auc + facet_grid(. ~ Split)
p_auc <- p_auc + geom_point(position=position_dodge(width = .9)) 
p_auc <- p_auc + geom_errorbar(position=position_dodge(width = .9))
p_auc <- p_auc + theme(strip.background = element_rect(fill = 'khaki'))
p_auc  

And here I plot the ROC curves.

for(m in c("SVM", "RF", "NB")){
  # m = "SVM"
  pre1_5cv <- roc.list[[paste0(tolower(m), "_pre1_5cv_prob")]]
  pre1_10cv <- roc.list[[paste0(tolower(m), "_pre1_10cv_prob")]]
  pre2_5cv <- roc.list[[paste0(tolower(m), "_pre2_5cv_prob")]]
  pre2_10cv <- roc.list[[paste0(tolower(m), "_pre2_10cv_prob")]]
  plot.roc(pre1_5cv, col = "black", main = m)
  lines.roc(pre1_10cv, col = "blue")
  lines.roc(pre2_5cv, col = "red")
  lines.roc(pre2_10cv, col = "green")
  legend("bottomright", legend=c("PRE1/5CV", "PRE1/10CV",
                                 "PRE2/5CV", "PRE2/10CV"), 
         col=c("black", "blue", "red", "green"), lwd=2)
}