This page evaluates the results from the algorithm training, which is documented here.
First, I get all of the .RData
file names and load them all (you need to set the environment to .GlobalEnv
for the load()
function or the objects won’t show up).
files <- list.files()[grep("\\.RData", list.files())]
lapply(files, load, envir = .GlobalEnv)
We also need the test data.
load("../Data/split_data.RData")
Here are the objects.
objects()
## [1] "files" "miRNA_pvalue" "nb_pre1_10cv"
## [4] "nb_pre1_10cv_pred" "nb_pre1_10cv_prob" "nb_pre1_5cv"
## [7] "nb_pre1_5cv_pred" "nb_pre1_5cv_prob" "nb_pre2_10cv"
## [10] "nb_pre2_10cv_pred" "nb_pre2_10cv_prob" "nb_pre2_5cv"
## [13] "nb_pre2_5cv_pred" "nb_pre2_5cv_prob" "rf_pre1_10cv"
## [16] "rf_pre1_10cv_pred" "rf_pre1_10cv_prob" "rf_pre1_5cv"
## [19] "rf_pre1_5cv_pred" "rf_pre1_5cv_prob" "rf_pre2_10cv"
## [22] "rf_pre2_10cv_pred" "rf_pre2_10cv_prob" "rf_pre2_5cv"
## [25] "rf_pre2_5cv_pred" "rf_pre2_5cv_prob" "svm_pre1_10cv"
## [28] "svm_pre1_10cv_pred" "svm_pre1_10cv_prob" "svm_pre1_5cv"
## [31] "svm_pre1_5cv_pred" "svm_pre1_5cv_prob" "svm_pre2_10cv"
## [34] "svm_pre2_10cv_pred" "svm_pre2_10cv_prob" "svm_pre2_5cv"
## [37] "svm_pre2_5cv_pred" "svm_pre2_5cv_prob" "test_data"
## [40] "train_data"
Here, I get the aptly named confusion matrix for all models.
library(caret)
predictions <- objects()[grep("_pred", objects())]
conf.matrices <- lapply(predictions, function(prediction){
# have to use 'get()` to get the object using its name
prediction <- get(prediction)
confusionMatrix(data = prediction,
reference = test_data$stage,
positive = "late")
})
names(conf.matrices) <- predictions
Here, I get the accuracies, and their confidence intervals, for the predictions.
Accuracy <- sapply(conf.matrices, function(mat) mat$overall["Accuracy"])
Accuracy_Upper <- sapply(conf.matrices, function(mat) mat$overall["AccuracyUpper"])
Accuracy_Lower <- sapply(conf.matrices, function(mat) mat$overall["AccuracyLower"])
Now I make a table that can be used to plot the accuracies.
Algorithm <- sapply(strsplit(predictions, "_"), function(x) x[1])
Algorithm <- toupper(Algorithm)
Preprocess <- sapply(strsplit(predictions, "_"), function(x) x[2])
Preprocess <- toupper(Preprocess)
Split <- sapply(strsplit(predictions, "_"), function(x) x[3])
Split <- toupper(Split)
accuracy.df <- data.frame(Algorithm, Preprocess, Split, Accuracy,
Accuracy_Upper, Accuracy_Lower,
row.names = 1:length(predictions))
accuracy.df
## Algorithm Preprocess Split Accuracy Accuracy_Upper Accuracy_Lower
## 1 NB PRE1 10CV 0.6603774 0.7495536 0.5619835
## 2 NB PRE1 5CV 0.6415094 0.7323414 0.5425503
## 3 NB PRE2 10CV 0.6226415 0.7149780 0.5232659
## 4 NB PRE2 5CV 0.6226415 0.7149780 0.5232659
## 5 RF PRE1 10CV 0.7452830 0.8249489 0.6514466
## 6 RF PRE1 5CV 0.7358491 0.8167536 0.6413293
## 7 RF PRE2 10CV 0.7452830 0.8249489 0.6514466
## 8 RF PRE2 5CV 0.7452830 0.8249489 0.6514466
## 9 SVM PRE1 10CV 0.6886792 0.7750753 0.5914241
## 10 SVM PRE1 5CV 0.7264151 0.8085095 0.6312592
## 11 SVM PRE2 10CV 0.6792453 0.7666088 0.5815707
## 12 SVM PRE2 5CV 0.6981132 0.7834998 0.6013186
Now I make two plots that show the comparison between algorithms and preprocessing: one plot for 5-fold cross validation and one for 10-fold.
p_accuracy <- ggplot(accuracy.df,
aes(x = Algorithm, y = Accuracy, ymin = Accuracy_Lower,
ymax = Accuracy_Upper, colour = Preprocess))
p_accuracy <- p_accuracy + facet_grid(. ~ Split)
p_accuracy <- p_accuracy + geom_point(position=position_dodge(width = .9))
p_accuracy <- p_accuracy + geom_errorbar(position=position_dodge(width = .9))
p_accuracy <- p_accuracy + theme(strip.background = element_rect(fill = 'khaki'))
p_accuracy
Now I add more information to the table to plot sensitivity and specificity.
accuracy.df$Sensitivity <- sapply(conf.matrices, function(mat) mat$byClass["Sensitivity"])
accuracy.df$Specificity <- sapply(conf.matrices, function(mat) mat$byClass["Specificity"])
accuracy.df[, -c(4:6)]
## Algorithm Preprocess Split Sensitivity Specificity
## 1 NB PRE1 10CV 0.2682927 0.9076923
## 2 NB PRE1 5CV 0.4878049 0.7384615
## 3 NB PRE2 10CV 0.3658537 0.7846154
## 4 NB PRE2 5CV 0.4390244 0.7384615
## 5 RF PRE1 10CV 0.5121951 0.8923077
## 6 RF PRE1 5CV 0.4878049 0.8923077
## 7 RF PRE2 10CV 0.4878049 0.9076923
## 8 RF PRE2 5CV 0.5365854 0.8769231
## 9 SVM PRE1 10CV 0.3902439 0.8769231
## 10 SVM PRE1 5CV 0.4878049 0.8769231
## 11 SVM PRE2 10CV 0.4146341 0.8461538
## 12 SVM PRE2 5CV 0.3902439 0.8923077
I use tidyr
to make it a long table for splitting the ggplot2
graph by Sensitivity/Specificity.
library(tidyr)
accuracy.long.df <- gather(accuracy.df[, -c(4:6)], Measure, Value, Sensitivity:Specificity)
accuracy.long.df
## Algorithm Preprocess Split Measure Value
## 1 NB PRE1 10CV Sensitivity 0.2682927
## 2 NB PRE1 5CV Sensitivity 0.4878049
## 3 NB PRE2 10CV Sensitivity 0.3658537
## 4 NB PRE2 5CV Sensitivity 0.4390244
## 5 RF PRE1 10CV Sensitivity 0.5121951
## 6 RF PRE1 5CV Sensitivity 0.4878049
## 7 RF PRE2 10CV Sensitivity 0.4878049
## 8 RF PRE2 5CV Sensitivity 0.5365854
## 9 SVM PRE1 10CV Sensitivity 0.3902439
## 10 SVM PRE1 5CV Sensitivity 0.4878049
## 11 SVM PRE2 10CV Sensitivity 0.4146341
## 12 SVM PRE2 5CV Sensitivity 0.3902439
## 13 NB PRE1 10CV Specificity 0.9076923
## 14 NB PRE1 5CV Specificity 0.7384615
## 15 NB PRE2 10CV Specificity 0.7846154
## 16 NB PRE2 5CV Specificity 0.7384615
## 17 RF PRE1 10CV Specificity 0.8923077
## 18 RF PRE1 5CV Specificity 0.8923077
## 19 RF PRE2 10CV Specificity 0.9076923
## 20 RF PRE2 5CV Specificity 0.8769231
## 21 SVM PRE1 10CV Specificity 0.8769231
## 22 SVM PRE1 5CV Specificity 0.8769231
## 23 SVM PRE2 10CV Specificity 0.8461538
## 24 SVM PRE2 5CV Specificity 0.8923077
p_sens_spec <- ggplot(accuracy.long.df,
aes(x = Algorithm, y = Value, colour = Preprocess,
fill = Preprocess))
p_sens_spec <- p_sens_spec + facet_grid(Measure ~ Split)
p_sens_spec <- p_sens_spec + geom_bar(stat = "identity", position="dodge")
p_sens_spec <- p_sens_spec + theme(strip.background = element_rect(fill = 'khaki'))
p_sens_spec
Here I calculate and plot the area under the ROC curve.
library(pROC)
probs <- objects()[grep("_prob", objects())]
roc.list <- lapply(probs, function(prob, resp){
# have to use 'get()` to get the object using its name
prob <- get(prob)
roc(response = as.factor(resp),
predictor = prob$early,
## This function assumes that the second
## class is the event of interest, so we
## reverse the labels
levels = rev(levels(as.factor(resp))))
}, resp = test_data$stage)
names(roc.list) <- probs
AUC <- sapply(roc.list, auc)
AUC_Lower <- sapply(roc.list, function(x) ci.auc(x)[1])
AUC_Upper <- sapply(roc.list, function(x) ci.auc(x)[3])
Algorithm <- sapply(strsplit(probs, "_"), function(x) x[1])
Algorithm <- toupper(Algorithm)
Preprocess <- sapply(strsplit(probs, "_"), function(x) x[2])
Preprocess <- toupper(Preprocess)
Split <- sapply(strsplit(probs, "_"), function(x) x[3])
Split <- toupper(Split)
auc.df <- data.frame(Algorithm, Preprocess, Split, AUC,
AUC_Upper, AUC_Lower,
row.names = 1:length(probs))
auc.df
## Algorithm Preprocess Split AUC AUC_Upper AUC_Lower
## 1 NB PRE1 10CV 0.6934615 0.8023393 0.5845838
## 2 NB PRE1 5CV 0.6720450 0.7798444 0.5642457
## 3 NB PRE2 10CV 0.6080769 0.7176692 0.4984846
## 4 NB PRE2 5CV 0.6484053 0.7548538 0.5419567
## 5 RF PRE1 10CV 0.8210131 0.9050060 0.7370203
## 6 RF PRE1 5CV 0.8309568 0.9120329 0.7498808
## 7 RF PRE2 10CV 0.7767355 0.8685708 0.6849001
## 8 RF PRE2 5CV 0.7883677 0.8788491 0.6978863
## 9 SVM PRE1 10CV 0.7643527 0.8605184 0.6681871
## 10 SVM PRE1 5CV 0.7996248 0.8856934 0.7135561
## 11 SVM PRE2 10CV 0.7080675 0.8095625 0.6065726
## 12 SVM PRE2 5CV 0.6604128 0.7730238 0.5478017
p_auc <- ggplot(auc.df,
aes(x = Algorithm, y = AUC, ymin = AUC_Lower,
ymax = AUC_Upper, colour = Preprocess))
p_auc <- p_auc + facet_grid(. ~ Split)
p_auc <- p_auc + geom_point(position=position_dodge(width = .9))
p_auc <- p_auc + geom_errorbar(position=position_dodge(width = .9))
p_auc <- p_auc + theme(strip.background = element_rect(fill = 'khaki'))
p_auc
And here I plot the ROC curves.
for(m in c("SVM", "RF", "NB")){
# m = "SVM"
pre1_5cv <- roc.list[[paste0(tolower(m), "_pre1_5cv_prob")]]
pre1_10cv <- roc.list[[paste0(tolower(m), "_pre1_10cv_prob")]]
pre2_5cv <- roc.list[[paste0(tolower(m), "_pre2_5cv_prob")]]
pre2_10cv <- roc.list[[paste0(tolower(m), "_pre2_10cv_prob")]]
plot.roc(pre1_5cv, col = "black", main = m)
lines.roc(pre1_10cv, col = "blue")
lines.roc(pre2_5cv, col = "red")
lines.roc(pre2_10cv, col = "green")
legend("bottomright", legend=c("PRE1/5CV", "PRE1/10CV",
"PRE2/5CV", "PRE2/10CV"),
col=c("black", "blue", "red", "green"), lwd=2)
}