This Markdown file demonstrates how to simulate a training set of simulated interaction profiles in a given range of expression values and noise levels. The simulated data is used to train two classifiers: Linear Discriminant Analysis and Random forests. Finally, the performance of the classifiers is measured in independent test sets. In the comparison, we also include a deterministic match, a classification scheme which is not based on machine learning. This Markdown file may take a while to compile, depending on the size of the training and test sets.
packages = c("R.utils", "MLmetrics", "caret",
"randomForest", "plotrix", "mltest", "ggpubr")
package_check = lapply(packages, FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE)
}
})
sourceDirectory("../utils")
prof_codes = read.table("../data/profile_codes_v2.txt",header = TRUE,sep = "\t")
#This may take a while...
#range of expression values for simulations
#expr_range = c(-14, 14)
#ntimes = 400
#signal_to_noise_range = seq(1.5, 4.5, by = 0.5)
#training_set = generate_synthetic_data_v2(expr_range, ntimes, signal_to_noise_range, prof_codes = prof_codes)
#save(training_set, file = 'training_set')
# load("training_set")
#
# #train LDA
# lda_model = train(training_set[[1]][,-1],
# as.factor(training_set[[1]][,1]),
# method = "lda", preProcess = "pca")
#
# save(lda_model, file = 'lda_model')
#
#
# #training RF
# trf = tuneRF(training_set[[1]][,-1], as.factor(training_set[[1]][,1]))
#
# rf_model1 = randomForest(training_set[[1]][,-1],
# as.factor(training_set[[1]][,1]), mtry = 8)
#
# save(rf_model1, file = '../data/rf_model1')
#range of expression values for simulations
expr_range = c(-14, 14)
#number of simulated instances per profile
ntimes = 100
#generate test set with low noise
signal_to_noise_range = 4
test_low = generate_synthetic_data_v2(expr_range, ntimes,
signal_to_noise_range, prof_codes = prof_codes)
signal_to_noise_range = 2.5
test_med = generate_synthetic_data_v2(expr_range, ntimes,
signal_to_noise_range, prof_codes = prof_codes)
signal_to_noise_range = 2
test_high = generate_synthetic_data_v2(expr_range, ntimes,
signal_to_noise_range, prof_codes = prof_codes)
#loading the already trained models
load("../data/lda_model")
load("../data/rf_model1")
#basic classifier - deterministic match without machine learning
non_ML_accuracy = c(basic_classifier_accuracy(test_low,
prof_codes = prof_codes),
basic_classifier_accuracy(test_med,
prof_codes = prof_codes),
basic_classifier_accuracy(test_high,
prof_codes = prof_codes))
lda_accuracy = vector(length = 3)
#LDA accuracy with low noise
y_true = test_low[[1]][,1]
y_lda_pred = predict(lda_model, newdata = test_low[[1]][,-1])
lda_accuracy[1] = Accuracy(y_pred = y_lda_pred, y_true = y_true)
lda_metrics_low = ml_test(predicted = y_lda_pred,
true = y_true, output.as.table = T)
lda_metrics_low$noise = 'low'
#LDA accuracy with med noise
y_true = test_med[[1]][,1]
y_lda_pred = predict(lda_model, newdata = test_med[[1]][,-1])
lda_accuracy[2] = Accuracy(y_pred = y_lda_pred, y_true = y_true)
lda_metrics_med = ml_test(predicted = y_lda_pred,
true = y_true, output.as.table = T)
lda_metrics_med$noise = 'med'
#LDA accuracy with high noise
y_true = test_high[[1]][,1]
y_lda_pred = predict(lda_model, newdata = test_high[[1]][,-1])
lda_accuracy[3] = Accuracy(y_pred = y_lda_pred, y_true = y_true)
lda_metrics_high = ml_test(predicted = y_lda_pred,
true = y_true, output.as.table = T)
lda_metrics_high$noise = 'high'
lda_metrics = rbind(lda_metrics_low,
lda_metrics_med,
lda_metrics_high)
lda_metrics$classifier = 'LDA'
#RF accuracy
rf_accuracy = vector(length = 3)
#RF accuracy with low noise
y_true = test_low[[1]][,1]
y_rf_pred = predict(rf_model1, newdata = test_low[[1]][,-1])
rf_accuracy[1] = Accuracy(y_pred = y_rf_pred, y_true = y_true)
rf_metrics_low = ml_test(predicted = y_rf_pred,
true = y_true, output.as.table = T)
rf_metrics_low$noise = 'low'
#RF accuracy with med noise
y_true = test_med[[1]][,1]
y_rf_pred = predict(rf_model1, newdata = test_med[[1]][,-1])
rf_accuracy[2] = Accuracy(y_pred = y_rf_pred, y_true = y_true)
rf_metrics_med = ml_test(predicted = y_rf_pred,
true = y_true, output.as.table = T)
rf_metrics_med$noise = 'med'
#RF accuracy with high noise
y_true = test_high[[1]][,1]
y_rf_pred = predict(rf_model1, newdata = test_high[[1]][,-1])
rf_accuracy[3] = Accuracy(y_pred = y_rf_pred, y_true = y_true)
rf_metrics_high = ml_test(predicted = y_rf_pred,
true = y_true, output.as.table = T)
rf_metrics_high$noise = 'high'
rf_metrics = rbind(rf_metrics_low,
rf_metrics_med,
rf_metrics_high)
rf_metrics$classifier = 'RF'
#plot accuracy with broken y-axis
from = 10
to = 50
x = 1:3
par(cex.main = 1.5, mar = c(5, 5, 2, 2),
mgp = c(3.5, 1, 0), cex.lab = 1.3, font.lab = 1.2, font.axis = 1.2,
cex.axis = 1.2, bty = 'n', lwd = 2)
gap.plot(x, rf_accuracy*100, gap=c(from,to), type = "b", xlab="noise", ylab = '% correct classification', ylim = c(0,100), xtics = x, xticlab = c('low', 'medium', 'high'), ytics = c(0, 3, seq(60, 100, by = 20)), col = 'orange')
gap.plot(x, lda_accuracy*100, gap=c(from,to), type = "b", xlab="noise", ylab = "value", ylim = c(0,100), add = T, col = 'green')
gap.plot(x, non_ML_accuracy*100, gap=c(from,to), type = "b", xlab="noise", ylab = "value", ylim = c(0,100), add = T, col = 'black')
gap.plot(x, rep(1/123, 3)*100, gap=c(from,to), type = "b", xlab="noise", ylab = "value", ylim = c(0,100), add = T, col = 'gray50')
axis.break(2, from, breakcol = "black", style="gap")
axis.break(2, from, breakcol="black", style="slash")
dev.off()
## null device
## 1
#to generate the PDF
# pdf("accuracy.pdf", width = 3.5, height = 3.2)
# par(cex.main = 1.5, mar = c(5, 5, 2, 2),
# mgp = c(3.5, 1, 0), cex.lab = 1.3, font.lab = 1.2, font.axis = 1.2,
# cex.axis = 1.2, bty = 'n', lwd = 2)
#bind lda and RF results
metrics = rbind(lda_metrics, rf_metrics)
metrics$noise = factor(rf_metrics$noise, levels = c("low", "med", "high"))
g_precision = ggboxplot(metrics, x = "noise", y = "precision", color = "classifier", palette =c("#00AFBB", "#E7B800"))
g_recall = ggboxplot(metrics, x = "noise", y = "recall", color = "classifier", palette =c("#00AFBB", "#E7B800"))
g_precision
g_recall
#LDA multi-log loss
lda_log_loss = vector(length = 3)
#LDA multi-log loss with low noise
y_true = test_low[[1]][,1]
y_lda_pred = predict(lda_model, newdata = test_low[[1]][,-1], type = 'prob')
lda_log_loss[1] = MultiLogLoss(y_pred = data.matrix(y_lda_pred), y_true = y_true)
#LDA multi-log loss with med noise
y_true = test_med[[1]][,1]
y_lda_pred = predict(lda_model, newdata = test_med[[1]][,-1], type = 'prob')
lda_log_loss[2] = MultiLogLoss(y_pred = data.matrix(y_lda_pred), y_true = y_true)
#LDA multi-log loss with high noise
y_true = test_high[[1]][,1]
y_lda_pred = predict(lda_model, newdata = test_high[[1]][,-1], type = 'prob')
lda_log_loss[3] = MultiLogLoss(y_pred = data.matrix(y_lda_pred), y_true = y_true)
#RF multi-log loss
rf_log_loss = vector(length = 3)
#RF multi-log loss with low noise
y_true = test_low[[1]][,1]
y_rf_pred = predict(rf_model1, newdata = test_low[[1]][,-1], type = 'prob')
rf_log_loss[1] = MultiLogLoss(y_pred = data.matrix(y_rf_pred), y_true = y_true)
#RF multi-log loss with med noise
y_true = test_med[[1]][,1]
#y_true = dummy.data.frame(data.frame(factor(y_true)))
y_rf_pred = predict(rf_model1, newdata = test_med[[1]][,-1], type = 'prob')
rf_log_loss[2] = MultiLogLoss(y_pred = data.matrix(y_rf_pred), y_true = y_true)
#RF multi-log loss with high noise
y_true = test_high[[1]][,1]
y_rf_pred = predict(rf_model1, newdata = test_high[[1]][,-1], type = 'prob')
rf_log_loss[3] = MultiLogLoss(y_pred = data.matrix(y_rf_pred), y_true = y_true)
#plot multilog loss
par(cex.main = 1.5, mar = c(5, 5, 2, 2),
mgp = c(3.5, 1, 0), cex.lab = 1.3, font.lab = 1.2, font.axis = 1.2,
cex.axis = 1.2, bty = 'n', lwd = 2)
plot(1:3, lda_log_loss, ylim = c(32, 33.5), type = 'b',
col = 'black', xlab = '', ylab = 'multiclass log gain', xaxt = "n", pch = 2, lwd = 3)
axis(side = 1, at = 1:3, labels = c('low', 'medium', 'high'))
lines(1:3, rf_log_loss, type = 'b', col = 'black',
pch = 1, lwd = 3)
#to generate PDF
# pdf("multiclass_log_loss.pdf", width = 3.5, height = 3.2)
# par(cex.main = 1.5, mar = c(5, 5, 2, 2),
# mgp = c(3.5, 1, 0), cex.lab = 1.3, font.lab = 1.2, font.axis = 1.2,
# cex.axis = 1.2, bty = 'n', lwd = 2)
sessionInfo()
## R version 3.4.0 (2017-04-21)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] limSolve_1.5.5.2 Biobase_2.38.0 BiocGenerics_0.24.0
## [4] limma_3.34.9 ggpubr_0.2 magrittr_1.5
## [7] mltest_1.0.1 plotrix_3.7-4 randomForest_4.6-12
## [10] caret_6.0-81 ggplot2_3.1.0.9000 lattice_0.20-35
## [13] MLmetrics_1.1.1 R.utils_2.6.0 R.oo_1.22.0
## [16] R.methodsS3_1.7.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.1 lubridate_1.6.0 class_7.3-14
## [4] assertthat_0.2.0 rprojroot_1.2 digest_0.6.18
## [7] ipred_0.9-8 foreach_1.4.4 R6_2.4.0
## [10] plyr_1.8.4 backports_1.1.0 stats4_3.4.0
## [13] evaluate_0.10.1 pillar_1.3.1 rlang_0.3.1
## [16] lazyeval_0.2.1 rstudioapi_0.7 rpart_4.1-11
## [19] Matrix_1.2-9 rmarkdown_1.6 labeling_0.3
## [22] splines_3.4.0 gower_0.2.0 stringr_1.3.0
## [25] munsell_0.5.0 compiler_3.4.0 xfun_0.6
## [28] pkgconfig_2.0.2 htmltools_0.3.6 nnet_7.3-12
## [31] tidyselect_0.2.5 lpSolve_5.6.13 tibble_2.0.1
## [34] prodlim_2018.04.18 quadprog_1.5-5 codetools_0.2-15
## [37] crayon_1.3.4 dplyr_0.7.8 withr_2.1.2.9000
## [40] MASS_7.3-47 recipes_0.1.5 ModelMetrics_1.1.0
## [43] grid_3.4.0 nlme_3.1-131 gtable_0.3.0
## [46] scales_1.0.0.9000 stringi_1.1.7 reshape2_1.4.3
## [49] bindrcpp_0.2.2 timeDate_3012.100 generics_0.0.2
## [52] lava_1.6.5 iterators_1.0.10 tools_3.4.0
## [55] glue_1.3.0 purrr_0.2.4 survival_2.41-3
## [58] yaml_2.2.0 colorspace_1.4-0 knitr_1.22
## [61] bindr_0.1.1