Training and testing classifiers

This Markdown file demonstrates how to simulate a training set of simulated interaction profiles in a given range of expression values and noise levels. The simulated data is used to train two classifiers: Linear Discriminant Analysis and Random forests. Finally, the performance of the classifiers is measured in independent test sets. In the comparison, we also include a deterministic match, a classification scheme which is not based on machine learning. This Markdown file may take a while to compile, depending on the size of the training and test sets.

Generating a training set

packages = c("R.utils", "MLmetrics", "caret",
             "randomForest", "plotrix", "mltest", "ggpubr")

package_check = lapply(packages, FUN = function(x) {
  if (!require(x, character.only = TRUE)) {
    install.packages(x, dependencies = TRUE)
    library(x, character.only = TRUE)
    }
  })

sourceDirectory("../utils")

prof_codes = read.table("../data/profile_codes_v2.txt",header = TRUE,sep = "\t")

#This may take a while...

#range of expression values for simulations
#expr_range = c(-14, 14)
#ntimes = 400
#signal_to_noise_range = seq(1.5, 4.5, by = 0.5)
#training_set = generate_synthetic_data_v2(expr_range, ntimes, signal_to_noise_range, prof_codes = prof_codes)

#save(training_set, file = 'training_set')

Training LDA and RF

# load("training_set")
# 
# #train LDA
# lda_model = train(training_set[[1]][,-1],
#                   as.factor(training_set[[1]][,1]),
#                   method = "lda", preProcess = "pca")
# 
# save(lda_model, file = 'lda_model')
# 
# 
# #training RF
# trf = tuneRF(training_set[[1]][,-1], as.factor(training_set[[1]][,1]))
# 
# rf_model1 = randomForest(training_set[[1]][,-1],
#                 as.factor(training_set[[1]][,1]), mtry = 8)
# 
# save(rf_model1, file = '../data/rf_model1')

Generating independent test sets with variable noise

#range of expression values for simulations
expr_range = c(-14, 14)

#number of simulated instances per profile
ntimes = 100

#generate test set with low noise
signal_to_noise_range = 4
test_low = generate_synthetic_data_v2(expr_range, ntimes,
                                         signal_to_noise_range, prof_codes = prof_codes)

signal_to_noise_range = 2.5
test_med = generate_synthetic_data_v2(expr_range, ntimes,
                                         signal_to_noise_range, prof_codes = prof_codes)

signal_to_noise_range = 2
test_high = generate_synthetic_data_v2(expr_range, ntimes,
                                         signal_to_noise_range, prof_codes = prof_codes)

Measuring classifiers’ accuracy on test sets

#loading the already trained models
load("../data/lda_model")
load("../data/rf_model1")


#basic classifier - deterministic match without machine learning
non_ML_accuracy = c(basic_classifier_accuracy(test_low, 
                                              prof_codes = prof_codes),
                    basic_classifier_accuracy(test_med, 
                                              prof_codes = prof_codes),
                    basic_classifier_accuracy(test_high, 
                                              prof_codes = prof_codes))
      

lda_accuracy = vector(length = 3)

#LDA accuracy with low noise
y_true = test_low[[1]][,1]
y_lda_pred = predict(lda_model, newdata = test_low[[1]][,-1])
lda_accuracy[1] = Accuracy(y_pred = y_lda_pred, y_true = y_true)

lda_metrics_low = ml_test(predicted = y_lda_pred, 
                  true = y_true, output.as.table = T)
lda_metrics_low$noise = 'low'


#LDA accuracy with med noise
y_true = test_med[[1]][,1]
y_lda_pred = predict(lda_model, newdata = test_med[[1]][,-1])
lda_accuracy[2] = Accuracy(y_pred = y_lda_pred, y_true = y_true)

lda_metrics_med = ml_test(predicted = y_lda_pred, 
                  true = y_true, output.as.table = T)
lda_metrics_med$noise = 'med'


#LDA accuracy with high noise
y_true = test_high[[1]][,1]
y_lda_pred = predict(lda_model, newdata = test_high[[1]][,-1])
lda_accuracy[3] = Accuracy(y_pred = y_lda_pred, y_true = y_true)

lda_metrics_high = ml_test(predicted = y_lda_pred, 
                  true = y_true, output.as.table = T)
lda_metrics_high$noise = 'high'

lda_metrics = rbind(lda_metrics_low, 
                    lda_metrics_med, 
                    lda_metrics_high)

lda_metrics$classifier = 'LDA'


#RF accuracy
rf_accuracy = vector(length = 3)

#RF accuracy with low noise
y_true = test_low[[1]][,1]
y_rf_pred = predict(rf_model1, newdata = test_low[[1]][,-1])
rf_accuracy[1] = Accuracy(y_pred = y_rf_pred, y_true = y_true)

rf_metrics_low = ml_test(predicted = y_rf_pred, 
                  true = y_true, output.as.table = T)
rf_metrics_low$noise = 'low'

#RF accuracy with med noise
y_true = test_med[[1]][,1]
y_rf_pred = predict(rf_model1, newdata = test_med[[1]][,-1])
rf_accuracy[2] = Accuracy(y_pred = y_rf_pred, y_true = y_true)

rf_metrics_med = ml_test(predicted = y_rf_pred, 
                  true = y_true, output.as.table = T)
rf_metrics_med$noise = 'med'

#RF accuracy with high noise
y_true = test_high[[1]][,1]
y_rf_pred = predict(rf_model1, newdata = test_high[[1]][,-1])
rf_accuracy[3] = Accuracy(y_pred = y_rf_pred, y_true = y_true)

rf_metrics_high = ml_test(predicted = y_rf_pred, 
                  true = y_true, output.as.table = T)
rf_metrics_high$noise = 'high'

rf_metrics = rbind(rf_metrics_low, 
                    rf_metrics_med, 
                    rf_metrics_high)

rf_metrics$classifier = 'RF'


#plot accuracy with broken y-axis
from = 10
to = 50
x = 1:3

par(cex.main = 1.5, mar = c(5, 5, 2, 2),
    mgp = c(3.5, 1, 0), cex.lab = 1.3, font.lab = 1.2, font.axis = 1.2, 
    cex.axis = 1.2, bty = 'n', lwd = 2)



gap.plot(x, rf_accuracy*100, gap=c(from,to), type = "b", xlab="noise", ylab = '% correct classification', ylim = c(0,100), xtics = x, xticlab = c('low', 'medium', 'high'), ytics = c(0, 3, seq(60, 100, by = 20)), col = 'orange')

gap.plot(x, lda_accuracy*100, gap=c(from,to), type = "b", xlab="noise", ylab = "value", ylim = c(0,100), add = T, col = 'green')

gap.plot(x, non_ML_accuracy*100, gap=c(from,to), type = "b", xlab="noise", ylab = "value", ylim = c(0,100), add = T, col = 'black')

gap.plot(x, rep(1/123, 3)*100, gap=c(from,to), type = "b", xlab="noise", ylab = "value", ylim = c(0,100), add = T, col = 'gray50')

axis.break(2, from, breakcol = "black", style="gap")
axis.break(2, from, breakcol="black", style="slash")

dev.off()

## null device 
##           1

#to generate the PDF
# pdf("accuracy.pdf", width = 3.5, height = 3.2)
# par(cex.main = 1.5, mar = c(5, 5, 2, 2),
#     mgp = c(3.5, 1, 0), cex.lab = 1.3, font.lab = 1.2, font.axis = 1.2,
#     cex.axis = 1.2, bty = 'n', lwd = 2)

Measuring class-dependent precision and recall

#bind lda and RF results
metrics = rbind(lda_metrics, rf_metrics)
metrics$noise = factor(rf_metrics$noise, levels = c("low", "med", "high"))

g_precision = ggboxplot(metrics, x = "noise", y = "precision", color = "classifier", palette =c("#00AFBB", "#E7B800"))

g_recall = ggboxplot(metrics, x = "noise", y = "recall", color = "classifier", palette =c("#00AFBB", "#E7B800"))

g_precision

g_recall

Measuring multi-log loss on test sets

#LDA multi-log loss
lda_log_loss = vector(length = 3)

#LDA multi-log loss with low noise
y_true = test_low[[1]][,1]
y_lda_pred = predict(lda_model, newdata = test_low[[1]][,-1], type = 'prob')
lda_log_loss[1] = MultiLogLoss(y_pred = data.matrix(y_lda_pred), y_true = y_true)

#LDA multi-log loss with med noise
y_true = test_med[[1]][,1]
y_lda_pred = predict(lda_model, newdata = test_med[[1]][,-1], type = 'prob')
lda_log_loss[2] = MultiLogLoss(y_pred = data.matrix(y_lda_pred), y_true = y_true)

#LDA multi-log loss with high noise
y_true = test_high[[1]][,1]
y_lda_pred = predict(lda_model, newdata = test_high[[1]][,-1], type = 'prob')
lda_log_loss[3] = MultiLogLoss(y_pred = data.matrix(y_lda_pred), y_true = y_true)



#RF multi-log loss
rf_log_loss = vector(length = 3)

#RF multi-log loss with low noise
y_true = test_low[[1]][,1]
y_rf_pred = predict(rf_model1, newdata = test_low[[1]][,-1], type = 'prob')
rf_log_loss[1] = MultiLogLoss(y_pred = data.matrix(y_rf_pred), y_true = y_true)

#RF multi-log loss with med noise
y_true = test_med[[1]][,1]
#y_true = dummy.data.frame(data.frame(factor(y_true)))
y_rf_pred = predict(rf_model1, newdata = test_med[[1]][,-1], type = 'prob')
rf_log_loss[2] = MultiLogLoss(y_pred = data.matrix(y_rf_pred), y_true = y_true)

#RF multi-log loss with high noise
y_true = test_high[[1]][,1]
y_rf_pred = predict(rf_model1, newdata = test_high[[1]][,-1], type = 'prob')
rf_log_loss[3] = MultiLogLoss(y_pred = data.matrix(y_rf_pred), y_true = y_true)

#plot multilog loss
par(cex.main = 1.5, mar = c(5, 5, 2, 2),
    mgp = c(3.5, 1, 0), cex.lab = 1.3, font.lab = 1.2, font.axis = 1.2, 
    cex.axis = 1.2, bty = 'n', lwd = 2)

plot(1:3, lda_log_loss, ylim = c(32, 33.5), type = 'b',
     col = 'black', xlab = '', ylab = 'multiclass log gain', xaxt = "n", pch = 2, lwd = 3)
axis(side = 1, at = 1:3, labels = c('low', 'medium', 'high'))
lines(1:3, rf_log_loss, type = 'b', col = 'black',
      pch = 1, lwd = 3)

#to generate PDF
# pdf("multiclass_log_loss.pdf", width = 3.5, height = 3.2)
# par(cex.main = 1.5, mar = c(5, 5, 2, 2),
#     mgp = c(3.5, 1, 0), cex.lab = 1.3, font.lab = 1.2, font.axis = 1.2,
#     cex.axis = 1.2, bty = 'n', lwd = 2)

sessionInfo()

## R version 3.4.0 (2017-04-21)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] limSolve_1.5.5.2    Biobase_2.38.0      BiocGenerics_0.24.0
##  [4] limma_3.34.9        ggpubr_0.2          magrittr_1.5       
##  [7] mltest_1.0.1        plotrix_3.7-4       randomForest_4.6-12
## [10] caret_6.0-81        ggplot2_3.1.0.9000  lattice_0.20-35    
## [13] MLmetrics_1.1.1     R.utils_2.6.0       R.oo_1.22.0        
## [16] R.methodsS3_1.7.1  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.1         lubridate_1.6.0    class_7.3-14      
##  [4] assertthat_0.2.0   rprojroot_1.2      digest_0.6.18     
##  [7] ipred_0.9-8        foreach_1.4.4      R6_2.4.0          
## [10] plyr_1.8.4         backports_1.1.0    stats4_3.4.0      
## [13] evaluate_0.10.1    pillar_1.3.1       rlang_0.3.1       
## [16] lazyeval_0.2.1     rstudioapi_0.7     rpart_4.1-11      
## [19] Matrix_1.2-9       rmarkdown_1.6      labeling_0.3      
## [22] splines_3.4.0      gower_0.2.0        stringr_1.3.0     
## [25] munsell_0.5.0      compiler_3.4.0     xfun_0.6          
## [28] pkgconfig_2.0.2    htmltools_0.3.6    nnet_7.3-12       
## [31] tidyselect_0.2.5   lpSolve_5.6.13     tibble_2.0.1      
## [34] prodlim_2018.04.18 quadprog_1.5-5     codetools_0.2-15  
## [37] crayon_1.3.4       dplyr_0.7.8        withr_2.1.2.9000  
## [40] MASS_7.3-47        recipes_0.1.5      ModelMetrics_1.1.0
## [43] grid_3.4.0         nlme_3.1-131       gtable_0.3.0      
## [46] scales_1.0.0.9000  stringi_1.1.7      reshape2_1.4.3    
## [49] bindrcpp_0.2.2     timeDate_3012.100  generics_0.0.2    
## [52] lava_1.6.5         iterators_1.0.10   tools_3.4.0       
## [55] glue_1.3.0         purrr_0.2.4        survival_2.41-3   
## [58] yaml_2.2.0         colorspace_1.4-0   knitr_1.22        
## [61] bindr_0.1.1