In this R-markdown file the results of screening experiments are imported then a design for 6 factors which each of them has 2 level and 5 replicated are created. Then the performed experiments and design are linked to each other.The factors are: algorithm , resampling , feature_selection , imputation_num , imputation_cat and encoding.
For convenience, we used the pacman package since it allows for installing/loading the needed packages in one step. The files are entered from local drive, you can replaced it to replicate the results.
rm(list = ls()) # clear global environment
graphics.off() # close all graphics
# gc()
if(!"pacman" %in% rownames(installed.packages())){
install.packages(pkgs = "pacman",repos = "http://cran.us.r-project.org")
}
library(pacman)
p_load(DoE.base,FrF2,DoE.wrapper,RcmdrPlugin.Export,
RcmdrPlugin.FactoMineR,RcmdrPlugin.HH
,RcmdrPlugin.TeachingDemos,RcmdrPlugin.orloca,Rcmdr,conf.design
,lhs,AlgDesign,DiceDesign,rsm,RcmdrPlugin.DoE)
# this library helps to produce codes required in DoE
# library(RcmdrPlugin.DoE)
# install.packages("RcmdrPlugin.Export") # Graphically export objects to LaTeX or HTML
# install.packages("RcmdrPlugin.FactoMineR") # Graphical User Interface for FactoMineR
# install.packages("RcmdrPlugin.HH") # Rcmdr support for the HH package
# install.packages("RcmdrPlugin.IPSUR") # Introduction to Probability and Statistics Using R
# install.packages("RcmdrPlugin.SurvivalT") # Rcmdr Survival Plug-In
# install.packages("RcmdrPlugin.TeachingDemos") # Rcmdr Teaching Demos Plug-In
# install.packages("RcmdrPlugin.epack") # Rcmdr plugin for time series
# install.packages("RcmdrPlugin.orloca") # orloca Rcmdr Plug-in
# install.packages("Rcmdr") # at the R prompt
# load the experiments results
experiments_glm<-read.csv("C:/Users/hza0020/OneDrive - Auburn University/Transplant/BUAL-LAB/DoE/_screen_glm_results.csv")
experiments_rf<-read.csv("C:/Users/hza0020/OneDrive - Auburn University/Transplant/BUAL-LAB/DoE/_screen_rf_results.csv")
screening_design <- fac.design(nfactors= 6 ,replications= 5 ,repeat.only= FALSE ,
blocks= 1 ,randomize= TRUE ,seed= 2019 ,nlevels=c( 2,2,2,2,2,2 ),
factor.names=list( algorithm=c("glm","ranger"),resampling=c("none","smote"),
feature_selection=c("FFS","LASSO"),imputation_num=c("DROP","MEDIAN"),
imputation_cat=c("DROP","MODE"),encoding=c("LABEL","HARD") ) )
## creator element of design.info will be different, when using the command line command!
# make an ID for each experiments
screening_design$ex.ID<-paste0(screening_design[["algorithm"]],screening_design[["resampling"]],
screening_design[["feature_selection"]], screening_design[["imputation_num"]],
screening_design[["imputation_cat"]],screening_design[["encoding"]])
screening_design$row.ID<-rownames(screening_design)
# merging results of the experiments
experiments_screen<-rbind(experiments_glm,experiments_rf)
# creating geometric mean as a performance measure
experiments_screen$gmean<-sqrt(experiments_screen$Sensitivity*experiments_screen$Specificity)
experiments_screen$ex.ID0<-paste0(experiments_screen[["algorithm"]],experiments_screen[["resampling"]],
experiments_screen[["feature_selection"]],experiments_screen[["imputation_num"]],
experiments_screen[["imputation_cat"]],experiments_screen[["encoding"]])
# next few lines of code is for merging the design parameters with the experiments
screening_design <- screening_design[order(screening_design$ex.ID),]
experiments_screen<-experiments_screen[order(experiments_screen$ex.ID0),]
screening_design<-cbind(screening_design,experiments_screen[c("ex.ID0","AUC","gmean")])
# checking if right cells are combined, if it's equal to 0 means the combining process was right
if(sum(ifelse(experiments_screen$ex.ID==experiments_screen$ex.ID0,0,1))==0){cat("Right Matching")}else{
cat("Data are not matched correctly")}
## Right Matching
# now, I sort back the experiments
screening_design <- screening_design[order(as.numeric(screening_design$row.ID)),]
I limit interactions to 2 level, and first I include all , then step by step drop the insignificant factors/interactions
First I perform DoE for gmean as the response variable
results_gmean1<-summary(lm.default(formula = gmean ~ (algorithm + resampling + feature_selection +
imputation_num + imputation_cat + encoding+
algorithm*resampling+
algorithm*feature_selection+
algorithm*imputation_num+
algorithm*imputation_cat+
algorithm*encoding+
resampling*feature_selection+
resampling*imputation_num+
resampling*imputation_cat+
resampling*encoding+
feature_selection*imputation_num+
feature_selection*imputation_cat+
feature_selection*encoding+
imputation_num*imputation_cat+
imputation_num*encoding+
imputation_cat*encoding), data = screening_design))
round(results_gmean1$coefficients,3)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.235 0.002 95.298 0.000
## algorithm1 -0.036 0.002 -14.736 0.000
## resampling1 0.188 0.002 76.077 0.000
## feature_selection1 0.011 0.002 4.278 0.000
## imputation_num1 0.007 0.002 2.892 0.004
## imputation_cat1 0.000 0.002 -0.104 0.917
## encoding1 -0.002 0.002 -0.800 0.424
## algorithm1:resampling1 -0.033 0.002 -13.162 0.000
## algorithm1:feature_selection1 -0.020 0.002 -8.083 0.000
## algorithm1:imputation_num1 -0.002 0.002 -0.990 0.323
## algorithm1:imputation_cat1 -0.003 0.002 -1.261 0.208
## algorithm1:encoding1 -0.003 0.002 -1.260 0.209
## resampling1:feature_selection1 -0.002 0.002 -0.947 0.344
## resampling1:imputation_num1 -0.011 0.002 -4.319 0.000
## resampling1:imputation_cat1 0.001 0.002 0.392 0.695
## resampling1:encoding1 0.000 0.002 0.130 0.896
## feature_selection1:imputation_num1 0.004 0.002 1.746 0.082
## feature_selection1:imputation_cat1 0.009 0.002 3.840 0.000
## feature_selection1:encoding1 0.006 0.002 2.388 0.018
## imputation_num1:imputation_cat1 -0.016 0.002 -6.322 0.000
## imputation_num1:encoding1 -0.003 0.002 -1.076 0.283
## imputation_cat1:encoding1 0.001 0.002 0.552 0.581
cat("\n")
cat(paste("R-Squared is: ",results_gmean1$adj.r.squared))
## R-Squared is: 0.9520845678058
Although imputation_cat and encoding are not significant but since their interaction are significant we keep them. Therefore, we construct full factorial design.
results_gmean2<-summary(lm.default(formula = gmean ~ (algorithm + resampling + feature_selection +
imputation_num + imputation_cat + encoding+
algorithm*resampling+
algorithm*feature_selection+
resampling*imputation_num+
feature_selection*imputation_cat+
feature_selection*encoding+
imputation_num*imputation_cat), data = screening_design))
round(results_gmean2$coefficients,3)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.235 0.002 95.183 0.000
## algorithm1 -0.036 0.002 -14.718 0.000
## resampling1 0.188 0.002 75.986 0.000
## feature_selection1 0.011 0.002 4.273 0.000
## imputation_num1 0.007 0.002 2.888 0.004
## imputation_cat1 0.000 0.002 -0.104 0.917
## encoding1 -0.002 0.002 -0.799 0.425
## algorithm1:resampling1 -0.033 0.002 -13.146 0.000
## algorithm1:feature_selection1 -0.020 0.002 -8.073 0.000
## resampling1:imputation_num1 -0.011 0.002 -4.314 0.000
## feature_selection1:imputation_cat1 0.009 0.002 3.835 0.000
## feature_selection1:encoding1 0.006 0.002 2.385 0.018
## imputation_num1:imputation_cat1 -0.016 0.002 -6.314 0.000
cat("\n")
cat(paste("R-Squared is: ",results_gmean2$r.squared))
## R-Squared is: 0.953776357358434
I limit interactions to 2 level, and first I include all , then step by step drop the insignificant factors/interactions
First I perform DoE for AUC as the response variable
results_AUC1<-summary(lm.default(formula = AUC ~ (algorithm + resampling + feature_selection +
imputation_num + imputation_cat + encoding+
algorithm*resampling+
algorithm*feature_selection+
algorithm*imputation_num+
algorithm*imputation_cat+
algorithm*encoding+
resampling*feature_selection+
resampling*imputation_num+
resampling*imputation_cat+
resampling*encoding+
feature_selection*imputation_num+
feature_selection*imputation_cat+
feature_selection*encoding+
imputation_num*imputation_cat+
imputation_num*encoding+
imputation_cat*encoding), data = screening_design))
round(results_AUC1$coefficients,3)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.615 0.001 915.871 0.000
## algorithm1 -0.003 0.001 -4.145 0.000
## resampling1 -0.001 0.001 -1.263 0.207
## feature_selection1 0.015 0.001 22.804 0.000
## imputation_num1 0.005 0.001 7.450 0.000
## imputation_cat1 0.003 0.001 5.028 0.000
## encoding1 -0.001 0.001 -1.822 0.069
## algorithm1:resampling1 -0.002 0.001 -2.959 0.003
## algorithm1:feature_selection1 -0.003 0.001 -3.992 0.000
## algorithm1:imputation_num1 0.001 0.001 0.930 0.353
## algorithm1:imputation_cat1 0.001 0.001 1.541 0.124
## algorithm1:encoding1 -0.001 0.001 -1.380 0.169
## resampling1:feature_selection1 0.000 0.001 -0.166 0.868
## resampling1:imputation_num1 0.000 0.001 0.184 0.854
## resampling1:imputation_cat1 0.000 0.001 0.423 0.673
## resampling1:encoding1 -0.001 0.001 -0.941 0.347
## feature_selection1:imputation_num1 0.005 0.001 7.425 0.000
## feature_selection1:imputation_cat1 0.003 0.001 4.248 0.000
## feature_selection1:encoding1 0.001 0.001 1.105 0.270
## imputation_num1:imputation_cat1 -0.006 0.001 -8.928 0.000
## imputation_num1:encoding1 0.001 0.001 1.417 0.157
## imputation_cat1:encoding1 0.001 0.001 1.399 0.163
cat("\n")
cat(paste("R-Squared is: ",results_AUC1$r.squared))
## R-Squared is: 0.731517510223042
results_AUC2<-summary(lm.default(formula = AUC ~ (
algorithm + resampling + feature_selection +
imputation_num + imputation_cat + encoding+
algorithm*resampling+
algorithm*feature_selection+
feature_selection*imputation_num+
feature_selection*imputation_cat+
imputation_num*imputation_cat
), data = screening_design))
round(results_AUC2$coefficients,3)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.615 0.001 913.710 0.000
## algorithm1 -0.003 0.001 -4.135 0.000
## resampling1 -0.001 0.001 -1.260 0.209
## feature_selection1 0.015 0.001 22.750 0.000
## imputation_num1 0.005 0.001 7.433 0.000
## imputation_cat1 0.003 0.001 5.017 0.000
## encoding1 -0.001 0.001 -1.818 0.070
## algorithm1:resampling1 -0.002 0.001 -2.952 0.003
## algorithm1:feature_selection1 -0.003 0.001 -3.982 0.000
## feature_selection1:imputation_num1 0.005 0.001 7.408 0.000
## feature_selection1:imputation_cat1 0.003 0.001 4.238 0.000
## imputation_num1:imputation_cat1 -0.006 0.001 -8.907 0.000
cat("\n")
cat(paste("R-Squared is: ",results_AUC2$r.squared))
## R-Squared is: 0.721194029662379
Feature selection is significant but since its interactions are significant we keep them.
results_AUC3<-summary(lm.default(formula = AUC ~ (
algorithm + resampling + feature_selection +
imputation_num + imputation_cat +
algorithm*resampling+
algorithm*feature_selection+
feature_selection*imputation_num+
feature_selection*imputation_cat+
imputation_num*imputation_cat
), data = screening_design))
round(results_AUC3$coefficients,3)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.615 0.001 910.322 0.000
## algorithm1 -0.003 0.001 -4.120 0.000
## resampling1 -0.001 0.001 -1.256 0.210
## feature_selection1 0.015 0.001 22.665 0.000
## imputation_num1 0.005 0.001 7.405 0.000
## imputation_cat1 0.003 0.001 4.998 0.000
## algorithm1:resampling1 -0.002 0.001 -2.941 0.004
## algorithm1:feature_selection1 -0.003 0.001 -3.967 0.000
## feature_selection1:imputation_num1 0.005 0.001 7.380 0.000
## feature_selection1:imputation_cat1 0.003 0.001 4.223 0.000
## imputation_num1:imputation_cat1 -0.006 0.001 -8.874 0.000
cat("\n")
cat(paste("R-Squared is: ",results_AUC3$r.squared))
## R-Squared is: 0.718203162435575
The next step will be full factorial design
Aubrun University, hamid@auburn.edu↩
University of Dayton, ychen4@udayton.edu↩
Miami University, fmegahed@miamioh.edu↩