Ubiqum Code Academy

This tutorial helps you understand how to automate feature selection and model selection using loops in R. The first loop train models with different methods, the second with different variables, and the third with the combination of the previous 2.

Creating random data

pacman:: p_load(caret, reshape, ggplot2, dplyr)

x <- runif(50, min = 0, max = 100) # generate 50 random number from 0 to 100
z <- runif(50, min = 0, max = 100)
a <- runif(50, min = 0, max = 100)
b <- runif(50, min = 0, max = 100)
y <- runif(50, min = 0, max = 100)

df <- as.data.frame(cbind(x,z,a,b,y))

Generating train and test sets

set.seed(100)
in_training <- createDataPartition(df$y, p = 0.7, list = F)

train <- df[in_training,]
test <- df[-in_training,]

Loop 1 - train a model with several methods

a <- c("lm", "rf","knn", "svmLinear", "svmRadial")

compare_model <- c()

for(i in a) {

model <- train(y ~., data = train, method = i)

pred <- predict(model, newdata = test)

pred_metric <- postResample(test$y, pred)

compare_model <- cbind(compare_model , pred_metric)

}

colnames(compare_model) <- a

compare_model
##                   lm          rf        knn  svmLinear  svmRadial
## RMSE     29.96703306 30.14808640 29.6961717 27.2077050 27.8245310
## Rsquared  0.03317504  0.02877531  0.0515163  0.3113565  0.2903578
## MAE      25.38608621 25.66321290 25.6691583 22.4357930 23.2843748

Structure table for plot

compare_model_melt <- melt(compare_model, varnames = c("metric", "model"))
compare_model_melt <- as_data_frame(compare_model_melt)
compare_model_melt
## # A tibble: 15 x 3
##    metric   model       value
##    <fct>    <fct>       <dbl>
##  1 RMSE     lm        30.0   
##  2 Rsquared lm         0.0332
##  3 MAE      lm        25.4   
##  4 RMSE     rf        30.1   
##  5 Rsquared rf         0.0288
##  6 MAE      rf        25.7   
##  7 RMSE     knn       29.7   
##  8 Rsquared knn        0.0515
##  9 MAE      knn       25.7   
## 10 RMSE     svmLinear 27.2   
## 11 Rsquared svmLinear  0.311 
## 12 MAE      svmLinear 22.4   
## 13 RMSE     svmRadial 27.8   
## 14 Rsquared svmRadial  0.290 
## 15 MAE      svmRadial 23.3

Plot

ggplot(compare_model_melt, aes(x=model, y=value))+
 geom_col()+
 facet_grid(metric~., scales="free")

Loop 2 - train a model with several combinations of variables

a <- c("y ~ x + a", "y ~ z + a", "y ~ a")

compare_var <- c()

for ( i in a) {
  
model <- train(formula(i), data = train, method = "lm")

pred <- predict(model, newdata = test)

pred_metric <- postResample(test$y, pred)

compare_var <- cbind(compare_var , pred_metric)

}

colnames(compare_var) <- a

compare_var
##           y ~ x + a  y ~ z + a     y ~ a
## RMSE     30.1664330 29.4225646 30.088915
## Rsquared  0.1095318  0.4121272  0.264659
## MAE      25.6668206 25.2873695 25.606911

Structure data for plot

compare_var_melt <- melt(compare_var, varnames = c("metric", "model"))
compare_var_melt <- as.data.frame(compare_var_melt)
compare_var_melt
##     metric     model      value
## 1     RMSE y ~ x + a 30.1664330
## 2 Rsquared y ~ x + a  0.1095318
## 3      MAE y ~ x + a 25.6668206
## 4     RMSE y ~ z + a 29.4225646
## 5 Rsquared y ~ z + a  0.4121272
## 6      MAE y ~ z + a 25.2873695
## 7     RMSE     y ~ a 30.0889145
## 8 Rsquared     y ~ a  0.2646590
## 9      MAE     y ~ a 25.6069114

Plot

ggplot(compare_var_melt, aes(x=model, y=value))+
 geom_col()+
 facet_grid(metric~., scales="free")

Loop 3 - train a model with several combinations of variables and methods

a <- c("y ~ x + a", "y ~ z + a", "y ~ a")
b <- c("lm", "knn")
compare_var_mod <- c()

for ( i in a) {
  for (j in b) {
  
  model <- train(formula(i), data = train, method = j)
  
  pred <- predict(model, newdata = test)
  
  pred_metric <- postResample(test$y, pred)
  
  compare_var_mod <- cbind(compare_var_mod , pred_metric)
  
  }
  
}

 names_var <- c()
for (i in a) {
  for(j in b) {
 names_var <- append(names_var,paste(i,j))
  }
  }


colnames(compare_var_mod) <- names_var

compare_var_mod
##          y ~ x + a lm y ~ x + a knn y ~ z + a lm y ~ z + a knn  y ~ a lm
## RMSE       30.1664330    28.4684608   29.4225646  3.093764e+01 30.088915
## Rsquared    0.1095318     0.2442604    0.4121272  3.053643e-05  0.264659
## MAE        25.6668206    22.5870947   25.2873695  2.694281e+01 25.606911
##           y ~ a knn
## RMSE     33.6925664
## Rsquared  0.0941204
## MAE      27.1860601

Structure data for plot

compare_var_mod_melt <- melt(compare_var_mod, varnames = c("metric", "model"))
compare_var_mod_melt <- as.data.frame(compare_var_mod_melt)
compare_var_mod_melt
##      metric         model        value
## 1      RMSE  y ~ x + a lm 3.016643e+01
## 2  Rsquared  y ~ x + a lm 1.095318e-01
## 3       MAE  y ~ x + a lm 2.566682e+01
## 4      RMSE y ~ x + a knn 2.846846e+01
## 5  Rsquared y ~ x + a knn 2.442604e-01
## 6       MAE y ~ x + a knn 2.258709e+01
## 7      RMSE  y ~ z + a lm 2.942256e+01
## 8  Rsquared  y ~ z + a lm 4.121272e-01
## 9       MAE  y ~ z + a lm 2.528737e+01
## 10     RMSE y ~ z + a knn 3.093764e+01
## 11 Rsquared y ~ z + a knn 3.053643e-05
## 12      MAE y ~ z + a knn 2.694281e+01
## 13     RMSE      y ~ a lm 3.008891e+01
## 14 Rsquared      y ~ a lm 2.646590e-01
## 15      MAE      y ~ a lm 2.560691e+01
## 16     RMSE     y ~ a knn 3.369257e+01
## 17 Rsquared     y ~ a knn 9.412040e-02
## 18      MAE     y ~ a knn 2.718606e+01

Plot

ggplot(compare_var_mod_melt, aes(x=model, y=value))+
 geom_col()+
 facet_grid(metric~., scales="free")