This tutorial helps you understand how to automate feature selection and model selection using loops in R. The first loop train models with different methods, the second with different variables, and the third with the combination of the previous 2.
pacman:: p_load(caret, reshape, ggplot2, dplyr)
x <- runif(50, min = 0, max = 100) # generate 50 random number from 0 to 100
z <- runif(50, min = 0, max = 100)
a <- runif(50, min = 0, max = 100)
b <- runif(50, min = 0, max = 100)
y <- runif(50, min = 0, max = 100)
df <- as.data.frame(cbind(x,z,a,b,y))
set.seed(100)
in_training <- createDataPartition(df$y, p = 0.7, list = F)
train <- df[in_training,]
test <- df[-in_training,]
a <- c("lm", "rf","knn", "svmLinear", "svmRadial")
compare_model <- c()
for(i in a) {
model <- train(y ~., data = train, method = i)
pred <- predict(model, newdata = test)
pred_metric <- postResample(test$y, pred)
compare_model <- cbind(compare_model , pred_metric)
}
colnames(compare_model) <- a
compare_model
## lm rf knn svmLinear svmRadial
## RMSE 29.96703306 30.14808640 29.6961717 27.2077050 27.8245310
## Rsquared 0.03317504 0.02877531 0.0515163 0.3113565 0.2903578
## MAE 25.38608621 25.66321290 25.6691583 22.4357930 23.2843748
compare_model_melt <- melt(compare_model, varnames = c("metric", "model"))
compare_model_melt <- as_data_frame(compare_model_melt)
compare_model_melt
## # A tibble: 15 x 3
## metric model value
## <fct> <fct> <dbl>
## 1 RMSE lm 30.0
## 2 Rsquared lm 0.0332
## 3 MAE lm 25.4
## 4 RMSE rf 30.1
## 5 Rsquared rf 0.0288
## 6 MAE rf 25.7
## 7 RMSE knn 29.7
## 8 Rsquared knn 0.0515
## 9 MAE knn 25.7
## 10 RMSE svmLinear 27.2
## 11 Rsquared svmLinear 0.311
## 12 MAE svmLinear 22.4
## 13 RMSE svmRadial 27.8
## 14 Rsquared svmRadial 0.290
## 15 MAE svmRadial 23.3
ggplot(compare_model_melt, aes(x=model, y=value))+
geom_col()+
facet_grid(metric~., scales="free")
a <- c("y ~ x + a", "y ~ z + a", "y ~ a")
compare_var <- c()
for ( i in a) {
model <- train(formula(i), data = train, method = "lm")
pred <- predict(model, newdata = test)
pred_metric <- postResample(test$y, pred)
compare_var <- cbind(compare_var , pred_metric)
}
colnames(compare_var) <- a
compare_var
## y ~ x + a y ~ z + a y ~ a
## RMSE 30.1664330 29.4225646 30.088915
## Rsquared 0.1095318 0.4121272 0.264659
## MAE 25.6668206 25.2873695 25.606911
compare_var_melt <- melt(compare_var, varnames = c("metric", "model"))
compare_var_melt <- as.data.frame(compare_var_melt)
compare_var_melt
## metric model value
## 1 RMSE y ~ x + a 30.1664330
## 2 Rsquared y ~ x + a 0.1095318
## 3 MAE y ~ x + a 25.6668206
## 4 RMSE y ~ z + a 29.4225646
## 5 Rsquared y ~ z + a 0.4121272
## 6 MAE y ~ z + a 25.2873695
## 7 RMSE y ~ a 30.0889145
## 8 Rsquared y ~ a 0.2646590
## 9 MAE y ~ a 25.6069114
ggplot(compare_var_melt, aes(x=model, y=value))+
geom_col()+
facet_grid(metric~., scales="free")
a <- c("y ~ x + a", "y ~ z + a", "y ~ a")
b <- c("lm", "knn")
compare_var_mod <- c()
for ( i in a) {
for (j in b) {
model <- train(formula(i), data = train, method = j)
pred <- predict(model, newdata = test)
pred_metric <- postResample(test$y, pred)
compare_var_mod <- cbind(compare_var_mod , pred_metric)
}
}
names_var <- c()
for (i in a) {
for(j in b) {
names_var <- append(names_var,paste(i,j))
}
}
colnames(compare_var_mod) <- names_var
compare_var_mod
## y ~ x + a lm y ~ x + a knn y ~ z + a lm y ~ z + a knn y ~ a lm
## RMSE 30.1664330 28.4684608 29.4225646 3.093764e+01 30.088915
## Rsquared 0.1095318 0.2442604 0.4121272 3.053643e-05 0.264659
## MAE 25.6668206 22.5870947 25.2873695 2.694281e+01 25.606911
## y ~ a knn
## RMSE 33.6925664
## Rsquared 0.0941204
## MAE 27.1860601
compare_var_mod_melt <- melt(compare_var_mod, varnames = c("metric", "model"))
compare_var_mod_melt <- as.data.frame(compare_var_mod_melt)
compare_var_mod_melt
## metric model value
## 1 RMSE y ~ x + a lm 3.016643e+01
## 2 Rsquared y ~ x + a lm 1.095318e-01
## 3 MAE y ~ x + a lm 2.566682e+01
## 4 RMSE y ~ x + a knn 2.846846e+01
## 5 Rsquared y ~ x + a knn 2.442604e-01
## 6 MAE y ~ x + a knn 2.258709e+01
## 7 RMSE y ~ z + a lm 2.942256e+01
## 8 Rsquared y ~ z + a lm 4.121272e-01
## 9 MAE y ~ z + a lm 2.528737e+01
## 10 RMSE y ~ z + a knn 3.093764e+01
## 11 Rsquared y ~ z + a knn 3.053643e-05
## 12 MAE y ~ z + a knn 2.694281e+01
## 13 RMSE y ~ a lm 3.008891e+01
## 14 Rsquared y ~ a lm 2.646590e-01
## 15 MAE y ~ a lm 2.560691e+01
## 16 RMSE y ~ a knn 3.369257e+01
## 17 Rsquared y ~ a knn 9.412040e-02
## 18 MAE y ~ a knn 2.718606e+01
ggplot(compare_var_mod_melt, aes(x=model, y=value))+
geom_col()+
facet_grid(metric~., scales="free")