Ubiqum Code Academy

Creating random data

pacman:: p_load(caret, party, reshape, ggplot2, dplyr)

x <- runif(50, min = 0, max = 100) # generate 50 random number from 0 to 100
z <- runif(50, min = 0, max = 100)
a <- runif(50, min = 0, max = 100)
b <- runif(50, min = 0, max = 100)
y <- runif(50, min = 0, max = 100)

df <- as.data.frame(cbind(x,z,a,b,y))

train model

set.seed(100)
in_training <- createDataPartition(df$y, p = 0.7, list = F)

train <- df[in_training,]
test <- df[-in_training,]

Loop 1 - train a model with several methods

a <- c("lm", "rf","knn", "svmLinear", "svmRadial")

compare.model <- c()

for(i in a) {

model <- train(y ~., data = train, method = i)

pred <- predict(model, newdata = test)

pred.metric <- postResample(test$y, pred)

compare.model <- cbind(pred.metric, compare.model)

}

compare.model <- compare.model[ , seq(ncol(compare.model), 1 , -1)] # Invert order of dataframe (from last column to first)
colnames(compare.model) <- a

compare.model
##                   lm          rf         knn   svmLinear   svmRadial
## RMSE     34.07731030 36.55747844 30.86533537 36.84210580 32.27818991
## Rsquared  0.06713161  0.03756654  0.04125198  0.05728815  0.02541492
## MAE      29.93277833 30.00960300 26.24757928 32.63475051 27.23245666

Structure table for plot

compare.model.melt <- melt(compare.model, varnames = c("metric", "model"))
compare.model.melt <- as.data.frame(compare.model.melt)
compare.model.melt
##      metric     model       value
## 1      RMSE        lm 34.07731030
## 2  Rsquared        lm  0.06713161
## 3       MAE        lm 29.93277833
## 4      RMSE        rf 36.55747844
## 5  Rsquared        rf  0.03756654
## 6       MAE        rf 30.00960300
## 7      RMSE       knn 30.86533537
## 8  Rsquared       knn  0.04125198
## 9       MAE       knn 26.24757928
## 10     RMSE svmLinear 36.84210580
## 11 Rsquared svmLinear  0.05728815
## 12      MAE svmLinear 32.63475051
## 13     RMSE svmRadial 32.27818991
## 14 Rsquared svmRadial  0.02541492
## 15      MAE svmRadial 27.23245666
for(i in c("RMSE","Rsquared", "MAE")) {
metric <-  compare.model.melt %>%  filter(metric == i)
f <- ggplot(metric, aes(model,value))
print(f + geom_bar(stat = "identity") + ggtitle(i))
}

Loop 2 - train a model with several combinations of variables

a <- c("y ~ x + a", "y ~ z + a", "y ~ a")

compare.var <- c()

for ( i in a) {
  
model <- train(formula(i), data = train, method = "lm")

pred <- predict(model, newdata = test)

pred.metric <- postResample(test$y, pred)

compare.var <- cbind(pred.metric, compare.var)

}

compare.var <- compare.var[ , seq(ncol(compare.var), 1 , -1)]
colnames(compare.var) <- a

compare.var
##            y ~ x + a  y ~ z + a      y ~ a
## RMSE     31.41514469 35.6888815 31.0029861
## Rsquared  0.05745741  0.2978386  0.1308906
## MAE      27.21851663 31.6735311 27.1076748

Loop 3 - train a model with several combinations of variables and methods

a <- c("y ~ x + a", "y ~ z + a", "y ~ a")
b <- c("lm", "knn")
compare.var.mod <- c()

for ( i in a) {
  for (j in b) {
  
  model <- train(formula(i), data = train, method = j)
  
  pred <- predict(model, newdata = test)
  
  pred.metric <- postResample(test$y, pred)
  
  compare.var.mod <- cbind(pred.metric, compare.var.mod)
  
  }
  
}

compare.var.mod <- compare.var.mod[ , seq(ncol(compare.var.mod), 1 , -1)] 

x <- as.vector(outer(a, b, paste, sep="."))

x <- x[c(1,4,2,5,3,6)]

colnames(compare.var.mod) <- x

compare.var.mod
##          y ~ x + a.lm y ~ x + a.knn y ~ z + a.lm y ~ z + a.knn   y ~ a.lm
## RMSE      31.41514469   32.14527450   35.6888815   34.36670813 31.0029861
## Rsquared   0.05745741    0.01075993    0.2978386    0.05701168  0.1308906
## MAE       27.21851663   26.79961609   31.6735311   29.78991248 27.1076748
##            y ~ a.knn
## RMSE     30.90856184
## Rsquared  0.03143209
## MAE      27.29233553