test / train
set.seed(100)
trainIndex <- createDataPartition(epa_complete$Volume, p=.70,list = FALSE)
training <- epa_complete[trainIndex,]
testing <- epa_complete[-trainIndex,]
Random forests test
# specify that the resampling method is ## 10-fold CV
fit_control <- trainControl(method = "repeatedcv",number = 10)
# run the random forest model
rf_fit <- train(Volume ~ ., data = training, method = "ranger", trControl = fit_control)
rf_fit
## Random Forest
##
## 47 samples
## 28 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 43, 42, 42, 41, 43, 42, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 957.2260 0.8657384 596.4313
## 2 extratrees 926.3231 0.8693426 580.5057
## 15 variance 788.1602 0.9294255 416.3653
## 15 extratrees 754.4218 0.9324144 401.0532
## 28 variance 738.5891 0.9445415 382.1764
## 28 extratrees 709.3970 0.9522243 360.7017
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 28, splitrule =
## extratrees and min.node.size = 5.