Set UP

loading

library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
# point this to your project folder
# should contain the csvs
setwd('C:/Users/rober/Desktop/x_fold')
epa = read.csv("epa.csv", header = TRUE) 

setting up the environment

# point this to your project folder setwd()
# should contain the csvs
setwd('C:/Users/rober/Desktop/x_fold')
epa = read.csv("epa.csv", header = TRUE) 

Data preparation

# remove rows with NA 
# this could be improved because 15 rows are being removed
epa_complete <- epa[complete.cases(epa), ]
newDataFrame <- dummyVars(" ~ .", data = epa_complete)

epa_complete <- data.frame(predict(newDataFrame, newdata = epa_complete))

Analyze data

corrData <- cor(epa_complete)
corrplot(corrData)

Model generation

test / train

set.seed(100)
trainIndex <- createDataPartition(epa_complete$Volume, p=.70,list = FALSE)
training <- epa_complete[trainIndex,]
testing <- epa_complete[-trainIndex,]

Random forests test

# specify that the resampling method is ## 10-fold CV
fit_control <- trainControl(method = "repeatedcv",number = 10)

# run the random forest model
rf_fit <- train(Volume ~ ., data = training, method = "ranger", trControl = fit_control)
rf_fit
## Random Forest 
## 
## 47 samples
## 28 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 43, 42, 42, 41, 43, 42, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE      Rsquared   MAE     
##    2    variance    957.2260  0.8657384  596.4313
##    2    extratrees  926.3231  0.8693426  580.5057
##   15    variance    788.1602  0.9294255  416.3653
##   15    extratrees  754.4218  0.9324144  401.0532
##   28    variance    738.5891  0.9445415  382.1764
##   28    extratrees  709.3970  0.9522243  360.7017
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 28, splitrule =
##  extratrees and min.node.size = 5.

Analyze model

plot(rf_fit)

# predicting our values
#testing <- droplevels(testing)
rf_predict <- predict(rf_fit,testing)

rf_fit$results
ABCDEFGHIJ0123456789
 
 
mtry
<dbl>
min.node.size
<dbl>
splitrule
<fctr>
RMSE
<dbl>
Rsquared
<dbl>
MAE
<dbl>
RMSESD
<dbl>
RsquaredSD
<dbl>
MAESD
<dbl>
125variance957.22600.8657384596.43131320.3060.13273986698.5362
225extratrees926.32310.8693426580.50571297.5790.13247718648.7463
3155variance788.16020.9294255416.36531192.0250.08644075631.7812
4155extratrees754.42180.9324144401.05321214.1920.07707140641.7092
5285variance738.58910.9445415382.17641184.0140.07364042626.8581
6285extratrees709.39700.9522243360.70171271.6990.05908924647.4439
rf_fit$resample
ABCDEFGHIJ0123456789
RMSE
<dbl>
Rsquared
<dbl>
MAE
<dbl>
Resample
<chr>
35.397870.998361525.15736Fold04.Rep1
1475.244890.9747660654.26238Fold08.Rep1
423.740230.8725127284.31813Fold06.Rep1
30.107140.996703416.90429Fold09.Rep1
739.214440.9325035342.69744Fold02.Rep1
32.417790.996725217.35787Fold05.Rep1
165.531210.931726790.02611Fold03.Rep1
57.372090.991574532.53660Fold01.Rep1
54.727120.994760137.67751Fold07.Rep1
4080.216900.83260982106.07893Fold10.Rep1