Set UP

loading

library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
# point this to your project folder
# should contain the csvs
setwd('C:/Users/rober/Desktop/x_fold')
epa = read.csv("epa.csv", header = TRUE) 

setting up the environment

# point this to your project folder setwd()
# should contain the csvs
setwd('C:/Users/rober/Desktop/x_fold')
epa = read.csv("epa.csv", header = TRUE) 

Data preparation

# remove rows with NA 
# this could be improved because 15 rows are being removed
epa_complete <- epa[complete.cases(epa), ]
newDataFrame <- dummyVars(" ~ .", data = epa_complete)

epa_complete <- data.frame(predict(newDataFrame, newdata = epa_complete))

Analyze data

corrData <- cor(epa_complete)
corrplot(corrData)

Model generation

test / train

set.seed(100)
trainIndex <- createDataPartition(epa_complete$Volume, p=.70,list = FALSE)
training <- epa_complete[trainIndex,]
testing <- epa_complete[-trainIndex,]

Random forests test

# specify that the resampling method is ## 10-fold CV
fit_control <- trainControl(method = "repeatedcv",number = 10)

# run the random forest model
rf_fit <- train(Volume ~ ., data = training, method = "ranger", trControl = fit_control)
rf_fit
## Random Forest 
## 
## 47 samples
## 28 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 43, 42, 42, 41, 43, 42, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE      Rsquared   MAE     
##    2    variance    957.2260  0.8657384  596.4313
##    2    extratrees  926.3231  0.8693426  580.5057
##   15    variance    788.1602  0.9294255  416.3653
##   15    extratrees  754.4218  0.9324144  401.0532
##   28    variance    738.5891  0.9445415  382.1764
##   28    extratrees  709.3970  0.9522243  360.7017
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 28, splitrule =
##  extratrees and min.node.size = 5.

Analyze model

plot(rf_fit)

# predicting our values
#testing <- droplevels(testing)
rf_predict <- predict(rf_fit,testing)

rf_fit$results
rf_fit$resample