Set UP

loading

library(corrplot)

## Warning: package 'corrplot' was built under R version 3.5.3

## corrplot 0.84 loaded

library(caret)

## Warning: package 'caret' was built under R version 3.5.3

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.5.3

# point this to your project folder
# should contain the csvs
setwd('C:/Users/rober/Desktop/x_fold')
epa = read.csv("epa.csv", header = TRUE)

setting up the environment

# point this to your project folder setwd()
# should contain the csvs
setwd('C:/Users/rober/Desktop/x_fold')
epa = read.csv("epa.csv", header = TRUE)

Data preparation

# remove rows with NA 
# this could be improved because 15 rows are being removed
epa_complete <- epa[complete.cases(epa), ]
newDataFrame <- dummyVars(" ~ .", data = epa_complete)

epa_complete <- data.frame(predict(newDataFrame, newdata = epa_complete))

Analyze data

corrData <- cor(epa_complete)
corrplot(corrData)

Model generation

test / train

set.seed(100)
trainIndex <- createDataPartition(epa_complete$Volume, p=.70,list = FALSE)
training <- epa_complete[trainIndex,]
testing <- epa_complete[-trainIndex,]

Random forests test

# specify that the resampling method is ## 10-fold CV
fit_control <- trainControl(method = "repeatedcv",number = 10)

# run the random forest model
rf_fit <- train(Volume ~ ., data = training, method = "ranger", trControl = fit_control)
rf_fit

## Random Forest 
## 
## 47 samples
## 28 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 43, 42, 42, 41, 43, 42, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE      Rsquared   MAE     
##    2    variance    957.2260  0.8657384  596.4313
##    2    extratrees  926.3231  0.8693426  580.5057
##   15    variance    788.1602  0.9294255  416.3653
##   15    extratrees  754.4218  0.9324144  401.0532
##   28    variance    738.5891  0.9445415  382.1764
##   28    extratrees  709.3970  0.9522243  360.7017
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 28, splitrule =
##  extratrees and min.node.size = 5.

Analyze model

plot(rf_fit)

# predicting our values
#testing <- droplevels(testing)
rf_predict <- predict(rf_fit,testing)

rf_fit$results

ABCDEFGHIJ0123456789

	mtry <dbl>	min.node.size <dbl>	splitrule <fctr>	RMSE <dbl>	Rsquared <dbl>	MAE <dbl>	RMSESD <dbl>	RsquaredSD <dbl>	MAESD <dbl>
1	2	5	variance	957.2260	0.8657384	596.4313	1320.306	0.13273986	698.5362
2	2	5	extratrees	926.3231	0.8693426	580.5057	1297.579	0.13247718	648.7463
3	15	5	variance	788.1602	0.9294255	416.3653	1192.025	0.08644075	631.7812
4	15	5	extratrees	754.4218	0.9324144	401.0532	1214.192	0.07707140	641.7092
5	28	5	variance	738.5891	0.9445415	382.1764	1184.014	0.07364042	626.8581
6	28	5	extratrees	709.3970	0.9522243	360.7017	1271.699	0.05908924	647.4439

rf_fit$resample

ABCDEFGHIJ0123456789

RMSE <dbl>	Rsquared <dbl>	MAE <dbl>	Resample <chr>
35.39787	0.9983615	25.15736	Fold04.Rep1
1475.24489	0.9747660	654.26238	Fold08.Rep1
423.74023	0.8725127	284.31813	Fold06.Rep1
30.10714	0.9967034	16.90429	Fold09.Rep1
739.21444	0.9325035	342.69744	Fold02.Rep1
32.41779	0.9967252	17.35787	Fold05.Rep1
165.53121	0.9317267	90.02611	Fold03.Rep1
57.37209	0.9915745	32.53660	Fold01.Rep1
54.72712	0.9947601	37.67751	Fold07.Rep1
4080.21690	0.8326098	2106.07893	Fold10.Rep1

R Notebook

Set UP

loading

setting up the environment

Data preparation

Analyze data

Model generation

test / train

Random forests test

Analyze model