library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
# point this to your project folder
# should contain the csvs
setwd('C:/Users/rober/Desktop/x_fold')
epa = read.csv("epa.csv", header = TRUE)
# point this to your project folder setwd()
# should contain the csvs
setwd('C:/Users/rober/Desktop/x_fold')
epa = read.csv("epa.csv", header = TRUE)
# remove rows with NA
# this could be improved because 15 rows are being removed
epa_complete <- epa[complete.cases(epa), ]
newDataFrame <- dummyVars(" ~ .", data = epa_complete)
epa_complete <- data.frame(predict(newDataFrame, newdata = epa_complete))
corrData <- cor(epa_complete)
corrplot(corrData)
set.seed(100)
trainIndex <- createDataPartition(epa_complete$Volume, p=.70,list = FALSE)
training <- epa_complete[trainIndex,]
testing <- epa_complete[-trainIndex,]
# specify that the resampling method is ## 10-fold CV
fit_control <- trainControl(method = "repeatedcv",number = 10)
# run the random forest model
rf_fit <- train(Volume ~ ., data = training, method = "ranger", trControl = fit_control)
rf_fit
## Random Forest
##
## 47 samples
## 28 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 43, 42, 42, 41, 43, 42, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 957.2260 0.8657384 596.4313
## 2 extratrees 926.3231 0.8693426 580.5057
## 15 variance 788.1602 0.9294255 416.3653
## 15 extratrees 754.4218 0.9324144 401.0532
## 28 variance 738.5891 0.9445415 382.1764
## 28 extratrees 709.3970 0.9522243 360.7017
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 28, splitrule =
## extratrees and min.node.size = 5.
plot(rf_fit)
# predicting our values
#testing <- droplevels(testing)
rf_predict <- predict(rf_fit,testing)
rf_fit$results
mtry <dbl> | min.node.size <dbl> | splitrule <fctr> | RMSE <dbl> | Rsquared <dbl> | MAE <dbl> | RMSESD <dbl> | RsquaredSD <dbl> | MAESD <dbl> | |
---|---|---|---|---|---|---|---|---|---|
1 | 2 | 5 | variance | 957.2260 | 0.8657384 | 596.4313 | 1320.306 | 0.13273986 | 698.5362 |
2 | 2 | 5 | extratrees | 926.3231 | 0.8693426 | 580.5057 | 1297.579 | 0.13247718 | 648.7463 |
3 | 15 | 5 | variance | 788.1602 | 0.9294255 | 416.3653 | 1192.025 | 0.08644075 | 631.7812 |
4 | 15 | 5 | extratrees | 754.4218 | 0.9324144 | 401.0532 | 1214.192 | 0.07707140 | 641.7092 |
5 | 28 | 5 | variance | 738.5891 | 0.9445415 | 382.1764 | 1184.014 | 0.07364042 | 626.8581 |
6 | 28 | 5 | extratrees | 709.3970 | 0.9522243 | 360.7017 | 1271.699 | 0.05908924 | 647.4439 |
rf_fit$resample
RMSE <dbl> | Rsquared <dbl> | MAE <dbl> | Resample <chr> | |
---|---|---|---|---|
35.39787 | 0.9983615 | 25.15736 | Fold04.Rep1 | |
1475.24489 | 0.9747660 | 654.26238 | Fold08.Rep1 | |
423.74023 | 0.8725127 | 284.31813 | Fold06.Rep1 | |
30.10714 | 0.9967034 | 16.90429 | Fold09.Rep1 | |
739.21444 | 0.9325035 | 342.69744 | Fold02.Rep1 | |
32.41779 | 0.9967252 | 17.35787 | Fold05.Rep1 | |
165.53121 | 0.9317267 | 90.02611 | Fold03.Rep1 | |
57.37209 | 0.9915745 | 32.53660 | Fold01.Rep1 | |
54.72712 | 0.9947601 | 37.67751 | Fold07.Rep1 | |
4080.21690 | 0.8326098 | 2106.07893 | Fold10.Rep1 |