Random Forest Regressions

Data

data02 <- read.csv("DATA.02.csv", sep = ";")

head(data02)

##      Y      X1    X2   X3     X4    X5   X6
## 1 18.7 0.06151 0.515 5.97 4.8122  9.29 58.5
## 2 23.8 2.30040 0.605 6.32 2.1000 11.10 96.1
## 3 20.1 0.10612 0.428 6.10 6.3361 12.40 65.1
## 4 22.7 5.20177 0.770 6.13 2.7227 11.48 83.4
## 5 22.0 0.01096 0.389 6.45 7.3073  8.23 31.9
## 6 21.7 0.08199 0.437 6.01 5.5027 10.40 42.3

dim(data02)

## [1] 324   7

Split Data

set.seed(1582)
sample <- sample(c(TRUE, FALSE), nrow(data02), replace=TRUE, prob=c(0.8,0.2))
train  <- data02[sample, ]
test   <- data02[!sample, ]

Regression Tree + Pruning

library(MASS)
library(rpart)
set.seed(1582)
tree1 = rpart(Y ~ ., data = train, method = "anova")
summary(tree1)

## Call:
## rpart(formula = Y ~ ., data = train, method = "anova")
##   n= 256 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.47380692      0 1.0000000 1.0097923 0.12237829
## 2 0.13609725      1 0.5261931 0.6214831 0.07558561
## 3 0.10397504      2 0.3900958 0.5806328 0.07300818
## 4 0.03456334      3 0.2861208 0.4496793 0.06673566
## 5 0.02982516      4 0.2515575 0.3949889 0.06194858
## 6 0.01365219      5 0.2217323 0.3495452 0.05952178
## 7 0.01000000      6 0.2080801 0.3210478 0.05710304
## 
## Variable importance
## X5 X3 X6 X2 X4 X1 
## 33 23 14 11  9  9 
## 
## Node number 1: 256 observations,    complexity param=0.4738069
##   mean=21.9, MSE=81.87883 
##   left son=2 (180 obs) right son=3 (76 obs)
##   Primary splits:
##       X5 < 7.865    to the right, improve=0.4738069, (0 missing)
##       X3 < 6.79     to the left,  improve=0.4561387, (0 missing)
##       X2 < 0.6575   to the right, improve=0.2816845, (0 missing)
##       X1 < 6.68632  to the right, improve=0.2687285, (0 missing)
##       X4 < 2.5977   to the left,  improve=0.1832771, (0 missing)
##   Surrogate splits:
##       X3 < 6.48     to the left,  agree=0.836, adj=0.447, (0 split)
##       X6 < 41.3     to the right, agree=0.820, adj=0.395, (0 split)
##       X2 < 0.4475   to the right, agree=0.789, adj=0.289, (0 split)
##       X4 < 5.10855  to the left,  agree=0.758, adj=0.184, (0 split)
##       X1 < 0.04158  to the right, agree=0.750, adj=0.158, (0 split)
## 
## Node number 2: 180 observations,    complexity param=0.103975
##   mean=17.85278, MSE=28.49316 
##   left son=4 (82 obs) right son=5 (98 obs)
##   Primary splits:
##       X5 < 15.1     to the right, improve=0.4249400, (0 missing)
##       X1 < 6.91188  to the right, improve=0.4216407, (0 missing)
##       X4 < 2.0643   to the left,  improve=0.4153047, (0 missing)
##       X2 < 0.6575   to the right, improve=0.3909876, (0 missing)
##       X6 < 87.65    to the right, improve=0.3325035, (0 missing)
##   Surrogate splits:
##       X6 < 87.65    to the right, agree=0.828, adj=0.622, (0 split)
##       X4 < 2.37495  to the left,  agree=0.800, adj=0.561, (0 split)
##       X2 < 0.607    to the right, agree=0.794, adj=0.549, (0 split)
##       X1 < 4.341795 to the right, agree=0.761, adj=0.476, (0 split)
##       X3 < 5.705    to the left,  agree=0.728, adj=0.402, (0 split)
## 
## Node number 3: 76 observations,    complexity param=0.1360973
##   mean=31.48553, MSE=77.6415 
##   left son=6 (62 obs) right son=7 (14 obs)
##   Primary splits:
##       X3 < 7.445    to the left,  improve=0.4834521, (0 missing)
##       X5 < 3.99     to the right, improve=0.4776189, (0 missing)
##       X6 < 82.95    to the left,  improve=0.2830986, (0 missing)
##       X4 < 2.90485  to the right, improve=0.2531994, (0 missing)
##       X1 < 0.51938  to the left,  improve=0.2064658, (0 missing)
##   Surrogate splits:
##       X5 < 3.21     to the right, agree=0.855, adj=0.214, (0 split)
##       X4 < 2.90485  to the right, agree=0.842, adj=0.143, (0 split)
##       X1 < 0.51938  to the left,  agree=0.829, adj=0.071, (0 split)
##       X2 < 0.5595   to the left,  agree=0.829, adj=0.071, (0 split)
##       X6 < 82.7     to the left,  agree=0.829, adj=0.071, (0 split)
## 
## Node number 4: 82 observations,    complexity param=0.02982516
##   mean=14.04878, MSE=19.14957 
##   left son=8 (28 obs) right son=9 (54 obs)
##   Primary splits:
##       X1 < 9.08499  to the right, improve=0.3981269, (0 missing)
##       X2 < 0.603    to the right, improve=0.3277976, (0 missing)
##       X4 < 2.0037   to the left,  improve=0.3063949, (0 missing)
##       X5 < 19.645   to the right, improve=0.2516812, (0 missing)
##       X6 < 75.25    to the right, improve=0.1660532, (0 missing)
##   Surrogate splits:
##       X5 < 22.74    to the right, agree=0.744, adj=0.250, (0 split)
##       X6 < 99.65    to the right, agree=0.720, adj=0.179, (0 split)
##       X2 < 0.646    to the right, agree=0.707, adj=0.143, (0 split)
##       X3 < 4.89     to the left,  agree=0.707, adj=0.143, (0 split)
##       X4 < 1.60635  to the left,  agree=0.695, adj=0.107, (0 split)
## 
## Node number 5: 98 observations,    complexity param=0.01365219
##   mean=21.03571, MSE=14.0723 
##   left son=10 (84 obs) right son=11 (14 obs)
##   Primary splits:
##       X3 < 6.52     to the left,  improve=0.20750230, (0 missing)
##       X5 < 11.69    to the right, improve=0.12523840, (0 missing)
##       X6 < 69.3     to the right, improve=0.10426680, (0 missing)
##       X4 < 2.015    to the left,  improve=0.09741158, (0 missing)
##       X1 < 6.13403  to the right, improve=0.09052701, (0 missing)
##   Surrogate splits:
##       X2 < 0.407    to the right, agree=0.867, adj=0.071, (0 split)
## 
## Node number 6: 62 observations,    complexity param=0.03456334
##   mean=28.57419, MSE=35.90837 
##   left son=12 (33 obs) right son=13 (29 obs)
##   Primary splits:
##       X3 < 6.735    to the left,  improve=0.3254168, (0 missing)
##       X5 < 4.24     to the right, improve=0.2858515, (0 missing)
##       X4 < 3.2666   to the right, improve=0.2348542, (0 missing)
##       X6 < 78.45    to the left,  improve=0.1799732, (0 missing)
##       X1 < 0.38176  to the left,  improve=0.1369479, (0 missing)
##   Surrogate splits:
##       X5 < 5.055    to the right, agree=0.694, adj=0.345, (0 split)
##       X1 < 0.08598  to the right, agree=0.661, adj=0.276, (0 split)
##       X4 < 4.2885   to the right, agree=0.645, adj=0.241, (0 split)
##       X2 < 0.4045   to the right, agree=0.629, adj=0.207, (0 split)
##       X6 < 38.2     to the left,  agree=0.629, adj=0.207, (0 split)
## 
## Node number 7: 14 observations
##   mean=44.37857, MSE=58.69311 
## 
## Node number 8: 28 observations
##   mean=10.21429, MSE=11.74765 
## 
## Node number 9: 54 observations
##   mean=16.03704, MSE=11.41048 
## 
## Node number 10: 84 observations
##   mean=20.3381, MSE=9.644501 
## 
## Node number 11: 14 observations
##   mean=25.22143, MSE=20.19883 
## 
## Node number 12: 33 observations
##   mean=25.3697, MSE=27.11545 
## 
## Node number 13: 29 observations
##   mean=32.22069, MSE=20.93199

Diperoleh summary pohon sbelum pruning, di mana seluruh peubah berperan dalam splitting.

library(rpart.plot)
prp(tree1)

plotcp(tree1)

min_cp = tree1$cptable[which.min(tree1$cptable[,"xerror"]),"CP"]
min_cp

## [1] 0.01

Diperoleh nilai CP untuk pruning sebesar 0.1

tree_prune = prune(tree1, cp = min_cp)
rpart.plot(tree_prune)

Diperoleh bahwa pohon dibentuk berdasarkan 3 prediktor yaitu X1, X3, dan X5 dengan prediktor terpenting merupakan prediktor dengan posisi paling atas yaitu X5.

pred1 <- predict(tree_prune, test)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

rmse1 <- RMSE(pred1, test$Y)
rmse1

## [1] 4.211731

Regression Tree + RF

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

tree2 <- randomForest(Y~., data = train, mtry = 3, importance = TRUE, ntrees = 400)
summary(tree2)

##                 Length Class  Mode     
## call              6    -none- call     
## type              1    -none- character
## predicted       256    -none- numeric  
## mse             500    -none- numeric  
## rsq             500    -none- numeric  
## oob.times       256    -none- numeric  
## importance       12    -none- numeric  
## importanceSD      6    -none- numeric  
## localImportance   0    -none- NULL     
## proximity         0    -none- NULL     
## ntree             1    -none- numeric  
## mtry              1    -none- numeric  
## forest           11    -none- list     
## coefs             0    -none- NULL     
## y               256    -none- numeric  
## test              0    -none- NULL     
## inbag             0    -none- NULL     
## terms             3    terms  call

varImpPlot(tree2, type=1)

Diperoleh bahwa peubah yang paling berperan dalam membentuk pohon adalah X3, X5, dan X4.

plot(tree2)

pred2 <- predict(tree2, test)
rmse2 <- RMSE(pred2, test$Y)
rmse2

## [1] 2.895557

Regression Tree + Gradient Boosting

library(gbm)

## Loaded gbm 2.1.9

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

tree3 <- gbm(Y~., data=train, distribution = "gaussian",
             n.trees = 400)
tree3

## gbm(formula = Y ~ ., distribution = "gaussian", data = train, 
##     n.trees = 400)
## A gradient boosted model with gaussian loss function.
## 400 iterations were performed.
## There were 6 predictors of which 6 had non-zero influence.

summary(tree3)

##    var   rel.inf
## X5  X5 49.623591
## X3  X3 27.318760
## X2  X2  8.049867
## X1  X1  7.453728
## X4  X4  5.025713
## X6  X6  2.528341

Diperoleh bahwa relative importance untuk pohon regresi dengan Gradient Boosting adalah dari yang tertinggi X5, X3, lalu X1 sebagaimana pada regression tree dengan pruning.

plot(tree3)

pred3 <- predict(tree3, test, n.trees = 400)
rmse3 <- RMSE(pred3, test$Y)
rmse3

## [1] 3.273002

Hasil

res <- data.frame(Method = c("Prune", "Random Forest", "Gradient Boosting"),
                  RMSE = c(rmse1, rmse2, rmse3))
res

##              Method     RMSE
## 1             Prune 4.211731
## 2     Random Forest 2.895557
## 3 Gradient Boosting 3.273002

Diperoleh bahwa hasil terbaik merupakan hasil pohon dengan Random Forest. Hal ini terjadi karena pohon RF dibentuk dengan saling independen dan tidak berkorelasi satu sama lain, selain karena performanya yang dikenal paling baik dibandingkan model lain.