data02 <- read.csv("DATA.02.csv", sep = ";")
head(data02)
## Y X1 X2 X3 X4 X5 X6
## 1 18.7 0.06151 0.515 5.97 4.8122 9.29 58.5
## 2 23.8 2.30040 0.605 6.32 2.1000 11.10 96.1
## 3 20.1 0.10612 0.428 6.10 6.3361 12.40 65.1
## 4 22.7 5.20177 0.770 6.13 2.7227 11.48 83.4
## 5 22.0 0.01096 0.389 6.45 7.3073 8.23 31.9
## 6 21.7 0.08199 0.437 6.01 5.5027 10.40 42.3
dim(data02)
## [1] 324 7
set.seed(1582)
sample <- sample(c(TRUE, FALSE), nrow(data02), replace=TRUE, prob=c(0.8,0.2))
train <- data02[sample, ]
test <- data02[!sample, ]
library(MASS)
library(rpart)
set.seed(1582)
tree1 = rpart(Y ~ ., data = train, method = "anova")
summary(tree1)
## Call:
## rpart(formula = Y ~ ., data = train, method = "anova")
## n= 256
##
## CP nsplit rel error xerror xstd
## 1 0.47380692 0 1.0000000 1.0097923 0.12237829
## 2 0.13609725 1 0.5261931 0.6214831 0.07558561
## 3 0.10397504 2 0.3900958 0.5806328 0.07300818
## 4 0.03456334 3 0.2861208 0.4496793 0.06673566
## 5 0.02982516 4 0.2515575 0.3949889 0.06194858
## 6 0.01365219 5 0.2217323 0.3495452 0.05952178
## 7 0.01000000 6 0.2080801 0.3210478 0.05710304
##
## Variable importance
## X5 X3 X6 X2 X4 X1
## 33 23 14 11 9 9
##
## Node number 1: 256 observations, complexity param=0.4738069
## mean=21.9, MSE=81.87883
## left son=2 (180 obs) right son=3 (76 obs)
## Primary splits:
## X5 < 7.865 to the right, improve=0.4738069, (0 missing)
## X3 < 6.79 to the left, improve=0.4561387, (0 missing)
## X2 < 0.6575 to the right, improve=0.2816845, (0 missing)
## X1 < 6.68632 to the right, improve=0.2687285, (0 missing)
## X4 < 2.5977 to the left, improve=0.1832771, (0 missing)
## Surrogate splits:
## X3 < 6.48 to the left, agree=0.836, adj=0.447, (0 split)
## X6 < 41.3 to the right, agree=0.820, adj=0.395, (0 split)
## X2 < 0.4475 to the right, agree=0.789, adj=0.289, (0 split)
## X4 < 5.10855 to the left, agree=0.758, adj=0.184, (0 split)
## X1 < 0.04158 to the right, agree=0.750, adj=0.158, (0 split)
##
## Node number 2: 180 observations, complexity param=0.103975
## mean=17.85278, MSE=28.49316
## left son=4 (82 obs) right son=5 (98 obs)
## Primary splits:
## X5 < 15.1 to the right, improve=0.4249400, (0 missing)
## X1 < 6.91188 to the right, improve=0.4216407, (0 missing)
## X4 < 2.0643 to the left, improve=0.4153047, (0 missing)
## X2 < 0.6575 to the right, improve=0.3909876, (0 missing)
## X6 < 87.65 to the right, improve=0.3325035, (0 missing)
## Surrogate splits:
## X6 < 87.65 to the right, agree=0.828, adj=0.622, (0 split)
## X4 < 2.37495 to the left, agree=0.800, adj=0.561, (0 split)
## X2 < 0.607 to the right, agree=0.794, adj=0.549, (0 split)
## X1 < 4.341795 to the right, agree=0.761, adj=0.476, (0 split)
## X3 < 5.705 to the left, agree=0.728, adj=0.402, (0 split)
##
## Node number 3: 76 observations, complexity param=0.1360973
## mean=31.48553, MSE=77.6415
## left son=6 (62 obs) right son=7 (14 obs)
## Primary splits:
## X3 < 7.445 to the left, improve=0.4834521, (0 missing)
## X5 < 3.99 to the right, improve=0.4776189, (0 missing)
## X6 < 82.95 to the left, improve=0.2830986, (0 missing)
## X4 < 2.90485 to the right, improve=0.2531994, (0 missing)
## X1 < 0.51938 to the left, improve=0.2064658, (0 missing)
## Surrogate splits:
## X5 < 3.21 to the right, agree=0.855, adj=0.214, (0 split)
## X4 < 2.90485 to the right, agree=0.842, adj=0.143, (0 split)
## X1 < 0.51938 to the left, agree=0.829, adj=0.071, (0 split)
## X2 < 0.5595 to the left, agree=0.829, adj=0.071, (0 split)
## X6 < 82.7 to the left, agree=0.829, adj=0.071, (0 split)
##
## Node number 4: 82 observations, complexity param=0.02982516
## mean=14.04878, MSE=19.14957
## left son=8 (28 obs) right son=9 (54 obs)
## Primary splits:
## X1 < 9.08499 to the right, improve=0.3981269, (0 missing)
## X2 < 0.603 to the right, improve=0.3277976, (0 missing)
## X4 < 2.0037 to the left, improve=0.3063949, (0 missing)
## X5 < 19.645 to the right, improve=0.2516812, (0 missing)
## X6 < 75.25 to the right, improve=0.1660532, (0 missing)
## Surrogate splits:
## X5 < 22.74 to the right, agree=0.744, adj=0.250, (0 split)
## X6 < 99.65 to the right, agree=0.720, adj=0.179, (0 split)
## X2 < 0.646 to the right, agree=0.707, adj=0.143, (0 split)
## X3 < 4.89 to the left, agree=0.707, adj=0.143, (0 split)
## X4 < 1.60635 to the left, agree=0.695, adj=0.107, (0 split)
##
## Node number 5: 98 observations, complexity param=0.01365219
## mean=21.03571, MSE=14.0723
## left son=10 (84 obs) right son=11 (14 obs)
## Primary splits:
## X3 < 6.52 to the left, improve=0.20750230, (0 missing)
## X5 < 11.69 to the right, improve=0.12523840, (0 missing)
## X6 < 69.3 to the right, improve=0.10426680, (0 missing)
## X4 < 2.015 to the left, improve=0.09741158, (0 missing)
## X1 < 6.13403 to the right, improve=0.09052701, (0 missing)
## Surrogate splits:
## X2 < 0.407 to the right, agree=0.867, adj=0.071, (0 split)
##
## Node number 6: 62 observations, complexity param=0.03456334
## mean=28.57419, MSE=35.90837
## left son=12 (33 obs) right son=13 (29 obs)
## Primary splits:
## X3 < 6.735 to the left, improve=0.3254168, (0 missing)
## X5 < 4.24 to the right, improve=0.2858515, (0 missing)
## X4 < 3.2666 to the right, improve=0.2348542, (0 missing)
## X6 < 78.45 to the left, improve=0.1799732, (0 missing)
## X1 < 0.38176 to the left, improve=0.1369479, (0 missing)
## Surrogate splits:
## X5 < 5.055 to the right, agree=0.694, adj=0.345, (0 split)
## X1 < 0.08598 to the right, agree=0.661, adj=0.276, (0 split)
## X4 < 4.2885 to the right, agree=0.645, adj=0.241, (0 split)
## X2 < 0.4045 to the right, agree=0.629, adj=0.207, (0 split)
## X6 < 38.2 to the left, agree=0.629, adj=0.207, (0 split)
##
## Node number 7: 14 observations
## mean=44.37857, MSE=58.69311
##
## Node number 8: 28 observations
## mean=10.21429, MSE=11.74765
##
## Node number 9: 54 observations
## mean=16.03704, MSE=11.41048
##
## Node number 10: 84 observations
## mean=20.3381, MSE=9.644501
##
## Node number 11: 14 observations
## mean=25.22143, MSE=20.19883
##
## Node number 12: 33 observations
## mean=25.3697, MSE=27.11545
##
## Node number 13: 29 observations
## mean=32.22069, MSE=20.93199
Diperoleh summary pohon sbelum pruning, di mana seluruh peubah berperan dalam splitting.
library(rpart.plot)
prp(tree1)
plotcp(tree1)
min_cp = tree1$cptable[which.min(tree1$cptable[,"xerror"]),"CP"]
min_cp
## [1] 0.01
Diperoleh nilai CP untuk pruning sebesar 0.1
tree_prune = prune(tree1, cp = min_cp)
rpart.plot(tree_prune)
Diperoleh bahwa pohon dibentuk berdasarkan 3 prediktor yaitu X1, X3, dan X5 dengan prediktor terpenting merupakan prediktor dengan posisi paling atas yaitu X5.
pred1 <- predict(tree_prune, test)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
rmse1 <- RMSE(pred1, test$Y)
rmse1
## [1] 4.211731
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
tree2 <- randomForest(Y~., data = train, mtry = 3, importance = TRUE, ntrees = 400)
summary(tree2)
## Length Class Mode
## call 6 -none- call
## type 1 -none- character
## predicted 256 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 256 -none- numeric
## importance 12 -none- numeric
## importanceSD 6 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 256 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
varImpPlot(tree2, type=1)
Diperoleh bahwa peubah yang paling berperan dalam membentuk pohon adalah X3, X5, dan X4.
plot(tree2)
pred2 <- predict(tree2, test)
rmse2 <- RMSE(pred2, test$Y)
rmse2
## [1] 2.895557
library(gbm)
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
tree3 <- gbm(Y~., data=train, distribution = "gaussian",
n.trees = 400)
tree3
## gbm(formula = Y ~ ., distribution = "gaussian", data = train,
## n.trees = 400)
## A gradient boosted model with gaussian loss function.
## 400 iterations were performed.
## There were 6 predictors of which 6 had non-zero influence.
summary(tree3)
## var rel.inf
## X5 X5 49.623591
## X3 X3 27.318760
## X2 X2 8.049867
## X1 X1 7.453728
## X4 X4 5.025713
## X6 X6 2.528341
Diperoleh bahwa relative importance untuk pohon regresi dengan Gradient Boosting adalah dari yang tertinggi X5, X3, lalu X1 sebagaimana pada regression tree dengan pruning.
plot(tree3)
pred3 <- predict(tree3, test, n.trees = 400)
rmse3 <- RMSE(pred3, test$Y)
rmse3
## [1] 3.273002
res <- data.frame(Method = c("Prune", "Random Forest", "Gradient Boosting"),
RMSE = c(rmse1, rmse2, rmse3))
res
## Method RMSE
## 1 Prune 4.211731
## 2 Random Forest 2.895557
## 3 Gradient Boosting 3.273002
Diperoleh bahwa hasil terbaik merupakan hasil pohon dengan Random Forest. Hal ini terjadi karena pohon RF dibentuk dengan saling independen dan tidak berkorelasi satu sama lain, selain karena performanya yang dikenal paling baik dibandingkan model lain.