In the lab, we applied random forests to the Boston data using mtry = 6 and using ntree = 25 and ntree = 500. Create a plot displaying the test error resulting from random forests on this data set for a more comprehensive range of values for mtry and ntree. You can model your plot after Figure 8.10. Describe the results obtained.
set.seed (123)
train_index<-sample (1:nrow(Boston), nrow(Boston)/2)
train<-Boston[train_index,]
test<-Boston[-train_index,]
# building models after 8.10
rf1 <- randomForest(x = train[,-14], y = train$medv, xtest = test[,-14], ytest = test$medv, mtry = ncol(Boston) - 1, ntree = 500)
rf2 <- randomForest(x = train[,-14], y = train$medv, xtest = test[,-14], ytest = test$medv, mtry = (ncol(Boston) - 1) / 2, ntree = 500)
rf3 <- randomForest(x = train[,-14], y = train$medv, xtest = test[,-14], ytest = test$medv, mtry = sqrt(ncol(Boston) - 1), ntree = 500)
plot(1:500, rf1$test$mse, col = "orange", type = "l", xlab = "Number of Trees", ylab = "Test MSE", ylim = c(0, 10))
lines(1:500, rf2$test$mse, col = "green", type = "l")
lines(1:500, rf3$test$mse, col = "maroon", type = "l")
legend("topright", c("m = p", "m = p/2", "m = sqrt(p)"), col = c("orange", "green", "maroon"), cex = 1, lty = 1)
set.seed(123)
train<-sample(1:nrow(Carseats), nrow(Carseats)/2)
carseats.train<-Carseats[train, ]
y.train<-Carseats[train, "Sales"]
carseats.test<-Carseats[-train, ]
y.test<-Carseats[-train, "Sales"]
tree.car<-tree(Sales ~ ., data = carseats.train)
summary(tree.car)
##
## Regression tree:
## tree(formula = Sales ~ ., data = carseats.train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Income" "Age" "Population"
## [6] "Education" "CompPrice" "Advertising"
## Number of terminal nodes: 18
## Residual mean deviance: 2.132 = 388.1 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.08000 -0.92870 0.06244 0.00000 0.87020 3.71700
plot(tree.car)
text(tree.car, pretty = 0, cex = 0.5)
yhat = predict(tree.car, newdata = carseats.test)
mean((yhat - y.test)^2)
## [1] 4.395357
set.seed(1)
cv.car = cv.tree(tree.car)
plot(cv.car$size , cv.car$dev, type = 'b')
prune.car<-prune.tree(tree.car, best = 12)
plot(prune.car)
text(prune.car, pretty = 0, cex = 0.5)
yhat<- predict(prune.car, carseats.test)
mean((yhat - y.test)^2)
## [1] 4.662975
It looks like the test MSE was slightly higher as a result of the pruning process.
set.seed (1)
bag.car<-randomForest(Sales ~ ., data = Carseats,
subset = train , mtry = 10, importance = TRUE)
yhat.bag <- predict(bag.car, newdata = carseats.test)
mean((yhat.bag - y.test)^2)
## [1] 2.719623
importance(bag.car)
## %IncMSE IncNodePurity
## CompPrice 19.45546048 157.493242
## Income 6.54808190 88.084674
## Advertising 9.35661044 76.584090
## Population -0.03332117 52.232916
## Price 46.38347866 383.642775
## ShelveLoc 50.78194122 388.510326
## Age 19.66414864 181.155199
## Education 2.95721774 54.469563
## Urban -0.22123191 8.959947
## US 1.51594678 6.052909
varImpPlot(bag.car)
It looks like Shelveloc and Price are the two most important
variables.
set.seed(123)
tree.car<-randomForest(Sales ~ ., data = Carseats,
subset = train , mtry = 5, importance = TRUE)
yhat.bag <- predict(tree.car, newdata = carseats.test)
mean((yhat.bag - y.test)^2)
## [1] 3.01707
importance(tree.car)
## %IncMSE IncNodePurity
## CompPrice 15.6874802 152.97700
## Income 6.1782093 104.83134
## Advertising 7.4822778 89.84070
## Population 0.1519680 74.70762
## Price 35.6880806 345.59391
## ShelveLoc 42.1336259 333.55620
## Age 18.3059283 204.29930
## Education 1.8670124 63.31604
## Urban -0.1467837 11.01253
## US 0.3849865 11.47038
varImpPlot(tree.car)
This appears to show a lower test MSE as compared to the earlier
models.
library(BART)
## Warning: package 'BART' was built under R version 4.3.2
## Loading required package: nlme
## Loading required package: nnet
## Loading required package: survival
x <- Carseats[, 2:11]
y <- Carseats[, "Sales"]
xtrain <- x[train, ]
ytrain <- y[train]
xtest <- x[-train, ]
ytest <- y[-train]
set.seed (1)
bartfit<- gbart(xtrain , ytrain , x.test = xtest)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 14, 200
## y1,yn: 3.230000, 3.070000
## x1,x[n*p]: 104.000000, 1.000000
## xp1,xp[np*p]: 138.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 63 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.260569,3,0.191523,7.43
## *****sigma: 0.991574
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 6s
## trcnt,tecnt: 1000,1000
yhat.bart <- bartfit$yhat.test.mean
mean (( ytest - yhat.bart)^2)
## [1] 1.621989
BART appears to indicate the lowest test MSE of 1.62.
set.seed(123)
train_indices<- sample(1:nrow(OJ), 800)
train_oj <- OJ[train_indices, ]
test_oj<-OJ[-train_indices, ]
oj_tree<-tree(Purchase ~ ., data = train_oj)
summary(oj_tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train_oj)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7625 = 603.9 / 792
## Misclassification error rate: 0.165 = 132 / 800
The output indicates a training error rate of .16, with 8 nodes,
oj_tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1071.00 CH ( 0.60875 0.39125 )
## 2) LoyalCH < 0.5036 350 415.10 MM ( 0.28000 0.72000 )
## 4) LoyalCH < 0.276142 170 131.00 MM ( 0.12941 0.87059 )
## 8) LoyalCH < 0.0356415 56 10.03 MM ( 0.01786 0.98214 ) *
## 9) LoyalCH > 0.0356415 114 108.90 MM ( 0.18421 0.81579 ) *
## 5) LoyalCH > 0.276142 180 245.20 MM ( 0.42222 0.57778 )
## 10) PriceDiff < 0.05 74 74.61 MM ( 0.20270 0.79730 ) *
## 11) PriceDiff > 0.05 106 144.50 CH ( 0.57547 0.42453 ) *
## 3) LoyalCH > 0.5036 450 357.10 CH ( 0.86444 0.13556 )
## 6) PriceDiff < -0.39 27 32.82 MM ( 0.29630 0.70370 ) *
## 7) PriceDiff > -0.39 423 273.70 CH ( 0.90071 0.09929 )
## 14) LoyalCH < 0.705326 130 135.50 CH ( 0.78462 0.21538 )
## 28) PriceDiff < 0.145 43 58.47 CH ( 0.58140 0.41860 ) *
## 29) PriceDiff > 0.145 87 62.07 CH ( 0.88506 0.11494 ) *
## 15) LoyalCH > 0.705326 293 112.50 CH ( 0.95222 0.04778 ) *
Node 4 is the 3rd split at LoyalCH < 0.276142. There are 170 observations within the node with a deviance 131. It appears that roughly .13% of the observations were misclassified as CH, with the remaining 87% being correctly identified as MM.
plot(oj_tree)
text(oj_tree, pretty=0)
The splits indicate that LoyaCH and PriceDiff are the two most important
variables in this model.
oj_pred <- predict(oj_tree, test_oj,
type = "class")
table(oj_pred, test_oj$Purchase)
##
## oj_pred CH MM
## CH 150 34
## MM 16 70
(150+70)/270
## [1] 0.8148148
It appears the test error rate is around 19%
set.seed (7)
cv.oj<-cv.tree(oj_tree, FUN = prune.misclass)
cv.oj
## $size
## [1] 8 5 3 2 1
##
## $dev
## [1] 140 140 160 166 313
##
## $k
## [1] -Inf 0 8 11 154
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
Both 8 and 5 are tied for lowest dev.
plot(cv.oj$size, cv.oj$dev, type = 'b')
(h) Which tree size corresponds to the lowest cross-validated
classification error rate?
Both 5 and 8
prune.oj<- prune.misclass(oj_tree, best = 5)
summary(prune.oj)
##
## Classification tree:
## snip.tree(tree = oj_tree, nodes = c(4L, 7L))
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 5
## Residual mean deviance: 0.826 = 656.6 / 795
## Misclassification error rate: 0.165 = 132 / 800
plot(prune.oj)
text(prune.oj, pretty = 0)
(j) Compare the training error rates between the pruned and unpruned
trees. Which is higher?
The training error was quite comparable .16
prune_pred <- predict(prune.oj, test_oj,
type = "class")
table(prune_pred, test_oj$Purchase)
##
## prune_pred CH MM
## CH 150 34
## MM 16 70
(150+70)/270
## [1] 0.8148148
Comparable test error rates.
Hitters = na.omit(Hitters)
Hitters$Salary = log(Hitters$Salary)
set.seed(123)
train_indices<- sample(1:nrow(Hitters), 200)
train_hit<- Hitters[train_indices, ]
test_hit<-Hitters[-train_indices, ]
set.seed (1)
shrinkage_values <- seq(0.01, 0.2, by = 0.01)
mse_values <- numeric(length(shrinkage_values))
for (i in seq_along(shrinkage_values)) {
hit_boost<- gbm(
formula = Salary ~ .,
distribution = "gaussian",
data = train_hit,
n.trees = 1000,
shrinkage = shrinkage_values[i],)
predictions_train<- predict(hit_boost, newdata = train_hit, n.trees = 1000)
mse_values[i]<-mean((predictions_train - train_hit$Salary)^2)}
plot(shrinkage_values, mse_values, type = "b", xlab = "Shrinkage Parameter (λ)", ylab = "Training Set MSE", main = "Boosting: Shrinkage vs. Training Set MSE")
(d) Produce a plot with different shrinkage values on the x-axis and the
corresponding test set MSE on the y-axis.
set.seed (1)
shrinkage_values<- seq(0.01, 0.2, by = 0.01)
mse_test<- numeric(length(shrinkage_values))
for (i in seq_along(shrinkage_values)) {
hit_boost<- gbm(
formula = Salary ~ .,
distribution = "gaussian",
data = train_hit,
n.trees = 1000,
shrinkage = shrinkage_values[i],)
predictions_train<- predict(hit_boost, newdata = test_hit, n.trees = 1000)
mse_test[i]<-mean((predictions_train - test_hit$Salary)^2)}
plot(shrinkage_values, mse_values, type = "b", xlab = "Shrinkage Parameter (λ)", ylab = "Training Set MSE", main = "Boosting: Shrinkage vs. Training Set MSE")
Lets first get the test error for the previous model
which.min(mse_test)
## [1] 5
mse_test[5]
## [1] 0.182951
shrinkage_values[5]
## [1] 0.05
lambda value 0.05 produces the lowest test error of 18
ml_fit<-lm(Salary ~ ., data = train_hit)
pred<-predict(ml_fit, test_hit)
mean((pred - test_hit$Salary)^2)
## [1] 0.4636651
x <- model.matrix(Salary ~ ., Hitters)[, -1]
Lets try ridge
set.seed(123)
y<-Hitters$Salary
x1_train<-x[1:200,]
x1_test<-x[201:263,]
y1_train<-y[1:200]
y1_test<-y[201:263]
ridge.mod<-glmnet(x1_train, y1_train, alpha = 0)
ridge.pred<- predict(ridge.mod, newx = x1_test)
mean ((ridge.pred - y1_test)^2)
## [1] 0.5145349
The method of boosting results in a quite significant reduction in test error, as compared to LM and ridge regression.
summary(hit_boost)
## var rel.inf
## CAtBat CAtBat 18.7349427
## CRBI CRBI 9.7748552
## Assists Assists 7.5894815
## Walks Walks 6.7376674
## PutOuts PutOuts 6.6569733
## CHmRun CHmRun 6.4233218
## CRuns CRuns 5.6908153
## AtBat AtBat 5.5493044
## RBI RBI 5.3605685
## Runs Runs 5.3319071
## HmRun HmRun 4.3953566
## Errors Errors 3.9842625
## Years Years 3.7912156
## Hits Hits 2.9571114
## CHits CHits 2.7213600
## CWalks CWalks 2.6797075
## League League 0.7050832
## Division Division 0.5579991
## NewLeague NewLeague 0.3580668
CAtBat followed by CRBI are the two most important.
set.seed (1)
bag.hit<-randomForest(Salary ~ ., data = train_hit, mtry = 19, importance = TRUE)
yhat.bag <- predict(bag.hit, newdata = test_hit)
mean((yhat.bag - test_hit$Salary)^2)
## [1] 0.1674674
importance(bag.hit)
## %IncMSE IncNodePurity
## AtBat 9.0942582 7.3808624
## Hits 4.5646002 6.1348510
## HmRun 0.8793614 1.4728781
## Runs 6.3357764 2.9861875
## RBI 8.5466292 6.3313359
## Walks 10.8810629 7.9538743
## Years 6.2213577 1.5016902
## CAtBat 20.1976310 49.1244534
## CHits 9.9028060 15.3977528
## CHmRun 6.7311147 4.7713401
## CRuns 18.6762434 42.7473717
## CRBI 9.7795240 7.2005027
## CWalks 5.4507949 5.4216838
## League -1.7684417 0.1456641
## Division -0.9509188 0.1200144
## PutOuts 0.5207492 2.2833580
## Assists 1.9050328 1.3871213
## Errors 2.1942694 1.4765472
## NewLeague -0.6469658 0.2690485
varImpPlot(bag.hit)
Not bad at all! This models test error is lower than that of boosting at
.16. It indicates the same most important variables as boosting.
Caravan$Purchase<-ifelse(Caravan$Purchase == "Yes", 1, 0)
train_indices<-1:1000
train_car<- Caravan[train_indices, ]
test_car<-Caravan[-train_indices, ]
set.seed(123)
boost.car<- gbm(Purchase ~ ., data = train_car, distribution = "gaussian", n.trees = 1000, shrinkage = 0.1)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
summary(boost.car)
## var rel.inf
## MOPLHOOG MOPLHOOG 5.84547588
## MKOOPKLA MKOOPKLA 5.13986245
## MOSTYPE MOSTYPE 5.03245809
## MBERHOOG MBERHOOG 4.90296559
## MSKA MSKA 4.38878354
## MGODGE MGODGE 4.18422333
## MAUT1 MAUT1 4.09888626
## MSKC MSKC 4.09832492
## MINK3045 MINK3045 3.98730910
## MBERMIDD MBERMIDD 3.48940487
## ABRAND ABRAND 3.36540347
## PPERSAUT PPERSAUT 3.19411639
## MFWEKIND MFWEKIND 2.76553266
## PBRAND PBRAND 2.74024602
## MGODPR MGODPR 2.62913019
## MAUT2 MAUT2 2.49000540
## MRELGE MRELGE 2.24093534
## MBERARBG MBERARBG 1.96804295
## APERSAUT APERSAUT 1.91085248
## MINKM30 MINKM30 1.88666994
## MHHUUR MHHUUR 1.83311135
## MSKB1 MSKB1 1.66511872
## MINK4575 MINK4575 1.66403284
## MFGEKIND MFGEKIND 1.52363199
## MZFONDS MZFONDS 1.51940089
## MAUT0 MAUT0 1.48393368
## MFALLEEN MFALLEEN 1.42959139
## MOPLMIDD MOPLMIDD 1.35712582
## MRELOV MRELOV 1.28841041
## MZPART MZPART 1.25929968
## MINKGEM MINKGEM 1.22869319
## PWAPART PWAPART 1.12908530
## MSKB2 MSKB2 1.10051328
## MINK7512 MINK7512 1.08680660
## MGODRK MGODRK 1.06714585
## MBERARBO MBERARBO 1.04217903
## MOPLLAAG MOPLLAAG 0.98962937
## MSKD MSKD 0.82949400
## MGODOV MGODOV 0.73230725
## PBYSTAND PBYSTAND 0.70677359
## MGEMOMV MGEMOMV 0.64226934
## MBERBOER MBERBOER 0.53154042
## MOSHOOFD MOSHOOFD 0.49436570
## MHKOOP MHKOOP 0.47951406
## PLEVEN PLEVEN 0.46311789
## MAANTHUI MAANTHUI 0.41468831
## MRELSA MRELSA 0.37218441
## MINK123M MINK123M 0.36565223
## PMOTSCO PMOTSCO 0.33226742
## MGEMLEEF MGEMLEEF 0.30293182
## MBERZELF MBERZELF 0.23642419
## ALEVEN ALEVEN 0.07013109
## PWABEDR PWABEDR 0.00000000
## PWALAND PWALAND 0.00000000
## PBESAUT PBESAUT 0.00000000
## PVRAAUT PVRAAUT 0.00000000
## PAANHANG PAANHANG 0.00000000
## PTRACTOR PTRACTOR 0.00000000
## PWERKT PWERKT 0.00000000
## PBROM PBROM 0.00000000
## PPERSONG PPERSONG 0.00000000
## PGEZONG PGEZONG 0.00000000
## PWAOREG PWAOREG 0.00000000
## PZEILPL PZEILPL 0.00000000
## PPLEZIER PPLEZIER 0.00000000
## PFIETS PFIETS 0.00000000
## PINBOED PINBOED 0.00000000
## AWAPART AWAPART 0.00000000
## AWABEDR AWABEDR 0.00000000
## AWALAND AWALAND 0.00000000
## ABESAUT ABESAUT 0.00000000
## AMOTSCO AMOTSCO 0.00000000
## AVRAAUT AVRAAUT 0.00000000
## AAANHANG AAANHANG 0.00000000
## ATRACTOR ATRACTOR 0.00000000
## AWERKT AWERKT 0.00000000
## ABROM ABROM 0.00000000
## APERSONG APERSONG 0.00000000
## AGEZONG AGEZONG 0.00000000
## AWAOREG AWAOREG 0.00000000
## AZEILPL AZEILPL 0.00000000
## APLEZIER APLEZIER 0.00000000
## AFIETS AFIETS 0.00000000
## AINBOED AINBOED 0.00000000
## ABYSTAND ABYSTAND 0.00000000
MOSTYPE and PPERSAUT are the two most important variables fromt his model.
probs_test<-predict(boost.car, test_car, n.trees = 1000, type = "response")
pred_test<-ifelse(probs_test > 0.2, 1, 0)
table(test_car$Purchase, pred_test)
## pred_test
## 0 1
## 0 4206 327
## 1 241 48
Roughly 13%