library(MASS)
library(tree)
## Warning: package 'tree' was built under R version 4.3.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
In the lab, we applied random forests to the Boston data using mtry = 6 and using ntree = 25 and ntree = 500. Create a plot displaying the test error resulting from random forests on this data set for a more comprehensive range of values for mtry and ntree. You can model your plot after Figure 8.10. Describe the results obtained.
set.seed(1)
train <- sample(1:nrow(Boston), nrow(Boston) / 2)
Boston.train <- Boston[train, -14]
Boston.test <- Boston[-train, -14]
Y.train <- Boston[train, 14]
Y.test <- Boston[-train, 14]
rf.boston1 <- randomForest(Boston.train, y = Y.train, xtest = Boston.test, ytest = Y.test, mtry = ncol(Boston) - 1, ntree = 500)
rf.boston2 <- randomForest(Boston.train, y = Y.train, xtest = Boston.test, ytest = Y.test, mtry = (ncol(Boston) - 1) / 2, ntree = 500)
rf.boston3 <- randomForest(Boston.train, y = Y.train, xtest = Boston.test, ytest = Y.test, mtry = sqrt(ncol(Boston) - 1), ntree = 500)
plot(1:500, rf.boston1$test$mse, col = "green", type = "l", xlab = "Number of Trees", ylab = "Test MSE", ylim = c(18, 30))
lines(1:500, rf.boston2$test$mse, col = "red", type = "l")
lines(1:500, rf.boston3$test$mse, col = "blue", type = "l")
legend("topright", c("m = p", "m = p/2", "m = sqrt(p)"), col = c("green", "red", "blue"), cex = 1, lty = 1)
We see that the Test MSE generally decreases as we increase the number of trees. After about 100, it stabilizes and we do not see much movement or improvement. The lowest Test MSE is observed when we have m = sqrt(p).
In the lab, a classifcation tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable.
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
##
## Attaching package: 'ISLR2'
## The following object is masked from 'package:MASS':
##
## Boston
attach(Carseats)
set.seed(1)
subset<-sample(1:nrow(Carseats),nrow(Carseats)/2)
car.train<-Carseats[subset,]
car.test<-Carseats[-subset,]
car.model.train<-tree(Sales~.,car.train)
summary(car.model.train)
##
## Regression tree:
## tree(formula = Sales ~ ., data = car.train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "US"
## Number of terminal nodes: 18
## Residual mean deviance: 2.167 = 394.3 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.88200 -0.88200 -0.08712 0.00000 0.89590 4.09900
plot(car.model.train)
text(car.model.train,pretty=0)
tree.prediction<-predict(car.model.train,newdata=car.test)
tree.mse<-mean((car.test$Sales-tree.prediction)^2)
tree.mse
## [1] 4.922039
We conclude that the Test MSE is about 4.92.
set.seed(1)
cv.car<-cv.tree(car.model.train)
plot(cv.car$size,cv.car$dev,xlab = "Size of Tree",ylab = "Deviance",type = "b")
prune.car<-prune.tree(car.model.train,best=6)
plot(prune.car)
text(prune.car,pretty=0)
prune.predict<-predict(prune.car,car.test)
mean((prune.predict-car.test$Sales)^2)
## [1] 5.318073
After using cross validation, the size with the minimum deviance is selected and we get a Test MSE of 5.32.
bag.car<-randomForest(Sales~.,car.train,importance=TRUE,mtry=10)
importance(bag.car)
## %IncMSE IncNodePurity
## CompPrice 25.6142224 170.664874
## Income 4.5915046 91.047400
## Advertising 12.9597786 98.653166
## Population -0.7823257 57.653647
## Price 55.2302897 510.311641
## ShelveLoc 48.5480002 381.630885
## Age 16.7267043 158.261715
## Education 0.9105144 43.460189
## Urban 0.2260420 9.347446
## US 5.8455044 18.261630
bag.car.predict<-predict(bag.car,car.test)
mean((bag.car.predict-car.test$Sales)^2)
## [1] 2.588149
Using randomforest and an mtry= p= 10, we used the bagging technique and the Test MSE obtained was 2.61. The bagging model chose Price and ShelveLoc to be the two most important variables.
rf.car<-randomForest(Sales~.,car.train,importance=TRUE,mtry=sqrt(10))
importance(rf.car)
## %IncMSE IncNodePurity
## CompPrice 12.9540442 157.53376
## Income 2.1683293 129.18612
## Advertising 8.7289900 111.38250
## Population -2.5290493 102.78681
## Price 33.9482500 393.61313
## ShelveLoc 34.1358807 289.28756
## Age 12.0804387 172.03776
## Education 0.2213600 72.02479
## Urban 0.9793293 14.73763
## US 4.1072742 33.91622
rf.car.predict<-predict(rf.car,car.test)
mean((rf.car.predict-car.test$Sales)^2)
## [1] 3.054306
Random forest increased the MSE from bagging to 3.05. The important variables are the same as those in bagging. Random forest is expected to outperform bagging, but that is not the case in this scenario.
library(BART)
## Warning: package 'BART' was built under R version 4.3.3
## Loading required package: nlme
## Loading required package: survival
carseats_data <- Carseats
carseats_data$ShelveLoc <- as.numeric(Carseats$ShelveLoc)
carseats_data$Urban <- ifelse(Carseats$Urban == "Yes", 1, 0)
carseats_data$US <- ifelse(Carseats$US == "Yes", 1, 0)
set.seed(1)
train <- sample(1:nrow(carseats_data), nrow(carseats_data) / 2)
x <- carseats_data[, names(carseats_data) != "Sales"]
y <- carseats_data$Sales
xtrain <- x[train, ]
ytrain <- y[train]
xtest <- x[-train, ]
ytest <- y[-train]
set.seed(1)
bartfit <- gbart(xtrain, ytrain, x.test = xtest)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 10, 200
## y1,yn: 2.781850, 1.091850
## x1,x[n*p]: 107.000000, 1.000000
## xp1,xp[np*p]: 111.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 63 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.273474,3,0.692269,7.57815
## *****sigma: 1.885179
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,10,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 3s
## trcnt,tecnt: 1000,1000
ord <- order(bartfit$varcount.mean, decreasing = TRUE)
bartfit$varcount.mean[ord]
## ShelveLoc Price Education CompPrice US Urban
## 31.159 29.937 24.392 23.995 22.884 22.612
## Population Age Income Advertising
## 22.380 21.764 21.291 20.900
yhat.bart <- bartfit$yhat.test.mean
mean((ytest - yhat.bart)^2)
## [1] 1.465254
The Test MSE here is 1.46 and the BART method selected the same two variables to be important because they appeared most in the collection of trees.
This question uses the Caravan data set.
set.seed(1)
train <- 1:1000
Caravan$Purchase <- ifelse(Caravan$Purchase == "Yes", 1, 0)
Caravan.train <- Caravan[train, ]
Caravan.test <- Caravan[-train, ]
library(gbm)
## Warning: package 'gbm' was built under R version 4.3.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
set.seed(1)
boost.caravan <- gbm(Purchase ~ ., data = Caravan.train, distribution = "gaussian", n.trees = 1000, shrinkage = 0.01)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
summary(boost.caravan)
## var rel.inf
## PPERSAUT PPERSAUT 13.51824557
## MKOOPKLA MKOOPKLA 10.24062778
## MOPLHOOG MOPLHOOG 7.32689780
## MBERMIDD MBERMIDD 6.35820558
## PBRAND PBRAND 4.98826360
## ABRAND ABRAND 4.54504653
## MGODGE MGODGE 4.26496875
## MINK3045 MINK3045 4.13253907
## PWAPART PWAPART 3.15612877
## MAUT1 MAUT1 2.76929763
## MOSTYPE MOSTYPE 2.56937935
## MAUT2 MAUT2 1.99879666
## MSKA MSKA 1.94618539
## MBERARBG MBERARBG 1.89917331
## PBYSTAND PBYSTAND 1.88591514
## MINKGEM MINKGEM 1.87131472
## MGODOV MGODOV 1.81673309
## MGODPR MGODPR 1.80814745
## MFWEKIND MFWEKIND 1.67884570
## MSKC MSKC 1.65075962
## MBERHOOG MBERHOOG 1.53559951
## MSKB1 MSKB1 1.43339514
## MOPLMIDD MOPLMIDD 1.10617074
## MHHUUR MHHUUR 1.09608784
## MRELGE MRELGE 1.09039794
## MINK7512 MINK7512 1.08772012
## MZFONDS MZFONDS 1.08427551
## MGODRK MGODRK 1.03126657
## MINK4575 MINK4575 1.02492795
## MZPART MZPART 0.98536712
## MRELOV MRELOV 0.80356854
## MFGEKIND MFGEKIND 0.80335689
## MBERARBO MBERARBO 0.60909852
## APERSAUT APERSAUT 0.56707821
## MGEMOMV MGEMOMV 0.55589456
## MOSHOOFD MOSHOOFD 0.55498375
## MAUT0 MAUT0 0.54748481
## PMOTSCO PMOTSCO 0.43362597
## MSKB2 MSKB2 0.43075446
## MSKD MSKD 0.42751490
## MINK123M MINK123M 0.40920707
## MINKM30 MINKM30 0.36996576
## MHKOOP MHKOOP 0.34941518
## MBERBOER MBERBOER 0.28967068
## MFALLEEN MFALLEEN 0.28877552
## MGEMLEEF MGEMLEEF 0.20084195
## MOPLLAAG MOPLLAAG 0.15750616
## MBERZELF MBERZELF 0.11203381
## PLEVEN PLEVEN 0.11030994
## MRELSA MRELSA 0.04500507
## MAANTHUI MAANTHUI 0.03322830
## PWABEDR PWABEDR 0.00000000
## PWALAND PWALAND 0.00000000
## PBESAUT PBESAUT 0.00000000
## PVRAAUT PVRAAUT 0.00000000
## PAANHANG PAANHANG 0.00000000
## PTRACTOR PTRACTOR 0.00000000
## PWERKT PWERKT 0.00000000
## PBROM PBROM 0.00000000
## PPERSONG PPERSONG 0.00000000
## PGEZONG PGEZONG 0.00000000
## PWAOREG PWAOREG 0.00000000
## PZEILPL PZEILPL 0.00000000
## PPLEZIER PPLEZIER 0.00000000
## PFIETS PFIETS 0.00000000
## PINBOED PINBOED 0.00000000
## AWAPART AWAPART 0.00000000
## AWABEDR AWABEDR 0.00000000
## AWALAND AWALAND 0.00000000
## ABESAUT ABESAUT 0.00000000
## AMOTSCO AMOTSCO 0.00000000
## AVRAAUT AVRAAUT 0.00000000
## AAANHANG AAANHANG 0.00000000
## ATRACTOR ATRACTOR 0.00000000
## AWERKT AWERKT 0.00000000
## ABROM ABROM 0.00000000
## ALEVEN ALEVEN 0.00000000
## APERSONG APERSONG 0.00000000
## AGEZONG AGEZONG 0.00000000
## AWAOREG AWAOREG 0.00000000
## AZEILPL AZEILPL 0.00000000
## APLEZIER APLEZIER 0.00000000
## AFIETS AFIETS 0.00000000
## AINBOED AINBOED 0.00000000
## ABYSTAND ABYSTAND 0.00000000
The variables PPERSAUT and MKOOPKLA are the most important variables.
probs.test <- predict(boost.caravan, Caravan.test, n.trees = 1000, type = "response")
pred.test <- ifelse(probs.test > 0.2, 1, 0)
table(Caravan.test$Purchase, pred.test)
## pred.test
## 0 1
## 0 4493 40
## 1 278 11
logit.caravan <- glm(Purchase ~ ., data = Caravan.train, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
probs.test2 <- predict(logit.caravan, Caravan.test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
pred.test2 <- ifelse(probs.test > 0.2, 1, 0)
table(Caravan.test$Purchase, pred.test2)
## pred.test2
## 0 1
## 0 4493 40
## 1 278 11
Both methods give the same answer that the fraction of people predicted to make a purchase that actually do is about 0.216.