Applied Exercises

Exercise in section 8.4

In the lab, we applied random forests to the Boston data using mtry = 6 and using ntree = 25 and ntree = 500. Create a plot displaying the test error resulting from random forests on this data set for a more comprehensive range of values for mtry and ntree. You can model your plot after Figure 8.10. Describe the results obtained.

library(MASS)
attach(Boston)
# install.packages("randomForest")
library(randomForest)
set.seed(1)
train<-sample(1:nrow(Boston),nrow(Boston)/2)
x.train <- Boston[train,-grep("medv",colnames(Boston))]
y.train <- Boston[train,grep("medv",colnames(Boston))]
x.test <- Boston[-train,-grep("medv",colnames(Boston))]
y.test <- Boston[-train,grep("medv",colnames(Boston))]

#p/2
rf_boston1<-randomForest(x=x.train, y=y.train, xtest=x.test, ytest=y.test,mtry =6, ntree = 500)
#p <bagging>
rf_boston2<-randomForest(x=x.train, y=y.train, xtest=x.test, ytest=y.test, mtry = ncol(Boston)-1,ntree = 500)
# sqrt(p)
rf_boston3<-randomForest(x=x.train, y=y.train, xtest=x.test, ytest=y.test, mtry = sqrt(ncol(Boston)-1),ntree = 500)

plot(1:500, rf_boston1$test$mse, type='l', col='red', xlab="Number of trees", ylab = "Test MSE", ylim=c(5, 50))
lines(1:500, rf_boston2$test$mse, col = "darkgreen", type = "l")
lines(1:500, rf_boston3$test$mse, col = "blue", type = "l")
legend("topright", c("m = p", "m = p/2", "m = sqrt(p)"), col = c("darkgreen", "red", "blue"), cex = 1, lty = 1)

Test MSE is very high for a single tree, but it decreases as the number of trees increases. Besides, Test MSE for all predictors is higher than for half the predictors or the square root of the number of predictors.

In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable.

(a). Split the data set into a training set and a test set.

library(ISLR2)
attach(Carseats)
# str(Carseats)
set.seed(1)
train<-sample(1:nrow(Carseats),nrow(Carseats)/2)
data_train<-Carseats[train,]
data_test<-Carseats[-train,]
y_test<-Carseats[-train,"Sales"]

(b). Fit a regression tree to the training set. Plot the tree, and interpret the results. What test MSE do you obtain?

#install.packages("tree")
library(tree)
set.seed(1)
rt<-tree(Sales~., data_train)
plot(rt)
text(rt,pretty=0)

summary(rt)

## 
## Regression tree:
## tree(formula = Sales ~ ., data = data_train)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Age"         "Advertising" "CompPrice"  
## [6] "US"         
## Number of terminal nodes:  18 
## Residual mean deviance:  2.167 = 394.3 / 182 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.88200 -0.88200 -0.08712  0.00000  0.89590  4.09900

car.pred <- predict(rt, newdata = data_test)
mean((car.pred - data_test$Sales)^2)

## [1] 4.922039

ShelveLoc seems to be the most important factor for Sales, and the price is the second variables we need to consider. The test MSE is 4.92

(c). Use cross-validation in order to determine the optimal level of tree complexity. Does pruning the tree improve the test MSE?

set.seed(1)
cv.car <- cv.tree (rt)
plot(cv.car$size , cv.car$dev, type = "b")

prune.car <- prune.tree(rt, best = 7)
plot(prune.car)
text(prune.car, pretty = 0)

car.pred_p <- predict(prune.car, newdata = data_test)
mean((car.pred_p - data_test$Sales)^2)

## [1] 4.861001

Pruning the tree improve the test MSE slightly in our case.

(d). Use the bagging approach in order to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important.

set.seed(1)
b.car<-randomForest(Sales~.,data=Carseats, subset = train, mtry = col(Carseats)-1, importance = TRUE )
b.car

## 
## Call:
##  randomForest(formula = Sales ~ ., data = Carseats, mtry = col(Carseats) -      1, importance = TRUE, subset = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 4.963872
##                     % Var explained: 36.87

y_hat<-predict(b.car, newdata = data_test)
mean((y_hat-y_test)^2)

## [1] 4.747522

importance(b.car)

##                 %IncMSE IncNodePurity
## CompPrice    6.18470194     113.03142
## Income       0.19683846     105.61571
## Advertising  7.74042647      79.82425
## Population  -5.14440528      97.28034
## Price       21.89468946     203.65198
## ShelveLoc   21.54002668     177.28749
## Age          9.42187647     133.20928
## Education   -0.07673498      78.31426
## Urban        2.21886591      18.13972
## US           7.18678895      41.29833

(e). Use random forests to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important. Describe the effect of m, the number of variables considered at each split, on the error rate obtained.

rf_car1<-randomForest(Sales~.,data_train,mtry =3, ntree = 500)
importance(rf_car1)

##             IncNodePurity
## CompPrice       153.04597
## Income          126.40122
## Advertising     112.88248
## Population      103.06024
## Price           383.14009
## ShelveLoc       297.08873
## Age             179.18864
## Education        73.60257
## Urban            17.11908
## US               32.01022

pred <- predict(rf_car1, newdata=data_test)
test.mse <- mean((pred - data_test$Sales)^2)
test.mse

## [1] 3.01168

rf_car2<-randomForest(Sales~.,data_train,mtry = 6,ntree = 500)
importance(rf_car2)

##             IncNodePurity
## CompPrice       162.06104
## Income          103.84505
## Advertising     106.55464
## Population       72.10733
## Price           469.57893
## ShelveLoc       339.68455
## Age             163.24846
## Education        55.28522
## Urban            10.36672
## US               20.75900

pred <- predict(rf_car2, newdata=data_test)
test.mse <- mean((pred - data_test$Sales)^2)
test.mse

## [1] 2.652791

The test MSE is smaller for model with m=5, compared with model m=3 in this question.

(f). (Optional question) Now analyze the data using BART, and report your results.

# install.packages("BART")
library(BART)
x<-Carseats[,-grep("Sales",colnames(Carseats))]
y<-Carseats[,"Sales"]
x_train<-x[train,]
y_train<-y[train]
x_test<-x[-train,]
y_test<-y[-train]
set.seed(1)
barfit<-gbart(x_train,y_train,x.test=x_test)

## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 14, 200
## y1,yn: 2.781850, 1.091850
## x1,x[n*p]: 107.000000, 1.000000
## xp1,xp[np*p]: 111.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 63 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.273474,3,0.23074,7.57815
## *****sigma: 1.088371
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****printevery: 100
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 2s
## trcnt,tecnt: 1000,1000

yhat.bart<-barfit$yhat.test.mean
mean((y_test-yhat.bart)^2)

## [1] 1.450842

This problem involves the OJ data set which is part of the ISLR2 package.

(a). Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations.

library(ISLR2)
attach(OJ)
train<-sample(1:nrow(OJ),800,replace = FALSE)
oj_train<-OJ[train,]
oj_test<-OJ[-train,]

(b). Fit a tree to the training data, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics about the tree, and describe the results obtained. What is the training error rate? How many terminal nodes does the tree have?

library(tree)
set.seed(1)
t_oj<-tree(Purchase~.,oj_train)
summary(t_oj)

## 
## Classification tree:
## tree(formula = Purchase ~ ., data = oj_train)
## Variables actually used in tree construction:
## [1] "LoyalCH"        "PriceDiff"      "SpecialCH"      "WeekofPurchase"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7036 = 556.6 / 791 
## Misclassification error rate: 0.1425 = 114 / 800

The training error rate is the mis-classification error rate 0.16, and terminal nodes of tree is 7

(c). Type in the name of the tree object in order to get a detailed text output. Pick one of the terminal nodes, and interpret the information displayed.

t_oj

## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1067.00 CH ( 0.61375 0.38625 )  
##    2) LoyalCH < 0.5036 349  410.60 MM ( 0.27507 0.72493 )  
##      4) LoyalCH < 0.136698 109   46.46 MM ( 0.05505 0.94495 ) *
##      5) LoyalCH > 0.136698 240  317.60 MM ( 0.37500 0.62500 )  
##       10) PriceDiff < 0.195 103  101.40 MM ( 0.19417 0.80583 )  
##         20) SpecialCH < 0.5 89   70.39 MM ( 0.13483 0.86517 ) *
##         21) SpecialCH > 0.5 14   19.12 CH ( 0.57143 0.42857 ) *
##       11) PriceDiff > 0.195 137  189.90 CH ( 0.51095 0.48905 )  
##         22) WeekofPurchase < 247.5 54   63.81 MM ( 0.27778 0.72222 ) *
##         23) WeekofPurchase > 247.5 83  106.10 CH ( 0.66265 0.33735 ) *
##    3) LoyalCH > 0.5036 451  338.40 CH ( 0.87583 0.12417 )  
##      6) LoyalCH < 0.754144 183  206.40 CH ( 0.74863 0.25137 )  
##       12) PriceDiff < 0.265 105  140.50 CH ( 0.60952 0.39048 )  
##         24) PriceDiff < -0.35 15   15.01 MM ( 0.20000 0.80000 ) *
##         25) PriceDiff > -0.35 90  113.10 CH ( 0.67778 0.32222 ) *
##       13) PriceDiff > 0.265 78   37.15 CH ( 0.93590 0.06410 ) *
##      7) LoyalCH > 0.754144 268   85.39 CH ( 0.96269 0.03731 ) *

When we only focus on the fist node from left of the tree, we can describe like that if [Customer brand loyalty for CH] is less than 0.50395, then we need to check whether this [Customer brand loyalty for CH] is also less than 0.276142. Customer are expected to purchase MM Orange Juice if the answer to the above question is still “yes”.

(d). Create a plot of the tree, and interpret the results.

plot(t_oj)
text(t_oj,pretty = 0)

When we predict the customers purchase intention, the most important variable is [Customer brand loyalty for CH], and the difference in sales price of these two brand of orange juice.

(e). Predict the response on the test data, and produce a confusion matrix comparing the test labels to the predicted test labels. What is the test error rate?

ytest<-oj_test$Purchase
pred<-predict(t_oj,oj_test,type="class")
table(pred,ytest)

##     ytest
## pred  CH  MM
##   CH 141  41
##   MM  21  67

test_err<-(31+16)/nrow(oj_test)
test_err

## [1] 0.1740741

(f). Apply the cv.tree() function to the training set in order to determine the optimal tree size.

set.seed(1)
cv_oj<-cv.tree(t_oj,FUN=prune.misclass)

(g). Produce a plot with tree size on the \(x\)-axis and cross-validated classification error rate on the \(y\)-axis.

plot(cv_oj$size,cv_oj$dev,type="b")

(h). Which tree size corresponds to the lowest cross-validated classification error rate? As the tree size increases, the cross-validated classification error rate will always decrease, but after 4, the classification error rate change very slightly. We may consider about choose 4 as size of our final tree.

(i). Produce a pruned tree corresponding to the optimal tree size obtained using cross-validation. If cross-validation does not lead to selection of a pruned tree, then create a pruned tree with five terminal nodes.

prune_oj<-prune.misclass(t_oj,best = 4)
plot(prune_oj)
text(prune_oj,pretty=0)

(j). Compare the training error rates between the pruned and unpruned trees. Which is higher?

summary(t_oj)

## 
## Classification tree:
## tree(formula = Purchase ~ ., data = oj_train)
## Variables actually used in tree construction:
## [1] "LoyalCH"        "PriceDiff"      "SpecialCH"      "WeekofPurchase"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7036 = 556.6 / 791 
## Misclassification error rate: 0.1425 = 114 / 800

summary(prune_oj)

## 
## Classification tree:
## snip.tree(tree = t_oj, nodes = c(10L, 3L))
## Variables actually used in tree construction:
## [1] "LoyalCH"        "PriceDiff"      "WeekofPurchase"
## Number of terminal nodes:  5 
## Residual mean deviance:  0.8254 = 656.2 / 795 
## Misclassification error rate: 0.1562 = 125 / 800

We can get the value of train error rate from [Misclassification error rate]. 0.1731 for pruned tree and 0.16 for unpruned tree, so higher for pruned tree.

(k). Compare the test error rates between the pruned and unpruned trees. Which is higher?

pred_prune<-predict(prune_oj,oj_test,type="class")
pred<-predict(t_oj,oj_test,type="class")
ytest<-oj_test$Purchase
table(pred,ytest)

##     ytest
## pred  CH  MM
##   CH 141  41
##   MM  21  67

table(pred_prune,ytest)

##           ytest
## pred_prune  CH  MM
##         CH 138  41
##         MM  24  67

te<-(31+16)/270
te_pruned<-(37+15)/270
te

## [1] 0.1740741

te_pruned

## [1] 0.1925926

Higher test error rate after pruning.

We now use boosting to predict Salary in the Hitters data set.

(a). Remove the observations for whom the salary information is unknown, and then log-transform the salaries.

attach(Hitters)
str(Hitters)

## 'data.frame':    322 obs. of  20 variables:
##  $ AtBat    : int  293 315 479 496 321 594 185 298 323 401 ...
##  $ Hits     : int  66 81 130 141 87 169 37 73 81 92 ...
##  $ HmRun    : int  1 7 18 20 10 4 1 0 6 17 ...
##  $ Runs     : int  30 24 66 65 39 74 23 24 26 49 ...
##  $ RBI      : int  29 38 72 78 42 51 8 24 32 66 ...
##  $ Walks    : int  14 39 76 37 30 35 21 7 8 65 ...
##  $ Years    : int  1 14 3 11 2 11 2 3 2 13 ...
##  $ CAtBat   : int  293 3449 1624 5628 396 4408 214 509 341 5206 ...
##  $ CHits    : int  66 835 457 1575 101 1133 42 108 86 1332 ...
##  $ CHmRun   : int  1 69 63 225 12 19 1 0 6 253 ...
##  $ CRuns    : int  30 321 224 828 48 501 30 41 32 784 ...
##  $ CRBI     : int  29 414 266 838 46 336 9 37 34 890 ...
##  $ CWalks   : int  14 375 263 354 33 194 24 12 8 866 ...
##  $ League   : Factor w/ 2 levels "A","N": 1 2 1 2 2 1 2 1 2 1 ...
##  $ Division : Factor w/ 2 levels "E","W": 1 2 2 1 1 2 1 2 2 1 ...
##  $ PutOuts  : int  446 632 880 200 805 282 76 121 143 0 ...
##  $ Assists  : int  33 43 82 11 40 421 127 283 290 0 ...
##  $ Errors   : int  20 10 14 3 4 25 7 9 19 0 ...
##  $ Salary   : num  NA 475 480 500 91.5 750 70 100 75 1100 ...
##  $ NewLeague: Factor w/ 2 levels "A","N": 1 2 1 2 2 1 1 1 2 1 ...

sum(is.na(Salary))==sum(is.na(Hitters))

## [1] TRUE

h<-na.omit(Hitters)
log_salary<-log(h$Salary)
h<-data.frame(h,log_salary)

(b). Create a training set consisting of the first 200 observations, and a test set consisting of the remaining observations.

h_train<-h[1:200,]
h_test<-h[-c(1:200),]

(c). Perform boosting on the training set with 1,000 trees for a range of values of the shrinkage parameter \(λ\). Produce a plot with different shrinkage values on the \(x\)-axis and the corresponding training set MSE on the \(y\)-axis.

# install.packages("gbm")
library(gbm)
set.seed(1)
ytrain<-h_train$log_salary
lam=10^(seq(-2,0,0.1))
m=vector()
for (i in 1:length(lam)){
  
boost_h<-gbm(log_salary~.,data=h_train,distribution = "gaussian", shrinkage = lam[i], n.trees=1000)
yhat.boost <- predict (boost_h, h_train, n.trees = 1000)
m=append(m,mean ((yhat.boost - ytrain)^2))

}
plot(lam,m,xlab = "lambda",ylab = "MSE for train data")
lines(lam,m)

(d). Produce a plot with different shrinkage values on the \(x\)-axis and the corresponding test set MSE on the \(y\)-axis.

ytest<-h_test$log_salary
set.seed(1)
lam=10^(seq(-2,0,0.1))
m=vector()
for (i in c(1:length(lam))){
  
  boost_h<-gbm(log_salary~.,data=h_train,distribution = "gaussian", shrinkage = lam[i], n.trees=1000)
yhat.boost <- predict (boost_h, h_test,n.trees = 1000)
m=append(m,mean ((yhat.boost - ytest)^2))

}
plot(lam,m,xlab = "lambda",ylab = "MSE for test data")
lines(lam,m)

min(m)

## [1] 0.002498284

(e). Compare the test MSE of boosting to the test MSE that results from applying two of the regression approaches seen in Chapters 3 and 6.

# Linear regression
set.seed(1)
lm_model<-lm(log_salary~.,h_train)
lm_pred<-predict(lm_model,h_test)
lm_te<-mean((lm_pred-h_test$log_salary)^2)
lm_te

## [1] 0.1050892

# Lasso Regression
library(glmnet)
set.seed(1)
x_mat<-model.matrix(log_salary~.,h_train)
y_mat<-model.matrix(log_salary~.,h_test)
y=h_train$log_salary
lasso_model<-glmnet(x_mat,y,alpha=1) # alpha=1 for Lasso and 0 for Ridge
lasso_pred<-predict(lasso_model,s=0.01,y_mat)# s is lambda here
lasso_te<-mean((lasso_pred-h_test$log_salary)^2)
lasso_te

## [1] 0.1087135

The test MSE of Lasso model and Linear regression model is very close,0.1087135 and 0.1050892 respectively, while boosting model pwns the much smaller test MSE 0.002498284.

(f). Which variables appear to be the most important predictors in the boosted model?

boosted.model = gbm(Salary~., data = h_train, 
distribution = "gaussian",n.trees = 1000, 
shrinkage = lam[which.min(m)])

summary(boosted.model)

Hits seem to be the most important based on the result.

(g). Now apply bagging to the training set. What is the test set MSE for this approach?

library(randomForest)
set.seed(1)
bag_model = randomForest(log_salary~., data = h_train, distribution = "gaussian", n.trees = 1000, 
shrinkage = lam[which.min(m)],mtry = 19,importance = TRUE)
bag_pred = predict(bag_model, h_test)
bag_te = mean((bag_pred - h_test$log_salary)^2)
bag_te

## [1] 0.0002454343

This question uses the Caravan data set.

(a). Create a training set consisting of the first 1,000 observations, and a test set consisting of the remaining observations.

# install.packages("ISLR")
library(ISLR)
attach(Caravan)
Caravan$Purchase <- ifelse(Caravan$Purchase == "Yes", 1, 0)
c_train<-Caravan[1:1000,]
c_test<-Caravan[-c(1:1000),]

(b). Fit a boosting model to the training set with Purchase as the response and the other variables as predictors. Use 1,000 trees, and a shrinkage value of 0.01. Which predictors appear to be the most important?

library (gbm)
set.seed (1)
boost_c <- gbm (Purchase~ ., data = c_train,
distribution = "gaussian", n.trees = 1000,
shrinkage = 0.01)
summary(boost_c)

PBRAND seems to be the most important based on the result.

(c). Use the boosting model to predict the response on the test data. Predict that a person will make a purchase if the estimated proability of purchase is greater than 20%. Form a confusion matrix. What fraction of the people predicted to make a purchase do in fact make one? How does this compare with the results obtained from applying KNN or logistic regression to this data set?

probs.test <- predict(boost_c, c_test, n.trees = 1000, type = "response")
pred.test <- ifelse(probs.test > 0.2, 1,0)
table(c_test$Purchase, pred.test)

##    pred.test
##        0    1
##   0 4493   40
##   1  278   11

11/51

## [1] 0.2156863

The fraction of people predicted to make a purchase that in fact make one is 0.2156863.

logit.caravan <- glm(Purchase ~ ., data = c_train, family = "binomial")
probs.test2 <- predict(logit.caravan, c_test, type = "response")
pred.test2 <- ifelse(probs.test > 0.2, 1, 0)
table(c_test$Purchase, pred.test2)

##    pred.test2
##        0    1
##   0 4493   40
##   1  278   11

For logistic regression, the fraction of people predicted to make a purchase that in fact make one is still 0.2156863, same as boosting.

Econ 7035, Spring 2022, Assignment 5

ZHANG Guiling 22432264

2023/4/12

Applied Exercises

Exercise in section 8.4