library(MASS)
attach(Boston)
# install.packages("randomForest")
library(randomForest)
set.seed(1)
train<-sample(1:nrow(Boston),nrow(Boston)/2)
x.train <- Boston[train,-grep("medv",colnames(Boston))]
y.train <- Boston[train,grep("medv",colnames(Boston))]
x.test <- Boston[-train,-grep("medv",colnames(Boston))]
y.test <- Boston[-train,grep("medv",colnames(Boston))]
#p/2
rf_boston1<-randomForest(x=x.train, y=y.train, xtest=x.test, ytest=y.test,mtry =6, ntree = 500)
#p <bagging>
rf_boston2<-randomForest(x=x.train, y=y.train, xtest=x.test, ytest=y.test, mtry = ncol(Boston)-1,ntree = 500)
# sqrt(p)
rf_boston3<-randomForest(x=x.train, y=y.train, xtest=x.test, ytest=y.test, mtry = sqrt(ncol(Boston)-1),ntree = 500)
plot(1:500, rf_boston1$test$mse, type='l', col='red', xlab="Number of trees", ylab = "Test MSE", ylim=c(5, 50))
lines(1:500, rf_boston2$test$mse, col = "darkgreen", type = "l")
lines(1:500, rf_boston3$test$mse, col = "blue", type = "l")
legend("topright", c("m = p", "m = p/2", "m = sqrt(p)"), col = c("darkgreen", "red", "blue"), cex = 1, lty = 1)
Test MSE is very high for a single tree, but it decreases as the number
of trees increases. Besides, Test MSE for all predictors is higher than
for half the predictors or the square root of the number of
predictors.
(a). Split the data set into a training set and a test set.
library(ISLR2)
attach(Carseats)
# str(Carseats)
set.seed(1)
train<-sample(1:nrow(Carseats),nrow(Carseats)/2)
data_train<-Carseats[train,]
data_test<-Carseats[-train,]
y_test<-Carseats[-train,"Sales"]
(b). Fit a regression tree to the training set. Plot the tree, and interpret the results. What test MSE do you obtain?
#install.packages("tree")
library(tree)
set.seed(1)
rt<-tree(Sales~., data_train)
plot(rt)
text(rt,pretty=0)
summary(rt)
##
## Regression tree:
## tree(formula = Sales ~ ., data = data_train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "US"
## Number of terminal nodes: 18
## Residual mean deviance: 2.167 = 394.3 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.88200 -0.88200 -0.08712 0.00000 0.89590 4.09900
car.pred <- predict(rt, newdata = data_test)
mean((car.pred - data_test$Sales)^2)
## [1] 4.922039
ShelveLoc seems to be the most important factor for Sales, and the price is the second variables we need to consider. The test MSE is 4.92
(c). Use cross-validation in order to determine the optimal level of tree complexity. Does pruning the tree improve the test MSE?
set.seed(1)
cv.car <- cv.tree (rt)
plot(cv.car$size , cv.car$dev, type = "b")
prune.car <- prune.tree(rt, best = 7)
plot(prune.car)
text(prune.car, pretty = 0)
car.pred_p <- predict(prune.car, newdata = data_test)
mean((car.pred_p - data_test$Sales)^2)
## [1] 4.861001
Pruning the tree improve the test MSE slightly in our case.
(d). Use the bagging approach in order to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important.
set.seed(1)
b.car<-randomForest(Sales~.,data=Carseats, subset = train, mtry = col(Carseats)-1, importance = TRUE )
b.car
##
## Call:
## randomForest(formula = Sales ~ ., data = Carseats, mtry = col(Carseats) - 1, importance = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 4.963872
## % Var explained: 36.87
y_hat<-predict(b.car, newdata = data_test)
mean((y_hat-y_test)^2)
## [1] 4.747522
importance(b.car)
## %IncMSE IncNodePurity
## CompPrice 6.18470194 113.03142
## Income 0.19683846 105.61571
## Advertising 7.74042647 79.82425
## Population -5.14440528 97.28034
## Price 21.89468946 203.65198
## ShelveLoc 21.54002668 177.28749
## Age 9.42187647 133.20928
## Education -0.07673498 78.31426
## Urban 2.21886591 18.13972
## US 7.18678895 41.29833
(e). Use random forests to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important. Describe the effect of m, the number of variables considered at each split, on the error rate obtained.
rf_car1<-randomForest(Sales~.,data_train,mtry =3, ntree = 500)
importance(rf_car1)
## IncNodePurity
## CompPrice 153.04597
## Income 126.40122
## Advertising 112.88248
## Population 103.06024
## Price 383.14009
## ShelveLoc 297.08873
## Age 179.18864
## Education 73.60257
## Urban 17.11908
## US 32.01022
pred <- predict(rf_car1, newdata=data_test)
test.mse <- mean((pred - data_test$Sales)^2)
test.mse
## [1] 3.01168
rf_car2<-randomForest(Sales~.,data_train,mtry = 6,ntree = 500)
importance(rf_car2)
## IncNodePurity
## CompPrice 162.06104
## Income 103.84505
## Advertising 106.55464
## Population 72.10733
## Price 469.57893
## ShelveLoc 339.68455
## Age 163.24846
## Education 55.28522
## Urban 10.36672
## US 20.75900
pred <- predict(rf_car2, newdata=data_test)
test.mse <- mean((pred - data_test$Sales)^2)
test.mse
## [1] 2.652791
The test MSE is smaller for model with m=5, compared with model m=3 in this question.
(f). (Optional question) Now analyze the data using BART, and report your results.
# install.packages("BART")
library(BART)
x<-Carseats[,-grep("Sales",colnames(Carseats))]
y<-Carseats[,"Sales"]
x_train<-x[train,]
y_train<-y[train]
x_test<-x[-train,]
y_test<-y[-train]
set.seed(1)
barfit<-gbart(x_train,y_train,x.test=x_test)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 14, 200
## y1,yn: 2.781850, 1.091850
## x1,x[n*p]: 107.000000, 1.000000
## xp1,xp[np*p]: 111.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 63 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.273474,3,0.23074,7.57815
## *****sigma: 1.088371
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 2s
## trcnt,tecnt: 1000,1000
yhat.bart<-barfit$yhat.test.mean
mean((y_test-yhat.bart)^2)
## [1] 1.450842
(a). Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations.
library(ISLR2)
attach(OJ)
train<-sample(1:nrow(OJ),800,replace = FALSE)
oj_train<-OJ[train,]
oj_test<-OJ[-train,]
(b). Fit a tree to the training data, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics about the tree, and describe the results obtained. What is the training error rate? How many terminal nodes does the tree have?
library(tree)
set.seed(1)
t_oj<-tree(Purchase~.,oj_train)
summary(t_oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = oj_train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "WeekofPurchase"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7036 = 556.6 / 791
## Misclassification error rate: 0.1425 = 114 / 800
The training error rate is the mis-classification error rate 0.16, and terminal nodes of tree is 7
(c). Type in the name of the tree object in order to get a detailed text output. Pick one of the terminal nodes, and interpret the information displayed.
t_oj
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1067.00 CH ( 0.61375 0.38625 )
## 2) LoyalCH < 0.5036 349 410.60 MM ( 0.27507 0.72493 )
## 4) LoyalCH < 0.136698 109 46.46 MM ( 0.05505 0.94495 ) *
## 5) LoyalCH > 0.136698 240 317.60 MM ( 0.37500 0.62500 )
## 10) PriceDiff < 0.195 103 101.40 MM ( 0.19417 0.80583 )
## 20) SpecialCH < 0.5 89 70.39 MM ( 0.13483 0.86517 ) *
## 21) SpecialCH > 0.5 14 19.12 CH ( 0.57143 0.42857 ) *
## 11) PriceDiff > 0.195 137 189.90 CH ( 0.51095 0.48905 )
## 22) WeekofPurchase < 247.5 54 63.81 MM ( 0.27778 0.72222 ) *
## 23) WeekofPurchase > 247.5 83 106.10 CH ( 0.66265 0.33735 ) *
## 3) LoyalCH > 0.5036 451 338.40 CH ( 0.87583 0.12417 )
## 6) LoyalCH < 0.754144 183 206.40 CH ( 0.74863 0.25137 )
## 12) PriceDiff < 0.265 105 140.50 CH ( 0.60952 0.39048 )
## 24) PriceDiff < -0.35 15 15.01 MM ( 0.20000 0.80000 ) *
## 25) PriceDiff > -0.35 90 113.10 CH ( 0.67778 0.32222 ) *
## 13) PriceDiff > 0.265 78 37.15 CH ( 0.93590 0.06410 ) *
## 7) LoyalCH > 0.754144 268 85.39 CH ( 0.96269 0.03731 ) *
When we only focus on the fist node from left of the tree, we can describe like that if [Customer brand loyalty for CH] is less than 0.50395, then we need to check whether this [Customer brand loyalty for CH] is also less than 0.276142. Customer are expected to purchase MM Orange Juice if the answer to the above question is still “yes”.
(d). Create a plot of the tree, and interpret the results.
plot(t_oj)
text(t_oj,pretty = 0)
When we predict the customers purchase intention, the most important
variable is [Customer brand loyalty for CH], and the difference in sales
price of these two brand of orange juice.
(e). Predict the response on the test data, and produce a confusion matrix comparing the test labels to the predicted test labels. What is the test error rate?
ytest<-oj_test$Purchase
pred<-predict(t_oj,oj_test,type="class")
table(pred,ytest)
## ytest
## pred CH MM
## CH 141 41
## MM 21 67
test_err<-(31+16)/nrow(oj_test)
test_err
## [1] 0.1740741
(f). Apply the cv.tree() function to the training set in order to determine the optimal tree size.
set.seed(1)
cv_oj<-cv.tree(t_oj,FUN=prune.misclass)
(g). Produce a plot with tree size on the \(x\)-axis and cross-validated classification error rate on the \(y\)-axis.
plot(cv_oj$size,cv_oj$dev,type="b")
(h). Which tree size corresponds to the lowest cross-validated classification error rate? As the tree size increases, the cross-validated classification error rate will always decrease, but after 4, the classification error rate change very slightly. We may consider about choose 4 as size of our final tree.
(i). Produce a pruned tree corresponding to the optimal tree size obtained using cross-validation. If cross-validation does not lead to selection of a pruned tree, then create a pruned tree with five terminal nodes.
prune_oj<-prune.misclass(t_oj,best = 4)
plot(prune_oj)
text(prune_oj,pretty=0)
(j). Compare the training error rates between the pruned and unpruned trees. Which is higher?
summary(t_oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = oj_train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "WeekofPurchase"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7036 = 556.6 / 791
## Misclassification error rate: 0.1425 = 114 / 800
summary(prune_oj)
##
## Classification tree:
## snip.tree(tree = t_oj, nodes = c(10L, 3L))
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "WeekofPurchase"
## Number of terminal nodes: 5
## Residual mean deviance: 0.8254 = 656.2 / 795
## Misclassification error rate: 0.1562 = 125 / 800
We can get the value of train error rate from [Misclassification error rate]. 0.1731 for pruned tree and 0.16 for unpruned tree, so higher for pruned tree.
(k). Compare the test error rates between the pruned and unpruned trees. Which is higher?
pred_prune<-predict(prune_oj,oj_test,type="class")
pred<-predict(t_oj,oj_test,type="class")
ytest<-oj_test$Purchase
table(pred,ytest)
## ytest
## pred CH MM
## CH 141 41
## MM 21 67
table(pred_prune,ytest)
## ytest
## pred_prune CH MM
## CH 138 41
## MM 24 67
te<-(31+16)/270
te_pruned<-(37+15)/270
te
## [1] 0.1740741
te_pruned
## [1] 0.1925926
Higher test error rate after pruning.
(a). Remove the observations for whom the salary information is unknown, and then log-transform the salaries.
attach(Hitters)
str(Hitters)
## 'data.frame': 322 obs. of 20 variables:
## $ AtBat : int 293 315 479 496 321 594 185 298 323 401 ...
## $ Hits : int 66 81 130 141 87 169 37 73 81 92 ...
## $ HmRun : int 1 7 18 20 10 4 1 0 6 17 ...
## $ Runs : int 30 24 66 65 39 74 23 24 26 49 ...
## $ RBI : int 29 38 72 78 42 51 8 24 32 66 ...
## $ Walks : int 14 39 76 37 30 35 21 7 8 65 ...
## $ Years : int 1 14 3 11 2 11 2 3 2 13 ...
## $ CAtBat : int 293 3449 1624 5628 396 4408 214 509 341 5206 ...
## $ CHits : int 66 835 457 1575 101 1133 42 108 86 1332 ...
## $ CHmRun : int 1 69 63 225 12 19 1 0 6 253 ...
## $ CRuns : int 30 321 224 828 48 501 30 41 32 784 ...
## $ CRBI : int 29 414 266 838 46 336 9 37 34 890 ...
## $ CWalks : int 14 375 263 354 33 194 24 12 8 866 ...
## $ League : Factor w/ 2 levels "A","N": 1 2 1 2 2 1 2 1 2 1 ...
## $ Division : Factor w/ 2 levels "E","W": 1 2 2 1 1 2 1 2 2 1 ...
## $ PutOuts : int 446 632 880 200 805 282 76 121 143 0 ...
## $ Assists : int 33 43 82 11 40 421 127 283 290 0 ...
## $ Errors : int 20 10 14 3 4 25 7 9 19 0 ...
## $ Salary : num NA 475 480 500 91.5 750 70 100 75 1100 ...
## $ NewLeague: Factor w/ 2 levels "A","N": 1 2 1 2 2 1 1 1 2 1 ...
sum(is.na(Salary))==sum(is.na(Hitters))
## [1] TRUE
h<-na.omit(Hitters)
log_salary<-log(h$Salary)
h<-data.frame(h,log_salary)
(b). Create a training set consisting of the first 200 observations, and a test set consisting of the remaining observations.
h_train<-h[1:200,]
h_test<-h[-c(1:200),]
(c). Perform boosting on the training set with 1,000 trees for a range of values of the shrinkage parameter \(λ\). Produce a plot with different shrinkage values on the \(x\)-axis and the corresponding training set MSE on the \(y\)-axis.
# install.packages("gbm")
library(gbm)
set.seed(1)
ytrain<-h_train$log_salary
lam=10^(seq(-2,0,0.1))
m=vector()
for (i in 1:length(lam)){
boost_h<-gbm(log_salary~.,data=h_train,distribution = "gaussian", shrinkage = lam[i], n.trees=1000)
yhat.boost <- predict (boost_h, h_train, n.trees = 1000)
m=append(m,mean ((yhat.boost - ytrain)^2))
}
plot(lam,m,xlab = "lambda",ylab = "MSE for train data")
lines(lam,m)
(d). Produce a plot with different shrinkage values on the \(x\)-axis and the corresponding test set MSE on the \(y\)-axis.
ytest<-h_test$log_salary
set.seed(1)
lam=10^(seq(-2,0,0.1))
m=vector()
for (i in c(1:length(lam))){
boost_h<-gbm(log_salary~.,data=h_train,distribution = "gaussian", shrinkage = lam[i], n.trees=1000)
yhat.boost <- predict (boost_h, h_test,n.trees = 1000)
m=append(m,mean ((yhat.boost - ytest)^2))
}
plot(lam,m,xlab = "lambda",ylab = "MSE for test data")
lines(lam,m)
min(m)
## [1] 0.002498284
(e). Compare the test MSE of boosting to the test MSE that results from applying two of the regression approaches seen in Chapters 3 and 6.
# Linear regression
set.seed(1)
lm_model<-lm(log_salary~.,h_train)
lm_pred<-predict(lm_model,h_test)
lm_te<-mean((lm_pred-h_test$log_salary)^2)
lm_te
## [1] 0.1050892
# Lasso Regression
library(glmnet)
set.seed(1)
x_mat<-model.matrix(log_salary~.,h_train)
y_mat<-model.matrix(log_salary~.,h_test)
y=h_train$log_salary
lasso_model<-glmnet(x_mat,y,alpha=1) # alpha=1 for Lasso and 0 for Ridge
lasso_pred<-predict(lasso_model,s=0.01,y_mat)# s is lambda here
lasso_te<-mean((lasso_pred-h_test$log_salary)^2)
lasso_te
## [1] 0.1087135
The test MSE of Lasso model and Linear regression model is very close,0.1087135 and 0.1050892 respectively, while boosting model pwns the much smaller test MSE 0.002498284.
(f). Which variables appear to be the most important predictors in the boosted model?
boosted.model = gbm(Salary~., data = h_train,
distribution = "gaussian",n.trees = 1000,
shrinkage = lam[which.min(m)])
summary(boosted.model)
Hits seem to be the most important based on the result.
(g). Now apply bagging to the training set. What is the test set MSE for this approach?
library(randomForest)
set.seed(1)
bag_model = randomForest(log_salary~., data = h_train, distribution = "gaussian", n.trees = 1000,
shrinkage = lam[which.min(m)],mtry = 19,importance = TRUE)
bag_pred = predict(bag_model, h_test)
bag_te = mean((bag_pred - h_test$log_salary)^2)
bag_te
## [1] 0.0002454343
(a). Create a training set consisting of the first 1,000 observations, and a test set consisting of the remaining observations.
# install.packages("ISLR")
library(ISLR)
attach(Caravan)
Caravan$Purchase <- ifelse(Caravan$Purchase == "Yes", 1, 0)
c_train<-Caravan[1:1000,]
c_test<-Caravan[-c(1:1000),]
(b). Fit a boosting model to the training set with Purchase as the response and the other variables as predictors. Use 1,000 trees, and a shrinkage value of 0.01. Which predictors appear to be the most important?
library (gbm)
set.seed (1)
boost_c <- gbm (Purchase~ ., data = c_train,
distribution = "gaussian", n.trees = 1000,
shrinkage = 0.01)
summary(boost_c)
PBRAND seems to be the most important based on the result.
(c). Use the boosting model to predict the response on the test data. Predict that a person will make a purchase if the estimated proability of purchase is greater than 20%. Form a confusion matrix. What fraction of the people predicted to make a purchase do in fact make one? How does this compare with the results obtained from applying KNN or logistic regression to this data set?
probs.test <- predict(boost_c, c_test, n.trees = 1000, type = "response")
pred.test <- ifelse(probs.test > 0.2, 1,0)
table(c_test$Purchase, pred.test)
## pred.test
## 0 1
## 0 4493 40
## 1 278 11
11/51
## [1] 0.2156863
The fraction of people predicted to make a purchase that in fact make one is 0.2156863.
logit.caravan <- glm(Purchase ~ ., data = c_train, family = "binomial")
probs.test2 <- predict(logit.caravan, c_test, type = "response")
pred.test2 <- ifelse(probs.test > 0.2, 1, 0)
table(c_test$Purchase, pred.test2)
## pred.test2
## 0 1
## 0 4493 40
## 1 278 11
For logistic regression, the fraction of people predicted to make a purchase that in fact make one is still 0.2156863, same as boosting.