############
# Chapter 8 Homework
# Gabrielle Allen
# Dr. Coberly
# MATH 5553
############
# Problem 8
# In the lab, a classification tree was applied to the Carseats data set after
# converting Sales into a qualitative response variable. Now we will seek to
# predict Sales using regression trees and related approaches, treating the
# response as a quantitative variable.
# Part a: Split the data set into a training and test set.
library(tree)
library(ISLR)
attach(Carseats)
set.seed(1)
train <- sample(1:nrow(Carseats), nrow(Carseats)/2)
car.train <- Carseats[train, ]
car.test <- Carseats[-train, ]
# Part b: Fit a regression tree to the trianing set. Plot the tree, and interpret
# the results. What test MSE do you obtain?
car.tree <- tree(Sales ~ ., data = car.train)
plot(car.tree)
text(car.tree, pretty=0)

summary(car.tree)
##
## Regression tree:
## tree(formula = Sales ~ ., data = car.train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "Income"
## [6] "CompPrice"
## Number of terminal nodes: 18
## Residual mean deviance: 2.36 = 429.5 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.2570 -1.0360 0.1024 0.0000 0.9301 3.9130
car.pred <- predict(car.tree, newdata = car.test)
mean((car.pred - car.test$Sales)^2)
## [1] 4.148897
# So the MSE is 4.148897.
# Part c: Use cross-validation in order to determine the optimal level of tree
# complexity. Does pruning the tree improve the test MSE?
set.seed(1)
car.cv <- cv.tree(car.tree)
par(mfrow = c(1, 2))
plot(car.cv$size, car.cv$dev, type = "b")
plot(car.cv$k, car.cv$dev, type = "b")

par(mfrow = c(1,1))
# It looks like 7 is the best size.
prune.car <- prune.tree(car.tree, best = 7)
plot(prune.car)
text(prune.car, pretty = 0)

predict.prune <- predict(prune.car, newdata = car.test)
mean((predict.prune - car.test$Sales)^2)
## [1] 5.340397
# In this case, pruning the tree actually makes the MSE worse - it goes from 4.14
# to 5.34. This isn't always the case - depending on the test and training sets,
# sometimes the pruned tree will result in a better MSE than the unpruned tree
# (when the unpruned tree has been overfit.).
# Part d: Use the bagging approach in order to analyze this data. What test MSE do
# you obtain? Use the importance() function to determine which variables are most
# important.
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
bag.car <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 10,
importance = TRUE)
bag.car
##
## Call:
## randomForest(formula = Sales ~ ., data = Carseats, mtry = 10, importance = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.825834
## % Var explained: 62.98
predict.bag <- predict(bag.car, newdata = car.test)
mean((predict.bag - car.test$Sales)^2)
## [1] 2.554292
# So in this case, the test MSE is lower - returning 2.55, lower than both the
# pruned tree and the original regression tree. This should be true as the bagging
# method is designed to result both low bias and reduced variance, and by combining
# several trees into a single procedure.
importance(bag.car)
## %IncMSE IncNodePurity
## CompPrice 14.032030 129.568747
## Income 5.523038 75.448682
## Advertising 13.571285 131.246840
## Population 1.968853 63.042648
## Price 56.863812 504.158108
## ShelveLoc 44.720455 323.055042
## Age 22.225468 194.915976
## Education 4.823966 40.810991
## Urban -1.902185 8.746566
## US 6.632887 14.599565
varImpPlot(bag.car)

# As seen above, it looks like the price of the carseat and where it is located on
# the shelf are the most important predictors of how a carseat will sell. Age,
# competitor price, and advertising budget also appear to have an effect, but all
# other variables seem to be less important.
# Part e: Use random forests to analyze this data. What test MSE do you obtain?
# Use the importance() function to determine which variables are most important.
# Describe the effect of m, the number of variables considered at each split, on
# the error rate obtained.
set.seed(1)
rf.car1 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 1,
importance = TRUE)
set.seed(1)
pred.rf1 <- predict(rf.car1, newdata = car.test)
mean((pred.rf1 - car.test$Sales)^2)
## [1] 5.07948
# So when m = 1, the MSE is 5.07948.
set.seed(1)
rf.car2 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 2,
importance = TRUE)
set.seed(1)
pred.rf2 <- predict(rf.car2, newdata = car.test)
mean((pred.rf2 - car.test$Sales)^2)
## [1] 3.712865
# So when m = 2, the MSE is 3.712865, lower than when m = 1.
set.seed(1)
rf.car <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 3,
importance = TRUE)
set.seed(1)
pred.rf <- predict(rf.car, newdata = car.test)
mean((pred.rf - car.test$Sales)^2)
## [1] 3.30763
# So the MSE (3.30763) is slightly larger for the random forests with m = 3 than
# it is for bagging, but lower than the MSE results for the pruned tree and the
# original tree. This is one of the cases where m = the square root of p,
# where p is 10.
set.seed(1)
rf.car4 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 4,
importance = TRUE)
set.seed(1)
pred.rf4 <- predict(rf.car4, newdata = car.test)
mean((pred.rf4 - car.test$Sales)^2)
## [1] 2.998498
# So, when m = 4, the MSE is 2.998498, lower than when m = 3. This is another one
# of the cases where m can potentially equal the square root of p, as the square
# root of 10 (the number of predictor variables) is between 3 and 4.
set.seed(1)
rf.car5 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 5,
importance = TRUE)
set.seed(1)
pred.rf5 <- predict(rf.car5, newdata = car.test)
mean((pred.rf5 - car.test$Sales)^2)
## [1] 2.835178
# So when m = 5, the MSE is 2.835178, lower when m = 4.
set.seed(1)
rf.car6 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 6,
importance = TRUE)
set.seed(1)
pred.rf6 <-predict(rf.car6, newdata = car.test)
mean((pred.rf6 - car.test$Sales)^2)
## [1] 2.786896
# So when m = 6, the MSE is 2.786896, lower than when m = 5.
set.seed(1)
rf.car7 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 7,
importance = TRUE)
set.seed(1)
pred.rf7 <-predict(rf.car7, newdata = car.test)
mean((pred.rf7 - car.test$Sales)^2)
## [1] 2.677964
# So when m = 7, the MSE is 2.677964, lower than when m = 6.
set.seed(1)
rf.car8 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 8,
importance = TRUE)
set.seed(1)
pred.rf8 <-predict(rf.car8, newdata = car.test)
mean((pred.rf8 - car.test$Sales)^2)
## [1] 2.610887
# So when m = 8, the MSE is 2.610887, lower than when m = 7.
set.seed(1)
rf.car9 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 9,
importance = TRUE)
set.seed(1)
pred.rf9 <-predict(rf.car9, newdata = car.test)
mean((pred.rf9 - car.test$Sales)^2)
## [1] 2.615076
# So when m = 9, the MSE is 2.615076, lower than when m = 8.
# In this particular case, as m grows closer to 10 (the total number of predictor
# variables), the MSE decreases.
importance(rf.car)
## %IncMSE IncNodePurity
## CompPrice 9.849043 134.17665
## Income 3.534622 130.84360
## Advertising 13.075334 139.40128
## Population -0.612195 100.34668
## Price 35.530402 380.27956
## ShelveLoc 31.015873 234.62966
## Age 16.680174 202.18673
## Education 2.563741 68.75977
## Urban -1.569224 16.99182
## US 6.400241 31.12594
varImpPlot(rf.car)
# Here, again, it looks like price and the location on the shelf are the most
# important predictors, but age and advertising again look to have some import.
# Competitors' prices seem to have less of an impact than they did in the bagging
# model, and income seems to have more of an impact in the random forests model
# than it did in the bagging model.
rm(list = ls()) # clear all variables from this problem
######################
# Problem 10
# We now use boosting to predict Salary in the Hitters data set.
# Part a: Remove the observations for whom the salary information is unknown, and
# then log-transform the salaries.
library(ISLR)
library(tree)
sum(is.na(Hitters))
## [1] 59
sum(is.na(Hitters$Salary))
## [1] 59
hitters <- na.omit(Hitters)
# Because there are 59 NAs in the Salary observation alone and only 59 NAs in the
# entire Hitters data set, we can omit all the NAs without getting rid of other
# observations.
sum(is.na(hitters$Salary)) # Verify that there are no more NAs
## [1] 0
hitters$Salary <- log(hitters$Salary)
# Part b: Create a training set consisting of the first 200 observations, and a
# test set consisting of the remaining observations.
train <- 1:200
hitters.train <- hitters[train, ]
hitters.test <- hitters[-train, ]
# Part c: Perform boosting on the training set with 1,000 trees for a range of
# values of the shrinkage parameter lambda. Produce a plot with different shrinkage
# values on the x-axis and the corresponding training set MSE on the y-axis.
library(gbm)
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3

set.seed(1)
boost.hitters <- gbm(Salary ~ ., data = hitters.train, distribution = "gaussian",
n.trees = 1000)
summary(boost.hitters)

## var rel.inf
## CAtBat CAtBat 43.47416260
## CHits CHits 15.48904323
## CWalks CWalks 13.61631801
## CRuns CRuns 11.61751925
## CRBI CRBI 10.89695717
## CHmRun CHmRun 1.55981216
## RBI RBI 1.02130939
## Hits Hits 0.97488648
## Years Years 0.79991936
## AtBat AtBat 0.41816217
## Walks Walks 0.09783338
## Runs Runs 0.03407680
## HmRun HmRun 0.00000000
## League League 0.00000000
## Division Division 0.00000000
## PutOuts PutOuts 0.00000000
## Assists Assists 0.00000000
## Errors Errors 0.00000000
## NewLeague NewLeague 0.00000000
# It looks like important variables include CAtBat (career at bats), CHits (career
# hits), CWalks (career walks), CRuns (career runs), and CRBI (career RBI).
mse.depth <- vector(mode = "numeric")
max.depth <- 10
grid <- 10^seq(-10, 0, by = 0.05)
mse.train <- vector(mode = "numeric")
for (k in 1:length(grid)){
boost.hitters.train <- gbm(Salary ~ ., data = hitters.train,
distribution = "gaussian", n.trees = 1000,
shrinkage = grid[k])
pred.boost.train <- predict(boost.hitters.train, newdata = hitters.train,
n.trees = 1000)
mse.train[k] = mean((pred.boost.train - hitters.train$Salary)^2)
}
plot(grid, mse.train, ylab = "Training MSE", xlab = "Shrinkage Values",
type = "b")

min(mse.train)
## [1] 0.0007340565
grid[which.min(mse.train)]
## [1] 0.8912509
# So the smallest training MSE is 0.0007340565, and that occurs when the shrinkage
# value is 0.8912509.
# Part d: Produce a plot with different shrinkage values on the x-axis and
# the corresponding test set MSE on the y-axis.
set.seed(1)
mse.test <- vector(mode = "numeric")
for (k in 1:length(grid)){
boost.hitters.test <- gbm(Salary ~ ., data = hitters.train,
distribution = "gaussian", n.trees = 1000,
shrinkage = grid[k])
pred.boost.test <- predict(boost.hitters.test, newdata = hitters.test,
n.trees = 1000)
mse.test[k] = mean((pred.boost.test - hitters.test$Salary)^2)
}
plot(grid, mse.test, ylab = "Test MSE", xlab = "Shrinkage Values",
type = "b")

min(mse.test)
## [1] 0.2504916
grid[which.min(mse.test)]
## [1] 0.07943282
# The smallest test MSE is 0.2504916, and that occurs when the shrinkage parameter
# is 0.07943282.
# Part e: Compare the test MSE of boosting to the test MSE that results from
# applying the regression approach seen in chapter 3.
hitters.lm <- lm(Salary ~ ., data = hitters.train)
plot(hitters.lm)




hitters.lm.pred <- predict(hitters.lm, newdata = hitters.test)
mean((hitters.lm.pred - hitters.test$Salary)^2)
## [1] 0.4917959
plot(grid, mse.test, ylab = "Test MSE", xlab = "Shrinkage Values", type = "b")
abline(0.4917959, 0)

# So the MSE for a linear model is 0.4917959 - nearly twice the smallest test MSE
# for boosting. The line in the graph represents the MSE for the linear model.
# Part f: Which variables appear to be the most important predictors in the
# boosted model?
summary(boost.hitters)

## var rel.inf
## CAtBat CAtBat 43.47416260
## CHits CHits 15.48904323
## CWalks CWalks 13.61631801
## CRuns CRuns 11.61751925
## CRBI CRBI 10.89695717
## CHmRun CHmRun 1.55981216
## RBI RBI 1.02130939
## Hits Hits 0.97488648
## Years Years 0.79991936
## AtBat AtBat 0.41816217
## Walks Walks 0.09783338
## Runs Runs 0.03407680
## HmRun HmRun 0.00000000
## League League 0.00000000
## Division Division 0.00000000
## PutOuts PutOuts 0.00000000
## Assists Assists 0.00000000
## Errors Errors 0.00000000
## NewLeague NewLeague 0.00000000
# It appears that the number of times at bat during their career (CAtBat) is by far
# the most important factor. The number of hits during their career (CHits), the
# number of walks during their career (CWalks), the number of runs during their
# career (CRuns), and the number of runs batted in during his career (CRBI) are also
# important. However, for the year in question, the number of runs during the year
# (HmRun), the league a player plays in (League), the division a player plays in
# (Division), the number of put outs (PutOuts), the number of assists (Assists),
# the number of errors (Errors), and any movement between leagues between 1986 and
# 1987 (NewLeague) appear to have absolutely no effect on a player's salary -
# or, at least they do not under this particular boosted model.