############
# Chapter 8 Homework
# Gabrielle Allen
# Dr. Coberly
# MATH 5553

############
# Problem 8
# In the lab, a classification tree was applied to the Carseats data set after
# converting Sales into a qualitative response variable. Now we will seek to 
# predict Sales using regression trees and related approaches, treating the
# response as a quantitative variable.

# Part a: Split the data set into a training and test set.

library(tree)
library(ISLR)
attach(Carseats)

set.seed(1)
train <- sample(1:nrow(Carseats), nrow(Carseats)/2)
car.train <- Carseats[train, ]
car.test <- Carseats[-train, ]

# Part b: Fit a regression tree to the trianing set. Plot the tree, and interpret
# the results. What test MSE do you obtain?

car.tree <- tree(Sales ~ ., data = car.train)
plot(car.tree)
text(car.tree, pretty=0)

summary(car.tree)
## 
## Regression tree:
## tree(formula = Sales ~ ., data = car.train)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Age"         "Advertising" "Income"     
## [6] "CompPrice"  
## Number of terminal nodes:  18 
## Residual mean deviance:  2.36 = 429.5 / 182 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -4.2570 -1.0360  0.1024  0.0000  0.9301  3.9130
car.pred <- predict(car.tree, newdata = car.test)
mean((car.pred - car.test$Sales)^2)
## [1] 4.148897
# So the MSE is 4.148897.

# Part c: Use cross-validation in order to determine the optimal level of tree
# complexity. Does pruning the tree improve the test MSE?

set.seed(1)
car.cv <- cv.tree(car.tree)
par(mfrow = c(1, 2))
plot(car.cv$size, car.cv$dev, type = "b")
plot(car.cv$k, car.cv$dev, type = "b")

par(mfrow = c(1,1))

# It looks like 7 is the best size.

prune.car <- prune.tree(car.tree, best = 7)
plot(prune.car)
text(prune.car, pretty = 0)

predict.prune <- predict(prune.car, newdata = car.test)
mean((predict.prune - car.test$Sales)^2)
## [1] 5.340397
# In this case, pruning the tree actually makes the MSE worse - it goes from 4.14
# to 5.34. This isn't always the case - depending on the test and training sets, 
# sometimes the pruned tree will result in a better MSE than the unpruned tree
# (when the unpruned tree has been overfit.). 

# Part d: Use the bagging approach in order to analyze this data. What test MSE do
# you obtain? Use the importance() function to determine which variables are most
# important.

library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
bag.car <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 10,
                        importance = TRUE)
bag.car
## 
## Call:
##  randomForest(formula = Sales ~ ., data = Carseats, mtry = 10,      importance = TRUE, subset = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 10
## 
##           Mean of squared residuals: 2.825834
##                     % Var explained: 62.98
predict.bag <- predict(bag.car, newdata = car.test)
mean((predict.bag - car.test$Sales)^2)
## [1] 2.554292
# So in this case, the test MSE is lower - returning 2.55, lower than both the 
# pruned tree and the original regression tree. This should be true as the bagging
# method is designed to result both low bias and reduced variance, and by combining
# several trees into a single procedure.

importance(bag.car)
##               %IncMSE IncNodePurity
## CompPrice   14.032030    129.568747
## Income       5.523038     75.448682
## Advertising 13.571285    131.246840
## Population   1.968853     63.042648
## Price       56.863812    504.158108
## ShelveLoc   44.720455    323.055042
## Age         22.225468    194.915976
## Education    4.823966     40.810991
## Urban       -1.902185      8.746566
## US           6.632887     14.599565
varImpPlot(bag.car)

# As seen above, it looks like the price of the carseat and where it is located on
# the shelf are the most important predictors of how a carseat will sell. Age, 
# competitor price, and advertising budget also appear to have an effect, but all
# other variables seem to be less important.

# Part e: Use random forests to analyze this data. What test MSE do you obtain?
# Use the importance() function to determine which variables are most important.
# Describe the effect of m, the number of variables considered at each split, on 
# the error rate obtained.

set.seed(1)
rf.car1 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 1,
                        importance = TRUE)
set.seed(1)
pred.rf1 <- predict(rf.car1, newdata = car.test)
mean((pred.rf1 - car.test$Sales)^2)
## [1] 5.07948
# So when m = 1, the MSE is 5.07948.

set.seed(1)
rf.car2 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 2,
                        importance = TRUE)
set.seed(1)
pred.rf2 <- predict(rf.car2, newdata = car.test)
mean((pred.rf2 - car.test$Sales)^2)
## [1] 3.712865
# So when m = 2, the MSE is 3.712865, lower than when m = 1.

set.seed(1)
rf.car <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 3,
                       importance = TRUE)
set.seed(1)
pred.rf <- predict(rf.car, newdata = car.test)
mean((pred.rf - car.test$Sales)^2)
## [1] 3.30763
# So the MSE (3.30763) is slightly larger for the random forests with m = 3 than 
# it is for bagging, but lower than the MSE results for the pruned tree and the 
# original tree. This is one of the cases where m = the square root of p, 
# where p is 10.

set.seed(1)
rf.car4 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 4,
                        importance = TRUE)
set.seed(1)
pred.rf4 <- predict(rf.car4, newdata = car.test)
mean((pred.rf4 - car.test$Sales)^2)
## [1] 2.998498
# So, when m = 4, the MSE is 2.998498, lower than when m = 3. This is another one 
# of the cases where m can potentially equal the square root of p, as the square 
# root of 10 (the number of predictor variables) is between 3 and 4.

set.seed(1)
rf.car5 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 5,
                        importance = TRUE)
set.seed(1)
pred.rf5 <- predict(rf.car5, newdata = car.test)
mean((pred.rf5 - car.test$Sales)^2)
## [1] 2.835178
# So when m = 5, the MSE is 2.835178, lower when m = 4.

set.seed(1)
rf.car6 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 6,
                        importance = TRUE)
set.seed(1)
pred.rf6 <-predict(rf.car6, newdata = car.test)
mean((pred.rf6 - car.test$Sales)^2)
## [1] 2.786896
# So when m = 6, the MSE is 2.786896, lower than when m = 5.

set.seed(1)
rf.car7 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 7,
                        importance = TRUE)
set.seed(1)
pred.rf7 <-predict(rf.car7, newdata = car.test)
mean((pred.rf7 - car.test$Sales)^2)
## [1] 2.677964
# So when m = 7, the MSE is 2.677964, lower than when m = 6.

set.seed(1)
rf.car8 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 8,
                        importance = TRUE)
set.seed(1)
pred.rf8 <-predict(rf.car8, newdata = car.test)
mean((pred.rf8 - car.test$Sales)^2)
## [1] 2.610887
# So when m = 8, the MSE is 2.610887, lower than when m = 7.

set.seed(1)
rf.car9 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 9,
                        importance = TRUE)
set.seed(1)
pred.rf9 <-predict(rf.car9, newdata = car.test)
mean((pred.rf9 - car.test$Sales)^2)
## [1] 2.615076
# So when m = 9, the MSE is 2.615076, lower than when m = 8.

# In this particular case, as m grows closer to 10 (the total number of predictor
# variables), the MSE decreases.

importance(rf.car)
##               %IncMSE IncNodePurity
## CompPrice    9.849043     134.17665
## Income       3.534622     130.84360
## Advertising 13.075334     139.40128
## Population  -0.612195     100.34668
## Price       35.530402     380.27956
## ShelveLoc   31.015873     234.62966
## Age         16.680174     202.18673
## Education    2.563741      68.75977
## Urban       -1.569224      16.99182
## US           6.400241      31.12594
varImpPlot(rf.car)

# Here, again, it looks like price and the location on the shelf are the most
# important predictors, but age and advertising again look to have some import.
# Competitors' prices seem to have less of an impact than they did in the bagging
# model, and income seems to have more of an impact in the random forests model
# than it did in the bagging model.

rm(list = ls()) # clear all variables from this problem

######################
# Problem 10
# We now use boosting to predict Salary in the Hitters data set.

# Part a: Remove the observations for whom the salary information is unknown, and 
# then log-transform the salaries.

library(ISLR)
library(tree)

sum(is.na(Hitters))
## [1] 59
sum(is.na(Hitters$Salary))
## [1] 59
hitters <- na.omit(Hitters)
# Because there are 59 NAs in the Salary observation alone and only 59 NAs in the 
# entire Hitters data set, we can omit all the NAs without getting rid of other 
# observations.

sum(is.na(hitters$Salary)) # Verify that there are no more NAs
## [1] 0
hitters$Salary <- log(hitters$Salary)

# Part b: Create a training set consisting of the first 200 observations, and a 
# test set consisting of the remaining observations.

train <- 1:200
hitters.train <- hitters[train, ]
hitters.test <- hitters[-train, ]

# Part c: Perform boosting on the training set with 1,000 trees for a range of 
# values of the shrinkage parameter lambda. Produce a plot with different shrinkage
# values on the x-axis and the corresponding training set MSE on the y-axis.


library(gbm)
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3

set.seed(1)

boost.hitters <- gbm(Salary ~ ., data = hitters.train, distribution = "gaussian",
                    n.trees = 1000)
summary(boost.hitters)

##                 var     rel.inf
## CAtBat       CAtBat 43.47416260
## CHits         CHits 15.48904323
## CWalks       CWalks 13.61631801
## CRuns         CRuns 11.61751925
## CRBI           CRBI 10.89695717
## CHmRun       CHmRun  1.55981216
## RBI             RBI  1.02130939
## Hits           Hits  0.97488648
## Years         Years  0.79991936
## AtBat         AtBat  0.41816217
## Walks         Walks  0.09783338
## Runs           Runs  0.03407680
## HmRun         HmRun  0.00000000
## League       League  0.00000000
## Division   Division  0.00000000
## PutOuts     PutOuts  0.00000000
## Assists     Assists  0.00000000
## Errors       Errors  0.00000000
## NewLeague NewLeague  0.00000000
# It looks like important variables include CAtBat (career at bats), CHits (career
# hits), CWalks (career walks), CRuns (career runs), and CRBI (career RBI).

mse.depth <- vector(mode = "numeric")
max.depth <- 10

grid <- 10^seq(-10, 0, by = 0.05)

mse.train <- vector(mode = "numeric")
for (k in 1:length(grid)){
  boost.hitters.train <- gbm(Salary ~ ., data = hitters.train, 
                             distribution = "gaussian", n.trees = 1000, 
                             shrinkage = grid[k])
  pred.boost.train <- predict(boost.hitters.train, newdata = hitters.train, 
                              n.trees = 1000)
  mse.train[k] = mean((pred.boost.train - hitters.train$Salary)^2)
}
plot(grid, mse.train, ylab = "Training MSE", xlab = "Shrinkage Values", 
     type = "b")

min(mse.train)
## [1] 0.0007340565
grid[which.min(mse.train)]
## [1] 0.8912509
# So the smallest training MSE is 0.0007340565, and that occurs when the shrinkage
# value is 0.8912509.

# Part d: Produce a plot with different shrinkage values on the x-axis and 
# the corresponding test set MSE on the y-axis.

set.seed(1)
mse.test <- vector(mode = "numeric")
for (k in 1:length(grid)){
  boost.hitters.test <- gbm(Salary ~ ., data = hitters.train, 
                            distribution = "gaussian", n.trees = 1000, 
                            shrinkage = grid[k])
  pred.boost.test <- predict(boost.hitters.test, newdata = hitters.test, 
                              n.trees = 1000)
  mse.test[k] = mean((pred.boost.test - hitters.test$Salary)^2)
}
plot(grid, mse.test, ylab = "Test MSE", xlab = "Shrinkage Values", 
     type = "b")

min(mse.test)
## [1] 0.2504916
grid[which.min(mse.test)]
## [1] 0.07943282
# The smallest test MSE is 0.2504916, and that occurs when the shrinkage parameter 
# is 0.07943282.

# Part e: Compare the test MSE of boosting to the test MSE that results from
# applying the regression approach seen in chapter 3.

hitters.lm <- lm(Salary ~ ., data = hitters.train)
plot(hitters.lm)

hitters.lm.pred <- predict(hitters.lm, newdata = hitters.test)
mean((hitters.lm.pred - hitters.test$Salary)^2)
## [1] 0.4917959
plot(grid, mse.test, ylab = "Test MSE", xlab = "Shrinkage Values", type = "b")
abline(0.4917959, 0)

# So the MSE for a linear model is 0.4917959 - nearly twice the smallest test MSE
# for boosting. The line in the graph represents the MSE for the linear model.

# Part f: Which variables appear to be the most important predictors in the 
# boosted model?

summary(boost.hitters)

##                 var     rel.inf
## CAtBat       CAtBat 43.47416260
## CHits         CHits 15.48904323
## CWalks       CWalks 13.61631801
## CRuns         CRuns 11.61751925
## CRBI           CRBI 10.89695717
## CHmRun       CHmRun  1.55981216
## RBI             RBI  1.02130939
## Hits           Hits  0.97488648
## Years         Years  0.79991936
## AtBat         AtBat  0.41816217
## Walks         Walks  0.09783338
## Runs           Runs  0.03407680
## HmRun         HmRun  0.00000000
## League       League  0.00000000
## Division   Division  0.00000000
## PutOuts     PutOuts  0.00000000
## Assists     Assists  0.00000000
## Errors       Errors  0.00000000
## NewLeague NewLeague  0.00000000
# It appears that the number of times at bat during their career (CAtBat) is by far
# the most important factor. The number of hits during their career (CHits), the 
# number of walks during their career (CWalks), the number of runs during their 
# career (CRuns), and the number of runs batted in during his career (CRBI) are also
# important. However, for the year in question, the number of runs during the year 
# (HmRun), the league a player plays in (League), the division a player plays in
# (Division), the number of put outs (PutOuts), the number of assists (Assists), 
# the number of errors (Errors), and any movement between leagues between 1986 and
# 1987 (NewLeague) appear to have absolutely no effect on a player's salary -
# or, at least they do not under this particular boosted model.