p = seq(0, 1, 0.01)
gini = p * (1 - p) * 2
entropy = -(p * log(p) + (1 - p) * log(1 - p))
class.err = 1 - pmax(p, 1 - p)
matplot(p, cbind(gini, entropy, class.err), col = c("pink", "red", "purple"))
library(tree)
library(ISLR)
attach(Carseats)
set.seed(1)
train <- sample(1:nrow(Carseats), nrow(Carseats)/2)
car.train <- Carseats[train, ]
car.test <- Carseats[-train, ]
#(b) Fit a regression tree to the training set. Plot the tree, and interpret the results. What test MSE do you obtain?
car.tree <- tree(Sales ~ ., data = car.train)
plot(car.tree)
text(car.tree, pretty=0)
summary(car.tree)
##
## Regression tree:
## tree(formula = Sales ~ ., data = car.train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "US"
## Number of terminal nodes: 18
## Residual mean deviance: 2.167 = 394.3 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.88200 -0.88200 -0.08712 0.00000 0.89590 4.09900
car.pred <- predict(car.tree, newdata = car.test)
mean((car.pred - car.test$Sales)^2)
## [1] 4.922039
Based upon the results above the MSE is [1] 4.922039
#(c) Use cross-validation in order to determine the optimal level of tree complexity. Does pruning the tree improve the test MSE?
set.seed(1)
car.cv <- cv.tree(car.tree)
par(mfrow = c(1, 2))
plot(car.cv$size, car.cv$dev, type = "b")
plot(car.cv$k, car.cv$dev, type = "b")
par(mfrow = c(1,1))
prune.car <- prune.tree(car.tree, best = 7)
plot(prune.car)
text(prune.car, pretty = 0)
It looks like 7 is the best size.
predict.prune <- predict(prune.car, newdata = car.test)
mean((predict.prune - car.test$Sales)^2)
## [1] 4.861001
Based on the results, pruning the tree makes the MSE a tad better - it goes from 4.92 to 4.86 so there is a slight change.
#(d) Use the bagging approach in order to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important.
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
bag.car <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 10,
importance = TRUE)
bag.car
##
## Call:
## randomForest(formula = Sales ~ ., data = Carseats, mtry = 10, importance = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.889221
## % Var explained: 63.26
predict.bag <- predict(bag.car, newdata = car.test)
mean((predict.bag - car.test$Sales)^2)
## [1] 2.605253
In this case, the MSE is lower than both the prune and regression tree, this should stand true as the bagging method is designed to provide low bias and also reduce the variance by combining multiple trees into a single step.
importance(bag.car)
## %IncMSE IncNodePurity
## CompPrice 24.8888481 170.182937
## Income 4.7121131 91.264880
## Advertising 12.7692401 97.164338
## Population -1.8074075 58.244596
## Price 56.3326252 502.903407
## ShelveLoc 48.8886689 380.032715
## Age 17.7275460 157.846774
## Education 0.5962186 44.598731
## Urban 0.1728373 9.822082
## US 4.2172102 18.073863
varImpPlot(bag.car)
#(e) Use random forests to analyze this data. What test MSE do you
obtain? Use the importance() function to determine which variables are
most important. Describe the effect of m, the number of variables
considered at each split, on the error rate obtained.
set.seed(1)
rf.car1 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 1,
importance = TRUE)
set.seed(1)
pred.rf1 <- predict(rf.car1, newdata = car.test)
mean((pred.rf1 - car.test$Sales)^2)
## [1] 4.747522
When m = 1, the MSE is 4.747522
set.seed(1)
rf.car2 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 2,
importance = TRUE)
set.seed(1)
pred.rf2 <- predict(rf.car2, newdata = car.test)
mean((pred.rf2 - car.test$Sales)^2)
## [1] 3.401523
When m = 2, the MSE is 3.401523 which is lower than m = 1
set.seed(1)
rf.car <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 3,
importance = TRUE)
set.seed(1)
pred.rf <- predict(rf.car, newdata = car.test)
mean((pred.rf - car.test$Sales)^2)
## [1] 2.960559
The MSE being 2.960559 is larger for the random forest with m = 3 than what it is for bagging but in lower then the MSE results for the prunned tree and the original tree.
set.seed(1)
rf.car4 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 4,
importance = TRUE)
set.seed(1)
pred.rf4 <- predict(rf.car4, newdata = car.test)
mean((pred.rf4 - car.test$Sales)^2)
## [1] 2.787584
set.seed(1)
rf.car5 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 5,
importance = TRUE)
set.seed(1)
pred.rf5 <- predict(rf.car5, newdata = car.test)
mean((pred.rf5 - car.test$Sales)^2)
## [1] 2.714168
set.seed(1)
rf.car6 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 6,
importance = TRUE)
set.seed(1)
pred.rf6 <-predict(rf.car6, newdata = car.test)
mean((pred.rf6 - car.test$Sales)^2)
## [1] 2.667767
set.seed(1)
rf.car7 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 7,
importance = TRUE)
set.seed(1)
pred.rf7 <-predict(rf.car7, newdata = car.test)
mean((pred.rf7 - car.test$Sales)^2)
## [1] 2.678559
set.seed(1)
rf.car8 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 8,
importance = TRUE)
set.seed(1)
pred.rf8 <-predict(rf.car8, newdata = car.test)
mean((pred.rf8 - car.test$Sales)^2)
## [1] 2.647116
set.seed(1)
rf.car9 <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = 9,
importance = TRUE)
set.seed(1)
pred.rf9 <-predict(rf.car9, newdata = car.test)
mean((pred.rf9 - car.test$Sales)^2)
## [1] 2.590855
as we continue down, we can see as we increase the m variable there is a decrease in the MSE.
importance(rf.car)
## %IncMSE IncNodePurity
## CompPrice 14.8840765 158.82956
## Income 4.3293950 125.64850
## Advertising 8.2215192 107.51700
## Population -0.9488134 97.06024
## Price 34.9793386 385.93142
## ShelveLoc 34.9248499 298.54210
## Age 14.3055912 178.42061
## Education 1.3117842 70.49202
## Urban -1.2680807 17.39986
## US 6.1139696 33.98963
varImpPlot(rf.car)
#(a) Remove the observations for whom the salary information is unknown, and then log-transform the salaries.
library(ISLR)
library(tree)
sum(is.na(Hitters))
## [1] 59
sum(is.na(Hitters$Salary))
## [1] 59
hitters <- na.omit(Hitters)
With there being 59 NA’s within the Salary observations and 59 NA’s in the Hitters data set, we are able to omit all of the NA’s without having to get rid of the other observations. The next chunk of code will verify that there are no more NA’s
sum(is.na(hitters$Salary))
## [1] 0
hitters$Salary <- log(hitters$Salary)
#(b) Create a training set consisting of the first 200 observations, and a test set consisting of the remaining observations
train <- 1:200
hitters.train <- hitters[train, ]
hitters.test <- hitters[-train, ]
#(c) Perform boosting on the training set with 1,000 trees for a range of values of the shrinkage parameter λ. Produce a plot with different shrinkage values on the x-axis and the corresponding training set MSE on the y-axis.
library(gbm)
## Loaded gbm 2.1.8.1
set.seed(1)
boost.hitters <- gbm(Salary ~ ., data = hitters.train, distribution = "gaussian",
n.trees = 1000)
summary(boost.hitters)
## var rel.inf
## CAtBat CAtBat 21.3171524
## PutOuts PutOuts 10.2788377
## CRuns CRuns 9.0751254
## CRBI CRBI 7.0417974
## CHmRun CHmRun 6.1148613
## Walks Walks 6.1031023
## Assists Assists 5.0788336
## Years Years 5.0759281
## CWalks CWalks 4.3126065
## Hits Hits 4.2479119
## RBI RBI 3.8718470
## CHits CHits 3.7653292
## Runs Runs 3.6845856
## AtBat AtBat 3.4443531
## HmRun HmRun 2.9520357
## Errors Errors 2.2050936
## Division Division 0.6822530
## NewLeague NewLeague 0.5057473
## League League 0.2425989
mse.depth <- vector(mode = "numeric")
max.depth <- 10
grid <- 10^seq(-10, 0, by = 0.05)
mse.train <- vector(mode = "numeric")
for (k in 1:length(grid)){
boost.hitters.train <- gbm(Salary ~ ., data = hitters.train,
distribution = "gaussian", n.trees = 1000,
shrinkage = grid[k])
pred.boost.train <- predict(boost.hitters.train, newdata = hitters.train,
n.trees = 1000)
mse.train[k] = mean((pred.boost.train - hitters.train$Salary)^2)
}
plot(grid, mse.train, ylab = "Training MSE", xlab = "Shrinkage Values",
type = "b")
min(mse.train)
## [1] 0.0007340565
grid[which.min(mse.train)]
## [1] 0.8912509
The smallest training MSE is 0.0007340565 when the shrinkage value is 0.8912509.
#(d) Produce a plot with different shrinkage values on the x-axis and the corresponding test set MSE on the y-axis.
set.seed(1)
mse.test <- vector(mode = "numeric")
for (k in 1:length(grid)){
boost.hitters.test <- gbm(Salary ~ ., data = hitters.train,
distribution = "gaussian", n.trees = 1000,
shrinkage = grid[k])
pred.boost.test <- predict(boost.hitters.test, newdata = hitters.test,
n.trees = 1000)
mse.test[k] = mean((pred.boost.test - hitters.test$Salary)^2)
}
plot(grid, mse.test, ylab = "Test MSE", xlab = "Shrinkage Values",
type = "b")
min(mse.test)
## [1] 0.2504916
grid[which.min(mse.test)]
## [1] 0.07943282
The smallest training MSE is 0.2504916 when the shrinkage value is 0.07943282.
#(e) Compare the test MSE of boosting to the test MSE that results from applying two of the regression approaches seen in Chapters 3 and 6.
hitters.lm <- lm(Salary ~ ., data = hitters.train)
plot(hitters.lm)
hitters.lm.pred <- predict(hitters.lm, newdata = hitters.test)
mean((hitters.lm.pred - hitters.test$Salary)^2)
## [1] 0.4917959
plot(grid, mse.test, ylab = "Test MSE", xlab = "Shrinkage Values", type = "b")
abline(0.4917959, 0)
Based on the linear model the MSE is 0.4917959 which is basically twice
the smallest MSE for boosting. the line graph represents the MSE for the
linear model.
#(f) Which variables appear to be the most important predictors in the boosted model?
summary(boost.hitters)
## var rel.inf
## CAtBat CAtBat 21.3171524
## PutOuts PutOuts 10.2788377
## CRuns CRuns 9.0751254
## CRBI CRBI 7.0417974
## CHmRun CHmRun 6.1148613
## Walks Walks 6.1031023
## Assists Assists 5.0788336
## Years Years 5.0759281
## CWalks CWalks 4.3126065
## Hits Hits 4.2479119
## RBI RBI 3.8718470
## CHits CHits 3.7653292
## Runs Runs 3.6845856
## AtBat AtBat 3.4443531
## HmRun HmRun 2.9520357
## Errors Errors 2.2050936
## Division Division 0.6822530
## NewLeague NewLeague 0.5057473
## League League 0.2425989
The number of times at bat during their career is the most important factor.