Notebook prepared by Everton Lima

Introduction to Statistical Learning Solutions (ISLR)

Ch 8 Exercises

Table of Contents

Conceptual Exercises

Applied Exercises

Conceptual Exercises

1

pts=data.frame(c(25,25,75,75,70,80),c(75,25,75,50,25,25))

plot(pts,xlim=c(0,100),ylim=c(0,110),pch="",xlab="X1",ylab="X2",xaxt='n',yaxt='n')

lines(x = c(50,50), y = c(0,100))
lines(x = c(0,50), y = c(65,65))
lines(x = c(50,100), y = c(60,60))
lines(x = c(50,100), y = c(40,40))
lines(x = c(75,75), y = c(1,40))

text(x = 50, y = 108, labels = c("t1"), col = "red")
text(x = 0, y = 70, labels = c("t2"), col = "red")
text(x = 100, y = 65, labels = c("t3"), col = "red")
text(x = 100, y = 45, labels = c("t4"), col = "red")
text(x = 75, y = 1, labels = c("t5"), col = "red")

text(pts,labels=paste("R",1:6,sep=""))

In two dimensions a tree is a subdivision of the space into rectangular regions where each region is a leaf node.

2

If only stumps are considered, then for a predictor \(X_j\) the equation for its stump takes the form; \[\hat{f}_j(x_j)=\beta_0+I(K_j<X_j)\beta_1\]

Now boosting considers several stumps on the set of predictors where the previously chosen predictor may be chosen or not. If another stump is created for a previously chosen predictor \(X_i\), then we can define \(\hat{f}_i(x_i\) as follows; \[\hat{f}_i(x_i)=\hat{f}_i(x_i)_1 +\hat{f}_i(x_i)_2\] \[=\dot{\beta_0}+I(K_{j1}<X_{j})\beta_1+I(K_{j2}<X_j)\beta_2\]

which if \(K_{j1} < K_{j2}\) is equivalent to the equation, \[\dot{\beta_0}+I(K_{j1}<X_j<K_{j2})\beta_1+I(K_{j2}s<X_j)(\beta_1+\beta_2)\]

Which can be seen as adding a branch to the stump. Thus, since all functions solely depend on a single predictor the model takes the form of \(f(X)=\sum^p_{j=1}f_j(X_j)\). Where \(f_j(X_j)=\frac{1}{\lambda}\hat{f_j}(x_j)\).

3

p=seq(0,1,0.01)

gini= 2*p*(1-p)
classerror= 1-pmax(p,1-p)
crossentropy= -(p*log(p)+(1-p)*log(1-p))

plot(NA,NA,xlim=c(0,1),ylim=c(0,1),xlab='p',ylab='f')

lines(p,gini,type='l')
lines(p,classerror,col='blue')
lines(p,crossentropy,col='red')

legend(x='top',legend=c('gini','class error','cross entropy'),
       col=c('black','blue','red'),lty=1,text.width = 0.22)

4

4a

This problem is best done in a piece of paper. The resulting tree has 5 leaf nodes corresponding to each rectangular region of the graph.

4b

X1=seq(0,3,0.1)
X2=seq(0,3,0.1)

plot(NA,NA,xlim=c(-1.2,3),ylim=c(-0.2,3),pch="",xlab="X1",ylab="X2",xaxt='n',yaxt='n')
pts=data.frame(c(1.5,-0.5,1.5,-0.25,1.50),c(2.5,1.5,1.5,0.5,0.5))

text(pts,labels=c('2.49','-1.06','0.21','-1.80','0.63'))
text(x = 1, y = -0.2, labels = c("1"), col = "red")
text(x = 0, y = 0.8, labels = c("0"), col = "red")
text(x = -1.2, y = 1, labels = c("1"), col = "red")
text(x = -1.2, y = 2, labels = c("2"), col = "red")
 
lines(x = c(-1,3), y = c(1,1))
lines(x = c(-1,3), y = c(2,2))
lines(x = c(0,0), y = c(1,2))
lines(x = c(1,1), y = c(0,1))

5

In the majority vote approach we assign the observation to the class that occurs the most. Thus, we count the number of assignments done to each class by making use of a cutoff value. As shown below, the observation X is assigned to the Red class.

mjr=c(0.1,0.15,0.2,0.2,0.55,0.6,0.6,0.65,0.7,0.75)
table(mjr>0.5)
## 
## FALSE  TRUE 
##     4     6

As described in the question, when average probability is used the average is taken from the estimated probabilities that result from the bagging model. The average is 0.45, which determines that X does not belong to the Red class.

avg=mean(mjr)
avg
## [1] 0.45

6

A regression tree performs subdivisions of the predictor space. The algorithm considers one split at a time in the set of predictors, where the split that is chosen is the one that achieves the most reduction in the RSS. This step will be repeated until threshold values are met (or cannot be met any longer such as minimum number of observations necessary in each node). When predictions are made, new observations travel down the branches of a tree, and upon reaching a leaf node, the average of the observations contained in that leaf is returned.

Applied Exercises

7

require(randomForest)
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
require(knitr)
## Loading required package: knitr
require(MASS)
## Loading required package: MASS
set.seed(1)
train=sample(1:nrow(Boston),nrow(Boston)/2)

train.rf=function(m.mtry=6,m.ntree=25){
  rf.boston=randomForest(medv~.,data=Boston,subset=train,m.mtry=6,ntree=m.ntree)
  rf.pred=predict(rf.boston,Boston[-train,])
  mse=mean((rf.pred-Boston[-train,])^2)
  mse
}
# This may take a few minutes.

mse=c()
m=c()

min.mtry=10
max.mtry=16

min.ntree=80
max.ntree=100

for(mtry in min.mtry:max.mtry){
  for(ntree in min.ntree:max.ntree){
    mse=rbind(mse,train.rf(mtry,ntree))
  }
    m=cbind(m,mse)
    mse=c()
}

rownames(m)=paste('ntree',min.ntree:max.ntree)
colnames(m)=paste('mtry',min.mtry:max.mtry)
kable(m)
mtry 10 mtry 11 mtry 12 mtry 13 mtry 14 mtry 15 mtry 16
ntree 80 21253.00 21255.46 21256.60 21254.51 21266.45 21263.31 21248.03
ntree 81 21259.58 21248.02 21247.25 21255.23 21259.82 21256.71 21257.66
ntree 82 21252.62 21252.46 21257.66 21256.55 21258.38 21253.52 21254.67
ntree 83 21261.70 21256.26 21252.17 21259.09 21254.70 21255.21 21252.04
ntree 84 21257.95 21250.39 21256.35 21252.84 21254.71 21257.51 21254.78
ntree 85 21254.83 21252.63 21260.12 21253.57 21257.85 21253.35 21254.40
ntree 86 21250.27 21261.40 21257.26 21258.94 21258.78 21255.56 21253.93
ntree 87 21266.40 21255.95 21257.92 21254.47 21255.26 21251.51 21253.59
ntree 88 21250.06 21251.44 21262.93 21254.97 21255.09 21255.29 21252.46
ntree 89 21260.38 21245.53 21256.82 21257.45 21258.35 21253.71 21256.94
ntree 90 21257.75 21255.81 21259.01 21257.02 21253.94 21250.75 21254.97
ntree 91 21263.07 21257.93 21251.67 21257.78 21255.10 21248.67 21255.10
ntree 92 21262.10 21251.81 21253.94 21255.26 21256.90 21253.74 21252.68
ntree 93 21252.34 21256.05 21252.98 21252.89 21261.52 21249.11 21256.05
ntree 94 21261.40 21251.93 21257.62 21251.49 21259.29 21259.00 21256.44
ntree 95 21254.47 21250.77 21249.82 21254.59 21256.67 21248.67 21251.23
ntree 96 21248.51 21261.74 21250.46 21256.93 21255.23 21249.72 21253.93
ntree 97 21253.65 21257.56 21250.37 21258.00 21256.40 21245.51 21263.49
ntree 98 21254.40 21252.46 21259.18 21262.46 21254.21 21256.51 21257.26
ntree 99 21258.71 21258.61 21254.59 21258.08 21255.51 21262.09 21258.87
ntree 100 21256.51 21256.90 21253.87 21260.11 21257.27 21258.65 21255.76
cols=rainbow(max.mtry)

plot(min.ntree:max.ntree,m[,1],col=cols[1],type='l',ylab='Hold out test error',xlab='ntree',lty=1)

lines(min.ntree:max.ntree,m[,2],col=cols[2],lty=2)
lines(min.ntree:max.ntree,m[,3],col=cols[3],lty=3)
lines(min.ntree:max.ntree,m[,4],col=cols[4],lty=4)
lines(min.ntree:max.ntree,m[,5],col=cols[5],lty=5)

legend(x='topright',legend=paste('mtry',10:15),col=rainbow(5),lty=1:5,ncol=1,text.width = 3)

plot(min.mtry:max.mtry,m[1,],col=cols[1],type='l',
     ylab='Hold out test error',xlab='mtry',ylim = c(21245,21270),lty=1)

lines(min.mtry:max.mtry,m[2,],col=cols[2],lty=2)
lines(min.mtry:max.mtry,m[3,],col=cols[3],lty=3)
lines(min.mtry:max.mtry,m[4,],col=cols[4],lty=4)
lines(min.mtry:max.mtry,m[5,],col=cols[5],lty=5)

legend(x='topright',legend=paste('ntree',80:85),col=rainbow(5),lty=1:5,ncol=1,text.width = 1)

8

8a

require(ISLR)
## Loading required package: ISLR
require(tree)
## Loading required package: tree
set.seed(42)
train=sample(1:nrow(Carseats),nrow(Carseats)/2)

8b

tree.carseats=tree(formula=Sales~.,data=Carseats,subset = train)
tree.pred=predict(tree.carseats,Carseats[-train,])

mean((tree.pred-Carseats[-train,'Sales'])^2)
## [1] 4.092899
plot(tree.carseats)
text(tree.carseats)

8c

tree.carseats.cv=cv.tree(tree.carseats) 
plot(tree.carseats.cv)

prune.carseats=prune.tree(tree.carseats,best=5)

plot(prune.carseats)
text(prune.carseats) 

tree.pred=predict(prune.carseats,Carseats[-train,])
mean((tree.pred-Carseats[-train,'Sales'])^2)
## [1] 4.991476
plot(tree.pred,Carseats[-train,'Sales'],xlab='prediction',ylab='actual')
abline(0,1)

From the MSE you can observe that it does not improve the test MSE.

8d

require(randomForest)

d=ncol(Carseats)-1

set.seed(42)
carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=d,importance=T,ntree=100)

tree.pred=predict(carseats.rf,Carseats[-train,])
mean((tree.pred-Carseats[-train,'Sales'])^2)
## [1] 2.750273
plot(carseats.rf)

varImpPlot(carseats.rf)

kable(importance(carseats.rf))
%IncMSE IncNodePurity
CompPrice 11.5170922 148.297954
Income 3.3430145 69.192503
Advertising 6.2062812 88.251686
Population -0.2522175 63.025904
Price 25.4921858 442.200618
ShelveLoc 25.8775065 422.649283
Age 7.0625363 132.928876
Education -1.3343248 30.809743
Urban -1.5756286 9.061747
US 1.3535718 7.374875

The predictor Price is clearly the most important predictor in predicting Sales. This model also achieves a much lower MSE than the previous one, with almost a half of reduction achieved in the test MSE.

8e

mse=c()

set.seed(42)

for(i in 3:10){
  carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=5,importance=T,ntree=100)

  tree.pred=predict(carseats.rf,Carseats[-train,])
  mse=rbind(mse,mean((tree.pred-Carseats[-train,'Sales'])^2))
}
plot(3:10,mse,type='b')

The plot above displays the effect of mtry in the test MSE.

require(randomForest)

set.seed(42)
carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=9,importance=T,ntree=100)

plot(carseats.rf)

varImpPlot(carseats.rf)

kable(importance(carseats.rf))
%IncMSE IncNodePurity
CompPrice 9.0678902 138.620007
Income 2.3427697 64.333859
Advertising 5.7404558 94.148559
Population 1.4682674 70.206086
Price 25.1750997 440.957346
ShelveLoc 23.6551259 408.393880
Age 6.2210635 136.281793
Education -0.1427487 33.300827
Urban -2.0875662 10.667281
US 0.2117698 7.227559

From the data above you can see that ShelveLoc is now the most important predictor in terms of MSE (whose absence most increase the training MSE). Moreover, while considering only 9 predictors for training each tree achieves a lower training MSE the test MSE is higher than the bagging approach.

9

9a

require(ISLR)

set.seed(42)
train=sample(1:nrow(OJ),800)

OJ.train=OJ[train,]
OJ.test=OJ[-train,]

9b

OJ.tree=tree(Purchase~.,data=OJ.train)  # There is no predictor `Buy`.
summary(OJ.tree)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH"   "PriceDiff"
## Number of terminal nodes:  8 
## Residual mean deviance:  0.7484 = 592.7 / 792 
## Misclassification error rate: 0.1762 = 141 / 800

9c

OJ.tree
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1075.00 CH ( 0.60250 0.39750 )  
##    2) LoyalCH < 0.5036 351  405.90 MM ( 0.26496 0.73504 )  
##      4) LoyalCH < 0.0356415 63    0.00 MM ( 0.00000 1.00000 ) *
##      5) LoyalCH > 0.0356415 288  362.30 MM ( 0.32292 0.67708 )  
##       10) LoyalCH < 0.276142 115  109.30 MM ( 0.18261 0.81739 ) *
##       11) LoyalCH > 0.276142 173  234.90 MM ( 0.41618 0.58382 )  
##         22) PriceDiff < 0.05 73   76.78 MM ( 0.21918 0.78082 ) *
##         23) PriceDiff > 0.05 100  137.20 CH ( 0.56000 0.44000 ) *
##    3) LoyalCH > 0.5036 449  353.10 CH ( 0.86637 0.13363 )  
##      6) LoyalCH < 0.705326 146  175.20 CH ( 0.71233 0.28767 )  
##       12) PriceDiff < 0.265 89  121.50 CH ( 0.57303 0.42697 ) *
##       13) PriceDiff > 0.265 57   28.97 CH ( 0.92982 0.07018 ) *
##      7) LoyalCH > 0.705326 303  136.50 CH ( 0.94059 0.05941 )  
##       14) PriceDiff < -0.39 14   19.12 CH ( 0.57143 0.42857 ) *
##       15) PriceDiff > -0.39 289   99.85 CH ( 0.95848 0.04152 ) *

A possible terminal node is PriceDiff < -0.165 27 30.90 MM ( 0.25926 0.74074 ) *. This node is a child for the splits LoyalCH < 0.764572 183 216.60 CH ( 0.72131 0.27869 ), LoyalCH > 0.5036 438 360.80 CH ( 0.85616 0.14384 ) and the root node LoyalCH < 0.5036 362 432.40 MM ( 0.28453 0.71547 ).

We know then that the tree will predict that the customer will purchase a minute maid if the price difference of Citrus Hill will 0.165 bigger, and the customer brand loyalty is in the interval (0.5,0.76).

9d

plot(OJ.tree)
text(OJ.tree)

From the two root node branches we can make some quick observations. The left branch shows that when there is a relative low customer loyalty for Citrus Hill then the Minute Maid. Otherwise, the cheaper drink will be the one purchased. Similarly, in the right sub tree this model predicts that when there is a strong preference for Citrus Hill it will always be the drink chosen. Otherwise, when the preference is not so strong the customer will purchase Minute Maid if it has a lower sale price or its list price is not significantly more than Citrus Hill. Otherwise, the Citrus Hill orange drink will be bought.

9e

OJ.pred.train=predict(OJ.tree,OJ.train,type = 'class')
kable(table(OJ.train[,'Purchase'],OJ.pred.train))
CH MM
CH 445 37
MM 104 214
kable(table(OJ.train[,'Purchase'],OJ.pred.train)/nrow(OJ.train))
CH MM
CH 0.55625 0.04625
MM 0.13000 0.26750
OJ.pred.test=predict(OJ.tree,OJ.test,type = 'class')
kable(table(OJ.test[,'Purchase'],OJ.pred.test))
CH MM
CH 159 12
MM 38 61
kable(table(OJ.test[,'Purchase'],OJ.pred.test)/nrow(OJ.test))
CH MM
CH 0.5888889 0.0444444
MM 0.1407407 0.2259259

From the second table we can see that about 48% of the Citrus Hill drinks are correctly predicted in the test data, also about 30% of the Minute Maid drinks are correctly chosen. Signifying a misclassification error rate about about 22%.

9f-h

set.seed(42)
OJ.tree.cv=cv.tree(OJ.tree,K = 10,FUN = prune.misclass)
plot(OJ.tree.cv)

From the plot the optimal size is 2.

OJ.tree=prune.misclass(OJ.tree,best = 2)

OJ.pred.train=predict(OJ.tree,OJ.train,type = 'class')
kable(table(OJ.train[,'Purchase'],OJ.pred.train))
CH MM
CH 389 93
MM 60 258
kable(table(OJ.train[,'Purchase'],OJ.pred.train)/nrow(OJ.train))
CH MM
CH 0.48625 0.11625
MM 0.07500 0.32250
OJ.pred.test=predict(OJ.tree,OJ.test,type = 'class')
kable(table(OJ.test[,'Purchase'],OJ.pred.test))
CH MM
CH 131 40
MM 21 78
kable(table(OJ.test[,'Purchase'],OJ.pred.test)/nrow(OJ.test))
CH MM
CH 0.4851852 0.1481481
MM 0.0777778 0.2888889
plot(OJ.tree)
text(OJ.tree)

A tree with 2 leaf nodes achieves a misclassification test rate of 25%. This represents a 3% improvement over an unpruned tree for the test data. However, the training error for the unpruned tree is lower. This is a good example of how easy trees can overfit the data.

10

10a

require(ISLR)
Hitters.unknownSal=is.na(Hitters[,"Salary"])
Hitters=Hitters[!Hitters.unknownSal,]
Hitters[,"Salary"]=log(Hitters[,"Salary"])

summary(Hitters)
##      AtBat            Hits           HmRun            Runs       
##  Min.   : 19.0   Min.   :  1.0   Min.   : 0.00   Min.   :  0.00  
##  1st Qu.:282.5   1st Qu.: 71.5   1st Qu.: 5.00   1st Qu.: 33.50  
##  Median :413.0   Median :103.0   Median : 9.00   Median : 52.00  
##  Mean   :403.6   Mean   :107.8   Mean   :11.62   Mean   : 54.75  
##  3rd Qu.:526.0   3rd Qu.:141.5   3rd Qu.:18.00   3rd Qu.: 73.00  
##  Max.   :687.0   Max.   :238.0   Max.   :40.00   Max.   :130.00  
##       RBI             Walks            Years            CAtBat       
##  Min.   :  0.00   Min.   :  0.00   Min.   : 1.000   Min.   :   19.0  
##  1st Qu.: 30.00   1st Qu.: 23.00   1st Qu.: 4.000   1st Qu.:  842.5  
##  Median : 47.00   Median : 37.00   Median : 6.000   Median : 1931.0  
##  Mean   : 51.49   Mean   : 41.11   Mean   : 7.312   Mean   : 2657.5  
##  3rd Qu.: 71.00   3rd Qu.: 57.00   3rd Qu.:10.000   3rd Qu.: 3890.5  
##  Max.   :121.00   Max.   :105.00   Max.   :24.000   Max.   :14053.0  
##      CHits            CHmRun           CRuns             CRBI       
##  Min.   :   4.0   Min.   :  0.00   Min.   :   2.0   Min.   :   3.0  
##  1st Qu.: 212.0   1st Qu.: 15.00   1st Qu.: 105.5   1st Qu.:  95.0  
##  Median : 516.0   Median : 40.00   Median : 250.0   Median : 230.0  
##  Mean   : 722.2   Mean   : 69.24   Mean   : 361.2   Mean   : 330.4  
##  3rd Qu.:1054.0   3rd Qu.: 92.50   3rd Qu.: 497.5   3rd Qu.: 424.5  
##  Max.   :4256.0   Max.   :548.00   Max.   :2165.0   Max.   :1659.0  
##      CWalks       League  Division    PutOuts          Assists     
##  Min.   :   1.0   A:139   E:129    Min.   :   0.0   Min.   :  0.0  
##  1st Qu.:  71.0   N:124   W:134    1st Qu.: 113.5   1st Qu.:  8.0  
##  Median : 174.0                    Median : 224.0   Median : 45.0  
##  Mean   : 260.3                    Mean   : 290.7   Mean   :118.8  
##  3rd Qu.: 328.5                    3rd Qu.: 322.5   3rd Qu.:192.0  
##  Max.   :1566.0                    Max.   :1377.0   Max.   :492.0  
##      Errors           Salary      NewLeague
##  Min.   : 0.000   Min.   :4.212   A:141    
##  1st Qu.: 3.000   1st Qu.:5.247   N:122    
##  Median : 7.000   Median :6.052            
##  Mean   : 8.593   Mean   :5.927            
##  3rd Qu.:13.000   3rd Qu.:6.620            
##  Max.   :32.000   Max.   :7.808

10b

Hitters.train=Hitters[1:200,]
Hitters.test=Hitters[-c(1:200),]

10c-d

require(gbm)
## Loading required package: gbm
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
train.mse=c()
test.mse=c()

for(shr in seq(0,0.08,0.002)){
  Hitters.gbm=gbm(Salary~.,data=Hitters.train,shrinkage = shr,n.trees = 1000,distribution = 'gaussian')
  
  Hitters.pred=predict(Hitters.gbm,Hitters.train,n.trees = 1000)
  train.mse=rbind(train.mse,mean((Hitters.pred-Hitters.train[,'Salary'])^2))
  
  Hitters.pred=predict(Hitters.gbm,Hitters.test,n.trees = 1000)
  test.mse=rbind(test.mse,mean((Hitters.pred-Hitters.test[,'Salary'])^2))
}

plot(seq(0,0.08,0.002),train.mse,type='l',xlab='shrinkage',xlim = c(0.003,0.07),ylab='MSE')
lines(seq(0,0.08,0.002),test.mse,col='red')
legend(x='top',legend = c('train MSE','test MSE'),col=c('black','red'),lty=1,text.width = 0.005)

10e

tb=c()

Hitters.gbm=gbm(Salary~.,data=Hitters.train,shrinkage = 0.01,n.trees = 1000,distribution = 'gaussian')
Hitters.pred=predict(Hitters.gbm,Hitters.test,n.trees = 1000)
tb=cbind(tb,'Boost'=mean((Hitters.pred-Hitters.test[,'Salary'])^2))

# Ch3 - linear regression
Hitters.lm=lm(Salary~.,Hitters.train)
Hitters.pred=predict(Hitters.lm,Hitters.test)
tb=cbind(tb,'Linear'=mean((Hitters.pred-Hitters.test[,'Salary'])^2))

# Ch6 - ridge regression
require(glmnet)
## Loading required package: glmnet
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-5
x = model.matrix(Salary ~ ., data = Hitters.train)
x.test = model.matrix(Salary ~ ., data = Hitters.test)
y = Hitters.train$Salary

Hitters.glm=glmnet(x,y,alpha = 0)
Hitters.pred=predict(Hitters.glm,x.test)
tb=cbind(tb,'Ridge'=mean((Hitters.pred-Hitters.test[,'Salary'])^2))

kable(tb)
Boost Linear Ridge
0.2752713 0.4917959 0.5145349

10f

kable(summary(Hitters.gbm),row.names = F)

var rel.inf
CAtBat 29.7469615
CHits 10.7783367
CWalks 8.6555091
CRBI 8.3272748
CRuns 7.8404149
CHmRun 6.2653705
Years 6.0026542
Hits 4.2452579
Walks 3.9946187
RBI 3.3923403
PutOuts 2.6646695
AtBat 1.9452653
HmRun 1.6876641
Errors 1.1948217
Assists 1.1196293
Runs 1.1075200
Division 0.7625687
NewLeague 0.1462979
League 0.1228249

10g

require(randomForest)
Hitters.rf=randomForest(Salary~.,data = Hitters.train,mtry=ncol(Hitters.train)-1) # bagging m=p
Hitters.pred=predict(Hitters.rf,Hitters.test)
mean((Hitters.pred-Hitters.test[,'Salary'])^2)
## [1] 0.232707

The test MSE is shown above.

11

11a

require(ISLR)
Caravan$Purchase=ifelse(Caravan$Purchase == "Yes",1,0)
Caravan.train=Caravan[1:1000,]
Caravan.test=Caravan[-c(1:1000),]

11b

require(gbm)
Caravan.gbm=gbm(Purchase~.,data=Caravan.train,n.trees = 1000,shrinkage = 0.01,distribution = "bernoulli")
## Warning in gbm.fit(x, y, offset = offset, distribution = distribution, w =
## w, : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x, y, offset = offset, distribution = distribution, w =
## w, : variable 71: AVRAAUT has no variation.
kable(summary(Caravan.gbm),row.names = F)

var rel.inf
PPERSAUT 15.2804738
MKOOPKLA 9.6328218
MOPLHOOG 6.5148600
MBERMIDD 5.4610803
PBRAND 5.0235877
ABRAND 4.3515849
MGODGE 4.3333834
MINK3045 4.0177164
MOSTYPE 3.0230879
MAUT1 2.6348897
MAUT2 2.2733199
PWAPART 2.2236415
MSKA 2.0834968
MGODPR 2.0023134
MSKC 1.8299210
MBERHOOG 1.8223698
MBERARBG 1.7419077
MINKGEM 1.6665703
MGODOV 1.6356941
MFWEKIND 1.6335492
PBYSTAND 1.5406059
MRELGE 1.4323479
MINK7512 1.2719825
MSKB1 1.2004837
MGODRK 1.1870963
MRELOV 1.1229476
MHHUUR 1.0384539
MINKM30 0.9373363
MBERARBO 0.9231986
MFGEKIND 0.8635693
MBERBOER 0.7517833
MHKOOP 0.7376097
APERSAUT 0.6923477
MGEMLEEF 0.6868871
MINK4575 0.6658493
MOPLMIDD 0.6604653
MZFONDS 0.6291890
PMOTSCO 0.6019984
MAUT0 0.5908220
MSKD 0.4867065
MZPART 0.4750920
MGEMOMV 0.4280569
MINK123M 0.3610230
MOSHOOFD 0.3560752
MSKB2 0.3545356
PLEVEN 0.3176596
MFALLEEN 0.1692333
MBERZELF 0.1659141
MRELSA 0.1644603
MAANTHUI 0.0000000
MOPLLAAG 0.0000000
PWABEDR 0.0000000
PWALAND 0.0000000
PBESAUT 0.0000000
PVRAAUT 0.0000000
PAANHANG 0.0000000
PTRACTOR 0.0000000
PWERKT 0.0000000
PBROM 0.0000000
PPERSONG 0.0000000
PGEZONG 0.0000000
PWAOREG 0.0000000
PZEILPL 0.0000000
PPLEZIER 0.0000000
PFIETS 0.0000000
PINBOED 0.0000000
AWAPART 0.0000000
AWABEDR 0.0000000
AWALAND 0.0000000
ABESAUT 0.0000000
AMOTSCO 0.0000000
AVRAAUT 0.0000000
AAANHANG 0.0000000
ATRACTOR 0.0000000
AWERKT 0.0000000
ABROM 0.0000000
ALEVEN 0.0000000
APERSONG 0.0000000
AGEZONG 0.0000000
AWAOREG 0.0000000
AZEILPL 0.0000000
APLEZIER 0.0000000
AFIETS 0.0000000
AINBOED 0.0000000
ABYSTAND 0.0000000

11c

Caravan.pred=predict(Caravan.gbm,Caravan.test,n.trees = 1000,type='response')
Caravan.pred=ifelse(Caravan.pred>0.2,1,0)
kable(table(Caravan.test$Purchase,Caravan.pred))
0 1
0 4412 121
1 256 33

About 20% of the people predicted to make a purchase do make one.

Caravan.glm=glm(Purchase~.,family='binomial',data = Caravan.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Caravan.pred=predict(Caravan.glm,Caravan.test,type='response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
Caravan.pred=ifelse(Caravan.pred>0.2,1,0)
kable(table(Caravan.test$Purchase,Caravan.pred))
0 1
0 4183 350
1 231 58

Logistic regression performs worse than boosting, with 14% of the purchases being correctly predicted.

12

require(ISLR)

set.seed(42)
Boston.train=sample(1:nrow(Boston),nrow(Boston)/2)

x.train=model.matrix(crim~.:.^2,data=Boston[Boston.train,])
y.train=Boston$crim[Boston.train]

x.test=model.matrix(crim~.:.^2,data=Boston[-Boston.train,])
y.test=Boston$crim[-Boston.train]
require(glmnet)
tb=c()
grid=seq(0,0.5,0.002)

Boston.glmnet.cv=cv.glmnet(x.train,y.train,alpha=0,lambda=grid)   
 
plot(log(Boston.glmnet.cv$lambda),Boston.glmnet.cv$cvm,type='l',
     xlab='log(lambda)',ylab='Cross Validation MSE')

Boston.glmnet=glmnet(x.train,y.train,alpha=0,lambda=Boston.glmnet.cv$lambda.min)
Boston.pred=predict(Boston.glmnet,x.test)
tb=cbind(tb,'Ridge'=mean((Boston.pred-y.test)^2))
require(randomForest)

Boston.tree=tree(crim~.,data=Boston,subset=Boston.train)

set.seed(42)
Boston.tree.cv=cv.tree(Boston.tree,K = 5)
plot(Boston.tree.cv)

Boston.tree=prune.tree(Boston.tree,best=4)
Boston.pred=predict(Boston.tree,Boston[Boston.train,])
tb=cbind(tb,'Tree'=mean((Boston.pred-y.test)^2))

row.names(tb)=c('Test MSE')
kable(tb)
Ridge Tree
Test MSE 55.0314 145.6773