Introduction to Statistical Learning Solutions (ISLR)

Ch 8 Exercises

Conceptual Exercises

Applied Exercises

Conceptual Exercises

1

pts=data.frame(c(25,25,75,75,70,80),c(75,25,75,50,25,25))

plot(pts,xlim=c(0,100),ylim=c(0,110),pch="",xlab="X1",ylab="X2",xaxt='n',yaxt='n')

lines(x = c(50,50), y = c(0,100))
lines(x = c(0,50), y = c(65,65))
lines(x = c(50,100), y = c(60,60))
lines(x = c(50,100), y = c(40,40))
lines(x = c(75,75), y = c(1,40))

text(x = 50, y = 108, labels = c("t1"), col = "red")
text(x = 0, y = 70, labels = c("t2"), col = "red")
text(x = 100, y = 65, labels = c("t3"), col = "red")
text(x = 100, y = 45, labels = c("t4"), col = "red")
text(x = 75, y = 1, labels = c("t5"), col = "red")

text(pts,labels=paste("R",1:6,sep=""))

In two dimensions a tree is a subdivision of the space into rectangular regions where each region is a leaf node.

2

If only stumps are considered, then for a predictor \(X_j\) the equation for its stump takes the form; \[\hat{f}_j(x_j)=\beta_0+I(K_j<X_j)\beta_1\]

Now boosting considers several stumps on the set of predictors where the previously chosen predictor may be chosen or not. If another stump is created for a previously chosen predictor \(X_i\), then we can define \(\hat{f}_i(x_i\) as follows; \[\hat{f}_i(x_i)=\hat{f}_i(x_i)_1 +\hat{f}_i(x_i)_2\] \[=\dot{\beta_0}+I(K_{j1}<X_{j})\beta_1+I(K_{j2}<X_j)\beta_2\]

which if \(K_{j1} < K_{j2}\) is equivalent to the equation, \[\dot{\beta_0}+I(K_{j1}<X_j<K_{j2})\beta_1+I(K_{j2}s<X_j)(\beta_1+\beta_2)\]

Which can be seen as adding a branch to the stump. Thus, since all functions solely depend on a single predictor the model takes the form of \(f(X)=\sum^p_{j=1}f_j(X_j)\). Where \(f_j(X_j)=\frac{1}{\lambda}\hat{f_j}(x_j)\).

3

p=seq(0,1,0.01)

gini= 2*p*(1-p)
classerror= 1-pmax(p,1-p)
crossentropy= -(p*log(p)+(1-p)*log(1-p))

plot(NA,NA,xlim=c(0,1),ylim=c(0,1),xlab='p',ylab='f')

lines(p,gini,type='l')
lines(p,classerror,col='blue')
lines(p,crossentropy,col='red')

legend(x='top',legend=c('gini','class error','cross entropy'),
       col=c('black','blue','red'),lty=1,text.width = 0.22)

4

4a

This problem is best done in a piece of paper. The resulting tree has 5 leaf nodes corresponding to each rectangular region of the graph.

4b

X1=seq(0,3,0.1)
X2=seq(0,3,0.1)

plot(NA,NA,xlim=c(-1.2,3),ylim=c(-0.2,3),pch="",xlab="X1",ylab="X2",xaxt='n',yaxt='n')
pts=data.frame(c(1.5,-0.5,1.5,-0.25,1.50),c(2.5,1.5,1.5,0.5,0.5))

text(pts,labels=c('2.49','-1.06','0.21','-1.80','0.63'))
text(x = 1, y = -0.2, labels = c("1"), col = "red")
text(x = 0, y = 0.8, labels = c("0"), col = "red")
text(x = -1.2, y = 1, labels = c("1"), col = "red")
text(x = -1.2, y = 2, labels = c("2"), col = "red")
 
lines(x = c(-1,3), y = c(1,1))
lines(x = c(-1,3), y = c(2,2))
lines(x = c(0,0), y = c(1,2))
lines(x = c(1,1), y = c(0,1))

5

In the majority vote approach we assign the observation to the class that occurs the most. Thus, we count the number of assignments done to each class by making use of a cutoff value. As shown below, the observation X is assigned to the Red class.

mjr=c(0.1,0.15,0.2,0.2,0.55,0.6,0.6,0.65,0.7,0.75)
table(mjr>0.5)

## 
## FALSE  TRUE 
##     4     6

As described in the question, when average probability is used the average is taken from the estimated probabilities that result from the bagging model. The average is 0.45, which determines that X does not belong to the Red class.

avg=mean(mjr)
avg

## [1] 0.45

6

A regression tree performs subdivisions of the predictor space. The algorithm considers one split at a time in the set of predictors, where the split that is chosen is the one that achieves the most reduction in the RSS. This step will be repeated until threshold values are met (or cannot be met any longer such as minimum number of observations necessary in each node). When predictions are made, new observations travel down the branches of a tree, and upon reaching a leaf node, the average of the observations contained in that leaf is returned.

Applied Exercises

7

require(randomForest)

## Loading required package: randomForest

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

require(knitr)

## Loading required package: knitr

require(MASS)

## Loading required package: MASS

set.seed(1)
train=sample(1:nrow(Boston),nrow(Boston)/2)

train.rf=function(m.mtry=6,m.ntree=25){
  rf.boston=randomForest(medv~.,data=Boston,subset=train,m.mtry=6,ntree=m.ntree)
  rf.pred=predict(rf.boston,Boston[-train,])
  mse=mean((rf.pred-Boston[-train,])^2)
  mse
}

# This may take a few minutes.

mse=c()
m=c()

min.mtry=10
max.mtry=16

min.ntree=80
max.ntree=100

for(mtry in min.mtry:max.mtry){
  for(ntree in min.ntree:max.ntree){
    mse=rbind(mse,train.rf(mtry,ntree))
  }
    m=cbind(m,mse)
    mse=c()
}

rownames(m)=paste('ntree',min.ntree:max.ntree)
colnames(m)=paste('mtry',min.mtry:max.mtry)
kable(m)

	mtry 10	mtry 11	mtry 12	mtry 13	mtry 14	mtry 15	mtry 16
ntree 80	21253.00	21255.46	21256.60	21254.51	21266.45	21263.31	21248.03
ntree 81	21259.58	21248.02	21247.25	21255.23	21259.82	21256.71	21257.66
ntree 82	21252.62	21252.46	21257.66	21256.55	21258.38	21253.52	21254.67
ntree 83	21261.70	21256.26	21252.17	21259.09	21254.70	21255.21	21252.04
ntree 84	21257.95	21250.39	21256.35	21252.84	21254.71	21257.51	21254.78
ntree 85	21254.83	21252.63	21260.12	21253.57	21257.85	21253.35	21254.40
ntree 86	21250.27	21261.40	21257.26	21258.94	21258.78	21255.56	21253.93
ntree 87	21266.40	21255.95	21257.92	21254.47	21255.26	21251.51	21253.59
ntree 88	21250.06	21251.44	21262.93	21254.97	21255.09	21255.29	21252.46
ntree 89	21260.38	21245.53	21256.82	21257.45	21258.35	21253.71	21256.94
ntree 90	21257.75	21255.81	21259.01	21257.02	21253.94	21250.75	21254.97
ntree 91	21263.07	21257.93	21251.67	21257.78	21255.10	21248.67	21255.10
ntree 92	21262.10	21251.81	21253.94	21255.26	21256.90	21253.74	21252.68
ntree 93	21252.34	21256.05	21252.98	21252.89	21261.52	21249.11	21256.05
ntree 94	21261.40	21251.93	21257.62	21251.49	21259.29	21259.00	21256.44
ntree 95	21254.47	21250.77	21249.82	21254.59	21256.67	21248.67	21251.23
ntree 96	21248.51	21261.74	21250.46	21256.93	21255.23	21249.72	21253.93
ntree 97	21253.65	21257.56	21250.37	21258.00	21256.40	21245.51	21263.49
ntree 98	21254.40	21252.46	21259.18	21262.46	21254.21	21256.51	21257.26
ntree 99	21258.71	21258.61	21254.59	21258.08	21255.51	21262.09	21258.87
ntree 100	21256.51	21256.90	21253.87	21260.11	21257.27	21258.65	21255.76

cols=rainbow(max.mtry)

plot(min.ntree:max.ntree,m[,1],col=cols[1],type='l',ylab='Hold out test error',xlab='ntree',lty=1)

lines(min.ntree:max.ntree,m[,2],col=cols[2],lty=2)
lines(min.ntree:max.ntree,m[,3],col=cols[3],lty=3)
lines(min.ntree:max.ntree,m[,4],col=cols[4],lty=4)
lines(min.ntree:max.ntree,m[,5],col=cols[5],lty=5)

legend(x='topright',legend=paste('mtry',10:15),col=rainbow(5),lty=1:5,ncol=1,text.width = 3)

plot(min.mtry:max.mtry,m[1,],col=cols[1],type='l',
     ylab='Hold out test error',xlab='mtry',ylim = c(21245,21270),lty=1)

lines(min.mtry:max.mtry,m[2,],col=cols[2],lty=2)
lines(min.mtry:max.mtry,m[3,],col=cols[3],lty=3)
lines(min.mtry:max.mtry,m[4,],col=cols[4],lty=4)
lines(min.mtry:max.mtry,m[5,],col=cols[5],lty=5)

legend(x='topright',legend=paste('ntree',80:85),col=rainbow(5),lty=1:5,ncol=1,text.width = 1)

8

8a

require(ISLR)

## Loading required package: ISLR

require(tree)

## Loading required package: tree

set.seed(42)
train=sample(1:nrow(Carseats),nrow(Carseats)/2)

8b

tree.carseats=tree(formula=Sales~.,data=Carseats,subset = train)
tree.pred=predict(tree.carseats,Carseats[-train,])

mean((tree.pred-Carseats[-train,'Sales'])^2)

## [1] 4.092899

plot(tree.carseats)
text(tree.carseats)

8c

tree.carseats.cv=cv.tree(tree.carseats) 
plot(tree.carseats.cv)

prune.carseats=prune.tree(tree.carseats,best=5)

plot(prune.carseats)
text(prune.carseats)

tree.pred=predict(prune.carseats,Carseats[-train,])
mean((tree.pred-Carseats[-train,'Sales'])^2)

## [1] 4.991476

plot(tree.pred,Carseats[-train,'Sales'],xlab='prediction',ylab='actual')
abline(0,1)

From the MSE you can observe that it does not improve the test MSE.

8d

require(randomForest)

d=ncol(Carseats)-1

set.seed(42)
carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=d,importance=T,ntree=100)

tree.pred=predict(carseats.rf,Carseats[-train,])
mean((tree.pred-Carseats[-train,'Sales'])^2)

## [1] 2.750273

plot(carseats.rf)

varImpPlot(carseats.rf)

kable(importance(carseats.rf))

	%IncMSE	IncNodePurity
CompPrice	11.5170922	148.297954
Income	3.3430145	69.192503
Advertising	6.2062812	88.251686
Population	-0.2522175	63.025904
Price	25.4921858	442.200618
ShelveLoc	25.8775065	422.649283
Age	7.0625363	132.928876
Education	-1.3343248	30.809743
Urban	-1.5756286	9.061747
US	1.3535718	7.374875

The predictor Price is clearly the most important predictor in predicting Sales. This model also achieves a much lower MSE than the previous one, with almost a half of reduction achieved in the test MSE.

8e

mse=c()

set.seed(42)

for(i in 3:10){
  carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=5,importance=T,ntree=100)

  tree.pred=predict(carseats.rf,Carseats[-train,])
  mse=rbind(mse,mean((tree.pred-Carseats[-train,'Sales'])^2))
}
plot(3:10,mse,type='b')

The plot above displays the effect of mtry in the test MSE.

require(randomForest)

set.seed(42)
carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=9,importance=T,ntree=100)

plot(carseats.rf)

varImpPlot(carseats.rf)

kable(importance(carseats.rf))

	%IncMSE	IncNodePurity
CompPrice	9.0678902	138.620007
Income	2.3427697	64.333859
Advertising	5.7404558	94.148559
Population	1.4682674	70.206086
Price	25.1750997	440.957346
ShelveLoc	23.6551259	408.393880
Age	6.2210635	136.281793
Education	-0.1427487	33.300827
Urban	-2.0875662	10.667281
US	0.2117698	7.227559

From the data above you can see that ShelveLoc is now the most important predictor in terms of MSE (whose absence most increase the training MSE). Moreover, while considering only 9 predictors for training each tree achieves a lower training MSE the test MSE is higher than the bagging approach.

9

9a

require(ISLR)

set.seed(42)
train=sample(1:nrow(OJ),800)

OJ.train=OJ[train,]
OJ.test=OJ[-train,]

9b

OJ.tree=tree(Purchase~.,data=OJ.train)  # There is no predictor `Buy`.
summary(OJ.tree)

## 
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH"   "PriceDiff"
## Number of terminal nodes:  8 
## Residual mean deviance:  0.7484 = 592.7 / 792 
## Misclassification error rate: 0.1762 = 141 / 800

9c

OJ.tree

## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1075.00 CH ( 0.60250 0.39750 )  
##    2) LoyalCH < 0.5036 351  405.90 MM ( 0.26496 0.73504 )  
##      4) LoyalCH < 0.0356415 63    0.00 MM ( 0.00000 1.00000 ) *
##      5) LoyalCH > 0.0356415 288  362.30 MM ( 0.32292 0.67708 )  
##       10) LoyalCH < 0.276142 115  109.30 MM ( 0.18261 0.81739 ) *
##       11) LoyalCH > 0.276142 173  234.90 MM ( 0.41618 0.58382 )  
##         22) PriceDiff < 0.05 73   76.78 MM ( 0.21918 0.78082 ) *
##         23) PriceDiff > 0.05 100  137.20 CH ( 0.56000 0.44000 ) *
##    3) LoyalCH > 0.5036 449  353.10 CH ( 0.86637 0.13363 )  
##      6) LoyalCH < 0.705326 146  175.20 CH ( 0.71233 0.28767 )  
##       12) PriceDiff < 0.265 89  121.50 CH ( 0.57303 0.42697 ) *
##       13) PriceDiff > 0.265 57   28.97 CH ( 0.92982 0.07018 ) *
##      7) LoyalCH > 0.705326 303  136.50 CH ( 0.94059 0.05941 )  
##       14) PriceDiff < -0.39 14   19.12 CH ( 0.57143 0.42857 ) *
##       15) PriceDiff > -0.39 289   99.85 CH ( 0.95848 0.04152 ) *

A possible terminal node is PriceDiff < -0.165 27 30.90 MM ( 0.25926 0.74074 ) *. This node is a child for the splits LoyalCH < 0.764572 183 216.60 CH ( 0.72131 0.27869 ), LoyalCH > 0.5036 438 360.80 CH ( 0.85616 0.14384 ) and the root node LoyalCH < 0.5036 362 432.40 MM ( 0.28453 0.71547 ).

We know then that the tree will predict that the customer will purchase a minute maid if the price difference of Citrus Hill will 0.165 bigger, and the customer brand loyalty is in the interval (0.5,0.76).

9d

plot(OJ.tree)
text(OJ.tree)

From the two root node branches we can make some quick observations. The left branch shows that when there is a relative low customer loyalty for Citrus Hill then the Minute Maid. Otherwise, the cheaper drink will be the one purchased. Similarly, in the right sub tree this model predicts that when there is a strong preference for Citrus Hill it will always be the drink chosen. Otherwise, when the preference is not so strong the customer will purchase Minute Maid if it has a lower sale price or its list price is not significantly more than Citrus Hill. Otherwise, the Citrus Hill orange drink will be bought.

9e

OJ.pred.train=predict(OJ.tree,OJ.train,type = 'class')
kable(table(OJ.train[,'Purchase'],OJ.pred.train))

	CH	MM
CH	445	37
MM	104	214

kable(table(OJ.train[,'Purchase'],OJ.pred.train)/nrow(OJ.train))

	CH	MM
CH	0.55625	0.04625
MM	0.13000	0.26750

OJ.pred.test=predict(OJ.tree,OJ.test,type = 'class')
kable(table(OJ.test[,'Purchase'],OJ.pred.test))

	CH	MM
CH	159	12
MM	38	61

kable(table(OJ.test[,'Purchase'],OJ.pred.test)/nrow(OJ.test))

	CH	MM
CH	0.5888889	0.0444444
MM	0.1407407	0.2259259

From the second table we can see that about 48% of the Citrus Hill drinks are correctly predicted in the test data, also about 30% of the Minute Maid drinks are correctly chosen. Signifying a misclassification error rate about about 22%.

9f-h

set.seed(42)
OJ.tree.cv=cv.tree(OJ.tree,K = 10,FUN = prune.misclass)
plot(OJ.tree.cv)

From the plot the optimal size is 2.

OJ.tree=prune.misclass(OJ.tree,best = 2)

OJ.pred.train=predict(OJ.tree,OJ.train,type = 'class')
kable(table(OJ.train[,'Purchase'],OJ.pred.train))

	CH	MM
CH	389	93
MM	60	258

kable(table(OJ.train[,'Purchase'],OJ.pred.train)/nrow(OJ.train))

	CH	MM
CH	0.48625	0.11625
MM	0.07500	0.32250

OJ.pred.test=predict(OJ.tree,OJ.test,type = 'class')
kable(table(OJ.test[,'Purchase'],OJ.pred.test))

	CH	MM
CH	131	40
MM	21	78

kable(table(OJ.test[,'Purchase'],OJ.pred.test)/nrow(OJ.test))

	CH	MM
CH	0.4851852	0.1481481
MM	0.0777778	0.2888889

plot(OJ.tree)
text(OJ.tree)

A tree with 2 leaf nodes achieves a misclassification test rate of 25%. This represents a 3% improvement over an unpruned tree for the test data. However, the training error for the unpruned tree is lower. This is a good example of how easy trees can overfit the data.

10

10a

require(ISLR)
Hitters.unknownSal=is.na(Hitters[,"Salary"])
Hitters=Hitters[!Hitters.unknownSal,]
Hitters[,"Salary"]=log(Hitters[,"Salary"])

summary(Hitters)

##      AtBat            Hits           HmRun            Runs       
##  Min.   : 19.0   Min.   :  1.0   Min.   : 0.00   Min.   :  0.00  
##  1st Qu.:282.5   1st Qu.: 71.5   1st Qu.: 5.00   1st Qu.: 33.50  
##  Median :413.0   Median :103.0   Median : 9.00   Median : 52.00  
##  Mean   :403.6   Mean   :107.8   Mean   :11.62   Mean   : 54.75  
##  3rd Qu.:526.0   3rd Qu.:141.5   3rd Qu.:18.00   3rd Qu.: 73.00  
##  Max.   :687.0   Max.   :238.0   Max.   :40.00   Max.   :130.00  
##       RBI             Walks            Years            CAtBat       
##  Min.   :  0.00   Min.   :  0.00   Min.   : 1.000   Min.   :   19.0  
##  1st Qu.: 30.00   1st Qu.: 23.00   1st Qu.: 4.000   1st Qu.:  842.5  
##  Median : 47.00   Median : 37.00   Median : 6.000   Median : 1931.0  
##  Mean   : 51.49   Mean   : 41.11   Mean   : 7.312   Mean   : 2657.5  
##  3rd Qu.: 71.00   3rd Qu.: 57.00   3rd Qu.:10.000   3rd Qu.: 3890.5  
##  Max.   :121.00   Max.   :105.00   Max.   :24.000   Max.   :14053.0  
##      CHits            CHmRun           CRuns             CRBI       
##  Min.   :   4.0   Min.   :  0.00   Min.   :   2.0   Min.   :   3.0  
##  1st Qu.: 212.0   1st Qu.: 15.00   1st Qu.: 105.5   1st Qu.:  95.0  
##  Median : 516.0   Median : 40.00   Median : 250.0   Median : 230.0  
##  Mean   : 722.2   Mean   : 69.24   Mean   : 361.2   Mean   : 330.4  
##  3rd Qu.:1054.0   3rd Qu.: 92.50   3rd Qu.: 497.5   3rd Qu.: 424.5  
##  Max.   :4256.0   Max.   :548.00   Max.   :2165.0   Max.   :1659.0  
##      CWalks       League  Division    PutOuts          Assists     
##  Min.   :   1.0   A:139   E:129    Min.   :   0.0   Min.   :  0.0  
##  1st Qu.:  71.0   N:124   W:134    1st Qu.: 113.5   1st Qu.:  8.0  
##  Median : 174.0                    Median : 224.0   Median : 45.0  
##  Mean   : 260.3                    Mean   : 290.7   Mean   :118.8  
##  3rd Qu.: 328.5                    3rd Qu.: 322.5   3rd Qu.:192.0  
##  Max.   :1566.0                    Max.   :1377.0   Max.   :492.0  
##      Errors           Salary      NewLeague
##  Min.   : 0.000   Min.   :4.212   A:141    
##  1st Qu.: 3.000   1st Qu.:5.247   N:122    
##  Median : 7.000   Median :6.052            
##  Mean   : 8.593   Mean   :5.927            
##  3rd Qu.:13.000   3rd Qu.:6.620            
##  Max.   :32.000   Max.   :7.808

10b

Hitters.train=Hitters[1:200,]
Hitters.test=Hitters[-c(1:200),]

10c-d

require(gbm)

## Loading required package: gbm

## Loading required package: survival

## Loading required package: lattice

## Loading required package: splines

## Loading required package: parallel

## Loaded gbm 2.1.1

train.mse=c()
test.mse=c()

for(shr in seq(0,0.08,0.002)){
  Hitters.gbm=gbm(Salary~.,data=Hitters.train,shrinkage = shr,n.trees = 1000,distribution = 'gaussian')
  
  Hitters.pred=predict(Hitters.gbm,Hitters.train,n.trees = 1000)
  train.mse=rbind(train.mse,mean((Hitters.pred-Hitters.train[,'Salary'])^2))
  
  Hitters.pred=predict(Hitters.gbm,Hitters.test,n.trees = 1000)
  test.mse=rbind(test.mse,mean((Hitters.pred-Hitters.test[,'Salary'])^2))
}

plot(seq(0,0.08,0.002),train.mse,type='l',xlab='shrinkage',xlim = c(0.003,0.07),ylab='MSE')
lines(seq(0,0.08,0.002),test.mse,col='red')
legend(x='top',legend = c('train MSE','test MSE'),col=c('black','red'),lty=1,text.width = 0.005)

10e

tb=c()

Hitters.gbm=gbm(Salary~.,data=Hitters.train,shrinkage = 0.01,n.trees = 1000,distribution = 'gaussian')
Hitters.pred=predict(Hitters.gbm,Hitters.test,n.trees = 1000)
tb=cbind(tb,'Boost'=mean((Hitters.pred-Hitters.test[,'Salary'])^2))

# Ch3 - linear regression
Hitters.lm=lm(Salary~.,Hitters.train)
Hitters.pred=predict(Hitters.lm,Hitters.test)
tb=cbind(tb,'Linear'=mean((Hitters.pred-Hitters.test[,'Salary'])^2))

# Ch6 - ridge regression
require(glmnet)

## Loading required package: glmnet

## Loading required package: Matrix

## Loading required package: foreach

## Loaded glmnet 2.0-5

x = model.matrix(Salary ~ ., data = Hitters.train)
x.test = model.matrix(Salary ~ ., data = Hitters.test)
y = Hitters.train$Salary

Hitters.glm=glmnet(x,y,alpha = 0)
Hitters.pred=predict(Hitters.glm,x.test)
tb=cbind(tb,'Ridge'=mean((Hitters.pred-Hitters.test[,'Salary'])^2))

kable(tb)

Boost	Linear	Ridge
0.2752713	0.4917959	0.5145349

10f

kable(summary(Hitters.gbm),row.names = F)

var	rel.inf
CAtBat	29.7469615
CHits	10.7783367
CWalks	8.6555091
CRBI	8.3272748
CRuns	7.8404149
CHmRun	6.2653705
Years	6.0026542
Hits	4.2452579
Walks	3.9946187
RBI	3.3923403
PutOuts	2.6646695
AtBat	1.9452653
HmRun	1.6876641
Errors	1.1948217
Assists	1.1196293
Runs	1.1075200
Division	0.7625687
NewLeague	0.1462979
League	0.1228249

10g

require(randomForest)
Hitters.rf=randomForest(Salary~.,data = Hitters.train,mtry=ncol(Hitters.train)-1) # bagging m=p
Hitters.pred=predict(Hitters.rf,Hitters.test)
mean((Hitters.pred-Hitters.test[,'Salary'])^2)

## [1] 0.232707

The test MSE is shown above.

11

11a

require(ISLR)
Caravan$Purchase=ifelse(Caravan$Purchase == "Yes",1,0)
Caravan.train=Caravan[1:1000,]
Caravan.test=Caravan[-c(1:1000),]

11b

require(gbm)
Caravan.gbm=gbm(Purchase~.,data=Caravan.train,n.trees = 1000,shrinkage = 0.01,distribution = "bernoulli")

## Warning in gbm.fit(x, y, offset = offset, distribution = distribution, w =
## w, : variable 50: PVRAAUT has no variation.

## Warning in gbm.fit(x, y, offset = offset, distribution = distribution, w =
## w, : variable 71: AVRAAUT has no variation.

kable(summary(Caravan.gbm),row.names = F)

var	rel.inf
PPERSAUT	15.2804738
MKOOPKLA	9.6328218
MOPLHOOG	6.5148600
MBERMIDD	5.4610803
PBRAND	5.0235877
ABRAND	4.3515849
MGODGE	4.3333834
MINK3045	4.0177164
MOSTYPE	3.0230879
MAUT1	2.6348897
MAUT2	2.2733199
PWAPART	2.2236415
MSKA	2.0834968
MGODPR	2.0023134
MSKC	1.8299210
MBERHOOG	1.8223698
MBERARBG	1.7419077
MINKGEM	1.6665703
MGODOV	1.6356941
MFWEKIND	1.6335492
PBYSTAND	1.5406059
MRELGE	1.4323479
MINK7512	1.2719825
MSKB1	1.2004837
MGODRK	1.1870963
MRELOV	1.1229476
MHHUUR	1.0384539
MINKM30	0.9373363
MBERARBO	0.9231986
MFGEKIND	0.8635693
MBERBOER	0.7517833
MHKOOP	0.7376097
APERSAUT	0.6923477
MGEMLEEF	0.6868871
MINK4575	0.6658493
MOPLMIDD	0.6604653
MZFONDS	0.6291890
PMOTSCO	0.6019984
MAUT0	0.5908220
MSKD	0.4867065
MZPART	0.4750920
MGEMOMV	0.4280569
MINK123M	0.3610230
MOSHOOFD	0.3560752
MSKB2	0.3545356
PLEVEN	0.3176596
MFALLEEN	0.1692333
MBERZELF	0.1659141
MRELSA	0.1644603
MAANTHUI	0.0000000
MOPLLAAG	0.0000000
PWABEDR	0.0000000
PWALAND	0.0000000
PBESAUT	0.0000000
PVRAAUT	0.0000000
PAANHANG	0.0000000
PTRACTOR	0.0000000
PWERKT	0.0000000
PBROM	0.0000000
PPERSONG	0.0000000
PGEZONG	0.0000000
PWAOREG	0.0000000
PZEILPL	0.0000000
PPLEZIER	0.0000000
PFIETS	0.0000000
PINBOED	0.0000000
AWAPART	0.0000000
AWABEDR	0.0000000
AWALAND	0.0000000
ABESAUT	0.0000000
AMOTSCO	0.0000000
AVRAAUT	0.0000000
AAANHANG	0.0000000
ATRACTOR	0.0000000
AWERKT	0.0000000
ABROM	0.0000000
ALEVEN	0.0000000
APERSONG	0.0000000
AGEZONG	0.0000000
AWAOREG	0.0000000
AZEILPL	0.0000000
APLEZIER	0.0000000
AFIETS	0.0000000
AINBOED	0.0000000
ABYSTAND	0.0000000

11c

Caravan.pred=predict(Caravan.gbm,Caravan.test,n.trees = 1000,type='response')
Caravan.pred=ifelse(Caravan.pred>0.2,1,0)
kable(table(Caravan.test$Purchase,Caravan.pred))

	0	1
0	4412	121
1	256	33

About 20% of the people predicted to make a purchase do make one.

Caravan.glm=glm(Purchase~.,family='binomial',data = Caravan.train)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

Caravan.pred=predict(Caravan.glm,Caravan.test,type='response')

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

Caravan.pred=ifelse(Caravan.pred>0.2,1,0)
kable(table(Caravan.test$Purchase,Caravan.pred))

	0	1
0	4183	350
1	231	58

Logistic regression performs worse than boosting, with 14% of the purchases being correctly predicted.

12

require(ISLR)

set.seed(42)
Boston.train=sample(1:nrow(Boston),nrow(Boston)/2)

x.train=model.matrix(crim~.:.^2,data=Boston[Boston.train,])
y.train=Boston$crim[Boston.train]

x.test=model.matrix(crim~.:.^2,data=Boston[-Boston.train,])
y.test=Boston$crim[-Boston.train]

require(glmnet)
tb=c()
grid=seq(0,0.5,0.002)

Boston.glmnet.cv=cv.glmnet(x.train,y.train,alpha=0,lambda=grid)   
 
plot(log(Boston.glmnet.cv$lambda),Boston.glmnet.cv$cvm,type='l',
     xlab='log(lambda)',ylab='Cross Validation MSE')

Boston.glmnet=glmnet(x.train,y.train,alpha=0,lambda=Boston.glmnet.cv$lambda.min)
Boston.pred=predict(Boston.glmnet,x.test)
tb=cbind(tb,'Ridge'=mean((Boston.pred-y.test)^2))

require(randomForest)

Boston.tree=tree(crim~.,data=Boston,subset=Boston.train)

set.seed(42)
Boston.tree.cv=cv.tree(Boston.tree,K = 5)
plot(Boston.tree.cv)

Boston.tree=prune.tree(Boston.tree,best=4)
Boston.pred=predict(Boston.tree,Boston[Boston.train,])
tb=cbind(tb,'Tree'=mean((Boston.pred-y.test)^2))

row.names(tb)=c('Test MSE')
kable(tb)

	Ridge	Tree
Test MSE	55.0314	145.6773

Introduction to Statistical Learning Solutions (ISLR)

Ch 8 Exercises

Table of Contents

Conceptual Exercises

Applied Exercises

Conceptual Exercises

1

2

3

4

4a

4b

5

6

Applied Exercises

7

8

8a

8b

8c

8d

8e

9

9a

9b

9c

9d

9e

9f-h

10

10a

10b

10c-d

10e

10f

10g

11

11a

11b

11c

12