Notebook prepared by Everton Lima
pts=data.frame(c(25,25,75,75,70,80),c(75,25,75,50,25,25))
plot(pts,xlim=c(0,100),ylim=c(0,110),pch="",xlab="X1",ylab="X2",xaxt='n',yaxt='n')
lines(x = c(50,50), y = c(0,100))
lines(x = c(0,50), y = c(65,65))
lines(x = c(50,100), y = c(60,60))
lines(x = c(50,100), y = c(40,40))
lines(x = c(75,75), y = c(1,40))
text(x = 50, y = 108, labels = c("t1"), col = "red")
text(x = 0, y = 70, labels = c("t2"), col = "red")
text(x = 100, y = 65, labels = c("t3"), col = "red")
text(x = 100, y = 45, labels = c("t4"), col = "red")
text(x = 75, y = 1, labels = c("t5"), col = "red")
text(pts,labels=paste("R",1:6,sep=""))
In two dimensions a tree is a subdivision of the space into rectangular regions where each region is a leaf node.
If only stumps are considered, then for a predictor \(X_j\) the equation for its stump takes the form; \[\hat{f}_j(x_j)=\beta_0+I(K_j<X_j)\beta_1\]
Now boosting considers several stumps on the set of predictors where the previously chosen predictor may be chosen or not. If another stump is created for a previously chosen predictor \(X_i\), then we can define \(\hat{f}_i(x_i\) as follows; \[\hat{f}_i(x_i)=\hat{f}_i(x_i)_1 +\hat{f}_i(x_i)_2\] \[=\dot{\beta_0}+I(K_{j1}<X_{j})\beta_1+I(K_{j2}<X_j)\beta_2\]
which if \(K_{j1} < K_{j2}\) is equivalent to the equation, \[\dot{\beta_0}+I(K_{j1}<X_j<K_{j2})\beta_1+I(K_{j2}s<X_j)(\beta_1+\beta_2)\]
Which can be seen as adding a branch to the stump. Thus, since all functions solely depend on a single predictor the model takes the form of \(f(X)=\sum^p_{j=1}f_j(X_j)\). Where \(f_j(X_j)=\frac{1}{\lambda}\hat{f_j}(x_j)\).
p=seq(0,1,0.01)
gini= 2*p*(1-p)
classerror= 1-pmax(p,1-p)
crossentropy= -(p*log(p)+(1-p)*log(1-p))
plot(NA,NA,xlim=c(0,1),ylim=c(0,1),xlab='p',ylab='f')
lines(p,gini,type='l')
lines(p,classerror,col='blue')
lines(p,crossentropy,col='red')
legend(x='top',legend=c('gini','class error','cross entropy'),
col=c('black','blue','red'),lty=1,text.width = 0.22)
This problem is best done in a piece of paper. The resulting tree has 5 leaf nodes corresponding to each rectangular region of the graph.
X1=seq(0,3,0.1)
X2=seq(0,3,0.1)
plot(NA,NA,xlim=c(-1.2,3),ylim=c(-0.2,3),pch="",xlab="X1",ylab="X2",xaxt='n',yaxt='n')
pts=data.frame(c(1.5,-0.5,1.5,-0.25,1.50),c(2.5,1.5,1.5,0.5,0.5))
text(pts,labels=c('2.49','-1.06','0.21','-1.80','0.63'))
text(x = 1, y = -0.2, labels = c("1"), col = "red")
text(x = 0, y = 0.8, labels = c("0"), col = "red")
text(x = -1.2, y = 1, labels = c("1"), col = "red")
text(x = -1.2, y = 2, labels = c("2"), col = "red")
lines(x = c(-1,3), y = c(1,1))
lines(x = c(-1,3), y = c(2,2))
lines(x = c(0,0), y = c(1,2))
lines(x = c(1,1), y = c(0,1))
In the majority vote approach we assign the observation to the class that occurs the most. Thus, we count the number of assignments done to each class by making use of a cutoff value. As shown below, the observation X is assigned to the Red class.
mjr=c(0.1,0.15,0.2,0.2,0.55,0.6,0.6,0.65,0.7,0.75)
table(mjr>0.5)
##
## FALSE TRUE
## 4 6
As described in the question, when average probability is used the average is taken from the estimated probabilities that result from the bagging model. The average is 0.45, which determines that X does not belong to the Red class.
avg=mean(mjr)
avg
## [1] 0.45
A regression tree performs subdivisions of the predictor space. The algorithm considers one split at a time in the set of predictors, where the split that is chosen is the one that achieves the most reduction in the RSS. This step will be repeated until threshold values are met (or cannot be met any longer such as minimum number of observations necessary in each node). When predictions are made, new observations travel down the branches of a tree, and upon reaching a leaf node, the average of the observations contained in that leaf is returned.
require(randomForest)
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
require(knitr)
## Loading required package: knitr
require(MASS)
## Loading required package: MASS
set.seed(1)
train=sample(1:nrow(Boston),nrow(Boston)/2)
train.rf=function(m.mtry=6,m.ntree=25){
rf.boston=randomForest(medv~.,data=Boston,subset=train,m.mtry=6,ntree=m.ntree)
rf.pred=predict(rf.boston,Boston[-train,])
mse=mean((rf.pred-Boston[-train,])^2)
mse
}
# This may take a few minutes.
mse=c()
m=c()
min.mtry=10
max.mtry=16
min.ntree=80
max.ntree=100
for(mtry in min.mtry:max.mtry){
for(ntree in min.ntree:max.ntree){
mse=rbind(mse,train.rf(mtry,ntree))
}
m=cbind(m,mse)
mse=c()
}
rownames(m)=paste('ntree',min.ntree:max.ntree)
colnames(m)=paste('mtry',min.mtry:max.mtry)
kable(m)
| mtry 10 | mtry 11 | mtry 12 | mtry 13 | mtry 14 | mtry 15 | mtry 16 | |
|---|---|---|---|---|---|---|---|
| ntree 80 | 21253.00 | 21255.46 | 21256.60 | 21254.51 | 21266.45 | 21263.31 | 21248.03 |
| ntree 81 | 21259.58 | 21248.02 | 21247.25 | 21255.23 | 21259.82 | 21256.71 | 21257.66 |
| ntree 82 | 21252.62 | 21252.46 | 21257.66 | 21256.55 | 21258.38 | 21253.52 | 21254.67 |
| ntree 83 | 21261.70 | 21256.26 | 21252.17 | 21259.09 | 21254.70 | 21255.21 | 21252.04 |
| ntree 84 | 21257.95 | 21250.39 | 21256.35 | 21252.84 | 21254.71 | 21257.51 | 21254.78 |
| ntree 85 | 21254.83 | 21252.63 | 21260.12 | 21253.57 | 21257.85 | 21253.35 | 21254.40 |
| ntree 86 | 21250.27 | 21261.40 | 21257.26 | 21258.94 | 21258.78 | 21255.56 | 21253.93 |
| ntree 87 | 21266.40 | 21255.95 | 21257.92 | 21254.47 | 21255.26 | 21251.51 | 21253.59 |
| ntree 88 | 21250.06 | 21251.44 | 21262.93 | 21254.97 | 21255.09 | 21255.29 | 21252.46 |
| ntree 89 | 21260.38 | 21245.53 | 21256.82 | 21257.45 | 21258.35 | 21253.71 | 21256.94 |
| ntree 90 | 21257.75 | 21255.81 | 21259.01 | 21257.02 | 21253.94 | 21250.75 | 21254.97 |
| ntree 91 | 21263.07 | 21257.93 | 21251.67 | 21257.78 | 21255.10 | 21248.67 | 21255.10 |
| ntree 92 | 21262.10 | 21251.81 | 21253.94 | 21255.26 | 21256.90 | 21253.74 | 21252.68 |
| ntree 93 | 21252.34 | 21256.05 | 21252.98 | 21252.89 | 21261.52 | 21249.11 | 21256.05 |
| ntree 94 | 21261.40 | 21251.93 | 21257.62 | 21251.49 | 21259.29 | 21259.00 | 21256.44 |
| ntree 95 | 21254.47 | 21250.77 | 21249.82 | 21254.59 | 21256.67 | 21248.67 | 21251.23 |
| ntree 96 | 21248.51 | 21261.74 | 21250.46 | 21256.93 | 21255.23 | 21249.72 | 21253.93 |
| ntree 97 | 21253.65 | 21257.56 | 21250.37 | 21258.00 | 21256.40 | 21245.51 | 21263.49 |
| ntree 98 | 21254.40 | 21252.46 | 21259.18 | 21262.46 | 21254.21 | 21256.51 | 21257.26 |
| ntree 99 | 21258.71 | 21258.61 | 21254.59 | 21258.08 | 21255.51 | 21262.09 | 21258.87 |
| ntree 100 | 21256.51 | 21256.90 | 21253.87 | 21260.11 | 21257.27 | 21258.65 | 21255.76 |
cols=rainbow(max.mtry)
plot(min.ntree:max.ntree,m[,1],col=cols[1],type='l',ylab='Hold out test error',xlab='ntree',lty=1)
lines(min.ntree:max.ntree,m[,2],col=cols[2],lty=2)
lines(min.ntree:max.ntree,m[,3],col=cols[3],lty=3)
lines(min.ntree:max.ntree,m[,4],col=cols[4],lty=4)
lines(min.ntree:max.ntree,m[,5],col=cols[5],lty=5)
legend(x='topright',legend=paste('mtry',10:15),col=rainbow(5),lty=1:5,ncol=1,text.width = 3)
plot(min.mtry:max.mtry,m[1,],col=cols[1],type='l',
ylab='Hold out test error',xlab='mtry',ylim = c(21245,21270),lty=1)
lines(min.mtry:max.mtry,m[2,],col=cols[2],lty=2)
lines(min.mtry:max.mtry,m[3,],col=cols[3],lty=3)
lines(min.mtry:max.mtry,m[4,],col=cols[4],lty=4)
lines(min.mtry:max.mtry,m[5,],col=cols[5],lty=5)
legend(x='topright',legend=paste('ntree',80:85),col=rainbow(5),lty=1:5,ncol=1,text.width = 1)
require(ISLR)
## Loading required package: ISLR
require(tree)
## Loading required package: tree
set.seed(42)
train=sample(1:nrow(Carseats),nrow(Carseats)/2)
tree.carseats=tree(formula=Sales~.,data=Carseats,subset = train)
tree.pred=predict(tree.carseats,Carseats[-train,])
mean((tree.pred-Carseats[-train,'Sales'])^2)
## [1] 4.092899
plot(tree.carseats)
text(tree.carseats)
tree.carseats.cv=cv.tree(tree.carseats)
plot(tree.carseats.cv)
prune.carseats=prune.tree(tree.carseats,best=5)
plot(prune.carseats)
text(prune.carseats)
tree.pred=predict(prune.carseats,Carseats[-train,])
mean((tree.pred-Carseats[-train,'Sales'])^2)
## [1] 4.991476
plot(tree.pred,Carseats[-train,'Sales'],xlab='prediction',ylab='actual')
abline(0,1)
From the MSE you can observe that it does not improve the test MSE.
require(randomForest)
d=ncol(Carseats)-1
set.seed(42)
carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=d,importance=T,ntree=100)
tree.pred=predict(carseats.rf,Carseats[-train,])
mean((tree.pred-Carseats[-train,'Sales'])^2)
## [1] 2.750273
plot(carseats.rf)
varImpPlot(carseats.rf)
kable(importance(carseats.rf))
| %IncMSE | IncNodePurity | |
|---|---|---|
| CompPrice | 11.5170922 | 148.297954 |
| Income | 3.3430145 | 69.192503 |
| Advertising | 6.2062812 | 88.251686 |
| Population | -0.2522175 | 63.025904 |
| Price | 25.4921858 | 442.200618 |
| ShelveLoc | 25.8775065 | 422.649283 |
| Age | 7.0625363 | 132.928876 |
| Education | -1.3343248 | 30.809743 |
| Urban | -1.5756286 | 9.061747 |
| US | 1.3535718 | 7.374875 |
The predictor Price is clearly the most important predictor in predicting Sales. This model also achieves a much lower MSE than the previous one, with almost a half of reduction achieved in the test MSE.
mse=c()
set.seed(42)
for(i in 3:10){
carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=5,importance=T,ntree=100)
tree.pred=predict(carseats.rf,Carseats[-train,])
mse=rbind(mse,mean((tree.pred-Carseats[-train,'Sales'])^2))
}
plot(3:10,mse,type='b')
The plot above displays the effect of mtry in the test MSE.
require(randomForest)
set.seed(42)
carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=9,importance=T,ntree=100)
plot(carseats.rf)
varImpPlot(carseats.rf)
kable(importance(carseats.rf))
| %IncMSE | IncNodePurity | |
|---|---|---|
| CompPrice | 9.0678902 | 138.620007 |
| Income | 2.3427697 | 64.333859 |
| Advertising | 5.7404558 | 94.148559 |
| Population | 1.4682674 | 70.206086 |
| Price | 25.1750997 | 440.957346 |
| ShelveLoc | 23.6551259 | 408.393880 |
| Age | 6.2210635 | 136.281793 |
| Education | -0.1427487 | 33.300827 |
| Urban | -2.0875662 | 10.667281 |
| US | 0.2117698 | 7.227559 |
From the data above you can see that ShelveLoc is now the most important predictor in terms of MSE (whose absence most increase the training MSE). Moreover, while considering only 9 predictors for training each tree achieves a lower training MSE the test MSE is higher than the bagging approach.
OJ.tree=tree(Purchase~.,data=OJ.train) # There is no predictor `Buy`.
summary(OJ.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7484 = 592.7 / 792
## Misclassification error rate: 0.1762 = 141 / 800
OJ.tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1075.00 CH ( 0.60250 0.39750 )
## 2) LoyalCH < 0.5036 351 405.90 MM ( 0.26496 0.73504 )
## 4) LoyalCH < 0.0356415 63 0.00 MM ( 0.00000 1.00000 ) *
## 5) LoyalCH > 0.0356415 288 362.30 MM ( 0.32292 0.67708 )
## 10) LoyalCH < 0.276142 115 109.30 MM ( 0.18261 0.81739 ) *
## 11) LoyalCH > 0.276142 173 234.90 MM ( 0.41618 0.58382 )
## 22) PriceDiff < 0.05 73 76.78 MM ( 0.21918 0.78082 ) *
## 23) PriceDiff > 0.05 100 137.20 CH ( 0.56000 0.44000 ) *
## 3) LoyalCH > 0.5036 449 353.10 CH ( 0.86637 0.13363 )
## 6) LoyalCH < 0.705326 146 175.20 CH ( 0.71233 0.28767 )
## 12) PriceDiff < 0.265 89 121.50 CH ( 0.57303 0.42697 ) *
## 13) PriceDiff > 0.265 57 28.97 CH ( 0.92982 0.07018 ) *
## 7) LoyalCH > 0.705326 303 136.50 CH ( 0.94059 0.05941 )
## 14) PriceDiff < -0.39 14 19.12 CH ( 0.57143 0.42857 ) *
## 15) PriceDiff > -0.39 289 99.85 CH ( 0.95848 0.04152 ) *
A possible terminal node is PriceDiff < -0.165 27 30.90 MM ( 0.25926 0.74074 ) *. This node is a child for the splits LoyalCH < 0.764572 183 216.60 CH ( 0.72131 0.27869 ), LoyalCH > 0.5036 438 360.80 CH ( 0.85616 0.14384 ) and the root node LoyalCH < 0.5036 362 432.40 MM ( 0.28453 0.71547 ).
We know then that the tree will predict that the customer will purchase a minute maid if the price difference of Citrus Hill will 0.165 bigger, and the customer brand loyalty is in the interval (0.5,0.76).
plot(OJ.tree)
text(OJ.tree)
From the two root node branches we can make some quick observations. The left branch shows that when there is a relative low customer loyalty for Citrus Hill then the Minute Maid. Otherwise, the cheaper drink will be the one purchased. Similarly, in the right sub tree this model predicts that when there is a strong preference for Citrus Hill it will always be the drink chosen. Otherwise, when the preference is not so strong the customer will purchase Minute Maid if it has a lower sale price or its list price is not significantly more than Citrus Hill. Otherwise, the Citrus Hill orange drink will be bought.
OJ.pred.train=predict(OJ.tree,OJ.train,type = 'class')
kable(table(OJ.train[,'Purchase'],OJ.pred.train))
| CH | MM | |
|---|---|---|
| CH | 445 | 37 |
| MM | 104 | 214 |
kable(table(OJ.train[,'Purchase'],OJ.pred.train)/nrow(OJ.train))
| CH | MM | |
|---|---|---|
| CH | 0.55625 | 0.04625 |
| MM | 0.13000 | 0.26750 |
OJ.pred.test=predict(OJ.tree,OJ.test,type = 'class')
kable(table(OJ.test[,'Purchase'],OJ.pred.test))
| CH | MM | |
|---|---|---|
| CH | 159 | 12 |
| MM | 38 | 61 |
kable(table(OJ.test[,'Purchase'],OJ.pred.test)/nrow(OJ.test))
| CH | MM | |
|---|---|---|
| CH | 0.5888889 | 0.0444444 |
| MM | 0.1407407 | 0.2259259 |
From the second table we can see that about 48% of the Citrus Hill drinks are correctly predicted in the test data, also about 30% of the Minute Maid drinks are correctly chosen. Signifying a misclassification error rate about about 22%.
set.seed(42)
OJ.tree.cv=cv.tree(OJ.tree,K = 10,FUN = prune.misclass)
plot(OJ.tree.cv)
From the plot the optimal size is 2.
OJ.tree=prune.misclass(OJ.tree,best = 2)
OJ.pred.train=predict(OJ.tree,OJ.train,type = 'class')
kable(table(OJ.train[,'Purchase'],OJ.pred.train))
| CH | MM | |
|---|---|---|
| CH | 389 | 93 |
| MM | 60 | 258 |
kable(table(OJ.train[,'Purchase'],OJ.pred.train)/nrow(OJ.train))
| CH | MM | |
|---|---|---|
| CH | 0.48625 | 0.11625 |
| MM | 0.07500 | 0.32250 |
OJ.pred.test=predict(OJ.tree,OJ.test,type = 'class')
kable(table(OJ.test[,'Purchase'],OJ.pred.test))
| CH | MM | |
|---|---|---|
| CH | 131 | 40 |
| MM | 21 | 78 |
kable(table(OJ.test[,'Purchase'],OJ.pred.test)/nrow(OJ.test))
| CH | MM | |
|---|---|---|
| CH | 0.4851852 | 0.1481481 |
| MM | 0.0777778 | 0.2888889 |
plot(OJ.tree)
text(OJ.tree)
A tree with 2 leaf nodes achieves a misclassification test rate of 25%. This represents a 3% improvement over an unpruned tree for the test data. However, the training error for the unpruned tree is lower. This is a good example of how easy trees can overfit the data.
require(ISLR)
Hitters.unknownSal=is.na(Hitters[,"Salary"])
Hitters=Hitters[!Hitters.unknownSal,]
Hitters[,"Salary"]=log(Hitters[,"Salary"])
summary(Hitters)
## AtBat Hits HmRun Runs
## Min. : 19.0 Min. : 1.0 Min. : 0.00 Min. : 0.00
## 1st Qu.:282.5 1st Qu.: 71.5 1st Qu.: 5.00 1st Qu.: 33.50
## Median :413.0 Median :103.0 Median : 9.00 Median : 52.00
## Mean :403.6 Mean :107.8 Mean :11.62 Mean : 54.75
## 3rd Qu.:526.0 3rd Qu.:141.5 3rd Qu.:18.00 3rd Qu.: 73.00
## Max. :687.0 Max. :238.0 Max. :40.00 Max. :130.00
## RBI Walks Years CAtBat
## Min. : 0.00 Min. : 0.00 Min. : 1.000 Min. : 19.0
## 1st Qu.: 30.00 1st Qu.: 23.00 1st Qu.: 4.000 1st Qu.: 842.5
## Median : 47.00 Median : 37.00 Median : 6.000 Median : 1931.0
## Mean : 51.49 Mean : 41.11 Mean : 7.312 Mean : 2657.5
## 3rd Qu.: 71.00 3rd Qu.: 57.00 3rd Qu.:10.000 3rd Qu.: 3890.5
## Max. :121.00 Max. :105.00 Max. :24.000 Max. :14053.0
## CHits CHmRun CRuns CRBI
## Min. : 4.0 Min. : 0.00 Min. : 2.0 Min. : 3.0
## 1st Qu.: 212.0 1st Qu.: 15.00 1st Qu.: 105.5 1st Qu.: 95.0
## Median : 516.0 Median : 40.00 Median : 250.0 Median : 230.0
## Mean : 722.2 Mean : 69.24 Mean : 361.2 Mean : 330.4
## 3rd Qu.:1054.0 3rd Qu.: 92.50 3rd Qu.: 497.5 3rd Qu.: 424.5
## Max. :4256.0 Max. :548.00 Max. :2165.0 Max. :1659.0
## CWalks League Division PutOuts Assists
## Min. : 1.0 A:139 E:129 Min. : 0.0 Min. : 0.0
## 1st Qu.: 71.0 N:124 W:134 1st Qu.: 113.5 1st Qu.: 8.0
## Median : 174.0 Median : 224.0 Median : 45.0
## Mean : 260.3 Mean : 290.7 Mean :118.8
## 3rd Qu.: 328.5 3rd Qu.: 322.5 3rd Qu.:192.0
## Max. :1566.0 Max. :1377.0 Max. :492.0
## Errors Salary NewLeague
## Min. : 0.000 Min. :4.212 A:141
## 1st Qu.: 3.000 1st Qu.:5.247 N:122
## Median : 7.000 Median :6.052
## Mean : 8.593 Mean :5.927
## 3rd Qu.:13.000 3rd Qu.:6.620
## Max. :32.000 Max. :7.808
require(gbm)
## Loading required package: gbm
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
train.mse=c()
test.mse=c()
for(shr in seq(0,0.08,0.002)){
Hitters.gbm=gbm(Salary~.,data=Hitters.train,shrinkage = shr,n.trees = 1000,distribution = 'gaussian')
Hitters.pred=predict(Hitters.gbm,Hitters.train,n.trees = 1000)
train.mse=rbind(train.mse,mean((Hitters.pred-Hitters.train[,'Salary'])^2))
Hitters.pred=predict(Hitters.gbm,Hitters.test,n.trees = 1000)
test.mse=rbind(test.mse,mean((Hitters.pred-Hitters.test[,'Salary'])^2))
}
plot(seq(0,0.08,0.002),train.mse,type='l',xlab='shrinkage',xlim = c(0.003,0.07),ylab='MSE')
lines(seq(0,0.08,0.002),test.mse,col='red')
legend(x='top',legend = c('train MSE','test MSE'),col=c('black','red'),lty=1,text.width = 0.005)
tb=c()
Hitters.gbm=gbm(Salary~.,data=Hitters.train,shrinkage = 0.01,n.trees = 1000,distribution = 'gaussian')
Hitters.pred=predict(Hitters.gbm,Hitters.test,n.trees = 1000)
tb=cbind(tb,'Boost'=mean((Hitters.pred-Hitters.test[,'Salary'])^2))
# Ch3 - linear regression
Hitters.lm=lm(Salary~.,Hitters.train)
Hitters.pred=predict(Hitters.lm,Hitters.test)
tb=cbind(tb,'Linear'=mean((Hitters.pred-Hitters.test[,'Salary'])^2))
# Ch6 - ridge regression
require(glmnet)
## Loading required package: glmnet
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-5
x = model.matrix(Salary ~ ., data = Hitters.train)
x.test = model.matrix(Salary ~ ., data = Hitters.test)
y = Hitters.train$Salary
Hitters.glm=glmnet(x,y,alpha = 0)
Hitters.pred=predict(Hitters.glm,x.test)
tb=cbind(tb,'Ridge'=mean((Hitters.pred-Hitters.test[,'Salary'])^2))
kable(tb)
| Boost | Linear | Ridge |
|---|---|---|
| 0.2752713 | 0.4917959 | 0.5145349 |
kable(summary(Hitters.gbm),row.names = F)
| var | rel.inf |
|---|---|
| CAtBat | 29.7469615 |
| CHits | 10.7783367 |
| CWalks | 8.6555091 |
| CRBI | 8.3272748 |
| CRuns | 7.8404149 |
| CHmRun | 6.2653705 |
| Years | 6.0026542 |
| Hits | 4.2452579 |
| Walks | 3.9946187 |
| RBI | 3.3923403 |
| PutOuts | 2.6646695 |
| AtBat | 1.9452653 |
| HmRun | 1.6876641 |
| Errors | 1.1948217 |
| Assists | 1.1196293 |
| Runs | 1.1075200 |
| Division | 0.7625687 |
| NewLeague | 0.1462979 |
| League | 0.1228249 |
require(ISLR)
Caravan$Purchase=ifelse(Caravan$Purchase == "Yes",1,0)
Caravan.train=Caravan[1:1000,]
Caravan.test=Caravan[-c(1:1000),]
require(gbm)
Caravan.gbm=gbm(Purchase~.,data=Caravan.train,n.trees = 1000,shrinkage = 0.01,distribution = "bernoulli")
## Warning in gbm.fit(x, y, offset = offset, distribution = distribution, w =
## w, : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x, y, offset = offset, distribution = distribution, w =
## w, : variable 71: AVRAAUT has no variation.
kable(summary(Caravan.gbm),row.names = F)
| var | rel.inf |
|---|---|
| PPERSAUT | 15.2804738 |
| MKOOPKLA | 9.6328218 |
| MOPLHOOG | 6.5148600 |
| MBERMIDD | 5.4610803 |
| PBRAND | 5.0235877 |
| ABRAND | 4.3515849 |
| MGODGE | 4.3333834 |
| MINK3045 | 4.0177164 |
| MOSTYPE | 3.0230879 |
| MAUT1 | 2.6348897 |
| MAUT2 | 2.2733199 |
| PWAPART | 2.2236415 |
| MSKA | 2.0834968 |
| MGODPR | 2.0023134 |
| MSKC | 1.8299210 |
| MBERHOOG | 1.8223698 |
| MBERARBG | 1.7419077 |
| MINKGEM | 1.6665703 |
| MGODOV | 1.6356941 |
| MFWEKIND | 1.6335492 |
| PBYSTAND | 1.5406059 |
| MRELGE | 1.4323479 |
| MINK7512 | 1.2719825 |
| MSKB1 | 1.2004837 |
| MGODRK | 1.1870963 |
| MRELOV | 1.1229476 |
| MHHUUR | 1.0384539 |
| MINKM30 | 0.9373363 |
| MBERARBO | 0.9231986 |
| MFGEKIND | 0.8635693 |
| MBERBOER | 0.7517833 |
| MHKOOP | 0.7376097 |
| APERSAUT | 0.6923477 |
| MGEMLEEF | 0.6868871 |
| MINK4575 | 0.6658493 |
| MOPLMIDD | 0.6604653 |
| MZFONDS | 0.6291890 |
| PMOTSCO | 0.6019984 |
| MAUT0 | 0.5908220 |
| MSKD | 0.4867065 |
| MZPART | 0.4750920 |
| MGEMOMV | 0.4280569 |
| MINK123M | 0.3610230 |
| MOSHOOFD | 0.3560752 |
| MSKB2 | 0.3545356 |
| PLEVEN | 0.3176596 |
| MFALLEEN | 0.1692333 |
| MBERZELF | 0.1659141 |
| MRELSA | 0.1644603 |
| MAANTHUI | 0.0000000 |
| MOPLLAAG | 0.0000000 |
| PWABEDR | 0.0000000 |
| PWALAND | 0.0000000 |
| PBESAUT | 0.0000000 |
| PVRAAUT | 0.0000000 |
| PAANHANG | 0.0000000 |
| PTRACTOR | 0.0000000 |
| PWERKT | 0.0000000 |
| PBROM | 0.0000000 |
| PPERSONG | 0.0000000 |
| PGEZONG | 0.0000000 |
| PWAOREG | 0.0000000 |
| PZEILPL | 0.0000000 |
| PPLEZIER | 0.0000000 |
| PFIETS | 0.0000000 |
| PINBOED | 0.0000000 |
| AWAPART | 0.0000000 |
| AWABEDR | 0.0000000 |
| AWALAND | 0.0000000 |
| ABESAUT | 0.0000000 |
| AMOTSCO | 0.0000000 |
| AVRAAUT | 0.0000000 |
| AAANHANG | 0.0000000 |
| ATRACTOR | 0.0000000 |
| AWERKT | 0.0000000 |
| ABROM | 0.0000000 |
| ALEVEN | 0.0000000 |
| APERSONG | 0.0000000 |
| AGEZONG | 0.0000000 |
| AWAOREG | 0.0000000 |
| AZEILPL | 0.0000000 |
| APLEZIER | 0.0000000 |
| AFIETS | 0.0000000 |
| AINBOED | 0.0000000 |
| ABYSTAND | 0.0000000 |
Caravan.pred=predict(Caravan.gbm,Caravan.test,n.trees = 1000,type='response')
Caravan.pred=ifelse(Caravan.pred>0.2,1,0)
kable(table(Caravan.test$Purchase,Caravan.pred))
| 0 | 1 | |
|---|---|---|
| 0 | 4412 | 121 |
| 1 | 256 | 33 |
About 20% of the people predicted to make a purchase do make one.
Caravan.glm=glm(Purchase~.,family='binomial',data = Caravan.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Caravan.pred=predict(Caravan.glm,Caravan.test,type='response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
Caravan.pred=ifelse(Caravan.pred>0.2,1,0)
kable(table(Caravan.test$Purchase,Caravan.pred))
| 0 | 1 | |
|---|---|---|
| 0 | 4183 | 350 |
| 1 | 231 | 58 |
Logistic regression performs worse than boosting, with 14% of the purchases being correctly predicted.
require(ISLR)
set.seed(42)
Boston.train=sample(1:nrow(Boston),nrow(Boston)/2)
x.train=model.matrix(crim~.:.^2,data=Boston[Boston.train,])
y.train=Boston$crim[Boston.train]
x.test=model.matrix(crim~.:.^2,data=Boston[-Boston.train,])
y.test=Boston$crim[-Boston.train]
require(glmnet)
tb=c()
grid=seq(0,0.5,0.002)
Boston.glmnet.cv=cv.glmnet(x.train,y.train,alpha=0,lambda=grid)
plot(log(Boston.glmnet.cv$lambda),Boston.glmnet.cv$cvm,type='l',
xlab='log(lambda)',ylab='Cross Validation MSE')
Boston.glmnet=glmnet(x.train,y.train,alpha=0,lambda=Boston.glmnet.cv$lambda.min)
Boston.pred=predict(Boston.glmnet,x.test)
tb=cbind(tb,'Ridge'=mean((Boston.pred-y.test)^2))
require(randomForest)
Boston.tree=tree(crim~.,data=Boston,subset=Boston.train)
set.seed(42)
Boston.tree.cv=cv.tree(Boston.tree,K = 5)
plot(Boston.tree.cv)
Boston.tree=prune.tree(Boston.tree,best=4)
Boston.pred=predict(Boston.tree,Boston[Boston.train,])
tb=cbind(tb,'Tree'=mean((Boston.pred-y.test)^2))
row.names(tb)=c('Test MSE')
kable(tb)
| Ridge | Tree | |
|---|---|---|
| Test MSE | 55.0314 | 145.6773 |