Question 3
p=seq(0,1,0.01)
gini= 2*p*(1-p)
classerror= 1-pmax(p,1-p)
crossentropy= -(p*log(p)+(1-p)*log(1-p))
plot(NA,NA,xlim=c(0,1),ylim=c(0,1),xlab='p',ylab='f')
lines(p,gini,type='l')
lines(p,classerror,col='blue')
lines(p,crossentropy,col='red')
legend(x='top',legend=c('gini','class error','cross entropy'),
col=c('black','blue','red'),lty=1,text.width = 0.22)
## Question 8
set.seed(42)
train=sample(1:nrow(Carseats),nrow(Carseats)/2)
tree.carseats=tree(formula=Sales~.,data=Carseats,subset = train)
tree.pred=predict(tree.carseats,Carseats[-train,])
mean((tree.pred-Carseats[-train,'Sales'])^2)
## [1] 5.686401
plot(tree.carseats)
text(tree.carseats)

tree.carseats.cv=cv.tree(tree.carseats)
plot(tree.carseats.cv)

prune.carseats=prune.tree(tree.carseats,best=5)
plot(prune.carseats)
text(prune.carseats)

tree.pred=predict(prune.carseats,Carseats[-train,])
mean((tree.pred-Carseats[-train,'Sales'])^2)
## [1] 4.866831
plot(tree.pred,Carseats[-train,'Sales'],xlab='prediction',ylab='actual')
abline(0,1)

require(randomForest)
## Loading required package: randomForest
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
d=ncol(Carseats)-1
set.seed(42)
carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=d,importance=T,ntree=100)
tree.pred=predict(carseats.rf,Carseats[-train,])
mean((tree.pred-Carseats[-train,'Sales'])^2)
## [1] 2.416614
plot(carseats.rf)

varImpPlot(carseats.rf)

kable(importance(carseats.rf))
| CompPrice |
12.7572658 |
194.553633 |
| Income |
3.6925644 |
114.061269 |
| Advertising |
8.4583279 |
99.943015 |
| Population |
1.7756989 |
62.016441 |
| Price |
24.8149447 |
428.177561 |
| ShelveLoc |
24.9933231 |
428.448690 |
| Age |
5.7722652 |
127.112993 |
| Education |
1.2674696 |
43.842548 |
| Urban |
-0.2388463 |
10.176602 |
| US |
-0.0532919 |
7.770429 |
mse=c()
set.seed(42)
for(i in 3:10){
carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=5,importance=T,ntree=100)
tree.pred=predict(carseats.rf,Carseats[-train,])
mse=rbind(mse,mean((tree.pred-Carseats[-train,'Sales'])^2))
}
plot(3:10,mse,type='b')

require(randomForest)
set.seed(42)
carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=9,importance=T,ntree=100)
plot(carseats.rf)

varImpPlot(carseats.rf)

kable(importance(carseats.rf))
| CompPrice |
12.9744625 |
197.97151 |
| Income |
5.1316050 |
103.17128 |
| Advertising |
5.4144641 |
110.59823 |
| Population |
-0.4748975 |
63.45821 |
| Price |
23.8930328 |
416.10410 |
| ShelveLoc |
25.0436020 |
417.41569 |
| Age |
4.1412728 |
133.36952 |
| Education |
0.5282113 |
46.89271 |
| Urban |
2.0743624 |
11.77462 |
| US |
-0.5563391 |
6.75921 |
From above we can see that Shelve Loc is the most important predictor in terms of MSE
Question 9
set.seed(42)
train=sample(1:nrow(OJ),800)
OJ.train=OJ[train,]
OJ.test=OJ[-train,]
OJ.tree=tree(Purchase~.,data=OJ.train) # There is no predictor `Buy`.
summary(OJ.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "SalePriceMM" "PriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7392 = 585.5 / 792
## Misclassification error rate: 0.1638 = 131 / 800
OJ.tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1066.00 CH ( 0.61500 0.38500 )
## 2) LoyalCH < 0.48285 285 296.00 MM ( 0.21404 0.78596 )
## 4) LoyalCH < 0.064156 64 0.00 MM ( 0.00000 1.00000 ) *
## 5) LoyalCH > 0.064156 221 260.40 MM ( 0.27602 0.72398 )
## 10) SalePriceMM < 2.04 128 123.50 MM ( 0.18750 0.81250 ) *
## 11) SalePriceMM > 2.04 93 125.00 MM ( 0.39785 0.60215 ) *
## 3) LoyalCH > 0.48285 515 458.10 CH ( 0.83689 0.16311 )
## 6) LoyalCH < 0.753545 230 282.70 CH ( 0.69565 0.30435 )
## 12) PriceDiff < 0.265 149 203.00 CH ( 0.57718 0.42282 )
## 24) PriceDiff < -0.165 32 38.02 MM ( 0.28125 0.71875 ) *
## 25) PriceDiff > -0.165 117 150.30 CH ( 0.65812 0.34188 )
## 50) LoyalCH < 0.703993 105 139.60 CH ( 0.61905 0.38095 ) *
## 51) LoyalCH > 0.703993 12 0.00 CH ( 1.00000 0.00000 ) *
## 13) PriceDiff > 0.265 81 47.66 CH ( 0.91358 0.08642 ) *
## 7) LoyalCH > 0.753545 285 111.70 CH ( 0.95088 0.04912 ) *
plot(OJ.tree)
text(OJ.tree)

OJ.pred.train=predict(OJ.tree,OJ.train,type = 'class')
kable(table(OJ.train[,'Purchase'],OJ.pred.train))
kable(table(OJ.train[,'Purchase'],OJ.pred.train)/nrow(OJ.train))
| CH |
0.52750 |
0.08750 |
| MM |
0.07625 |
0.30875 |
OJ.pred.test=predict(OJ.tree,OJ.test,type = 'class')
kable(table(OJ.test[,'Purchase'],OJ.pred.test))
kable(table(OJ.test[,'Purchase'],OJ.pred.test)/nrow(OJ.test))
| CH |
0.4629630 |
0.1333333 |
| MM |
0.0555556 |
0.3481481 |
set.seed(42)
OJ.tree.cv=cv.tree(OJ.tree,K = 10,FUN = prune.misclass)
plot(OJ.tree.cv)

OJ.tree=prune.misclass(OJ.tree,best = 2)
OJ.pred.train=predict(OJ.tree,OJ.train,type = 'class')
kable(table(OJ.train[,'Purchase'],OJ.pred.train))
kable(table(OJ.train[,'Purchase'],OJ.pred.train)/nrow(OJ.train))
| CH |
0.53875 |
0.07625 |
| MM |
0.10500 |
0.28000 |
OJ.pred.test=predict(OJ.tree,OJ.test,type = 'class')
kable(table(OJ.test[,'Purchase'],OJ.pred.test))
kable(table(OJ.test[,'Purchase'],OJ.pred.test)/nrow(OJ.test))
| CH |
0.4740741 |
0.1222222 |
| MM |
0.0962963 |
0.3074074 |
plot(OJ.tree)
text(OJ.tree)
