Question 3

p=seq(0,1,0.01)

gini= 2*p*(1-p)
classerror= 1-pmax(p,1-p)
crossentropy= -(p*log(p)+(1-p)*log(1-p))

plot(NA,NA,xlim=c(0,1),ylim=c(0,1),xlab='p',ylab='f')

lines(p,gini,type='l')
lines(p,classerror,col='blue')
lines(p,crossentropy,col='red')

legend(x='top',legend=c('gini','class error','cross entropy'),
       col=c('black','blue','red'),lty=1,text.width = 0.22)

## Question 8

set.seed(42)
train=sample(1:nrow(Carseats),nrow(Carseats)/2)
tree.carseats=tree(formula=Sales~.,data=Carseats,subset = train)
tree.pred=predict(tree.carseats,Carseats[-train,])

mean((tree.pred-Carseats[-train,'Sales'])^2)
## [1] 5.686401
plot(tree.carseats)
text(tree.carseats)

tree.carseats.cv=cv.tree(tree.carseats) 
plot(tree.carseats.cv)

prune.carseats=prune.tree(tree.carseats,best=5)

plot(prune.carseats)
text(prune.carseats) 

tree.pred=predict(prune.carseats,Carseats[-train,])
mean((tree.pred-Carseats[-train,'Sales'])^2)
## [1] 4.866831
plot(tree.pred,Carseats[-train,'Sales'],xlab='prediction',ylab='actual')
abline(0,1)

require(randomForest)
## Loading required package: randomForest
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
d=ncol(Carseats)-1

set.seed(42)
carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=d,importance=T,ntree=100)

tree.pred=predict(carseats.rf,Carseats[-train,])
mean((tree.pred-Carseats[-train,'Sales'])^2)
## [1] 2.416614
plot(carseats.rf)

varImpPlot(carseats.rf)

kable(importance(carseats.rf))
%IncMSE IncNodePurity
CompPrice 12.7572658 194.553633
Income 3.6925644 114.061269
Advertising 8.4583279 99.943015
Population 1.7756989 62.016441
Price 24.8149447 428.177561
ShelveLoc 24.9933231 428.448690
Age 5.7722652 127.112993
Education 1.2674696 43.842548
Urban -0.2388463 10.176602
US -0.0532919 7.770429
mse=c()

set.seed(42)

for(i in 3:10){
  carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=5,importance=T,ntree=100)

  tree.pred=predict(carseats.rf,Carseats[-train,])
  mse=rbind(mse,mean((tree.pred-Carseats[-train,'Sales'])^2))
}
plot(3:10,mse,type='b')

require(randomForest)

set.seed(42)
carseats.rf=randomForest(Sales~.,data=Carseats,subset=train,mtry=9,importance=T,ntree=100)

plot(carseats.rf)

varImpPlot(carseats.rf)

kable(importance(carseats.rf))
%IncMSE IncNodePurity
CompPrice 12.9744625 197.97151
Income 5.1316050 103.17128
Advertising 5.4144641 110.59823
Population -0.4748975 63.45821
Price 23.8930328 416.10410
ShelveLoc 25.0436020 417.41569
Age 4.1412728 133.36952
Education 0.5282113 46.89271
Urban 2.0743624 11.77462
US -0.5563391 6.75921

From above we can see that Shelve Loc is the most important predictor in terms of MSE

Question 9

set.seed(42)
train=sample(1:nrow(OJ),800)

OJ.train=OJ[train,]
OJ.test=OJ[-train,]
OJ.tree=tree(Purchase~.,data=OJ.train)  # There is no predictor `Buy`.
summary(OJ.tree)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH"     "SalePriceMM" "PriceDiff"  
## Number of terminal nodes:  8 
## Residual mean deviance:  0.7392 = 585.5 / 792 
## Misclassification error rate: 0.1638 = 131 / 800
OJ.tree
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1066.00 CH ( 0.61500 0.38500 )  
##    2) LoyalCH < 0.48285 285  296.00 MM ( 0.21404 0.78596 )  
##      4) LoyalCH < 0.064156 64    0.00 MM ( 0.00000 1.00000 ) *
##      5) LoyalCH > 0.064156 221  260.40 MM ( 0.27602 0.72398 )  
##       10) SalePriceMM < 2.04 128  123.50 MM ( 0.18750 0.81250 ) *
##       11) SalePriceMM > 2.04 93  125.00 MM ( 0.39785 0.60215 ) *
##    3) LoyalCH > 0.48285 515  458.10 CH ( 0.83689 0.16311 )  
##      6) LoyalCH < 0.753545 230  282.70 CH ( 0.69565 0.30435 )  
##       12) PriceDiff < 0.265 149  203.00 CH ( 0.57718 0.42282 )  
##         24) PriceDiff < -0.165 32   38.02 MM ( 0.28125 0.71875 ) *
##         25) PriceDiff > -0.165 117  150.30 CH ( 0.65812 0.34188 )  
##           50) LoyalCH < 0.703993 105  139.60 CH ( 0.61905 0.38095 ) *
##           51) LoyalCH > 0.703993 12    0.00 CH ( 1.00000 0.00000 ) *
##       13) PriceDiff > 0.265 81   47.66 CH ( 0.91358 0.08642 ) *
##      7) LoyalCH > 0.753545 285  111.70 CH ( 0.95088 0.04912 ) *
plot(OJ.tree)
text(OJ.tree)

OJ.pred.train=predict(OJ.tree,OJ.train,type = 'class')
kable(table(OJ.train[,'Purchase'],OJ.pred.train))
CH MM
CH 422 70
MM 61 247
kable(table(OJ.train[,'Purchase'],OJ.pred.train)/nrow(OJ.train))
CH MM
CH 0.52750 0.08750
MM 0.07625 0.30875
OJ.pred.test=predict(OJ.tree,OJ.test,type = 'class')
kable(table(OJ.test[,'Purchase'],OJ.pred.test))
CH MM
CH 125 36
MM 15 94
kable(table(OJ.test[,'Purchase'],OJ.pred.test)/nrow(OJ.test))
CH MM
CH 0.4629630 0.1333333
MM 0.0555556 0.3481481
set.seed(42)
OJ.tree.cv=cv.tree(OJ.tree,K = 10,FUN = prune.misclass)
plot(OJ.tree.cv)

OJ.tree=prune.misclass(OJ.tree,best = 2)

OJ.pred.train=predict(OJ.tree,OJ.train,type = 'class')
kable(table(OJ.train[,'Purchase'],OJ.pred.train))
CH MM
CH 431 61
MM 84 224
kable(table(OJ.train[,'Purchase'],OJ.pred.train)/nrow(OJ.train))
CH MM
CH 0.53875 0.07625
MM 0.10500 0.28000
OJ.pred.test=predict(OJ.tree,OJ.test,type = 'class')
kable(table(OJ.test[,'Purchase'],OJ.pred.test))
CH MM
CH 128 33
MM 26 83
kable(table(OJ.test[,'Purchase'],OJ.pred.test)/nrow(OJ.test))
CH MM
CH 0.4740741 0.1222222
MM 0.0962963 0.3074074
plot(OJ.tree)
text(OJ.tree)