Chapter 8: 3, 8, 9

3.

p=seq(0,1,0.01)
gini= 2*p*(1-p)
classerror= 1-pmax(p,1-p)
crossentropy= -(p*log(p)+(1-p)*log(1-p))

plot(NA,NA,xlim=c(0,1),ylim=c(0,1),xlab='p',ylab='f')
lines(p,gini,type='l')
lines(p,classerror,col='green')
lines(p,crossentropy,col='violet')
legend(x='top',legend=c('gini','class error','cross entropy'),
       col=c('black','green','violet'),lty=1)

8. a)

library(ISLR)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(tree)
set.seed(2021)
data("Carseats")
index<- sample(nrow(Carseats), size = 300)
train<- Carseats[index,]
test<- Carseats[-index,]

b)

a= tree(Sales~.,data=Carseats,subset = index)
a_pred<- predict(a, newdata = test)
mean((a_pred-Carseats[-index,'Sales'])^2)
## [1] 4.9824
plot(a)
text(a)

c)

cv_a=cv.tree(a) 
plot(cv_a)

d The MSE increases when pruning is added.

d=prune.tree(a,best=4)
plot(d)
text(d)

e=predict(d,Carseats[-index,])
mean((e-Carseats[-index,'Sales'])^2)
## [1] 5.82288

e)

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
mse=c()
set.seed(2021)
for(i in 3:10){
  a_rf=randomForest(Sales~.,data=Carseats,subset=index,mtry=5,importance=T,ntree=100)
  a_pred=predict(a_rf,Carseats[-index,])
  mse=rbind(mse,mean((a_pred-Carseats[-index,'Sales'])^2))
}
plot(3:10,mse,type='b')

library(knitr)
varImpPlot(a_rf)

kable(importance(a_rf))
%IncMSE IncNodePurity
CompPrice 10.7166862 208.58793
Income 3.5739868 155.10388
Advertising 6.4820251 184.73183
Population -0.3774366 106.98729
Price 26.6665300 616.34004
ShelveLoc 29.7794242 742.94875
Age 6.3104003 184.76711
Education 1.0799941 63.34303
Urban -0.5050450 10.16969
US -0.4619055 12.15648

9. a)

data(OJ)

set.seed(2021)
index<- sample(nrow(OJ), 800, replace = T)
train<- OJ[index,]
test<- OJ[-index,]

b)

b= tree(Purchase ~., data = train)
summary(b)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "ListPriceDiff" "DiscMM"        "DiscCH"       
## [5] "PriceDiff"    
## Number of terminal nodes:  8 
## Residual mean deviance:  0.6608 = 523.4 / 792 
## Misclassification error rate: 0.15 = 120 / 800

c) There is a terminal node “8) WeekofPurchase < 243 35 28.71 MM ( 0.14286 0.85714 )” which is a child node of “4) LoyalCH < 0.171368 108 40.49 MM ( 0.04630 0.95370 )” and “2) LoyalCH < 0.5036 338 426.50 MM ( 0.32544 0.67456 )” and the root of “1) root 800 1041.00 CH ( 0.64500 0.35500 )”.

b
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1050.000 CH ( 0.63500 0.36500 )  
##    2) LoyalCH < 0.482389 277  292.100 MM ( 0.22022 0.77978 )  
##      4) LoyalCH < 0.0506575 64    0.000 MM ( 0.00000 1.00000 ) *
##      5) LoyalCH > 0.0506575 213  255.100 MM ( 0.28638 0.71362 ) *
##    3) LoyalCH > 0.482389 523  433.600 CH ( 0.85468 0.14532 )  
##      6) LoyalCH < 0.738638 213  262.100 CH ( 0.69484 0.30516 )  
##       12) ListPriceDiff < 0.255 118  163.400 CH ( 0.51695 0.48305 )  
##         24) DiscMM < 0.47 99  132.800 CH ( 0.60606 0.39394 )  
##           48) DiscCH < 0.115 89  122.000 CH ( 0.56180 0.43820 ) *
##           49) DiscCH > 0.115 10    0.000 CH ( 1.00000 0.00000 ) *
##         25) DiscMM > 0.47 19    7.835 MM ( 0.05263 0.94737 ) *
##       13) ListPriceDiff > 0.255 95   54.900 CH ( 0.91579 0.08421 ) *
##      7) LoyalCH > 0.738638 310   95.060 CH ( 0.96452 0.03548 )  
##       14) PriceDiff < -0.39 7    9.561 CH ( 0.57143 0.42857 ) *
##       15) PriceDiff > -0.39 303   73.940 CH ( 0.97360 0.02640 ) *

d)

library(tidyverse)
## Registered S3 method overwritten by 'cli':
##   method     from
##   print.tree tree
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.5     ✓ dplyr   1.0.3
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::combine()       masks randomForest::combine()
## x dplyr::filter()        masks stats::filter()
## x dplyr::lag()           masks stats::lag()
## x purrr::lift()          masks caret::lift()
## x randomForest::margin() masks ggplot2::margin()
plot(b) 
text(b)

e)

b_pred<- predict(b, newdata = test, type = "class")
kable(table(test[,'Purchase'],b_pred))
CH MM
CH 266 52
MM 48 150
(59+42)/(59+42+251+155)
## [1] 0.199211

f)

set.seed(2021)
f<- cv.tree(b, FUN = prune.misclass, K = 10)

g)

plot(f)

h) 2 corresponds to the best error rate.

i)

g=prune.misclass(b,best = 2)

j) Test error rate (unpruned): 0.199 Test error rate (pruned): 0.20

test_pred= predict(g, test, type= "class")
kable(table(test[,'Purchase'],test_pred))
CH MM
CH 268 50
MM 51 147
(42+57)/(42+57+236+172)
## [1] 0.1952663

k) Error rate for unpruned: 0.19 Error rate for pruned tree: 0.20

pred=predict(g,train,type = 'class')
kable(table(train[,'Purchase'],pred))
CH MM
CH 447 61
MM 76 216
(56+110)/(56+110+406+228)
## [1] 0.2075