Chapter 8: 3, 8, 9
3.
p=seq(0,1,0.01)
gini= 2*p*(1-p)
classerror= 1-pmax(p,1-p)
crossentropy= -(p*log(p)+(1-p)*log(1-p))
plot(NA,NA,xlim=c(0,1),ylim=c(0,1),xlab='p',ylab='f')
lines(p,gini,type='l')
lines(p,classerror,col='green')
lines(p,crossentropy,col='violet')
legend(x='top',legend=c('gini','class error','cross entropy'),
col=c('black','green','violet'),lty=1)
8. a)
library(ISLR)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(tree)
set.seed(2021)
data("Carseats")
index<- sample(nrow(Carseats), size = 300)
train<- Carseats[index,]
test<- Carseats[-index,]
b)
a= tree(Sales~.,data=Carseats,subset = index)
a_pred<- predict(a, newdata = test)
mean((a_pred-Carseats[-index,'Sales'])^2)
## [1] 4.9824
plot(a)
text(a)
c)
cv_a=cv.tree(a)
plot(cv_a)
d The MSE increases when pruning is added.
d=prune.tree(a,best=4)
plot(d)
text(d)
e=predict(d,Carseats[-index,])
mean((e-Carseats[-index,'Sales'])^2)
## [1] 5.82288
e)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
mse=c()
set.seed(2021)
for(i in 3:10){
a_rf=randomForest(Sales~.,data=Carseats,subset=index,mtry=5,importance=T,ntree=100)
a_pred=predict(a_rf,Carseats[-index,])
mse=rbind(mse,mean((a_pred-Carseats[-index,'Sales'])^2))
}
plot(3:10,mse,type='b')
library(knitr)
varImpPlot(a_rf)
kable(importance(a_rf))
| %IncMSE | IncNodePurity | |
|---|---|---|
| CompPrice | 10.7166862 | 208.58793 |
| Income | 3.5739868 | 155.10388 |
| Advertising | 6.4820251 | 184.73183 |
| Population | -0.3774366 | 106.98729 |
| Price | 26.6665300 | 616.34004 |
| ShelveLoc | 29.7794242 | 742.94875 |
| Age | 6.3104003 | 184.76711 |
| Education | 1.0799941 | 63.34303 |
| Urban | -0.5050450 | 10.16969 |
| US | -0.4619055 | 12.15648 |
9. a)
data(OJ)
set.seed(2021)
index<- sample(nrow(OJ), 800, replace = T)
train<- OJ[index,]
test<- OJ[-index,]
b)
b= tree(Purchase ~., data = train)
summary(b)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "ListPriceDiff" "DiscMM" "DiscCH"
## [5] "PriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.6608 = 523.4 / 792
## Misclassification error rate: 0.15 = 120 / 800
c) There is a terminal node “8) WeekofPurchase < 243 35 28.71 MM ( 0.14286 0.85714 )” which is a child node of “4) LoyalCH < 0.171368 108 40.49 MM ( 0.04630 0.95370 )” and “2) LoyalCH < 0.5036 338 426.50 MM ( 0.32544 0.67456 )” and the root of “1) root 800 1041.00 CH ( 0.64500 0.35500 )”.
b
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1050.000 CH ( 0.63500 0.36500 )
## 2) LoyalCH < 0.482389 277 292.100 MM ( 0.22022 0.77978 )
## 4) LoyalCH < 0.0506575 64 0.000 MM ( 0.00000 1.00000 ) *
## 5) LoyalCH > 0.0506575 213 255.100 MM ( 0.28638 0.71362 ) *
## 3) LoyalCH > 0.482389 523 433.600 CH ( 0.85468 0.14532 )
## 6) LoyalCH < 0.738638 213 262.100 CH ( 0.69484 0.30516 )
## 12) ListPriceDiff < 0.255 118 163.400 CH ( 0.51695 0.48305 )
## 24) DiscMM < 0.47 99 132.800 CH ( 0.60606 0.39394 )
## 48) DiscCH < 0.115 89 122.000 CH ( 0.56180 0.43820 ) *
## 49) DiscCH > 0.115 10 0.000 CH ( 1.00000 0.00000 ) *
## 25) DiscMM > 0.47 19 7.835 MM ( 0.05263 0.94737 ) *
## 13) ListPriceDiff > 0.255 95 54.900 CH ( 0.91579 0.08421 ) *
## 7) LoyalCH > 0.738638 310 95.060 CH ( 0.96452 0.03548 )
## 14) PriceDiff < -0.39 7 9.561 CH ( 0.57143 0.42857 ) *
## 15) PriceDiff > -0.39 303 73.940 CH ( 0.97360 0.02640 ) *
d)
library(tidyverse)
## Registered S3 method overwritten by 'cli':
## method from
## print.tree tree
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.5 ✓ dplyr 1.0.3
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::combine() masks randomForest::combine()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x randomForest::margin() masks ggplot2::margin()
plot(b)
text(b)
e)
b_pred<- predict(b, newdata = test, type = "class")
kable(table(test[,'Purchase'],b_pred))
| CH | MM | |
|---|---|---|
| CH | 266 | 52 |
| MM | 48 | 150 |
(59+42)/(59+42+251+155)
## [1] 0.199211
f)
set.seed(2021)
f<- cv.tree(b, FUN = prune.misclass, K = 10)
g)
plot(f)
h) 2 corresponds to the best error rate.
i)
g=prune.misclass(b,best = 2)
j) Test error rate (unpruned): 0.199 Test error rate (pruned): 0.20
test_pred= predict(g, test, type= "class")
kable(table(test[,'Purchase'],test_pred))
| CH | MM | |
|---|---|---|
| CH | 268 | 50 |
| MM | 51 | 147 |
(42+57)/(42+57+236+172)
## [1] 0.1952663
k) Error rate for unpruned: 0.19 Error rate for pruned tree: 0.20
pred=predict(g,train,type = 'class')
kable(table(train[,'Purchase'],pred))
| CH | MM | |
|---|---|---|
| CH | 447 | 61 |
| MM | 76 | 216 |
(56+110)/(56+110+406+228)
## [1] 0.2075