library(tree)
## Warning: package 'tree' was built under R version 4.1.3
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.1.3
data("Carseats")
Consider the Gini index, classification error, and entropy in a simple classification setting with two classes. Create a single plot that displays each of these quantities as a function of ˆpm1. The x-axis should display ˆpm1, ranging from 0 to 1, and the y-axis should display the value of the Gini index, classification error, and entropy.
pm1 <- seq(0,1, 0.001)
pm2 <- 1-pm1
E <- 1-pmax(pm1, pm2)
G <- pm1*(1-pm1) + pm2*(1-pm2)
D=-pm1*log(pm1)-pm2*log(pm2)
plot(NA,NA,xlim=c(0,1),ylim=c(0,1),xlab='p',ylab='f')
lines(pm1,G,type='l')
lines(pm1,E,col='blue')
lines(pm1,D,col='red')
legend(x='top',legend=c('gini','class error','cross entropy'),
col=c('black','blue','red'),lty=1,text.width = 0.22)
In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable. (a) Split the data set into a training set and a test set.
set.seed(2)
train <- sample(1:nrow(Carseats), 200)
Carseats.test <- Carseats[-train, "Sales"]
tree.Carseats<- tree(Sales ~., Carseats, subset=train)
summary(tree.Carseats)
##
## Regression tree:
## tree(formula = Sales ~ ., data = Carseats, subset = train)
## Variables actually used in tree construction:
## [1] "Price" "ShelveLoc" "CompPrice" "Age" "Advertising"
## [6] "Population"
## Number of terminal nodes: 14
## Residual mean deviance: 2.602 = 484 / 186
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.71700 -1.08700 -0.01026 0.00000 1.11300 4.06600
plot(tree.Carseats)
text(tree.Carseats, pretty=0)
yhat <- predict(tree.Carseats, newdata = Carseats[-train, ])
mean((yhat-Carseats.test)^2)
## [1] 4.471569
test MSE is 4.47.
cv.carseats <- cv.tree(tree.Carseats)
plot(cv.carseats$size, cv.carseats$dev, type="b")
# optimal level = 11
prune.carseats <- prune.tree(tree.Carseats, best=11)
plot(prune.carseats)
text(prune.carseats, pretty=0)
yhat <- predict(prune.carseats, newdata = Carseats[-train, ] )
mean((yhat-Carseats.test)^2)
## [1] 4.644345
Test error is 4.64. Pruning did not improve the test error rate.
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
bag.carseats <- randomForest(Sales~., data=Carseats, subset=train, mtry=10, importance=TRUE)
bag.carseats
##
## Call:
## randomForest(formula = Sales ~ ., data = Carseats, mtry = 10, importance = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.953352
## % Var explained: 60.36
yhat.bag <- predict(bag.carseats, newdata = Carseats[-train, ])
plot(yhat.bag, Carseats.test)
abline(0,1)
mean((yhat.bag-Carseats.test)^2)
## [1] 2.555523
importance(bag.carseats)
## %IncMSE IncNodePurity
## CompPrice 25.64599599 213.096278
## Income 5.67740403 76.385083
## Advertising 12.22093633 104.986108
## Population 0.59715138 61.294727
## Price 56.50749020 535.237246
## ShelveLoc 41.05022443 283.698935
## Age 9.11485795 118.103039
## Education -0.05758758 39.442533
## Urban 0.79459812 8.828001
## US 1.45384950 7.611851
varImpPlot(bag.carseats)
Test mse: 2.56
rf.carseats <- randomForest(Sales~., data=Carseats,subset=train, importance=T)
yhat.rf <- predict(rf.carseats, newdata=Carseats[-train, ])
mean((yhat.rf - Carseats.test)^2)
## [1] 3.209712
importance(rf.carseats)
## %IncMSE IncNodePurity
## CompPrice 10.8825891 157.96141
## Income 4.0892724 132.48555
## Advertising 9.8371786 126.18765
## Population -2.8000757 99.90701
## Price 36.4254627 403.76853
## ShelveLoc 30.4598617 238.61308
## Age 7.2107815 144.57156
## Education -0.0605914 69.02272
## Urban 1.3385164 14.16939
## US 1.6808343 15.63370
varImpPlot(rf.carseats)
test error: 3.21 From the plot,we see that Price and Shelveloc are the
two most important variables.
library(BART)
## Warning: package 'BART' was built under R version 4.1.3
## Loading required package: nlme
## Loading required package: nnet
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.1.3
x <- Carseats[, 1:10]
y <- Carseats[, "Sales"]
xtrain <- x[train,]
ytrain <- y[train]
xtest <- x[-train, ]
ytest <- y[-train]
set.seed(1)
bartfit <- gbart(xtrain, ytrain, x.test=xtest)
## Warning in summary.lm(lm(y.train ~ ., data.frame(t(x.train), y.train))):
## essentially perfect fit: summary may be unreliable
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 13, 200
## y1,yn: 0.154000, -2.406000
## x1,x[n*p]: 7.500000, 1.000000
## xp1,xp[np*p]: 11.220000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 100 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.287616,3,9.47926e-32,7.346
## *****sigma: 0.000000
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,13,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 4s
## trcnt,tecnt: 1000,1000
yhat.bart <- bartfit$yhat.test.mean
mean((ytest-yhat.bart)^2)
## [1] 0.1158053
The test error = 0.116 is lower than the test errors for the other techniques.
data(OJ)
attach(OJ)
set.seed(2)
trainOJ <- sample(1:nrow(OJ), 800)
OJ.test <- OJ[-trainOJ, ]
Purchase.test <- Purchase[-trainOJ]
oj.tree <- tree(Purchase~., OJ, subset=trainOJ)
summary(oj.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ, subset = trainOJ)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7009 = 554.4 / 791
## Misclassification error rate: 0.1588 = 127 / 800
The training error rate is 0.1588, and the tree has 9 terminal nodes.
oj.tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1068.00 CH ( 0.61250 0.38750 )
## 2) LoyalCH < 0.5036 359 422.80 MM ( 0.27577 0.72423 )
## 4) LoyalCH < 0.280875 172 127.60 MM ( 0.12209 0.87791 )
## 8) LoyalCH < 0.035047 56 10.03 MM ( 0.01786 0.98214 ) *
## 9) LoyalCH > 0.035047 116 106.60 MM ( 0.17241 0.82759 ) *
## 5) LoyalCH > 0.280875 187 254.10 MM ( 0.41711 0.58289 )
## 10) PriceDiff < 0.05 73 71.36 MM ( 0.19178 0.80822 ) *
## 11) PriceDiff > 0.05 114 156.30 CH ( 0.56140 0.43860 ) *
## 3) LoyalCH > 0.5036 441 311.80 CH ( 0.88662 0.11338 )
## 6) LoyalCH < 0.737888 168 191.10 CH ( 0.74405 0.25595 )
## 12) PriceDiff < 0.265 93 125.00 CH ( 0.60215 0.39785 )
## 24) PriceDiff < -0.35 12 10.81 MM ( 0.16667 0.83333 ) *
## 25) PriceDiff > -0.35 81 103.10 CH ( 0.66667 0.33333 ) *
## 13) PriceDiff > 0.265 75 41.82 CH ( 0.92000 0.08000 ) *
## 7) LoyalCH > 0.737888 273 65.11 CH ( 0.97436 0.02564 )
## 14) PriceDiff < -0.39 11 12.89 CH ( 0.72727 0.27273 ) *
## 15) PriceDiff > -0.39 262 41.40 CH ( 0.98473 0.01527 ) *
Node 1: The split criterion is loyalCH< 0.035047. There are 56 observations in the branch. The branch has a deviance of 10.03. The overall prediction for this branch is MM - Minute maid orange juice.
plot(oj.tree)
text(oj.tree, pretty=0)
Interpretation: 1. Loyal CH is the most important factor in determing Purchase. 2.Customers with loyal CH<0.03 and having the lowest Price diff purchased minute maid. 3.Customers with a CH value higher than the previous and having Price Difference higher than the previous purchased Citrus Hill. 4. Customers having with loyal CH value greater than 0.5 but less than 0.7 and having Price Difference less than -0.35 purchased minute maid. 5. Customers having with loyal CH value greater than 0.5 but less than 0.7 and having Price Difference greater than -0.35 purchased citrus hill. 6. Customers having with loyal CH value greater than 0.7 and having Price Difference either less than or greater than -0.39 purchased citrus hill.
oj.pred <- predict(oj.tree, OJ.test, type="class")
table(oj.pred, OJ.test$Purchase)
##
## oj.pred CH MM
## CH 148 37
## MM 15 70
(148+70)/270
## [1] 0.8074074
The test error rate is 0.807
set.seed(2)
cv.oj <- cv.tree(oj.tree, FUN=prune.misclass)
cv.carseats
## $size
## [1] 14 13 12 11 10 9 8 7 6 4 3 2 1
##
## $dev
## [1] 944.4136 960.5082 975.5682 969.8304 1018.6218 1085.5631 1114.2876
## [8] 1116.6153 1156.7491 1160.1852 1142.0335 1320.0677 1574.5229
##
## $k
## [1] -Inf 16.92509 19.38585 23.44178 29.89370 36.28493 50.16562
## [8] 54.84825 65.75957 80.79945 90.11022 179.77305 277.78708
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
par(mfrow=c(1,2))
plot(cv.oj$size, cv.oj$dev, type="b")
Ans: Tree size 4.
oj.prune <- prune.misclass(oj.tree,best=4)
prune.pred <- predict(oj.prune, OJ.test, type="class")
table(prune.pred, OJ.test$Purchase)
##
## prune.pred CH MM
## CH 149 42
## MM 14 65
summary(oj.prune)
##
## Classification tree:
## snip.tree(tree = oj.tree, nodes = 4:3)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 4
## Residual mean deviance: 0.8381 = 667.1 / 796
## Misclassification error rate: 0.1688 = 135 / 800
(149+65)/270
## [1] 0.7925926
Test error is 0.793
Compare the training error rates between the pruned and unpruned trees. Which is higher? Ans: For unpruned trees: training error was 0.1588 For pruned trees: training error = 0.1688 Pruned tree had a higher training error rate.
Compare the test error rates between the pruned and unpruned trees. Which is higher?
Ans: Pruned: 0.793 Unpruned:0.807
Unpruned trees had a higher test error rate.