Consider the Gini index, classification error, and entropy in a simple classification setting with two classes. Create a single plot that displays each of these quantities as a function of ˆpm1. The xaxis should display ˆpm1, ranging from 0 to 1, and the y-axis should display the value of the Gini index, classification error, and entropy. Hint: In a setting with two classes, pˆm1 = 1 − pˆm2. You could make this plot by hand, but it will be much easier to make in R.
p <- seq(0, 1, 0.01)
gini.index <- 2 * p * (1 - p)
class.error <- 1 - pmax(p, 1 - p)
cross.entropy <- - (p * log(p) + (1 - p) * log(1 - p))
matplot(p, cbind(gini.index, class.error, cross.entropy), col = c("black", "orange", "blue"), xlab=expression(italic(hat(p))[m1]), ylab = "Quantities")
In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches,treating the response as a quantitative variable.
library(ISLR)
set.seed(1)
train <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train.car<- Carseats[train, ]
test.car<- Carseats[-train, ]
Training Data = 200 observations Test Data = 200 observations
library(tree)
tree.car<- tree(Sales ~ ., data = train.car)
summary(tree.car)
##
## Regression tree:
## tree(formula = Sales ~ ., data = train.car)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "Income"
## [6] "CompPrice"
## Number of terminal nodes: 18
## Residual mean deviance: 2.36 = 429.5 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.2570 -1.0360 0.1024 0.0000 0.9301 3.9130
Plotting the tree
plot(tree.car)
text(tree.car, pretty = 0)
MSE
prediction<-predict(tree.car, newdata = test.car)
mean((prediction- test.car$Sales)^2)
## [1] 4.148897
Error = 4.148897
cv.car<- cv.tree(tree.car)
plot(cv.car$size, cv.car$dev, type = "b")
tree.min <- which.min(cv.car$dev)
points(tree.min, cv.car$dev[tree.min], col = "red", cex = 2, pch = 9)
Tree of Size 8 can be chosen. Prune to obtain an 8-node tree.
prune.car<- prune.tree(tree.car, best = 8)
plot(prune.car)
text(prune.car, pretty = 0)
MSE
predict2<- predict(prune.car, newdata = test.car)
mean((predict2- test.car$Sales)^2)
## [1] 5.09085
Error = 5.09085
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
bag.car<- randomForest(Sales ~ ., data = train.car, mtry = 10, ntree = 500, importance = TRUE)
predict3<- predict(bag.car, newdata = test.car)
mean((predict3- test.car$Sales)^2)
## [1] 2.633915
Error = 2.546545
Important Variables
importance (bag.car)
## %IncMSE IncNodePurity
## CompPrice 16.9874366 126.852848
## Income 3.8985402 78.314126
## Advertising 16.5698586 123.702901
## Population 0.6487058 62.328851
## Price 55.3976775 514.654890
## ShelveLoc 42.7849818 319.133777
## Age 20.5135255 185.582077
## Education 3.4615211 42.253410
## Urban -2.5125087 8.700009
## US 7.3586645 18.180651
Price and ShelveLoc are the two most important variables
rf.car<- randomForest(Sales ~ ., data = train.car, mtry = 3, ntree = 500, importance = TRUE)
predict4<- predict(rf.car, newdata = test.car)
mean((predict4- test.car$Sales)^2)
## [1] 3.321154
Error = 3.306537
Important Variables
importance(rf.car)
## %IncMSE IncNodePurity
## CompPrice 7.443405 130.87552
## Income 3.227858 127.18662
## Advertising 13.388259 139.53499
## Population -1.031306 102.32154
## Price 36.616911 369.59534
## ShelveLoc 31.284175 233.49549
## Age 17.622273 206.09959
## Education 1.454555 70.41374
## Urban -1.864781 15.13225
## US 6.193082 35.74746
Price and ShelveLoc are the two most important variables
This problem involves the OJ data set which is part of the ISLR package.
set.seed(1)
train <- sample(1:nrow(OJ), 800)
train.OJ<- OJ[train, ]
test.OJ<- OJ[-train, ]
tree.OJ<- tree(Purchase ~ ., data = train.OJ)
summary(tree.OJ)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train.OJ)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7305 = 578.6 / 792
## Misclassification error rate: 0.165 = 132 / 800
8-terminal nodes and error rate of 0.165
tree.OJ
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1064.00 CH ( 0.61750 0.38250 )
## 2) LoyalCH < 0.508643 350 409.30 MM ( 0.27143 0.72857 )
## 4) LoyalCH < 0.264232 166 122.10 MM ( 0.12048 0.87952 )
## 8) LoyalCH < 0.0356415 57 10.07 MM ( 0.01754 0.98246 ) *
## 9) LoyalCH > 0.0356415 109 100.90 MM ( 0.17431 0.82569 ) *
## 5) LoyalCH > 0.264232 184 248.80 MM ( 0.40761 0.59239 )
## 10) PriceDiff < 0.195 83 91.66 MM ( 0.24096 0.75904 )
## 20) SpecialCH < 0.5 70 60.89 MM ( 0.15714 0.84286 ) *
## 21) SpecialCH > 0.5 13 16.05 CH ( 0.69231 0.30769 ) *
## 11) PriceDiff > 0.195 101 139.20 CH ( 0.54455 0.45545 ) *
## 3) LoyalCH > 0.508643 450 318.10 CH ( 0.88667 0.11333 )
## 6) LoyalCH < 0.764572 172 188.90 CH ( 0.76163 0.23837 )
## 12) ListPriceDiff < 0.235 70 95.61 CH ( 0.57143 0.42857 ) *
## 13) ListPriceDiff > 0.235 102 69.76 CH ( 0.89216 0.10784 ) *
## 7) LoyalCH > 0.764572 278 86.14 CH ( 0.96403 0.03597 ) *
We pick the node labelled 7. The split criterion is LoyalCH >0.764572, the number of observations in that branch is 278 with a deviance of 86.14 and an overall prediction for the branch of CH.
plot(tree.OJ)
text(tree.OJ, pretty = 0)
Loyal CH seems to be the most important indicator of Purchase.
tree.pred <- predict(tree.OJ, test.OJ, type = "class")
table(tree.pred, test.OJ$Purchase)
##
## tree.pred CH MM
## CH 147 49
## MM 12 62
error rate
1 - (147 + 62) / 270
## [1] 0.2259259
error rate = 0.2259259
cv.OJ <- cv.tree(tree.OJ, FUN = prune.misclass)
cv.OJ
## $size
## [1] 8 5 2 1
##
## $dev
## [1] 156 156 156 306
##
## $k
## [1] -Inf 0.000000 4.666667 160.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.OJ$size, cv.OJ$dev, type = "b", xlab = "Tree size", ylab = "Cross-Validated Classification Error")
2-node tree is the smallest tree with the lowest classification error rate.
Pruned Tree
prune.OJ <- prune.misclass(tree.OJ, best = 2)
plot(prune.OJ)
text(prune.OJ, pretty = 0)
summary(tree.OJ)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train.OJ)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7305 = 578.6 / 792
## Misclassification error rate: 0.165 = 132 / 800
summary(prune.OJ)
##
## Classification tree:
## snip.tree(tree = tree.OJ, nodes = 3:2)
## Variables actually used in tree construction:
## [1] "LoyalCH"
## Number of terminal nodes: 2
## Residual mean deviance: 0.9115 = 727.4 / 798
## Misclassification error rate: 0.1825 = 146 / 800
Misclassification error rate is higher for Pruned Tree
prune.pred <- predict(prune.OJ, test.OJ, type = "class")
table(prune.pred, test.OJ$Purchase)
##
## prune.pred CH MM
## CH 119 30
## MM 40 81
Error
1 - (119 + 81) / 270
## [1] 0.2592593
Pruning increased the error rate by 0.2592593.