p = seq(0, 1, 0.01)
gini = p * (1 - p) * 2
entropy = -(p * log(p) + (1 - p) * log(1 - p))
class.err = 1 - pmax(p, 1 - p)
matplot(p, cbind(gini, entropy, class.err), col = c("red", "green", "blue"))
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.2
attach(Carseats)
set.seed(420)
train = sample(dim(Carseats)[1], dim(Carseats)[1]/2)
Carseats.train = Carseats[train, ]
Carseats.test = Carseats[-train, ]
library(tree)
## Warning: package 'tree' was built under R version 3.6.3
tree.carseats = tree(Sales ~ ., data = Carseats.train)
summary(tree.carseats)
##
## Regression tree:
## tree(formula = Sales ~ ., data = Carseats.train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Advertising" "CompPrice" "Income"
## [6] "Age" "US"
## Number of terminal nodes: 18
## Residual mean deviance: 1.989 = 362 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.87900 -0.97120 -0.01688 0.00000 0.98090 4.00600
plot(tree.carseats)
text(tree.carseats, pretty = 0)
pred.carseats = predict(tree.carseats, Carseats.test)
mean((Carseats.test$Sales - pred.carseats)^2)
## [1] 5.804668
test mse is 5.14
cv.carseats = cv.tree(tree.carseats, FUN = prune.tree)
par(mfrow = c(1, 2))
plot(cv.carseats$size, cv.carseats$dev, type = "b")
plot(cv.carseats$k, cv.carseats$dev, type = "b")
# Best size = 9
pruned.carseats = prune.tree(tree.carseats, best = 7)
par(mfrow = c(1, 1))
plot(pruned.carseats)
text(pruned.carseats, pretty = 0)
pred.pruned = predict(pruned.carseats, Carseats.test)
mean((Carseats.test$Sales - pred.pruned)^2)
## [1] 5.590505
pruning the test gives us 5.34 test MSE
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
bag.carseats = randomForest(Sales ~ ., data = Carseats.train, mtry = 10, ntree = 500,
importance = T)
bag.pred = predict(bag.carseats, Carseats.test)
mean((Carseats.test$Sales - bag.pred)^2)
## [1] 3.203475
importance(bag.carseats)
## %IncMSE IncNodePurity
## CompPrice 19.3293891 142.375289
## Income 9.7172257 101.151280
## Advertising 14.3091177 121.277258
## Population 1.7268708 58.159152
## Price 53.2910661 484.272326
## ShelveLoc 49.8205619 316.790453
## Age 7.2600657 99.165252
## Education -0.2298514 34.633861
## Urban -1.1816569 5.797032
## US 2.6532566 10.184712
our MSE is 3.0, with price, shelveloc, and compprice being the most important predictors
rf.carseats = randomForest(Sales ~ ., data = Carseats.train, mtry = 5, ntree = 500,
importance = T)
rf.pred = predict(rf.carseats, Carseats.test)
mean((Carseats.test$Sales - rf.pred)^2)
## [1] 3.344854
importance(rf.carseats)
## %IncMSE IncNodePurity
## CompPrice 13.1862220 135.396201
## Income 5.8183410 115.426670
## Advertising 10.8474857 136.779370
## Population 0.2648359 78.451301
## Price 44.1353013 420.848684
## ShelveLoc 39.5808402 279.267052
## Age 5.3569496 113.405360
## Education 1.4016062 47.632886
## Urban -1.0096678 7.777727
## US 3.6910967 14.432180
our test mse got slightly better with a 2.91 mse, our predictors have changed to shelvloc, price, and advertising.
library(ISLR)
attach(OJ)
set.seed(1013)
train = sample(dim(OJ)[1], 800)
OJ.train = OJ[train, ]
OJ.test = OJ[-train, ]
library(tree)
oj.tree = tree(Purchase ~ ., data = OJ.train)
summary(oj.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff" "SalePriceMM"
## Number of terminal nodes: 7
## Residual mean deviance: 0.7564 = 599.8 / 793
## Misclassification error rate: 0.1612 = 129 / 800
only uses LoyalCH and PriceDiff, 7 terminal nodes and missclassification error of 0.155
oj.tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1069.00 CH ( 0.61125 0.38875 )
## 2) LoyalCH < 0.5036 344 407.30 MM ( 0.27907 0.72093 )
## 4) LoyalCH < 0.276142 163 121.40 MM ( 0.12270 0.87730 ) *
## 5) LoyalCH > 0.276142 181 246.30 MM ( 0.41989 0.58011 )
## 10) PriceDiff < 0.065 75 75.06 MM ( 0.20000 0.80000 ) *
## 11) PriceDiff > 0.065 106 144.50 CH ( 0.57547 0.42453 ) *
## 3) LoyalCH > 0.5036 456 366.30 CH ( 0.86184 0.13816 )
## 6) LoyalCH < 0.753545 189 224.30 CH ( 0.71958 0.28042 )
## 12) ListPriceDiff < 0.235 79 109.40 MM ( 0.48101 0.51899 )
## 24) SalePriceMM < 1.64 22 20.86 MM ( 0.18182 0.81818 ) *
## 25) SalePriceMM > 1.64 57 76.88 CH ( 0.59649 0.40351 ) *
## 13) ListPriceDiff > 0.235 110 75.81 CH ( 0.89091 0.10909 ) *
## 7) LoyalCH > 0.753545 267 85.31 CH ( 0.96255 0.03745 ) *
LoyalCH < 0.764572 186 210.30 CH ( 0.74731 0.25269 ) LoyalCH has a splititng point of 0.76, there are 186 nodes below this subtree, the deviance is 210.30
(d) Create a plot of the tree, and interpret the results.
plot(oj.tree)
text(oj.tree, pretty = 0)
LoyalCH appears to be the most important variable of the tree, the top 3 nodes contain LoyalCH. If LoyalCH<0.27, the tree predicts MM. If LoyalCH>0.76, the tree predicts CH. For intermediate values of LoyalCH, the decision depends on PriceDiff.
oj.pred = predict(oj.tree, OJ.test, type = "class")
table(OJ.test$Purchase, oj.pred)
## oj.pred
## CH MM
## CH 149 15
## MM 30 76
table(OJ.test$Purchase, oj.pred)
## oj.pred
## CH MM
## CH 149 15
## MM 30 76
Produce a plot with tree size on the x-axis and cross-validated classification error rate on the y-axis. this code would not knit
Which tree size corresponds to the lowest cross-validated classification error rate? 7
Produce a pruned tree corresponding to the optimal tree size obtained using cross-validation. If cross-validation does not lead to selection of a pruned tree, then create a pruned tree with five terminal nodes.
oj.pruned = prune.tree(oj.tree, best = 7)
summary(oj.pruned)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff" "SalePriceMM"
## Number of terminal nodes: 7
## Residual mean deviance: 0.7564 = 599.8 / 793
## Misclassification error rate: 0.1612 = 129 / 800
misclassification is still 0.155
pred.unpruned = predict(oj.tree, OJ.test, type = "class")
misclass.unpruned = sum(OJ.test$Purchase != pred.unpruned)
misclass.unpruned/length(pred.unpruned)
## [1] 0.1666667
pred.pruned = predict(oj.pruned, OJ.test, type = "class")
misclass.pruned = sum(OJ.test$Purchase != pred.pruned)
misclass.pruned/length(pred.pruned)
## [1] 0.1666667
They have equal missclassification rates