This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
p = seq(0, 1, 0.01)
gini = p * (1 - p) * 2
entropy = -(p * log(p) + (1 - p) * log(1 - p))
class.err = 1 - pmax(p, 1 - p)
matplot(p, cbind(gini, entropy, class.err), col = c("green", "blue", "red"))
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.1.3
library(tree)
## Warning: package 'tree' was built under R version 4.1.3
attach(Carseats)
set.seed(1)
###a
train = sample(dim(Carseats)[1], dim(Carseats)[1]/2)
Carseats.train = Carseats[train, ]
Carseats.test = Carseats[-train, ]
###b
tree.carseats = tree(Sales ~ ., data = Carseats.train)
plot(tree.carseats)
text(tree.carseats, pretty = 0)
pred.carseats = predict(tree.carseats, Carseats.test)
mean((Carseats.test$Sales - pred.carseats)^2)
## [1] 4.922039
The MSE value is 4.92
###c
cv.carseats = cv.tree(tree.carseats, FUN = prune.tree)
par(mfrow = c(1, 2))
plot(cv.carseats$size, cv.carseats$dev, type = "b")
plot(cv.carseats$k, cv.carseats$dev, type = "b")
pruned.carseats = prune.tree(tree.carseats, best = 9)
par(mfrow = c(1, 1))
plot(pruned.carseats)
text(pruned.carseats, pretty = 0)
pred.pruned = predict(pruned.carseats, Carseats.test)
mean((Carseats.test$Sales - pred.pruned)^2)
## [1] 4.918134
The new MSE value has decreased to 4.91
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
bag.carseats = randomForest(Sales ~ ., data = Carseats.train, mtry = 10, ntree = 500,
importance = T)
bag.pred = predict(bag.carseats, Carseats.test)
mean((Carseats.test$Sales - bag.pred)^2)
## [1] 2.657296
importance(bag.carseats)
## %IncMSE IncNodePurity
## CompPrice 23.07909904 171.185734
## Income 2.82081527 94.079825
## Advertising 11.43295625 99.098941
## Population -3.92119532 59.818905
## Price 54.24314632 505.887016
## ShelveLoc 46.26912996 361.962753
## Age 14.24992212 159.740422
## Education -0.07662320 46.738585
## Urban 0.08530119 8.453749
## US 4.34349223 15.157608
By using the bagging approach our MSE is now 2.65. Additionaly, the price, shelveloc, and compprice varibales are the most important.
rf.carseats = randomForest(Sales ~ ., data = Carseats.train, mtry = 5, ntree = 500,
importance = T)
rf.pred = predict(rf.carseats, Carseats.test)
mean((Carseats.test$Sales - rf.pred)^2)
## [1] 2.701665
importance(rf.carseats)
## %IncMSE IncNodePurity
## CompPrice 19.8160444 162.73603
## Income 2.8940268 106.96093
## Advertising 11.6799573 106.30923
## Population -1.6998805 79.04937
## Price 46.3454015 448.33554
## ShelveLoc 40.4412189 334.33610
## Age 12.5440659 169.06125
## Education 1.0762096 55.87510
## Urban 0.5703583 13.21963
## US 5.8799999 25.59797
The new MSE value is 2.701. The most important variables are price, age, shelveloc.
###a
library(ISLR2)
attach(OJ)
library(tree)
set.seed(1)
train = sample(dim(OJ)[1], 800)
OJ.train = OJ[train, ]
OJ.test = OJ[-train, ]
oj.tree = tree(Purchase ~ ., data = OJ.train)
summary(oj.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## [5] "PctDiscMM"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7432 = 587.8 / 791
## Misclassification error rate: 0.1588 = 127 / 800
There are 9 terminal nodes and the training error rate for the tree is .158.
###c
oj.tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1073.00 CH ( 0.60625 0.39375 )
## 2) LoyalCH < 0.5036 365 441.60 MM ( 0.29315 0.70685 )
## 4) LoyalCH < 0.280875 177 140.50 MM ( 0.13559 0.86441 )
## 8) LoyalCH < 0.0356415 59 10.14 MM ( 0.01695 0.98305 ) *
## 9) LoyalCH > 0.0356415 118 116.40 MM ( 0.19492 0.80508 ) *
## 5) LoyalCH > 0.280875 188 258.00 MM ( 0.44149 0.55851 )
## 10) PriceDiff < 0.05 79 84.79 MM ( 0.22785 0.77215 )
## 20) SpecialCH < 0.5 64 51.98 MM ( 0.14062 0.85938 ) *
## 21) SpecialCH > 0.5 15 20.19 CH ( 0.60000 0.40000 ) *
## 11) PriceDiff > 0.05 109 147.00 CH ( 0.59633 0.40367 ) *
## 3) LoyalCH > 0.5036 435 337.90 CH ( 0.86897 0.13103 )
## 6) LoyalCH < 0.764572 174 201.00 CH ( 0.73563 0.26437 )
## 12) ListPriceDiff < 0.235 72 99.81 MM ( 0.50000 0.50000 )
## 24) PctDiscMM < 0.196197 55 73.14 CH ( 0.61818 0.38182 ) *
## 25) PctDiscMM > 0.196197 17 12.32 MM ( 0.11765 0.88235 ) *
## 13) ListPriceDiff > 0.235 102 65.43 CH ( 0.90196 0.09804 ) *
## 7) LoyalCH > 0.764572 261 91.20 CH ( 0.95785 0.04215 ) *
I decided to pick node labeled 20. The splitting variable at this node is SpecialCH. The splitting value is .5. There are 64 points in the subtree. The deviance for all points bellow the node is 52. Around 15% of points in this node have CH value of Sales and the remaining 85% have MM as value of sales.
###d
plot(oj.tree)
text(oj.tree, pretty = 0)
LoyalCH is in the top 3 nodes of this tree. If the LoyalCH<.28, the
tree predicts MM. If the LoyalCH>.76 the tree predicts CH.
###e
oj.pred = predict(oj.tree, OJ.test, type = "class")
table(OJ.test$Purchase, oj.pred)
## oj.pred
## CH MM
## CH 160 8
## MM 38 64
###f
cv.oj = cv.tree(oj.tree, FUN = prune.tree)
###g
plot(cv.oj$size, cv.oj$dev, type = "b", xlab = "Tree Size", ylab = "Deviance")
###h A tree size of 6 has the lowest cross-validation error
###i
oj.pruned = prune.tree(oj.tree, best = 6)
###j
summary(oj.pruned)
##
## Classification tree:
## snip.tree(tree = oj.tree, nodes = c(10L, 4L, 12L))
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff"
## Number of terminal nodes: 6
## Residual mean deviance: 0.7919 = 628.8 / 794
## Misclassification error rate: 0.1788 = 143 / 800
The misclassification error of the pruned tree is .1788, while the error rate for the unpruned tree is .1588.
###k
pred.unpruned = predict(oj.tree, OJ.test, type = "class")
misclass.unpruned = sum(OJ.test$Purchase != pred.unpruned)
misclass.unpruned/length(pred.unpruned)
## [1] 0.1703704
pred.pruned = predict(oj.pruned, OJ.test, type = "class")
misclass.pruned = sum(OJ.test$Purchase != pred.pruned)
misclass.pruned/length(pred.pruned)
## [1] 0.1851852
The pruned tree has the higher test error rate of .185.