p <- seq(0, 1, length = 100)
gini <- 2 * p * (1 - p)
error <- pmin(p, 1 - p)
entropy <- -(
ifelse(p == 0, 0, p * log2(p)) +
ifelse(p == 1, 0, (1 - p) * log2(1 - p))
)
plot(p, gini,
type = "l", lwd = 2,
ylim = c(0, 1),
xlab = expression(hat(p)[m1]),
ylab = "Impurity / Entropy"
)
lines(p, error, lwd = 2, lty = 2)
lines(p, entropy, lwd = 2, lty = 3)
legend("top",
legend = c("Gini index", "Classification error", "Entropy"),
lty = c(1, 2, 3),
lwd = 2
)
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.3
library(tree)
## Warning: package 'tree' was built under R version 4.4.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
install.packages("BayesTree")
## Installing package into 'C:/Users/austr/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'BayesTree' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'BayesTree'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\austr\AppData\Local\R\win-library\4.4\00LOCK\BayesTree\libs\x64\BayesTree.dll
## to
## C:\Users\austr\AppData\Local\R\win-library\4.4\BayesTree\libs\x64\BayesTree.dll:
## Permission denied
## Warning: restored 'BayesTree'
##
## The downloaded binary packages are in
## C:\Users\austr\AppData\Local\Temp\RtmpycxC1W\downloaded_packages
library(BayesTree)
## Warning: package 'BayesTree' was built under R version 4.4.3
set.seed(42)
train_idx <- sample(seq_len(nrow(Carseats)), size = 0.7*nrow(Carseats))
Car_train <- Carseats[train_idx, ]
Car_test <- Carseats[-train_idx, ]
tree.carseats <- tree(Sales ~ ., data = Car_train)
install.packages("rpart.plot")
## Installing package into 'C:/Users/austr/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'rpart.plot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\austr\AppData\Local\Temp\RtmpycxC1W\downloaded_packages
library(rpart)
## Warning: package 'rpart' was built under R version 4.4.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
rt <- rpart(Sales ~ ., data = Car_train, method = "anova")
rpart.plot(rt,
type = 2,
extra = 101,
tweak = 1.2,
shadow.col = "gray90",
fallen.leaves = TRUE,
main = "Regression Tree for Carseats (rpart.plot)")
pred.tree <- predict(tree.carseats, newdata = Car_test)
mse.tree <- mean((Car_test$Sales - pred.tree)^2)
mse.tree
## [1] 4.214264
The MSE that I obtain is 4.214264
cv.carseats <- cv.tree(tree.carseats, FUN = prune.tree)
plot(cv.carseats$size, cv.carseats$dev, type = "b",
xlab = "Tree size (# terminal nodes)",
ylab = "CV deviance")
best.size <- cv.carseats$size[which.min(cv.carseats$dev)]
pruned.tree <- prune.tree(tree.carseats, best = best.size)
plot(pruned.tree); text(pruned.tree, pretty = 0)
pred.pruned <- predict(pruned.tree, newdata = Car_test)
mse.pruned <- mean((Car_test$Sales - pred.pruned)^2)
mse.pruned
## [1] 4.170757
Pruning the tree did improve the test MSE, because it decreased from 4.214264 to 4.170757.
set.seed(42)
bag.carseats <- randomForest(
Sales ~ .,
data = Car_train,
mtry = ncol(Car_train) - 1,
ntree = 500,
importance = TRUE
)
pred.bag <- predict(bag.carseats, newdata = Car_test)
mse.bag <- mean((Car_test$Sales - pred.bag)^2)
mse.bag
## [1] 2.008767
importance(bag.carseats)
## %IncMSE IncNodePurity
## CompPrice 36.17125995 284.639884
## Income 9.17309008 122.646819
## Advertising 21.48508768 174.232464
## Population 0.07498017 83.290458
## Price 61.55615122 648.631596
## ShelveLoc 68.93170258 585.299090
## Age 18.06108045 197.675735
## Education 4.10601657 58.406892
## Urban 2.52710047 9.704960
## US 1.77709308 8.463419
varImpPlot(bag.carseats)
The test MSE that I obtained is 2.008767.
set.seed(42)
rf.carseats <- randomForest(
Sales ~ .,
data = Car_train,
mtry = floor((ncol(Car_train)-1)/3),
ntree = 500,
importance = TRUE
)
pred.rf <- predict(rf.carseats, newdata = Car_test)
mse.rf <- mean((Car_test$Sales - pred.rf)^2)
mse.rf
## [1] 2.642094
The test MSE that I obtain is 2.642094.
importance(rf.carseats)
## %IncMSE IncNodePurity
## CompPrice 17.6317246 216.58010
## Income 5.3918611 171.93417
## Advertising 15.4372996 208.01838
## Population 1.0167576 147.55649
## Price 40.9793223 507.22018
## ShelveLoc 46.1777066 461.41692
## Age 12.0906006 235.55478
## Education 2.8932704 102.44743
## Urban 0.3027265 19.85041
## US 3.5234743 30.83360
varImpPlot(rf.carseats)
mtry.values <- c(2, 4, 6, 8, 10)
mse.by.mtry <- sapply(mtry.values, function(m) {
rf <- randomForest(Sales ~ ., data = Car_train, mtry = m, ntree = 200)
mean((Car_test$Sales - predict(rf, Car_test))^2)
})
data.frame(mtry = mtry.values, TestMSE = mse.by.mtry)
## mtry TestMSE
## 1 2 3.298154
## 2 4 2.414009
## 3 6 2.097258
## 4 8 2.019671
## 5 10 1.916187
From these results we can see that raising m reduces bias and lowers the test error. When m is a small value, the trees are diverse but they have high bias.
set.seed(42)
bart.carseats <- bart(
x.train = model.matrix(Sales ~ . - 1, data = Car_train),
y.train = Car_train$Sales,
x.test = model.matrix(Sales ~ . - 1, data = Car_test),
ntree = 50
)
##
##
## Running BART with numeric y
##
## number of trees: 50
## Prior:
## k: 2.000000
## degrees of freedom in sigma prior: 3
## quantile in sigma prior: 0.900000
## power and base for tree prior: 2.000000 0.950000
## use quantiles for rule cut points: 0
## data:
## number of training observations: 280
## number of test observations: 120
## number of explanatory variables: 12
##
##
## Cutoff rules c in x<=c vs x>c
## Number of cutoffs: (var: number of possible c):
## (1: 100) (2: 100) (3: 100) (4: 100) (5: 100)
## (6: 100) (7: 100) (8: 100) (9: 100) (10: 100)
## (11: 100) (12: 100)
##
##
## Running mcmc loop:
## iteration: 100 (of 1100)
## iteration: 200 (of 1100)
## iteration: 300 (of 1100)
## iteration: 400 (of 1100)
## iteration: 500 (of 1100)
## iteration: 600 (of 1100)
## iteration: 700 (of 1100)
## iteration: 800 (of 1100)
## iteration: 900 (of 1100)
## iteration: 1000 (of 1100)
## iteration: 1100 (of 1100)
## time for loop: 2
##
## Tree sizes, last iteration:
## 2 2 3 2 2 2 2 2 2 4 3 2 1 3 3 3 3 2 2 2
## 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2
## 3 2 2 3 2 2 2 2 2 2 Variable Usage, last iteration (var:count):
## (1: 10) (2: 2) (3: 7) (4: 4) (5: 11)
## (6: 7) (7: 6) (8: 2) (9: 3) (10: 2)
## (11: 1) (12: 5)
## DONE BART 11-2-2014
pred.bart <- bart.carseats$yhat.test.mean
mse.bart <- mean((Car_test$Sales - pred.bart)^2)
mse.bart
## [1] 1.249302
Based on these results, we can see that there are many small trees, which is expected. These trees are heavily relying on the key predictors with the highest variable usage counts. The MSE value is very good, the lower than the previous methods.
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
library(tree)
set.seed(123)
n <- nrow(OJ)
idx <- sample(n, 800)
OJ.train <- OJ[idx, ]
OJ.test <- OJ[-idx, ]
oj.tree <- tree(Purchase ~ ., data = OJ.train)
train.pred <- predict(oj.tree, OJ.train, type = "class")
train.err <- mean(train.pred != OJ.train$Purchase)
cat("Training error rate =", round(train.err, 4), "\n")
## Training error rate = 0.165
oj.tree <- tree(Purchase ~ ., data = OJ.train)
oj.sum <- summary(oj.tree)
oj.sum
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7625 = 603.9 / 792
## Misclassification error rate: 0.165 = 132 / 800
Only two predictors were used, LoyalCH and PriceDiff, and all the observations were carved into 8 nodes. Thus, the tree has 8 nodes. The training error rate value is 0.165, or 16.5%.
oj.tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1071.00 CH ( 0.60875 0.39125 )
## 2) LoyalCH < 0.5036 350 415.10 MM ( 0.28000 0.72000 )
## 4) LoyalCH < 0.276142 170 131.00 MM ( 0.12941 0.87059 )
## 8) LoyalCH < 0.0356415 56 10.03 MM ( 0.01786 0.98214 ) *
## 9) LoyalCH > 0.0356415 114 108.90 MM ( 0.18421 0.81579 ) *
## 5) LoyalCH > 0.276142 180 245.20 MM ( 0.42222 0.57778 )
## 10) PriceDiff < 0.05 74 74.61 MM ( 0.20270 0.79730 ) *
## 11) PriceDiff > 0.05 106 144.50 CH ( 0.57547 0.42453 ) *
## 3) LoyalCH > 0.5036 450 357.10 CH ( 0.86444 0.13556 )
## 6) PriceDiff < -0.39 27 32.82 MM ( 0.29630 0.70370 ) *
## 7) PriceDiff > -0.39 423 273.70 CH ( 0.90071 0.09929 )
## 14) LoyalCH < 0.705326 130 135.50 CH ( 0.78462 0.21538 )
## 28) PriceDiff < 0.145 43 58.47 CH ( 0.58140 0.41860 ) *
## 29) PriceDiff > 0.145 87 62.07 CH ( 0.88506 0.11494 ) *
## 15) LoyalCH > 0.705326 293 112.50 CH ( 0.95222 0.04778 ) *
I will pick terminal 4. It has 170 nodes, deviance value of 131.00, predicts Minute Maid for everyone, and the missclass rate is 0.12941 (1-0.87059), or 12.94%. In conclusion, the customers in this segment have very low CH-loyalty.
plot(oj.tree)
text(oj.tree, pretty = 0, cex = 0.7)
title("Classification Tree for OJ (training set)")
We can determine that CH loyalty is the dominant predictor. Also, we can see that price difference matters only after you establish a moderate level of brand loyalty. Additionally, we can see that there are noticable price penalties that can overcome even strong loyalty. We can also see that very weak loyalty almost always defaults to MM regardless of price, unless there is a good enough offer.
test.pred <- predict(oj.tree, OJ.test, type = "class")
cm <- table(OJ.test$Purchase, test.pred)
cm
## test.pred
## CH MM
## CH 150 16
## MM 34 70
test.err <- mean(test.pred != OJ.test$Purchase)
cat("Test error rate =", round(test.err, 4), "\n")
## Test error rate = 0.1852
The test error rate is 0.1852, or 18.52%.
set.seed(123)
cv.oj <- cv.tree(oj.tree, FUN = prune.tree, method = "misclass")
cv.oj
## $size
## [1] 8 5 3 2 1
##
## $dev
## [1] 139 139 157 167 313
##
## $k
## [1] -Inf 0 8 11 154
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
best_idx <- which.min(cv.oj$dev)
# 2) Look up the corresponding tree size
opt_size <- cv.oj$size[best_idx]
# 3) Compare to your full‐tree size
full_size <- sum(oj.tree$frame$var == "<leaf>")
cat("Full tree size:", full_size, "\n",
"CV-optimal size:", opt_size, "\n")
## Full tree size: 8
## CV-optimal size: 8
plot(cv.oj$size, cv.oj$dev, type = "b",
xlab = "Tree size (no. terminal nodes)",
ylab = "CV misclass. error",
main = "CV for OJ Tree")
opt_size <- cv.oj$size[which.min(cv.oj$dev)]
cat("Optimal tree size =", opt_size, "\n")
## Optimal tree size = 8
The tree size that corresponds to the lowest level cross-validated classification error rate is 8 nodes.
pruned.oj <- prune.tree(oj.tree, best = 5, method = "misclass")
plot(pruned.oj)
text(pruned.oj, pretty = 0, cex = 0.7)
title("Pruned OJ Tree")
train.pred1 <- predict(oj.tree, OJ.train, type = "class")
err1 <- mean(train.pred1 != OJ.train$Purchase)
train.pred2 <- predict(pruned.oj, OJ.train, type = "class")
err2 <- mean(train.pred2 != OJ.train$Purchase)
cat("Unpruned training error:", round(err1,4), "\n",
"Pruned training error:", round(err2,4), "\n")
## Unpruned training error: 0.165
## Pruned training error: 0.165
They are the same value because pruning merged branches that all predicted the same class, so no observation’s predicted label, and thus no misclassification actually changed.
test.pred1 <- predict(oj.tree, OJ.test, type = "class")
test.err1 <- mean(test.pred1 != OJ.test$Purchase)
test.pred2 <- predict(pruned.oj, OJ.test, type = "class")
test.err2 <- mean(test.pred2 != OJ.test$Purchase)
cat("Unpruned test error:", round(test.err1,4), "\n",
"Pruned test error:", round(test.err2,4), "\n")
## Unpruned test error: 0.1852
## Pruned test error: 0.1852
They are the same value because pruning only removed splits whose leaves all agreed on each test‐case’s predicted class, so no test observations changed labels and the misclassification rate stayed at 0.1852.