knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
3.
pm1 <- seq(0, 1, by = 0.01)
gini <- 2 * pm1 * (1 - pm1)
class_error <- 1 - pmax(pm1, 1 - pm1)
entropy <- ifelse(pm1 == 0 | pm1 == 1, 0,
-pm1 * log2(pm1) - (1 - pm1) * log2(1 - pm1))
plot(pm1, gini, type = "l", lwd = 2, col = "blue", ylim = c(0, 1),
ylab = "Impurity Measure", xlab = expression(p[m1]),
main = "Impurity Measures vs. Class Probability")
lines(pm1, class_error, col = "red", lwd = 2, lty = 2)
lines(pm1, entropy, col = "darkgreen", lwd = 2, lty = 3)
legend("top", legend = c("Gini", "Classification Error", "Entropy"),
col = c("blue", "red", "darkgreen"), lty = 1:3, lwd = 2)
8.
library(ISLR2)
library(tree)
library(randomForest)
library(BART)
data(Carseats)
a.
set.seed(1)
train_index <- sample(1:nrow(Carseats), nrow(Carseats)/2)
train <- Carseats[train_index, ]
test <- Carseats[-train_index, ]
b.
tree_model <- tree(Sales ~ ., data = train)
summary(tree_model)
##
## Regression tree:
## tree(formula = Sales ~ ., data = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "US"
## Number of terminal nodes: 18
## Residual mean deviance: 2.167 = 394.3 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.88200 -0.88200 -0.08712 0.00000 0.89590 4.09900
plot(tree_model)
preds <- predict(tree_model, newdata = test)
mse <- mean((preds - test$Sales)^2)
mse
## [1] 4.922039
c.
set.seed(2)
cv_result <- cv.tree(tree_model)
plot(cv_result$size, cv_result$dev, type = "b",
xlab = "Tree Size (Number of Terminal Nodes)",
ylab = "Deviance (CV Error)",
main = "Cross-Validation Results")
best_size <- cv_result$size[which.min(cv_result$dev)]
best_size
## [1] 18
pruned_tree <- prune.tree(tree_model, best = best_size)
plot(pruned_tree)
pruned_pred <- predict(pruned_tree, newdata = test)
mse_pruned <- mean((pruned_pred - test$Sales)^2)
mse_pruned
## [1] 4.922039
d.
set.seed(1)
bag_model <- randomForest(Sales ~ ., data = train, mtry = ncol(train) - 1, importance = TRUE)
bag_pred <- predict(bag_model, newdata = test)
mse_bag <- mean((bag_pred - test$Sales)^2)
mse_bag
## [1] 2.605253
importance(bag_model)
## %IncMSE IncNodePurity
## CompPrice 24.8888481 170.182937
## Income 4.7121131 91.264880
## Advertising 12.7692401 97.164338
## Population -1.8074075 58.244596
## Price 56.3326252 502.903407
## ShelveLoc 48.8886689 380.032715
## Age 17.7275460 157.846774
## Education 0.5962186 44.598731
## Urban 0.1728373 9.822082
## US 4.2172102 18.073863
e. If a small number of variables is used, each tree is more different from the others, which can help the model avoid over fitting and make better general predictions. If m is too small, the trees may miss important variables and make worse predictions. If a large m is used, the trees become more similar, which can reduce the benefit of using a forest of many trees. In this data set, choosing a middle value for m gave the best test performance and it allowed the model to use useful information without over fitting the training data.
set.seed(1)
rf_model <- randomForest(Sales ~ ., data = train, mtry = 4, importance = TRUE)
rf_pred <- predict(rf_model, newdata = test)
mse_rf <- mean((rf_pred - test$Sales)^2)
mse_rf
## [1] 2.787584
importance(rf_model)
## %IncMSE IncNodePurity
## CompPrice 15.7891655 160.57944
## Income 4.1275509 121.12953
## Advertising 9.6425758 111.54581
## Population -1.3596645 85.92575
## Price 43.4055391 423.06225
## ShelveLoc 37.8850232 311.97119
## Age 13.8924424 174.18229
## Education 0.1960888 62.77782
## Urban 0.1393816 12.92952
## US 6.3532441 30.42255
f. Using BART to predict how many car seats a store might sell, it performed very well, with a low test error, meaning its guesses were usually close to the real numbers. BART was especially good at avoiding mistakes that come from over fitting. Overall, BART gave accurate predictions and helped highlight which store features most affect sales.
set.seed(1)
x_train <- train[, -which(names(train) == "Sales")]
y_train <- train$Sales
x_test <- test[, -which(names(test) == "Sales")]
y_test <- test$Sales
bart_model <- gbart(x.train = x_train, y.train = y_train, x.test = x_test)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 14, 200
## y1,yn: 2.781850, 1.091850
## x1,x[n*p]: 107.000000, 1.000000
## xp1,xp[np*p]: 111.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 63 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.273474,3,0.23074,7.57815
## *****sigma: 1.088371
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 7s
## trcnt,tecnt: 1000,1000
bart_preds <- bart_model$yhat.test.mean
mse_bart <- mean((bart_preds - y_test)^2)
mse_bart
## [1] 1.450842
9.
data(OJ)
a.
set.seed(1)
train_index <- sample(1:nrow(OJ), 800)
train <- OJ[train_index, ]
test <- OJ[-train_index, ]
b.
tree_model <- tree(Purchase ~ ., data = train)
summary(tree_model)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## [5] "PctDiscMM"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7432 = 587.8 / 791
## Misclassification error rate: 0.1588 = 127 / 800
c. The tree works by asking yes-or-no questions and then splitting the data based on those answers. Each final decision point in the tree is called a terminal node, and the tree being modeled has 9 terminal nodes. After testing the tree on the same data it was trained on, it got about 85% of the answers right, which means it had a training error rate of about 15%.
plot(tree_model)
d. Evaluating node 8, the model looked at how loyal a customer is to Tropicana(CH). If their loyalty score was less than 0.0356, they were put into this group. There were 59 people in this group, and the tree predicted that most of them (about 98%) would not buy Tropicana and instead chose Minute Maid. This shows that customers who weren’t very loyal to Tropicana are still more likely to choose Minute Maid. The tree uses these kinds of patterns to help stores predict what customers are likely to buy.
tree_model
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1073.00 CH ( 0.60625 0.39375 )
## 2) LoyalCH < 0.5036 365 441.60 MM ( 0.29315 0.70685 )
## 4) LoyalCH < 0.280875 177 140.50 MM ( 0.13559 0.86441 )
## 8) LoyalCH < 0.0356415 59 10.14 MM ( 0.01695 0.98305 ) *
## 9) LoyalCH > 0.0356415 118 116.40 MM ( 0.19492 0.80508 ) *
## 5) LoyalCH > 0.280875 188 258.00 MM ( 0.44149 0.55851 )
## 10) PriceDiff < 0.05 79 84.79 MM ( 0.22785 0.77215 )
## 20) SpecialCH < 0.5 64 51.98 MM ( 0.14062 0.85938 ) *
## 21) SpecialCH > 0.5 15 20.19 CH ( 0.60000 0.40000 ) *
## 11) PriceDiff > 0.05 109 147.00 CH ( 0.59633 0.40367 ) *
## 3) LoyalCH > 0.5036 435 337.90 CH ( 0.86897 0.13103 )
## 6) LoyalCH < 0.764572 174 201.00 CH ( 0.73563 0.26437 )
## 12) ListPriceDiff < 0.235 72 99.81 MM ( 0.50000 0.50000 )
## 24) PctDiscMM < 0.196196 55 73.14 CH ( 0.61818 0.38182 ) *
## 25) PctDiscMM > 0.196196 17 12.32 MM ( 0.11765 0.88235 ) *
## 13) ListPriceDiff > 0.235 102 65.43 CH ( 0.90196 0.09804 ) *
## 7) LoyalCH > 0.764572 261 91.20 CH ( 0.95785 0.04215 ) *
e.
test_preds <- predict(tree_model, newdata = test, type = "class")
conf_matrix <- table(Predicted = test_preds, Actual = test$Purchase)
print(conf_matrix)
## Actual
## Predicted CH MM
## CH 160 38
## MM 8 64
test_error_rate <- mean(test_preds != test$Purchase)
print(test_error_rate)
## [1] 0.1703704
f.
set.seed(1)
cv_result <- cv.tree(tree_model, FUN = prune.misclass)
print(cv_result)
## $size
## [1] 9 8 7 4 2 1
##
## $dev
## [1] 145 145 146 146 167 315
##
## $k
## [1] -Inf 0.000000 3.000000 4.333333 10.500000 151.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
best_size <- cv_result$size[which.min(cv_result$dev)]
best_size
## [1] 9
g.
set.seed(1)
cv_result <- cv.tree(tree_model, FUN = prune.misclass)
plot(cv_result$size, cv_result$dev, type = "b", pch = 19,
xlab = "Tree Size (Number of Terminal Nodes)",
ylab = "CV Classification Error",
main = "Cross-Validated Error vs. Tree Size")
h. Tree size 9
i.
pruned_tree <- prune.misclass(tree_model, best = best_size)
plot(pruned_tree)
text(pruned_tree, pretty = 0)
title("Pruned Classification Tree")
j and k. They are all the same since the pruned tree and the unpruned tree are the same.
train_pred_unpruned <- predict(tree_model, newdata = train, type = "class")
train_error_unpruned <- mean(train_pred_unpruned != train$Purchase)
train_pred_pruned <- predict(pruned_tree, newdata = train, type = "class")
train_error_pruned <- mean(train_pred_pruned != train$Purchase)
test_pred_unpruned <- predict(tree_model, newdata = test, type = "class")
test_error_unpruned <- mean(test_pred_unpruned != test$Purchase)
test_pred_pruned <- predict(pruned_tree, newdata = test, type = "class")
test_error_pruned <- mean(test_pred_pruned != test$Purchase)
data.frame(
Model = c("Unpruned Tree", "Pruned Tree"),
Train_Error = c(train_error_unpruned, train_error_pruned),
Test_Error = c(test_error_unpruned, test_error_pruned)
)
## Model Train_Error Test_Error
## 1 Unpruned Tree 0.15875 0.1703704
## 2 Pruned Tree 0.15875 0.1703704