Consider the Gini index, classification error, and entropy in a simple classification setting with two classes. Create a single plot that displays each of these quantities as a function of p^m1. The x-axis should display p^m1, ranging from 0 to 1, and the y-axis should display the value of the Gini index, classification error, and entropy.
p1 <- seq(0, 1, 0.001)
p2 <- 1 - p1
classification_error <- 1 - pmax(p1, p2)
gini <- p1*(1-p1) + p2*(1-p2)
entropy <- -p1*log(p1) - p2*log(p2)
data.frame(p1, p2, classification_error, gini, entropy) %>%
pivot_longer(cols = c(classification_error, gini, entropy), names_to = "metric") %>%
ggplot(aes(x = p1, y = value, col = factor(metric))) +
geom_line(size = 1) +
scale_y_continuous(breaks = seq(0, 1, 0.1), minor_breaks = NULL) +
scale_color_hue(labels = c("Classification Error", "Entropy", "Gini")) +
labs(col = "Metric",
y = "Value",
x = "Proportion (of class '1')")
In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable.
a). Split the data set into a training set and a test set
set.seed(1, sample.kind = "Rounding")
train_index <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train <- Carseats[train_index, ]
test <- Carseats[-train_index, ]
tree_model <- tree(Sales ~ ., train)
plot(tree_model)
text(tree_model, pretty = 0, cex = 0.7)
summary(tree_model)
##
## Regression tree:
## tree(formula = Sales ~ ., data = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "Income"
## [6] "CompPrice"
## Number of terminal nodes: 18
## Residual mean deviance: 2.36 = 429.5 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.2570 -1.0360 0.1024 0.0000 0.9301 3.9130
test_pred <- predict(tree_model, test)
mean((test_pred - test$Sales)^2)
## [1] 4.148897
Shelf location is the first variable to split followed by Price. I would think that a third node would split at age, after that splitting doesn’t makes too much sense. Test MSE of 4.148897
set.seed(2)
cv_tree_model <- cv.tree(tree_model, K = 10)
data.frame(n_leaves = cv_tree_model$size,
CV_RSS = cv_tree_model$dev) %>%
mutate(min_CV_RSS = as.numeric(min(CV_RSS) == CV_RSS)) %>%
ggplot(aes(x = n_leaves, y = CV_RSS)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_CV_RSS))) +
scale_x_continuous(breaks = seq(1, 17, 2)) +
scale_y_continuous(labels = scales::comma_format()) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme(legend.position = "none") +
labs(title = "Carseats Dataset - Regression Tree",
subtitle = "Selecting the complexity parameter with cross-validation",
x = "Terminal Nodes",
y = "CV RSS")
I would go with 9 or 10 nodes. After 9 the RSS slightly increases. Going futher than 9 nodes the diagram would become too complex to be useful.
set.seed(3)
bagged_trees_model <- randomForest(y = train$Sales,
x = train[ ,-1],
mtry = ncol(train) - 1, # 10
importance = T)
test_pred <- predict(bagged_trees_model, test)
mean((test_pred - test$Sales)^2)
## [1] 2.55751
The test MSE was lowered to 2.55751 using random forest.
importance(bagged_trees_model) %>%
as.data.frame() %>%
arrange(desc(IncNodePurity))
## %IncMSE IncNodePurity
## Price 61.0815445 510.222759
## ShelveLoc 46.5308006 320.758343
## Age 20.9071380 196.309827
## CompPrice 14.3725974 133.408852
## Advertising 14.2546456 124.572583
## Income 5.9039670 78.035000
## Population 0.5521805 62.833496
## Education 1.3675478 42.260461
## US 6.8176330 15.273874
## Urban -2.7169457 7.955904
The importance () function says to go with 10 variables.
test_MSE <- c()
i <- 1
for (Mtry in 1:10) {
set.seed(3)
rf_temp <- randomForest(y = train$Sales,
x = train[ ,-1],
mtry = Mtry,
importance = T)
test_pred <- predict(rf_temp, test)
test_MSE[i] <- mean((test_pred - test$Sales)^2)
i <- i + 1
}
data.frame(mtry = 1:10, test_MSE = test_MSE) %>%
mutate(min_test_MSE = as.numeric(min(test_MSE) == test_MSE)) %>%
ggplot(aes(x = mtry, y = test_MSE)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_test_MSE))) +
scale_x_continuous(breaks = seq(1, 10), minor_breaks = NULL) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme(legend.position = "none") +
labs(title = "Carseats Dataset - Random Forests",
subtitle = "Selecting 'mtry' using the test MSE",
x = "mtry",
y = "Test MSE")
### Problem 9
This problem involves the OJ data set which is part of the ISLR2 package.
set.seed(4)
train_index <- sample(1:nrow(OJ), 800)
train <- OJ[train_index, ]
test <- OJ[-train_index, ]
tree_model <- tree(Purchase ~ ., train)
summary(tree_model)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "SalePriceMM" "PriceDiff"
## Number of terminal nodes: 7
## Residual mean deviance: 0.7677 = 608.8 / 793
## Misclassification error rate: 0.1788 = 143 / 800
The training error rate is 17.88%. The number of terminal nodes is 7. The variables used in the split were: LoyalCH, SalePriceMM, and PriceDiff.
plot(tree_model)
text(tree_model, pretty = 0, cex = 0.7)
This tree shows people’s preference for Citrus Hill or Minute Maid drinks. The first terminal node shows loyalty, while the bottom terminal nodes show how price might sway someone’s answer.
test_pred <- predict(tree_model, test, type = "class")
table(test_pred, test_actual = test$Purchase)
## test_actual
## test_pred CH MM
## CH 153 38
## MM 15 64
1 - mean(test_pred == test$Purchase)
## [1] 0.1962963
1 - mean(test$Purchase == "CH")
## [1] 0.3777778
set.seed(5)
cv_tree_model <- cv.tree(tree_model, K = 10, FUN = prune.misclass)
cv_tree_model
## $size
## [1] 7 4 2 1
##
## $dev
## [1] 166 166 166 315
##
## $k
## [1] -Inf 0 4 164
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
data.frame(size = cv_tree_model$size, CV_Error = cv_tree_model$dev / nrow(train)) %>%
mutate(min_CV_Error = as.numeric(min(CV_Error) == CV_Error)) %>%
ggplot(aes(x = size, y = CV_Error)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_CV_Error))) +
scale_x_continuous(breaks = seq(1, 7), minor_breaks = NULL) +
scale_y_continuous(labels = scales::percent_format()) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme(legend.position = "none") +
labs(title = "OJ Dataset - Classification Tree",
subtitle = "Selecting tree 'size' (# of terminal nodes) using cross-validation",
x = "Tree Size",
y = "CV Error")
The plot shows tree sizes of 2, 4, and 7. I will choose tree size 2.
pruned_tree_model <- prune.tree(tree_model, best = 4)
summary(pruned_tree_model)
##
## Classification tree:
## snip.tree(tree = tree_model, nodes = 4:6)
## Variables actually used in tree construction:
## [1] "LoyalCH"
## Number of terminal nodes: 4
## Residual mean deviance: 0.8413 = 669.7 / 796
## Misclassification error rate: 0.1888 = 151 / 800