library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(MASS) # Boston data
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(ISLR) # Carseats data
library(randomForest) # random forests
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(tree) # trees
## Registered S3 method overwritten by 'tree':
## method from
## print.tree cli
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
library(gbm)
## Loaded gbm 2.1.8
library(gam)
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.20
prop_class_1 <- seq(0, 1, 0.001)
prop_class_2 <- 1 - prop_class_1
classification_error <- 1 - pmax(prop_class_1, prop_class_2)
gini <- prop_class_1*(1-prop_class_1) + prop_class_2*(1-prop_class_2)
entropy <- -prop_class_1*log(prop_class_1) - prop_class_2*log(prop_class_2)
data.frame(prop_class_1, prop_class_2, classification_error, gini, entropy) %>%
pivot_longer(cols = c(classification_error, gini, entropy), names_to = "metric") %>%
ggplot(aes(x = prop_class_1, y = value, col = factor(metric))) +
geom_line(size = 1) +
scale_y_continuous(breaks = seq(0, 1, 0.1), minor_breaks = NULL) +
scale_color_hue(labels = c("Classification Error", "Entropy", "Gini")) +
labs(col = "Metric",
y = "Value",
x = "Proportion (of class '1')")
## Warning: Removed 2 row(s) containing missing values (geom_path).
set.seed(2, sample.kind = "Rounding")
## Warning in set.seed(2, sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
train_index <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train <- Carseats[train_index, ]
test <- Carseats[-train_index, ]
tree_model <- tree(Sales ~ ., train)
plot(tree_model)
text(tree_model, pretty = 0, cex = 0.7)
summary(tree_model)
##
## Regression tree:
## tree(formula = Sales ~ ., data = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Income" "CompPrice"
## [6] "Population" "Advertising"
## Number of terminal nodes: 17
## Residual mean deviance: 2.341 = 428.4 / 183
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.76700 -1.00900 -0.01558 0.00000 0.94900 3.58600
test_pred <- predict(tree_model, test)
mean((test_pred - test$Sales)^2)
## [1] 4.844991
The regression tree shows that ShelveLoc and Price are the factors that influence the amount of car seat sales the most. I see a test MSE of 4.844991.
set.seed(3)
cv_tree_model <- cv.tree(tree_model, K = 10)
data.frame(n_leaves = cv_tree_model$size,
CV_RSS = cv_tree_model$dev) %>%
mutate(min_CV_RSS = as.numeric(min(CV_RSS) == CV_RSS)) %>%
ggplot(aes(x = n_leaves, y = CV_RSS)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_CV_RSS))) +
scale_x_continuous(breaks = seq(1, 17, 2)) +
scale_y_continuous(labels = scales::comma_format()) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme(legend.position = "none") +
labs(title = "Carseats Dataset - Regression Tree",
subtitle = "Selecting the complexity parameter with cross-validation",
x = "Terminal Nodes",
y = "CV RSS")
Seeing as the cross validation terminal node of 17 is the same as the tree from part b we can just take the MSE from part b, showing that it does not improve.
set.seed(1)
bagged_trees_model <- randomForest(y = train$Sales,
x = train[ ,-1],
mtry = ncol(train) - 1,
importance = T)
test_pred <- predict(bagged_trees_model, test)
mean((test_pred - test$Sales)^2)
## [1] 2.368674
importance(bagged_trees_model) %>%
as.data.frame() %>%
rownames_to_column("varname") %>%
arrange(desc(IncNodePurity))
## varname %IncMSE IncNodePurity
## 1 Price 55.76266629 493.804969
## 2 ShelveLoc 53.87451311 446.816951
## 3 CompPrice 25.47984338 173.982449
## 4 Age 12.07366106 117.502364
## 5 Advertising 13.97464644 96.929928
## 6 Income 1.72616791 71.465227
## 7 Population 1.01449985 68.297498
## 8 Education 0.08382003 37.513944
## 9 Urban -3.06299457 6.909530
## 10 US 0.14346468 5.985091
We see a test MSE of 2.368674. Just as we saw in part b we see that Price and ShelveLoc are the most important (actually much more important than the rest when looking at the raw numbers) variables.
test_MSE <- c()
i <- 1
for (Mtry in 1:10) {
set.seed(1)
rf_temp <- randomForest(y = train$Sales,
x = train[ ,-1],
mtry = Mtry,
importance = T)
test_pred <- predict(rf_temp, test)
test_MSE[i] <- mean((test_pred - test$Sales)^2)
i <- i + 1
}
data.frame(mtry = 1:10, test_MSE = test_MSE) %>%
mutate(min_test_MSE = as.numeric(min(test_MSE) == test_MSE)) %>%
ggplot(aes(x = mtry, y = test_MSE)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_test_MSE))) +
scale_x_continuous(breaks = seq(1, 10), minor_breaks = NULL) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme(legend.position = "none") +
labs(title = "Carseats Dataset - Random Forests",
subtitle = "Selecting 'mtry' using the test MSE",
x = "mtry",
y = "Test MSE")
When using the random forrest model we get a mtry of 10 which is the same as we found in the bagged tree model. The MSE is also the same with it being 2.368674.
set.seed(5)
train_index <- sample(1:nrow(OJ), 800)
train <- OJ[train_index, ]
test <- OJ[-train_index, ]
tree_model <- tree(Purchase ~ ., train)
summary(tree_model)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "DiscCH"
## Number of terminal nodes: 7
## Residual mean deviance: 0.7786 = 617.4 / 793
## Misclassification error rate: 0.1838 = 147 / 800
The error rate is 0.1838 with only 7 terminal nodes being used. Also, out of the 17 predictors only 3 of them were used, LoyalCH, PriceDiff, and DiscCH.
tree_model
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1064.00 CH ( 0.61750 0.38250 )
## 2) LoyalCH < 0.5036 354 435.50 MM ( 0.30508 0.69492 )
## 4) LoyalCH < 0.142213 100 45.39 MM ( 0.06000 0.94000 ) *
## 5) LoyalCH > 0.142213 254 342.20 MM ( 0.40157 0.59843 )
## 10) PriceDiff < 0.235 136 153.00 MM ( 0.25000 0.75000 ) *
## 11) PriceDiff > 0.235 118 160.80 CH ( 0.57627 0.42373 ) *
## 3) LoyalCH > 0.5036 446 352.30 CH ( 0.86547 0.13453 )
## 6) LoyalCH < 0.705699 154 189.50 CH ( 0.69481 0.30519 )
## 12) PriceDiff < 0.25 85 117.50 CH ( 0.52941 0.47059 )
## 24) DiscCH < 0.15 77 106.60 MM ( 0.48052 0.51948 ) *
## 25) DiscCH > 0.15 8 0.00 CH ( 1.00000 0.00000 ) *
## 13) PriceDiff > 0.25 69 45.30 CH ( 0.89855 0.10145 ) *
## 7) LoyalCH > 0.705699 292 106.30 CH ( 0.95548 0.04452 ) *
I chose terminal node 4. There is a split at LoyalCH = 0.5036 which means node 4 is where the LoyalCH is between 0.142214 and 0.5036.
plot(tree_model)
text(tree_model, pretty = 0, cex = 0.7)
Starting with the first split on the left, those who had the lowest LoyalCH would purchase MM, which you would assume. However, if the PriceDiff was under 0.235 they would still purchase MM. But if the PriceDiff was large, meaning CH was cheaper, then it shows that they would purchase CH more often. Those who had a higher LoyalCh would purchase CH more often. If the PriceDiff was high then they would also purchase CH more often. When the PriceDiff was low and the DiscCH was low then they would purchase MM more often.
test_pred <- predict(tree_model, test, type = "class")
table(test_pred, test_actual = test$Purchase)
## test_actual
## test_pred CH MM
## CH 125 32
## MM 34 79
1 - mean(test_pred == test$Purchase)
## [1] 0.2444444
The test error rate was 0.2444444
set.seed(2)
cv_tree_model <- cv.tree(tree_model, K = 10)
cv_tree_model
## $size
## [1] 7 6 5 4 3 2 1
##
## $dev
## [1] 735.2284 734.6752 735.1345 736.3625 764.8377 781.1287 1065.3358
##
## $k
## [1] -Inf 10.91298 26.64213 28.43029 47.89357 56.45519 276.68413
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
data.frame(size = cv_tree_model$size, CV_Error = cv_tree_model$dev / nrow(train)) %>%
mutate(min_CV_Error = as.numeric(min(CV_Error) == CV_Error)) %>%
ggplot(aes(x = size, y = CV_Error)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_CV_Error))) +
scale_x_continuous(breaks = seq(1, 7), minor_breaks = NULL) +
scale_y_continuous(labels = scales::percent_format()) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme(legend.position = "none") +
labs(title = "OJ Dataset - Classification Tree",
subtitle = "Selecting tree 'size' (# of terminal nodes) using cross-validation",
x = "Tree Size",
y = "CV Error")
The tree size with 6 terminal nodes has the best classification error rate.
pruned_tree_model <- prune.tree(tree_model, best = 6)
pruned_tree_model
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1064.00 CH ( 0.61750 0.38250 )
## 2) LoyalCH < 0.5036 354 435.50 MM ( 0.30508 0.69492 )
## 4) LoyalCH < 0.142213 100 45.39 MM ( 0.06000 0.94000 ) *
## 5) LoyalCH > 0.142213 254 342.20 MM ( 0.40157 0.59843 )
## 10) PriceDiff < 0.235 136 153.00 MM ( 0.25000 0.75000 ) *
## 11) PriceDiff > 0.235 118 160.80 CH ( 0.57627 0.42373 ) *
## 3) LoyalCH > 0.5036 446 352.30 CH ( 0.86547 0.13453 )
## 6) LoyalCH < 0.705699 154 189.50 CH ( 0.69481 0.30519 )
## 12) PriceDiff < 0.25 85 117.50 CH ( 0.52941 0.47059 ) *
## 13) PriceDiff > 0.25 69 45.30 CH ( 0.89855 0.10145 ) *
## 7) LoyalCH > 0.705699 292 106.30 CH ( 0.95548 0.04452 ) *
mean(predict(tree_model, type = "class") != train$Purchase)
## [1] 0.18375
This is the training error for the unpruned, 7 terminal nodes, tree.
mean(predict(pruned_tree_model, type = "class") != train$Purchase)
## [1] 0.1875
The training error for the pruned tree model is slightly higher than the training error of the unpruned.
mean(predict(tree_model, type = "class", newdata = test) != test$Purchase)
## [1] 0.2444444
mean(predict(pruned_tree_model, type = "class", newdata = test) != test$Purchase)
## [1] 0.2148148
Using the test data we see that the error is actually higher for the unpruned tree.