Load Libraries

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(MASS) # Boston data
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(ISLR) # Carseats data
library(randomForest) # random forests
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(tree) # trees
## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
library(gbm)
## Loaded gbm 2.1.8
library(gam)
## Loading required package: splines
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.20

Question 3

prop_class_1 <- seq(0, 1, 0.001)
prop_class_2 <- 1 - prop_class_1


classification_error <- 1 - pmax(prop_class_1, prop_class_2)

gini <- prop_class_1*(1-prop_class_1) + prop_class_2*(1-prop_class_2)

entropy <- -prop_class_1*log(prop_class_1) - prop_class_2*log(prop_class_2)


data.frame(prop_class_1, prop_class_2, classification_error, gini, entropy) %>%
  pivot_longer(cols = c(classification_error, gini, entropy), names_to = "metric") %>%
  ggplot(aes(x = prop_class_1, y = value, col = factor(metric))) + 
  geom_line(size = 1) + 
  scale_y_continuous(breaks = seq(0, 1, 0.1), minor_breaks = NULL) + 
  scale_color_hue(labels = c("Classification Error", "Entropy", "Gini")) +
  labs(col = "Metric", 
       y = "Value", 
       x = "Proportion (of class '1')")
## Warning: Removed 2 row(s) containing missing values (geom_path).

Question 8a

set.seed(2, sample.kind = "Rounding")
## Warning in set.seed(2, sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
train_index <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train <- Carseats[train_index, ]
test <- Carseats[-train_index, ]

Question 8b

tree_model <- tree(Sales ~ ., train)
plot(tree_model)
text(tree_model, pretty = 0, cex = 0.7)

summary(tree_model)
## 
## Regression tree:
## tree(formula = Sales ~ ., data = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Age"         "Income"      "CompPrice"  
## [6] "Population"  "Advertising"
## Number of terminal nodes:  17 
## Residual mean deviance:  2.341 = 428.4 / 183 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.76700 -1.00900 -0.01558  0.00000  0.94900  3.58600
test_pred <- predict(tree_model, test)
mean((test_pred - test$Sales)^2)
## [1] 4.844991

The regression tree shows that ShelveLoc and Price are the factors that influence the amount of car seat sales the most. I see a test MSE of 4.844991.

Question 8c

set.seed(3)
cv_tree_model <- cv.tree(tree_model, K = 10)
data.frame(n_leaves = cv_tree_model$size,
           CV_RSS = cv_tree_model$dev) %>%
  mutate(min_CV_RSS = as.numeric(min(CV_RSS) == CV_RSS)) %>%
  ggplot(aes(x = n_leaves, y = CV_RSS)) +
  geom_line(col = "grey55") +
  geom_point(size = 2, aes(col = factor(min_CV_RSS))) +
  scale_x_continuous(breaks = seq(1, 17, 2)) +
  scale_y_continuous(labels = scales::comma_format()) +
  scale_color_manual(values = c("deepskyblue3", "green")) +
  theme(legend.position = "none") +
  labs(title = "Carseats Dataset - Regression Tree",
       subtitle = "Selecting the complexity parameter with cross-validation",
       x = "Terminal Nodes",
       y = "CV RSS")

Seeing as the cross validation terminal node of 17 is the same as the tree from part b we can just take the MSE from part b, showing that it does not improve.

Question 8d

set.seed(1)
bagged_trees_model <- randomForest(y = train$Sales, 
                                   x = train[ ,-1], 
                                   mtry = ncol(train) - 1,
                                   importance = T) 
test_pred <- predict(bagged_trees_model, test)
mean((test_pred - test$Sales)^2)
## [1] 2.368674
importance(bagged_trees_model) %>%
  as.data.frame() %>%
  rownames_to_column("varname") %>%
  arrange(desc(IncNodePurity))
##        varname     %IncMSE IncNodePurity
## 1        Price 55.76266629    493.804969
## 2    ShelveLoc 53.87451311    446.816951
## 3    CompPrice 25.47984338    173.982449
## 4          Age 12.07366106    117.502364
## 5  Advertising 13.97464644     96.929928
## 6       Income  1.72616791     71.465227
## 7   Population  1.01449985     68.297498
## 8    Education  0.08382003     37.513944
## 9        Urban -3.06299457      6.909530
## 10          US  0.14346468      5.985091

We see a test MSE of 2.368674. Just as we saw in part b we see that Price and ShelveLoc are the most important (actually much more important than the rest when looking at the raw numbers) variables.

Question 8e

test_MSE <- c()
i <- 1

for (Mtry in 1:10) {
  set.seed(1)
  
  rf_temp <- randomForest(y = train$Sales, 
                          x = train[ ,-1], 
                          mtry = Mtry, 
                          importance = T)
  
  test_pred <- predict(rf_temp, test)
  
  test_MSE[i] <- mean((test_pred - test$Sales)^2)
  
  i <- i + 1
}

data.frame(mtry = 1:10, test_MSE = test_MSE) %>%
  mutate(min_test_MSE = as.numeric(min(test_MSE) == test_MSE)) %>%
  ggplot(aes(x = mtry, y = test_MSE)) +
  geom_line(col = "grey55") +
  geom_point(size = 2, aes(col = factor(min_test_MSE))) +
  scale_x_continuous(breaks = seq(1, 10), minor_breaks = NULL) +
  scale_color_manual(values = c("deepskyblue3", "green")) +
  theme(legend.position = "none") +
  labs(title = "Carseats Dataset - Random Forests",
       subtitle = "Selecting 'mtry' using the test MSE",
       x = "mtry",
       y = "Test MSE")

When using the random forrest model we get a mtry of 10 which is the same as we found in the bagged tree model. The MSE is also the same with it being 2.368674.

Question 9a

set.seed(5)
train_index <- sample(1:nrow(OJ), 800)
train <- OJ[train_index, ]
test <- OJ[-train_index, ]

Question 9b

tree_model <- tree(Purchase ~ ., train)
summary(tree_model)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH"   "PriceDiff" "DiscCH"   
## Number of terminal nodes:  7 
## Residual mean deviance:  0.7786 = 617.4 / 793 
## Misclassification error rate: 0.1838 = 147 / 800

The error rate is 0.1838 with only 7 terminal nodes being used. Also, out of the 17 predictors only 3 of them were used, LoyalCH, PriceDiff, and DiscCH.

Question 9c

tree_model
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1064.00 CH ( 0.61750 0.38250 )  
##    2) LoyalCH < 0.5036 354  435.50 MM ( 0.30508 0.69492 )  
##      4) LoyalCH < 0.142213 100   45.39 MM ( 0.06000 0.94000 ) *
##      5) LoyalCH > 0.142213 254  342.20 MM ( 0.40157 0.59843 )  
##       10) PriceDiff < 0.235 136  153.00 MM ( 0.25000 0.75000 ) *
##       11) PriceDiff > 0.235 118  160.80 CH ( 0.57627 0.42373 ) *
##    3) LoyalCH > 0.5036 446  352.30 CH ( 0.86547 0.13453 )  
##      6) LoyalCH < 0.705699 154  189.50 CH ( 0.69481 0.30519 )  
##       12) PriceDiff < 0.25 85  117.50 CH ( 0.52941 0.47059 )  
##         24) DiscCH < 0.15 77  106.60 MM ( 0.48052 0.51948 ) *
##         25) DiscCH > 0.15 8    0.00 CH ( 1.00000 0.00000 ) *
##       13) PriceDiff > 0.25 69   45.30 CH ( 0.89855 0.10145 ) *
##      7) LoyalCH > 0.705699 292  106.30 CH ( 0.95548 0.04452 ) *

I chose terminal node 4. There is a split at LoyalCH = 0.5036 which means node 4 is where the LoyalCH is between 0.142214 and 0.5036.

Question 9d

plot(tree_model)
text(tree_model, pretty = 0, cex = 0.7)

Starting with the first split on the left, those who had the lowest LoyalCH would purchase MM, which you would assume. However, if the PriceDiff was under 0.235 they would still purchase MM. But if the PriceDiff was large, meaning CH was cheaper, then it shows that they would purchase CH more often. Those who had a higher LoyalCh would purchase CH more often. If the PriceDiff was high then they would also purchase CH more often. When the PriceDiff was low and the DiscCH was low then they would purchase MM more often.

Question 9e

test_pred <- predict(tree_model, test, type = "class")
table(test_pred, test_actual = test$Purchase)
##          test_actual
## test_pred  CH  MM
##        CH 125  32
##        MM  34  79
1 - mean(test_pred == test$Purchase)
## [1] 0.2444444

The test error rate was 0.2444444

Question 9f

set.seed(2)
cv_tree_model <- cv.tree(tree_model, K = 10)
cv_tree_model
## $size
## [1] 7 6 5 4 3 2 1
## 
## $dev
## [1]  735.2284  734.6752  735.1345  736.3625  764.8377  781.1287 1065.3358
## 
## $k
## [1]      -Inf  10.91298  26.64213  28.43029  47.89357  56.45519 276.68413
## 
## $method
## [1] "deviance"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

Question 9g

data.frame(size = cv_tree_model$size, CV_Error = cv_tree_model$dev / nrow(train)) %>%
  mutate(min_CV_Error = as.numeric(min(CV_Error) == CV_Error)) %>%
  ggplot(aes(x = size, y = CV_Error)) +
  geom_line(col = "grey55") +
  geom_point(size = 2, aes(col = factor(min_CV_Error))) +
  scale_x_continuous(breaks = seq(1, 7), minor_breaks = NULL) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_color_manual(values = c("deepskyblue3", "green")) +
  theme(legend.position = "none") +
  labs(title = "OJ Dataset - Classification Tree",
       subtitle = "Selecting tree 'size' (# of terminal nodes) using cross-validation",
       x = "Tree Size",
       y = "CV Error")

Question 9h

The tree size with 6 terminal nodes has the best classification error rate.

Question 9i

pruned_tree_model <- prune.tree(tree_model, best = 6)
pruned_tree_model
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1064.00 CH ( 0.61750 0.38250 )  
##    2) LoyalCH < 0.5036 354  435.50 MM ( 0.30508 0.69492 )  
##      4) LoyalCH < 0.142213 100   45.39 MM ( 0.06000 0.94000 ) *
##      5) LoyalCH > 0.142213 254  342.20 MM ( 0.40157 0.59843 )  
##       10) PriceDiff < 0.235 136  153.00 MM ( 0.25000 0.75000 ) *
##       11) PriceDiff > 0.235 118  160.80 CH ( 0.57627 0.42373 ) *
##    3) LoyalCH > 0.5036 446  352.30 CH ( 0.86547 0.13453 )  
##      6) LoyalCH < 0.705699 154  189.50 CH ( 0.69481 0.30519 )  
##       12) PriceDiff < 0.25 85  117.50 CH ( 0.52941 0.47059 ) *
##       13) PriceDiff > 0.25 69   45.30 CH ( 0.89855 0.10145 ) *
##      7) LoyalCH > 0.705699 292  106.30 CH ( 0.95548 0.04452 ) *

Question 9j

mean(predict(tree_model, type = "class") != train$Purchase)
## [1] 0.18375

This is the training error for the unpruned, 7 terminal nodes, tree.

mean(predict(pruned_tree_model, type = "class") != train$Purchase)
## [1] 0.1875

The training error for the pruned tree model is slightly higher than the training error of the unpruned.

Question 9k

mean(predict(tree_model, type = "class", newdata = test) != test$Purchase)
## [1] 0.2444444
mean(predict(pruned_tree_model, type = "class", newdata = test) != test$Purchase)
## [1] 0.2148148

Using the test data we see that the error is actually higher for the unpruned tree.