library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ISLR)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(tree)
## Warning: package 'tree' was built under R version 4.2.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(BART)
## Warning: package 'BART' was built under R version 4.2.3
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## Loading required package: nnet
## Loading required package: survival
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster

Question 3

  1. Consider the Gini index, classification error, and entropy in a simple classification setting with two classes. Create a single plot that displays each of these quantities as a function of ˆpm1. The x-axis should display ˆpm1, ranging from 0 to 1, and the y-axis should display the value of the Gini index, classification error, and entropy.
prop1 <- seq(0,1,.01)
prop2 <- 1 - prop1

classerror <- 1 - pmax(prop1, prop2)
gini <- prop1 * (1-prop1) + prop2 * (1-prop2)
entropy <- -prop1 * log(prop1) - prop2 * log(prop2)

data.frame(prop1, prop2, classerror, gini, entropy) %>%
  pivot_longer(cols = c(classerror,gini, entropy), names_to = "metric") %>%
  ggplot(aes(prop1, value, col = factor(metric))) +
  geom_line() +
  scale_color_hue(labels = c("Classification Error", "Entropy", "Gini")) +
  labs(col = "Metric", x = "proportion", y = "value")
## Warning: Removed 2 rows containing missing values (`geom_line()`).

Question 8

  1. In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable.
  1. Split the data set into a training set and a test set.
set.seed(1)
car.index <- createDataPartition(Carseats$Sales, p = .75, list = FALSE, times = 1)

carTrain <- Carseats[car.index,]
carTest <- Carseats[-car.index,]
  1. Fit a regression tree to the training set. Plot the tree, and interpret the results. What test MSE do you obtain?
car.tree <- tree(Sales~., data = carTrain)

plot(car.tree)
text(car.tree, pretty = 0, cex = .5)

summary(car.tree)
## 
## Regression tree:
## tree(formula = Sales ~ ., data = carTrain)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Income"      "Age"         "Advertising"
## [6] "CompPrice"  
## Number of terminal nodes:  16 
## Residual mean deviance:  2.295 = 654 / 285 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.5490 -1.0160 -0.1187  0.0000  0.9648  3.5840
car.pred.t <- predict(car.tree, carTest)
mean((car.pred.t-carTest$Sales)^2)
## [1] 4.935788

We get an MSE if 4.935, which is pretty good.

  1. Use cross-validation in order to determine the optimal level of tree complexity. Does pruning the tree improve the test MSE?
set.seed(1)
cv.cars <- cv.tree(car.tree)
plot(cv.cars$size, cv.cars$dev, type = 'b')

It appears that pruning our model down to 12 terminal nodes would lower the deviance the most.

prune.carseats <- prune.tree(car.tree, best = 12)
plot(prune.carseats)
text(prune.carseats, pretty = 0, cex = .5)

prune.pred <- predict(prune.carseats, carTest)
mean((prune.pred-carTest$Sales)^2)
## [1] 4.970822

This raised the MSE by .04.

  1. Use the bagging approach in order to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important.
car.bag <- randomForest(Sales~., data = carTrain, mtry =10, ntree = 500, importance = TRUE)
bag.preds <- predict(car.bag, carTest)
mean((bag.preds - carTest$Sales)^2)
## [1] 3.179889
importance(car.bag) %>%
  as.data.frame() %>%
  rownames_to_column('variable') %>%
  arrange(desc(IncNodePurity))
##       variable    %IncMSE IncNodePurity
## 1        Price 68.2206989     743.44922
## 2    ShelveLoc 80.1387119     694.76788
## 3    CompPrice 29.7704182     215.53404
## 4  Advertising 24.7563156     185.71361
## 5          Age 20.3956172     181.77479
## 6       Income  8.8936255     104.74986
## 7   Population  0.9713052      70.37262
## 8    Education  2.8936326      61.95442
## 9           US  3.9202545      14.31920
## 10       Urban -0.7213293      10.73275

It appears that Price and ShelveLoc are the two most important variables.

  1. Use random forests to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important. Describe the effect of m, the number of variables considered at each split, on the error rate obtained.
car.rf <- randomForest(Sales~., data = carTrain, mtry = 3, ntree = 500, importance = TRUE)
rf.preds <- predict(car.rf, carTest)
mean((rf.preds - carTest$Sales)^2)
## [1] 3.639842
importance(car.rf) %>%
  as.data.frame() %>%
  rownames_to_column('variable') %>%
  arrange(desc(IncNodePurity))
##       variable   %IncMSE IncNodePurity
## 1        Price 46.405424     571.77646
## 2    ShelveLoc 48.426476     552.90732
## 3          Age 16.616958     229.12277
## 4    CompPrice 13.685658     200.89817
## 5  Advertising 18.826769     200.14971
## 6       Income  2.149908     157.18510
## 7   Population -1.204152     134.97045
## 8    Education  1.874559     106.63373
## 9           US  5.656612      33.74457
## 10       Urban -2.098901      22.43028

Again, Price and ShelveLoc are the two most important variables. Increasing mtry reduces the test error, and we can see an reduction in test MSE as mtry approaches 10.

  1. Now analyze the data using BART, and report your results.
set.seed(1)

train <- sample(1:nrow(Carseats), nrow(Carseats) /2)

x <- Carseats[, 2:11]
y <- Carseats[,  'Sales']
xtrain <- x[train, ]
ytrain <- y[train]

xtest <- x[-train,]
ytest <-y[-train]

car.bart <- gbart(xtrain, ytrain, x.test = xtest)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 14, 200
## y1,yn: 2.781850, 1.091850
## x1,x[n*p]: 107.000000, 1.000000
## xp1,xp[np*p]: 111.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 63 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.273474,3,0.23074,7.57815
## *****sigma: 1.088371
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****printevery: 100
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 3s
## trcnt,tecnt: 1000,1000
bart.preds <- car.bart$yhat.test.mean
mean((ytest - bart.preds)^2)
## [1] 1.499118

Question 9

  1. This problem involves the OJ data set which is part of the ISLR2 package.
  1. Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations.
OJ.index <- createDataPartition(OJ$Purchase, p = .75, list = FALSE, times = 1)

OJtrain <- OJ[OJ.index,]
OJtest <- OJ[-OJ.index,]
  1. Fit a tree to the training data, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics about the tree, and describe the results obtained. What is the training error rate? How many terminal nodes does the tree have?
oj.tree <- tree(Purchase~., data = OJtrain)
summary(oj.tree)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = OJtrain)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "ListPriceDiff" "DiscMM"       
## Number of terminal nodes:  7 
## Residual mean deviance:  0.7389 = 588.2 / 796 
## Misclassification error rate: 0.1569 = 126 / 803

There are 7 terminal nodes and a missclassification rate of around 15%.

  1. Type in the name of the tree object in order to get a detailed text output. Pick one of the terminal nodes, and interpret the information displayed.
oj.tree
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 803 1074.00 CH ( 0.61021 0.38979 )  
##    2) LoyalCH < 0.48285 290  306.10 MM ( 0.22069 0.77931 )  
##      4) LoyalCH < 0.280875 167  118.30 MM ( 0.11377 0.88623 ) *
##      5) LoyalCH > 0.280875 123  161.60 MM ( 0.36585 0.63415 )  
##       10) PriceDiff < -0.24 13    0.00 MM ( 0.00000 1.00000 ) *
##       11) PriceDiff > -0.24 110  148.80 MM ( 0.40909 0.59091 ) *
##    3) LoyalCH > 0.48285 513  467.10 CH ( 0.83041 0.16959 )  
##      6) LoyalCH < 0.764572 237  298.90 CH ( 0.67511 0.32489 )  
##       12) ListPriceDiff < 0.235 95  129.30 MM ( 0.42105 0.57895 )  
##         24) DiscMM < 0.15 56   75.84 CH ( 0.58929 0.41071 ) *
##         25) DiscMM > 0.15 39   36.71 MM ( 0.17949 0.82051 ) *
##       13) ListPriceDiff > 0.235 142  122.50 CH ( 0.84507 0.15493 ) *
##      7) LoyalCH > 0.764572 276   85.99 CH ( 0.96377 0.03623 ) *

I chose 4) to be my terminal node. This is a set of observations where LoyalCH is less than .280875. There are 167 observations with a deviance 118.3. This branch overall predicts for MM, based off of a 11-89 split.

  1. Create a plot of the tree, and interpret the results.
plot(oj.tree)
text(oj.tree, pretty = 0, cex = .8)

It seems that generally if LoyaltyCH is less than .48, it is MM, but otherwise it is CH with one node exception.

  1. Predict the response on the test data, and produce a confusion matrix comparing the test labels to the predicted test labels. What is the test error rate?
OJ.preds <- predict(oj.tree, OJtest, type = 'class')
table(OJ.preds, OJtest$Purchase)
##         
## OJ.preds  CH  MM
##       CH 125  16
##       MM  38  88
(38+16)/270
## [1] 0.2

We have a 20% test error rate.

  1. Apply the cv.tree() function to the training set in order to determine the optimal tree size.
oj.cv <- cv.tree(oj.tree, FUN = prune.misclass)
oj.cv
## $size
## [1] 7 5 2 1
## 
## $dev
## [1] 151 151 169 313
## 
## $k
## [1]       -Inf   0.000000   8.333333 162.000000
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
  1. Produce a plot with tree size on the x-axis and cross-validated classification error rate on the y-axis.
plot(oj.cv$size, oj.cv$dev, type = 'b')

  1. Which tree size corresponds to the lowest cross-validated classification error rate?

It looks like a 5 node tree is our best bet.

  1. Produce a pruned tree corresponding to the optimal tree size obtained using cross-validation. If cross-validation does not lead to selection of a pruned tree, then create a pruned tree with five terminal nodes.
oj.prune <- prune.misclass(oj.tree, best = 5)

plot(oj.prune)
text(oj.prune, pretty = 0, cex = .7)

  1. Compare the training error rates between the pruned and unpruned trees. Which is higher?
summary(oj.tree)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = OJtrain)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "ListPriceDiff" "DiscMM"       
## Number of terminal nodes:  7 
## Residual mean deviance:  0.7389 = 588.2 / 796 
## Misclassification error rate: 0.1569 = 126 / 803
summary(oj.prune)
## 
## Classification tree:
## snip.tree(tree = oj.tree, nodes = 2L)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "ListPriceDiff" "DiscMM"       
## Number of terminal nodes:  5 
## Residual mean deviance:  0.7858 = 627.1 / 798 
## Misclassification error rate: 0.1569 = 126 / 803

It appears that the pruned tree and the original tree have the same misclassification error rate.

  1. Compare the test error rates between the pruned and unpruned trees. Which is higher?
prune.preds <- predict(oj.prune, OJtest, type = 'class')
table(prune.preds, OJtest$Purchase)
##            
## prune.preds  CH  MM
##          CH 125  16
##          MM  38  88
(38+16)/270
## [1] 0.2

They again have the same test missclassification rate.