Assignment 7

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Load Libraries
library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.3.3

library(tree)

## Warning: package 'tree' was built under R version 4.3.3

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.3.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

library(BART)

## Warning: package 'BART' was built under R version 4.3.3

## Loading required package: nlme

## Loading required package: survival

# Problem 3. Consider the gini_index index, classification error, and entropy in a
# simple classification setting with two classes.
# Create a single plot that displays each of these quantities as a function of
# p^m1. The x axis should display p^m1,
# ranging from 0 to 1, and the y-axis should display the value of the gini_index
# index classification error, and entropy. Hint: In a setting with two classes,
# pˆm1 = 1 − pˆm2. You could make this plot by hand, but it will be much easier
# to make in R.



p=seq(0,1,0.01)

gini_index= 2 * p *(1-p)
class_error= 1 - pmax(p,1-p)
crossentropy= -(p*log(p)+(1-p)*log(1-p))

plot(NA,NA,xlim=c(0,1),ylim=c(0,1),xlab='pm1',ylab='values from 0 to 1')

lines(p,gini_index,col = 'red')
lines(p,class_error,col='blue')
lines(p,crossentropy,col='green')

legend(x='top',legend=c('gini','class error','cross entropy'),
       col=c('red','blue','green'),lty=1,text.width = 0.25)

# Problem 8 In the lab, a classification tree was applied to the Carseats data
# set after converting Sales into a qualitative response variable. Now we will
# seek to predict Sales using regression trees and related approaches,
# treating the response as a quantitative variable.

str(Carseats)

## 'data.frame':    400 obs. of  11 variables:
##  $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : num  138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : num  73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: num  11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : num  276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : num  120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : num  42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : num  17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...

# Part a) Split the data set into a training set and a test set.

set.seed(7)
train = sample(1:nrow(Carseats), nrow(Carseats)/2)
Carseats_train = Carseats[train,]
Carseats_test = Carseats[-train,]

# b) Fit a regression tree to the training set. Plot the tree, and interpret
# the results. What test MSE do you obtain?


tree_Carseats = tree(Sales~. , data = Carseats_train)
plot(tree_Carseats)
text(tree_Carseats)

# Calculate MSE
tree_pred = predict(tree_Carseats, Carseats_test)
test_mse = mean((tree_pred-Carseats_test$Sales)^2)
test_mse

## [1] 4.765435

# From the results generated the regression tree shows that the variables
# Shelve Location and Price are the first two important predictors as they
# are the core components of the tree.

# c) Use cross-validation in order to determine the optimal level of
# tree complexity. Does pruning the tree improve the test MSE?
set.seed(7)
cv_Carseats = cv.tree(tree_Carseats)
plot(cv_Carseats$size, cv_Carseats$dev, xlab = "Number of terminal nodes", ylab = "Cross validation error", type = "b")

prune_Carseats = prune.tree(tree_Carseats, best = 6)
tree_pred = predict(prune_Carseats, Carseats_test)
test_mse = mean((tree_pred-Carseats_test$Sales)^2)
test_mse

## [1] 4.448592

# In the graph there the last significant decrease in cross validation error
# is at 6 nodes. Compared to other possible nodes to prune at, pruning at 
# node 6 causes the test MSE to have the greatest decrease to a value of 
# 4.448592.

# Part d) Use the bagging approach in order to analyze this data. What
# test MSE do you obtain? Use the importance() function to determine
# which variables are most important.

set.seed(7)
bag_Carseats = randomForest(Carseats_train$Sales~., 
                            data = Carseats_train, mtry = 10, importance = TRUE)

bag_Carseats

## 
## Call:
##  randomForest(formula = Carseats_train$Sales ~ ., data = Carseats_train,      mtry = 10, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 10
## 
##           Mean of squared residuals: 2.945681
##                     % Var explained: 67.23

bagging_pred = predict(bag_Carseats, Carseats_test)
test_mse = mean((bagging_pred-Carseats_test$Sales)^2)
test_mse

## [1] 2.352134

importance(bag_Carseats)

##                %IncMSE IncNodePurity
## CompPrice   17.6664100    148.289756
## Income       0.5061627     78.178160
## Advertising 16.5454276    127.177114
## Population   0.2094257     63.036489
## Price       52.0759302    513.899493
## ShelveLoc   56.7454975    603.105977
## Age         18.1370219    167.149176
## Education    0.6632503     38.960150
## Urban       -0.8142933      4.599324
## US           1.9049544      7.827738

# The test mse is 2.352134 and the variables that are most important are 
# ShelveLoc and Price

# Part e) Use random forests to analyze this data. What test MSE do you
# obtain? Use the importance() function to determine which variables
# are most important. Describe the effect of m, the number of
# variables considered at each split, on the error rate
# obtained.

set.seed(7)
randfore_Carseats = randomForest(Carseats_train$Sales~.,data = Carseats_train, importance=TRUE)
randfore_Carseats

## 
## Call:
##  randomForest(formula = Carseats_train$Sales ~ ., data = Carseats_train,      importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 3.299988
##                     % Var explained: 63.29

# now get the test MSE #
yhat_rf = predict(randfore_Carseats, newdata = Carseats_test)
randf_mse = mean((yhat_rf - Carseats_test$Sales)^2)
randf_mse

## [1] 2.770891

importance(randfore_Carseats)

##                %IncMSE IncNodePurity
## CompPrice    7.8250023     136.13136
## Income       0.6547144     105.42039
## Advertising 13.2914493     150.93588
## Population   1.5889887     113.39707
## Price       37.7823496     424.67020
## ShelveLoc   39.7564258     417.07473
## Age         17.2844151     221.63789
## Education    2.9320469      83.08323
## Urban        0.4656180      11.91653
## US           2.0176005      22.60699

set.seed(7)
rf_Carseats = randomForest(Carseats_train$Sales~.,
                           data = Carseats_train,mtry = 2, importance=TRUE)
yhat_rf = predict(rf_Carseats, newdata = Carseats_test)
rf_mse2 =  mean((yhat_rf-Carseats_test$Sales)^2)
rf_mse2

## [1] 3.136767

set.seed(7)
rf_Carseats = randomForest(Carseats_train$Sales~.,
                           data = Carseats_train,mtry = 6, importance=TRUE)
yhat_rf = predict(rf_Carseats, newdata = Carseats_test)
rf_mse3 = mean((yhat_rf-Carseats_test$Sales)^2)
rf_mse3

## [1] 2.425748

# With the random forest using 3 predictors, the MSE obtained is
# 2.770891 which is higher compared to the bagging approach. 
# When adjusting the value of m, it can be seen that as m increase the 
# test mean square error decreases.

# Part f)Now analyze the data using BART, and report your results.

test = Carseats[-train,"Sales"]
x = Carseats[,1:10]
y = Carseats[,"Sales"]
xtrain = x[train, ]
ytrain = y[train]
xtest = x[test, ]
ytest = y[test]

set.seed(7)
bart_fit = gbart(xtrain, ytrain, x.test=xtest)

## Warning in summary.lm(lm(y.train ~ ., data.frame(t(x.train), y.train))):
## essentially perfect fit: summary may be unreliable

## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 13, 200
## y1,yn: -4.467850, 1.162150
## x1,x[n*p]: 3.070000, 1.000000
## xp1,xp[np*p]: 6.630000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 100 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.287616,3,5.10935e-31,7.53785
## *****sigma: 0.000000
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,13,0
## *****printevery: 100
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 4s
## trcnt,tecnt: 1000,1000

yhat_bart = bart_fit$yhat.test.mean
mean((ytest - yhat_bart)^2)

## [1] 0.04028545

# Problem 9 This problem involves the OJ data set which is 
# part of the ISLR2 package.
str(OJ)

## 'data.frame':    1070 obs. of  18 variables:
##  $ Purchase      : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
##  $ WeekofPurchase: num  237 239 245 227 228 230 232 234 235 238 ...
##  $ StoreID       : num  1 1 1 1 7 7 7 7 7 7 ...
##  $ PriceCH       : num  1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceMM       : num  1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
##  $ DiscCH        : num  0 0 0.17 0 0 0 0 0 0 0 ...
##  $ DiscMM        : num  0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
##  $ SpecialCH     : num  0 0 0 0 0 0 1 1 0 0 ...
##  $ SpecialMM     : num  0 1 0 0 0 1 1 0 0 0 ...
##  $ LoyalCH       : num  0.5 0.6 0.68 0.4 0.957 ...
##  $ SalePriceMM   : num  1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
##  $ SalePriceCH   : num  1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceDiff     : num  0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
##  $ Store7        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
##  $ PctDiscMM     : num  0 0.151 0 0 0 ...
##  $ PctDiscCH     : num  0 0 0.0914 0 0 ...
##  $ ListPriceDiff : num  0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
##  $ STORE         : num  1 1 1 1 0 0 0 0 0 0 ...

# Part a) Create a training set containing a random sample of 800 observations,
# and a test set containing the remaining observations.

set.seed(7)
model_train = sample(1:nrow(OJ), 800)
OJ_train = OJ[model_train,]
OJ_test = OJ[-model_train,]

# Part (b) Fit a tree to the training data, with Purchase as the response
# and the other variables as predictors. Use the summary() function
# to produce summary statistics about the tree, and describe the
# results obtained. What is the training error rate? How many
# terminal nodes does the tree have?

OJ_tree = tree(Purchase~., data= OJ_train)
summary(OJ_tree)

## 
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ_train)
## Variables actually used in tree construction:
## [1] "LoyalCH"        "PriceDiff"      "WeekofPurchase" "ListPriceDiff" 
## [5] "DiscMM"         "STORE"         
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7329 = 579.7 / 791 
## Misclassification error rate: 0.1588 = 127 / 800

# The training error is 15.88% and based on the summary function there
# are 9 terminal nodes.

# Part (c) Type in the name of the tree object in order to get a detailed
# text output. Pick one of the terminal nodes, and interpret the
# information displayed.

OJ_tree

## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1073.00 CH ( 0.60500 0.39500 )  
##    2) LoyalCH < 0.5036 360  431.00 MM ( 0.28611 0.71389 )  
##      4) LoyalCH < 0.136698 109   57.19 MM ( 0.07339 0.92661 ) *
##      5) LoyalCH > 0.136698 251  333.00 MM ( 0.37849 0.62151 )  
##       10) PriceDiff < 0.195 110  107.30 MM ( 0.19091 0.80909 ) *
##       11) PriceDiff > 0.195 141  195.10 CH ( 0.52482 0.47518 )  
##         22) WeekofPurchase < 247.5 55   70.90 MM ( 0.34545 0.65455 ) *
##         23) WeekofPurchase > 247.5 86  112.40 CH ( 0.63953 0.36047 ) *
##    3) LoyalCH > 0.5036 440  346.80 CH ( 0.86591 0.13409 )  
##      6) LoyalCH < 0.764572 185  215.90 CH ( 0.72973 0.27027 )  
##       12) ListPriceDiff < 0.255 102  139.50 CH ( 0.56863 0.43137 )  
##         24) DiscMM < 0.47 87  113.30 CH ( 0.64368 0.35632 ) *
##         25) DiscMM > 0.47 15   11.78 MM ( 0.13333 0.86667 ) *
##       13) ListPriceDiff > 0.255 83   43.08 CH ( 0.92771 0.07229 ) *
##      7) LoyalCH > 0.764572 255   77.87 CH ( 0.96471 0.03529 )  
##       14) STORE < 1.5 136    0.00 CH ( 1.00000 0.00000 ) *
##       15) STORE > 1.5 119   63.78 CH ( 0.92437 0.07563 ) *

# At the fourth split there is a terminal node which is indicated in the
# result above. 4) LoyalCH < 0.136698 109   57.19 MM ( 0.07339 0.92661 ) *
# The split of the branch in this node is, LoyalCH < 0.136698, with
# 109 observations in the node with a final prediction of MM that is indicated.

# Part (d) Create a plot of the tree, and interpret the results.
plot(OJ_tree)
text(OJ_tree)

# From the tree results when customer loyalty is low for Citrus Hill, they
# are more than likely to buy from Minute Maid (MM) unless Citrus Hill price
# difference cost 19.5 cents less than Minute Maid. On the otherside of the
# tree when customer loyalty is high to Citrus Hill they will buy 
# the product more than likely compared to Minute Maid unless the price
# difference cost 25.5 cents less, then customers will buy form Minute maid 
# instead.

# Part (e) Predict the response on the test data, and produce a confusion
# matrix comparing the test labels to the predicted test labels.
# What is the test error rate?

pred_OJ = predict(OJ_tree, newdata = OJ_test, type = "class")
table(pred_OJ,OJ_test$Purchase)

##        
## pred_OJ  CH  MM
##      CH 151  31
##      MM  18  70

# Obtained from results above to calculate test error rate
49/270

## [1] 0.1814815

# Test error rate is 0.1814815

# Part (f) Apply the cv.tree() function to the training set in order to
# determine the optimal tree size.
set.seed(7)
cv_OJ_tree = cv.tree(OJ_tree, FUN = prune.misclass)
cv_OJ_tree

## $size
## [1] 9 8 5 2 1
## 
## $dev
## [1] 151 154 154 165 316
## 
## $k
## [1]       -Inf   0.000000   3.666667   8.000000 154.000000
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

# Optimal tree size is 9

plot(cv_OJ_tree$size, cv_OJ_tree$dev, xlab = "Tree size",
     ylab = "Cross-validated classification error rate", type = "b")

# Part h) Which tree size corresponds to the lowest cross-validated
# classification error rate?
# Tree size with 9 terminal nodes corresponds to the lowest cross-validated
# classification error.

# Part (i) Produce a pruned tree corresponding to the optimal tree size
# obtained using cross-validation. If cross-validation does not lead
# to selection of a pruned tree, then create a pruned tree with five
# terminal nodes.

prune_OJ_tree = prune.tree(OJ_tree, best = 5)
plot(prune_OJ_tree)
text(prune_OJ_tree)

# Part (j) Compare the training error rates between the pruned and unpruned
# trees. Which is higher?

pred_prune_tree = predict(prune_OJ_tree, newdata = OJ_train, type = "class")

table(pred_prune_tree, OJ_train$Purchase)

##                
## pred_prune_tree  CH  MM
##              CH 381  59
##              MM 103 257

# The error rate on the pruning is 
162/800

## [1] 0.2025

# Part (k) Compare the test error rates between the pruned and unpruned
# trees. Which is higher?
# The test error rate of the pruned is higher than error rate of unpruned tree.

Assignment 7

Daniel O.

2025-04-30

R Markdown