This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Load Libraries
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
library(tree)
## Warning: package 'tree' was built under R version 4.3.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(BART)
## Warning: package 'BART' was built under R version 4.3.3
## Loading required package: nlme
## Loading required package: survival
# Problem 3. Consider the gini_index index, classification error, and entropy in a
# simple classification setting with two classes.
# Create a single plot that displays each of these quantities as a function of
# p^m1. The x axis should display p^m1,
# ranging from 0 to 1, and the y-axis should display the value of the gini_index
# index classification error, and entropy. Hint: In a setting with two classes,
# pˆm1 = 1 − pˆm2. You could make this plot by hand, but it will be much easier
# to make in R.
p=seq(0,1,0.01)
gini_index= 2 * p *(1-p)
class_error= 1 - pmax(p,1-p)
crossentropy= -(p*log(p)+(1-p)*log(1-p))
plot(NA,NA,xlim=c(0,1),ylim=c(0,1),xlab='pm1',ylab='values from 0 to 1')
lines(p,gini_index,col = 'red')
lines(p,class_error,col='blue')
lines(p,crossentropy,col='green')
legend(x='top',legend=c('gini','class error','cross entropy'),
col=c('red','blue','green'),lty=1,text.width = 0.25)
# Problem 8 In the lab, a classification tree was applied to the Carseats data
# set after converting Sales into a qualitative response variable. Now we will
# seek to predict Sales using regression trees and related approaches,
# treating the response as a quantitative variable.
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
# Part a) Split the data set into a training set and a test set.
set.seed(7)
train = sample(1:nrow(Carseats), nrow(Carseats)/2)
Carseats_train = Carseats[train,]
Carseats_test = Carseats[-train,]
# b) Fit a regression tree to the training set. Plot the tree, and interpret
# the results. What test MSE do you obtain?
tree_Carseats = tree(Sales~. , data = Carseats_train)
plot(tree_Carseats)
text(tree_Carseats)
# Calculate MSE
tree_pred = predict(tree_Carseats, Carseats_test)
test_mse = mean((tree_pred-Carseats_test$Sales)^2)
test_mse
## [1] 4.765435
# From the results generated the regression tree shows that the variables
# Shelve Location and Price are the first two important predictors as they
# are the core components of the tree.
# c) Use cross-validation in order to determine the optimal level of
# tree complexity. Does pruning the tree improve the test MSE?
set.seed(7)
cv_Carseats = cv.tree(tree_Carseats)
plot(cv_Carseats$size, cv_Carseats$dev, xlab = "Number of terminal nodes", ylab = "Cross validation error", type = "b")
prune_Carseats = prune.tree(tree_Carseats, best = 6)
tree_pred = predict(prune_Carseats, Carseats_test)
test_mse = mean((tree_pred-Carseats_test$Sales)^2)
test_mse
## [1] 4.448592
# In the graph there the last significant decrease in cross validation error
# is at 6 nodes. Compared to other possible nodes to prune at, pruning at
# node 6 causes the test MSE to have the greatest decrease to a value of
# 4.448592.
# Part d) Use the bagging approach in order to analyze this data. What
# test MSE do you obtain? Use the importance() function to determine
# which variables are most important.
set.seed(7)
bag_Carseats = randomForest(Carseats_train$Sales~.,
data = Carseats_train, mtry = 10, importance = TRUE)
bag_Carseats
##
## Call:
## randomForest(formula = Carseats_train$Sales ~ ., data = Carseats_train, mtry = 10, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.945681
## % Var explained: 67.23
bagging_pred = predict(bag_Carseats, Carseats_test)
test_mse = mean((bagging_pred-Carseats_test$Sales)^2)
test_mse
## [1] 2.352134
importance(bag_Carseats)
## %IncMSE IncNodePurity
## CompPrice 17.6664100 148.289756
## Income 0.5061627 78.178160
## Advertising 16.5454276 127.177114
## Population 0.2094257 63.036489
## Price 52.0759302 513.899493
## ShelveLoc 56.7454975 603.105977
## Age 18.1370219 167.149176
## Education 0.6632503 38.960150
## Urban -0.8142933 4.599324
## US 1.9049544 7.827738
# The test mse is 2.352134 and the variables that are most important are
# ShelveLoc and Price
# Part e) Use random forests to analyze this data. What test MSE do you
# obtain? Use the importance() function to determine which variables
# are most important. Describe the effect of m, the number of
# variables considered at each split, on the error rate
# obtained.
set.seed(7)
randfore_Carseats = randomForest(Carseats_train$Sales~.,data = Carseats_train, importance=TRUE)
randfore_Carseats
##
## Call:
## randomForest(formula = Carseats_train$Sales ~ ., data = Carseats_train, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 3.299988
## % Var explained: 63.29
# now get the test MSE #
yhat_rf = predict(randfore_Carseats, newdata = Carseats_test)
randf_mse = mean((yhat_rf - Carseats_test$Sales)^2)
randf_mse
## [1] 2.770891
importance(randfore_Carseats)
## %IncMSE IncNodePurity
## CompPrice 7.8250023 136.13136
## Income 0.6547144 105.42039
## Advertising 13.2914493 150.93588
## Population 1.5889887 113.39707
## Price 37.7823496 424.67020
## ShelveLoc 39.7564258 417.07473
## Age 17.2844151 221.63789
## Education 2.9320469 83.08323
## Urban 0.4656180 11.91653
## US 2.0176005 22.60699
set.seed(7)
rf_Carseats = randomForest(Carseats_train$Sales~.,
data = Carseats_train,mtry = 2, importance=TRUE)
yhat_rf = predict(rf_Carseats, newdata = Carseats_test)
rf_mse2 = mean((yhat_rf-Carseats_test$Sales)^2)
rf_mse2
## [1] 3.136767
set.seed(7)
rf_Carseats = randomForest(Carseats_train$Sales~.,
data = Carseats_train,mtry = 6, importance=TRUE)
yhat_rf = predict(rf_Carseats, newdata = Carseats_test)
rf_mse3 = mean((yhat_rf-Carseats_test$Sales)^2)
rf_mse3
## [1] 2.425748
# With the random forest using 3 predictors, the MSE obtained is
# 2.770891 which is higher compared to the bagging approach.
# When adjusting the value of m, it can be seen that as m increase the
# test mean square error decreases.
# Part f)Now analyze the data using BART, and report your results.
test = Carseats[-train,"Sales"]
x = Carseats[,1:10]
y = Carseats[,"Sales"]
xtrain = x[train, ]
ytrain = y[train]
xtest = x[test, ]
ytest = y[test]
set.seed(7)
bart_fit = gbart(xtrain, ytrain, x.test=xtest)
## Warning in summary.lm(lm(y.train ~ ., data.frame(t(x.train), y.train))):
## essentially perfect fit: summary may be unreliable
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 13, 200
## y1,yn: -4.467850, 1.162150
## x1,x[n*p]: 3.070000, 1.000000
## xp1,xp[np*p]: 6.630000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 100 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.287616,3,5.10935e-31,7.53785
## *****sigma: 0.000000
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,13,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 4s
## trcnt,tecnt: 1000,1000
yhat_bart = bart_fit$yhat.test.mean
mean((ytest - yhat_bart)^2)
## [1] 0.04028545
# Problem 9 This problem involves the OJ data set which is
# part of the ISLR2 package.
str(OJ)
## 'data.frame': 1070 obs. of 18 variables:
## $ Purchase : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
## $ WeekofPurchase: num 237 239 245 227 228 230 232 234 235 238 ...
## $ StoreID : num 1 1 1 1 7 7 7 7 7 7 ...
## $ PriceCH : num 1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceMM : num 1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
## $ DiscCH : num 0 0 0.17 0 0 0 0 0 0 0 ...
## $ DiscMM : num 0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
## $ SpecialCH : num 0 0 0 0 0 0 1 1 0 0 ...
## $ SpecialMM : num 0 1 0 0 0 1 1 0 0 0 ...
## $ LoyalCH : num 0.5 0.6 0.68 0.4 0.957 ...
## $ SalePriceMM : num 1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
## $ SalePriceCH : num 1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceDiff : num 0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
## $ Store7 : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
## $ PctDiscMM : num 0 0.151 0 0 0 ...
## $ PctDiscCH : num 0 0 0.0914 0 0 ...
## $ ListPriceDiff : num 0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
## $ STORE : num 1 1 1 1 0 0 0 0 0 0 ...
# Part a) Create a training set containing a random sample of 800 observations,
# and a test set containing the remaining observations.
set.seed(7)
model_train = sample(1:nrow(OJ), 800)
OJ_train = OJ[model_train,]
OJ_test = OJ[-model_train,]
# Part (b) Fit a tree to the training data, with Purchase as the response
# and the other variables as predictors. Use the summary() function
# to produce summary statistics about the tree, and describe the
# results obtained. What is the training error rate? How many
# terminal nodes does the tree have?
OJ_tree = tree(Purchase~., data= OJ_train)
summary(OJ_tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ_train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "WeekofPurchase" "ListPriceDiff"
## [5] "DiscMM" "STORE"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7329 = 579.7 / 791
## Misclassification error rate: 0.1588 = 127 / 800
# The training error is 15.88% and based on the summary function there
# are 9 terminal nodes.
# Part (c) Type in the name of the tree object in order to get a detailed
# text output. Pick one of the terminal nodes, and interpret the
# information displayed.
OJ_tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1073.00 CH ( 0.60500 0.39500 )
## 2) LoyalCH < 0.5036 360 431.00 MM ( 0.28611 0.71389 )
## 4) LoyalCH < 0.136698 109 57.19 MM ( 0.07339 0.92661 ) *
## 5) LoyalCH > 0.136698 251 333.00 MM ( 0.37849 0.62151 )
## 10) PriceDiff < 0.195 110 107.30 MM ( 0.19091 0.80909 ) *
## 11) PriceDiff > 0.195 141 195.10 CH ( 0.52482 0.47518 )
## 22) WeekofPurchase < 247.5 55 70.90 MM ( 0.34545 0.65455 ) *
## 23) WeekofPurchase > 247.5 86 112.40 CH ( 0.63953 0.36047 ) *
## 3) LoyalCH > 0.5036 440 346.80 CH ( 0.86591 0.13409 )
## 6) LoyalCH < 0.764572 185 215.90 CH ( 0.72973 0.27027 )
## 12) ListPriceDiff < 0.255 102 139.50 CH ( 0.56863 0.43137 )
## 24) DiscMM < 0.47 87 113.30 CH ( 0.64368 0.35632 ) *
## 25) DiscMM > 0.47 15 11.78 MM ( 0.13333 0.86667 ) *
## 13) ListPriceDiff > 0.255 83 43.08 CH ( 0.92771 0.07229 ) *
## 7) LoyalCH > 0.764572 255 77.87 CH ( 0.96471 0.03529 )
## 14) STORE < 1.5 136 0.00 CH ( 1.00000 0.00000 ) *
## 15) STORE > 1.5 119 63.78 CH ( 0.92437 0.07563 ) *
# At the fourth split there is a terminal node which is indicated in the
# result above. 4) LoyalCH < 0.136698 109 57.19 MM ( 0.07339 0.92661 ) *
# The split of the branch in this node is, LoyalCH < 0.136698, with
# 109 observations in the node with a final prediction of MM that is indicated.
# Part (d) Create a plot of the tree, and interpret the results.
plot(OJ_tree)
text(OJ_tree)
# From the tree results when customer loyalty is low for Citrus Hill, they
# are more than likely to buy from Minute Maid (MM) unless Citrus Hill price
# difference cost 19.5 cents less than Minute Maid. On the otherside of the
# tree when customer loyalty is high to Citrus Hill they will buy
# the product more than likely compared to Minute Maid unless the price
# difference cost 25.5 cents less, then customers will buy form Minute maid
# instead.
# Part (e) Predict the response on the test data, and produce a confusion
# matrix comparing the test labels to the predicted test labels.
# What is the test error rate?
pred_OJ = predict(OJ_tree, newdata = OJ_test, type = "class")
table(pred_OJ,OJ_test$Purchase)
##
## pred_OJ CH MM
## CH 151 31
## MM 18 70
# Obtained from results above to calculate test error rate
49/270
## [1] 0.1814815
# Test error rate is 0.1814815
# Part (f) Apply the cv.tree() function to the training set in order to
# determine the optimal tree size.
set.seed(7)
cv_OJ_tree = cv.tree(OJ_tree, FUN = prune.misclass)
cv_OJ_tree
## $size
## [1] 9 8 5 2 1
##
## $dev
## [1] 151 154 154 165 316
##
## $k
## [1] -Inf 0.000000 3.666667 8.000000 154.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
# Optimal tree size is 9
plot(cv_OJ_tree$size, cv_OJ_tree$dev, xlab = "Tree size",
ylab = "Cross-validated classification error rate", type = "b")
# Part h) Which tree size corresponds to the lowest cross-validated
# classification error rate?
# Tree size with 9 terminal nodes corresponds to the lowest cross-validated
# classification error.
# Part (i) Produce a pruned tree corresponding to the optimal tree size
# obtained using cross-validation. If cross-validation does not lead
# to selection of a pruned tree, then create a pruned tree with five
# terminal nodes.
prune_OJ_tree = prune.tree(OJ_tree, best = 5)
plot(prune_OJ_tree)
text(prune_OJ_tree)
# Part (j) Compare the training error rates between the pruned and unpruned
# trees. Which is higher?
pred_prune_tree = predict(prune_OJ_tree, newdata = OJ_train, type = "class")
table(pred_prune_tree, OJ_train$Purchase)
##
## pred_prune_tree CH MM
## CH 381 59
## MM 103 257
# The error rate on the pruning is
162/800
## [1] 0.2025
# Part (k) Compare the test error rates between the pruned and unpruned
# trees. Which is higher?
# The test error rate of the pruned is higher than error rate of unpruned tree.