library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.2
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Loading required package: lattice
library(tree)
## Warning: package 'tree' was built under R version 4.3.3
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)
## Warning: package 'gbm' was built under R version 4.3.3
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(rsample)
## Warning: package 'rsample' was built under R version 4.3.3
library(BART)
## Warning: package 'BART' was built under R version 4.3.3
## Loading required package: nlme
## Loading required package: nnet
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
p <- seq(0, 1, 0.01)
classification_error <- 1 - pmax(p, 1 - p)
gini_index <- 2 * p * (1 - p)
entropy <- - (p * log(p) + (1 - p) * log(1 - p))
matplot(p, cbind(classification_error, gini_index, entropy), col = c("blue", "green", "orange" ), pch=16,main = "Measures with 2 classes ~ Classification Tree", xlab = "training observations(ˆpm1)", ylab = "Different Measure Values",ylim=c(0,1.1), lty=1)
legend(x='top', pch = 16,title = "Measures Values(Indicator)", col=c("blue", "green", "orange"), legend=c("Classification Error Rate", "Gini Index", "Entropy"), box.lty=1)
data("Carseats")
set.seed(1)
carseats_split_80_20 = initial_split(Carseats, strata = Sales, prop = .8)
mytrain_carseats_split_80_20 = training(carseats_split_80_20)
mytest_carseats_split_80_20 = testing (carseats_split_80_20)
treereg_carseats <- tree(Sales ~ ., data = mytrain_carseats_split_80_20)
summary(treereg_carseats)
##
## Regression tree:
## tree(formula = Sales ~ ., data = mytrain_carseats_split_80_20)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Income" "Age" "Advertising"
## [6] "CompPrice"
## Number of terminal nodes: 18
## Residual mean deviance: 2.543 = 765.4 / 301
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.0110 -0.9695 0.1060 0.0000 1.0080 3.6990
Six of the eleven variables are used in the development of the trees, according to the regression tree fit.
plot(treereg_carseats)
text(treereg_carseats, pretty = 0)
The tree plot shows that shelving location (ShelvLoc), a categorical variable, is the most significant indicator of sales.
preds_carseats = predict(treereg_carseats, newdata = mytest_carseats_split_80_20)
mytest_carseats_split_80_20_mse = mean((preds_carseats - mytest_carseats_split_80_20$Sales)^2)
print(paste("Test MSE:", mytest_carseats_split_80_20_mse))
## [1] "Test MSE: 5.24480646279063"
set.seed(1)
cv_carseats <- cv.tree(treereg_carseats)
cv_carseats
## $size
## [1] 18 17 16 15 14 13 12 10 9 8 7 6 5 4 3 2 1
##
## $dev
## [1] 1507.003 1481.050 1524.057 1523.040 1504.805 1505.384 1534.300 1552.600
## [9] 1541.317 1579.384 1587.949 1574.523 1529.546 1540.716 1745.632 2001.102
## [17] 2540.863
##
## $k
## [1] -Inf 26.21400 29.10078 33.92926 34.42837 37.16043 44.81280
## [8] 45.45044 47.46589 52.68018 53.90292 61.90044 75.32244 103.96770
## [15] 163.15419 286.39345 616.83439
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
From the above it is understood that $size variable values in the decreasing order.
cv_carseats_min = which.min(cv_carseats$dev)
tree_min_size = cv_carseats$size[cv_carseats_min]
tree_min_size
## [1] 17
plot(cv_carseats$size, cv_carseats$dev, type = "b",xlab='Size',ylab='Deviance',main='Cross Validation Error Plot')
points(tree_min_size, cv_carseats$dev[cv_carseats_min], col = "red", cex = 2, pch = 20)
cat("Red Point (Size, Deviance): ", tree_min_size, ", ", cv_carseats$dev[cv_carseats_min], "\n")
## Red Point (Size, Deviance): 17 , 1481.05
# Populate the size value for the best parameter to prune
prune_carseats <- prune.tree(treereg_carseats, best = 17)
plot(prune_carseats)
text(prune_carseats, pretty = 0)
# Pruned MSE:
prune_preds <- predict(prune_carseats, newdata = mytest_carseats_split_80_20)
prune_mse = mean((prune_preds - mytest_carseats_split_80_20$Sales)^2)
#prune_mse
print(paste("Pruned TEST MSE:", round(prune_mse, 4)))
## [1] "Pruned TEST MSE: 5.3863"
print(paste("Test MSE:", round(mytest_carseats_split_80_20_mse, 4)))
## [1] "Test MSE: 5.2448"
In fact, the test MSE slightly increased after the tree was pruned using CV.
bag_car<-randomForest(Sales~.,mytrain_carseats_split_80_20,importance=TRUE,mtry=10)
importance(bag_car)
## %IncMSE IncNodePurity
## CompPrice 29.4907932 252.76623
## Income 11.2268650 118.14465
## Advertising 25.3521818 209.03840
## Population -1.0199921 77.58257
## Price 77.0059274 779.50681
## ShelveLoc 73.9529633 730.25644
## Age 19.3169328 213.45063
## Education 0.3431786 62.47515
## Urban 1.2007436 12.11727
## US 4.5814787 11.22747
varImpPlot(bag_car)
bag_car_predict<-predict(bag_car,mytest_carseats_split_80_20)
mean((bag_car_predict-mytest_carseats_split_80_20$Sales)^2)
## [1] 2.918387
2.876001 is the test mean square error (MSE) that bagging yielded, which is superior to the decision tree strategy with ShelveLoc and Price.
rf_carseats = randomForest(Sales ~ ., data = mytrain_carseats_split_80_20, mtry = 5, ntree = 500,
importance = T)
rf_pred = predict(rf_carseats, mytest_carseats_split_80_20)
mean((mytest_carseats_split_80_20$Sales - rf_pred)^2)
## [1] 2.762785
importance(rf_carseats)
## %IncMSE IncNodePurity
## CompPrice 23.0092954 244.57989
## Income 8.7939753 151.29176
## Advertising 22.5561036 230.56229
## Population -1.1421291 106.33742
## Price 63.8920093 697.70719
## ShelveLoc 63.9433216 666.27855
## Age 18.6351204 229.42474
## Education 1.2416396 80.59786
## Urban -0.8628727 16.29853
## US 5.0716029 19.97302
varImpPlot(rf_carseats)
The MSE is increased to 2.876724 via Random Forest. Changes in m have an impact on the MSE. ShelveLoc, Price are continue to be the most crucial variables.
set.seed(100)
train <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
carseats_test <- Carseats[-train, "Sales"]
boost_BART_carseats <- gbm(Sales ~ ., data = Carseats[train, ],
distribution = "gaussian", n.trees = 7000, interaction.depth = 5)
summary(boost_BART_carseats)
## var rel.inf
## ShelveLoc ShelveLoc 28.0910574
## Price Price 23.6993021
## Age Age 11.9237569
## CompPrice CompPrice 11.3677990
## Advertising Advertising 8.3828408
## Income Income 7.0214604
## Population Population 5.4484161
## Education Education 3.0956725
## US US 0.5540452
## Urban Urban 0.4156496
boost_BART_carseats_MSE <- predict(boost_BART_carseats, newdata = Carseats[-train, ], n.trees = 5000)
mean((boost_BART_carseats_MSE - carseats_test)^2)
## [1] 2.085493
Using BART, we can observe that the most significant predictors are those that come from bagging and Random Forest(ShelveLoc, Price). When BART is contrasted with any of the other models studied, the test MSE has dramatically dropped to 2.085493.
attach(OJ)
set.seed(0)
train_set_OJ=sample(dim(OJ)[1], 800)
train_oj=OJ[train_set_OJ, ]
test_oj=OJ[-train_set_OJ, ]
oj_tree_fit = tree(Purchase ~ ., data = train_oj)
summary(oj_tree_fit)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train_oj)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "StoreID"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7679 = 608.2 / 792
## Misclassification error rate: 0.1588 = 127 / 800
There are 8 terminal nodes with a training error rate (misclassification error rate) of 0.1588.
oj_tree_fit
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1067.000 CH ( 0.61375 0.38625 )
## 2) LoyalCH < 0.48285 297 329.000 MM ( 0.24242 0.75758 )
## 4) LoyalCH < 0.0356415 60 10.170 MM ( 0.01667 0.98333 ) *
## 5) LoyalCH > 0.0356415 237 289.400 MM ( 0.29958 0.70042 )
## 10) PriceDiff < 0.49 228 268.800 MM ( 0.27632 0.72368 )
## 20) PriceDiff < -0.34 16 0.000 MM ( 0.00000 1.00000 ) *
## 21) PriceDiff > -0.34 212 258.000 MM ( 0.29717 0.70283 ) *
## 11) PriceDiff > 0.49 9 6.279 CH ( 0.88889 0.11111 ) *
## 3) LoyalCH > 0.48285 503 453.800 CH ( 0.83300 0.16700 )
## 6) LoyalCH < 0.764572 233 288.100 CH ( 0.69099 0.30901 )
## 12) PriceDiff < 0.015 83 113.600 MM ( 0.43373 0.56627 )
## 24) StoreID < 3.5 44 49.490 MM ( 0.25000 0.75000 ) *
## 25) StoreID > 3.5 39 50.920 CH ( 0.64103 0.35897 ) *
## 13) PriceDiff > 0.015 150 135.200 CH ( 0.83333 0.16667 ) *
## 7) LoyalCH > 0.764572 270 98.180 CH ( 0.95556 0.04444 ) *
The split criterion of LoyalCH, node #2 on the decision tree, has 297 observations, a deviation of 329, an MM prediction, and a percentage of observations that take on an MM value.
plot(oj_tree_fit)
text(oj_tree_fit, pretty =0)
the disparity in cost between the brands and general brand inclination.
tree_pred = predict(oj_tree_fit, newdata = test_oj, type = "class")
confusionMatrix_tree = table(tree_pred, test_oj$Purchase)
print(confusionMatrix_tree)
##
## tree_pred CH MM
## CH 134 24
## MM 28 84
print((confusionMatrix_tree[1, 2] + confusionMatrix_tree[2, 1])/sum(confusionMatrix_tree))
## [1] 0.1925926
The TEST MSE for Tree is 0.1925926.
OJ_cv_tree = cv.tree(oj_tree_fit)
OJ_cv_tree
## $size
## [1] 8 7 6 5 4 3 2 1
##
## $dev
## [1] 771.0492 748.5568 715.1965 718.5060 726.8831 741.8421 796.6811
## [8] 1072.1687
##
## $k
## [1] -Inf 10.80302 13.19440 14.31642 29.43984 39.36313 67.48591
## [8] 284.47328
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
The tree sizes tested range from 1 to 8 terminal nodes. The corresponding cross-validated error rates range from approximately 771.0492 to 1072.1687. The tree size with the lowest cross-validated error rate is 8 nodes with error rate of 771.0492.
plot(OJ_cv_tree$size, OJ_cv_tree$dev, type = "b")
min_dev <- min(OJ_cv_tree$dev)
min_size <- OJ_cv_tree$size[which.min(OJ_cv_tree$dev)]
cat("Minimum deviance:", min_dev, "\n")
## Minimum deviance: 715.1965
cat("Corresponding size:", min_size, "\n")
## Corresponding size: 6
A tree size of 6 corresponds to the lowest cross-validated classification error rate, as shown by the plot generated in (g).
OJ_tree_prune = prune.tree(oj_tree_fit, best = min_size)
plot(OJ_tree_prune)
text(OJ_tree_prune, pretty = 0)
tree_pred_pruned = predict(OJ_tree_prune, newdata = train_oj, type = "class")
CT_prune_train_tree = table(tree_pred_pruned, train_oj$Purchase)
print(CT_prune_train_tree)
##
## tree_pred_pruned CH MM
## CH 391 38
## MM 100 271
print((CT_prune_train_tree[1, 2] + CT_prune_train_tree[2, 1])/sum(CT_prune_train_tree))
## [1] 0.1725
The training error is slightly higher (0.1725) when the tree is pruned.
tree_prune_pred_test_comp = predict(OJ_tree_prune, newdata = test_oj, type = "class")
CT_tree_prune_pred_test_comp = table(tree_prune_pred_test_comp, test_oj$Purchase)
print(CT_tree_prune_pred_test_comp)
##
## tree_prune_pred_test_comp CH MM
## CH 131 19
## MM 31 89
print((CT_tree_prune_pred_test_comp[1, 2] + CT_tree_prune_pred_test_comp[2, 1])/sum(CT_tree_prune_pred_test_comp))
## [1] 0.1851852
The test error is lower(0.1851852) when the tree is pruned. Hence, the test error is higher when tree is unpruned.