library(ISLR2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(tree)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(BART)
## Loading required package: nlme
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## Loading required package: survival
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
Consider the Gini index, classification error, and entropy in a simple classification setting with two classes. Create a single plot that displays each of these quantities as a function of pˆm1. The x-axis should display pˆm1, ranging from 0 to 1, and the y-axis should display the value of the Gini index, classification error, and entropy.
Hint: In a setting with two classes, pˆm1 = 1 − pˆm2. You could make this plot by hand, but it will be much easier to make in R.
class_1 = seq(0, 1 , 0.0001)
class_2 = 1 - class_1
classification_error = 1 - pmax(class_1, class_2)
gini = class_1 * (1 - class_1) + class_2 * (1 - class_2)
entropy = - class_1 * log(class_1) - class_2 * log(class_2)
data.frame(class_1, class_2, classification_error, gini, entropy) %>%
pivot_longer(cols = c(classification_error, gini, entropy), names_to = "measure") %>%
ggplot(aes(x = class_1, y = value, col = factor(measure))) +
geom_line() +
labs(col = "Measure") +
theme_minimal()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable.
attach(Carseats)
Carseats = Carseats
set.seed(23)
train_index = sample(1:nrow(Carseats), nrow(Carseats) * 0.6)
train = Carseats[train_index, ]
test = Carseats[-train_index, ]
tree_model = tree(Sales ~ ., data = train)
summary(tree_model)
##
## Regression tree:
## tree(formula = Sales ~ ., data = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "CompPrice" "Income" "Advertising"
## [6] "Age"
## Number of terminal nodes: 19
## Residual mean deviance: 2.146 = 474.3 / 221
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.13000 -0.74940 0.05118 0.00000 0.89410 3.67600
plot(tree_model)
text(tree_model, pretty = 0, cex = 0.5)
ShelveLoc and Price seem to be the most important factors.
preds = predict(tree_model, newdata = test)
mse = mean((preds - test$Sales)^2)
mse
## [1] 4.460833
trimed_tree_model = cv.tree(tree_model)
plot(trimed_tree_model$size, trimed_tree_model$dev, type = "b")
pruned_tree_model = prune.tree(tree_model, best = 18)
preds = predict(pruned_tree_model, newdata = test)
mse = mean((preds - test$Sales)^2)
mse
## [1] 4.35041
Pruning the tree does slightly improve our MSE.
set.seed(23)
bag_model = randomForest(Sales ~ ., data = train, mtry = 10, importance = TRUE)
bag_model
##
## Call:
## randomForest(formula = Sales ~ ., data = train, mtry = 10, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.599092
## % Var explained: 67.9
preds = predict(bag_model, newdata = test)
plot(preds, test$Sales)
mean((preds - test$Sales)^2)
## [1] 2.216528
importance(bag_model)
## %IncMSE IncNodePurity
## CompPrice 24.830717 197.79619
## Income 11.809758 109.84085
## Advertising 26.428868 222.20544
## Population 1.103274 75.86258
## Price 61.496419 547.48336
## ShelveLoc 58.711603 498.43603
## Age 16.490336 162.72249
## Education 4.166045 50.03900
## Urban -2.213879 8.06940
## US 2.804635 12.51660
set.seed(23)
rf_model = randomForest(Sales ~ ., data = train, mtry = 3, importance = TRUE)
preds = predict(rf_model, newdata = test)
mean((preds - test$Sales)^2)
## [1] 3.031801
When we increase m the error rate gets lower but the model becomes more complex.
importance(rf_model)
## %IncMSE IncNodePurity
## CompPrice 12.5185007 180.06666
## Income 9.6930436 148.82291
## Advertising 20.7103898 223.94149
## Population 2.6644176 134.22415
## Price 37.5611920 440.18254
## ShelveLoc 41.4239942 389.67498
## Age 11.3496972 185.55786
## Education -0.2200384 81.58845
## Urban -0.3943959 17.24848
## US 4.5631140 39.63352
x = Carseats[, 2:11]
y = Carseats[, 1]
xtrain = x[train_index, ]
ytrain = y[train_index]
xtest = x[-train_index, ]
ytest = y[-train_index]
set.seed(23)
bart_model = gbart(xtrain, ytrain, x.test = xtest)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 240, 14, 160
## y1,yn: -2.322875, 2.757125
## x1,x[n*p]: 135.000000, 1.000000
## xp1,xp[np*p]: 138.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 64 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.281075,3,0.192662,7.68288
## *****sigma: 0.994520
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 5s
## trcnt,tecnt: 1000,1000
preds = bart_model$yhat.test.mean
mean((ytest - preds)^2)
## [1] 1.471902
The lowest MSE of all our models.
This problem involves the OJ data set which is part of the ISLR2 package.
attach(OJ)
oj = OJ
set.seed(23)
train_index = sample(1:nrow(oj), 800)
train = oj[train_index, ]
test = oj[-train_index, ]
tree_model = tree(Purchase ~ ., data = train)
summary(tree_model)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH"
## Number of terminal nodes: 10
## Residual mean deviance: 0.7116 = 562.2 / 790
## Misclassification error rate: 0.145 = 116 / 800
Training error rate is 14.5% and there are ten terminal nodes.
tree_model
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1064.00 CH ( 0.61750 0.38250 )
## 2) LoyalCH < 0.48285 298 327.20 MM ( 0.23826 0.76174 )
## 4) LoyalCH < 0.142213 101 55.92 MM ( 0.07921 0.92079 ) *
## 5) LoyalCH > 0.142213 197 246.90 MM ( 0.31980 0.68020 )
## 10) PriceDiff < 0.31 154 169.80 MM ( 0.24026 0.75974 )
## 20) SpecialCH < 0.5 137 136.00 MM ( 0.19708 0.80292 ) *
## 21) SpecialCH > 0.5 17 23.03 CH ( 0.58824 0.41176 ) *
## 11) PriceDiff > 0.31 43 57.71 CH ( 0.60465 0.39535 ) *
## 3) LoyalCH > 0.48285 502 437.00 CH ( 0.84263 0.15737 )
## 6) LoyalCH < 0.705699 208 258.40 CH ( 0.68750 0.31250 )
## 12) PriceDiff < -0.165 27 25.87 MM ( 0.18519 0.81481 ) *
## 13) PriceDiff > -0.165 181 198.50 CH ( 0.76243 0.23757 )
## 26) PriceDiff < 0.265 97 125.70 CH ( 0.64948 0.35052 )
## 52) LoyalCH < 0.6864 92 114.70 CH ( 0.68478 0.31522 ) *
## 53) LoyalCH > 0.6864 5 0.00 MM ( 0.00000 1.00000 ) *
## 27) PriceDiff > 0.265 84 57.20 CH ( 0.89286 0.10714 ) *
## 7) LoyalCH > 0.705699 294 112.60 CH ( 0.95238 0.04762 )
## 14) PriceDiff < -0.39 14 19.12 CH ( 0.57143 0.42857 ) *
## 15) PriceDiff > -0.39 280 72.65 CH ( 0.97143 0.02857 ) *
Node 21, is when 0.142213 < Loyal < 0.48285 and PriceDiff < 0.31 and SpecialCH > 0.5 and the prediction is CH with confidence of 58.8% there are 17 observations in the node.
plot(tree_model)
text(tree_model, pretty = 0, cex = 0.5)
preds = predict(tree_model, test, type = "class")
table(preds, test$Purchase)
##
## preds CH MM
## CH 139 37
## MM 20 74
1 - mean(preds == test$Purchase)
## [1] 0.2111111
set.seed(23)
trimed_tree_model = cv.tree(tree_model, K = 10, FUN = prune.misclass)
trimed_tree_model
## $size
## [1] 10 9 7 6 4 2 1
##
## $dev
## [1] 147 146 147 145 162 169 306
##
## $k
## [1] -Inf 0.0 2.5 3.0 4.5 8.5 156.0
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(trimed_tree_model$size, trimed_tree_model$dev, type = "b")
pruned_tree_model = prune.tree(tree_model, best = 18)
## Warning in prune.tree(tree_model, best = 18): best is bigger than tree size
preds = predict(pruned_tree_model, newdata = test)
mse = mean((preds - test$Sales)^2)
mse
## [1] NaN
data.frame(size = trimed_tree_model$size, CV_Error = trimed_tree_model$dev / nrow(train)) %>%
mutate(min_CV_Error = as.numeric(min(CV_Error) == CV_Error)) %>%
ggplot(aes(x = size, y = CV_Error)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_CV_Error))) +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal() +
theme(legend.position = "none")
Size 6 is the best.
pruned_tree_model = prune.tree(tree_model, best = 6)
pruned_tree_model
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1064.00 CH ( 0.61750 0.38250 )
## 2) LoyalCH < 0.48285 298 327.20 MM ( 0.23826 0.76174 )
## 4) LoyalCH < 0.142213 101 55.92 MM ( 0.07921 0.92079 ) *
## 5) LoyalCH > 0.142213 197 246.90 MM ( 0.31980 0.68020 ) *
## 3) LoyalCH > 0.48285 502 437.00 CH ( 0.84263 0.15737 )
## 6) LoyalCH < 0.705699 208 258.40 CH ( 0.68750 0.31250 )
## 12) PriceDiff < -0.165 27 25.87 MM ( 0.18519 0.81481 ) *
## 13) PriceDiff > -0.165 181 198.50 CH ( 0.76243 0.23757 ) *
## 7) LoyalCH > 0.705699 294 112.60 CH ( 0.95238 0.04762 )
## 14) PriceDiff < -0.39 14 19.12 CH ( 0.57143 0.42857 ) *
## 15) PriceDiff > -0.39 280 72.65 CH ( 0.97143 0.02857 ) *
Un-pruned
mean(predict(tree_model, type = "class") != train$Purchase)
## [1] 0.145
Pruned
mean(predict(pruned_tree_model, type = "class") != train$Purchase)
## [1] 0.16625
Un-pruned
mean(predict(tree_model, type = "class", newdata = test) != test$Purchase)
## [1] 0.2111111
Pruned
mean(predict(pruned_tree_model, type = "class", newdata = test) != test$Purchase)
## [1] 0.1925926
The pruned version did better on the test data.