library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ISLR)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(tree)
## Warning: package 'tree' was built under R version 4.2.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(BART)
## Warning: package 'BART' was built under R version 4.2.3
## Loading required package: nlme
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## Loading required package: nnet
## Loading required package: survival
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
prop1 <- seq(0,1,.01)
prop2 <- 1 - prop1
classerror <- 1 - pmax(prop1, prop2)
gini <- prop1 * (1-prop1) + prop2 * (1-prop2)
entropy <- -prop1 * log(prop1) - prop2 * log(prop2)
data.frame(prop1, prop2, classerror, gini, entropy) %>%
pivot_longer(cols = c(classerror,gini, entropy), names_to = "metric") %>%
ggplot(aes(prop1, value, col = factor(metric))) +
geom_line() +
scale_color_hue(labels = c("Classification Error", "Entropy", "Gini")) +
labs(col = "Metric", x = "proportion", y = "value")
## Warning: Removed 2 rows containing missing values (`geom_line()`).
set.seed(1)
car.index <- createDataPartition(Carseats$Sales, p = .75, list = FALSE, times = 1)
carTrain <- Carseats[car.index,]
carTest <- Carseats[-car.index,]
car.tree <- tree(Sales~., data = carTrain)
plot(car.tree)
text(car.tree, pretty = 0, cex = .5)
summary(car.tree)
##
## Regression tree:
## tree(formula = Sales ~ ., data = carTrain)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Income" "Age" "Advertising"
## [6] "CompPrice"
## Number of terminal nodes: 16
## Residual mean deviance: 2.295 = 654 / 285
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.5490 -1.0160 -0.1187 0.0000 0.9648 3.5840
car.pred.t <- predict(car.tree, carTest)
mean((car.pred.t-carTest$Sales)^2)
## [1] 4.935788
We get an MSE if 4.935, which is pretty good.
set.seed(1)
cv.cars <- cv.tree(car.tree)
plot(cv.cars$size, cv.cars$dev, type = 'b')
It appears that pruning our model down to 12 terminal nodes would lower
the deviance the most.
prune.carseats <- prune.tree(car.tree, best = 12)
plot(prune.carseats)
text(prune.carseats, pretty = 0, cex = .5)
prune.pred <- predict(prune.carseats, carTest)
mean((prune.pred-carTest$Sales)^2)
## [1] 4.970822
This raised the MSE by .04.
car.bag <- randomForest(Sales~., data = carTrain, mtry =10, ntree = 500, importance = TRUE)
bag.preds <- predict(car.bag, carTest)
mean((bag.preds - carTest$Sales)^2)
## [1] 3.179889
importance(car.bag) %>%
as.data.frame() %>%
rownames_to_column('variable') %>%
arrange(desc(IncNodePurity))
## variable %IncMSE IncNodePurity
## 1 Price 68.2206989 743.44922
## 2 ShelveLoc 80.1387119 694.76788
## 3 CompPrice 29.7704182 215.53404
## 4 Advertising 24.7563156 185.71361
## 5 Age 20.3956172 181.77479
## 6 Income 8.8936255 104.74986
## 7 Population 0.9713052 70.37262
## 8 Education 2.8936326 61.95442
## 9 US 3.9202545 14.31920
## 10 Urban -0.7213293 10.73275
It appears that Price and ShelveLoc are the two most important variables.
car.rf <- randomForest(Sales~., data = carTrain, mtry = 3, ntree = 500, importance = TRUE)
rf.preds <- predict(car.rf, carTest)
mean((rf.preds - carTest$Sales)^2)
## [1] 3.639842
importance(car.rf) %>%
as.data.frame() %>%
rownames_to_column('variable') %>%
arrange(desc(IncNodePurity))
## variable %IncMSE IncNodePurity
## 1 Price 46.405424 571.77646
## 2 ShelveLoc 48.426476 552.90732
## 3 Age 16.616958 229.12277
## 4 CompPrice 13.685658 200.89817
## 5 Advertising 18.826769 200.14971
## 6 Income 2.149908 157.18510
## 7 Population -1.204152 134.97045
## 8 Education 1.874559 106.63373
## 9 US 5.656612 33.74457
## 10 Urban -2.098901 22.43028
Again, Price and ShelveLoc are the two most important variables. Increasing mtry reduces the test error, and we can see an reduction in test MSE as mtry approaches 10.
set.seed(1)
train <- sample(1:nrow(Carseats), nrow(Carseats) /2)
x <- Carseats[, 2:11]
y <- Carseats[, 'Sales']
xtrain <- x[train, ]
ytrain <- y[train]
xtest <- x[-train,]
ytest <-y[-train]
car.bart <- gbart(xtrain, ytrain, x.test = xtest)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 14, 200
## y1,yn: 2.781850, 1.091850
## x1,x[n*p]: 107.000000, 1.000000
## xp1,xp[np*p]: 111.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 63 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.273474,3,0.23074,7.57815
## *****sigma: 1.088371
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 3s
## trcnt,tecnt: 1000,1000
bart.preds <- car.bart$yhat.test.mean
mean((ytest - bart.preds)^2)
## [1] 1.499118
OJ.index <- createDataPartition(OJ$Purchase, p = .75, list = FALSE, times = 1)
OJtrain <- OJ[OJ.index,]
OJtest <- OJ[-OJ.index,]
oj.tree <- tree(Purchase~., data = OJtrain)
summary(oj.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJtrain)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff" "DiscMM"
## Number of terminal nodes: 7
## Residual mean deviance: 0.7389 = 588.2 / 796
## Misclassification error rate: 0.1569 = 126 / 803
There are 7 terminal nodes and a missclassification rate of around 15%.
oj.tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 803 1074.00 CH ( 0.61021 0.38979 )
## 2) LoyalCH < 0.48285 290 306.10 MM ( 0.22069 0.77931 )
## 4) LoyalCH < 0.280875 167 118.30 MM ( 0.11377 0.88623 ) *
## 5) LoyalCH > 0.280875 123 161.60 MM ( 0.36585 0.63415 )
## 10) PriceDiff < -0.24 13 0.00 MM ( 0.00000 1.00000 ) *
## 11) PriceDiff > -0.24 110 148.80 MM ( 0.40909 0.59091 ) *
## 3) LoyalCH > 0.48285 513 467.10 CH ( 0.83041 0.16959 )
## 6) LoyalCH < 0.764572 237 298.90 CH ( 0.67511 0.32489 )
## 12) ListPriceDiff < 0.235 95 129.30 MM ( 0.42105 0.57895 )
## 24) DiscMM < 0.15 56 75.84 CH ( 0.58929 0.41071 ) *
## 25) DiscMM > 0.15 39 36.71 MM ( 0.17949 0.82051 ) *
## 13) ListPriceDiff > 0.235 142 122.50 CH ( 0.84507 0.15493 ) *
## 7) LoyalCH > 0.764572 276 85.99 CH ( 0.96377 0.03623 ) *
I chose 4) to be my terminal node. This is a set of observations where LoyalCH is less than .280875. There are 167 observations with a deviance 118.3. This branch overall predicts for MM, based off of a 11-89 split.
plot(oj.tree)
text(oj.tree, pretty = 0, cex = .8)
It seems that generally if LoyaltyCH is less than .48, it is MM, but
otherwise it is CH with one node exception.
OJ.preds <- predict(oj.tree, OJtest, type = 'class')
table(OJ.preds, OJtest$Purchase)
##
## OJ.preds CH MM
## CH 125 16
## MM 38 88
(38+16)/270
## [1] 0.2
We have a 20% test error rate.
oj.cv <- cv.tree(oj.tree, FUN = prune.misclass)
oj.cv
## $size
## [1] 7 5 2 1
##
## $dev
## [1] 151 151 169 313
##
## $k
## [1] -Inf 0.000000 8.333333 162.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(oj.cv$size, oj.cv$dev, type = 'b')
It looks like a 5 node tree is our best bet.
oj.prune <- prune.misclass(oj.tree, best = 5)
plot(oj.prune)
text(oj.prune, pretty = 0, cex = .7)
summary(oj.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJtrain)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff" "DiscMM"
## Number of terminal nodes: 7
## Residual mean deviance: 0.7389 = 588.2 / 796
## Misclassification error rate: 0.1569 = 126 / 803
summary(oj.prune)
##
## Classification tree:
## snip.tree(tree = oj.tree, nodes = 2L)
## Variables actually used in tree construction:
## [1] "LoyalCH" "ListPriceDiff" "DiscMM"
## Number of terminal nodes: 5
## Residual mean deviance: 0.7858 = 627.1 / 798
## Misclassification error rate: 0.1569 = 126 / 803
It appears that the pruned tree and the original tree have the same misclassification error rate.
prune.preds <- predict(oj.prune, OJtest, type = 'class')
table(prune.preds, OJtest$Purchase)
##
## prune.preds CH MM
## CH 125 16
## MM 38 88
(38+16)/270
## [1] 0.2
They again have the same test missclassification rate.