Rp = seq(0, 1, 0.05)
gini = p * (1 - p) * 2
entropy = -(p * log(p) + (1 - p) * log(1 - p))
class.err = 1 - pmax(p, 1 - p)
matplot(p, cbind(gini, entropy, class.err), pch = c(0, 15, 2))
inTrain <- createDataPartition(Carseats$Sales,p=0.5,list=FALSE)
train <- Carseats[inTrain,]
test <- Carseats[-inTrain,]
y.test <- test$Sales
carseat.tree <- tree(Sales~.,data=train)
summary(carseat.tree)
##
## Regression tree:
## tree(formula = Sales ~ ., data = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Income" "Age" "CompPrice"
## [6] "Advertising"
## Number of terminal nodes: 17
## Residual mean deviance: 2.272 = 418 / 184
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.76200 -0.95950 0.08568 0.00000 1.00600 3.55000
plot(carseat.tree)
text(carseat.tree, pretty=0)
yhat <- predict(carseat.tree, newdata=test)
carseat.MSE <- mean((yhat - y.test)^2)
The test MSE obtained was 4.5407647
cv.Carseats <- cv.tree(carseat.tree)
plot(cv.Carseats$size, cv.Carseats$dev, type='b')
prune.Carseats <- prune.tree(carseat.tree, best=14)
plot(prune.Carseats)
text(prune.Carseats, pretty=0)
yhat <- predict(prune.Carseats, test)
carseat.MSE2 <- mean((yhat-y.test)^2)
The test MSE is is 4.4820204, which is only 0.0587443 lower than the previous MSE.
importance() function to
determine which variables are most important.library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
bag.Carseats <- randomForest(Sales ~., data=train, mtry=10, importance=TRUE)
yhat.bag <- predict(bag.Carseats, newdata=test)
bag.MSE <- mean((yhat.bag-y.test)^2)
importance(bag.Carseats)
## %IncMSE IncNodePurity
## CompPrice 20.0856500 160.997241
## Income 12.4774969 97.736143
## Advertising 15.5139101 121.667479
## Population -0.7938376 50.286078
## Price 49.7490298 416.931988
## ShelveLoc 47.1510800 363.787790
## Age 16.9282480 140.719245
## Education -0.5861685 53.488122
## Urban -1.2541817 6.804677
## US 1.1445297 9.490164
The test MSE found is 2.5933931, which is lower than the previous MSE of 4.4820204
importance() function to determine which
variables are most important. Describe the effect of m, the number of
variables considered at each split, on the error rate obtained.randforest.Carseats <- randomForest(Sales~., data=train, mtry=floor((ncol(Carseats)-1)/3),importance=TRUE)
yhat.randforest <- predict(randforest.Carseats, newdata=test)
randforest.MSE <- mean((yhat.randforest-y.test)^2)
importance(randforest.Carseats)
## %IncMSE IncNodePurity
## CompPrice 11.5759305 146.19780
## Income 8.3447205 139.25753
## Advertising 12.8890094 131.73946
## Population 0.2337977 101.64228
## Price 35.7432757 333.97613
## ShelveLoc 33.6579308 274.06395
## Age 11.5117569 149.71564
## Education -0.5650574 65.44572
## Urban -3.2490911 12.95234
## US 4.7378338 19.12290
The MSE for random forest is 3.3579941
library(BART)
## Warning: package 'BART' was built under R version 4.2.3
## Loading required package: nlme
## Loading required package: nnet
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
train = sample(nrow(Carseats), nrow(Carseats) / 2)
test = Carseats[-train, "Sales"]
x <- Carseats[, 2:11]
y <- Carseats[, "Sales"]
xtrain <- x[train,]
ytrain <- y[train]
xtest <- x[test, ]
ytest <- y[test]
bartfit <- gbart(xtrain, ytrain, x.test=xtest)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 14, 197
## y1,yn: -3.000450, -1.940450
## x1,x[n*p]: 141.000000, 0.000000
## xp1,xp[np*p]: 132.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 64 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.284787,3,0.206647,7.15045
## *****sigma: 1.029983
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 3s
## trcnt,tecnt: 1000,1000
yhat.bart = bartfit$yhat.test.mean
BART.MSE <- mean((ytest - yhat.bart)^2)
The MSE with BART is 0.3068485
detach(Carseats)
OJ data set which is part of
the ISLR2 package.library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
attach(OJ)
set.seed(1)
train <- sample(1:nrow(OJ), 800)
oj.train <- OJ[train,]
oj.test <- OJ[-train,]
Purchase as
the response and the other variables as predictors. Use the
summary() function to produce summary statistics about the
tree, and describe the results obtained. What is the training error
rate? How many terminal nodes does the tree have?oj.tree <- tree(Purchase~., oj.train)
summary(oj.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = oj.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## [5] "PctDiscMM"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7432 = 587.8 / 791
## Misclassification error rate: 0.1588 = 127 / 800
The training error rate is 0.1588, and the tree is using
9 terminal nodes with 3 variables: LoyalCH,
PriceDiff, and DiscMM.
oj.tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1073.00 CH ( 0.60625 0.39375 )
## 2) LoyalCH < 0.5036 365 441.60 MM ( 0.29315 0.70685 )
## 4) LoyalCH < 0.280875 177 140.50 MM ( 0.13559 0.86441 )
## 8) LoyalCH < 0.0356415 59 10.14 MM ( 0.01695 0.98305 ) *
## 9) LoyalCH > 0.0356415 118 116.40 MM ( 0.19492 0.80508 ) *
## 5) LoyalCH > 0.280875 188 258.00 MM ( 0.44149 0.55851 )
## 10) PriceDiff < 0.05 79 84.79 MM ( 0.22785 0.77215 )
## 20) SpecialCH < 0.5 64 51.98 MM ( 0.14062 0.85938 ) *
## 21) SpecialCH > 0.5 15 20.19 CH ( 0.60000 0.40000 ) *
## 11) PriceDiff > 0.05 109 147.00 CH ( 0.59633 0.40367 ) *
## 3) LoyalCH > 0.5036 435 337.90 CH ( 0.86897 0.13103 )
## 6) LoyalCH < 0.764572 174 201.00 CH ( 0.73563 0.26437 )
## 12) ListPriceDiff < 0.235 72 99.81 MM ( 0.50000 0.50000 )
## 24) PctDiscMM < 0.196196 55 73.14 CH ( 0.61818 0.38182 ) *
## 25) PctDiscMM > 0.196196 17 12.32 MM ( 0.11765 0.88235 ) *
## 13) ListPriceDiff > 0.235 102 65.43 CH ( 0.90196 0.09804 ) *
## 7) LoyalCH > 0.764572 261 91.20 CH ( 0.95785 0.04215 ) *
Node 7 identifies LoyalCH, a terminal node with a
splitting value of 0.745, 260 observations
below it, and a deviance of 84.77
plot(oj.tree)
text(oj.tree, pretty = 0, cex = 0.5)
This tree ends with the same amount of Terminal nodes on both sides
(balanced?) with LoyalCH and PriceDiff being
the two leading factors
oj.pred <- predict(oj.tree, oj.test, type = "class")
table(oj.test$Purchase, oj.pred)
## oj.pred
## CH MM
## CH 160 8
## MM 38 64
OJ.testError <- 1- mean(oj.test$Purchase == oj.pred)
The test error is 0.1703704
cv.tree() function to the training set in
order to determine the optimal tree size.oj.cv <- cv.tree(oj.tree, FUN=prune.misclass)
oj.cv
## $size
## [1] 9 8 7 4 2 1
##
## $dev
## [1] 150 150 149 158 172 315
##
## $k
## [1] -Inf 0.000000 3.000000 4.333333 10.500000 151.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(oj.cv$size, oj.cv$dev, xlab="Tree Size", ylab="Error Rate")
4 is the lowest
oj.pruned = prune.tree(oj.tree, best=4)
oj.pruned
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1073.0 CH ( 0.60625 0.39375 )
## 2) LoyalCH < 0.5036 365 441.6 MM ( 0.29315 0.70685 )
## 4) LoyalCH < 0.280875 177 140.5 MM ( 0.13559 0.86441 ) *
## 5) LoyalCH > 0.280875 188 258.0 MM ( 0.44149 0.55851 ) *
## 3) LoyalCH > 0.5036 435 337.9 CH ( 0.86897 0.13103 )
## 6) LoyalCH < 0.764572 174 201.0 CH ( 0.73563 0.26437 ) *
## 7) LoyalCH > 0.764572 261 91.2 CH ( 0.95785 0.04215 ) *
plot(oj.pruned)
text(oj.pruned, pretty = 0, cex = 0.5)
summary(oj.pruned)
##
## Classification tree:
## snip.tree(tree = oj.tree, nodes = 4:6)
## Variables actually used in tree construction:
## [1] "LoyalCH"
## Number of terminal nodes: 4
## Residual mean deviance: 0.8678 = 690.7 / 796
## Misclassification error rate: 0.205 = 164 / 800
The higher misclassification error rate belongs to
oj.pruned at 0.1988; compared to
oj.tree at 0.1588.
pred.oj.tree <- predict(oj.tree, oj.test, type="class")
misclass.oj.tree <- sum(oj.test$Purchase != pred.oj.tree)
misclass.oj.tree/length(pred.oj.tree)
## [1] 0.1703704
pred.oj.pruned <- predict(oj.pruned, oj.test, type="class")
misclass.oj.pruned <- sum(oj.test$Purchase != pred.oj.pruned)
misclass.oj.pruned/length(pred.oj.pruned)
## [1] 0.1851852
The higher test error rate belongs to the Pruned tree, at 0.1851852