p <- seq(0, 1, 0.01)
gini <- 2 * p * (1 - p)
classerror <- 1 - pmax(p, 1 - p)
entropy <- - (p * log(p) + (1 - p) * log(1 - p))
matplot(p, cbind(gini, classerror, entropy), col = c("blue", "lightblue", "black"))
### Number 8 ## (a) Split the data set into a training set and a test
set.
library(ISLR2)
carseats <- Carseats
set.seed(1)
train <- sample(1:nrow(carseats), 200)
test <- Carseats[-train, ]
library(tree)
## Warning: package 'tree' was built under R version 4.2.2
tree <- tree(Sales ~., data = carseats, subset = train)
summary(tree)
##
## Regression tree:
## tree(formula = Sales ~ ., data = carseats, subset = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "US"
## Number of terminal nodes: 18
## Residual mean deviance: 2.167 = 394.3 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.88200 -0.88200 -0.08712 0.00000 0.89590 4.09900
plot(tree)
text(tree, pretty = 0)
treepred <- predict(tree, test)
plot(treepred , test$Sales)
abline (0, 1)
mean((test$Sales - treepred)^2)
## [1] 4.922039
# The test MSE is 4.92
set.seed(3)
cv <- cv.tree(tree)
min <- which.min(cv$dev)
min
## [1] 1
plot(cv$size, cv$dev, type = "b")
points(min, cv$dev[min], col = "red", cex = 2, pch = 20)
# optimal level is 9
prune <- prune.tree(tree, best = 9)
plot(prune)
text(prune, pretty = 0)
predprune = predict(prune, test)
plot(predprune , test$Sales)
abline (0, 1)
mean((test$Sales - predprune)^2)
## [1] 4.918134
# Pruning does not decrease the MSE, it increases by about .01. We can also see by the plot that pruning did not improve the test.
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
bag <- randomForest(Sales ~., data = carseats, subset = train, mtry =10, importance = TRUE)
predbag <- predict(bag, newdata = test)
plot(predbag , test$Sales)
abline (0, 1)
mean((predbag - test$Sales)^2)
## [1] 2.617413
importance(bag)
## %IncMSE IncNodePurity
## CompPrice 25.8395521 173.028616
## Income 3.7400180 88.246346
## Advertising 11.5586859 102.750315
## Population -4.3487076 60.585131
## Price 53.0886784 510.532384
## ShelveLoc 46.4358875 372.977805
## Age 16.4441978 156.566479
## Education 2.3618726 45.308113
## Urban -0.5679812 9.234006
## US 4.9344961 16.531234
# The MSE improved and is now 2.61. The variables that are most important are shown with the importance() function. The plot shows a relatively even scatter around the line. However, some variables are poorly predicted.
set.seed (1)
rf <- randomForest(Sales ~., data = carseats, subset = train, mtry = 5, importance = TRUE)
predrf <- predict(rf, newdata = test)
plot(predrf, test$Sales)
abline (0, 1)
mean((predrf - test$Sales)^2)
## [1] 2.714168
importance(rf)
## %IncMSE IncNodePurity
## CompPrice 17.4126238 157.53631
## Income 2.9969399 110.40731
## Advertising 11.0485672 105.75049
## Population -1.5321044 80.73318
## Price 43.3572135 452.02367
## ShelveLoc 44.4474163 331.64508
## Age 14.5322339 176.64252
## Education 0.8237454 55.91141
## Urban -2.7805788 11.07321
## US 3.7773881 23.75322
# The test MSE is 2.71, and this is shown by the predicted values in the plot. There is a more even spread (still outliers), but it has good looking residuals. The variable ShelveLoc shows the highest % when using the importance function. Behind that variable is Price, CompPrice, and Advertising, respectively.
test <- sample(1:nrow(test), 200)
library(BART)
## Loading required package: nlme
## Loading required package: nnet
## Loading required package: survival
x <- carseats[, c(1:11)]
y <- carseats[, "Sales"]
xtrain <- x[train, ]
ytrain <- y[train]
xtest <- x[-test, ]
ytest <- y[-test]
set.seed (1)
bartfit <- gbart(xtrain, ytrain, x.test = xtest)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 15, 200
## y1,yn: 2.781850, 1.091850
## x1,x[n*p]: 10.360000, 1.000000
## xp1,xp[np*p]: 5.560000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 100 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.273474,3,4.01382e-30,7.57815
## *****sigma: 0.000000
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,15,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 3s
## trcnt,tecnt: 1000,1000
yhat.bart <- bartfit$yhat.test.mean
mean((ytest - yhat.bart)^2)
## [1] 0.08306884
plot(yhat.bart, ytest)
abline (0, 1)
ord <- order(bartfit$varcount.mean , decreasing = T)
bartfit$varcount.mean[ord]
## Sales ShelveLoc3 ShelveLoc2 US2 ShelveLoc1 Urban2
## 32.954 22.459 21.159 18.784 18.752 18.390
## Urban1 US1 Advertising Price Education CompPrice
## 15.512 14.402 13.118 10.007 8.873 7.820
## Age Population Income
## 7.394 6.673 6.099
# The BART gave us incredibly good results. The MSE is only 0.083. The residual plot looks very good. Towards the higher yhat.bart values, there is a little bit of dispersing, but the overall model looks great.
set.seed(1)
library(ISLR2)
oj <- OJ
train <- sample(1:nrow(oj), 800)
test <- oj[-train, ]
library(tree)
tree <- tree(Purchase~., data = oj, subset = train)
summary(tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = oj, subset = train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## [5] "PctDiscMM"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7432 = 587.8 / 791
## Misclassification error rate: 0.1588 = 127 / 800
# There are 8 terminal nodes and the test error rate is 16.75.
tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1073.00 CH ( 0.60625 0.39375 )
## 2) LoyalCH < 0.5036 365 441.60 MM ( 0.29315 0.70685 )
## 4) LoyalCH < 0.280875 177 140.50 MM ( 0.13559 0.86441 )
## 8) LoyalCH < 0.0356415 59 10.14 MM ( 0.01695 0.98305 ) *
## 9) LoyalCH > 0.0356415 118 116.40 MM ( 0.19492 0.80508 ) *
## 5) LoyalCH > 0.280875 188 258.00 MM ( 0.44149 0.55851 )
## 10) PriceDiff < 0.05 79 84.79 MM ( 0.22785 0.77215 )
## 20) SpecialCH < 0.5 64 51.98 MM ( 0.14062 0.85938 ) *
## 21) SpecialCH > 0.5 15 20.19 CH ( 0.60000 0.40000 ) *
## 11) PriceDiff > 0.05 109 147.00 CH ( 0.59633 0.40367 ) *
## 3) LoyalCH > 0.5036 435 337.90 CH ( 0.86897 0.13103 )
## 6) LoyalCH < 0.764572 174 201.00 CH ( 0.73563 0.26437 )
## 12) ListPriceDiff < 0.235 72 99.81 MM ( 0.50000 0.50000 )
## 24) PctDiscMM < 0.196196 55 73.14 CH ( 0.61818 0.38182 ) *
## 25) PctDiscMM > 0.196196 17 12.32 MM ( 0.11765 0.88235 ) *
## 13) ListPriceDiff > 0.235 102 65.43 CH ( 0.90196 0.09804 ) *
## 7) LoyalCH > 0.764572 261 91.20 CH ( 0.95785 0.04215 ) *
# The terminal node 4 is the one I choose to describe. It has 153 observations and a deviance of 106.70. The branch takes on 11% of yes values and 89% of no values.
plot(tree)
text(tree, pretty = 0)
# the visual of the tree shows each branch, which signifies what would happen when choosing 'yes' or 'no' or greater than or less than the value shown.
purchasetesting <- oj$Purchase[-train]
treepred <- predict(tree, test, type = "class")
table(treepred, purchasetesting)
## purchasetesting
## treepred CH MM
## CH 160 38
## MM 8 64
(27+19)/270
## [1] 0.1703704
# the decision tree correctly predicted 145 CH indicators and 80 MM. The test error rate is 17.04.
cv <- cv.tree(tree, FUN = prune.misclass)
cv
## $size
## [1] 9 8 7 4 2 1
##
## $dev
## [1] 150 150 149 158 172 315
##
## $k
## [1] -Inf 0.000000 3.000000 4.333333 10.500000 151.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
#The optimal tree size is 5.
plot(cv$size, cv$dev, type = "b", xlab = "size", ylab = "dev")
# The tree size corresponding to the lowest classification error rate is 6.
prune <- prune.tree(tree,best=6)
summary(tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = oj, subset = train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## [5] "PctDiscMM"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7432 = 587.8 / 791
## Misclassification error rate: 0.1588 = 127 / 800
summary(prune)
##
## Classification tree:
## snip.tree(tree = tree, nodes = c(10L, 4L, 12L))
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff"
## Number of terminal nodes: 6
## Residual mean deviance: 0.7919 = 628.8 / 794
## Misclassification error rate: 0.1788 = 143 / 800
#the error rate of the pruned tree and unpruned tree are the same.
mean(treepred!= test$Purchase)
## [1] 0.1703704
mean(predict(prune, test, type = "class")!= test$Purchase)
## [1] 0.1851852
# The error rate of the unpruned tree is 17.04 and the error rate of the pruned tree is 16.3. The error rate of the unpruned tree is higher.