library(ISLR2)
library(tree)
library(rpart)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(BART)
## Loading required package: nlme
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
Consider the Gini index, classification error, and entropy in a simple classification setting with two classes. Create a single plot that displays each of these quantities as a function of pˆm1. The x-axis should display pˆm1, ranging from 0 to 1, and the y-axis should display the value of the Gini index, classification error, and entropy. (Hint: In a setting with two classes, pˆm1 = 1 − pˆm2. You could make this plot by hand, but it will be much easier to make in R.)
par(mar = c(5, 4, 4, 8), xpd = TRUE)
p = seq(0, 1, 0.001)
gini_index = 2 * p * (1 - p)
class.error = 1 - pmax(p, 1 - p)
cross.entropy = - (p * log(p) + (1 - p) * log(1 - p))
matplot(p, cbind(gini_index, class.error, cross.entropy), lty = 1, lwd = 1, col = c("red", "blue", "green"), ylab = "gini.index, class.error, cross.entropy")
legend("topright", inset = c(-0.25, 0),
legend = c("Gini Index", "Classification Error", "Cross Entropy"),
col = c("red", "blue", "green"), lty = 1, cex = 0.8)
In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable.
d1 = Carseats
head(d1)
## Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
## 1 9.50 138 73 11 276 120 Bad 42 17
## 2 11.22 111 48 16 260 83 Good 65 10
## 3 10.06 113 35 10 269 80 Medium 59 12
## 4 7.40 117 100 4 466 97 Medium 55 14
## 5 4.15 141 64 3 340 128 Bad 38 13
## 6 10.81 124 113 13 501 72 Bad 78 16
## Urban US
## 1 Yes Yes
## 2 Yes Yes
## 3 Yes Yes
## 4 Yes Yes
## 5 Yes No
## 6 No Yes
str(d1)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
set.seed(42)
carseat_train_index <- createDataPartition(d1$Sales, p = 0.7, list = FALSE)
carseat_train_data <- d1[carseat_train_index, ]
carseat_test_data <- d1[-carseat_train_index, ]
carseat_rt <- tree(Sales ~ ., data = carseat_train_data)
summary(carseat_rt)
##
## Regression tree:
## tree(formula = Sales ~ ., data = carseat_train_data)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "CompPrice" "Age" "Advertising"
## [6] "Education"
## Number of terminal nodes: 17
## Residual mean deviance: 2.46 = 649.5 / 264
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.91700 -1.00100 0.01913 0.00000 0.96830 4.18100
plot(carseat_rt)
text(carseat_rt, pretty = 0)
carseat.pred = predict(carseat_rt, newdata = carseat_test_data)
mean((carseat.pred - carseat_test_data$Sales)^2)
## [1] 4.421924
I got an MSE of 4.421924 from the regression tree.
set.seed(42)
cv_carseat <- cv.tree(carseat_rt)
plot(cv_carseat$size, cv_carseat$dev, type = "b", xlab = "Tree Size (Number of Terminal Nodes)", ylab = "Deviance")
best_index <- which.min(cv_carseat$dev)
cv_carseat$size[best_index]
## [1] 9
carseats.prune = prune.tree(carseat_rt, best = 9)
summary(carseats.prune)
##
## Regression tree:
## snip.tree(tree = carseat_rt, nodes = c(6L, 14L, 10L, 16L, 23L
## ))
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "CompPrice" "Advertising"
## Number of terminal nodes: 9
## Residual mean deviance: 3.441 = 936 / 272
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.80400 -1.16400 -0.09413 0.00000 1.22500 5.58400
plot(carseats.prune)
text(carseats.prune,pretty=0)
carseat.pred = predict(carseats.prune, newdata = carseat_test_data)
mean((carseat.pred - carseat_test_data$Sales)^2)
## [1] 4.23625
In this case, pruning did improve the results of the regression tree as the MSE went down from 4.421924 to 4.23625 by setting the number of nodes to 9.
set.seed(42)
carseats.bag <- randomForest(Sales ~ ., data = carseat_train_data, mtry = 10, importance = TRUE)
carseat.pred = predict(carseats.bag, newdata = carseat_test_data)
mean((carseat.pred - carseat_test_data$Sales)^2)
## [1] 2.063321
I got a test MSE of 2.063321.
importance(carseats.bag)
## %IncMSE IncNodePurity
## CompPrice 32.066701 251.211444
## Income 7.807017 117.174920
## Advertising 22.253304 162.647032
## Population -1.420411 67.974963
## Price 66.328927 645.348379
## ShelveLoc 68.144349 770.526801
## Age 20.056238 175.868521
## Education 4.409980 53.236309
## Urban -0.421840 11.171375
## US 2.450300 9.298957
Looking at the variable importance plot we can see that
Price and ShelveLoc are the most important
variables. CompPrice would be the next most important
variable after those 2.
set.seed(42)
carseats.rf <- randomForest(Sales ~ ., data = carseat_train_data, mtry = 3, importance = TRUE)
carseat.pred = predict(carseats.rf, newdata = carseat_test_data)
mean((carseat.pred - carseat_test_data$Sales)^2)
## [1] 2.403375
I got a test MSE of 2.403375 which was slightly worse than the bagging model.
importance(carseats.rf)
## %IncMSE IncNodePurity
## CompPrice 18.7356408 215.35309
## Income 6.6339752 173.26772
## Advertising 13.7031915 189.72684
## Population -1.5548213 135.25324
## Price 42.5710171 529.19502
## ShelveLoc 49.5414748 585.17801
## Age 14.6065870 238.28702
## Education 0.7162662 90.63694
## Urban -0.4856681 22.95345
## US 4.6618975 28.30025
We can see that Price and ShelveLoc are
still the 2 most important variables. I chose to set M = 3 becasue of
the rule of thumb that for regression problems you want to set M equal
to 1/3 of the total number of predictors. However in this case, the MSE
ended up getting worse. I tested to see what would happen if I increased
it and the test MSE slowly started to approach the MSE from the bagging
model.
x <- d1[, 2:11]
y <- d1[, "Sales"]
x_train <- x[carseat_train_index, ]
y_train <- y[carseat_train_index]
x_test <- x[-carseat_train_index, ]
y_test <- y[-carseat_train_index]
set.seed(42)
carseats.bart <- gbart(x_train, y_train, x.test = x_test)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 281, 14, 119
## y1,yn: 3.685623, 2.175623
## x1,x[n*p]: 111.000000, 1.000000
## xp1,xp[np*p]: 138.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 68 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.287616,3,0.196244,7.53438
## *****sigma: 1.003722
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 2s
## trcnt,tecnt: 1000,1000
yhat.bart <- carseats.bart$yhat.test.mean
mean((y_test - yhat.bart)^2)
## [1] 1.472166
After fitting a BART model, I got a test MSE of 1.472166. This beats all of the previous models by a wide margin.
This problem involves the OJ data set which is part of the ISLR2 package.
d2 = OJ
head(d2)
## Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH
## 1 CH 237 1 1.75 1.99 0.00 0.0 0
## 2 CH 239 1 1.75 1.99 0.00 0.3 0
## 3 CH 245 1 1.86 2.09 0.17 0.0 0
## 4 MM 227 1 1.69 1.69 0.00 0.0 0
## 5 CH 228 7 1.69 1.69 0.00 0.0 0
## 6 CH 230 7 1.69 1.99 0.00 0.0 0
## SpecialMM LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM
## 1 0 0.500000 1.99 1.75 0.24 No 0.000000
## 2 1 0.600000 1.69 1.75 -0.06 No 0.150754
## 3 0 0.680000 2.09 1.69 0.40 No 0.000000
## 4 0 0.400000 1.69 1.69 0.00 No 0.000000
## 5 0 0.956535 1.69 1.69 0.00 Yes 0.000000
## 6 1 0.965228 1.99 1.69 0.30 Yes 0.000000
## PctDiscCH ListPriceDiff STORE
## 1 0.000000 0.24 1
## 2 0.000000 0.24 1
## 3 0.091398 0.23 1
## 4 0.000000 0.00 1
## 5 0.000000 0.00 0
## 6 0.000000 0.30 0
str(d2)
## 'data.frame': 1070 obs. of 18 variables:
## $ Purchase : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
## $ WeekofPurchase: num 237 239 245 227 228 230 232 234 235 238 ...
## $ StoreID : num 1 1 1 1 7 7 7 7 7 7 ...
## $ PriceCH : num 1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceMM : num 1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
## $ DiscCH : num 0 0 0.17 0 0 0 0 0 0 0 ...
## $ DiscMM : num 0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
## $ SpecialCH : num 0 0 0 0 0 0 1 1 0 0 ...
## $ SpecialMM : num 0 1 0 0 0 1 1 0 0 0 ...
## $ LoyalCH : num 0.5 0.6 0.68 0.4 0.957 ...
## $ SalePriceMM : num 1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
## $ SalePriceCH : num 1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceDiff : num 0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
## $ Store7 : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
## $ PctDiscMM : num 0 0.151 0 0 0 ...
## $ PctDiscCH : num 0 0 0.0914 0 0 ...
## $ ListPriceDiff : num 0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
## $ STORE : num 1 1 1 1 0 0 0 0 0 0 ...
set.seed(42)
oj_train_index <- sample(1:nrow(d2), 800)
oj_train_data <- d2[oj_train_index, ]
oj_test_data <- d2[-oj_train_index, ]
oj_ct <- tree(Purchase ~ ., data = oj_train_data)
summary(oj_ct)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = oj_train_data)
## Variables actually used in tree construction:
## [1] "LoyalCH" "SalePriceMM" "PriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7392 = 585.5 / 792
## Misclassification error rate: 0.1638 = 131 / 800
The training error rate was 16.38%. The number of terminal nodes is 8.
oj_ct
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1066.00 CH ( 0.61500 0.38500 )
## 2) LoyalCH < 0.48285 285 296.00 MM ( 0.21404 0.78596 )
## 4) LoyalCH < 0.064156 64 0.00 MM ( 0.00000 1.00000 ) *
## 5) LoyalCH > 0.064156 221 260.40 MM ( 0.27602 0.72398 )
## 10) SalePriceMM < 2.04 128 123.50 MM ( 0.18750 0.81250 ) *
## 11) SalePriceMM > 2.04 93 125.00 MM ( 0.39785 0.60215 ) *
## 3) LoyalCH > 0.48285 515 458.10 CH ( 0.83689 0.16311 )
## 6) LoyalCH < 0.753545 230 282.70 CH ( 0.69565 0.30435 )
## 12) PriceDiff < 0.265 149 203.00 CH ( 0.57718 0.42282 )
## 24) PriceDiff < -0.165 32 38.02 MM ( 0.28125 0.71875 ) *
## 25) PriceDiff > -0.165 117 150.30 CH ( 0.65812 0.34188 )
## 50) LoyalCH < 0.703993 105 139.60 CH ( 0.61905 0.38095 ) *
## 51) LoyalCH > 0.703993 12 0.00 CH ( 1.00000 0.00000 ) *
## 13) PriceDiff > 0.265 81 47.66 CH ( 0.91358 0.08642 ) *
## 7) LoyalCH > 0.753545 285 111.70 CH ( 0.95088 0.04912 ) *
The terminal node described above denotes that there are 64
observations in which the predictor variable LoyalCH is
< 0.064156. The people in this group have a zero percent chance of
choosing CH and a 100% chance of having MM.
plot(oj_ct)
text(oj_ct, pretty = 0)
Looking at the plot above, we can see that LoyalCH,
SalePriceMM, and PriceDiff are the 3 variables
used in the tree. LoyalCH is the most important varibale as
it is the one used at the root node.
oj.pred <- predict(oj_ct, oj_test_data, type = "class")
confusionMatrix(oj.pred, oj_test_data$Purchase, positive = "MM")
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 125 15
## MM 36 94
##
## Accuracy : 0.8111
## 95% CI : (0.7592, 0.856)
## No Information Rate : 0.5963
## P-Value [Acc > NIR] : 3.4e-14
##
## Kappa : 0.6195
##
## Mcnemar's Test P-Value : 0.005101
##
## Sensitivity : 0.8624
## Specificity : 0.7764
## Pos Pred Value : 0.7231
## Neg Pred Value : 0.8929
## Prevalence : 0.4037
## Detection Rate : 0.3481
## Detection Prevalence : 0.4815
## Balanced Accuracy : 0.8194
##
## 'Positive' Class : MM
##
The confusion matrix shows that the test error is 1 - 0.8111 = 0.1889.
set.seed(42)
cv_oj <- cv.tree(oj_ct, FUN = prune.misclass)
plot(cv_oj$size, cv_oj$dev, type = "b", xlab = "Tree Size (Number of Terminal Nodes)", ylab = "Classification Error")
The tree size of 5 corresponds to the lowest cross-validation classification error rate.
oj.prune = prune.tree(oj_ct, best = 5)
summary(oj.prune)
##
## Classification tree:
## snip.tree(tree = oj_ct, nodes = c(5L, 12L))
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 5
## Residual mean deviance: 0.7833 = 622.7 / 795
## Misclassification error rate: 0.1812 = 145 / 800
plot(oj.prune)
text(oj.prune,pretty=0)
summary(oj.prune)
##
## Classification tree:
## snip.tree(tree = oj_ct, nodes = c(5L, 12L))
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 5
## Residual mean deviance: 0.7833 = 622.7 / 795
## Misclassification error rate: 0.1812 = 145 / 800
The training error for this model is 18.12% which is higher than the unpruned tree which had a training error of 16.38%.
oj.pred <- predict(oj.prune, oj_test_data, type = "class")
confusionMatrix(oj.pred, oj_test_data$Purchase, positive = "MM")
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 128 26
## MM 33 83
##
## Accuracy : 0.7815
## 95% CI : (0.7274, 0.8293)
## No Information Rate : 0.5963
## P-Value [Acc > NIR] : 8.681e-11
##
## Kappa : 0.5508
##
## Mcnemar's Test P-Value : 0.4347
##
## Sensitivity : 0.7615
## Specificity : 0.7950
## Pos Pred Value : 0.7155
## Neg Pred Value : 0.8312
## Prevalence : 0.4037
## Detection Rate : 0.3074
## Detection Prevalence : 0.4296
## Balanced Accuracy : 0.7782
##
## 'Positive' Class : MM
##
The confusion matrix shows that the test error is 1 - 0.7815 = 0.7815. This again is higher than the test error for the unpruned tree which was 0.1889.