p_m1 <- seq(from = 0, to = 1, length.out = 100)
gini_index <- p_m1 * (1-p_m1)
plot(p_m1, gini_index, type = 'l', main = "Gini Index vs p_m1")
p_m1 <- seq(from = 0, to = 1, length.out = 100)
classification_error <- 1 - pmax(p_m1, 1-p_m1)
plot(p_m1, classification_error, type = 'l', main = "Classification Error vs p_m1")
p_m1 <- seq(from = 0, to = 1, length.out = 100)
cross_entropy <- - p_m1 * log(p_m1) - (1-p_m1) * log((1-p_m1))
plot(p_m1, cross_entropy, type = 'l', main = "Cross Entropy vs p_m1")
Carseats data set after converting Sales into
a qualitative response variable. Now we will seek to predict
Sales using regression trees and related approaches,
treating the response as a quantitative variable.library(tree)
library(ISLR2)
attach(Carseats)
Carseats <- Carseats %>%
dplyr::select(Sales, CompPrice, Income, Advertising,Population, Price,
ShelveLoc, Age, Education, Urban, US)
set.seed(1)
train <- sample(1:nrow(Carseats), nrow(Carseats)/2)
tree.carseats <- tree(Sales ~ ., Carseats, subset = train)
summary(tree.carseats)
Regression tree:
tree(formula = Sales ~ ., data = Carseats, subset = train)
Variables actually used in tree construction:
[1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice" "US"
Number of terminal nodes: 18
Residual mean deviance: 2.167 = 394.3 / 182
Distribution of residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-3.88200 -0.88200 -0.08712 0.00000 0.89590 4.09900
plot(tree.carseats)
text(tree.carseats, pretty = 0)
Comments: Variables used in tree construction include “ShelveLoc”, “Price”, “Age”, “Advertising”, “CompPrice”, “US”, with 18 terminal nodes.
yhat <- predict(tree.carseats, newdata = Carseats[-train, ])
carseats.test <- Carseats[-train, "Sales"]
plot(yhat, carseats.test)
abline(0, 1)
mean((yhat-carseats.test)^2)
[1] 4.922039
sqrt(mean((yhat-carseats.test)^2))
[1] 2.218567
Comments: The test MSE is 4.92, and the square root of the MSE is around 2.21, meaning test predictions are (on average) within around $2.21 of the actual sales value.
cv.carseats <- cv.tree(tree.carseats)
plot(cv.carseats$size, cv.carseats$dev, type = "b")
prune.carseats <- prune.tree(tree.carseats, best = 11)
plot(prune.carseats)
text(prune.carseats, pretty = 0)
yhat_pru <- predict(prune.carseats, newdata = Carseats[-train, ])
carseats.test <- Carseats[-train, "Sales"]
plot(yhat_pru, carseats.test)
abline(0, 1)
mean((yhat_pru - carseats.test)^2)
[1] 4.757881
Comments: Yes, after pruning the tree, the test MSE improved.
library(randomForest)
set.seed(1)
bag.carseats <- randomForest(Sales ~ ., data = Carseats,
subset = train, mtry = 10, importance = TRUE)
bag.carseats
Call:
randomForest(formula = Sales ~ ., data = Carseats, mtry = 10, importance = TRUE, subset = train)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 10
Mean of squared residuals: 2.889221
% Var explained: 63.26
yhat.bag <- predict(bag.carseats, newdata = Carseats[-train, ])
plot(yhat.bag, carseats.test)
abline(0, 1)
mean((yhat.bag - carseats.test)^2)
[1] 2.605253
Comments: The test set MSE of the bagged regression tree is 2.605, about 55% of that obtained using an optimally-pruned single tree.
importance(bag.carseats)
%IncMSE IncNodePurity
CompPrice 24.8888481 170.182937
Income 4.7121131 91.264880
Advertising 12.7692401 97.164338
Population -1.8074075 58.244596
Price 56.3326252 502.903407
ShelveLoc 48.8886689 380.032715
Age 17.7275460 157.846774
Education 0.5962186 44.598731
Urban 0.1728373 9.822082
US 4.2172102 18.073863
varImpPlot(bag.carseats)
Comments: The most crucial variable is Price, which the company charges for car seats at each site.
set.seed(1)
rf.carseats <- randomForest(Sales ~ ., data = Carseats,
subset = train, mtry = 4, importance = TRUE) # p/3 for regression, sqrt(p) for classification
rf.carseats
Call:
randomForest(formula = Sales ~ ., data = Carseats, mtry = 4, importance = TRUE, subset = train)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 4
Mean of squared residuals: 3.140932
% Var explained: 60.06
yhat.rf <- predict(rf.carseats, newdata = Carseats[-train, ])
plot(yhat.rf, carseats.test)
abline(0, 1)
mean((yhat.rf - carseats.test)^2)
[1] 2.787584
Comments: When m = 4, the MSE of random forest regression is a bit larger than bagging.
importance(rf.carseats)
%IncMSE IncNodePurity
CompPrice 15.7891655 160.57944
Income 4.1275509 121.12953
Advertising 9.6425758 111.54581
Population -1.3596645 85.92575
Price 43.4055391 423.06225
ShelveLoc 37.8850232 311.97119
Age 13.8924424 174.18229
Education 0.1960888 62.77782
Urban 0.1393816 12.92952
US 6.3532441 30.42255
varImpPlot(rf.carseats)
Comments: The most important variable is still
Price.
set.seed(1)
for (m in 1:10) {
rf.carseats <- randomForest(Sales ~ ., data = Carseats, subset = train, mtry = m)
yhat.rf <- predict(rf.carseats, newdata = Carseats[-train, ])
mse[m] <- mean((yhat.rf - carseats.test)^2)
}
# Plot
plot(1:10, mse, type = "b", xlab = "mtry", ylab = "Test MSE")
points(which.min(mse), mse[which.min(mse)], col = "red", pch = 19)
mse[8]
[1] 2.580009
Comments: As the mtry increases, random
forest regression performs better. The smallest test MSE value appears
when 8 variables considered at each split equal.
library(BART)
x <- Carseats[, 1:10]
y <- Carseats[, "Sales"]
xtrain <- x[train, ]
ytrain <- y[train]
xtest <- x[-train, ]
ytest <- y[-train]
set.seed(1)
bartfit <- gbart(xtrain, ytrain, x.test = xtest)
*****Calling gbart: type=1
*****Data:
data:n,p,np: 200, 13, 200
y1,yn: 2.781850, 1.091850
x1,x[n*p]: 10.360000, 1.000000
xp1,xp[np*p]: 11.220000, 1.000000
*****Number of Trees: 200
*****Number of Cut Points: 100 ... 1
*****burn,nd,thin: 100,1000,1
*****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.273474,3,6.77063e-30,7.57815
*****sigma: 0.000000
*****w (weights): 1.000000 ... 1.000000
*****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,13,0
*****printevery: 100
MCMC
done 0 (out of 1100)
done 100 (out of 1100)
done 200 (out of 1100)
done 300 (out of 1100)
done 400 (out of 1100)
done 500 (out of 1100)
done 600 (out of 1100)
done 700 (out of 1100)
done 800 (out of 1100)
done 900 (out of 1100)
done 1000 (out of 1100)
time: 3s
trcnt,tecnt: 1000,1000
yhat.bart <- bartfit$yhat.test.mean
mean((ytest - yhat.bart)^2)
[1] 0.184202
Comments: On this data set, the test MSE of BART is lower than the test MSE of random forests and bagging.