p <- seq(0, 1, by = 0.01)
gini <- 2 * p * (1 - p)
class_error <- 1 - pmax(p, 1 - p)
entropy <- -p * log(p) - (1 - p) * log(1 - p)
entropy[is.nan(entropy)] <- 0
plot(p, gini, type = "l", col = "blue", ylim = c(0, 1), ylab = "Impurity", xlab = expression(hat(p)[m1]))
lines(p, class_error, col = "red")
lines(p, entropy, col = "darkgreen")
legend("top", legend = c("Gini", "Classification Error", "Entropy"), col = c("blue", "red", "darkgreen"), lty = 1)
Answer: - Gini index is highest when the classes are equally likely (p = 0.5). - Classification error is minimized when the prediction matches the most likely class. - Entropy captures the amount of uncertainty; it is also highest at p = 0.5.
set.seed(1)
Carseats$HighSales <- Carseats$Sales
train <- sample(1:nrow(Carseats), nrow(Carseats)/2)
test <- -train
y_test <- Carseats$Sales[test]
# (b) Regression Tree
reg_tree <- tree(Sales ~ . - HighSales, data = Carseats, subset = train)
summary(reg_tree)
##
## Regression tree:
## tree(formula = Sales ~ . - HighSales, data = Carseats, subset = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "US"
## Number of terminal nodes: 18
## Residual mean deviance: 2.167 = 394.3 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.88200 -0.88200 -0.08712 0.00000 0.89590 4.09900
plot(reg_tree)
text(reg_tree, pretty = 0)
# Test MSE
pred_tree <- predict(reg_tree, Carseats[test,])
mse_tree <- mean((pred_tree - y_test)^2)
mse_tree
## [1] 4.922039
# (c) Cross-validation
cv_reg <- cv.tree(reg_tree)
plot(cv_reg$size, cv_reg$dev, type = "b", xlab = "Tree Size", ylab = "Deviance")
pruned_tree <- prune.tree(reg_tree, best = cv_reg$size[which.min(cv_reg$dev)])
summary(pruned_tree)
##
## Regression tree:
## tree(formula = Sales ~ . - HighSales, data = Carseats, subset = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "US"
## Number of terminal nodes: 18
## Residual mean deviance: 2.167 = 394.3 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.88200 -0.88200 -0.08712 0.00000 0.89590 4.09900
# (d) Bagging
bag <- randomForest(Sales ~ . - HighSales, data = Carseats, subset = train, mtry = ncol(Carseats)-2, importance = TRUE)
pred_bag <- predict(bag, Carseats[test,])
mse_bag <- mean((pred_bag - y_test)^2)
mse_bag
## [1] 2.657296
importance(bag)
## %IncMSE IncNodePurity
## CompPrice 23.07909904 171.185734
## Income 2.82081527 94.079825
## Advertising 11.43295625 99.098941
## Population -3.92119532 59.818905
## Price 54.24314632 505.887016
## ShelveLoc 46.26912996 361.962753
## Age 14.24992212 159.740422
## Education -0.07662320 46.738585
## Urban 0.08530119 8.453749
## US 4.34349223 15.157608
# (e) Random Forest
rf <- randomForest(Sales ~ . - HighSales, data = Carseats, subset = train, importance = TRUE)
pred_rf <- predict(rf, Carseats[test,])
mse_rf <- mean((pred_rf - y_test)^2)
mse_rf
## [1] 3.049406
importance(rf)
## %IncMSE IncNodePurity
## CompPrice 12.9489323 158.48521
## Income 2.2754686 129.59400
## Advertising 8.9977589 111.94374
## Population -2.2513981 102.84599
## Price 33.4226950 391.60804
## ShelveLoc 34.0233545 290.56502
## Age 12.2185108 171.83302
## Education 0.2592124 71.65413
## Urban 1.1382113 14.76798
## US 4.1925335 33.75554
Answers: - (b) Regression tree yields a test MSE of 4.922. - (c) Cross-validation recommends pruning; pruned model is simpler. - (d) Bagging results in a lower test MSE of 2.657. - (e) Random forest performs similarly or slightly better than bagging with test MSE of 3.049. Variable importance shows ShelveLoc is the most important predictor.
data(OJ)
set.seed(1)
train_id <- sample(1:nrow(OJ), 800)
train <- OJ[train_id, ]
test <- OJ[-train_id, ]
# (b) Fit a classification tree
oj_tree <- tree(Purchase ~ ., data = train)
summary(oj_tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## [5] "PctDiscMM"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7432 = 587.8 / 791
## Misclassification error rate: 0.1588 = 127 / 800
# (c) Terminal Node Analysis
oj_tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1073.00 CH ( 0.60625 0.39375 )
## 2) LoyalCH < 0.5036 365 441.60 MM ( 0.29315 0.70685 )
## 4) LoyalCH < 0.280875 177 140.50 MM ( 0.13559 0.86441 )
## 8) LoyalCH < 0.0356415 59 10.14 MM ( 0.01695 0.98305 ) *
## 9) LoyalCH > 0.0356415 118 116.40 MM ( 0.19492 0.80508 ) *
## 5) LoyalCH > 0.280875 188 258.00 MM ( 0.44149 0.55851 )
## 10) PriceDiff < 0.05 79 84.79 MM ( 0.22785 0.77215 )
## 20) SpecialCH < 0.5 64 51.98 MM ( 0.14062 0.85938 ) *
## 21) SpecialCH > 0.5 15 20.19 CH ( 0.60000 0.40000 ) *
## 11) PriceDiff > 0.05 109 147.00 CH ( 0.59633 0.40367 ) *
## 3) LoyalCH > 0.5036 435 337.90 CH ( 0.86897 0.13103 )
## 6) LoyalCH < 0.764572 174 201.00 CH ( 0.73563 0.26437 )
## 12) ListPriceDiff < 0.235 72 99.81 MM ( 0.50000 0.50000 )
## 24) PctDiscMM < 0.196196 55 73.14 CH ( 0.61818 0.38182 ) *
## 25) PctDiscMM > 0.196196 17 12.32 MM ( 0.11765 0.88235 ) *
## 13) ListPriceDiff > 0.235 102 65.43 CH ( 0.90196 0.09804 ) *
## 7) LoyalCH > 0.764572 261 91.20 CH ( 0.95785 0.04215 ) *
# (d) Plot tree
plot(oj_tree)
text(oj_tree, pretty = 0)
# (e) Predict on test set
pred_test <- predict(oj_tree, test, type = "class")
table(pred_test, test$Purchase)
##
## pred_test CH MM
## CH 160 38
## MM 8 64
test_err_unpruned <- mean(pred_test != test$Purchase)
test_err_unpruned
## [1] 0.1703704
# (f) Cross-validation for pruning
cv_oj <- cv.tree(oj_tree, FUN = prune.misclass)
# (g) Plot size vs CV error
plot(cv_oj$size, cv_oj$dev, type = "b", xlab = "Tree Size", ylab = "CV Misclass Error")
# (h) Best size
best_size <- cv_oj$size[which.min(cv_oj$dev)]
best_size
## [1] 7
# (i) Prune tree
pruned_oj <- prune.misclass(oj_tree, best = best_size)
plot(pruned_oj)
text(pruned_oj, pretty = 0)
# (j) Training error rates
train_pred_unpruned <- predict(oj_tree, train, type = "class")
train_err_unpruned <- mean(train_pred_unpruned != train$Purchase)
train_pred_pruned <- predict(pruned_oj, train, type = "class")
train_err_pruned <- mean(train_pred_pruned != train$Purchase)
c(train_err_unpruned = train_err_unpruned, train_err_pruned = train_err_pruned)
## train_err_unpruned train_err_pruned
## 0.15875 0.16250
# (k) Test error rates
pruned_test_pred <- predict(pruned_oj, test, type = "class")
test_err_pruned <- mean(pruned_test_pred != test$Purchase)
c(test_err_unpruned = test_err_unpruned, test_err_pruned = test_err_pruned)
## test_err_unpruned test_err_pruned
## 0.1703704 0.1629630
Answers: - (b) Tree fits well with summary output showing top splits. Training error rate is 0.159. - (c) Tree structure can be interpreted using printed output and plots. - (e) Test error rate for unpruned tree is 0.17. - (f) Cross-validation suggests best size is 7. - (j) Pruned tree training error: 0.162; Unpruned: 0.159. - (k) Pruned tree test error: 0.163; Unpruned: 0.17. Usually, pruned model generalizes better.