Exercise 3: Gini, Entropy, and Classification Error

p <- seq(0, 1, by = 0.01)
gini <- 2 * p * (1 - p)
class_error <- 1 - pmax(p, 1 - p)
entropy <- -p * log(p) - (1 - p) * log(1 - p)
entropy[is.nan(entropy)] <- 0

plot(p, gini, type = "l", col = "blue", ylim = c(0, 1), ylab = "Impurity", xlab = expression(hat(p)[m1]))
lines(p, class_error, col = "red")
lines(p, entropy, col = "darkgreen")
legend("top", legend = c("Gini", "Classification Error", "Entropy"), col = c("blue", "red", "darkgreen"), lty = 1)

Answer: - Gini index is highest when the classes are equally likely (p = 0.5). - Classification error is minimized when the prediction matches the most likely class. - Entropy captures the amount of uncertainty; it is also highest at p = 0.5.

Exercise 8: Carseats - Regression Trees and Ensembles

set.seed(1)
Carseats$HighSales <- Carseats$Sales
train <- sample(1:nrow(Carseats), nrow(Carseats)/2)
test <- -train
y_test <- Carseats$Sales[test]

# (b) Regression Tree
reg_tree <- tree(Sales ~ . - HighSales, data = Carseats, subset = train)
summary(reg_tree)
## 
## Regression tree:
## tree(formula = Sales ~ . - HighSales, data = Carseats, subset = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Age"         "Advertising" "CompPrice"  
## [6] "US"         
## Number of terminal nodes:  18 
## Residual mean deviance:  2.167 = 394.3 / 182 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.88200 -0.88200 -0.08712  0.00000  0.89590  4.09900
plot(reg_tree)
text(reg_tree, pretty = 0)

# Test MSE
pred_tree <- predict(reg_tree, Carseats[test,])
mse_tree <- mean((pred_tree - y_test)^2)
mse_tree
## [1] 4.922039
# (c) Cross-validation
cv_reg <- cv.tree(reg_tree)
plot(cv_reg$size, cv_reg$dev, type = "b", xlab = "Tree Size", ylab = "Deviance")

pruned_tree <- prune.tree(reg_tree, best = cv_reg$size[which.min(cv_reg$dev)])
summary(pruned_tree)
## 
## Regression tree:
## tree(formula = Sales ~ . - HighSales, data = Carseats, subset = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Age"         "Advertising" "CompPrice"  
## [6] "US"         
## Number of terminal nodes:  18 
## Residual mean deviance:  2.167 = 394.3 / 182 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.88200 -0.88200 -0.08712  0.00000  0.89590  4.09900
# (d) Bagging
bag <- randomForest(Sales ~ . - HighSales, data = Carseats, subset = train, mtry = ncol(Carseats)-2, importance = TRUE)
pred_bag <- predict(bag, Carseats[test,])
mse_bag <- mean((pred_bag - y_test)^2)
mse_bag
## [1] 2.657296
importance(bag)
##                 %IncMSE IncNodePurity
## CompPrice   23.07909904    171.185734
## Income       2.82081527     94.079825
## Advertising 11.43295625     99.098941
## Population  -3.92119532     59.818905
## Price       54.24314632    505.887016
## ShelveLoc   46.26912996    361.962753
## Age         14.24992212    159.740422
## Education   -0.07662320     46.738585
## Urban        0.08530119      8.453749
## US           4.34349223     15.157608
# (e) Random Forest
rf <- randomForest(Sales ~ . - HighSales, data = Carseats, subset = train, importance = TRUE)
pred_rf <- predict(rf, Carseats[test,])
mse_rf <- mean((pred_rf - y_test)^2)
mse_rf
## [1] 3.049406
importance(rf)
##                %IncMSE IncNodePurity
## CompPrice   12.9489323     158.48521
## Income       2.2754686     129.59400
## Advertising  8.9977589     111.94374
## Population  -2.2513981     102.84599
## Price       33.4226950     391.60804
## ShelveLoc   34.0233545     290.56502
## Age         12.2185108     171.83302
## Education    0.2592124      71.65413
## Urban        1.1382113      14.76798
## US           4.1925335      33.75554

Answers: - (b) Regression tree yields a test MSE of 4.922. - (c) Cross-validation recommends pruning; pruned model is simpler. - (d) Bagging results in a lower test MSE of 2.657. - (e) Random forest performs similarly or slightly better than bagging with test MSE of 3.049. Variable importance shows ShelveLoc is the most important predictor.

Exercise 9: OJ Data - Classification Trees

data(OJ)
set.seed(1)
train_id <- sample(1:nrow(OJ), 800)
train <- OJ[train_id, ]
test <- OJ[-train_id, ]

# (b) Fit a classification tree
oj_tree <- tree(Purchase ~ ., data = train)
summary(oj_tree)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "SpecialCH"     "ListPriceDiff"
## [5] "PctDiscMM"    
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7432 = 587.8 / 791 
## Misclassification error rate: 0.1588 = 127 / 800
# (c) Terminal Node Analysis
oj_tree
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1073.00 CH ( 0.60625 0.39375 )  
##    2) LoyalCH < 0.5036 365  441.60 MM ( 0.29315 0.70685 )  
##      4) LoyalCH < 0.280875 177  140.50 MM ( 0.13559 0.86441 )  
##        8) LoyalCH < 0.0356415 59   10.14 MM ( 0.01695 0.98305 ) *
##        9) LoyalCH > 0.0356415 118  116.40 MM ( 0.19492 0.80508 ) *
##      5) LoyalCH > 0.280875 188  258.00 MM ( 0.44149 0.55851 )  
##       10) PriceDiff < 0.05 79   84.79 MM ( 0.22785 0.77215 )  
##         20) SpecialCH < 0.5 64   51.98 MM ( 0.14062 0.85938 ) *
##         21) SpecialCH > 0.5 15   20.19 CH ( 0.60000 0.40000 ) *
##       11) PriceDiff > 0.05 109  147.00 CH ( 0.59633 0.40367 ) *
##    3) LoyalCH > 0.5036 435  337.90 CH ( 0.86897 0.13103 )  
##      6) LoyalCH < 0.764572 174  201.00 CH ( 0.73563 0.26437 )  
##       12) ListPriceDiff < 0.235 72   99.81 MM ( 0.50000 0.50000 )  
##         24) PctDiscMM < 0.196196 55   73.14 CH ( 0.61818 0.38182 ) *
##         25) PctDiscMM > 0.196196 17   12.32 MM ( 0.11765 0.88235 ) *
##       13) ListPriceDiff > 0.235 102   65.43 CH ( 0.90196 0.09804 ) *
##      7) LoyalCH > 0.764572 261   91.20 CH ( 0.95785 0.04215 ) *
# (d) Plot tree
plot(oj_tree)
text(oj_tree, pretty = 0)

# (e) Predict on test set
pred_test <- predict(oj_tree, test, type = "class")
table(pred_test, test$Purchase)
##          
## pred_test  CH  MM
##        CH 160  38
##        MM   8  64
test_err_unpruned <- mean(pred_test != test$Purchase)
test_err_unpruned
## [1] 0.1703704
# (f) Cross-validation for pruning
cv_oj <- cv.tree(oj_tree, FUN = prune.misclass)

# (g) Plot size vs CV error
plot(cv_oj$size, cv_oj$dev, type = "b", xlab = "Tree Size", ylab = "CV Misclass Error")

# (h) Best size
best_size <- cv_oj$size[which.min(cv_oj$dev)]
best_size
## [1] 7
# (i) Prune tree
pruned_oj <- prune.misclass(oj_tree, best = best_size)
plot(pruned_oj)
text(pruned_oj, pretty = 0)

# (j) Training error rates
train_pred_unpruned <- predict(oj_tree, train, type = "class")
train_err_unpruned <- mean(train_pred_unpruned != train$Purchase)
train_pred_pruned <- predict(pruned_oj, train, type = "class")
train_err_pruned <- mean(train_pred_pruned != train$Purchase)
c(train_err_unpruned = train_err_unpruned, train_err_pruned = train_err_pruned)
## train_err_unpruned   train_err_pruned 
##            0.15875            0.16250
# (k) Test error rates
pruned_test_pred <- predict(pruned_oj, test, type = "class")
test_err_pruned <- mean(pruned_test_pred != test$Purchase)
c(test_err_unpruned = test_err_unpruned, test_err_pruned = test_err_pruned)
## test_err_unpruned   test_err_pruned 
##         0.1703704         0.1629630

Answers: - (b) Tree fits well with summary output showing top splits. Training error rate is 0.159. - (c) Tree structure can be interpreted using printed output and plots. - (e) Test error rate for unpruned tree is 0.17. - (f) Cross-validation suggests best size is 7. - (j) Pruned tree training error: 0.162; Unpruned: 0.159. - (k) Pruned tree test error: 0.163; Unpruned: 0.17. Usually, pruned model generalizes better.