options(repos = c(CRAN = "https://cloud.r-project.org"))
# Define a sequence of pm1 values
pm1 <- seq(0, 1, by = 0.01)
pm2 <- 1 - pm1
# Calculate metrics
gini <- pm1 * (1 - pm1) + pm2 * (1 - pm2)
entropy <- -pm1 * log2(pm1) - pm2 * log2(pm2)
entropy[is.nan(entropy)] <- 0 # Handle log(0)
class_error <- 1 - pmax(pm1, pm2)
# Plot
plot(pm1, gini, type = "l", col = "red", ylim = c(0, 1), ylab = "Impurity/Error", xlab = "pm1", lwd = 2)
lines(pm1, entropy, col = "blue", lwd = 2)
lines(pm1, class_error, col = "green", lwd = 2)
legend("top", legend = c("Gini", "Entropy", "Classification Error"), col = c("red", "blue", "green"), lty = 1, lwd = 2)
Carseats Dataset**Load the required libraries and data:
library(ISLR2)
install.packages("tree")
##
## The downloaded binary packages are in
## /var/folders/n6/kts7k_nx3v3208p01m5x0p_00000gn/T//Rtmpc8RFej/downloaded_packages
library(tree)
install.packages("randomForest")
##
## The downloaded binary packages are in
## /var/folders/n6/kts7k_nx3v3208p01m5x0p_00000gn/T//Rtmpc8RFej/downloaded_packages
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
install.packages("gbm")
##
## The downloaded binary packages are in
## /var/folders/n6/kts7k_nx3v3208p01m5x0p_00000gn/T//Rtmpc8RFej/downloaded_packages
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
set.seed(1)
# Convert Sales to quantitative
data(Carseats)
train_indices <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train <- Carseats[train_indices, ]
test <- Carseats[-train_indices, ]
tree_model <- tree(Sales ~ ., data = train)
summary(tree_model)
##
## Regression tree:
## tree(formula = Sales ~ ., data = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "US"
## Number of terminal nodes: 18
## Residual mean deviance: 2.167 = 394.3 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.88200 -0.88200 -0.08712 0.00000 0.89590 4.09900
plot(tree_model)
text(tree_model, pretty = 0)
# Predict and test MSE
preds <- predict(tree_model, newdata = test)
mse <- mean((preds - test$Sales)^2)
print(mse)
## [1] 4.922039
cv_results <- cv.tree(tree_model)
plot(cv_results$size, cv_results$dev, type = "b", xlab = "Tree Size", ylab = "CV Error")
# Prune tree
pruned_tree <- prune.tree(tree_model, best = cv_results$size[which.min(cv_results$dev)])
plot(pruned_tree)
text(pruned_tree, pretty = 0)
# Test MSE after pruning
preds_pruned <- predict(pruned_tree, newdata = test)
mean((preds_pruned - test$Sales)^2)
## [1] 4.922039
bag_model <- randomForest(Sales ~ ., data = train, mtry = ncol(train) - 1, importance = TRUE)
preds_bag <- predict(bag_model, newdata = test)
mean((preds_bag - test$Sales)^2)
## [1] 2.657296
importance(bag_model)
## %IncMSE IncNodePurity
## CompPrice 23.07909904 171.185734
## Income 2.82081527 94.079825
## Advertising 11.43295625 99.098941
## Population -3.92119532 59.818905
## Price 54.24314632 505.887016
## ShelveLoc 46.26912996 361.962753
## Age 14.24992212 159.740422
## Education -0.07662320 46.738585
## Urban 0.08530119 8.453749
## US 4.34349223 15.157608
rf_model <- randomForest(Sales ~ ., data = train, mtry = 3, importance = TRUE)
preds_rf <- predict(rf_model, newdata = test)
mean((preds_rf - test$Sales)^2)
## [1] 3.049406
importance(rf_model)
## %IncMSE IncNodePurity
## CompPrice 12.9489323 158.48521
## Income 2.2754686 129.59400
## Advertising 8.9977589 111.94374
## Population -2.2513981 102.84599
## Price 33.4226950 391.60804
## ShelveLoc 34.0233545 290.56502
## Age 12.2185108 171.83302
## Education 0.2592124 71.65413
## Urban 1.1382113 14.76798
## US 4.1925335 33.75554
Try different values of mtry to study its effect.
library(BART)
## Loading required package: nlme
## Loading required package: survival
x_train <- train[, -which(names(train) == "Sales")]
x_test <- test[, -which(names(test) == "Sales")]
y_train <- train$Sales
bart_model <- wbart(x.train = x_train, y.train = y_train, x.test = x_test)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 200, 14, 200
## y1,yn: 2.781850, 1.091850
## x1,x[n*p]: 107.000000, 1.000000
## xp1,xp[np*p]: 111.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 63 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.273474,3.000000,0.230740
## *****sigma: 1.088371
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 2s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
mean((bart_model$yhat.test.mean - test$Sales)^2)
## [1] 1.438471
library(ISLR2)
data(OJ)
set.seed(1)
train_indices <- sample(1:nrow(OJ), 800)
train <- OJ[train_indices, ]
test <- OJ[-train_indices, ]
tree_oj <- tree(Purchase ~ ., data = train)
summary(tree_oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## [5] "PctDiscMM"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7432 = 587.8 / 791
## Misclassification error rate: 0.1588 = 127 / 800
tree_oj # Shows splits and terminal node rules
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1073.00 CH ( 0.60625 0.39375 )
## 2) LoyalCH < 0.5036 365 441.60 MM ( 0.29315 0.70685 )
## 4) LoyalCH < 0.280875 177 140.50 MM ( 0.13559 0.86441 )
## 8) LoyalCH < 0.0356415 59 10.14 MM ( 0.01695 0.98305 ) *
## 9) LoyalCH > 0.0356415 118 116.40 MM ( 0.19492 0.80508 ) *
## 5) LoyalCH > 0.280875 188 258.00 MM ( 0.44149 0.55851 )
## 10) PriceDiff < 0.05 79 84.79 MM ( 0.22785 0.77215 )
## 20) SpecialCH < 0.5 64 51.98 MM ( 0.14062 0.85938 ) *
## 21) SpecialCH > 0.5 15 20.19 CH ( 0.60000 0.40000 ) *
## 11) PriceDiff > 0.05 109 147.00 CH ( 0.59633 0.40367 ) *
## 3) LoyalCH > 0.5036 435 337.90 CH ( 0.86897 0.13103 )
## 6) LoyalCH < 0.764572 174 201.00 CH ( 0.73563 0.26437 )
## 12) ListPriceDiff < 0.235 72 99.81 MM ( 0.50000 0.50000 )
## 24) PctDiscMM < 0.196196 55 73.14 CH ( 0.61818 0.38182 ) *
## 25) PctDiscMM > 0.196196 17 12.32 MM ( 0.11765 0.88235 ) *
## 13) ListPriceDiff > 0.235 102 65.43 CH ( 0.90196 0.09804 ) *
## 7) LoyalCH > 0.764572 261 91.20 CH ( 0.95785 0.04215 ) *
plot(tree_oj)
text(tree_oj, pretty = 0)
preds <- predict(tree_oj, newdata = test, type = "class")
table(preds, test$Purchase)
##
## preds CH MM
## CH 160 38
## MM 8 64
mean(preds != test$Purchase) # Test error rate
## [1] 0.1703704
cv_oj <- cv.tree(tree_oj, FUN = prune.misclass)
plot(cv_oj$size, cv_oj$dev, type = "b")
cv_oj$size[which.min(cv_oj$dev)]
## [1] 7
pruned_oj <- prune.misclass(tree_oj, best = 5)
plot(pruned_oj)
text(pruned_oj, pretty = 0)
train_pred_unpruned <- predict(tree_oj, newdata = train, type = "class")
train_pred_pruned <- predict(pruned_oj, newdata = train, type = "class")
mean(train_pred_unpruned != train$Purchase)
## [1] 0.15875
mean(train_pred_pruned != train$Purchase)
## [1] 0.1625
test_pred_unpruned <- predict(tree_oj, newdata = test, type = "class")
test_pred_pruned <- predict(pruned_oj, newdata = test, type = "class")
mean(test_pred_unpruned != test$Purchase)
## [1] 0.1703704
mean(test_pred_pruned != test$Purchase)
## [1] 0.162963