p <- seq(0, 1, by = 0.01)
gini <- 2 * p * (1 - p)

class_error <- 1 - pmax(p, 1 - p)

entropy <- -p * log2(p) - (1 - p) * log2(1 - p)
entropy[is.nan(entropy)] <- 0 

plot(p, gini, type = "l", col = "red", lwd = 2,
     ylab = "Impurity Measure", xlab = expression(hat(p)[m1]),
     main = "Impurity Measures vs. Class 1 Probability", ylim = c(0, 1))
lines(p, class_error, col = "blue", lwd = 2)
lines(p, entropy, col = "darkgreen", lwd = 2)
legend("top", legend = c("Gini Index", "Classification Error", "Entropy"),
       col = c("red", "blue", "darkgreen"), lwd = 2, bty = "n")

#8

library(ISLR2)
library(tree)
set.seed(1)

train_indices <- sample(1:nrow(Carseats), nrow(Carseats)/2)
train <- Carseats[train_indices, ]
test <- Carseats[-train_indices, ]
tree_model <- tree(Sales ~ ., data = train)
summary(tree_model)
## 
## Regression tree:
## tree(formula = Sales ~ ., data = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Age"         "Advertising" "CompPrice"  
## [6] "US"         
## Number of terminal nodes:  18 
## Residual mean deviance:  2.167 = 394.3 / 182 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.88200 -0.88200 -0.08712  0.00000  0.89590  4.09900
plot(tree_model)
text(tree_model, pretty = 0)

pred_tree <- predict(tree_model, newdata = test)
mse_tree <- mean((pred_tree - test$Sales)^2)
mse_tree
## [1] 4.922039

test MSE: 4.922

set.seed(1)
cv_tree <- cv.tree(tree_model)
plot(cv_tree$size, cv_tree$dev, type = "b")

pruned_tree <- prune.tree(tree_model, best = 8)  
plot(pruned_tree)
text(pruned_tree, pretty = 0)

pred_pruned <- predict(pruned_tree, newdata = test)
mse_pruned <- mean((pred_pruned - test$Sales)^2)
mse_pruned
## [1] 5.113254

MSE increased to 5.11 from 4.922

library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
bag_model <- randomForest(Sales ~ ., data = train, mtry = ncol(train) - 1, importance = TRUE)
pred_bag <- predict(bag_model, newdata = test)
mean((pred_bag - test$Sales)^2)
## [1] 2.605253
importance(bag_model)
##                %IncMSE IncNodePurity
## CompPrice   24.8888481    170.182937
## Income       4.7121131     91.264880
## Advertising 12.7692401     97.164338
## Population  -1.8074075     58.244596
## Price       56.3326252    502.903407
## ShelveLoc   48.8886689    380.032715
## Age         17.7275460    157.846774
## Education    0.5962186     44.598731
## Urban        0.1728373      9.822082
## US           4.2172102     18.073863

Test MSE: 2.61

set.seed(1)
rf_model <- randomForest(Sales ~ ., data = train, mtry = 4, importance = TRUE) 
pred_rf <- predict(rf_model, newdata = test)
mse_rf <- mean((pred_rf - test$Sales)^2)
importance(rf_model)
##                %IncMSE IncNodePurity
## CompPrice   15.7891655     160.57944
## Income       4.1275509     121.12953
## Advertising  9.6425758     111.54581
## Population  -1.3596645      85.92575
## Price       43.4055391     423.06225
## ShelveLoc   37.8850232     311.97119
## Age         13.8924424     174.18229
## Education    0.1960888      62.77782
## Urban        0.1393816      12.92952
## US           6.3532441      30.42255
varImpPlot(rf_model)

Most important variables: Price and ShelveLoc

library(BART)
## Loading required package: nlme
## Loading required package: survival
x_train <- data.matrix(train[, -which(names(train) == "Sales")])
y_train <- train$Sales
x_test <- data.matrix(test[, -which(names(test) == "Sales")])

set.seed(1)
bart_model <- gbart(x.train = x_train, y.train = y_train, x.test = x_test)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 10, 200
## y1,yn: 2.781850, 1.091850
## x1,x[n*p]: 107.000000, 2.000000
## xp1,xp[np*p]: 111.000000, 2.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 63 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.273474,3,0.692269,7.57815
## *****sigma: 1.885179
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,10,0
## *****printevery: 100
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 5s
## trcnt,tecnt: 1000,1000
pred_bart <- bart_model$yhat.test.mean
mse_bart <- mean((pred_bart - test$Sales)^2)
mse_bart
## [1] 1.465254

BART MSE: 1.465

set.seed(1)
data(OJ)

train_indices <- sample(1:nrow(OJ), 800)
train <- OJ[train_indices, ]
test <- OJ[-train_indices, ]
tree_oj <- tree(Purchase ~ ., data = train)
summary(tree_oj)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "SpecialCH"     "ListPriceDiff"
## [5] "PctDiscMM"    
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7432 = 587.8 / 791 
## Misclassification error rate: 0.1588 = 127 / 800

The tree uses only 5 variable and 9 terminal nodes.

tree_oj  
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1073.00 CH ( 0.60625 0.39375 )  
##    2) LoyalCH < 0.5036 365  441.60 MM ( 0.29315 0.70685 )  
##      4) LoyalCH < 0.280875 177  140.50 MM ( 0.13559 0.86441 )  
##        8) LoyalCH < 0.0356415 59   10.14 MM ( 0.01695 0.98305 ) *
##        9) LoyalCH > 0.0356415 118  116.40 MM ( 0.19492 0.80508 ) *
##      5) LoyalCH > 0.280875 188  258.00 MM ( 0.44149 0.55851 )  
##       10) PriceDiff < 0.05 79   84.79 MM ( 0.22785 0.77215 )  
##         20) SpecialCH < 0.5 64   51.98 MM ( 0.14062 0.85938 ) *
##         21) SpecialCH > 0.5 15   20.19 CH ( 0.60000 0.40000 ) *
##       11) PriceDiff > 0.05 109  147.00 CH ( 0.59633 0.40367 ) *
##    3) LoyalCH > 0.5036 435  337.90 CH ( 0.86897 0.13103 )  
##      6) LoyalCH < 0.764572 174  201.00 CH ( 0.73563 0.26437 )  
##       12) ListPriceDiff < 0.235 72   99.81 MM ( 0.50000 0.50000 )  
##         24) PctDiscMM < 0.196196 55   73.14 CH ( 0.61818 0.38182 ) *
##         25) PctDiscMM > 0.196196 17   12.32 MM ( 0.11765 0.88235 ) *
##       13) ListPriceDiff > 0.235 102   65.43 CH ( 0.90196 0.09804 ) *
##      7) LoyalCH > 0.764572 261   91.20 CH ( 0.95785 0.04215 ) *

Terminal Node 7 contains 261 observations and predicts CH with 95.8% probability for CH and 4.2% for MM. LoyalCH > 0.7646 indiocates strong brand loyalty.

plot(tree_oj)
text(tree_oj, pretty = 0)

LoyalCH is the most important feature

pred_test <- predict(tree_oj, newdata = test, type = "class")
conf_matrix <- table(pred_test, test$Purchase)
conf_matrix
##          
## pred_test  CH  MM
##        CH 160  38
##        MM   8  64
test_error <- mean(pred_test != test$Purchase)
test_error
## [1] 0.1703704

Test error rate: 0.1704

set.seed(1)
cv_oj <- cv.tree(tree_oj, FUN = prune.misclass)
plot(cv_oj$size, cv_oj$dev, type = "b", xlab = "Tree Size", ylab = "CV Misclassification Error")

min_dev_index <- which.min(cv_oj$dev)
optimal_size <- cv_oj$size[min_dev_index]
optimal_size
## [1] 9

Tree size 9 corresponds with the lowest cross validated classification error rate

pruned_oj <- prune.misclass(tree_oj, best = optimal_size)
plot(pruned_oj)
text(pruned_oj, pretty = 0)

set.seed(1)
train_pred_unpruned <- predict(tree_oj, newdata = train, type = "class")
train_pred_pruned <- predict(pruned_oj, newdata = train, type = "class")

train_error_unpruned <- mean(train_pred_unpruned != train$Purchase)
train_error_pruned <- mean(train_pred_pruned != train$Purchase)

train_error_unpruned
## [1] 0.15875
train_error_pruned
## [1] 0.15875

Pruned and unpruned has the same error rate.

test_pred_pruned <- predict(pruned_oj, newdata = test, type = "class")

test_error_pruned <- mean(test_pred_pruned != test$Purchase)

test_error  # from part (e)
## [1] 0.1703704
test_error_pruned
## [1] 0.1703704