Decision Trees

Question 7

library(randomForest)
library(ISLR2)
library(tidyverse)
library(tree)
set.seed(1)
train_idx <- sample(1:nrow(Boston), nrow(Boston)/2)
train <- Boston[train_idx, ]
test <- Boston[-train_idx, ]

mtry_vals <- 1:12
ntree_vals <- c(25, 100, 250, 500)
results <- expand.grid(mtry = mtry_vals, ntree = ntree_vals)
results$test_mse <- NA

for (i in 1:nrow(results)) {
  fit <- randomForest(medv ~ ., data = train,
                      mtry = results$mtry[i],
                      ntree = results$ntree[i])
  preds <- predict(fit, newdata = test)
  results$test_mse[i] <- mean((preds - test$medv)^2)
}

# Plot
library(ggplot2)

ggplot(results, aes(x = mtry, y = test_mse, color = as.factor(ntree))) +
  geom_line(linewidth = 1.1) +
  labs(title = "Test MSE vs. mtry for different ntree values",
       x = "mtry", y = "Test MSE", color = "ntree") +
  theme_minimal()

We observe that mtry of around 2.5 results in lowest test MSE (almost) for different number of trees. As we increase the mtry which means we’re going towards bagging method, the test error steadily increases.

Question 8

#a)
train_indices <- sample(1:nrow(Carseats),size = 0.7 * nrow(Carseats))
train_set <- Carseats[train_indices,]
test_set <- Carseats[-train_indices,]
#b)

tree_model <- tree(Sales ~ ., data = train_set)
plot(tree_model)
text(tree_model, pretty = 0)

tree_preds <- predict(tree_model, newdata = test_set)


test_mse <- mean((tree_preds - test_set$Sales)^2)
test_mse
## [1] 4.417454

The tree is pretty dense with lots of terminal nodes. The current test error is 4.37

#c)

cv_tree <- cv.tree(tree_model)


plot(cv_tree$size, cv_tree$dev, type = "b", xlab = "Tree Size", ylab = "Deviance")

pruned_tree <- prune.tree(tree_model, best = cv_tree$size[which.min(cv_tree$dev)])


plot(pruned_tree)
text(pruned_tree, pretty = 0)

pruned_tree_preds <- predict(pruned_tree, newdata = test_set)


pruned_test_mse <- mean((pruned_tree_preds - test_set$Sales)^2)
pruned_test_mse
## [1] 4.572979

We got a little bit higher error with pruned tree. May be we lost some good predictors in the pruning process leading to some underfitting.

#d)
set.seed(1)
bagging_model <- randomForest(Sales ~ ., data = train_set, mtry = ncol(train_set) - 1, importance = TRUE)


bagging_preds <- predict(bagging_model, newdata = test_set)


bagging_mse <- mean((bagging_preds - test_set$Sales)^2)
bagging_mse
## [1] 2.669637
importance(bagging_model)
##                %IncMSE IncNodePurity
## CompPrice   32.6596414    247.328054
## Income      14.7515801    130.944978
## Advertising 23.5984697    156.075304
## Population  -0.5546660     83.186230
## Price       65.8234351    562.687705
## ShelveLoc   74.6603552    695.433319
## Age         22.2480466    219.188112
## Education    5.2878833     63.313801
## Urban        0.4913305      9.764823
## US           1.9844981      8.139650
varImpPlot(bagging_model)

Price and ShelveLoc appear to be the most important predictors as suggested by the importance plots. Test MSE obtained is 2.488.

#e)
# random Forests use m<p
set.seed(1)
rf_model <- randomForest(Sales ~ ., data = train_set, mtry = 4, importance = TRUE)


rf_preds <- predict(rf_model, newdata = test_set)


rf_mse <- mean((rf_preds - test_set$Sales)^2)
rf_mse
## [1] 2.687564
importance(rf_model)
##                %IncMSE IncNodePurity
## CompPrice   20.4821766     204.83212
## Income       8.4780968     173.04153
## Advertising 17.8520077     199.14167
## Population   0.1630113     123.74688
## Price       47.8792267     510.15519
## ShelveLoc   54.3368739     572.31743
## Age         19.5309367     246.00952
## Education    4.2617674      83.34553
## Urban       -1.4702934      13.48611
## US           4.3159791      22.36719
varImpPlot(rf_model)

Our test error after lowering mtry increased to 2.77. This could be because good predictors might be skipped by the model randomly thus underfitting the data.

mtry_vals <- 1:(ncol(train_set)-1)
mse_vals <- numeric(length(mtry_vals))

set.seed(1)
for (i in mtry_vals) {
  model <- randomForest(Sales ~ ., data = train_set, mtry = i)
  preds <- predict(model, newdata = test_set)
  mse_vals[i] <- mean((preds - test_set$Sales)^2)
}

# Plot MSE vs mtry
plot(mtry_vals, mse_vals, type = "b", pch = 19, col = "blue",
     xlab = "mtry", ylab = "Test MSE", main = "Effect of mtry on Test MSE")

#f)
library(BART)
train_set[] <- lapply(train_set, function(x) if (is.character(x)) as.factor(x) else x)
test_set[] <- lapply(test_set, function(x) if (is.character(x)) as.factor(x) else x)


x.train <- model.matrix(Sales ~ ., data = train_set)[, -1]
y.train <- train_set$Sales

x.test <- model.matrix(Sales ~ ., data = test_set)[, -1]
y.test <- test_set$Sales


set.seed(1)
bart_fit <- gbart(x.train, y.train, x.test)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 280, 11, 120
## y1,yn: -4.480036, -1.210036
## x1,x[n*p]: 143.000000, 1.000000
## xp1,xp[np*p]: 113.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 66 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.287616,3,0.200063,7.41004
## *****sigma: 1.013442
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,11,0
## *****printevery: 100
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 4s
## trcnt,tecnt: 1000,1000
bart_preds <- bart_fit$yhat.test.mean


bart_mse <- mean((bart_preds - y.test)^2)
bart_mse
## [1] 1.395154

The BART test MSE is 1.317 which is considerably lower than randomForest and bagging model.

Question 11

# a) and b)


library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
## The following object is masked from 'package:purrr':
## 
##     lift
# some predictors removed with no variation
Caravan_nopurchase <- Caravan[, -which(names(Caravan) == "Purchase")]
Caravan_clean <- Caravan_nopurchase[, -nearZeroVar(Caravan_nopurchase)]


train_indices <- 1:1000
test_indices <- -(1:1000)

train_X <- Caravan_clean[train_indices, ]
test_X <- Caravan_clean[test_indices, ]


train_Y <- Caravan$Purchase[train_indices]
test_Y <- Caravan$Purchase[test_indices]


train_X$boost_Y <- as.numeric(train_Y == "Yes")


set.seed(1)
boost_model <- gbm(boost_Y ~ ., 
                   data = train_X,
                   distribution = "bernoulli",
                   n.trees = 1000,
                   shrinkage = 0.01,
                   verbose = FALSE)


summary(boost_model)

##               var     rel.inf
## PPERSAUT PPERSAUT 14.78154443
## MKOOPKLA MKOOPKLA  9.58234146
## MOPLHOOG MOPLHOOG  7.30943159
## MBERMIDD MBERMIDD  6.26752903
## PBRAND     PBRAND  4.74371471
## MGODGE     MGODGE  4.61018236
## ABRAND     ABRAND  4.55122064
## MINK3045 MINK3045  4.46936571
## MOSTYPE   MOSTYPE  2.76087330
## MAUT1       MAUT1  2.72184919
## PWAPART   PWAPART  2.71558336
## MSKA         MSKA  2.30109468
## MBERARBG MBERARBG  2.09032579
## MGODPR     MGODPR  2.06716115
## MAUT2       MAUT2  2.03420559
## MSKC         MSKC  1.94142407
## MINKGEM   MINKGEM  1.93778386
## MBERHOOG MBERHOOG  1.83038211
## MGODOV     MGODOV  1.78233230
## MFWEKIND MFWEKIND  1.48605203
## MSKB1       MSKB1  1.37904822
## MRELGE     MRELGE  1.26164627
## MINK4575 MINK4575  1.19290438
## MINK7512 MINK7512  1.04213440
## MZFONDS   MZFONDS  0.98597870
## MGODRK     MGODRK  0.91988987
## MOPLMIDD MOPLMIDD  0.91193943
## MOSHOOFD MOSHOOFD  0.90304382
## MHKOOP     MHKOOP  0.89303214
## MRELOV     MRELOV  0.86210534
## MFGEKIND MFGEKIND  0.86110052
## MAUT0       MAUT0  0.72041902
## MZPART     MZPART  0.70973738
## APERSAUT APERSAUT  0.62142035
## MINKM30   MINKM30  0.61435773
## MSKB2       MSKB2  0.54325864
## MINK123M MINK123M  0.51966269
## MGEMOMV   MGEMOMV  0.48882515
## MHHUUR     MHHUUR  0.45107353
## MBERARBO MBERARBO  0.43692224
## MSKD         MSKD  0.40295863
## MBERBOER MBERBOER  0.36700341
## MGEMLEEF MGEMLEEF  0.33869234
## MFALLEEN MFALLEEN  0.28568749
## MBERZELF MBERZELF  0.21003527
## MOPLLAAG MOPLLAAG  0.05450073
## MAANTHUI MAANTHUI  0.03822492
## MRELSA     MRELSA  0.00000000
## AWAPART   AWAPART  0.00000000
## ABROM       ABROM  0.00000000

PPERSAUT appears to be most important predictor with highest influence.

#c)

boost_probs <- predict(boost_model, newdata = test_X, n.trees = 1000, type = "response")


boost_pred <- ifelse(boost_probs > 0.2, "Yes", "No")


conf_matrix <- table(Predicted = boost_pred, Actual = test_Y)
print(conf_matrix)
##          Actual
## Predicted   No  Yes
##       No  4406  258
##       Yes  127   31
correct_yes <- sum(boost_pred == "Yes" & test_Y == "Yes")
total_predicted_yes <- sum(boost_pred == "Yes")

fraction_correct_yes <- correct_yes / total_predicted_yes
cat("Fraction of predicted 'Yes' that were correct:", round(fraction_correct_yes, 3), "\n")
## Fraction of predicted 'Yes' that were correct: 0.196

About 20% make a purchase who we predicted that we’ll do.

# logistic comparison and KNN comparison


glm_model <- glm(Purchase ~ ., data = Caravan[1:1000, ], family = binomial)


glm_probs <- predict(glm_model, Caravan[-(1:1000), ], type = "response")


glm_pred <- ifelse(glm_probs > 0.2, "Yes", "No")


glm_cm <- table(Predicted = glm_pred, Actual = test_Y)
print(glm_cm)
##          Actual
## Predicted   No  Yes
##       No  4183  231
##       Yes  350   58
glm_correct_yes <- sum(glm_pred == "Yes" & test_Y == "Yes")
glm_total_predicted_yes <- sum(glm_pred == "Yes")
glm_precision <- glm_correct_yes / glm_total_predicted_yes
cat("Logistic Regression - Precision:", round(glm_precision, 3), "\n")
## Logistic Regression - Precision: 0.142

Less precision than boosting.

library(class)
library(caret)


standardized_X <- scale(Caravan[, -86])  # Exclude Purchase column
train_X_std <- standardized_X[1:1000, ]
test_X_std <- standardized_X[-(1:1000), ]


knn_pred <- knn(train = train_X_std, test = test_X_std, cl = train_Y, k = 5)


knn_cm <- table(Predicted = knn_pred, Actual = test_Y)
print(knn_cm)
##          Actual
## Predicted   No  Yes
##       No  4506  279
##       Yes   27   10
# Precision
knn_correct_yes <- sum(knn_pred == "Yes" & test_Y == "Yes")
knn_total_predicted_yes <- sum(knn_pred == "Yes")
knn_precision <- knn_correct_yes / knn_total_predicted_yes
cat("KNN - Precision:", round(knn_precision, 3), "\n")
## KNN - Precision: 0.27

We got highest precision 0f 0.27 with KNN.