library(randomForest)
library(ISLR2)
library(tidyverse)
library(tree)
set.seed(1)
train_idx <- sample(1:nrow(Boston), nrow(Boston)/2)
train <- Boston[train_idx, ]
test <- Boston[-train_idx, ]
mtry_vals <- 1:12
ntree_vals <- c(25, 100, 250, 500)
results <- expand.grid(mtry = mtry_vals, ntree = ntree_vals)
results$test_mse <- NA
for (i in 1:nrow(results)) {
fit <- randomForest(medv ~ ., data = train,
mtry = results$mtry[i],
ntree = results$ntree[i])
preds <- predict(fit, newdata = test)
results$test_mse[i] <- mean((preds - test$medv)^2)
}
# Plot
library(ggplot2)
ggplot(results, aes(x = mtry, y = test_mse, color = as.factor(ntree))) +
geom_line(linewidth = 1.1) +
labs(title = "Test MSE vs. mtry for different ntree values",
x = "mtry", y = "Test MSE", color = "ntree") +
theme_minimal()
We observe that mtry of around 2.5 results in lowest test MSE (almost) for different number of trees. As we increase the mtry which means we’re going towards bagging method, the test error steadily increases.
#a)
train_indices <- sample(1:nrow(Carseats),size = 0.7 * nrow(Carseats))
train_set <- Carseats[train_indices,]
test_set <- Carseats[-train_indices,]
#b)
tree_model <- tree(Sales ~ ., data = train_set)
plot(tree_model)
text(tree_model, pretty = 0)
tree_preds <- predict(tree_model, newdata = test_set)
test_mse <- mean((tree_preds - test_set$Sales)^2)
test_mse
## [1] 4.417454
The tree is pretty dense with lots of terminal nodes. The current test error is 4.37
#c)
cv_tree <- cv.tree(tree_model)
plot(cv_tree$size, cv_tree$dev, type = "b", xlab = "Tree Size", ylab = "Deviance")
pruned_tree <- prune.tree(tree_model, best = cv_tree$size[which.min(cv_tree$dev)])
plot(pruned_tree)
text(pruned_tree, pretty = 0)
pruned_tree_preds <- predict(pruned_tree, newdata = test_set)
pruned_test_mse <- mean((pruned_tree_preds - test_set$Sales)^2)
pruned_test_mse
## [1] 4.572979
We got a little bit higher error with pruned tree. May be we lost some good predictors in the pruning process leading to some underfitting.
#d)
set.seed(1)
bagging_model <- randomForest(Sales ~ ., data = train_set, mtry = ncol(train_set) - 1, importance = TRUE)
bagging_preds <- predict(bagging_model, newdata = test_set)
bagging_mse <- mean((bagging_preds - test_set$Sales)^2)
bagging_mse
## [1] 2.669637
importance(bagging_model)
## %IncMSE IncNodePurity
## CompPrice 32.6596414 247.328054
## Income 14.7515801 130.944978
## Advertising 23.5984697 156.075304
## Population -0.5546660 83.186230
## Price 65.8234351 562.687705
## ShelveLoc 74.6603552 695.433319
## Age 22.2480466 219.188112
## Education 5.2878833 63.313801
## Urban 0.4913305 9.764823
## US 1.9844981 8.139650
varImpPlot(bagging_model)
Price and ShelveLoc appear to be the most important predictors as suggested by the importance plots. Test MSE obtained is 2.488.
#e)
# random Forests use m<p
set.seed(1)
rf_model <- randomForest(Sales ~ ., data = train_set, mtry = 4, importance = TRUE)
rf_preds <- predict(rf_model, newdata = test_set)
rf_mse <- mean((rf_preds - test_set$Sales)^2)
rf_mse
## [1] 2.687564
importance(rf_model)
## %IncMSE IncNodePurity
## CompPrice 20.4821766 204.83212
## Income 8.4780968 173.04153
## Advertising 17.8520077 199.14167
## Population 0.1630113 123.74688
## Price 47.8792267 510.15519
## ShelveLoc 54.3368739 572.31743
## Age 19.5309367 246.00952
## Education 4.2617674 83.34553
## Urban -1.4702934 13.48611
## US 4.3159791 22.36719
varImpPlot(rf_model)
Our test error after lowering mtry increased to 2.77. This could be because good predictors might be skipped by the model randomly thus underfitting the data.
mtry_vals <- 1:(ncol(train_set)-1)
mse_vals <- numeric(length(mtry_vals))
set.seed(1)
for (i in mtry_vals) {
model <- randomForest(Sales ~ ., data = train_set, mtry = i)
preds <- predict(model, newdata = test_set)
mse_vals[i] <- mean((preds - test_set$Sales)^2)
}
# Plot MSE vs mtry
plot(mtry_vals, mse_vals, type = "b", pch = 19, col = "blue",
xlab = "mtry", ylab = "Test MSE", main = "Effect of mtry on Test MSE")
#f)
library(BART)
train_set[] <- lapply(train_set, function(x) if (is.character(x)) as.factor(x) else x)
test_set[] <- lapply(test_set, function(x) if (is.character(x)) as.factor(x) else x)
x.train <- model.matrix(Sales ~ ., data = train_set)[, -1]
y.train <- train_set$Sales
x.test <- model.matrix(Sales ~ ., data = test_set)[, -1]
y.test <- test_set$Sales
set.seed(1)
bart_fit <- gbart(x.train, y.train, x.test)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 280, 11, 120
## y1,yn: -4.480036, -1.210036
## x1,x[n*p]: 143.000000, 1.000000
## xp1,xp[np*p]: 113.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 66 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.287616,3,0.200063,7.41004
## *****sigma: 1.013442
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,11,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 4s
## trcnt,tecnt: 1000,1000
bart_preds <- bart_fit$yhat.test.mean
bart_mse <- mean((bart_preds - y.test)^2)
bart_mse
## [1] 1.395154
The BART test MSE is 1.317 which is considerably lower than randomForest and bagging model.
# a) and b)
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
# some predictors removed with no variation
Caravan_nopurchase <- Caravan[, -which(names(Caravan) == "Purchase")]
Caravan_clean <- Caravan_nopurchase[, -nearZeroVar(Caravan_nopurchase)]
train_indices <- 1:1000
test_indices <- -(1:1000)
train_X <- Caravan_clean[train_indices, ]
test_X <- Caravan_clean[test_indices, ]
train_Y <- Caravan$Purchase[train_indices]
test_Y <- Caravan$Purchase[test_indices]
train_X$boost_Y <- as.numeric(train_Y == "Yes")
set.seed(1)
boost_model <- gbm(boost_Y ~ .,
data = train_X,
distribution = "bernoulli",
n.trees = 1000,
shrinkage = 0.01,
verbose = FALSE)
summary(boost_model)
## var rel.inf
## PPERSAUT PPERSAUT 14.78154443
## MKOOPKLA MKOOPKLA 9.58234146
## MOPLHOOG MOPLHOOG 7.30943159
## MBERMIDD MBERMIDD 6.26752903
## PBRAND PBRAND 4.74371471
## MGODGE MGODGE 4.61018236
## ABRAND ABRAND 4.55122064
## MINK3045 MINK3045 4.46936571
## MOSTYPE MOSTYPE 2.76087330
## MAUT1 MAUT1 2.72184919
## PWAPART PWAPART 2.71558336
## MSKA MSKA 2.30109468
## MBERARBG MBERARBG 2.09032579
## MGODPR MGODPR 2.06716115
## MAUT2 MAUT2 2.03420559
## MSKC MSKC 1.94142407
## MINKGEM MINKGEM 1.93778386
## MBERHOOG MBERHOOG 1.83038211
## MGODOV MGODOV 1.78233230
## MFWEKIND MFWEKIND 1.48605203
## MSKB1 MSKB1 1.37904822
## MRELGE MRELGE 1.26164627
## MINK4575 MINK4575 1.19290438
## MINK7512 MINK7512 1.04213440
## MZFONDS MZFONDS 0.98597870
## MGODRK MGODRK 0.91988987
## MOPLMIDD MOPLMIDD 0.91193943
## MOSHOOFD MOSHOOFD 0.90304382
## MHKOOP MHKOOP 0.89303214
## MRELOV MRELOV 0.86210534
## MFGEKIND MFGEKIND 0.86110052
## MAUT0 MAUT0 0.72041902
## MZPART MZPART 0.70973738
## APERSAUT APERSAUT 0.62142035
## MINKM30 MINKM30 0.61435773
## MSKB2 MSKB2 0.54325864
## MINK123M MINK123M 0.51966269
## MGEMOMV MGEMOMV 0.48882515
## MHHUUR MHHUUR 0.45107353
## MBERARBO MBERARBO 0.43692224
## MSKD MSKD 0.40295863
## MBERBOER MBERBOER 0.36700341
## MGEMLEEF MGEMLEEF 0.33869234
## MFALLEEN MFALLEEN 0.28568749
## MBERZELF MBERZELF 0.21003527
## MOPLLAAG MOPLLAAG 0.05450073
## MAANTHUI MAANTHUI 0.03822492
## MRELSA MRELSA 0.00000000
## AWAPART AWAPART 0.00000000
## ABROM ABROM 0.00000000
PPERSAUT appears to be most important predictor with highest influence.
#c)
boost_probs <- predict(boost_model, newdata = test_X, n.trees = 1000, type = "response")
boost_pred <- ifelse(boost_probs > 0.2, "Yes", "No")
conf_matrix <- table(Predicted = boost_pred, Actual = test_Y)
print(conf_matrix)
## Actual
## Predicted No Yes
## No 4406 258
## Yes 127 31
correct_yes <- sum(boost_pred == "Yes" & test_Y == "Yes")
total_predicted_yes <- sum(boost_pred == "Yes")
fraction_correct_yes <- correct_yes / total_predicted_yes
cat("Fraction of predicted 'Yes' that were correct:", round(fraction_correct_yes, 3), "\n")
## Fraction of predicted 'Yes' that were correct: 0.196
About 20% make a purchase who we predicted that we’ll do.
# logistic comparison and KNN comparison
glm_model <- glm(Purchase ~ ., data = Caravan[1:1000, ], family = binomial)
glm_probs <- predict(glm_model, Caravan[-(1:1000), ], type = "response")
glm_pred <- ifelse(glm_probs > 0.2, "Yes", "No")
glm_cm <- table(Predicted = glm_pred, Actual = test_Y)
print(glm_cm)
## Actual
## Predicted No Yes
## No 4183 231
## Yes 350 58
glm_correct_yes <- sum(glm_pred == "Yes" & test_Y == "Yes")
glm_total_predicted_yes <- sum(glm_pred == "Yes")
glm_precision <- glm_correct_yes / glm_total_predicted_yes
cat("Logistic Regression - Precision:", round(glm_precision, 3), "\n")
## Logistic Regression - Precision: 0.142
Less precision than boosting.
library(class)
library(caret)
standardized_X <- scale(Caravan[, -86]) # Exclude Purchase column
train_X_std <- standardized_X[1:1000, ]
test_X_std <- standardized_X[-(1:1000), ]
knn_pred <- knn(train = train_X_std, test = test_X_std, cl = train_Y, k = 5)
knn_cm <- table(Predicted = knn_pred, Actual = test_Y)
print(knn_cm)
## Actual
## Predicted No Yes
## No 4506 279
## Yes 27 10
# Precision
knn_correct_yes <- sum(knn_pred == "Yes" & test_Y == "Yes")
knn_total_predicted_yes <- sum(knn_pred == "Yes")
knn_precision <- knn_correct_yes / knn_total_predicted_yes
cat("KNN - Precision:", round(knn_precision, 3), "\n")
## KNN - Precision: 0.27
We got highest precision 0f 0.27 with KNN.