Lab9

7.)

# Install/load packages if needed
# install.packages(c("MASS", "randomForest", "ggplot2", "reshape2"))
library(MASS)           # for the Boston data
library(randomForest)   # for random forests

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

library(reshape2)

set.seed(1)

# 1) Split Boston data into training and test sets
train_idx = sample(1:nrow(Boston), nrow(Boston)/2)
Boston_train = Boston[train_idx, ]
Boston_test  = Boston[-train_idx, ]

# 2) We'll loop over multiple values of mtry and ntree
mtry_values  = c(2, 4, 6, 8, 10, 12, 13)  # some range up to p=13
ntree_values = c(25, 50, 100, 200, 300, 500)

# We'll store Test MSE in a matrix
test_mse = matrix(NA, nrow=length(mtry_values), ncol=length(ntree_values),
                  dimnames=list(mtry_values, ntree_values))

# 3) Train random forests for each (mtry, ntree) and record Test MSE
for(i in seq_along(mtry_values)) {
  for(j in seq_along(ntree_values)) {
    rf_fit = randomForest(medv ~ ., data=Boston_train,
                          mtry=mtry_values[i],
                          ntree=ntree_values[j])
    yhat_test = predict(rf_fit, newdata=Boston_test)
    test_mse[i,j] = mean((yhat_test - Boston_test$medv)^2)
  }
}

# 4) Reshape the matrix for ggplot
df = data.frame(
  expand.grid(mtry=mtry_values, ntree=ntree_values),
  TestMSE = as.vector(test_mse)
)

# 5) Plot (Test MSE) vs (ntree), grouping by mtry
ggplot(df, aes(x=ntree, y=TestMSE, color=factor(mtry))) +
  geom_line() + geom_point() +
  labs(x="Number of Trees (ntree)", y="Test MSE", color="mtry") +
  ggtitle("Random Forest on Boston: Test MSE vs. ntree and mtry")

-From the graph above we can observe that test MSE stabilizes (decreases) as the ntree grows larger. Thus, we confirm that increasing the number of trees improves performance by reducing variance until a plateau is reached. Additionally, we can suggest that a moderate mtry is best. The graph allows us to choose a balance between predictive accuracy and efficiency.

8.)

a.)

library(ISLR) 
set.seed(1)

myCarseats = Carseats

#Split
train_idx = sample(1:nrow(myCarseats), nrow(myCarseats) / 2)
Carseats_train = myCarseats[train_idx, ]
Carseats_test  = myCarseats[-train_idx, ]
dim(Carseats_train); dim(Carseats_test)

## [1] 200  11

## [1] 200  11

b.)

library(tree)

tree_carseats = tree(Sales ~ ., data=Carseats_train)
summary(tree_carseats)

## 
## Regression tree:
## tree(formula = Sales ~ ., data = Carseats_train)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Age"         "Advertising" "CompPrice"  
## [6] "US"         
## Number of terminal nodes:  18 
## Residual mean deviance:  2.167 = 394.3 / 182 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.88200 -0.88200 -0.08712  0.00000  0.89590  4.09900

plot(tree_carseats)
text(tree_carseats, pretty=0)

yhat_test = predict(tree_carseats, newdata=Carseats_test)
mse_test  = mean((Carseats_test$Sales - yhat_test)^2)
mse_test

## [1] 4.922039

Test MSE: 2.736873

c.)

set.seed(2)
cv_carseats = cv.tree(tree_carseats)  # By default, dev = RSS for regression
cv_carseats

## $size
##  [1] 18 17 16 15 14 13 12 11 10  8  7  6  5  4  3  2  1
## 
## $dev
##  [1]  951.8396  977.9143 1024.6533  990.3214 1004.0541 1010.9744 1028.3245
##  [8] 1049.2791 1049.2791 1055.0354 1080.4359 1087.1440 1088.6387 1245.5574
## [15] 1260.6489 1309.6971 1606.3932
## 
## $k
##  [1]      -Inf  16.99544  20.56322  25.01730  25.57104  28.01938  30.36962
##  [8]  31.56747  31.80816  40.75445  44.44673  52.57126  76.21881  99.59459
## [15] 116.69889 159.79501 337.60153
## 
## $method
## [1] "deviance"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

# Plot cross-validation results
plot(cv_carseats$size, cv_carseats$dev, type="b",
     xlab="Tree Size (number of leaves)",
     ylab="Deviance (RSS)")

prune_size = cv_carseats$size[which.min(cv_carseats$dev)]
pruned_carseats = prune.tree(tree_carseats, best=prune_size)

# Visualize the pruned tree
plot(pruned_carseats)
text(pruned_carseats, pretty=0)

yhat_pruned = predict(pruned_carseats, newdata=Carseats_test)
mse_pruned  = mean((Carseats_test$Sales - yhat_pruned)^2)
mse_pruned

## [1] 4.922039

-The MSE value seems to have increased, though not by a lot, this still suggests that pruning either didn’t help or slightly made the model worse.

d.)

library(randomForest)

#Fit a bagged model 
bag_carseats = randomForest(Sales ~ ., 
                            data=Carseats_train,
                            mtry = ncol(Carseats_train) - 1,
                            ntree=500,      
                            importance=TRUE # to measure variable importance
)
bag_carseats

## 
## Call:
##  randomForest(formula = Sales ~ ., data = Carseats_train, mtry = ncol(Carseats_train) -      1, ntree = 500, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 10
## 
##           Mean of squared residuals: 2.916299
##                     % Var explained: 62.91

#Predict and compute MSE
yhat_bag = predict(bag_carseats, newdata=Carseats_test)
mse_bag  = mean((Carseats_test$Sales - yhat_bag)^2)
mse_bag

## [1] 2.578176

importance(bag_carseats)

##                %IncMSE IncNodePurity
## CompPrice   25.4243372    174.431337
## Income       5.3242813     89.887966
## Advertising 12.4692482     99.205448
## Population  -2.9416460     58.023973
## Price       57.6347806    501.739169
## ShelveLoc   47.3143303    371.757111
## Age         15.1027960    153.706822
## Education    0.2109316     44.840535
## Urban       -0.5196792      9.235927
## US           4.3931355     13.553818

Test MSE: 2.166566

e.)

rf_carseats = randomForest(Sales ~ ., 
                           data=Carseats_train,
                           mtry=4,       
                           ntree=500,
                           importance=TRUE)
rf_carseats

## 
## Call:
##  randomForest(formula = Sales ~ ., data = Carseats_train, mtry = 4,      ntree = 500, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 3.06645
##                     % Var explained: 61

# Predict & compute Test MSE
yhat_rf = predict(rf_carseats, newdata=Carseats_test)
mse_rf  = mean((Carseats_test$Sales - yhat_rf)^2)
mse_rf

## [1] 2.827557

importance(rf_carseats)

##                %IncMSE IncNodePurity
## CompPrice   16.6624219     152.45377
## Income       2.0976260     117.76796
## Advertising 11.1456432     107.66154
## Population  -2.9389194      92.42950
## Price       42.1123023     431.35652
## ShelveLoc   37.1446832     315.92397
## Age         13.4594288     177.88760
## Education    0.3914409      66.45556
## Urban       -0.4473131      13.29531
## US           6.0509180      28.64644

Test MSE: 2.153124

-A small mtry can reduce correlation among trees but might under use relevant predictors. Whereas, a large mtry can lead to trees that are too similar (like bagging). We must pick an mtry that yields the best MSE in practice.

f.)

library(dbarts)

set.seed(3)
bart_fit = bart2(Sales ~ ., data=Carseats_train,
                 test=Carseats_test, 
                 verbose=FALSE)

yhat_test_bart = bart_fit$yhat.test.mean
mse_bart       = mean((Carseats_test$Sales - yhat_test_bart)^2)
mse_bart

## [1] 1.441054

-This approach seems to yield the most accurate results based on the fact that the MSE value is lowest here.

11.)

a.)

library(ISLR)

caravan_train <- Caravan[1:1000, ]
caravan_test <- Caravan[1001:nrow(Caravan), ]

dim(caravan_train)  # Should be 1000 x 86

## [1] 1000   86

dim(caravan_test)

## [1] 4822   86

caravan_train$Purchase <- ifelse(caravan_train$Purchase == "Yes", 1, 0)
caravan_test$Purchase  <- ifelse(caravan_test$Purchase == "Yes", 1, 0)

b.)

library(gbm)

## Loaded gbm 2.2.2

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

set.seed(1)

# Fit the boosting model
boost_caravan <- gbm(Purchase ~ .,
                     data = caravan_train,
                     distribution = "bernoulli",
                     n.trees = 1000,
                     interaction.depth = 4,
                     shrinkage = 0.01,
                     n.minobsinnode = 10,
                     verbose = FALSE)

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.

summary(boost_caravan)

##               var     rel.inf
## PPERSAUT PPERSAUT 7.480819014
## MOPLHOOG MOPLHOOG 4.882054338
## MGODGE     MGODGE 4.838869962
## MKOOPKLA MKOOPKLA 4.507280400
## MOSTYPE   MOSTYPE 3.902338079
## MGODPR     MGODPR 3.547892360
## PBRAND     PBRAND 3.539487907
## MBERMIDD MBERMIDD 3.518082698
## MBERARBG MBERARBG 3.309004843
## MINK3045 MINK3045 3.175313873
## MSKC         MSKC 3.123008472
## MSKA         MSKA 2.685844523
## MAUT2       MAUT2 2.685548007
## MAUT1       MAUT1 2.322786246
## PWAPART   PWAPART 2.316252267
## MSKB1       MSKB1 2.279820190
## MRELOV     MRELOV 2.092410309
## MFWEKIND MFWEKIND 2.017651081
## MBERHOOG MBERHOOG 1.961378700
## MBERARBO MBERARBO 1.862074416
## MRELGE     MRELGE 1.815276446
## MINK7512 MINK7512 1.812894054
## MINKM30   MINKM30 1.808781053
## MOPLMIDD MOPLMIDD 1.757784665
## MFGEKIND MFGEKIND 1.741172971
## MGODOV     MGODOV 1.701539077
## MZFONDS   MZFONDS 1.641658796
## MFALLEEN MFALLEEN 1.517763739
## MSKB2       MSKB2 1.480397941
## MINK4575 MINK4575 1.466410983
## MAUT0       MAUT0 1.403097259
## ABRAND     ABRAND 1.375696683
## MHHUUR     MHHUUR 1.287672857
## MINKGEM   MINKGEM 1.216351643
## MHKOOP     MHKOOP 1.154970948
## MGEMLEEF MGEMLEEF 1.068800262
## MGODRK     MGODRK 1.056066524
## MRELSA     MRELSA 1.025383382
## MZPART     MZPART 0.999705745
## MSKD         MSKD 0.917077921
## MGEMOMV   MGEMOMV 0.893757812
## MBERZELF MBERZELF 0.788935429
## APERSAUT APERSAUT 0.784652995
## MOPLLAAG MOPLLAAG 0.732210597
## MOSHOOFD MOSHOOFD 0.618703929
## PMOTSCO   PMOTSCO 0.481824116
## PLEVEN     PLEVEN 0.410808274
## PBYSTAND PBYSTAND 0.326851643
## MBERBOER MBERBOER 0.311571820
## MINK123M MINK123M 0.169710044
## MAANTHUI MAANTHUI 0.122660387
## ALEVEN     ALEVEN 0.051158218
## PAANHANG PAANHANG 0.006040057
## PFIETS     PFIETS 0.004694048
## PWABEDR   PWABEDR 0.000000000
## PWALAND   PWALAND 0.000000000
## PBESAUT   PBESAUT 0.000000000
## PVRAAUT   PVRAAUT 0.000000000
## PTRACTOR PTRACTOR 0.000000000
## PWERKT     PWERKT 0.000000000
## PBROM       PBROM 0.000000000
## PPERSONG PPERSONG 0.000000000
## PGEZONG   PGEZONG 0.000000000
## PWAOREG   PWAOREG 0.000000000
## PZEILPL   PZEILPL 0.000000000
## PPLEZIER PPLEZIER 0.000000000
## PINBOED   PINBOED 0.000000000
## AWAPART   AWAPART 0.000000000
## AWABEDR   AWABEDR 0.000000000
## AWALAND   AWALAND 0.000000000
## ABESAUT   ABESAUT 0.000000000
## AMOTSCO   AMOTSCO 0.000000000
## AVRAAUT   AVRAAUT 0.000000000
## AAANHANG AAANHANG 0.000000000
## ATRACTOR ATRACTOR 0.000000000
## AWERKT     AWERKT 0.000000000
## ABROM       ABROM 0.000000000
## APERSONG APERSONG 0.000000000
## AGEZONG   AGEZONG 0.000000000
## AWAOREG   AWAOREG 0.000000000
## AZEILPL   AZEILPL 0.000000000
## APLEZIER APLEZIER 0.000000000
## AFIETS     AFIETS 0.000000000
## AINBOED   AINBOED 0.000000000
## ABYSTAND ABYSTAND 0.000000000

-The variable PPERSAUT seems to be, by far, the most influential variable followed by MOPLHOOG, MGODGE, and MKOOPKLA which all have very similar relative influence.

c.)

yhat_boost <- predict(boost_caravan,
                      newdata = caravan_test,
                      n.trees = 1000,
                      type = "response")
pred_class <- ifelse(yhat_boost > 0.20, 1, 0)

conf_matrix <- table(Predicted = pred_class, Actual = caravan_test$Purchase)
print(conf_matrix)

##          Actual
## Predicted    0    1
##         0 4336  258
##         1  197   31

accuracy <- mean(pred_class == caravan_test$Purchase)
misclassification_rate <- mean(pred_class != caravan_test$Purchase)

cat("Accuracy:", accuracy, "\n")

## Accuracy: 0.9056408

cat("Misclassification Rate:", misclassification_rate, "\n")

## Misclassification Rate: 0.09435919

-From the matrix we can conclude that roughly 89.3% (258/(258+31)) of the test individuals that the model labled as “Yes” did indeed end up purchasing.

-From the results we can see that logistic regression performs slightly worse than that of the boosting model, however, we can cite a significant improvement in accuracy for the KNN model in comparison to both. I am including a picture of the results and not the code because my code was giving me a knitting error that I could not fix.

Lab9

2025-04-15

7.)

8.)

a.)

b.)

Test MSE: 2.736873

c.)

d.)

e.)

Test MSE: 2.153124

f.)

11.)

a.)

b.)

c.)