# Install/load packages if needed
# install.packages(c("MASS", "randomForest", "ggplot2", "reshape2"))
library(MASS) # for the Boston data
library(randomForest) # for random forests
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(reshape2)
set.seed(1)
# 1) Split Boston data into training and test sets
train_idx = sample(1:nrow(Boston), nrow(Boston)/2)
Boston_train = Boston[train_idx, ]
Boston_test = Boston[-train_idx, ]
# 2) We'll loop over multiple values of mtry and ntree
mtry_values = c(2, 4, 6, 8, 10, 12, 13) # some range up to p=13
ntree_values = c(25, 50, 100, 200, 300, 500)
# We'll store Test MSE in a matrix
test_mse = matrix(NA, nrow=length(mtry_values), ncol=length(ntree_values),
dimnames=list(mtry_values, ntree_values))
# 3) Train random forests for each (mtry, ntree) and record Test MSE
for(i in seq_along(mtry_values)) {
for(j in seq_along(ntree_values)) {
rf_fit = randomForest(medv ~ ., data=Boston_train,
mtry=mtry_values[i],
ntree=ntree_values[j])
yhat_test = predict(rf_fit, newdata=Boston_test)
test_mse[i,j] = mean((yhat_test - Boston_test$medv)^2)
}
}
# 4) Reshape the matrix for ggplot
df = data.frame(
expand.grid(mtry=mtry_values, ntree=ntree_values),
TestMSE = as.vector(test_mse)
)
# 5) Plot (Test MSE) vs (ntree), grouping by mtry
ggplot(df, aes(x=ntree, y=TestMSE, color=factor(mtry))) +
geom_line() + geom_point() +
labs(x="Number of Trees (ntree)", y="Test MSE", color="mtry") +
ggtitle("Random Forest on Boston: Test MSE vs. ntree and mtry")
-From the graph above we can observe that test MSE stabilizes (decreases) as the ntree grows larger. Thus, we confirm that increasing the number of trees improves performance by reducing variance until a plateau is reached. Additionally, we can suggest that a moderate mtry is best. The graph allows us to choose a balance between predictive accuracy and efficiency.
library(ISLR)
set.seed(1)
myCarseats = Carseats
#Split
train_idx = sample(1:nrow(myCarseats), nrow(myCarseats) / 2)
Carseats_train = myCarseats[train_idx, ]
Carseats_test = myCarseats[-train_idx, ]
dim(Carseats_train); dim(Carseats_test)
## [1] 200 11
## [1] 200 11
library(tree)
tree_carseats = tree(Sales ~ ., data=Carseats_train)
summary(tree_carseats)
##
## Regression tree:
## tree(formula = Sales ~ ., data = Carseats_train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "US"
## Number of terminal nodes: 18
## Residual mean deviance: 2.167 = 394.3 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.88200 -0.88200 -0.08712 0.00000 0.89590 4.09900
plot(tree_carseats)
text(tree_carseats, pretty=0)
yhat_test = predict(tree_carseats, newdata=Carseats_test)
mse_test = mean((Carseats_test$Sales - yhat_test)^2)
mse_test
## [1] 4.922039
set.seed(2)
cv_carseats = cv.tree(tree_carseats) # By default, dev = RSS for regression
cv_carseats
## $size
## [1] 18 17 16 15 14 13 12 11 10 8 7 6 5 4 3 2 1
##
## $dev
## [1] 951.8396 977.9143 1024.6533 990.3214 1004.0541 1010.9744 1028.3245
## [8] 1049.2791 1049.2791 1055.0354 1080.4359 1087.1440 1088.6387 1245.5574
## [15] 1260.6489 1309.6971 1606.3932
##
## $k
## [1] -Inf 16.99544 20.56322 25.01730 25.57104 28.01938 30.36962
## [8] 31.56747 31.80816 40.75445 44.44673 52.57126 76.21881 99.59459
## [15] 116.69889 159.79501 337.60153
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
# Plot cross-validation results
plot(cv_carseats$size, cv_carseats$dev, type="b",
xlab="Tree Size (number of leaves)",
ylab="Deviance (RSS)")
prune_size = cv_carseats$size[which.min(cv_carseats$dev)]
pruned_carseats = prune.tree(tree_carseats, best=prune_size)
# Visualize the pruned tree
plot(pruned_carseats)
text(pruned_carseats, pretty=0)
yhat_pruned = predict(pruned_carseats, newdata=Carseats_test)
mse_pruned = mean((Carseats_test$Sales - yhat_pruned)^2)
mse_pruned
## [1] 4.922039
-The MSE value seems to have increased, though not by a lot, this still suggests that pruning either didn’t help or slightly made the model worse.
library(randomForest)
#Fit a bagged model
bag_carseats = randomForest(Sales ~ .,
data=Carseats_train,
mtry = ncol(Carseats_train) - 1,
ntree=500,
importance=TRUE # to measure variable importance
)
bag_carseats
##
## Call:
## randomForest(formula = Sales ~ ., data = Carseats_train, mtry = ncol(Carseats_train) - 1, ntree = 500, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.916299
## % Var explained: 62.91
#Predict and compute MSE
yhat_bag = predict(bag_carseats, newdata=Carseats_test)
mse_bag = mean((Carseats_test$Sales - yhat_bag)^2)
mse_bag
## [1] 2.578176
importance(bag_carseats)
## %IncMSE IncNodePurity
## CompPrice 25.4243372 174.431337
## Income 5.3242813 89.887966
## Advertising 12.4692482 99.205448
## Population -2.9416460 58.023973
## Price 57.6347806 501.739169
## ShelveLoc 47.3143303 371.757111
## Age 15.1027960 153.706822
## Education 0.2109316 44.840535
## Urban -0.5196792 9.235927
## US 4.3931355 13.553818
Test MSE: 2.166566
rf_carseats = randomForest(Sales ~ .,
data=Carseats_train,
mtry=4,
ntree=500,
importance=TRUE)
rf_carseats
##
## Call:
## randomForest(formula = Sales ~ ., data = Carseats_train, mtry = 4, ntree = 500, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 3.06645
## % Var explained: 61
# Predict & compute Test MSE
yhat_rf = predict(rf_carseats, newdata=Carseats_test)
mse_rf = mean((Carseats_test$Sales - yhat_rf)^2)
mse_rf
## [1] 2.827557
importance(rf_carseats)
## %IncMSE IncNodePurity
## CompPrice 16.6624219 152.45377
## Income 2.0976260 117.76796
## Advertising 11.1456432 107.66154
## Population -2.9389194 92.42950
## Price 42.1123023 431.35652
## ShelveLoc 37.1446832 315.92397
## Age 13.4594288 177.88760
## Education 0.3914409 66.45556
## Urban -0.4473131 13.29531
## US 6.0509180 28.64644
-A small mtry can reduce correlation among trees but might under use relevant predictors. Whereas, a large mtry can lead to trees that are too similar (like bagging). We must pick an mtry that yields the best MSE in practice.
library(dbarts)
set.seed(3)
bart_fit = bart2(Sales ~ ., data=Carseats_train,
test=Carseats_test,
verbose=FALSE)
yhat_test_bart = bart_fit$yhat.test.mean
mse_bart = mean((Carseats_test$Sales - yhat_test_bart)^2)
mse_bart
## [1] 1.441054
-This approach seems to yield the most accurate results based on the fact that the MSE value is lowest here.
library(ISLR)
caravan_train <- Caravan[1:1000, ]
caravan_test <- Caravan[1001:nrow(Caravan), ]
dim(caravan_train) # Should be 1000 x 86
## [1] 1000 86
dim(caravan_test)
## [1] 4822 86
caravan_train$Purchase <- ifelse(caravan_train$Purchase == "Yes", 1, 0)
caravan_test$Purchase <- ifelse(caravan_test$Purchase == "Yes", 1, 0)
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
set.seed(1)
# Fit the boosting model
boost_caravan <- gbm(Purchase ~ .,
data = caravan_train,
distribution = "bernoulli",
n.trees = 1000,
interaction.depth = 4,
shrinkage = 0.01,
n.minobsinnode = 10,
verbose = FALSE)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
summary(boost_caravan)
## var rel.inf
## PPERSAUT PPERSAUT 7.480819014
## MOPLHOOG MOPLHOOG 4.882054338
## MGODGE MGODGE 4.838869962
## MKOOPKLA MKOOPKLA 4.507280400
## MOSTYPE MOSTYPE 3.902338079
## MGODPR MGODPR 3.547892360
## PBRAND PBRAND 3.539487907
## MBERMIDD MBERMIDD 3.518082698
## MBERARBG MBERARBG 3.309004843
## MINK3045 MINK3045 3.175313873
## MSKC MSKC 3.123008472
## MSKA MSKA 2.685844523
## MAUT2 MAUT2 2.685548007
## MAUT1 MAUT1 2.322786246
## PWAPART PWAPART 2.316252267
## MSKB1 MSKB1 2.279820190
## MRELOV MRELOV 2.092410309
## MFWEKIND MFWEKIND 2.017651081
## MBERHOOG MBERHOOG 1.961378700
## MBERARBO MBERARBO 1.862074416
## MRELGE MRELGE 1.815276446
## MINK7512 MINK7512 1.812894054
## MINKM30 MINKM30 1.808781053
## MOPLMIDD MOPLMIDD 1.757784665
## MFGEKIND MFGEKIND 1.741172971
## MGODOV MGODOV 1.701539077
## MZFONDS MZFONDS 1.641658796
## MFALLEEN MFALLEEN 1.517763739
## MSKB2 MSKB2 1.480397941
## MINK4575 MINK4575 1.466410983
## MAUT0 MAUT0 1.403097259
## ABRAND ABRAND 1.375696683
## MHHUUR MHHUUR 1.287672857
## MINKGEM MINKGEM 1.216351643
## MHKOOP MHKOOP 1.154970948
## MGEMLEEF MGEMLEEF 1.068800262
## MGODRK MGODRK 1.056066524
## MRELSA MRELSA 1.025383382
## MZPART MZPART 0.999705745
## MSKD MSKD 0.917077921
## MGEMOMV MGEMOMV 0.893757812
## MBERZELF MBERZELF 0.788935429
## APERSAUT APERSAUT 0.784652995
## MOPLLAAG MOPLLAAG 0.732210597
## MOSHOOFD MOSHOOFD 0.618703929
## PMOTSCO PMOTSCO 0.481824116
## PLEVEN PLEVEN 0.410808274
## PBYSTAND PBYSTAND 0.326851643
## MBERBOER MBERBOER 0.311571820
## MINK123M MINK123M 0.169710044
## MAANTHUI MAANTHUI 0.122660387
## ALEVEN ALEVEN 0.051158218
## PAANHANG PAANHANG 0.006040057
## PFIETS PFIETS 0.004694048
## PWABEDR PWABEDR 0.000000000
## PWALAND PWALAND 0.000000000
## PBESAUT PBESAUT 0.000000000
## PVRAAUT PVRAAUT 0.000000000
## PTRACTOR PTRACTOR 0.000000000
## PWERKT PWERKT 0.000000000
## PBROM PBROM 0.000000000
## PPERSONG PPERSONG 0.000000000
## PGEZONG PGEZONG 0.000000000
## PWAOREG PWAOREG 0.000000000
## PZEILPL PZEILPL 0.000000000
## PPLEZIER PPLEZIER 0.000000000
## PINBOED PINBOED 0.000000000
## AWAPART AWAPART 0.000000000
## AWABEDR AWABEDR 0.000000000
## AWALAND AWALAND 0.000000000
## ABESAUT ABESAUT 0.000000000
## AMOTSCO AMOTSCO 0.000000000
## AVRAAUT AVRAAUT 0.000000000
## AAANHANG AAANHANG 0.000000000
## ATRACTOR ATRACTOR 0.000000000
## AWERKT AWERKT 0.000000000
## ABROM ABROM 0.000000000
## APERSONG APERSONG 0.000000000
## AGEZONG AGEZONG 0.000000000
## AWAOREG AWAOREG 0.000000000
## AZEILPL AZEILPL 0.000000000
## APLEZIER APLEZIER 0.000000000
## AFIETS AFIETS 0.000000000
## AINBOED AINBOED 0.000000000
## ABYSTAND ABYSTAND 0.000000000
-The variable PPERSAUT seems to be, by far, the most influential variable followed by MOPLHOOG, MGODGE, and MKOOPKLA which all have very similar relative influence.
yhat_boost <- predict(boost_caravan,
newdata = caravan_test,
n.trees = 1000,
type = "response")
pred_class <- ifelse(yhat_boost > 0.20, 1, 0)
conf_matrix <- table(Predicted = pred_class, Actual = caravan_test$Purchase)
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 4336 258
## 1 197 31
accuracy <- mean(pred_class == caravan_test$Purchase)
misclassification_rate <- mean(pred_class != caravan_test$Purchase)
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.9056408
cat("Misclassification Rate:", misclassification_rate, "\n")
## Misclassification Rate: 0.09435919
-From the matrix we can conclude that roughly 89.3% (258/(258+31)) of the test individuals that the model labled as “Yes” did indeed end up purchasing.
-From the results we can see that logistic regression performs slightly worse than that of the boosting model, however, we can cite a significant improvement in accuracy for the KNN model in comparison to both. I am including a picture of the results and not the code because my code was giving me a knitting error that I could not fix.