Austin Weideman
This question relates to the below figure:
Partition figure
Tree
Diagram
In the lab, we applied random forests to the Boston data
using mtry = 6 and using ntree = 25 and
ntree = 500. Create a plot displaying the test error
resulting from random forests on this data set for a more comprehensive
range of values for mtry and ntree. You can model your plot after Figure
8.10 in the book. Describe the results obtained.
library(ISLR2)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
set.seed(1)
train <- sample(1:nrow(Boston), nrow(Boston) / 2)
xtrain <- Boston[train, -14]
ytrain <- Boston[train, 14]
xtest <- Boston[-train, -14]
ytest <- Boston[-train, 14]
forest.p = randomForest(xtrain, ytrain, xtest=xtest, ytest=ytest, mtry=13, ntree=500)
forest.p_2 = randomForest(xtrain, ytrain, xtest=xtest, ytest=ytest, mtry=13/2, ntree=500)
forest.sqrt_p = randomForest(xtrain, ytrain, xtest=xtest, ytest=ytest, mtry=sqrt(13), ntree=500)
plot(1:500, forest.p$test$mse, col="orange", type="l", xlab="Number of Trees", ylab="Test Error", ylim=c(10, 34))
lines(1:500, forest.p_2$test$mse, col="blue", type="l")
lines(1:500, forest.sqrt_p$test$mse, col="green", type="l")
legend("topright", c("m = p", "m = p/2", "m = sqrt(p)"), col=c("orange", "blue", "green"), cex=1, lty=1)
- The test error initially decreases sharply as the number of trees
increases, until it reaches about 100 trees, then it stabilizes. - The
bagging (m=p) model has a much higher test error than models with
m<p.
We now use boosting to predict Salary in the
Hitters data set.
Hitters = Hitters[-which(is.na(Hitters$Salary)),]
Hitters$Salary = log(Hitters$Salary)
train = Hitters[1:200,]
test = Hitters[-(1:200),]
library(gbm)
## Loaded gbm 2.1.8.1
set.seed(1)
powers = seq(-10, -0.2, by=0.1)
lambdas = 10^powers
train.error = rep(NA, length(lambdas))
test.error = rep(NA, length(lambdas))
for(i in 1:length(lambdas)){
hitters.boost = gbm(Salary ~ ., data = train, distribution = "gaussian", n.trees = 1000, shrinkage = lambdas[i])
pred.train = predict(hitters.boost, train, n.trees=1000)
pred.test = predict(hitters.boost, test, n.trees=1000)
train.error[i] = mean((train$Salary - pred.train)^2)
test.error[i] = mean((test$Salary - pred.test)^2)
}
plot(lambdas, train.error, type="l", xlab="Shrinkage Value", ylab="Training MSE", col="blue")
plot(lambdas, test.error, type="l", xlab="Shrinkage Value", ylab="Test MSE", col="blue")
# boosting
min(test.error)
## [1] 0.2540265
# linear regression
lm.fit = lm(Salary~., data=train)
lm.pred = predict(lm.fit, test)
mean((test$Salary - lm.pred)^2)
## [1] 0.4917959
# random forests with m = sqrt(p)
forest.hitters = randomForest(Salary ~ ., data=train, ntree=500, mtry=sqrt(19))
forest.pred = predict(forest.hitters, test)
mean((test$Salary - forest.pred)^2)
## [1] 0.210951
boosted <- gbm(Salary ~ ., data = train, distribution = "gaussian", n.trees = 1000, shrinkage = lambdas[which.min(test.error)])
summary(boosted)
# random forests with m = p
forest.hitters = randomForest(Salary ~ ., data=train, ntree=500, mtry=19)
forest.pred = predict(forest.hitters, test)
mean((test$Salary - forest.pred)^2)
## [1] 0.2313281
Apply boosting, bagging, random forests, and BART to a data set of your choice. Be sure to fit the models on a training set and to evaluate their performance on a test set. How accurate are the results compared to simple methods like linear or logistic regression? Which of these approaches yields the best performance?
I will be looking at the Boston data from ISLR2 library, and the response variable will be medv (median value of owner occupied homes) modeled on the rest of the variables.
set.seed(1)
data <- (Boston)
p <- dim(Boston)[2]-1
train <- sample(1:nrow(Boston), nrow(Boston) / 2)
boston.test <- Boston[-train, "medv"]
# bagging
bag.boston <- randomForest(medv ~ ., data = Boston, subset = train, mtry = p, importance = TRUE)
yhat.bag <- predict(bag.boston, newdata = Boston[-train, ])
mean((yhat.bag - boston.test)^2)
## [1] 23.4579
# random forest m = sqrt(p)
set.seed(1)
rf.boston <- randomForest(medv ~ ., data = Boston,
subset = train, mtry = sqrt(p), importance = TRUE)
yhat.rf <- predict(rf.boston, newdata = Boston[-train, ])
mean((yhat.rf - boston.test)^2)
## [1] 18.11686
# random forest m = p/2
rf.boston <- randomForest(medv ~ ., data = Boston,
subset = train, mtry = p/2, importance = TRUE)
yhat.rf <- predict(rf.boston, newdata = Boston[-train, ])
mean((yhat.rf - boston.test)^2)
## [1] 19.27921
# boosting
set.seed(1)
boost.boston <- gbm(medv ~ ., data = Boston[train, ],
distribution = "gaussian", n.trees = 5000,
interaction.depth = 4, shrinkage = 0.2, verbose = F)
yhat.boost <- predict(boost.boston,
newdata = Boston[-train, ], n.trees = 5000)
mean((yhat.boost - boston.test)^2)
## [1] 20.37141
# shrinkage = 0.01
set.seed(1)
boost.boston <- gbm(medv ~ ., data = Boston[train, ],
distribution = "gaussian", n.trees = 5000,
interaction.depth = 4, shrinkage = 0.01, verbose = F)
yhat.boost <- predict(boost.boston,
newdata = Boston[-train, ], n.trees = 5000)
mean((yhat.boost - boston.test)^2)
## [1] 17.567
library(BART)
## Loading required package: nlme
## Loading required package: nnet
## Loading required package: survival
x <- Boston[, 1:12]
y <- Boston[, "medv"]
xtrain <- x[train, ]
ytrain <- y[train]
xtest <- x[-train, ]
ytest <- y[-train]
set.seed(1)
bartfit <- gbart(xtrain, ytrain, x.test = xtest)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 253, 12, 253
## y1,yn: 0.213439, -5.486561
## x1,x[n*p]: 0.109590, 210.970000
## xp1,xp[np*p]: 0.027310, 396.900000
## *****Number of Trees: 200
## *****Number of Cut Points: 100 ... 100
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.795495,3,4.42346,21.7866
## *****sigma: 4.765364
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,12,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 3s
## trcnt,tecnt: 1000,1000
yhat.bart <- bartfit$yhat.test.mean
mean((ytest - yhat.bart)^2)
## [1] 21.73309
# linear regression
lm.fit <- lm(medv ~ ., data=Boston, subset = train)
lm.pred <- predict(lm.fit, Boston[-train,])
mean((lm.pred - boston.test)^2)
## [1] 26.86123
Here, boosting with shrinkage = 0.01 yields the lowest test error of all the methods. All methods from this chapter were more accurate than linear regression.
Suppose that we have four observations, for which we compute a dissimilarity matrix, given by
\[ \begin{bmatrix} & .6 & .3 & .9 \\ .6 & & .2 & .35 \\ .3 & .2 & & .5 \\ .9 & .35 & .5 & \end{bmatrix} \]
For instance, the dissimilarity between the first and second observations is 0.6, and the dissimilarity between the second and fourth observations is 0.35.
\[ \begin{bmatrix} & .6 & .9 \\ .6 & & .5 \\ .9 & .5 \end{bmatrix} \] \[ \begin{bmatrix} & .9 \\ .9 & \end{bmatrix} \]
x = as.dist(matrix(c(0, 0.6, 0.3, 0.9,
0.6, 0, 0.2, 0.35,
0.3, 0.2, 0, 0.5,
0.9, 0.35, 0.5, 0), nrow = 4))
plot(hclust(x, method = "complete"))
plot(hclust(x, method = "single"))
Observations 2, 3, and 4 would be in one cluster and 1 would be in the other cluster.
Observations 1, 2, and 3 would be in one cluster and 4 would be in the other cluster.
Answer: Same dendrogram as in (a) but with 2 and 3 swapped positions.
In this problem, you will perform \(K\)-means clustering manually, with
K = 2, on a small example with n = 6
observations and p = 2 features. The observations are as
follows.
| Obs | X_1 | X_2 |
|---|---|---|
| 1 | 2 | 5 |
| 2 | 0 | 3 |
| 3 | 1 | 6 |
| 4 | 4 | 2 |
| 5 | 1 | 4 |
| 6 | 5 | 2 |
x <- cbind(c(2, 0, 1, 4, 1, 5), c(5, 3, 6, 2, 4, 2))
plot(x[,1], x[,2])
sample() command in R to do this. Report the cluster labels
for each observation.set.seed(1)
cluster.labels <- sample(2, nrow(x), replace = T)
cluster.labels
## [1] 1 2 1 1 2 1
plot(x[,1], x[,2], col = cluster.labels+1, pch = 20, cex = 2)
red.centroid <- c(mean(x[cluster.labels == 1, 1]), mean(x[cluster.labels == 1, 2]))
green.centroid <- c(mean(x[cluster.labels == 2, 1]), mean(x[cluster.labels == 2, 2]))
plot(x[,1], x[,2], col=cluster.labels + 1, pch = 20, cex = 2)
points(red.centroid[1], red.centroid[2], col = 'red', pch = 3)
points(green.centroid[1], green.centroid[2], col = 'green', pch = 3)
cluster.labels <- c(1, 2, 2, 1, 2, 1)
plot(x[, 1], x[, 2], col = cluster.labels + 1, pch = 20, cex = 2)
points(red.centroid[1], red.centroid[2], col = 'red', pch = 3)
points(green.centroid[1], green.centroid[2], col = 'green', pch = 3)
red.centroid <- c(mean(x[cluster.labels == 1, 1]), mean(x[cluster.labels == 1, 2]))
green.centroid <- c(mean(x[cluster.labels == 2, 1]), mean(x[cluster.labels == 2, 2]))
plot(x[,1], x[,2], col=cluster.labels + 1, pch = 20, cex = 2)
points(red.centroid[1], red.centroid[2], col = 'red', pch = 3)
points(green.centroid[1], green.centroid[2], col = 'green', pch = 3)
cluster.labels <- c(2, 2, 2, 1, 2, 1)
plot(x[, 1], x[, 2], col = cluster.labels + 1, pch = 20, cex = 2)
points(red.centroid[1], red.centroid[2], col = 'red', pch = 3)
points(green.centroid[1], green.centroid[2], col = 'green', pch = 3)
red.centroid <- c(mean(x[cluster.labels == 1, 1]), mean(x[cluster.labels == 1, 2]))
green.centroid <- c(mean(x[cluster.labels == 2, 1]), mean(x[cluster.labels == 2, 2]))
plot(x[,1], x[,2], col=cluster.labels + 1, pch = 20, cex = 2)
points(red.centroid[1], red.centroid[2], col = 'red', pch = 3)
points(green.centroid[1], green.centroid[2], col = 'green', pch = 3)
cluster.labels <- c(2, 2, 2, 1, 2, 1)
plot(x[,1], x[,2], col = cluster.labels + 1, pch = 20, cex = 2)
points(red.centroid[1], red.centroid[2], col = 2, pch = 3)
points(green.centroid[1], green.centroid[2], col = 3, pch = 3)
- Here, we assigned everything to the color of the closest centroid and
nothing changed, so we have converged at the answer.
plot(x[,1], x[,2], col=cluster.labels + 1, pch = 20, cex = 2)
In Section 12.2.3, a formula for calculating PVE was given in
Equation 12.10. We also saw that the PVE can be obtained using the
sdev output of the prcomp() function. You will
simulate data, and calculate PVE in two ways.
To simulate data, create random normal data matrix of 50 observations
and 4 variables using rnorm(). For grading purposes, please
set the random seed to 1 (set.seed(1)).
set.seed(1)
data <- matrix(rnorm(4*50), 50, 4)
sdev output of the prcomp()
function, as was done in Section 12.2.3.pr.comp <- prcomp(data, scale = TRUE)
pr.vars <- pr.comp$sdev^2
pve <- pr.vars / sum(pr.vars)
pve
## [1] 0.3337764 0.2598064 0.2476689 0.1587483
prcomp() function to compute the principal component
loadings. Then, use those loadings in Equation 12.10 to obtain the PVE.
These two approaches should give the same results.loadings <- pr.comp$rotation
data.scaled <- scale(data)
sum.vars <- sum(apply(as.matrix(data.scaled)^2, 2, sum))
apply((as.matrix(data.scaled) %*% loadings)^2, 2, sum) / sum.vars
## PC1 PC2 PC3 PC4
## 0.3337764 0.2598064 0.2476689 0.1587483
Hint: You will only obtain the same results in (a) and (b) if the
same data is used in both cases. For instance, if in (a) you performed
prcomp() using centered and scaled variables, then you must
center and scale the variables before applying Equation 12.10 in
(b).
In this problem, you will generate simulated data, and then perform PCA and K-means clustering on the data.
Hint: There are a number of functions in R that you can use to generate data. One example is the rnorm() function; runif() is another option. Be sure to add a mean shift to the observations in each class so that there are three distinct classes.
set.seed(1)
dataset <- matrix(rnorm(100 * 50, mean = 0, sd = 0.001), 100, 50)
dataset[1:25, 1] <- 1
dataset[26:50, 2] <- 2
dataset[51:75, 1] <- 2
dataset[76:100, 2] <- 1
#view(dataset)
labels <- c(rep(1, 25), rep(2, 25), rep(3, 25), rep(4, 25))
pr.comp <- prcomp(dataset)
plot(pr.comp$x[, 1:2], col = 1:4, pch = 16)
K = 4. How well do the clusters that you obtained in
K-means clustering compare to the true class labels? Hint: You can use
the table() function in R to compare the true class labels
to the class labels obtained by clustering. Be careful how you interpret
the results: K-means clustering will arbitrarily number the clusters, so
you cannot simply check whether the true class labels and clustering
labels are the same.km <- kmeans(dataset, 4, nstart = 25)
table(labels, km$cluster)
##
## labels 1 2 3 4
## 1 25 0 0 0
## 2 0 0 0 25
## 3 0 25 0 0
## 4 0 0 25 0
The observations are clustered correctly, with one group in each row/col pair.
(≥d) Perform K-means clustering with K = 3. Describe
your results.
km <- kmeans(dataset, 3, nstart = 25)
table(labels, km$cluster)
##
## labels 1 2 3
## 1 25 0 0
## 2 0 25 0
## 3 25 0 0
## 4 0 0 25
One of the clusters combines with another one. Now there are two in one column.
K = 5, and describe
your results.km <- kmeans(dataset, 5, nstart = 25)
table(labels, km$cluster)
##
## labels 1 2 3 4 5
## 1 0 0 0 0 25
## 2 25 0 0 0 0
## 3 0 11 0 14 0
## 4 0 0 25 0 0
One of the clusters is split into two.
K = 4 on the first
two principal component score vectors, rather than on the raw data. That
is, perform K-means clustering on the 100x2 matrix of which the first
column is the first principal component score vector, and the second
column is the second principal component score vector. Comment on the
results.km <- kmeans(pr.comp$x[, 1:2], 4, nstart = 25)
table(labels, km$cluster)
##
## labels 1 2 3 4
## 1 0 0 0 25
## 2 0 25 0 0
## 3 0 0 25 0
## 4 25 0 0 0
All observations are clustered correctly.
scale() function, perform K-means clustering
with K = 4 on the data after scaling each variable to have
standard deviation one. How do these results compare to those obtained
in (b)? Explain.km <- kmeans(scale(dataset), 4, nstart = 25)
table(labels, km$cluster)
##
## labels 1 2 3 4
## 1 8 6 3 8
## 2 2 12 8 3
## 3 12 1 0 12
## 4 2 7 9 7
Scaling worsens the clustering accuracy since it results in different distances between data points.