set.seed(1)
x = matrix(rnorm(25 * 3 * 45, mean = 0), ncol = 45)
# shift mean
x[1:25, 1:45] = x[1:25, 1:45] + 3
x[26:50, 1:45] = x[26:50, 1:45] - 3
x[51:75, 1:45] = x[51:75, 1:45] + 7
# perform PCA
pca.fit = prcomp(x, scale = TRUE)
biplot(pca.fit, scale = 0)
# perform K-means
kmeans3 = kmeans(x, 3, nstart = 20)
plot(x, col = (kmeans3$cluster + 1), pch = 20, cex = 2)
table(kmeans3$cluster, c(rep(2, 25), rep(1, 25), rep(3, 25)))
##
## 1 2 3
## 1 0 25 0
## 2 0 0 25
## 3 25 0 0
Since K-means will arbitrarily number the cluster, the label may not be correct, but we can clearly see the predictors are correctly clustered,
kmeans2 = kmeans(x, 2, nstart = 20)
plot(x, col = (kmeans2$cluster + 1), pch = 20, cex = 2)
table(kmeans2$cluster, c(rep(2, 25), rep(1, 25), rep(3, 25)))
##
## 1 2 3
## 1 0 25 25
## 2 25 0 0
As we can see in the plot and table, all the observations in one group are fully labeled as another group.
kmeans4 = kmeans(x, 4, nstart = 20)
plot(x, col = (kmeans4$cluster + 1), pch = 20, cex = 2)
table(kmeans4$cluster, c(rep(2, 25), rep(1, 25), rep(3, 25)))
##
## 1 2 3
## 1 25 0 0
## 2 0 0 14
## 3 0 25 0
## 4 0 0 11
In this fit, observations from top right are divided into two clusters.
kmeans.pca.fit = kmeans(pca.fit$x[1:75, 1:2], 3, nstart = 20)
plot(x, col = (kmeans.pca.fit$cluster + 1), pch = 20, cex = 2)
table(kmeans.pca.fit$cluster, c(rep(2, 25), rep(1, 25), rep(3, 25)))
##
## 1 2 3
## 1 0 25 0
## 2 25 0 0
## 3 0 0 25
All the observations are correctly labeled. It means that first and second components explain the majority of the observations.
kmeansScale = kmeans(scale(x), 4, nstart = 20)
plot(x, col = (kmeansScale$cluster + 1), pch = 20, cex = 2)
table(kmeansScale$cluster, c(rep(2, 25), rep(1, 25), rep(3, 25)))
##
## 1 2 3
## 1 0 0 14
## 2 0 25 0
## 3 0 0 11
## 4 25 0 0
Some observations are not labeled correctly. Because each pairwise distance will be affected by the scale function. If the measure units are already the same, we should not scale it again.
# setup data
setwd("D:/One Drive/OneDrive/Document/Study/Stanford/Introduction to Statistical Learning/homework/hw4")
load("body.RData")
set.seed(1)
traingIndex = sample(seq_len(nrow(X)), size = 307)
bag.mse = rep(0, 50)
rf.mse = rep(0, 50)
index = c(1: 50)
training = X[traingIndex, ]
trainRes = Y[traingIndex, ]
test = X[-traingIndex, ]
testRes = Y[-traingIndex, ]
# fit the model
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
for (i in index) {
bag.fit = randomForest(trainRes$Weight ~ ., data = training, ntree = i * 5, mtry = ncol(training), importance = TRUE)
rf.fit = randomForest(trainRes$Weight ~ ., data = training, ntree = i * 5, mtry = ncol(training) / 3, importance = TRUE)
bag.pred = predict(bag.fit, newdata = test)
rf.pred = predict(rf.fit, newdata = test)
bag.mse[i] = mean((bag.pred - testRes$Weight)^2)
rf.mse[i] = mean((rf.pred - testRes$Weight)^2)
}
plot(index * 5, rf.mse, xlab = "trees", ylab = "MSE", col = "blue", type = "l")
lines(index * 5, bag.mse)
bag.fit = randomForest(trainRes$Weight ~ ., data = training, mtry = ncol(training), importance = TRUE)
rf.fit = randomForest(trainRes$Weight ~ ., data = training, mtry = ncol(training) / 3, importance = TRUE)
importance(bag.fit)
## %IncMSE IncNodePurity
## Wrist.Diam 6.213179 114.1910
## Wrist.Girth 9.723407 379.4467
## Forearm.Girth 12.681253 2516.7483
## Elbow.Diam 11.525439 254.7523
## Bicep.Girth 8.369623 2642.5175
## Shoulder.Girth 18.262922 12025.7172
## Biacromial.Diam 11.526198 200.3680
## Chest.Depth 10.039241 222.6029
## Chest.Diam 9.353270 272.9933
## Chest.Girth 16.180858 7039.8064
## Navel.Girth 10.955272 435.7336
## Waist.Girth 26.585136 19675.2072
## Pelvic.Breadth 11.617856 194.3229
## Bitrochanteric.Diam 7.456843 145.4929
## Hip.Girth 28.822096 3708.7290
## Thigh.Girth 19.839332 590.1507
## Knee.Diam 14.643604 573.4334
## Knee.Girth 24.639385 1806.0981
## Calf.Girth 12.704152 433.7513
## Ankle.Diam 11.393109 304.3159
## Ankle.Girth 11.363095 368.3032
importance(rf.fit)
## %IncMSE IncNodePurity
## Wrist.Diam 7.201574 208.1603
## Wrist.Girth 7.818100 1110.1029
## Forearm.Girth 12.456342 4601.8990
## Elbow.Diam 7.166250 715.3144
## Bicep.Girth 11.155830 5283.4467
## Shoulder.Girth 15.591795 9125.2871
## Biacromial.Diam 8.289636 202.4015
## Chest.Depth 8.285154 867.3976
## Chest.Diam 8.263369 1161.6197
## Chest.Girth 16.310453 8456.0094
## Navel.Girth 12.133828 803.9854
## Waist.Girth 20.073316 12823.6658
## Pelvic.Breadth 10.513057 279.3287
## Bitrochanteric.Diam 9.606671 333.0635
## Hip.Girth 24.636410 2982.1815
## Thigh.Girth 17.786278 867.2484
## Knee.Diam 13.262410 636.4748
## Knee.Girth 20.754587 1833.2621
## Calf.Girth 13.824790 693.6542
## Ankle.Diam 10.897502 378.8536
## Ankle.Girth 9.305935 597.5383
For Bagging, Waist.Girth has the largest %IncMSE and IncNodePurity, so it’s the most important variable. Fot Random forest, Hip.Girth and Waist.Girth are the most importance variables, they have the largest %IncMSE and IncNodePurity.
rf.fit = randomForest(trainRes$Weight ~ ., data = training, mtry = ncol(training) / 3, importance = TRUE, ntree = 500)
rf.pred = predict(rf.fit, newdata = test)
rf.mse = mean((rf.pred - testRes$Weight)^2)
rf.mse1 = mean((rf.pred - testRes)^2)
rf.mse
## [1] 9.65916
rf.mse1
## [1] 4269.325
In homework 3(f) I’m using the whole test variables instead of Weight only to calculate MSE, so the numbers are off. But still we can use the same apporoach to calculate the MSE so we can have a basic idea of their performance.
PCR: 4281.326, PLS: 4279.822, LASSO: 4280.238, RF: 4268.863. Random forest have the lowest MSE so it has the best performance.
4
x1 = c(3, 2, 4, 1, 2, 4, 4)
x2 = c(4, 2, 4, 4, 1, 3, 1)
y = c("red", "red", "red", "red", "blue", "blue", "blue")
plot(x1, x2, col = y, type = "p")
plot(x1, x2, col = y, type = "p")
abline(a = -0.5, b = 1)
\(\beta_0 = -0.5, \beta_1 = 1, \beta_2 = -1\) Classify to Red if \(-0.5 + x1 - x2 = 0 > 0\) and classify to blue otherwise
plot(x1, x2, col = y, type = "p")
abline(a = -1, b = 1, col = "green", lty = 2)
abline(a = 0, b = 1, col = "green", lty = 2)
abline(a= -0.5, b = 1)
Then we calculate the distance between the solid black line and the dotted green line \(0.5 - x1 - x2 = 0\) and \(-1 + x1 - x2 = 0\). Through the distance formala we get the distance is \(\sqrt{2} / 4\)
points \((2, 1), (2, 2), (4, 4), (4, 3)\) are the supporting vectors
It will not affect the maximal margin hyperplane. As we can see from the above figure, point \((4, 1)\) is far from the maximal margin classifier line. As long as it’s not moving into the margin, it has no affect on the hyperplane.
plot(x1, x2, col = y, type = "p")
abline(a= -0.5, b = 1)
abline(a= -0.6, b = 1, col = "yellow")
I drawed a yellow line with different intercept(0.6), the equation is: \(-0.6 + x1 - x2 = 0 > 0\)
plot(x1, x2, col = y, type = "p")
points(3, 1.5, col = "red")
I drawed a point at (3, 1.5) in red. In this case they cannot be separated by a hyperplane.