library(ISLR)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-2
attach(College)
set.seed(11)
index = sample(1:nrow(College), round(nrow(College)*.7))
train = College[index,]
test = College[-index,]
lm.fit = lm(Apps ~ ., data = train)
#predict and get test error
lm.pred = predict(lm.fit, test)
lm.error = mean((lm.pred-test$Apps)**2)
print(lm.error)
## [1] 682474.3
The MSE is 682474.3
# matrix for test and training
train.mat <- model.matrix(Apps ~ ., data = train)
test.mat <- model.matrix(Apps ~ ., data = test)
grid <- 10 ^ seq(4, -2, length = 100)
fit.ridge <- glmnet(train.mat, train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
cv.ridge <- cv.glmnet(train.mat, train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
bestlam.ridge <- cv.ridge$lambda.min
pred.ridge <- predict(fit.ridge, s = bestlam.ridge, newx = test.mat)
mean((pred.ridge - test$Apps)^2)
## [1] 682427.7
I see the same MSE for ridge regression and least squares
fit.lasso <- glmnet(train.mat, train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
cv.lasso <- cv.glmnet(train.mat, train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
bestlam.lasso <- cv.lasso$lambda.min
bestlam.lasso
## [1] 0.01
pred.lasso <- predict(fit.lasso, s = bestlam.lasso, newx = test.mat)
mean((pred.lasso - test$Apps)^2)
## [1] 682353.9
This MSE is actually slightly lower than the least squares.
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
fit.pcr <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(fit.pcr, val.type = "MSEP")
pred.pcr <- predict(fit.pcr, test, ncomp = 10)
mean((pred.pcr - test$Apps)^2)
## [1] 979550.1
You can see that on this the MSE is higher than that on least squares
fit.pls <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(fit.pls, val.type = "MSEP")
pred.pls <- predict(fit.pls, test, ncomp = 10)
mean((pred.pls - test$Apps)^2)
## [1] 660366.4
The MSR is actually lower for PLS than for least squares.
test.avg <- mean(test$Apps)
lm.r2 <- 1 - mean((lm.pred - test$Apps)^2) / mean((test.avg - test$Apps)^2)
ridge.r2 <- 1 - mean((pred.ridge - test$Apps)^2) / mean((test.avg - test$Apps)^2)
lasso.r2 <- 1 - mean((pred.lasso - test$Apps)^2) / mean((test.avg - test$Apps)^2)
pcr.r2 <- 1 - mean((pred.pcr - test$Apps)^2) / mean((test.avg - test$Apps)^2)
pls.r2 <- 1 - mean((pred.pls - test$Apps)^2) / mean((test.avg - test$Apps)^2)
lm.r2
## [1] 0.9318928
ridge.r2
## [1] 0.9318975
lasso.r2
## [1] 0.9319048
pcr.r2
## [1] 0.9022463
pls.r2
## [1] 0.9340991
It looks like we can accurately predict the number of college applications. All the predictions are > than 90% with only a 3 point difference between the highest and lowest.
library(leaps)
set.seed(1)
x <- matrix(rnorm(1000 * 20), 1000, 20)
b <- rnorm(20)
b[3] <- 0
b[4] <- 0
b[9] <- 0
b[19] <- 0
b[10] <- 0
eps <- rnorm(1000)
y <- x %*% b + eps
train <- sample(seq(1000), 100, replace = FALSE)
test <- -train
x.train <- x[train, ]
x.test <- x[test, ]
y.train <- y[train]
y.test <- y[test]
data.train <- data.frame(y = y.train, x = x.train)
regfit.full <- regsubsets(y ~ ., data = data.train, nvmax = 20)
train.mat <- model.matrix(y ~ ., data = data.train, nvmax = 20)
val.errors <- rep(NA, 20)
for (i in 1:20) {
coefi <- coef(regfit.full, id = i)
pred <- train.mat[, names(coefi)] %*% coefi
val.errors[i] <- mean((pred - y.train)^2)
}
plot(val.errors, xlab = "Number of predictors", ylab = "Training MSE", pch = 19, type = "b")
data.test <- data.frame(y = y.test, x = x.test)
test.mat <- model.matrix(y ~ ., data = data.test, nvmax = 20)
val.errors <- rep(NA, 20)
for (i in 1:20) {
coefi <- coef(regfit.full, id = i)
pred <- test.mat[, names(coefi)] %*% coefi
val.errors[i] <- mean((pred - y.test)^2)
}
plot(val.errors, xlab = "Number of predictors", ylab = "Test MSE", pch = 19, type = "b")
It looks like model with 15 predictors has the lowest but we will use which.min() to make sure
which.min(val.errors)
## [1] 15
Model 15 has the smallest test MSE
coef(regfit.full, which.min(val.errors))
## (Intercept) x.2 x.4 x.5 x.6 x.7
## -0.003933937 0.359127426 0.202707344 1.036265913 -0.253843053 -1.282753293
## x.8 x.11 x.12 x.13 x.14 x.15
## 0.691581077 0.895769881 0.526887865 -0.207638251 -0.507929833 -0.892604795
## x.16 x.17 x.18 x.20
## -0.343062241 0.184479252 1.646950451 -1.060191640
The best model actually caught all the coefficients that were zeroed out.
val.errors <- rep(NA, 20)
x_cols = colnames(x, do.NULL = FALSE, prefix = "x.")
for (i in 1:20) {
coefi <- coef(regfit.full, id = i)
val.errors[i] <- sqrt(sum((b[x_cols %in% names(coefi)] - coefi[names(coefi) %in% x_cols])^2) + sum(b[!(x_cols %in% names(coefi))])^2)
}
plot(val.errors, xlab = "Number of coefficients", ylab = "Error between estimated and true coefficients", pch = 19, type = "b")
Error between estimated and true coefficients and test error isn’t actually directly correlated. A better fit of true coefficients doesn’t always mean a lower test MSE