The lasso, relative to least squares, is: iii. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. With increasing \(\lambda\) variance starts to decrease faster than bias increases leading to a low in the U shaped MSE curve.
Repeat (a) for ridge regression relative to least squares. iii. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. With increasing \(\lambda\) variance starts to decrease faster than bias increases leading to a low in the U shaped MSE curve.
Repeat (a) for non-linear methods relative to least squares. ii. More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.
\[\sum_{i=1}^n\Biggl(y_i - \beta_0 - \sum_{j=1}^p\beta_jx_{ij}\Biggr)\text{ subject to }\sum_{j=1}^p|\beta_j|\le s\]
As we increase s from 0, the training RSS will: If we start from zero we start with the least flexible and most restricted model and end with the least squares model. As the model gets more flexible the training RSS will steadily decrease.
Repeat (a) for test RSS Test RSS will initially decrease and then start increasing again as the model gets more flexible and starts to overfit the training data.
Repeat (a) for variance. If we start from zero we start with the least flexible and most restricted model and end with the least squares model. With increased flexibility variance always increases.
Repeat (a) for (squared) bias. Squared bias steadily decreases with increased flexibility.
Repeat (a) for the irreducible error. The irreducible error is a constant independet of the model.
\[\sum_{i=1}^n\Biggl(y_i - \beta_0 - \sum_{j=1}^p\beta_jx_{ij}\Biggr) - \lambda\sum_{j=1}^p\beta_j^2\]
Training error will increase steadily with less and less flexibility in the model.
Test error decreases initially and then starts to increase again as the model gets less and less flexible.
Variance decreases with less and less flexibility.
Bias increases with less and less flexibility.
The irreducible error is a constant independet of the model.
lambda = 3
y= 4
betas = seq(-10, 10, 0.2)
ridge = function(beta, y, lambda) (y-beta)^2 + lambda*beta^2
plot(betas, ridge(betas, y, lambda), pch = 20, xlab = "beta", ylab = "Ridge optimization")
est.beta= y/(1+ lambda)
points(est.beta, ridge(est.beta, y, lambda), col = "red", pch = 20, lwd=5)
lambda = 3
y= 4
betas = seq(-10, 10, 0.2)
lasso = function(beta, y, lambda) (y-beta)^2 + lambda*abs(beta)
plot(betas, lasso(betas, y, lambda), xlab="beta", main="Lasso Regression Optimization", pch=20)
est.beta= y- (lambda/2)
points(est.beta, lasso(est.beta, y, lambda), col = "red", pch = 20, lwd=5)
set.seed(1)
X = rnorm(100)
eps = rnorm(100)
b0 = 4
b1 = 2
b2 = 3
b3 = 2
Y = b0 + b1 * X + b2 * X^2 + b3 * X^3 + eps
library(leaps)
data.full = data.frame(Y, X)
regfit.full = regsubsets(Y~poly(X,10,raw=T), data=data.frame(Y,X), nvmax=10)
summary = summary(regfit.full)
par(mfrow=c(2,2))
plot(summary$cp, xlab ="Number of variables", ylab="C_p", type="l")
points(which.min(summary$cp), summary$cp[which.min(summary$cp)], col = "red", cex = 2, pch = 20)
plot(summary$bic, xlab ="Number of variables", ylab="BIC", type="l")
points(which.min(summary$bic), summary$bic[which.min(summary$bic)], col = "red", cex = 2, pch = 20)
plot(summary$adjr2, xlab ="Number of variables", ylab="Adjusted R^2", type="l")
points(which.max(summary$adjr2), summary$adjr2[which.max(summary$adjr2)], col = "red", cex = 2, pch = 20)
library(leaps)
data.full = data.frame(Y, X)
regfit.fwd = regsubsets(Y~poly(X,10,raw=T), data=data.frame(Y,X), nvmax=10, method="forward")
summary = summary(regfit.fwd)
par(mfrow=c(2,2))
plot(summary$cp, xlab ="Number of variables", ylab="C_p", type="l")
points(which.min(summary$cp), summary$cp[which.min(summary$cp)], col = "red", cex = 2, pch = 20)
plot(summary$bic, xlab ="Number of variables", ylab="BIC", type="l")
points(which.min(summary$bic), summary$bic[which.min(summary$bic)], col = "red", cex = 2, pch = 20)
plot(summary$adjr2, xlab ="Number of variables", ylab="Adjusted R^2", type="l")
points(which.max(summary$adjr2), summary$adjr2[which.max(summary$adjr2)], col = "red", cex = 2, pch = 20)
library(leaps)
data.full = data.frame(Y, X)
regfit.back = regsubsets(Y~poly(X,10,raw=T), data=data.frame(Y,X), nvmax=10, method="backward")
summary = summary(regfit.back)
par(mfrow=c(2,2))
plot(summary$cp, xlab ="Number of variables", ylab="C_p", type="l")
points(which.min(summary$cp), summary$cp[which.min(summary$cp)], col = "red", cex = 2, pch = 20)
plot(summary$bic, xlab ="Number of variables", ylab="BIC", type="l")
points(which.min(summary$bic), summary$bic[which.min(summary$bic)], col = "red", cex = 2, pch = 20)
plot(summary$adjr2, xlab ="Number of variables", ylab="Adjusted R^2", type="l")
points(which.max(summary$adjr2), summary$adjr2[which.max(summary$adjr2)], col = "red", cex = 2, pch = 20)
coef(regfit.full, which.max(summary(regfit.full)$adjr2))
## (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2
## 4.07200775 2.38745596 2.84575641
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)5
## 1.55797426 0.08072292
coef(regfit.fwd, which.max(summary(regfit.fwd)$adjr2))
## (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2
## 4.07200775 2.38745596 2.84575641
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)5
## 1.55797426 0.08072292
coef(regfit.back, which.max(summary(regfit.back)$adjr2))
## (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2
## 4.079236362 2.231905828 2.833494180
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)9
## 1.819555807 0.001290827
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 3.0-2
xmat = model.matrix(Y~poly(X,10,raw=T))[,-1]
lasso.mod = cv.glmnet(xmat, Y, alpha=1)
(lambda = lasso.mod$lambda.min)
## [1] 0.04924644
par(mfrow=c(1,1))
plot(lasso.mod)
predict(lasso.mod, s=lambda, type="coefficients")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 4.160843693
## poly(X, 10, raw = T)1 2.203516696
## poly(X, 10, raw = T)2 2.650263654
## poly(X, 10, raw = T)3 1.765931598
## poly(X, 10, raw = T)4 0.040178155
## poly(X, 10, raw = T)5 0.021903164
## poly(X, 10, raw = T)6 .
## poly(X, 10, raw = T)7 0.003685793
## poly(X, 10, raw = T)8 .
## poly(X, 10, raw = T)9 .
## poly(X, 10, raw = T)10 .
Y2 = 4 + 7*X^7 + eps
data.full2 = data.frame(y = Y2,x = X)
regfit.full2 = regsubsets(Y~poly(X,10,raw=T), data=data.full2, nvmax=10)
summary2 = summary(regfit.full2)
par(mfrow=c(2,2))
plot(summary2$cp, xlab ="Number of variables", ylab="C_p", type="l")
points(which.min(summary2$cp), summary2$cp[which.min(summary2$cp)], col = "red", cex = 2, pch = 20)
plot(summary2$bic, xlab ="Number of variables", ylab="BIC", type="l")
points(which.min(summary2$bic), summary2$bic[which.min(summary2$bic)], col = "red", cex = 2, pch = 20)
plot(summary2$adjr2, xlab ="Number of variables", ylab="Adjusted R^2", type="l")
points(which.max(summary2$adjr2), summary2$adjr2[which.max(summary2$adjr2)], col = "red", cex = 2, pch = 20)
coef(regfit.full2, which.min(summary2$cp))
## (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2
## 4.07200775 2.38745596 2.84575641
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)5
## 1.55797426 0.08072292
coef(regfit.full2, which.min(summary2$bic))
## (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2
## 4.061507 1.975280 2.876209
## poly(X, 10, raw = T)3
## 2.017639
coef(regfit.full2, which.min(summary2$adjr2))
## (Intercept) poly(X, 10, raw = T)3
## 6.437156 2.828270
With \(C_p\) we pick a 4 variable model, with BIC a 3 variable model and with adjusted \(R^2\) a 1 variable model.
xmat = model.matrix(Y2~poly(X,10,raw=T))[,-1]
lasso = cv.glmnet(xmat, Y2, alpha=1)
(lambda = lasso$lambda.min)
## [1] 12.36884
par(mfrow=c(1,1))
plot(lasso)
predict(lasso, s=lambda, type="coefficients")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 4.820215
## poly(X, 10, raw = T)1 .
## poly(X, 10, raw = T)2 .
## poly(X, 10, raw = T)3 .
## poly(X, 10, raw = T)4 .
## poly(X, 10, raw = T)5 .
## poly(X, 10, raw = T)6 .
## poly(X, 10, raw = T)7 6.796694
## poly(X, 10, raw = T)8 .
## poly(X, 10, raw = T)9 .
## poly(X, 10, raw = T)10 .
The Lasso picks the 1 variable model. Coefficiants differ quite heavily from best subsets.
library(ISLR)
data(College)
set.seed(1)
trainid = sample(1:nrow(College), nrow(College)/2)
train = College[trainid,]
test = College[-trainid,]
lm.fit = lm(Apps~., data=train)
lm.pred = predict(lm.fit, test)
lm.err = mean((test$Apps - lm.pred)^2)
lm.err
## [1] 1135758
library(glmnet)
train.X = model.matrix(Apps ~ ., data = train)
train.Y = train[, "Apps"]
test.X = model.matrix(Apps ~ ., data = test)
test.Y = test[, "Apps"]
grid = 10 ^ seq(4, -2, length=100)
ridge.mod = glmnet(train.X, train.Y, alpha =0, lambda=grid, thresh = 1e-12)
lambda.best = ridge.mod$lambda.min
ridge.pred = predict(ridge.mod, newx= test.X, s=lambda.best)
(ridge.err = mean((test.Y - ridge.pred)^2))
## [1] 1164319
lasso.mod = glmnet(train.X, train.Y, alpha =1, lambda=grid, thresh = 1e-12)
lasso.cv = cv.glmnet(train.X, train.Y, alpha =1, lambda=grid, thresh = 1e-12)
lambda.best = lasso.cv$lambda.min
lasso.pred = predict(lasso.mod, newx= test.X, s=lambda.best)
(lasso.err = mean((test.Y - lasso.pred)^2))
## [1] 1135660
predict(lasso.mod, s = lambda.best, type="coefficients")
## 19 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -7.900363e+02
## (Intercept) .
## PrivateYes -3.070103e+02
## Accept 1.779328e+00
## Enroll -1.469508e+00
## Top10perc 6.672214e+01
## Top25perc -2.230442e+01
## F.Undergrad 9.258974e-02
## P.Undergrad 9.408838e-03
## Outstate -1.083495e-01
## Room.Board 2.115147e-01
## Books 2.912105e-01
## Personal 6.120406e-03
## PhD -1.547200e+01
## Terminal 6.409503e+00
## S.F.Ratio 2.282638e+01
## perc.alumni 1.130498e+00
## Expend 4.856697e-02
## Grad.Rate 7.488081e+00
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
pcr.fit = pcr(Apps~., data=train, scale=TRUE, validation="CV")
validationplot(pcr.fit, val.type="MSEP")
summary(pcr.fit)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4288 4027 2351 2355 2046 1965 1906
## adjCV 4288 4031 2347 2353 2014 1955 1899
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1910 1913 1871 1799 1799 1802 1800
## adjCV 1903 1908 1866 1790 1792 1795 1793
## 14 comps 15 comps 16 comps 17 comps
## CV 1832 1728 1310 1222
## adjCV 1829 1702 1296 1212
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 32.20 57.78 65.31 70.99 76.37 81.27 84.8 87.85
## Apps 13.44 70.93 71.07 79.87 81.15 82.25 82.3 82.33
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.62 92.91 94.98 96.74 97.79 98.72 99.42
## Apps 83.38 84.76 84.80 84.84 85.11 85.14 90.55
## 16 comps 17 comps
## X 99.88 100.00
## Apps 93.42 93.89
pcr.pred = predict(pcr.fit, test, ncomp=10)
(pcr.err = mean((test$Apps - pcr.pred)^2))
## [1] 1723100
pls.fit = plsr(Apps~., data=train, scale=TRUE, validation="CV")
validationplot(pls.fit, val.type="MSEP")
pls.pred = predict(pls.fit, test, ncomp=10)
(pls.err = mean((test$Apps - pls.pred)^2))
## [1] 1131661
test.avg = mean(test$Apps)
lm.r2 =1 - lm.err/mean((test.avg - test$Apps)^2)
ridge.r2 =1 - ridge.err/mean((test.avg - test$Apps)^2)
lasso.r2 =1 - lasso.err/mean((test.avg - test$Apps)^2)
pcr.r2 =1 - pcr.err/mean((test.avg - test$Apps)^2)
pls.r2 =1 - pls.err/mean((test.avg - test$Apps)^2)
all.r2 = c(lm.r2, ridge.r2, lasso.r2, pcr.r2, pls.r2)
names(all.r2) = c("lm", "ridge", "lasso", "pcr", "pls")
barplot(all.r2 )
All but the pcr model predict college applications with high accuracy.
set.seed(1)
x = matrix(rnorm(1000 * 20), 1000, 20)
b = rnorm(20)
b[3] = 0
b[4] = 0
b[9] = 0
b[11] = 0
b[13] = 0
b[14] = 0
b[7] = 0
b[19] = 0
eps = rnorm(1000)
y = x %*% b + eps
trainid = sample(1:nrow(x), nrow(x)/10)
X.train = x[-trainid,]
Y.train = y[-trainid,]
X.test = x[trainid,]
Y.test = y[trainid,]
data.train = data.frame(y = Y.train, x = X.train)
regfit.full = regsubsets(y ~ ., data = data.train, nvmax = 20)
train.mat = model.matrix(y ~ ., data = data.train, nvmax = 20)
val.errors = rep(NA, 20)
for (i in 1:20) {
coefi = coef(regfit.full, id = i)
pred = train.mat[, names(coefi)] %*% coefi
val.errors[i] = mean((pred - Y.train)^2)
}
plot(val.errors, xlab = "Number of predictors", ylab = "Training MSE", pch = 19, type = "b")
data.test = data.frame(y = Y.test, x = X.test)
test.mat = model.matrix(y ~ ., data = data.test, nvmax = 20)
for (i in 1:20) {
coefi = coef(regfit.full, id = i)
pred = test.mat[, names(coefi)] %*% coefi
val.errors[i] = mean((pred - Y.test)^2)
}
plot(val.errors, xlab = "Number of predictors", ylab = "Test MSE", pch = 19, type = "b")
which.min(val.errors)
## [1] 12
coef(regfit.full, which.min(val.errors))
## (Intercept) x.1 x.2 x.5 x.6 x.8
## 0.06067403 0.19513841 0.23488764 1.06526308 -0.26722541 0.73552035
## x.10 x.12 x.15 x.16 x.17 x.18
## 0.71666624 0.53548818 -0.72346236 -0.27004140 0.34920967 1.65652430
## x.20
## -1.01865533
val.errors = rep(NA, 20)
x_cols = colnames(x, do.NULL = FALSE, prefix = "x.")
for (i in 1:20) {
coefi = coef(regfit.full, id = i)
val.errors[i] = sqrt(sum((b[x_cols %in% names(coefi)] - coefi[names(coefi) %in% x_cols])^2) + sum(b[!(x_cols %in% names(coefi))])^2)
}
plot(val.errors, xlab = "Number of coefficients", ylab = "Error between estimated and true coefficients", pch = 19, type = "b")
which.min(val.errors)
## [1] 12
library(MASS)
library(leaps)
library(glmnet)
data(Boston)
set.seed(1)
predict.regsubsets = function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id = id)
xvars = names(coefi)
mat[, xvars] %*% coefi
}
k = 10
folds = sample(1:k, nrow(Boston), replace = TRUE)
cv.errors = matrix(NA, k, 13, dimnames = list(NULL, paste(1:13)))
for (j in 1:k) {
best.fit = regsubsets(crim ~ ., data = Boston[folds != j, ], nvmax = 13)
for (i in 1:13) {
pred = predict(best.fit, Boston[folds == j, ], id = i)
cv.errors[j, i] = mean((Boston$crim[folds == j] - pred)^2)
}
}
mean.cv.errors = apply(cv.errors, 2, mean)
plot(mean.cv.errors, type = "b", xlab = "Number of variables", ylab = "CV error")
min(mean.cv.errors)
## [1] 42.46014
regfit.best = regsubsets(crim~., data=Boston, nvmax=13)
coef(regfit.best, 12)
## (Intercept) zn indus chas nox
## 16.985713928 0.044673247 -0.063848469 -0.744367726 -10.202169211
## rm dis rad tax ptratio
## 0.439588002 -0.993556631 0.587660185 -0.003767546 -0.269948860
## black lstat medv
## -0.007518904 0.128120290 -0.198877768
x = model.matrix(crim ~ ., Boston)[, -1]
y = Boston$crim
cv.out = cv.glmnet(x, y, alpha = 1, type.measure = "mse")
plot(cv.out)
cv.out
##
## Call: cv.glmnet(x = x, y = y, type.measure = "mse", alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Measure SE Nonzero
## min 0.051 43.11 14.16 11
## 1se 3.376 56.89 17.29 1
cv.out = cv.glmnet(x, y, alpha = 0, type.measure = "mse")
cv.out
##
## Call: cv.glmnet(x = x, y = y, type.measure = "mse", alpha = 0)
##
## Measure: Mean-Squared Error
##
## Lambda Measure SE Nonzero
## min 0.54 43.48 14.33 13
## 1se 74.44 57.69 16.76 13
pcr.fit = pcr(crim ~ ., data = Boston, scale = TRUE, validation = "CV")
validationplot(pcr.fit, val.type = "MSEP")