############################
# Homework 2
# Gabbi Allen
# Dr. Coberly
# MATH 5553-01
############################
############################
# Chapter 5, Problem 8
# We will now perform cross-validation on a simulated data set
# Part a: Generate a simulated data set as follows:
set.seed(1)
y <- rnorm(100)
x <- rnorm(100)
y <- x - 2*x^2 +rnorm(100)
# n = 100 and p = 1; x is the only predictor variable
# equation form of y: Y = X + 2X^2 + epsilon, where epsilon is the error term
# Part b: Create a scatterplot of X against Y. Comment on what you find
plot(x, y)

# The plot indicates a fairly strong quadratic relationship.
# Part c: Set a random seed, and then compute the LOOCV errors that result from
# fitting the following four models using least squares:
# i. Y = B0 + B1X + epsilon
# ii. Y = B0 + B1X + B2X^2 + epsilon
# iii. Y = B0 + B1X + B2X^2 + B3X^3 + epsilon
# iv. Y = B0 + B1X + B2X^2 + B3X^3 + B4X^4 + epsilon
library(boot) # load boot package
set.seed(96) # set random seed
Data <- data.frame(x, y)
# part i
glm.fit1 <- glm(y ~ x, data = Data)
cv.err1 <- cv.glm(Data, glm.fit1)
cv.err1$delta # these numbers are essentially identical
## [1] 5.890979 5.888812
# part ii
glm.fit2 <- glm(y ~ poly(x, 2), data = Data)
cv.err2 <- cv.glm(Data, glm.fit2)
cv.err2$delta # these numbers are essentially identical
## [1] 1.086596 1.086326
# part iii
glm.fit3 <- glm(y ~ poly(x, 3), data = Data)
cv.err3 <- cv.glm(Data, glm.fit3)
cv.err3$delta # these numbers are essentially identical
## [1] 1.102585 1.102227
# part iv
glm.fit4 <- glm(y ~ poly(x, 4), data = Data)
cv.err4 <- cv.glm(Data, glm.fit4)
cv.err4$delta # these numbers are essentially identical
## [1] 1.114772 1.114334
# part d: Repeat (c) using another random seed, and report your results. Are your
# results the same as what you got in (c)? Why?
set.seed(17) # set random seed
# part i
glm.fit1 <- glm(y ~ x, data = Data)
cv.err1 <- cv.glm(Data, glm.fit1)
cv.err1$delta # these numbers are essentially identical
## [1] 5.890979 5.888812
# part ii
glm.fit2 <- glm(y ~ poly(x, 2), data = Data)
cv.err2 <- cv.glm(Data, glm.fit2)
cv.err2$delta # these numbers are essentially identical
## [1] 1.086596 1.086326
# part iii
glm.fit3 <- glm(y ~ poly(x, 3), data = Data)
cv.err3 <- cv.glm(Data, glm.fit3)
cv.err3$delta # these numbers are essentially identical
## [1] 1.102585 1.102227
# part iv
glm.fit4 <- glm(y ~ poly(x, 4), data = Data)
cv.err4 <- cv.glm(Data, glm.fit4)
cv.err4$delta # these numbers are essentially identical
## [1] 1.114772 1.114334
# The numbers I got in part d were identical to the numbers in part c. This is
# because LOOCV leaves one observation out every time, so it doesn't matter what
# the seed is.
# Part e: Which of the models in (c) had the smallest LOOCV error?
# Is this what you expected? Explain your answer
# The quadratic models (the second one) had the smallest LOOCV error. This is not
# unexpected because when looking at the graph, there is clearly a quadratic
# relationship between x and y
# Part f: Comment on the statistical significance of the coefficient estimates
# that results from fitting each of the models in (c) using least squares. Do
# these results agree with the conclusions drawn based on the cross-validation
# results?
summary(glm.fit4)
##
## Call:
## glm(formula = y ~ poly(x, 4), data = Data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8914 -0.5244 0.0749 0.5932 2.7796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.8277 0.1041 -17.549 <2e-16 ***
## poly(x, 4)1 2.3164 1.0415 2.224 0.0285 *
## poly(x, 4)2 -21.0586 1.0415 -20.220 <2e-16 ***
## poly(x, 4)3 -0.3048 1.0415 -0.293 0.7704
## poly(x, 4)4 -0.4926 1.0415 -0.473 0.6373
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.084654)
##
## Null deviance: 552.21 on 99 degrees of freedom
## Residual deviance: 103.04 on 95 degrees of freedom
## AIC: 298.78
##
## Number of Fisher Scoring iterations: 2
# It appears that the linear and quadratic terms are statistically significant,
# as well as the intercept term. Because our CV errors were lowest for the
# quadratic model, this isn't surprising.
rm(list = ls()) # clear all variables/models to start fresh for next problem
############################
# Chapter 6, Problem 8
# In this exercise, we will generate simulated data, and will then use this data
# to perform best subset selection.
# Part a: Use the rnorm() function to generate a predictor X of length n = 100,
# as well as a noise vector epsilon of length n = 100.
set.seed(3)
x <- rnorm(100)
epsilon <- rnorm(100)
# Part b: Generate a response vector Y of length n = 100 according to the model
# Y = B0 + B1X + B2X^2 + B3X^3 + epsilon
# where B0, B1, B2, and B3 are constants of your choice.
beta <- c(2, 4, 7, 3)
y <- beta[1] + beta[2]*x + beta[3]*x^2 + beta[4]*x^3 + epsilon
# Part c: Use the regsubsets() function to perform best subset selection in order
# to choose the best model containing the predictors x, x^2, ..., x^10. What is
# the best model obtained according to Cp, BIC, and adjusted R^2? Show some plots
# to provide evidence for your answer, and report the coefficients of the best
# model obtained. Note you will need to use the data.frame() function to create
# a single data set using both x and y.
library(leaps) # load package
plot(x,y) # plot data points
grid <- seq(min(x), max(x), length.out = 50)
lines(grid, predict(loess(y ~ x), data.frame(x = grid)))

x.mat <- poly(x, 10, raw = T)
d.frame <- data.frame(x,y) # create a single data set using both x and y
regfit.full <- regsubsets(y ~ x.mat, data = d.frame, nvmax = 10)
# best subset selection model
sum.full <- summary(regfit.full)
objects(sum.full)
## [1] "adjr2" "bic" "cp" "obj" "outmat" "rsq" "rss" "which"
sum.full$outmat # see which predictors are chosen depending on how many are chosen
## x.mat1 x.mat2 x.mat3 x.mat4 x.mat5 x.mat6 x.mat7 x.mat8 x.mat9
## 1 ( 1 ) "*" " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" "*" "*" " " " " "*" " " " " " "
## 5 ( 1 ) "*" "*" " " " " "*" " " "*" " " "*"
## 6 ( 1 ) "*" "*" "*" " " "*" " " "*" " " "*"
## 7 ( 1 ) "*" "*" " " "*" "*" "*" "*" " " "*"
## 8 ( 1 ) "*" "*" " " " " "*" "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*"
## x.mat10
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) " "
## 5 ( 1 ) " "
## 6 ( 1 ) " "
## 7 ( 1 ) " "
## 8 ( 1 ) "*"
## 9 ( 1 ) "*"
## 10 ( 1 ) "*"
sum.full$adjr2 # adjusted R^2
## [1] 0.6186582 0.9372859 0.9845940 0.9844369 0.9845636 0.9844058 0.9842723
## [8] 0.9844977 0.9846469 0.9849463
plot(sum.full$adjr2)
indx <- which.max(sum.full$adjr2)
points(indx, sum.full$adjr2[indx], pch = 19)

# The model with 10 variables has the highest adjusted R^2, but there does not
# appear to be a particularly meaningful difference between the best 3-variable
# model and the best 10 variable model.
plot(sum.full$bic)
indx.2 <- which.min(sum.full$bic)
points(indx.2, sum.full$bic[indx.2], pch = 19)

# In the BIC graph, the model with the lowest BIC is the 3-variable model.
plot(sum.full$cp)
indx.3 <- which.min(sum.full$cp)
points(indx.3, sum.full$cp[indx.3], pch = 19)

# Again, in the cP graph, the 3-variable model has the lowest cP. Because the
# 3-variable model has the lowest BIC and cP error, and there isn't a meaningful
# increase in adjusted R^2, we will treat the 3-variable model as the best model.
sum.full
## Subset selection object
## Call: regsubsets.formula(y ~ x.mat, data = d.frame, nvmax = 10)
## 10 Variables (and intercept)
## Forced in Forced out
## x.mat1 FALSE FALSE
## x.mat2 FALSE FALSE
## x.mat3 FALSE FALSE
## x.mat4 FALSE FALSE
## x.mat5 FALSE FALSE
## x.mat6 FALSE FALSE
## x.mat7 FALSE FALSE
## x.mat8 FALSE FALSE
## x.mat9 FALSE FALSE
## x.mat10 FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
## x.mat1 x.mat2 x.mat3 x.mat4 x.mat5 x.mat6 x.mat7 x.mat8 x.mat9
## 1 ( 1 ) "*" " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" "*" "*" " " " " "*" " " " " " "
## 5 ( 1 ) "*" "*" " " " " "*" " " "*" " " "*"
## 6 ( 1 ) "*" "*" "*" " " "*" " " "*" " " "*"
## 7 ( 1 ) "*" "*" " " "*" "*" "*" "*" " " "*"
## 8 ( 1 ) "*" "*" " " " " "*" "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*"
## x.mat10
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) " "
## 5 ( 1 ) " "
## 6 ( 1 ) " "
## 7 ( 1 ) " "
## 8 ( 1 ) "*"
## 9 ( 1 ) "*"
## 10 ( 1 ) "*"
coef(regfit.full, 3)
## (Intercept) x.mat1 x.mat2 x.mat3
## 1.957565 3.888312 7.087611 3.017670
# So, the third model uses the first (number of hits), second (CRBI), and third
# (PutOuts) variables in the model. The coefficient for the first variable is
# 2.8038, the coefficient for the second variable is 0.68253, and the coefficient
# for the third variable is 0.27358.
# Part d: Repeat (c), using forward stepwise selection and also using backwards
# stepwise selection. How does your answer compare to the results in (c)?
# Forward stepwise selection:
regfit.fwd <- regsubsets(y ~ x.mat, method="forward", data = d.frame, nvmax=10)
sum.fwd <- summary(regfit.fwd)
sum.fwd
## Subset selection object
## Call: regsubsets.formula(y ~ x.mat, method = "forward", data = d.frame,
## nvmax = 10)
## 10 Variables (and intercept)
## Forced in Forced out
## x.mat1 FALSE FALSE
## x.mat2 FALSE FALSE
## x.mat3 FALSE FALSE
## x.mat4 FALSE FALSE
## x.mat5 FALSE FALSE
## x.mat6 FALSE FALSE
## x.mat7 FALSE FALSE
## x.mat8 FALSE FALSE
## x.mat9 FALSE FALSE
## x.mat10 FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: forward
## x.mat1 x.mat2 x.mat3 x.mat4 x.mat5 x.mat6 x.mat7 x.mat8 x.mat9
## 1 ( 1 ) "*" " " " " " " " " " " " " " " " "
## 2 ( 1 ) "*" "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" "*" "*" " " " " "*" " " " " " "
## 5 ( 1 ) "*" "*" "*" " " "*" "*" " " " " " "
## 6 ( 1 ) "*" "*" "*" "*" "*" "*" " " " " " "
## 7 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " " "*"
## 9 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " " "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*"
## x.mat10
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) " "
## 5 ( 1 ) " "
## 6 ( 1 ) " "
## 7 ( 1 ) " "
## 8 ( 1 ) " "
## 9 ( 1 ) "*"
## 10 ( 1 ) "*"
objects(sum.fwd)
## [1] "adjr2" "bic" "cp" "obj" "outmat" "rsq" "rss" "which"
sum.fwd$adjr2
## [1] 0.6186582 0.8735752 0.9845940 0.9844369 0.9843192 0.9842197 0.9841484
## [8] 0.9841141 0.9840127 0.9849463
plot(sum.fwd$adjr2)
indx.fwd <- which.max(sum.fwd$adjr2)
points(indx.fwd, sum.fwd$adjr2[indx.fwd], pch = 19)

# So the highest adjusted R^2 is the 10- variable model, but there doesn't appear
# to be much improvement between the 3-variable model and the 10-variable model.
sum.fwd$bic
## [1] -88.21081 -195.03613 -401.95623 -397.38380 -393.08360 -388.91501
## [7] -384.94025 -381.21184 -377.07508 -379.60464
plot(sum.fwd$bic)
indx.fwd2 <- which.min(sum.fwd$bic)
points(indx.fwd2, sum.fwd$bic[indx.fwd2], pch = 19)

# In terms of BIC, the 3-variable model returns the lowest BIC.
sum.fwd$cp
## [1] 2386.546245 720.630863 6.246770 8.214612 9.915538
## [6] 11.488996 12.876348 14.030682 15.581917 11.000000
plot(sum.fwd$cp)
indx.fwd3 <- which.min(sum.fwd$cp)
points(indx.fwd3, sum.fwd$cp[indx.fwd3], pch = 19)

sum.fwd
## Subset selection object
## Call: regsubsets.formula(y ~ x.mat, method = "forward", data = d.frame,
## nvmax = 10)
## 10 Variables (and intercept)
## Forced in Forced out
## x.mat1 FALSE FALSE
## x.mat2 FALSE FALSE
## x.mat3 FALSE FALSE
## x.mat4 FALSE FALSE
## x.mat5 FALSE FALSE
## x.mat6 FALSE FALSE
## x.mat7 FALSE FALSE
## x.mat8 FALSE FALSE
## x.mat9 FALSE FALSE
## x.mat10 FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: forward
## x.mat1 x.mat2 x.mat3 x.mat4 x.mat5 x.mat6 x.mat7 x.mat8 x.mat9
## 1 ( 1 ) "*" " " " " " " " " " " " " " " " "
## 2 ( 1 ) "*" "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" "*" "*" " " " " "*" " " " " " "
## 5 ( 1 ) "*" "*" "*" " " "*" "*" " " " " " "
## 6 ( 1 ) "*" "*" "*" "*" "*" "*" " " " " " "
## 7 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " " "*"
## 9 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " " "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*"
## x.mat10
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) " "
## 5 ( 1 ) " "
## 6 ( 1 ) " "
## 7 ( 1 ) " "
## 8 ( 1 ) " "
## 9 ( 1 ) "*"
## 10 ( 1 ) "*"
coef(regfit.fwd, 3)
## (Intercept) x.mat1 x.mat2 x.mat3
## 1.957565 3.888312 7.087611 3.017670
# Again, the 3-variable model returns the lowest cP. Therefore, the 3-variable
# model again is the best model selected by forward stepwise selection. The
# variables selected are x1, x2, and x3, with the coefficient for x1 = 3.888, the
# coefficient for x2 = 7.088, and the coefficient for x3 = 3.018. The coefficients
# acquired using forward stepsize selection are different than the ones produced
# by using best subset selection.
# Backwards stepwise selection:
regfit.bwd <- regsubsets(y ~ x.mat, method = "backward", data = d.frame,
nvmax = 10)
sum.bwd <- summary(regfit.bwd)
sum.bwd
## Subset selection object
## Call: regsubsets.formula(y ~ x.mat, method = "backward", data = d.frame,
## nvmax = 10)
## 10 Variables (and intercept)
## Forced in Forced out
## x.mat1 FALSE FALSE
## x.mat2 FALSE FALSE
## x.mat3 FALSE FALSE
## x.mat4 FALSE FALSE
## x.mat5 FALSE FALSE
## x.mat6 FALSE FALSE
## x.mat7 FALSE FALSE
## x.mat8 FALSE FALSE
## x.mat9 FALSE FALSE
## x.mat10 FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: backward
## x.mat1 x.mat2 x.mat3 x.mat4 x.mat5 x.mat6 x.mat7 x.mat8 x.mat9
## 1 ( 1 ) "*" " " " " " " " " " " " " " " " "
## 2 ( 1 ) "*" "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" "*" " " " " "*" " " " " " " " "
## 4 ( 1 ) "*" "*" " " " " "*" " " "*" " " " "
## 5 ( 1 ) "*" "*" " " " " "*" " " "*" " " "*"
## 6 ( 1 ) "*" "*" " " " " "*" " " "*" "*" "*"
## 7 ( 1 ) "*" "*" " " " " "*" "*" "*" "*" "*"
## 8 ( 1 ) "*" "*" " " " " "*" "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*"
## x.mat10
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) " "
## 5 ( 1 ) " "
## 6 ( 1 ) " "
## 7 ( 1 ) " "
## 8 ( 1 ) "*"
## 9 ( 1 ) "*"
## 10 ( 1 ) "*"
objects(sum.bwd)
## [1] "adjr2" "bic" "cp" "obj" "outmat" "rsq" "rss" "which"
plot(sum.bwd$adjr)
indx.bwd <- which.max(sum.bwd$adjr2)
points(indx.bwd, sum.bwd$adjr2[indx.bwd], pch = 19)

# Again, the 10-variable model has the highest adjusted R^2, but there isn't a
# substantial difference between the 3-variable model and the 10-variable
# model.
plot(sum.bwd$bic)
indx.bwd2 <- which.min(sum.bwd$bic)
points(indx.bwd2, sum.bwd$bic[indx.bwd2], pch = 19)

# We get a different result when using backwards stepwise selection than we did
# before. This time, the 5-variable model has the smallest BIC. However, there
# doesn't seem to be a significant difference in BIC between the 4-variable
# and 5-variable model, but there does to appear to be a somewhat large difference
# between the 3-variable and 5-variable models.
plot(sum.bwd$cp)
indx.bwd3 <- which.min(sum.bwd$cp)
points(indx.bwd3, sum.bwd$cp[indx.bwd3], pch = 19)
# Differently than when we used best subset and forward stepwise selection,
# the model with the smallest cP is the 5-variable model. There does not appear
# to be a substantial difference between the 5-variable and 3-variable model.
# Because of this, I am selecting the 4-variable model as the best model when
# using backwards stepwise selection, as that appears to appropriately maximize
# the adjusted R^2 value and minimize cP and BIC.
sum.bwd
## Subset selection object
## Call: regsubsets.formula(y ~ x.mat, method = "backward", data = d.frame,
## nvmax = 10)
## 10 Variables (and intercept)
## Forced in Forced out
## x.mat1 FALSE FALSE
## x.mat2 FALSE FALSE
## x.mat3 FALSE FALSE
## x.mat4 FALSE FALSE
## x.mat5 FALSE FALSE
## x.mat6 FALSE FALSE
## x.mat7 FALSE FALSE
## x.mat8 FALSE FALSE
## x.mat9 FALSE FALSE
## x.mat10 FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: backward
## x.mat1 x.mat2 x.mat3 x.mat4 x.mat5 x.mat6 x.mat7 x.mat8 x.mat9
## 1 ( 1 ) "*" " " " " " " " " " " " " " " " "
## 2 ( 1 ) "*" "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" "*" " " " " "*" " " " " " " " "
## 4 ( 1 ) "*" "*" " " " " "*" " " "*" " " " "
## 5 ( 1 ) "*" "*" " " " " "*" " " "*" " " "*"
## 6 ( 1 ) "*" "*" " " " " "*" " " "*" "*" "*"
## 7 ( 1 ) "*" "*" " " " " "*" "*" "*" "*" "*"
## 8 ( 1 ) "*" "*" " " " " "*" "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*"
## x.mat10
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) " "
## 5 ( 1 ) " "
## 6 ( 1 ) " "
## 7 ( 1 ) " "
## 8 ( 1 ) "*"
## 9 ( 1 ) "*"
## 10 ( 1 ) "*"
coef(regfit.bwd, 4)
## (Intercept) x.mat1 x.mat2 x.mat5 x.mat7
## 1.9711524 5.4901289 7.0905984 1.3653743 -0.1641141
# The 4-variable model when selected using backwards stepwise selection uses the
# first variable, the second variable, the fifth variable, and the seventh
# variable. Coefficients for x1 = 5.490, x2 = 7.091, x5 = 1.365, x7 = -0.164.
# This is different from the forward stepwise selection model and the best
# subset selection model, both in coefficients and in selected variables.
# Even if we had selected the 3-variable model using backwards stepwise selection,
# the variables selected would have been x1, x2, and x5, rather than x1, x2, and
# x3.
# Part e: Now fit a lasso model to the simulated data, again using x, x^2, ...,
# x^10 as predictors. Use cross-validation to select the optimal value of lambda.
# Create plots of the corss-validation error as a function of lambda. Report the
# resulting coefficient estimates, and discuss the results obtained.
set.seed(1)
xmat <- model.matrix(y ~ poly(x, 10, raw = T), data = d.frame)[,-1]
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-13

grid <- 10^seq(10, -2, length = 100)
lasso.mod <- glmnet(xmat, y, alpha = 1, lambda = grid)
plot(lasso.mod)

set.seed(1)
cv.out <- cv.glmnet(xmat, y, alpha = 1)
plot(cv.out)

bestlam <- cv.out$lambda.min
lasso.pred <- predict(lasso.mod, s = bestlam, type = "coefficients")
lasso.pred
## 11 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 2.016542
## poly(x, 10, raw = T)1 3.897243
## poly(x, 10, raw = T)2 7.003016
## poly(x, 10, raw = T)3 2.975668
## poly(x, 10, raw = T)4 .
## poly(x, 10, raw = T)5 .
## poly(x, 10, raw = T)6 .
## poly(x, 10, raw = T)7 .
## poly(x, 10, raw = T)8 .
## poly(x, 10, raw = T)9 .
## poly(x, 10, raw = T)10 .
coef(regfit.bwd, 4)
## (Intercept) x.mat1 x.mat2 x.mat5 x.mat7
## 1.9711524 5.4901289 7.0905984 1.3653743 -0.1641141
coef(regfit.fwd, 3)
## (Intercept) x.mat1 x.mat2 x.mat3
## 1.957565 3.888312 7.087611 3.017670
coef(regfit.full, 3)
## (Intercept) x.mat1 x.mat2 x.mat3
## 1.957565 3.888312 7.087611 3.017670
# The lasso model picks x1, x2, and x3, similar to the previous best subset
# selection and forward stepwise selection models. For the lasso model, x1 = 3.897,
# x2 = 7.003, and x3 = 2.976. Additionally, the resulting coefficients for the
# lasso model are very similar to the coefficients acquired for the best subset
# selection model and for the forward stepwise model. The x2 variable is quite
# similar across models.
rm(list = ls()) # clear environment to get ready for next problem
###########################
# Chapter 6, Problem 9
# In this exercise, we will predict the number of applications received using
# the other variables in the College data set.
library(ISLR)
# Part a: Split the data into a training set and a test set.
college <- College
attach(college)
names(college)
## [1] "Private" "Apps" "Accept" "Enroll" "Top10perc"
## [6] "Top25perc" "F.Undergrad" "P.Undergrad" "Outstate" "Room.Board"
## [11] "Books" "Personal" "PhD" "Terminal" "S.F.Ratio"
## [16] "perc.alumni" "Expend" "Grad.Rate"
train <- sample(1:nrow(college), nrow(college)/2)
test = -train
college.train <- college[train, ]
college.test <- college[test, ]
# Part b: Fit a linear model using least squares on the training set, and report
# the test error obtained.
college.lm <- lm(Apps ~ ., data = college.train)
college.pred <- predict(college.lm, college.test)
mean((college.pred - college.test$Apps)^2)
## [1] 1132563
# So the test MSE is 1719379.
# Part c: Fit a ridge regression model on the training set, with lambda chosen
# by cross-validation. Report on the test error obtained.
grid <- 10^seq(5, -2, length = 100)
train.mat <- model.matrix(Apps ~ ., data = college.train)
test.mat <- model.matrix (Apps ~ ., data = college.test)
ridge.college <- glmnet(train.mat, college.train$Apps, alpha = 0,
lambda = grid, thresh = 1e-12)
ridge.cv.out <- cv.glmnet(train.mat, college.train$Apps, alpha = 0,
lambda = grid, thresh = 1e-12)
plot(ridge.cv.out)

bestlam.ridge <- ridge.cv.out$lambda.min
bestlam.ridge
## [1] 0.01
# So the best lambda value is 24.771.
ridge.pred <- predict(ridge.college, s = bestlam.ridge, newx = test.mat)
mean((ridge.pred - college.test$Apps)^2)
## [1] 1132519
# The test MSE is 1827747, higher than least squares.
# Part d: Fit a lasso model on the training set, with lambda chosen by cross-
# validation. Report the test error obtained, along with the number of non-zero
# coefficient estimates.
lasso.college <- glmnet(train.mat, college.train$Apps, alpha = 1, lambda = grid)
lasso.cv.out <- cv.glmnet(train.mat, college.train$Apps, alpha = 1, lambda = grid)
plot(lasso.cv.out)

bestlam.lasso <- lasso.cv.out$lambda.min
bestlam.lasso
## [1] 0.01
# So the best lambda value is 24.77076.
lasso.pred <- predict(lasso.college, s = bestlam.lasso, newx = test.mat)
mean((lasso.pred - college.test$Apps)^2)
## [1] 1131629
# The test MSE is 1906073, which is larger than the ridge regression error and
# larger than the least squares error.
lasso.pred.coef <- predict(lasso.college, s = bestlam.lasso, type = "coefficients")
lasso.pred.coef
## 19 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -855.61504558
## (Intercept) .
## PrivateYes -228.72339945
## Accept 1.76228998
## Enroll -1.58326629
## Top10perc 67.69850987
## Top25perc -24.29711506
## F.Undergrad 0.12842608
## P.Undergrad 0.10192439
## Outstate -0.11576445
## Room.Board 0.13861694
## Books 0.45940028
## Personal -0.05639144
## PhD -7.65426623
## Terminal -10.54972812
## S.F.Ratio 44.56058688
## perc.alumni 5.68664205
## Expend 0.10686250
## Grad.Rate 10.97568606
# 11 out of 16 variables are included in the lasso model. The variables included
# are graduation rate, percent of alumni who donate, percent of faculty with
# terminal degree, percent of faculty with Ph.D.'s, room and board costs,
# number of part time students, number of full time students,
# percent of new students from top 10% of HS class, number of new students
# enrolled, number of applications accepted, and a factor with levels no and
# yes indicating private or public university.
# Part e: Fit a PCR model on the training set, with M chosen by cross-validation.
# Report the test error obtained, along with the value of M selected by
# cross-validation.
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(2)
college.pcr <- pcr(Apps ~ ., data = college.train, scale = TRUE,
validation = "CV")
summary(college.pcr)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4076 3962 2340 2362 2165 1933 1899
## adjCV 4076 3959 2334 2358 2178 1907 1889
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1890 1902 1875 1884 1895 1893 1918
## adjCV 1876 1893 1865 1874 1885 1883 1908
## 14 comps 15 comps 16 comps 17 comps
## CV 1937 1779 1412 1307
## adjCV 1982 1739 1395 1294
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 31.100 55.92 63.61 69.43 74.88 79.92 83.7
## Apps 7.934 69.39 69.39 74.33 80.84 81.51 81.8
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 87.35 90.34 92.78 94.90 96.70 97.85 98.66
## Apps 81.80 82.44 82.50 82.51 82.66 82.70 84.53
## 15 comps 16 comps 17 comps
## X 99.40 99.84 100.00
## Apps 90.56 92.06 92.74
validationplot(college.pcr, val.type = "MSEP")

# It looks like the smallest MSEP happens at M = 10.
pcr.pred <- predict(college.pcr, college.test, ncomp = 16)
mean((pcr.pred - college.test$Apps)^2)
## [1] 1098421
# So MSE for PCR is 1998540, higher than least squares, ridge regression, and the
# lasso methods.
# Part f: Fit a PLS model on the training set, with M chosen by cross-validation.
# Report the test error obtained, along with the value of M selected by cross-
# validation.
set.seed(1)
college.pls <- plsr(Apps ~ ., data = college.train, scale = TRUE,
validation = "CV")
summary(college.pls)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4076 2139 1942 1774 1692 1545 1445
## adjCV 4076 2132 1943 1761 1649 1510 1421
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1379 1347 1340 1340 1336 1328 1328
## adjCV 1362 1332 1325 1325 1322 1314 1314
## 14 comps 15 comps 16 comps 17 comps
## CV 1330 1329 1328 1328
## adjCV 1315 1314 1314 1314
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 25.70 46.36 61.26 63.07 66.74 71.25 76.44
## Apps 74.52 80.88 85.71 90.93 92.07 92.29 92.39
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 79.76 82.60 85.62 88.93 90.20 92.12 93.32
## Apps 92.53 92.63 92.67 92.69 92.73 92.74 92.74
## 15 comps 16 comps 17 comps
## X 95.83 98.01 100.00
## Apps 92.74 92.74 92.74
validationplot(college.pls, val.type = "MSEP")

# It appears by the time M = 6, any additional decrease is basically nonexistent,
# and so we'll let M = 6.
pls.pred <- predict(college.pls, college.test, ncomp = 6)
mean((pls.pred - college.test$Apps)^2)
## [1] 1102855
# So the MSE for PLS is 1851769, smaller than PCR, smaller than lasso, larger
# than ridge regression, and larger than the least squares.
# Part g: Comment on the results obtained. How accurately can we predict the
# number of college applications recieved? Is there much difference among the
# test errors resulting from these five approaches?
# Compute R^2 for all
test.avg <- mean(college.test$Apps)
lm.r2 <- 1 - mean((college.pred - college.test$Apps)^2) /
mean((test.avg - college.test$Apps)^2)
ridge.r2 <- 1 - mean((ridge.pred - college.test$Apps)^2)/
mean((test.avg - college.test$Apps)^2)
lasso.r2 <- 1 - mean((lasso.pred - college.test$Apps)^2) /
mean((test.avg - college.test$Apps)^2)
pcr.r2 <- 1 - mean((pcr.pred - college.test$Apps)^2) /
mean((test.avg - college.test$Apps)^2)
pls.r2 <- 1 - mean((pls.pred - college.test$Apps)^2) /
mean((test.avg - college.test$Apps)^2)
lm.r2
## [1] 0.9154386
ridge.r2
## [1] 0.9154419
lasso.r2
## [1] 0.9155084
pcr.r2
## [1] 0.9179878
pls.r2
## [1] 0.9176568
# We can predict accurately the number of college applications - all our
# models explain about 88 - 89% of the variance seen in the data. There is not a
# huge difference in R^2 among the models - it appears that the linear model
# has the lowest R^2, but the adjusted R^2 may be lower, as the normal R^2
# does not have a penalty term for increasing numbers of variables. This may
# change if the code is run again, as the models depend heavily on the test and
# training data, which is randomized each time the code is run.