Math 533 Homework 4 - Part 4 and Part 5

library(ggplot2)
library(glmnet)

Problem 6.8 pg.285 of ISLR

6.8a.)

set.seed(10)
x <- rnorm(n=100, mean=2, sd=4)
epsilon <- rnorm(n=100, mean=0, sd=1)

6.8b.)

b.0 <- 5
b.1 <- 2
b.2 <- 0.5
b.3 <- -1.2
y <- b.0 + b.1*x + b.2*x^2 + b.3*x^3 + epsilon
data <- data.frame(x = x, y = y)
p <- ggplot(data = data, aes(x=x, y=y)) + geom_point()
p

6.8c.) Use the regsubsets() function to perform best subset selection

library(leaps) 
power.df <- data.frame(x = x, x.2 = x^2, x.3 = x^3, x.4 = x^4, x.5 = x^5, x.6 = x^6, x.7 = x^7, x.8 = x^8, x.9 = x^9, x.10 = x^10, y = y)
regfit.full <- regsubsets(y ~ ., data = power.df, nvmax = 11)
reg.summary <- summary(regfit.full)

par(mfrow = c(2,2))
plot(reg.summary$cp, xlab = 'Number of Variables', ylab = "Cp", type = 'l')
plot(reg.summary$adjr2, xlab = 'Number of Variables', ylab = "Adjusted RSq", type = 'l')
plot(reg.summary$bic, xlab = 'Number of Variables', ylab = 'BIC', type = 'l')

In all three criteria we can see from our plots above that the model using three variables is the best since \(C_p\) is minimized at three variables, adjusted \(R^2\) is maximized at three variables, and BIC is minimized at three variables.

reg.summary

## Subset selection object
## Call: regsubsets.formula(y ~ ., data = power.df, nvmax = 11)
## 10 Variables  (and intercept)
##      Forced in Forced out
## x        FALSE      FALSE
## x.2      FALSE      FALSE
## x.3      FALSE      FALSE
## x.4      FALSE      FALSE
## x.5      FALSE      FALSE
## x.6      FALSE      FALSE
## x.7      FALSE      FALSE
## x.8      FALSE      FALSE
## x.9      FALSE      FALSE
## x.10     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
##           x   x.2 x.3 x.4 x.5 x.6 x.7 x.8 x.9 x.10
## 1  ( 1 )  " " " " "*" " " " " " " " " " " " " " " 
## 2  ( 1 )  " " "*" "*" " " " " " " " " " " " " " " 
## 3  ( 1 )  "*" "*" "*" " " " " " " " " " " " " " " 
## 4  ( 1 )  "*" "*" "*" " " "*" " " " " " " " " " " 
## 5  ( 1 )  "*" "*" "*" " " "*" " " " " "*" " " " " 
## 6  ( 1 )  "*" "*" "*" " " " " "*" " " "*" "*" " " 
## 7  ( 1 )  "*" "*" "*" "*" " " "*" " " "*" " " "*" 
## 8  ( 1 )  "*" "*" "*" "*" "*" "*" " " "*" "*" " " 
## 9  ( 1 )  "*" "*" "*" " " "*" "*" "*" "*" "*" "*" 
## 10  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"

We can see from the output of the summary of regfit that the best three variable model includes \(X, X^2\) and \(X^3\).

6.8d.) Use forward stepwise selection and backward stepwise selection. Do the answers differ from part (c.)

We can also use the regsubsets() function to perform forward stepwise or backward stepwise selection using the argument method = “forward” or method = “backward”.

We will first do Forward Stepwise Selection

regfit.fwd <- regsubsets(y ~ ., data = power.df, nvmax = 11, method = 'forward')
regfit.fwd.summary <- summary(regfit.fwd)
par(mfrow = c(2,2))
plot(regfit.fwd.summary$cp, xlab = 'Number of Variables', ylab = "Cp", type = 'l')
plot(regfit.fwd.summary$adjr2, xlab = 'Number of Variables', ylab = "Adjusted RSq", type = 'l')
plot(regfit.fwd.summary$bic, xlab = 'Number of Variables', ylab = 'BIC', type = 'l')

Using forward stepwise selection we again see that the best model is one with three variables. Now let’s see which three variables.

coef(regfit.fwd, 3)

## (Intercept)           x         x.2         x.3 
##    4.975134    1.984106    0.495747   -1.199670

From our output we can see that again \(X\), \(X^2\), and \(X^3\) are the most important predictors.

Now let’s try backward stepwise selection

regfit.bwd <- regsubsets(y ~ ., data = power.df, nvmax = 11, method = 'backward')
regfit.bwd.summary <- summary(regfit.bwd)
par(mfrow = c(2,2))
plot(regfit.bwd.summary$cp, xlab = 'Number of Variables', ylab = "Cp", type = 'l')
plot(regfit.bwd.summary$adjr2, xlab = 'Number of Variables', ylab = "Adjusted RSq", type = 'l')
plot(regfit.bwd.summary$bic, xlab = 'Number of Variables', ylab = 'BIC', type = 'l')

Again, no surprise, our best model is with three variables.

coef(regfit.bwd, 3)

## (Intercept)           x         x.2         x.3 
##    4.975134    1.984106    0.495747   -1.199670

In summary, using forward and backward stepwise selection leads to the same three predictors being chosen for the best model.

6.8e.) Now fit a lasso model using the simulated data using \(X, X^2, ... , X^10\) as predictors. Use cross validation to select the optimal \(\lambda\).

We first need to split our data into test and training sets.

x <- model.matrix(y ~ ., data = power.df)[,-1]
set.seed(1)
n <- nrow(x)
ntest <- trunc(n/3)
testid <- sample(1:n, ntest)
y.test <- y[testid]
train <- (-testid)

Use cross-validation using the cv.glmnet() function.

library(glmnet)
cv.out <- cv.glmnet(x[train,], y[train], alpha = 1, nfolds = 5)
plot(cv.out)

bestlam <- cv.out$lambda.min

best.lam <- cv.out$lambda.min
best.lam

## [1] 3.680833

We can see that the cross-validation error is minimized for the above value of \(\lambda\).

Now we can see the coefficients of the best model.

best_model <- glmnet(x, y, alpha = 1, lambda = best.lam)
coef(best_model)

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                        s0
## (Intercept)  9.6181119438
## x            .           
## x.2          .           
## x.3         -1.0795874534
## x.4          .           
## x.5         -0.0003651072
## x.6          .           
## x.7          .           
## x.8          .           
## x.9          .           
## x.10         .

It appears that the best model (according to lasso regression) is that intercept = 9.412, \(X^3\) coefficient = -1.078, and \(X^5\) coefficient = -0.003. This is in stark contrast to subset selection which seemed to believe intercept, \(\beta_1\), \(\beta_2\), and \(\beta_3\) were always chosen.

6.8f.) Create a new response variable and use subset selection and lasso. Discuss the results obtained.

Now let’s create a response variable \(Y = \beta_0 + \beta_7 X^7 + \epsilon\).

#define our beta's
b.0 <- 5
b.7 <- -3.5

set.seed(10)
x <- rnorm(n=100, mean=2, sd=4)
epsilon <- rnorm(n=100, mean=0, sd=1)

y.7 <- b.0 + b.7 * (x^7) + epsilon

plot(x, y.7)

Let’s use subset selection first.

power.df <- data.frame(x = x, x.2 = x^2, x.3 = x^3, x.4 = x^4, x.5 = x^5, x.6 = x^6, x.7 = x^7, x.8 = x^8, x.9 = x^9, x.10 = x^10, y = y.7)
regfit.full <- regsubsets(y ~ ., data = power.df, nvmax = 11)
reg.summary <- summary(regfit.full)

par(mfrow = c(2,2))
plot(reg.summary$cp, xlab = 'Number of Variables', ylab = "Cp", type = 'l')
plot(reg.summary$adjr2, xlab = 'Number of Variables', ylab = "Adjusted RSq", type = 'l')
plot(reg.summary$bic, xlab = 'Number of Variables', ylab = 'BIC', type = 'l')

It appears that \(C_p\) and BIC are both minimized with the use of one variable, and that variable according to our summary is the variable \(X^7\).

reg.summary

## Subset selection object
## Call: regsubsets.formula(y ~ ., data = power.df, nvmax = 11)
## 10 Variables  (and intercept)
##      Forced in Forced out
## x        FALSE      FALSE
## x.2      FALSE      FALSE
## x.3      FALSE      FALSE
## x.4      FALSE      FALSE
## x.5      FALSE      FALSE
## x.6      FALSE      FALSE
## x.7      FALSE      FALSE
## x.8      FALSE      FALSE
## x.9      FALSE      FALSE
## x.10     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
##           x   x.2 x.3 x.4 x.5 x.6 x.7 x.8 x.9 x.10
## 1  ( 1 )  " " " " " " " " " " " " "*" " " " " " " 
## 2  ( 1 )  " " "*" " " " " " " " " "*" " " " " " " 
## 3  ( 1 )  " " " " "*" " " " " " " "*" "*" " " " " 
## 4  ( 1 )  "*" " " "*" " " " " " " "*" "*" " " " " 
## 5  ( 1 )  " " " " " " " " "*" " " "*" "*" "*" "*" 
## 6  ( 1 )  " " "*" " " "*" " " "*" "*" "*" "*" " " 
## 7  ( 1 )  " " " " "*" " " "*" "*" "*" "*" "*" "*" 
## 8  ( 1 )  "*" " " "*" " " "*" "*" "*" "*" "*" "*" 
## 9  ( 1 )  "*" " " "*" "*" "*" "*" "*" "*" "*" "*" 
## 10  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"

Let’s look at our coefficient.

coef(regfit.full, 1)

## (Intercept)         x.7 
##    4.907956   -3.500000

No surprise here. R was able to learn the the theoretical coefficients \(\beta_0\) and \(\beta_7\).

Now we will apply lasso regression.

b.0 <- 5
b.7 <- -3.5

set.seed(10)
x <- rnorm(n=100, mean=2, sd=4)
epsilon <- rnorm(n=100, mean=0, sd=1)

y.7 <- b.0 + b.7 * (x^7) + epsilon

power.df <- data.frame(x = x, x.2 = x^2, x.3 = x^3, x.4 = x^4, x.5 = x^5, x.6 = x^6, x.7 = x^7, x.8 = x^8, x.9 = x^9, x.10 = x^10, y = y.7)

x <- model.matrix(y ~ ., data = power.df)[,-1]
set.seed(1)
n <- nrow(x)
ntest <- trunc(n/3)
testid <- sample(1:n, ntest)
y.test <- y.7[testid]
train <- (-testid)

cv.out <- cv.glmnet(x[train,], y.7[train], alpha = 1, nfolds = 5)
plot(cv.out)

best.lam <- cv.out$lambda.min

best_model <- glmnet(x, y.7, alpha = 1, lambda = best.lam)
coef(best_model)

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                        s0
## (Intercept) -2.687768e+04
## x            .           
## x.2          .           
## x.3          .           
## x.4          .           
## x.5         -2.964627e+01
## x.6         -1.084804e+00
## x.7         -2.419113e+00
## x.8          .           
## x.9         -5.419082e-03
## x.10         .

Interestingly, lasso believes that we should be using a 4 variable model (seen from console output above)

Problem 6.9 pg.286 of ISLR (Part a, b, c, d)

6.9a.) Split the College data set into a training and test set

library(ISLR2)
set.seed(1)
train <- sample(c(TRUE, FALSE), nrow(College), replace = TRUE)
test <- (!train)

6.9b.) Fit a linear model using the Least Squares on the training set and report the test error obtained

least.squares.mod <- lm(Apps ~ ., data = College[train, ])
least.squares.pred <- predict(least.squares.mod, newdata = College[test,])
least.squares.test.error <- mean((least.squares.pred - College$Apps[test])^2)
least.squares.test.error

## [1] 984743.1

6.9c.) Fit a ridge regression model on the training set, with \(\lambda\) chosen by cross-validation. Report the test error obtained

x <- model.matrix(Apps ~ ., data = College)[,-1]
y <- College$Apps
set.seed(1)
n <- nrow(x)
ntest <- trunc(n/3)
testid <- sample(1:n, ntest)
y.test <- y[testid]
train <- (-testid)

set.seed(1)
cv.out <- cv.glmnet(x[train, ], y[train], alpha = 0, nfolds = 5)
#plot(cv.out)
bestlam <- cv.out$lambda.min

ridge.mod <- glmnet(x[train, ], y[train], alpha = 0, lambda = bestlam)
ridge.pred <- predict(ridge.mod, s = bestlam, newx = x[testid,])
mean((ridge.pred - y.test)^2)

## [1] 2739396

6.9d.) Fit a lasso model on the training set, with \(\lambda\) chosen by cross-validation. Report the test error obtained, along with the number of non-zero coefficient estimates.

x <- model.matrix(Apps ~ ., data = College)[,-1]
y <- College$Apps
set.seed(1)
n <- nrow(x)
ntest <- trunc(n/3)
testid <- sample(1:n, ntest)
y.test <- y[testid]
train <- (-testid)

set.seed(1)
cv.out <- cv.glmnet(x[train, ], y[train], alpha = 1, nfolds = 5)
#plot(cv.out)
bestlam <- cv.out$lambda.min

lasso.mod <- glmnet(x[train, ], y[train], alpha = 0, lambda = bestlam)
lasso.pred <- predict(lasso.mod, s = bestlam, newx = x[testid,])
mean((lasso.pred - y.test)^2)

## [1] 1684920

Let’s check the number of non-zero coefficient estimates.

lasso.coef <- predict(lasso.mod, type = 'coefficients', s = bestlam)
lasso.coef[lasso.coef != 0]

## <sparse>[ <logic> ] : .M.sub.i.logical() maybe inefficient

##  [1] -4.788408e+02 -8.032647e+02  1.191181e+00 -8.271075e-02  4.185450e+01
##  [6] -9.651726e+00  7.049242e-02  2.441507e-02 -4.718787e-02  1.828779e-01
## [11]  7.496437e-03  4.249123e-02 -1.429313e+00 -1.218140e+01 -1.052814e-01
## [16] -6.868998e+00  9.643843e-02  1.174702e+01

length(lasso.coef != 0)

## [1] 18

We can see the number of non-zero coefficient estimates above.