Complete the following exercises from Introduction to Statistical Learning

Chapter 6: (8) and (9) ## a)

set.seed(1)
X = rnorm(100)
epsilon = rnorm(100)

b)

b0 = 2
b1 = 5
b2 = 1
b3 = 3
Y = b0 + b1 * X + b2 * X^2 + b3 * X^3 + epsilon

c)

library(leaps)
## Warning: package 'leaps' was built under R version 4.3.3
full_data_set = data.frame(Y, X)
regfit.full = regsubsets(Y~poly(X,10,raw=T), data=full_data_set, nvmax=10)
summary = summary(regfit.full)

par(mfrow=c(2,2))

plot(summary$cp, xlab="Number of variables", ylab="C_p", type="l")
points(which.min(summary$cp), summary$cp[which.min(summary$cp)], col="green", cex=2, pch=20)

plot(summary$bic, xlab="Number of variables", ylab="BIC", type="l")
points(which.min(summary$bic), summary$bic[which.min(summary$bic)], col="green", cex=2, pch=20)

plot(summary$adjr2, xlab="Number of variables", ylab="Adjusted R^2", type="l")
points(which.max(summary$adjr2), summary$adjr2[which.max(summary$adjr2)], col="green", cex=2, pch=20)

coef(regfit.full, 6)
##           (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2 
##           2.110439287           5.384203911           0.759905641 
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)7 poly(X, 10, raw = T)8 
##           2.618914864           0.033082193           0.001295952 
## poly(X, 10, raw = T)9 
##          -0.003967445

d)

library(leaps)
full_data_set = data.frame(Y, X)
regfit.fwd = regsubsets(Y~poly(X,10,raw=T), data=full_data_set, nvmax=10, method="forward")
summary = summary(regfit.fwd)

par(mfrow=c(2,2))

plot(summary$cp, xlab="Number of variables", ylab="C_p", type="l")
points(which.min(summary$cp), summary$cp[which.min(summary$cp)], col="blue", cex=2, pch=20)

plot(summary$bic, xlab="Number of variables", ylab="BIC", type="l")
points(which.min(summary$bic), summary$bic[which.min(summary$bic)], col="blue", cex=2, pch=20)

plot(summary$adjr2, xlab="Number of variables", ylab="Adjusted R^2", type="l")
points(which.max(summary$adjr2), summary$adjr2[which.max(summary$adjr2)], col="black", cex=2, pch=20)

coef(regfit.fwd, 6)
##           (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2 
##          2.1150434322          5.4221681979          0.7414449221 
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)5 poly(X, 10, raw = T)6 
##          2.4840805635          0.1137013368          0.0067082199 
## poly(X, 10, raw = T)9 
##         -0.0008836127
library(leaps)
# Assuming Y and X are defined somewhere in your script
regfit.back = regsubsets(Y~poly(X,10,raw=T), data=data.frame(Y,X), nvmax=10, method="backward")
summary = summary(regfit.back)

par(mfrow=c(2,2))

plot(summary$cp, xlab="Number of variables", ylab="C_p", type="l")
points(which.min(summary$cp), summary$cp[which.min(summary$cp)], col="red", cex=2, pch=20)

plot(summary$bic, xlab="Number of variables", ylab="BIC", type="l")
points(which.min(summary$bic), summary$bic[which.min(summary$bic)], col="green", cex=2, pch=20)

plot(summary$adjr2, xlab="Number of variables", ylab="Adjusted R^2", type="l")
points(which.max(summary$adjr2), summary$adjr2[which.max(summary$adjr2)], col="blue", cex=2, pch=20)

coef(regfit.back, 6)
##            (Intercept)  poly(X, 10, raw = T)1  poly(X, 10, raw = T)3 
##              2.1952117              5.0448889              2.9796905 
##  poly(X, 10, raw = T)4  poly(X, 10, raw = T)6  poly(X, 10, raw = T)8 
##              1.2870503             -0.7374018              0.1640190 
## poly(X, 10, raw = T)10 
##             -0.0120300
coef(regfit.full, 6)
##           (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2 
##           2.110439287           5.384203911           0.759905641 
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)7 poly(X, 10, raw = T)8 
##           2.618914864           0.033082193           0.001295952 
## poly(X, 10, raw = T)9 
##          -0.003967445
coef(regfit.fwd, 6)
##           (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2 
##          2.1150434322          5.4221681979          0.7414449221 
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)5 poly(X, 10, raw = T)6 
##          2.4840805635          0.1137013368          0.0067082199 
## poly(X, 10, raw = T)9 
##         -0.0008836127

The forward and standard models show a considerable resemblance in terms of their coefficients, whereas the backward model exhibits significant differences. This indicates that using a backward fitting approach results in a model with a distinct number of variables and markedly different coefficients. ## e)

library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
x_mat = model.matrix(Y~poly(X,10,raw=T))[,-1]
lasso.mod = cv.glmnet(x_mat, Y, alpha=1)
(lambda = lasso.mod$lambda.min)
## [1] 0.0536454
par(mfrow=c(1,1))
plot(lasso.mod)

predict(lasso.mod, s=lambda, type="coefficients")
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                                 s1
## (Intercept)            2.166718686
## poly(X, 10, raw = T)1  5.185343238
## poly(X, 10, raw = T)2  0.639888787
## poly(X, 10, raw = T)3  2.770953283
## poly(X, 10, raw = T)4  0.042241167
## poly(X, 10, raw = T)5  0.025323050
## poly(X, 10, raw = T)6  .          
## poly(X, 10, raw = T)7  0.002886045
## poly(X, 10, raw = T)8  .          
## poly(X, 10, raw = T)9  .          
## poly(X, 10, raw = T)10 .

The selection seems to favor a model with six variables, whose coefficients closely align with those from the forward and full fitting models, yet diverge from those identified by the backward fitting approach.

f)

# and epsilon is some error term added to Y2.
Y2 = 4 + 7 * X^7 + epsilon
data_full_set2 = data.frame(y = Y2, x = X)

# Correcting the formula to use 'y' instead of 'Y'
regfit.full2 = regsubsets(y ~ poly(x, 10, raw = T), data = data_full_set2, nvmax = 10)
summary2 = summary(regfit.full2)

# Setting up the plotting layout
par(mfrow = c(2, 2))

# Plotting Cp
plot(summary2$cp, xlab = "Number of variables", ylab = "C_p", type = "l")
points(which.min(summary2$cp), summary2$cp[which.min(summary2$cp)], col = "red", cex = 2, pch = 20)

# Plotting BIC
plot(summary2$bic, xlab = "Number of variables", ylab = "BIC", type = "l")
points(which.min(summary2$bic), summary2$bic[which.min(summary2$bic)], col = "green", cex = 2, pch = 20)

# Plotting Adjusted R^2
plot(summary2$adjr2, xlab = "Number of variables", ylab = "Adjusted R^2", type = "l")
points(which.max(summary2$adjr2), summary2$adjr2[which.max(summary2$adjr2)], col = "blue", cex = 2, pch = 20)

coef(regfit.full2, which.min(summary2$cp))
##           (Intercept) poly(x, 10, raw = T)2 poly(x, 10, raw = T)7 
##             4.0704904            -0.1417084             7.0015552
coef(regfit.full2, which.min(summary2$bic))
##           (Intercept) poly(x, 10, raw = T)7 
##               3.95894               7.00077
coef(regfit.full2, which.min(summary2$adjr2))
##            (Intercept)  poly(x, 10, raw = T)1  poly(x, 10, raw = T)2 
##             4.17282867             0.51409233            -1.13146007 
##  poly(x, 10, raw = T)3  poly(x, 10, raw = T)4  poly(x, 10, raw = T)5 
##            -0.93113515             1.90382807             0.55109577 
##  poly(x, 10, raw = T)6  poly(x, 10, raw = T)7  poly(x, 10, raw = T)8 
##            -1.26499408             6.84430680             0.31986888 
##  poly(x, 10, raw = T)9 poly(x, 10, raw = T)10 
##             0.01627747            -0.02690171
xmat = model.matrix(Y2~poly(X,10,raw=T))[,-1]
lasso2 = cv.glmnet(xmat, Y2, alpha=1)
(lambda = lasso2$lambda.min)
## [1] 12.36884
par(mfrow=c(1,1))
plot(lasso2)

predict(lasso2, s=lambda, type="coefficients")
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                              s1
## (Intercept)            4.820215
## poly(X, 10, raw = T)1  .       
## poly(X, 10, raw = T)2  .       
## poly(X, 10, raw = T)3  .       
## poly(X, 10, raw = T)4  .       
## poly(X, 10, raw = T)5  .       
## poly(X, 10, raw = T)6  .       
## poly(X, 10, raw = T)7  6.796694
## poly(X, 10, raw = T)8  .       
## poly(X, 10, raw = T)9  .       
## poly(X, 10, raw = T)10 .

The newly implemented lasso model opted for a model with a single variable, diverging significantly from other models. While it aligns with the choice of a one-variable model based on adjusted R^2 criteria, the coefficients it presents are markedly distinct.

9)

a)

library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.2
data(College)
grid <- 10^seq(10, -2, length = 100)
set.seed(1)
trainid = sample(1:nrow(College), nrow(College)/2)
train_college = College[trainid,]
test_college = College[-trainid,]
x = model.matrix(Apps ~ ., data = College)[, -1]
y = College$Apps

b)

lm.fit = lm(Apps~., data=train_college)
lm.pred = predict(lm.fit, test_college)
lm.err = mean((test_college$Apps - lm.pred)^2)
lm.err
## [1] 1135758

c)

train_college.X = model.matrix(Apps ~ ., data = train_college)
train_college.Y = train_college[, "Apps"]
test_college.X = model.matrix(Apps ~ ., data = test_college)
test_college.Y = test_college[, "Apps"]
set.seed(1)
cv.out <- cv.glmnet(train_college.X, train_college.Y, alpha = 0)
plot(cv.out)

bestlam <- cv.out$lambda.min
bestlam
## [1] 405.8404
cv.pred <- predict(cv.out, s = bestlam, newx = test_college.X)
mean((cv.pred - test_college.Y)^2)
## [1] 976261.5
dim(coef(cv.out))
## [1] 19  1
out <- glmnet(x, y, alpha = 0)
ridge_coef = predict(out, type = "coefficients", s = bestlam)[1:18, ]
ridge_coef[ridge_coef!=0]
##   (Intercept)    PrivateYes        Accept        Enroll     Top10perc 
## -1.521486e+03 -5.295317e+02  9.743995e-01  4.712708e-01  2.486088e+01 
##     Top25perc   F.Undergrad   P.Undergrad      Outstate    Room.Board 
##  1.124189e+00  7.721190e-02  2.453206e-02 -2.100391e-02  1.999142e-01 
##         Books      Personal           PhD      Terminal     S.F.Ratio 
##  1.362569e-01 -9.049362e-03 -3.734037e+00 -4.696652e+00  1.280442e+01 
##   perc.alumni        Expend     Grad.Rate 
## -8.869487e+00  7.518826e-02  1.138097e+01

d)

lasso.mod <- glmnet(train_college.X, train_college.Y, alpha = 1, lambda = grid)
plot(lasso.mod)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

set.seed(1)
cv.out <- cv.glmnet(train_college.X, train_college.Y, alpha = 1)
plot(cv.out)

bestlam_lasso <- cv.out$lambda.min
lasso.pred <- predict(lasso.mod, s = bestlam_lasso, newx = test_college.X)
mean((lasso.pred - test_college.Y)^2)
## [1] 1116252
out <- glmnet(x, y, alpha = 1, lambda = grid)
lasso.coef <- predict(out, type = "coefficients", s = bestlam_lasso)[1:18, ]
lasso.coef[lasso.coef != 0]
##   (Intercept)    PrivateYes        Accept        Enroll     Top10perc 
## -468.96036214 -491.47351867    1.57117027   -0.76858284   48.25732969 
##     Top25perc   F.Undergrad   P.Undergrad      Outstate    Room.Board 
##  -12.94716203    0.04293538    0.04406960   -0.08340724    0.14963683 
##         Books      Personal           PhD      Terminal     S.F.Ratio 
##    0.01549548    0.02915128   -8.42538012   -3.26671319   14.61167079 
##   perc.alumni        Expend     Grad.Rate 
##   -0.02612797    0.07718333    8.31579869
library(pls)
## Warning: package 'pls' was built under R version 4.3.3
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
set.seed(1)
pcr.fit <- pcr(Apps~., data=train_college, scale=TRUE, validation="CV")
validationplot(pcr.fit, val.type = "MSEP")

test_college.X2 <- model.matrix(Apps ~ ., data = test_college)[, -1]
pcr.pred <- predict(pcr.fit, test_college.X2, ncomp=17)
mean((pcr.pred - test_college.Y)^2)
## [1] 1135758

M=17

f)

pls.fit <- plsr(Apps ~ ., data = train_college, scale= TRUE , validation = "CV")
validationplot(pls.fit, val.type = "MSEP")

pls.pred <- predict(pls.fit, test_college.X2, ncomp = 6)
mean((pls.pred - test_college.Y)^2)
## [1] 1066991

M = 6

g)

The differences in error rates are not extreme, but Partial Least Squares (PLS) appears to have the lowest error rate, indicating we can predict test scores with relatively high accuracy. However, Principal Component Regression (PCR) seems to exhibit a higher error rate compared to the other methods.