Chapter 6: (8) and (9) ## a)
set.seed(1)
X = rnorm(100)
epsilon = rnorm(100)
b0 = 2
b1 = 5
b2 = 1
b3 = 3
Y = b0 + b1 * X + b2 * X^2 + b3 * X^3 + epsilon
library(leaps)
## Warning: package 'leaps' was built under R version 4.3.3
full_data_set = data.frame(Y, X)
regfit.full = regsubsets(Y~poly(X,10,raw=T), data=full_data_set, nvmax=10)
summary = summary(regfit.full)
par(mfrow=c(2,2))
plot(summary$cp, xlab="Number of variables", ylab="C_p", type="l")
points(which.min(summary$cp), summary$cp[which.min(summary$cp)], col="green", cex=2, pch=20)
plot(summary$bic, xlab="Number of variables", ylab="BIC", type="l")
points(which.min(summary$bic), summary$bic[which.min(summary$bic)], col="green", cex=2, pch=20)
plot(summary$adjr2, xlab="Number of variables", ylab="Adjusted R^2", type="l")
points(which.max(summary$adjr2), summary$adjr2[which.max(summary$adjr2)], col="green", cex=2, pch=20)
coef(regfit.full, 6)
## (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2
## 2.110439287 5.384203911 0.759905641
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)7 poly(X, 10, raw = T)8
## 2.618914864 0.033082193 0.001295952
## poly(X, 10, raw = T)9
## -0.003967445
library(leaps)
full_data_set = data.frame(Y, X)
regfit.fwd = regsubsets(Y~poly(X,10,raw=T), data=full_data_set, nvmax=10, method="forward")
summary = summary(regfit.fwd)
par(mfrow=c(2,2))
plot(summary$cp, xlab="Number of variables", ylab="C_p", type="l")
points(which.min(summary$cp), summary$cp[which.min(summary$cp)], col="blue", cex=2, pch=20)
plot(summary$bic, xlab="Number of variables", ylab="BIC", type="l")
points(which.min(summary$bic), summary$bic[which.min(summary$bic)], col="blue", cex=2, pch=20)
plot(summary$adjr2, xlab="Number of variables", ylab="Adjusted R^2", type="l")
points(which.max(summary$adjr2), summary$adjr2[which.max(summary$adjr2)], col="black", cex=2, pch=20)
coef(regfit.fwd, 6)
## (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2
## 2.1150434322 5.4221681979 0.7414449221
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)5 poly(X, 10, raw = T)6
## 2.4840805635 0.1137013368 0.0067082199
## poly(X, 10, raw = T)9
## -0.0008836127
library(leaps)
# Assuming Y and X are defined somewhere in your script
regfit.back = regsubsets(Y~poly(X,10,raw=T), data=data.frame(Y,X), nvmax=10, method="backward")
summary = summary(regfit.back)
par(mfrow=c(2,2))
plot(summary$cp, xlab="Number of variables", ylab="C_p", type="l")
points(which.min(summary$cp), summary$cp[which.min(summary$cp)], col="red", cex=2, pch=20)
plot(summary$bic, xlab="Number of variables", ylab="BIC", type="l")
points(which.min(summary$bic), summary$bic[which.min(summary$bic)], col="green", cex=2, pch=20)
plot(summary$adjr2, xlab="Number of variables", ylab="Adjusted R^2", type="l")
points(which.max(summary$adjr2), summary$adjr2[which.max(summary$adjr2)], col="blue", cex=2, pch=20)
coef(regfit.back, 6)
## (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)3
## 2.1952117 5.0448889 2.9796905
## poly(X, 10, raw = T)4 poly(X, 10, raw = T)6 poly(X, 10, raw = T)8
## 1.2870503 -0.7374018 0.1640190
## poly(X, 10, raw = T)10
## -0.0120300
coef(regfit.full, 6)
## (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2
## 2.110439287 5.384203911 0.759905641
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)7 poly(X, 10, raw = T)8
## 2.618914864 0.033082193 0.001295952
## poly(X, 10, raw = T)9
## -0.003967445
coef(regfit.fwd, 6)
## (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2
## 2.1150434322 5.4221681979 0.7414449221
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)5 poly(X, 10, raw = T)6
## 2.4840805635 0.1137013368 0.0067082199
## poly(X, 10, raw = T)9
## -0.0008836127
The forward and standard models show a considerable resemblance in terms of their coefficients, whereas the backward model exhibits significant differences. This indicates that using a backward fitting approach results in a model with a distinct number of variables and markedly different coefficients. ## e)
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
x_mat = model.matrix(Y~poly(X,10,raw=T))[,-1]
lasso.mod = cv.glmnet(x_mat, Y, alpha=1)
(lambda = lasso.mod$lambda.min)
## [1] 0.0536454
par(mfrow=c(1,1))
plot(lasso.mod)
predict(lasso.mod, s=lambda, type="coefficients")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 2.166718686
## poly(X, 10, raw = T)1 5.185343238
## poly(X, 10, raw = T)2 0.639888787
## poly(X, 10, raw = T)3 2.770953283
## poly(X, 10, raw = T)4 0.042241167
## poly(X, 10, raw = T)5 0.025323050
## poly(X, 10, raw = T)6 .
## poly(X, 10, raw = T)7 0.002886045
## poly(X, 10, raw = T)8 .
## poly(X, 10, raw = T)9 .
## poly(X, 10, raw = T)10 .
The selection seems to favor a model with six variables, whose coefficients closely align with those from the forward and full fitting models, yet diverge from those identified by the backward fitting approach.
# and epsilon is some error term added to Y2.
Y2 = 4 + 7 * X^7 + epsilon
data_full_set2 = data.frame(y = Y2, x = X)
# Correcting the formula to use 'y' instead of 'Y'
regfit.full2 = regsubsets(y ~ poly(x, 10, raw = T), data = data_full_set2, nvmax = 10)
summary2 = summary(regfit.full2)
# Setting up the plotting layout
par(mfrow = c(2, 2))
# Plotting Cp
plot(summary2$cp, xlab = "Number of variables", ylab = "C_p", type = "l")
points(which.min(summary2$cp), summary2$cp[which.min(summary2$cp)], col = "red", cex = 2, pch = 20)
# Plotting BIC
plot(summary2$bic, xlab = "Number of variables", ylab = "BIC", type = "l")
points(which.min(summary2$bic), summary2$bic[which.min(summary2$bic)], col = "green", cex = 2, pch = 20)
# Plotting Adjusted R^2
plot(summary2$adjr2, xlab = "Number of variables", ylab = "Adjusted R^2", type = "l")
points(which.max(summary2$adjr2), summary2$adjr2[which.max(summary2$adjr2)], col = "blue", cex = 2, pch = 20)
coef(regfit.full2, which.min(summary2$cp))
## (Intercept) poly(x, 10, raw = T)2 poly(x, 10, raw = T)7
## 4.0704904 -0.1417084 7.0015552
coef(regfit.full2, which.min(summary2$bic))
## (Intercept) poly(x, 10, raw = T)7
## 3.95894 7.00077
coef(regfit.full2, which.min(summary2$adjr2))
## (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2
## 4.17282867 0.51409233 -1.13146007
## poly(x, 10, raw = T)3 poly(x, 10, raw = T)4 poly(x, 10, raw = T)5
## -0.93113515 1.90382807 0.55109577
## poly(x, 10, raw = T)6 poly(x, 10, raw = T)7 poly(x, 10, raw = T)8
## -1.26499408 6.84430680 0.31986888
## poly(x, 10, raw = T)9 poly(x, 10, raw = T)10
## 0.01627747 -0.02690171
xmat = model.matrix(Y2~poly(X,10,raw=T))[,-1]
lasso2 = cv.glmnet(xmat, Y2, alpha=1)
(lambda = lasso2$lambda.min)
## [1] 12.36884
par(mfrow=c(1,1))
plot(lasso2)
predict(lasso2, s=lambda, type="coefficients")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 4.820215
## poly(X, 10, raw = T)1 .
## poly(X, 10, raw = T)2 .
## poly(X, 10, raw = T)3 .
## poly(X, 10, raw = T)4 .
## poly(X, 10, raw = T)5 .
## poly(X, 10, raw = T)6 .
## poly(X, 10, raw = T)7 6.796694
## poly(X, 10, raw = T)8 .
## poly(X, 10, raw = T)9 .
## poly(X, 10, raw = T)10 .
The newly implemented lasso model opted for a model with a single variable, diverging significantly from other models. While it aligns with the choice of a one-variable model based on adjusted R^2 criteria, the coefficients it presents are markedly distinct.
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.2
data(College)
grid <- 10^seq(10, -2, length = 100)
set.seed(1)
trainid = sample(1:nrow(College), nrow(College)/2)
train_college = College[trainid,]
test_college = College[-trainid,]
x = model.matrix(Apps ~ ., data = College)[, -1]
y = College$Apps
lm.fit = lm(Apps~., data=train_college)
lm.pred = predict(lm.fit, test_college)
lm.err = mean((test_college$Apps - lm.pred)^2)
lm.err
## [1] 1135758
train_college.X = model.matrix(Apps ~ ., data = train_college)
train_college.Y = train_college[, "Apps"]
test_college.X = model.matrix(Apps ~ ., data = test_college)
test_college.Y = test_college[, "Apps"]
set.seed(1)
cv.out <- cv.glmnet(train_college.X, train_college.Y, alpha = 0)
plot(cv.out)
bestlam <- cv.out$lambda.min
bestlam
## [1] 405.8404
cv.pred <- predict(cv.out, s = bestlam, newx = test_college.X)
mean((cv.pred - test_college.Y)^2)
## [1] 976261.5
dim(coef(cv.out))
## [1] 19 1
out <- glmnet(x, y, alpha = 0)
ridge_coef = predict(out, type = "coefficients", s = bestlam)[1:18, ]
ridge_coef[ridge_coef!=0]
## (Intercept) PrivateYes Accept Enroll Top10perc
## -1.521486e+03 -5.295317e+02 9.743995e-01 4.712708e-01 2.486088e+01
## Top25perc F.Undergrad P.Undergrad Outstate Room.Board
## 1.124189e+00 7.721190e-02 2.453206e-02 -2.100391e-02 1.999142e-01
## Books Personal PhD Terminal S.F.Ratio
## 1.362569e-01 -9.049362e-03 -3.734037e+00 -4.696652e+00 1.280442e+01
## perc.alumni Expend Grad.Rate
## -8.869487e+00 7.518826e-02 1.138097e+01
lasso.mod <- glmnet(train_college.X, train_college.Y, alpha = 1, lambda = grid)
plot(lasso.mod)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
set.seed(1)
cv.out <- cv.glmnet(train_college.X, train_college.Y, alpha = 1)
plot(cv.out)
bestlam_lasso <- cv.out$lambda.min
lasso.pred <- predict(lasso.mod, s = bestlam_lasso, newx = test_college.X)
mean((lasso.pred - test_college.Y)^2)
## [1] 1116252
out <- glmnet(x, y, alpha = 1, lambda = grid)
lasso.coef <- predict(out, type = "coefficients", s = bestlam_lasso)[1:18, ]
lasso.coef[lasso.coef != 0]
## (Intercept) PrivateYes Accept Enroll Top10perc
## -468.96036214 -491.47351867 1.57117027 -0.76858284 48.25732969
## Top25perc F.Undergrad P.Undergrad Outstate Room.Board
## -12.94716203 0.04293538 0.04406960 -0.08340724 0.14963683
## Books Personal PhD Terminal S.F.Ratio
## 0.01549548 0.02915128 -8.42538012 -3.26671319 14.61167079
## perc.alumni Expend Grad.Rate
## -0.02612797 0.07718333 8.31579869
library(pls)
## Warning: package 'pls' was built under R version 4.3.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(1)
pcr.fit <- pcr(Apps~., data=train_college, scale=TRUE, validation="CV")
validationplot(pcr.fit, val.type = "MSEP")
test_college.X2 <- model.matrix(Apps ~ ., data = test_college)[, -1]
pcr.pred <- predict(pcr.fit, test_college.X2, ncomp=17)
mean((pcr.pred - test_college.Y)^2)
## [1] 1135758
M=17
pls.fit <- plsr(Apps ~ ., data = train_college, scale= TRUE , validation = "CV")
validationplot(pls.fit, val.type = "MSEP")
pls.pred <- predict(pls.fit, test_college.X2, ncomp = 6)
mean((pls.pred - test_college.Y)^2)
## [1] 1066991
M = 6
The differences in error rates are not extreme, but Partial Least Squares (PLS) appears to have the lowest error rate, indicating we can predict test scores with relatively high accuracy. However, Principal Component Regression (PCR) seems to exhibit a higher error rate compared to the other methods.