8.)

a.)

#Generate X and Noise
set.seed(1)

n <- 100
X <- rnorm(n)
eps <- rnorm(n)

b.)

#Generate Y in Accordadnce with model
beta0 <- 1.0
beta1 <- 2.0
beta2 <- -3.0
beta3 <- 0.5

Y <- beta0 + beta1*X + beta2*(X^2) + beta3*(X^3) + eps
#Forming Data Frame with Model Fitting
poly_data <- data.frame(
  Y   = Y,
  X1  = X,
  X2  = X^2,
  X3  = X^3,
  X4  = X^4,
  X5  = X^5,
  X6  = X^6,
  X7  = X^7,
  X8  = X^8,
  X9  = X^9,
  X10 = X^10
)

c.)

#Fitting Different Subsets
library(leaps)
regfit <- regsubsets(
  Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
  data = poly_data,
  nvmax = 10
)
#Comparing Models
reg_summary <- summary(regfit)

reg_summary$cp
##  [1] 917.497515  63.788757  -1.340110   0.354922   2.052791   3.853556
##  [7]   5.264784   7.169409   9.065416  11.000000
reg_summary$bic
##  [1]  -58.6585 -240.0439 -290.8529 -286.5847 -282.3145 -277.9308 -273.9830
##  [8] -269.4848 -264.9963 -260.4646
reg_summary$adjr2
##  [1] 0.4875417 0.9193942 0.9532043 0.9528708 0.9525287 0.9521244 0.9519211
##  [8] 0.9514447 0.9509625 0.9504479

Interpretation: It can be observed that lowest CP, BIC, and the highest R^2 can all be found at the third variable. Thus, all three criteria point to the use of a 3-variable model. The plots below provide visual evidence to my findings, pertinent mins and maxes can be seen highlighted in red.

#Plots
#Cp 
plot(reg_summary$cp, xlab = "Number of Predictors", ylab = "Cp", 
     pch = 19, type = "b")
best_cp <- which.min(reg_summary$cp)
points(best_cp, reg_summary$cp[best_cp], col = "red", cex = 2, pch = 20)
abline(h = min(reg_summary$cp), col = "red", lty = 2)

#BIC
plot(reg_summary$bic, xlab = "Number of Predictors", ylab = "BIC", 
     pch = 19, type = "b")
best_bic <- which.min(reg_summary$bic)
points(best_bic, reg_summary$bic[best_bic], col = "red", cex = 2, pch = 20)
abline(h = min(reg_summary$bic), col = "red", lty = 2)

#Adjusted R^2
plot(reg_summary$adjr2, xlab = "Number of Predictors", ylab = "Adjusted R^2",
     pch = 19, type = "b")
best_adjr2 <- which.max(reg_summary$adjr2)
points(best_adjr2, reg_summary$adjr2[best_adjr2], col = "red", cex = 2, pch = 20)
abline(h = max(reg_summary$adjr2), col = "red", lty = 2)

#Reporting Coefficietns for 3-Variable 
coef_3var <- coef(regfit, id = 3)
coef_3var
## (Intercept)           X          X2          X5 
##  1.07219472  2.44514720 -3.15676236  0.09022577

d.)

# Forward stepwise
regfit_fwd <- regsubsets(
  Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
  data = poly_data,
  nvmax = 10,
  method = "forward"
)

# Backward stepwise
regfit_bwd <- regsubsets(
  Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
  data = poly_data,
  nvmax = 10,
  method = "backward"
)

summary_fwd <- summary(regfit_fwd)
summary_bwd <- summary(regfit_bwd)

summary_fwd$cp
##  [1] 917.497515  63.788757  -1.340110   0.354922   2.124432   4.041147
##  [7]   6.009102   7.257998   9.065416  11.000000
summary_bwd$cp
##  [1] 917.497515  63.788757  -1.340110   0.417214   2.182401   3.853556
##  [7]   5.264784   7.257998   9.065416  11.000000
#Comparing Summaries

#Forward stepwise:
which.min(summary_fwd$cp)
## [1] 3
which.min(summary_fwd$bic)
## [1] 3
which.max(summary_fwd$adjr2)
## [1] 3
#Backward stepwise:
which.min(summary_bwd$cp)
## [1] 3
which.min(summary_bwd$bic)
## [1] 3
which.max(summary_bwd$adjr2)
## [1] 3
#Coefficients 

#Best model size is 3 for forward stepwise
coef_fwd_3 <- coef(regfit_fwd, id = 3)

#Best model size is 3 for backward stepwise
coef_bwd_3 <- coef(regfit_bwd, id = 3)

coef_fwd_3
## (Intercept)           X          X2          X5 
##  1.07219472  2.44514720 -3.15676236  0.09022577
coef_bwd_3
## (Intercept)           X          X2          X5 
##  1.07219472  2.44514720 -3.15676236  0.09022577

Interpretation: Both the forward and backward sdtepwise function verify what our answer in c suggested. The best model still is a 3-variable model and this is suggested by all three criteria across all the methods we used in this question.

e.)

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
X_matrix <- model.matrix(
  Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
  data = poly_data
)[, -1]  # remove the intercept column

#10-fold cross-validation for Lasso 
cv_lasso <- cv.glmnet(X_matrix, poly_data$Y, alpha = 1)

best_lambda <- cv_lasso$lambda.min  #minimizes cross-validation error

lasso_coef <- coef(cv_lasso, s = best_lambda)
lasso_coef
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept)  1.04330762
## X            2.28855463
## X2          -3.11000997
## X3           0.13706910
## X4           .         
## X5           0.06479831
## X6           .         
## X7           .         
## X8           .         
## X9           .         
## X10          .

-Interpretation: Overall, the lasso confirms what we have found int he previous questions and solidifies that a 3-variable model is going to be best. One notable difference is that the coefficients have shrunk marginally in lasso compared to that of the prior methods.

f.)

#1.) Generate X
set.seed(123)        
n <- 100             # sample size
X <- rnorm(n)        # predictor
eps <- rnorm(n)      # random noise

beta0 <- 1.5
beta7 <- 2.0  

Y <- beta0 + beta7 * (X^7) + eps

poly_data <- data.frame(
  Y   = Y,
  X   = X,
  X2  = X^2,
  X3  = X^3,
  X4  = X^4,
  X5  = X^5,
  X6  = X^6,
  X7  = X^7,
  X8  = X^8,
  X9  = X^9,
  X10 = X^10
)

#Best Subset Selection
library(leaps)
regfit_full <- regsubsets(
  Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
  data = poly_data,
  nvmax = 10
)
reg_summary <- summary(regfit_full)

which.min(reg_summary$cp)      
## [1] 1
which.min(reg_summary$bic)     
## [1] 1
which.max(reg_summary$adjr2)  
## [1] 6
coef(regfit_full, id = 1)  #2/3 indicate a 1-variable model
## (Intercept)          X7 
##    1.393251    1.999704
#And it returns X7 as the main contributer. 
#Stepwise Selections

# Forward
regfit_fwd <- regsubsets(
  Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, 
  data = poly_data, 
  nvmax = 10,
  method = "forward"
)
summary_fwd <- summary(regfit_fwd)

# Backward
regfit_bwd <- regsubsets(
  Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10, 
  data = poly_data,
  nvmax = 10,
  method = "backward"
)
summary_bwd <- summary(regfit_bwd)

summary_fwd$cp
##  [1] -0.4892301  0.3551574  1.0565357  2.9239183  4.4007817  6.1236555
##  [7]  6.3437996  7.1581102  9.1290867 11.0000000
summary_fwd$bic
##  [1] -929.1265 -925.7386 -922.5193 -918.0568 -914.0162 -909.7114 -907.0571
##  [8] -903.7731 -899.2005 -894.7402
summary_fwd$adjr2
##  [1] 0.9999150 0.9999152 0.9999155 0.9999147 0.9999143 0.9999136 0.9999144
##  [8] 0.9999146 0.9999137 0.9999128
summary_bwd$cp
##  [1] -0.4892301  1.1294727  1.1051856  2.2565990  3.5665229  3.7694845
##  [7]  5.7157631  7.1623173  9.0222992 11.0000000
summary_bwd$bic
##  [1] -929.1265 -924.9213 -922.4671 -918.7775 -914.9231 -912.3000 -907.7547
##  [8] -903.7684 -899.3203 -894.7402
summary_bwd$adjr2
##  [1] 0.9999150 0.9999145 0.9999154 0.9999153 0.9999151 0.9999158 0.9999150
##  [8] 0.9999146 0.9999138 0.9999128
#Forward stepwise:
which.min(summary_fwd$cp)
## [1] 1
which.min(summary_fwd$bic)
## [1] 1
which.max(summary_fwd$adjr2)
## [1] 3
#Backward stepwise:
which.min(summary_bwd$cp)
## [1] 1
which.min(summary_bwd$bic)
## [1] 1
which.max(summary_bwd$adjr2)
## [1] 6

Interpretation:

It can be observed that our R2 tends to favor a larger model. This is due to the fact that Adjusted R2 penalizes added predictors more gently. As soon as a predictor explains any extra variance, it nudges adjusted R2 upwards. This is not out the ordinary, to my understanding R2 is the most generous of the three criteria we used today in regards to adding variables.

#Lasso
X_matrix <- model.matrix(
  Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
  data = poly_data
)[, -1]  # drop the intercept column

cv_lasso <- cv.glmnet(X_matrix, poly_data$Y, alpha = 1)
best_lambda <- cv_lasso$lambda.min
lasso_coef <- coef(cv_lasso, s = best_lambda)
lasso_coef
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                   s1
## (Intercept) 1.550655
## X           .       
## X2          .       
## X3          .       
## X4          .       
## X5          .       
## X6          .       
## X7          1.941412
## X8          .       
## X9          .       
## X10         .

-Interpretation: It is now becoming obvious that the data we generated truly hinges on X7. As evidenced above the lasso is driving other factors towards zero while making X7 more of a nonzero. Overall, it seems that our overall analysis indicates X7 as the best subset.

9.)

a.)

#Split into Test Set and Training Set
library(ISLR2) 

# For reproducibility
set.seed(1)

train_idx <- sample(1:nrow(College), nrow(College)*0.7)
train_data <- College[train_idx, ]
test_data  <- College[-train_idx, ]

b.)

#Model
lm_fit <- lm(Apps ~ ., data=train_data)
summary(lm_fit)
## 
## Call:
## lm(formula = Apps ~ ., data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5816.1  -451.6    -1.0   327.2  7445.9 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.377e+02  5.076e+02  -1.059  0.28995    
## PrivateYes  -5.045e+02  1.648e+02  -3.061  0.00232 ** 
## Accept       1.722e+00  4.763e-02  36.159  < 2e-16 ***
## Enroll      -1.055e+00  2.437e-01  -4.329 1.80e-05 ***
## Top10perc    5.358e+01  6.440e+00   8.320 7.64e-16 ***
## Top25perc   -1.614e+01  5.092e+00  -3.170  0.00161 ** 
## F.Undergrad  2.970e-02  4.399e-02   0.675  0.49976    
## P.Undergrad  7.162e-02  3.649e-02   1.963  0.05019 .  
## Outstate    -8.841e-02  2.176e-02  -4.064 5.57e-05 ***
## Room.Board   1.630e-01  5.577e-02   2.923  0.00362 ** 
## Books        2.727e-01  2.723e-01   1.001  0.31715    
## Personal    -7.316e-03  7.283e-02  -0.100  0.92002    
## PhD         -9.676e+00  5.360e+00  -1.805  0.07161 .  
## Terminal    -3.781e-01  6.015e+00  -0.063  0.94990    
## S.F.Ratio    1.627e+01  1.608e+01   1.012  0.31214    
## perc.alumni  2.358e+00  4.853e+00   0.486  0.62722    
## Expend       5.986e-02  1.337e-02   4.476 9.34e-06 ***
## Grad.Rate    7.158e+00  3.520e+00   2.034  0.04248 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1022 on 525 degrees of freedom
## Multiple R-squared:  0.9332, Adjusted R-squared:  0.9311 
## F-statistic: 431.6 on 17 and 525 DF,  p-value: < 2.2e-16
#Test Error
pred_lm <- predict(lm_fit, test_data)
mse_lm <- mean((test_data$Apps - pred_lm)^2)
mse_lm
## [1] 1261630

c.)

# Build an x matrix and a y vector
x_train <- model.matrix(Apps ~ ., data=train_data)[, -1] 
y_train <- train_data$Apps

x_test <- model.matrix(Apps ~ ., data=test_data)[, -1]
y_test <- test_data$Apps

# 10-fold CV to pick lambda
cv_ridge <- cv.glmnet(x_train, y_train, alpha=0) 
best_lambda_ridge <- cv_ridge$lambda.min

# Final ridge model
ridge_fit <- glmnet(x_train, y_train, alpha=0, lambda=best_lambda_ridge)

# Test error
pred_ridge <- predict(ridge_fit, s=best_lambda_ridge, newx=x_test)
mse_ridge <- mean((y_test - pred_ridge)^2)
mse_ridge
## [1] 1121808

d.)

cv_lasso <- cv.glmnet(x_train, y_train, alpha=1)  # alpha=1 => Lasso
best_lambda_lasso <- cv_lasso$lambda.min

lasso_fit <- glmnet(x_train, y_train, alpha=1, lambda=best_lambda_lasso)

# Test error
pred_lasso <- predict(lasso_fit, s=best_lambda_lasso, newx=x_test)
mse_lasso <- mean((y_test - pred_lasso)^2)
mse_lasso
## [1] 1254248
# Number of non-zero coefficients
lasso_coef <- coef(lasso_fit, s=best_lambda_lasso)
nonzero_count <- sum(lasso_coef != 0)
nonzero_count
## [1] 18

e.) Test Error can be found at the bottom of my results and M=17

library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
pcr_fit <- pcr(Apps ~ ., data=train_data, scale=TRUE, validation="CV")  
# By default, pcr() does cross-validation for M = 1 to number_of_predictors

summary(pcr_fit)  
## Data:    X dimension: 543 17 
##  Y dimension: 543 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3895     3793     2133     2152     1835     1705     1719
## adjCV         3895     3794     2129     2153     1808     1695     1712
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1719     1670     1646      1639      1642      1651      1646
## adjCV     1713     1658     1639      1632      1635      1643      1639
##        14 comps  15 comps  16 comps  17 comps
## CV         1647      1621      1216      1197
## adjCV      1640      1603      1205      1185
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X      32.051    57.00    64.42    70.27    75.65    80.65    84.26    87.61
## Apps    5.788    71.69    71.70    80.97    82.60    82.60    82.69    84.06
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       90.58     92.84     94.93     96.74     97.82     98.72     99.39
## Apps    84.55     84.82     84.86     84.86     85.01     85.05     89.81
##       16 comps  17 comps
## X        99.85    100.00
## Apps     93.03     93.32
# Suppose we identify M_best from the plots or summary
M_best <- 10  # just an example, you need to check the summary or plot

# Refit PCR with the chosen number of components
pcr_fit_best <- pcr(Apps ~ ., data=train_data, scale=TRUE, ncomp=M_best)

# Test error
pred_pcr <- predict(pcr_fit_best, test_data, ncomp=M_best)
mse_pcr <- mean((test_data$Apps - pred_pcr)^2)
mse_pcr
## [1] 1837203

f.) Test Error can be found at the bottom of my results and M=17

pls_fit <- plsr(Apps ~ ., data=train_data, scale=TRUE, validation="CV")
summary(pls_fit)
## Data:    X dimension: 543 17 
##  Y dimension: 543 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3895     1955     1746     1541     1471     1266     1200
## adjCV         3895     1949     1747     1534     1448     1242     1188
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1177     1164     1158      1161      1157      1157      1157
## adjCV     1167     1155     1149      1151      1147      1148      1148
##        14 comps  15 comps  16 comps  17 comps
## CV         1156      1155      1155      1155
## adjCV      1147      1146      1146      1146
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       25.68    47.43    62.46    64.88    67.34    72.68    77.20    80.92
## Apps    76.62    82.39    86.93    90.76    92.82    93.05    93.13    93.20
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       82.69     85.16     87.35     90.73     92.49     95.10     97.09
## Apps    93.26     93.28     93.30     93.31     93.32     93.32     93.32
##       16 comps  17 comps
## X        98.40    100.00
## Apps     93.32     93.32
# Similar to PCR, we examine the cross-validation results to find the best M

M_best_pls <- 10  # placeholder - identify from summary or validationplot()
pls_fit_best <- plsr(Apps ~ ., data=train_data, scale=TRUE, ncomp=M_best_pls)

pred_pls <- predict(pls_fit_best, test_data, ncomp=M_best_pls)
mse_pls <- mean((test_data$Apps - pred_pls)^2)
mse_pls
## [1] 1279353

g.)

mse_data <- data.frame(
  Model  = c("Linear", "Ridge", "Lasso", "PCR", "PLS"),
  TestMSE = c(mse_lm, mse_ridge, mse_lasso, mse_pcr, mse_pls)
)
mse_data
##    Model TestMSE
## 1 Linear 1261630
## 2  Ridge 1121808
## 3  Lasso 1254248
## 4    PCR 1837203
## 5    PLS 1279353

-It can be clearly seen that ridge has the best (lowest) test error. PCR stand out with the largest Test Error by far. The other 3 tests cluster together and have very similar test errors. In regards to minimizing test MSE, Ridge is the best solution.