library(leaps)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(ISLR2)
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings

8. In this exercise, we will generate simulated data, and will then use this data to perform best subset selection.

(a) Use the rnorm() function to generate a predictor X of length n = 100, as well as a noise vector ϵ of length n = 100.

set.seed(96)
X <- rnorm(100, sd = 5)    ## lets use sd = 5
ϵ <- rnorm(100)

(b) Generate a response vector Y of length n = 100 according to the model \(Y= β_0 + β_1X + β_2X^2 + β_3X^3 + ϵ\), where β0, β1, β2, and β3 are constants of your choice.

  • Lets get \(Y= 1 + 2X + 3X^2 + 4X^3 + ϵ\)
Y <- 1 + 2*X + 3*X^2 + 4*X^3 + ϵ

(c) Use the regsubsets() function to perform best subset selection in order to choose the best model containing the predictors X, X2, . . . , X10. What is the best model obtained according to Cp, BIC, and adjusted R2? Show some plots to provide evidence for your answer, and report the coefficients of the best model obtained. Note you will need to use the data.frame() function to create a single data set containing both X and Y.

X2 <- X^2
X3 <- X^3
X4 <- X^4
X5 <- X^5
X6 <- X^6
X7 <- X^7
X8 <- X^8
X9 <- X^9
X10 <- X^10

data <- data.frame(Y,X,X2,X3,X4,X5,X6,X7,X8,X9,X10)
regfit.full <- regsubsets(Y ~ ., data = data, nvmax = 10)
reg.summary <- summary(regfit.full)
reg.summary
## Subset selection object
## Call: regsubsets.formula(Y ~ ., data = data, nvmax = 10)
## 10 Variables  (and intercept)
##     Forced in Forced out
## X       FALSE      FALSE
## X2      FALSE      FALSE
## X3      FALSE      FALSE
## X4      FALSE      FALSE
## X5      FALSE      FALSE
## X6      FALSE      FALSE
## X7      FALSE      FALSE
## X8      FALSE      FALSE
## X9      FALSE      FALSE
## X10     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
##           X   X2  X3  X4  X5  X6  X7  X8  X9  X10
## 1  ( 1 )  " " " " "*" " " " " " " " " " " " " " "
## 2  ( 1 )  " " "*" "*" " " " " " " " " " " " " " "
## 3  ( 1 )  "*" "*" "*" " " " " " " " " " " " " " "
## 4  ( 1 )  "*" "*" "*" " " " " " " " " " " "*" " "
## 5  ( 1 )  "*" "*" "*" " " " " "*" " " "*" " " " "
## 6  ( 1 )  "*" "*" "*" " " " " "*" " " "*" " " "*"
## 7  ( 1 )  "*" "*" "*" "*" "*" " " "*" " " "*" " "
## 8  ( 1 )  "*" "*" "*" "*" "*" " " "*" "*" "*" " "
## 9  ( 1 )  "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
par(mfrow = c(1, 3))
plot(reg.summary$cp, xlab = "Number of Predictorss", ylab = "Cp", type = "l")
points(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary$bic, xlab = "Number of Predictors", ylab = "BIC", type = "l")
points(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary$adjr2, xlab = "Number of Predictors", ylab = "Adjusted R2", type = "l")
points(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)], col = "red", cex = 2, pch = 20)

  • The model with 3 predictors is the best model according to all criterions, \(C_p\), \(BIC\) and \(Adj. R_2\).
  • And from the best subset for 3 predictors in X, X2 and X3 which are indeed true predictors of the model.
coef(regfit.full, which.min(reg.summary$cp))
## (Intercept)           X          X2          X3 
##   0.8296789   1.9036658   3.0029415   4.0009233
coef(regfit.full, which.min(reg.summary$bic))
## (Intercept)           X          X2          X3 
##   0.8296789   1.9036658   3.0029415   4.0009233
coef(regfit.full, which.max(reg.summary$adjr2))
## (Intercept)           X          X2          X3 
##   0.8296789   1.9036658   3.0029415   4.0009233
  • Clearly, the coeff. obtained from the mdoel is also very close to true coefficients.
  • Therefore, the model is nearly perfect and small difference is acceptable which accounts irreducible error thats present in true expression for Y

(d) Repeat (c), using forward stepwise selection and also using backwards stepwise selection. How does your answer compare to the results in (c)?

Forward selection using regsubsets method

regfit.fwd <- regsubsets(Y ~ ., data = data, nvmax = 10, method = "forward")
reg.summary.fwd <- summary(regfit.fwd)
reg.summary.fwd
## Subset selection object
## Call: regsubsets.formula(Y ~ ., data = data, nvmax = 10, method = "forward")
## 10 Variables  (and intercept)
##     Forced in Forced out
## X       FALSE      FALSE
## X2      FALSE      FALSE
## X3      FALSE      FALSE
## X4      FALSE      FALSE
## X5      FALSE      FALSE
## X6      FALSE      FALSE
## X7      FALSE      FALSE
## X8      FALSE      FALSE
## X9      FALSE      FALSE
## X10     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: forward
##           X   X2  X3  X4  X5  X6  X7  X8  X9  X10
## 1  ( 1 )  " " " " "*" " " " " " " " " " " " " " "
## 2  ( 1 )  " " "*" "*" " " " " " " " " " " " " " "
## 3  ( 1 )  "*" "*" "*" " " " " " " " " " " " " " "
## 4  ( 1 )  "*" "*" "*" " " " " " " " " " " "*" " "
## 5  ( 1 )  "*" "*" "*" "*" " " " " " " " " "*" " "
## 6  ( 1 )  "*" "*" "*" "*" " " " " " " " " "*" "*"
## 7  ( 1 )  "*" "*" "*" "*" "*" " " " " " " "*" "*"
## 8  ( 1 )  "*" "*" "*" "*" "*" " " "*" " " "*" "*"
## 9  ( 1 )  "*" "*" "*" "*" "*" " " "*" "*" "*" "*"
## 10  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
par(mfrow = c(1, 3))
plot(reg.summary.fwd$cp, xlab = "Number of Predictors using forward selection", ylab = "Cp", type = "l")
points(which.min(reg.summary.fwd$cp), reg.summary.fwd$cp[which.min(reg.summary.fwd$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary.fwd$bic, xlab = "Number of Predictors using forward selection", ylab = "BIC", type = "l")
points(which.min(reg.summary.fwd$bic), reg.summary.fwd$bic[which.min(reg.summary.fwd$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary.fwd$adjr2, xlab = "Number of Predictors using forward selection", ylab = "Adjusted R2", type = "l")
points(which.max(reg.summary.fwd$adjr2), reg.summary.fwd$adjr2[which.max(reg.summary.fwd$adjr2)], col = "red", cex = 2, pch = 20)

* Still the same result, 3 predictors is the best model and the these predictors are X, X2, X3 * These predictors are the True predictors of Y

coef(regfit.fwd, which.min(reg.summary.fwd$cp)) ## that is model with 3 predictors
## (Intercept)           X          X2          X3 
##   0.8296789   1.9036658   3.0029415   4.0009233
  • Same Coefficients are obtained (no difference from using exhaustive method)

Backward selection

regfit.bwd <- regsubsets(Y ~ ., data = data, nvmax = 10, method = "backward")
reg.summary.bwd <- summary(regfit.bwd)
reg.summary.bwd
## Subset selection object
## Call: regsubsets.formula(Y ~ ., data = data, nvmax = 10, method = "backward")
## 10 Variables  (and intercept)
##     Forced in Forced out
## X       FALSE      FALSE
## X2      FALSE      FALSE
## X3      FALSE      FALSE
## X4      FALSE      FALSE
## X5      FALSE      FALSE
## X6      FALSE      FALSE
## X7      FALSE      FALSE
## X8      FALSE      FALSE
## X9      FALSE      FALSE
## X10     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: backward
##           X   X2  X3  X4  X5  X6  X7  X8  X9  X10
## 1  ( 1 )  " " " " "*" " " " " " " " " " " " " " "
## 2  ( 1 )  " " "*" "*" " " " " " " " " " " " " " "
## 3  ( 1 )  "*" "*" "*" " " " " " " " " " " " " " "
## 4  ( 1 )  "*" "*" "*" " " " " " " " " "*" " " " "
## 5  ( 1 )  "*" "*" "*" " " " " "*" " " "*" " " " "
## 6  ( 1 )  "*" "*" "*" " " "*" "*" " " "*" " " " "
## 7  ( 1 )  "*" "*" "*" " " "*" "*" "*" "*" " " " "
## 8  ( 1 )  "*" "*" "*" " " "*" "*" "*" "*" "*" " "
## 9  ( 1 )  "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
par(mfrow = c(1, 3))
plot(reg.summary.bwd$cp, xlab = "Number of Predictors using forward selection", ylab = "Cp", type = "l")
points(which.min(reg.summary.bwd$cp), reg.summary.bwd$cp[which.min(reg.summary.bwd$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary.bwd$bic, xlab = "Number of Predictors using forward selection", ylab = "BIC", type = "l")
points(which.min(reg.summary.bwd$bic), reg.summary.bwd$bic[which.min(reg.summary.bwd$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary.bwd$adjr2, xlab = "Number of Predictors using forward selection", ylab = "Adjusted R2", type = "l")
points(which.max(reg.summary.bwd$adjr2), reg.summary.bwd$adjr2[which.max(reg.summary.bwd$adjr2)], col = "red", cex = 2, pch = 20)

* The same result as forward selection and exhaustive serach method

coef(regfit.bwd, which.min(reg.summary.bwd$cp))
## (Intercept)           X          X2          X3 
##   0.8296789   1.9036658   3.0029415   4.0009233
  • This observation furthur confirms the fact that the results are the same.

(e) Now fit a lasso model to the simulated data, again using X, X2,. . . , X10 as predictors. Use cross-validation to select the optimal value of λ. Create plots of the cross-validation error as a function of λ. Report the resulting coefficient estimates, and discuss the results obtained.

x <- model.matrix(Y ~ .-1 , data = data)
y <- data$Y
cv.lasso <- cv.glmnet(x, y, alpha = 1)
plot(cv.lasso)

best.lambda <- cv.lasso$lambda.min
lasso.coef <- predict(cv.lasso, s = best.lambda, type = "coefficients")
lasso.coef
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                     s1
## (Intercept) 39.9901000
## X            0.9139107
## X2           1.7133365
## X3           3.9153370
## X4           .        
## X5           .        
## X6           .        
## X7           .        
## X8           .        
## X9           .        
## X10          .
  • Even with Lasso, we arrive that the same conclusion to use 3 predictors
  • And the 3 predictors are also the true predictors
  • Although the estimated coefficients are not close to the True coefficients (except for X3)
  • This suggests that, Lasso yeilds right predictors but the estimated coefficients might not be precise.

(f) Now generate a response vector Y according to the model Y= β0 + β7X7 + ϵ, and perform best subset selection and the lasso. Discuss the results obtained.

Y_new <- 1 + 4*X^7 + ϵ  # Define new response
data_new <- data  # Keep the same predictors
data_new$Y <- Y_new  # Update response
best_subset_new <- regsubsets(Y ~ ., data = data_new, nvmax = 10)
best_subset_new.summary <- summary(best_subset_new)
best_subset_new.summary
## Subset selection object
## Call: regsubsets.formula(Y ~ ., data = data_new, nvmax = 10)
## 10 Variables  (and intercept)
##     Forced in Forced out
## X       FALSE      FALSE
## X2      FALSE      FALSE
## X3      FALSE      FALSE
## X4      FALSE      FALSE
## X5      FALSE      FALSE
## X6      FALSE      FALSE
## X7      FALSE      FALSE
## X8      FALSE      FALSE
## X9      FALSE      FALSE
## X10     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
##           X   X2  X3  X4  X5  X6  X7  X8  X9  X10
## 1  ( 1 )  " " " " " " " " " " " " "*" " " " " " "
## 2  ( 1 )  "*" " " " " " " " " " " "*" " " " " " "
## 3  ( 1 )  "*" " " "*" " " " " " " "*" " " " " " "
## 4  ( 1 )  "*" " " "*" " " " " " " "*" " " "*" " "
## 5  ( 1 )  "*" " " "*" " " "*" " " "*" " " "*" " "
## 6  ( 1 )  "*" "*" "*" " " " " "*" "*" "*" " " " "
## 7  ( 1 )  "*" "*" "*" "*" "*" " " "*" " " "*" " "
## 8  ( 1 )  "*" "*" "*" "*" "*" " " "*" "*" "*" " "
## 9  ( 1 )  "*" " " "*" "*" "*" "*" "*" "*" "*" "*"
## 10  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
par(mfrow = c(1, 3))
plot(best_subset_new.summary$cp, xlab = "Number of Predictors using forward selection", ylab = "Cp", type = "l")
points(which.min(best_subset_new.summary$cp), best_subset_new.summary$cp[which.min(best_subset_new.summary$cp)], col = "red", cex = 2, pch = 20)
plot(best_subset_new.summary$bic, xlab = "Number of Predictors using forward selection", ylab = "BIC", type = "l")
points(which.min(best_subset_new.summary$bic), best_subset_new.summary$bic[which.min(best_subset_new.summary$bic)], col = "red", cex = 2, pch = 20)
plot(best_subset_new.summary$adjr2, xlab = "Number of Predictors using forward selection", ylab = "Adjusted R2", type = "l")
points(which.max(best_subset_new.summary$adjr2), best_subset_new.summary$adjr2[which.max(best_subset_new.summary$adjr2)], col = "red", cex = 2, pch = 20)

coef(best_subset_new, which.min(best_subset_new.summary$cp))
##  (Intercept)            X           X3           X7 
##  0.891177761 -0.121411481  0.001658063  3.999999980
coef(best_subset_new, which.min(best_subset_new.summary$bic))
## (Intercept)          X7 
##   0.8722273   4.0000000
coef(best_subset_new, which.min(best_subset_new.summary$adjr2))
## (Intercept)          X7 
##   0.8722273   4.0000000
  • There is a little discrencpy here, according to BIC the modelwith 1 predict is the best and according to \(C_p\) the model iwth 3 predictors is the best

  • And according the \(adj R^2\) the model with 1 predictor is the best.

  • But even for the best model for \(C_p\) with 3 predictors, the est. coefficients of X3 and X are very close to 0 and X7 is close to 4

  • Thus the X7 coefficients in both the cases is 4.

  • This indicates, the model with X7 can be considered are the best model.

lasso_cv_new <- cv.glmnet(x, data_new$Y, alpha = 1)
plot(lasso_cv_new)  

coef(lasso_cv_new, s = lasso_cv_new$lambda.min)  
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -1.220110e+05
## X            .           
## X2           .           
## X3           .           
## X4           .           
## X5           .           
## X6           .           
## X7           3.883398e+00
## X8           .           
## X9           .           
## X10          .
  • Clearly, the model with 1 predictor is the best model

  • And the best model subset of predictors for 1 predictor model is X7 as predictor

  • Which is same as True model and the coefficient is ≈3.88 (to close 4).

  • Therefore the variable selection methods are able to obtain True relationships and almost close to True Coefficient values (this is acceptable as to account ϵ)

  • Both LASSO and best subset variable selection leads to X7 single predictor model with est. coeff ≈4.

9

(a)

data(College)
set.seed(96)
train <- sample(nrow(College), nrow(College) / 2)
test <- -train
train_data <- College[train, ]
test_data <- College[test, ]

(b)

lm_fit <- lm(Apps ~ ., data = train_data)
lm_pred <- predict(lm_fit, newdata = test_data)
lm_test_error <- mean((lm_pred - test_data$Apps)^2)
print(paste("Linear Model Test Error:", lm_test_error))
## [1] "Linear Model Test Error: 1412660.07615934"

(c)

x_train <- model.matrix(Apps ~ ., data = train_data)[, -1]
y_train <- train_data$Apps
x_test <- model.matrix(Apps ~ ., data = test_data)[, -1]
y_test <- test_data$Apps
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0) #ridge regression
best_lambda_ridge <- cv_ridge$lambda.min
ridge_pred <- predict(cv_ridge, s = best_lambda_ridge, newx = x_test)
ridge_test_error <- mean((ridge_pred - y_test)^2)
print(paste("Ridge Regression Test Error:", ridge_test_error))
## [1] "Ridge Regression Test Error: 2201690.18667754"

(d)

cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1) # lasso
best_lambda_lasso <- cv_lasso$lambda.min
lasso_pred <- predict(cv_lasso, s = best_lambda_lasso, newx = x_test)
lasso_test_error <- mean((lasso_pred - y_test)^2)
lasso_coef <- predict(cv_lasso, s = best_lambda_lasso, type = "coefficients")
non_zero_coef <- sum(lasso_coef != 0) - 1 # Subtract 1 for intercept
print(paste("Lasso Regression Test Error:", lasso_test_error))
## [1] "Lasso Regression Test Error: 1416808.49996602"
print(paste("Number of Non-Zero Coefficients:", non_zero_coef))
## [1] "Number of Non-Zero Coefficients: 16"

(e)

pcr_fit <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
validationplot(pcr_fit, val.type = "MSEP")

best_m_pcr <- which.min(pcr_fit$validation$PRESS) 
pcr_pred <- predict(pcr_fit, newdata = test_data, ncomp = best_m_pcr)
pcr_test_error <- mean((pcr_pred - y_test)^2)
print(paste("PCR Test Error:", pcr_test_error))
## [1] "PCR Test Error: 1412660.07615935"
print(paste("Optimal M (PCR):", best_m_pcr))
## [1] "Optimal M (PCR): 17"

(f)

pls_fit <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
validationplot(pls_fit, val.type = "MSEP")

best_m_pls <- which.min(MSEP(pls_fit)$val[seq(2, 32, 2)]) -1 ## considering adjusted CV 
pls_pred <- predict(pls_fit, newdata = test_data, ncomp = best_m_pls)


pls_test_error <- mean(((pls_pred - y_test))^2)
print(paste("PLS Test Error:", pls_test_error))
## [1] "PLS Test Error: 1414330.86739303"
print(paste("Optimal M (PLS):", best_m_pls))
## [1] "Optimal M (PLS): 13"

(g)

cat("\nResults Summary:\n")
## 
## Results Summary:
cat(paste("Linear Model Test Error:", lm_test_error, "\n"))
## Linear Model Test Error: 1412660.07615934
cat(paste("Ridge Regression Test Error:", ridge_test_error, "\n"))
## Ridge Regression Test Error: 2201690.18667754
cat(paste("Lasso Regression Test Error:", lasso_test_error, "\n"))
## Lasso Regression Test Error: 1416808.49996602
cat(paste("PCR Test Error:", pcr_test_error, "\n"))
## PCR Test Error: 1412660.07615935
cat(paste("PLS Test Error:", pls_test_error, "\n"))
## PLS Test Error: 1414330.86739303

Discussion or Comments on results summary:

Linear Model (Least Squares): - Test Error: 1,412,660.08. This serves as the baseline for comparison.

Lasso Regression: - Test Error: 1,416,808.50. Lasso’s performance was very similar to the linear model, indicating that the feature selection it performed did not substantially impact prediction accuracy.

PCR (Principal Components Regression): - Test Error: 1,412,660.08. With an optimal M of 17 (using all predictors), PCR achieved an identical test error to the linear model. This signifies that the principal components perfectly captured the variance used by the linear model.

Ridge Regression: - Test Error: 2,201,690.19. Ridge regression exhibited the highest test error, suggesting that L2 regularization was detrimental in this case.

PLS (Partial Least Squares): - Test Error: 1,420,888.52. PLS performed similarly to the linear model and Lasso, with a slightly higher error, indicating that the latent variables created by PLS were also good predictors of the data.

  • We can predict the number of college applications received with reasonable accuracy. The test errors, while large in absolute terms, need to be considered in the context of the scale of the “Apps” variable. The fact that the linear model, PCR, and Lasso all have very similar errors indicates a good level of predictive power.

  • The fact that PCR’s optimal M was 17 (using all predictors) explains its identical performance to the linear model. PCR, when allowed to use all components, essentially replicates a least squares regression.

  • Lasso also performs very similarly, showing that the model is not improved by feature selection.

  • PLS performs very closely to the other models, but with a slight increase in error.

  • Ridge regression’s significantly higher error indicates a clear problem. It suggests that L2 regularization was detrimental in this case.

  • There is remarkably little difference among the test errors of the linear model, Lasso, and PCR. This suggests that the data is well-suited to a linear model and that feature selection or dimensionality reduction did not provide significant benefits.

  • The results strongly suggest that a standard linear regression model is sufficient for predicting college applications in this dataset. The added complexity of Lasso, PCR (when using all components), and PLS did not lead to substantial improvements. The poor performance of Ridge regression warrants further investigation.

y <- runif(100)
# Load required libraries
library(ggplot2)

# Generate data
set.seed(123)  # for reproducibility
x1 <- rnorm(100)
x2 <- rnorm(100, 2, 1)# predictor variable
y <- ifelse(x1 > 0, 1, 0)  # binary outcome variable

# Scatter plot
plot(x1, y, main = "Scatter plot of x1 vs y",
     xlab = "x1", ylab = "y", pch = 19, col = "blue")

# Fit logistic regression model
model <- glm(y ~ x2, family = 'binomial')

# Display summary of the model
summary(model)
## 
## Call:
## glm(formula = y ~ x2, family = "binomial")
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.21780    0.44255   0.492    0.623
## x2          -0.07275    0.20832  -0.349    0.727
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 138.47  on 99  degrees of freedom
## Residual deviance: 138.35  on 98  degrees of freedom
## AIC: 142.35
## 
## Number of Fisher Scoring iterations: 3
# Plot using ggplot2 with logistic regression curve
ggplot(data = data.frame(x1,x2, y), aes(x = x1, y = y)) +
  geom_point(color = "blue") +  # Plot the points
  geom_smooth(method = "glm", method.args = list(family = "binomial"), color = "red") +
  labs(title = "Logistic Regression Curve", x = "x1", y = "Probability of y") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

# Calculate the log-odds (logits) from the fitted model
logits <- predict(model, type = "link")
# Plot Log-Odds (logits) vs. Predictor x
plot(x2, logits, main = "Log-Odds vs. Predictor",
     xlab = "Predictor (x)", ylab = "Log-Odds",
     pch = 19, col = "blue")