set.seed(123) # Set seed for reproducibility
n <- 100
X <- rnorm(n)
epsilon <- rnorm(n)
beta_0 <- 1
beta_1 <- 2
beta_2 <- 3
beta_3 <- -1
beta_4 <- 5
Y <- beta_0 + beta_1*X + beta_2*X^2 + beta_3*X^3 + beta_4*X^4 + epsilon
data <- data.frame(Y, X)
for (i in 2:10) {
data[[paste0("X", i)]] <- X^i
}
chooseCRANmirror(graphics = FALSE, ind = 1) # Selects a mirror
install.packages("leaps", dependencies = FALSE)
## Installing package into 'C:/Users/saisr/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'leaps' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'leaps'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\saisr\AppData\Local\R\win-library\4.4\00LOCK\leaps\libs\x64\leaps.dll
## to C:\Users\saisr\AppData\Local\R\win-library\4.4\leaps\libs\x64\leaps.dll:
## Permission denied
## Warning: restored 'leaps'
##
## The downloaded binary packages are in
## C:\Users\saisr\AppData\Local\Temp\RtmpWmO8RB\downloaded_packages
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.3
subset_fit <- regsubsets(Y ~ ., data = data, nvmax = 10)
summary_subset <- summary(subset_fit)
# Find best model using Cp, BIC, Adjusted R2
best_cp <- which.min(summary_subset$cp)
best_bic <- which.min(summary_subset$bic)
best_adj_r2 <- which.max(summary_subset$adjr2)
# Plot model selection criteria
par(mfrow = c(1, 3))
plot(summary_subset$cp, xlab = "Number of Predictors", ylab = "Cp", type = "b", pch = 19)
points(best_cp, summary_subset$cp[best_cp], col = "red", cex = 2, pch = 4)
plot(summary_subset$bic, xlab = "Number of Predictors", ylab = "BIC", type = "b", pch = 19)
points(best_bic, summary_subset$bic[best_bic], col = "red", cex = 2, pch = 4)
plot(summary_subset$adjr2, xlab = "Number of Predictors", ylab = "Adjusted R2", type = "b", pch = 19)
points(best_adj_r2, summary_subset$adjr2[best_adj_r2], col = "red", cex = 2, pch = 4)
# Coefficients of the best model
coef(subset_fit, best_cp)
## (Intercept) X X2 X3 X4
## 1.0530423 1.9326115 2.6650344 -0.9804582 5.0599798
subset_fit_fwd <- regsubsets(Y ~ ., data = data, nvmax = 10, method = "forward")
subset_fit_bwd <- regsubsets(Y ~ ., data = data, nvmax = 10, method = "backward")
summary_fwd <- summary(subset_fit_fwd)
summary_bwd <- summary(subset_fit_bwd)
best_fwd <- which.min(summary_fwd$bic)
best_bwd <- which.min(summary_bwd$bic)
coef(subset_fit_fwd, best_fwd)
## (Intercept) X X2 X3 X4 X7
## 1.052149936 1.887208425 2.666372973 -0.939924992 5.058452776 -0.001487585
coef(subset_fit_bwd, best_bwd)
## (Intercept) X X2 X3 X6 X8
## 0.73644476 1.87444441 5.86918758 -0.93506911 2.76754265 -0.61368406
## X10
## 0.04737384
The stepwise methods have included a broader set of predictors compared to the best subset selection, which focuses only on the most significant predictors. In your case:
Forward stepwise and best subset are quite close in terms of the most relevant predictors (like đť‘‹,đť‘‹2, X3, X4).
Backward stepwise has included predictors like X6,X8,X10 which were not part of the best subset model, suggesting it might have considered these predictors for better model performance.
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
x_matrix <- as.matrix(data[,-1])
y_vector <- data$Y
lasso_fit <- cv.glmnet(x_matrix, y_vector, alpha = 1)
plot(lasso_fit)
best_lambda <- lasso_fit$lambda.min
lasso_coef <- coef(lasso_fit, s = best_lambda)
print(lasso_coef)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.071616798
## X 0.786777469
## X2 2.692168850
## X3 -0.001384835
## X4 5.001472320
## X5 -0.149594768
## X6 0.005658989
## X7 .
## X8 .
## X9 .
## X10 .
Y_alt <- beta_0 + 5*X^7 + epsilon # Change coefficient for X7
data_alt <- data
data_alt$Y <- Y_alt
# Best subset selection for new model
subset_fit_alt <- regsubsets(Y ~ ., data = data_alt, nvmax = 10)
summary_alt <- summary(subset_fit_alt)
coef(subset_fit_alt, which.min(summary_alt$bic))
## (Intercept) X7
## 0.8932512 4.9997045
# Lasso for new model
y_alt_vector <- data_alt$Y
lasso_fit_alt <- cv.glmnet(x_matrix, y_alt_vector, alpha = 1)
plot(lasso_fit_alt)
best_lambda_alt <- lasso_fit_alt$lambda.min
lasso_coef_alt <- coef(lasso_fit_alt, s = best_lambda_alt)
print(lasso_coef_alt)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.286795
## X .
## X2 .
## X3 .
## X4 .
## X5 .
## X6 .
## X7 4.853960
## X8 .
## X9 .
## X10 .
# Load the necessary libraries
library(ISLR) # for the College dataset
library(glmnet) # for Ridge and Lasso regression
library(pls) # for PCR and PLS
## Warning: package 'pls' was built under R version 4.4.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
library(caret) # for train/test splitting and cross-validation
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:pls':
##
## R2
# Load the dataset
data("College")
set.seed(11)
# Convert 'Private' factor variable to numeric (dummy coding)
College$Private <- ifelse(College$Private == "Yes", 1, 0)
# Split the data into training and test sets (70% training, 30% test)
train_index <- sample(1:nrow(College), size = 0.7 * nrow(College)) # 70% for training
train_data <- College[train_index, ]
test_data <- College[-train_index, ]
# Define the predictors (X) and response (y)
X_train <- train_data[, -2] # Excluding 'Apps' as the response variable
y_train <- train_data$Apps
X_test <- test_data[, -2]
y_test <- test_data$Apps
# Standardize the predictors (important for ridge and lasso regression)
X_train_scaled <- scale(X_train)
X_test_scaled <- scale(X_test)
lm_model <- lm(Apps ~ ., data = train_data)
lm_pred <- predict(lm_model, newdata = test_data)
lm_mse <- mean((lm_pred - y_test)^2)
print(paste("Test Error (MSE) for Linear Regression:", lm_mse))
## [1] "Test Error (MSE) for Linear Regression: 680349.549952722"
ridge_model <- cv.glmnet(X_train_scaled, y_train, alpha = 0) # alpha = 0 for Ridge
ridge_pred <- predict(ridge_model, X_test_scaled, s = "lambda.min")
ridge_mse <- mean((ridge_pred - y_test)^2)
print(paste("Test Error (MSE) for Ridge Regression:", ridge_mse))
## [1] "Test Error (MSE) for Ridge Regression: 893531.192687124"
lasso_model <- cv.glmnet(X_train_scaled, y_train, alpha = 1) # alpha = 1 for Lasso
lasso_pred <- predict(lasso_model, X_test_scaled, s = "lambda.min")
lasso_mse <- mean((lasso_pred - y_test)^2)
lasso_nonzero_coeff <- sum(lasso_model$glmnet.fit$beta[, which.min(lasso_model$cvm)] != 0)
print(paste("Test Error (MSE) for Lasso Regression:", lasso_mse))
## [1] "Test Error (MSE) for Lasso Regression: 1430749.53217372"
print(paste("Number of non-zero coefficients in Lasso model:", lasso_nonzero_coeff))
## [1] "Number of non-zero coefficients in Lasso model: 17"
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
pcr_pred <- predict(pcr_model, test_data, ncomp = pcr_model$ncomp)
pcr_mse <- mean((pcr_pred - y_test)^2)
print(paste("Test Error (MSE) for PCR:", pcr_mse))
## [1] "Test Error (MSE) for PCR: 680349.549952724"
print(paste("Number of components selected in PCR:", pcr_model$ncomp))
## [1] "Number of components selected in PCR: 17"
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
pls_pred <- predict(pls_model, test_data, ncomp = pls_model$ncomp)
pls_mse <- mean((pls_pred - y_test)^2)
print(paste("Test Error (MSE) for PLS:", pls_mse))
## [1] "Test Error (MSE) for PLS: 680349.549952701"
print(paste("Number of components selected in PLS:", pls_model$ncomp))
## [1] "Number of components selected in PLS: 17"
results <- data.frame(
Model = c("Linear Regression", "Ridge Regression", "Lasso Regression", "PCR", "PLS"),
Test_MSE = c(lm_mse, ridge_mse, lasso_mse, pcr_mse, pls_mse),
Components = c(NA, NA, lasso_nonzero_coeff, pcr_model$ncomp, pls_model$ncomp)
)
print(results)
## Model Test_MSE Components
## 1 Linear Regression 680349.5 NA
## 2 Ridge Regression 893531.2 NA
## 3 Lasso Regression 1430749.5 17
## 4 PCR 680349.5 17
## 5 PLS 680349.5 17
Linear Regression has the lowest MSE of 680349.5, indicating that this model fits the data well and has the most accurate predictions of the five models.
Ridge Regression has a higher MSE (893531.2) compared to Linear Regression, meaning that regularization (through the λ parameter) did not improve its performance for this dataset.
Lasso Regression has the highest MSE of 1430749.5, suggesting that the regularization used in Lasso might have overly penalized the coefficients, resulting in a worse fit for the data. Despite performing feature selection by setting some coefficients to zero, it did not significantly reduce the test error.
How Accurately Can We Predict the Number of College Applications? Based on the Linear Regression model, the number of college applications can be predicted with reasonable accuracy, though the error is still substantial.The models that tried to regularize or reduce dimensionality (Ridge, Lasso, PCR, and PLS) did not provide a significant improvement over the simple Linear Regression model in terms of predictive accuracy.
Is There Much Difference Among the Test Errors from the Five Approaches? Overall, the difference in test errors isn’t drastic, but it is clear that regularized models (Ridge and Lasso) didn’t improve the model as expected, and Linear Regression, along with PCR and PLS, seems to be a better choice in this case.