library(leaps)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(ISLR2)
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(96)
X <- rnorm(100, sd = 5) ## lets use sd = 5
ϵ <- rnorm(100)
Y <- 1 + 2*X + 3*X^2 + 4*X^3 + ϵ
X2 <- X^2
X3 <- X^3
X4 <- X^4
X5 <- X^5
X6 <- X^6
X7 <- X^7
X8 <- X^8
X9 <- X^9
X10 <- X^10
data <- data.frame(Y,X,X2,X3,X4,X5,X6,X7,X8,X9,X10)
regfit.full <- regsubsets(Y ~ ., data = data, nvmax = 10)
reg.summary <- summary(regfit.full)
reg.summary
## Subset selection object
## Call: regsubsets.formula(Y ~ ., data = data, nvmax = 10)
## 10 Variables (and intercept)
## Forced in Forced out
## X FALSE FALSE
## X2 FALSE FALSE
## X3 FALSE FALSE
## X4 FALSE FALSE
## X5 FALSE FALSE
## X6 FALSE FALSE
## X7 FALSE FALSE
## X8 FALSE FALSE
## X9 FALSE FALSE
## X10 FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
## X X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 ( 1 ) " " " " "*" " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " " " " " " " " " " " "
## 4 ( 1 ) "*" "*" "*" " " " " " " " " " " "*" " "
## 5 ( 1 ) "*" "*" "*" " " " " "*" " " "*" " " " "
## 6 ( 1 ) "*" "*" "*" " " " " "*" " " "*" " " "*"
## 7 ( 1 ) "*" "*" "*" "*" "*" " " "*" " " "*" " "
## 8 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" " "
## 9 ( 1 ) "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
par(mfrow = c(1, 3))
plot(reg.summary$cp, xlab = "Number of Predictorss", ylab = "Cp", type = "l")
points(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary$bic, xlab = "Number of Predictors", ylab = "BIC", type = "l")
points(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary$adjr2, xlab = "Number of Predictors", ylab = "Adjusted R2", type = "l")
points(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)], col = "red", cex = 2, pch = 20)
coef(regfit.full, which.min(reg.summary$cp))
## (Intercept) X X2 X3
## 0.8296789 1.9036658 3.0029415 4.0009233
coef(regfit.full, which.min(reg.summary$bic))
## (Intercept) X X2 X3
## 0.8296789 1.9036658 3.0029415 4.0009233
coef(regfit.full, which.max(reg.summary$adjr2))
## (Intercept) X X2 X3
## 0.8296789 1.9036658 3.0029415 4.0009233
regfit.fwd <- regsubsets(Y ~ ., data = data, nvmax = 10, method = "forward")
reg.summary.fwd <- summary(regfit.fwd)
reg.summary.fwd
## Subset selection object
## Call: regsubsets.formula(Y ~ ., data = data, nvmax = 10, method = "forward")
## 10 Variables (and intercept)
## Forced in Forced out
## X FALSE FALSE
## X2 FALSE FALSE
## X3 FALSE FALSE
## X4 FALSE FALSE
## X5 FALSE FALSE
## X6 FALSE FALSE
## X7 FALSE FALSE
## X8 FALSE FALSE
## X9 FALSE FALSE
## X10 FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: forward
## X X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 ( 1 ) " " " " "*" " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " " " " " " " " " " " "
## 4 ( 1 ) "*" "*" "*" " " " " " " " " " " "*" " "
## 5 ( 1 ) "*" "*" "*" "*" " " " " " " " " "*" " "
## 6 ( 1 ) "*" "*" "*" "*" " " " " " " " " "*" "*"
## 7 ( 1 ) "*" "*" "*" "*" "*" " " " " " " "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" "*" " " "*" " " "*" "*"
## 9 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
par(mfrow = c(1, 3))
plot(reg.summary.fwd$cp, xlab = "Number of Predictors using forward selection", ylab = "Cp", type = "l")
points(which.min(reg.summary.fwd$cp), reg.summary.fwd$cp[which.min(reg.summary.fwd$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary.fwd$bic, xlab = "Number of Predictors using forward selection", ylab = "BIC", type = "l")
points(which.min(reg.summary.fwd$bic), reg.summary.fwd$bic[which.min(reg.summary.fwd$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary.fwd$adjr2, xlab = "Number of Predictors using forward selection", ylab = "Adjusted R2", type = "l")
points(which.max(reg.summary.fwd$adjr2), reg.summary.fwd$adjr2[which.max(reg.summary.fwd$adjr2)], col = "red", cex = 2, pch = 20)
* Still the same result, 3 predictors is the best model and the these
predictors are X, X2, X3 * These predictors are the True predictors of
Y
coef(regfit.fwd, which.min(reg.summary.fwd$cp)) ## that is model with 3 predictors
## (Intercept) X X2 X3
## 0.8296789 1.9036658 3.0029415 4.0009233
regfit.bwd <- regsubsets(Y ~ ., data = data, nvmax = 10, method = "backward")
reg.summary.bwd <- summary(regfit.bwd)
reg.summary.bwd
## Subset selection object
## Call: regsubsets.formula(Y ~ ., data = data, nvmax = 10, method = "backward")
## 10 Variables (and intercept)
## Forced in Forced out
## X FALSE FALSE
## X2 FALSE FALSE
## X3 FALSE FALSE
## X4 FALSE FALSE
## X5 FALSE FALSE
## X6 FALSE FALSE
## X7 FALSE FALSE
## X8 FALSE FALSE
## X9 FALSE FALSE
## X10 FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: backward
## X X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 ( 1 ) " " " " "*" " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " " " " " " " " " " " "
## 4 ( 1 ) "*" "*" "*" " " " " " " " " "*" " " " "
## 5 ( 1 ) "*" "*" "*" " " " " "*" " " "*" " " " "
## 6 ( 1 ) "*" "*" "*" " " "*" "*" " " "*" " " " "
## 7 ( 1 ) "*" "*" "*" " " "*" "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" "*" " " "*" "*" "*" "*" "*" " "
## 9 ( 1 ) "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
par(mfrow = c(1, 3))
plot(reg.summary.bwd$cp, xlab = "Number of Predictors using forward selection", ylab = "Cp", type = "l")
points(which.min(reg.summary.bwd$cp), reg.summary.bwd$cp[which.min(reg.summary.bwd$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary.bwd$bic, xlab = "Number of Predictors using forward selection", ylab = "BIC", type = "l")
points(which.min(reg.summary.bwd$bic), reg.summary.bwd$bic[which.min(reg.summary.bwd$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary.bwd$adjr2, xlab = "Number of Predictors using forward selection", ylab = "Adjusted R2", type = "l")
points(which.max(reg.summary.bwd$adjr2), reg.summary.bwd$adjr2[which.max(reg.summary.bwd$adjr2)], col = "red", cex = 2, pch = 20)
* The same result as forward selection and exhaustive serach method
coef(regfit.bwd, which.min(reg.summary.bwd$cp))
## (Intercept) X X2 X3
## 0.8296789 1.9036658 3.0029415 4.0009233
x <- model.matrix(Y ~ .-1 , data = data)
y <- data$Y
cv.lasso <- cv.glmnet(x, y, alpha = 1)
plot(cv.lasso)
best.lambda <- cv.lasso$lambda.min
lasso.coef <- predict(cv.lasso, s = best.lambda, type = "coefficients")
lasso.coef
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 39.9901000
## X 0.9139107
## X2 1.7133365
## X3 3.9153370
## X4 .
## X5 .
## X6 .
## X7 .
## X8 .
## X9 .
## X10 .
Y_new <- 1 + 4*X^7 + ϵ # Define new response
data_new <- data # Keep the same predictors
data_new$Y <- Y_new # Update response
best_subset_new <- regsubsets(Y ~ ., data = data_new, nvmax = 10)
best_subset_new.summary <- summary(best_subset_new)
best_subset_new.summary
## Subset selection object
## Call: regsubsets.formula(Y ~ ., data = data_new, nvmax = 10)
## 10 Variables (and intercept)
## Forced in Forced out
## X FALSE FALSE
## X2 FALSE FALSE
## X3 FALSE FALSE
## X4 FALSE FALSE
## X5 FALSE FALSE
## X6 FALSE FALSE
## X7 FALSE FALSE
## X8 FALSE FALSE
## X9 FALSE FALSE
## X10 FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
## X X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 ( 1 ) " " " " " " " " " " " " "*" " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " "*" " " " " " "
## 3 ( 1 ) "*" " " "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) "*" " " "*" " " " " " " "*" " " "*" " "
## 5 ( 1 ) "*" " " "*" " " "*" " " "*" " " "*" " "
## 6 ( 1 ) "*" "*" "*" " " " " "*" "*" "*" " " " "
## 7 ( 1 ) "*" "*" "*" "*" "*" " " "*" " " "*" " "
## 8 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" " "
## 9 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
par(mfrow = c(1, 3))
plot(best_subset_new.summary$cp, xlab = "Number of Predictors using forward selection", ylab = "Cp", type = "l")
points(which.min(best_subset_new.summary$cp), best_subset_new.summary$cp[which.min(best_subset_new.summary$cp)], col = "red", cex = 2, pch = 20)
plot(best_subset_new.summary$bic, xlab = "Number of Predictors using forward selection", ylab = "BIC", type = "l")
points(which.min(best_subset_new.summary$bic), best_subset_new.summary$bic[which.min(best_subset_new.summary$bic)], col = "red", cex = 2, pch = 20)
plot(best_subset_new.summary$adjr2, xlab = "Number of Predictors using forward selection", ylab = "Adjusted R2", type = "l")
points(which.max(best_subset_new.summary$adjr2), best_subset_new.summary$adjr2[which.max(best_subset_new.summary$adjr2)], col = "red", cex = 2, pch = 20)
coef(best_subset_new, which.min(best_subset_new.summary$cp))
## (Intercept) X X3 X7
## 0.891177761 -0.121411481 0.001658063 3.999999980
coef(best_subset_new, which.min(best_subset_new.summary$bic))
## (Intercept) X7
## 0.8722273 4.0000000
coef(best_subset_new, which.min(best_subset_new.summary$adjr2))
## (Intercept) X7
## 0.8722273 4.0000000
There is a little discrencpy here, according to BIC the modelwith 1 predict is the best and according to \(C_p\) the model iwth 3 predictors is the best
And according the \(adj R^2\) the model with 1 predictor is the best.
But even for the best model for \(C_p\) with 3 predictors, the est. coefficients of X3 and X are very close to 0 and X7 is close to 4
Thus the X7 coefficients in both the cases is 4.
This indicates, the model with X7 can be considered are the best model.
lasso_cv_new <- cv.glmnet(x, data_new$Y, alpha = 1)
plot(lasso_cv_new)
coef(lasso_cv_new, s = lasso_cv_new$lambda.min)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -1.220110e+05
## X .
## X2 .
## X3 .
## X4 .
## X5 .
## X6 .
## X7 3.883398e+00
## X8 .
## X9 .
## X10 .
Clearly, the model with 1 predictor is the best model
And the best model subset of predictors for 1 predictor model is X7 as predictor
Which is same as True model and the coefficient is ≈3.88 (to close 4).
Therefore the variable selection methods are able to obtain True relationships and almost close to True Coefficient values (this is acceptable as to account ϵ)
Both LASSO and best subset variable selection leads to X7 single predictor model with est. coeff ≈4.
data(College)
set.seed(96)
train <- sample(nrow(College), nrow(College) / 2)
test <- -train
train_data <- College[train, ]
test_data <- College[test, ]
lm_fit <- lm(Apps ~ ., data = train_data)
lm_pred <- predict(lm_fit, newdata = test_data)
lm_test_error <- mean((lm_pred - test_data$Apps)^2)
print(paste("Linear Model Test Error:", lm_test_error))
## [1] "Linear Model Test Error: 1412660.07615934"
x_train <- model.matrix(Apps ~ ., data = train_data)[, -1]
y_train <- train_data$Apps
x_test <- model.matrix(Apps ~ ., data = test_data)[, -1]
y_test <- test_data$Apps
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0) #ridge regression
best_lambda_ridge <- cv_ridge$lambda.min
ridge_pred <- predict(cv_ridge, s = best_lambda_ridge, newx = x_test)
ridge_test_error <- mean((ridge_pred - y_test)^2)
print(paste("Ridge Regression Test Error:", ridge_test_error))
## [1] "Ridge Regression Test Error: 2201690.18667754"
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1) # lasso
best_lambda_lasso <- cv_lasso$lambda.min
lasso_pred <- predict(cv_lasso, s = best_lambda_lasso, newx = x_test)
lasso_test_error <- mean((lasso_pred - y_test)^2)
lasso_coef <- predict(cv_lasso, s = best_lambda_lasso, type = "coefficients")
non_zero_coef <- sum(lasso_coef != 0) - 1 # Subtract 1 for intercept
print(paste("Lasso Regression Test Error:", lasso_test_error))
## [1] "Lasso Regression Test Error: 1416808.49996602"
print(paste("Number of Non-Zero Coefficients:", non_zero_coef))
## [1] "Number of Non-Zero Coefficients: 16"
pcr_fit <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
validationplot(pcr_fit, val.type = "MSEP")
best_m_pcr <- which.min(pcr_fit$validation$PRESS)
pcr_pred <- predict(pcr_fit, newdata = test_data, ncomp = best_m_pcr)
pcr_test_error <- mean((pcr_pred - y_test)^2)
print(paste("PCR Test Error:", pcr_test_error))
## [1] "PCR Test Error: 1412660.07615935"
print(paste("Optimal M (PCR):", best_m_pcr))
## [1] "Optimal M (PCR): 17"
pls_fit <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
validationplot(pls_fit, val.type = "MSEP")
best_m_pls <- which.min(MSEP(pls_fit)$val[seq(2, 32, 2)]) -1 ## considering adjusted CV
pls_pred <- predict(pls_fit, newdata = test_data, ncomp = best_m_pls)
pls_test_error <- mean(((pls_pred - y_test))^2)
print(paste("PLS Test Error:", pls_test_error))
## [1] "PLS Test Error: 1414330.86739303"
print(paste("Optimal M (PLS):", best_m_pls))
## [1] "Optimal M (PLS): 13"
cat("\nResults Summary:\n")
##
## Results Summary:
cat(paste("Linear Model Test Error:", lm_test_error, "\n"))
## Linear Model Test Error: 1412660.07615934
cat(paste("Ridge Regression Test Error:", ridge_test_error, "\n"))
## Ridge Regression Test Error: 2201690.18667754
cat(paste("Lasso Regression Test Error:", lasso_test_error, "\n"))
## Lasso Regression Test Error: 1416808.49996602
cat(paste("PCR Test Error:", pcr_test_error, "\n"))
## PCR Test Error: 1412660.07615935
cat(paste("PLS Test Error:", pls_test_error, "\n"))
## PLS Test Error: 1414330.86739303
Discussion or Comments on results summary:
Linear Model (Least Squares): - Test Error: 1,412,660.08. This serves as the baseline for comparison.
Lasso Regression: - Test Error: 1,416,808.50. Lasso’s performance was very similar to the linear model, indicating that the feature selection it performed did not substantially impact prediction accuracy.
PCR (Principal Components Regression): - Test Error: 1,412,660.08. With an optimal M of 17 (using all predictors), PCR achieved an identical test error to the linear model. This signifies that the principal components perfectly captured the variance used by the linear model.
Ridge Regression: - Test Error: 2,201,690.19. Ridge regression exhibited the highest test error, suggesting that L2 regularization was detrimental in this case.
PLS (Partial Least Squares): - Test Error: 1,420,888.52. PLS performed similarly to the linear model and Lasso, with a slightly higher error, indicating that the latent variables created by PLS were also good predictors of the data.
We can predict the number of college applications received with reasonable accuracy. The test errors, while large in absolute terms, need to be considered in the context of the scale of the “Apps” variable. The fact that the linear model, PCR, and Lasso all have very similar errors indicates a good level of predictive power.
The fact that PCR’s optimal M was 17 (using all predictors) explains its identical performance to the linear model. PCR, when allowed to use all components, essentially replicates a least squares regression.
Lasso also performs very similarly, showing that the model is not improved by feature selection.
PLS performs very closely to the other models, but with a slight increase in error.
Ridge regression’s significantly higher error indicates a clear problem. It suggests that L2 regularization was detrimental in this case.
There is remarkably little difference among the test errors of the linear model, Lasso, and PCR. This suggests that the data is well-suited to a linear model and that feature selection or dimensionality reduction did not provide significant benefits.
The results strongly suggest that a standard linear regression model is sufficient for predicting college applications in this dataset. The added complexity of Lasso, PCR (when using all components), and PLS did not lead to substantial improvements. The poor performance of Ridge regression warrants further investigation.
y <- runif(100)
# Load required libraries
library(ggplot2)
# Generate data
set.seed(123) # for reproducibility
x1 <- rnorm(100)
x2 <- rnorm(100, 2, 1)# predictor variable
y <- ifelse(x1 > 0, 1, 0) # binary outcome variable
# Scatter plot
plot(x1, y, main = "Scatter plot of x1 vs y",
xlab = "x1", ylab = "y", pch = 19, col = "blue")
# Fit logistic regression model
model <- glm(y ~ x2, family = 'binomial')
# Display summary of the model
summary(model)
##
## Call:
## glm(formula = y ~ x2, family = "binomial")
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.21780 0.44255 0.492 0.623
## x2 -0.07275 0.20832 -0.349 0.727
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 138.47 on 99 degrees of freedom
## Residual deviance: 138.35 on 98 degrees of freedom
## AIC: 142.35
##
## Number of Fisher Scoring iterations: 3
# Plot using ggplot2 with logistic regression curve
ggplot(data = data.frame(x1,x2, y), aes(x = x1, y = y)) +
geom_point(color = "blue") + # Plot the points
geom_smooth(method = "glm", method.args = list(family = "binomial"), color = "red") +
labs(title = "Logistic Regression Curve", x = "x1", y = "Probability of y") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Calculate the log-odds (logits) from the fitted model
logits <- predict(model, type = "link")
# Plot Log-Odds (logits) vs. Predictor x
plot(x2, logits, main = "Log-Odds vs. Predictor",
xlab = "Predictor (x)", ylab = "Log-Odds",
pch = 19, col = "blue")