#Generate X and Noise
set.seed(1)
n <- 100
X <- rnorm(n)
eps <- rnorm(n)
#Generate Y in Accordadnce with model
beta0 <- 1.0
beta1 <- 2.0
beta2 <- -3.0
beta3 <- 0.5
Y <- beta0 + beta1*X + beta2*(X^2) + beta3*(X^3) + eps
#Forming Data Frame with Model Fitting
poly_data <- data.frame(
Y = Y,
X1 = X,
X2 = X^2,
X3 = X^3,
X4 = X^4,
X5 = X^5,
X6 = X^6,
X7 = X^7,
X8 = X^8,
X9 = X^9,
X10 = X^10
)
#Fitting Different Subsets
library(leaps)
regfit <- regsubsets(
Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
data = poly_data,
nvmax = 10
)
#Comparing Models
reg_summary <- summary(regfit)
reg_summary$cp
## [1] 917.497515 63.788757 -1.340110 0.354922 2.052791 3.853556
## [7] 5.264784 7.169409 9.065416 11.000000
reg_summary$bic
## [1] -58.6585 -240.0439 -290.8529 -286.5847 -282.3145 -277.9308 -273.9830
## [8] -269.4848 -264.9963 -260.4646
reg_summary$adjr2
## [1] 0.4875417 0.9193942 0.9532043 0.9528708 0.9525287 0.9521244 0.9519211
## [8] 0.9514447 0.9509625 0.9504479
#Plots
#Cp
plot(reg_summary$cp, xlab = "Number of Predictors", ylab = "Cp",
pch = 19, type = "b")
best_cp <- which.min(reg_summary$cp)
points(best_cp, reg_summary$cp[best_cp], col = "red", cex = 2, pch = 20)
abline(h = min(reg_summary$cp), col = "red", lty = 2)
#BIC
plot(reg_summary$bic, xlab = "Number of Predictors", ylab = "BIC",
pch = 19, type = "b")
best_bic <- which.min(reg_summary$bic)
points(best_bic, reg_summary$bic[best_bic], col = "red", cex = 2, pch = 20)
abline(h = min(reg_summary$bic), col = "red", lty = 2)
#Adjusted R^2
plot(reg_summary$adjr2, xlab = "Number of Predictors", ylab = "Adjusted R^2",
pch = 19, type = "b")
best_adjr2 <- which.max(reg_summary$adjr2)
points(best_adjr2, reg_summary$adjr2[best_adjr2], col = "red", cex = 2, pch = 20)
abline(h = max(reg_summary$adjr2), col = "red", lty = 2)
#Reporting Coefficietns for 3-Variable
coef_3var <- coef(regfit, id = 3)
coef_3var
## (Intercept) X X2 X5
## 1.07219472 2.44514720 -3.15676236 0.09022577
d.)
# Forward stepwise
regfit_fwd <- regsubsets(
Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
data = poly_data,
nvmax = 10,
method = "forward"
)
# Backward stepwise
regfit_bwd <- regsubsets(
Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
data = poly_data,
nvmax = 10,
method = "backward"
)
summary_fwd <- summary(regfit_fwd)
summary_bwd <- summary(regfit_bwd)
summary_fwd$cp
## [1] 917.497515 63.788757 -1.340110 0.354922 2.124432 4.041147
## [7] 6.009102 7.257998 9.065416 11.000000
summary_bwd$cp
## [1] 917.497515 63.788757 -1.340110 0.417214 2.182401 3.853556
## [7] 5.264784 7.257998 9.065416 11.000000
#Comparing Summaries
#Forward stepwise:
which.min(summary_fwd$cp)
## [1] 3
which.min(summary_fwd$bic)
## [1] 3
which.max(summary_fwd$adjr2)
## [1] 3
#Backward stepwise:
which.min(summary_bwd$cp)
## [1] 3
which.min(summary_bwd$bic)
## [1] 3
which.max(summary_bwd$adjr2)
## [1] 3
#Coefficients
#Best model size is 3 for forward stepwise
coef_fwd_3 <- coef(regfit_fwd, id = 3)
#Best model size is 3 for backward stepwise
coef_bwd_3 <- coef(regfit_bwd, id = 3)
coef_fwd_3
## (Intercept) X X2 X5
## 1.07219472 2.44514720 -3.15676236 0.09022577
coef_bwd_3
## (Intercept) X X2 X5
## 1.07219472 2.44514720 -3.15676236 0.09022577
e.)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
X_matrix <- model.matrix(
Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
data = poly_data
)[, -1] # remove the intercept column
#10-fold cross-validation for Lasso
cv_lasso <- cv.glmnet(X_matrix, poly_data$Y, alpha = 1)
best_lambda <- cv_lasso$lambda.min #minimizes cross-validation error
lasso_coef <- coef(cv_lasso, s = best_lambda)
lasso_coef
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.04330762
## X 2.28855463
## X2 -3.11000997
## X3 0.13706910
## X4 .
## X5 0.06479831
## X6 .
## X7 .
## X8 .
## X9 .
## X10 .
#1.) Generate X
set.seed(123)
n <- 100 # sample size
X <- rnorm(n) # predictor
eps <- rnorm(n) # random noise
beta0 <- 1.5
beta7 <- 2.0
Y <- beta0 + beta7 * (X^7) + eps
poly_data <- data.frame(
Y = Y,
X = X,
X2 = X^2,
X3 = X^3,
X4 = X^4,
X5 = X^5,
X6 = X^6,
X7 = X^7,
X8 = X^8,
X9 = X^9,
X10 = X^10
)
#Best Subset Selection
library(leaps)
regfit_full <- regsubsets(
Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
data = poly_data,
nvmax = 10
)
reg_summary <- summary(regfit_full)
which.min(reg_summary$cp)
## [1] 1
which.min(reg_summary$bic)
## [1] 1
which.max(reg_summary$adjr2)
## [1] 6
coef(regfit_full, id = 1) #2/3 indicate a 1-variable model
## (Intercept) X7
## 1.393251 1.999704
#And it returns X7 as the main contributer.
#Stepwise Selections
# Forward
regfit_fwd <- regsubsets(
Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
data = poly_data,
nvmax = 10,
method = "forward"
)
summary_fwd <- summary(regfit_fwd)
# Backward
regfit_bwd <- regsubsets(
Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
data = poly_data,
nvmax = 10,
method = "backward"
)
summary_bwd <- summary(regfit_bwd)
summary_fwd$cp
## [1] -0.4892301 0.3551574 1.0565357 2.9239183 4.4007817 6.1236555
## [7] 6.3437996 7.1581102 9.1290867 11.0000000
summary_fwd$bic
## [1] -929.1265 -925.7386 -922.5193 -918.0568 -914.0162 -909.7114 -907.0571
## [8] -903.7731 -899.2005 -894.7402
summary_fwd$adjr2
## [1] 0.9999150 0.9999152 0.9999155 0.9999147 0.9999143 0.9999136 0.9999144
## [8] 0.9999146 0.9999137 0.9999128
summary_bwd$cp
## [1] -0.4892301 1.1294727 1.1051856 2.2565990 3.5665229 3.7694845
## [7] 5.7157631 7.1623173 9.0222992 11.0000000
summary_bwd$bic
## [1] -929.1265 -924.9213 -922.4671 -918.7775 -914.9231 -912.3000 -907.7547
## [8] -903.7684 -899.3203 -894.7402
summary_bwd$adjr2
## [1] 0.9999150 0.9999145 0.9999154 0.9999153 0.9999151 0.9999158 0.9999150
## [8] 0.9999146 0.9999138 0.9999128
#Forward stepwise:
which.min(summary_fwd$cp)
## [1] 1
which.min(summary_fwd$bic)
## [1] 1
which.max(summary_fwd$adjr2)
## [1] 3
#Backward stepwise:
which.min(summary_bwd$cp)
## [1] 1
which.min(summary_bwd$bic)
## [1] 1
which.max(summary_bwd$adjr2)
## [1] 6
It can be observed that our R2 tends to favor a larger model. This is due to the fact that Adjusted R2 penalizes added predictors more gently. As soon as a predictor explains any extra variance, it nudges adjusted R2 upwards. This is not out the ordinary, to my understanding R2 is the most generous of the three criteria we used today in regards to adding variables.
#Lasso
X_matrix <- model.matrix(
Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10,
data = poly_data
)[, -1] # drop the intercept column
cv_lasso <- cv.glmnet(X_matrix, poly_data$Y, alpha = 1)
best_lambda <- cv_lasso$lambda.min
lasso_coef <- coef(cv_lasso, s = best_lambda)
lasso_coef
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.550655
## X .
## X2 .
## X3 .
## X4 .
## X5 .
## X6 .
## X7 1.941412
## X8 .
## X9 .
## X10 .
a.)
#Split into Test Set and Training Set
library(ISLR2)
# For reproducibility
set.seed(1)
train_idx <- sample(1:nrow(College), nrow(College)*0.7)
train_data <- College[train_idx, ]
test_data <- College[-train_idx, ]
b.)
#Model
lm_fit <- lm(Apps ~ ., data=train_data)
summary(lm_fit)
##
## Call:
## lm(formula = Apps ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5816.1 -451.6 -1.0 327.2 7445.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.377e+02 5.076e+02 -1.059 0.28995
## PrivateYes -5.045e+02 1.648e+02 -3.061 0.00232 **
## Accept 1.722e+00 4.763e-02 36.159 < 2e-16 ***
## Enroll -1.055e+00 2.437e-01 -4.329 1.80e-05 ***
## Top10perc 5.358e+01 6.440e+00 8.320 7.64e-16 ***
## Top25perc -1.614e+01 5.092e+00 -3.170 0.00161 **
## F.Undergrad 2.970e-02 4.399e-02 0.675 0.49976
## P.Undergrad 7.162e-02 3.649e-02 1.963 0.05019 .
## Outstate -8.841e-02 2.176e-02 -4.064 5.57e-05 ***
## Room.Board 1.630e-01 5.577e-02 2.923 0.00362 **
## Books 2.727e-01 2.723e-01 1.001 0.31715
## Personal -7.316e-03 7.283e-02 -0.100 0.92002
## PhD -9.676e+00 5.360e+00 -1.805 0.07161 .
## Terminal -3.781e-01 6.015e+00 -0.063 0.94990
## S.F.Ratio 1.627e+01 1.608e+01 1.012 0.31214
## perc.alumni 2.358e+00 4.853e+00 0.486 0.62722
## Expend 5.986e-02 1.337e-02 4.476 9.34e-06 ***
## Grad.Rate 7.158e+00 3.520e+00 2.034 0.04248 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1022 on 525 degrees of freedom
## Multiple R-squared: 0.9332, Adjusted R-squared: 0.9311
## F-statistic: 431.6 on 17 and 525 DF, p-value: < 2.2e-16
#Test Error
pred_lm <- predict(lm_fit, test_data)
mse_lm <- mean((test_data$Apps - pred_lm)^2)
mse_lm
## [1] 1261630
c.)
# Build an x matrix and a y vector
x_train <- model.matrix(Apps ~ ., data=train_data)[, -1]
y_train <- train_data$Apps
x_test <- model.matrix(Apps ~ ., data=test_data)[, -1]
y_test <- test_data$Apps
# 10-fold CV to pick lambda
cv_ridge <- cv.glmnet(x_train, y_train, alpha=0)
best_lambda_ridge <- cv_ridge$lambda.min
# Final ridge model
ridge_fit <- glmnet(x_train, y_train, alpha=0, lambda=best_lambda_ridge)
# Test error
pred_ridge <- predict(ridge_fit, s=best_lambda_ridge, newx=x_test)
mse_ridge <- mean((y_test - pred_ridge)^2)
mse_ridge
## [1] 1121808
d.)
cv_lasso <- cv.glmnet(x_train, y_train, alpha=1) # alpha=1 => Lasso
best_lambda_lasso <- cv_lasso$lambda.min
lasso_fit <- glmnet(x_train, y_train, alpha=1, lambda=best_lambda_lasso)
# Test error
pred_lasso <- predict(lasso_fit, s=best_lambda_lasso, newx=x_test)
mse_lasso <- mean((y_test - pred_lasso)^2)
mse_lasso
## [1] 1254248
# Number of non-zero coefficients
lasso_coef <- coef(lasso_fit, s=best_lambda_lasso)
nonzero_count <- sum(lasso_coef != 0)
nonzero_count
## [1] 18
e.) Test Error can be found at the bottom of my results and M=17
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
pcr_fit <- pcr(Apps ~ ., data=train_data, scale=TRUE, validation="CV")
# By default, pcr() does cross-validation for M = 1 to number_of_predictors
summary(pcr_fit)
## Data: X dimension: 543 17
## Y dimension: 543 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3895 3793 2133 2152 1835 1705 1719
## adjCV 3895 3794 2129 2153 1808 1695 1712
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1719 1670 1646 1639 1642 1651 1646
## adjCV 1713 1658 1639 1632 1635 1643 1639
## 14 comps 15 comps 16 comps 17 comps
## CV 1647 1621 1216 1197
## adjCV 1640 1603 1205 1185
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 32.051 57.00 64.42 70.27 75.65 80.65 84.26 87.61
## Apps 5.788 71.69 71.70 80.97 82.60 82.60 82.69 84.06
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.58 92.84 94.93 96.74 97.82 98.72 99.39
## Apps 84.55 84.82 84.86 84.86 85.01 85.05 89.81
## 16 comps 17 comps
## X 99.85 100.00
## Apps 93.03 93.32
# Suppose we identify M_best from the plots or summary
M_best <- 10 # just an example, you need to check the summary or plot
# Refit PCR with the chosen number of components
pcr_fit_best <- pcr(Apps ~ ., data=train_data, scale=TRUE, ncomp=M_best)
# Test error
pred_pcr <- predict(pcr_fit_best, test_data, ncomp=M_best)
mse_pcr <- mean((test_data$Apps - pred_pcr)^2)
mse_pcr
## [1] 1837203
f.) Test Error can be found at the bottom of my results and M=17
pls_fit <- plsr(Apps ~ ., data=train_data, scale=TRUE, validation="CV")
summary(pls_fit)
## Data: X dimension: 543 17
## Y dimension: 543 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3895 1955 1746 1541 1471 1266 1200
## adjCV 3895 1949 1747 1534 1448 1242 1188
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1177 1164 1158 1161 1157 1157 1157
## adjCV 1167 1155 1149 1151 1147 1148 1148
## 14 comps 15 comps 16 comps 17 comps
## CV 1156 1155 1155 1155
## adjCV 1147 1146 1146 1146
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 25.68 47.43 62.46 64.88 67.34 72.68 77.20 80.92
## Apps 76.62 82.39 86.93 90.76 92.82 93.05 93.13 93.20
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 82.69 85.16 87.35 90.73 92.49 95.10 97.09
## Apps 93.26 93.28 93.30 93.31 93.32 93.32 93.32
## 16 comps 17 comps
## X 98.40 100.00
## Apps 93.32 93.32
# Similar to PCR, we examine the cross-validation results to find the best M
M_best_pls <- 10 # placeholder - identify from summary or validationplot()
pls_fit_best <- plsr(Apps ~ ., data=train_data, scale=TRUE, ncomp=M_best_pls)
pred_pls <- predict(pls_fit_best, test_data, ncomp=M_best_pls)
mse_pls <- mean((test_data$Apps - pred_pls)^2)
mse_pls
## [1] 1279353
g.)
mse_data <- data.frame(
Model = c("Linear", "Ridge", "Lasso", "PCR", "PLS"),
TestMSE = c(mse_lm, mse_ridge, mse_lasso, mse_pcr, mse_pls)
)
mse_data
## Model TestMSE
## 1 Linear 1261630
## 2 Ridge 1121808
## 3 Lasso 1254248
## 4 PCR 1837203
## 5 PLS 1279353
-It can be clearly seen that ridge has the best (lowest) test error. PCR stand out with the largest Test Error by far. The other 3 tests cluster together and have very similar test errors. In regards to minimizing test MSE, Ridge is the best solution.