# Clear the workspace
rm(list = ls()) # Clear environment - remove all files from your workspace
invisible(gc()) # Clear unused memory
cat("\f") # Clear the console
graphics.off() # clear all graphs
#Importing necessary libraries
library(MASS)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
# Set seed for reproducibility
set.seed(42)
# Define parameters
n <- 20 # Number of observations
p <- 100 # Number of predictors
# Simulate predictors (X) and response variable (y)
X <- matrix(rnorm(n * p), nrow = n, ncol = p)
y <- rnorm(n)
# Attempt to fit an OLS model
ols_model <- lm(y ~ X)
summary(ols_model)
##
## Call:
## lm(formula = y ~ X)
##
## Residuals:
## ALL 20 residuals are 0: no residual degrees of freedom!
##
## Coefficients: (81 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.5796240 NaN NaN NaN
## X1 -0.1633698 NaN NaN NaN
## X2 0.0539168 NaN NaN NaN
## X3 0.2902229 NaN NaN NaN
## X4 -0.7841325 NaN NaN NaN
## X5 -0.8992692 NaN NaN NaN
## X6 -0.1560911 NaN NaN NaN
## X7 -0.8007147 NaN NaN NaN
## X8 -0.7305252 NaN NaN NaN
## X9 0.0213804 NaN NaN NaN
## X10 -0.7078046 NaN NaN NaN
## X11 -0.2856775 NaN NaN NaN
## X12 -0.4160416 NaN NaN NaN
## X13 -0.0032922 NaN NaN NaN
## X14 -0.0008478 NaN NaN NaN
## X15 0.4290647 NaN NaN NaN
## X16 0.0103589 NaN NaN NaN
## X17 -1.6703966 NaN NaN NaN
## X18 0.4150547 NaN NaN NaN
## X19 0.5231972 NaN NaN NaN
## X20 NA NA NA NA
## X21 NA NA NA NA
## X22 NA NA NA NA
## X23 NA NA NA NA
## X24 NA NA NA NA
## X25 NA NA NA NA
## X26 NA NA NA NA
## X27 NA NA NA NA
## X28 NA NA NA NA
## X29 NA NA NA NA
## X30 NA NA NA NA
## X31 NA NA NA NA
## X32 NA NA NA NA
## X33 NA NA NA NA
## X34 NA NA NA NA
## X35 NA NA NA NA
## X36 NA NA NA NA
## X37 NA NA NA NA
## X38 NA NA NA NA
## X39 NA NA NA NA
## X40 NA NA NA NA
## X41 NA NA NA NA
## X42 NA NA NA NA
## X43 NA NA NA NA
## X44 NA NA NA NA
## X45 NA NA NA NA
## X46 NA NA NA NA
## X47 NA NA NA NA
## X48 NA NA NA NA
## X49 NA NA NA NA
## X50 NA NA NA NA
## X51 NA NA NA NA
## X52 NA NA NA NA
## X53 NA NA NA NA
## X54 NA NA NA NA
## X55 NA NA NA NA
## X56 NA NA NA NA
## X57 NA NA NA NA
## X58 NA NA NA NA
## X59 NA NA NA NA
## X60 NA NA NA NA
## X61 NA NA NA NA
## X62 NA NA NA NA
## X63 NA NA NA NA
## X64 NA NA NA NA
## X65 NA NA NA NA
## X66 NA NA NA NA
## X67 NA NA NA NA
## X68 NA NA NA NA
## X69 NA NA NA NA
## X70 NA NA NA NA
## X71 NA NA NA NA
## X72 NA NA NA NA
## X73 NA NA NA NA
## X74 NA NA NA NA
## X75 NA NA NA NA
## X76 NA NA NA NA
## X77 NA NA NA NA
## X78 NA NA NA NA
## X79 NA NA NA NA
## X80 NA NA NA NA
## X81 NA NA NA NA
## X82 NA NA NA NA
## X83 NA NA NA NA
## X84 NA NA NA NA
## X85 NA NA NA NA
## X86 NA NA NA NA
## X87 NA NA NA NA
## X88 NA NA NA NA
## X89 NA NA NA NA
## X90 NA NA NA NA
## X91 NA NA NA NA
## X92 NA NA NA NA
## X93 NA NA NA NA
## X94 NA NA NA NA
## X95 NA NA NA NA
## X96 NA NA NA NA
## X97 NA NA NA NA
## X98 NA NA NA NA
## X99 NA NA NA NA
## X100 NA NA NA NA
##
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 19 and 0 DF, p-value: NA
In this simulated scenario where p≫n (100 predictors and 20 observations), OLS regression produces coefficient estimates but assigns infinite standard errors to all predictors. This is due to perfect multicollinearity—when p>n, the design matrix X cannot be of full rank, which prevents the model from uniquely estimating each coefficient.
#Scaling
X_scaled <- scale(X)
# Perform cross-validation for Lasso
cv_lasso <- cv.glmnet(X_scaled, y, alpha = 1, nfolds = 5)
# Get the optimal lambda value for Lasso
optimal_lambda_lasso <- cv_lasso$lambda.min
lasso_coefficients <- coef(cv_lasso, s = "lambda.min")
# Otimal lambda value
cat("Optimal Lambda for Lasso: ", optimal_lambda_lasso, "\n")
## Optimal Lambda for Lasso: 0.5640283
# Plot the cross-validation results
plot(cv_lasso)
# Perform cross-validation for Ridge
cv_ridge <- cv.glmnet(X_scaled, y, alpha = 0, nfolds = 5)
# Get the optimal lambda value for Ridge
optimal_lambda_ridge <- cv_ridge$lambda.min
ridge_coefficients <- coef(cv_ridge, s = "lambda.min")
# Output the optimal lambda value and coefficients
cat("Optimal Lambda for Ridge: ", optimal_lambda_ridge, "\n")
## Optimal Lambda for Ridge: 5.640283
plot(cv_ridge)
# Summary of non-zero coefficients from both models
ridge_non_zero <- which(ridge_coefficients != 0)
lasso_non_zero <- which(lasso_coefficients != 0)
print("Non-zero Ridge Coefficients:")
## [1] "Non-zero Ridge Coefficients:"
print(ridge_coefficients[ridge_non_zero])
## [1] -0.1646529224 -0.0291058573 -0.0340641768 0.0063700972 0.0104727934
## [6] -0.0301403744 0.0109108482 0.0037876294 -0.0154466119 -0.0081071299
## [11] 0.0108057892 0.0050496517 0.0410907680 -0.0208989857 0.0245618743
## [16] 0.0071337225 -0.0204436682 -0.0497435522 0.0265368598 0.0003906498
## [21] -0.0029570052 -0.0232219225 -0.0213913811 0.0274462007 0.0182773639
## [26] 0.0201994680 0.0024239584 -0.0207178020 -0.0043419211 0.0249667224
## [31] 0.0312678204 0.0144799773 -0.0055915302 -0.0109840880 -0.0184609730
## [36] -0.0371145949 0.0350459851 0.0120687100 0.0381936685 -0.0102040871
## [41] -0.0064710388 0.0157622747 0.0008999130 0.0374362277 0.0019006785
## [46] 0.0185591588 -0.0189871492 0.0247680834 0.0094764796 0.0240215675
## [51] -0.0011946665 -0.0072310877 -0.0158727369 0.0459513456 -0.0124469993
## [56] 0.0465814740 0.0015868517 -0.0138635145 -0.0045387662 0.0044873306
## [61] -0.0011943441 -0.0105972635 0.0155693340 0.0210235450 0.0188281772
## [66] -0.0100965273 -0.0174690971 -0.0229925051 0.0132609092 -0.0184545517
## [71] -0.0131686014 0.0113219932 0.0484246040 -0.0253034570 0.0128428821
## [76] 0.0217184052 -0.0021332859 0.0040309954 0.0058467156 -0.0112787133
## [81] -0.0272319857 -0.0029281754 0.0154846301 -0.0154784457 -0.0142740965
## [86] 0.0259856332 0.0119605323 0.0007207626 -0.0166601496 -0.0323446496
## [91] -0.0257290933 0.0102783094 0.0095924415 0.0252797581 -0.0027808296
## [96] -0.0017761066 -0.0381302354 0.0246577162 -0.0049615642 0.0000132025
## [101] 0.0103930173
print("Non-zero Lasso Coefficients:")
## [1] "Non-zero Lasso Coefficients:"
print(lasso_coefficients[lasso_non_zero])
## [1] -0.1646529
Which method do you prefer, and why?
Do they give similar coefficients?
# Transforming a variable
non_zero_index <- lasso_non_zero[1]
X_transformed <- X_scaled
X_transformed[, non_zero_index] <- X_transformed[, non_zero_index] / 1000
# Rerun Lasso without re-scaling predictors
cv_lasso_transformed <- cv.glmnet(X_transformed, y, alpha = 1, nfolds = 5)
lasso_coefficients_transformed <- coef(cv_lasso_transformed, s = "lambda.min")
# Output the coefficients after transformation
print(lasso_coefficients_transformed)
## 101 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -0.16465292
## V1 .
## V2 .
## V3 .
## V4 .
## V5 .
## V6 .
## V7 .
## V8 .
## V9 .
## V10 .
## V11 .
## V12 0.06223462
## V13 .
## V14 .
## V15 .
## V16 .
## V17 -0.15970959
## V18 .
## V19 .
## V20 .
## V21 .
## V22 .
## V23 .
## V24 .
## V25 .
## V26 .
## V27 .
## V28 .
## V29 .
## V30 .
## V31 .
## V32 .
## V33 .
## V34 .
## V35 .
## V36 .
## V37 .
## V38 .
## V39 .
## V40 .
## V41 .
## V42 .
## V43 .
## V44 .
## V45 .
## V46 .
## V47 .
## V48 .
## V49 .
## V50 .
## V51 .
## V52 .
## V53 0.12920566
## V54 .
## V55 0.02784337
## V56 .
## V57 .
## V58 .
## V59 .
## V60 .
## V61 .
## V62 .
## V63 .
## V64 .
## V65 .
## V66 .
## V67 .
## V68 .
## V69 .
## V70 .
## V71 .
## V72 0.13175282
## V73 .
## V74 .
## V75 .
## V76 .
## V77 .
## V78 .
## V79 .
## V80 .
## V81 .
## V82 .
## V83 .
## V84 .
## V85 .
## V86 .
## V87 .
## V88 .
## V89 .
## V90 .
## V91 .
## V92 .
## V93 .
## V94 .
## V95 .
## V96 .
## V97 .
## V98 .
## V99 .
## V100 .
Impact of Transformation: Transforming an independent variable can affect the regularization path. When the scale of a variable changes, it can alter its influence on the outcome, leading to different variable selection by Lasso.
Regularization Sensitivity: Lasso applies a penalty proportional to the absolute size of the coefficients. If the magnitude of the transformed variable is significantly smaller (e.g., dollars to thousands), it may become less influential, leading to its elimination from the model.