graphics.off() # clear all graphs
# Prepare needed libraries
<- c("glmnet", # used for regression
packages "caret", # used for modeling
"xgboost", # used for building XGBoost model
"ISLR",
"dplyr", # used for data manipulation and joining
"tidyselect",
"stargazer", # presentation of data
"data.table",# used for reading and manipulation of data
"ggplot2", # used for ploting
"cowplot", # used for combining multiple plots
"e1071", # used for skewness
"psych",
"car"
)
suppressPackageStartupMessages({
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
repos = "http://cran.rstudio.com/"
, dependencies = TRUE
,
)
}library(packages[i], character.only = TRUE)
}
})
rm(packages)
set.seed(7)
Weekly Discussion
Part 1
# Generate response variable 'y' from a Student's t-distribution with 10 observations and 7 degrees of freedom
<- rt(n = 10, df = 7)
y
# Generate predictor variables from different distributions:
# Uniform distributions for x1, x2, and x3:
<- runif(n = 10, min = 1, max = 7) # 10 random values between 1 and 7
x1 <- runif(n = 10, min = 2, max = 8)
x2 <- runif(n = 10, min = 3, max = 9)
x3
# Binomial distributions for x4, x5, and x6:
<- rbinom(n = 10, size = 14, prob = 0.7) # 10 random values, each as the count of 14 trials with 0.7 probability
x4 <- rbinom(n = 10, size = 15, prob = 0.8)
x5 <- rbinom(n = 10, size = 16, prob = 0.9)
x6
# Poisson distributions for x7 to x11:
<- rpois(n = 10, lambda = 5) # 10 random values with an average rate of 5
x7 <- rpois(n = 10, lambda = 6)
x8 <- rpois(n = 10, lambda = 7)
x9 <- rpois(n = 10, lambda = 8)
x10 <- rpois(n = 10, lambda = 9) x11
<- lm(formula = y ~ x1+x2+x3+x4+x5+x6+x7+x8+x9+x10+x11)
reg stargazer(reg,
type = "text")
========================================
Dependent variable:
---------------------------
y
----------------------------------------
x1 0.231
x2 0.980
x3 -0.453
x4 -0.316
x5 -0.117
x6 -1.296
x7 0.186
x8 0.888
x9 0.127
x10
x11
Constant 14.039
----------------------------------------
Observations 10
R2 1.000
========================================
Note: *p<0.1; **p<0.05; ***p<0.01
As the result, the OLS of reg is not able to estimate all betas.
When p > n , there are several reasons may prevent OLS from estimating all betas :
- Mathematical Impossibility. The quantity of observations needs more than predictors when estimating coefficients uniquely.
- Degrees of freedom are calculated by n-(p+1). When the degrees of freedom become 0 or negative, the variance or standard errors cannot be estimated accurately.
- Perfect Multicollinearity. Due to p>n, some predictors can be expressed as combinations of other predictors, leading to perfect multicollinearity.
Part 2
Set up
<- 10 # number of observations
N <- 100 # number of variables
p <- matrix(rnorm(N * p), ncol = p)
X <- rnorm(p)
beta = X %*% beta + rnorm(n = N) y
Normalization
apply(X = X,
MARGIN = 2,
FUN = sd # var
# standard deviation )
[1] 0.9000842 1.0084974 0.9171923 0.8784224 0.9171138 0.8694956 0.8110299
[8] 0.8710902 0.5496935 1.2390827 1.0518419 1.0030131 0.9632123 1.0334780
[15] 0.8071488 1.0187991 0.9425848 1.0587825 1.2682515 1.0555811 1.0722629
[22] 0.9939775 1.0380717 1.3303358 1.1048965 0.6954717 1.1942691 0.6266085
[29] 1.3090863 1.0612981 1.3696867 0.6943468 0.9262161 1.0643878 1.0164233
[36] 0.8359361 1.1452806 1.0566926 0.6738331 0.9203036 0.9205429 1.0921600
[43] 1.0566481 1.2350392 1.3161283 0.6037398 0.9735241 0.8298963 1.1660314
[50] 1.1205335 0.6421805 1.3309486 0.5717622 0.6676411 1.0629769 0.6968737
[57] 1.0136342 1.0435973 0.8487475 0.6950089 0.8520837 1.0465104 0.9549793
[64] 1.2247120 0.6749819 0.5705312 1.0742498 1.2033368 0.9838953 1.1799823
[71] 0.9550470 0.9370296 0.8710326 0.8014529 0.6450255 0.7233523 0.9082225
[78] 1.2887190 1.0001078 1.2763862 0.6376239 0.9231904 1.2268764 0.9852911
[85] 0.6850452 0.7516824 0.6204208 1.2018508 0.9430598 0.8371741 1.0874075
[92] 0.8545697 1.1797265 1.0935859 0.3851652 0.9940430 0.9613160 0.8901537
[99] 1.0298063 0.7172392
# scale : mean = 0, std=1
= scale(x = X)
scaled_X
# after standardization
colMeans(x = scaled_X) # mean ~ 0
[1] -2.688821e-18 -5.551115e-18 -1.942890e-17 0.000000e+00 2.636780e-17
[6] -8.326673e-18 3.538836e-17 -2.012279e-17 7.771561e-17 5.551115e-18
[11] 1.665335e-17 2.498002e-17 -5.551115e-18 0.000000e+00 -5.967449e-17
[16] -2.046974e-17 2.498002e-17 5.551115e-18 7.632783e-18 -2.359224e-17
[21] 1.110223e-17 1.249001e-17 -1.283695e-17 -2.220446e-17 8.326673e-18
[26] 3.642919e-17 1.110223e-17 1.179612e-17 -2.775558e-17 -1.665335e-17
[31] -2.775558e-18 -8.326673e-18 -1.110223e-17 -8.326673e-18 1.387779e-18
[36] 0.000000e+00 -3.885781e-17 4.128642e-17 -6.383782e-17 1.829049e-17
[41] 2.220446e-17 8.326673e-18 1.786765e-17 7.632783e-18 3.885781e-17
[46] 2.203099e-17 -9.020562e-18 4.683753e-18 -2.081668e-17 5.551115e-18
[51] -4.718448e-17 -4.163336e-18 -9.714451e-18 2.775558e-18 2.636780e-17
[56] -5.551115e-18 -2.775558e-18 -5.204170e-18 -1.110223e-17 -4.440892e-17
[61] 6.245005e-18 2.498002e-17 -2.775558e-17 1.387779e-17 1.110223e-17
[66] 5.551115e-17 -6.357762e-17 3.122502e-17 -2.706169e-17 2.914335e-17
[71] 2.775558e-17 -1.942890e-17 6.106227e-17 -5.551115e-18 1.873501e-17
[76] 2.775558e-18 -9.714451e-18 2.775558e-17 2.289835e-17 2.775558e-18
[81] -4.579670e-17 -3.330669e-17 2.220446e-17 7.840950e-17 1.110223e-17
[86] -3.018419e-17 1.387779e-18 3.122502e-18 -1.942890e-17 1.734723e-17
[91] 1.387779e-18 -2.775558e-17 4.163336e-17 -4.163336e-18 -6.106227e-17
[96] 5.551115e-18 -3.712308e-17 -1.804112e-17 -4.579670e-17 3.989864e-17
apply(X = scaled_X,
MARGIN = 2,
FUN = sd
# standard deviation = 1 )
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
<- as.data.frame(scaled_X)
df_scaledX describe(df_scaledX)
vars n mean sd median trimmed mad min max range skew kurtosis se
V1 1 10 0 1 -0.25 -0.22 0.35 -0.92 2.65 3.57 1.74 2.07 0.32
V2 2 10 0 1 0.24 0.06 1.05 -1.94 1.45 3.39 -0.41 -0.95 0.32
V3 3 10 0 1 0.16 0.01 1.08 -1.47 1.36 2.83 -0.27 -1.57 0.32
V4 4 10 0 1 0.30 0.03 0.76 -1.72 1.45 3.17 -0.38 -1.26 0.32
V5 5 10 0 1 0.15 0.15 0.65 -2.32 1.09 3.41 -1.02 0.20 0.32
V6 6 10 0 1 0.25 0.10 0.91 -1.90 1.09 2.99 -0.58 -1.20 0.32
V7 7 10 0 1 0.26 0.06 0.94 -1.77 1.27 3.04 -0.40 -1.30 0.32
V8 8 10 0 1 -0.05 0.00 1.19 -1.56 1.53 3.09 0.01 -1.53 0.32
V9 9 10 0 1 -0.23 0.01 1.02 -1.42 1.38 2.81 0.18 -1.68 0.32
V10 10 10 0 1 0.43 0.09 0.36 -1.79 1.05 2.84 -0.73 -1.29 0.32
V11 11 10 0 1 0.03 -0.03 1.11 -1.27 1.52 2.78 0.17 -1.54 0.32
V12 12 10 0 1 0.00 -0.03 1.07 -1.45 1.69 3.13 0.05 -1.30 0.32
V13 13 10 0 1 0.00 -0.03 1.11 -1.52 1.80 3.32 0.19 -1.17 0.32
V14 14 10 0 1 -0.13 -0.04 0.92 -1.36 1.65 3.01 0.31 -1.32 0.32
V15 15 10 0 1 -0.04 -0.02 0.95 -1.48 1.63 3.11 0.15 -1.31 0.32
V16 16 10 0 1 -0.05 -0.02 0.59 -1.58 1.77 3.36 0.18 -0.96 0.32
V17 17 10 0 1 -0.29 -0.11 0.46 -1.13 1.99 3.11 0.93 -0.66 0.32
V18 18 10 0 1 0.24 0.05 0.88 -1.98 1.59 3.57 -0.39 -0.70 0.32
V19 19 10 0 1 -0.08 -0.04 0.97 -1.55 1.84 3.39 0.25 -1.02 0.32
V20 20 10 0 1 -0.15 -0.07 0.89 -1.40 1.97 3.36 0.38 -0.81 0.32
V21 21 10 0 1 0.03 0.02 1.22 -1.74 1.60 3.34 -0.12 -1.17 0.32
V22 22 10 0 1 0.01 0.00 0.83 -1.78 1.74 3.52 -0.01 -0.82 0.32
V23 23 10 0 1 -0.03 -0.04 0.70 -1.74 2.03 3.77 0.31 -0.29 0.32
V24 24 10 0 1 0.36 0.15 0.48 -2.28 1.08 3.35 -1.07 -0.03 0.32
V25 25 10 0 1 -0.25 -0.14 0.83 -1.14 2.24 3.37 0.92 -0.15 0.32
V26 26 10 0 1 -0.23 -0.06 1.06 -1.24 1.70 2.94 0.39 -1.41 0.32
V27 27 10 0 1 -0.18 0.01 1.57 -1.24 1.16 2.40 0.02 -1.94 0.32
V28 28 10 0 1 0.12 0.09 1.00 -2.08 1.35 3.43 -0.56 -0.61 0.32
V29 29 10 0 1 0.17 -0.04 0.74 -1.59 1.91 3.50 0.12 -0.74 0.32
V30 30 10 0 1 0.22 0.02 0.50 -1.83 1.65 3.47 -0.35 -0.84 0.32
V31 31 10 0 1 0.28 -0.01 0.65 -1.59 1.70 3.29 -0.11 -1.18 0.32
V32 32 10 0 1 0.17 -0.01 1.33 -1.39 1.49 2.87 -0.02 -1.69 0.32
V33 33 10 0 1 0.35 0.04 1.09 -1.52 1.19 2.71 -0.27 -1.78 0.32
V34 34 10 0 1 0.17 0.01 0.80 -1.74 1.67 3.41 -0.20 -1.01 0.32
V35 35 10 0 1 0.09 0.04 1.10 -1.82 1.53 3.36 -0.22 -1.07 0.32
V36 36 10 0 1 0.12 0.10 0.91 -2.07 1.27 3.34 -0.66 -0.68 0.32
V37 37 10 0 1 -0.30 -0.04 1.13 -1.20 1.56 2.76 0.36 -1.59 0.32
V38 38 10 0 1 -0.32 -0.15 0.52 -0.96 2.19 3.15 1.08 -0.26 0.32
V39 39 10 0 1 0.17 0.05 1.02 -1.89 1.47 3.36 -0.34 -0.98 0.32
V40 40 10 0 1 -0.25 -0.03 0.44 -1.70 1.92 3.62 0.38 -0.55 0.32
V41 41 10 0 1 -0.53 -0.13 0.54 -0.90 1.92 2.83 0.68 -1.22 0.32
V42 42 10 0 1 -0.04 -0.01 1.43 -1.27 1.34 2.61 0.12 -1.71 0.32
V43 43 10 0 1 -0.29 0.01 0.49 -1.68 1.63 3.31 0.18 -1.16 0.32
V44 44 10 0 1 -0.21 -0.07 0.80 -1.30 1.89 3.19 0.60 -1.04 0.32
V45 45 10 0 1 0.34 0.05 0.88 -1.46 1.08 2.54 -0.41 -1.70 0.32
V46 46 10 0 1 -0.02 -0.10 1.10 -1.17 1.95 3.12 0.49 -0.95 0.32
V47 47 10 0 1 -0.30 -0.04 0.63 -1.28 1.63 2.92 0.56 -1.27 0.32
V48 48 10 0 1 0.12 -0.11 0.62 -1.37 2.25 3.62 0.77 0.04 0.32
V49 49 10 0 1 -0.17 -0.03 0.98 -1.66 1.90 3.56 0.22 -0.79 0.32
V50 50 10 0 1 -0.46 -0.07 0.76 -0.99 1.52 2.51 0.51 -1.65 0.32
V51 51 10 0 1 0.13 -0.11 0.60 -1.21 2.11 3.33 0.53 -0.50 0.32
V52 52 10 0 1 -0.01 0.07 0.87 -2.00 1.42 3.43 -0.39 -0.71 0.32
V53 53 10 0 1 0.24 0.10 0.72 -2.03 1.26 3.29 -0.62 -0.69 0.32
V54 54 10 0 1 -0.11 -0.03 0.75 -1.70 1.92 3.62 0.20 -0.63 0.32
V55 55 10 0 1 0.34 0.07 0.66 -1.95 1.36 3.31 -0.66 -0.82 0.32
V56 56 10 0 1 -0.03 0.05 1.21 -1.64 1.24 2.87 -0.15 -1.42 0.32
V57 57 10 0 1 -0.32 -0.04 0.90 -1.22 1.51 2.73 0.36 -1.68 0.32
V58 58 10 0 1 -0.11 -0.07 1.07 -1.38 1.96 3.34 0.47 -0.93 0.32
V59 59 10 0 1 -0.27 -0.07 1.05 -1.29 1.88 3.17 0.43 -1.17 0.32
V60 60 10 0 1 0.39 0.08 0.78 -1.76 1.11 2.87 -0.67 -1.19 0.32
V61 61 10 0 1 -0.08 -0.03 1.08 -1.42 1.69 3.11 0.14 -1.32 0.32
V62 62 10 0 1 -0.04 0.06 1.06 -1.79 1.27 3.06 -0.18 -1.29 0.32
V63 63 10 0 1 0.31 0.10 1.08 -1.89 1.12 3.01 -0.54 -1.20 0.32
V64 64 10 0 1 -0.23 -0.05 0.88 -1.61 1.98 3.60 0.36 -0.66 0.32
V65 65 10 0 1 0.54 0.01 0.69 -1.33 1.24 2.58 -0.28 -1.90 0.32
V66 66 10 0 1 -0.12 -0.04 1.30 -1.11 1.41 2.53 0.15 -1.89 0.32
V67 67 10 0 1 -0.01 -0.08 0.83 -1.55 2.18 3.74 0.62 -0.06 0.32
V68 68 10 0 1 0.08 -0.04 0.87 -1.48 1.81 3.29 0.07 -1.07 0.32
V69 69 10 0 1 -0.06 -0.03 0.32 -1.43 1.67 3.10 0.30 -0.98 0.32
V70 70 10 0 1 0.02 -0.01 0.93 -1.52 1.63 3.14 -0.01 -1.28 0.32
V71 71 10 0 1 0.07 -0.02 1.00 -1.57 1.71 3.29 0.08 -1.27 0.32
V72 72 10 0 1 0.25 0.18 0.59 -2.55 1.13 3.68 -1.46 1.44 0.32
V73 73 10 0 1 -0.02 -0.06 1.15 -1.21 1.69 2.91 0.32 -1.45 0.32
V74 74 10 0 1 0.30 -0.01 0.55 -1.40 1.45 2.85 -0.36 -1.49 0.32
V75 75 10 0 1 -0.11 -0.04 0.66 -1.47 1.75 3.21 0.49 -0.96 0.32
V76 76 10 0 1 0.11 -0.01 0.77 -1.69 1.78 3.47 -0.01 -0.91 0.32
V77 77 10 0 1 0.01 -0.11 0.45 -1.44 2.33 3.76 0.88 0.45 0.32
V78 78 10 0 1 0.04 0.04 0.94 -1.94 1.60 3.54 -0.28 -0.80 0.32
V79 79 10 0 1 -0.29 -0.05 0.86 -1.17 1.59 2.76 0.39 -1.64 0.32
V80 80 10 0 1 0.36 0.03 1.01 -1.82 1.55 3.36 -0.29 -1.16 0.32
V81 81 10 0 1 0.05 0.05 1.31 -1.55 1.18 2.73 -0.24 -1.66 0.32
V82 82 10 0 1 0.25 0.04 0.71 -1.77 1.49 3.26 -0.31 -1.26 0.32
V83 83 10 0 1 0.09 -0.06 1.23 -1.29 1.74 3.03 0.43 -1.21 0.32
V84 84 10 0 1 -0.16 -0.02 0.96 -1.46 1.62 3.08 0.19 -1.52 0.32
V85 85 10 0 1 -0.04 -0.01 1.04 -1.61 1.71 3.32 0.10 -1.25 0.32
V86 86 10 0 1 -0.17 0.02 0.81 -1.65 1.51 3.17 0.07 -1.32 0.32
V87 87 10 0 1 0.16 -0.12 0.95 -1.10 2.07 3.17 0.53 -0.72 0.32
V88 88 10 0 1 -0.24 -0.14 0.50 -1.31 2.41 3.72 1.15 0.70 0.32
V89 89 10 0 1 -0.26 -0.04 0.97 -1.36 1.69 3.05 0.34 -1.21 0.32
V90 90 10 0 1 -0.07 -0.04 1.29 -1.19 1.51 2.71 0.23 -1.42 0.32
V91 91 10 0 1 -0.05 0.09 0.88 -1.95 1.26 3.21 -0.34 -0.95 0.32
V92 92 10 0 1 -0.29 0.03 0.88 -1.80 1.52 3.32 -0.05 -1.12 0.32
V93 93 10 0 1 -0.04 0.04 1.21 -1.59 1.29 2.87 -0.07 -1.62 0.32
V94 94 10 0 1 0.18 0.05 1.08 -1.64 1.21 2.85 -0.34 -1.57 0.32
V95 95 10 0 1 -0.05 0.03 0.57 -2.02 1.80 3.82 -0.19 -0.21 0.32
V96 96 10 0 1 -0.13 -0.01 1.08 -1.57 1.68 3.25 0.12 -1.36 0.32
V97 97 10 0 1 -0.09 -0.08 1.05 -1.43 2.09 3.53 0.60 -0.50 0.32
V98 98 10 0 1 0.08 -0.03 1.31 -1.33 1.55 2.88 0.24 -1.47 0.32
V99 99 10 0 1 -0.14 -0.09 1.30 -1.13 1.83 2.96 0.36 -1.33 0.32
V100 100 10 0 1 -0.01 0.06 0.41 -2.07 1.57 3.63 -0.30 -0.27 0.32
= scale(x = X)
scaled_X colMeans(x = scaled_X)
[1] -2.688821e-18 -5.551115e-18 -1.942890e-17 0.000000e+00 2.636780e-17
[6] -8.326673e-18 3.538836e-17 -2.012279e-17 7.771561e-17 5.551115e-18
[11] 1.665335e-17 2.498002e-17 -5.551115e-18 0.000000e+00 -5.967449e-17
[16] -2.046974e-17 2.498002e-17 5.551115e-18 7.632783e-18 -2.359224e-17
[21] 1.110223e-17 1.249001e-17 -1.283695e-17 -2.220446e-17 8.326673e-18
[26] 3.642919e-17 1.110223e-17 1.179612e-17 -2.775558e-17 -1.665335e-17
[31] -2.775558e-18 -8.326673e-18 -1.110223e-17 -8.326673e-18 1.387779e-18
[36] 0.000000e+00 -3.885781e-17 4.128642e-17 -6.383782e-17 1.829049e-17
[41] 2.220446e-17 8.326673e-18 1.786765e-17 7.632783e-18 3.885781e-17
[46] 2.203099e-17 -9.020562e-18 4.683753e-18 -2.081668e-17 5.551115e-18
[51] -4.718448e-17 -4.163336e-18 -9.714451e-18 2.775558e-18 2.636780e-17
[56] -5.551115e-18 -2.775558e-18 -5.204170e-18 -1.110223e-17 -4.440892e-17
[61] 6.245005e-18 2.498002e-17 -2.775558e-17 1.387779e-17 1.110223e-17
[66] 5.551115e-17 -6.357762e-17 3.122502e-17 -2.706169e-17 2.914335e-17
[71] 2.775558e-17 -1.942890e-17 6.106227e-17 -5.551115e-18 1.873501e-17
[76] 2.775558e-18 -9.714451e-18 2.775558e-17 2.289835e-17 2.775558e-18
[81] -4.579670e-17 -3.330669e-17 2.220446e-17 7.840950e-17 1.110223e-17
[86] -3.018419e-17 1.387779e-18 3.122502e-18 -1.942890e-17 1.734723e-17
[91] 1.387779e-18 -2.775558e-17 4.163336e-17 -4.163336e-18 -6.106227e-17
[96] 5.551115e-18 -3.712308e-17 -1.804112e-17 -4.579670e-17 3.989864e-17
= scale(y)
scaled_y summary(scaled_y)
V1
Min. :-1.51298
1st Qu.:-0.64817
Median : 0.07106
Mean : 0.00000
3rd Qu.: 0.90469
Max. : 1.13932
Optimal Lambda
<- cv.glmnet(x = scaled_X,
cv_model y = scaled_y,
alpha = 1
)
Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per
fold
#find optimal lambda value that minimizes test MSE
<- cv_model$lambda.min
best_lambda best_lambda
[1] 0.7808275
Lasso and Ridge
<- lm(y ~ X-1)
li.eq
# lasso
<- glmnet(x = scaled_X,
la.eq y = scaled_y,
lambda = best_lambda,
family = "gaussian",
intercept = F,
alpha=1
)
# Ridge
<- glmnet(x = scaled_X, y = scaled_y, lambda = best_lambda,
ri.eq family="gaussian",
intercept = F, alpha=0)
#——————————————–
# Results (lambda=0.01)
#——————————————–
<- data.frame(
df_comp_high beta = beta,
Linear = li.eq$coefficients,
Lasso = la.eq$beta[,1],
Ridge = ri.eq$beta[,1]
) df_comp_high
beta Linear Lasso Ridge
X1 -0.6620669664 -6.903451 0 0.0015800777
X2 -0.7352366267 10.612519 0 0.0278737178
X3 1.7508699993 21.693025 0 0.0591219811
X4 -0.8491820168 14.410192 0 -0.0301057370
X5 1.8035603967 4.587912 0 -0.0088794952
X6 -0.0784052362 -6.676640 0 0.0114134645
X7 -1.1161000953 8.685751 0 -0.0548412929
X8 -0.0372769939 1.848606 0 -0.0089381563
X9 -1.6815233675 -3.700296 0 -0.0370558007
X10 0.0004679096 -2.530038 0 0.0147744505
X11 -0.2623455328 NA 0 0.0527631873
X12 0.8483555433 NA 0 0.0050220307
X13 0.1799708761 NA 0 0.0187658344
X14 -0.0911368686 NA 0 -0.0224804741
X15 -0.1414202285 NA 0 -0.0094265447
X16 0.4850146355 NA 0 0.0310073012
X17 -0.2696655766 NA 0 -0.0441706015
X18 -0.9688645170 NA 0 -0.0490533502
X19 -1.1234983906 NA 0 -0.0229904096
X20 3.1393687290 NA 0 0.0195703428
X21 -0.9695458912 NA 0 0.0019177570
X22 -1.8375380305 NA 0 -0.0137567842
X23 -1.2795758391 NA 0 0.0403574692
X24 2.1641665929 NA 0 0.0604665451
X25 2.4209916351 NA 0 0.0272225964
X26 2.2109648619 NA 0 0.0276408405
X27 0.9116650148 NA 0 -0.0210689224
X28 0.8978615035 NA 0 -0.0471231424
X29 -0.7193884784 NA 0 -0.0053177871
X30 -0.7573287230 NA 0 -0.0104247761
X31 -1.8101523840 NA 0 -0.0074935131
X32 -0.6094810410 NA 0 -0.0112595301
X33 -0.2778865427 NA 0 -0.0079334868
X34 -0.4699389657 NA 0 -0.0297980368
X35 0.6211107045 NA 0 -0.0339351509
X36 1.5201589999 NA 0 0.0059792831
X37 0.9518111620 NA 0 0.0211468702
X38 -0.5122846090 NA 0 0.0078583296
X39 0.9611689855 NA 0 -0.0025846166
X40 -1.4967590754 NA 0 0.0161980088
X41 -0.2283412946 NA 0 0.0100064031
X42 -0.9104280443 NA 0 0.0427543063
X43 0.9192839074 NA 0 -0.0443426106
X44 0.8641317482 NA 0 0.0347277969
X45 -0.3643413836 NA 0 0.0085369963
X46 0.5662044955 NA 0 -0.0459373737
X47 -0.8509591636 NA 0 -0.0026210523
X48 0.2971417195 NA 0 0.0269057524
X49 0.2662160706 NA 0 0.0114516922
X50 -1.2885402011 NA 0 0.0172944036
X51 0.7481715256 NA 0 0.0172526707
X52 -0.4223271282 NA 0 0.0007400728
X53 -1.7500623726 NA 0 -0.0268112880
X54 0.9234397618 NA 0 -0.0023481429
X55 -0.8821037145 NA 0 -0.0104106106
X56 0.5202904403 NA 0 0.0001420476
X57 -0.3631135825 NA 0 -0.0241345231
X58 0.5266851293 NA 0 0.0183394323
X59 1.3277225888 NA 0 -0.0078230527
X60 1.1275525661 NA 0 -0.0001412315
X61 0.9812273299 NA 0 -0.0010788420
X62 -0.0723424468 NA 0 0.0502083290
X63 2.0384099963 NA 0 0.0057782520
X64 -0.2033425799 NA 0 0.0139379133
X65 -1.2717571839 NA 0 -0.0221676465
X66 -0.4015569492 NA 0 0.0066881307
X67 -0.3897622556 NA 0 -0.0457440935
X68 -0.4887119353 NA 0 0.0316421155
X69 -0.5410535717 NA 0 0.0501620297
X70 -0.9998464113 NA 0 -0.0122367836
X71 0.1825101668 NA 0 -0.0048255900
X72 0.4826074670 NA 0 0.0509179300
X73 0.9767061541 NA 0 -0.0379409536
X74 -0.6682645515 NA 0 0.0180568719
X75 -0.1878769088 NA 0 -0.0173881574
X76 1.6023772710 NA 0 0.0156902110
X77 0.8168614554 NA 0 0.0211399499
X78 -1.0859782982 NA 0 -0.0305714989
X79 -0.4224383235 NA 0 -0.0111994917
X80 -1.8763360056 NA 0 -0.0235776949
X81 0.4952753595 NA 0 0.0054535771
X82 -0.2802398017 NA 0 -0.0391171989
X83 1.7208843087 NA 0 -0.0241213310
X84 -0.2975690799 NA 0 0.0152762996
X85 1.1664894548 NA 0 -0.0406439321
X86 -0.2490940692 NA 0 0.0145892023
X87 1.0907235174 NA 0 0.0560210733
X88 -0.2259807699 NA 0 0.0207892304
X89 -0.3484056581 NA 0 -0.0015631053
X90 0.1673910986 NA 0 0.0082798813
X91 -1.2000012467 NA 0 -0.0214655420
X92 -0.2508087782 NA 0 0.0479789304
X93 0.1638443945 NA 0 -0.0083333872
X94 -0.1099517653 NA 0 -0.0065088203
X95 0.4663823285 NA 0 0.0110495714
X96 -0.6053877287 NA 0 0.0008098798
X97 -1.8733065010 NA 0 0.0139356408
X98 0.3379801804 NA 0 0.0223211714
X99 1.0589476891 NA 0 0.0148030704
X100 1.2738963015 NA 0 -0.0008877544
Based on the results, there is a significant difference in the coefficients between Lasso and Ridge regression. I prefer Ridge regression. Lasso compresses most coefficients to 0, indicating that it did not select any variables in this dataset. In contrast, Ridge shows estimated values for each variable, and although these values are small, all variables have corresponding coefficients.
Transformation and Rerun the Lasso/Ridge
<- X
X1 8] <- X[, 8] / 1000 X[,
<- cv.glmnet(x = X1,
cv_model y = y,
alpha = 1
)
Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per
fold
#Lasso
<- glmnet(x = X1, y = y, lambda = best_lambda,
la.eq family = "gaussian",
intercept = F, alpha=1)
#Ridge
<- glmnet(x = X1, y = y, lambda =best_lambda,
ri.eq family="gaussian",
intercept = F, alpha=0)
<- data.frame(
df_comp_high beta = beta,
Linear = li.eq$coefficients,
Lasso = la.eq$beta[,1],
Ridge = ri.eq$beta[,1]
) df_comp_high
beta Linear Lasso Ridge
X1 -0.6620669664 -6.903451 0.000000000 0.040234030
X2 -0.7352366267 10.612519 0.000000000 0.183792721
X3 1.7508699993 21.693025 3.452285142 0.502310248
X4 -0.8491820168 14.410192 0.000000000 -0.256232740
X5 1.8035603967 4.587912 0.000000000 -0.144210552
X6 -0.0784052362 -6.676640 0.000000000 0.060008586
X7 -1.1161000953 8.685751 -0.673284795 -0.544683643
X8 -0.0372769939 1.848606 0.000000000 -0.082455216
X9 -1.6815233675 -3.700296 0.000000000 -0.817065643
X10 0.0004679096 -2.530038 0.000000000 0.094890800
X11 -0.2623455328 NA 0.626989974 0.338909910
X12 0.8483555433 NA 0.000000000 0.104230306
X13 0.1799708761 NA 0.000000000 0.125392075
X14 -0.0911368686 NA 0.000000000 -0.130733355
X15 -0.1414202285 NA 0.000000000 -0.088132411
X16 0.4850146355 NA 0.000000000 0.276933423
X17 -0.2696655766 NA 0.000000000 -0.423003425
X18 -0.9688645170 NA 0.000000000 -0.339521937
X19 -1.1234983906 NA 0.000000000 -0.199192103
X20 3.1393687290 NA 0.000000000 0.111301504
X21 -0.9695458912 NA 0.000000000 0.018744648
X22 -1.8375380305 NA 0.000000000 -0.095135127
X23 -1.2795758391 NA 0.000000000 0.381784690
X24 2.1641665929 NA 1.342455580 0.372348984
X25 2.4209916351 NA 0.000000000 0.222784252
X26 2.2109648619 NA 0.000000000 0.458767143
X27 0.9116650148 NA 0.000000000 -0.172963547
X28 0.8978615035 NA -0.550285166 -0.628486626
X29 -0.7193884784 NA 0.000000000 -0.031648922
X30 -0.7573287230 NA 0.000000000 -0.126681442
X31 -1.8101523840 NA 0.000000000 -0.038996230
X32 -0.6094810410 NA 0.000000000 -0.144458090
X33 -0.2778865427 NA 0.000000000 -0.134530655
X34 -0.4699389657 NA 0.000000000 -0.240062754
X35 0.6211107045 NA 0.000000000 -0.317385718
X36 1.5201589999 NA 0.000000000 0.084584052
X37 0.9518111620 NA 0.000000000 0.187525353
X38 -0.5122846090 NA 0.000000000 0.103866990
X39 0.9611689855 NA 0.000000000 0.037306286
X40 -1.4967590754 NA 0.000000000 0.131441896
X41 -0.2283412946 NA 0.000000000 0.060725048
X42 -0.9104280443 NA 0.979594448 0.373464662
X43 0.9192839074 NA 0.000000000 -0.346227572
X44 0.8641317482 NA 0.000000000 0.274778352
X45 -0.3643413836 NA 0.000000000 0.076234581
X46 0.5662044955 NA 0.000000000 -0.702730178
X47 -0.8509591636 NA 0.000000000 -0.046145316
X48 0.2971417195 NA 0.000000000 0.329997499
X49 0.2662160706 NA 0.000000000 0.053867786
X50 -1.2885402011 NA 0.000000000 0.110015921
X51 0.7481715256 NA 0.000000000 0.182139720
X52 -0.4223271282 NA 0.000000000 -0.003272439
X53 -1.7500623726 NA 0.000000000 -0.364551880
X54 0.9234397618 NA 0.000000000 -0.080337306
X55 -0.8821037145 NA 0.000000000 -0.104901168
X56 0.5202904403 NA 0.000000000 -0.010193123
X57 -0.3631135825 NA 0.000000000 -0.209893511
X58 0.5266851293 NA 0.000000000 0.167366161
X59 1.3277225888 NA 0.000000000 -0.138346162
X60 1.1275525661 NA 0.000000000 0.036599514
X61 0.9812273299 NA 0.000000000 0.025366865
X62 -0.0723424468 NA 0.003305248 0.416022440
X63 2.0384099963 NA 0.000000000 0.018009780
X64 -0.2033425799 NA 0.000000000 0.108529999
X65 -1.2717571839 NA 0.000000000 -0.271378028
X66 -0.4015569492 NA 0.000000000 0.121353685
X67 -0.3897622556 NA 0.000000000 -0.422174493
X68 -0.4887119353 NA 0.000000000 0.226638728
X69 -0.5410535717 NA 0.000000000 0.409156036
X70 -0.9998464113 NA 0.000000000 -0.061724694
X71 0.1825101668 NA 0.000000000 -0.065030743
X72 0.4826074670 NA 0.000000000 0.476058856
X73 0.9767061541 NA 0.000000000 -0.305580283
X74 -0.6682645515 NA 0.000000000 0.138480847
X75 -0.1878769088 NA 0.000000000 -0.223163387
X76 1.6023772710 NA 0.000000000 0.186016338
X77 0.8168614554 NA 0.000000000 0.171731877
X78 -1.0859782982 NA 0.000000000 -0.180207477
X79 -0.4224383235 NA 0.000000000 -0.099277563
X80 -1.8763360056 NA 0.000000000 -0.150873966
X81 0.4952753595 NA 0.000000000 0.173148767
X82 -0.2802398017 NA 0.000000000 -0.279062399
X83 1.7208843087 NA 0.000000000 -0.141684021
X84 -0.2975690799 NA 0.000000000 0.163441260
X85 1.1664894548 NA -1.868493298 -0.550429735
X86 -0.2490940692 NA 0.000000000 0.152199515
X87 1.0907235174 NA 1.190722305 0.692668873
X88 -0.2259807699 NA 0.000000000 0.130880339
X89 -0.3484056581 NA 0.000000000 -0.077439675
X90 0.1673910986 NA 0.000000000 0.099061277
X91 -1.2000012467 NA 0.000000000 -0.115884637
X92 -0.2508087782 NA 0.000000000 0.454127058
X93 0.1638443945 NA 0.000000000 -0.019190471
X94 -0.1099517653 NA 0.000000000 -0.060487806
X95 0.4663823285 NA 0.000000000 -0.045628800
X96 -0.6053877287 NA 0.000000000 0.019517466
X97 -1.8733065010 NA 0.000000000 0.177629352
X98 0.3379801804 NA 0.000000000 0.192950965
X99 1.0589476891 NA 0.000000000 0.167227378
X100 1.2738963015 NA 0.000000000 0.043662748
The previously non-eliminated rescaled variable will not be eliminated due to the change in scale, but its beta has decreased from -0.07321 to 0.001901, moving closer to zero. This indicates that while the variable remains in the model, its contribution is diminished. This is because the reduction in the variable’s numerical range affects the performance of the Lasso and Ridge models. Lasso is more likely to compress it closer to zero as its relative importance decreases.