Statistical Learning: Week-7

Ridge and LASSO regressions

Fit ridge and LASSO regressions, interpret coefficients and visualize their variation across the range of \(\lambda\).

# load the required packages and data
library(glmnet)
library(caret)
library(plotmo)
data(mtcars)
names(mtcars) # As usual, check out what's inside the loaded dataframe

##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

# You can apply complete.cases() to subset on obs w/o missing values if there are NA entries in the data
# Completed.cases() function removes observations containing NA values
# Note that we are applying this on row vector, so a row will be deleted if it contains NA values in at least one of the columns (variables) 

df <- mtcars[complete.cases(mtcars),]  # Subset data and rename the dataframe to df

# We will first use the train() function from the caret package for demonstration
# set up control function
control = trainControl(method = "cv", number = 5)     # 5-fold cross-validation, we will get to this in later weeks

# We first fit a LASSO regression
# Using the train() function from the caret package. At its very core, it uses the glmnet package on the back, hence you see method = "glmnet"
set.seed(12345)  
LASSO_caret.fit <- train(mpg ~ .-1, data = df, method = "glmnet",    # .-1 means including all right-hand-side variables minus the constant term (1)
                trControl = control,  # "trControl" argument takes the trainControl() function you just specified
                tuneGrid = expand.grid(alpha = 1,     # Why do I set alpha to 1?
                                       lambda = c(0, 0.2, 0.4, 0.6, 0.8, 1, 1.2, 1.4, 1.6, 1.8, 2)))   # The penalty parameter: 0 <= λ <= 2

# See what's inside the train object
names(LASSO_caret.fit)

##  [1] "method"       "modelInfo"    "modelType"    "results"      "pred"        
##  [6] "bestTune"     "call"         "dots"         "metric"       "control"     
## [11] "finalModel"   "preProcess"   "trainingData" "ptype"        "resample"    
## [16] "resampledCM"  "perfNames"    "maximize"     "yLimits"      "times"       
## [21] "levels"       "terms"        "coefnames"    "xlevels"

# Obtain the coefficient estimates (note that it's the beta coef. that we really care, not the s.e.)
coef(LASSO_caret.fit$finalModel, LASSO_caret.fit$bestTune$lambda) # Display the results estimated at the optimal penalty parameter (lambda): finalModel as the first argument, followed by the optimized lambda value

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept) 36.00243493
## cyl         -0.88613435
## disp         .         
## hp          -0.01168996
## drat         .         
## wt          -2.70855013
## qsec         .         
## vs           .         
## am           .         
## gear         .         
## carb         .

# Check out the optimized lambda value (that generated the above coef. estimates)
LASSO_caret.fit$bestTune$lambda

## [1] 0.8

# Now we will do the same model with glmnet's own function, but first we need to convert the data into matrix format
# Data preparation: convert data into matrix format to be fed into glmnet()  
# model.matrix() creates a design (or model) matrix for regression model. It creates a column for each term (variable) in your regression formula 
# Note that the coefficients are represented in the sparse matrix format, because the solutions along the regularization path are often very sparse. It is more efficient in time and space to use the sparse format.
X = model.matrix(mpg ~ .-1, data = df)  # minus the constant term (-1)
Y = df$mpg  # our dependent variable

# Estimate a LASSO model by setting alpha to 1 (which is the default in glmnet(), why do I set alpha to 1?)
mtcars.LASSO = glmnet(X, Y, alpha = 1)  # X argument should be stored in matrix format

# Check out the glmnet model output (note that we fit a LASSO model)
# Remember to use print() function instead of summary() (summary() will only give you a description of the "class" of model coef.)
print(mtcars.LASSO)

## 
## Call:  glmnet(x = X, y = Y, alpha = 1) 
## 
##    Df  %Dev Lambda
## 1   0  0.00 5.1470
## 2   2 12.90 4.6900
## 3   2 24.81 4.2730
## 4   2 34.69 3.8940
## 5   2 42.90 3.5480
## 6   2 49.71 3.2320
## 7   2 55.37 2.9450
## 8   2 60.06 2.6840
## 9   2 63.96 2.4450
## 10  3 67.26 2.2280
## 11  3 70.15 2.0300
## 12  3 72.56 1.8500
## 13  3 74.55 1.6850
## 14  3 76.21 1.5360
## 15  3 77.59 1.3990
## 16  3 78.73 1.2750
## 17  3 79.68 1.1620
## 18  3 80.46 1.0580
## 19  3 81.12 0.9645
## 20  3 81.66 0.8788
## 21  3 82.11 0.8007
## 22  3 82.49 0.7296
## 23  4 82.81 0.6648
## 24  5 83.20 0.6057
## 25  5 83.60 0.5519
## 26  6 83.96 0.5029
## 27  6 84.26 0.4582
## 28  6 84.51 0.4175
## 29  6 84.72 0.3804
## 30  8 84.89 0.3466
## 31  8 85.14 0.3158
## 32  8 85.35 0.2878
## 33  8 85.53 0.2622
## 34  8 85.68 0.2389
## 35  8 85.80 0.2177
## 36  8 85.90 0.1983
## 37  8 85.98 0.1807
## 38  9 86.06 0.1647
## 39  9 86.15 0.1500
## 40  9 86.22 0.1367
## 41  9 86.27 0.1246
## 42  9 86.32 0.1135
## 43  9 86.36 0.1034
## 44  9 86.39 0.0942
## 45  9 86.42 0.0859
## 46  9 86.44 0.0782
## 47  9 86.46 0.0713
## 48  9 86.48 0.0649
## 49  9 86.49 0.0592
## 50  9 86.50 0.0539
## 51  9 86.51 0.0491
## 52  9 86.52 0.0448
## 53  9 86.52 0.0408
## 54 10 86.54 0.0372
## 55 10 86.60 0.0339
## 56 10 86.65 0.0309
## 57 10 86.69 0.0281
## 58 10 86.73 0.0256
## 59 10 86.76 0.0233
## 60 10 86.78 0.0213
## 61 10 86.80 0.0194
## 62 10 86.82 0.0177
## 63 10 86.83 0.0161
## 64 10 86.84 0.0147
## 65 10 86.85 0.0134
## 66 10 86.86 0.0122
## 67 10 86.87 0.0111
## 68 10 86.87 0.0101
## 69 10 86.88 0.0092
## 70 10 86.88 0.0084
## 71 10 86.88 0.0076
## 72 10 86.89 0.0070
## 73 10 86.89 0.0063
## 74 10 86.89 0.0058
## 75 10 86.89 0.0053
## 76 10 86.89 0.0048
## 77 10 86.89 0.0044
## 78 10 86.90 0.0040
## 79 10 86.90 0.0036

# Display the estimated coef.
coef(mtcars.LASSO, s = cv.glmnet(X, Y)$lambda.1se)   # The second argument (s (which means "lambda" in glmnet()) says choosing the lambda value from cross-valided glmnet() that minimizes the loss function

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) 33.471391885
## cyl         -0.833427049
## disp         .          
## hp          -0.005886408
## drat         .          
## wt          -2.287815524
## qsec         .          
## vs           .          
## am           .          
## gear         .          
## carb         .

# "s" is glmnet's lambda, it's because the authors want to preserve lambda for something else

# Only cyl, hp, and wt are significant at the optimized lambda value

Note that here you have two choices of \(\lambda\):

\(\textsf{lambda.min}\): \(\lambda\) value that produces the minimum mean cross-validated (cv) error
: \(\textsf{lambda.1se}\): largest value of \(\lambda\) such that error is within 1 standard error of the cross-validated errors for \(\textsf{lambda.min}\)

That is, \(\textsf{lambda.1se}\) is the \(\lambda\) for maximum \(\textsf{avg(mcr)}\) from a set of

\[\begin{align*} \text{avg}(mcr) &\lt \min(\text{avg}(mcr)) + \text{se}(mcr), \\ \mbox{where }\text{se}(mcr) &= \frac{\text{sd}(mcr)}{N_{fold}}. \end{align*}\]

\(\textsf{mcr}\) is the abbreviation for “mis-classification error rates” by each \(\lambda\), and \(\textsf{Nfold}\) is the number of folds. \(\textsf{avg(mcr)}\) and \(\textsf{se(mcr)}\) are the average and standard error of \(\textsf{mcr}\) by each \(\lambda\). You should get pretty similar penalized results using either \(\lambda\) value. But, why do many online \(\textsf{glmnet()}\) examples often use \(\textsf{lambda.1se}\) as their optimized \(\lambda\) values instead of \(\textsf{lambda.min}\)? Well, according to Friedman, Hastie, and Tibshirani (2010), we should choose the \(\lambda\) that provides the most parsimonious list of features (variables), which is the simplest model whose accuracy is almost comparable to that of the best model (aka., \(\textsf{lambda.min}\)).

Let’s check them out.

# Display the estimated coef. at the lambda value that produces the minimum mean cross-validated error
coef(mtcars.LASSO, s = cv.glmnet(X, Y)$lambda.min)

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept) 36.00001676
## cyl         -0.88608541
## disp         .         
## hp          -0.01168438
## drat         .         
## wt          -2.70814703
## qsec         .         
## vs           .         
## am           .         
## gear         .         
## carb         .

Visualization

## Plot it out
# We first use the uncool plot :/
plot(mtcars.LASSO, label = TRUE)

# The graph shows the path of model coefficients against the L1-norm of the whole coefficient vector as lambda varies. The x-axis above indicates the number of nonzero coefficients at the current lambda value, which is also the effective degree of freedom (df) for the LASSO.


# Make a prettier plot using functions from the "plotmo" package
par(mfrow=c(1,2))  # Set up 1 by 2 plotting environment, this activates a plotting environment (which you will need to close at the end of it)

plot_glmnet(mtcars.LASSO, xvar = "lambda", 
            label = 5,   # "label" means the top-n variable you want the graph to show (we want it to display the top-5 variables)
             xlab = expression(paste("log(", lambda, ")")),   # Use expression() to enable Greek letters typesetting
             ylab = expression(beta))  

plot_glmnet(mtcars.LASSO, label = 5, xlab = expression(paste("log(", lambda, ")")), ylab = expression(beta))

dev.off()   # Close the plotting environment

## null device 
##           1

# The graph says as the magnitude of penalty (lambda) increases, the estimated coefficients (beta) of most variables "shrink" in magnitude, so the number of variables (look at the top x-axis) that still have non-zero beta coef. also decreases.


# We then fit a Ridge regression using exactly the same specification
set.seed(12345)
Ridge_caret.fit <- train(mpg ~ .-1, data = df, method = "glmnet",
                trControl = control, preProc = c("center","scale"),
                # Center and scale data. Unlike OLS regression, Ridge regression is highly sensitive to the scale of the predictors. Therefore, it is better to standardize (i.e., scale) the predictors before implementing the Ridge regression, so that all predictors will be on the same scale.
                tuneGrid = expand.grid(alpha = 0,     # Why do I set alpha to 0?
                                       lambda = c(0, 0.2, 0.4, 0.6, 0.8, 1, 1.2, 1.4, 1.6, 1.8, 2)))


# Display coefficient estimates at optimized penalty value (lambda)
coef(Ridge_caret.fit$finalModel, Ridge_caret.fit$bestTune$lambda)

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                     s1
## (Intercept) 20.0906250
## cyl         -0.6400378
## disp        -0.6130703
## hp          -0.8171092
## drat         0.5591863
## wt          -1.3097846
## qsec         0.3098637
## vs           0.3610331
## am           0.8637287
## gear         0.4088496
## carb        -0.9500222

# Do a Ridge regression with the same data but using the glmnet() function
mtcars.Ridge = glmnet(X, Y, alpha = 0)    # Why do I set alpha to 0?

# Display the result
print(mtcars.Ridge)

## 
## Call:  glmnet(x = X, y = Y, alpha = 0) 
## 
##     Df  %Dev Lambda
## 1   10  0.00 5147.0
## 2   10  1.19 4690.0
## 3   10  1.31 4273.0
## 4   10  1.44 3894.0
## 5   10  1.57 3548.0
## 6   10  1.72 3232.0
## 7   10  1.89 2945.0
## 8   10  2.07 2684.0
## 9   10  2.27 2445.0
## 10  10  2.49 2228.0
## 11  10  2.72 2030.0
## 12  10  2.98 1850.0
## 13  10  3.26 1685.0
## 14  10  3.57 1536.0
## 15  10  3.90 1399.0
## 16  10  4.27 1275.0
## 17  10  4.67 1162.0
## 18  10  5.10 1058.0
## 19  10  5.58  964.5
## 20  10  6.09  878.8
## 21  10  6.65  800.7
## 22  10  7.25  729.6
## 23  10  7.91  664.8
## 24  10  8.62  605.7
## 25  10  9.39  551.9
## 26  10 10.21  502.9
## 27  10 11.11  458.2
## 28  10 12.07  417.5
## 29  10 13.11  380.4
## 30  10 14.21  346.6
## 31  10 15.40  315.8
## 32  10 16.67  287.8
## 33  10 18.02  262.2
## 34  10 19.46  238.9
## 35  10 20.98  217.7
## 36  10 22.59  198.3
## 37  10 24.29  180.7
## 38  10 26.07  164.7
## 39  10 27.93  150.0
## 40  10 29.87  136.7
## 41  10 31.89  124.6
## 42  10 33.97  113.5
## 43  10 36.12  103.4
## 44  10 38.32   94.2
## 45  10 40.56   85.9
## 46  10 42.83   78.2
## 47  10 45.12   71.3
## 48  10 47.43   65.0
## 49  10 49.72   59.2
## 50  10 52.00   53.9
## 51  10 54.25   49.1
## 52  10 56.45   44.8
## 53  10 58.59   40.8
## 54  10 60.67   37.2
## 55  10 62.67   33.9
## 56  10 64.58   30.9
## 57  10 66.39   28.1
## 58  10 68.11   25.6
## 59  10 69.72   23.3
## 60  10 71.23   21.3
## 61  10 72.63   19.4
## 62  10 73.93   17.7
## 63  10 75.12   16.1
## 64  10 76.21   14.7
## 65  10 77.20   13.4
## 66  10 78.11   12.2
## 67  10 78.92   11.1
## 68  10 79.66   10.1
## 69  10 80.33    9.2
## 70  10 80.93    8.4
## 71  10 81.46    7.6
## 72  10 81.94    7.0
## 73  10 82.37    6.3
## 74  10 82.76    5.8
## 75  10 83.10    5.3
## 76  10 83.41    4.8
## 77  10 83.69    4.4
## 78  10 83.94    4.0
## 79  10 84.16    3.6
## 80  10 84.36    3.3
## 81  10 84.55    3.0
## 82  10 84.71    2.7
## 83  10 84.86    2.5
## 84  10 84.99    2.3
## 85  10 85.11    2.1
## 86  10 85.22    1.9
## 87  10 85.33    1.7
## 88  10 85.42    1.6
## 89  10 85.50    1.4
## 90  10 85.58    1.3
## 91  10 85.66    1.2
## 92  10 85.73    1.1
## 93  10 85.79    1.0
## 94  10 85.85    0.9
## 95  10 85.91    0.8
## 96  10 85.96    0.7
## 97  10 86.02    0.7
## 98  10 86.07    0.6
## 99  10 86.12    0.6
## 100 10 86.16    0.5

coef(mtcars.Ridge, s = cv.glmnet(X, Y)$lambda.1se)

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) 21.198965430
## cyl         -0.343460134
## disp        -0.004584881
## hp          -0.012125251
## drat         1.034448412
## wt          -1.430452671
## qsec         0.188109721
## vs           0.667988913
## am           1.815495783
## gear         0.564910902
## carb        -0.617060594

# Most variables remain after we ran the ridge regression
# What might cause this difference (compared to the result obtained using LASSO)? Recall that previous studies have shown that L2 penalty shrinks the coefficients of correlated predictors towards each other, while the L1 (the one used by LASSO) tends to pick one of them and discard the rest.

# Is this selection result robust to the values of lambda (the penalty parameter)? Let's find out.
# See model coefficients estimated at a particular lambda value - the higher the value of lambda, the greater the penalty
coef(mtcars.Ridge, s = 0.1)

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) 19.425799351
## cyl         -0.249615804
## disp        -0.001941782
## hp          -0.013068281
## drat         0.981164133
## wt          -1.892240792
## qsec         0.313425238
## vs           0.482628428
## am           2.111716247
## gear         0.632184273
## carb        -0.661753198

coef(mtcars.Ridge, s = 10)

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) 19.782187001
## cyl         -0.357084237
## disp        -0.005175146
## hp          -0.009430710
## drat         0.970302597
## wt          -0.845387844
## qsec         0.150382743
## vs           0.867656952
## am           1.147887483
## gear         0.488537961
## carb        -0.357761223

# You can plot the Ridge regression using the same plotting method
# plot(mtcars.Ridge, label = TRUE)

Elastic net

An “elastic” combination of LASSO and Ridge that relates both models’ penalty terms via the \(\alpha\) parameter.

## Using the train() function
enet_caret.fit <- train(mpg ~ .-1, data = df, method = "glmnet",
                trControl = control, preProc = c("center","scale"),
                tuneGrid = expand.grid(alpha = 0.5,    # We temporarily set alpha to 0.5 for a 50-50 mix of LASSO and Ridge     
                                       lambda = c(0, 0.2, 0.4, 0.6, 0.8, 1, 1.2, 1.4, 1.6, 1.8, 2)))   

# Display the estimated coef. at optimized lambda value
coef(enet_caret.fit$finalModel, enet_caret.fit$bestTune$lambda)

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                     s1
## (Intercept) 20.0906250
## cyl         -1.1806783
## disp        -0.2685256
## hp          -0.9372092
## drat         0.2891931
## wt          -1.9892664
## qsec         .        
## vs           0.0954949
## am           0.4879888
## gear         .        
## carb        -0.4410428

## Using the glmnet() function
mtcars.enet = glmnet(X, Y, alpha = 0.5)    # Why do I set alpha to 0.5?
 

# Display the result
print(mtcars.enet)

## 
## Call:  glmnet(x = X, y = Y, alpha = 0.5) 
## 
##    Df  %Dev  Lambda
## 1   0  0.00 10.2900
## 2   3  9.38  9.3790
## 3   3 19.12  8.5460
## 4   3 27.70  7.7870
## 5   4 35.31  7.0950
## 6   4 42.26  6.4650
## 7   4 48.28  5.8910
## 8   4 53.46  5.3670
## 9   4 57.92  4.8900
## 10  4 61.75  4.4560
## 11  4 65.04  4.0600
## 12  4 67.85  3.6990
## 13  4 70.26  3.3710
## 14  5 72.36  3.0710
## 15  5 74.17  2.7990
## 16  6 75.72  2.5500
## 17  7 77.12  2.3230
## 18  7 78.43  2.1170
## 19  7 79.55  1.9290
## 20  7 80.51  1.7580
## 21  7 81.32  1.6010
## 22  8 82.00  1.4590
## 23  8 82.60  1.3300
## 24  8 83.11  1.2110
## 25  8 83.54  1.1040
## 26  8 83.91  1.0060
## 27  8 84.22  0.9164
## 28  8 84.49  0.8350
## 29  8 84.71  0.7608
## 30  8 84.91  0.6932
## 31  9 85.09  0.6316
## 32  9 85.29  0.5755
## 33  8 85.44  0.5244
## 34  8 85.58  0.4778
## 35  9 85.69  0.4354
## 36  9 85.81  0.3967
## 37  9 85.91  0.3614
## 38  9 86.00  0.3293
## 39  9 86.08  0.3001
## 40  9 86.15  0.2734
## 41  9 86.20  0.2491
## 42  9 86.25  0.2270
## 43  9 86.30  0.2068
## 44  9 86.33  0.1885
## 45  9 86.37  0.1717
## 46  9 86.39  0.1565
## 47  9 86.42  0.1426
## 48  9 86.44  0.1299
## 49  9 86.46  0.1184
## 50  9 86.47  0.1078
## 51  9 86.48  0.0983
## 52  9 86.49  0.0895
## 53  9 86.50  0.0816
## 54  9 86.51  0.0743
## 55  9 86.52  0.0677
## 56 10 86.56  0.0617
## 57 10 86.61  0.0562
## 58 10 86.65  0.0512
## 59 10 86.69  0.0467
## 60 10 86.72  0.0425
## 61 10 86.75  0.0388
## 62 10 86.77  0.0353
## 63 10 86.79  0.0322
## 64 10 86.81  0.0293
## 65 10 86.82  0.0267
## 66 10 86.83  0.0243
## 67 10 86.84  0.0222
## 68 10 86.85  0.0202
## 69 10 86.86  0.0184
## 70 10 86.87  0.0168
## 71 10 86.87  0.0153
## 72 10 86.88  0.0139
## 73 10 86.88  0.0127
## 74 10 86.88  0.0116
## 75 10 86.89  0.0105
## 76 10 86.89  0.0096
## 77 10 86.89  0.0087
## 78 10 86.89  0.0080
## 79 10 86.89  0.0073
## 80 10 86.89  0.0066
## 81 10 86.90  0.0060
## 82 10 86.90  0.0055
## 83 10 86.90  0.0050

coef(mtcars.enet, s = cv.glmnet(X, Y)$lambda.1se)

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) 31.807458212
## cyl         -0.676188412
## disp        -0.003513132
## hp          -0.012479650
## drat         0.400166363
## wt          -1.898264925
## qsec         .          
## vs           0.001022619
## am           0.558290289
## gear         .          
## carb        -0.160368257

# qsec and gear got kicked out


# Do a 5-fold cross-validated elastic net regression with varying degree of penalty

# Cross-validated elastic net regression 
cv.enet = cv.glmnet(X, Y, 
                   nfolds = 5,  # 5-fold CV
                   alpha = 0.5, # you can change the mix of LASSO and Ridge
                   gamma = c(0, 0.25, 0.5, 0.75, 1))  # Varying the level of penalty (λ) 
windows.options(width = 8, height = 5) # Set the ratio of plotting frame
plot(cv.enet) # Identify the lambda value that produces the lowest MSE

# We can view the optimized lambda value and the corresponding coef. estimates. For instance,
cv.enet$lambda.min

## [1] 0.631628

# Variable selection based on the lambda (penalty) that gives minimum mean cross-validated (cv) error
coef(cv.enet, s = "lambda.min")

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) 30.9986254994
## cyl         -0.6321334606
## disp        -0.0005092674
## hp          -0.0149130843
## drat         0.6421995415
## wt          -2.1651209076
## qsec         0.0122000939
## vs           0.3895792270
## am           1.3522128776
## gear         .           
## carb        -0.3459988928

# Compare to the estimated coefs. at the lambda value whose error is within 1 standard error of the cross-validated errors for lambda.min
coef(cv.enet, s = "lambda.1se")

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) 31.369950069
## cyl         -0.633706991
## disp        -0.004657583
## hp          -0.010255098
## drat         0.175768508
## wt          -1.683744918
## qsec         .          
## vs           .          
## am           0.013547997
## gear         .          
## carb         .

# You can do the same with LASSO and Ridge regressions

(Mindless) Data mining for theorists

Functional form specification check using adaptive LASSO via the \(\texttt{polywog}\) package.

# Load the Central Bank Independence (CBI) data again
CBI <- read.csv("https://www.dropbox.com/s/426fnyypna3h0bo/CBI.csv?dl=1")
# Subsetting needed variables (columns)
CBI_sub <- CBI[, c("cbi_cukierman", "polity", "inequality")] 
names(CBI_sub)[1] <- "cbi"  # rename the first variable

# install.packages("polywog")
library(polywog)

## Warning: package 'polywog' was built under R version 4.4.3

## Loading required package: miscTools

polywog.fit1 <- polywog(cbi ~ ., data = CBI_sub, degree = 2, thresh = 1e-4)   # Set degree of polynomial to 2 
summary(polywog.fit1) # Shows a list of multiplicative and polynomial terms tested in the model

## 
## Call:
## polywog(formula = cbi ~ ., data = CBI_sub, degree = 2, thresh = 1e-04)
## 
## Coefficients:
##                    Estimate Std. Error
## (Intercept)       -0.225570         NA
## polity            -0.017300         NA
## inequality         2.497447         NA
## polity^2           0.000222         NA
## polity.inequality  0.051525         NA
## inequality^2      -2.810045         NA
## 
## Regularization method: Adaptive LASSO
## Adaptive weights: inverse linear model coefficients
## Number of observations: 2066
## Polynomial expansion degree: 2
## Model family: gaussian
## Bootstrap iterations: 0
## Penalization parameter (lambda): 0.0007481

# Plot the results
plot(polywog.fit1)

## Warning in predVals(x, xvars = xnames[i], interval = interval, level = level, :
## Option 'interval' not available for models without a 'boot.matrix' element

## Warning in predVals(x, xvars = xnames[i], interval = interval, level = level, :
## Option 'interval' not available for models without a 'boot.matrix' element

# Inequality does seem to have a non-linear effect on CBI

polywog.boot <- bootPolywog(polywog.fit1, nboot = 5)   # Use 5-fold (times) bootstrap iteration to get simulated s.e., because for the LASSO-type of models, the s.e. is not estimated, but you can obtain simulated upper and lower bounds via bootstrap
summary(polywog.boot)

## 
## Call:
## polywog(formula = cbi ~ ., data = CBI_sub, degree = 2, thresh = 1e-04)
## 
## Coefficients:
##                     Estimate Std. Error       2.5%  97.5%
## (Intercept)       -2.256e-01  1.931e-01 -4.634e-01 -0.001
## polity            -1.730e-02  6.654e-03 -2.366e-02 -0.008
## inequality         2.497e+00  7.474e-01  1.612e+00  3.348
## polity^2           2.220e-04  1.414e-04  3.944e-05  0.000
## polity.inequality  5.152e-02  1.036e-02  3.858e-02  0.063
## inequality^2      -2.810e+00  6.949e-01 -3.596e+00 -1.968
## 
## Regularization method: Adaptive LASSO
## Adaptive weights: inverse linear model coefficients
## Number of observations: 2066
## Polynomial expansion degree: 2
## Model family: gaussian
## Bootstrap iterations: 5
## Penalization parameter (lambda): 0.0007481

# Cross-validated 3rd degree polywog with 10-fold CV
cv.polywog.fit <- cv.polywog(cbi ~ ., data = CBI_sub, degrees.cv = 1:3, nfolds = 10, model = TRUE, thresh = 1e-4)
# "model" argument asks if one should include the model frame in the "polywog" object
# See what's inside the cv.polywog object
names(cv.polywog.fit$polywog.fit)

##  [1] "coefficients"  "lambda"        "lambda.cv"     "fitted.values"
##  [5] "lmcoef"        "penwt"         "formula"       "degree"       
##  [9] "family"        "weights"       "method"        "penwt.method" 
## [13] "unpenalized"   "thresh"        "maxit"         "terms"        
## [17] "polyTerms"     "nobs"          "na.action"     "xlevels"      
## [21] "varNames"      "call"          "model"         "X"            
## [25] "y"

# "results" is a table of each degree tested, the optimal penalization factor (lambda) for that degree, and its cross-validation error. The smallest cv error occurred at the second degree
cv.polywog.fit$results

##      degree   lambda.min     cv.err
## [1,]      1 0.0009362648 0.03804442
## [2,]      2 0.0007480813 0.03583858
## [3,]      3 0.0097229470 0.03589330

# "degree.min" shows the polynomial degree giving the lowest cross-validation (cv) error
cv.polywog.fit$degree.min

## [1] 2

# "polywog.fit" is just what you will get from the regular lm(), glm() objects, note that the estimated coef. do not have s.e.

summary(cv.polywog.fit$polywog.fit)

## 
## Call:
## polywog(formula = cbi ~ ., data = CBI_sub, degree = 2L, boot = 0, 
##     thresh = 1e-04, model = TRUE, X = FALSE, y = FALSE)
## 
## Coefficients:
##                    Estimate Std. Error
## (Intercept)       -0.225570         NA
## polity            -0.017300         NA
## inequality         2.497447         NA
## polity^2           0.000222         NA
## polity.inequality  0.051525         NA
## inequality^2      -2.810045         NA
## 
## Regularization method: Adaptive LASSO
## Adaptive weights: inverse linear model coefficients
## Number of observations: 2066
## Polynomial expansion degree: 2
## Model family: gaussian
## Bootstrap iterations: 0
## Penalization parameter (lambda): 0.0007481

# Extract the best model and use bootstrap to get lower and upper bounds
cv.polywog.pred <- cv.polywog.fit$polywog.fit
cv.polywog.pred.boot <- bootPolywog(cv.polywog.pred, nboot = 5)   # 5-fold (times) bootstrap

summary(cv.polywog.pred.boot)

## 
## Call:
## polywog(formula = cbi ~ ., data = CBI_sub, degree = 2L, boot = 0, 
##     thresh = 1e-04, model = TRUE, X = FALSE, y = FALSE)
## 
## Coefficients:
##                     Estimate Std. Error       2.5%  97.5%
## (Intercept)       -2.256e-01  1.320e-01 -4.165e-01 -0.102
## polity            -1.730e-02  4.879e-03 -2.413e-02 -0.013
## inequality         2.497e+00  5.481e-01  1.970e+00  3.297
## polity^2           2.220e-04  1.991e-04  2.011e-05  0.001
## polity.inequality  5.152e-02  6.985e-03  4.294e-02  0.059
## inequality^2      -2.810e+00  5.066e-01 -3.602e+00 -2.338
## 
## Regularization method: Adaptive LASSO
## Adaptive weights: inverse linear model coefficients
## Number of observations: 2066
## Polynomial expansion degree: 2
## Model family: gaussian
## Bootstrap iterations: 5
## Penalization parameter (lambda): 0.0007481

# Plot the marginal effects of the predicted values of cv.polywog.boot across the range of parameter values
plot(margEff(cv.polywog.pred.boot))

Supplementary Materials

Obtain bootstraped s.e. for ridge and LASSO models

# if you really want s.e., you can only get that through bootstrap simulation

install.packages("boot")
library(boot)
bootSamples <- boot(CBI_sub, function(data, idx) {
            bootstrapData <- data[idx, ]

            bootstrapMod <- train(cbi ~ ., 
                    data = bootstrapData, 
                    method = "glmnet", 
                    trControl = control,
                    tuneGrid = lasso_caret.fit$bestTune)

            as.vector(coef(bootstrapMod$finalModel, lasso_caret.fit$bestTune$lambda))
        }, 100L)

bootSamples