library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.3
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack

import data

setwd("C:/Thesis")
machine<- read.csv("Edited data.csv", sep=",")
dim(machine)
## [1] 96 42
class(machine)
## [1] "data.frame"
A<-apply(as.matrix.noquote(machine),2,as.numeric, na.rm=TRUE)
## Warning in apply(as.matrix.noquote(machine), 2, as.numeric, na.rm = TRUE): NAs
## introduced by coercion
head(A)
##      Headline._Inflation  woil BROAD_MO CLOTHING Electrical_Materials
## [1,]               -6.93 77.64 19399.40 279884.7             938298.5
## [2,]               -0.21 81.86 20678.60 296271.8             899228.3
## [3,]               -3.09 86.09 21957.80 312658.9             860158.1
## [4,]               -7.77 90.31 23237.00 329046.0             821087.9
## [5,]               -0.81 94.53 24516.20 345433.1             782017.6
## [6,]                6.83 87.94 24960.17 375852.4             809772.9
##      EXPIDITURE FERTILIZER FOOD_PRICE FOOD_INFL   GRAIN_ GD_Fixed_.Investment
## [1,]   17533.51   336378.8   766560.3       1.0 646316.2                24.70
## [2,]   17430.30   283999.2   735319.3       1.1 600071.0                25.10
## [3,]   17327.08   231619.5   704078.4       0.3 553825.8                25.50
## [4,]   17223.87   179239.8   672837.5       0.4 507580.5                25.90
## [5,]   17120.66   126860.1   641596.6      -3.3 461335.3                26.30
## [6,]   17672.95   235209.4   822592.7      -5.2 657531.3                27.08
##      GG_Cosumption GNational._Savings GrPrivate._Consumption INVESTMENT Lending
## [1,]         26.20           16195.52                  62.40    17044.9   12.00
## [2,]         25.00           16195.52                  62.72    17282.1   12.13
## [3,]         23.80           16195.52                  63.19    17637.9   12.31
## [4,]         22.60           16195.52                  63.67    17993.7   12.50
## [5,]         21.40           16195.52                  64.14    18349.5   12.69
## [6,]         21.47           16276.01                  64.85    18763.9   12.42
##      MACHINERY      med   NARROW Netdemand_depoisit Net_Foreign_Assets
## [1,]   1351512 246178.6 11378.90            6182.50            4770.60
## [2,]   1372992 254112.8 11773.38            6457.75            4775.50
## [3,]   1405212 266014.2 12365.11            6870.63            4782.85
## [4,]   1437433 277915.6 12956.83            7283.50            4790.20
## [5,]   1469653 289816.9 13548.56            7696.38            4797.55
## [6,]   1511623 304652.4 13813.59            7976.77            5303.75
##      NON_FOOD_ Others._imported Overall_Budget PETROLEU PRIVATE_C REAL_GDP_
## [1,]      1.40          1783302        -5961.6    30.91   43005.1   6244.40
## [2,]      1.20          1860840        -5464.9    25.76   43377.6   6298.60
## [3,]      0.67          1977147        -4719.9    18.03   43936.3   6379.90
## [4,]      0.37          2093455        -3974.8    10.30   44495.0   6461.20
## [5,]     -0.21          2209762        -3229.8     2.58   45053.7   6542.50
## [6,]      2.50          2300627        -3287.5     0.00   45450.2   6556.25
##      Reserve_Requi  REVENUE  RoadMV TDemanded    TSold TELECOMM Unemployment
## [1,]        924.10  9119.70 1548459  18322.80 13311.15 32655.53         3.65
## [2,]        950.38  9289.78 1533097  18303.17 12646.45 38282.71         3.65
## [3,]        989.81  9544.89 1510053  18273.73 11649.39 46723.48         3.65
## [4,]       1029.23  9800.00 1487009  18244.29 10652.34 55164.24         3.65
## [5,]       1068.66 10055.11 1463966  18214.84  9655.29 63605.01         3.51
## [6,]       1095.48 10180.30 1452778  19519.04 10511.31 72258.09         3.51
##      Forein_direct rainfall Global._Food Real._GDP Nominal._GDP Minimum_Deposit
## [1,]          0.91     5.60        57.65    6244.4        135.0               6
## [2,]          0.91    81.91        58.50    6244.4        135.0               6
## [3,]          0.91   127.12        59.25    6244.4        135.0               6
## [4,]          0.91    46.14        60.34    6244.4        135.0               6
## [5,]          1.63   106.11        63.09    6569.6        130.8               6
## [6,]          1.63   102.31        62.17    6569.6        130.8               6
##      Exchange_Rate Capital_.account
## [1,]          8.20            151.7
## [2,]          8.20            151.7
## [3,]          8.20            151.7
## [4,]          8.20            151.7
## [5,]          8.42            258.6
## [6,]          8.42            258.6
W<-as.data.frame(A)
C<-makeX(W, na.impute = TRUE)
D<-as.data.frame(C)
L= model.matrix(Headline._Inflation~., D)[,-1] # trim off the first column
# leaving only the predictors
M = D %>%
  select(Headline._Inflation) %>%
  unlist() %>%
  as.numeric()

Ridge Regression

The glmnet() function has an alpha argument that determines what type of model is fit. If alpha = 0 then a ridge regression model is fit, and if alpha = 1 then a lasso model is fit. We first fit a ridge regression model:

grid = 10^seq(10, -2, length = 100)
ridge_mod = glmnet(L, M, alpha = 0, lambda = grid)

By default the glmnet() function performs ridge regression for an automatically selected range of λ values. However, here we have chosen to implement the function over a grid of values ranging from \(λ=10^{10}\) to \(λ=10^2\) , essentially covering the full range of scenarios from the null model containing only the intercept, to the least squares fit.

As we will see, we can also compute model fits for a particular value of λ that is not one of the original grid values. Note that by default, the glmnet() function standardizes the variables so that they are on the same scale. To turn off this default setting, use the argument standardize = FALSE.

Associated with each value of λ is a vector of ridge regression coefficients, stored in a matrix that can be accessed by coef(). In this case, it is a 20×100 matrix, with 20 rows (one for each predictor, plus an intercept) and 100 columns (one for each value of λ )

 dim(coef(ridge_mod))
## [1]  42 100
plot(ridge_mod)    # Draw plot of coefficients

ridge_mod$lambda[50] #Display 50th lambda value
## [1] 11497.57

We expect the coefficient estimates to be much smaller, in terms of l2 norm, when a large value of λ is used, as compared to when a small value of λ is used. These are the coefficients when λ=11498 , along with their l2 norm:

coef(ridge_mod)[,50] # Display coefficients associated with 50th lambda value
##            (Intercept)                   woil               BROAD_MO 
##           1.668285e+01          -3.214180e-04           1.214120e-08 
##               CLOTHING   Electrical_Materials             EXPIDITURE 
##           9.244630e-10           1.889418e-10           2.677619e-08 
##             FERTILIZER             FOOD_PRICE              FOOD_INFL 
##           9.504212e-10           3.079193e-10           1.765106e-03 
##                 GRAIN_   GD_Fixed_.Investment          GG_Cosumption 
##           1.154776e-09          -1.317489e-03          -1.816991e-03 
##     GNational._Savings GrPrivate._Consumption             INVESTMENT 
##           1.398979e-08           2.988503e-03           1.241711e-08 
##                Lending              MACHINERY                    med 
##           4.110979e-03           1.592233e-10           3.894898e-09 
##                 NARROW     Netdemand_depoisit     Net_Foreign_Assets 
##           3.741240e-08           5.233761e-08          -1.045134e-07 
##              NON_FOOD_       Others._imported         Overall_Budget 
##           1.593769e-03           8.743922e-11          -9.329020e-08 
##               PETROLEU              PRIVATE_C              REAL_GDP_ 
##           2.049658e-07           5.259627e-09           1.037885e-06 
##          Reserve_Requi                REVENUE                 RoadMV 
##           2.782935e-07           3.444163e-08          -1.898946e-10 
##              TDemanded                  TSold               TELECOMM 
##           2.773839e-09           1.833095e-09           5.399088e-11 
##           Unemployment          Forein_direct               rainfall 
##           3.664963e-03          -1.451937e-03          -3.916109e-05 
##           Global._Food              Real._GDP           Nominal._GDP 
##           6.312120e-04           9.507607e-07           1.659822e-05 
##        Minimum_Deposit          Exchange_Rate       Capital_.account 
##           2.790833e-03           5.478562e-04          -2.807869e-07
sqrt(sum(coef(ridge_mod)[-1,50]^2)) # Calculate l2 norm
## [1] 0.007788244

In contrast, here are the coefficients when λ=705 , along with their l2 norm. Note the much larger l2 norm of the coefficients associated with this smaller value of λ .

ridge_mod$lambda[60] #Display 60th lambda value
## [1] 705.4802
coef(ridge_mod)[,60] # Display coefficients associated with 60th lambda value
##            (Intercept)                   woil               BROAD_MO 
##           1.234165e+01          -4.019520e-03           1.228201e-07 
##               CLOTHING   Electrical_Materials             EXPIDITURE 
##           4.745414e-09           9.576558e-10           2.674183e-07 
##             FERTILIZER             FOOD_PRICE              FOOD_INFL 
##           1.010559e-08           3.397199e-09           2.618948e-02 
##                 GRAIN_   GD_Fixed_.Investment          GG_Cosumption 
##           1.361829e-08          -1.768916e-02          -2.211666e-02 
##     GNational._Savings GrPrivate._Consumption             INVESTMENT 
##           1.372256e-07           3.926579e-02           1.214915e-07 
##                Lending              MACHINERY                    med 
##           3.563427e-02           1.259065e-09           5.158627e-08 
##                 NARROW     Netdemand_depoisit     Net_Foreign_Assets 
##           3.775857e-07           5.282449e-07          -1.129317e-06 
##              NON_FOOD_       Others._imported         Overall_Budget 
##           2.166089e-02           9.185464e-10          -9.474864e-07 
##               PETROLEU              PRIVATE_C              REAL_GDP_ 
##           1.844446e-06           5.497991e-08           1.034021e-05 
##          Reserve_Requi                REVENUE                 RoadMV 
##           3.026420e-06           3.341564e-07          -3.722543e-09 
##              TDemanded                  TSold               TELECOMM 
##           2.294356e-08           1.480140e-08          -1.253157e-08 
##           Unemployment          Forein_direct               rainfall 
##           1.621743e-02          -2.564456e-02          -6.499424e-04 
##           Global._Food              Real._GDP           Nominal._GDP 
##           6.550260e-03           9.361512e-06           1.615837e-04 
##        Minimum_Deposit          Exchange_Rate       Capital_.account 
##           2.222842e-02           5.482289e-03          -1.104726e-05
sqrt(sum(coef(ridge_mod)[-1,60]^2)) # Calculate l2 norm
## [1] 0.07920264

We now split the samples into a training set and a test set in order to estimate the test error of ridge regression and the lasso.

set.seed(1)
trainm = D %>%
  sample_frac(0.8)

testm = D %>%
  setdiff(trainm)

x_trainm = model.matrix(Headline._Inflation~., trainm)[,-1]
x_testm = model.matrix(Headline._Inflation~., testm)[,-1]

y_trainm = trainm %>%
  select(Headline._Inflation) %>%
  unlist() %>%
  as.numeric()

y_testm = testm %>%
  select(Headline._Inflation) %>%
  unlist() %>%
  as.numeric()
dim(x_trainm)
## [1] 77 41

Next we fit a ridge regression model on the training set, and evaluate its MSE on the test set, using λ=4 . Note the use of the predict() function again: this time we get predictions for a test set, by replacing type=“coefficients” with the newx argument.

ridge_mod = glmnet(x_trainm, y_trainm, alpha=0, lambda = grid, thresh = 1e-12)
## Warning: from glmnet C++ code (error code -98); Convergence for 98th lambda
## value not reached after maxit=100000 iterations; solutions for larger lambdas
## returned
ridge_pred = predict(ridge_mod, s = 4, newx = x_testm)
mean((ridge_pred - y_testm)^2)
## [1] 32.66551

The test MSE is 240.9262. Note that if we had instead simply fit a model with just an intercept, we would have predicted each test observation using the mean of the training observations. In that case, we could compute the test set MSE like this:

mean((mean(y_trainm) - y_testm)^2)
## [1] 240.9262

We could also get the same result by fitting a ridge regression model with a very large value of λ . Note that 1e10 means \(10^{10}\)

ridge_pred = predict(ridge_mod, s = 1e10, newx = x_testm)
mean((ridge_pred - y_testm)^2)
## [1] 240.9262

so fitting a ridge regression model with λ=4 leads to a much lower test MSE than fitting a model with just an intercept. We now check whether there is any benefit to performing ridge regression with λ=4 instead of just performing least squares regression. Recall that least squares is simply ridge regression with λ=0

, we use the argument exact=T when calling the predict() function. Otherwise, the predict() function will ### Instead of arbitrarily choosing λ=4, it would be better to use cross-validation to choose the tuning parameter λ . We can do this using the built-in cross-validation function, cv.glmnet(). By default, the function performs 10-fold cross-validation, though this can be changed using the argument folds. Note that we set a random seed first so our results will be reproducible, since the choice of the cross-validation folds is random. .

set.seed(1)
cv.out = cv.glmnet(x_trainm, y_trainm, alpha = 0) # Fit ridge regression model on training data
bestlam = cv.out$lambda.min  # Select lamda that minimizes training MSE
bestlam
## [1] 1.60137

Therefore, we see that the value of λ that results in the smallest cross-validation error is 1.60137 We can also plot the MSE as a function of λ

plot(cv.out) # Draw plot of training MSE as a function of lambda

What is the test MSE associated with this value of λ?

ridge_pred = predict(ridge_mod, s = bestlam, newx = x_testm) # Use best lambda to predict test data
mean((ridge_pred - y_testm)^2) # Calculate test MSE
## [1] 21.54921

Final model

out = glmnet(L, M, alpha = 0) # Fit ridge regression model on full dataset
predict(out, type = "coefficients", s = bestlam)[1:40,] # Display coefficients using lambda chosen by CV
##            (Intercept)                   woil               BROAD_MO 
##          -2.264407e+01          -1.028162e-01          -7.102219e-07 
##               CLOTHING   Electrical_Materials             EXPIDITURE 
##          -2.856806e-07           1.910505e-08          -2.352161e-06 
##             FERTILIZER             FOOD_PRICE              FOOD_INFL 
##           6.077341e-08           1.635610e-08           1.181159e+00 
##                 GRAIN_   GD_Fixed_.Investment          GG_Cosumption 
##           2.256003e-08           1.660333e-01          -1.195127e-01 
##     GNational._Savings GrPrivate._Consumption             INVESTMENT 
##          -7.985189e-07           2.093985e-01          -6.339477e-07 
##                Lending              MACHINERY                    med 
##           7.057966e-02          -2.114805e-08          -1.111985e-07 
##                 NARROW     Netdemand_depoisit     Net_Foreign_Assets 
##          -2.883593e-06          -3.556549e-06          -5.624649e-07 
##              NON_FOOD_       Others._imported         Overall_Budget 
##           3.613149e-02          -3.900531e-09           1.172309e-05 
##               PETROLEU              PRIVATE_C              REAL_GDP_ 
##           4.361428e-06          -1.458747e-07          -1.250721e-04 
##          Reserve_Requi                REVENUE                 RoadMV 
##          -6.170820e-06          -1.804967e-06          -3.288400e-08 
##              TDemanded                  TSold               TELECOMM 
##           1.083291e-07           9.071405e-08           5.416657e-07 
##           Unemployment          Forein_direct               rainfall 
##           2.852438e+00           2.972515e-01          -9.246616e-03 
##           Global._Food              Real._GDP           Nominal._GDP 
##           6.254666e-02          -1.441004e-04          -8.874276e-04 
##        Minimum_Deposit 
##           9.594236e-01

# The Lasso We saw that ridge regression with a wise choice of λ can outperform least squares as well as the null model on the Hitters data set. We now ask whether the lasso can yield either a more accurate or a more interpretable model than ridge regression. In order to fit a lasso model, we once again use the glmnet() function; however, this time we use the argument alpha=1. Other than that change, we proceed just as we did in fitting a ridge model:

lasso_mod = glmnet(x_trainm, 
                   y_trainm, 
                   alpha = 1, 
                   lambda = grid) # Fit lasso model on training data

plot(lasso_mod)    # Draw plot of coefficients
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

Notice that in the coefficient plot that depending on the choice of tuning parameter, some of the coefficients are exactly equal to zero. We now perform cross-validation and compute the associated test error:

set.seed(1)
cv.out = cv.glmnet(x_trainm, y_trainm, alpha = 1) # Fit lasso model on training data
plot(cv.out) # Draw plot of training MSE as a function of lambda

bestlam = cv.out$lambda.min # Select lamda that minimizes training MSE
bestlam 
## [1] 0.01493443
lasso_pred = predict(lasso_mod, s = bestlam, newx = x_testm) # Use best lambda to predict test data
mean((lasso_pred - y_testm)^2) # Calculate test MSE
## [1] 26.33755

This is substantially lower than the test set MSE of the null model and of least squares, and very similar to the test MSE of ridge regression with λ chosen by cross-validation.

However, the lasso has a substantial advantage over ridge regression in that the resulting coefficient estimates are sparse. Here we see that 11 of the 41 coefficient estimates are exactly zero

out = glmnet(L, M, alpha = 1, lambda = grid) # Fit lasso model on full dataset
lasso_coef = predict(out, type = "coefficients", s = bestlam)[1:42,] # Display coefficients using lambda chosen by CV
lasso_coef
##            (Intercept)                   woil               BROAD_MO 
##          -1.377887e+02          -3.593233e-02           0.000000e+00 
##               CLOTHING   Electrical_Materials             EXPIDITURE 
##          -2.600709e-07          -1.409352e-07          -2.339369e-07 
##             FERTILIZER             FOOD_PRICE              FOOD_INFL 
##          -7.425321e-07           2.124929e-07           1.598220e+00 
##                 GRAIN_   GD_Fixed_.Investment          GG_Cosumption 
##          -6.434675e-08           1.202268e+00          -7.021282e-01 
##     GNational._Savings GrPrivate._Consumption             INVESTMENT 
##           0.000000e+00           1.276687e+00           0.000000e+00 
##                Lending              MACHINERY                    med 
##           4.487659e-01          -7.990817e-09          -1.355381e-06 
##                 NARROW     Netdemand_depoisit     Net_Foreign_Assets 
##           0.000000e+00           0.000000e+00          -7.138657e-06 
##              NON_FOOD_       Others._imported         Overall_Budget 
##          -5.306092e-01          -2.839843e-09           3.699823e-05 
##               PETROLEU              PRIVATE_C              REAL_GDP_ 
##           3.230244e-05           0.000000e+00          -3.055080e-04 
##          Reserve_Requi                REVENUE                 RoadMV 
##           0.000000e+00           0.000000e+00          -1.097396e-07 
##              TDemanded                  TSold               TELECOMM 
##           0.000000e+00           0.000000e+00           1.096150e-06 
##           Unemployment          Forein_direct               rainfall 
##           5.814603e+00           4.236062e-01          -4.368686e-04 
##           Global._Food              Real._GDP           Nominal._GDP 
##           5.101704e-02           3.044873e-05          -7.178937e-03 
##        Minimum_Deposit          Exchange_Rate       Capital_.account 
##           3.646081e+00          -2.012662e-01           4.572987e-04

Selecting only the predictors with non-zero coefficients, we see that the lasso model with λ chosen by cross-validation contains only 31 variables variables:

lasso_coef[lasso_coef != 0] # Display only non-zero coefficients
##            (Intercept)                   woil               CLOTHING 
##          -1.377887e+02          -3.593233e-02          -2.600709e-07 
##   Electrical_Materials             EXPIDITURE             FERTILIZER 
##          -1.409352e-07          -2.339369e-07          -7.425321e-07 
##             FOOD_PRICE              FOOD_INFL                 GRAIN_ 
##           2.124929e-07           1.598220e+00          -6.434675e-08 
##   GD_Fixed_.Investment          GG_Cosumption GrPrivate._Consumption 
##           1.202268e+00          -7.021282e-01           1.276687e+00 
##                Lending              MACHINERY                    med 
##           4.487659e-01          -7.990817e-09          -1.355381e-06 
##     Net_Foreign_Assets              NON_FOOD_       Others._imported 
##          -7.138657e-06          -5.306092e-01          -2.839843e-09 
##         Overall_Budget               PETROLEU              REAL_GDP_ 
##           3.699823e-05           3.230244e-05          -3.055080e-04 
##                 RoadMV               TELECOMM           Unemployment 
##          -1.097396e-07           1.096150e-06           5.814603e+00 
##          Forein_direct               rainfall           Global._Food 
##           4.236062e-01          -4.368686e-04           5.101704e-02 
##              Real._GDP           Nominal._GDP        Minimum_Deposit 
##           3.044873e-05          -7.178937e-03           3.646081e+00 
##          Exchange_Rate       Capital_.account 
##          -2.012662e-01           4.572987e-04