Aim of this lab is: To build ridge, lasso regression models using cross validation

To observe how lambda- the regularization cofficent, effects the error

To observe the relationship between lambda and attribute coefficents in each of the models

To compare the performance of the two models

It uses Boston housing dataset to build a model to predict the price of houses

Libraries Needed

Data - load data from mlbench

data("BostonHousing")

###Boston Housing dataset has 506 observations and 14 variables

data <- BostonHousing

dataset has been stored in variable “data”

str(data)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : num  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ b      : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

All data types but one are numbers. “Chas” is a factor.

Data Partition

set.seed(222)
view(data)
ind <- sample(2, nrow(data), replace = T, prob = c(0.7, 0.3))
train_data <- data[ind==1,]
test_data <- data[ind==2,]

test_data now has 153 observation while train_data has 353 observations. All have 14 variables

view(test_data)
view(train_data)

converting dataframe into matrix

x <- data.matrix(train_data[,1:13])
y <- train_data$medv

Ridge regression - glmnet parameter alpha=0 for ridge regression

For numerical prediction choose family - gaussian, for classification family = binomial

glmnet by defaut chooses 100 lambda values that are data dependent

l_ridge <- glmnet(x, y, family="gaussian", alpha=0)
plot(l_ridge, xvar = 'lambda', label=T)

From the plot note that the coefficients reduce when lambda increases. However all 13 attributes remain, none of them are dropped.

To find the best value for lambda, we use the builtin cross validation of cv.glmnet.

cv_out_ridge = cv.glmnet(x, y, alpha =0)
plot (cv_out_ridge)

names(cv_out_ridge)
##  [1] "lambda"     "cvm"        "cvsd"       "cvup"       "cvlo"      
##  [6] "nzero"      "call"       "name"       "glmnet.fit" "lambda.min"
## [11] "lambda.1se"

two lambda values may be noted. ‘lambda.min’, ‘lambda.1se’- lambda for error within 1 standard deviation

lambda_min <- cv_out_ridge$lambda.min
lambda_min
## [1] 0.6609873
lambda_1se<- cv_out_ridge$lambda.1se
lambda_1se
## [1] 4.663135

Plotting the ridge regression ouput once again

l_ridge <- glmnet(x, y, family="gaussian", alpha=0)
plot(l_ridge, xvar = 'lambda', label=T)

cv_out_ridge = cv.glmnet(x, y, alpha =0)
plot (cv_out_ridge)

names(cv_out_ridge)
##  [1] "lambda"     "cvm"        "cvsd"       "cvup"       "cvlo"      
##  [6] "nzero"      "call"       "name"       "glmnet.fit" "lambda.min"
## [11] "lambda.1se"
lambda_min <- cv_out_ridge$lambda.min
lambda_min
## [1] 0.6609873
lambda_1se<- cv_out_ridge$lambda.1se
lambda_1se
## [1] 4.663135
plot(l_ridge, xvar = 'lambda', label=T)
abline(v = log(cv_out_ridge$lambda.1se), col = "red", lty = "dashed")
abline(v = log(cv_out_ridge$lambda.min), col = "blue", lty = "dashed")

# set lambda to one of these values and build the model

l_ridge_final <- glmnet(x, y, family="gaussian", lambda = lambda_1se, alpha=0)
coef(l_ridge_final)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                       s0
## (Intercept) 16.932104599
## crim        -0.088883221
## zn           0.021173911
## indus       -0.076004871
## chas         1.597809301
## nox         -3.808180660
## rm           3.864271100
## age         -0.016786220
## dis         -0.413660680
## rad          0.003558169
## tax         -0.003711038
## ptratio     -0.658480768
## b            0.006195020
## lstat       -0.270391027
plot(coef(l_ridge_final))

# alternate plot of the coeffs

coef(l_ridge_final) %>%
  tidy() %>%
  filter(row != "(Intercept)") %>%
  top_n(25, wt = abs(value)) %>%
  ggplot(aes(value, reorder(row, value))) +
  geom_point() +
  ggtitle("Top 25 influential variables") +
  xlab("Coefficient") +
  ylab(NULL)
## Warning: 'tidy.dgCMatrix' is deprecated.
## See help("Deprecated")
## Warning: 'tidy.dgTMatrix' is deprecated.
## See help("Deprecated")

Prediction with training set

p1 <- predict(l_ridge_final, x) #remember to use x and not train_data
rmse_l_ridge_final <- sqrt(mean((train_data$medv-p1)^2))
rmse_l_ridge_final
## [1] 4.590626

Repeat the same with lasso regression

cv_out_lasso = cv.glmnet(x, y, alpha = 1)
plot (cv_out_lasso)

names(cv_out_lasso)
##  [1] "lambda"     "cvm"        "cvsd"       "cvup"       "cvlo"      
##  [6] "nzero"      "call"       "name"       "glmnet.fit" "lambda.min"
## [11] "lambda.1se"
# two lambda values may be noted. 'lambda.min', 'lambda.1se'- lambda for error within 1 standard deviation
lambda_min <- cv_out_lasso$lambda.min
lambda_min
## [1] 0.02267497
lambda_1se<- cv_out_lasso$lambda.1se
lambda_1se
## [1] 0.3367161

Plotting the lasso regression ouput once again

l_lasso <- glmnet(x, y, family="gaussian", alpha=1)
plot(l_lasso, xvar = 'lambda', label=T)
abline(v = log(cv_out_lasso$lambda.1se), col = "red", lty = "dashed")
abline(v = log(cv_out_lasso$lambda.min), col = "blue", lty = "dashed")

Set lambda to one of these values and build the model

l_lasso_final <- glmnet(x, y, family="gaussian", lambda = lambda_1se, alpha=1)
coef(l_lasso_final)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                       s0
## (Intercept)  9.317372353
## crim        -0.083088453
## zn           .          
## indus        .          
## chas         0.734904190
## nox         -2.571132768
## rm           5.374982341
## age         -0.007170991
## dis         -0.232202852
## rad          .          
## tax         -0.002333689
## ptratio     -0.813096482
## b            0.005645356
## lstat       -0.356054885
plot(coef(l_lasso_final))