\[RSS(\beta) + \lambda \sum_{j=1}^{p} \beta_j^2\]
\[RSS(\beta) + \lambda \sum_{j=1}^{p} |\beta_j|\]
rm(list = ls(all=TRUE))
setwd("C://Users//brbhatta//Desktop//INSOFE//My_exercise//8_26May_27May//27May//20180527_Batch42_CSE7302c_Ridge_Lasso_ElasticNet_TuningHyperParams//ridge_lasso")
labour_data <- read.csv("labour_income.csv")
str(labour_data)
## 'data.frame': 3987 obs. of 5 variables:
## $ wages : num 10.6 11 17.8 14 8.2 ...
## $ education: num 15 13.2 14 16 15 13.5 12 14 18 11 ...
## $ age : int 40 19 46 50 31 30 61 46 43 17 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 1 2 1 1 1 2 2 ...
## $ language : Factor w/ 3 levels "English","French",..: 1 1 3 1 1 1 1 3 1 1 ...
summary(labour_data)
## wages education age sex
## Min. : 2.30 Min. : 0.00 Min. :16.0 Female:2001
## 1st Qu.: 9.25 1st Qu.:12.00 1st Qu.:28.0 Male :1986
## Median :14.13 Median :13.00 Median :36.0
## Mean :15.54 Mean :13.34 Mean :37.1
## 3rd Qu.:19.72 3rd Qu.:15.10 3rd Qu.:46.0
## Max. :49.92 Max. :20.00 Max. :69.0
## language
## English:3244
## French : 259
## Other : 484
##
##
##
head(labour_data)
## wages education age sex language
## 1 10.56 15.0 40 Male English
## 2 11.00 13.2 19 Male English
## 3 17.76 14.0 46 Male Other
## 4 14.00 16.0 50 Female English
## 5 8.20 15.0 31 Male English
## 6 16.97 13.5 30 Female English
tail(labour_data)
## wages education age sex language
## 3982 16.66 8.0 61 Female English
## 3983 6.80 13.1 20 Male English
## 3984 30.49 16.0 52 Male Other
## 3985 22.00 15.0 41 Male Other
## 3986 11.85 11.0 47 Female English
## 3987 23.00 14.0 30 Male English
sum(is.na(labour_data))
## [1] 0
set.seed(007)
train_rows <- sample(x = seq(1, nrow(labour_data), 1), size = 0.7*nrow(labour_data))
train_data <- labour_data[train_rows, ]
test_data <- labour_data[-train_rows, ]
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
std_obj <- preProcess(x = train_data[, !colnames(train_data) %in% c("wages")],
method = c("center", "scale"))
train_std_data <- predict(std_obj, train_data)
test_std_data <- predict(std_obj, test_data)
std_obj
## Created from 2790 samples and 4 variables
##
## Pre-processing:
## - centered (2)
## - ignored (2)
## - scaled (2)
head(train_std_data)
## wages education age sex language
## 3943 12.38 -0.27202965 -0.41263640 Female English
## 1586 9.97 0.70387715 -0.82111481 Male English
## 462 13.73 1.84243509 0.32262474 Female English
## 278 23.04 0.05327262 1.05788588 Female English
## 971 23.96 0.21592375 -0.08585367 Male English
## 3154 6.58 -0.43468078 -1.47468026 Female English
head(test_std_data)
## wages education age sex language
## 1 10.56 0.5412260 0.2409291 Male English
## 3 17.76 0.2159238 0.7311031 Male Other
## 4 14.00 0.8665283 1.0578859 Female English
## 7 6.70 -0.4346808 1.9565384 Female English
## 11 16.00 -2.0611921 0.7311031 Male Other
## 12 23.00 0.2159238 -0.4126364 Male English
dummy_obj <- dummyVars( ~ . , train_std_data)
train_dummy_data <- as.data.frame(predict(dummy_obj, train_std_data))
test_dummy_data <- as.data.frame(predict(dummy_obj, test_std_data))
head(train_dummy_data)
## wages education age sex.Female sex.Male language.English
## 3943 12.38 -0.27202965 -0.41263640 1 0 1
## 1586 9.97 0.70387715 -0.82111481 0 1 1
## 462 13.73 1.84243509 0.32262474 1 0 1
## 278 23.04 0.05327262 1.05788588 1 0 1
## 971 23.96 0.21592375 -0.08585367 0 1 1
## 3154 6.58 -0.43468078 -1.47468026 1 0 1
## language.French language.Other
## 3943 0 0
## 1586 0 0
## 462 0 0
## 278 0 0
## 971 0 0
## 3154 0 0
head(test_dummy_data)
## wages education age sex.Female sex.Male language.English
## 1 10.56 0.5412260 0.2409291 0 1 1
## 3 17.76 0.2159238 0.7311031 0 1 0
## 4 14.00 0.8665283 1.0578859 1 0 1
## 7 6.70 -0.4346808 1.9565384 1 0 1
## 11 16.00 -2.0611921 0.7311031 0 1 0
## 12 23.00 0.2159238 -0.4126364 0 1 1
## language.French language.Other
## 1 0 0
## 3 0 1
## 4 0 0
## 7 0 0
## 11 0 1
## 12 0 0
X_train <- as.matrix(train_dummy_data[, -1])
y_train <- as.matrix(train_dummy_data[, 1])
X_test <- as.matrix(test_dummy_data[, -1])
y_test <- as.matrix(test_dummy_data[, 1])
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
cv_lasso <- cv.glmnet(X_train, y_train, alpha = 1, type.measure = "mse", nfolds = 4)
plot(cv_lasso)
The object returned form the call to cv.glmnet() function, contains the lambda values of importance
The coefficients are accessible calling the coef() function on the cv_lasso object
plot(cv_lasso$glmnet.fit, xvar="lambda", label=TRUE)
plot(cv_lasso)
print(cv_lasso$lambda.min)
## [1] 0.07241715
coef(cv_lasso)
## 8 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 16.794336
## education 2.285889
## age 2.664529
## sex.Female -2.279586
## sex.Male .
## language.English .
## language.French .
## language.Other .
cv_ridge <- cv.glmnet(X_train, y_train, alpha = 0, type.measure = "mse", nfolds = 4)
plot(cv_ridge)
plot(cv_ridge$glmnet.fit, xvar="lambda", label=TRUE)
print(cv_ridge$lambda.min)
## [1] 0.328403
coef(cv_ridge)
## 8 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 15.73379156
## education 1.87998809
## age 2.13132411
## sex.Female -1.34452432
## sex.Male 1.34171157
## language.English -0.12408093
## language.French 0.23497933
## language.Other 0.04254616
lasso_model <- glmnet(X_train, y_train, lambda = cv_lasso$lambda.min, alpha = 1)
coef(lasso_model)
## 8 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 17.21523733
## education 2.77341706
## age 3.13672030
## sex.Female -3.14026120
## sex.Male .
## language.English .
## language.French 0.08227626
## language.Other .
preds_lasso <- predict(lasso_model, X_test)
ridge_model <- glmnet(X_train, y_train, lambda = cv_ridge$lambda.min, alpha = 0)
coef(ridge_model)
## 8 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 15.76844764
## education 2.73075409
## age 3.07192453
## sex.Female -1.65759407
## sex.Male 1.55801018
## language.English -0.10449380
## language.French 0.26629251
## language.Other -0.01122563
preds_ridge <- predict(ridge_model, X_test)
library(DMwR)
## Loading required package: grid
regr.eval(trues = y_test, preds = preds_lasso)
## mae mse rmse mape
## 5.1939127 44.6079517 6.6789185 0.4327586
library(DMwR)
regr.eval(trues = y_test, preds = preds_ridge)
## mae mse rmse mape
## 5.1943165 44.5931661 6.6778115 0.4330808