Linear regression and regularized regression: step by step example

Load the necessary packages

library(ggplot2)
library(rsample)    # stratified sampling
library(reshape2)   # data wrangling
library(knitr)
library(car)        # variance inflation factor

## Loading required package: carData

library(glmnet)     # regularized linear regression

## Loading required package: Matrix

## Loaded glmnet 4.1-1

library(glmnetUtils)

## 
## Attaching package: 'glmnetUtils'

## The following objects are masked from 'package:glmnet':
## 
##     cv.glmnet, glmnet

library(gridExtra)  
library(dplyr)      # data wrangling

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Import data and display statistical summaries of each variables.

## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : int  6 6 6 6 6 6 6 6 6 6 ...

##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.200  
##  Mean   : 6.855   Mean   :0.2782   Mean   :0.3342   Mean   : 6.391  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.900  
##  Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.00900   Min.   :  2.00      Min.   :  9.0        Min.   :0.9871  
##  1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0        1st Qu.:0.9917  
##  Median :0.04300   Median : 34.00      Median :134.0        Median :0.9937  
##  Mean   :0.04577   Mean   : 35.31      Mean   :138.4        Mean   :0.9940  
##  3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0        3rd Qu.:0.9961  
##  Max.   :0.34600   Max.   :289.00      Max.   :440.0        Max.   :1.0390  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
##  1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.180   Median :0.4700   Median :10.40   Median :6.000  
##  Mean   :3.188   Mean   :0.4898   Mean   :10.51   Mean   :5.878  
##  3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40   3rd Qu.:6.000  
##  Max.   :3.820   Max.   :1.0800   Max.   :14.20   Max.   :9.000

Distribution of white wine quality

# distribution of quality of wine (response variable)
tabulate(as.factor(data$quality))

## [1]   20  163 1457 2198  880  175    5

ggplot(data=data,aes(x=quality)) +
  geom_bar(fill="steelblue") + theme_classic() +
  ggtitle('distribution of response variable (quality of white wine)')

Split the data by stratified sampling

set.seed(2)
split=initial_split(data,prop=0.8,strata = "quality")
train=training(split)
test=testing(split)

ggplot(data=train,aes(x=quality)) +
  geom_bar(fill="steelblue") + theme_classic() +
  ggtitle('distribution of white wine quality in training dataset')

Correlation matrix of predictors

vars=setdiff(colnames(train),'quality')
corr=cor(train[,vars])
library(reshape2)
corr[upper.tri(corr)]=NA
melted_cormat=melt(corr,na.rm = TRUE)
# heatmap
ggplot(data=melted_cormat,aes(Var1,Var2,fill=value)) +
  geom_tile(color='white') +
  scale_fill_gradient2(low = "blue", high="red",mid="white",midpoint = 0,
                       limit=c(-1,1),space = "Lab",name="Correlation") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45,vjust = 1,size = 12,hjust = 1)) +
  coord_fixed() +
  geom_text(aes(Var1,Var2,label=round(value,2)),color="black",size=3) +
  xlab("") + ylab("")

Model training and evaluation

Linear regression

model_lm=lm(quality~.,data = train)
summary(model_lm)

## 
## Call:
## lm(formula = quality ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6916 -0.4868 -0.0338  0.4610  3.1230 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.470e+02  2.024e+01   7.263 4.56e-13 ***
## fixed.acidity         5.627e-02  2.278e-02   2.471 0.013527 *  
## volatile.acidity     -1.853e+00  1.273e-01 -14.560  < 2e-16 ***
## citric.acid           6.185e-02  1.081e-01   0.572 0.567123    
## residual.sugar        8.123e-02  8.218e-03   9.884  < 2e-16 ***
## chlorides            -1.349e-01  6.543e-01  -0.206 0.836687    
## free.sulfur.dioxide   3.249e-03  9.417e-04   3.450 0.000567 ***
## total.sulfur.dioxide -3.379e-04  4.218e-04  -0.801 0.423189    
## density              -1.469e+02  2.054e+01  -7.154 1.00e-12 ***
## pH                    6.477e-01  1.161e-01   5.577 2.61e-08 ***
## sulphates             6.724e-01  1.136e-01   5.921 3.47e-09 ***
## alcohol               1.948e-01  2.610e-02   7.462 1.05e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7534 on 3906 degrees of freedom
## Multiple R-squared:  0.2777, Adjusted R-squared:  0.2756 
## F-statistic: 136.5 on 11 and 3906 DF,  p-value: < 2.2e-16

# variance inflation factor
kable(data.frame(vif=vif(model_lm)))

	vif
fixed.acidity	2.603146
volatile.acidity	1.133923
citric.acid	1.163265
residual.sugar	11.828014
chlorides	1.248815
free.sulfur.dioxide	1.781740
total.sulfur.dioxide	2.220937
density	26.063623
pH	2.150540
sulphates	1.151245
alcohol	7.090508

Drop the variable ‘density’.

drops='density'
model_lm=lm(quality~.,data=train[,!(colnames(train) %in% drops)])
summary(model_lm)

## 
## Call:
## lm(formula = quality ~ ., data = train[, !(colnames(train) %in% 
##     drops)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7646 -0.4875 -0.0339  0.4644  3.1861 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.2230063  0.3894104   5.709 1.22e-08 ***
## fixed.acidity        -0.0560073  0.0166107  -3.372 0.000754 ***
## volatile.acidity     -1.9587062  0.1272360 -15.394  < 2e-16 ***
## citric.acid           0.0076788  0.1084914   0.071 0.943578    
## residual.sugar        0.0260537  0.0028580   9.116  < 2e-16 ***
## chlorides            -0.9452015  0.6485185  -1.457 0.145065    
## free.sulfur.dioxide   0.0042711  0.0009368   4.559 5.29e-06 ***
## total.sulfur.dioxide -0.0009131  0.0004167  -2.191 0.028498 *  
## pH                    0.1378271  0.0922836   1.494 0.135383    
## sulphates             0.4525128  0.1100200   4.113 3.99e-05 ***
## alcohol               0.3583921  0.0126642  28.300  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7582 on 3907 degrees of freedom
## Multiple R-squared:  0.2682, Adjusted R-squared:  0.2663 
## F-statistic: 143.2 on 10 and 3907 DF,  p-value: < 2.2e-16

kable(data.frame(vif=vif(model_lm)))

	vif
fixed.acidity	1.367079
volatile.acidity	1.118704
citric.acid	1.157553
residual.sugar	1.412293
chlorides	1.211388
free.sulfur.dioxide	1.740699
total.sulfur.dioxide	2.140232
pH	1.340717
sulphates	1.066900
alcohol	1.647809

Ridge regression

# Train ridge regression model
# First standardize the predictors
X_stand=scale(train[,vars],center = FALSE, scale = TRUE)
train_stand=data.frame(X_stand,quality=train$quality)
X1_stand=scale(test[,vars],center = FALSE, scale = attributes(X_stand)$'scaled:scale')
test_stand=data.frame(X1_stand,quality=test$quality)
# Ridge regression model
model_ridge=cv.glmnet(quality~.,data=train_stand,alpha = 0)
# visualize coefficients
coef_ridge=coef(model_ridge)
df_coef_ridge=data.frame(coef_name=rownames(coef_ridge)[-1], coef=coef_ridge[-1,1])
ggplot(data = df_coef_ridge,aes(x=coef_name,y=coef)) +
  geom_bar(stat = 'identity', width=0.5) +
  coord_flip()

Visualize the cross-validation error with repect to different lambda

roundoff=function(x) sprintf("%.3f",x)
df_cvm_ridge=data.frame(lambda=model_ridge$lambda,cvm=model_ridge$cvm,
                        cvm_up=model_ridge$cvup,cvm_down=model_ridge$cvlo)
ggplot(data=df_cvm_ridge,aes(x=lambda,y=cvm,ymin=cvm_down,ymax=cvm_up)) +
  geom_line() + geom_point() + geom_errorbar() + scale_x_continuous(trans = "log2",labels = roundoff) +
  geom_vline(xintercept = model_ridge$lambda.min, linetype=2, color="red") +
  geom_vline(xintercept = model_ridge$lambda.1se, linetype=3, color="blue") +
  ggtitle('ridge regression')

Lasso

model_lasso=cv.glmnet(quality~.,data=train_stand,alpha = 1)
# visualize coefficients
coef_lasso=coef(model_lasso)
df_coef_lasso=data.frame(coef_name=rownames(coef_lasso)[-1], coef=coef_lasso[-1,1])
ggplot(data = df_coef_lasso,aes(x=coef_name,y=coef)) +
  geom_bar(stat = 'identity', width=0.5) +
  coord_flip()

Visualize cross-validation error with repect to different lambda plus the number of nonzero coefficients

df_cvm_lasso=data.frame(lambda=model_lasso$lambda,cvm=model_lasso$cvm,
                        cvm_up=model_lasso$cvup,cvm_down=model_lasso$cvlo)
p1=ggplot(data=df_cvm_lasso,aes(x=lambda,y=cvm,ymin=cvm_down,ymax=cvm_up)) +
  geom_line() + geom_point() + geom_errorbar() + scale_x_continuous(trans = "log2",labels = roundoff) +
  geom_vline(xintercept = model_lasso$lambda.min, linetype=2, color="red") +
  geom_vline(xintercept = model_lasso$lambda.1se, linetype=3, color="blue") +
  ggtitle('Lasso')

df_nzero_lasso=data.frame(lambda=model_lasso$lambda,nzero=model_lasso$nzero)
p2=ggplot(data=df_nzero_lasso,aes(x=lambda,y=nzero)) +
  geom_line(color="steelblue") + scale_x_continuous(trans = "log2",labels = roundoff) + 
  ylab('Number of non-zero coefficients')

grid.arrange(p1,p2,ncol=2)

Elastic net

elastic_net=cva.glmnet(quality~.,train_stand)

# function to get cvm of the model$lambda.1se in each alpha
get_cvm=function(model) {
  index <- match(model$lambda.1se, model$lambda)
  model$cvm[index]
}
# data frame that contains alpha and its corresponding cross-validation error
enet_performance=data.frame(alpha=elastic_net$alpha)
models=elastic_net$modlist
enet_performance$cvm=vapply(models,get_cvm,numeric(1))
# get the best alpha and train the model
best_alpha=enet_performance[which.min(enet_performance$cvm),'alpha']
model_enet=cv.glmnet(quality~.,train_stand,alpha = best_alpha)

# visualize coefficients
coef_enet=coef(model_enet)
df_coef_enet=data.frame(coef_name=rownames(coef_enet)[-1], coef=coef_enet[-1,1])
ggplot(data = df_coef_enet,aes(x=coef_name,y=coef)) +
  geom_bar(stat = 'identity', width=0.5) +
  coord_flip()

df_cvm_enet=data.frame(lambda=model_enet$lambda,cvm=model_enet$cvm,
                        cvm_up=model_enet$cvup,cvm_down=model_enet$cvlo)
ggplot(data=df_cvm_enet,aes(x=lambda,y=cvm,ymin=cvm_down,ymax=cvm_up)) +
  geom_line() + geom_point() + geom_errorbar() + scale_x_continuous(trans = "log2",labels = roundoff) +
  geom_vline(xintercept = model_enet$lambda.min, linetype=2, color="red") +
  geom_vline(xintercept = model_enet$lambda.1se, linetype=3, color="blue") +
  ggtitle('Elastic net')

Grouped bar plot for all the regularized linear regression models (e.g. ridge, lasso and elastic net)

df=rbind(df_coef_ridge,df_coef_lasso,df_coef_enet)
L=nrow(df_coef_ridge)
id_vec=c(rep('ridge',L),rep('lasso',L),rep('elastic_net',L))
df=cbind(df,types=id_vec)

ggplot(data=df) +
  geom_bar(aes(x = coef_name, y = coef, fill = types),
           stat = "identity", position = "dodge") +
  scale_y_continuous("Coefficient estimates") +
  scale_x_discrete("Predictors") +
  # remove grey theme
  theme_classic(base_size = 15) +
  # rotate x-axis text and remove superfluous axis elements
  theme(axis.text.x = element_text(angle = 90,hjust = 1, vjust=0),
        axis.title.x=element_text(size=13),
        axis.title.y=element_text(size=15),
        axis.line = element_blank())

Performance evaluation

# function to evaluate models' performance
performance=function(truth,pred){
  # Root mean squre error
  sse=mean((truth-pred)^2)
  rmse=sqrt(sse)
  # mean absolute error
  mae=sum(abs(truth-pred))/length(truth)
  # coefficient of determination R2
  denom=sum((truth-mean(truth))^2)
  numer=sum((truth-pred)^2)
  R_squared=1-numer/denom
  c(rmse,mae,R_squared)
}

perf_lm=performance(test$quality,predict(model_lm,newdata = test))
perf_ridge=performance(test_stand$quality,predict(model_ridge,newdata = test_stand))
perf_lasso=performance(test_stand$quality,predict(model_lasso,newdata = test_stand))
perf_enet=performance(test_stand$quality,predict(model_enet,newdata = test_stand))
df_perf=data.frame(rbind(perf_lm,perf_ridge,perf_lasso,perf_enet))
colnames(df_perf)=c("RMSE","MAE","R2")
rownames(df_perf)=c("linear regression","ridge regression","lasso","elastic net")
kable(df_perf)

	RMSE	MAE	R2
linear regression	0.7476650	0.5862890	0.2897569
ridge regression	0.7587427	0.5954561	0.2685544
lasso	0.7559080	0.5967849	0.2740096
elastic net	0.7565655	0.5968977	0.2727460

Repeated k-fold cross-validation (CV)

Repeated 5-fold CV (20 iterations)

set.seed(321)
k=5
data=mutate(data,folds=sample(1:k,size = nrow(data),replace=TRUE))

# function
kfold_cv=function(kfold,data,method="lm"){
  train=subset(data,folds!=kfold)
  validate=subset(data,folds==kfold)
  train_1=scale(train[,vars],center = FALSE, scale=TRUE)
  train=data.frame(train_1,quality=train$quality)
  validate_1=scale(validate[,vars],center = FALSE, scale = attributes(train_1)$'scaled:scale')
  validate=data.frame(validate_1,quality=validate$quality)
  if (method=="lm") {
    model=lm(quality~.,data = train[,!(colnames(train) %in% drops)])
  }
  if (method=="ridge"){
    model=cv.glmnet(quality~.,train,alpha=0)
  }
  if (method=="lasso"){
    model=cv.glmnet(quality~.,train,alpha=1)
  }
  if (method=="enet"){
    elastic_net=cva.glmnet(quality~.,train)
    # data frame that contains alpha and its corresponding cross-validation error
    enet_performance=data.frame(alpha=elastic_net$alpha)
    models=elastic_net$modlist
    enet_performance$cvm=vapply(models,get_cvm,numeric(1))
    # get the best alpha and train the model
    best_alpha=enet_performance[which.min(enet_performance$cvm),'alpha']
    model=cv.glmnet(quality~.,train,alpha = best_alpha)
  }
  if (!(method %in% c("lm","ridge","lasso","enet"))) {
    stop('Methods not found')
  }
  pred=predict(model,newdata=validate)
  performance(validate$quality,pred)
}

n_repeat=20
mat_perf=matrix(,nrow=n_repeat,ncol=3)
a=numeric(0)

for (i in 1:n_repeat){
  set.seed(i+100)
  cv.data=mutate(data,folds=sample(1:k,size = nrow(data),replace=TRUE))
  cv.perf=sapply(c(1:k),FUN = kfold_cv,data=cv.data,method="lm")
  a=cbind(a,cv.perf)
  #perf=apply(cv.perf,1,mean)
  #mat_perf[i,]=perf
}
rownames(a)=c("RMSE","MAE","R2")
(mean_perf=apply(a,1,mean))

##      RMSE       MAE        R2 
## 0.7572172 0.5880354 0.2680121

# t-test confidence interval
tcrit=qt(0.025,df=ncol(a)-1,lower.tail = FALSE)
sd_perf=apply(a, 1, sd)
(ci=tcrit*sd_perf/sqrt(ncol(a)))

##        RMSE         MAE          R2 
## 0.004342151 0.003056763 0.005050711

Reference:

P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553. ISSN: 0167-9236.