Load the necessary packages
library(ggplot2)
library(rsample) # stratified sampling
library(reshape2) # data wrangling
library(knitr)
library(car) # variance inflation factor
## Loading required package: carData
library(glmnet) # regularized linear regression
## Loading required package: Matrix
## Loaded glmnet 4.1-1
library(glmnetUtils)
##
## Attaching package: 'glmnetUtils'
## The following objects are masked from 'package:glmnet':
##
## cv.glmnet, glmnet
library(gridExtra)
library(dplyr) # data wrangling
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Import data and display statistical summaries of each variables.
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.00900 Min. : 2.00 Min. : 9.0 Min. :0.9871
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0 1st Qu.:0.9917
## Median :0.04300 Median : 34.00 Median :134.0 Median :0.9937
## Mean :0.04577 Mean : 35.31 Mean :138.4 Mean :0.9940
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0 3rd Qu.:0.9961
## Max. :0.34600 Max. :289.00 Max. :440.0 Max. :1.0390
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000
## 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.180 Median :0.4700 Median :10.40 Median :6.000
## Mean :3.188 Mean :0.4898 Mean :10.51 Mean :5.878
## 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40 3rd Qu.:6.000
## Max. :3.820 Max. :1.0800 Max. :14.20 Max. :9.000
Distribution of white wine quality
# distribution of quality of wine (response variable)
tabulate(as.factor(data$quality))
## [1] 20 163 1457 2198 880 175 5
ggplot(data=data,aes(x=quality)) +
geom_bar(fill="steelblue") + theme_classic() +
ggtitle('distribution of response variable (quality of white wine)')
Split the data by stratified sampling
set.seed(2)
split=initial_split(data,prop=0.8,strata = "quality")
train=training(split)
test=testing(split)
ggplot(data=train,aes(x=quality)) +
geom_bar(fill="steelblue") + theme_classic() +
ggtitle('distribution of white wine quality in training dataset')
Correlation matrix of predictors
vars=setdiff(colnames(train),'quality')
corr=cor(train[,vars])
library(reshape2)
corr[upper.tri(corr)]=NA
melted_cormat=melt(corr,na.rm = TRUE)
# heatmap
ggplot(data=melted_cormat,aes(Var1,Var2,fill=value)) +
geom_tile(color='white') +
scale_fill_gradient2(low = "blue", high="red",mid="white",midpoint = 0,
limit=c(-1,1),space = "Lab",name="Correlation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45,vjust = 1,size = 12,hjust = 1)) +
coord_fixed() +
geom_text(aes(Var1,Var2,label=round(value,2)),color="black",size=3) +
xlab("") + ylab("")
model_lm=lm(quality~.,data = train)
summary(model_lm)
##
## Call:
## lm(formula = quality ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6916 -0.4868 -0.0338 0.4610 3.1230
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.470e+02 2.024e+01 7.263 4.56e-13 ***
## fixed.acidity 5.627e-02 2.278e-02 2.471 0.013527 *
## volatile.acidity -1.853e+00 1.273e-01 -14.560 < 2e-16 ***
## citric.acid 6.185e-02 1.081e-01 0.572 0.567123
## residual.sugar 8.123e-02 8.218e-03 9.884 < 2e-16 ***
## chlorides -1.349e-01 6.543e-01 -0.206 0.836687
## free.sulfur.dioxide 3.249e-03 9.417e-04 3.450 0.000567 ***
## total.sulfur.dioxide -3.379e-04 4.218e-04 -0.801 0.423189
## density -1.469e+02 2.054e+01 -7.154 1.00e-12 ***
## pH 6.477e-01 1.161e-01 5.577 2.61e-08 ***
## sulphates 6.724e-01 1.136e-01 5.921 3.47e-09 ***
## alcohol 1.948e-01 2.610e-02 7.462 1.05e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7534 on 3906 degrees of freedom
## Multiple R-squared: 0.2777, Adjusted R-squared: 0.2756
## F-statistic: 136.5 on 11 and 3906 DF, p-value: < 2.2e-16
# variance inflation factor
kable(data.frame(vif=vif(model_lm)))
| vif | |
|---|---|
| fixed.acidity | 2.603146 |
| volatile.acidity | 1.133923 |
| citric.acid | 1.163265 |
| residual.sugar | 11.828014 |
| chlorides | 1.248815 |
| free.sulfur.dioxide | 1.781740 |
| total.sulfur.dioxide | 2.220937 |
| density | 26.063623 |
| pH | 2.150540 |
| sulphates | 1.151245 |
| alcohol | 7.090508 |
Drop the variable ‘density’.
drops='density'
model_lm=lm(quality~.,data=train[,!(colnames(train) %in% drops)])
summary(model_lm)
##
## Call:
## lm(formula = quality ~ ., data = train[, !(colnames(train) %in%
## drops)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7646 -0.4875 -0.0339 0.4644 3.1861
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2230063 0.3894104 5.709 1.22e-08 ***
## fixed.acidity -0.0560073 0.0166107 -3.372 0.000754 ***
## volatile.acidity -1.9587062 0.1272360 -15.394 < 2e-16 ***
## citric.acid 0.0076788 0.1084914 0.071 0.943578
## residual.sugar 0.0260537 0.0028580 9.116 < 2e-16 ***
## chlorides -0.9452015 0.6485185 -1.457 0.145065
## free.sulfur.dioxide 0.0042711 0.0009368 4.559 5.29e-06 ***
## total.sulfur.dioxide -0.0009131 0.0004167 -2.191 0.028498 *
## pH 0.1378271 0.0922836 1.494 0.135383
## sulphates 0.4525128 0.1100200 4.113 3.99e-05 ***
## alcohol 0.3583921 0.0126642 28.300 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7582 on 3907 degrees of freedom
## Multiple R-squared: 0.2682, Adjusted R-squared: 0.2663
## F-statistic: 143.2 on 10 and 3907 DF, p-value: < 2.2e-16
kable(data.frame(vif=vif(model_lm)))
| vif | |
|---|---|
| fixed.acidity | 1.367079 |
| volatile.acidity | 1.118704 |
| citric.acid | 1.157553 |
| residual.sugar | 1.412293 |
| chlorides | 1.211388 |
| free.sulfur.dioxide | 1.740699 |
| total.sulfur.dioxide | 2.140232 |
| pH | 1.340717 |
| sulphates | 1.066900 |
| alcohol | 1.647809 |
# Train ridge regression model
# First standardize the predictors
X_stand=scale(train[,vars],center = FALSE, scale = TRUE)
train_stand=data.frame(X_stand,quality=train$quality)
X1_stand=scale(test[,vars],center = FALSE, scale = attributes(X_stand)$'scaled:scale')
test_stand=data.frame(X1_stand,quality=test$quality)
# Ridge regression model
model_ridge=cv.glmnet(quality~.,data=train_stand,alpha = 0)
# visualize coefficients
coef_ridge=coef(model_ridge)
df_coef_ridge=data.frame(coef_name=rownames(coef_ridge)[-1], coef=coef_ridge[-1,1])
ggplot(data = df_coef_ridge,aes(x=coef_name,y=coef)) +
geom_bar(stat = 'identity', width=0.5) +
coord_flip()
Visualize the cross-validation error with repect to different lambda
roundoff=function(x) sprintf("%.3f",x)
df_cvm_ridge=data.frame(lambda=model_ridge$lambda,cvm=model_ridge$cvm,
cvm_up=model_ridge$cvup,cvm_down=model_ridge$cvlo)
ggplot(data=df_cvm_ridge,aes(x=lambda,y=cvm,ymin=cvm_down,ymax=cvm_up)) +
geom_line() + geom_point() + geom_errorbar() + scale_x_continuous(trans = "log2",labels = roundoff) +
geom_vline(xintercept = model_ridge$lambda.min, linetype=2, color="red") +
geom_vline(xintercept = model_ridge$lambda.1se, linetype=3, color="blue") +
ggtitle('ridge regression')
model_lasso=cv.glmnet(quality~.,data=train_stand,alpha = 1)
# visualize coefficients
coef_lasso=coef(model_lasso)
df_coef_lasso=data.frame(coef_name=rownames(coef_lasso)[-1], coef=coef_lasso[-1,1])
ggplot(data = df_coef_lasso,aes(x=coef_name,y=coef)) +
geom_bar(stat = 'identity', width=0.5) +
coord_flip()
Visualize cross-validation error with repect to different lambda plus the number of nonzero coefficients
df_cvm_lasso=data.frame(lambda=model_lasso$lambda,cvm=model_lasso$cvm,
cvm_up=model_lasso$cvup,cvm_down=model_lasso$cvlo)
p1=ggplot(data=df_cvm_lasso,aes(x=lambda,y=cvm,ymin=cvm_down,ymax=cvm_up)) +
geom_line() + geom_point() + geom_errorbar() + scale_x_continuous(trans = "log2",labels = roundoff) +
geom_vline(xintercept = model_lasso$lambda.min, linetype=2, color="red") +
geom_vline(xintercept = model_lasso$lambda.1se, linetype=3, color="blue") +
ggtitle('Lasso')
df_nzero_lasso=data.frame(lambda=model_lasso$lambda,nzero=model_lasso$nzero)
p2=ggplot(data=df_nzero_lasso,aes(x=lambda,y=nzero)) +
geom_line(color="steelblue") + scale_x_continuous(trans = "log2",labels = roundoff) +
ylab('Number of non-zero coefficients')
grid.arrange(p1,p2,ncol=2)
elastic_net=cva.glmnet(quality~.,train_stand)
# function to get cvm of the model$lambda.1se in each alpha
get_cvm=function(model) {
index <- match(model$lambda.1se, model$lambda)
model$cvm[index]
}
# data frame that contains alpha and its corresponding cross-validation error
enet_performance=data.frame(alpha=elastic_net$alpha)
models=elastic_net$modlist
enet_performance$cvm=vapply(models,get_cvm,numeric(1))
# get the best alpha and train the model
best_alpha=enet_performance[which.min(enet_performance$cvm),'alpha']
model_enet=cv.glmnet(quality~.,train_stand,alpha = best_alpha)
# visualize coefficients
coef_enet=coef(model_enet)
df_coef_enet=data.frame(coef_name=rownames(coef_enet)[-1], coef=coef_enet[-1,1])
ggplot(data = df_coef_enet,aes(x=coef_name,y=coef)) +
geom_bar(stat = 'identity', width=0.5) +
coord_flip()
df_cvm_enet=data.frame(lambda=model_enet$lambda,cvm=model_enet$cvm,
cvm_up=model_enet$cvup,cvm_down=model_enet$cvlo)
ggplot(data=df_cvm_enet,aes(x=lambda,y=cvm,ymin=cvm_down,ymax=cvm_up)) +
geom_line() + geom_point() + geom_errorbar() + scale_x_continuous(trans = "log2",labels = roundoff) +
geom_vline(xintercept = model_enet$lambda.min, linetype=2, color="red") +
geom_vline(xintercept = model_enet$lambda.1se, linetype=3, color="blue") +
ggtitle('Elastic net')
Grouped bar plot for all the regularized linear regression models (e.g. ridge, lasso and elastic net)
df=rbind(df_coef_ridge,df_coef_lasso,df_coef_enet)
L=nrow(df_coef_ridge)
id_vec=c(rep('ridge',L),rep('lasso',L),rep('elastic_net',L))
df=cbind(df,types=id_vec)
ggplot(data=df) +
geom_bar(aes(x = coef_name, y = coef, fill = types),
stat = "identity", position = "dodge") +
scale_y_continuous("Coefficient estimates") +
scale_x_discrete("Predictors") +
# remove grey theme
theme_classic(base_size = 15) +
# rotate x-axis text and remove superfluous axis elements
theme(axis.text.x = element_text(angle = 90,hjust = 1, vjust=0),
axis.title.x=element_text(size=13),
axis.title.y=element_text(size=15),
axis.line = element_blank())
# function to evaluate models' performance
performance=function(truth,pred){
# Root mean squre error
sse=mean((truth-pred)^2)
rmse=sqrt(sse)
# mean absolute error
mae=sum(abs(truth-pred))/length(truth)
# coefficient of determination R2
denom=sum((truth-mean(truth))^2)
numer=sum((truth-pred)^2)
R_squared=1-numer/denom
c(rmse,mae,R_squared)
}
perf_lm=performance(test$quality,predict(model_lm,newdata = test))
perf_ridge=performance(test_stand$quality,predict(model_ridge,newdata = test_stand))
perf_lasso=performance(test_stand$quality,predict(model_lasso,newdata = test_stand))
perf_enet=performance(test_stand$quality,predict(model_enet,newdata = test_stand))
df_perf=data.frame(rbind(perf_lm,perf_ridge,perf_lasso,perf_enet))
colnames(df_perf)=c("RMSE","MAE","R2")
rownames(df_perf)=c("linear regression","ridge regression","lasso","elastic net")
kable(df_perf)
| RMSE | MAE | R2 | |
|---|---|---|---|
| linear regression | 0.7476650 | 0.5862890 | 0.2897569 |
| ridge regression | 0.7587427 | 0.5954561 | 0.2685544 |
| lasso | 0.7559080 | 0.5967849 | 0.2740096 |
| elastic net | 0.7565655 | 0.5968977 | 0.2727460 |
Repeated 5-fold CV (20 iterations)
set.seed(321)
k=5
data=mutate(data,folds=sample(1:k,size = nrow(data),replace=TRUE))
# function
kfold_cv=function(kfold,data,method="lm"){
train=subset(data,folds!=kfold)
validate=subset(data,folds==kfold)
train_1=scale(train[,vars],center = FALSE, scale=TRUE)
train=data.frame(train_1,quality=train$quality)
validate_1=scale(validate[,vars],center = FALSE, scale = attributes(train_1)$'scaled:scale')
validate=data.frame(validate_1,quality=validate$quality)
if (method=="lm") {
model=lm(quality~.,data = train[,!(colnames(train) %in% drops)])
}
if (method=="ridge"){
model=cv.glmnet(quality~.,train,alpha=0)
}
if (method=="lasso"){
model=cv.glmnet(quality~.,train,alpha=1)
}
if (method=="enet"){
elastic_net=cva.glmnet(quality~.,train)
# data frame that contains alpha and its corresponding cross-validation error
enet_performance=data.frame(alpha=elastic_net$alpha)
models=elastic_net$modlist
enet_performance$cvm=vapply(models,get_cvm,numeric(1))
# get the best alpha and train the model
best_alpha=enet_performance[which.min(enet_performance$cvm),'alpha']
model=cv.glmnet(quality~.,train,alpha = best_alpha)
}
if (!(method %in% c("lm","ridge","lasso","enet"))) {
stop('Methods not found')
}
pred=predict(model,newdata=validate)
performance(validate$quality,pred)
}
n_repeat=20
mat_perf=matrix(,nrow=n_repeat,ncol=3)
a=numeric(0)
for (i in 1:n_repeat){
set.seed(i+100)
cv.data=mutate(data,folds=sample(1:k,size = nrow(data),replace=TRUE))
cv.perf=sapply(c(1:k),FUN = kfold_cv,data=cv.data,method="lm")
a=cbind(a,cv.perf)
#perf=apply(cv.perf,1,mean)
#mat_perf[i,]=perf
}
rownames(a)=c("RMSE","MAE","R2")
(mean_perf=apply(a,1,mean))
## RMSE MAE R2
## 0.7572172 0.5880354 0.2680121
# t-test confidence interval
tcrit=qt(0.025,df=ncol(a)-1,lower.tail = FALSE)
sd_perf=apply(a, 1, sd)
(ci=tcrit*sd_perf/sqrt(ncol(a)))
## RMSE MAE R2
## 0.004342151 0.003056763 0.005050711
P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553. ISSN: 0167-9236.