Individual Project 1 : Diabetes Disease Progression
Problem Statement
In this individual project, we will use the diabetes data in Efron et al. (2003) to examine the effects of ten baseline predictor variables and six blood serum measurements on a quantitative measure of disease progression one year after baseline.
Data
Response Variable - Measure of disease progression.
Dependent variables :- Ten baseline predictor variables including:
There are 442 diabetes patients in this data set implying 442 observations/rows of data. The data is available in the R package “lars.”
Now lets apply some simple and some advanced regression techniques to measure the disease progression.
library(lars)
## Loaded lars 1.2
data("diabetes")
names(diabetes)
## [1] "x" "y" "x2"
# Load the diabetes data
data.all <- data.frame(cbind(diabetes$x, y = diabetes$y))
# Partition the patients into two groups: training (75%) and test (25%)
n<-dim(data.all)[1] # sample size = 442
set.seed(1306) # set random number generator seed to enable repeatability of results
test<-sample(n, round(n/4)) # randomly sample 25% test
data.train<-data.all[-test,]
data.test<-data.all[test,]
x<-model.matrix(y ~ ., data = data.all)[,-1] # define predictor matrix
#excl intercept col of 1s
x.train <- x[-test,] # define training predictor matrix
x.test <- x[test,] # define test predictor matrix
y <- data.all$y # define response variable
y.train <- y[-test] # define training response variable
y.test <- y[test] # define test response variable
n.train <- dim(data.train)[1] # training sample size = 332
n.test <- dim(data.test)[1]
Before we beging modelling, first lets visualize the daa first:
# Plot a Scatterplot Matrix of the TRAINING data set and TEST data set
pairs(data.train)
pairs(data.test)
# Plot Histograms to check for normality of predictor variables
par(mfrow = c(3, 3))
hist(data.train$age)
hist(data.train$bmi)
hist(data.train$map)
hist(data.train$tc)
hist(data.train$ldl)
hist(data.train$hdl)
hist(data.train$tch)
hist(data.train$ltg)
hist(data.train$glu)
Model 1 - Least squares regression using all ten predictors
# Fit the model using lm() on the training data
model1_lm <- lm(y ~ ., data = data.train)
summary(model1_lm)
##
## Call:
## lm(formula = y ~ ., data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -155.726 -36.065 -2.758 35.039 151.509
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 149.920 2.976 50.382 < 2e-16 ***
## age -66.758 68.946 -0.968 0.33364
## sex -304.651 69.847 -4.362 1.74e-05 ***
## bmi 518.663 76.573 6.773 6.01e-11 ***
## map 388.111 72.755 5.335 1.81e-07 ***
## tc -815.268 537.549 -1.517 0.13034
## ldl 387.604 439.162 0.883 0.37811
## hdl 162.903 269.117 0.605 0.54539
## tch 323.832 186.803 1.734 0.08396 .
## ltg 673.620 206.888 3.256 0.00125 **
## glu 94.219 79.590 1.184 0.23737
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54.05 on 321 degrees of freedom
## Multiple R-squared: 0.5213, Adjusted R-squared: 0.5064
## F-statistic: 34.96 on 10 and 321 DF, p-value: < 2.2e-16
coef(model1_lm)
## (Intercept) age sex bmi map tc
## 149.92029 -66.75836 -304.65071 518.66346 388.11130 -815.26815
## ldl hdl tch ltg glu
## 387.60431 162.90253 323.83151 673.62035 94.21867
# Plot the model diagnostics
par(mfrow = c(2, 2))
plot(model1_lm)
# Prediction on the test data
# predict(model1_lm, data.test)
# Test MSE
mean((data.test$y - predict(model1_lm, data.test))^2)
## [1] 3111.265
# Standard Error
sd((data.test$y - predict(model1_lm, data.test))^2)/sqrt(n.test)
## [1] 361.0908
This model has a R2 of 52% with low p-value indicating model significance. But only 5 variales are significant in the model namely: sex,bmi,map,tch,ltg.
Model 2 - Linear Regression with Best subset selection using BIC
library(leaps)
## Warning: package 'leaps' was built under R version 3.4.3
model2.bic <- regsubsets(y ~ ., data = data.train, nvmax = 10)
summary(model2.bic)
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = data.train, nvmax = 10)
## 10 Variables (and intercept)
## Forced in Forced out
## age FALSE FALSE
## sex FALSE FALSE
## bmi FALSE FALSE
## map FALSE FALSE
## tc FALSE FALSE
## ldl FALSE FALSE
## hdl FALSE FALSE
## tch FALSE FALSE
## ltg FALSE FALSE
## glu FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
## age sex bmi map tc ldl hdl tch ltg glu
## 1 ( 1 ) " " " " "*" " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " "*" " " " " " " " " " " "*" " "
## 3 ( 1 ) " " " " "*" "*" " " " " " " " " "*" " "
## 4 ( 1 ) " " " " "*" "*" "*" " " " " " " "*" " "
## 5 ( 1 ) " " "*" "*" "*" " " " " "*" " " "*" " "
## 6 ( 1 ) " " "*" "*" "*" "*" " " " " "*" "*" " "
## 7 ( 1 ) " " "*" "*" "*" "*" " " " " "*" "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" "*" " " " " "*" "*" "*"
## 9 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
par(mfrow = c(1, 2))
plot(model2.bic, scale = "bic", main = "Predictor Variables vs. BIC")
summary(model2.bic)
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = data.train, nvmax = 10)
## 10 Variables (and intercept)
## Forced in Forced out
## age FALSE FALSE
## sex FALSE FALSE
## bmi FALSE FALSE
## map FALSE FALSE
## tc FALSE FALSE
## ldl FALSE FALSE
## hdl FALSE FALSE
## tch FALSE FALSE
## ltg FALSE FALSE
## glu FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
## age sex bmi map tc ldl hdl tch ltg glu
## 1 ( 1 ) " " " " "*" " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " "*" " " " " " " " " " " "*" " "
## 3 ( 1 ) " " " " "*" "*" " " " " " " " " "*" " "
## 4 ( 1 ) " " " " "*" "*" "*" " " " " " " "*" " "
## 5 ( 1 ) " " "*" "*" "*" " " " " "*" " " "*" " "
## 6 ( 1 ) " " "*" "*" "*" "*" " " " " "*" "*" " "
## 7 ( 1 ) " " "*" "*" "*" "*" " " " " "*" "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" "*" " " " " "*" "*" "*"
## 9 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
summary(model2.bic)$bic
## [1] -125.5340 -173.7155 -185.0856 -188.0941 -198.8718 -201.1269 -196.4423
## [8] -191.5214 -186.1681 -180.7417
summary(model2.bic)$bic[6]
## [1] -201.1269
plot(summary(model2.bic)$bic, xlab = "Number of Predictors", ylab = "BIC", type = "l",
main = "Best Subset Selection Using BIC")
which.min(summary(model2.bic)$bic)
## [1] 6
points(6, summary(model2.bic)$bic[6], col = "brown", cex = 2, pch = 20)
coef(model2.bic, 6)
## (Intercept) sex bmi map tc tch
## 150.1166 -306.0420 538.8274 389.0673 -379.0379 332.6735
## ltg
## 527.5658
# Predict "function" for regsubsets() ; will be re-used down below
predict.regsubsets <- function(object, newdata, id,...){
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
cof <- coef(object, id = id)
xvars <- names(cof)
mat[, xvars]%*%cof
}
# Mean Predictor Error
mean((data.test$y - predict(model2.bic, data.test, id = 6))^2)
## [1] 3095.483
# Standard Error
sd((data.test$y - predict(model2.bic, data.test, id = 6))^2)/sqrt(n.test)
## [1] 369.7526
lm.bic <- lm(y ~ sex + bmi + map + tc + tch + ltg, data = data.train)
summary(lm.bic) # Summary of the linear regression model
##
## Call:
## lm(formula = y ~ sex + bmi + map + tc + tch + ltg, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -154.549 -34.821 -2.544 35.859 159.067
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 150.117 2.962 50.683 < 2e-16 ***
## sex -306.042 69.041 -4.433 1.27e-05 ***
## bmi 538.827 74.219 7.260 2.88e-12 ***
## map 389.067 69.916 5.565 5.49e-08 ***
## tc -379.038 82.271 -4.607 5.87e-06 ***
## tch 332.674 88.968 3.739 0.000218 ***
## ltg 527.566 87.067 6.059 3.78e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53.94 on 325 degrees of freedom
## Multiple R-squared: 0.5172, Adjusted R-squared: 0.5083
## F-statistic: 58.03 on 6 and 325 DF, p-value: < 2.2e-16
coef(lm.bic) # Extract the estimated regression model coefficients
## (Intercept) sex bmi map tc tch
## 150.1166 -306.0420 538.8274 389.0673 -379.0379 332.6735
## ltg
## 527.5658
confint(lm.bic) # Obtain a 95% CI for the coefficient estimates
## 2.5 % 97.5 %
## (Intercept) 144.2898 155.9435
## sex -441.8652 -170.2188
## bmi 392.8167 684.8382
## map 251.5229 526.6118
## tc -540.8880 -217.1877
## tch 157.6473 507.6997
## ltg 356.2789 698.8527
# Plot the model diagnostics
par(mfrow = c(2, 2))
plot(lm.bic)
# Mean Predictor Error
mean((data.test$y - predict(lm.bic, data.test))^2)
## [1] 3095.483
# Standard Error
sd((data.test$y - predict(lm.bic, data.test))^2)/sqrt(n.test)
## [1] 369.7526
Model 3 - Linear Regression with best subset selection using 10-fold CV
k <- 10
set.seed(1306)
folds <- sample(1:k, nrow(data.train), replace = TRUE)
cv.errors <- matrix(NA, k, 10, dimnames = list(NULL, paste(1:10)))
for (i in 1:k) {
model3.cv <- regsubsets(y ~ ., data = data.train[folds != i, ], nvmax = 10)
for (j in 1:10) {
pred <- predict(model3.cv, data.train[folds == i, ], id = j)
cv.errors[i, j] = mean((data.train$y[folds == i] - pred)^2)
}
}
This gives us a 10x10 matrix(no. of predictors = 10), of which the (i, j)th element corresponds to the test MSE for the best i-variable model for the jth cross-validation fold
cv.errors
## 1 2 3 4 5 6 7
## [1,] 5060.151 4054.381 4042.152 4319.934 4195.403 4572.721 4596.817
## [2,] 3439.586 2697.570 2454.353 2585.358 2575.142 2194.526 2152.844
## [3,] 3436.239 3103.584 2931.994 3099.518 2866.454 2802.145 2786.955
## [4,] 4447.886 3321.043 2884.731 2836.289 2265.152 2153.625 2391.823
## [5,] 5073.264 4023.290 3671.215 3508.997 3253.881 3161.034 3129.718
## [6,] 3824.258 3599.092 3840.461 3856.610 3557.187 3549.081 3579.439
## [7,] 3601.586 3704.709 3554.070 3364.279 3109.488 2812.486 2897.396
## [8,] 4096.953 3427.270 3108.255 3068.888 2740.950 2665.427 2654.731
## [9,] 3742.206 3832.129 3463.103 3649.511 3562.864 3308.198 3381.443
## [10,] 3103.915 2446.412 2688.210 2775.552 2632.751 2569.827 2627.139
## 8 9 10
## [1,] 4545.516 4554.312 4508.588
## [2,] 2131.643 2113.199 2115.943
## [3,] 2823.623 2826.683 2826.121
## [4,] 2401.491 2383.259 2482.791
## [5,] 3089.509 3072.503 3077.937
## [6,] 3503.111 3636.919 3560.506
## [7,] 2873.095 2960.593 2946.175
## [8,] 2685.431 2713.651 2679.861
## [9,] 3372.835 3410.084 3398.884
## [10,] 2631.695 2658.567 2627.616
mean.cv.errors <- apply(cv.errors, 2, mean)
mean.cv.errors
## 1 2 3 4 5 6 7 8
## 3982.604 3420.948 3263.854 3306.494 3075.927 2978.907 3019.831 3005.795
## 9 10
## 3032.977 3022.442
which.min(mean.cv.errors)
## 6
## 6
mean.cv.errors[6]
## 6
## 2978.907
par(mfrow = c(1,2))
plot(mean.cv.errors, type = "b", xlab = "Number of Predictors", ylab = "Mean CV Errors",
main = "Best Subset Selection (10-fold CV)")
points(6, mean.cv.errors[6], col = "brown", cex = 2, pch = 20)
rmse.cv = sqrt(mean.cv.errors)
rmse.cv[6]
## 6
## 54.57937
plot(rmse.cv, pch = 19, type = "b", xlab = "Number of Predictors", ylab = "RMSE CV",
main = "Best Subset Selection (10-fold CV)")
points(6, rmse.cv[6], col = "blue", cex = 2, pch = 20)
The cross-validation suggests a 6-variable model, so we perform best subset selection on the training data set to get the best 6-variable model
reg.best <- regsubsets(y ~ ., data = data.train, nvmax = 10)
coef(reg.best, 6)
## (Intercept) sex bmi map tc tch
## 150.1166 -306.0420 538.8274 389.0673 -379.0379 332.6735
## ltg
## 527.5658
mean((data.test$y - predict(reg.best, data.test, id = 6))^2)
## [1] 3095.483
sd((data.test$y - predict(reg.best, data.test, id = 6))^2)/sqrt(n.test)
## [1] 369.7526
lm.cv.best <- lm(y ~ sex + bmi + map + tc + tch + ltg, data = data.train)
summary(lm.cv.best)
##
## Call:
## lm(formula = y ~ sex + bmi + map + tc + tch + ltg, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -154.549 -34.821 -2.544 35.859 159.067
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 150.117 2.962 50.683 < 2e-16 ***
## sex -306.042 69.041 -4.433 1.27e-05 ***
## bmi 538.827 74.219 7.260 2.88e-12 ***
## map 389.067 69.916 5.565 5.49e-08 ***
## tc -379.038 82.271 -4.607 5.87e-06 ***
## tch 332.674 88.968 3.739 0.000218 ***
## ltg 527.566 87.067 6.059 3.78e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53.94 on 325 degrees of freedom
## Multiple R-squared: 0.5172, Adjusted R-squared: 0.5083
## F-statistic: 58.03 on 6 and 325 DF, p-value: < 2.2e-16
# Extracting the estimated regression model coefficients
coef(lm.cv.best)
## (Intercept) sex bmi map tc tch
## 150.1166 -306.0420 538.8274 389.0673 -379.0379 332.6735
## ltg
## 527.5658
par(mfrow = c(2, 2)); plot(lm.cv.best) # Plot the model diagnostics
# Mean Predictor Error
mean((data.test$y - predict(lm.cv.best, data.test))^2)
## [1] 3095.483
# Standard Error
sd((data.test$y - predict(lm.cv.best, data.test))^2)/sqrt(n.test)
## [1] 369.7526
Model 4 - Ridge regression using 10-fold CV (with largest lambda within 1 SE of min.)
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.2
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.2
## Loaded glmnet 2.0-13
par(mfrow = c(1,2))
grid <- 10^seq(10, -2, length = 100)
model4.ridge <- glmnet(x.train, y.train, alpha = 0, lambda = grid, thresh = 1e-12)
plot(model4.ridge, xvar = "lambda", label = TRUE)
set.seed(1306)
cv.out <- cv.glmnet(x.train, y.train, alpha = 0)
plot(cv.out)
bestlam <- cv.out$lambda.min
bestlam
## [1] 4.904021
log(bestlam)
## [1] 1.590056
model4.ridge <- glmnet(x.train, y.train, alpha = 0, lambda = bestlam)
ridge.pred <- predict(model4.ridge, s = bestlam, newx = x.test)
mean((ridge.pred - y.test)^2)
## [1] 3074.378
sd((ridge.pred - y.test)^2)/sqrt(n.test)
## [1] 357.9628
largelam <- cv.out$lambda.1se
largelam
## [1] 41.67209
model4.ridge <- glmnet(x.train, y.train, alpha = 0, lambda = largelam)
ridge.pred <- predict(model4.ridge, s = largelam, newx = x.test)
mean((ridge.pred - y.test)^2)
## [1] 3070.87
sd((ridge.pred - y.test)^2)/sqrt(n.test)
## [1] 350.5467
# Here are the estimated coefficients
predict(model4.ridge, type = "coefficients", s = largelam)[1:11,]
## (Intercept) age sex bmi map tc
## 149.99068 -11.33162 -156.91053 374.44939 264.89998 -31.96990
## ldl hdl tch ltg glu
## -66.89724 -174.01202 123.97204 307.68646 134.48120
cv.out() out function spits out a lot of information about the model.
Model 5 - Lasso model using 10-fold CV (with largest lambda within 1 SE of min.)
par(mfrow = c(1,2))
grid <- 10^seq(10, -2, length = 100)
model5.lasso <- glmnet(x.train, y.train, alpha = 1, lambda = grid, thresh = 1e-12)
plot(model5.lasso, xvar = "lambda", label = TRUE)
set.seed(1306)
cv.out <- cv.glmnet(x.train, y.train, alpha = 1)
plot(cv.out)
bestlam <- cv.out$lambda.min
bestlam
## [1] 0.2026348
log(bestlam)
## [1] -1.59635
model5.lasso <- glmnet(x.train, y.train, alpha = 1, lambda = bestlam)
lasso.pred <- predict(model5.lasso, s = bestlam, newx = x.test)
mean((lasso.pred - y.test)^2)
## [1] 3103.65
sd((lasso.pred - y.test)^2)/sqrt(n.test)
## [1] 363.0016
largelam <- cv.out$lambda.1se
largelam
## [1] 4.791278
model5.lasso <- glmnet(x.train, y.train, alpha = 1, lambda = largelam)
lasso.pred <- predict(model5.lasso, s = largelam, newx = x.test)
mean((lasso.pred - y.test)^2)
## [1] 2920.041
sd((lasso.pred - y.test)^2)/sqrt(n.test)
## [1] 346.2248
# Here are the estimated coefficients
lasso.coef <- predict(model5.lasso, type = "coefficients", s = largelam)[1:11,]
lasso.coef[lasso.coef != 0]
## (Intercept) sex bmi map hdl ltg
## 149.95298 -119.62208 501.56473 270.92614 -180.29437 390.55001
## glu
## 16.58881
** Model Comparison**
results_df = data.frame(Model = c("1.Full Least Squares Model", "2.Best Subsets Model with BIC", "3.Best Subsets Model with 10-fold CV", "4.Ridge Regression Model with 10-fold CV", "5.Lasso Model with 10-fold CV"), "Test Error" = c(3111.26,3095.48,3095.48,3070.87,2920.04), "Std. Error" = c(361.09,369.75,369.75,350.55,346.22))
print(results_df, row.names=FALSE)
## Model Test.Error Std..Error
## 1.Full Least Squares Model 3111.26 361.09
## 2.Best Subsets Model with BIC 3095.48 369.75
## 3.Best Subsets Model with 10-fold CV 3095.48 369.75
## 4.Ridge Regression Model with 10-fold CV 3070.87 350.55
## 5.Lasso Model with 10-fold CV 2920.04 346.22
cat(paste0("The test MAE for the lasso model using CV is: ",round(mean((y.test - abs(lasso.pred))),2)))
## The test MAE for the lasso model using CV is: 8.76
cat(paste0("20th, 50th, and 80th quantiles of the errors:"))
## 20th, 50th, and 80th quantiles of the errors:
quantile(y.test-lasso.pred,probs=c(.20,.50,.80))
## 20% 50% 80%
## -39.755441 4.984314 57.448337
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.3
ku = kurtosis(y.test-lasso.pred)
ku = round(ku,2)
cat(paste0("Kurtosis of the error distribution: ", ku))
## Kurtosis of the error distribution: -0.58