Big Data- 743 - Individual Project 1

Individual Project 1 : Diabetes Disease Progression

Problem Statement
In this individual project, we will use the diabetes data in Efron et al. (2003) to examine the effects of ten baseline predictor variables and six blood serum measurements on a quantitative measure of disease progression one year after baseline.

Data

Response Variable - Measure of disease progression.
Dependent variables :- Ten baseline predictor variables including:

age,
sex,
body mass index (bmi),
average blood pressure (map), and six blood serum measurements:
tc,
ldl,
hdl,
tch,
ltg,
glu

There are 442 diabetes patients in this data set implying 442 observations/rows of data. The data is available in the R package “lars.”

Now lets apply some simple and some advanced regression techniques to measure the disease progression.

library(lars)

## Loaded lars 1.2

data("diabetes")
names(diabetes)

## [1] "x"  "y"  "x2"

# Load the diabetes data
data.all <- data.frame(cbind(diabetes$x, y = diabetes$y))

# Partition the patients into two groups: training (75%) and test (25%)
n<-dim(data.all)[1] # sample size = 442
set.seed(1306) # set random number generator seed to enable repeatability of results

test<-sample(n, round(n/4)) # randomly sample 25% test
data.train<-data.all[-test,]
data.test<-data.all[test,]
x<-model.matrix(y ~ ., data = data.all)[,-1] # define predictor matrix 

#excl intercept col of 1s
x.train <- x[-test,] # define training predictor matrix
x.test <- x[test,] # define test predictor matrix

y <- data.all$y # define response variable
y.train <- y[-test] # define training response variable
y.test <- y[test] # define test response variable
n.train <- dim(data.train)[1] # training sample size = 332
n.test <- dim(data.test)[1]

Before we beging modelling, first lets visualize the daa first:

# Plot a Scatterplot Matrix of the TRAINING data set and TEST data set
pairs(data.train)

pairs(data.test)

# Plot Histograms to check for normality of predictor variables
par(mfrow = c(3, 3))
hist(data.train$age)
hist(data.train$bmi)
hist(data.train$map)
hist(data.train$tc)
hist(data.train$ldl)
hist(data.train$hdl)
hist(data.train$tch)
hist(data.train$ltg)
hist(data.train$glu)

Model 1 - Least squares regression using all ten predictors

# Fit the model using lm() on the training data
model1_lm <- lm(y ~ ., data = data.train) 

summary(model1_lm)

## 
## Call:
## lm(formula = y ~ ., data = data.train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -155.726  -36.065   -2.758   35.039  151.509 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  149.920      2.976  50.382  < 2e-16 ***
## age          -66.758     68.946  -0.968  0.33364    
## sex         -304.651     69.847  -4.362 1.74e-05 ***
## bmi          518.663     76.573   6.773 6.01e-11 ***
## map          388.111     72.755   5.335 1.81e-07 ***
## tc          -815.268    537.549  -1.517  0.13034    
## ldl          387.604    439.162   0.883  0.37811    
## hdl          162.903    269.117   0.605  0.54539    
## tch          323.832    186.803   1.734  0.08396 .  
## ltg          673.620    206.888   3.256  0.00125 ** 
## glu           94.219     79.590   1.184  0.23737    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54.05 on 321 degrees of freedom
## Multiple R-squared:  0.5213, Adjusted R-squared:  0.5064 
## F-statistic: 34.96 on 10 and 321 DF,  p-value: < 2.2e-16

coef(model1_lm)

## (Intercept)         age         sex         bmi         map          tc 
##   149.92029   -66.75836  -304.65071   518.66346   388.11130  -815.26815 
##         ldl         hdl         tch         ltg         glu 
##   387.60431   162.90253   323.83151   673.62035    94.21867

# Plot the model diagnostics
par(mfrow = c(2, 2)) 
plot(model1_lm)

# Prediction on the test data

# predict(model1_lm, data.test) 

# Test MSE
mean((data.test$y - predict(model1_lm, data.test))^2)

## [1] 3111.265

# Standard Error
sd((data.test$y - predict(model1_lm, data.test))^2)/sqrt(n.test)

## [1] 361.0908

This model has a R2 of 52% with low p-value indicating model significance. But only 5 variales are significant in the model namely: sex,bmi,map,tch,ltg.

Model 2 - Linear Regression with Best subset selection using BIC

library(leaps)

## Warning: package 'leaps' was built under R version 3.4.3

model2.bic <- regsubsets(y ~ ., data = data.train, nvmax = 10)
summary(model2.bic)

## Subset selection object
## Call: regsubsets.formula(y ~ ., data = data.train, nvmax = 10)
## 10 Variables  (and intercept)
##     Forced in Forced out
## age     FALSE      FALSE
## sex     FALSE      FALSE
## bmi     FALSE      FALSE
## map     FALSE      FALSE
## tc      FALSE      FALSE
## ldl     FALSE      FALSE
## hdl     FALSE      FALSE
## tch     FALSE      FALSE
## ltg     FALSE      FALSE
## glu     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
##           age sex bmi map tc  ldl hdl tch ltg glu
## 1  ( 1 )  " " " " "*" " " " " " " " " " " " " " "
## 2  ( 1 )  " " " " "*" " " " " " " " " " " "*" " "
## 3  ( 1 )  " " " " "*" "*" " " " " " " " " "*" " "
## 4  ( 1 )  " " " " "*" "*" "*" " " " " " " "*" " "
## 5  ( 1 )  " " "*" "*" "*" " " " " "*" " " "*" " "
## 6  ( 1 )  " " "*" "*" "*" "*" " " " " "*" "*" " "
## 7  ( 1 )  " " "*" "*" "*" "*" " " " " "*" "*" "*"
## 8  ( 1 )  "*" "*" "*" "*" "*" " " " " "*" "*" "*"
## 9  ( 1 )  "*" "*" "*" "*" "*" "*" " " "*" "*" "*"
## 10  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"

par(mfrow = c(1, 2))
plot(model2.bic, scale = "bic", main = "Predictor Variables vs. BIC")
summary(model2.bic)

## Subset selection object
## Call: regsubsets.formula(y ~ ., data = data.train, nvmax = 10)
## 10 Variables  (and intercept)
##     Forced in Forced out
## age     FALSE      FALSE
## sex     FALSE      FALSE
## bmi     FALSE      FALSE
## map     FALSE      FALSE
## tc      FALSE      FALSE
## ldl     FALSE      FALSE
## hdl     FALSE      FALSE
## tch     FALSE      FALSE
## ltg     FALSE      FALSE
## glu     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
##           age sex bmi map tc  ldl hdl tch ltg glu
## 1  ( 1 )  " " " " "*" " " " " " " " " " " " " " "
## 2  ( 1 )  " " " " "*" " " " " " " " " " " "*" " "
## 3  ( 1 )  " " " " "*" "*" " " " " " " " " "*" " "
## 4  ( 1 )  " " " " "*" "*" "*" " " " " " " "*" " "
## 5  ( 1 )  " " "*" "*" "*" " " " " "*" " " "*" " "
## 6  ( 1 )  " " "*" "*" "*" "*" " " " " "*" "*" " "
## 7  ( 1 )  " " "*" "*" "*" "*" " " " " "*" "*" "*"
## 8  ( 1 )  "*" "*" "*" "*" "*" " " " " "*" "*" "*"
## 9  ( 1 )  "*" "*" "*" "*" "*" "*" " " "*" "*" "*"
## 10  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"

summary(model2.bic)$bic

##  [1] -125.5340 -173.7155 -185.0856 -188.0941 -198.8718 -201.1269 -196.4423
##  [8] -191.5214 -186.1681 -180.7417

summary(model2.bic)$bic[6]

## [1] -201.1269

plot(summary(model2.bic)$bic, xlab = "Number of Predictors", ylab = "BIC", type = "l",
     main = "Best Subset Selection Using BIC")
which.min(summary(model2.bic)$bic)

## [1] 6

points(6, summary(model2.bic)$bic[6], col = "brown", cex = 2, pch = 20)

coef(model2.bic, 6)

## (Intercept)         sex         bmi         map          tc         tch 
##    150.1166   -306.0420    538.8274    389.0673   -379.0379    332.6735 
##         ltg 
##    527.5658

# Predict "function" for regsubsets() ; will be re-used down below 
predict.regsubsets <- function(object, newdata, id,...){
  form <- as.formula(object$call[[2]])
  mat <- model.matrix(form, newdata)
  cof <- coef(object, id = id)
  xvars <- names(cof)
  mat[, xvars]%*%cof
}
# Mean Predictor Error
mean((data.test$y - predict(model2.bic, data.test, id = 6))^2)

## [1] 3095.483

# Standard Error
sd((data.test$y - predict(model2.bic, data.test, id = 6))^2)/sqrt(n.test)

## [1] 369.7526

lm.bic <- lm(y ~ sex + bmi + map + tc + tch + ltg, data = data.train)
summary(lm.bic) # Summary of the linear regression model

## 
## Call:
## lm(formula = y ~ sex + bmi + map + tc + tch + ltg, data = data.train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -154.549  -34.821   -2.544   35.859  159.067 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  150.117      2.962  50.683  < 2e-16 ***
## sex         -306.042     69.041  -4.433 1.27e-05 ***
## bmi          538.827     74.219   7.260 2.88e-12 ***
## map          389.067     69.916   5.565 5.49e-08 ***
## tc          -379.038     82.271  -4.607 5.87e-06 ***
## tch          332.674     88.968   3.739 0.000218 ***
## ltg          527.566     87.067   6.059 3.78e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53.94 on 325 degrees of freedom
## Multiple R-squared:  0.5172, Adjusted R-squared:  0.5083 
## F-statistic: 58.03 on 6 and 325 DF,  p-value: < 2.2e-16

coef(lm.bic) # Extract the estimated regression model coefficients

## (Intercept)         sex         bmi         map          tc         tch 
##    150.1166   -306.0420    538.8274    389.0673   -379.0379    332.6735 
##         ltg 
##    527.5658

confint(lm.bic) # Obtain a 95% CI for the coefficient estimates

##                 2.5 %    97.5 %
## (Intercept)  144.2898  155.9435
## sex         -441.8652 -170.2188
## bmi          392.8167  684.8382
## map          251.5229  526.6118
## tc          -540.8880 -217.1877
## tch          157.6473  507.6997
## ltg          356.2789  698.8527

# Plot the model diagnostics
par(mfrow = c(2, 2))
plot(lm.bic)

# Mean Predictor Error
mean((data.test$y - predict(lm.bic, data.test))^2)

## [1] 3095.483

# Standard Error
sd((data.test$y - predict(lm.bic, data.test))^2)/sqrt(n.test)

## [1] 369.7526

Model 3 - Linear Regression with best subset selection using 10-fold CV

k <- 10
set.seed(1306)
folds <- sample(1:k, nrow(data.train), replace = TRUE)
cv.errors <- matrix(NA, k, 10, dimnames = list(NULL, paste(1:10)))

for (i in 1:k) {
  model3.cv <- regsubsets(y ~ ., data = data.train[folds != i, ], nvmax = 10)
  for (j in 1:10) {
    pred <- predict(model3.cv, data.train[folds == i, ], id = j)
    cv.errors[i, j] = mean((data.train$y[folds == i] - pred)^2)
  }
}

This gives us a 10x10 matrix(no. of predictors = 10), of which the (i, j)th element corresponds to the test MSE for the best i-variable model for the jth cross-validation fold

cv.errors

##              1        2        3        4        5        6        7
##  [1,] 5060.151 4054.381 4042.152 4319.934 4195.403 4572.721 4596.817
##  [2,] 3439.586 2697.570 2454.353 2585.358 2575.142 2194.526 2152.844
##  [3,] 3436.239 3103.584 2931.994 3099.518 2866.454 2802.145 2786.955
##  [4,] 4447.886 3321.043 2884.731 2836.289 2265.152 2153.625 2391.823
##  [5,] 5073.264 4023.290 3671.215 3508.997 3253.881 3161.034 3129.718
##  [6,] 3824.258 3599.092 3840.461 3856.610 3557.187 3549.081 3579.439
##  [7,] 3601.586 3704.709 3554.070 3364.279 3109.488 2812.486 2897.396
##  [8,] 4096.953 3427.270 3108.255 3068.888 2740.950 2665.427 2654.731
##  [9,] 3742.206 3832.129 3463.103 3649.511 3562.864 3308.198 3381.443
## [10,] 3103.915 2446.412 2688.210 2775.552 2632.751 2569.827 2627.139
##              8        9       10
##  [1,] 4545.516 4554.312 4508.588
##  [2,] 2131.643 2113.199 2115.943
##  [3,] 2823.623 2826.683 2826.121
##  [4,] 2401.491 2383.259 2482.791
##  [5,] 3089.509 3072.503 3077.937
##  [6,] 3503.111 3636.919 3560.506
##  [7,] 2873.095 2960.593 2946.175
##  [8,] 2685.431 2713.651 2679.861
##  [9,] 3372.835 3410.084 3398.884
## [10,] 2631.695 2658.567 2627.616

mean.cv.errors <- apply(cv.errors, 2, mean)
mean.cv.errors

##        1        2        3        4        5        6        7        8 
## 3982.604 3420.948 3263.854 3306.494 3075.927 2978.907 3019.831 3005.795 
##        9       10 
## 3032.977 3022.442

which.min(mean.cv.errors)

## 6 
## 6

mean.cv.errors[6]

##        6 
## 2978.907

par(mfrow = c(1,2))
plot(mean.cv.errors, type = "b", xlab = "Number of Predictors", ylab = "Mean CV Errors",
     main = "Best Subset Selection (10-fold CV)")
points(6, mean.cv.errors[6], col = "brown", cex = 2, pch = 20)

rmse.cv = sqrt(mean.cv.errors)
rmse.cv[6]

##        6 
## 54.57937

plot(rmse.cv, pch = 19, type = "b", xlab = "Number of Predictors", ylab = "RMSE CV",
     main = "Best Subset Selection (10-fold CV)")
points(6, rmse.cv[6], col = "blue", cex = 2, pch = 20)

The cross-validation suggests a 6-variable model, so we perform best subset selection on the training data set to get the best 6-variable model

reg.best <- regsubsets(y ~ ., data = data.train, nvmax = 10)
coef(reg.best, 6)

## (Intercept)         sex         bmi         map          tc         tch 
##    150.1166   -306.0420    538.8274    389.0673   -379.0379    332.6735 
##         ltg 
##    527.5658

mean((data.test$y - predict(reg.best, data.test, id = 6))^2)

## [1] 3095.483

sd((data.test$y - predict(reg.best, data.test, id = 6))^2)/sqrt(n.test)

## [1] 369.7526

lm.cv.best <- lm(y ~ sex + bmi + map + tc + tch + ltg, data = data.train)
summary(lm.cv.best)

## 
## Call:
## lm(formula = y ~ sex + bmi + map + tc + tch + ltg, data = data.train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -154.549  -34.821   -2.544   35.859  159.067 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  150.117      2.962  50.683  < 2e-16 ***
## sex         -306.042     69.041  -4.433 1.27e-05 ***
## bmi          538.827     74.219   7.260 2.88e-12 ***
## map          389.067     69.916   5.565 5.49e-08 ***
## tc          -379.038     82.271  -4.607 5.87e-06 ***
## tch          332.674     88.968   3.739 0.000218 ***
## ltg          527.566     87.067   6.059 3.78e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53.94 on 325 degrees of freedom
## Multiple R-squared:  0.5172, Adjusted R-squared:  0.5083 
## F-statistic: 58.03 on 6 and 325 DF,  p-value: < 2.2e-16

# Extracting the estimated regression model coefficients
coef(lm.cv.best)

## (Intercept)         sex         bmi         map          tc         tch 
##    150.1166   -306.0420    538.8274    389.0673   -379.0379    332.6735 
##         ltg 
##    527.5658

par(mfrow = c(2, 2)); plot(lm.cv.best) # Plot the model diagnostics

# Mean Predictor Error
mean((data.test$y - predict(lm.cv.best, data.test))^2)

## [1] 3095.483

# Standard Error
sd((data.test$y - predict(lm.cv.best, data.test))^2)/sqrt(n.test)

## [1] 369.7526

Model 4 - Ridge regression using 10-fold CV (with largest lambda within 1 SE of min.)

library(glmnet)

## Warning: package 'glmnet' was built under R version 3.4.2

## Loading required package: Matrix

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 3.4.2

## Loaded glmnet 2.0-13

par(mfrow = c(1,2))
grid <- 10^seq(10, -2, length = 100)
model4.ridge <- glmnet(x.train, y.train, alpha = 0, lambda = grid, thresh = 1e-12)

plot(model4.ridge, xvar = "lambda", label = TRUE)
set.seed(1306)
cv.out <- cv.glmnet(x.train, y.train, alpha = 0)
plot(cv.out)

bestlam <- cv.out$lambda.min
bestlam

## [1] 4.904021

log(bestlam)

## [1] 1.590056

model4.ridge <- glmnet(x.train, y.train, alpha = 0, lambda = bestlam)
ridge.pred <- predict(model4.ridge, s = bestlam, newx = x.test)

mean((ridge.pred - y.test)^2)

## [1] 3074.378

sd((ridge.pred - y.test)^2)/sqrt(n.test)

## [1] 357.9628

largelam <- cv.out$lambda.1se
largelam

## [1] 41.67209

model4.ridge <- glmnet(x.train, y.train, alpha = 0, lambda = largelam)
ridge.pred <- predict(model4.ridge, s = largelam, newx = x.test)

mean((ridge.pred - y.test)^2)

## [1] 3070.87

sd((ridge.pred - y.test)^2)/sqrt(n.test)

## [1] 350.5467

# Here are the estimated coefficients
predict(model4.ridge, type = "coefficients", s = largelam)[1:11,]

## (Intercept)         age         sex         bmi         map          tc 
##   149.99068   -11.33162  -156.91053   374.44939   264.89998   -31.96990 
##         ldl         hdl         tch         ltg         glu 
##   -66.89724  -174.01202   123.97204   307.68646   134.48120

cv.out() out function spits out a lot of information about the model.

Model 5 - Lasso model using 10-fold CV (with largest lambda within 1 SE of min.)

par(mfrow = c(1,2))
grid <- 10^seq(10, -2, length = 100)

model5.lasso <- glmnet(x.train, y.train, alpha = 1, lambda = grid, thresh = 1e-12)
plot(model5.lasso, xvar = "lambda", label = TRUE)

set.seed(1306)
cv.out <- cv.glmnet(x.train, y.train, alpha = 1)
plot(cv.out)

bestlam <- cv.out$lambda.min
bestlam

## [1] 0.2026348

log(bestlam)

## [1] -1.59635

model5.lasso <- glmnet(x.train, y.train, alpha = 1, lambda = bestlam)
lasso.pred <- predict(model5.lasso, s = bestlam, newx = x.test)

mean((lasso.pred - y.test)^2)

## [1] 3103.65

sd((lasso.pred - y.test)^2)/sqrt(n.test)

## [1] 363.0016

largelam <- cv.out$lambda.1se
largelam

## [1] 4.791278

model5.lasso <- glmnet(x.train, y.train, alpha = 1, lambda = largelam)
lasso.pred <- predict(model5.lasso, s = largelam, newx = x.test)

mean((lasso.pred - y.test)^2)

## [1] 2920.041

sd((lasso.pred - y.test)^2)/sqrt(n.test)

## [1] 346.2248

# Here are the estimated coefficients
lasso.coef <- predict(model5.lasso, type = "coefficients", s = largelam)[1:11,]
lasso.coef[lasso.coef != 0]

## (Intercept)         sex         bmi         map         hdl         ltg 
##   149.95298  -119.62208   501.56473   270.92614  -180.29437   390.55001 
##         glu 
##    16.58881

** Model Comparison**

results_df = data.frame(Model = c("1.Full Least Squares Model", "2.Best Subsets Model with BIC", "3.Best Subsets Model with 10-fold CV", "4.Ridge Regression Model with 10-fold CV", "5.Lasso Model with 10-fold CV"), "Test Error" = c(3111.26,3095.48,3095.48,3070.87,2920.04), "Std. Error" = c(361.09,369.75,369.75,350.55,346.22))
print(results_df, row.names=FALSE)

##                                     Model Test.Error Std..Error
##                1.Full Least Squares Model    3111.26     361.09
##             2.Best Subsets Model with BIC    3095.48     369.75
##      3.Best Subsets Model with 10-fold CV    3095.48     369.75
##  4.Ridge Regression Model with 10-fold CV    3070.87     350.55
##             5.Lasso Model with 10-fold CV    2920.04     346.22

cat(paste0("The test MAE for the lasso model using CV is: ",round(mean((y.test - abs(lasso.pred))),2)))

## The test MAE for the lasso model using CV is: 8.76

cat(paste0("20th, 50th, and 80th quantiles of the errors:"))

## 20th, 50th, and 80th quantiles of the errors:

quantile(y.test-lasso.pred,probs=c(.20,.50,.80))

##        20%        50%        80% 
## -39.755441   4.984314  57.448337

library(e1071)

## Warning: package 'e1071' was built under R version 3.4.3

ku = kurtosis(y.test-lasso.pred)
ku = round(ku,2)
cat(paste0("Kurtosis of the error distribution: ", ku))

## Kurtosis of the error distribution: -0.58

Big Data- 743 - Individual Project 1

Priyanka Gagneja

January 31, 2018