# Load the diabetes data
library(lars)
## Loaded lars 1.2
data("diabetes")
data.all <- data.frame(cbind(diabetes$x, y = diabetes$y))
# Partition the patients into two groups: training (75%) and test (25%)
n <- dim(data.all)[1] # sample size = 442
set.seed(1306) # set random number generator seed to enable
# repeatability of results
test <- sample(n, round(n/4)) # randomly sample 25% test
data.train <- data.all[-test,]
data.test <- data.all[test,]
x <- model.matrix(y ~ ., data = data.all)[,-1] # define predictor matrix
# excl intercept col of 1s
x.train <- x[-test,] # define training predictor matrix
x.test <- x[test,] # define test predictor matrix
y <- data.all$y # define response variable
y.train <- y[-test] # define training response variable
y.test <- y[test] # define test response variable
n.train <- dim(data.train)[1] # training sample size = 332
n.test <- dim(data.test)[1] # test sample size = 110
print("start")
## [1] "start"
### 1. Least squares regression model
lm_model = lm(y~ ., data = data.train)
summary(lm_model)
## 
## Call:
## lm(formula = y ~ ., data = data.train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -155.726  -36.065   -2.758   35.039  151.509 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  149.920      2.976  50.382  < 2e-16 ***
## age          -66.758     68.946  -0.968  0.33364    
## sex         -304.651     69.847  -4.362 1.74e-05 ***
## bmi          518.663     76.573   6.773 6.01e-11 ***
## map          388.111     72.755   5.335 1.81e-07 ***
## tc          -815.268    537.549  -1.517  0.13034    
## ldl          387.604    439.162   0.883  0.37811    
## hdl          162.903    269.117   0.605  0.54539    
## tch          323.832    186.803   1.734  0.08396 .  
## ltg          673.620    206.888   3.256  0.00125 ** 
## glu           94.219     79.590   1.184  0.23737    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54.05 on 321 degrees of freedom
## Multiple R-squared:  0.5213, Adjusted R-squared:  0.5064 
## F-statistic: 34.96 on 10 and 321 DF,  p-value: < 2.2e-16
# Coefficient estimates
coef(lm_model)
## (Intercept)         age         sex         bmi         map          tc 
##   149.92029   -66.75836  -304.65071   518.66346   388.11130  -815.26815 
##         ldl         hdl         tch         ltg         glu 
##   387.60431   162.90253   323.83151   673.62035    94.21867
# predict the responses for the test set
lm_pred = predict(lm_model, data.test)
lm_pred
##       346        85       214        35       309       398       169 
## 111.33590  48.56528  95.70299  77.78492  90.76860 177.49358 222.55797 
##       373        44       112       190        15       367       394 
## 105.91081  74.68751 117.22451 118.16116  99.74539 249.71198 103.27034 
##        59       380       201       144       162       395       311 
## 108.32707 145.75808  90.48867 119.66708 227.78207 261.72436 207.26008 
##       115       143       114       110       336       151         8 
## 291.50661 156.15240 204.11733 141.47954  87.07373 210.91597 109.07446 
##       343       355       257       141       440       134        69 
## 156.87763 179.99476 271.41886 151.08492 117.45552  69.97842 107.65025 
##       170       301       121       262        41       393       310 
## 221.92662 213.30678 164.32962 137.09749 146.30792  96.76769 127.56460 
##       328       379       297       188       203       178       364 
## 220.14235 155.82771  73.06765  74.28493 144.12865 220.34496 164.94441 
##        62       333       399         4       287        23        24 
## 180.73525 293.94907 159.42888 168.61670  85.74577 117.45140 250.15760 
##       285       242       299       238       130        86       332 
## 153.59940 112.53621 100.11421  59.64842 213.56023 160.87843 105.35910 
##       123       357       209        73       432       330       391 
## 211.50139  86.64252 229.03412 121.74165 116.94238  99.85730 249.49075 
##        40       265       365        55       135       120       420 
## 123.51126 116.12589 157.41684 141.83419 143.12530 150.11808  86.07971 
##         3       274        87       303       363       339        61 
## 166.93805 181.41530  44.97468 205.13313 235.08068 120.40087 120.03891 
##        77       157       352       288       419        66        79 
## 196.69663 147.71916  88.75660 133.26056  92.82943 193.62858 164.14708 
##       125       410        11        19        20       322       281 
## 171.48048 172.94849 106.20084 149.90738 120.55537 289.78934 159.97171 
##       378       414       211       150       308       255       206 
## 164.01378 123.20767 120.49902 175.36300 139.39764 253.05674 230.55348 
##       220       207       422       273       360 
## 151.94021 153.53364 185.00350 143.91021 171.46479
plot(lm_pred)

# calculate the mean prediction error (MSE)
mean((data.test$y - predict(lm_model, data.test))^2)
## [1] 3111.265
# calculate the standard error
sd((data.test$y - predict(lm_model, data.test))^2)/sqrt(n.test) 
## [1] 361.0908
### 2. Apply best subset selection using BIC to select the number of predictors (R function regsubsets in package leaps).
library(leaps)
BIC_model = regsubsets(y ~ ., data = data.train, nvmax = 10)
summary(BIC_model)
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = data.train, nvmax = 10)
## 10 Variables  (and intercept)
##     Forced in Forced out
## age     FALSE      FALSE
## sex     FALSE      FALSE
## bmi     FALSE      FALSE
## map     FALSE      FALSE
## tc      FALSE      FALSE
## ldl     FALSE      FALSE
## hdl     FALSE      FALSE
## tch     FALSE      FALSE
## ltg     FALSE      FALSE
## glu     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
##           age sex bmi map tc  ldl hdl tch ltg glu
## 1  ( 1 )  " " " " "*" " " " " " " " " " " " " " "
## 2  ( 1 )  " " " " "*" " " " " " " " " " " "*" " "
## 3  ( 1 )  " " " " "*" "*" " " " " " " " " "*" " "
## 4  ( 1 )  " " " " "*" "*" "*" " " " " " " "*" " "
## 5  ( 1 )  " " "*" "*" "*" " " " " "*" " " "*" " "
## 6  ( 1 )  " " "*" "*" "*" "*" " " " " "*" "*" " "
## 7  ( 1 )  " " "*" "*" "*" "*" " " " " "*" "*" "*"
## 8  ( 1 )  "*" "*" "*" "*" "*" " " " " "*" "*" "*"
## 9  ( 1 )  "*" "*" "*" "*" "*" "*" " " "*" "*" "*"
## 10  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
reg.summary = summary(BIC_model) 
plot(reg.summary$bic ,xlab="Number of Variables ",ylab="BIC", type = "l")

summary(BIC_model)$bic[6]
## [1] -201.1269
BIC_model6 = lm(y ~ sex + bmi + map + tc + tch + ltg, data = data.train)
# Coefficient estimates
coef((BIC_model6))
## (Intercept)         sex         bmi         map          tc         tch 
##    150.1166   -306.0420    538.8274    389.0673   -379.0379    332.6735 
##         ltg 
##    527.5658
# predict the responses for the test set
BIC_pred = predict(BIC_model6, data.test)
BIC_pred
##       346        85       214        35       309       398       169 
## 115.08010  61.44411  92.26548  84.81964  98.38269 177.79810 214.78405 
##       373        44       112       190        15       367       394 
## 103.98820  67.86200 121.59371 117.93563 108.37323 247.86330  98.89102 
##        59       380       201       144       162       395       311 
## 112.58548 142.65033  87.65020 121.18532 221.68066 283.39225 203.91116 
##       115       143       114       110       336       151         8 
## 301.82539 158.74933 197.24723 132.99181  92.19627 213.63936 112.15429 
##       343       355       257       141       440       134        69 
## 157.61139 176.55369 269.39120 151.98736 118.49628  70.80399 107.80243 
##       170       301       121       262        41       393       310 
## 240.44541 211.25966 163.02389 136.21698 135.85866  86.75992 117.51609 
##       328       379       297       188       203       178       364 
## 226.16995 159.24674  82.43226  81.59298 157.27770 224.58717 158.79959 
##        62       333       399         4       287        23        24 
## 175.89576 295.97994 166.46770 162.03381  81.01296 113.75382 262.64387 
##       285       242       299       238       130        86       332 
## 159.49541 107.96791 101.27321  62.02180 202.83746 171.58218 110.63091 
##       123       357       209        73       432       330       391 
## 210.99768  91.65957 219.26553 132.21093 119.54068  92.05041 254.35998 
##        40       265       365        55       135       120       420 
## 121.59683 121.97007 146.65570 137.81490 142.15608 146.17405  86.22342 
##         3       274        87       303       363       339        61 
## 174.28405 193.30767  46.12564 202.89655 233.07209 119.80632 118.57049 
##        77       157       352       288       419        66        79 
## 192.97904 148.31438  86.96816 135.34763  90.69224 188.65044 171.05135 
##       125       410        11        19        20       322       281 
## 171.38090 168.76919 102.35574 147.58380 121.70455 300.89884 160.85989 
##       378       414       211       150       308       255       206 
## 170.14833 116.96216 125.83236 177.45579 134.14826 263.21170 233.66911 
##       220       207       422       273       360 
## 146.27774 152.21678 186.86259 152.22618 166.87817
plot(lm_pred)

# calculate the mean prediction error (MSE)
mean((data.test$y - predict(BIC_model6, data.test))^2)
## [1] 3095.483
# calculate the standard error
sd((data.test$y - predict(BIC_model6, data.test))^2)/sqrt(n.test)
## [1] 369.7526
### 3. 10-fold cross-validation
set.seed(1306)
k = 10
folds <- sample(1:k, nrow(data.train), replace = TRUE)
predict.regsubsets = function (object , newdata ,id ,...){
  form=as.formula (object$call [[2]])
  mat=model.matrix(form ,newdata )
  coefi=coef(object ,id=id)
  xvars=names(coefi)
  mat[,xvars]%*%coefi
}
cv.errors = matrix (NA,k,10, dimnames = list(NULL , paste (1:10)))
for(j in 1:k){
  CV_model = regsubsets (y~., data = data.train[folds!=j,], nvmax=10)
  for(i in 1:10){
    pred = predict(CV_model,data.train[folds ==j,],id=i)
    cv.errors[j,i] = mean((data.train$y[folds==j]-pred)^2)
  }
}
mean.cv.errors = apply(cv.errors,2, mean)
mean.cv.errors
##        1        2        3        4        5        6        7        8 
## 3982.604 3420.948 3263.854 3306.494 3075.927 2978.907 3019.831 3005.795 
##        9       10 
## 3032.977 3022.442
plot(mean.cv.errors, type = "b")

mean.cv.errors[6]
##        6 
## 2978.907
reg.best = regsubsets(y~., data = data.train , nvmax=10)
# Coefficient estimates
coef(reg.best, 6)
## (Intercept)         sex         bmi         map          tc         tch 
##    150.1166   -306.0420    538.8274    389.0673   -379.0379    332.6735 
##         ltg 
##    527.5658
# predict the responses for the test set
CV_pred = predict(reg.best, id = 6, data.test)
CV_pred
##          [,1]
## 346 115.08010
## 85   61.44411
## 214  92.26548
## 35   84.81964
## 309  98.38269
## 398 177.79810
## 169 214.78405
## 373 103.98820
## 44   67.86200
## 112 121.59371
## 190 117.93563
## 15  108.37323
## 367 247.86330
## 394  98.89102
## 59  112.58548
## 380 142.65033
## 201  87.65020
## 144 121.18532
## 162 221.68066
## 395 283.39225
## 311 203.91116
## 115 301.82539
## 143 158.74933
## 114 197.24723
## 110 132.99181
## 336  92.19627
## 151 213.63936
## 8   112.15429
## 343 157.61139
## 355 176.55369
## 257 269.39120
## 141 151.98736
## 440 118.49628
## 134  70.80399
## 69  107.80243
## 170 240.44541
## 301 211.25966
## 121 163.02389
## 262 136.21698
## 41  135.85866
## 393  86.75992
## 310 117.51609
## 328 226.16995
## 379 159.24674
## 297  82.43226
## 188  81.59298
## 203 157.27770
## 178 224.58717
## 364 158.79959
## 62  175.89576
## 333 295.97994
## 399 166.46770
## 4   162.03381
## 287  81.01296
## 23  113.75382
## 24  262.64387
## 285 159.49541
## 242 107.96791
## 299 101.27321
## 238  62.02180
## 130 202.83746
## 86  171.58218
## 332 110.63091
## 123 210.99768
## 357  91.65957
## 209 219.26553
## 73  132.21093
## 432 119.54068
## 330  92.05041
## 391 254.35998
## 40  121.59683
## 265 121.97007
## 365 146.65570
## 55  137.81490
## 135 142.15608
## 120 146.17405
## 420  86.22342
## 3   174.28405
## 274 193.30767
## 87   46.12564
## 303 202.89655
## 363 233.07209
## 339 119.80632
## 61  118.57049
## 77  192.97904
## 157 148.31438
## 352  86.96816
## 288 135.34763
## 419  90.69224
## 66  188.65044
## 79  171.05135
## 125 171.38090
## 410 168.76919
## 11  102.35574
## 19  147.58380
## 20  121.70455
## 322 300.89884
## 281 160.85989
## 378 170.14833
## 414 116.96216
## 211 125.83236
## 150 177.45579
## 308 134.14826
## 255 263.21170
## 206 233.66911
## 220 146.27774
## 207 152.21678
## 422 186.86259
## 273 152.22618
## 360 166.87817
plot(CV_pred)
# MSE
mean((data.test$y - predict(reg.best, id = 6, data.test))^2)
## [1] 3095.483
# Standard Error
sd((data.test$y - predict(reg.best, id = 6, data.test))^2)/sqrt(n.test) 
## [1] 369.7526
### 4. Ridge regression modeling using 10-fold cross-validation
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-18

set.seed(1306)
cv.out <- cv.glmnet(x.train, y.train, alpha = 0)
grid=10^seq(10, -2, length = 100)
ridge_model = glmnet(x.train, y.train, alpha=0, lambda=grid, thresh = 1e-12)
largestlambda = cv.out$lambda.1se
largestlambda
## [1] 41.67209
ridge_model = glmnet(x.train, y.train, alpha=0, lambda = largestlambda)
# Predict the response for the test set
ridge_pred = predict(ridge_model, s = largestlambda ,newx = x.test)
ridge_pred
##             1
## 346 131.68015
## 85   65.86790
## 214 109.01779
## 35   89.37115
## 309  92.73246
## 398 165.60957
## 169 219.38819
## 373 116.81330
## 44   95.85469
## 112 119.96801
## 190 120.19421
## 15  103.86472
## 367 230.99466
## 394 108.76025
## 59   97.32234
## 380 134.91774
## 201 112.47075
## 144 119.75055
## 162 227.14067
## 395 243.59543
## 311 191.11558
## 115 257.14215
## 143 164.52129
## 114 200.97300
## 110 158.19818
## 336  96.26947
## 151 189.98233
## 8   135.68347
## 343 164.73104
## 355 190.84719
## 257 230.18409
## 141 158.32195
## 440 132.94191
## 134  88.67143
## 69  120.36658
## 170 220.06985
## 301 195.93878
## 121 152.32317
## 262 126.81919
## 41  163.12081
## 393 121.29330
## 310 147.81228
## 328 199.64118
## 379 155.62038
## 297  89.16592
## 188  79.49482
## 203 163.79199
## 178 205.19915
## 364 159.17027
## 62  169.89715
## 333 254.20120
## 399 155.32711
## 4   158.13841
## 287  82.57695
## 23  124.09247
## 24  252.34506
## 285 149.87349
## 242 133.61903
## 299 112.71615
## 238  84.96118
## 130 200.51089
## 86  154.03957
## 332 120.38613
## 123 206.77327
## 357 102.20145
## 209 213.75754
## 73  143.47745
## 432 124.22021
## 330 116.04355
## 391 238.78786
## 40  135.59002
## 265 132.83527
## 365 163.47721
## 55  141.53212
## 135 134.33421
## 120 143.43437
## 420  97.97071
## 3   162.66272
## 274 193.92943
## 87   79.02746
## 303 190.46887
## 363 213.15031
## 339 124.01553
## 61  114.04791
## 77  178.89045
## 157 139.01395
## 352  98.97602
## 288 147.73925
## 419 113.02908
## 66  181.43830
## 79  146.97444
## 125 156.27409
## 410 170.97669
## 11  107.21214
## 19  142.99720
## 20  119.19528
## 322 261.57001
## 281 156.95876
## 378 160.94200
## 414 127.74932
## 211 125.91741
## 150 164.80876
## 308 155.16650
## 255 246.55105
## 206 198.02885
## 220 134.29374
## 207 157.30724
## 422 183.04708
## 273 132.43031
## 360 172.24354
plot(ridge_pred)

# Coefficients 
coef(ridge_model)
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                     s0
## (Intercept)  149.99068
## age          -11.33162
## sex         -156.91053
## bmi          374.44939
## map          264.89998
## tc           -31.96990
## ldl          -66.89724
## hdl         -174.01202
## tch          123.97204
## ltg          307.68646
## glu          134.48120
# MSE
mean((ridge_pred - y.test)^2)
## [1] 3070.87
# Standard Error
sd((ridge_pred - y.test)^2)/sqrt(n.test) 
## [1] 350.5467
### 5. Lasso model using 10-fold cross-validation
set.seed(1306)
cv.out <- cv.glmnet(x.train, y.train, alpha = 1)
largestlambda = cv.out$lambda.1se
largestlambda
## [1] 4.791278
lasso_model = glmnet(x.train, y.train, alpha = 1, lambda = largestlambda)
# Predict the response for the test set
lasso_pred = predict(lasso_model, s = largestlambda, newx = x.test)
lasso_pred
##             1
## 346 134.99628
## 85   71.13937
## 214 100.08529
## 35   82.18893
## 309  89.85838
## 398 161.87571
## 169 212.68368
## 373 118.38671
## 44   87.48320
## 112 128.35692
## 190 111.75078
## 15  110.55540
## 367 244.71111
## 394 101.67717
## 59   98.89933
## 380 119.85097
## 201 103.65422
## 144 125.45387
## 162 233.06638
## 395 243.26139
## 311 193.48681
## 115 271.34207
## 143 177.54484
## 114 196.37392
## 110 141.24538
## 336  95.21648
## 151 195.60193
## 8   142.93145
## 343 167.08028
## 355 202.50036
## 257 242.86159
## 141 157.66805
## 440 127.04883
## 134  77.98254
## 69  114.88140
## 170 216.28854
## 301 201.01496
## 121 147.98325
## 262 125.86698
## 41  156.59362
## 393 111.14802
## 310 149.63363
## 328 216.73426
## 379 160.94482
## 297  84.71963
## 188  76.30456
## 203 183.26029
## 178 201.99940
## 364 151.10009
## 62  160.68917
## 333 262.09408
## 399 171.00052
## 4   154.75071
## 287  73.78141
## 23  122.55290
## 24  246.95072
## 285 151.81139
## 242 128.90274
## 299 105.97086
## 238  75.17776
## 130 187.15853
## 86  171.36777
## 332 112.10754
## 123 213.78135
## 357 107.81771
## 209 202.83524
## 73  161.06699
## 432 122.29645
## 330 105.92598
## 391 244.85395
## 40  140.92122
## 265 134.64235
## 365 150.09185
## 55  144.65557
## 135 145.25579
## 120 136.07150
## 420  82.10500
## 3   171.17125
## 274 194.83984
## 87   64.24753
## 303 194.38624
## 363 222.06825
## 339 124.40123
## 61  112.32228
## 77  175.12463
## 157 139.29263
## 352  92.38877
## 288 156.96671
## 419 114.86949
## 66  174.38499
## 79  145.20889
## 125 158.91652
## 410 172.34060
## 11   92.82896
## 19  137.88786
## 20  124.18752
## 322 256.18236
## 281 166.41694
## 378 167.66039
## 414 125.31841
## 211 139.79457
## 150 164.34050
## 308 145.26141
## 255 247.86023
## 206 206.94186
## 220 127.55337
## 207 162.10501
## 422 185.57512
## 273 138.68729
## 360 166.75629
plot(lasso_pred)

# Coefficients 
coef(lasso_model)
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                     s0
## (Intercept)  149.95298
## age            .      
## sex         -119.62208
## bmi          501.56473
## map          270.92614
## tc             .      
## ldl            .      
## hdl         -180.29437
## tch            .      
## ltg          390.55001
## glu           16.58881
# MSE
mean((lasso_pred - y.test)^2)
## [1] 2920.041
# Standard Error
sd((lasso_pred - y.test)^2)/sqrt(n.test) 
## [1] 346.2248