# Load the diabetes data
library(lars)
## Loaded lars 1.2
data("diabetes")
data.all <- data.frame(cbind(diabetes$x, y = diabetes$y))
# Partition the patients into two groups: training (75%) and test (25%)
n <- dim(data.all)[1] # sample size = 442
set.seed(1306) # set random number generator seed to enable
# repeatability of results
test <- sample(n, round(n/4)) # randomly sample 25% test
data.train <- data.all[-test,]
data.test <- data.all[test,]
x <- model.matrix(y ~ ., data = data.all)[,-1] # define predictor matrix
# excl intercept col of 1s
x.train <- x[-test,] # define training predictor matrix
x.test <- x[test,] # define test predictor matrix
y <- data.all$y # define response variable
y.train <- y[-test] # define training response variable
y.test <- y[test] # define test response variable
n.train <- dim(data.train)[1] # training sample size = 332
n.test <- dim(data.test)[1] # test sample size = 110
print("start")
## [1] "start"
### 1. Least squares regression model
lm_model = lm(y~ ., data = data.train)
summary(lm_model)
##
## Call:
## lm(formula = y ~ ., data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -155.726 -36.065 -2.758 35.039 151.509
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 149.920 2.976 50.382 < 2e-16 ***
## age -66.758 68.946 -0.968 0.33364
## sex -304.651 69.847 -4.362 1.74e-05 ***
## bmi 518.663 76.573 6.773 6.01e-11 ***
## map 388.111 72.755 5.335 1.81e-07 ***
## tc -815.268 537.549 -1.517 0.13034
## ldl 387.604 439.162 0.883 0.37811
## hdl 162.903 269.117 0.605 0.54539
## tch 323.832 186.803 1.734 0.08396 .
## ltg 673.620 206.888 3.256 0.00125 **
## glu 94.219 79.590 1.184 0.23737
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54.05 on 321 degrees of freedom
## Multiple R-squared: 0.5213, Adjusted R-squared: 0.5064
## F-statistic: 34.96 on 10 and 321 DF, p-value: < 2.2e-16
# Coefficient estimates
coef(lm_model)
## (Intercept) age sex bmi map tc
## 149.92029 -66.75836 -304.65071 518.66346 388.11130 -815.26815
## ldl hdl tch ltg glu
## 387.60431 162.90253 323.83151 673.62035 94.21867
# predict the responses for the test set
lm_pred = predict(lm_model, data.test)
lm_pred
## 346 85 214 35 309 398 169
## 111.33590 48.56528 95.70299 77.78492 90.76860 177.49358 222.55797
## 373 44 112 190 15 367 394
## 105.91081 74.68751 117.22451 118.16116 99.74539 249.71198 103.27034
## 59 380 201 144 162 395 311
## 108.32707 145.75808 90.48867 119.66708 227.78207 261.72436 207.26008
## 115 143 114 110 336 151 8
## 291.50661 156.15240 204.11733 141.47954 87.07373 210.91597 109.07446
## 343 355 257 141 440 134 69
## 156.87763 179.99476 271.41886 151.08492 117.45552 69.97842 107.65025
## 170 301 121 262 41 393 310
## 221.92662 213.30678 164.32962 137.09749 146.30792 96.76769 127.56460
## 328 379 297 188 203 178 364
## 220.14235 155.82771 73.06765 74.28493 144.12865 220.34496 164.94441
## 62 333 399 4 287 23 24
## 180.73525 293.94907 159.42888 168.61670 85.74577 117.45140 250.15760
## 285 242 299 238 130 86 332
## 153.59940 112.53621 100.11421 59.64842 213.56023 160.87843 105.35910
## 123 357 209 73 432 330 391
## 211.50139 86.64252 229.03412 121.74165 116.94238 99.85730 249.49075
## 40 265 365 55 135 120 420
## 123.51126 116.12589 157.41684 141.83419 143.12530 150.11808 86.07971
## 3 274 87 303 363 339 61
## 166.93805 181.41530 44.97468 205.13313 235.08068 120.40087 120.03891
## 77 157 352 288 419 66 79
## 196.69663 147.71916 88.75660 133.26056 92.82943 193.62858 164.14708
## 125 410 11 19 20 322 281
## 171.48048 172.94849 106.20084 149.90738 120.55537 289.78934 159.97171
## 378 414 211 150 308 255 206
## 164.01378 123.20767 120.49902 175.36300 139.39764 253.05674 230.55348
## 220 207 422 273 360
## 151.94021 153.53364 185.00350 143.91021 171.46479
plot(lm_pred)

# calculate the mean prediction error (MSE)
mean((data.test$y - predict(lm_model, data.test))^2)
## [1] 3111.265
# calculate the standard error
sd((data.test$y - predict(lm_model, data.test))^2)/sqrt(n.test)
## [1] 361.0908
### 2. Apply best subset selection using BIC to select the number of predictors (R function regsubsets in package leaps).
library(leaps)
BIC_model = regsubsets(y ~ ., data = data.train, nvmax = 10)
summary(BIC_model)
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = data.train, nvmax = 10)
## 10 Variables (and intercept)
## Forced in Forced out
## age FALSE FALSE
## sex FALSE FALSE
## bmi FALSE FALSE
## map FALSE FALSE
## tc FALSE FALSE
## ldl FALSE FALSE
## hdl FALSE FALSE
## tch FALSE FALSE
## ltg FALSE FALSE
## glu FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
## age sex bmi map tc ldl hdl tch ltg glu
## 1 ( 1 ) " " " " "*" " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " "*" " " " " " " " " " " "*" " "
## 3 ( 1 ) " " " " "*" "*" " " " " " " " " "*" " "
## 4 ( 1 ) " " " " "*" "*" "*" " " " " " " "*" " "
## 5 ( 1 ) " " "*" "*" "*" " " " " "*" " " "*" " "
## 6 ( 1 ) " " "*" "*" "*" "*" " " " " "*" "*" " "
## 7 ( 1 ) " " "*" "*" "*" "*" " " " " "*" "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" "*" " " " " "*" "*" "*"
## 9 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
reg.summary = summary(BIC_model)
plot(reg.summary$bic ,xlab="Number of Variables ",ylab="BIC", type = "l")

summary(BIC_model)$bic[6]
## [1] -201.1269
BIC_model6 = lm(y ~ sex + bmi + map + tc + tch + ltg, data = data.train)
# Coefficient estimates
coef((BIC_model6))
## (Intercept) sex bmi map tc tch
## 150.1166 -306.0420 538.8274 389.0673 -379.0379 332.6735
## ltg
## 527.5658
# predict the responses for the test set
BIC_pred = predict(BIC_model6, data.test)
BIC_pred
## 346 85 214 35 309 398 169
## 115.08010 61.44411 92.26548 84.81964 98.38269 177.79810 214.78405
## 373 44 112 190 15 367 394
## 103.98820 67.86200 121.59371 117.93563 108.37323 247.86330 98.89102
## 59 380 201 144 162 395 311
## 112.58548 142.65033 87.65020 121.18532 221.68066 283.39225 203.91116
## 115 143 114 110 336 151 8
## 301.82539 158.74933 197.24723 132.99181 92.19627 213.63936 112.15429
## 343 355 257 141 440 134 69
## 157.61139 176.55369 269.39120 151.98736 118.49628 70.80399 107.80243
## 170 301 121 262 41 393 310
## 240.44541 211.25966 163.02389 136.21698 135.85866 86.75992 117.51609
## 328 379 297 188 203 178 364
## 226.16995 159.24674 82.43226 81.59298 157.27770 224.58717 158.79959
## 62 333 399 4 287 23 24
## 175.89576 295.97994 166.46770 162.03381 81.01296 113.75382 262.64387
## 285 242 299 238 130 86 332
## 159.49541 107.96791 101.27321 62.02180 202.83746 171.58218 110.63091
## 123 357 209 73 432 330 391
## 210.99768 91.65957 219.26553 132.21093 119.54068 92.05041 254.35998
## 40 265 365 55 135 120 420
## 121.59683 121.97007 146.65570 137.81490 142.15608 146.17405 86.22342
## 3 274 87 303 363 339 61
## 174.28405 193.30767 46.12564 202.89655 233.07209 119.80632 118.57049
## 77 157 352 288 419 66 79
## 192.97904 148.31438 86.96816 135.34763 90.69224 188.65044 171.05135
## 125 410 11 19 20 322 281
## 171.38090 168.76919 102.35574 147.58380 121.70455 300.89884 160.85989
## 378 414 211 150 308 255 206
## 170.14833 116.96216 125.83236 177.45579 134.14826 263.21170 233.66911
## 220 207 422 273 360
## 146.27774 152.21678 186.86259 152.22618 166.87817
plot(lm_pred)

# calculate the mean prediction error (MSE)
mean((data.test$y - predict(BIC_model6, data.test))^2)
## [1] 3095.483
# calculate the standard error
sd((data.test$y - predict(BIC_model6, data.test))^2)/sqrt(n.test)
## [1] 369.7526
### 3. 10-fold cross-validation
set.seed(1306)
k = 10
folds <- sample(1:k, nrow(data.train), replace = TRUE)
predict.regsubsets = function (object , newdata ,id ,...){
form=as.formula (object$call [[2]])
mat=model.matrix(form ,newdata )
coefi=coef(object ,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
cv.errors = matrix (NA,k,10, dimnames = list(NULL , paste (1:10)))
for(j in 1:k){
CV_model = regsubsets (y~., data = data.train[folds!=j,], nvmax=10)
for(i in 1:10){
pred = predict(CV_model,data.train[folds ==j,],id=i)
cv.errors[j,i] = mean((data.train$y[folds==j]-pred)^2)
}
}
mean.cv.errors = apply(cv.errors,2, mean)
mean.cv.errors
## 1 2 3 4 5 6 7 8
## 3982.604 3420.948 3263.854 3306.494 3075.927 2978.907 3019.831 3005.795
## 9 10
## 3032.977 3022.442
plot(mean.cv.errors, type = "b")

mean.cv.errors[6]
## 6
## 2978.907
reg.best = regsubsets(y~., data = data.train , nvmax=10)
# Coefficient estimates
coef(reg.best, 6)
## (Intercept) sex bmi map tc tch
## 150.1166 -306.0420 538.8274 389.0673 -379.0379 332.6735
## ltg
## 527.5658
# predict the responses for the test set
CV_pred = predict(reg.best, id = 6, data.test)
CV_pred
## [,1]
## 346 115.08010
## 85 61.44411
## 214 92.26548
## 35 84.81964
## 309 98.38269
## 398 177.79810
## 169 214.78405
## 373 103.98820
## 44 67.86200
## 112 121.59371
## 190 117.93563
## 15 108.37323
## 367 247.86330
## 394 98.89102
## 59 112.58548
## 380 142.65033
## 201 87.65020
## 144 121.18532
## 162 221.68066
## 395 283.39225
## 311 203.91116
## 115 301.82539
## 143 158.74933
## 114 197.24723
## 110 132.99181
## 336 92.19627
## 151 213.63936
## 8 112.15429
## 343 157.61139
## 355 176.55369
## 257 269.39120
## 141 151.98736
## 440 118.49628
## 134 70.80399
## 69 107.80243
## 170 240.44541
## 301 211.25966
## 121 163.02389
## 262 136.21698
## 41 135.85866
## 393 86.75992
## 310 117.51609
## 328 226.16995
## 379 159.24674
## 297 82.43226
## 188 81.59298
## 203 157.27770
## 178 224.58717
## 364 158.79959
## 62 175.89576
## 333 295.97994
## 399 166.46770
## 4 162.03381
## 287 81.01296
## 23 113.75382
## 24 262.64387
## 285 159.49541
## 242 107.96791
## 299 101.27321
## 238 62.02180
## 130 202.83746
## 86 171.58218
## 332 110.63091
## 123 210.99768
## 357 91.65957
## 209 219.26553
## 73 132.21093
## 432 119.54068
## 330 92.05041
## 391 254.35998
## 40 121.59683
## 265 121.97007
## 365 146.65570
## 55 137.81490
## 135 142.15608
## 120 146.17405
## 420 86.22342
## 3 174.28405
## 274 193.30767
## 87 46.12564
## 303 202.89655
## 363 233.07209
## 339 119.80632
## 61 118.57049
## 77 192.97904
## 157 148.31438
## 352 86.96816
## 288 135.34763
## 419 90.69224
## 66 188.65044
## 79 171.05135
## 125 171.38090
## 410 168.76919
## 11 102.35574
## 19 147.58380
## 20 121.70455
## 322 300.89884
## 281 160.85989
## 378 170.14833
## 414 116.96216
## 211 125.83236
## 150 177.45579
## 308 134.14826
## 255 263.21170
## 206 233.66911
## 220 146.27774
## 207 152.21678
## 422 186.86259
## 273 152.22618
## 360 166.87817
plot(CV_pred)
# MSE
mean((data.test$y - predict(reg.best, id = 6, data.test))^2)
## [1] 3095.483
# Standard Error
sd((data.test$y - predict(reg.best, id = 6, data.test))^2)/sqrt(n.test)
## [1] 369.7526
### 4. Ridge regression modeling using 10-fold cross-validation
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-18

set.seed(1306)
cv.out <- cv.glmnet(x.train, y.train, alpha = 0)
grid=10^seq(10, -2, length = 100)
ridge_model = glmnet(x.train, y.train, alpha=0, lambda=grid, thresh = 1e-12)
largestlambda = cv.out$lambda.1se
largestlambda
## [1] 41.67209
ridge_model = glmnet(x.train, y.train, alpha=0, lambda = largestlambda)
# Predict the response for the test set
ridge_pred = predict(ridge_model, s = largestlambda ,newx = x.test)
ridge_pred
## 1
## 346 131.68015
## 85 65.86790
## 214 109.01779
## 35 89.37115
## 309 92.73246
## 398 165.60957
## 169 219.38819
## 373 116.81330
## 44 95.85469
## 112 119.96801
## 190 120.19421
## 15 103.86472
## 367 230.99466
## 394 108.76025
## 59 97.32234
## 380 134.91774
## 201 112.47075
## 144 119.75055
## 162 227.14067
## 395 243.59543
## 311 191.11558
## 115 257.14215
## 143 164.52129
## 114 200.97300
## 110 158.19818
## 336 96.26947
## 151 189.98233
## 8 135.68347
## 343 164.73104
## 355 190.84719
## 257 230.18409
## 141 158.32195
## 440 132.94191
## 134 88.67143
## 69 120.36658
## 170 220.06985
## 301 195.93878
## 121 152.32317
## 262 126.81919
## 41 163.12081
## 393 121.29330
## 310 147.81228
## 328 199.64118
## 379 155.62038
## 297 89.16592
## 188 79.49482
## 203 163.79199
## 178 205.19915
## 364 159.17027
## 62 169.89715
## 333 254.20120
## 399 155.32711
## 4 158.13841
## 287 82.57695
## 23 124.09247
## 24 252.34506
## 285 149.87349
## 242 133.61903
## 299 112.71615
## 238 84.96118
## 130 200.51089
## 86 154.03957
## 332 120.38613
## 123 206.77327
## 357 102.20145
## 209 213.75754
## 73 143.47745
## 432 124.22021
## 330 116.04355
## 391 238.78786
## 40 135.59002
## 265 132.83527
## 365 163.47721
## 55 141.53212
## 135 134.33421
## 120 143.43437
## 420 97.97071
## 3 162.66272
## 274 193.92943
## 87 79.02746
## 303 190.46887
## 363 213.15031
## 339 124.01553
## 61 114.04791
## 77 178.89045
## 157 139.01395
## 352 98.97602
## 288 147.73925
## 419 113.02908
## 66 181.43830
## 79 146.97444
## 125 156.27409
## 410 170.97669
## 11 107.21214
## 19 142.99720
## 20 119.19528
## 322 261.57001
## 281 156.95876
## 378 160.94200
## 414 127.74932
## 211 125.91741
## 150 164.80876
## 308 155.16650
## 255 246.55105
## 206 198.02885
## 220 134.29374
## 207 157.30724
## 422 183.04708
## 273 132.43031
## 360 172.24354
plot(ridge_pred)

# Coefficients
coef(ridge_model)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 149.99068
## age -11.33162
## sex -156.91053
## bmi 374.44939
## map 264.89998
## tc -31.96990
## ldl -66.89724
## hdl -174.01202
## tch 123.97204
## ltg 307.68646
## glu 134.48120
# MSE
mean((ridge_pred - y.test)^2)
## [1] 3070.87
# Standard Error
sd((ridge_pred - y.test)^2)/sqrt(n.test)
## [1] 350.5467
### 5. Lasso model using 10-fold cross-validation
set.seed(1306)
cv.out <- cv.glmnet(x.train, y.train, alpha = 1)
largestlambda = cv.out$lambda.1se
largestlambda
## [1] 4.791278
lasso_model = glmnet(x.train, y.train, alpha = 1, lambda = largestlambda)
# Predict the response for the test set
lasso_pred = predict(lasso_model, s = largestlambda, newx = x.test)
lasso_pred
## 1
## 346 134.99628
## 85 71.13937
## 214 100.08529
## 35 82.18893
## 309 89.85838
## 398 161.87571
## 169 212.68368
## 373 118.38671
## 44 87.48320
## 112 128.35692
## 190 111.75078
## 15 110.55540
## 367 244.71111
## 394 101.67717
## 59 98.89933
## 380 119.85097
## 201 103.65422
## 144 125.45387
## 162 233.06638
## 395 243.26139
## 311 193.48681
## 115 271.34207
## 143 177.54484
## 114 196.37392
## 110 141.24538
## 336 95.21648
## 151 195.60193
## 8 142.93145
## 343 167.08028
## 355 202.50036
## 257 242.86159
## 141 157.66805
## 440 127.04883
## 134 77.98254
## 69 114.88140
## 170 216.28854
## 301 201.01496
## 121 147.98325
## 262 125.86698
## 41 156.59362
## 393 111.14802
## 310 149.63363
## 328 216.73426
## 379 160.94482
## 297 84.71963
## 188 76.30456
## 203 183.26029
## 178 201.99940
## 364 151.10009
## 62 160.68917
## 333 262.09408
## 399 171.00052
## 4 154.75071
## 287 73.78141
## 23 122.55290
## 24 246.95072
## 285 151.81139
## 242 128.90274
## 299 105.97086
## 238 75.17776
## 130 187.15853
## 86 171.36777
## 332 112.10754
## 123 213.78135
## 357 107.81771
## 209 202.83524
## 73 161.06699
## 432 122.29645
## 330 105.92598
## 391 244.85395
## 40 140.92122
## 265 134.64235
## 365 150.09185
## 55 144.65557
## 135 145.25579
## 120 136.07150
## 420 82.10500
## 3 171.17125
## 274 194.83984
## 87 64.24753
## 303 194.38624
## 363 222.06825
## 339 124.40123
## 61 112.32228
## 77 175.12463
## 157 139.29263
## 352 92.38877
## 288 156.96671
## 419 114.86949
## 66 174.38499
## 79 145.20889
## 125 158.91652
## 410 172.34060
## 11 92.82896
## 19 137.88786
## 20 124.18752
## 322 256.18236
## 281 166.41694
## 378 167.66039
## 414 125.31841
## 211 139.79457
## 150 164.34050
## 308 145.26141
## 255 247.86023
## 206 206.94186
## 220 127.55337
## 207 162.10501
## 422 185.57512
## 273 138.68729
## 360 166.75629
plot(lasso_pred)

# Coefficients
coef(lasso_model)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 149.95298
## age .
## sex -119.62208
## bmi 501.56473
## map 270.92614
## tc .
## ldl .
## hdl -180.29437
## tch .
## ltg 390.55001
## glu 16.58881
# MSE
mean((lasso_pred - y.test)^2)
## [1] 2920.041
# Standard Error
sd((lasso_pred - y.test)^2)/sqrt(n.test)
## [1] 346.2248