dat <- College
###
fwd <- regsubsets(Outstate ~ ., data=dat, nvmax=17, method="forward") # ivs maxium=17
fwd.sum <- summary(fwd) # all possiblity
which.max(fwd.sum$adjr2) # pick the model by adjr2
## [1] 15
coef(fwd, 15, scale="adjr2") # 看最佳模型的變項係數,按adjr2排
## (Intercept) PrivateYes Apps Accept Enroll
## -1.647456e+03 2.263196e+03 -2.994326e-01 8.022893e-01 -5.366481e-01
## Top10perc F.Undergrad Room.Board Books Personal
## 2.427876e+01 -9.509565e-02 8.836488e-01 -4.637800e-01 -2.267671e-01
## PhD Terminal S.F.Ratio perc.alumni Expend
## 1.134333e+01 2.417116e+01 -4.638875e+01 4.155348e+01 2.009265e-01
## Grad.Rate
## 2.364651e+01
coef(fwd, 17, scale="adjr2") # 全部變項丟入,按adjr2排
## (Intercept) PrivateYes Apps Accept Enroll
## -1.587267e+03 2.263757e+03 -3.034481e-01 8.123743e-01 -5.492393e-01
## Top10perc Top25perc F.Undergrad P.Undergrad Room.Board
## 2.834130e+01 -3.779314e+00 -9.566599e-02 1.166082e-02 8.816138e-01
## Books Personal PhD Terminal S.F.Ratio
## -4.592264e-01 -2.294487e-01 1.124167e+01 2.467266e+01 -4.643932e+01
## perc.alumni Expend Grad.Rate
## 4.179887e+01 1.989838e-01 2.400159e+01
#
mod_15 <- lm(Outstate ~ Private + Apps + Accept + Enroll + Top10perc
+ F.Undergrad + Room.Board + Books + Personal + PhD
+ Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate, data = dat)
mod_17 <- lm(Outstate ~ Private + Apps + Accept + Enroll + Top10perc
+ Top25perc + F.Undergrad + P.Undergrad + Room.Board + Books + Personal + PhD
+ Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate, data = dat)
#
summary(mod_15)
##
## Call:
## lm(formula = Outstate ~ Private + Apps + Accept + Enroll + Top10perc +
## F.Undergrad + Room.Board + Books + Personal + PhD + Terminal +
## S.F.Ratio + perc.alumni + Expend + Grad.Rate, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6810.4 -1264.0 -34.9 1238.7 9929.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.647e+03 7.525e+02 -2.189 0.028886 *
## PrivateYes 2.263e+03 2.475e+02 9.145 < 2e-16 ***
## Apps -2.994e-01 6.675e-02 -4.486 8.38e-06 ***
## Accept 8.023e-01 1.274e-01 6.298 5.11e-10 ***
## Enroll -5.366e-01 3.512e-01 -1.528 0.126950
## Top10perc 2.428e+01 6.628e+00 3.663 0.000266 ***
## F.Undergrad -9.510e-02 5.930e-02 -1.604 0.109235
## Room.Board 8.837e-01 8.495e-02 10.402 < 2e-16 ***
## Books -4.638e-01 4.472e-01 -1.037 0.300015
## Personal -2.268e-01 1.174e-01 -1.931 0.053828 .
## PhD 1.134e+01 8.706e+00 1.303 0.193007
## Terminal 2.417e+01 9.457e+00 2.556 0.010788 *
## S.F.Ratio -4.639e+01 2.438e+01 -1.902 0.057488 .
## perc.alumni 4.155e+01 7.535e+00 5.515 4.79e-08 ***
## Expend 2.009e-01 2.227e-02 9.023 < 2e-16 ***
## Grad.Rate 2.365e+01 5.428e+00 4.357 1.50e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1955 on 761 degrees of freedom
## Multiple R-squared: 0.7683, Adjusted R-squared: 0.7638
## F-statistic: 168.3 on 15 and 761 DF, p-value: < 2.2e-16
summary(mod_17)
##
## Call:
## lm(formula = Outstate ~ Private + Apps + Accept + Enroll + Top10perc +
## Top25perc + F.Undergrad + P.Undergrad + Room.Board + Books +
## Personal + PhD + Terminal + S.F.Ratio + perc.alumni + Expend +
## Grad.Rate, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6782.6 -1267.5 -40.9 1244.5 9953.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.587e+03 7.660e+02 -2.072 0.03860 *
## PrivateYes 2.264e+03 2.480e+02 9.128 < 2e-16 ***
## Apps -3.034e-01 6.734e-02 -4.506 7.64e-06 ***
## Accept 8.124e-01 1.293e-01 6.286 5.51e-10 ***
## Enroll -5.492e-01 3.541e-01 -1.551 0.12134
## Top10perc 2.834e+01 1.098e+01 2.582 0.01002 *
## Top25perc -3.779e+00 8.475e+00 -0.446 0.65576
## F.Undergrad -9.567e-02 6.152e-02 -1.555 0.12038
## P.Undergrad 1.166e-02 6.049e-02 0.193 0.84720
## Room.Board 8.816e-01 8.558e-02 10.302 < 2e-16 ***
## Books -4.592e-01 4.479e-01 -1.025 0.30551
## Personal -2.294e-01 1.183e-01 -1.940 0.05280 .
## PhD 1.124e+01 8.730e+00 1.288 0.19822
## Terminal 2.467e+01 9.538e+00 2.587 0.00988 **
## S.F.Ratio -4.644e+01 2.441e+01 -1.902 0.05753 .
## perc.alumni 4.180e+01 7.561e+00 5.528 4.45e-08 ***
## Expend 1.990e-01 2.269e-02 8.769 < 2e-16 ***
## Grad.Rate 2.400e+01 5.506e+00 4.359 1.49e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1958 on 759 degrees of freedom
## Multiple R-squared: 0.7684, Adjusted R-squared: 0.7632
## F-statistic: 148.1 on 17 and 759 DF, p-value: < 2.2e-16
anova(mod_15, mod_17)
## Analysis of Variance Table
##
## Model 1: Outstate ~ Private + Apps + Accept + Enroll + Top10perc + F.Undergrad +
## Room.Board + Books + Personal + PhD + Terminal + S.F.Ratio +
## perc.alumni + Expend + Grad.Rate
## Model 2: Outstate ~ Private + Apps + Accept + Enroll + Top10perc + Top25perc +
## F.Undergrad + P.Undergrad + Room.Board + Books + Personal +
## PhD + Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 761 2909430692
## 2 759 2908534423 2 896269 0.1169 0.8897
Ans: 1. The final model is included the predictors: Private + Apps + Accept + Enroll + Top10perc + F.Undergrad + Room.Board + Books + Personal + PhD + Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate 15 ivs
model_15 adjusted R^2 is 0.7638 model_17 adjusted R^2 is 0.7632
According anova summary, there isn’t significant different in R^2 between these two models.
## Read in your data
College_train <- read.csv("C:/Users/Liz/Desktop/NTNU/bigdata/College_train.csv")
College_test <- read.csv("C:/Users/Liz/Desktop/NTNU/bigdata/College_test.csv")
# ridge models (5 folds CV)
YTrain <- College_train$Outstate
XTrain <- model.matrix( Outstate ~ ., data = College_train)
head(XTrain)
## (Intercept) PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 1 0 9478 6312 2194 29 65 15646
## 2 1 1 878 816 200 16 41 706
## 3 1 1 1416 1015 417 10 44 1324
## 4 1 0 1373 1373 724 6 21 2754
## 5 1 1 7875 5062 1492 38 71 5471
## 6 1 1 12394 5232 2464 85 100 9205
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni
## 1 1829 4180 470 1800 87 89 19.2 10
## 2 62 3363 550 1700 62 68 11.6 29
## 3 117 5800 585 1700 67 78 13.2 23
## 4 474 2660 540 1660 60 68 20.3 29
## 5 1470 6328 700 950 92 93 7.6 15
## 6 531 7270 500 1544 95 96 6.3 38
## Expend Grad.Rate
## 1 7850 59
## 2 7718 48
## 3 9006 75
## 4 4550 52
## 5 14745 72
## 6 25765 93
# 5 folds cv on train dataset to find the best lambda
cv.out.ridge <- cv.glmnet(XTrain, YTrain, alpha = 0, nfolds = 5)
plot(cv.out.ridge)
# the best lambda the glmnet select
cv.out.ridge$lambda.1se
## [1] 2393.902
bestLamdaForRidge <- cv.out.ridge$lambda.1se
# get ridge model based on train dataset
# alpha=0 Ridge
ridge.mod <- glmnet(XTrain, YTrain, alpha = 0)
# coefs of the "best" ridge model in train
predict(ridge.mod, s = bestLamdaForRidge, type = "coefficients")
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 854.47099755
## (Intercept) .
## PrivateYes 1374.03200439
## Apps 0.02948502
## Accept 0.08313705
## Enroll -0.09411606
## Top10perc 15.46787202
## Top25perc 6.35615393
## F.Undergrad -0.04307018
## P.Undergrad -0.06808726
## Room.Board 0.70410499
## Books -0.07943506
## Personal -0.25562441
## PhD 16.13654391
## Terminal 18.73749507
## S.F.Ratio -93.87835903
## perc.alumni 33.29328020
## Expend 0.13464158
## Grad.Rate 27.95623774
# use the x coefs in "train dataset" to generate predictive for test dataset
Y_hat <- predict(ridge.mod, s = bestLamdaForRidge, newx=model.matrix( Outstate ~ ., data=College_test ) )
# calculate diffference between ridge predictive app vs real apps in test dataset
testMSE_ridgeReg <- mean(( College_test$Outstate - Y_hat )^2 )
testMSE_ridgeReg
## [1] 5054367
# lasso (5 folds CV)
YTrain <- College_train$Outstate
XTrain <- model.matrix(Outstate ~ ., data = College_train)
# apply 5 folds cv on train dataset to find out the best lambda
# alpha=1 Lasso
cv.out.lasso <- cv.glmnet(XTrain, YTrain, alpha = 1, nfolds = 5)
plot(cv.out.lasso)
cv.out.lasso$lambda.1se
## [1] 275.2405
# the best lambda the glmnet select on train dataset with 5 folds cv
bestLamdaForLasso <- cv.out.lasso$lambda.1se
lasso.mod <- glmnet(XTrain, YTrain, alpha = 1)
predict(lasso.mod, s = bestLamdaForRidge, type = "coefficients")
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 9.088875e+03
## (Intercept) .
## PrivateYes .
## Apps .
## Accept .
## Enroll .
## Top10perc .
## Top25perc .
## F.Undergrad .
## P.Undergrad .
## Room.Board 1.787716e-01
## Books .
## Personal .
## PhD .
## Terminal .
## S.F.Ratio .
## perc.alumni .
## Expend 6.320378e-02
## Grad.Rate .
Y_hat <- predict( lasso.mod, s = bestLamdaForLasso, newx = model.matrix( Outstate ~ ., data = College_test ) )
# calculate diffference between lasso predictive app vs real apps in test dataset
testMSE_lassoReg <- mean(( College_test$Outstate - Y_hat )^2 )
testMSE_lassoReg
## [1] 5111281
## This is prediction function of provided by the ISLR.
## Please use it get predictive values for the stepwise regression model in question B2
# the best stepwise (best subset, nvmax=17)
predict.regsubsets <- function(object,newdata,id){
form=as.formula(object$call[[2]])
mat=model.matrix(form,newdata)
coefi=coef(object,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
set.seed(1000)
k <- 5
# create
folderIndex <- sample(1:k, nrow(College_train), replace = TRUE)
folderIndex
## [1] 4 3 3 5 3 5 2 5 1 1 5 2 2 4 2 5 3 2 5 2 1 2 4 1 4 3 3 3 3 5 4 3 5 2 5 2 1
## [38] 1 3 5 3 3 4 4 2 2 3 2 4 1 5 3 5 3 2 2 1 3 1 1 1 1 2 1 3 1 2 1 3 5 4 4 1 1
## [75] 4 5 3 1 2 3 5 5 3 1 1 4 1 5 4 1 4 4 1 3 3 2 2 3 5 3 2 1 3 5 3 1 5 3 3 1 1
## [112] 2 4 2 5 5 1 3 1 5 5 4 1 4 2 3 2 2 2 4 5 1 1 4 1 3 3 4 1 5 5 3 4 4 3 2 3 2
## [149] 4 1 5 5 4 4 1 4 4 4 3 2 5 3 3 1 5 1 5 4 5 5 1 4 2 1 1 2 1 1 3 5 4 2 1 2 3
## [186] 2 2 3 3 5 3 5 4 2 4 1 3 4 1 1 5 1 5 1 4 4 2 2 4 2 2 5 3 5 2 3 3 5 4 1 3 4
## [223] 1 5 3 5 4 5 4 4 5 5 5 1 1 4 1 3 3 3 3 1 3 3 2 1 4 5 2 1 2 5 5 5 1 1 4 4 1
## [260] 2 5 3 2 1 2 3 5 5 5 1 4 5 4 3 3 4 5 5 4 3 5 5 2 5 2 5 2 4 3 5 3 5 4 1 3 4
## [297] 4 4 2 3 3 3 3 1 5 3 5 2 1 4 2 4 2 2 5 1 5 3 2 1 3 3 4 2 5 3 4 3 3 3 1 3 1
## [334] 3 3 5 2 1 2 3 4 5 3 1 2 5 4 3 5 3 2 1 5 5 1 3 3 4 3 5 1 4 4 1 2 4 4 3 3 3
## [371] 3 4 3 3 4 4 5 5 5 2 5 1 1 5 4 2 4 5
College_train[folderIndex == 1,] # participants belong to the first fold
## Private Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad
## 9 Yes 1464 888 176 26 52 624 128
## 10 Yes 1110 930 332 18 36 1603 374
## 21 Yes 2652 1900 484 44 77 1707 44
## 24 No 5803 4441 1730 19 62 10099 3255
## 37 Yes 261 192 111 15 36 453 266
## 38 No 434 412 319 10 30 1376 237
## 50 No 11023 8298 3183 21 54 14861 1310
## 57 Yes 323 278 122 31 51 393 4
## 59 No 11220 7871 3320 43 79 19553 2748
## 60 No 5318 3515 1025 8 29 7626 2091
## 61 No 5165 3887 1561 20 60 8234 2619
## 62 Yes 920 745 347 35 66 1133 42
## 64 Yes 1576 1110 274 24 55 992 112
## 66 Yes 1310 1086 458 26 61 1365 144
## 68 Yes 427 385 143 18 38 581 533
## 73 Yes 1305 1100 334 42 64 1098 151
## 74 Yes 1769 1092 437 41 80 1757 81
## 78 Yes 353 340 103 17 45 416 230
## 84 Yes 372 362 181 15 32 1501 353
## 85 Yes 838 651 159 11 25 654 162
## 87 Yes 2936 2342 669 35 62 2502 66
## 90 Yes 1428 1097 336 22 50 1036 99
## 93 No 2405 2234 1061 10 22 4564 448
## 102 Yes 3713 1237 443 47 83 1971 107
## 106 Yes 1358 1006 274 35 63 1028 13
## 110 Yes 4019 2779 888 40 73 3891 128
## 111 Yes 3877 3156 713 71 93 3051 513
## 117 No 3073 2672 1547 9 29 9649 1792
## 119 Yes 632 620 222 14 24 702 501
## 123 Yes 443 330 151 5 36 453 42
## 132 No 4418 2737 2049 23 51 14047 5134
## 133 Yes 582 498 172 21 44 799 78
## 135 Yes 2895 1249 579 80 96 2195 156
## 139 Yes 344 264 97 11 42 500 331
## 150 No 1576 1326 913 13 50 3689 2200
## 155 Yes 7033 5125 1223 47 75 4941 1534
## 164 Yes 291 245 126 16 49 981 337
## 166 Yes 1006 837 317 33 65 1024 15
## 171 Yes 1083 880 291 13 34 915 80
## 174 Yes 860 811 366 22 56 1040 52
## 175 No 5244 3782 1930 12 37 11561 7443
## 177 Yes 4471 2942 910 29 60 3674 493
## 178 Yes 8256 3750 1522 37 70 5809 4379
## 183 Yes 672 596 278 29 60 1350 275
## 196 No 7693 5815 2328 30 66 12594 3661
## 199 Yes 2432 1730 563 20 63 2578 254
## 200 No 10706 7219 2397 12 37 14826 1979
## 202 Yes 1011 829 410 9 33 1059 2458
## 204 Yes 4095 3079 1195 33 64 5064 660
## 220 Yes 1800 1314 526 47 79 1891 40
## 223 Yes 2774 2092 482 35 64 1763 59
## 234 Yes 824 670 337 15 41 1160 653
## 235 Yes 465 361 176 19 39 879 156
## 237 Yes 1243 1020 414 33 60 2149 418
## 242 No 6502 3539 1372 11 51 7484 1904
## 246 No 2618 2288 1032 42 77 5524 414
## 250 Yes 571 461 174 10 26 645 283
## 255 Yes 2220 1796 467 65 99 1919 334
## 256 Yes 1040 845 286 48 77 967 24
## 259 Yes 1860 767 227 71 93 887 1
## 264 No 1800 1253 560 44 85 3876 3588
## 270 Yes 804 632 281 29 72 840 68
## 294 Yes 1190 978 324 12 30 1280 61
## 304 Yes 3586 2424 730 16 31 2748 1309
## 309 Yes 1151 813 248 40 64 850 80
## 316 Yes 787 562 363 21 55 925 605
## 320 Yes 440 396 221 26 51 910 166
## 331 No 1584 1456 891 6 18 3471 911
## 333 Yes 1650 1471 409 7 21 1803 1116
## 338 No 11651 8683 3023 50 90 18906 3242
## 344 Yes 212 197 91 28 56 471 148
## 352 Yes 2212 1538 408 44 75 1445 1
## 355 Yes 191 165 63 5 25 494 574
## 361 No 787 601 233 40 73 1017 411
## 364 Yes 1160 991 352 19 55 1357 737
## 382 Yes 880 520 224 16 42 1068 364
## 383 Yes 1861 998 359 45 77 1220 46
## Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni
## 9 11200 4208 500 1642 80 90 11.1 43
## 10 8180 4270 500 500 65 58 15.2 12
## 21 17080 4440 400 600 73 91 9.9 41
## 24 7248 3109 600 1900 79 91 16.8 7
## 37 9690 4300 500 500 57 77 9.7 35
## 38 4486 2146 600 2000 50 64 16.5 28
## 50 7629 4095 550 2300 79 87 20.4 13
## 57 16150 5450 275 800 63 72 7.2 26
## 59 5697 3600 525 1755 88 95 14.7 22
## 60 6550 4040 550 1230 71 78 18.7 12
## 61 6108 3800 500 1000 64 66 20.6 9
## 62 11090 4700 400 750 80 80 12.0 31
## 64 8000 3700 400 500 51 52 14.1 28
## 66 12250 3530 400 1150 85 87 16.7 35
## 68 12700 5800 450 700 81 85 10.3 37
## 73 16260 4005 300 500 91 91 12.1 40
## 74 10965 4000 450 1250 60 61 14.2 32
## 78 13290 5720 500 1500 90 93 11.5 26
## 84 8600 3550 385 665 48 48 15.4 9
## 85 8640 3700 400 1915 62 62 12.2 13
## 87 15990 4080 600 825 73 78 14.5 31
## 90 11250 3750 400 1165 53 66 12.9 30
## 93 4290 3500 598 1582 55 93 19.4 1
## 102 7000 5565 660 2400 73 80 12.5 18
## 106 15036 4056 600 600 90 94 10.6 46
## 110 13584 5928 630 1278 88 92 13.9 19
## 111 15700 4730 525 1460 95 95 2.9 29
## 117 4300 2643 450 1660 57 68 19.0 11
## 119 9550 3850 350 250 64 84 14.1 18
## 123 14080 6270 500 900 57 80 10.2 21
## 132 4104 3579 450 1700 86 94 22.6 6
## 133 10468 3380 660 1800 40 41 11.5 15
## 135 18345 5995 500 700 94 98 10.6 51
## 139 12600 5520 630 2250 77 80 10.4 7
## 150 3840 2852 200 400 52 54 20.3 9
## 155 19040 5950 350 800 98 98 9.1 21
## 164 8390 4100 350 1500 45 55 21.5 24
## 166 8200 3485 500 1200 84 84 10.6 26
## 171 9270 4100 600 1860 75 82 13.5 27
## 174 9900 3075 300 1800 68 68 14.9 34
## 175 8786 2964 570 1980 79 87 15.9 8
## 177 11584 5986 650 800 83 83 14.1 41
## 178 11000 5160 660 1115 90 95 13.8 10
## 183 11844 3696 450 1146 54 76 11.6 33
## 196 8074 3522 495 2941 84 88 16.9 18
## 199 12370 6800 500 1800 92 92 13.6 25
## 200 7799 3296 470 1750 73 78 17.3 11
## 202 9000 3100 450 1413 77 78 12.4 7
## 204 8490 3320 650 2400 81 93 14.8 23
## 220 19300 5700 750 750 79 91 9.0 51
## 223 15800 4790 450 950 97 98 12.3 21
## 234 9400 3400 500 1100 37 37 8.4 21
## 235 12580 4345 400 970 76 79 13.1 24
## 237 8678 3858 700 1736 82 83 16.2 7
## 242 7844 4108 400 2000 76 79 15.3 16
## 246 8127 3978 900 1200 82 82 17.0 25
## 250 10850 3670 400 1159 58 60 12.8 19
## 255 10320 4762 450 650 68 100 14.2 20
## 256 15747 4062 400 800 88 95 12.7 33
## 259 17000 6010 500 850 99 99 9.6 52
## 264 6634 4360 600 2604 82 85 17.8 14
## 270 10390 4040 525 1345 54 78 12.5 37
## 294 8840 3620 500 1200 57 58 16.2 26
## 304 13250 5420 700 3100 84 92 12.3 23
## 309 15588 6174 500 1200 78 90 9.2 34
## 316 5950 2890 600 1300 67 72 14.6 35
## 320 9420 3730 600 1230 51 56 9.9 46
## 331 5016 3798 540 2256 48 48 28.8 12
## 333 8994 5500 498 2065 74 97 15.4 15
## 338 6680 4060 600 1020 80 89 23.1 15
## 344 13500 5230 400 850 95 98 9.3 37
## 352 19240 3690 750 480 95 95 11.1 46
## 355 11550 4270 300 500 43 77 14.5 8
## 361 5376 3214 600 1100 99 100 13.7 11
## 364 12200 3880 480 930 74 81 17.8 25
## 382 9300 3260 600 900 81 81 11.1 24
## 383 16670 4900 750 800 80 83 10.5 51
## Expend Grad.Rate
## 9 8317 51
## 10 5664 29
## 21 11711 76
## 24 6227 62
## 37 9337 71
## 38 4525 46
## 50 8811 64
## 57 15622 64
## 59 7881 63
## 60 7511 42
## 61 5063 57
## 62 12639 79
## 64 5807 51
## 66 7215 81
## 68 11758 84
## 73 10162 86
## 74 8294 98
## 78 8861 63
## 84 10938 49
## 85 7634 48
## 87 9979 83
## 90 8735 54
## 93 5967 35
## 102 9988 65
## 106 14634 78
## 110 10872 100
## 111 19733 67
## 117 5801 68
## 119 5922 58
## 123 15387 46
## 132 5657 35
## 133 8991 52
## 135 21409 91
## 139 9773 43
## 150 4172 100
## 155 16920 74
## 164 4607 62
## 166 9248 64
## 171 8425 55
## 174 6357 68
## 175 8094 38
## 177 9131 92
## 178 10059 62
## 183 8996 72
## 196 8246 63
## 199 10062 79
## 200 6086 56
## 202 11178 42
## 204 9158 64
## 220 18359 84
## 223 12999 69
## 234 5352 59
## 235 10889 74
## 237 7651 72
## 242 6773 52
## 246 7473 65
## 250 7505 56
## 255 7788 65
## 256 13224 79
## 259 18443 87
## 264 6104 47
## 270 11751 60
## 294 6563 63
## 304 11299 70
## 309 16623 77
## 316 5177 53
## 320 10270 72
## 331 3871 59
## 333 8409 59
## 338 7250 58
## 344 16095 52
## 352 14067 88
## 355 9209 40
## 361 9241 34
## 364 7666 79
## 382 8129 63
## 383 13198 72
head(College_train[folderIndex != 1,]) #participants in other 9 folds
## Private Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad
## 1 No 9478 6312 2194 29 65 15646 1829
## 2 Yes 878 816 200 16 41 706 62
## 3 Yes 1416 1015 417 10 44 1324 117
## 4 No 1373 1373 724 6 21 2754 474
## 5 Yes 7875 5062 1492 38 71 5471 1470
## 6 Yes 12394 5232 2464 85 100 9205 531
## Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend
## 1 8412 4180 470 1800 87 89 19.2 10 7850
## 2 8740 3363 550 1700 62 68 11.6 29 7718
## 3 13500 5800 585 1700 67 78 13.2 23 9006
## 4 2700 2660 540 1660 60 68 20.3 29 4550
## 5 17450 6328 700 950 92 93 7.6 15 14745
## 6 17020 7270 500 1544 95 96 6.3 38 25765
## Grad.Rate
## 1 59
## 2 48
## 3 75
## 4 52
## 5 72
## 6 93
# 1 DV plus 17 IVs
cv.errors <- matrix(NA, k, 17)
cv.errors
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [2,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [3,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [4,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [5,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [,15] [,16] [,17]
## [1,] NA NA NA
## [2,] NA NA NA
## [3,] NA NA NA
## [4,] NA NA NA
## [5,] NA NA NA
for(j in 1:k){
best.fit <- regsubsets(Outstate~., data = College_train[folderIndex != j,], nvmax=17)
for(i in 1:17){
pred <- predict.regsubsets(best.fit,College_train[folderIndex == j,], id = i) ## get the predictive Apps in the jth folder
cv.errors[j,i]=mean((College_train$Outstate[folderIndex == j]-pred)^2)
}
}
cv.errors
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] 9697502 6342883 4753765 4292914 4091951 3800213 3834508 3533049 3693008
## [2,] 8107827 4857563 3823296 2768955 2801565 2692831 3366359 3008167 3380385
## [3,] 10088670 5507156 4983331 4635631 4312491 4133863 4469576 3979953 4149410
## [4,] 6816368 4876174 4455358 4096383 3699363 3381773 3278692 2991075 3065459
## [5,] 7441235 5347854 4621362 3997176 3815050 3678282 3850257 3691659 3557180
## [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17]
## [1,] 3591618 3475598 3455090 3460324 3448236 3433389 3405761 3380263
## [2,] 3451938 3331738 3506112 3619378 3572031 3478425 3495297 3515661
## [3,] 4345780 4260224 4348491 4241606 4216893 4194699 4182582 4189057
## [4,] 2987812 2950824 3008219 2970785 2956009 2909386 2946647 2945616
## [5,] 3420744 3639952 3633554 3528493 3561300 3470112 3465991 3450164
colMeans(cv.errors)
## [1] 8430320 5386326 4527422 3958212 3744084 3537392 3759878 3440780 3569088
## [10] 3559578 3531667 3590293 3564117 3550894 3497202 3499256 3496152
which.min(colMeans(cv.errors))
## [1] 8
# the stepwise regression model use for prediction
coef(best.fit, which.min(colMeans(cv.errors)))
## (Intercept) PrivateYes Accept Enroll Room.Board
## -3110.3361784 2202.7266033 0.7410086 -1.7305874 0.8725575
## PhD perc.alumni Expend Grad.Rate
## 29.8884039 46.9684580 0.2725747 32.7786934
# get the predictive Apps in the "test dataset"
pred <- predict.regsubsets(best.fit, College_test, id = which.min(colMeans(cv.errors)))
# calculate diffference between stepwise predictive app vs real apps
testMSE_stepWiseReg <- mean((College_test$Outstate - pred)^2)
testMSE_stepWiseReg
## [1] 5283737
# Get the coefs of the typical OLS regression with all other variables
typicalOLSRes <- lm(Outstate ~ ., data = College_train)
summary(typicalOLSRes)
##
## Call:
## lm(formula = Outstate ~ ., data = College_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4548 -1179 -93 1209 4806
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -949.21776 1007.55500 -0.942 0.346756
## PrivateYes 1647.99998 323.08084 5.101 5.41e-07 ***
## Apps -0.36630 0.10676 -3.431 0.000669 ***
## Accept 1.07559 0.18860 5.703 2.41e-08 ***
## Enroll -0.53925 0.51452 -1.048 0.295295
## Top10perc 35.89413 15.27295 2.350 0.019289 *
## Top25perc -10.45250 10.79711 -0.968 0.333635
## F.Undergrad -0.19098 0.08250 -2.315 0.021162 *
## P.Undergrad 0.10013 0.09014 1.111 0.267335
## Room.Board 0.91762 0.11568 7.933 2.57e-14 ***
## Books -0.33147 0.66730 -0.497 0.619674
## Personal -0.19750 0.15265 -1.294 0.196541
## PhD 16.12634 11.75443 1.372 0.170914
## Terminal 11.12688 12.98403 0.857 0.392018
## S.F.Ratio -59.47601 30.06077 -1.979 0.048611 *
## perc.alumni 38.21392 10.24325 3.731 0.000221 ***
## Expend 0.22741 0.03150 7.219 3.00e-12 ***
## Grad.Rate 30.26752 8.05945 3.756 0.000201 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1773 on 370 degrees of freedom
## Multiple R-squared: 0.805, Adjusted R-squared: 0.796
## F-statistic: 89.85 on 17 and 370 DF, p-value: < 2.2e-16
# all other variables coefs of the typical OLS regression in train
coef(typicalOLSRes)
## (Intercept) PrivateYes Apps Accept Enroll Top10perc
## -949.2177620 1647.9999839 -0.3662995 1.0755873 -0.5392495 35.8941306
## Top25perc F.Undergrad P.Undergrad Room.Board Books Personal
## -10.4525000 -0.1909845 0.1001341 0.9176215 -0.3314662 -0.1975002
## PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
## 16.1263412 11.1268770 -59.4760093 38.2139187 0.2274092 30.2675194
Y_hat <- predict(typicalOLSRes, newdata = College_test)
# get test MSE for typical OLS regression
testMSE_typicalOLS <- mean((College_test$Outstate - Y_hat )^2 )
testMSE_typicalOLS
## [1] 4744862
testMSE_typicalOLS
## [1] 4744862
testMSE_ridgeReg
## [1] 5054367
testMSE_lassoReg
## [1] 5111281
testMSE_stepWiseReg
## [1] 5283737
Ans: 1. testMSE_typicalOLS 4744862 testMSE_ridgeReg 5104067 testMSE_lassoReg 4757973 testMSE_stepWiseReg 5283737
coef(typicalOLSRes)
## (Intercept) PrivateYes Apps Accept Enroll Top10perc
## -949.2177620 1647.9999839 -0.3662995 1.0755873 -0.5392495 35.8941306
## Top25perc F.Undergrad P.Undergrad Room.Board Books Personal
## -10.4525000 -0.1909845 0.1001341 0.9176215 -0.3314662 -0.1975002
## PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
## 16.1263412 11.1268770 -59.4760093 38.2139187 0.2274092 30.2675194
plot(College$Expend, College$Outstate)
OLSRes <- lm(Outstate ~ Expend, data = College_train)
summary(OLSRes)
##
## Call:
## lm(formula = Outstate ~ Expend, data = College_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13667.1 -1749.8 71.2 1775.6 6624.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.102e+03 2.986e+02 17.09 <2e-16 ***
## Expend 5.601e-01 2.761e-02 20.29 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2736 on 386 degrees of freedom
## Multiple R-squared: 0.516, Adjusted R-squared: 0.5147
## F-statistic: 411.5 on 1 and 386 DF, p-value: < 2.2e-16
coef(typicalOLSRes)
## (Intercept) PrivateYes Apps Accept Enroll Top10perc
## -949.2177620 1647.9999839 -0.3662995 1.0755873 -0.5392495 35.8941306
## Top25perc F.Undergrad P.Undergrad Room.Board Books Personal
## -10.4525000 -0.1909845 0.1001341 0.9176215 -0.3314662 -0.1975002
## PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
## 16.1263412 11.1268770 -59.4760093 38.2139187 0.2274092 30.2675194
y_hat <- predict(OLSRes, newdata = College_test)
# get test MSE for typical OLS regression
testMSE_OLS <- mean((College_test$Outstate - Y_hat )^2 )
testMSE_OLS
## [1] 4744862
# Polynomial model
# max degree to test is 5
maxDegree <- 5
# 5 folds cv
k <- 5
set.seed(1000)
foldIndex <- sample( 1:k, nrow(College_train), replace=TRUE)
foldIndex
## [1] 4 3 3 5 3 5 2 5 1 1 5 2 2 4 2 5 3 2 5 2 1 2 4 1 4 3 3 3 3 5 4 3 5 2 5 2 1
## [38] 1 3 5 3 3 4 4 2 2 3 2 4 1 5 3 5 3 2 2 1 3 1 1 1 1 2 1 3 1 2 1 3 5 4 4 1 1
## [75] 4 5 3 1 2 3 5 5 3 1 1 4 1 5 4 1 4 4 1 3 3 2 2 3 5 3 2 1 3 5 3 1 5 3 3 1 1
## [112] 2 4 2 5 5 1 3 1 5 5 4 1 4 2 3 2 2 2 4 5 1 1 4 1 3 3 4 1 5 5 3 4 4 3 2 3 2
## [149] 4 1 5 5 4 4 1 4 4 4 3 2 5 3 3 1 5 1 5 4 5 5 1 4 2 1 1 2 1 1 3 5 4 2 1 2 3
## [186] 2 2 3 3 5 3 5 4 2 4 1 3 4 1 1 5 1 5 1 4 4 2 2 4 2 2 5 3 5 2 3 3 5 4 1 3 4
## [223] 1 5 3 5 4 5 4 4 5 5 5 1 1 4 1 3 3 3 3 1 3 3 2 1 4 5 2 1 2 5 5 5 1 1 4 4 1
## [260] 2 5 3 2 1 2 3 5 5 5 1 4 5 4 3 3 4 5 5 4 3 5 5 2 5 2 5 2 4 3 5 3 5 4 1 3 4
## [297] 4 4 2 3 3 3 3 1 5 3 5 2 1 4 2 4 2 2 5 1 5 3 2 1 3 3 4 2 5 3 4 3 3 3 1 3 1
## [334] 3 3 5 2 1 2 3 4 5 3 1 2 5 4 3 5 3 2 1 5 5 1 3 3 4 3 5 1 4 4 1 2 4 4 3 3 3
## [371] 3 4 3 3 4 4 5 5 5 2 5 1 1 5 4 2 4 5
cv.mse.train.poly <- matrix(NA, k, maxDegree)
for( j in 1:maxDegree){
for( i in 1:k ){
fit <- lm( Outstate ~ poly(Expend,j), data = College_train[foldIndex != i,] )
y_hat = predict(fit, newdata=College_train[foldIndex == i,] )
cv.mse.train.poly[i,j] = mean((College_train[foldIndex == i,]$Outstate - y_hat )^2)
}
}
cv.mse.train.poly
## [,1] [,2] [,3] [,4] [,5]
## [1,] 6574163 4842551 4837942 4821256 4791332
## [2,] 8107827 7439765 7457389 7437216 7227487
## [3,] 10088670 4821453 4904684 5536038 5612567
## [4,] 6816368 5786950 5800577 5783494 5794133
## [5,] 7441235 6077416 6069416 6067771 5847808
colMeans(cv.mse.train.poly)
## [1] 7805652 5793627 5814002 5929155 5854665
which.min(colMeans(cv.mse.train.poly))
## [1] 2
# 2 is the best for poly degree 1 -5, according to the 5 folds CV
# get the predictive Apps in the "test dataset"
fitPoly <- lm(Outstate ~ poly(Expend,2), data=College_train)
y_hatPoly <- predict(fitPoly, newdata = College_test)
#pred <- predict(fit, College_test, id = which.min(colMeans(cv.mse.train)))
# MSE
testMSE_poly <- mean((College_test$Outstate - y_hatPoly)^2)
testMSE_poly
## [1] 7781220
# Cubic splines
# set the df ranges from 1:10 when tuning
df_choices = c(1:10)
length(df_choices)
## [1] 10
# 5 folds cv
k <- 5
set.seed(1000)
foldIndex <- sample( 1:k, nrow(College_train), replace=TRUE)
foldIndex
## [1] 4 3 3 5 3 5 2 5 1 1 5 2 2 4 2 5 3 2 5 2 1 2 4 1 4 3 3 3 3 5 4 3 5 2 5 2 1
## [38] 1 3 5 3 3 4 4 2 2 3 2 4 1 5 3 5 3 2 2 1 3 1 1 1 1 2 1 3 1 2 1 3 5 4 4 1 1
## [75] 4 5 3 1 2 3 5 5 3 1 1 4 1 5 4 1 4 4 1 3 3 2 2 3 5 3 2 1 3 5 3 1 5 3 3 1 1
## [112] 2 4 2 5 5 1 3 1 5 5 4 1 4 2 3 2 2 2 4 5 1 1 4 1 3 3 4 1 5 5 3 4 4 3 2 3 2
## [149] 4 1 5 5 4 4 1 4 4 4 3 2 5 3 3 1 5 1 5 4 5 5 1 4 2 1 1 2 1 1 3 5 4 2 1 2 3
## [186] 2 2 3 3 5 3 5 4 2 4 1 3 4 1 1 5 1 5 1 4 4 2 2 4 2 2 5 3 5 2 3 3 5 4 1 3 4
## [223] 1 5 3 5 4 5 4 4 5 5 5 1 1 4 1 3 3 3 3 1 3 3 2 1 4 5 2 1 2 5 5 5 1 1 4 4 1
## [260] 2 5 3 2 1 2 3 5 5 5 1 4 5 4 3 3 4 5 5 4 3 5 5 2 5 2 5 2 4 3 5 3 5 4 1 3 4
## [297] 4 4 2 3 3 3 3 1 5 3 5 2 1 4 2 4 2 2 5 1 5 3 2 1 3 3 4 2 5 3 4 3 3 3 1 3 1
## [334] 3 3 5 2 1 2 3 4 5 3 1 2 5 4 3 5 3 2 1 5 5 1 3 3 4 3 5 1 4 4 1 2 4 4 3 3 3
## [371] 3 4 3 3 4 4 5 5 5 2 5 1 1 5 4 2 4 5
cv.mse.train.cubic <- matrix(NA, k, length(df_choices))
cv.mse.train.cubic
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] NA NA NA NA NA NA NA NA NA NA
## [2,] NA NA NA NA NA NA NA NA NA NA
## [3,] NA NA NA NA NA NA NA NA NA NA
## [4,] NA NA NA NA NA NA NA NA NA NA
## [5,] NA NA NA NA NA NA NA NA NA NA
for( j in 1:length(df_choices)){
for( i in 1:k){
fit <- lm(Outstate ~ bs(Expend,df=df_choices[j]), data=College_train[foldIndex != i,] )
y_hat <- predict(fit, newdata=College_train[foldIndex == i,])
cv.mse.train.cubic[i,j] <- mean(( College_train[foldIndex == i,]$Outstate - y_hat )^2)
}
}
## Warning in bs(Expend, df = df_choices[j]): 'df' 太小了;用過的值為 3
## Warning in bs(Expend, df = df_choices[j]): 'df' 太小了;用過的值為 3
## Warning in bs(Expend, degree = 3L, knots = numeric(0), Boundary.knots =
## c(3605L, : 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, df = df_choices[j]): 'df' 太小了;用過的值為 3
## Warning in bs(Expend, degree = 3L, knots = numeric(0), Boundary.knots =
## c(3186L, : 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, df = df_choices[j]): 'df' 太小了;用過的值為 3
## Warning in bs(Expend, df = df_choices[j]): 'df' 太小了;用過的值為 3
## Warning in bs(Expend, df = df_choices[j]): 'df' 太小了;用過的值為 3
## Warning in bs(Expend, df = df_choices[j]): 'df' 太小了;用過的值為 3
## Warning in bs(Expend, degree = 3L, knots = numeric(0), Boundary.knots =
## c(3605L, : 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, df = df_choices[j]): 'df' 太小了;用過的值為 3
## Warning in bs(Expend, degree = 3L, knots = numeric(0), Boundary.knots =
## c(3186L, : 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, df = df_choices[j]): 'df' 太小了;用過的值為 3
## Warning in bs(Expend, df = df_choices[j]): 'df' 太小了;用過的值為 3
## Warning in bs(Expend, degree = 3L, knots = numeric(0), Boundary.knots =
## c(3605L, : 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = numeric(0), Boundary.knots =
## c(3186L, : 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = 8366, Boundary.knots = c(3605L, :
## 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = 8270, Boundary.knots = c(3186L, :
## 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = c(7500.33333333333,
## 9572.66666666667: 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = c(7347.66666666667,
## 9579.33333333333: 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = c(6876, 8366, 10876.25),
## Boundary.knots = c(3605L, : 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = c(6720.5, 8270, 10550.5),
## Boundary.knots = c(3186L, : 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = c(6347.4, 7837.6, 9105.8, 11753.8:
## 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = c(6122, 7703, 8960, 11525),
## Boundary.knots = c(3186L, : 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = c(6101, 7500.33333333333, 8366, :
## 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = c(5816, 7347.66666666667, 8270, :
## 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = c(5888.57142857143,
## 7148.57142857143, : 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = c(5669.71428571429,
## 7012.42857142857, : 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = c(5713.5, 6876, 7685.25, 8366, :
## 一些在結值界外的 'x' 資料有可能會引起病態底數
## Warning in bs(Expend, degree = 3L, knots = c(5530.5, 6720.5, 7573.5, 8270, :
## 一些在結值界外的 'x' 資料有可能會引起病態底數
cv.mse.train.cubic
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] 4837942 4837942 4837942 4829896 4909695 4924147 4918481 4921615 4963863
## [2,] 7457389 7457389 7457389 7175432 7124975 7254694 7276626 7281805 7339847
## [3,] 4904684 4904684 4904684 5127335 5033336 5087205 5211154 5258377 5236154
## [4,] 5800577 5800577 5800577 5711656 5632816 5628598 5692274 5717126 5726293
## [5,] 6069416 6069416 6069416 5933831 5862896 5860455 5930381 5916367 5954576
## [,10]
## [1,] 4955096
## [2,] 7425916
## [3,] 5220681
## [4,] 5830352
## [5,] 6076524
colMeans(cv.mse.train.cubic)
## [1] 5814002 5814002 5814002 5755630 5712744 5751020 5805783 5819058 5844146
## [10] 5901714
which.min(colMeans(cv.mse.train.cubic))
## [1] 5
# The 5th column has the samllest average MSE
# that best df for spline is 5
# get the predictive Apps in the "test dataset"
fitCubic <- lm(Outstate ~ bs(Expend,5), data=College_train)
y_hatCubic <- predict(fitCubic, newdata = College_test)
## Warning in bs(Expend, degree = 3L, knots = c(7355, 9552), Boundary.knots =
## c(3186L, : 一些在結值界外的 'x' 資料有可能會引起病態底數
#pred <- predict(fit, College_test, id = which.min(colMeans(cv.mse.train)))
# MSE
testMSE_cubic <- mean((College_test$Outstate - y_hatCubic)^2)
testMSE_cubic
## [1] 7113121
# Natural splines
# df ranges from 1:10
df_choices = c(1:10)
length(df_choices)
## [1] 10
# 5 folds cv
k <- 5
set.seed(1000)
foldIndex <- sample( 1:k, nrow(College_train), replace=TRUE)
foldIndex
## [1] 4 3 3 5 3 5 2 5 1 1 5 2 2 4 2 5 3 2 5 2 1 2 4 1 4 3 3 3 3 5 4 3 5 2 5 2 1
## [38] 1 3 5 3 3 4 4 2 2 3 2 4 1 5 3 5 3 2 2 1 3 1 1 1 1 2 1 3 1 2 1 3 5 4 4 1 1
## [75] 4 5 3 1 2 3 5 5 3 1 1 4 1 5 4 1 4 4 1 3 3 2 2 3 5 3 2 1 3 5 3 1 5 3 3 1 1
## [112] 2 4 2 5 5 1 3 1 5 5 4 1 4 2 3 2 2 2 4 5 1 1 4 1 3 3 4 1 5 5 3 4 4 3 2 3 2
## [149] 4 1 5 5 4 4 1 4 4 4 3 2 5 3 3 1 5 1 5 4 5 5 1 4 2 1 1 2 1 1 3 5 4 2 1 2 3
## [186] 2 2 3 3 5 3 5 4 2 4 1 3 4 1 1 5 1 5 1 4 4 2 2 4 2 2 5 3 5 2 3 3 5 4 1 3 4
## [223] 1 5 3 5 4 5 4 4 5 5 5 1 1 4 1 3 3 3 3 1 3 3 2 1 4 5 2 1 2 5 5 5 1 1 4 4 1
## [260] 2 5 3 2 1 2 3 5 5 5 1 4 5 4 3 3 4 5 5 4 3 5 5 2 5 2 5 2 4 3 5 3 5 4 1 3 4
## [297] 4 4 2 3 3 3 3 1 5 3 5 2 1 4 2 4 2 2 5 1 5 3 2 1 3 3 4 2 5 3 4 3 3 3 1 3 1
## [334] 3 3 5 2 1 2 3 4 5 3 1 2 5 4 3 5 3 2 1 5 5 1 3 3 4 3 5 1 4 4 1 2 4 4 3 3 3
## [371] 3 4 3 3 4 4 5 5 5 2 5 1 1 5 4 2 4 5
cv.mse.train.natur <- matrix(NA, k, length(df_choices))
cv.mse.train.natur
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] NA NA NA NA NA NA NA NA NA NA
## [2,] NA NA NA NA NA NA NA NA NA NA
## [3,] NA NA NA NA NA NA NA NA NA NA
## [4,] NA NA NA NA NA NA NA NA NA NA
## [5,] NA NA NA NA NA NA NA NA NA NA
for( j in 1:length(df_choices)){
for( i in 1:k){
fit <- lm(Outstate ~ ns(Expend,df=df_choices[j]), data=College_train[foldIndex != i,] )
y_hat <- predict(fit, newdata=College_train[foldIndex == i,])
cv.mse.train.natur[i,j] <- mean(( College_train[foldIndex == i,]$Outstate - y_hat )^2)
}
}
cv.mse.train.natur
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] 6574163 4836955 4803385 4945813 4932252 4926295 4981080 4975324 4937407
## [2,] 8107827 7418312 7325599 7194780 7229803 7247597 7344969 7425528 7225903
## [3,] 10088670 4919883 4828330 4888769 4942423 4922950 4880645 4990422 5033642
## [4,] 6816368 5803239 5772372 5714328 5696174 5692181 5639416 5680827 5783956
## [5,] 7441235 5963822 6007444 5916316 5883535 5862531 5922803 6009312 5823795
## [,10]
## [1,] 5101243
## [2,] 7163741
## [3,] 4990174
## [4,] 5748198
## [5,] 5921639
colMeans(cv.mse.train.natur)
## [1] 7805652 5788442 5747426 5732001 5736837 5730311 5753783 5816283 5760941
## [10] 5784999
which.min(colMeans(cv.mse.train.natur))
## [1] 6
# The 6th column has the samllest average MSE
# that best df for spline is 6
# get the predictive Apps in the "test dataset"
fitNatur <- lm(Outstate ~ ns(Expend,6), data=College_train)
y_hatNatur <- predict(fitNatur, newdata = College_test)
#pred <- predict(fit, College_test, id = which.min(colMeans(cv.mse.train)))
# MSE
testMSE_natur <- mean((College_test$Outstate - y_hatNatur)^2)
testMSE_natur
## [1] 7110048
testMSE_OLS
## [1] 4744862
testMSE_poly
## [1] 7781220
testMSE_cubic
## [1] 7113121
testMSE_natur
## [1] 7110048
summary(OLSRes)
##
## Call:
## lm(formula = Outstate ~ Expend, data = College_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13667.1 -1749.8 71.2 1775.6 6624.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.102e+03 2.986e+02 17.09 <2e-16 ***
## Expend 5.601e-01 2.761e-02 20.29 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2736 on 386 degrees of freedom
## Multiple R-squared: 0.516, Adjusted R-squared: 0.5147
## F-statistic: 411.5 on 1 and 386 DF, p-value: < 2.2e-16
summary(fitNatur)
##
## Call:
## lm(formula = Outstate ~ ns(Expend, 6), data = College_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9355.8 -1423.6 219.2 1628.3 5845.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6530.5 944.6 6.913 2.00e-11 ***
## ns(Expend, 6)1 2388.0 923.3 2.586 0.01007 *
## ns(Expend, 6)2 3516.8 1098.9 3.200 0.00149 **
## ns(Expend, 6)3 5218.4 1010.1 5.166 3.86e-07 ***
## ns(Expend, 6)4 14957.1 1104.1 13.547 < 2e-16 ***
## ns(Expend, 6)5 12209.8 2323.3 5.255 2.47e-07 ***
## ns(Expend, 6)6 11155.6 1511.7 7.380 1.00e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2377 on 381 degrees of freedom
## Multiple R-squared: 0.6393, Adjusted R-squared: 0.6336
## F-statistic: 112.6 on 6 and 381 DF, p-value: < 2.2e-16
anova(OLSRes, fitNatur)
## Analysis of Variance Table
##
## Model 1: Outstate ~ Expend
## Model 2: Outstate ~ ns(Expend, 6)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 386 2888442532
## 2 381 2152487660 5 735954872 26.053 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Natural splines the smallest mse
Ans: testMSE_OLS 10352013 testMSE_poly 7781220 testMSE_cubic 7113121 testMSE_natur 7110048 1.nature spline mse is the smallest. nature spline Adjusted R-squared: 0.6336 OLS regression Adjusted R-squared: 0.5147 According ANOVA table, there is significant different in R^2 between these two models. We can also read the scatter plot. 2.Yes, I think the relation between Outstate and Expend is non-linear. 3.The nature spline model.
gam1 <- lm(Outstate ~ ns(Expend, 6)
+ Private + Apps + Accept + Enroll + Top10perc + Top25perc
+ F.Undergrad + P.Undergrad + Room.Board + Books + Personal + PhD
+ Terminal + S.F.Ratio + perc.alumni + Grad.Rate, data = College)
# the typical OLS regression in College
typicalOLSRes <- lm(Outstate ~ ., data = College)
summary(gam1)
##
## Call:
## lm(formula = Outstate ~ ns(Expend, 6) + Private + Apps + Accept +
## Enroll + Top10perc + Top25perc + F.Undergrad + P.Undergrad +
## Room.Board + Books + Personal + PhD + Terminal + S.F.Ratio +
## perc.alumni + Grad.Rate, data = College)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6350.2 -1081.0 79.8 1257.7 8073.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.824e+02 9.488e+02 -0.298 0.766087
## ns(Expend, 6)1 1.112e+03 5.780e+02 1.923 0.054802 .
## ns(Expend, 6)2 1.145e+03 7.025e+02 1.631 0.103391
## ns(Expend, 6)3 2.895e+03 6.639e+02 4.360 1.48e-05 ***
## ns(Expend, 6)4 9.869e+03 1.010e+03 9.768 < 2e-16 ***
## ns(Expend, 6)5 7.176e+03 1.614e+03 4.445 1.01e-05 ***
## ns(Expend, 6)6 4.721e+03 1.365e+03 3.459 0.000572 ***
## PrivateYes 2.126e+03 2.427e+02 8.758 < 2e-16 ***
## Apps -2.386e-01 6.424e-02 -3.714 0.000219 ***
## Accept 6.492e-01 1.241e-01 5.233 2.16e-07 ***
## Enroll -3.741e-01 3.345e-01 -1.118 0.263723
## Top10perc 1.546e+01 1.057e+01 1.463 0.143831
## Top25perc -3.014e+00 8.032e+00 -0.375 0.707617
## F.Undergrad -1.099e-01 5.822e-02 -1.888 0.059379 .
## P.Undergrad 7.745e-03 5.691e-02 0.136 0.891782
## Room.Board 7.022e-01 8.258e-02 8.503 < 2e-16 ***
## Books -9.213e-01 4.234e-01 -2.176 0.029872 *
## Personal -1.743e-01 1.112e-01 -1.567 0.117463
## PhD 5.103e+00 8.237e+00 0.619 0.535810
## Terminal 1.619e+01 9.027e+00 1.794 0.073283 .
## S.F.Ratio 2.708e+01 2.479e+01 1.092 0.275043
## perc.alumni 3.448e+01 7.133e+00 4.833 1.63e-06 ***
## Grad.Rate 2.839e+01 5.194e+00 5.466 6.28e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1837 on 754 degrees of freedom
## Multiple R-squared: 0.7975, Adjusted R-squared: 0.7916
## F-statistic: 134.9 on 22 and 754 DF, p-value: < 2.2e-16
summary(typicalOLSRes)
##
## Call:
## lm(formula = Outstate ~ ., data = College)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6782.6 -1267.5 -40.9 1244.5 9953.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.587e+03 7.660e+02 -2.072 0.03860 *
## PrivateYes 2.264e+03 2.480e+02 9.128 < 2e-16 ***
## Apps -3.034e-01 6.734e-02 -4.506 7.64e-06 ***
## Accept 8.124e-01 1.293e-01 6.286 5.51e-10 ***
## Enroll -5.492e-01 3.541e-01 -1.551 0.12134
## Top10perc 2.834e+01 1.098e+01 2.582 0.01002 *
## Top25perc -3.779e+00 8.475e+00 -0.446 0.65576
## F.Undergrad -9.567e-02 6.152e-02 -1.555 0.12038
## P.Undergrad 1.166e-02 6.049e-02 0.193 0.84720
## Room.Board 8.816e-01 8.558e-02 10.302 < 2e-16 ***
## Books -4.592e-01 4.479e-01 -1.025 0.30551
## Personal -2.294e-01 1.183e-01 -1.940 0.05280 .
## PhD 1.124e+01 8.730e+00 1.288 0.19822
## Terminal 2.467e+01 9.538e+00 2.587 0.00988 **
## S.F.Ratio -4.644e+01 2.441e+01 -1.902 0.05753 .
## perc.alumni 4.180e+01 7.561e+00 5.528 4.45e-08 ***
## Expend 1.990e-01 2.269e-02 8.769 < 2e-16 ***
## Grad.Rate 2.400e+01 5.506e+00 4.359 1.49e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1958 on 759 degrees of freedom
## Multiple R-squared: 0.7684, Adjusted R-squared: 0.7632
## F-statistic: 148.1 on 17 and 759 DF, p-value: < 2.2e-16
anova(gam1, typicalOLSRes)
## Analysis of Variance Table
##
## Model 1: Outstate ~ ns(Expend, 6) + Private + Apps + Accept + Enroll +
## Top10perc + Top25perc + F.Undergrad + P.Undergrad + Room.Board +
## Books + Personal + PhD + Terminal + S.F.Ratio + perc.alumni +
## Grad.Rate
## Model 2: Outstate ~ Private + Apps + Accept + Enroll + Top10perc + Top25perc +
## F.Undergrad + P.Undergrad + Room.Board + Books + Personal +
## PhD + Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 754 2543713378
## 2 759 2908534423 -5 -364821045 21.628 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# gam model djusted R-squared: 0.7916
# typical OLS regression djusted R-squared: 0.7632
Ans: I use the ns(Expend, 6) gam model djusted R-squared: 0.7916 typical OLS regression djusted R-squared: 0.7632 According ANOVA table, there is significant different in R^2 between these two models.