library(ISLR)
## Warning: package 'ISLR' was built under R version 3.5.2
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
set.seed(1)
names(College)
## [1] "Private" "Apps" "Accept" "Enroll" "Top10perc"
## [6] "Top25perc" "F.Undergrad" "P.Undergrad" "Outstate" "Room.Board"
## [11] "Books" "Personal" "PhD" "Terminal" "S.F.Ratio"
## [16] "perc.alumni" "Expend" "Grad.Rate"
head(College)
## Private Apps Accept Enroll Top10perc
## Abilene Christian University Yes 1660 1232 721 23
## Adelphi University Yes 2186 1924 512 16
## Adrian College Yes 1428 1097 336 22
## Agnes Scott College Yes 417 349 137 60
## Alaska Pacific University Yes 193 146 55 16
## Albertson College Yes 587 479 158 38
## Top25perc F.Undergrad P.Undergrad Outstate
## Abilene Christian University 52 2885 537 7440
## Adelphi University 29 2683 1227 12280
## Adrian College 50 1036 99 11250
## Agnes Scott College 89 510 63 12960
## Alaska Pacific University 44 249 869 7560
## Albertson College 62 678 41 13500
## Room.Board Books Personal PhD Terminal
## Abilene Christian University 3300 450 2200 70 78
## Adelphi University 6450 750 1500 29 30
## Adrian College 3750 400 1165 53 66
## Agnes Scott College 5450 450 875 92 97
## Alaska Pacific University 4120 800 1500 76 72
## Albertson College 3335 500 675 67 73
## S.F.Ratio perc.alumni Expend Grad.Rate
## Abilene Christian University 18.1 12 7041 60
## Adelphi University 12.2 16 10527 56
## Adrian College 12.9 30 8735 54
## Agnes Scott College 7.7 37 19016 59
## Alaska Pacific University 11.9 2 10922 15
## Albertson College 9.4 11 9727 55
College=na.omit(College) # Remove records with omitted data
x=model.matrix(Apps~.,data=College)[,-1] # Predictor matrix
y=College$Apps # Outcome variable vector
head(x)
## PrivateYes Accept Enroll Top10perc Top25perc
## Abilene Christian University 1 1232 721 23 52
## Adelphi University 1 1924 512 16 29
## Adrian College 1 1097 336 22 50
## Agnes Scott College 1 349 137 60 89
## Alaska Pacific University 1 146 55 16 44
## Albertson College 1 479 158 38 62
## F.Undergrad P.Undergrad Outstate Room.Board
## Abilene Christian University 2885 537 7440 3300
## Adelphi University 2683 1227 12280 6450
## Adrian College 1036 99 11250 3750
## Agnes Scott College 510 63 12960 5450
## Alaska Pacific University 249 869 7560 4120
## Albertson College 678 41 13500 3335
## Books Personal PhD Terminal S.F.Ratio
## Abilene Christian University 450 2200 70 78 18.1
## Adelphi University 750 1500 29 30 12.2
## Adrian College 400 1165 53 66 12.9
## Agnes Scott College 450 875 92 97 7.7
## Alaska Pacific University 800 1500 76 72 11.9
## Albertson College 500 675 67 73 9.4
## perc.alumni Expend Grad.Rate
## Abilene Christian University 12 7041 60
## Adelphi University 16 10527 56
## Adrian College 30 8735 54
## Agnes Scott College 37 19016 59
## Alaska Pacific University 2 10922 15
## Albertson College 11 9727 55
library(perturb)
lm.fit.all <- lm(Apps~Accept+Enroll+Top10perc+Top25perc+F.Undergrad+P.Undergrad+Outstate+Room.Board+Books+Personal+PhD+Terminal+S.F.Ratio+perc.alumni+Expend+Grad.Rate, data=College)
collin.diag = colldiag(mod=lm.fit.all, scale=FALSE, center=FALSE, add.intercept=TRUE)
collin.diag
## Condition
## Index Variance Decomposition Proportions
## intercept Accept Enroll Top10perc Top25perc F.Undergrad
## 1 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 2 2.942 0.000 0.004 0.000 0.000 0.000 0.030
## 3 5.776 0.000 0.000 0.000 0.000 0.000 0.000
## 4 11.899 0.000 0.076 0.000 0.000 0.000 0.000
## 5 16.068 0.000 0.008 0.000 0.000 0.000 0.007
## 6 16.777 0.000 0.528 0.000 0.000 0.000 0.255
## 7 24.945 0.000 0.056 0.000 0.000 0.000 0.040
## 8 81.936 0.000 0.216 0.928 0.000 0.000 0.620
## 9 96.126 0.000 0.020 0.033 0.000 0.000 0.009
## 10 666.209 0.000 0.021 0.009 0.005 0.027 0.011
## 11 1028.079 0.000 0.001 0.000 0.049 0.085 0.000
## 12 1225.757 0.000 0.000 0.005 0.011 0.018 0.014
## 13 1844.757 0.000 0.029 0.007 0.002 0.002 0.002
## 14 2600.748 0.000 0.000 0.004 0.264 0.349 0.002
## 15 3068.396 0.000 0.005 0.002 0.500 0.434 0.000
## 16 4639.750 0.000 0.010 0.003 0.094 0.047 0.001
## 17 172742.443 1.000 0.026 0.009 0.074 0.039 0.008
## P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 0.000 0.007 0.000 0.000 0.000 0.000 0.000 0.000
## 2 0.001 0.009 0.000 0.000 0.000 0.000 0.000 0.000
## 3 0.000 0.193 0.005 0.000 0.000 0.000 0.000 0.000
## 4 0.377 0.068 0.047 0.000 0.017 0.000 0.000 0.000
## 5 0.464 0.299 0.249 0.000 0.031 0.000 0.000 0.000
## 6 0.079 0.126 0.068 0.000 0.000 0.000 0.000 0.000
## 7 0.007 0.042 0.216 0.000 0.623 0.000 0.000 0.000
## 8 0.011 0.003 0.000 0.014 0.013 0.000 0.000 0.000
## 9 0.000 0.001 0.130 0.787 0.136 0.000 0.000 0.000
## 10 0.009 0.145 0.101 0.059 0.027 0.029 0.021 0.000
## 11 0.031 0.005 0.062 0.004 0.025 0.044 0.043 0.000
## 12 0.016 0.028 0.023 0.002 0.000 0.019 0.009 0.000
## 13 0.000 0.046 0.025 0.001 0.008 0.004 0.001 0.000
## 14 0.001 0.002 0.021 0.047 0.007 0.508 0.286 0.004
## 15 0.003 0.004 0.000 0.003 0.005 0.387 0.564 0.003
## 16 0.000 0.021 0.012 0.031 0.015 0.009 0.025 0.630
## 17 0.000 0.002 0.040 0.052 0.094 0.000 0.052 0.362
## perc.alumni Expend Grad.Rate
## 1 0.000 0.014 0.000
## 2 0.000 0.006 0.000
## 3 0.000 0.627 0.000
## 4 0.000 0.002 0.000
## 5 0.000 0.000 0.000
## 6 0.000 0.003 0.000
## 7 0.000 0.000 0.000
## 8 0.000 0.002 0.000
## 9 0.000 0.000 0.000
## 10 0.003 0.012 0.037
## 11 0.001 0.050 0.002
## 12 0.018 0.038 0.687
## 13 0.954 0.002 0.126
## 14 0.011 0.064 0.010
## 15 0.003 0.074 0.011
## 16 0.004 0.064 0.051
## 17 0.006 0.040 0.076
We know that CI >50 is an indicator of multicollinearity. In our case, The CI for the majority of the variables (8 to 17) are over 50. Thus, we can conclude that our model exhibits high level of multicollinearity.
collin.diag = colldiag(mod=lm.fit.all, scale=FALSE, center=TRUE, add.intercept=FALSE)
collin.diag
## Condition
## Index Variance Decomposition Proportions
## Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad
## 1 1.000 0.000 0.000 0.000 0.000 0.001 0.000
## 2 1.117 0.005 0.000 0.000 0.000 0.031 0.001
## 3 2.495 0.004 0.000 0.000 0.000 0.002 0.000
## 4 4.887 0.055 0.000 0.000 0.000 0.000 0.741
## 5 6.211 0.536 0.000 0.000 0.000 0.260 0.097
## 6 7.853 0.084 0.000 0.000 0.000 0.043 0.059
## 7 10.142 0.006 0.000 0.000 0.000 0.006 0.032
## 8 30.689 0.251 0.974 0.000 0.000 0.618 0.009
## 9 38.650 0.000 0.000 0.000 0.000 0.000 0.000
## 10 289.187 0.005 0.002 0.022 0.063 0.015 0.017
## 11 414.511 0.002 0.004 0.013 0.027 0.006 0.035
## 12 489.918 0.004 0.001 0.022 0.059 0.005 0.004
## 13 683.191 0.030 0.007 0.002 0.001 0.002 0.000
## 14 1018.983 0.004 0.006 0.192 0.217 0.004 0.000
## 15 1155.067 0.014 0.006 0.748 0.633 0.000 0.004
## 16 2113.142 0.000 0.000 0.000 0.000 0.006 0.000
## Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni
## 1 0.038 0.000 0.000 0.000 0.000 0.000 0.000 0.000
## 2 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
## 3 0.443 0.002 0.000 0.000 0.000 0.000 0.000 0.000
## 4 0.024 0.004 0.000 0.002 0.000 0.000 0.000 0.000
## 5 0.107 0.054 0.000 0.001 0.000 0.000 0.000 0.000
## 6 0.151 0.852 0.000 0.002 0.000 0.000 0.000 0.000
## 7 0.013 0.009 0.000 0.933 0.000 0.000 0.000 0.000
## 8 0.001 0.016 0.000 0.000 0.000 0.000 0.000 0.000
## 9 0.001 0.016 0.969 0.033 0.000 0.000 0.000 0.000
## 10 0.112 0.000 0.001 0.004 0.024 0.013 0.000 0.004
## 11 0.005 0.005 0.003 0.002 0.110 0.079 0.000 0.004
## 12 0.040 0.012 0.006 0.009 0.010 0.006 0.000 0.023
## 13 0.044 0.024 0.000 0.007 0.005 0.001 0.000 0.949
## 14 0.000 0.005 0.014 0.001 0.596 0.588 0.000 0.012
## 15 0.002 0.001 0.007 0.000 0.246 0.313 0.000 0.001
## 16 0.020 0.000 0.000 0.006 0.009 0.000 1.000 0.008
## Expend Grad.Rate
## 1 0.150 0.000
## 2 0.021 0.000
## 3 0.479 0.000
## 4 0.001 0.000
## 5 0.003 0.000
## 6 0.003 0.000
## 7 0.007 0.000
## 8 0.004 0.000
## 9 0.003 0.000
## 10 0.071 0.026
## 11 0.000 0.112
## 12 0.020 0.719
## 13 0.002 0.136
## 14 0.024 0.006
## 15 0.082 0.000
## 16 0.128 0.000
When we use scale=F,center=T and add.intercept=F, a little less than half of the variables are under CI=30 and almost (roughly) half of the variables have CI value much higher than 50. Therefore, I would say that there is still severe multicollinearity in this model.
let us do some analysis on VIF to see multi-coliniarity before we rush in to modeling part.
library(car)
## Loading required package: carData
vif(lm.fit.all)
## Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad
## 7.126046 21.360983 6.928113 5.630782 17.697719 1.709601
## Outstate Room.Board Books Personal PhD Terminal
## 3.711710 1.990349 1.107372 1.305168 4.051287 3.979967
## S.F.Ratio perc.alumni Expend Grad.Rate
## 1.845581 1.833717 2.950354 1.829794
summary(lm.fit.all)
##
## Call:
## lm(formula = Apps ~ Accept + Enroll + Top10perc + Top25perc +
## F.Undergrad + P.Undergrad + Outstate + Room.Board + Books +
## Personal + PhD + Terminal + S.F.Ratio + perc.alumni + Expend +
## Grad.Rate, data = College)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5060.6 -429.2 -18.0 309.7 7661.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.948e+02 3.916e+02 -2.285 0.02259 *
## Accept 1.591e+00 4.103e-02 38.784 < 2e-16 ***
## Enroll -8.903e-01 1.874e-01 -4.751 2.42e-06 ***
## Top10perc 4.968e+01 5.621e+00 8.838 < 2e-16 ***
## Top25perc -1.438e+01 4.514e+00 -3.185 0.00151 **
## F.Undergrad 7.303e-02 3.267e-02 2.235 0.02570 *
## P.Undergrad 4.990e-02 3.236e-02 1.542 0.12346
## Outstate -1.093e-01 1.804e-02 -6.058 2.17e-09 ***
## Room.Board 1.351e-01 4.846e-02 2.788 0.00544 **
## Books -9.177e-03 2.401e-01 -0.038 0.96952
## Personal 3.147e-02 6.357e-02 0.495 0.62068
## PhD -6.789e+00 4.644e+00 -1.462 0.14417
## Terminal -1.371e+00 5.105e+00 -0.269 0.78834
## S.F.Ratio 2.305e+01 1.293e+01 1.783 0.07506 .
## perc.alumni -1.170e+00 4.117e+00 -0.284 0.77640
## Expend 8.196e-02 1.239e-02 6.614 7.06e-11 ***
## Grad.Rate 8.041e+00 2.967e+00 2.710 0.00687 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1049 on 760 degrees of freedom
## Multiple R-squared: 0.928, Adjusted R-squared: 0.9265
## F-statistic: 612.1 on 16 and 760 DF, p-value: < 2.2e-16
VIF above 10 is not tolerable. Thus, Milticolliniarity exists.
1. Variable Selection: Subset Comparison
Fit a full* model with all variables that make sense from a business standpoint, that is: Enroll, Top10perc, Outstate, Room.Board, PhD, S.F.Ratio, Expend and Grad.Rate. Name this model lm.fit.full. let us display the model summary results first.*
lm.fit.full <-lm(Apps~.,data=College)
summary(lm.fit.full)
##
## Call:
## lm(formula = Apps ~ ., data = College)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4908.8 -430.2 -29.5 322.3 7852.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -445.08413 408.32855 -1.090 0.276053
## PrivateYes -494.14897 137.81191 -3.586 0.000358 ***
## Accept 1.58581 0.04074 38.924 < 2e-16 ***
## Enroll -0.88069 0.18596 -4.736 2.60e-06 ***
## Top10perc 49.92628 5.57824 8.950 < 2e-16 ***
## Top25perc -14.23448 4.47914 -3.178 0.001543 **
## F.Undergrad 0.05739 0.03271 1.754 0.079785 .
## P.Undergrad 0.04445 0.03214 1.383 0.167114
## Outstate -0.08587 0.01906 -4.506 7.64e-06 ***
## Room.Board 0.15103 0.04829 3.127 0.001832 **
## Books 0.02090 0.23841 0.088 0.930175
## Personal 0.03110 0.06308 0.493 0.622060
## PhD -8.67850 4.63814 -1.871 0.061714 .
## Terminal -3.33066 5.09494 -0.654 0.513492
## S.F.Ratio 15.38961 13.00622 1.183 0.237081
## perc.alumni 0.17867 4.10230 0.044 0.965273
## Expend 0.07790 0.01235 6.308 4.79e-10 ***
## Grad.Rate 8.66763 2.94893 2.939 0.003390 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1041 on 759 degrees of freedom
## Multiple R-squared: 0.9292, Adjusted R-squared: 0.9276
## F-statistic: 585.9 on 17 and 759 DF, p-value: < 2.2e-16
2. best subset modelling
library(leaps)
lm.fit.subsets=regsubsets(Apps~., data=College)
subset.sum<-summary(lm.fit.subsets)
subset.sum
## Subset selection object
## Call: regsubsets.formula(Apps ~ ., data = College)
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Outstate FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## PrivateYes Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " "*" " " " " " " " "
## 2 ( 1 ) " " "*" " " "*" " " " "
## 3 ( 1 ) " " "*" " " "*" " " " "
## 4 ( 1 ) " " "*" " " "*" " " " "
## 5 ( 1 ) " " "*" "*" "*" " " " "
## 6 ( 1 ) " " "*" "*" "*" " " " "
## 7 ( 1 ) " " "*" "*" "*" "*" " "
## 8 ( 1 ) "*" "*" "*" "*" " " " "
## P.Undergrad Outstate Room.Board Books Personal PhD Terminal
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " " " " " " "
## 6 ( 1 ) " " "*" "*" " " " " " " " "
## 7 ( 1 ) " " "*" "*" " " " " " " " "
## 8 ( 1 ) " " "*" "*" " " " " "*" " "
## S.F.Ratio perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " "*" " "
## 4 ( 1 ) " " " " "*" " "
## 5 ( 1 ) " " " " "*" " "
## 6 ( 1 ) " " " " "*" " "
## 7 ( 1 ) " " " " "*" " "
## 8 ( 1 ) " " " " "*" " "
RSS<-subset.sum$rss
AdjR2<-subset.sum$adjr2
cbind(RSS,AdjR2)
## RSS AdjR2
## [1,] 1277410811 0.8899572
## [2,] 978867162 0.9155663
## [3,] 949208869 0.9180186
## [4,] 915171254 0.9208560
## [5,] 886160591 0.9232655
## [6,] 874694084 0.9241600
## [7,] 862855633 0.9250892
## [8,] 849981358 0.9261108
par(mfrow=c(1,2))
plot(RSS,xlab="Number of Variables",ylab="RSS",type="l")
plot(AdjR2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
3. Step wise selection method
a) forward elimination method
lm.fit.null <-lm(Apps~1,data=College)
lm.fit.full <-lm(Apps~.,data=College)
lm.step.forward <- step(lm.fit.null, scope = list(lower=lm.fit.null, upper=lm.fit.full), direction="forward", test = "F")
## Start: AIC=12838.69
## Apps ~ 1
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + Accept 1 1.0346e+10 1.2774e+09 11125 6276.8000 < 2.2e-16 ***
## + Enroll 1 8.3351e+09 3.2881e+09 11860 1964.5574 < 2.2e-16 ***
## + F.Undergrad 1 7.7108e+09 3.9125e+09 11995 1527.4013 < 2.2e-16 ***
## + Private 1 2.1701e+09 9.4531e+09 12680 177.9147 < 2.2e-16 ***
## + P.Undergrad 1 1.8436e+09 9.7797e+09 12706 146.0997 < 2.2e-16 ***
## + PhD 1 1.7742e+09 9.8491e+09 12712 139.6101 < 2.2e-16 ***
## + Terminal 1 1.5869e+09 1.0036e+10 12727 122.5350 < 2.2e-16 ***
## + Top25perc 1 1.4372e+09 1.0186e+10 12738 109.3505 < 2.2e-16 ***
## + Top10perc 1 1.3344e+09 1.0289e+10 12746 100.5165 < 2.2e-16 ***
## + Expend 1 7.8327e+08 1.0840e+10 12786 55.9994 1.971e-13 ***
## + Personal 1 3.7130e+08 1.1252e+10 12816 25.5741 5.318e-07 ***
## + Room.Board 1 3.1621e+08 1.1307e+10 12819 21.6734 3.802e-06 ***
## + Grad.Rate 1 2.5033e+08 1.1373e+10 12824 17.0585 4.019e-05 ***
## + Books 1 2.0424e+08 1.1419e+10 12827 13.8617 0.000211 ***
## + S.F.Ratio 1 1.0630e+08 1.1517e+10 12834 7.1533 0.007640 **
## + perc.alumni 1 9.4622e+07 1.1529e+10 12834 6.3608 0.011866 *
## <none> 1.1623e+10 12839
## + Outstate 1 2.9243e+07 1.1594e+10 12839 1.9548 0.162475
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=11124.94
## Apps ~ Accept
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + Top10perc 1 298543648 978867162 10920 236.0614 < 2.2e-16 ***
## + Expend 1 237832439 1039578371 10967 177.0740 < 2.2e-16 ***
## + Top25perc 1 172865433 1104545378 11014 121.1339 < 2.2e-16 ***
## + Grad.Rate 1 80919712 1196491098 11076 52.3463 1.121e-12 ***
## + Room.Board 1 73480089 1203930722 11081 47.2399 1.293e-11 ***
## + Outstate 1 64480760 1212930051 11087 41.1467 2.457e-10 ***
## + S.F.Ratio 1 59842954 1217567857 11090 38.0418 1.115e-09 ***
## + perc.alumni 1 43974982 1233435829 11100 27.5950 1.935e-07 ***
## + PhD 1 40339305 1237071506 11102 25.2391 6.293e-07 ***
## + Terminal 1 34118294 1243292517 11106 21.2400 4.738e-06 ***
## + Enroll 1 12102492 1265308319 11120 7.4032 0.006657 **
## + Books 1 7628546 1269782265 11122 4.6500 0.031361 *
## + F.Undergrad 1 5226771 1272184040 11124 3.1800 0.074937 .
## + P.Undergrad 1 4704068 1272706743 11124 2.8608 0.091165 .
## + Private 1 3980419 1273430392 11124 2.4193 0.120255
## <none> 1277410811 11125
## + Personal 1 1436992 1275973818 11126 0.8717 0.350784
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10920.1
## Apps ~ Accept + Top10perc
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + Expend 1 29658293 949208869 10898 24.1526 1.087e-06 ***
## + Top25perc 1 22836110 956031052 10904 18.4642 1.952e-05 ***
## + Enroll 1 13912570 964954593 10911 11.1450 0.0008829 ***
## + Private 1 10668209 968198953 10914 8.5174 0.0036198 **
## + PhD 1 7596359 971270804 10916 6.0457 0.0141586 *
## + Room.Board 1 6159533 972707629 10917 4.8949 0.0272275 *
## + Outstate 1 5782676 973084487 10918 4.5936 0.0324021 *
## + Terminal 1 5767400 973099762 10918 4.5814 0.0326323 *
## + perc.alumni 1 5577199 973289963 10918 4.4295 0.0356454 *
## + P.Undergrad 1 2567814 976299348 10920 2.0331 0.1543095
## <none> 978867162 10920
## + F.Undergrad 1 1718371 977148792 10921 1.3594 0.2440074
## + Personal 1 1404273 977462889 10921 1.1105 0.2922958
## + Books 1 1098190 977768973 10921 0.8682 0.3517446
## + Grad.Rate 1 315353 978551809 10922 0.2491 0.6178437
## + S.F.Ratio 1 73757 978793406 10922 0.0582 0.8093490
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10898.2
## Apps ~ Accept + Top10perc + Expend
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + Outstate 1 34037615 915171254 10872 28.7127 1.109e-07 ***
## + Private 1 21921323 927287546 10882 18.2503 2.179e-05 ***
## + Top25perc 1 14720207 934488662 10888 12.1607 0.0005157 ***
## + PhD 1 12430477 936778392 10890 10.2440 0.0014274 **
## + Terminal 1 11956757 937252112 10890 9.8486 0.0017642 **
## + perc.alumni 1 11589036 937619833 10891 9.5420 0.0020803 **
## + Enroll 1 8007593 941201276 10894 6.5681 0.0105710 *
## + S.F.Ratio 1 7737137 941471733 10894 6.3444 0.0119763 *
## + P.Undergrad 1 2918539 946290330 10898 2.3810 0.1232291
## <none> 949208869 10898
## + Personal 1 2112443 947096426 10898 1.7219 0.1898386
## + Books 1 637723 948571146 10900 0.5190 0.4714806
## + Room.Board 1 264603 948944266 10900 0.2153 0.6428033
## + F.Undergrad 1 52599 949156270 10900 0.0428 0.8361921
## + Grad.Rate 1 1172 949207697 10900 0.0010 0.9753817
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10871.82
## Apps ~ Accept + Top10perc + Expend + Outstate
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + Enroll 1 29010663 886160591 10849 25.2406 6.294e-07 ***
## + Room.Board 1 16274538 898896716 10860 13.9590 0.0002006 ***
## + Top25perc 1 10654640 904516614 10865 9.0819 0.0026661 **
## + F.Undergrad 1 8296040 906875214 10867 7.0531 0.0080764 **
## + PhD 1 7700769 907470485 10867 6.5427 0.0107218 *
## + Grad.Rate 1 7321591 907849663 10868 6.2179 0.0128548 *
## + Terminal 1 6021988 909149266 10869 5.1069 0.0241089 *
## + Private 1 3410109 911761145 10871 2.8836 0.0898868 .
## <none> 915171254 10872
## + S.F.Ratio 1 2129211 913042043 10872 1.7980 0.1803530
## + perc.alumni 1 1979846 913191408 10872 1.6716 0.1964360
## + P.Undergrad 1 318048 914853206 10874 0.2680 0.6047997
## + Books 1 270308 914900946 10874 0.2278 0.6333007
## + Personal 1 51356 915119898 10874 0.0433 0.8352764
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10848.79
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + Room.Board 1 11466506 874694084 10841 10.0941 0.001547 **
## + Top25perc 1 11359437 874801154 10841 9.9986 0.001628 **
## + Private 1 9551354 876609237 10842 8.3898 0.003880 **
## + F.Undergrad 1 6287657 879872934 10845 5.5025 0.019242 *
## + PhD 1 5184017 880976573 10846 4.5310 0.033603 *
## + Grad.Rate 1 4977276 881183315 10846 4.3493 0.037354 *
## + P.Undergrad 1 4196016 881964575 10847 3.6633 0.055993 .
## + Terminal 1 3667705 882492885 10848 3.2002 0.074023 .
## + S.F.Ratio 1 3471226 882689365 10848 3.0281 0.082235 .
## <none> 886160591 10849
## + perc.alumni 1 1150617 885009974 10850 1.0011 0.317361
## + Personal 1 364916 885795675 10850 0.3172 0.573452
## + Books 1 346795 885813795 10850 0.3015 0.583131
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10840.67
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + Top25perc 1 11838452 862855633 10832 10.5507 0.001212 **
## + Private 1 10278235 864415849 10834 9.1437 0.002579 **
## + PhD 1 6633253 868060832 10837 5.8763 0.015576 *
## + Terminal 1 5679065 869015019 10838 5.0255 0.025262 *
## + F.Undergrad 1 5218200 869475884 10838 4.6152 0.032002 *
## + Grad.Rate 1 3809312 870884773 10839 3.3637 0.067036 .
## + S.F.Ratio 1 3268726 871425358 10840 2.8845 0.089839 .
## + P.Undergrad 1 2771544 871922540 10840 2.4444 0.118357
## <none> 874694084 10841
## + Personal 1 483131 874210954 10842 0.4250 0.514654
## + perc.alumni 1 430235 874263849 10842 0.3784 0.538625
## + Books 1 32071 874662014 10843 0.0282 0.866692
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10832.09
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board +
## Top25perc
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + Private 1 11486640 851368993 10824 10.3618 0.001341 **
## + F.Undergrad 1 6829025 856026608 10828 6.1268 0.013530 *
## + Grad.Rate 1 5022152 857833480 10830 4.4962 0.034290 *
## + PhD 1 4164868 858690765 10830 3.7250 0.053972 .
## + S.F.Ratio 1 3313460 859542173 10831 2.9606 0.085720 .
## + P.Undergrad 1 3150967 859704666 10831 2.8149 0.093802 .
## + Terminal 1 2866270 859989362 10832 2.5597 0.110032
## <none> 862855633 10832
## + Personal 1 424835 862430798 10834 0.3783 0.538687
## + perc.alumni 1 111184 862744449 10834 0.0990 0.753150
## + Books 1 56442 862799190 10834 0.0502 0.822704
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10823.67
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board +
## Top25perc + Private
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + PhD 1 10749717 840619277 10816 9.8083 0.001803 **
## + Terminal 1 8264997 843103997 10818 7.5189 0.006248 **
## + Grad.Rate 1 6882166 844486827 10819 6.2507 0.012622 *
## + F.Undergrad 1 3793524 847575469 10822 3.4329 0.064294 .
## <none> 851368993 10824
## + P.Undergrad 1 1682974 849686020 10824 1.5192 0.218119
## + S.F.Ratio 1 1313916 850055078 10824 1.1855 0.276573
## + Personal 1 283343 851085650 10825 0.2553 0.613479
## + Books 1 119956 851249037 10826 0.1081 0.742426
## + perc.alumni 1 3141 851365852 10826 0.0028 0.957588
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10815.8
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board +
## Top25perc + Private + PhD
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + Grad.Rate 1 7349094 833270183 10811 6.7558 0.009524 **
## + F.Undergrad 1 4405096 836214181 10814 4.0352 0.044910 *
## + P.Undergrad 1 2451457 838167820 10816 2.2404 0.134860
## <none> 840619277 10816
## + S.F.Ratio 1 1808849 838810427 10816 1.6518 0.199097
## + Terminal 1 487207 840132070 10817 0.4442 0.505295
## + Personal 1 284996 840334280 10818 0.2598 0.610414
## + perc.alumni 1 78065 840541211 10818 0.0711 0.789753
## + Books 1 3237 840616040 10818 0.0029 0.956701
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10810.98
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board +
## Top25perc + Private + PhD + Grad.Rate
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + F.Undergrad 1 5704713 827565469 10808 5.2734 0.02192 *
## + P.Undergrad 1 4323227 828946955 10809 3.9897 0.04613 *
## <none> 833270183 10811
## + S.F.Ratio 1 1711005 831559177 10811 1.5741 0.21000
## + Personal 1 841830 832428353 10812 0.7736 0.37937
## + Terminal 1 352500 832917683 10813 0.3238 0.56953
## + perc.alumni 1 117245 833152938 10813 0.1077 0.74292
## + Books 1 58008 833212174 10813 0.0533 0.81755
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10807.64
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board +
## Top25perc + Private + PhD + Grad.Rate + F.Undergrad
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + P.Undergrad 1 2248227 825317242 10808 2.0812 0.1495
## <none> 827565469 10808
## + S.F.Ratio 1 1437928 826127541 10808 1.3298 0.2492
## + Terminal 1 515424 827050045 10809 0.4761 0.4904
## + Personal 1 426703 827138766 10809 0.3941 0.5303
## + perc.alumni 1 39675 827525794 10810 0.0366 0.8483
## + Books 1 25040 827540429 10810 0.0231 0.8792
##
## Step: AIC=10807.53
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board +
## Top25perc + Private + PhD + Grad.Rate + F.Undergrad + P.Undergrad
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 825317242 10808
## + S.F.Ratio 1 1485954 823831288 10808 1.3762 0.2411
## + Terminal 1 513811 824803431 10809 0.4753 0.4908
## + Personal 1 227696 825089547 10809 0.2106 0.6465
## + perc.alumni 1 20671 825296571 10810 0.0191 0.8901
## + Books 1 13607 825303635 10810 0.0126 0.9107
summary(lm.step.forward)
##
## Call:
## lm(formula = Apps ~ Accept + Top10perc + Expend + Outstate +
## Enroll + Room.Board + Top25perc + Private + PhD + Grad.Rate +
## F.Undergrad + P.Undergrad, data = College)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4931.0 -428.4 -28.0 325.6 7870.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -157.28686 265.43114 -0.593 0.553643
## Accept 1.58691 0.04004 39.633 < 2e-16 ***
## Top10perc 50.41132 5.52514 9.124 < 2e-16 ***
## Expend 0.07247 0.01139 6.363 3.41e-10 ***
## Outstate -0.09018 0.01831 -4.925 1.03e-06 ***
## Enroll -0.88265 0.18468 -4.779 2.11e-06 ***
## Room.Board 0.14777 0.04695 3.147 0.001713 **
## Top25perc -14.74735 4.41778 -3.338 0.000884 ***
## PrivateYes -511.78760 134.28488 -3.811 0.000149 ***
## PhD -10.70503 3.12107 -3.430 0.000636 ***
## Grad.Rate 8.63961 2.86106 3.020 0.002614 **
## F.Undergrad 0.05945 0.03244 1.833 0.067185 .
## P.Undergrad 0.04593 0.03184 1.443 0.149533
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1039 on 764 degrees of freedom
## Multiple R-squared: 0.929, Adjusted R-squared: 0.9279
## F-statistic: 833 on 12 and 764 DF, p-value: < 2.2e-16
b) Backward elimination method
lm.fit.null <-lm(Apps~1,data=College)
lm.fit.full <-lm(Apps~.,data=College)
lm.step.backward <- step(lm.fit.full, scope = list(lower=lm.fit.null, upper=lm.fit.full), direction="backward", test = "F")
## Start: AIC=10815.4
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + Books + Personal +
## PhD + Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - perc.alumni 1 2057 823062005 10813 0.0019 0.9652727
## - Books 1 8332 823068280 10813 0.0077 0.9301754
## - Personal 1 263707 823323655 10814 0.2432 0.6220599
## - Terminal 1 463415 823523363 10814 0.4273 0.5134916
## - S.F.Ratio 1 1518247 824578195 10815 1.4001 0.2370808
## - P.Undergrad 1 2073704 825133652 10815 1.9123 0.1671142
## <none> 823059948 10815
## - F.Undergrad 1 3337258 826397207 10816 3.0775 0.0797853 .
## - PhD 1 3796560 826856508 10817 3.5011 0.0617138 .
## - Grad.Rate 1 9368302 832428250 10822 8.6392 0.0033900 **
## - Room.Board 1 10605426 833665374 10823 9.7800 0.0018316 **
## - Top25perc 1 10951733 834011681 10824 10.0993 0.0015433 **
## - Private 1 13942221 837002170 10826 12.8571 0.0003577 ***
## - Outstate 1 22020341 845080289 10834 20.3065 7.638e-06 ***
## - Enroll 1 24321652 847381600 10836 22.4287 2.603e-06 ***
## - Expend 1 43151679 866211628 10853 39.7931 4.793e-10 ***
## - Top10perc 1 86866642 909926590 10891 80.1057 < 2.2e-16 ***
## - Accept 1 1642984489 2466044437 11666 1515.1086 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10813.4
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + Books + Personal +
## PhD + Terminal + S.F.Ratio + Expend + Grad.Rate
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - Books 1 7989 823069994 10811 0.0074 0.9315780
## - Personal 1 261699 823323704 10812 0.2416 0.6231601
## - Terminal 1 461438 823523443 10812 0.4261 0.5141145
## - S.F.Ratio 1 1517299 824579304 10813 1.4010 0.2369187
## - P.Undergrad 1 2071755 825133760 10813 1.9130 0.1670344
## <none> 823062005 10813
## - F.Undergrad 1 3335997 826398002 10814 3.0804 0.0796441 .
## - PhD 1 3799154 826861159 10815 3.5081 0.0614543 .
## - Grad.Rate 1 9830950 832892955 10821 9.0777 0.0026733 **
## - Room.Board 1 10817132 833879137 10822 9.9883 0.0016378 **
## - Top25perc 1 10979163 834041168 10822 10.1380 0.0015117 **
## - Private 1 14029065 837091070 10824 12.9542 0.0003399 ***
## - Outstate 1 22841086 845903092 10833 21.0910 5.125e-06 ***
## - Enroll 1 24505771 847567776 10834 22.6282 2.353e-06 ***
## - Expend 1 43192465 866254470 10851 39.8831 4.585e-10 ***
## - Top10perc 1 86934199 909996204 10889 80.2734 < 2.2e-16 ***
## - Accept 1 1686807594 2509869599 11678 1557.5665 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10811.41
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + Personal + PhD + Terminal +
## S.F.Ratio + Expend + Grad.Rate
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - Personal 1 286484 823356478 10810 0.2649 0.6069372
## - Terminal 1 453536 823523530 10810 0.4193 0.5174650
## - S.F.Ratio 1 1523776 824593770 10811 1.4089 0.2356154
## - P.Undergrad 1 2073781 825143774 10811 1.9174 0.1665504
## <none> 823069994 10811
## - F.Undergrad 1 3337954 826407948 10813 3.0862 0.0793597 .
## - PhD 1 3936953 827006947 10813 3.6401 0.0567811 .
## - Grad.Rate 1 9823693 832893687 10819 9.0829 0.0026658 **
## - Top25perc 1 10971468 834041462 10820 10.1441 0.0015067 **
## - Room.Board 1 11052994 834122988 10820 10.2195 0.0014471 **
## - Private 1 14021861 837091855 10822 12.9644 0.0003381 ***
## - Outstate 1 22934693 846004687 10831 21.2051 4.836e-06 ***
## - Enroll 1 24507814 847577808 10832 22.6596 2.315e-06 ***
## - Expend 1 43299794 866369788 10849 40.0344 4.257e-10 ***
## - Top10perc 1 87145523 910215517 10888 80.5736 < 2.2e-16 ***
## - Accept 1 1687626727 2510696721 11676 1560.3581 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10809.68
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + PhD + Terminal + S.F.Ratio +
## Expend + Grad.Rate
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - Terminal 1 474810 823831288 10808 0.4394 0.507599
## - S.F.Ratio 1 1446954 824803431 10809 1.3391 0.247551
## <none> 823356478 10810
## - P.Undergrad 1 2294055 825650532 10810 2.1231 0.145504
## - F.Undergrad 1 3524082 826880559 10811 3.2615 0.071320 .
## - PhD 1 3903516 827259993 10811 3.6126 0.057720 .
## - Grad.Rate 1 9576067 832932544 10817 8.8625 0.003003 **
## - Room.Board 1 10948047 834304524 10818 10.1322 0.001516 **
## - Top25perc 1 11009477 834365954 10818 10.1891 0.001471 **
## - Private 1 14045232 837401710 10821 12.9986 0.000332 ***
## - Outstate 1 23757582 847114060 10830 21.9872 3.252e-06 ***
## - Enroll 1 24529642 847886120 10830 22.7017 2.266e-06 ***
## - Expend 1 43741282 867097760 10848 40.4817 3.423e-10 ***
## - Top10perc 1 87332619 910689096 10886 80.8246 < 2.2e-16 ***
## - Accept 1 1688998295 2512354773 11674 1563.1342 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10808.13
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + PhD + S.F.Ratio + Expend +
## Grad.Rate
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - S.F.Ratio 1 1485954 825317242 10808 1.3762 0.2411101
## <none> 823831288 10808
## - P.Undergrad 1 2296253 826127541 10808 2.1267 0.1451637
## - F.Undergrad 1 3402201 827233489 10809 3.1510 0.0762798 .
## - Grad.Rate 1 9725730 833557018 10815 9.0076 0.0027763 **
## - Room.Board 1 10580678 834411966 10816 9.7994 0.0018122 **
## - Top25perc 1 11852393 835683681 10817 10.9772 0.0009659 ***
## - PhD 1 13174760 837006048 10818 12.2019 0.0005049 ***
## - Private 1 13675900 837507188 10819 12.6661 0.0003954 ***
## - Enroll 1 24420572 848251860 10829 22.6174 2.364e-06 ***
## - Outstate 1 24881049 848712337 10829 23.0438 1.906e-06 ***
## - Expend 1 43404484 867235772 10846 40.1995 3.923e-10 ***
## - Top10perc 1 89940095 913771383 10887 83.2990 < 2.2e-16 ***
## - Accept 1 1691950930 2515782218 11674 1567.0181 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=10807.53
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + PhD + Expend + Grad.Rate
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 825317242 10808
## - P.Undergrad 1 2248227 827565469 10808 2.0812 0.1495333
## - F.Undergrad 1 3629713 828946955 10809 3.3600 0.0671853 .
## - Grad.Rate 1 9850583 835167825 10815 9.1187 0.0026143 **
## - Room.Board 1 10699017 836016260 10816 9.9041 0.0017131 **
## - Top25perc 1 12037817 837355059 10817 11.1435 0.0008841 ***
## - PhD 1 12708568 838025810 10817 11.7644 0.0006362 ***
## - Private 1 15691081 841008323 10820 14.5253 0.0001494 ***
## - Enroll 1 24676722 849993965 10828 22.8434 2.108e-06 ***
## - Outstate 1 26201946 851519188 10830 24.2553 1.035e-06 ***
## - Expend 1 43734225 869051468 10846 40.4850 3.412e-10 ***
## - Top10perc 1 89928332 915245574 10886 83.2471 < 2.2e-16 ***
## - Accept 1 1696846612 2522163854 11674 1570.7788 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm.step.backward)
##
## Call:
## lm(formula = Apps ~ Private + Accept + Enroll + Top10perc + Top25perc +
## F.Undergrad + P.Undergrad + Outstate + Room.Board + PhD +
## Expend + Grad.Rate, data = College)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4931.0 -428.4 -28.0 325.6 7870.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -157.28686 265.43114 -0.593 0.553643
## PrivateYes -511.78760 134.28488 -3.811 0.000149 ***
## Accept 1.58691 0.04004 39.633 < 2e-16 ***
## Enroll -0.88265 0.18468 -4.779 2.11e-06 ***
## Top10perc 50.41132 5.52514 9.124 < 2e-16 ***
## Top25perc -14.74735 4.41778 -3.338 0.000884 ***
## F.Undergrad 0.05945 0.03244 1.833 0.067185 .
## P.Undergrad 0.04593 0.03184 1.443 0.149533
## Outstate -0.09018 0.01831 -4.925 1.03e-06 ***
## Room.Board 0.14777 0.04695 3.147 0.001713 **
## PhD -10.70503 3.12107 -3.430 0.000636 ***
## Expend 0.07247 0.01139 6.363 3.41e-10 ***
## Grad.Rate 8.63961 2.86106 3.020 0.002614 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1039 on 764 degrees of freedom
## Multiple R-squared: 0.929, Adjusted R-squared: 0.9279
## F-statistic: 833 on 12 and 764 DF, p-value: < 2.2e-16
both stepwise models produce the same results on the p-value of the predictors and on the r squared. regsubsets() will test all of the first 8 (one-variable, second-variable, third-variable, etc.) models. and these models doesn’t seem to be a great models. The stepwise regression method selects a model that contains only the significatn variables. becase since some of the predictors are correlated, the OLS methods will not be best methods in this case
library(glmnet) # Contains functions for Ridge and LASSO
library(ISLR) # Contains the College data set College=na.omit(College)
x=model.matrix(Apps~., data=College)[,-1]
y=College$Apps
ridge.mod=glmnet(x,y,alpha=0)
plot(ridge.mod)
ridge.cv=cv.glmnet(x,y,alpha=0) # Remove the below if you want to see all results
# ridge.cv
plot(ridge.cv)
bestlam=ridge.cv$lambda.min
min.mse=min(ridge.cv$cvm)
options(scipen=4)
cbind("Best Ridge Lambda"=bestlam, "Best Ridge MSE"=min.mse)
## Best Ridge Lambda Best Ridge MSE
## [1,] 400.4766 1659189
predict(ridge.mod, type="coefficients", s=bestlam)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1514.926536392
## PrivateYes -529.332545725
## Accept 0.978075133
## Enroll 0.466691689
## Top10perc 24.973135731
## Top25perc 1.056472786
## F.Undergrad 0.076628595
## P.Undergrad 0.024459386
## Outstate -0.021365424
## Room.Board 0.199797958
## Books 0.135279907
## Personal -0.008966624
## PhD -3.771159409
## Terminal -4.713593090
## S.F.Ratio 12.828367668
## perc.alumni -8.831660770
## Expend 0.075275975
## Grad.Rate 11.366625796
** These are the ridge model cofficients obtained using the best Lambda chosen by cross validation**
# Just change alpha from 0 to 1
lasso.mod=glmnet(x,y,alpha=1)
# lasso.mod
lasso.cv=cv.glmnet(x,y,alpha=1) # Remove the # below if you want to see all results
# ridge.cv
plot(lasso.cv)
bestlam=lasso.cv$lambda.min
min.mse = min(lasso.cv$cvm)
cbind("Best LASSO Lambda"=bestlam, "Best LASSO MSE"=min.mse)
## Best LASSO Lambda Best LASSO MSE
## [1,] 12.51776 1314657
predict(lasso.mod, type="coefficients", s=bestlam)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -545.91928048
## PrivateYes -466.37942894
## Accept 1.50970702
## Enroll -0.37570084
## Top10perc 40.35507151
## Top25perc -7.03944778
## F.Undergrad .
## P.Undergrad 0.03677940
## Outstate -0.07046711
## Room.Board 0.13714936
## Books .
## Personal 0.01508622
## PhD -6.97610541
## Terminal -3.11527371
## S.F.Ratio 9.55099994
## perc.alumni -0.65186829
## Expend 0.07310876
## Grad.Rate 6.54067105
** These are the ridge model cofficients obtained using the best Lambda chosen by cross validation**
Smallest Deviance with LASSO model. LASSO yields a smaller compared to Ridge’s MSE. LASSO is slightly better in this case.The relative importance of the coefficients is similar between the two models, but the LASSO coefficients are slightly smaller because the optimal Lambda is smaller and, therefore, there was less shrinkage in the resulting coefficients.
For Principal Components Regression (PCR), we will use the pcr(){pls} function and the College data set. let us load first the {pls} library and fit a PCR model to predict Apps using all the remaining variables as predictors and the full data set. we will use the LOOCV validation this time scale=T, validation=“LOO” and Store the results in an object named pcr.fit.
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
attach(College)
pcr.fit=pcr(Apps~., data = College[,-1], scale=TRUE, validation="LOO")
summary(pcr.fit)
## Data: X dimension: 777 16
## Y dimension: 777 1
## Fit method: svdpc
## Number of components considered: 16
##
## VALIDATION: RMSEP
## Cross-validated using 777 leave-one-out segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3873 3667 1988 1988 2024 1591 1587
## adjCV 3873 3667 1988 1988 2026 1590 1587
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1588 1551 1503 1501 1502 1506 1506
## adjCV 1588 1551 1503 1501 1502 1506 1506
## 14 comps 15 comps 16 comps
## CV 1439 1172 1134
## adjCV 1439 1172 1134
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 32.77 57.08 64.39 70.22 75.96 81.25 85.03
## Apps 10.83 74.28 74.53 74.62 83.94 84.06 84.13
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 88.65 91.93 94.44 96.39 97.77 98.67 99.32
## Apps 85.00 85.80 85.93 85.98 85.99 86.02 90.20
## 15 comps 16 comps
## X 99.83 100.0
## Apps 92.36 92.8
Displaying a scree plot for the Root Square of the MSE (val.type=“RMSEP”) and summary results of pcr.fit using the validationplot(){pls} function.
validationplot(pcr.fit, val.type="RMSEP")
summary(pcr.fit)
## Data: X dimension: 777 16
## Y dimension: 777 1
## Fit method: svdpc
## Number of components considered: 16
##
## VALIDATION: RMSEP
## Cross-validated using 777 leave-one-out segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3873 3667 1988 1988 2024 1591 1587
## adjCV 3873 3667 1988 1988 2026 1590 1587
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1588 1551 1503 1501 1502 1506 1506
## adjCV 1588 1551 1503 1501 1502 1506 1506
## 14 comps 15 comps 16 comps
## CV 1439 1172 1134
## adjCV 1439 1172 1134
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 32.77 57.08 64.39 70.22 75.96 81.25 85.03
## Apps 10.83 74.28 74.53 74.62 83.94 84.06 84.13
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 88.65 91.93 94.44 96.39 97.77 98.67 99.32
## Apps 85.00 85.80 85.93 85.98 85.99 86.02 90.20
## 15 comps 16 comps
## X 99.83 100.0
## Apps 92.36 92.8
Based on the 2 outputs above, the number of components that will be appropriate for this model will be discussed here refering to “elbows” in the scree plot, the cross-validation (CV values of the RMSEP), and the % explained variance of the various components.
The scree plot “elbows” at around 2, 4 5, and 14 components, so these are good candidates. The RMSEP CV value declines sharply as we add more components, with 14 components showing the lowest RMSEP. But 4 components explain 65% of the variance in the predictors. Based on this analysis, I would select 4 components. 14 components are way too many to achieve any meaningful dimension reduction, but it would still be a viable model because all 14 principal components are “orthogonal” (i.e., uncorrelated). 4 components would be OK, but I generally like to have at least 70% of the predictors variance explained.
reg = lm(Apps~., data = College[,-1], scale=TRUE)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'scale' will be disregarded
summary(reg)
##
## Call:
## lm(formula = Apps ~ ., data = College[, -1], scale = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5060.6 -429.2 -18.0 309.7 7661.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -894.795010 391.609762 -2.285 0.02259 *
## Accept 1.591269 0.041029 38.784 < 2e-16 ***
## Enroll -0.890291 0.187386 -4.751 2.42e-06 ***
## Top10perc 49.677562 5.621155 8.838 < 2e-16 ***
## Top25perc -14.375035 4.513776 -3.185 0.00151 **
## F.Undergrad 0.073030 0.032674 2.235 0.02570 *
## P.Undergrad 0.049895 0.032355 1.542 0.12346
## Outstate -0.109284 0.018041 -6.058 2.17e-09 ***
## Room.Board 0.135106 0.048462 2.788 0.00544 **
## Books -0.009177 0.240111 -0.038 0.96952
## Personal 0.031471 0.063566 0.495 0.62068
## PhD -6.789119 4.643923 -1.462 0.14417
## Terminal -1.370997 5.104909 -0.269 0.78834
## S.F.Ratio 23.047061 12.929393 1.783 0.07506 .
## perc.alumni -1.169644 4.116777 -0.284 0.77640
## Expend 0.081959 0.012392 6.614 7.06e-11 ***
## Grad.Rate 8.040662 2.966617 2.710 0.00687 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1049 on 760 degrees of freedom
## Multiple R-squared: 0.928, Adjusted R-squared: 0.9265
## F-statistic: 612.1 on 16 and 760 DF, p-value: < 2.2e-16
x = model.matrix(reg)
head(x)
## (Intercept) Accept Enroll Top10perc Top25perc
## Abilene Christian University 1 1232 721 23 52
## Adelphi University 1 1924 512 16 29
## Adrian College 1 1097 336 22 50
## Agnes Scott College 1 349 137 60 89
## Alaska Pacific University 1 146 55 16 44
## Albertson College 1 479 158 38 62
## F.Undergrad P.Undergrad Outstate Room.Board
## Abilene Christian University 2885 537 7440 3300
## Adelphi University 2683 1227 12280 6450
## Adrian College 1036 99 11250 3750
## Agnes Scott College 510 63 12960 5450
## Alaska Pacific University 249 869 7560 4120
## Albertson College 678 41 13500 3335
## Books Personal PhD Terminal S.F.Ratio
## Abilene Christian University 450 2200 70 78 18.1
## Adelphi University 750 1500 29 30 12.2
## Adrian College 400 1165 53 66 12.9
## Agnes Scott College 450 875 92 97 7.7
## Alaska Pacific University 800 1500 76 72 11.9
## Albertson College 500 675 67 73 9.4
## perc.alumni Expend Grad.Rate
## Abilene Christian University 12 7041 60
## Adelphi University 16 10527 56
## Adrian College 30 8735 54
## Agnes Scott College 37 19016 59
## Alaska Pacific University 2 10922 15
## Albertson College 11 9727 55
princomp(x)
## Call:
## princomp(x = x)
##
## Standard deviations:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## 6147.789307 5502.758528 2464.281335 1257.955785 989.756759 782.815423
## Comp.7 Comp.8 Comp.9 Comp.10 Comp.11 Comp.12
## 606.198957 200.326557 159.062212 21.258838 14.831438 12.548620
## Comp.13 Comp.14 Comp.15 Comp.16 Comp.17
## 8.998635 6.033261 5.322451 2.909312 0.000000
##
## 17 variables and 777 observations.
pc = princomp(x)
summary(pc)
## Importance of components:
## Comp.1 Comp.2 Comp.3
## Standard deviation 6147.7893073 5502.7585281 2464.2813348
## Proportion of Variance 0.4860689 0.3894223 0.0780981
## Cumulative Proportion 0.4860689 0.8754912 0.9535893
## Comp.4 Comp.5 Comp.6
## Standard deviation 1257.95578516 989.75675903 782.815422961
## Proportion of Variance 0.02035123 0.01259844 0.007880951
## Cumulative Proportion 0.97394051 0.98653895 0.994419903
## Comp.7 Comp.8 Comp.9
## Standard deviation 606.198957144 200.3265568670 159.0622117122
## Proportion of Variance 0.004725962 0.0005161037 0.0003253823
## Cumulative Proportion 0.999145865 0.9996619688 0.9999873510
## Comp.10 Comp.11 Comp.12
## Standard deviation 21.258838393006 14.831437967099 12.548620325534
## Proportion of Variance 0.000005812179 0.000002828957 0.000002025125
## Cumulative Proportion 0.999993163227 0.999995992185 0.999998017310
## Comp.13 Comp.14 Comp.15
## Standard deviation 8.998635090312 6.0332609293813 5.3224509652541
## Proportion of Variance 0.000001041389 0.0000004681274 0.0000003643202
## Cumulative Proportion 0.999999058699 0.9999995268268 0.9999998911470
## Comp.16 Comp.17
## Standard deviation 2.909312274850 0
## Proportion of Variance 0.000000108853 0
## Cumulative Proportion 1.000000000000 1
screeplot(pc)
thus, the first 3 Principal components could explain 95.42 % all the variation in data.
library(pls)
pcreg=pcr(Apps~., data = College[,-1], scale=TRUE, validation="CV")
summary(pcreg)
## Data: X dimension: 777 16
## Y dimension: 777 1
## Fit method: svdpc
## Number of components considered: 16
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3873 3664 1988 1998 1926 1594 1587
## adjCV 3873 3664 1985 1999 2004 1581 1585
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1582 1545 1496 1494 1497 1503 1503
## adjCV 1585 1540 1493 1492 1494 1500 1500
## 14 comps 15 comps 16 comps
## CV 1421 1157 1123
## adjCV 1405 1151 1119
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 32.77 57.08 64.39 70.22 75.96 81.25 85.03
## Apps 10.83 74.28 74.53 74.62 83.94 84.06 84.13
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 88.65 91.93 94.44 96.39 97.77 98.67 99.32
## Apps 85.00 85.80 85.93 85.98 85.99 86.02 90.20
## 15 comps 16 comps
## X 99.83 100.0
## Apps 92.36 92.8
1 comps expain unsupervised 32.77% of total variance explain in X, with 10.83 % supervised variation in y (Apps): var(y) and etc..
#pcreg$loadings # The linear weight of each variable on each
#head(pcreg$loadings)
#pcreg$scores # # Resulting from applying loadings to each data
#head(pcreg$scores)
set.seed(1) # You can use any seed number
test=sample(1:nrow(College), 0.10*nrow(College))
pcr.pred.4=predict(pcreg, College[test,], ncomp=4)
mse.4=mean((College$Apps[test]-pcr.pred.4)^2) # Compute the MSE
rmse.4=sqrt(mse.4)
pcr.pred.5=predict(pcreg, College[test,], ncomp=5)
mse.5=mean((College$Apps[test]-pcr.pred.5)^2) # Compute the MSE
rmse.5=sqrt(mse.5)
pcr.pred.10=predict(pcreg, College[test,], ncomp=10)
mse.10=mean((College$Apps[test]-pcr.pred.10)^2) # Compute the MSE
rmse.10=sqrt(mse.10)
mse.test=data.frame(c(mse.4, mse.5, mse.10), c(rmse.4, rmse.5, rmse.10))
colnames(mse.test)=c("Test MSE", "Test RMSE")
rownames(mse.test)=c("4 Comp", "5 Comp", "10 Comp")
mse.test
## Test MSE Test RMSE
## 4 Comp 15152636 3892.639
## 5 Comp 9730377 3119.355
## 10 Comp 8820289 2969.897
library(pls)
plsreg=plsr(Apps~., data = College[,-1], scale=TRUE, validation="CV")
summary(plsreg)
## Data: X dimension: 777 16
## Y dimension: 777 1
## Fit method: kernelpls
## Number of components considered: 16
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3873 1845 1660 1441 1355 1243 1179
## adjCV 3873 1843 1657 1437 1342 1236 1172
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1174 1167 1162 1161 1160 1160 1159
## adjCV 1166 1159 1155 1154 1153 1153 1153
## 14 comps 15 comps 16 comps
## CV 1159 1159 1159
## adjCV 1152 1152 1152
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 26.16 52.39 62.56 65.36 69.34 74.09 76.80
## Apps 77.97 82.66 87.65 90.56 92.12 92.57 92.64
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 79.73 82.60 85.35 87.98 91.73 94.0 96.4
## Apps 92.70 92.75 92.77 92.79 92.79 92.8 92.8
## 15 comps 16 comps
## X 98.76 100.0
## Apps 92.80 92.8
validationplot(plsreg, val.type="RMSEP")
summary(plsreg)
## Data: X dimension: 777 16
## Y dimension: 777 1
## Fit method: kernelpls
## Number of components considered: 16
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3873 1845 1660 1441 1355 1243 1179
## adjCV 3873 1843 1657 1437 1342 1236 1172
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1174 1167 1162 1161 1160 1160 1159
## adjCV 1166 1159 1155 1154 1153 1153 1153
## 14 comps 15 comps 16 comps
## CV 1159 1159 1159
## adjCV 1152 1152 1152
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 26.16 52.39 62.56 65.36 69.34 74.09 76.80
## Apps 77.97 82.66 87.65 90.56 92.12 92.57 92.64
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 79.73 82.60 85.35 87.98 91.73 94.0 96.4
## Apps 92.70 92.75 92.77 92.79 92.79 92.8 92.8
## 15 comps 16 comps
## X 98.76 100.0
## Apps 92.80 92.8
#plsreg$loadings
#head(plsreg$loadings)
pls.pred.4=predict(plsreg, College[test,], ncomp=4)
mse.4=mean((College$Apps[test]-pls.pred.4)^2) # Compute the MSE rmse.4=sqrt(mse.4)
pls.pred.5=predict(plsreg, College[test,], ncomp=5)
mse.5=mean((College$Apps[test]-pls.pred.5)^2) # Compute the MSE rmse.5=sqrt(mse.5)
pls.pred.10=predict(plsreg, College[test,], ncomp=10)
mse.10=mean((College$Apps[test]-pls.pred.10)^2) # Compute the MSE rmse.10=sqrt(mse.10)
mse.test=data.frame(c(mse.4, mse.5, mse.10), c(rmse.4, rmse.5, rmse.10))
colnames(mse.test)=c("Test MSE", "Test RMSE")
rownames(mse.test)=c("4 Comp", "5 Comp", "10 Comp")
mse.test
## Test MSE Test RMSE
## 4 Comp 4358302 3892.639
## 5 Comp 2979685 3119.355
## 10 Comp 2501719 2969.897
** Without question, the 10 component model yields the lowest MSE and RMSE, so it is the best. But if our goal is dimension reduction, we could choose a model with fewer components.**
require(ISLR)
data(College)
set.seed(1)
trainid <- sample(1:nrow(College), nrow(College)/2)
train <- College[trainid,]
test <- College[-trainid,]
str(College)
## 'data.frame': 777 obs. of 18 variables:
## $ Private : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Apps : num 1660 2186 1428 417 193 ...
## $ Accept : num 1232 1924 1097 349 146 ...
## $ Enroll : num 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : num 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : num 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: num 2885 2683 1036 510 249 ...
## $ P.Undergrad: num 537 1227 99 63 869 ...
## $ Outstate : num 7440 12280 11250 12960 7560 ...
## $ Room.Board : num 3300 6450 3750 5450 4120 ...
## $ Books : num 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : num 2200 1500 1165 875 1500 ...
## $ PhD : num 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : num 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: num 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : num 7041 10527 8735 19016 10922 ...
## $ Grad.Rate : num 60 56 54 59 15 55 63 73 80 52 ...
fit.lm <- lm(Apps~., data=train)
pred.lm <- predict(fit.lm, test)
(err.lm <- mean((test$Apps - pred.lm)^2)) # test error
## [1] 1108531
require(glmnet)
xmat.train <- model.matrix(Apps~., data=train)[,-1]
xmat.test <- model.matrix(Apps~., data=test)[,-1]
fit.ridge <- cv.glmnet(xmat.train, train$Apps, alpha=0)
(lambda <- fit.ridge$lambda.min) # optimal lambda
## [1] 450.7435
pred.ridge <- predict(fit.ridge, s=lambda, newx=xmat.test)
(err.ridge <- mean((test$Apps - pred.ridge)^2)) # test error
## [1] 1037616
require(glmnet)
xmat.train <- model.matrix(Apps~., data=train)[,-1]
xmat.test <- model.matrix(Apps~., data=test)[,-1]
fit.lasso <- cv.glmnet(xmat.train, train$Apps, alpha=1)
(lambda <- fit.lasso$lambda.min) # optimal lambda
## [1] 29.65591
pred.lasso <- predict(fit.lasso, s=lambda, newx=xmat.test)
(err.lasso <- mean((test$Apps - pred.lasso)^2)) # test error
## [1] 1025248
coef.lasso <- predict(fit.lasso, type="coefficients", s=lambda)[1:ncol(College),]
coef.lasso[coef.lasso != 0]
## (Intercept) PrivateYes Accept Enroll Top10perc
## -451.422290444 -481.435163973 1.535012486 -0.403545479 46.681699352
## Top25perc F.Undergrad Outstate Room.Board PhD
## -7.091364043 -0.003426988 -0.050200640 0.185160995 -3.849884689
## Terminal perc.alumni Expend Grad.Rate
## -3.443746687 -2.117869862 0.031928455 2.695729825
length(coef.lasso[coef.lasso != 0])
## [1] 14
require(pls)
set.seed(1)
fit.pcr <- pcr(Apps~., data=train, scale=TRUE, validation="CV")
validationplot(fit.pcr, val.type="MSEP")
summary(fit.pcr)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4335 4179 2364 2374 1996 1844 1845
## adjCV 4335 4182 2360 2374 1788 1831 1838
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1850 1863 1809 1809 1812 1815 1825
## adjCV 1844 1857 1801 1800 1804 1808 1817
## 14 comps 15 comps 16 comps 17 comps
## CV 1810 1823 1273 1281
## adjCV 1806 1789 1260 1268
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 31.216 57.68 64.73 70.55 76.33 81.30 85.01
## Apps 6.976 71.47 71.58 83.32 83.44 83.45 83.46
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 88.40 91.16 93.36 95.38 96.94 97.96 98.76
## Apps 83.47 84.53 84.86 84.98 84.98 84.99 85.24
## 15 comps 16 comps 17 comps
## X 99.40 99.87 100.00
## Apps 90.87 93.93 93.97
pred.pcr <- predict(fit.pcr, test, ncomp=16) # min Cv at M=16
(err.pcr <- mean((test$Apps - pred.pcr)^2)) # test error
## [1] 1166897
require(pls)
set.seed(1)
fit.pls <- plsr(Apps~., data=train, scale=TRUE, validation="CV")
validationplot(fit.pls, val.type="MSEP")
summary(fit.pls)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4335 2176 1893 1725 1613 1406 1312
## adjCV 4335 2171 1884 1715 1578 1375 1295
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1297 1285 1280 1278 1279 1282 1281
## adjCV 1281 1271 1267 1265 1266 1269 1268
## 14 comps 15 comps 16 comps 17 comps
## CV 1281 1281 1281 1281
## adjCV 1267 1267 1268 1268
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 26.91 43.08 63.26 65.16 68.50 73.75 76.10
## Apps 76.64 83.93 87.14 91.90 93.49 93.85 93.91
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 79.03 81.76 85.41 89.03 91.38 93.31 95.43
## Apps 93.94 93.96 93.96 93.96 93.97 93.97 93.97
## 15 comps 16 comps 17 comps
## X 97.41 98.78 100.00
## Apps 93.97 93.97 93.97
pred.pls <- predict(fit.pls, test, ncomp=10) # min Cv at M=10
(err.pls <- mean((test$Apps - pred.pls)^2)) # test error
## [1] 1134531
err.pls
## [1] 1134531
err.all <- c(err.lm, err.ridge, err.lasso, err.pcr, err.pls)
names(err.all) <- c("lm", "ridge", "lasso", "pcr", "pls")
barplot(err.all )
The test errors aren’t much different. The ridge and lasso seem to perform slightly better while the PCR/PLS don’t show any improvement from the full linear regression model.