library(ISLR)

## Warning: package 'ISLR' was built under R version 3.5.2

library(glmnet)

## Loading required package: Matrix

## Loading required package: foreach

## Loaded glmnet 2.0-16

set.seed(1)
names(College)

##  [1] "Private"     "Apps"        "Accept"      "Enroll"      "Top10perc"  
##  [6] "Top25perc"   "F.Undergrad" "P.Undergrad" "Outstate"    "Room.Board" 
## [11] "Books"       "Personal"    "PhD"         "Terminal"    "S.F.Ratio"  
## [16] "perc.alumni" "Expend"      "Grad.Rate"

head(College)

##                              Private Apps Accept Enroll Top10perc
## Abilene Christian University     Yes 1660   1232    721        23
## Adelphi University               Yes 2186   1924    512        16
## Adrian College                   Yes 1428   1097    336        22
## Agnes Scott College              Yes  417    349    137        60
## Alaska Pacific University        Yes  193    146     55        16
## Albertson College                Yes  587    479    158        38
##                              Top25perc F.Undergrad P.Undergrad Outstate
## Abilene Christian University        52        2885         537     7440
## Adelphi University                  29        2683        1227    12280
## Adrian College                      50        1036          99    11250
## Agnes Scott College                 89         510          63    12960
## Alaska Pacific University           44         249         869     7560
## Albertson College                   62         678          41    13500
##                              Room.Board Books Personal PhD Terminal
## Abilene Christian University       3300   450     2200  70       78
## Adelphi University                 6450   750     1500  29       30
## Adrian College                     3750   400     1165  53       66
## Agnes Scott College                5450   450      875  92       97
## Alaska Pacific University          4120   800     1500  76       72
## Albertson College                  3335   500      675  67       73
##                              S.F.Ratio perc.alumni Expend Grad.Rate
## Abilene Christian University      18.1          12   7041        60
## Adelphi University                12.2          16  10527        56
## Adrian College                    12.9          30   8735        54
## Agnes Scott College                7.7          37  19016        59
## Alaska Pacific University         11.9           2  10922        15
## Albertson College                  9.4          11   9727        55

College=na.omit(College) # Remove records with omitted data
x=model.matrix(Apps~.,data=College)[,-1] # Predictor matrix
y=College$Apps # Outcome variable vector
head(x)

##                              PrivateYes Accept Enroll Top10perc Top25perc
## Abilene Christian University          1   1232    721        23        52
## Adelphi University                    1   1924    512        16        29
## Adrian College                        1   1097    336        22        50
## Agnes Scott College                   1    349    137        60        89
## Alaska Pacific University             1    146     55        16        44
## Albertson College                     1    479    158        38        62
##                              F.Undergrad P.Undergrad Outstate Room.Board
## Abilene Christian University        2885         537     7440       3300
## Adelphi University                  2683        1227    12280       6450
## Adrian College                      1036          99    11250       3750
## Agnes Scott College                  510          63    12960       5450
## Alaska Pacific University            249         869     7560       4120
## Albertson College                    678          41    13500       3335
##                              Books Personal PhD Terminal S.F.Ratio
## Abilene Christian University   450     2200  70       78      18.1
## Adelphi University             750     1500  29       30      12.2
## Adrian College                 400     1165  53       66      12.9
## Agnes Scott College            450      875  92       97       7.7
## Alaska Pacific University      800     1500  76       72      11.9
## Albertson College              500      675  67       73       9.4
##                              perc.alumni Expend Grad.Rate
## Abilene Christian University          12   7041        60
## Adelphi University                    16  10527        56
## Adrian College                        30   8735        54
## Agnes Scott College                   37  19016        59
## Alaska Pacific University              2  10922        15
## Albertson College                     11   9727        55

Dimensionality: Multicollinearity Analysis

library(perturb)
lm.fit.all <- lm(Apps~Accept+Enroll+Top10perc+Top25perc+F.Undergrad+P.Undergrad+Outstate+Room.Board+Books+Personal+PhD+Terminal+S.F.Ratio+perc.alumni+Expend+Grad.Rate, data=College)
collin.diag = colldiag(mod=lm.fit.all, scale=FALSE, center=FALSE, add.intercept=TRUE)
collin.diag

## Condition
## Index    Variance Decomposition Proportions
##                intercept Accept Enroll Top10perc Top25perc F.Undergrad
## 1        1.000 0.000     0.000  0.000  0.000     0.000     0.000      
## 2        2.942 0.000     0.004  0.000  0.000     0.000     0.030      
## 3        5.776 0.000     0.000  0.000  0.000     0.000     0.000      
## 4       11.899 0.000     0.076  0.000  0.000     0.000     0.000      
## 5       16.068 0.000     0.008  0.000  0.000     0.000     0.007      
## 6       16.777 0.000     0.528  0.000  0.000     0.000     0.255      
## 7       24.945 0.000     0.056  0.000  0.000     0.000     0.040      
## 8       81.936 0.000     0.216  0.928  0.000     0.000     0.620      
## 9       96.126 0.000     0.020  0.033  0.000     0.000     0.009      
## 10     666.209 0.000     0.021  0.009  0.005     0.027     0.011      
## 11    1028.079 0.000     0.001  0.000  0.049     0.085     0.000      
## 12    1225.757 0.000     0.000  0.005  0.011     0.018     0.014      
## 13    1844.757 0.000     0.029  0.007  0.002     0.002     0.002      
## 14    2600.748 0.000     0.000  0.004  0.264     0.349     0.002      
## 15    3068.396 0.000     0.005  0.002  0.500     0.434     0.000      
## 16    4639.750 0.000     0.010  0.003  0.094     0.047     0.001      
## 17  172742.443 1.000     0.026  0.009  0.074     0.039     0.008      
##    P.Undergrad Outstate Room.Board Books Personal PhD   Terminal S.F.Ratio
## 1  0.000       0.007    0.000      0.000 0.000    0.000 0.000    0.000    
## 2  0.001       0.009    0.000      0.000 0.000    0.000 0.000    0.000    
## 3  0.000       0.193    0.005      0.000 0.000    0.000 0.000    0.000    
## 4  0.377       0.068    0.047      0.000 0.017    0.000 0.000    0.000    
## 5  0.464       0.299    0.249      0.000 0.031    0.000 0.000    0.000    
## 6  0.079       0.126    0.068      0.000 0.000    0.000 0.000    0.000    
## 7  0.007       0.042    0.216      0.000 0.623    0.000 0.000    0.000    
## 8  0.011       0.003    0.000      0.014 0.013    0.000 0.000    0.000    
## 9  0.000       0.001    0.130      0.787 0.136    0.000 0.000    0.000    
## 10 0.009       0.145    0.101      0.059 0.027    0.029 0.021    0.000    
## 11 0.031       0.005    0.062      0.004 0.025    0.044 0.043    0.000    
## 12 0.016       0.028    0.023      0.002 0.000    0.019 0.009    0.000    
## 13 0.000       0.046    0.025      0.001 0.008    0.004 0.001    0.000    
## 14 0.001       0.002    0.021      0.047 0.007    0.508 0.286    0.004    
## 15 0.003       0.004    0.000      0.003 0.005    0.387 0.564    0.003    
## 16 0.000       0.021    0.012      0.031 0.015    0.009 0.025    0.630    
## 17 0.000       0.002    0.040      0.052 0.094    0.000 0.052    0.362    
##    perc.alumni Expend Grad.Rate
## 1  0.000       0.014  0.000    
## 2  0.000       0.006  0.000    
## 3  0.000       0.627  0.000    
## 4  0.000       0.002  0.000    
## 5  0.000       0.000  0.000    
## 6  0.000       0.003  0.000    
## 7  0.000       0.000  0.000    
## 8  0.000       0.002  0.000    
## 9  0.000       0.000  0.000    
## 10 0.003       0.012  0.037    
## 11 0.001       0.050  0.002    
## 12 0.018       0.038  0.687    
## 13 0.954       0.002  0.126    
## 14 0.011       0.064  0.010    
## 15 0.003       0.074  0.011    
## 16 0.004       0.064  0.051    
## 17 0.006       0.040  0.076

We know that CI >50 is an indicator of multicollinearity. In our case, The CI for the majority of the variables (8 to 17) are over 50. Thus, we can conclude that our model exhibits high level of multicollinearity.

collin.diag = colldiag(mod=lm.fit.all, scale=FALSE, center=TRUE, add.intercept=FALSE)
collin.diag

## Condition
## Index    Variance Decomposition Proportions
##              Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad
## 1      1.000 0.000  0.000  0.000     0.000     0.001       0.000      
## 2      1.117 0.005  0.000  0.000     0.000     0.031       0.001      
## 3      2.495 0.004  0.000  0.000     0.000     0.002       0.000      
## 4      4.887 0.055  0.000  0.000     0.000     0.000       0.741      
## 5      6.211 0.536  0.000  0.000     0.000     0.260       0.097      
## 6      7.853 0.084  0.000  0.000     0.000     0.043       0.059      
## 7     10.142 0.006  0.000  0.000     0.000     0.006       0.032      
## 8     30.689 0.251  0.974  0.000     0.000     0.618       0.009      
## 9     38.650 0.000  0.000  0.000     0.000     0.000       0.000      
## 10   289.187 0.005  0.002  0.022     0.063     0.015       0.017      
## 11   414.511 0.002  0.004  0.013     0.027     0.006       0.035      
## 12   489.918 0.004  0.001  0.022     0.059     0.005       0.004      
## 13   683.191 0.030  0.007  0.002     0.001     0.002       0.000      
## 14  1018.983 0.004  0.006  0.192     0.217     0.004       0.000      
## 15  1155.067 0.014  0.006  0.748     0.633     0.000       0.004      
## 16  2113.142 0.000  0.000  0.000     0.000     0.006       0.000      
##    Outstate Room.Board Books Personal PhD   Terminal S.F.Ratio perc.alumni
## 1  0.038    0.000      0.000 0.000    0.000 0.000    0.000     0.000      
## 2  0.000    0.000      0.000 0.000    0.000 0.000    0.000     0.000      
## 3  0.443    0.002      0.000 0.000    0.000 0.000    0.000     0.000      
## 4  0.024    0.004      0.000 0.002    0.000 0.000    0.000     0.000      
## 5  0.107    0.054      0.000 0.001    0.000 0.000    0.000     0.000      
## 6  0.151    0.852      0.000 0.002    0.000 0.000    0.000     0.000      
## 7  0.013    0.009      0.000 0.933    0.000 0.000    0.000     0.000      
## 8  0.001    0.016      0.000 0.000    0.000 0.000    0.000     0.000      
## 9  0.001    0.016      0.969 0.033    0.000 0.000    0.000     0.000      
## 10 0.112    0.000      0.001 0.004    0.024 0.013    0.000     0.004      
## 11 0.005    0.005      0.003 0.002    0.110 0.079    0.000     0.004      
## 12 0.040    0.012      0.006 0.009    0.010 0.006    0.000     0.023      
## 13 0.044    0.024      0.000 0.007    0.005 0.001    0.000     0.949      
## 14 0.000    0.005      0.014 0.001    0.596 0.588    0.000     0.012      
## 15 0.002    0.001      0.007 0.000    0.246 0.313    0.000     0.001      
## 16 0.020    0.000      0.000 0.006    0.009 0.000    1.000     0.008      
##    Expend Grad.Rate
## 1  0.150  0.000    
## 2  0.021  0.000    
## 3  0.479  0.000    
## 4  0.001  0.000    
## 5  0.003  0.000    
## 6  0.003  0.000    
## 7  0.007  0.000    
## 8  0.004  0.000    
## 9  0.003  0.000    
## 10 0.071  0.026    
## 11 0.000  0.112    
## 12 0.020  0.719    
## 13 0.002  0.136    
## 14 0.024  0.006    
## 15 0.082  0.000    
## 16 0.128  0.000

When we use scale=F,center=T and add.intercept=F, a little less than half of the variables are under CI=30 and almost (roughly) half of the variables have CI value much higher than 50. Therefore, I would say that there is still severe multicollinearity in this model.

another way to check multicolliniarity is Variance inflation factor (VIF)

let us do some analysis on VIF to see multi-coliniarity before we rush in to modeling part.

library(car)

## Loading required package: carData

vif(lm.fit.all)

##      Accept      Enroll   Top10perc   Top25perc F.Undergrad P.Undergrad 
##    7.126046   21.360983    6.928113    5.630782   17.697719    1.709601 
##    Outstate  Room.Board       Books    Personal         PhD    Terminal 
##    3.711710    1.990349    1.107372    1.305168    4.051287    3.979967 
##   S.F.Ratio perc.alumni      Expend   Grad.Rate 
##    1.845581    1.833717    2.950354    1.829794

summary(lm.fit.all)

## 
## Call:
## lm(formula = Apps ~ Accept + Enroll + Top10perc + Top25perc + 
##     F.Undergrad + P.Undergrad + Outstate + Room.Board + Books + 
##     Personal + PhD + Terminal + S.F.Ratio + perc.alumni + Expend + 
##     Grad.Rate, data = College)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5060.6  -429.2   -18.0   309.7  7661.8 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.948e+02  3.916e+02  -2.285  0.02259 *  
## Accept       1.591e+00  4.103e-02  38.784  < 2e-16 ***
## Enroll      -8.903e-01  1.874e-01  -4.751 2.42e-06 ***
## Top10perc    4.968e+01  5.621e+00   8.838  < 2e-16 ***
## Top25perc   -1.438e+01  4.514e+00  -3.185  0.00151 ** 
## F.Undergrad  7.303e-02  3.267e-02   2.235  0.02570 *  
## P.Undergrad  4.990e-02  3.236e-02   1.542  0.12346    
## Outstate    -1.093e-01  1.804e-02  -6.058 2.17e-09 ***
## Room.Board   1.351e-01  4.846e-02   2.788  0.00544 ** 
## Books       -9.177e-03  2.401e-01  -0.038  0.96952    
## Personal     3.147e-02  6.357e-02   0.495  0.62068    
## PhD         -6.789e+00  4.644e+00  -1.462  0.14417    
## Terminal    -1.371e+00  5.105e+00  -0.269  0.78834    
## S.F.Ratio    2.305e+01  1.293e+01   1.783  0.07506 .  
## perc.alumni -1.170e+00  4.117e+00  -0.284  0.77640    
## Expend       8.196e-02  1.239e-02   6.614 7.06e-11 ***
## Grad.Rate    8.041e+00  2.967e+00   2.710  0.00687 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1049 on 760 degrees of freedom
## Multiple R-squared:  0.928,  Adjusted R-squared:  0.9265 
## F-statistic: 612.1 on 16 and 760 DF,  p-value: < 2.2e-16

VIF above 10 is not tolerable. Thus, Milticolliniarity exists.

part a) fit least squares regression, selecting the best model

1. Variable Selection: Subset Comparison

Fit a full* model with all variables that make sense from a business standpoint, that is: Enroll, Top10perc, Outstate, Room.Board, PhD, S.F.Ratio, Expend and Grad.Rate. Name this model lm.fit.full. let us display the model summary results first.*

lm.fit.full <-lm(Apps~.,data=College)
summary(lm.fit.full)

## 
## Call:
## lm(formula = Apps ~ ., data = College)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4908.8  -430.2   -29.5   322.3  7852.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -445.08413  408.32855  -1.090 0.276053    
## PrivateYes  -494.14897  137.81191  -3.586 0.000358 ***
## Accept         1.58581    0.04074  38.924  < 2e-16 ***
## Enroll        -0.88069    0.18596  -4.736 2.60e-06 ***
## Top10perc     49.92628    5.57824   8.950  < 2e-16 ***
## Top25perc    -14.23448    4.47914  -3.178 0.001543 ** 
## F.Undergrad    0.05739    0.03271   1.754 0.079785 .  
## P.Undergrad    0.04445    0.03214   1.383 0.167114    
## Outstate      -0.08587    0.01906  -4.506 7.64e-06 ***
## Room.Board     0.15103    0.04829   3.127 0.001832 ** 
## Books          0.02090    0.23841   0.088 0.930175    
## Personal       0.03110    0.06308   0.493 0.622060    
## PhD           -8.67850    4.63814  -1.871 0.061714 .  
## Terminal      -3.33066    5.09494  -0.654 0.513492    
## S.F.Ratio     15.38961   13.00622   1.183 0.237081    
## perc.alumni    0.17867    4.10230   0.044 0.965273    
## Expend         0.07790    0.01235   6.308 4.79e-10 ***
## Grad.Rate      8.66763    2.94893   2.939 0.003390 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1041 on 759 degrees of freedom
## Multiple R-squared:  0.9292, Adjusted R-squared:  0.9276 
## F-statistic: 585.9 on 17 and 759 DF,  p-value: < 2.2e-16

2. best subset modelling

library(leaps)
lm.fit.subsets=regsubsets(Apps~., data=College)
subset.sum<-summary(lm.fit.subsets)
subset.sum

## Subset selection object
## Call: regsubsets.formula(Apps ~ ., data = College)
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Outstate        FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          PrivateYes Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 ) " "        "*"    " "    " "       " "       " "        
## 2  ( 1 ) " "        "*"    " "    "*"       " "       " "        
## 3  ( 1 ) " "        "*"    " "    "*"       " "       " "        
## 4  ( 1 ) " "        "*"    " "    "*"       " "       " "        
## 5  ( 1 ) " "        "*"    "*"    "*"       " "       " "        
## 6  ( 1 ) " "        "*"    "*"    "*"       " "       " "        
## 7  ( 1 ) " "        "*"    "*"    "*"       "*"       " "        
## 8  ( 1 ) "*"        "*"    "*"    "*"       " "       " "        
##          P.Undergrad Outstate Room.Board Books Personal PhD Terminal
## 1  ( 1 ) " "         " "      " "        " "   " "      " " " "     
## 2  ( 1 ) " "         " "      " "        " "   " "      " " " "     
## 3  ( 1 ) " "         " "      " "        " "   " "      " " " "     
## 4  ( 1 ) " "         "*"      " "        " "   " "      " " " "     
## 5  ( 1 ) " "         "*"      " "        " "   " "      " " " "     
## 6  ( 1 ) " "         "*"      "*"        " "   " "      " " " "     
## 7  ( 1 ) " "         "*"      "*"        " "   " "      " " " "     
## 8  ( 1 ) " "         "*"      "*"        " "   " "      "*" " "     
##          S.F.Ratio perc.alumni Expend Grad.Rate
## 1  ( 1 ) " "       " "         " "    " "      
## 2  ( 1 ) " "       " "         " "    " "      
## 3  ( 1 ) " "       " "         "*"    " "      
## 4  ( 1 ) " "       " "         "*"    " "      
## 5  ( 1 ) " "       " "         "*"    " "      
## 6  ( 1 ) " "       " "         "*"    " "      
## 7  ( 1 ) " "       " "         "*"    " "      
## 8  ( 1 ) " "       " "         "*"    " "

RSS<-subset.sum$rss
AdjR2<-subset.sum$adjr2
cbind(RSS,AdjR2)

##             RSS     AdjR2
## [1,] 1277410811 0.8899572
## [2,]  978867162 0.9155663
## [3,]  949208869 0.9180186
## [4,]  915171254 0.9208560
## [5,]  886160591 0.9232655
## [6,]  874694084 0.9241600
## [7,]  862855633 0.9250892
## [8,]  849981358 0.9261108

par(mfrow=c(1,2))
plot(RSS,xlab="Number of Variables",ylab="RSS",type="l")
plot(AdjR2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")

3. Step wise selection method

a) forward elimination method

lm.fit.null <-lm(Apps~1,data=College) 
lm.fit.full <-lm(Apps~.,data=College)
lm.step.forward <- step(lm.fit.null, scope = list(lower=lm.fit.null, upper=lm.fit.full), direction="forward", test = "F")

## Start:  AIC=12838.69
## Apps ~ 1
## 
##               Df  Sum of Sq        RSS   AIC   F value    Pr(>F)    
## + Accept       1 1.0346e+10 1.2774e+09 11125 6276.8000 < 2.2e-16 ***
## + Enroll       1 8.3351e+09 3.2881e+09 11860 1964.5574 < 2.2e-16 ***
## + F.Undergrad  1 7.7108e+09 3.9125e+09 11995 1527.4013 < 2.2e-16 ***
## + Private      1 2.1701e+09 9.4531e+09 12680  177.9147 < 2.2e-16 ***
## + P.Undergrad  1 1.8436e+09 9.7797e+09 12706  146.0997 < 2.2e-16 ***
## + PhD          1 1.7742e+09 9.8491e+09 12712  139.6101 < 2.2e-16 ***
## + Terminal     1 1.5869e+09 1.0036e+10 12727  122.5350 < 2.2e-16 ***
## + Top25perc    1 1.4372e+09 1.0186e+10 12738  109.3505 < 2.2e-16 ***
## + Top10perc    1 1.3344e+09 1.0289e+10 12746  100.5165 < 2.2e-16 ***
## + Expend       1 7.8327e+08 1.0840e+10 12786   55.9994 1.971e-13 ***
## + Personal     1 3.7130e+08 1.1252e+10 12816   25.5741 5.318e-07 ***
## + Room.Board   1 3.1621e+08 1.1307e+10 12819   21.6734 3.802e-06 ***
## + Grad.Rate    1 2.5033e+08 1.1373e+10 12824   17.0585 4.019e-05 ***
## + Books        1 2.0424e+08 1.1419e+10 12827   13.8617  0.000211 ***
## + S.F.Ratio    1 1.0630e+08 1.1517e+10 12834    7.1533  0.007640 ** 
## + perc.alumni  1 9.4622e+07 1.1529e+10 12834    6.3608  0.011866 *  
## <none>                      1.1623e+10 12839                        
## + Outstate     1 2.9243e+07 1.1594e+10 12839    1.9548  0.162475    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=11124.94
## Apps ~ Accept
## 
##               Df Sum of Sq        RSS   AIC  F value    Pr(>F)    
## + Top10perc    1 298543648  978867162 10920 236.0614 < 2.2e-16 ***
## + Expend       1 237832439 1039578371 10967 177.0740 < 2.2e-16 ***
## + Top25perc    1 172865433 1104545378 11014 121.1339 < 2.2e-16 ***
## + Grad.Rate    1  80919712 1196491098 11076  52.3463 1.121e-12 ***
## + Room.Board   1  73480089 1203930722 11081  47.2399 1.293e-11 ***
## + Outstate     1  64480760 1212930051 11087  41.1467 2.457e-10 ***
## + S.F.Ratio    1  59842954 1217567857 11090  38.0418 1.115e-09 ***
## + perc.alumni  1  43974982 1233435829 11100  27.5950 1.935e-07 ***
## + PhD          1  40339305 1237071506 11102  25.2391 6.293e-07 ***
## + Terminal     1  34118294 1243292517 11106  21.2400 4.738e-06 ***
## + Enroll       1  12102492 1265308319 11120   7.4032  0.006657 ** 
## + Books        1   7628546 1269782265 11122   4.6500  0.031361 *  
## + F.Undergrad  1   5226771 1272184040 11124   3.1800  0.074937 .  
## + P.Undergrad  1   4704068 1272706743 11124   2.8608  0.091165 .  
## + Private      1   3980419 1273430392 11124   2.4193  0.120255    
## <none>                     1277410811 11125                       
## + Personal     1   1436992 1275973818 11126   0.8717  0.350784    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10920.1
## Apps ~ Accept + Top10perc
## 
##               Df Sum of Sq       RSS   AIC F value    Pr(>F)    
## + Expend       1  29658293 949208869 10898 24.1526 1.087e-06 ***
## + Top25perc    1  22836110 956031052 10904 18.4642 1.952e-05 ***
## + Enroll       1  13912570 964954593 10911 11.1450 0.0008829 ***
## + Private      1  10668209 968198953 10914  8.5174 0.0036198 ** 
## + PhD          1   7596359 971270804 10916  6.0457 0.0141586 *  
## + Room.Board   1   6159533 972707629 10917  4.8949 0.0272275 *  
## + Outstate     1   5782676 973084487 10918  4.5936 0.0324021 *  
## + Terminal     1   5767400 973099762 10918  4.5814 0.0326323 *  
## + perc.alumni  1   5577199 973289963 10918  4.4295 0.0356454 *  
## + P.Undergrad  1   2567814 976299348 10920  2.0331 0.1543095    
## <none>                     978867162 10920                      
## + F.Undergrad  1   1718371 977148792 10921  1.3594 0.2440074    
## + Personal     1   1404273 977462889 10921  1.1105 0.2922958    
## + Books        1   1098190 977768973 10921  0.8682 0.3517446    
## + Grad.Rate    1    315353 978551809 10922  0.2491 0.6178437    
## + S.F.Ratio    1     73757 978793406 10922  0.0582 0.8093490    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10898.2
## Apps ~ Accept + Top10perc + Expend
## 
##               Df Sum of Sq       RSS   AIC F value    Pr(>F)    
## + Outstate     1  34037615 915171254 10872 28.7127 1.109e-07 ***
## + Private      1  21921323 927287546 10882 18.2503 2.179e-05 ***
## + Top25perc    1  14720207 934488662 10888 12.1607 0.0005157 ***
## + PhD          1  12430477 936778392 10890 10.2440 0.0014274 ** 
## + Terminal     1  11956757 937252112 10890  9.8486 0.0017642 ** 
## + perc.alumni  1  11589036 937619833 10891  9.5420 0.0020803 ** 
## + Enroll       1   8007593 941201276 10894  6.5681 0.0105710 *  
## + S.F.Ratio    1   7737137 941471733 10894  6.3444 0.0119763 *  
## + P.Undergrad  1   2918539 946290330 10898  2.3810 0.1232291    
## <none>                     949208869 10898                      
## + Personal     1   2112443 947096426 10898  1.7219 0.1898386    
## + Books        1    637723 948571146 10900  0.5190 0.4714806    
## + Room.Board   1    264603 948944266 10900  0.2153 0.6428033    
## + F.Undergrad  1     52599 949156270 10900  0.0428 0.8361921    
## + Grad.Rate    1      1172 949207697 10900  0.0010 0.9753817    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10871.82
## Apps ~ Accept + Top10perc + Expend + Outstate
## 
##               Df Sum of Sq       RSS   AIC F value    Pr(>F)    
## + Enroll       1  29010663 886160591 10849 25.2406 6.294e-07 ***
## + Room.Board   1  16274538 898896716 10860 13.9590 0.0002006 ***
## + Top25perc    1  10654640 904516614 10865  9.0819 0.0026661 ** 
## + F.Undergrad  1   8296040 906875214 10867  7.0531 0.0080764 ** 
## + PhD          1   7700769 907470485 10867  6.5427 0.0107218 *  
## + Grad.Rate    1   7321591 907849663 10868  6.2179 0.0128548 *  
## + Terminal     1   6021988 909149266 10869  5.1069 0.0241089 *  
## + Private      1   3410109 911761145 10871  2.8836 0.0898868 .  
## <none>                     915171254 10872                      
## + S.F.Ratio    1   2129211 913042043 10872  1.7980 0.1803530    
## + perc.alumni  1   1979846 913191408 10872  1.6716 0.1964360    
## + P.Undergrad  1    318048 914853206 10874  0.2680 0.6047997    
## + Books        1    270308 914900946 10874  0.2278 0.6333007    
## + Personal     1     51356 915119898 10874  0.0433 0.8352764    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10848.79
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll
## 
##               Df Sum of Sq       RSS   AIC F value   Pr(>F)   
## + Room.Board   1  11466506 874694084 10841 10.0941 0.001547 **
## + Top25perc    1  11359437 874801154 10841  9.9986 0.001628 **
## + Private      1   9551354 876609237 10842  8.3898 0.003880 **
## + F.Undergrad  1   6287657 879872934 10845  5.5025 0.019242 * 
## + PhD          1   5184017 880976573 10846  4.5310 0.033603 * 
## + Grad.Rate    1   4977276 881183315 10846  4.3493 0.037354 * 
## + P.Undergrad  1   4196016 881964575 10847  3.6633 0.055993 . 
## + Terminal     1   3667705 882492885 10848  3.2002 0.074023 . 
## + S.F.Ratio    1   3471226 882689365 10848  3.0281 0.082235 . 
## <none>                     886160591 10849                    
## + perc.alumni  1   1150617 885009974 10850  1.0011 0.317361   
## + Personal     1    364916 885795675 10850  0.3172 0.573452   
## + Books        1    346795 885813795 10850  0.3015 0.583131   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10840.67
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board
## 
##               Df Sum of Sq       RSS   AIC F value   Pr(>F)   
## + Top25perc    1  11838452 862855633 10832 10.5507 0.001212 **
## + Private      1  10278235 864415849 10834  9.1437 0.002579 **
## + PhD          1   6633253 868060832 10837  5.8763 0.015576 * 
## + Terminal     1   5679065 869015019 10838  5.0255 0.025262 * 
## + F.Undergrad  1   5218200 869475884 10838  4.6152 0.032002 * 
## + Grad.Rate    1   3809312 870884773 10839  3.3637 0.067036 . 
## + S.F.Ratio    1   3268726 871425358 10840  2.8845 0.089839 . 
## + P.Undergrad  1   2771544 871922540 10840  2.4444 0.118357   
## <none>                     874694084 10841                    
## + Personal     1    483131 874210954 10842  0.4250 0.514654   
## + perc.alumni  1    430235 874263849 10842  0.3784 0.538625   
## + Books        1     32071 874662014 10843  0.0282 0.866692   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10832.09
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board + 
##     Top25perc
## 
##               Df Sum of Sq       RSS   AIC F value   Pr(>F)   
## + Private      1  11486640 851368993 10824 10.3618 0.001341 **
## + F.Undergrad  1   6829025 856026608 10828  6.1268 0.013530 * 
## + Grad.Rate    1   5022152 857833480 10830  4.4962 0.034290 * 
## + PhD          1   4164868 858690765 10830  3.7250 0.053972 . 
## + S.F.Ratio    1   3313460 859542173 10831  2.9606 0.085720 . 
## + P.Undergrad  1   3150967 859704666 10831  2.8149 0.093802 . 
## + Terminal     1   2866270 859989362 10832  2.5597 0.110032   
## <none>                     862855633 10832                    
## + Personal     1    424835 862430798 10834  0.3783 0.538687   
## + perc.alumni  1    111184 862744449 10834  0.0990 0.753150   
## + Books        1     56442 862799190 10834  0.0502 0.822704   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10823.67
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board + 
##     Top25perc + Private
## 
##               Df Sum of Sq       RSS   AIC F value   Pr(>F)   
## + PhD          1  10749717 840619277 10816  9.8083 0.001803 **
## + Terminal     1   8264997 843103997 10818  7.5189 0.006248 **
## + Grad.Rate    1   6882166 844486827 10819  6.2507 0.012622 * 
## + F.Undergrad  1   3793524 847575469 10822  3.4329 0.064294 . 
## <none>                     851368993 10824                    
## + P.Undergrad  1   1682974 849686020 10824  1.5192 0.218119   
## + S.F.Ratio    1   1313916 850055078 10824  1.1855 0.276573   
## + Personal     1    283343 851085650 10825  0.2553 0.613479   
## + Books        1    119956 851249037 10826  0.1081 0.742426   
## + perc.alumni  1      3141 851365852 10826  0.0028 0.957588   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10815.8
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board + 
##     Top25perc + Private + PhD
## 
##               Df Sum of Sq       RSS   AIC F value   Pr(>F)   
## + Grad.Rate    1   7349094 833270183 10811  6.7558 0.009524 **
## + F.Undergrad  1   4405096 836214181 10814  4.0352 0.044910 * 
## + P.Undergrad  1   2451457 838167820 10816  2.2404 0.134860   
## <none>                     840619277 10816                    
## + S.F.Ratio    1   1808849 838810427 10816  1.6518 0.199097   
## + Terminal     1    487207 840132070 10817  0.4442 0.505295   
## + Personal     1    284996 840334280 10818  0.2598 0.610414   
## + perc.alumni  1     78065 840541211 10818  0.0711 0.789753   
## + Books        1      3237 840616040 10818  0.0029 0.956701   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10810.98
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board + 
##     Top25perc + Private + PhD + Grad.Rate
## 
##               Df Sum of Sq       RSS   AIC F value  Pr(>F)  
## + F.Undergrad  1   5704713 827565469 10808  5.2734 0.02192 *
## + P.Undergrad  1   4323227 828946955 10809  3.9897 0.04613 *
## <none>                     833270183 10811                  
## + S.F.Ratio    1   1711005 831559177 10811  1.5741 0.21000  
## + Personal     1    841830 832428353 10812  0.7736 0.37937  
## + Terminal     1    352500 832917683 10813  0.3238 0.56953  
## + perc.alumni  1    117245 833152938 10813  0.1077 0.74292  
## + Books        1     58008 833212174 10813  0.0533 0.81755  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10807.64
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board + 
##     Top25perc + Private + PhD + Grad.Rate + F.Undergrad
## 
##               Df Sum of Sq       RSS   AIC F value Pr(>F)
## + P.Undergrad  1   2248227 825317242 10808  2.0812 0.1495
## <none>                     827565469 10808               
## + S.F.Ratio    1   1437928 826127541 10808  1.3298 0.2492
## + Terminal     1    515424 827050045 10809  0.4761 0.4904
## + Personal     1    426703 827138766 10809  0.3941 0.5303
## + perc.alumni  1     39675 827525794 10810  0.0366 0.8483
## + Books        1     25040 827540429 10810  0.0231 0.8792
## 
## Step:  AIC=10807.53
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board + 
##     Top25perc + Private + PhD + Grad.Rate + F.Undergrad + P.Undergrad
## 
##               Df Sum of Sq       RSS   AIC F value Pr(>F)
## <none>                     825317242 10808               
## + S.F.Ratio    1   1485954 823831288 10808  1.3762 0.2411
## + Terminal     1    513811 824803431 10809  0.4753 0.4908
## + Personal     1    227696 825089547 10809  0.2106 0.6465
## + perc.alumni  1     20671 825296571 10810  0.0191 0.8901
## + Books        1     13607 825303635 10810  0.0126 0.9107

summary(lm.step.forward)

## 
## Call:
## lm(formula = Apps ~ Accept + Top10perc + Expend + Outstate + 
##     Enroll + Room.Board + Top25perc + Private + PhD + Grad.Rate + 
##     F.Undergrad + P.Undergrad, data = College)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4931.0  -428.4   -28.0   325.6  7870.6 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -157.28686  265.43114  -0.593 0.553643    
## Accept         1.58691    0.04004  39.633  < 2e-16 ***
## Top10perc     50.41132    5.52514   9.124  < 2e-16 ***
## Expend         0.07247    0.01139   6.363 3.41e-10 ***
## Outstate      -0.09018    0.01831  -4.925 1.03e-06 ***
## Enroll        -0.88265    0.18468  -4.779 2.11e-06 ***
## Room.Board     0.14777    0.04695   3.147 0.001713 ** 
## Top25perc    -14.74735    4.41778  -3.338 0.000884 ***
## PrivateYes  -511.78760  134.28488  -3.811 0.000149 ***
## PhD          -10.70503    3.12107  -3.430 0.000636 ***
## Grad.Rate      8.63961    2.86106   3.020 0.002614 ** 
## F.Undergrad    0.05945    0.03244   1.833 0.067185 .  
## P.Undergrad    0.04593    0.03184   1.443 0.149533    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1039 on 764 degrees of freedom
## Multiple R-squared:  0.929,  Adjusted R-squared:  0.9279 
## F-statistic:   833 on 12 and 764 DF,  p-value: < 2.2e-16

b) Backward elimination method

lm.fit.null <-lm(Apps~1,data=College) 
lm.fit.full <-lm(Apps~.,data=College)
lm.step.backward <- step(lm.fit.full, scope = list(lower=lm.fit.null, upper=lm.fit.full), direction="backward", test = "F")

## Start:  AIC=10815.4
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + Books + Personal + 
##     PhD + Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate
## 
##               Df  Sum of Sq        RSS   AIC   F value    Pr(>F)    
## - perc.alumni  1       2057  823062005 10813    0.0019 0.9652727    
## - Books        1       8332  823068280 10813    0.0077 0.9301754    
## - Personal     1     263707  823323655 10814    0.2432 0.6220599    
## - Terminal     1     463415  823523363 10814    0.4273 0.5134916    
## - S.F.Ratio    1    1518247  824578195 10815    1.4001 0.2370808    
## - P.Undergrad  1    2073704  825133652 10815    1.9123 0.1671142    
## <none>                       823059948 10815                        
## - F.Undergrad  1    3337258  826397207 10816    3.0775 0.0797853 .  
## - PhD          1    3796560  826856508 10817    3.5011 0.0617138 .  
## - Grad.Rate    1    9368302  832428250 10822    8.6392 0.0033900 ** 
## - Room.Board   1   10605426  833665374 10823    9.7800 0.0018316 ** 
## - Top25perc    1   10951733  834011681 10824   10.0993 0.0015433 ** 
## - Private      1   13942221  837002170 10826   12.8571 0.0003577 ***
## - Outstate     1   22020341  845080289 10834   20.3065 7.638e-06 ***
## - Enroll       1   24321652  847381600 10836   22.4287 2.603e-06 ***
## - Expend       1   43151679  866211628 10853   39.7931 4.793e-10 ***
## - Top10perc    1   86866642  909926590 10891   80.1057 < 2.2e-16 ***
## - Accept       1 1642984489 2466044437 11666 1515.1086 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10813.4
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + Books + Personal + 
##     PhD + Terminal + S.F.Ratio + Expend + Grad.Rate
## 
##               Df  Sum of Sq        RSS   AIC   F value    Pr(>F)    
## - Books        1       7989  823069994 10811    0.0074 0.9315780    
## - Personal     1     261699  823323704 10812    0.2416 0.6231601    
## - Terminal     1     461438  823523443 10812    0.4261 0.5141145    
## - S.F.Ratio    1    1517299  824579304 10813    1.4010 0.2369187    
## - P.Undergrad  1    2071755  825133760 10813    1.9130 0.1670344    
## <none>                       823062005 10813                        
## - F.Undergrad  1    3335997  826398002 10814    3.0804 0.0796441 .  
## - PhD          1    3799154  826861159 10815    3.5081 0.0614543 .  
## - Grad.Rate    1    9830950  832892955 10821    9.0777 0.0026733 ** 
## - Room.Board   1   10817132  833879137 10822    9.9883 0.0016378 ** 
## - Top25perc    1   10979163  834041168 10822   10.1380 0.0015117 ** 
## - Private      1   14029065  837091070 10824   12.9542 0.0003399 ***
## - Outstate     1   22841086  845903092 10833   21.0910 5.125e-06 ***
## - Enroll       1   24505771  847567776 10834   22.6282 2.353e-06 ***
## - Expend       1   43192465  866254470 10851   39.8831 4.585e-10 ***
## - Top10perc    1   86934199  909996204 10889   80.2734 < 2.2e-16 ***
## - Accept       1 1686807594 2509869599 11678 1557.5665 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10811.41
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + Personal + PhD + Terminal + 
##     S.F.Ratio + Expend + Grad.Rate
## 
##               Df  Sum of Sq        RSS   AIC   F value    Pr(>F)    
## - Personal     1     286484  823356478 10810    0.2649 0.6069372    
## - Terminal     1     453536  823523530 10810    0.4193 0.5174650    
## - S.F.Ratio    1    1523776  824593770 10811    1.4089 0.2356154    
## - P.Undergrad  1    2073781  825143774 10811    1.9174 0.1665504    
## <none>                       823069994 10811                        
## - F.Undergrad  1    3337954  826407948 10813    3.0862 0.0793597 .  
## - PhD          1    3936953  827006947 10813    3.6401 0.0567811 .  
## - Grad.Rate    1    9823693  832893687 10819    9.0829 0.0026658 ** 
## - Top25perc    1   10971468  834041462 10820   10.1441 0.0015067 ** 
## - Room.Board   1   11052994  834122988 10820   10.2195 0.0014471 ** 
## - Private      1   14021861  837091855 10822   12.9644 0.0003381 ***
## - Outstate     1   22934693  846004687 10831   21.2051 4.836e-06 ***
## - Enroll       1   24507814  847577808 10832   22.6596 2.315e-06 ***
## - Expend       1   43299794  866369788 10849   40.0344 4.257e-10 ***
## - Top10perc    1   87145523  910215517 10888   80.5736 < 2.2e-16 ***
## - Accept       1 1687626727 2510696721 11676 1560.3581 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10809.68
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + PhD + Terminal + S.F.Ratio + 
##     Expend + Grad.Rate
## 
##               Df  Sum of Sq        RSS   AIC   F value    Pr(>F)    
## - Terminal     1     474810  823831288 10808    0.4394  0.507599    
## - S.F.Ratio    1    1446954  824803431 10809    1.3391  0.247551    
## <none>                       823356478 10810                        
## - P.Undergrad  1    2294055  825650532 10810    2.1231  0.145504    
## - F.Undergrad  1    3524082  826880559 10811    3.2615  0.071320 .  
## - PhD          1    3903516  827259993 10811    3.6126  0.057720 .  
## - Grad.Rate    1    9576067  832932544 10817    8.8625  0.003003 ** 
## - Room.Board   1   10948047  834304524 10818   10.1322  0.001516 ** 
## - Top25perc    1   11009477  834365954 10818   10.1891  0.001471 ** 
## - Private      1   14045232  837401710 10821   12.9986  0.000332 ***
## - Outstate     1   23757582  847114060 10830   21.9872 3.252e-06 ***
## - Enroll       1   24529642  847886120 10830   22.7017 2.266e-06 ***
## - Expend       1   43741282  867097760 10848   40.4817 3.423e-10 ***
## - Top10perc    1   87332619  910689096 10886   80.8246 < 2.2e-16 ***
## - Accept       1 1688998295 2512354773 11674 1563.1342 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10808.13
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + PhD + S.F.Ratio + Expend + 
##     Grad.Rate
## 
##               Df  Sum of Sq        RSS   AIC   F value    Pr(>F)    
## - S.F.Ratio    1    1485954  825317242 10808    1.3762 0.2411101    
## <none>                       823831288 10808                        
## - P.Undergrad  1    2296253  826127541 10808    2.1267 0.1451637    
## - F.Undergrad  1    3402201  827233489 10809    3.1510 0.0762798 .  
## - Grad.Rate    1    9725730  833557018 10815    9.0076 0.0027763 ** 
## - Room.Board   1   10580678  834411966 10816    9.7994 0.0018122 ** 
## - Top25perc    1   11852393  835683681 10817   10.9772 0.0009659 ***
## - PhD          1   13174760  837006048 10818   12.2019 0.0005049 ***
## - Private      1   13675900  837507188 10819   12.6661 0.0003954 ***
## - Enroll       1   24420572  848251860 10829   22.6174 2.364e-06 ***
## - Outstate     1   24881049  848712337 10829   23.0438 1.906e-06 ***
## - Expend       1   43404484  867235772 10846   40.1995 3.923e-10 ***
## - Top10perc    1   89940095  913771383 10887   83.2990 < 2.2e-16 ***
## - Accept       1 1691950930 2515782218 11674 1567.0181 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=10807.53
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + PhD + Expend + Grad.Rate
## 
##               Df  Sum of Sq        RSS   AIC   F value    Pr(>F)    
## <none>                       825317242 10808                        
## - P.Undergrad  1    2248227  827565469 10808    2.0812 0.1495333    
## - F.Undergrad  1    3629713  828946955 10809    3.3600 0.0671853 .  
## - Grad.Rate    1    9850583  835167825 10815    9.1187 0.0026143 ** 
## - Room.Board   1   10699017  836016260 10816    9.9041 0.0017131 ** 
## - Top25perc    1   12037817  837355059 10817   11.1435 0.0008841 ***
## - PhD          1   12708568  838025810 10817   11.7644 0.0006362 ***
## - Private      1   15691081  841008323 10820   14.5253 0.0001494 ***
## - Enroll       1   24676722  849993965 10828   22.8434 2.108e-06 ***
## - Outstate     1   26201946  851519188 10830   24.2553 1.035e-06 ***
## - Expend       1   43734225  869051468 10846   40.4850 3.412e-10 ***
## - Top10perc    1   89928332  915245574 10886   83.2471 < 2.2e-16 ***
## - Accept       1 1696846612 2522163854 11674 1570.7788 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(lm.step.backward)

## 
## Call:
## lm(formula = Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + 
##     F.Undergrad + P.Undergrad + Outstate + Room.Board + PhD + 
##     Expend + Grad.Rate, data = College)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4931.0  -428.4   -28.0   325.6  7870.6 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -157.28686  265.43114  -0.593 0.553643    
## PrivateYes  -511.78760  134.28488  -3.811 0.000149 ***
## Accept         1.58691    0.04004  39.633  < 2e-16 ***
## Enroll        -0.88265    0.18468  -4.779 2.11e-06 ***
## Top10perc     50.41132    5.52514   9.124  < 2e-16 ***
## Top25perc    -14.74735    4.41778  -3.338 0.000884 ***
## F.Undergrad    0.05945    0.03244   1.833 0.067185 .  
## P.Undergrad    0.04593    0.03184   1.443 0.149533    
## Outstate      -0.09018    0.01831  -4.925 1.03e-06 ***
## Room.Board     0.14777    0.04695   3.147 0.001713 ** 
## PhD          -10.70503    3.12107  -3.430 0.000636 ***
## Expend         0.07247    0.01139   6.363 3.41e-10 ***
## Grad.Rate      8.63961    2.86106   3.020 0.002614 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1039 on 764 degrees of freedom
## Multiple R-squared:  0.929,  Adjusted R-squared:  0.9279 
## F-statistic:   833 on 12 and 764 DF,  p-value: < 2.2e-16

both stepwise models produce the same results on the p-value of the predictors and on the r squared. regsubsets() will test all of the first 8 (one-variable, second-variable, third-variable, etc.) models. and these models doesn’t seem to be a great models. The stepwise regression method selects a model that contains only the significatn variables. becase since some of the predictors are correlated, the OLS methods will not be best methods in this case

part b) fitting Ridge regression with lambda choosen by cross-validation

library(glmnet) # Contains functions for Ridge and LASSO 
library(ISLR) # Contains the College data set College=na.omit(College)
x=model.matrix(Apps~., data=College)[,-1] 
y=College$Apps
ridge.mod=glmnet(x,y,alpha=0)
plot(ridge.mod)

ridge.cv=cv.glmnet(x,y,alpha=0) # Remove the  below if you want to see all results 
# ridge.cv 
plot(ridge.cv)

bestlam=ridge.cv$lambda.min 
min.mse=min(ridge.cv$cvm)
options(scipen=4) 
cbind("Best Ridge Lambda"=bestlam, "Best Ridge MSE"=min.mse)

##      Best Ridge Lambda Best Ridge MSE
## [1,]          400.4766        1659189

predict(ridge.mod, type="coefficients", s=bestlam)

## 18 x 1 sparse Matrix of class "dgCMatrix"
##                           1
## (Intercept) -1514.926536392
## PrivateYes   -529.332545725
## Accept          0.978075133
## Enroll          0.466691689
## Top10perc      24.973135731
## Top25perc       1.056472786
## F.Undergrad     0.076628595
## P.Undergrad     0.024459386
## Outstate       -0.021365424
## Room.Board      0.199797958
## Books           0.135279907
## Personal       -0.008966624
## PhD            -3.771159409
## Terminal       -4.713593090
## S.F.Ratio      12.828367668
## perc.alumni    -8.831660770
## Expend          0.075275975
## Grad.Rate      11.366625796

** These are the ridge model cofficients obtained using the best Lambda chosen by cross validation**

part C) fitting Lassso, with lambda chosen by cross-validation

LASSO Illustration

# Just change alpha from 0 to 1 
lasso.mod=glmnet(x,y,alpha=1)
# lasso.mod 
lasso.cv=cv.glmnet(x,y,alpha=1) # Remove the # below if you want to see all results 
# ridge.cv 
plot(lasso.cv)

bestlam=lasso.cv$lambda.min 
min.mse = min(lasso.cv$cvm) 
cbind("Best LASSO Lambda"=bestlam, "Best LASSO MSE"=min.mse)

##      Best LASSO Lambda Best LASSO MSE
## [1,]          12.51776        1314657

predict(lasso.mod, type="coefficients", s=bestlam)

## 18 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept) -545.91928048
## PrivateYes  -466.37942894
## Accept         1.50970702
## Enroll        -0.37570084
## Top10perc     40.35507151
## Top25perc     -7.03944778
## F.Undergrad    .         
## P.Undergrad    0.03677940
## Outstate      -0.07046711
## Room.Board     0.13714936
## Books          .         
## Personal       0.01508622
## PhD           -6.97610541
## Terminal      -3.11527371
## S.F.Ratio      9.55099994
## perc.alumni   -0.65186829
## Expend         0.07310876
## Grad.Rate      6.54067105

** These are the ridge model cofficients obtained using the best Lambda chosen by cross validation**

Smallest Deviance with LASSO model. LASSO yields a smaller compared to Ridge’s MSE. LASSO is slightly better in this case.The relative importance of the coefficients is similar between the two models, but the LASSO coefficients are slightly smaller because the optimal Lambda is smaller and, therefore, there was less shrinkage in the resulting coefficients.

part d) Fitting PCR model, with the M, the number of principal components, chosen by cross-Validation

For Principal Components Regression (PCR), we will use the pcr(){pls} function and the College data set. let us load first the {pls} library and fit a PCR model to predict Apps using all the remaining variables as predictors and the full data set. we will use the LOOCV validation this time scale=T, validation=“LOO” and Store the results in an object named pcr.fit.

library(pls)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:stats':
## 
##     loadings

attach(College)

pcr.fit=pcr(Apps~., data = College[,-1],  scale=TRUE, validation="LOO")
summary(pcr.fit)

## Data:    X dimension: 777 16 
##  Y dimension: 777 1
## Fit method: svdpc
## Number of components considered: 16
## 
## VALIDATION: RMSEP
## Cross-validated using 777 leave-one-out segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3873     3667     1988     1988     2024     1591     1587
## adjCV         3873     3667     1988     1988     2026     1590     1587
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1588     1551     1503      1501      1502      1506      1506
## adjCV     1588     1551     1503      1501      1502      1506      1506
##        14 comps  15 comps  16 comps
## CV         1439      1172      1134
## adjCV      1439      1172      1134
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X       32.77    57.08    64.39    70.22    75.96    81.25    85.03
## Apps    10.83    74.28    74.53    74.62    83.94    84.06    84.13
##       8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X       88.65    91.93     94.44     96.39     97.77     98.67     99.32
## Apps    85.00    85.80     85.93     85.98     85.99     86.02     90.20
##       15 comps  16 comps
## X        99.83     100.0
## Apps     92.36      92.8

Displaying a scree plot for the Root Square of the MSE (val.type=“RMSEP”) and summary results of pcr.fit using the validationplot(){pls} function.

validationplot(pcr.fit, val.type="RMSEP")

summary(pcr.fit)

## Data:    X dimension: 777 16 
##  Y dimension: 777 1
## Fit method: svdpc
## Number of components considered: 16
## 
## VALIDATION: RMSEP
## Cross-validated using 777 leave-one-out segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3873     3667     1988     1988     2024     1591     1587
## adjCV         3873     3667     1988     1988     2026     1590     1587
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1588     1551     1503      1501      1502      1506      1506
## adjCV     1588     1551     1503      1501      1502      1506      1506
##        14 comps  15 comps  16 comps
## CV         1439      1172      1134
## adjCV      1439      1172      1134
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X       32.77    57.08    64.39    70.22    75.96    81.25    85.03
## Apps    10.83    74.28    74.53    74.62    83.94    84.06    84.13
##       8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X       88.65    91.93     94.44     96.39     97.77     98.67     99.32
## Apps    85.00    85.80     85.93     85.98     85.99     86.02     90.20
##       15 comps  16 comps
## X        99.83     100.0
## Apps     92.36      92.8

Based on the 2 outputs above, the number of components that will be appropriate for this model will be discussed here refering to “elbows” in the scree plot, the cross-validation (CV values of the RMSEP), and the % explained variance of the various components.

The scree plot “elbows” at around 2, 4 5, and 14 components, so these are good candidates. The RMSEP CV value declines sharply as we add more components, with 14 components showing the lowest RMSEP. But 4 components explain 65% of the variance in the predictors. Based on this analysis, I would select 4 components. 14 components are way too many to achieve any meaningful dimension reduction, but it would still be a viable model because all 14 principal components are “orthogonal” (i.e., uncorrelated). 4 components would be OK, but I generally like to have at least 70% of the predictors variance explained.

or : more on PCs

reg = lm(Apps~., data = College[,-1],  scale=TRUE)

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'scale' will be disregarded

summary(reg)

## 
## Call:
## lm(formula = Apps ~ ., data = College[, -1], scale = TRUE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5060.6  -429.2   -18.0   309.7  7661.8 
## 
## Coefficients:
##                Estimate  Std. Error t value Pr(>|t|)    
## (Intercept) -894.795010  391.609762  -2.285  0.02259 *  
## Accept         1.591269    0.041029  38.784  < 2e-16 ***
## Enroll        -0.890291    0.187386  -4.751 2.42e-06 ***
## Top10perc     49.677562    5.621155   8.838  < 2e-16 ***
## Top25perc    -14.375035    4.513776  -3.185  0.00151 ** 
## F.Undergrad    0.073030    0.032674   2.235  0.02570 *  
## P.Undergrad    0.049895    0.032355   1.542  0.12346    
## Outstate      -0.109284    0.018041  -6.058 2.17e-09 ***
## Room.Board     0.135106    0.048462   2.788  0.00544 ** 
## Books         -0.009177    0.240111  -0.038  0.96952    
## Personal       0.031471    0.063566   0.495  0.62068    
## PhD           -6.789119    4.643923  -1.462  0.14417    
## Terminal      -1.370997    5.104909  -0.269  0.78834    
## S.F.Ratio     23.047061   12.929393   1.783  0.07506 .  
## perc.alumni   -1.169644    4.116777  -0.284  0.77640    
## Expend         0.081959    0.012392   6.614 7.06e-11 ***
## Grad.Rate      8.040662    2.966617   2.710  0.00687 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1049 on 760 degrees of freedom
## Multiple R-squared:  0.928,  Adjusted R-squared:  0.9265 
## F-statistic: 612.1 on 16 and 760 DF,  p-value: < 2.2e-16

x = model.matrix(reg)
head(x)

##                              (Intercept) Accept Enroll Top10perc Top25perc
## Abilene Christian University           1   1232    721        23        52
## Adelphi University                     1   1924    512        16        29
## Adrian College                         1   1097    336        22        50
## Agnes Scott College                    1    349    137        60        89
## Alaska Pacific University              1    146     55        16        44
## Albertson College                      1    479    158        38        62
##                              F.Undergrad P.Undergrad Outstate Room.Board
## Abilene Christian University        2885         537     7440       3300
## Adelphi University                  2683        1227    12280       6450
## Adrian College                      1036          99    11250       3750
## Agnes Scott College                  510          63    12960       5450
## Alaska Pacific University            249         869     7560       4120
## Albertson College                    678          41    13500       3335
##                              Books Personal PhD Terminal S.F.Ratio
## Abilene Christian University   450     2200  70       78      18.1
## Adelphi University             750     1500  29       30      12.2
## Adrian College                 400     1165  53       66      12.9
## Agnes Scott College            450      875  92       97       7.7
## Alaska Pacific University      800     1500  76       72      11.9
## Albertson College              500      675  67       73       9.4
##                              perc.alumni Expend Grad.Rate
## Abilene Christian University          12   7041        60
## Adelphi University                    16  10527        56
## Adrian College                        30   8735        54
## Agnes Scott College                   37  19016        59
## Alaska Pacific University              2  10922        15
## Albertson College                     11   9727        55

princomp(x)

## Call:
## princomp(x = x)
## 
## Standard deviations:
##      Comp.1      Comp.2      Comp.3      Comp.4      Comp.5      Comp.6 
## 6147.789307 5502.758528 2464.281335 1257.955785  989.756759  782.815423 
##      Comp.7      Comp.8      Comp.9     Comp.10     Comp.11     Comp.12 
##  606.198957  200.326557  159.062212   21.258838   14.831438   12.548620 
##     Comp.13     Comp.14     Comp.15     Comp.16     Comp.17 
##    8.998635    6.033261    5.322451    2.909312    0.000000 
## 
##  17  variables and  777 observations.

pc = princomp(x)
summary(pc)

## Importance of components:
##                              Comp.1       Comp.2       Comp.3
## Standard deviation     6147.7893073 5502.7585281 2464.2813348
## Proportion of Variance    0.4860689    0.3894223    0.0780981
## Cumulative Proportion     0.4860689    0.8754912    0.9535893
##                               Comp.4       Comp.5        Comp.6
## Standard deviation     1257.95578516 989.75675903 782.815422961
## Proportion of Variance    0.02035123   0.01259844   0.007880951
## Cumulative Proportion     0.97394051   0.98653895   0.994419903
##                               Comp.7         Comp.8         Comp.9
## Standard deviation     606.198957144 200.3265568670 159.0622117122
## Proportion of Variance   0.004725962   0.0005161037   0.0003253823
## Cumulative Proportion    0.999145865   0.9996619688   0.9999873510
##                                Comp.10         Comp.11         Comp.12
## Standard deviation     21.258838393006 14.831437967099 12.548620325534
## Proportion of Variance  0.000005812179  0.000002828957  0.000002025125
## Cumulative Proportion   0.999993163227  0.999995992185  0.999998017310
##                               Comp.13         Comp.14         Comp.15
## Standard deviation     8.998635090312 6.0332609293813 5.3224509652541
## Proportion of Variance 0.000001041389 0.0000004681274 0.0000003643202
## Cumulative Proportion  0.999999058699 0.9999995268268 0.9999998911470
##                               Comp.16 Comp.17
## Standard deviation     2.909312274850       0
## Proportion of Variance 0.000000108853       0
## Cumulative Proportion  1.000000000000       1

screeplot(pc)

thus, the first 3 Principal components could explain 95.42 % all the variation in data.

library(pls) 

pcreg=pcr(Apps~., data = College[,-1],  scale=TRUE, validation="CV")
summary(pcreg)

## Data:    X dimension: 777 16 
##  Y dimension: 777 1
## Fit method: svdpc
## Number of components considered: 16
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3873     3664     1988     1998     1926     1594     1587
## adjCV         3873     3664     1985     1999     2004     1581     1585
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1582     1545     1496      1494      1497      1503      1503
## adjCV     1585     1540     1493      1492      1494      1500      1500
##        14 comps  15 comps  16 comps
## CV         1421      1157      1123
## adjCV      1405      1151      1119
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X       32.77    57.08    64.39    70.22    75.96    81.25    85.03
## Apps    10.83    74.28    74.53    74.62    83.94    84.06    84.13
##       8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X       88.65    91.93     94.44     96.39     97.77     98.67     99.32
## Apps    85.00    85.80     85.93     85.98     85.99     86.02     90.20
##       15 comps  16 comps
## X        99.83     100.0
## Apps     92.36      92.8

1 comps expain unsupervised 32.77% of total variance explain in X, with 10.83 % supervised variation in y (Apps): var(y) and etc..

#pcreg$loadings # The linear weight of each variable on each 
#head(pcreg$loadings)

#pcreg$scores # # Resulting from applying loadings to each data 
#head(pcreg$scores)

Predictions and testing some of the Principal component models.

set.seed(1) # You can use any seed number 

test=sample(1:nrow(College), 0.10*nrow(College))

pcr.pred.4=predict(pcreg, College[test,], ncomp=4) 

mse.4=mean((College$Apps[test]-pcr.pred.4)^2) # Compute the MSE

rmse.4=sqrt(mse.4) 
pcr.pred.5=predict(pcreg, College[test,], ncomp=5) 

mse.5=mean((College$Apps[test]-pcr.pred.5)^2) # Compute the MSE 
rmse.5=sqrt(mse.5) 

pcr.pred.10=predict(pcreg, College[test,], ncomp=10) 

mse.10=mean((College$Apps[test]-pcr.pred.10)^2) # Compute the MSE 

rmse.10=sqrt(mse.10)

mse.test=data.frame(c(mse.4, mse.5, mse.10), c(rmse.4, rmse.5, rmse.10)) 
colnames(mse.test)=c("Test MSE", "Test RMSE") 
rownames(mse.test)=c("4 Comp", "5 Comp", "10 Comp") 
mse.test

##         Test MSE Test RMSE
## 4 Comp  15152636  3892.639
## 5 Comp   9730377  3119.355
## 10 Comp  8820289  2969.897

part e) fitting PLS model, with M chosen by cross validation

library(pls) 

plsreg=plsr(Apps~., data = College[,-1],  scale=TRUE, validation="CV")
summary(plsreg)

## Data:    X dimension: 777 16 
##  Y dimension: 777 1
## Fit method: kernelpls
## Number of components considered: 16
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3873     1845     1660     1441     1355     1243     1179
## adjCV         3873     1843     1657     1437     1342     1236     1172
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1174     1167     1162      1161      1160      1160      1159
## adjCV     1166     1159     1155      1154      1153      1153      1153
##        14 comps  15 comps  16 comps
## CV         1159      1159      1159
## adjCV      1152      1152      1152
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X       26.16    52.39    62.56    65.36    69.34    74.09    76.80
## Apps    77.97    82.66    87.65    90.56    92.12    92.57    92.64
##       8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X       79.73    82.60     85.35     87.98     91.73      94.0      96.4
## Apps    92.70    92.75     92.77     92.79     92.79      92.8      92.8
##       15 comps  16 comps
## X        98.76     100.0
## Apps     92.80      92.8

validationplot(plsreg, val.type="RMSEP")

summary(plsreg)

## Data:    X dimension: 777 16 
##  Y dimension: 777 1
## Fit method: kernelpls
## Number of components considered: 16
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3873     1845     1660     1441     1355     1243     1179
## adjCV         3873     1843     1657     1437     1342     1236     1172
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1174     1167     1162      1161      1160      1160      1159
## adjCV     1166     1159     1155      1154      1153      1153      1153
##        14 comps  15 comps  16 comps
## CV         1159      1159      1159
## adjCV      1152      1152      1152
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X       26.16    52.39    62.56    65.36    69.34    74.09    76.80
## Apps    77.97    82.66    87.65    90.56    92.12    92.57    92.64
##       8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X       79.73    82.60     85.35     87.98     91.73      94.0      96.4
## Apps    92.70    92.75     92.77     92.79     92.79      92.8      92.8
##       15 comps  16 comps
## X        98.76     100.0
## Apps     92.80      92.8

#plsreg$loadings

#head(plsreg$loadings)

Predictions and testing PLS model.

pls.pred.4=predict(plsreg, College[test,], ncomp=4) 
mse.4=mean((College$Apps[test]-pls.pred.4)^2) # Compute the MSE rmse.4=sqrt(mse.4)
pls.pred.5=predict(plsreg, College[test,], ncomp=5) 
mse.5=mean((College$Apps[test]-pls.pred.5)^2) # Compute the MSE rmse.5=sqrt(mse.5)
pls.pred.10=predict(plsreg, College[test,], ncomp=10) 
mse.10=mean((College$Apps[test]-pls.pred.10)^2) # Compute the MSE rmse.10=sqrt(mse.10)
mse.test=data.frame(c(mse.4, mse.5, mse.10), c(rmse.4, rmse.5, rmse.10)) 
colnames(mse.test)=c("Test MSE", "Test RMSE") 
rownames(mse.test)=c("4 Comp", "5 Comp", "10 Comp") 
mse.test

##         Test MSE Test RMSE
## 4 Comp   4358302  3892.639
## 5 Comp   2979685  3119.355
## 10 Comp  2501719  2969.897

** Without question, the 10 component model yields the lowest MSE and RMSE, so it is the best. But if our goal is dimension reduction, we could choose a model with fewer components.**

summary of prediction for all modell findings - using Training and testing Data partitioning

Part a) partitioning of data

require(ISLR)
data(College)
set.seed(1)
trainid <- sample(1:nrow(College), nrow(College)/2)
train <- College[trainid,]
test <- College[-trainid,]
str(College)

## 'data.frame':    777 obs. of  18 variables:
##  $ Private    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Apps       : num  1660 2186 1428 417 193 ...
##  $ Accept     : num  1232 1924 1097 349 146 ...
##  $ Enroll     : num  721 512 336 137 55 158 103 489 227 172 ...
##  $ Top10perc  : num  23 16 22 60 16 38 17 37 30 21 ...
##  $ Top25perc  : num  52 29 50 89 44 62 45 68 63 44 ...
##  $ F.Undergrad: num  2885 2683 1036 510 249 ...
##  $ P.Undergrad: num  537 1227 99 63 869 ...
##  $ Outstate   : num  7440 12280 11250 12960 7560 ...
##  $ Room.Board : num  3300 6450 3750 5450 4120 ...
##  $ Books      : num  450 750 400 450 800 500 500 450 300 660 ...
##  $ Personal   : num  2200 1500 1165 875 1500 ...
##  $ PhD        : num  70 29 53 92 76 67 90 89 79 40 ...
##  $ Terminal   : num  78 30 66 97 72 73 93 100 84 41 ...
##  $ S.F.Ratio  : num  18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
##  $ perc.alumni: num  12 16 30 37 2 11 26 37 23 15 ...
##  $ Expend     : num  7041 10527 8735 19016 10922 ...
##  $ Grad.Rate  : num  60 56 54 59 15 55 63 73 80 52 ...

Part b) lm method test error prediction

fit.lm <- lm(Apps~., data=train)
pred.lm <- predict(fit.lm, test)
(err.lm <- mean((test$Apps - pred.lm)^2))  # test error

## [1] 1108531

Part c) Ridge method test error prediction

require(glmnet)
xmat.train <- model.matrix(Apps~., data=train)[,-1]
xmat.test <- model.matrix(Apps~., data=test)[,-1]
fit.ridge <- cv.glmnet(xmat.train, train$Apps, alpha=0)
(lambda <- fit.ridge$lambda.min)  # optimal lambda

## [1] 450.7435

pred.ridge <- predict(fit.ridge, s=lambda, newx=xmat.test)
(err.ridge <- mean((test$Apps - pred.ridge)^2))  # test error

## [1] 1037616

Part d) Lasso method test error prediction

require(glmnet)
xmat.train <- model.matrix(Apps~., data=train)[,-1]
xmat.test <- model.matrix(Apps~., data=test)[,-1]
fit.lasso <- cv.glmnet(xmat.train, train$Apps, alpha=1)
(lambda <- fit.lasso$lambda.min)  # optimal lambda

## [1] 29.65591

pred.lasso <- predict(fit.lasso, s=lambda, newx=xmat.test)
(err.lasso <- mean((test$Apps - pred.lasso)^2))  # test error

## [1] 1025248

coef.lasso <- predict(fit.lasso, type="coefficients", s=lambda)[1:ncol(College),]
coef.lasso[coef.lasso != 0]

##    (Intercept)     PrivateYes         Accept         Enroll      Top10perc 
## -451.422290444 -481.435163973    1.535012486   -0.403545479   46.681699352 
##      Top25perc    F.Undergrad       Outstate     Room.Board            PhD 
##   -7.091364043   -0.003426988   -0.050200640    0.185160995   -3.849884689 
##       Terminal    perc.alumni         Expend      Grad.Rate 
##   -3.443746687   -2.117869862    0.031928455    2.695729825

length(coef.lasso[coef.lasso != 0])

## [1] 14

Part e) Principal component Test error prediction

require(pls)
set.seed(1)
fit.pcr <- pcr(Apps~., data=train, scale=TRUE, validation="CV")
validationplot(fit.pcr, val.type="MSEP")

summary(fit.pcr)

## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4335     4179     2364     2374     1996     1844     1845
## adjCV         4335     4182     2360     2374     1788     1831     1838
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1850     1863     1809      1809      1812      1815      1825
## adjCV     1844     1857     1801      1800      1804      1808      1817
##        14 comps  15 comps  16 comps  17 comps
## CV         1810      1823      1273      1281
## adjCV      1806      1789      1260      1268
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X      31.216    57.68    64.73    70.55    76.33    81.30    85.01
## Apps    6.976    71.47    71.58    83.32    83.44    83.45    83.46
##       8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X       88.40    91.16     93.36     95.38     96.94     97.96     98.76
## Apps    83.47    84.53     84.86     84.98     84.98     84.99     85.24
##       15 comps  16 comps  17 comps
## X        99.40     99.87    100.00
## Apps     90.87     93.93     93.97

pred.pcr <- predict(fit.pcr, test, ncomp=16)  # min Cv at M=16
(err.pcr <- mean((test$Apps - pred.pcr)^2))  # test error

## [1] 1166897

Part f) PLS test error prediction

require(pls)
set.seed(1)
fit.pls <- plsr(Apps~., data=train, scale=TRUE, validation="CV")
validationplot(fit.pls, val.type="MSEP")

summary(fit.pls)

## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4335     2176     1893     1725     1613     1406     1312
## adjCV         4335     2171     1884     1715     1578     1375     1295
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1297     1285     1280      1278      1279      1282      1281
## adjCV     1281     1271     1267      1265      1266      1269      1268
##        14 comps  15 comps  16 comps  17 comps
## CV         1281      1281      1281      1281
## adjCV      1267      1267      1268      1268
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X       26.91    43.08    63.26    65.16    68.50    73.75    76.10
## Apps    76.64    83.93    87.14    91.90    93.49    93.85    93.91
##       8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X       79.03    81.76     85.41     89.03     91.38     93.31     95.43
## Apps    93.94    93.96     93.96     93.96     93.97     93.97     93.97
##       15 comps  16 comps  17 comps
## X        97.41     98.78    100.00
## Apps     93.97     93.97     93.97

pred.pls <- predict(fit.pls, test, ncomp=10)  # min Cv at M=10
(err.pls <- mean((test$Apps - pred.pls)^2))  # test error

## [1] 1134531

err.pls

## [1] 1134531

Part g) summary of all test error predictions using graph

err.all <- c(err.lm, err.ridge, err.lasso, err.pcr, err.pls)
names(err.all) <- c("lm", "ridge", "lasso", "pcr", "pls")
barplot(err.all )

The test errors aren’t much different. The ridge and lasso seem to perform slightly better while the PCR/PLS don’t show any improvement from the full linear regression model.

Lasso appears to to be the most accurate model to predict the number of college applications, followed by Ridge regression. This is the final conclusion that I can take here.

Predicting the number of college applications

Alema Fissuh

March 29, 2019