Question 8

Generate test data:

set.seed(1)
x <- rnorm(100)
eps <- rnorm(100)
beta0 <- 2.731
beta1 <- -3.296
beta2 <- 5.117
beta3 <- -1.793
y <- beta0 + beta1 * x + beta2 * x^2 + beta3 * x^3 + eps
plot(x, y)

inp <- data.frame(x, y)

(c) Fit with best subsets selection

bss <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = inp, nvmax = 10)
bss.summary <- summary(bss)
bss.summary
Subset selection object
Call: regsubsets.formula(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + 
    I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = inp, 
    nvmax = 10)
10 Variables  (and intercept)
        Forced in Forced out
x           FALSE      FALSE
I(x^2)      FALSE      FALSE
I(x^3)      FALSE      FALSE
I(x^4)      FALSE      FALSE
I(x^5)      FALSE      FALSE
I(x^6)      FALSE      FALSE
I(x^7)      FALSE      FALSE
I(x^8)      FALSE      FALSE
I(x^9)      FALSE      FALSE
I(x^10)     FALSE      FALSE
1 subsets of each size up to 10
Selection Algorithm: exhaustive
          x   I(x^2) I(x^3) I(x^4) I(x^5) I(x^6) I(x^7) I(x^8) I(x^9) I(x^10)
1  ( 1 )  "*" " "    " "    " "    " "    " "    " "    " "    " "    " "    
2  ( 1 )  " " "*"    "*"    " "    " "    " "    " "    " "    " "    " "    
3  ( 1 )  "*" "*"    "*"    " "    " "    " "    " "    " "    " "    " "    
4  ( 1 )  "*" "*"    "*"    " "    "*"    " "    " "    " "    " "    " "    
5  ( 1 )  "*" "*"    "*"    " "    "*"    "*"    " "    " "    " "    " "    
6  ( 1 )  "*" "*"    "*"    " "    " "    " "    "*"    "*"    "*"    " "    
7  ( 1 )  "*" "*"    "*"    " "    "*"    "*"    " "    "*"    " "    "*"    
8  ( 1 )  "*" "*"    "*"    "*"    " "    "*"    " "    "*"    "*"    "*"    
9  ( 1 )  "*" "*"    "*"    "*"    "*"    "*"    " "    "*"    "*"    "*"    
10  ( 1 ) "*" "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    
par(mfrow=c(1,3))
plot(bss.summary$adjr2, xlab = 'Number of variables', ylab = 'Adjusted R2', type = 'l')
rsq.max.idx <- which.max(bss.summary$adjr2)
points(rsq.max.idx, bss.summary$adjr2[rsq.max.idx], col = 'red', cex = 2, pch = 20)
cp.min.idx <- which.min(bss.summary$cp)
plot(bss.summary$cp, xlab = 'Number of variables', ylab = 'Cp', type = 'l')
points(cp.min.idx, bss.summary$cp[cp.min.idx], col = 'red', cex = 2, pch = 20)
bic.min.idx <- which.min(bss.summary$bic)
plot(bss.summary$bic, xlab = 'Number of variables', ylab = 'BIC', type = 'l')
points(bic.min.idx, bss.summary$bic[bic.min.idx], col = 'red', cex = 2, pch = 20)
par(mfrow=c(1,1))

plot(bss, scale = 'adjr2')

plot(bss, scale = 'Cp')

plot(bss, scale = 'bic')

According to adjusted \(R^2\) and \(C_p\), the best model contains 4 predictors: \(X\), \(X^2\), \(X^3\) and \(X^5\). According to BIC, the best model contains 3 predictors: \(X\), \(X^2\) and \(X^3\).

Get the coefficient estimates:

coef(bss, 3)
(Intercept)           x      I(x^2)      I(x^3) 
   2.792507   -3.320720    4.993209   -1.775361 

The coefficient estimates vs origin coefficients: \[ \beta_0 = 2.731 \quad \beta_1 = -3.296 \quad \beta_2 = 5.117 \quad \beta_3 = -1.793\\ \hat\beta_0 = 2.793 \quad \hat\beta_1 = -3.321 \quad \hat\beta_2 = 4.993\quad \hat\beta_3 = -1.775 \]

(d)

Fit with forward stepwise selection method:

fss <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = inp, method = 'forward', nvmax = 10)
fss.summary <- summary(fss)
fss.summary
Subset selection object
Call: regsubsets.formula(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + 
    I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = inp, 
    method = "forward", nvmax = 10)
10 Variables  (and intercept)
        Forced in Forced out
x           FALSE      FALSE
I(x^2)      FALSE      FALSE
I(x^3)      FALSE      FALSE
I(x^4)      FALSE      FALSE
I(x^5)      FALSE      FALSE
I(x^6)      FALSE      FALSE
I(x^7)      FALSE      FALSE
I(x^8)      FALSE      FALSE
I(x^9)      FALSE      FALSE
I(x^10)     FALSE      FALSE
1 subsets of each size up to 10
Selection Algorithm: forward
          x   I(x^2) I(x^3) I(x^4) I(x^5) I(x^6) I(x^7) I(x^8) I(x^9) I(x^10)
1  ( 1 )  "*" " "    " "    " "    " "    " "    " "    " "    " "    " "    
2  ( 1 )  "*" "*"    " "    " "    " "    " "    " "    " "    " "    " "    
3  ( 1 )  "*" "*"    "*"    " "    " "    " "    " "    " "    " "    " "    
4  ( 1 )  "*" "*"    "*"    " "    "*"    " "    " "    " "    " "    " "    
5  ( 1 )  "*" "*"    "*"    " "    "*"    "*"    " "    " "    " "    " "    
6  ( 1 )  "*" "*"    "*"    " "    "*"    "*"    " "    " "    "*"    " "    
7  ( 1 )  "*" "*"    "*"    " "    "*"    "*"    "*"    " "    "*"    " "    
8  ( 1 )  "*" "*"    "*"    " "    "*"    "*"    "*"    "*"    "*"    " "    
9  ( 1 )  "*" "*"    "*"    " "    "*"    "*"    "*"    "*"    "*"    "*"    
10  ( 1 ) "*" "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    
par(mfrow=c(1,3))
plot(fss.summary$adjr2, xlab = 'Number of variables', ylab = 'Adjusted R2', type = 'l')
rsq.max.idx <- which.max(fss.summary$adjr2)
points(rsq.max.idx, fss.summary$adjr2[rsq.max.idx], col = 'red', cex = 2, pch = 20)
cp.min.idx <- which.min(fss.summary$cp)
plot(fss.summary$cp, xlab = 'Number of variables', ylab = 'Cp', type = 'l')
points(cp.min.idx, fss.summary$cp[cp.min.idx], col = 'red', cex = 2, pch = 20)
bic.min.idx <- which.min(fss.summary$bic)
plot(fss.summary$bic, xlab = 'Number of variables', ylab = 'BIC', type = 'l')
points(bic.min.idx, fss.summary$bic[bic.min.idx], col = 'red', cex = 2, pch = 20)

According to adjusted \(R^2\), the best model contains 5 predictors: \(X\) ~ \(X^5\). According to \(C_p\), the best model contains 4 predictors: \(X\), \(X^2\), \(X^3\) and \(X^5\). According to BIC, the best model contains 3 predictors: \(X\), \(X^2\) and \(X^3\).

Fit with backward stepwise selection method:

bks <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = inp, method = 'backward', nvmax = 10)
bks.summary <- summary(bks)
bks.summary
Subset selection object
Call: regsubsets.formula(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + 
    I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = inp, 
    method = "backward", nvmax = 10)
10 Variables  (and intercept)
        Forced in Forced out
x           FALSE      FALSE
I(x^2)      FALSE      FALSE
I(x^3)      FALSE      FALSE
I(x^4)      FALSE      FALSE
I(x^5)      FALSE      FALSE
I(x^6)      FALSE      FALSE
I(x^7)      FALSE      FALSE
I(x^8)      FALSE      FALSE
I(x^9)      FALSE      FALSE
I(x^10)     FALSE      FALSE
1 subsets of each size up to 10
Selection Algorithm: backward
          x   I(x^2) I(x^3) I(x^4) I(x^5) I(x^6) I(x^7) I(x^8) I(x^9) I(x^10)
1  ( 1 )  " " " "    "*"    " "    " "    " "    " "    " "    " "    " "    
2  ( 1 )  " " "*"    "*"    " "    " "    " "    " "    " "    " "    " "    
3  ( 1 )  "*" "*"    "*"    " "    " "    " "    " "    " "    " "    " "    
4  ( 1 )  "*" "*"    "*"    " "    " "    " "    " "    " "    "*"    " "    
5  ( 1 )  "*" "*"    "*"    " "    " "    " "    " "    "*"    "*"    " "    
6  ( 1 )  "*" "*"    "*"    " "    " "    " "    " "    "*"    "*"    "*"    
7  ( 1 )  "*" "*"    "*"    " "    " "    "*"    " "    "*"    "*"    "*"    
8  ( 1 )  "*" "*"    "*"    "*"    " "    "*"    " "    "*"    "*"    "*"    
9  ( 1 )  "*" "*"    "*"    "*"    "*"    "*"    " "    "*"    "*"    "*"    
10  ( 1 ) "*" "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    
which.max(bks.summary$adjr2)
[1] 4
which.min(bks.summary$cp)
[1] 4
bks.summary$bic
 [1]  -57.68976 -279.28567 -419.95753 -418.81452 -414.48436 -410.22467 -406.16631
 [8] -402.40497 -397.81758 -393.38480
which.min(bks.summary$bic)
[1] 3
par(mfrow=c(1,3))
plot(bks.summary$adjr2, xlab = 'Number of variables', ylab = 'Adjusted R2', type = 'l')
rsq.max.idx <- which.max(bks.summary$adjr2)
points(rsq.max.idx, bks.summary$adjr2[rsq.max.idx], col = 'red', cex = 2, pch = 20)
cp.min.idx <- which.min(bks.summary$cp)
plot(bks.summary$cp, xlab = 'Number of variables', ylab = 'Cp', type = 'l')
points(cp.min.idx, bks.summary$cp[cp.min.idx], col = 'red', cex = 2, pch = 20)
bic.min.idx <- which.min(bks.summary$bic)
plot(bks.summary$bic, xlab = 'Number of variables', ylab = 'BIC', type = 'l')
points(bic.min.idx, bks.summary$bic[bic.min.idx], col = 'red', cex = 2, pch = 20)

The result is the same with forward stepwise selection methods. The results of forward/backward stepwise selection are basically the same with results of best subsets methods.

Get the coefficient estimates (which is the same with results of best subsets method in section (c)):

coef(fss, 3)
(Intercept)           x      I(x^2)      I(x^3) 
   2.792507   -3.320720    4.993209   -1.775361 
coef(bks, 3)
(Intercept)           x      I(x^2)      I(x^3) 
   2.792507   -3.320720    4.993209   -1.775361 

Use I() instead of poly

fss.poly <- regsubsets(y ~ poly(x, 10), data = inp, method = 'forward', nvmax = 10)
coef(fss.poly, 3)
 (Intercept) poly(x, 10)1 poly(x, 10)2 poly(x, 10)3 
    6.104218   -61.183673    50.894057   -26.582822 

So the wrong coefficient estimates show the poly(x, 10) is wrong here.

(e)

Fit with lasso model:

X <- data.frame(x, x^2, x^3, x^4, x^5, x^6, x^7, x^8, x^9, x^10)
lasso.cv <- cv.glmnet(as.matrix(X), y, alpha = 1)
plot(lasso.cv)

lbl <- lasso.cv$lambda.min
lasso.res <- glmnet(as.matrix(X), y, alpha = 1, lambda = lbl)
lasso.res$beta
10 x 1 sparse Matrix of class "dgCMatrix"
                s0
x    -3.250153e+00
x.2   4.719326e+00
x.3  -1.785540e+00
x.4   4.548949e-02
x.5   .           
x.6   6.696648e-04
x.7   .           
x.8   2.418415e-04
x.9   .           
x.10  1.191105e-05
lasso.res$a0
      s0 
2.906318 

The coefficient estimates vs origin coefficients: \[ \beta_0 = 2.731 \quad \beta_1 = -3.296 \quad \beta_2 = 5.117 \quad \beta_3 = -1.793\\ \hat\beta_0 = 2.906 \quad \hat\beta_1 = -3.250 \quad \hat\beta_2 = 4.719 \quad \hat\beta_3 = -1.786 \]

(f)

Fit \(Y = \beta_0 + \beta_7 X^7 + \epsilon\) with best subsets method:

beta7 <- 7.697
yf <- beta0 + beta7 * x^7 + eps
bsf <- regsubsets(yf ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = inp, nvmax = 10)
bsf.summary <- summary(bsf)
bsf.summary
Subset selection object
Call: regsubsets.formula(yf ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + 
    I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = inp, 
    nvmax = 10)
10 Variables  (and intercept)
        Forced in Forced out
x           FALSE      FALSE
I(x^2)      FALSE      FALSE
I(x^3)      FALSE      FALSE
I(x^4)      FALSE      FALSE
I(x^5)      FALSE      FALSE
I(x^6)      FALSE      FALSE
I(x^7)      FALSE      FALSE
I(x^8)      FALSE      FALSE
I(x^9)      FALSE      FALSE
I(x^10)     FALSE      FALSE
1 subsets of each size up to 10
Selection Algorithm: exhaustive
          x   I(x^2) I(x^3) I(x^4) I(x^5) I(x^6) I(x^7) I(x^8) I(x^9) I(x^10)
1  ( 1 )  " " " "    " "    " "    " "    " "    "*"    " "    " "    " "    
2  ( 1 )  " " "*"    " "    " "    " "    " "    "*"    " "    " "    " "    
3  ( 1 )  " " "*"    " "    " "    "*"    " "    "*"    " "    " "    " "    
4  ( 1 )  "*" "*"    "*"    " "    " "    " "    "*"    " "    " "    " "    
5  ( 1 )  "*" "*"    "*"    "*"    " "    " "    "*"    " "    " "    " "    
6  ( 1 )  "*" " "    "*"    " "    " "    "*"    "*"    "*"    " "    "*"    
7  ( 1 )  "*" " "    "*"    " "    "*"    "*"    "*"    "*"    " "    "*"    
8  ( 1 )  "*" "*"    "*"    "*"    " "    "*"    "*"    "*"    " "    "*"    
9  ( 1 )  "*" "*"    "*"    "*"    " "    "*"    "*"    "*"    "*"    "*"    
10  ( 1 ) "*" "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    
par(mfrow=c(1,3))
plot(bsf.summary$adjr2, xlab = 'Number of variables', ylab = 'Adjusted R2', type = 'l')
rsq.max.idx <- which.max(bsf.summary$adjr2)
points(rsq.max.idx, bsf.summary$adjr2[rsq.max.idx], col = 'red', cex = 2, pch = 20)
cp.min.idx <- which.min(bsf.summary$cp)
plot(bsf.summary$cp, xlab = 'Number of variables', ylab = 'Cp', type = 'l')
points(cp.min.idx, bsf.summary$cp[cp.min.idx], col = 'red', cex = 2, pch = 20)
bic.min.idx <- which.min(bsf.summary$bic)
plot(bsf.summary$bic, xlab = 'Number of variables', ylab = 'BIC', type = 'l')
points(bic.min.idx, bsf.summary$bic[bic.min.idx], col = 'red', cex = 2, pch = 20)

Compare the coefficients:

coef(bsf, 1)
(Intercept)      I(x^7) 
    2.68994     7.69777 

The coefficient estimates vs origin coefficients: \[ \beta_0 = 2.731 \quad \beta_7 = 7.697 \\ \hat\beta_0 = 2.690 \quad \hat\beta_1 = 7.698 \]

Fit with lasso:

X <- data.frame(x, x^2, x^3, x^4, x^5, x^6, x^7, x^8, x^9, x^10)
lasso.cv <- cv.glmnet(as.matrix(X), yf, alpha = 1)
plot(lasso.cv)

lbl <- lasso.cv$lambda.min
lasso.res <- glmnet(as.matrix(X), yf, alpha = 1, lambda = lbl)
lasso.res$beta
10 x 1 sparse Matrix of class "dgCMatrix"
              s0
x    .          
x.2  .          
x.3  .          
x.4  .          
x.5  .          
x.6  .          
x.7  7.436029866
x.8  .          
x.9  0.002537033
x.10 .          
lasso.res$a0
      s0 
3.734293 

The coefficient estimates vs origin coefficients: \[ \beta_0 = 2.731 \quad \beta_7 = 7.697 \\ \hat\beta_0 = 3.734 \quad \hat\beta_1 = 7.436 \] So for this model, the lasso give worse coefficients than best subsets.

Question 9

  1. and (b): generate data set, fit with linear regression and calculate test error:
set.seed(1)
train <- sample(1: nrow(College), nrow(College)/2)
test <- -train
lr.fit <- lm(Apps ~ ., data = College)
lr.pred <- predict(lr.fit, College[test, ])
mean((lr.pred - College[test, ]$Apps) ^ 2)
[1] 940062.9
summary(lr.fit)

Call:
lm(formula = Apps ~ ., data = College)

Residuals:
    Min      1Q  Median      3Q     Max 
-4908.8  -430.2   -29.5   322.3  7852.5 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -445.08413  408.32855  -1.090 0.276053    
PrivateYes  -494.14897  137.81191  -3.586 0.000358 ***
Accept         1.58581    0.04074  38.924  < 2e-16 ***
Enroll        -0.88069    0.18596  -4.736 2.60e-06 ***
Top10perc     49.92628    5.57824   8.950  < 2e-16 ***
Top25perc    -14.23448    4.47914  -3.178 0.001543 ** 
F.Undergrad    0.05739    0.03271   1.754 0.079785 .  
P.Undergrad    0.04445    0.03214   1.383 0.167114    
Outstate      -0.08587    0.01906  -4.506 7.64e-06 ***
Room.Board     0.15103    0.04829   3.127 0.001832 ** 
Books          0.02090    0.23841   0.088 0.930175    
Personal       0.03110    0.06308   0.493 0.622060    
PhD           -8.67850    4.63814  -1.871 0.061714 .  
Terminal      -3.33066    5.09494  -0.654 0.513492    
S.F.Ratio     15.38961   13.00622   1.183 0.237081    
perc.alumni    0.17867    4.10230   0.044 0.965273    
Expend         0.07790    0.01235   6.308 4.79e-10 ***
Grad.Rate      8.66763    2.94893   2.939 0.003390 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1041 on 759 degrees of freedom
Multiple R-squared:  0.9292,    Adjusted R-squared:  0.9276 
F-statistic: 585.9 on 17 and 759 DF,  p-value: < 2.2e-16

(c): fit with ridge regression and calculate test error:

set.seed(1)
inpx <- model.matrix(Apps ~ ., data = College)[, -1]
inpy <- College$Apps
cv.out <- cv.glmnet(inpx[train, ], inpy[train], alpha = 0)
rr.fit <- glmnet(inpx[train, ], inpy[train], alpha = 0, lambda = cv.out$lambda.min)
rr.pred <- predict(rr.fit, inpx[test, ], s = cv.out$lambda.min)
mean((rr.pred - inpy[test]) ^ 2)
[1] 1038427

(d): fit with lasso, calculate the test error and show coefficient estimates:

set.seed(1)
cv.out <- cv.glmnet(inpx[train, ], inpy[train], alpha = 1)
lasso.fit <- glmnet(inpx[train, ], inpy[train], alpha = 1, lambda = cv.out$lambda.min)
lasso.pred <- predict(lasso.fit, inpx[test, ], s = cv.out$lambda.min)
mean((lasso.pred - inpy[test]) ^ 2)
[1] 1034786
coef(lasso.fit, 20)
18 x 1 sparse Matrix of class "dgCMatrix"
                        1
(Intercept) -3.674800e+02
PrivateYes  -5.305394e+02
Accept       1.556370e+00
Enroll      -4.041862e-01
Top10perc    5.032803e+01
Top25perc   -9.825451e+00
F.Undergrad -1.798594e-02
P.Undergrad  .           
Outstate    -5.752475e-02
Room.Board   1.963418e-01
Books        1.927387e-02
Personal     3.556498e-03
PhD         -4.740559e+00
Terminal    -2.924054e+00
S.F.Ratio    .           
perc.alumni -2.264210e+00
Expend       3.257759e-02
Grad.Rate    3.412244e+00
# predict(lasso.fit, type = 'coefficients', s = cv.out$lambda.min) product same results with above `coef` function

2 (P.Undergrad and S. F. Ratio) of 17 predictors are zeros.

(e): fit with PCR and calculate the test error:

library(pls)
set.seed(1)
pcr.fit <- pcr(Apps ~ ., data = College, subset = train, scale = TRUE, validation = 'CV')
summary(pcr.fit)
Data:   X dimension: 388 17 
    Y dimension: 388 1
Fit method: svdpc
Number of components considered: 17

VALIDATION: RMSEP
Cross-validated using 10 random segments.
       (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
CV            4335     4179     2364     2374     1996     1844     1845     1850
adjCV         4335     4182     2360     2374     1788     1831     1838     1844
       8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
CV        1863     1809      1809      1812      1815      1825      1810      1823
adjCV     1857     1801      1800      1804      1808      1817      1806      1789
       16 comps  17 comps
CV         1273      1281
adjCV      1260      1268

TRAINING: % variance explained
      1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps  9 comps
X      31.216    57.68    64.73    70.55    76.33    81.30    85.01    88.40    91.16
Apps    6.976    71.47    71.58    83.32    83.44    83.45    83.46    83.47    84.53
      10 comps  11 comps  12 comps  13 comps  14 comps  15 comps  16 comps  17 comps
X        93.36     95.38     96.94     97.96     98.76     99.40     99.87    100.00
Apps     84.86     84.98     84.98     84.99     85.24     90.87     93.93     93.97
validationplot(pcr.fit, val.type = 'MSEP')

pcr.pred <- predict(pcr.fit, inpx[test, ], ncomp = 5)
mean((pcr.pred - inpy[test]) ^ 2)
[1] 1907827

When \(M=5\), the test MSE is 1907827.

(f): fit with PLS and calculate the test error:

library(pls)
set.seed(1)
pls.fit <- plsr(Apps ~ ., data = College, subset = train, scale = TRUE, validation = 'CV')
validationplot(pls.fit, val.type = 'MSEP')

pls.pred <- predict(pls.fit, inpx[test, ], ncomp = 6)
mean((pls.pred - inpy[test]) ^ 2)
[1] 1112189

When \(M=6\), the test MSE is 1112189.

(g): Compare the prediction accuracy with test \(R^2\) of above models:

test.avg.apps <- mean(inpy[test])
lr.r2 <- 1 - mean((lr.pred - inpy[test])^2) / mean((test.avg.apps - inpy[test])^2)
rr.r2 <- 1 - mean((rr.pred - inpy[test])^2) / mean((test.avg.apps - inpy[test])^2)
lasso.r2 <- 1 - mean((lasso.pred - inpy[test])^2) / mean((test.avg.apps - inpy[test])^2)
prc.r2 <- 1 - mean((pcr.pred - inpy[test])^2) / mean((test.avg.apps - inpy[test])^2)
pls.r2 <- 1 - mean((pls.pred - inpy[test])^2) / mean((test.avg.apps - inpy[test])^2)
barplot(c(lr.r2, rr.r2, lasso.r2, pcr.r2, pls.r2), col="gray", names.arg=c("LR", "Ridge", "Lasso", "PCR", "PLS"), main="Test R-squared")

For this data set, linear regression is the most accurate, while PCR give the worst prediction accuracy.

---
title: "Applied Exercises of Chapter 6"
output: html_notebook
---

# Question 8

Generate test data:
```{r}
library(ISLR)
set.seed(1)
x <- rnorm(100)
eps <- rnorm(100)
beta0 <- 2.731
beta1 <- -3.296
beta2 <- 5.117
beta3 <- -1.793
y <- beta0 + beta1 * x + beta2 * x^2 + beta3 * x^3 + eps
plot(x, y)
inp <- data.frame(x, y)
```

## (c) Fit with best subsets selection
```{r}
bss <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = inp, nvmax = 10)
bss.summary <- summary(bss)
bss.summary

par(mfrow=c(1,3))
plot(bss.summary$adjr2, xlab = 'Number of variables', ylab = 'Adjusted R2', type = 'l')
rsq.max.idx <- which.max(bss.summary$adjr2)
points(rsq.max.idx, bss.summary$adjr2[rsq.max.idx], col = 'red', cex = 2, pch = 20)

cp.min.idx <- which.min(bss.summary$cp)
plot(bss.summary$cp, xlab = 'Number of variables', ylab = 'Cp', type = 'l')
points(cp.min.idx, bss.summary$cp[cp.min.idx], col = 'red', cex = 2, pch = 20)

bic.min.idx <- which.min(bss.summary$bic)
plot(bss.summary$bic, xlab = 'Number of variables', ylab = 'BIC', type = 'l')
points(bic.min.idx, bss.summary$bic[bic.min.idx], col = 'red', cex = 2, pch = 20)

par(mfrow=c(1,1))
plot(bss, scale = 'adjr2')
plot(bss, scale = 'Cp')
plot(bss, scale = 'bic')
```
According to adjusted $R^2$ and $C_p$, the best model contains 4 predictors: $X$, $X^2$, $X^3$ and $X^5$.
According to BIC, the best model contains 3 predictors: $X$, $X^2$ and $X^3$.

Get the coefficient estimates:
```{r}
coef(bss, 3)
```
The coefficient estimates vs origin coefficients:
$$
\beta_0 = 2.731 \quad \beta_1 = -3.296 \quad \beta_2 = 5.117 \quad \beta_3 = -1.793\\
\hat\beta_0 = 2.793 \quad \hat\beta_1 = -3.321 \quad \hat\beta_2 = 4.993\quad \hat\beta_3 = -1.775
$$

## (d)
Fit with forward stepwise selection method:
```{r}
fss <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = inp, method = 'forward', nvmax = 10)
fss.summary <- summary(fss)
fss.summary

par(mfrow=c(1,3))
plot(fss.summary$adjr2, xlab = 'Number of variables', ylab = 'Adjusted R2', type = 'l')
rsq.max.idx <- which.max(fss.summary$adjr2)
points(rsq.max.idx, fss.summary$adjr2[rsq.max.idx], col = 'red', cex = 2, pch = 20)

cp.min.idx <- which.min(fss.summary$cp)
plot(fss.summary$cp, xlab = 'Number of variables', ylab = 'Cp', type = 'l')
points(cp.min.idx, fss.summary$cp[cp.min.idx], col = 'red', cex = 2, pch = 20)

bic.min.idx <- which.min(fss.summary$bic)
plot(fss.summary$bic, xlab = 'Number of variables', ylab = 'BIC', type = 'l')
points(bic.min.idx, fss.summary$bic[bic.min.idx], col = 'red', cex = 2, pch = 20)
```
According to adjusted $R^2$, the best model contains 5 predictors: $X$ ~ $X^5$.
According to $C_p$, the best model contains 4 predictors: $X$, $X^2$, $X^3$ and $X^5$.
According to BIC, the best model contains 3 predictors: $X$, $X^2$ and $X^3$.

Fit with backward stepwise selection method:
```{r}
bks <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = inp, method = 'backward', nvmax = 10)
bks.summary <- summary(bks)
bks.summary

which.max(bks.summary$adjr2)
which.min(bks.summary$cp)
bks.summary$bic
which.min(bks.summary$bic)

par(mfrow=c(1,3))
plot(bks.summary$adjr2, xlab = 'Number of variables', ylab = 'Adjusted R2', type = 'l')
rsq.max.idx <- which.max(bks.summary$adjr2)
points(rsq.max.idx, bks.summary$adjr2[rsq.max.idx], col = 'red', cex = 2, pch = 20)

cp.min.idx <- which.min(bks.summary$cp)
plot(bks.summary$cp, xlab = 'Number of variables', ylab = 'Cp', type = 'l')
points(cp.min.idx, bks.summary$cp[cp.min.idx], col = 'red', cex = 2, pch = 20)

bic.min.idx <- which.min(bks.summary$bic)
plot(bks.summary$bic, xlab = 'Number of variables', ylab = 'BIC', type = 'l')
points(bic.min.idx, bks.summary$bic[bic.min.idx], col = 'red', cex = 2, pch = 20)
```
The result is the same with forward stepwise selection methods.
The results of forward/backward stepwise selection are basically the same with results of best subsets methods.

Get the coefficient estimates (which is the same with results of best subsets method in section (c)):
```{r}
coef(fss, 3)
coef(bks, 3)
```

### Use `I()` instead of `poly`
```{r}
fss.poly <- regsubsets(y ~ poly(x, 10), data = inp, method = 'forward', nvmax = 10)
coef(fss.poly, 3)
```
So the wrong coefficient estimates show the `poly(x, 10)` is wrong here.

## (e)
Fit with lasso model:
```{r}
X <- data.frame(x, x^2, x^3, x^4, x^5, x^6, x^7, x^8, x^9, x^10)
lasso.cv <- cv.glmnet(as.matrix(X), y, alpha = 1)
plot(lasso.cv)
lbl <- lasso.cv$lambda.min
lasso.res <- glmnet(as.matrix(X), y, alpha = 1, lambda = lbl)
lasso.res$beta
lasso.res$a0
```
The coefficient estimates vs origin coefficients:
$$
\beta_0 = 2.731 \quad \beta_1 = -3.296 \quad \beta_2 = 5.117 \quad \beta_3 = -1.793\\
\hat\beta_0 = 2.906 \quad \hat\beta_1 = -3.250 \quad \hat\beta_2 = 4.719 \quad \hat\beta_3 = -1.786
$$

## (f)

Fit $Y = \beta_0 + \beta_7 X^7 + \epsilon$ with best subsets method:
```{r}
beta7 <- 7.697
yf <- beta0 + beta7 * x^7 + eps

bsf <- regsubsets(yf ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = inp, nvmax = 10)
bsf.summary <- summary(bsf)
bsf.summary

par(mfrow=c(1,3))
plot(bsf.summary$adjr2, xlab = 'Number of variables', ylab = 'Adjusted R2', type = 'l')
rsq.max.idx <- which.max(bsf.summary$adjr2)
points(rsq.max.idx, bsf.summary$adjr2[rsq.max.idx], col = 'red', cex = 2, pch = 20)

cp.min.idx <- which.min(bsf.summary$cp)
plot(bsf.summary$cp, xlab = 'Number of variables', ylab = 'Cp', type = 'l')
points(cp.min.idx, bsf.summary$cp[cp.min.idx], col = 'red', cex = 2, pch = 20)

bic.min.idx <- which.min(bsf.summary$bic)
plot(bsf.summary$bic, xlab = 'Number of variables', ylab = 'BIC', type = 'l')
points(bic.min.idx, bsf.summary$bic[bic.min.idx], col = 'red', cex = 2, pch = 20)
```

Compare the coefficients:
```{r}
coef(bsf, 1)
```
The coefficient estimates vs origin coefficients:
$$
\beta_0 = 2.731 \quad \beta_7 = 7.697 \\
\hat\beta_0 = 2.690 \quad \hat\beta_1 = 7.698
$$

Fit with lasso:
```{r}
X <- data.frame(x, x^2, x^3, x^4, x^5, x^6, x^7, x^8, x^9, x^10)
lasso.cv <- cv.glmnet(as.matrix(X), yf, alpha = 1)
plot(lasso.cv)
lbl <- lasso.cv$lambda.min
lasso.res <- glmnet(as.matrix(X), yf, alpha = 1, lambda = lbl)
lasso.res$beta
lasso.res$a0
```
The coefficient estimates vs origin coefficients:
$$
\beta_0 = 2.731 \quad \beta_7 = 7.697 \\
\hat\beta_0 = 3.734 \quad \hat\beta_1 = 7.436
$$
So for this model, the lasso give worse coefficients than best subsets.

# Question 9

(a) and (b): generate data set, fit with linear regression and calculate test error:
```{r}
set.seed(1)
train <- sample(1: nrow(College), nrow(College)/2)
test <- -train
lr.fit <- lm(Apps ~ ., data = College)
lr.pred <- predict(lr.fit, College[test, ])
mean((lr.pred - College[test, ]$Apps) ^ 2)
summary(lr.fit)
```

(c): fit with ridge regression and calculate test error:
```{r}
set.seed(1)
inpx <- model.matrix(Apps ~ ., data = College)[, -1]
inpy <- College$Apps
cv.out <- cv.glmnet(inpx[train, ], inpy[train], alpha = 0)
rr.fit <- glmnet(inpx[train, ], inpy[train], alpha = 0, lambda = cv.out$lambda.min)
rr.pred <- predict(rr.fit, inpx[test, ], s = cv.out$lambda.min)
mean((rr.pred - inpy[test]) ^ 2)
```

(d): fit with lasso, calculate the test error and show coefficient estimates:
```{r}
set.seed(1)
cv.out <- cv.glmnet(inpx[train, ], inpy[train], alpha = 1)
lasso.fit <- glmnet(inpx[train, ], inpy[train], alpha = 1, lambda = cv.out$lambda.min)
lasso.pred <- predict(lasso.fit, inpx[test, ], s = cv.out$lambda.min)
mean((lasso.pred - inpy[test]) ^ 2)
coef(lasso.fit, 20)
# predict(lasso.fit, type = 'coefficients', s = cv.out$lambda.min) product same results with above `coef` function
```

2 (*P.Undergrad* and *S. F. Ratio*) of 17 predictors are zeros.

(e): fit with PCR and calculate the test error:
```{r}
library(pls)
set.seed(1)
pcr.fit <- pcr(Apps ~ ., data = College, subset = train, scale = TRUE, validation = 'CV')
summary(pcr.fit)
validationplot(pcr.fit, val.type = 'MSEP')
pcr.pred <- predict(pcr.fit, inpx[test, ], ncomp = 5)
mean((pcr.pred - inpy[test]) ^ 2)
```
When $M=5$, the test MSE is 1907827.

(f): fit with PLS and calculate the test error:
```{r}
library(pls)
set.seed(1)
pls.fit <- plsr(Apps ~ ., data = College, subset = train, scale = TRUE, validation = 'CV')
validationplot(pls.fit, val.type = 'MSEP')
pls.pred <- predict(pls.fit, inpx[test, ], ncomp = 6)
mean((pls.pred - inpy[test]) ^ 2)
```
When $M=6$, the test MSE is 1112189.

(g): Compare the prediction accuracy with test $R^2$ of above models:
```{r}
test.avg.apps <- mean(inpy[test])

lr.r2 <- 1 - mean((lr.pred - inpy[test])^2) / mean((test.avg.apps - inpy[test])^2)

rr.r2 <- 1 - mean((rr.pred - inpy[test])^2) / mean((test.avg.apps - inpy[test])^2)

lasso.r2 <- 1 - mean((lasso.pred - inpy[test])^2) / mean((test.avg.apps - inpy[test])^2)

prc.r2 <- 1 - mean((pcr.pred - inpy[test])^2) / mean((test.avg.apps - inpy[test])^2)
pls.r2 <- 1 - mean((pls.pred - inpy[test])^2) / mean((test.avg.apps - inpy[test])^2)

barplot(c(lr.r2, rr.r2, lasso.r2, pcr.r2, pls.r2), col="gray", names.arg=c("LR", "Ridge", "Lasso", "PCR", "PLS"), main="Test R-squared")
```
For this data set, linear regression is the most accurate, while PCR give the worst prediction accuracy.
