Question 2

2. (a)

  1. Less flexible and will give improved prediction accuracy when its increase in bias is less than its decrease in variance. As lambda increases, flexibility of fit decreases, and so the estimated coefficients decrease with some being zero. This leads to a substantial decrease in the variance of the predictions for a small increase in bias.

(b)

  1. same as (a), except every variable has a non-zero coefficient.

(c)

  1. Non-linear models will generally be more flexible, and so predictions tend to have a higher variance and lower bias. So predictions will improve if the variance rises less than a decrease in the bias (bias-variance trade off).

Question 9

set.seed(12)
train_index <- sample(1:nrow(College), round(nrow(College)* 0.7))

train <- College[train_index,]
nrow(train)/nrow(College)
## [1] 0.7001287
test <- College[-train_index,]
nrow(test)/nrow(College)
## [1] 0.2998713
model_1 <- lm(Apps ~ . , data = train)
summary(model_1)
## 
## Call:
## lm(formula = Apps ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2856.6  -428.5   -31.1   341.3  6072.2 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  109.57809  458.85840   0.239  0.81135    
## PrivateYes  -669.57143  156.71415  -4.273 2.29e-05 ***
## Accept         1.43962    0.05963  24.143  < 2e-16 ***
## Enroll        -0.43771    0.21037  -2.081  0.03795 *  
## Top10perc     45.96929    6.17104   7.449 3.88e-13 ***
## Top25perc    -12.27879    4.81219  -2.552  0.01100 *  
## F.Undergrad    0.04197    0.03687   1.138  0.25551    
## P.Undergrad    0.04533    0.03371   1.345  0.17929    
## Outstate      -0.06959    0.02147  -3.241  0.00127 ** 
## Room.Board     0.09682    0.05194   1.864  0.06286 .  
## Books         -0.01277    0.28426  -0.045  0.96420    
## Personal      -0.08698    0.06869  -1.266  0.20598    
## PhD           -9.38474    5.20732  -1.802  0.07208 .  
## Terminal      -4.64721    5.55680  -0.836  0.40336    
## S.F.Ratio      5.63601   14.28717   0.394  0.69339    
## perc.alumni   -6.63544    4.58468  -1.447  0.14841    
## Expend         0.10262    0.01789   5.734 1.65e-08 ***
## Grad.Rate      9.68517    3.20262   3.024  0.00262 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 945.6 on 526 degrees of freedom
## Multiple R-squared:  0.929,  Adjusted R-squared:  0.9267 
## F-statistic: 404.9 on 17 and 526 DF,  p-value: < 2.2e-16
ols_pred <- predict(model_1,test)

(ols_mse <- mean((ols_pred - test$Apps)^2))
## [1] 1642531
train_mat <- dummyVars(Apps ~ ., data = train, fullRank = F) %>%
  predict(newdata = train) %>%
  as.matrix()

test_mat <- dummyVars(Apps ~ ., data = test, fullRank = F) %>%
  predict(newdata = test) %>%
  as.matrix()
set.seed(3)

model_ridge <- cv.glmnet(y = train$Apps, 
                         x = train_mat, 
                         alpha = 0, 
                         lambda = 10^seq(2,-2, length = 100), 
                         standardize = TRUE, 
                         nfolds = 5)

data.frame(lambda = model_ridge$lambda, 
           cv_mse = model_ridge$cvm) %>%
  ggplot(aes(x = lambda, y = cv_mse)) + 
  geom_point() + 
  geom_line() + 
  geom_vline(xintercept = model_ridge$lambda.min, col = "deepskyblue3") +
  geom_hline(yintercept = min(model_ridge$cvm), col = "deepskyblue3") +
  scale_x_continuous(trans = 'log10', breaks = c(0.01, 0.1, 1, 10, 100), labels = c(0.01, 0.1, 1, 10, 100)) + 
  scale_y_continuous(labels = scales::comma_format()) + 
  theme(legend.position = "bottom") + 
  labs(x = "Lambda", 
       y = "Cross-Validation MSE", 
       col = "Non-Zero Coefficients:", 
       title = "Ridge Regression - Lambda Selection (Using 5-Fold Cross-Validation)")

model_ridge_best <- glmnet(y = train$Apps,
                           x = train_mat,
                           alpha = 0, 
                           lambda = 10^seq(2,-2, length = 100))

ridge_pred <- predict(model_ridge_best, s = model_ridge$lambda.min, newx = test_mat)
(ridge_mse <- mean((ridge_pred - test$Apps)^2))
## [1] 1753325
set.seed(4)

model_lasso <- cv.glmnet(y = train$Apps, 
                         x = train_mat, 
                         alpha = 1, 
                         lambda = 10^seq(2, -2, length = 100), 
                         standardize = TRUE, 
                         nfolds = 5, 
                         thresh = 1e-12)

data.frame(lambda = model_lasso$lambda, 
           cv_mse = model_lasso$cvm, 
           nonzero_coeff = model_lasso$nzero) %>%
  ggplot(aes(x = lambda, y = cv_mse, col = nonzero_coeff)) + 
  geom_point() + 
  geom_line() + 
  geom_vline(xintercept = model_lasso$lambda.min, col = "deepskyblue3") +
  geom_hline(yintercept = min(model_lasso$cvm), col = "deepskyblue3") +
  scale_x_continuous(trans = 'log10', breaks = c(0.01, 0.1, 1, 10, 100), labels = c(0.01, 0.1, 1, 10, 100)) + 
  scale_y_continuous(labels = scales::comma_format()) + 
  theme(legend.position = "bottom") + 
  scale_color_gradient(low = "red", high = "green") +
  labs(x = "Lambda", 
       y = "Cross-Validation MSE", 
       col = "Non-Zero Coefficients:", 
       title = "Lasso - Lambda Selection (Using 5-Fold Cross-Validation)")

model_lasso_best <- glmnet(y = train$Apps,
                           x = train_mat,
                           alpha = 1, 
                           lambda = 10^seq(2,-5, length = 100))

lasso_pred <- predict(model_lasso_best, s = model_lasso$lambda.min, newx = test_mat)
(lasso_mse <- mean((lasso_pred - test$Apps)^2))
## [1] 1720941
lasso_coef <- predict(model_lasso_best, type = "coefficients", s = model_lasso$lambda.min)

round(lasso_coef, 3)
## 19 x 1 sparse Matrix of class "dgCMatrix"
##                   s1
## (Intercept) -764.406
## Private.No   645.813
## Private.Yes    .    
## Accept         1.351
## Enroll         .    
## Top10perc     36.721
## Top25perc     -5.248
## F.Undergrad    .    
## P.Undergrad    0.029
## Outstate      -0.048
## Room.Board     0.084
## Books          .    
## Personal      -0.061
## PhD           -6.771
## Terminal      -4.525
## S.F.Ratio      .    
## perc.alumni   -7.335
## Expend         0.092
## Grad.Rate      7.555
set.seed(5)

model_pcr <- pcr(Apps ~ .,
                 data = train, 
                 scale = T, 
                 validation = "CV")

model_pcr_mse <- MSEP(model_pcr, estimate = "CV")$val %>%
                        reshape2::melt() %>%
                        mutate(M = 0:(nrow(.)-1))
                        

model_pcr_mse
##    estimate response       model      value  M
## 1        CV     Apps (Intercept) 12222541.6  0
## 2        CV     Apps     1 comps 11634225.5  1
## 3        CV     Apps     2 comps  2961400.5  2
## 4        CV     Apps     3 comps  2813186.4  3
## 5        CV     Apps     4 comps  1847789.7  4
## 6        CV     Apps     5 comps  1726480.3  5
## 7        CV     Apps     6 comps  1608343.7  6
## 8        CV     Apps     7 comps  1520679.6  7
## 9        CV     Apps     8 comps  1520036.3  8
## 10       CV     Apps     9 comps  1396939.2  9
## 11       CV     Apps    10 comps  1392357.1 10
## 12       CV     Apps    11 comps  1405071.3 11
## 13       CV     Apps    12 comps  1391098.8 12
## 14       CV     Apps    13 comps  1393771.6 13
## 15       CV     Apps    14 comps  1402508.0 14
## 16       CV     Apps    15 comps  1392142.3 15
## 17       CV     Apps    16 comps   998691.7 16
## 18       CV     Apps    17 comps   978147.2 17
model_pcr_mse = model_pcr_mse[c('M','value')]
model_pcr_mse %>%
  mutate(min_CV_MSE = as.numeric(min(value) == value)) %>%
  ggplot(aes(x = M, y = value)) + 
  geom_line(col = "grey55") + 
  geom_point(size = 2, aes(col = factor(min_CV_MSE))) + 
  scale_y_continuous(labels = scales::comma_format()) + 
  scale_color_manual(values = c("deepskyblue3", "green")) + 
  theme(legend.position = "none") + 
  labs(x = "M", 
       y = "Cross-Validation MSE", 
       col = "Non-Zero Coefficients:", 
       title = "PCR - M Selection (Using 10-Fold Cross-Validation)")

pcr_pred <- predict(model_pcr, test, ncomp = 17)
(pcr_mse <- mean((pcr_pred - test$Apps)^2))
## [1] 1642531
set.seed(6)

model_pls <- plsr(Apps ~ .,
                  data = train, 
                  scale = T, 
                  validation = "CV")

model_pls_mse <- MSEP(model_pls, estimate = "CV")$val %>%
  reshape2::melt() %>%
  mutate(M = 0:(nrow(.)-1)) 

model_pls_mse
##    estimate response       model    value  M
## 1        CV     Apps (Intercept) 12222542  0
## 2        CV     Apps     1 comps  2412191  1
## 3        CV     Apps     2 comps  1677574  2
## 4        CV     Apps     3 comps  1352068  3
## 5        CV     Apps     4 comps  1270299  4
## 6        CV     Apps     5 comps  1187455  5
## 7        CV     Apps     6 comps  1054465  6
## 8        CV     Apps     7 comps  1024252  7
## 9        CV     Apps     8 comps  1024301  8
## 10       CV     Apps     9 comps  1023928  9
## 11       CV     Apps    10 comps  1022383 10
## 12       CV     Apps    11 comps  1020038 11
## 13       CV     Apps    12 comps  1021349 12
## 14       CV     Apps    13 comps  1023531 13
## 15       CV     Apps    14 comps  1024888 14
## 16       CV     Apps    15 comps  1023937 15
## 17       CV     Apps    16 comps  1023689 16
## 18       CV     Apps    17 comps  1023647 17
model_pls_mse = model_pls_mse[c('M','value')]
model_pls_mse %>%
  mutate(min_CV_MSE = as.numeric(min(value) == value)) %>%
  ggplot(aes(x = M, y = value)) + 
  geom_line(col = "grey55") + 
  geom_point(size = 2, aes(col = factor(min_CV_MSE))) + 
  scale_y_continuous(labels = scales::comma_format()) + 
  scale_color_manual(values = c("deepskyblue3", "green")) + 
  theme(legend.position = "none") + 
  labs(x = "M", 
       y = "Cross-Validation MSE", 
       title = "PLS - M Selection (Using 10-Fold Cross-Validation)")

pls_pred <- predict(model_pls, test, ncomp = 12)
(pls_mse <- mean((pls_pred - test$Apps)^2))
## [1] 1652446
SS_tot <- sum((test$Apps - mean(test$Apps))^2)

data.frame(method = c("OLS", "Ridge", "Lasso", "PCR", "PLS"), 
           test_MSE = c(ols_mse, ridge_mse, lasso_mse, pcr_mse, pls_mse), 
           test_R2 = c(1 - sum((test$Apps - ols_pred)^2) / SS_tot,
                       1 - sum((test$Apps - ridge_pred)^2) / SS_tot, 
                       1 - sum((test$Apps - lasso_pred)^2) / SS_tot, 
                       1 - sum((test$Apps - pcr_pred)^2) / SS_tot, 
                       1 - sum((test$Apps - pls_pred)^2) / SS_tot)) %>%
  arrange(test_MSE)
##   method test_MSE   test_R2
## 1    PCR  1642531 0.9233748
## 2    OLS  1642531 0.9233748
## 3    PLS  1652446 0.9229122
## 4  Lasso  1720941 0.9197169
## 5  Ridge  1753325 0.9182061

##Question 11

predict.regsubsets <- function(object, newdata, id, ...) {
    form = as.formula(object$call[[2]])
    mat = model.matrix(form, newdata)
    coefi = coef(object, id = id)
    xvars = names(coefi)
    mat[, xvars] %*% coefi
}
set.seed(121)
k = 10
folds = sample(1:k, nrow(Boston), replace=TRUE)
cv.errors = matrix(NA,k,13, dimnames = list(NULL, paste(1:13)))
for (j in 1:k) {
    best.fit = regsubsets(crim ~ ., data = Boston[folds != j, ], nvmax = 13)
    for (i in 1:13) {
        pred = predict(best.fit, Boston[folds == j, ], id = i)
        cv.errors[j, i] = mean((Boston$crim[folds == j] - pred)^2)
    }
}
mean.cv.errors = apply(cv.errors, 2, mean)
plot(1:13, mean.cv.errors, xlab = "Number of variables", ylab = "CV error", main= "Best subset selection", pch = 1, type = "b")

# Using Lasso to create a sparse model.
set.seed(121)
x = model.matrix(crim~.,Boston)[,-1]
y = Boston$crim
grid = 10^seq(10,-2,length=100)
train = sample(1:nrow(x), nrow(x)/1.3)
test = (-train)
y.test = y[test]
set.seed(121)
cv.out = cv.glmnet(x[train,], y[train], alpha=1)
bestlam = cv.out$lambda.min
lasso.mod = glmnet(x[train,],y[train],alpha=1,lambda=grid)
lasso.pred = predict(lasso.mod, s=bestlam, newx=x[test,])
mean((lasso.pred-y.test)^2)
## [1] 31.61342
lasso.coef = predict(lasso.mod, type="coefficients", s=bestlam)[1:13,]
lasso.coef
##  (Intercept)           zn        indus         chas          nox           rm 
## 11.863977269  0.035551345 -0.069356583 -0.341147083 -7.159328191  0.258906989 
##          age          dis          rad          tax      ptratio        black 
##  0.000000000 -0.756621224  0.510388304  0.000000000 -0.159640151 -0.009241809 
##        lstat 
##  0.145509242
# Using Ridge Regression.
cv.out = cv.glmnet(x[train,], y[train], alpha=0)
bestlam = cv.out$lambda.min
glm.mod = glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12)
glm.pred = predict(glm.mod, s=bestlam, newx=x[test,])
mean((glm.pred-y.test)^2)
## [1] 31.87334
glm.coef = predict(glm.mod, type="coefficients", s=bestlam)[1:13,]
glm.coef
##  (Intercept)           zn        indus         chas          nox           rm 
##  9.139463700  0.033824862 -0.087751836 -0.530186066 -6.237996789  0.383596934 
##          age          dis          rad          tax      ptratio        black 
##  0.003905644 -0.708321504  0.425606232  0.003344803 -0.125766385 -0.010097779 
##        lstat 
##  0.155705890
# Using PCR
pcr.fit = pcr(crim~., data=Boston, subset=train, scale=T, validation="CV")
validationplot(pcr.fit, val.type="MSEP")

pcr.pred = predict(pcr.fit, x[test,], ncomp=8)
mean((pcr.pred-y.test)^2)
## [1] 33.74218

(b) (c)