assignment

Question 9

set.seed(12)
train_index <- sample(1:nrow(College), round(nrow(College)* 0.7))

train <- College[train_index,]
nrow(train)/nrow(College)

## [1] 0.7001287

test <- College[-train_index,]

nrow(test)/nrow(College)

## [1] 0.2998713

model_1 <- lm(Apps ~ . , data = train)
summary(model_1)

## 
## Call:
## lm(formula = Apps ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2856.6  -428.5   -31.1   341.3  6072.2 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  109.57809  458.85840   0.239  0.81135    
## PrivateYes  -669.57143  156.71415  -4.273 2.29e-05 ***
## Accept         1.43962    0.05963  24.143  < 2e-16 ***
## Enroll        -0.43771    0.21037  -2.081  0.03795 *  
## Top10perc     45.96929    6.17104   7.449 3.88e-13 ***
## Top25perc    -12.27879    4.81219  -2.552  0.01100 *  
## F.Undergrad    0.04197    0.03687   1.138  0.25551    
## P.Undergrad    0.04533    0.03371   1.345  0.17929    
## Outstate      -0.06959    0.02147  -3.241  0.00127 ** 
## Room.Board     0.09682    0.05194   1.864  0.06286 .  
## Books         -0.01277    0.28426  -0.045  0.96420    
## Personal      -0.08698    0.06869  -1.266  0.20598    
## PhD           -9.38474    5.20732  -1.802  0.07208 .  
## Terminal      -4.64721    5.55680  -0.836  0.40336    
## S.F.Ratio      5.63601   14.28717   0.394  0.69339    
## perc.alumni   -6.63544    4.58468  -1.447  0.14841    
## Expend         0.10262    0.01789   5.734 1.65e-08 ***
## Grad.Rate      9.68517    3.20262   3.024  0.00262 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 945.6 on 526 degrees of freedom
## Multiple R-squared:  0.929,  Adjusted R-squared:  0.9267 
## F-statistic: 404.9 on 17 and 526 DF,  p-value: < 2.2e-16

ols_pred <- predict(model_1,test)

(ols_mse <- mean((ols_pred - test$Apps)^2))

## [1] 1642531

train_mat <- dummyVars(Apps ~ ., data = train, fullRank = F) %>%
  predict(newdata = train) %>%
  as.matrix()

test_mat <- dummyVars(Apps ~ ., data = test, fullRank = F) %>%
  predict(newdata = test) %>%
  as.matrix()

set.seed(3)

model_ridge <- cv.glmnet(y = train$Apps, 
                         x = train_mat, 
                         alpha = 0, 
                         lambda = 10^seq(2,-2, length = 100), 
                         standardize = TRUE, 
                         nfolds = 5)

data.frame(lambda = model_ridge$lambda, 
           cv_mse = model_ridge$cvm) %>%
  ggplot(aes(x = lambda, y = cv_mse)) + 
  geom_point() + 
  geom_line() + 
  geom_vline(xintercept = model_ridge$lambda.min, col = "deepskyblue3") +
  geom_hline(yintercept = min(model_ridge$cvm), col = "deepskyblue3") +
  scale_x_continuous(trans = 'log10', breaks = c(0.01, 0.1, 1, 10, 100), labels = c(0.01, 0.1, 1, 10, 100)) + 
  scale_y_continuous(labels = scales::comma_format()) + 
  theme(legend.position = "bottom") + 
  labs(x = "Lambda", 
       y = "Cross-Validation MSE", 
       col = "Non-Zero Coefficients:", 
       title = "Ridge Regression - Lambda Selection (Using 5-Fold Cross-Validation)")

model_ridge_best <- glmnet(y = train$Apps,
                           x = train_mat,
                           alpha = 0, 
                           lambda = 10^seq(2,-2, length = 100))

ridge_pred <- predict(model_ridge_best, s = model_ridge$lambda.min, newx = test_mat)
(ridge_mse <- mean((ridge_pred - test$Apps)^2))

## [1] 1753325

set.seed(4)

model_lasso <- cv.glmnet(y = train$Apps, 
                         x = train_mat, 
                         alpha = 1, 
                         lambda = 10^seq(2, -2, length = 100), 
                         standardize = TRUE, 
                         nfolds = 5, 
                         thresh = 1e-12)

data.frame(lambda = model_lasso$lambda, 
           cv_mse = model_lasso$cvm, 
           nonzero_coeff = model_lasso$nzero) %>%
  ggplot(aes(x = lambda, y = cv_mse, col = nonzero_coeff)) + 
  geom_point() + 
  geom_line() + 
  geom_vline(xintercept = model_lasso$lambda.min, col = "deepskyblue3") +
  geom_hline(yintercept = min(model_lasso$cvm), col = "deepskyblue3") +
  scale_x_continuous(trans = 'log10', breaks = c(0.01, 0.1, 1, 10, 100), labels = c(0.01, 0.1, 1, 10, 100)) + 
  scale_y_continuous(labels = scales::comma_format()) + 
  theme(legend.position = "bottom") + 
  scale_color_gradient(low = "red", high = "green") +
  labs(x = "Lambda", 
       y = "Cross-Validation MSE", 
       col = "Non-Zero Coefficients:", 
       title = "Lasso - Lambda Selection (Using 5-Fold Cross-Validation)")

model_lasso_best <- glmnet(y = train$Apps,
                           x = train_mat,
                           alpha = 1, 
                           lambda = 10^seq(2,-5, length = 100))

lasso_pred <- predict(model_lasso_best, s = model_lasso$lambda.min, newx = test_mat)
(lasso_mse <- mean((lasso_pred - test$Apps)^2))

## [1] 1720941

lasso_coef <- predict(model_lasso_best, type = "coefficients", s = model_lasso$lambda.min)

round(lasso_coef, 3)

## 19 x 1 sparse Matrix of class "dgCMatrix"
##                   s1
## (Intercept) -764.406
## Private.No   645.813
## Private.Yes    .    
## Accept         1.351
## Enroll         .    
## Top10perc     36.721
## Top25perc     -5.248
## F.Undergrad    .    
## P.Undergrad    0.029
## Outstate      -0.048
## Room.Board     0.084
## Books          .    
## Personal      -0.061
## PhD           -6.771
## Terminal      -4.525
## S.F.Ratio      .    
## perc.alumni   -7.335
## Expend         0.092
## Grad.Rate      7.555

set.seed(5)

model_pcr <- pcr(Apps ~ .,
                 data = train, 
                 scale = T, 
                 validation = "CV")

model_pcr_mse <- MSEP(model_pcr, estimate = "CV")$val %>%
                        reshape2::melt() %>%
                        mutate(M = 0:(nrow(.)-1))
                        

model_pcr_mse

##    estimate response       model      value  M
## 1        CV     Apps (Intercept) 12222541.6  0
## 2        CV     Apps     1 comps 11634225.5  1
## 3        CV     Apps     2 comps  2961400.5  2
## 4        CV     Apps     3 comps  2813186.4  3
## 5        CV     Apps     4 comps  1847789.7  4
## 6        CV     Apps     5 comps  1726480.3  5
## 7        CV     Apps     6 comps  1608343.7  6
## 8        CV     Apps     7 comps  1520679.6  7
## 9        CV     Apps     8 comps  1520036.3  8
## 10       CV     Apps     9 comps  1396939.2  9
## 11       CV     Apps    10 comps  1392357.1 10
## 12       CV     Apps    11 comps  1405071.3 11
## 13       CV     Apps    12 comps  1391098.8 12
## 14       CV     Apps    13 comps  1393771.6 13
## 15       CV     Apps    14 comps  1402508.0 14
## 16       CV     Apps    15 comps  1392142.3 15
## 17       CV     Apps    16 comps   998691.7 16
## 18       CV     Apps    17 comps   978147.2 17

model_pcr_mse = model_pcr_mse[c('M','value')]

model_pcr_mse %>%
  mutate(min_CV_MSE = as.numeric(min(value) == value)) %>%
  ggplot(aes(x = M, y = value)) + 
  geom_line(col = "grey55") + 
  geom_point(size = 2, aes(col = factor(min_CV_MSE))) + 
  scale_y_continuous(labels = scales::comma_format()) + 
  scale_color_manual(values = c("deepskyblue3", "green")) + 
  theme(legend.position = "none") + 
  labs(x = "M", 
       y = "Cross-Validation MSE", 
       col = "Non-Zero Coefficients:", 
       title = "PCR - M Selection (Using 10-Fold Cross-Validation)")

pcr_pred <- predict(model_pcr, test, ncomp = 17)
(pcr_mse <- mean((pcr_pred - test$Apps)^2))

## [1] 1642531

set.seed(6)

model_pls <- plsr(Apps ~ .,
                  data = train, 
                  scale = T, 
                  validation = "CV")

model_pls_mse <- MSEP(model_pls, estimate = "CV")$val %>%
  reshape2::melt() %>%
  mutate(M = 0:(nrow(.)-1)) 

model_pls_mse

##    estimate response       model    value  M
## 1        CV     Apps (Intercept) 12222542  0
## 2        CV     Apps     1 comps  2412191  1
## 3        CV     Apps     2 comps  1677574  2
## 4        CV     Apps     3 comps  1352068  3
## 5        CV     Apps     4 comps  1270299  4
## 6        CV     Apps     5 comps  1187455  5
## 7        CV     Apps     6 comps  1054465  6
## 8        CV     Apps     7 comps  1024252  7
## 9        CV     Apps     8 comps  1024301  8
## 10       CV     Apps     9 comps  1023928  9
## 11       CV     Apps    10 comps  1022383 10
## 12       CV     Apps    11 comps  1020038 11
## 13       CV     Apps    12 comps  1021349 12
## 14       CV     Apps    13 comps  1023531 13
## 15       CV     Apps    14 comps  1024888 14
## 16       CV     Apps    15 comps  1023937 15
## 17       CV     Apps    16 comps  1023689 16
## 18       CV     Apps    17 comps  1023647 17

model_pls_mse = model_pls_mse[c('M','value')]

model_pls_mse %>%
  mutate(min_CV_MSE = as.numeric(min(value) == value)) %>%
  ggplot(aes(x = M, y = value)) + 
  geom_line(col = "grey55") + 
  geom_point(size = 2, aes(col = factor(min_CV_MSE))) + 
  scale_y_continuous(labels = scales::comma_format()) + 
  scale_color_manual(values = c("deepskyblue3", "green")) + 
  theme(legend.position = "none") + 
  labs(x = "M", 
       y = "Cross-Validation MSE", 
       title = "PLS - M Selection (Using 10-Fold Cross-Validation)")

pls_pred <- predict(model_pls, test, ncomp = 12)
(pls_mse <- mean((pls_pred - test$Apps)^2))

## [1] 1652446

SS_tot <- sum((test$Apps - mean(test$Apps))^2)

data.frame(method = c("OLS", "Ridge", "Lasso", "PCR", "PLS"), 
           test_MSE = c(ols_mse, ridge_mse, lasso_mse, pcr_mse, pls_mse), 
           test_R2 = c(1 - sum((test$Apps - ols_pred)^2) / SS_tot,
                       1 - sum((test$Apps - ridge_pred)^2) / SS_tot, 
                       1 - sum((test$Apps - lasso_pred)^2) / SS_tot, 
                       1 - sum((test$Apps - pcr_pred)^2) / SS_tot, 
                       1 - sum((test$Apps - pls_pred)^2) / SS_tot)) %>%
  arrange(test_MSE)

##   method test_MSE   test_R2
## 1    PCR  1642531 0.9233748
## 2    OLS  1642531 0.9233748
## 3    PLS  1652446 0.9229122
## 4  Lasso  1720941 0.9197169
## 5  Ridge  1753325 0.9182061

##Question 11

predict.regsubsets <- function(object, newdata, id, ...) {
    form = as.formula(object$call[[2]])
    mat = model.matrix(form, newdata)
    coefi = coef(object, id = id)
    xvars = names(coefi)
    mat[, xvars] %*% coefi
}
set.seed(121)
k = 10
folds = sample(1:k, nrow(Boston), replace=TRUE)
cv.errors = matrix(NA,k,13, dimnames = list(NULL, paste(1:13)))
for (j in 1:k) {
    best.fit = regsubsets(crim ~ ., data = Boston[folds != j, ], nvmax = 13)
    for (i in 1:13) {
        pred = predict(best.fit, Boston[folds == j, ], id = i)
        cv.errors[j, i] = mean((Boston$crim[folds == j] - pred)^2)
    }
}
mean.cv.errors = apply(cv.errors, 2, mean)
plot(1:13, mean.cv.errors, xlab = "Number of variables", ylab = "CV error", main= "Best subset selection", pch = 1, type = "b")

CV error is lowest for model with 9 variables. CV Error = 42.0151126.

# Using Lasso to create a sparse model.
set.seed(121)
x = model.matrix(crim~.,Boston)[,-1]
y = Boston$crim
grid = 10^seq(10,-2,length=100)
train = sample(1:nrow(x), nrow(x)/1.3)
test = (-train)
y.test = y[test]

set.seed(121)
cv.out = cv.glmnet(x[train,], y[train], alpha=1)
bestlam = cv.out$lambda.min
lasso.mod = glmnet(x[train,],y[train],alpha=1,lambda=grid)
lasso.pred = predict(lasso.mod, s=bestlam, newx=x[test,])
mean((lasso.pred-y.test)^2)

## [1] 31.61342

lasso.coef = predict(lasso.mod, type="coefficients", s=bestlam)[1:13,]
lasso.coef

##  (Intercept)           zn        indus         chas          nox           rm 
## 11.863977269  0.035551345 -0.069356583 -0.341147083 -7.159328191  0.258906989 
##          age          dis          rad          tax      ptratio        black 
##  0.000000000 -0.756621224  0.510388304  0.000000000 -0.159640151 -0.009241809 
##        lstat 
##  0.145509242

Test MSE of 31.6, with only age and tax being exactly zero, we have a best model with 10 variables.

# Using Ridge Regression.
cv.out = cv.glmnet(x[train,], y[train], alpha=0)
bestlam = cv.out$lambda.min
glm.mod = glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12)
glm.pred = predict(glm.mod, s=bestlam, newx=x[test,])
mean((glm.pred-y.test)^2)

## [1] 31.87334

glm.coef = predict(glm.mod, type="coefficients", s=bestlam)[1:13,]
glm.coef

##  (Intercept)           zn        indus         chas          nox           rm 
##  9.139463700  0.033824862 -0.087751836 -0.530186066 -6.237996789  0.383596934 
##          age          dis          rad          tax      ptratio        black 
##  0.003905644 -0.708321504  0.425606232  0.003344803 -0.125766385 -0.010097779 
##        lstat 
##  0.155705890

Lambda chosen by cross validation is close to zero, so both ridge regression and lasso test mse are similar to that provided by least squares.

# Using PCR
pcr.fit = pcr(crim~., data=Boston, subset=train, scale=T, validation="CV")
validationplot(pcr.fit, val.type="MSEP")

pcr.pred = predict(pcr.fit, x[test,], ncomp=8)
mean((pcr.pred-y.test)^2)

## [1] 33.74218

Test MSE of 33.7.

(b) (c)

I would choose the Lasso model, as it gives the lowest test mse.
Lasso models are generally more interpretable.
It results in a sparse model with 10 variables. Two variables whose effect on the response were below the required threshold were removed.

assignment_5

Gitanjali Mule

3/25/2022

Question 2

Question 9