2. (a)
(b)
(c)
set.seed(12)
train_index <- sample(1:nrow(College), round(nrow(College)* 0.7))
train <- College[train_index,]
nrow(train)/nrow(College)
## [1] 0.7001287
test <- College[-train_index,]
nrow(test)/nrow(College)
## [1] 0.2998713
model_1 <- lm(Apps ~ . , data = train)
summary(model_1)
##
## Call:
## lm(formula = Apps ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2856.6 -428.5 -31.1 341.3 6072.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 109.57809 458.85840 0.239 0.81135
## PrivateYes -669.57143 156.71415 -4.273 2.29e-05 ***
## Accept 1.43962 0.05963 24.143 < 2e-16 ***
## Enroll -0.43771 0.21037 -2.081 0.03795 *
## Top10perc 45.96929 6.17104 7.449 3.88e-13 ***
## Top25perc -12.27879 4.81219 -2.552 0.01100 *
## F.Undergrad 0.04197 0.03687 1.138 0.25551
## P.Undergrad 0.04533 0.03371 1.345 0.17929
## Outstate -0.06959 0.02147 -3.241 0.00127 **
## Room.Board 0.09682 0.05194 1.864 0.06286 .
## Books -0.01277 0.28426 -0.045 0.96420
## Personal -0.08698 0.06869 -1.266 0.20598
## PhD -9.38474 5.20732 -1.802 0.07208 .
## Terminal -4.64721 5.55680 -0.836 0.40336
## S.F.Ratio 5.63601 14.28717 0.394 0.69339
## perc.alumni -6.63544 4.58468 -1.447 0.14841
## Expend 0.10262 0.01789 5.734 1.65e-08 ***
## Grad.Rate 9.68517 3.20262 3.024 0.00262 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 945.6 on 526 degrees of freedom
## Multiple R-squared: 0.929, Adjusted R-squared: 0.9267
## F-statistic: 404.9 on 17 and 526 DF, p-value: < 2.2e-16
ols_pred <- predict(model_1,test)
(ols_mse <- mean((ols_pred - test$Apps)^2))
## [1] 1642531
train_mat <- dummyVars(Apps ~ ., data = train, fullRank = F) %>%
predict(newdata = train) %>%
as.matrix()
test_mat <- dummyVars(Apps ~ ., data = test, fullRank = F) %>%
predict(newdata = test) %>%
as.matrix()
set.seed(3)
model_ridge <- cv.glmnet(y = train$Apps,
x = train_mat,
alpha = 0,
lambda = 10^seq(2,-2, length = 100),
standardize = TRUE,
nfolds = 5)
data.frame(lambda = model_ridge$lambda,
cv_mse = model_ridge$cvm) %>%
ggplot(aes(x = lambda, y = cv_mse)) +
geom_point() +
geom_line() +
geom_vline(xintercept = model_ridge$lambda.min, col = "deepskyblue3") +
geom_hline(yintercept = min(model_ridge$cvm), col = "deepskyblue3") +
scale_x_continuous(trans = 'log10', breaks = c(0.01, 0.1, 1, 10, 100), labels = c(0.01, 0.1, 1, 10, 100)) +
scale_y_continuous(labels = scales::comma_format()) +
theme(legend.position = "bottom") +
labs(x = "Lambda",
y = "Cross-Validation MSE",
col = "Non-Zero Coefficients:",
title = "Ridge Regression - Lambda Selection (Using 5-Fold Cross-Validation)")
model_ridge_best <- glmnet(y = train$Apps,
x = train_mat,
alpha = 0,
lambda = 10^seq(2,-2, length = 100))
ridge_pred <- predict(model_ridge_best, s = model_ridge$lambda.min, newx = test_mat)
(ridge_mse <- mean((ridge_pred - test$Apps)^2))
## [1] 1753325
set.seed(4)
model_lasso <- cv.glmnet(y = train$Apps,
x = train_mat,
alpha = 1,
lambda = 10^seq(2, -2, length = 100),
standardize = TRUE,
nfolds = 5,
thresh = 1e-12)
data.frame(lambda = model_lasso$lambda,
cv_mse = model_lasso$cvm,
nonzero_coeff = model_lasso$nzero) %>%
ggplot(aes(x = lambda, y = cv_mse, col = nonzero_coeff)) +
geom_point() +
geom_line() +
geom_vline(xintercept = model_lasso$lambda.min, col = "deepskyblue3") +
geom_hline(yintercept = min(model_lasso$cvm), col = "deepskyblue3") +
scale_x_continuous(trans = 'log10', breaks = c(0.01, 0.1, 1, 10, 100), labels = c(0.01, 0.1, 1, 10, 100)) +
scale_y_continuous(labels = scales::comma_format()) +
theme(legend.position = "bottom") +
scale_color_gradient(low = "red", high = "green") +
labs(x = "Lambda",
y = "Cross-Validation MSE",
col = "Non-Zero Coefficients:",
title = "Lasso - Lambda Selection (Using 5-Fold Cross-Validation)")
model_lasso_best <- glmnet(y = train$Apps,
x = train_mat,
alpha = 1,
lambda = 10^seq(2,-5, length = 100))
lasso_pred <- predict(model_lasso_best, s = model_lasso$lambda.min, newx = test_mat)
(lasso_mse <- mean((lasso_pred - test$Apps)^2))
## [1] 1720941
lasso_coef <- predict(model_lasso_best, type = "coefficients", s = model_lasso$lambda.min)
round(lasso_coef, 3)
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -764.406
## Private.No 645.813
## Private.Yes .
## Accept 1.351
## Enroll .
## Top10perc 36.721
## Top25perc -5.248
## F.Undergrad .
## P.Undergrad 0.029
## Outstate -0.048
## Room.Board 0.084
## Books .
## Personal -0.061
## PhD -6.771
## Terminal -4.525
## S.F.Ratio .
## perc.alumni -7.335
## Expend 0.092
## Grad.Rate 7.555
set.seed(5)
model_pcr <- pcr(Apps ~ .,
data = train,
scale = T,
validation = "CV")
model_pcr_mse <- MSEP(model_pcr, estimate = "CV")$val %>%
reshape2::melt() %>%
mutate(M = 0:(nrow(.)-1))
model_pcr_mse
## estimate response model value M
## 1 CV Apps (Intercept) 12222541.6 0
## 2 CV Apps 1 comps 11634225.5 1
## 3 CV Apps 2 comps 2961400.5 2
## 4 CV Apps 3 comps 2813186.4 3
## 5 CV Apps 4 comps 1847789.7 4
## 6 CV Apps 5 comps 1726480.3 5
## 7 CV Apps 6 comps 1608343.7 6
## 8 CV Apps 7 comps 1520679.6 7
## 9 CV Apps 8 comps 1520036.3 8
## 10 CV Apps 9 comps 1396939.2 9
## 11 CV Apps 10 comps 1392357.1 10
## 12 CV Apps 11 comps 1405071.3 11
## 13 CV Apps 12 comps 1391098.8 12
## 14 CV Apps 13 comps 1393771.6 13
## 15 CV Apps 14 comps 1402508.0 14
## 16 CV Apps 15 comps 1392142.3 15
## 17 CV Apps 16 comps 998691.7 16
## 18 CV Apps 17 comps 978147.2 17
model_pcr_mse = model_pcr_mse[c('M','value')]
model_pcr_mse %>%
mutate(min_CV_MSE = as.numeric(min(value) == value)) %>%
ggplot(aes(x = M, y = value)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_CV_MSE))) +
scale_y_continuous(labels = scales::comma_format()) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme(legend.position = "none") +
labs(x = "M",
y = "Cross-Validation MSE",
col = "Non-Zero Coefficients:",
title = "PCR - M Selection (Using 10-Fold Cross-Validation)")
pcr_pred <- predict(model_pcr, test, ncomp = 17)
(pcr_mse <- mean((pcr_pred - test$Apps)^2))
## [1] 1642531
set.seed(6)
model_pls <- plsr(Apps ~ .,
data = train,
scale = T,
validation = "CV")
model_pls_mse <- MSEP(model_pls, estimate = "CV")$val %>%
reshape2::melt() %>%
mutate(M = 0:(nrow(.)-1))
model_pls_mse
## estimate response model value M
## 1 CV Apps (Intercept) 12222542 0
## 2 CV Apps 1 comps 2412191 1
## 3 CV Apps 2 comps 1677574 2
## 4 CV Apps 3 comps 1352068 3
## 5 CV Apps 4 comps 1270299 4
## 6 CV Apps 5 comps 1187455 5
## 7 CV Apps 6 comps 1054465 6
## 8 CV Apps 7 comps 1024252 7
## 9 CV Apps 8 comps 1024301 8
## 10 CV Apps 9 comps 1023928 9
## 11 CV Apps 10 comps 1022383 10
## 12 CV Apps 11 comps 1020038 11
## 13 CV Apps 12 comps 1021349 12
## 14 CV Apps 13 comps 1023531 13
## 15 CV Apps 14 comps 1024888 14
## 16 CV Apps 15 comps 1023937 15
## 17 CV Apps 16 comps 1023689 16
## 18 CV Apps 17 comps 1023647 17
model_pls_mse = model_pls_mse[c('M','value')]
model_pls_mse %>%
mutate(min_CV_MSE = as.numeric(min(value) == value)) %>%
ggplot(aes(x = M, y = value)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_CV_MSE))) +
scale_y_continuous(labels = scales::comma_format()) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme(legend.position = "none") +
labs(x = "M",
y = "Cross-Validation MSE",
title = "PLS - M Selection (Using 10-Fold Cross-Validation)")
pls_pred <- predict(model_pls, test, ncomp = 12)
(pls_mse <- mean((pls_pred - test$Apps)^2))
## [1] 1652446
SS_tot <- sum((test$Apps - mean(test$Apps))^2)
data.frame(method = c("OLS", "Ridge", "Lasso", "PCR", "PLS"),
test_MSE = c(ols_mse, ridge_mse, lasso_mse, pcr_mse, pls_mse),
test_R2 = c(1 - sum((test$Apps - ols_pred)^2) / SS_tot,
1 - sum((test$Apps - ridge_pred)^2) / SS_tot,
1 - sum((test$Apps - lasso_pred)^2) / SS_tot,
1 - sum((test$Apps - pcr_pred)^2) / SS_tot,
1 - sum((test$Apps - pls_pred)^2) / SS_tot)) %>%
arrange(test_MSE)
## method test_MSE test_R2
## 1 PCR 1642531 0.9233748
## 2 OLS 1642531 0.9233748
## 3 PLS 1652446 0.9229122
## 4 Lasso 1720941 0.9197169
## 5 Ridge 1753325 0.9182061
##Question 11
predict.regsubsets <- function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id = id)
xvars = names(coefi)
mat[, xvars] %*% coefi
}
set.seed(121)
k = 10
folds = sample(1:k, nrow(Boston), replace=TRUE)
cv.errors = matrix(NA,k,13, dimnames = list(NULL, paste(1:13)))
for (j in 1:k) {
best.fit = regsubsets(crim ~ ., data = Boston[folds != j, ], nvmax = 13)
for (i in 1:13) {
pred = predict(best.fit, Boston[folds == j, ], id = i)
cv.errors[j, i] = mean((Boston$crim[folds == j] - pred)^2)
}
}
mean.cv.errors = apply(cv.errors, 2, mean)
plot(1:13, mean.cv.errors, xlab = "Number of variables", ylab = "CV error", main= "Best subset selection", pch = 1, type = "b")
# Using Lasso to create a sparse model.
set.seed(121)
x = model.matrix(crim~.,Boston)[,-1]
y = Boston$crim
grid = 10^seq(10,-2,length=100)
train = sample(1:nrow(x), nrow(x)/1.3)
test = (-train)
y.test = y[test]
set.seed(121)
cv.out = cv.glmnet(x[train,], y[train], alpha=1)
bestlam = cv.out$lambda.min
lasso.mod = glmnet(x[train,],y[train],alpha=1,lambda=grid)
lasso.pred = predict(lasso.mod, s=bestlam, newx=x[test,])
mean((lasso.pred-y.test)^2)
## [1] 31.61342
lasso.coef = predict(lasso.mod, type="coefficients", s=bestlam)[1:13,]
lasso.coef
## (Intercept) zn indus chas nox rm
## 11.863977269 0.035551345 -0.069356583 -0.341147083 -7.159328191 0.258906989
## age dis rad tax ptratio black
## 0.000000000 -0.756621224 0.510388304 0.000000000 -0.159640151 -0.009241809
## lstat
## 0.145509242
# Using Ridge Regression.
cv.out = cv.glmnet(x[train,], y[train], alpha=0)
bestlam = cv.out$lambda.min
glm.mod = glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12)
glm.pred = predict(glm.mod, s=bestlam, newx=x[test,])
mean((glm.pred-y.test)^2)
## [1] 31.87334
glm.coef = predict(glm.mod, type="coefficients", s=bestlam)[1:13,]
glm.coef
## (Intercept) zn indus chas nox rm
## 9.139463700 0.033824862 -0.087751836 -0.530186066 -6.237996789 0.383596934
## age dis rad tax ptratio black
## 0.003905644 -0.708321504 0.425606232 0.003344803 -0.125766385 -0.010097779
## lstat
## 0.155705890
# Using PCR
pcr.fit = pcr(crim~., data=Boston, subset=train, scale=T, validation="CV")
validationplot(pcr.fit, val.type="MSEP")
pcr.pred = predict(pcr.fit, x[test,], ncomp=8)
mean((pcr.pred-y.test)^2)
## [1] 33.74218
(b) (c)