# i. More flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
# ii. More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.
# iii. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
# iv. Less flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.
Option iii is correct.Lasso regression relies on ‘lambda’ parameter to control the factor of shrinkage. So Lasso restricted the size of the regression coefficient which leads to decrease in variance but increase in bias.
Option iii is correct. Like Lasso, Ridge can decrease variance with higher bias as coefficient tends toward 0.
Option ii is correct. All non-linear methods are more flexible. As flexibility increases, variance generally tends to increase and bias generally decreases.
library(ISLR)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
set.seed(1)
tr_ind = sample(nrow(College), 0.7*nrow(College), replace = F)
College.train = College[tr_ind,]
College.test = College[-tr_ind,]
lm.fit = lm(Apps~., data=College.train)
lm.pred = predict(lm.fit, College.test, type="response")
lm.mean = mean((lm.pred-College.test$Apps)^2)
lm.mean
## [1] 1261630
Test error for linear model is 1,261,630
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-6
#Set up matrices needed for ridge + Lasso regression
train.mat = model.matrix(Apps~., data = College.train)
test.mat = model.matrix(Apps~., data = College.test)
#Set x and y value:
x=model.matrix(Apps~.,College)[,-1]
y=College$Apps
grid=10^seq(10,-2,length=100)
#Choose best lambda
set.seed(1)
cv.out = cv.glmnet(train.mat,College.train$Apps,alpha=0)
bestlam=cv.out$lambda.min
bestlam
## [1] 367.5286
#Fit a ridge regression
ridge.mod = glmnet(train.mat,College.train$Apps,alpha = 0, lambda = grid)
#Make predictions
ridge.pred = predict(ridge.mod,s=bestlam,newx = test.mat)
#Calculate test error
ridge.mean = mean((ridge.pred - College.test$Apps)^2)
ridge.mean
## [1] 1120171
Test error is 1,120,171. This is lower than linear model
#Choose best lambda
set.seed(1)
cv.out2 = cv.glmnet(train.mat,College.train$Apps,alpha=1, lambda = grid)
bestlam2=cv.out2$lambda.min
bestlam2
## [1] 0.01
#Fit a Lasso model
lasso.mod = glmnet(train.mat, College.train$Apps, alpha = 1)
lasso.pred = predict(lasso.mod, s=bestlam2, newx = test.mat)
lasso.mean = mean((lasso.pred-College.test$Apps)^2)
lasso.mean
## [1] 1254408
#Number of coefficient
out=glmnet(x, y, alpha=1, lambda = grid)
lasso.coef=predict(out,type="coefficients",s=bestlam)[1:18,]
lasso.coef[lasso.coef!=0]
## (Intercept) Accept Top10perc Expend
## -1.880992e+02 1.314216e+00 1.695380e+01 7.181022e-03
Test error obtained is 1,254,408. This is higher than both linear and ridge regression.
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(1)
#Fit and determine M based on CV results
pcr.fit = pcr(Apps~., data = College.train, scale = TRUE, validation = "CV")
summary(pcr.fit)
## Data: X dimension: 543 17
## Y dimension: 543 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3895 3813 2129 2139 1798 1715 1709
## adjCV 3895 3814 2125 2136 1785 1703 1703
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1711 1656 1620 1615 1621 1621 1618
## adjCV 1705 1645 1614 1608 1615 1615 1612
## 14 comps 15 comps 16 comps 17 comps
## CV 1618 1546 1177 1138
## adjCV 1612 1529 1168 1130
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 32.051 57.00 64.42 70.27 75.65 80.65 84.26 87.61
## Apps 5.788 71.69 71.70 80.97 82.60 82.60 82.69 84.06
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.58 92.84 94.93 96.74 97.82 98.72 99.39
## Apps 84.55 84.82 84.86 84.86 85.01 85.05 89.81
## 16 comps 17 comps
## X 99.85 100.00
## Apps 93.03 93.32
validationplot(pcr.fit,val.type = "MSEP")
Lowest MSEP occur when M = 10. The test error obtained below, which is 1,837,203. The PCR model has the highest test error out of all so far and proves to be underperformed
#Make predictions using chosen M
pcr.pred=predict(pcr.fit, College.test, ncomp=10)
pcr.mean=mean((pcr.pred-College.test$Apps)^2)
pcr.mean
## [1] 1837203
#Fit and determine M based on CV results
set.seed(1)
pls.fit = plsr(Apps~., data = College.train, scale = TRUE, validation = "CV")
summary(pls.fit)
## Data: X dimension: 543 17
## Y dimension: 543 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3895 1952 1743 1540 1451 1258 1171
## adjCV 3895 1947 1737 1532 1430 1231 1161
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1153 1147 1144 1141 1142 1141 1139
## adjCV 1145 1139 1136 1133 1134 1133 1131
## 14 comps 15 comps 16 comps 17 comps
## CV 1138 1138 1138 1138
## adjCV 1130 1130 1130 1130
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 25.68 47.43 62.46 64.88 67.34 72.68 77.20 80.92
## Apps 76.62 82.39 86.93 90.76 92.82 93.05 93.13 93.20
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 82.69 85.16 87.35 90.73 92.49 95.10 97.09
## Apps 93.26 93.28 93.30 93.31 93.32 93.32 93.32
## 16 comps 17 comps
## X 98.40 100.00
## Apps 93.32 93.32
validationplot(pls.fit,val.type = "MSEP")
M=11 appears to be the best since it has low adjCV (1132) while quite high variance explained (87.35).
#Make prediction using M = 10
pls.pred=predict(pls.fit, College.test, ncomp=11)
pls.mean=mean((pls.pred-College.test$Apps)^2)
pls.mean
## [1] 1256121
Test error is 1,256,121 (which is almost similar to ridge and lasso regression)
test.error <- c(lm.mean, ridge.mean, lasso.mean, pcr.mean, pls.mean)
barplot(test.error,
names.arg = c("linear","ridge","lasso","PCR","PLS"),
xlab = "Models", ylab = "Test Error")
We can see that ridge regression performs the best based on its test error. There’s not much difference between linear, lasso, and PLS’ test error. PCR performs significantly worse than all other models. To say how accurately we can predict number of applications received, we can compute R-square value based on ridge regression. Based on the result below, we can explain 92.31% of the variance of ‘Apps’ using ridge regression. This is high R-square
TotalSumOfSquares = sum((mean(College.test$Apps)- College.test$Apps)^2)
TotalSumOfResiduals = sum((ridge.pred-College.test$Apps)^2)
1 - ((TotalSumOfResiduals)/(TotalSumOfSquares))
## [1] 0.9231507
# Split the data into training and testing
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
set.seed(1)
tr_ind = sample(nrow(Boston), 0.8*nrow(Boston), replace = F)
Boston.train = Boston[tr_ind,]
Boston.test = Boston[tr_ind,]
Best Subset Selection
library(leaps) #for best subset selection (regsubsets)
#Perform Best Subset Selection for all variables
regfit.full=regsubsets(crim~.,data=Boston,nvmax=14)
reg.summary=summary(regfit.full)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(crim ~ ., data = Boston, nvmax = 14)
## 13 Variables (and intercept)
## Forced in Forced out
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## black FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
## zn indus chas nox rm age dis rad tax ptratio black lstat medv
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " "*" " "
## 3 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*" "*" " "
## 4 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " " " " " "*"
## 5 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " "*" " " "*"
## 6 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " " " "*" " " "*"
## 7 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" " " "*"
## 8 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" " " " " "*" "*" " " "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
#Make plots of performance statistics to determine best subset
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(reg.summary$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
p = which.max(reg.summary$adjr2)
points(p,reg.summary$adjr2[p], col="red",cex=2,pch=20)
plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",type='l')
p = which.min(reg.summary$cp)
points(p,reg.summary$cp[p],col="red",cex=2,pch=20)
plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
p = which.min(reg.summary$bic)
points(p,reg.summary$bic[p],col="red",cex=2,pch=20)
p=3, 8, and 9 appears to be the best.
#Regular model test error
mean((mean(Boston.test$crim)-Boston.test$crim)^2)
## [1] 64.67552
#Model with subset 3
lm.fit = lm(crim~rad+black+lstat, data=Boston.train)
lm.pred = predict(lm.fit, Boston.test, type="response")
mean((lm.pred-Boston.test$crim)^2)
## [1] 34.97689
#Model with subset 8
lm.fit = lm(crim~ zn+nox+dis+rad+black+lstat+medv, data=Boston.train)
lm.pred = predict(lm.fit, Boston.test, type="response")
mean((lm.pred-Boston.test$crim)^2)
## [1] 34.07552
#Model with subset 9
lm.fit = lm(crim~zn+indus+nox+dis+rad+black+lstat+medv, data=Boston.train)
lm.pred = predict(lm.fit, Boston.test, type="response")
mean((lm.pred-Boston.test$crim)^2)
## [1] 33.89677
The MSE of model 9 is the best. There’s a huge decrease of test error when comparing between null model and model 3. However, it only decrease slightly with 0.9 from model 3 to 8, and 0.2 from model 8 to 9 even after adding 5 extra variables. Therefore, for simplicity and ease of interpretation, I would choose model 3 with test error of 34.98
Lasso
#Set up matrices needed for the glmnet functions
train.mat = model.matrix(crim~., data = Boston.train)
test.mat = model.matrix(crim~., data = Boston.test)
grid=10^seq(10,-2,length=100)
#Choose best lambda
set.seed(1)
cv.out = cv.glmnet(train.mat,Boston.train$crim, alpha=1, lambda = grid)
bestlam=cv.out$lambda.min
bestlam
## [1] 0.05336699
#Fit a Lasso model
lasso.mod = glmnet(train.mat, Boston.train$crim, alpha = 1)
lasso.pred = predict(lasso.mod, s=bestlam2, newx = test.mat)
lasso.mean = mean((lasso.pred-Boston.train$crim)^2)
lasso.mean
## [1] 33.66338
#Variables in Lasso model
lasso.coef=predict(lasso.mod,type="coefficients",s=bestlam)[1:15,]
lasso.coef[lasso.coef!=0]
## (Intercept) zn indus chas nox
## 14.0132745484 0.0267085606 -0.0663932119 -0.3798334367 -5.6023355247
## age dis rad ptratio black
## 0.0007350302 -0.5984041093 0.4621042366 -0.1852877996 -0.0137727996
## lstat medv
## 0.1259036754 -0.1086474749
Lasso test error is 33.66, which is smaller than both null model and best subset selection
Ridge Regression
#Choose lambda using cross-validation
set.seed(1)
cv.out = cv.glmnet(train.mat,Boston.train$crim,alpha=0, labmda=grid)
bestlam = cv.out$lambda.min
bestlam
## [1] 0.5094548
#Fit a ridge regression
ridge.mod = glmnet(train.mat,Boston.train$crim,alpha = 0)
ridge.pred = predict(ridge.mod,s=bestlam,newx = test.mat)
mean((ridge.pred - Boston.test$crim)^2)
## [1] 33.97892
Test error of ridge regression is 33.98, only slightly higher than Lasso.
From 3 models from part (a), I would choose linear model with Best Subset of 3 predictors. Both the Lasso and Ridge Regression has a lower test error than Best Subset. However, they contain more variables (12 to 13 predictors) when compared to 3 predictors. This does not justify the addition of so many additional predictors when test error is only slightly reduce
No it does not. Best Subset only have 3 variables out of 14. However, because of this, we can reduce variance while still have good accuracy and low test error