iii: as \(\lambda\) increases, variance decreases and bias increases, resulting in a U-shape MSE line
iii: similar to Lasso, as \(\lambda\) increases, variance decreases and bias increases, resulting in a U-shape MSE line
ii: non-linear methods are always more flexible due to not haveing a pre-described shape.
College data
set.library(ISLR)
str(College)
## 'data.frame': 777 obs. of 18 variables:
## $ Private : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Apps : num 1660 2186 1428 417 193 ...
## $ Accept : num 1232 1924 1097 349 146 ...
## $ Enroll : num 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : num 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : num 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: num 2885 2683 1036 510 249 ...
## $ P.Undergrad: num 537 1227 99 63 869 ...
## $ Outstate : num 7440 12280 11250 12960 7560 ...
## $ Room.Board : num 3300 6450 3750 5450 4120 ...
## $ Books : num 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : num 2200 1500 1165 875 1500 ...
## $ PhD : num 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : num 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: num 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : num 7041 10527 8735 19016 10922 ...
## $ Grad.Rate : num 60 56 54 59 15 55 63 73 80 52 ...
attach(College)
set.seed(1)
train <- sample(c(TRUE,FALSE), nrow(College),rep=TRUE)
table(train)
## train
## FALSE TRUE
## 384 393
test <- (!train)
apps.fit <- lm(Apps~.,data=College[train,])
summary(apps.fit)
##
## Call:
## lm(formula = Apps ~ ., data = College[train, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3898.0 -467.2 -8.1 377.0 6742.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -135.78186 601.32767 -0.226 0.82148
## PrivateYes -664.07860 224.58778 -2.957 0.00330 **
## Accept 1.68180 0.05371 31.313 < 2e-16 ***
## Enroll -0.86870 0.27062 -3.210 0.00144 **
## Top10perc 59.35451 8.85627 6.702 7.56e-11 ***
## Top25perc -22.24798 6.97394 -3.190 0.00154 **
## F.Undergrad 0.03452 0.05100 0.677 0.49893
## P.Undergrad 0.03911 0.04566 0.857 0.39225
## Outstate -0.08693 0.02889 -3.009 0.00280 **
## Room.Board 0.11208 0.07614 1.472 0.14185
## Books 0.07767 0.38938 0.199 0.84199
## Personal -0.02680 0.09364 -0.286 0.77488
## PhD -9.41831 6.87963 -1.369 0.17181
## Terminal -6.46298 7.51936 -0.860 0.39061
## S.F.Ratio 21.14715 18.89609 1.119 0.26380
## perc.alumni 9.76448 6.20070 1.575 0.11616
## Expend 0.08358 0.01780 4.695 3.74e-06 ***
## Grad.Rate 9.91812 4.27608 2.319 0.02091 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1130 on 375 degrees of freedom
## Multiple R-squared: 0.9302, Adjusted R-squared: 0.927
## F-statistic: 293.9 on 17 and 375 DF, p-value: < 2.2e-16
apps.pred <- predict(apps.fit, newdata = College[test,])
mean((apps.pred-College[test,]$Apps)^2)
## [1] 984743.1
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-6
x <- model.matrix(Apps~.,College[,-2])
y <- College$Apps
ridge.mod <- cv.glmnet(x[train,], y[train], alpha = 0)
ridge.bestlam <- ridge.mod$lambda.min
y.test <- y[test]
ridge.pred <- predict(ridge.mod, s=ridge.bestlam, newx = x[test,])
mean((ridge.pred-y.test)^2)
## [1] 940970.9
lasso.mod <- cv.glmnet(x[train,], y[train], alpha = 1)
lasso.bestlam <- lasso.mod$lambda.min
y.test <- y[test]
lasso.pred <- predict(lasso.mod, s=lasso.bestlam, newx = x[test,])
mean((lasso.pred-y.test)^2)
## [1] 979071.2
grid <- 10^seq(10,-2,length=100)
out <- glmnet(x,y,alpha=1, lambda = grid)
lasso.coef <- predict(out, type='coefficient', s=lasso.bestlam)[1:19,]
lasso.coef
## (Intercept) (Intercept) PrivateYes Accept Enroll
## -471.62236610 0.00000000 -491.05286308 1.56921353 -0.75280819
## Top10perc Top25perc F.Undergrad P.Undergrad Outstate
## 48.00777736 -12.75431948 0.04085270 0.04400650 -0.08302818
## Room.Board Books Personal PhD Terminal
## 0.14928961 0.01464639 0.02882039 -8.38320965 -3.25388115
## S.F.Ratio perc.alumni Expend Grad.Rate
## 14.46488381 -0.04955865 0.07705012 8.25692656
sum(lasso.coef!=0)
## [1] 18
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
train=sample(1:nrow(x), nrow(x)/2)
test <- (-train)
y.test <- y[test]
pcr.fit <- pcr(Apps~., data=College, subset=train, scale=TRUE, validation='CV')
validationplot(pcr.fit,val.type="MSEP")
pcr.pred <- predict(pcr.fit, College[test,], ncomp=5)
mean((pcr.pred - College$Apps[test])^2)
## [1] 3198543
pls.fit <- plsr(Apps~., data=College, subset=train, scale=TRUE, validation='CV')
summary(pls.fit)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3299 1439 1278 1083 1065 993.5 981.1
## adjCV 3299 1437 1277 1080 1052 982.7 975.0
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 967.0 966.0 966.0 965.9 968.8 970.1 970.8
## adjCV 962.3 961.5 961.6 961.6 964.3 965.4 966.0
## 14 comps 15 comps 16 comps 17 comps
## CV 970.6 970.5 970.6 970.6
## adjCV 965.9 965.8 965.8 965.8
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 26.38 52.08 63.49 65.99 68.78 74.27 77.21 80.78
## Apps 81.63 85.94 90.13 91.60 92.59 92.74 92.79 92.79
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 83.85 86.41 88.46 91.14 93.63 96.48 98.11
## Apps 92.79 92.79 92.80 92.80 92.80 92.80 92.80
## 16 comps 17 comps
## X 99.04 100.0
## Apps 92.80 92.8
pls.pred <- predict(pls.fit, College[test,], ncomp=5)
mean((pls.pred - College$Apps[test])^2)
## [1] 1755071
Of the models run, they rank in order from least to most mean square
error as follows: 1. Ridge - 940970.9; 2. Lasso -
979071.2; 3. least squares - 984743.1; 4. PLS
- 1755071; 5. PCR - 3198543. There appears to
be very wide differences between these models.
detach(College)
Boston data set.library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
str(Boston)
## 'data.frame': 506 obs. of 13 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
attach(Boston)
library(leaps)
regfit.full <- regsubsets(crim~.,data=Boston, nvmax = 13)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(crim ~ ., data = Boston, nvmax = 13)
## 12 Variables (and intercept)
## Forced in Forced out
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## 1 subsets of each size up to 12
## Selection Algorithm: exhaustive
## zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*" " "
## 3 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*" "*"
## 4 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " "*" "*" " " " " " " "*"
## 6 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" " " "*"
## 7 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" "*"
## 8 ( 1 ) "*" "*" " " "*" " " " " "*" "*" " " "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" " " "*" "*" "*"
## 10 ( 1 ) "*" " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
reg.summary <- summary(regfit.full)
par(mfrow=c(2,2))
plot(reg.summary$rss, xlab="Number of Variables ", ylab="RSS", type="l")
plot(reg.summary$adjr2, xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
which.max(reg.summary$adjr2)
## [1] 9
points(9,reg.summary$adjr2[9], col="red", cex=2, pch =20)
plot(reg.summary$cp,xlab="Number of Variables ",ylab="Cp",type='l')
which.min(reg.summary$cp)
## [1] 7
points(7,reg.summary$cp[7], col ="red", cex=2, pch =20)
which.min(reg.summary$bic)
## [1] 2
plot(reg.summary$bic ,xlab="Number of Variables ", ylab="BIC", type='l')
points (2,reg.summary$bic [2],col="red",cex=2,pch =20)
coef(regfit.full,2)
## (Intercept) rad lstat
## -4.3814053 0.5228128 0.2372846
set.seed(1)
train <- sample(c(TRUE,FALSE), nrow(Boston),rep=TRUE)
test <- (!train)
library(glmnet)
x <- model.matrix(crim~.,Boston[,-1])
y <- Boston$crim
ridge.mod <- cv.glmnet(x[train,], y[train], alpha = 0)
ridge.bestlam <- ridge.mod$lambda.min
y.test <- y[test]
ridge.pred <- predict(ridge.mod, s=ridge.bestlam, newx = x[test,])
mean((ridge.pred-y.test)^2)
## [1] 49.91218
lasso.mod <- cv.glmnet(x[train,], y[train], alpha = 1)
lasso.bestlam <- lasso.mod$lambda.min
y.test <- y[test]
lasso.pred <- predict(lasso.mod, s=lasso.bestlam, newx = x[test,])
mean((lasso.pred-y.test)^2)
## [1] 49.33339
library(pls)
train=sample(1:nrow(x), nrow(x)/2)
test <- (-train)
y.test <- y[test]
pcr.fit <- pcr(crim~., data=Boston, subset=train, scale=TRUE, validation='CV')
validationplot(pcr.fit,val.type="MSEP")
pcr.pred <- predict(pcr.fit, Boston[test,], ncomp=5)
mean((pcr.pred - y.test)^2)
## [1] 23.70325
The MSE for these models are as follows: Ridge =
49.91218, Lasso = 49.3619, and PCR =
50.74418. All models show a very similar MSE
Since the Lasso model technically had the lowest model, I chose to use that and look at its coefficients
grid <- 10^seq(10,-2,length=100)
out <- glmnet(x,y,alpha=1, lambda = grid)
lasso.coef <- predict(out, type='coefficient', s=lasso.bestlam)[1:14,]
lasso.coef
## (Intercept) (Intercept) zn indus chas
## 10.8027449546 0.0000000000 0.0396497555 -0.0664644843 -0.7084897170
## nox rm age dis rad
## -7.9085628271 0.4851336975 0.0000000000 -0.8700775394 0.5556591795
## tax ptratio lstat medv
## -0.0006894823 -0.2509050533 0.1360443929 -0.1919947820
With these coefficients, the model would be as follows:
crim = 10.20296 + 0.03847zn - 0.06760indus - 0.68627chas - 7.46289nox + 0.45617rm - 0.84183dis + 0.54531rad - 0.00015tax - 0.23964ptratio + 0.13566lstat - 0.18633medv
The age variable had a coefficient of 0 and
was not included
detach(Boston)