library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.8 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tibble' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.1.3
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
College <- College
attach(College)
install.packages("pls", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Winni/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'pls' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Winni\AppData\Local\Temp\Rtmpc3WXEw\downloaded_packages
9.In this exercise, we will predict the number of applications received using the other variables in the College data set. (a) Split the data set into a training set and a test set.
set.seed(1)
split <- sample.split(College, SplitRatio = 0.7)
Train <- subset(College, split==TRUE)
Test<- subset(College, split==FALSE)
linear.model <- lm(Apps~.,data=Train)
summary(linear.model)
##
## Call:
## lm(formula = Apps ~ ., data = Train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5378.4 -456.3 -29.2 340.4 7420.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -241.60620 512.88001 -0.471 0.637790
## PrivateYes -442.83942 165.07608 -2.683 0.007546 **
## Accept 1.66448 0.04603 36.161 < 2e-16 ***
## Enroll -1.17188 0.23091 -5.075 5.47e-07 ***
## Top10perc 53.32115 6.95449 7.667 9.21e-14 ***
## Top25perc -13.99816 5.55282 -2.521 0.012015 *
## F.Undergrad 0.08075 0.03808 2.120 0.034457 *
## P.Undergrad 0.08245 0.03621 2.277 0.023209 *
## Outstate -0.08299 0.02266 -3.663 0.000276 ***
## Room.Board 0.11950 0.05872 2.035 0.042346 *
## Books 0.04427 0.29187 0.152 0.879502
## Personal -0.00386 0.07729 -0.050 0.960190
## PhD -9.11216 5.58831 -1.631 0.103609
## Terminal -3.89153 6.14692 -0.633 0.526967
## S.F.Ratio 15.28781 17.52458 0.872 0.383428
## perc.alumni 3.50630 5.18340 0.676 0.499069
## Expend 0.05822 0.01508 3.860 0.000128 ***
## Grad.Rate 7.94150 3.48452 2.279 0.023082 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1039 on 501 degrees of freedom
## Multiple R-squared: 0.9347, Adjusted R-squared: 0.9325
## F-statistic: 421.7 on 17 and 501 DF, p-value: < 2.2e-16
pred.lm <- predict(linear.model, Test)
mean((pred.lm-Test$Apps)^2)
## [1] 1138477
The test error is 1,138,477.
#SOLUTION
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.1.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-4
train.mat <- model.matrix(Apps~., data=Train)
test.mat <- model.matrix(Apps~., data=Test)
grid <- 10^seq(10, -2, length=100)
ridge.mod <- glmnet(train.mat, Train$Apps, alpha=0, lambda=grid, thresh=1e-12)
set.seed(1)
cv.out <- cv.glmnet(train.mat, Train$Apps, alpha=0, lambda=grid, thresh=1e-12)
bestlam <- cv.out$lambda.min
bestlam
## [1] 0.01
ridge.pred <- predict(ridge.mod, s=bestlam, newx=test.mat)
mean((ridge.pred-Test$Apps)^2)
## [1] 1138456
The MSE is 1,138,456
lasso.mod <-glmnet(train.mat, Train$Apps, alpha=1, lambda=grid)
cv.outL <- cv.glmnet(train.mat, Train$Apps, alpha=1, lambda=grid)
bestlamL <- cv.outL$lambda.min
lasso.pred <- predict(lasso.mod, s=bestlamL, newx=test.mat)
mean((lasso.pred-Test$Apps)^2)
## [1] 1137717
predict(lasso.mod, s = bestlamL, type = "coefficients")
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -2.444694e+02
## (Intercept) .
## PrivateYes -4.425095e+02
## Accept 1.663032e+00
## Enroll -1.162670e+00
## Top10perc 5.324079e+01
## Top25perc -1.394415e+01
## F.Undergrad 7.975250e-02
## P.Undergrad 8.238586e-02
## Outstate -8.291378e-02
## Room.Board 1.197514e-01
## Books 4.425446e-02
## Personal -3.986567e-03
## PhD -9.099208e+00
## Terminal -3.903091e+00
## S.F.Ratio 1.528425e+01
## perc.alumni 3.476229e+00
## Expend 5.823394e-02
## Grad.Rate 7.943471e+00
The test error is 1,469,400.
library(pls)
## Warning: package 'pls' was built under R version 4.1.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
set.seed(2)
pcr.fit <-pcr(Apps~., data=Train, scale=TRUE, validation="CV")
summary(pcr.fit)
## Data: X dimension: 519 17
## Y dimension: 519 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4002 3952 2187 2187 1803 1745 1737
## adjCV 4002 3954 2183 2185 1728 1732 1731
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1727 1718 1637 1642 1653 1651 1669
## adjCV 1714 1713 1632 1637 1648 1645 1665
## 14 comps 15 comps 16 comps 17 comps
## CV 1679 1438 1232 1167
## adjCV 1676 1410 1222 1159
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 30.811 56.36 63.69 69.37 74.87 79.78 83.52 87.12
## Apps 3.431 70.97 71.04 82.09 83.05 83.06 83.56 83.58
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.29 92.67 94.69 96.60 97.77 98.64 99.34
## Apps 84.90 84.97 85.01 85.07 85.07 85.17 91.77
## 16 comps 17 comps
## X 99.83 100.00
## Apps 92.83 93.47
validationplot(pcr.fit, val.type = "MSEP")
#M=9
pcr.pred <- predict(pcr.fit, Test, ncomp = 9)
mean((pcr.pred-Test$Apps)^2)
## [1] 1604067
The test error is 1,604,067 and M=9.
##SOLUTION
set.seed(1)
pls.fit <- plsr(Apps~., data=Train, scale=TRUE, validation="CV")
summary(pls.fit)
## Data: X dimension: 519 17
## Y dimension: 519 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4002 1996 1703 1566 1459 1321 1253
## adjCV 4002 1991 1699 1558 1432 1300 1240
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1241 1243 1241 1238 1235 1233 1232
## adjCV 1229 1230 1228 1225 1222 1220 1219
## 14 comps 15 comps 16 comps 17 comps
## CV 1232 1232 1232 1232
## adjCV 1220 1219 1219 1219
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 25.74 39.66 61.71 63.99 67.26 73.15 76.10 79.00
## Apps 76.52 84.63 87.38 91.35 92.90 93.14 93.23 93.33
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 82.04 84.73 86.42 89.87 92.34 93.88 96.80
## Apps 93.40 93.44 93.46 93.47 93.47 93.47 93.47
## 16 comps 17 comps
## X 98.73 100.00
## Apps 93.47 93.47
validationplot(pls.fit, val.type = "MSEP")
#M=10
pls.pred <- predict(pls.fit, Test, ncomp = 10)
mean((pls.pred-Test$Apps)^2)
## [1] 1135147
The test error is 1,135,147.
We see the PLS approach resulted in the lowest test error rate and the ridge regression resulted in the largest.There isn’t a big difference in the test error for the least squares, ridge and lasso but we see that the lasso had the lowest test error among these three models.
attach(Boston)
library(leaps)
## Warning: package 'leaps' was built under R version 4.1.3
set.seed(1)
Best.split <- sample.split(Boston, SplitRatio = 0.7)
Best.Train <- subset(Boston, Best.split==TRUE)
Best.Test<- subset(Boston, Best.split==FALSE)
regfit.best <- regsubsets(crim~., data=Best.Train, nvmax = 12)
reg.summary <- summary(regfit.best)
reg.summary
## Subset selection object
## Call: regsubsets.formula(crim ~ ., data = Best.Train, nvmax = 12)
## 12 Variables (and intercept)
## Forced in Forced out
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## 1 subsets of each size up to 12
## Selection Algorithm: exhaustive
## zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*" " "
## 3 ( 1 ) "*" " " " " " " " " " " " " "*" " " " " "*" " "
## 4 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " "*" "*" " " " " " " "*"
## 6 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" " " "*"
## 7 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" "*"
## 8 ( 1 ) "*" "*" " " "*" " " " " "*" "*" " " "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" " " " " "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
##Finding best model from RSS and R-squared
par (mfrow = c(2, 2))
plot (reg.summary$rss , xlab = " Number of Variables ",
ylab = " RSS ", type = "l")
plot (reg.summary$rsq , xlab = " Number of Variables ",
ylab = " RSq ", type = "l")
which.max(reg.summary$rsq)
## [1] 12
which.min(reg.summary$rss)
## [1] 12
From the result, we see that the twelve variable model has the smallest
RSS value and largest R-squared value. This is biased because it is
gotten from the training error. Let’s find the Cp, Adjusted R-squared
and BIC values.
par (mfrow = c(2, 2))
plot (reg.summary$cp , xlab = " Number of Variables ",
ylab = " Cp ", type = "l")
plot (reg.summary$bic , xlab = " Number of Variables ",
ylab = "bic", type = "l")
plot (reg.summary$adjr2 , xlab = " Number of Variables ",
ylab = " AdjRsq ", type = "l")
which.max(reg.summary$adjr2)
## [1] 8
which.min(reg.summary$cp)
## [1] 8
which.min(reg.summary$bic)
## [1] 2
We see that the eight-variables model has the largest adjusted R-squared
and smallest Cp value, and the two-variables has the smallest BIC
value.
#Creating a least squares regression using the eight-variable model
var.lm <- lm(crim~zn+indus+nox+dis+rad+ptratio+lstat+medv, data=Best.Train)
summary(var.lm)
##
## Call:
## lm(formula = crim ~ zn + indus + nox + dis + rad + ptratio +
## lstat + medv, data = Best.Train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.530 -2.424 -0.310 1.278 57.798
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.18806 7.22697 3.070 0.002311 **
## zn 0.05519 0.02110 2.616 0.009298 **
## indus -0.14000 0.08626 -1.623 0.105527
## nox -13.42524 6.01635 -2.231 0.026298 *
## dis -1.30596 0.32397 -4.031 6.85e-05 ***
## rad 0.61825 0.05656 10.930 < 2e-16 ***
## ptratio -0.40242 0.21527 -1.869 0.062424 .
## lstat 0.14042 0.07759 1.810 0.071220 .
## medv -0.23329 0.06429 -3.629 0.000328 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.273 on 342 degrees of freedom
## Multiple R-squared: 0.5051, Adjusted R-squared: 0.4935
## F-statistic: 43.62 on 8 and 342 DF, p-value: < 2.2e-16
pred.var <- predict(var.lm, Best.Test)
mean((pred.var-Best.Test$crim)^2)
## [1] 47.60297
The test error rate is 47.60
#Creating a least squares regression using the two-variable model
two.lm <- lm(crim~rad+lstat, data=Best.Train)
summary(two.lm)
##
## Call:
## lm(formula = crim ~ rad + lstat, data = Best.Train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.297 -1.984 -0.208 1.203 59.565
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.77303 0.69187 -6.899 2.49e-11 ***
## rad 0.55878 0.04584 12.190 < 2e-16 ***
## lstat 0.25858 0.05353 4.831 2.04e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.435 on 348 degrees of freedom
## Multiple R-squared: 0.4701, Adjusted R-squared: 0.4671
## F-statistic: 154.4 on 2 and 348 DF, p-value: < 2.2e-16
pred.two <- predict(two.lm, Best.Test)
mean((pred.two-Best.Test$crim)^2)
## [1] 47.36111
The test error is 47.36.
#Finding best model using validation set approach
test.mat <- model.matrix(crim~., data=Best.Test)
val.errors <- rep(NA,12)
for(i in 1:12){
coefi <- coef(regfit.best, id=i)
pred <- test.mat[, names(coefi)] %*%coefi
val.errors[i] <- mean((Best.Test$crim-pred)^2)
}
which.min(val.errors)
## [1] 4
coef(regfit.best, 4)
## (Intercept) zn dis rad medv
## 5.89771148 0.06651123 -0.83293557 0.53469778 -0.21056061
The best model is one with four variables (zn, dis, rad, medv)
#Least squares regression using four variables model
Best.lm <- lm(crim~zn+dis+rad+medv, data=Best.Train)
summary(Best.lm)
##
## Call:
## lm(formula = crim ~ zn + dis + rad + medv, data = Best.Train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.238 -2.133 -0.290 0.990 58.158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.89771 1.60884 3.666 0.000285 ***
## zn 0.06651 0.02067 3.218 0.001412 **
## dis -0.83294 0.23931 -3.481 0.000564 ***
## rad 0.53470 0.04824 11.083 < 2e-16 ***
## medv -0.21056 0.04173 -5.046 7.31e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.374 on 346 degrees of freedom
## Multiple R-squared: 0.483, Adjusted R-squared: 0.4771
## F-statistic: 80.82 on 4 and 346 DF, p-value: < 2.2e-16
pred.Best <- predict(Best.lm, Best.Test)
mean((pred.Best-Best.Test$crim)^2)
## [1] 46.73149
The test error is 46.73
2.Ridge Regression
library(glmnet)
train.crime <- model.matrix(crim~., data=Best.Train)
test.crime <- model.matrix(crim~., data=Best.Test)
grid <- 10^seq(10, -2, length=100)
crime.model<- glmnet(train.crime, Best.Train$crim, alpha=0, lambda=grid, thresh=1e-12)
set.seed(1)
cv.outC <- cv.glmnet(train.crime, Best.Train$crim, alpha=0, lambda=grid, thresh=1e-12)
bestlamC <- cv.outC$lambda.min
bestlamC
## [1] 0.1232847
crime.pred <- predict(crime.model, s=bestlamC, newx=test.crime)
mean((crime.pred-Best.Test$crim)^2)
## [1] 47.18648
The test error is 47.186
2.Lasso Regression ## SOLUTION
set.seed(1)
cv.outL <- cv.glmnet(train.crime, Best.Train$crim, alpha=1, lambda=grid)
bestlamL <- cv.outL$lambda.min
lass.mod <- glmnet(train.crime, Best.Train$crim, alpha=1, lambda=grid)
lass.pred <- predict(lass.mod, s=bestlamL, newx=test.crime)
mean((lass.pred-Best.Test$crim)^2)
## [1] 46.93537
predict(lass.mod, s = bestlamL, type = "coefficients")
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 18.584068479
## (Intercept) .
## zn 0.051023801
## indus -0.104581899
## chas -0.631702517
## nox -10.611162133
## rm -0.045026871
## age .
## dis -1.130420065
## rad 0.618617581
## tax -0.001438741
## ptratio -0.332649124
## lstat 0.141351786
## medv -0.204918346
The test error is 46.935
3.PCR
library(pls)
set.seed(2)
pcr.fits <-pcr(crim~., data=Best.Train, scale=TRUE, validation="CV")
summary(pcr.fits)
## Data: X dimension: 351 12
## Y dimension: 351 1
## Fit method: svdpc
## Number of components considered: 12
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.827 7.345 7.328 6.925 6.822 6.807 6.771
## adjCV 8.827 7.339 7.323 6.916 6.814 6.800 6.763
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps
## CV 6.570 6.575 6.579 6.560 6.494 6.378
## adjCV 6.562 6.568 6.572 6.551 6.483 6.368
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 50.27 63.72 72.93 80.29 86.75 90.28 92.88 95.05
## crim 32.32 32.69 40.34 42.19 42.54 43.14 46.30 46.37
## 9 comps 10 comps 11 comps 12 comps
## X 96.82 98.41 99.46 100.00
## crim 46.83 47.22 48.98 50.65
validationplot(pcr.fits, val.type = "MSEP")
#M=7
pcr.preds <- predict(pcr.fits, Best.Test, ncomp = 7)
mean((pcr.preds-Best.Test$crim)^2)
## [1] 47.11836
The test error is 47.11
#ANSWER 1. For this data, I would choose the ridge regression model, the lasso regression model, the four-variables model derived using the best subset selection method (validation set approach), and the pcr model. The reason is because aprroximately, they all had a test error rate of 47.0 indicating the differences between the model performance is small.