2.) For parts (a) through (c), indicate which of i. through iv. is correct. Justify your answer.
iii. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
iii. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
ii. More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.
9.) In this exercise, we will predict the number of applications received using the other variables in the College data set.
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.3
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.1.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-4
data('College')
set.seed(74)
samp = sample(1:nrow(College), (nrow(College)*.75))
college.train = College[samp,]
college.test = College[-(samp),]
preObj = preProcess(college.train, method = c('center', 'scale'))
college.train = predict(preObj, college.train)
college.test = predict(preObj, college.test)
y_train = college.train$Apps
y_test = college.test$Apps
one_hot_encoding = dummyVars(Apps ~ ., data = college.train)
x_train = predict(one_hot_encoding, college.train)
x_test = predict(one_hot_encoding, college.test)
college.lm = lm(Apps ~ ., data = college.train)
pred = predict(college.lm, college.test)
lm.info = postResample(pred, college.test$Apps)
lm.info
## RMSE Rsquared MAE
## 0.3529980 0.9422620 0.1797521
ridge.college = train(x = x_train, y = y_train,
method = 'glmnet',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(alpha = 0,
lambda = seq(0, 10e2, length.out = 20)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
ridge.info = postResample(predict(ridge.college, x_test), y_test)
ridge.info
## RMSE Rsquared MAE
## 0.4748245 0.9002998 0.1962587
coef(ridge.college$finalModel, ridge.college$bestTune$lambda)
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.044009800
## Private.No 0.096149793
## Private.Yes -0.097150878
## Accept 0.527734468
## Enroll 0.181172897
## Top10perc 0.143139390
## Top25perc 0.000573791
## F.Undergrad 0.102005350
## P.Undergrad 0.014906701
## Outstate -0.007493953
## Room.Board 0.068549037
## Books 0.001250405
## Personal -0.013042167
## PhD -0.027196979
## Terminal -0.020247007
## S.F.Ratio 0.012316140
## perc.alumni -0.048065871
## Expend 0.115544479
## Grad.Rate 0.066587741
plot(ridge.college)
plot(varImp(ridge.college))
lasso.college = train(x = x_train, y = y_train,
method = 'glmnet',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(alpha = 1,
lambda = seq(0.0001, 1, length.out = 50)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
lasso.info = postResample(predict(lasso.college, x_test), y_test)
lasso.info
## RMSE Rsquared MAE
## 0.3529499 0.9425167 0.1782946
coef(lasso.college$finalModel, lasso.college$bestTune$lambda)
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -0.045040750
## Private.No 0.163835729
## Private.Yes .
## Accept 0.800018445
## Enroll 0.030955345
## Top10perc 0.238048038
## Top25perc -0.064607643
## F.Undergrad 0.010136106
## P.Undergrad 0.022351901
## Outstate -0.071295530
## Room.Board 0.055114154
## Books -0.001731007
## Personal -0.007605054
## PhD -0.049579006
## Terminal -0.006843202
## S.F.Ratio 0.015389014
## perc.alumni -0.027706535
## Expend 0.123299470
## Grad.Rate 0.056473648
plot(lasso.college)
plot(varImp(lasso.college))
pcr.college = train(x = x_train, y = y_train,
method = 'pcr',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(ncomp = 1:10))
pcr.info = postResample(predict(pcr.college, x_test), y_test)
pcr.info
## RMSE Rsquared MAE
## 0.5686043 0.8427682 0.2310103
coef(pcr.college$finalModel)
## , , 9 comps
##
## .outcome
## Private.No 0.034380407
## Private.Yes -0.034380407
## Accept 0.332501459
## Enroll 0.299976793
## Top10perc 0.066337967
## Top25perc 0.048876585
## F.Undergrad 0.269432526
## P.Undergrad -0.021492923
## Outstate 0.039148474
## Room.Board 0.071944509
## Books 0.001494507
## Personal -0.032215377
## PhD -0.016799754
## Terminal -0.028241643
## S.F.Ratio -0.025156253
## perc.alumni -0.097072489
## Expend 0.083240217
## Grad.Rate 0.090511210
plot(pcr.college)
plot(varImp(pcr.college))
pls.college = train(x = x_train, y = y_train,
method = 'pls',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(ncomp = 1:10))
pls.info = postResample(predict(pls.college, x_test), y_test)
pls.info
## RMSE Rsquared MAE
## 0.3580636 0.9404572 0.1803492
coef(pls.college$finalModel)
## , , 9 comps
##
## .outcome
## Private.No 0.083793414
## Private.Yes -0.083793414
## Accept 0.793118084
## Enroll 0.083416049
## Top10perc 0.237250790
## Top25perc -0.064726170
## F.Undergrad -0.036489231
## P.Undergrad 0.023992670
## Outstate -0.073581295
## Room.Board 0.057707856
## Books -0.001588054
## Personal -0.007440295
## PhD -0.050111345
## Terminal -0.006811487
## S.F.Ratio 0.015620917
## perc.alumni -0.027241027
## Expend 0.121876436
## Grad.Rate 0.057980932
plot(pls.college)
plot(varImp(pls.college))
## Warning: package 'pls' was built under R version 4.1.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
as_tibble(rbind(lm.info,
ridge.info,
lasso.info,
pcr.info,
pls.info)) %>%
mutate(model = c('Linear', 'Ridge', 'Lasso', 'PCR', 'PLS')) %>%
dplyr::select(model, RMSE, Rsquared)
## # A tibble: 5 x 3
## model RMSE Rsquared
## <chr> <dbl> <dbl>
## 1 Linear 0.353 0.942
## 2 Ridge 0.475 0.900
## 3 Lasso 0.353 0.943
## 4 PCR 0.569 0.843
## 5 PLS 0.358 0.940
college.test %>%
summarize(sd = sd(Apps))
## sd
## 1 1.404956
The Linear, Lasso, and PLS models all perform very similarly. The R^2 is at or above 0.94 for these models, and the RMSE is between 0.35 and 0.36. These models appear to be very accurate.
11.) We will now try to predict per capita crime rate in the Boston data set.
library(caret)
library(tidyverse)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(leaps)
set.seed(21)
boston = Boston
inTrain = createDataPartition(boston$crim, p = 0.7, list = FALSE)
x_train = Boston[inTrain, -1]
y_train = Boston[inTrain, 1]
x_test = Boston[-inTrain, -1]
y_test = Boston[-inTrain, 1]
library(ggplot2)
library(ggthemes)
#Best Subset
best.subs = regsubsets(x = x_train, y = y_train, nvmax = 13)
fit.summary = summary(best.subs)
tibble(MSE = fit.summary$rss/nrow(x_train),
Cp = fit.summary$cp,
BIC = fit.summary$bic,
AdjustedR2 = fit.summary$adjr2) %>%
mutate(id = row_number()) %>%
gather(Metric, value, -id) %>%
ggplot(aes(id, value, col = Metric)) +
geom_line() + geom_point() + ylab('') +
xlab('Number of Variables Used') +
facet_wrap(~ Metric, scales = 'free') +
theme_tufte() + scale_x_continuous(breaks = 1:13)
scales = c('r2', 'adjr2', 'bic', 'Cp')
par(mfrow = c(2,2))
for (scale in scales) {
plot(best.subs, scale = scale)
}
par(mfrow = c(1,1))
test.errors = rep(NA,13)
test.mat = model.matrix(crim ~ ., data = boston[-inTrain,])
for (i in 1:13){
coefs = coef(best.subs, id=i)
pred = test.mat[,names(coefs)]%*%coefs
test.errors[i] = sqrt(mean((y_test - pred)^2))
}
data_frame(RMSE = test.errors) %>%
mutate(id = row_number()) %>%
ggplot(aes(id, RMSE)) +
geom_line() + geom_point() +
xlab('Number of Variables Used') +
ggtitle('MSE on testing set') +
theme_tufte() +
scale_x_continuous(breaks = 1:13)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
(regsubset.info = min(test.errors))
## [1] 7.899537
coef(best.subs, id = 1:5)
## [[1]]
## (Intercept) rad
## -2.3591638 0.6370914
##
## [[2]]
## (Intercept) rad lstat
## -4.9101729 0.5129819 0.2892960
##
## [[3]]
## (Intercept) zn rad lstat
## -5.65447286 0.02720559 0.52285327 0.31733752
##
## [[4]]
## (Intercept) zn rad black lstat
## -2.803628195 0.026575745 0.501748247 -0.006701025 0.297920434
##
## [[5]]
## (Intercept) zn indus dis rad lstat
## -2.76212352 0.03854517 -0.13614895 -0.48800202 0.52946533 0.33718413
#Lasso
lasso.fit = train(x = x_train, y = y_train,
method = 'glmnet',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(alpha = 1,
lambda = seq(0.001, 1, length.out = 100)),
preProcess = c('center', 'scale'))
(lasso.info = postResample(predict(lasso.fit, x_test), y_test))
## RMSE Rsquared MAE
## 7.9399365 0.3060687 3.2940688
coef(lasso.fit$finalModel, lasso.fit$bestTune$lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 3.62162138
## zn 0.81430968
## indus -0.60437975
## chas -0.23399892
## nox -0.71605996
## rm -0.30001398
## age -0.09444168
## dis -1.39323234
## rad 4.62619826
## tax .
## ptratio -0.42814888
## black -0.54190606
## lstat 1.65028292
## medv -0.71265307
lasso.fit$bestTune
## alpha lambda
## 7 1 0.06154545
plot(lasso.fit)
plot(varImp(lasso.fit))
#Ridge Regression
ridge.fit = train(x_train, y_train,
method = 'glmnet',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(alpha = 0,
lambda = seq(0, 1e2, length.out = 50)),
preProcess = c('center', 'scale'))
(ridge.info = postResample(predict(ridge.fit, x_test), y_test))
## RMSE Rsquared MAE
## 7.9345380 0.3053235 3.2246580
coef(ridge.fit$finalModel, ridge.fit$bestTune$lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 3.6216214
## zn 0.7921889
## indus -0.6815298
## chas -0.2535164
## nox -0.5527679
## rm -0.3445693
## age -0.1546682
## dis -1.3956797
## rad 3.8321407
## tax 0.5657165
## ptratio -0.3419531
## black -0.6429606
## lstat 1.5855953
## medv -0.6809526
ridge.fit$bestTune
## alpha lambda
## 1 0 0
plot(ridge.fit)
plot(varImp(ridge.fit))
#Combined Lasso and Ridge with glmnet
glmnet.fit = train(x_train, y_train,
method = 'glmnet',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(alpha = seq(0, 1, length.out = 6),
lambda = seq(0, 1e2, length.out = 20)),
preProcess = c('center', 'scale'))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
(glmnet.info = postResample(predict(glmnet.fit, x_test), y_test))
## RMSE Rsquared MAE
## 7.9345380 0.3053235 3.2246580
coef(object = glmnet.fit$finalModel, s = glmnet.fit$bestTune$lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 3.6216214
## zn 0.7921889
## indus -0.6815298
## chas -0.2535164
## nox -0.5527679
## rm -0.3445693
## age -0.1546682
## dis -1.3956797
## rad 3.8321407
## tax 0.5657165
## ptratio -0.3419531
## black -0.6429606
## lstat 1.5855953
## medv -0.6809526
glmnet.fit$bestTune
## alpha lambda
## 1 0 0
plot(glmnet.fit)
plot(varImp(glmnet.fit))
#Principal Components Regression
pcr.fit = train(x_train, y_train,
method = 'pcr',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(ncomp = 1:13),
preProcess = c('center', 'scale'))
(pcr.info = postResample(predict(pcr.fit, x_test), y_test))
## RMSE Rsquared MAE
## 7.9662171 0.3048383 3.5098732
pcr.fit
## Principal Component Analysis
##
## 356 samples
## 13 predictor
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 321, 320, 320, 320, 320, 322, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 6.079159 0.4396882 3.591496
## 2 6.079279 0.4377816 3.648797
## 3 5.582968 0.5427569 2.928616
## 4 5.578876 0.5440950 2.924665
## 5 5.594527 0.5413528 2.930437
## 6 5.575019 0.5452919 2.972506
## 7 5.573984 0.5457591 3.012596
## 8 5.297795 0.5937781 2.861465
## 9 5.322414 0.5863766 2.832778
## 10 5.264764 0.5963546 2.790431
## 11 5.254570 0.5975658 2.757461
## 12 5.261138 0.5995739 2.829231
## 13 5.177791 0.6156058 2.755884
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 13.
plot(pcr.fit)
plot(varImp(pcr.fit))
#Partial Least Squares
pls.fit = train(x_train, y_train,
method = 'pls',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(ncomp = 1:13),
preProcess = c('center', 'scale'))
(pls.info = postResample(predict(pls.fit, x_test), y_test))
## RMSE Rsquared MAE
## 7.9661953 0.3048414 3.5098438
pls.fit
## Partial Least Squares
##
## 356 samples
## 13 predictor
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 320, 320, 321, 320, 321, 321, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 5.870652 0.5032589 3.458601
## 2 5.492417 0.5943935 2.883473
## 3 5.336743 0.6145624 2.793708
## 4 5.293443 0.6157901 2.835456
## 5 5.274564 0.6223494 2.802064
## 6 5.288363 0.6202693 2.817464
## 7 5.259942 0.6239670 2.805121
## 8 5.262846 0.6238793 2.808915
## 9 5.258298 0.6254873 2.792707
## 10 5.258121 0.6258084 2.791658
## 11 5.256727 0.6259474 2.789085
## 12 5.256697 0.6259312 2.789116
## 13 5.256720 0.6259278 2.789135
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 12.
plot(pls.fit)
rbind(c(regsubset.info, NA, NA),
lasso.info,
ridge.info,
glmnet.info,
pcr.info,
pls.info)
## RMSE Rsquared MAE
## 7.899537 NA NA
## lasso.info 7.939937 0.3060687 3.294069
## ridge.info 7.934538 0.3053235 3.224658
## glmnet.info 7.934538 0.3053235 3.224658
## pcr.info 7.966217 0.3048383 3.509873
## pls.info 7.966195 0.3048414 3.509844
All models appear to perform similarly, but the best appear to be “ridge regression” and “combined lasso and ridge with glmnet”.