library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
library(MASS)
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.3
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.4
## Loading required package: Matrix
## Loaded glmnet 4.1-1
library(pls)
## Warning: package 'pls' was built under R version 4.0.4
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'caret'
## The following object is masked from 'package:pls':
##
## R2
##Question 2
2A. The lasso, relative to least sqaures is less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance
2B. Ridge regression relative to least sqaures is less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance
2C. Non-linear methods relative to least squares is more flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias
##Question 9
data("College")
9A. Split data into train and test
set.seed(8)
train=sample(1:nrow(College), nrow(College)/2)
test = (-train)
College.train = College[train,]
College.test = College[test,]
#set.seed(8)
#inTrain = createDataPartition(y=College$Apps, p=.8, list = FALSE)
#College.train = College[inTrain,]
#College.test = College[-inTrain,]
Make the cross-validation
train_control = trainControl(method = "cv", number = 10, savePredictions = "all")
9B. Linear Regression
set.seed(8)
lm.fit = train(Apps ~., data = College.train, trControl = train_control, method = "lm")
summary(lm.fit)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3182.1 -428.9 -20.4 297.3 6907.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.716e+02 5.242e+02 -0.709 0.47885
## PrivateYes -8.712e+02 1.864e+02 -4.674 4.15e-06 ***
## Accept 1.289e+00 6.700e-02 19.230 < 2e-16 ***
## Enroll -2.028e-01 2.515e-01 -0.806 0.42057
## Top10perc 3.815e+01 7.568e+00 5.041 7.27e-07 ***
## Top25perc -1.095e+01 6.102e+00 -1.794 0.07363 .
## F.Undergrad 2.368e-02 4.188e-02 0.565 0.57217
## P.Undergrad 1.452e-03 4.922e-02 0.030 0.97648
## Outstate -8.025e-02 2.613e-02 -3.071 0.00229 **
## Room.Board 2.104e-01 6.632e-02 3.173 0.00164 **
## Books 1.259e-01 2.795e-01 0.450 0.65265
## Personal 2.873e-02 8.217e-02 0.350 0.72682
## PhD -3.195e+00 6.090e+00 -0.525 0.60017
## Terminal -9.109e+00 7.208e+00 -1.264 0.20714
## S.F.Ratio 1.306e+01 1.851e+01 0.705 0.48108
## perc.alumni 4.301e+00 5.877e+00 0.732 0.46468
## Expend 1.084e-01 1.823e-02 5.948 6.29e-09 ***
## Grad.Rate 6.713e+00 3.792e+00 1.770 0.07749 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 980.2 on 370 degrees of freedom
## Multiple R-squared: 0.9286, Adjusted R-squared: 0.9254
## F-statistic: 283.3 on 17 and 370 DF, p-value: < 2.2e-16
lm.fit
## Linear Regression
##
## 388 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 349, 348, 350, 348, 349, 351, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 992.857 0.9212021 608.8517
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
lin.preds = predict(lm.fit, newdata=College.test)
linear.mean.err = mean((College.test$Apps - lin.preds)^2)
linear.mean.err
## [1] 1469722
summary(lm.fit)$r.squared
## [1] 0.9286493
lin.RMSE = RMSE(lin.preds, College.test$Apps)
lin.RMSE
## [1] 1212.321
lin.test.r2 = R2(lin.preds, College.test$Apps)
lin.test.r2
## [1] 0.9260995
9C. Ridge Regression
x_train = model.matrix(Apps~., College.train)[,-1]
y_train = College.train$Apps
x_test = model.matrix(Apps~., College.test)[,-1]
y_test = College.test$Apps
grid=10^seq(4,-2,length=100)
set.seed(8)
cv_ridge = cv.glmnet(x_train, y_train, alpha = 0, lambda = grid , thresh = 1e-12)
optimal_lambda = cv_ridge$lambda.min
ridge.fit = glmnet(x_train, y_train, alpha = 0, lambda = optimal_lambda)
coef(ridge.fit)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -4.539741e+02
## PrivateYes -8.939414e+02
## Accept 1.182170e+00
## Enroll 5.656325e-02
## Top10perc 3.524110e+01
## Top25perc -8.931204e+00
## F.Undergrad 1.997640e-02
## P.Undergrad 1.024211e-03
## Outstate -6.842933e-02
## Room.Board 2.117602e-01
## Books 1.463373e-01
## Personal 1.892829e-02
## PhD -2.829440e+00
## Terminal -8.907760e+00
## S.F.Ratio 1.016648e+01
## perc.alumni 2.019977e+00
## Expend 1.057877e-01
## Grad.Rate 7.280783e+00
ridge.preds = predict(ridge.fit, s = optimal_lambda, newx = x_test)
ridge.mean.error = mean((College.test$Apps - ridge.preds)^2)
ridge.mean.error
## [1] 1596463
ridge.RMSE = RMSE(ridge.preds, College.test$Apps)
ridge.RMSE
## [1] 1263.512
ridge.test.rsquared = R2(ridge.preds, College.test$Apps)
ridge.test.rsquared
## 1
## [1,] 0.9223008
9D. Lasso Regression
set.seed(8)
cv_lasso = cv.glmnet(x_train, y_train, alpha = 1, lambda = grid , thresh = 1e-12)
optimal_lambda_lasso = cv_lasso$lambda.min
lasso.fit = glmnet(x_train, y_train, alpha = 0, lambda = optimal_lambda_lasso)
coef(lasso.fit)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -4.446305e+02
## PrivateYes -8.916340e+02
## Accept 1.193812e+00
## Enroll 3.124709e-02
## Top10perc 3.556814e+01
## Top25perc -9.160756e+00
## F.Undergrad 1.976540e-02
## P.Undergrad 1.223561e-03
## Outstate -6.979570e-02
## Room.Board 2.117188e-01
## Books 1.440272e-01
## Personal 1.993674e-02
## PhD -2.867270e+00
## Terminal -8.934412e+00
## S.F.Ratio 1.049127e+01
## perc.alumni 2.270297e+00
## Expend 1.061105e-01
## Grad.Rate 7.217923e+00
lasso.preds = predict(lasso.fit, s = optimal_lambda_lasso, newx = x_test)
lasso.mean.error = mean((College.test$Apps - lasso.preds)^2)
lasso.mean.error
## [1] 1581287
lasso.RMSE = RMSE(lasso.preds, College.test$Apps)
lasso.RMSE
## [1] 1257.492
lasso.test.rsquared = R2(lasso.preds, College.test$Apps)
lasso.test.rsquared
## 1
## [1,] 0.9227781
9E. PCR
set.seed(8)
pcr.fit = train(Apps ~., data=College.train, scale = TRUE, trControl = train_control, tuneLength=ncol(College.train), method = 'pcr')
plot(pcr.fit)
pcr.fit$bestTune
## ncomp
## 16 16
print(pcr.fit)
## Principal Component Analysis
##
## 388 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 349, 348, 350, 348, 349, 351, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 3491.496 0.02142928 2508.1309
## 2 1702.096 0.78319854 1243.8442
## 3 1712.234 0.78177302 1249.3233
## 4 1401.340 0.84817174 980.3353
## 5 1285.233 0.86588277 869.0904
## 6 1272.717 0.86980666 867.7251
## 7 1279.593 0.86916256 871.0263
## 8 1267.624 0.87124919 854.5579
## 9 1195.609 0.88696663 786.2901
## 10 1180.477 0.89018765 761.6441
## 11 1183.515 0.89032063 769.8830
## 12 1192.774 0.88900073 776.8107
## 13 1203.892 0.88712197 785.0383
## 14 1208.306 0.88636749 792.8389
## 15 1213.532 0.88508601 795.0885
## 16 1005.375 0.92093882 627.9100
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 16.
summary(pcr.fit$finalModel)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 16
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 31.147 57.16 64.05 70.31 75.95 81.19 84.85
## .outcome 0.121 77.52 77.57 84.78 87.29 87.30 87.31
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 88.14 91.11 93.47 95.39 97.10 98.15 98.96
## .outcome 87.47 89.00 89.18 89.36 89.39 89.40 89.44
## 15 comps 16 comps
## X 99.44 99.86
## .outcome 89.48 92.65
pcr.pred=predict(pcr.fit,College.test,ncomp=16)
pcr.mean.err = mean((College.test$Apps - pcr.pred)^2)
pcr.mean.err
## [1] 1647232
pcr.RMSE = RMSE(pcr.pred, College.test$Apps)
pcr.RMSE
## [1] 1283.445
pcr.test.rsquared = R2(pcr.pred, College.test$Apps)
pcr.test.rsquared
## [1] 0.918927
PLS
set.seed(8)
pls.fit = train(Apps ~., data=College.train, scale = TRUE, trControl = train_control, tuneLength=ncol(College.train), method = 'pls')
plot(pls.fit)
pls.fit$bestTune
## ncomp
## 10 10
summary(pls.fit$finalModel)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: oscorespls
## Number of components considered: 10
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 25.88 32.45 61.51 66.05 69.11 72.54 76.17
## .outcome 82.16 89.39 89.73 90.98 92.10 92.71 92.78
## 8 comps 9 comps 10 comps
## X 81.21 83.34 85.64
## .outcome 92.80 92.82 92.84
pls.fit
## Partial Least Squares
##
## 388 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 349, 348, 350, 348, 349, 351, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1518.6058 0.8254592 1103.6556
## 2 1189.7147 0.8883682 771.5916
## 3 1164.2923 0.8930979 758.1285
## 4 1128.5224 0.8996012 726.8939
## 5 1085.3298 0.9083062 698.7793
## 6 1023.9233 0.9173131 646.3927
## 7 1013.6656 0.9194031 631.5614
## 8 1000.8815 0.9210369 621.1063
## 9 989.5432 0.9225253 611.3539
## 10 988.6935 0.9223078 609.8801
## 11 992.6419 0.9214769 608.5388
## 12 993.3988 0.9211094 610.1710
## 13 992.3213 0.9213370 608.3659
## 14 992.4023 0.9212801 608.5394
## 15 992.4649 0.9212577 608.8236
## 16 992.8481 0.9212052 608.8398
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 10.
set.seed(8)
pls.pred=predict(pls.fit,College.test,ncomp=10)
pls.mean.err =mean((College.test$Apps - pls.pred)^2)
pls.mean.err
## [1] 1495875
pls.RSME = RMSE(pls.pred, College.test$Apps)
pls.RSME
## [1] 1223.06
pls.test.rsquared = R2(pls.pred, College.test$Apps)
pls.test.rsquared
## [1] 0.9251608
9G.
#Creating the dataframe for comparison
data.frame(method = c("OLS", "Ridge", "Lasso", "PCR", "PLS"),
MSE = c(linear.mean.err, ridge.mean.error, lasso.mean.error, pcr.mean.err, pls.mean.err),
RMSE = c(lin.RMSE, ridge.RMSE, lasso.RMSE, pcr.RMSE, pls.RSME),
test.rsquared = c(lin.test.r2, ridge.test.rsquared, lasso.test.rsquared, pcr.test.rsquared, pls.test.rsquared))
## method MSE RMSE test.rsquared
## 1 OLS 1469722 1212.321 0.9260995
## 2 Ridge 1596463 1263.512 0.9223008
## 3 Lasso 1581287 1257.492 0.9227781
## 4 PCR 1647232 1283.445 0.9189270
## 5 PLS 1495875 1223.060 0.9251608
The methods produce slightly different mean squared error, root mean squared error, and R-squared values, however, the difference is very small. The OLS and PLS have the highest r-squared and lowest MSE and RMSE. The PCR has the highest MSE and RMSE and the lowest test r-sqaured. The models seem to have good predictive power.
#Question 11
data("Boston")
set.seed(8)
train=sample(1:nrow(Boston), nrow(Boston)/2)
test = (-train)
Boston.train = Boston[train,]
Boston.test = Boston[test,]
11A.
train_control_b = trainControl(method = "cv", number = 10, savePredictions = "all")
Linear Regression
set.seed(8)
lm.fit.bost = train(crim ~., data = Boston.train, trControl = train_control_b, method = "lm")
summary(lm.fit.bost)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.762 -2.149 -0.289 1.345 52.846
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.198926 8.222216 0.632 0.5278
## zn 0.039713 0.021802 1.822 0.0698 .
## indus -0.072151 0.102902 -0.701 0.4839
## chas -1.259336 1.356609 -0.928 0.3542
## nox -5.830684 5.953000 -0.979 0.3283
## rm -0.350889 0.720973 -0.487 0.6269
## age -0.016732 0.021824 -0.767 0.4440
## dis -0.712840 0.308824 -2.308 0.0218 *
## rad 0.541735 0.107040 5.061 8.32e-07 ***
## tax -0.005171 0.006233 -0.830 0.4076
## ptratio -0.175709 0.207705 -0.846 0.3984
## black 0.006684 0.004204 1.590 0.1131
## lstat 0.496459 0.099360 4.997 1.13e-06 ***
## medv -0.016963 0.065362 -0.260 0.7955
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.047 on 239 degrees of freedom
## Multiple R-squared: 0.5736, Adjusted R-squared: 0.5504
## F-statistic: 24.74 on 13 and 239 DF, p-value: < 2.2e-16
lm.fit.bost
## Linear Regression
##
## 253 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 228, 228, 229, 228, 227, 229, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.438062 0.6975947 2.763431
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
lin.cv.error = 4.44
lin.preds.bost = predict(lm.fit.bost, newdata=Boston.test)
linear.mean.err.bost = mean((Boston.test$crim - lin.preds.bost)^2)
linear.mean.err.bost
## [1] 65.15022
lin.RMSE.bost = RMSE(lin.preds.bost, Boston.test$crim)
lin.RMSE.bost
## [1] 8.071568
lin.test.r2.bost = R2(lin.preds.bost, Boston.test$crim)
lin.test.r2.bost
## [1] 0.2888541
Best Subset Selection
set.seed(8)
predict.regsubsets = function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id = id)
mat[, names(coefi)] %*% coefi
}
k = 10
p = ncol(Boston) - 1
folds = sample(rep(1:k, length = nrow(Boston)))
cv.errors = matrix(NA, k, p)
for (i in 1:k) {
best.fit = regsubsets(crim ~ ., data = Boston[folds != i, ], nvmax = p)
for (j in 1:p) {
pred = predict(best.fit, Boston[folds == i, ], id = j)
cv.errors[i, j] = mean((Boston$crim[folds == i] - pred)^2)
}
}
rmse.cv = sqrt(apply(cv.errors, 2, mean))
plot(rmse.cv, pch = 19, type = "b")
which.min(rmse.cv)
## [1] 11
reg.bestsub =regsubsets(crim ~., data = Boston.train, nvmax=11)
summary(reg.bestsub)
## Subset selection object
## Call: regsubsets.formula(crim ~ ., data = Boston.train, nvmax = 11)
## 13 Variables (and intercept)
## Forced in Forced out
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## black FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## 1 subsets of each size up to 11
## Selection Algorithm: exhaustive
## zn indus chas nox rm age dis rad tax ptratio black lstat medv
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " "*" " "
## 3 ( 1 ) " " "*" " " " " " " " " " " "*" " " " " " " "*" " "
## 4 ( 1 ) "*" " " " " " " " " " " " " "*" " " " " "*" "*" " "
## 5 ( 1 ) "*" "*" " " " " " " " " "*" "*" " " " " " " "*" " "
## 6 ( 1 ) "*" "*" " " " " " " " " "*" "*" " " " " "*" "*" " "
## 7 ( 1 ) "*" "*" " " " " " " "*" "*" "*" " " " " "*" "*" " "
## 8 ( 1 ) "*" "*" "*" " " " " "*" "*" "*" " " " " "*" "*" " "
## 9 ( 1 ) "*" "*" "*" " " " " "*" "*" "*" "*" " " "*" "*" " "
## 10 ( 1 ) "*" " " "*" "*" " " "*" "*" "*" "*" "*" "*" "*" " "
## 11 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*" "*" " "
final.bestsub.mod = train(crim ~ zn + indus + chas + nox + age + dis + rad + tax
+ ptratio + black + lstat + medv, data=Boston.train, trControl = train_control_b, method = "lm")
summary(final.bestsub.mod)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.732 -2.166 -0.213 1.323 52.759
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.931473 6.764105 0.433 0.6651
## zn 0.038816 0.021690 1.790 0.0748 .
## indus -0.067485 0.102291 -0.660 0.5101
## chas -1.197279 1.348455 -0.888 0.3755
## nox -5.843532 5.943470 -0.983 0.3265
## age -0.018596 0.021451 -0.867 0.3869
## dis -0.710742 0.308303 -2.305 0.0220 *
## rad 0.536906 0.106410 5.046 8.93e-07 ***
## tax -0.005042 0.006217 -0.811 0.4182
## ptratio -0.169050 0.206924 -0.817 0.4148
## black 0.007009 0.004143 1.692 0.0920 .
## lstat 0.509363 0.095605 5.328 2.29e-07 ***
## medv -0.028481 0.060830 -0.468 0.6401
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.039 on 240 degrees of freedom
## Multiple R-squared: 0.5732, Adjusted R-squared: 0.5519
## F-statistic: 26.86 on 12 and 240 DF, p-value: < 2.2e-16
final.bestsub.mod
## Linear Regression
##
## 253 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 227, 229, 228, 228, 228, 228, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.321811 0.6867314 2.745647
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
bestsub.cv.error = 4.32
bestsub.preds = predict(final.bestsub.mod, newdata=Boston.test)
bestsub.mean.err.bost = mean((Boston.test$crim - bestsub.preds)^2)
bestsub.mean.err.bost
## [1] 64.93419
bestsub.RMSE.bost = RMSE(bestsub.preds, Boston.test$crim)
bestsub.RMSE.bost
## [1] 8.058175
bestsub.test.r2.bost = R2(bestsub.preds, Boston.test$crim)
bestsub.test.r2.bost
## [1] 0.2909503
Lasso Regression
x_train_bost = model.matrix(crim~., Boston.train)[,-1]
y_train_bost = Boston.train$crim
x_test_bost = model.matrix(crim~., Boston.test)[,-1]
y_test_bost = Boston.test$crim
lambda_vector_bost =10^seq(4,-2,length=100)
set.seed(8)
lasso.fit.bost = train(y = y_train_bost, x = x_train_bost, method = 'glmnet', trControl =
train_control_b, tuneGrid = expand.grid(alpha = 1, lambda = lambda_vector_bost))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
lasso.fit.bost$bestTune
## alpha lambda
## 31 1 0.6579332
lasso.fit.bost
## glmnet
##
## 253 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 228, 228, 229, 228, 227, 229, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 1.000000e-02 4.422344 0.6985197 2.742232
## 1.149757e-02 4.420325 0.6986081 2.739580
## 1.321941e-02 4.417816 0.6987371 2.736668
## 1.519911e-02 4.415230 0.6988474 2.733394
## 1.747528e-02 4.412147 0.6989833 2.729677
## 2.009233e-02 4.408714 0.6991221 2.725520
## 2.310130e-02 4.404949 0.6992440 2.720603
## 2.656088e-02 4.400487 0.6994292 2.714125
## 3.053856e-02 4.394812 0.6997575 2.706932
## 3.511192e-02 4.388468 0.7001356 2.699243
## 4.037017e-02 4.381721 0.7004868 2.690598
## 4.641589e-02 4.374839 0.7008001 2.681677
## 5.336699e-02 4.367866 0.7011352 2.672443
## 6.135907e-02 4.360588 0.7015730 2.661462
## 7.054802e-02 4.354133 0.7018470 2.649796
## 8.111308e-02 4.348339 0.7019372 2.638154
## 9.326033e-02 4.343043 0.7018547 2.626907
## 1.072267e-01 4.339378 0.7015502 2.617123
## 1.232847e-01 4.336476 0.7011171 2.608642
## 1.417474e-01 4.330140 0.7009157 2.597100
## 1.629751e-01 4.324721 0.7007884 2.585064
## 1.873817e-01 4.323015 0.7000241 2.576049
## 2.154435e-01 4.320640 0.6992298 2.564479
## 2.477076e-01 4.317059 0.6984895 2.553826
## 2.848036e-01 4.309811 0.6980113 2.543964
## 3.274549e-01 4.301612 0.6974706 2.533287
## 3.764936e-01 4.298648 0.6967141 2.523524
## 4.328761e-01 4.293943 0.6964743 2.506174
## 4.977024e-01 4.289551 0.6964415 2.485400
## 5.722368e-01 4.286248 0.6964021 2.461835
## 6.579332e-01 4.284753 0.6963539 2.436510
## 7.564633e-01 4.286129 0.6962900 2.409908
## 8.697490e-01 4.291856 0.6962075 2.382722
## 1.000000e+00 4.303982 0.6960981 2.355621
## 1.149757e+00 4.325332 0.6959543 2.330952
## 1.321941e+00 4.359722 0.6957562 2.315282
## 1.519911e+00 4.412213 0.6954744 2.321582
## 1.747528e+00 4.489335 0.6950589 2.359593
## 2.009233e+00 4.599199 0.6944083 2.440036
## 2.310130e+00 4.751443 0.6933171 2.579428
## 2.656088e+00 4.956788 0.6913222 2.784784
## 3.053856e+00 5.226624 0.6871211 3.046185
## 3.511192e+00 5.571396 0.6768657 3.364104
## 4.037017e+00 5.985514 0.6662700 3.747078
## 4.641589e+00 6.479141 0.6355898 4.222106
## 5.336699e+00 6.810710 NaN 4.562012
## 6.135907e+00 6.810710 NaN 4.562012
## 7.054802e+00 6.810710 NaN 4.562012
## 8.111308e+00 6.810710 NaN 4.562012
## 9.326033e+00 6.810710 NaN 4.562012
## 1.072267e+01 6.810710 NaN 4.562012
## 1.232847e+01 6.810710 NaN 4.562012
## 1.417474e+01 6.810710 NaN 4.562012
## 1.629751e+01 6.810710 NaN 4.562012
## 1.873817e+01 6.810710 NaN 4.562012
## 2.154435e+01 6.810710 NaN 4.562012
## 2.477076e+01 6.810710 NaN 4.562012
## 2.848036e+01 6.810710 NaN 4.562012
## 3.274549e+01 6.810710 NaN 4.562012
## 3.764936e+01 6.810710 NaN 4.562012
## 4.328761e+01 6.810710 NaN 4.562012
## 4.977024e+01 6.810710 NaN 4.562012
## 5.722368e+01 6.810710 NaN 4.562012
## 6.579332e+01 6.810710 NaN 4.562012
## 7.564633e+01 6.810710 NaN 4.562012
## 8.697490e+01 6.810710 NaN 4.562012
## 1.000000e+02 6.810710 NaN 4.562012
## 1.149757e+02 6.810710 NaN 4.562012
## 1.321941e+02 6.810710 NaN 4.562012
## 1.519911e+02 6.810710 NaN 4.562012
## 1.747528e+02 6.810710 NaN 4.562012
## 2.009233e+02 6.810710 NaN 4.562012
## 2.310130e+02 6.810710 NaN 4.562012
## 2.656088e+02 6.810710 NaN 4.562012
## 3.053856e+02 6.810710 NaN 4.562012
## 3.511192e+02 6.810710 NaN 4.562012
## 4.037017e+02 6.810710 NaN 4.562012
## 4.641589e+02 6.810710 NaN 4.562012
## 5.336699e+02 6.810710 NaN 4.562012
## 6.135907e+02 6.810710 NaN 4.562012
## 7.054802e+02 6.810710 NaN 4.562012
## 8.111308e+02 6.810710 NaN 4.562012
## 9.326033e+02 6.810710 NaN 4.562012
## 1.072267e+03 6.810710 NaN 4.562012
## 1.232847e+03 6.810710 NaN 4.562012
## 1.417474e+03 6.810710 NaN 4.562012
## 1.629751e+03 6.810710 NaN 4.562012
## 1.873817e+03 6.810710 NaN 4.562012
## 2.154435e+03 6.810710 NaN 4.562012
## 2.477076e+03 6.810710 NaN 4.562012
## 2.848036e+03 6.810710 NaN 4.562012
## 3.274549e+03 6.810710 NaN 4.562012
## 3.764936e+03 6.810710 NaN 4.562012
## 4.328761e+03 6.810710 NaN 4.562012
## 4.977024e+03 6.810710 NaN 4.562012
## 5.722368e+03 6.810710 NaN 4.562012
## 6.579332e+03 6.810710 NaN 4.562012
## 7.564633e+03 6.810710 NaN 4.562012
## 8.697490e+03 6.810710 NaN 4.562012
## 1.000000e+04 6.810710 NaN 4.562012
##
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.6579332.
lasso.cv.error = 4.28
coef(lasso.fit.bost$finalModel, lasso.fit.bost$bestTune$lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -4.0639552
## zn .
## indus .
## chas .
## nox .
## rm .
## age .
## dis .
## rad 0.3364142
## tax .
## ptratio .
## black .
## lstat 0.3536487
## medv .
final.lasso.bost = glmnet(x_train_bost, y_train_bost, alpha = 1, lambda = 0.6579332)
lasso.preds.bost = predict(final.lasso.bost, newx = x_test_bost)
lasso.mean.error.bost = mean((Boston.test$crim - lasso.preds.bost)^2)
lasso.mean.error.bost
## [1] 63.23433
lasso.RMSE.bost = RMSE(lasso.preds.bost, Boston.test$crim)
lasso.RMSE.bost
## [1] 7.952001
lasso.test.r2.bost = R2(lasso.preds.bost, Boston.test$crim)
lasso.test.r2.bost
## s0
## [1,] 0.3127097
Ridge Regression
set.seed(8)
ridge.fit.bost = train(y = y_train_bost, x = x_train_bost, method = 'glmnet', trControl =
train_control_b, tuneGrid = expand.grid(alpha = 0, lambda = lambda_vector_bost))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
ridge.fit.bost$bestTune
## alpha lambda
## 34 0 1
ridge.fit.bost
## glmnet
##
## 253 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 228, 228, 229, 228, 227, 229, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 1.000000e-02 4.344479 0.6975261 2.632751
## 1.149757e-02 4.344479 0.6975261 2.632751
## 1.321941e-02 4.344479 0.6975261 2.632751
## 1.519911e-02 4.344479 0.6975261 2.632751
## 1.747528e-02 4.344479 0.6975261 2.632751
## 2.009233e-02 4.344479 0.6975261 2.632751
## 2.310130e-02 4.344479 0.6975261 2.632751
## 2.656088e-02 4.344479 0.6975261 2.632751
## 3.053856e-02 4.344479 0.6975261 2.632751
## 3.511192e-02 4.344479 0.6975261 2.632751
## 4.037017e-02 4.344479 0.6975261 2.632751
## 4.641589e-02 4.344479 0.6975261 2.632751
## 5.336699e-02 4.344479 0.6975261 2.632751
## 6.135907e-02 4.344479 0.6975261 2.632751
## 7.054802e-02 4.344479 0.6975261 2.632751
## 8.111308e-02 4.344479 0.6975261 2.632751
## 9.326033e-02 4.344479 0.6975261 2.632751
## 1.072267e-01 4.344479 0.6975261 2.632751
## 1.232847e-01 4.344479 0.6975261 2.632751
## 1.417474e-01 4.344479 0.6975261 2.632751
## 1.629751e-01 4.344479 0.6975261 2.632751
## 1.873817e-01 4.344479 0.6975261 2.632751
## 2.154435e-01 4.344479 0.6975261 2.632751
## 2.477076e-01 4.344479 0.6975261 2.632751
## 2.848036e-01 4.344479 0.6975261 2.632751
## 3.274549e-01 4.344479 0.6975261 2.632751
## 3.764936e-01 4.344479 0.6975261 2.632751
## 4.328761e-01 4.344479 0.6975261 2.632751
## 4.977024e-01 4.345708 0.6973622 2.633528
## 5.722368e-01 4.343774 0.6967349 2.627500
## 6.579332e-01 4.339961 0.6960420 2.618965
## 7.564633e-01 4.336972 0.6951816 2.611403
## 8.697490e-01 4.335010 0.6941374 2.605373
## 1.000000e+00 4.334182 0.6928902 2.598499
## 1.149757e+00 4.334907 0.6913986 2.591019
## 1.321941e+00 4.337189 0.6896537 2.583344
## 1.519911e+00 4.341268 0.6876312 2.576850
## 1.747528e+00 4.347339 0.6853212 2.571034
## 2.009233e+00 4.355508 0.6827339 2.566545
## 2.310130e+00 4.365853 0.6798327 2.562779
## 2.656088e+00 4.378385 0.6766537 2.558917
## 3.053856e+00 4.393137 0.6731942 2.558330
## 3.511192e+00 4.409984 0.6694909 2.559274
## 4.037017e+00 4.429102 0.6655266 2.560010
## 4.641589e+00 4.450329 0.6613766 2.562738
## 5.336699e+00 4.473735 0.6570282 2.565458
## 6.135907e+00 4.499253 0.6525609 2.568827
## 7.054802e+00 4.527103 0.6479668 2.574421
## 8.111308e+00 4.557164 0.6433356 2.582714
## 9.326033e+00 4.589845 0.6386601 2.593778
## 1.072267e+01 4.625164 0.6340375 2.607664
## 1.232847e+01 4.663726 0.6294335 2.624797
## 1.417474e+01 4.705482 0.6249768 2.642750
## 1.629751e+01 4.751262 0.6206181 2.662064
## 1.873817e+01 4.801203 0.6164494 2.685009
## 2.154435e+01 4.855941 0.6124407 2.717922
## 2.477076e+01 4.915464 0.6086861 2.757839
## 2.848036e+01 4.980441 0.6051213 2.802821
## 3.274549e+01 5.050474 0.6018275 2.852876
## 3.764936e+01 5.125905 0.5987394 2.914806
## 4.328761e+01 5.205817 0.5959242 2.986263
## 4.977024e+01 5.290216 0.5933078 3.066058
## 5.722368e+01 5.377614 0.5909603 3.148946
## 6.579332e+01 5.467783 0.5888058 3.234788
## 7.564633e+01 5.558858 0.5868856 3.325839
## 8.697490e+01 5.650548 0.5851363 3.420268
## 1.000000e+02 5.740851 0.5835907 3.512782
## 1.149757e+02 5.829651 0.5821918 3.602386
## 1.321941e+02 5.915072 0.5809645 3.687809
## 1.519911e+02 5.997305 0.5798593 3.769146
## 1.747528e+02 6.074772 0.5788934 3.844183
## 2.009233e+02 6.147983 0.5780285 3.914655
## 2.310130e+02 6.215730 0.5772785 3.979519
## 2.656088e+02 6.278770 0.5766078 4.039530
## 3.053856e+02 6.336244 0.5760276 4.094808
## 3.511192e+02 6.389046 0.5755099 4.146337
## 4.037017e+02 6.436605 0.5750634 4.193217
## 4.641589e+02 6.479851 0.5746658 4.235766
## 5.336699e+02 6.518427 0.5743235 4.273727
## 6.135907e+02 6.553218 0.5740192 4.308083
## 7.054802e+02 6.584014 0.5737577 4.338394
## 8.111308e+02 6.611611 0.5735254 4.365433
## 9.326033e+02 6.635892 0.5733261 4.389401
## 1.072267e+03 6.657542 0.5731492 4.410706
## 1.232847e+03 6.676502 0.5729977 4.429306
## 1.417474e+03 6.693342 0.5728648 4.445814
## 1.629751e+03 6.708036 0.5727492 4.460387
## 1.873817e+03 6.721048 0.5726467 4.473265
## 2.154435e+03 6.732370 0.5725590 4.484453
## 2.477076e+03 6.742374 0.5724813 4.494321
## 2.848036e+03 6.751060 0.5724148 4.502941
## 3.274549e+03 6.758721 0.5723560 4.510554
## 3.764936e+03 6.765362 0.5723056 4.517148
## 4.328761e+03 6.771343 0.5722611 4.523110
## 4.977024e+03 6.800911 0.6067236 4.552676
## 5.722368e+03 6.810710 NaN 4.562012
## 6.579332e+03 6.810710 NaN 4.562012
## 7.564633e+03 6.810710 NaN 4.562012
## 8.697490e+03 6.810710 NaN 4.562012
## 1.000000e+04 6.810710 NaN 4.562012
##
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 1.
ridge.cv.error = 4.34
coef(ridge.fit.bost$finalModel, ridge.fit.bost$bestTune$lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.267894814
## zn 0.028178313
## indus -0.073328379
## chas -1.093209480
## nox -1.121283654
## rm -0.431817343
## age -0.002141736
## dis -0.435124734
## rad 0.306261724
## tax 0.004623799
## ptratio -0.039136060
## black 0.002839402
## lstat 0.356944500
## medv -0.026296716
final.ridge.bost = glmnet(x_train_bost, y_train_bost, alpha = 1, lambda = 1)
ridge.preds.bost = predict(final.ridge.bost, newx = x_test_bost)
ridge.mean.error.bost = mean((Boston.test$crim - ridge.preds.bost)^2)
ridge.mean.error.bost
## [1] 63.79812
ridge.RMSE.bost = RMSE(ridge.preds.bost, Boston.test$crim)
ridge.RMSE.bost
## [1] 7.987373
ridge.test.r2.bost = R2(ridge.preds.bost, Boston.test$crim)
ridge.test.r2.bost
## s0
## [1,] 0.3143458
PCR
set.seed(8)
pcr.fit.bost = train(crim ~., data=Boston.train, scale = TRUE, trControl = train_control_b, tuneLength=14, method = 'pcr')
plot(pcr.fit.bost)
pcr.fit.bost$bestTune
## ncomp
## 9 9
print(pcr.fit.bost)
## Principal Component Analysis
##
## 253 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 228, 228, 229, 228, 227, 229, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 5.059584 0.5387187 3.215446
## 2 5.046589 0.5463093 3.281501
## 3 4.821077 0.5987237 2.866363
## 4 4.771006 0.6079421 2.811514
## 5 4.776528 0.6066792 2.811613
## 6 4.658189 0.6255412 2.865325
## 7 4.674648 0.6314719 2.975392
## 8 4.592278 0.6461243 2.850103
## 9 4.486453 0.6722929 2.847627
## 10 4.490941 0.6823706 2.864746
## 11 4.488325 0.6850498 2.857102
## 12 4.487390 0.6844183 2.813741
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 9.
pcr.cv.error = 4.48
summary(pcr.fit.bost$finalModel)
## Data: X dimension: 253 13
## Y dimension: 253 1
## Fit method: svdpc
## Number of components considered: 9
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 49.66 62.49 71.12 78.46 84.72 89.43 92.32
## .outcome 40.11 41.35 44.97 45.44 45.45 48.47 49.47
## 8 comps 9 comps
## X 94.26 96.04
## .outcome 51.14 54.19
coef(pcr.fit.bost$finalModel, pcr.fit.bost$bestTune$ncomp)
## , , 9 comps
##
## .outcome
## zn 0.5469047
## indus -2.0672558
## chas -0.2685570
## nox 0.2376067
## rm -0.8034521
## age 0.6829360
## dis -0.6410346
## rad 2.9174962
## tax 1.7255229
## ptratio -0.2899265
## black 0.2073022
## lstat 1.9114809
## medv -0.2821096
pcr.pred.bost = predict(pcr.fit.bost, Boston.test, ncomp=9)
pcr.mean.err.bost = mean((Boston.test$crim - pcr.pred.bost)^2)
pcr.mean.err.bost
## [1] 62.816
pcr.RMSE.bost = RMSE(pcr.pred.bost, Boston.test$crim)
pcr.RMSE.bost
## [1] 7.925655
pcr.test.rsquared.bost = R2(pcr.pred.bost, Boston.test$crim)
pcr.test.rsquared.bost
## [1] 0.3121689
11B.
data.frame(method = c("OLS", "Best Subset", "Lasso", "Ridge", "PCR"),
CV_RMSE = c(lin.cv.error, bestsub.cv.error, lasso.cv.error, ridge.cv.error, pcr.cv.error),
MSE = c(linear.mean.err.bost, bestsub.mean.err.bost, lasso.mean.error.bost,
ridge.mean.error.bost, pcr.mean.err.bost),
RMSE = c(lin.RMSE.bost, bestsub.RMSE.bost, lasso.RMSE.bost, ridge.RMSE.bost, pcr.RMSE.bost),
test.rsquared = c(lin.test.r2.bost, bestsub.test.r2.bost, lasso.test.r2.bost,
ridge.test.r2.bost, pcr.test.rsquared.bost))
## method CV_RMSE MSE RMSE test.rsquared
## 1 OLS 4.44 65.15022 8.071568 0.2888541
## 2 Best Subset 4.32 64.93419 8.058175 0.2909503
## 3 Lasso 4.28 63.23433 7.952001 0.3127097
## 4 Ridge 4.34 63.79812 7.987373 0.3143458
## 5 PCR 4.48 62.81600 7.925655 0.3121689
All the models perform relatively the same and there is not a substantial amount of difference between all of the models performance. Looking at the cross-validation RMSE, we can see that the best subset model had the lowest error. PCR had the highest cross-validation RMSE.Ridge has the highest test r-squared (0.3143458) and the OLS regression had the lowest test r-squared (0.2888541).
9C.
My chosen model is the Ridge regression because it has the highest test r-squared. The ridge regression kept all of the variables. It kept all the variables because the penalty in ridge will never force any of the coefficients to be exactly zero. Thus, the final model in a ridge regression will include all the variables.