knitr::opts_chunk$set(echo = TRUE)
##A) Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. The lasso is a more restrictive model, and thus it has the possibility of reducing overfitting and variance in predictions. As long as it does not result in too high of a bias due to its added constraints, it will outperform least squares which might be fitting spurious parameters.
##B) Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. Although not as restrictive as the lasso, the ridge is more restrictive, and for the same reasions as outlined above this is the case.
##C) More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias. Non linear methods are generally more flexible than least squares. They perform better when the linearity assumption is strongly broken. These methods will have more variance due to their more sensitive fits to the underlying data, and to perform well will need to have a substantial drop in bias.
library(ISLR2)
##(a) Split the data set into a training set and a test set.
train=sample(c(TRUE,FALSE),nrow(College),rep=TRUE)
test=(!train)
College.train=College[train,,drop=F]
College.test=College[test,,drop=F]
##(b) Fit a linear model using least squares on the training set, and report the test error obtained.
lm.fit=lm(Apps~.,data=College.train)
summary(lm.fit)
##
## Call:
## lm(formula = Apps ~ ., data = College.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3649.9 -374.8 4.2 303.2 7814.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -581.71789 549.84960 -1.058 0.29078
## PrivateYes -407.58384 191.08269 -2.133 0.03359 *
## Accept 1.75321 0.04888 35.866 < 2e-16 ***
## Enroll -1.45165 0.25488 -5.695 2.55e-08 ***
## Top10perc 35.62531 8.16562 4.363 1.68e-05 ***
## Top25perc -6.32476 6.25467 -1.011 0.31259
## F.Undergrad 0.10884 0.04442 2.450 0.01475 *
## P.Undergrad 0.04614 0.03953 1.167 0.24399
## Outstate -0.09430 0.02689 -3.507 0.00051 ***
## Room.Board 0.16669 0.06663 2.502 0.01280 *
## Books -0.02771 0.34164 -0.081 0.93540
## Personal 0.03516 0.08060 0.436 0.66291
## PhD -6.11693 6.30790 -0.970 0.33283
## Terminal -6.62848 6.97225 -0.951 0.34239
## S.F.Ratio 23.33220 16.88668 1.382 0.16792
## perc.alumni 7.19484 5.89253 1.221 0.22287
## Expend 0.07974 0.01786 4.464 1.07e-05 ***
## Grad.Rate 5.23686 4.01360 1.305 0.19280
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1007 on 363 degrees of freedom
## Multiple R-squared: 0.9431, Adjusted R-squared: 0.9405
## F-statistic: 354.1 on 17 and 363 DF, p-value: < 2.2e-16
pred=predict(lm.fit,College.test)
rss=sum((pred-College.test$Apps)^2)
tss=sum((College.test$Apps-mean(College.test$Apps))^2)
test.rsq=1-(rss/tss)
test.rsq
## [1] 0.9033761
##(c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
## Loaded glmnet 4.1-7
College.train.X=scale(model.matrix(Apps~.,data=College.train)[,-1],scale=T,center=T)
College.train.Y=College.train$Apps
College.test.X=scale(model.matrix(Apps~.,data=College.test)[,-1],
attr(College.train.X,"scaled:center"),
attr(College.train.X,"scaled:scale"))
College.test.Y=College.test$Apps
cv.out=cv.glmnet(College.train.X,College.train.Y,alpha=0)
bestlam=cv.out$lambda.min
bestlam
## [1] 394.015
lasso.mod=glmnet(College.train.X,College.train.Y,alpha=0,lambda=bestlam)
pred=predict(lasso.mod,College.test.X,s=bestlam)
rss=sum((pred-College.test$Apps)^2)
tss=sum((College.test$Apps-mean(College.test$Apps))^2)
test.rsq=1-(rss/tss)
test.rsq
## [1] 0.9093068
##(d) Fit a lasso model on the training set, with λ chosen by crossvalidation. Report the test error obtained, along with the number of non-zero coefficient estimates.
cv.out=cv.glmnet(College.train.X,College.train.Y,alpha=1)
bestlam=cv.out$lambda.min
bestlam
## [1] 2.10274
lasso.mod=glmnet(College.train.X,College.train.Y,alpha=1,lambda=bestlam)
pred=predict(lasso.mod,College.test.X,s=bestlam)
rss=sum((pred-College.test$Apps)^2)
tss=sum((College.test$Apps-mean(College.test$Apps))^2)
test.rsq=1-(rss/tss)
test.rsq
## [1] 0.9038741
#Number of coefficients equal to 0
sum(coef(lasso.mod)[,1]==0)
## [1] 0
names(coef(lasso.mod)[, 1][coef(lasso.mod)[, 1] == 0])
## character(0)
##(e) Fit a PCR model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.
library(pls)
## Warning: package 'pls' was built under R version 4.2.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(1)
pcr.fit=pcr(Apps~.,data=College.train, scale=TRUE, validation="CV")
summary(pcr.fit) #lowest at M=17
## Data: X dimension: 381 17
## Y dimension: 381 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4132 4155 2375 2382 2211 1935 1943
## adjCV 4132 4159 2368 2379 2167 1923 1933
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1917 1809 1793 1791 1785 1787 1805
## adjCV 1920 1790 1781 1782 1775 1778 1798
## 14 comps 15 comps 16 comps 17 comps
## CV 1804 1587 1256 1150
## adjCV 1819 1552 1241 1140
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 30.9351 55.65 63.48 69.41 75.02 79.58 83.49 87.11
## Apps 0.3736 69.05 69.44 77.05 81.26 81.30 81.58 84.01
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.32 92.93 94.99 96.78 97.83 98.64 99.36
## Apps 84.02 84.52 84.57 84.57 84.58 85.24 92.55
## 16 comps 17 comps
## X 99.83 100.00
## Apps 93.65 94.31
pred=predict(pcr.fit,College.test,ncomp=17)
rss=sum((pred-College.test$Apps)^2)
tss=sum((College.test$Apps-mean(College.test$Apps))^2)
test.rsq=1-(rss/tss)
test.rsq
## [1] 0.9033761
##(f) Fit a PLS model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.
library(pls)
set.seed(1)
pls.fit=plsr(Apps~.,data=College.train, scale=TRUE, validation="CV")
summary(pls.fit) #pretty much lowest at 9 comps, certainly closest to lowest there
## Data: X dimension: 381 17
## Y dimension: 381 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4132 2160 1729 1691 1564 1359 1214
## adjCV 4132 2150 1699 1696 1536 1338 1201
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1185 1171 1159 1156 1152 1149 1151
## adjCV 1175 1162 1149 1146 1142 1139 1141
## 14 comps 15 comps 16 comps 17 comps
## CV 1150 1150 1150 1150
## adjCV 1140 1140 1140 1140
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 24.56 32.26 60.59 64.34 68.95 73.52 77.26 80.86
## Apps 75.63 86.27 87.39 91.30 93.25 93.91 94.01 94.10
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 82.48 85.75 87.78 89.92 93.06 94.23 97.13
## Apps 94.24 94.27 94.29 94.31 94.31 94.31 94.31
## 16 comps 17 comps
## X 98.40 100.00
## Apps 94.31 94.31
pred=predict(pls.fit,College.test,ncomp=9)
rss=sum((pred-College.test$Apps)^2)
tss=sum((College.test$Apps-mean(College.test$Apps))^2)
test.rsq=1-(rss/tss)
test.rsq
## [1] 0.9060727
##(g) Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?
##Ordinary least squares, PLS regression, lasso, and PCR regression performed (more or less equally) best. PLS regression was able to cut out a few things, chosing a model that used 9 of the possible 17 components, and 83% of the variance, while still performing pretty much as well. Interestingly the Lasso, while not performing quite as well, still performed pretty comparably 0.8995 vs 0.9052 (a difference of `r 0.9052 - 0.8995`). The lasso though only set 3 variables to 0 (Enroll (students enrolled), Terminal (pct fac w/ terminal degree), and S.F. Ratio(student/factulty ratio)). Ridge regression performed the poorest at $R^2=0.84$.
##(a) Try out some of the regression methods explored in this chapter, such as best subset selection, the lasso, ridge regression, and PCR. Present and discuss results for the approaches that you consider.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
data("Boston")
train <- sample(1:nrow(Boston), nrow(Boston)*0.70)
test <- -train
y.test <- Boston$crim[test]
##Least Sqaures
lm.fit <- lm(crim~., data=Boston, subset=train)
summary(lm.fit)
##
## Call:
## lm(formula = crim ~ ., data = Boston, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.464 -2.385 -0.483 0.974 74.451
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.572687 9.296673 2.213 0.027568 *
## zn 0.050093 0.023354 2.145 0.032665 *
## indus -0.047622 0.108818 -0.438 0.661932
## chas -0.652652 1.594414 -0.409 0.682550
## nox -13.622079 7.071091 -1.926 0.054882 .
## rm 0.470542 0.752745 0.625 0.532324
## age 0.003977 0.022766 0.175 0.861444
## dis -1.218832 0.365848 -3.332 0.000959 ***
## rad 0.610610 0.109344 5.584 4.81e-08 ***
## tax -0.005257 0.006584 -0.798 0.425204
## ptratio -0.319125 0.237077 -1.346 0.179173
## black -0.004493 0.004680 -0.960 0.337716
## lstat 0.096664 0.096140 1.005 0.315395
## medv -0.245278 0.078829 -3.112 0.002019 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.795 on 340 degrees of freedom
## Multiple R-squared: 0.4378, Adjusted R-squared: 0.4163
## F-statistic: 20.37 on 13 and 340 DF, p-value: < 2.2e-16
##Best Subset Selection
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
best_subset <- regsubsets(crim ~ ., data=Boston, subset=train, nvmax=13)
best_subset_summary <- summary(best_subset)
# Pick the best model using validation set approach
test_matrix <- model.matrix(crim~., data=Boston[test,])
val.errors <- rep(NA,13)
for(i in 1:13){
coefi <- coef(best_subset,id=i)
pred <- test_matrix[,names(coefi)]%*%coefi
val.errors[i] <- mean((Boston$crim[test]-pred)^2)
}
which.min(val.errors)
## [1] 12
coef(best_subset, which.min(val.errors))
## (Intercept) zn indus chas nox
## 20.472011282 0.049521661 -0.046296575 -0.639677973 -13.343568597
## rm dis rad tax ptratio
## 0.500239621 -1.233831693 0.609954300 -0.005259068 -0.317914057
## black lstat medv
## -0.004444226 0.101162410 -0.245665204
# Perform best subset selection with the chosen model on the entire dataset in order to obtain more accurate coefficient estimates
best_subset_full <- regsubsets(crim ~., data=Boston, nvmax = 13)
coef(best_subset_full, 6)
## (Intercept) zn nox dis rad black
## 14.642639407 0.053963088 -9.238768232 -0.992810697 0.499838443 -0.008710565
## medv
## -0.195989936
# Choose the model using cross-validation and the best subset method
predict.regsubsets <- function (object, newdata, id,...){
form<-as.formula(object$call [[2]])
mat<-model.matrix(form,newdata)
coefi<-coef(object, id=id)
xvars<-names(coefi)
mat[,xvars]%*%coefi
}
k <- 10
set.seed(2)
folds <- sample(1:k,nrow(Boston), replace=TRUE)
cv.errors <- matrix(NA,k,13, dimnames=list(NULL, paste(1:13)))
for(j in 1:k){
best.fit=regsubsets(crim~., data=Boston[folds!=j, ], nvmax=13)
for(i in 1:13) {
pred=predict(best.fit,Boston[folds==j,],id=i)
cv.errors[j,i]=mean((Boston$crim[folds==j]-pred)^2)
}
}
mean.cv.errors <- apply(cv.errors,2,mean)
mean.cv.errors
## 1 2 3 4 5 6 7 8
## 44.45496 42.50923 44.07822 44.55039 44.07080 43.85521 43.25223 43.11403
## 9 10 11 12 13
## 42.69129 42.56528 42.83967 42.88256 42.92730
best_subset_cv <- regsubsets(crim~., data=Boston, nvmax=13)
coef(best_subset_cv, 12)
## (Intercept) zn indus chas nox
## 16.985713928 0.044673247 -0.063848469 -0.744367726 -10.202169211
## rm dis rad tax ptratio
## 0.439588002 -0.993556631 0.587660185 -0.003767546 -0.269948860
## black lstat medv
## -0.007518904 0.128120290 -0.198877768
##Backward Stepwise
bwd <- regsubsets(crim~., data=Boston, subset=train, nvmax=13, method="backward")
test_matrix <- model.matrix(crim~., data=Boston[test,])
val.errors <- rep(NA,13)
for(i in 1:13){
coefi <- coef(bwd,id=i)
pred <- test_matrix[,names(coefi)]%*%coefi
val.errors[i] <- mean((Boston$crim[test]-pred)^2)
}
which.min(val.errors)
## [1] 12
bwd_mse <- val.errors[8]
bwd_mse
## [1] 32.0661
bwd_full <- regsubsets(crim ~., data=Boston, nvmax = 13)
coef(bwd_full, 8)
## (Intercept) zn nox dis rad
## 19.683127801 0.043293393 -12.753707757 -0.918318253 0.532616533
## ptratio black lstat medv
## -0.310540942 -0.007922426 0.110173124 -0.174207166
##Lasso Regression
x <- model.matrix(crim~., Boston)[, -1]
y <- Boston$crim
lasso.fit <- glmnet(x[train, ], y[train], alpha=1)
cv.lasso.fit <- cv.glmnet(x[train, ], y[train], alpha=1)
plot(cv.lasso.fit)
bestlam.lasso <- cv.lasso.fit$lambda.min
bestlam.lasso
## [1] 0.02254785
##Ridge Regression
ridge.fit <- glmnet(x[train, ], y[train], alpha=0)
cv.ridge.fit <- cv.glmnet(x[train, ], y[train], alpha=0)
plot(cv.ridge.fit)
bestlam.ridge <- cv.ridge.fit$lambda.min
bestlam.ridge
## [1] 0.5456868
ridge.pred <- predict(ridge.fit, s=bestlam.ridge, newx=x[test,])
ridge.error <- mean((ridge.pred-y[test])^2)
ridge.error
## [1] 31.01025
##(b) Propose a model (or set of models) that seem to perform well on this data set, and justify your answer. Make sure that you are evaluating model performance using validation set error, crossvalidation, or some other reasonable alternative, as opposed to using training error.
##The best model is a 9 variable model selected using cross-validation and backward stepwise selection method.
##(c) ) Does your chosen model involve all of the features in the data set? Why or why not?
##No, it contains 9 variables: zn, indus, nox, dis, rad, ptratio, black, lstat, and medv.