library(ISLR2)
college <- College
##(a) Split the data set into a training set and a test set.
set.seed(1)
sample <- sample(1:nrow(college),size=0.3*nrow(college))
train <- college[-sample,]
test <- college[sample,]
##(b) Fit a linear model using least squares on the training set, and report the test error obtained.
lmfit = lm(Apps ~., data = train)
lmpred = predict(lmfit, test, type="response")
mean((lmpred-test$Apps)^2)
## [1] 1740793
# the test error obtained using least squares is 1740793.
##(c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
x <- model.matrix(Apps ~., train)
y <- model.matrix(Apps~., test)
cv.out <- cv.glmnet(x, train$Apps,alpha=0)
bestlam <- cv.out$lambda.min
bestlam
## [1] 327.157
ridge <- glmnet(x,train$Apps,alpha = 0)
ridgepred <- predict(ridge,s=bestlam,newx = y)
mean((ridgepred - test$Apps)^2)
## [1] 3041114
# The MSE obtained is 3041114
##(d) Fit a lasso model on the training set, with λ chosen by crossvalidation. Report the test error obtained, along with the number of non-zero coefficient estimates.
x.tr <- model.matrix(Apps ~., train)
y.tr <- train$Apps
x.test <- model.matrix(Apps ~., test)
y.test <- test$Apps
set.seed(1)
cv.out <- cv.glmnet(x.tr, y.tr, alpha=1)
bestlam <- cv.out$lambda.min
bestlam
## [1] 25.92663
ridge <- glmnet(x.tr, y.tr, alpha = 1)
ridgepred <- predict(ridge,s=bestlam,newx = x.test)
mean((ridgepred - test$Apps)^2)
## [1] 1846253
# the MSE obtained is 1846253
##(e) Fit a PCR model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.
x <- model.matrix(Apps ~., college)
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed (1)
pcrfit <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
summary(pcrfit)
## Data: X dimension: 544 17
## Y dimension: 544 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3514 3535 1725 1716 1415 1285 1292
## adjCV 3514 3535 1722 1723 1409 1266 1287
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1237 1214 1202 1205 1197 1209 1213
## adjCV 1232 1210 1200 1203 1193 1205 1209
## 14 comps 15 comps 16 comps 17 comps
## CV 1215 1224 1084 1091
## adjCV 1211 1223 1080 1085
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 31.8106 57.40 64.08 69.93 75.19 80.22 84.02 87.55
## Apps 0.5794 76.64 77.24 85.07 87.46 87.56 88.33 88.85
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.67 93.11 95.14 96.95 98.00 98.86 99.39
## Apps 89.18 89.22 89.41 89.46 89.47 89.51 89.54
## 16 comps 17 comps
## X 99.83 100.00
## Apps 91.82 92.16
validationplot(pcrfit , val.type = "MSEP")
pcrpred <- predict(pcrfit,test,ncomp=9)
mean((pcrpred - y.test)^2)
## [1] 4003530
# The test error obtained is 4003530 with M=9
##(f) Fit a PLS model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.
library(pls)
set.seed (1)
plsfit <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
summary(plsfit)
## Data: X dimension: 544 17
## Y dimension: 544 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3514 1552 1163 1173 1158 1133 1102
## adjCV 3514 1548 1157 1171 1152 1124 1095
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1092 1091 1088 1088 1090 1091 1091
## adjCV 1086 1085 1083 1082 1085 1086 1085
## 14 comps 15 comps 16 comps 17 comps
## CV 1091 1091 1091 1091
## adjCV 1085 1085 1085 1085
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 25.51 34.63 62.83 66.57 69.80 73.56 77.21 81.02
## Apps 81.56 88.94 89.77 90.64 91.51 91.96 92.06 92.08
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 83.27 85.14 87.90 90.76 93.56 96.06 97.63
## Apps 92.11 92.14 92.15 92.16 92.16 92.16 92.16
## 16 comps 17 comps
## X 99.18 100.00
## Apps 92.16 92.16
validationplot(plsfit , val.type = "MSEP")
plspred <- predict(plsfit,test,ncomp=10)
mean((plspred - y.test)^2)
## [1] 1750013
## The test error obtained is 1750013 with M=9.
##(g) Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches? # the test errors based on MSE are ranked best to worst below: #1. linear model #2. PLS #3. lasso #4. ridge #5. PCR There is a decently sized difference in test errors between PCR and the other models.
boston <- Boston
set.seed(1)
sample <- sample(1:nrow(boston),size=0.3*nrow(boston))
train <- boston[-sample,]
test <- boston[sample,]
library(glmnet)
x <- model.matrix(crim ~., train)
y <- model.matrix(crim~., test)
cv.out <- cv.glmnet(x, train$crim,alpha=0)
bestlam <- cv.out$lambda.min
ridge <- glmnet(x,train$crim,alpha = 0)
ridgepred <- predict(ridge,s=bestlam,newx = y)
MSEridge <- mean((ridgepred - test$crim)^2)
MSEridge
## [1] 55.46288
rootMSE <- sqrt(MSEridge)
rootMSE
## [1] 7.447341
plot(cv.out)
x.tr <- model.matrix(crim ~., train)
y.tr <- train$crim
x.test <- model.matrix(crim ~., test)
y.test <- test$crim
set.seed(1)
grid=10^seq(10,-2,length=100)
cv.out <- cv.glmnet(x.tr, y.tr, alpha=1)
bestlam <- cv.out$lambda.min
lasso <- glmnet(x.tr, y.tr, alpha = 1)
lassopred <- predict(lasso,s=bestlam,newx = x.test)
MSElasso <- mean((lassopred - test$crim)^2)
MSElasso
## [1] 54.74424
rootMSE <- sqrt(MSElasso)
rootMSE
## [1] 7.398935
plot(cv.out)
x <- model.matrix(crim ~., boston)
library(pls)
set.seed (1)
pcrfit <- pcr(crim ~ ., data = train, scale = TRUE, validation = "CV")
summary(pcrfit)
## Data: X dimension: 355 12
## Y dimension: 355 1
## Fit method: svdpc
## Number of components considered: 12
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.08 6.796 6.789 6.413 6.414 6.362 6.342
## adjCV 8.08 6.793 6.786 6.405 6.409 6.356 6.335
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps
## CV 6.211 6.262 6.236 6.219 6.208 6.163
## adjCV 6.203 6.253 6.226 6.210 6.196 6.151
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 49.93 63.55 73.09 80.42 87.06 90.28 92.85 95.07
## crim 30.14 30.66 38.60 38.60 39.91 40.58 42.88 42.90
## 9 comps 10 comps 11 comps 12 comps
## X 96.87 98.33 99.51 100.00
## crim 43.48 43.89 44.76 45.75
validationplot(pcrfit , val.type = "MSEP")
pcrpred <- predict(pcrfit,test,ncomp=7)
MSEpcr <- mean((pcrpred - y.test)^2)
MSEpcr
## [1] 57.31212
rootMSE <- sqrt(MSEpcr)
rootMSE
## [1] 7.570477
##(b) Propose a model (or set of models) that seem to perform well on this data set, and justify your answer. Make sure that you are evaluating model performance using validation set error, crossvalidation, or some other reasonable alternative, as opposed to using training error.
cv.out <- cv.glmnet(x.tr, y.tr, alpha=1)
predict(cv.out,type='coefficient',s=bestlam)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 6.54479074
## (Intercept) .
## zn 0.03536269
## indus -0.05554954
## chas -0.88096081
## nox -6.24186702
## rm 0.21909432
## age .
## dis -0.72136600
## rad 0.50486034
## tax .
## ptratio -0.10572140
## lstat 0.15571559
## medv -0.12416743
#The lasso model does not involve all of the features in the dataset. It on only includes 10. This is shown above where the coefficients of age and tax are not there. The model did not include these variables because the cv error is lowest without these variables.