library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ISLR)
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-6
library(pls)
## Warning: package 'pls' was built under R version 4.2.3
##
## Attaching package: 'pls'
##
## The following object is masked from 'package:stats':
##
## loadings
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
The lasso is less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
Ridge Regression is also less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
Non-linear methods are more flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
set.seed(1)
split <- sample(nrow(College),nrow(College)*.5)
train <- College[split,]
test<-College[-split,]
lm.college <- lm(Apps ~ .,data = train)
summary(lm.college)
##
## Call:
## lm(formula = Apps ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5741.2 -479.5 15.3 359.6 7258.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.902e+02 6.381e+02 -1.238 0.216410
## PrivateYes -3.070e+02 2.006e+02 -1.531 0.126736
## Accept 1.779e+00 5.420e-02 32.830 < 2e-16 ***
## Enroll -1.470e+00 3.115e-01 -4.720 3.35e-06 ***
## Top10perc 6.673e+01 8.310e+00 8.030 1.31e-14 ***
## Top25perc -2.231e+01 6.533e+00 -3.415 0.000708 ***
## F.Undergrad 9.269e-02 5.529e-02 1.676 0.094538 .
## P.Undergrad 9.397e-03 5.493e-02 0.171 0.864275
## Outstate -1.084e-01 2.700e-02 -4.014 7.22e-05 ***
## Room.Board 2.115e-01 7.224e-02 2.928 0.003622 **
## Books 2.912e-01 3.985e-01 0.731 0.465399
## Personal 6.133e-03 8.803e-02 0.070 0.944497
## PhD -1.548e+01 6.681e+00 -2.316 0.021082 *
## Terminal 6.415e+00 7.290e+00 0.880 0.379470
## S.F.Ratio 2.283e+01 2.047e+01 1.115 0.265526
## perc.alumni 1.134e+00 6.083e+00 0.186 0.852274
## Expend 4.857e-02 1.619e-02 2.999 0.002890 **
## Grad.Rate 7.490e+00 4.397e+00 1.703 0.089324 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1083 on 370 degrees of freedom
## Multiple R-squared: 0.9389, Adjusted R-squared: 0.9361
## F-statistic: 334.3 on 17 and 370 DF, p-value: < 2.2e-16
testpredict <- predict(lm.college,test)
error <- mean((test$Apps-testpredict)^2)
error
## [1] 1135758
trainmatrix<- model.matrix(Apps~.,data = train)
testmatrix <- model.matrix(Apps~.,data=test)
grid<-10^seq(4,-2,length=50)
ridgeregression <- glmnet(trainmatrix,train$Apps,alpha = 0, lambda = grid)
cvridge <- cv.glmnet(trainmatrix,train$Apps, alpha = 0, lambda = grid)
bestlamda <- cvridge$lambda.min
bestlamda
## [1] 0.01
testridge <- predict(ridgeregression, s=bestlamda,newx=testmatrix)
mean((test$Apps-testridge)^2)
## [1] 1134666
Our ridge regression has a MSE of about 2000 less than our linear model does.
lasso <- glmnet(trainmatrix,train$Apps,alpha=1,lamdba=grid)
cvlasso<-cv.glmnet(trainmatrix,train$Apps,alpha=1,lambda = grid)
bestlamdbalasso <- cvlasso$lambda.min
bestlamdbalasso
## [1] 0.01
testlasso <- predict(lasso,s=bestlamdbalasso,newx=testmatrix)
mean((test$Apps-testlasso)^2)
## [1] 1115901
The lasso has a smaller MSE than our previous two models by about 30,000.
pcr.college <- pcr(Apps~., data = train, scale = TRUE, validation ="CV")
validationplot(pcr.college,val.type = "MSEP")
testpcr <- predict(pcr.college,test,ncomp = 17)
mean((test$Apps-testpcr)^2)
## [1] 1135758
Our PCR model has a similar MSE to our linear model and our ridge regression.
pls.college <- plsr(Apps~.,data= train, scale = TRUE, validation = "CV")
validationplot(pls.college,val.type = "MSEP")
testpsl <- predict(pls.college,test,ncomp=8)
mean((test$Apps-testpsl)^2)
## [1] 1090290
testavg <- mean(test$Apps)
lm.r2 <- 1-mean((testpredict-test$Apps)^2)/ mean((testavg-test$Apps)^2)
ridge.r2 <- 1 -mean((testridge-test$Apps)^2)/ mean((testavg-test$Apps)^2)
lasso.r2 <- 1 - mean((testlasso-test$Apps)^2)/ mean((testavg-test$Apps)^2)
pcr.r2 <- 1 - mean((testpcr-test$Apps)^2)/ mean((testavg-test$Apps)^2)
pls.r2 <- 1 - mean((testpsl-test$Apps)^2)/ mean((testavg-test$Apps)^2)
lm.r2
## [1] 0.9015413
ridge.r2
## [1] 0.901636
lasso.r2
## [1] 0.9032628
pcr.r2
## [1] 0.9015413
pls.r2
## [1] 0.9054829
It seems that pls had the largest r^2, which we can see as its MSE is much larger than any of the other models. The other 4 models are essentially the same fit.
#Best Subset
train <- sample(c(TRUE,FALSE),nrow(Boston),rep=TRUE)
test <- (-train)
reg.Boston <- regsubsets(crim~.,data=Boston[train,],nvmax = 13)
testing.matrix <- model.matrix(crim~.,data=Boston[test,])
#I was having problems on the best subset so I found this code online that helped
error <- rep(NA,13)
for(i in 1:13){
coefi<- coef(reg.Boston,id=i)
pred<- testing.matrix[,names(coefi)]%*%coefi
error[i]<-mean((Boston$crim[test]-pred)^2)
}
error
## [1] 45.23385 48.49510 47.00658 46.53321 46.46462 45.54211 45.35982 45.53008
## [9] 45.48939 45.27065 45.21770 45.14207 45.06880
min(error)
## [1] 45.0688
coef(reg.Boston,11)
## (Intercept) zn indus chas nox rm
## 34.51537941 0.04167207 -0.08653546 -0.83293663 -15.80299087 -0.69519516
## age dis rad ptratio black medv
## 0.01903755 -0.93970090 0.44418205 -0.16215699 -0.03464723 -0.16566162
The lowest MSE for best subset uses 11 predictors. This gives us an MSE around 41.85.
#Ridge
Boston.split <- sample(nrow(Boston),nrow(Boston)*.5)
B.train <- Boston[Boston.split,]
B.test <- Boston[-Boston.split,]
B.train.matrix <- model.matrix(crim~.,data = B.train)
B.test.matrix <- model.matrix(crim~., data = B.test)
grid<-10^seq(4,-2,length=50)
B.ridge <- glmnet(B.train.matrix,B.train$crim,alpha=0, lambda = grid)
cv.B.ridge <- cv.glmnet(B.train.matrix,B.train$crim,alpha=0, lambda = grid)
B.bestlamda <- cv.B.ridge$lambda.min
B.bestlamda
## [1] 0.390694
B.testridge <- predict(B.ridge, s =B.bestlamda, newx = B.test.matrix)
mean((B.test$crim-B.testridge)^2)
## [1] 26.1705
Ridge has a MSE of 32.37, which is much smaller than our subset model.
#Lasso
B.lasso <- glmnet(B.train.matrix,B.train$crim,alpha=1, lambda = grid)
cv.B.lasso <- cv.glmnet(B.train.matrix,B.train$crim,alpha=1, lambda = grid)
B.bestlamda <- cv.B.lasso$lambda.min
B.bestlamda
## [1] 0.5179475
B.testlasso <- predict(B.lasso, s =B.bestlamda, newx = B.test.matrix)
mean((B.test$crim-B.testlasso)^2)
## [1] 27.19339
Our Lasso has a MSE of 33.24, which is slightly higher than our Ridge model.
#PCR
B.pcr <- pcr(crim~., data = B.train, scale = TRUE, validation = "CV")
validationplot(B.pcr,val.type = "MSEP")
B.test.pcr <- predict(B.pcr, B.test, ncomp=3)
mean((B.test$crim-B.test.pcr)^2)
## [1] 28.65787
Our Principal components model has a MSE of 35.20
I would end up using the Ridge Regression model. Even though it has some coefficients that are near 0, it still gives a lower MSE than any of the other models I made.
My Ridge Regression model does include all of the features within the Boston data set. If I chose to go with the first model I made, I would only go with 7 out of the 13 variables, but the Ridge model ends up performing better.