#install.packages("dplyr")
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
#library(dplyr)
#library(tidyr)
library(pls)
## Warning: package 'pls' was built under R version 4.0.5
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
#attach(College)
set.seed(1)
index = sample(1:nrow(College), 0.8 * nrow(College))
train = College[index,]
test = College[-index,]
lm.fit = lm(Apps ~ ., data = train)
summary(lm.fit)
##
## Call:
## lm(formula = Apps ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5555.2 -404.6 19.9 310.3 7577.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -630.58238 435.56266 -1.448 0.148209
## PrivateYes -388.97393 148.87623 -2.613 0.009206 **
## Accept 1.69123 0.04433 38.153 < 2e-16 ***
## Enroll -1.21543 0.20873 -5.823 9.41e-09 ***
## Top10perc 50.45622 5.88174 8.578 < 2e-16 ***
## Top25perc -13.62655 4.67321 -2.916 0.003679 **
## F.Undergrad 0.08271 0.03632 2.277 0.023111 *
## P.Undergrad 0.06555 0.03367 1.947 0.052008 .
## Outstate -0.07562 0.01987 -3.805 0.000156 ***
## Room.Board 0.14161 0.05130 2.760 0.005947 **
## Books 0.21161 0.25184 0.840 0.401102
## Personal 0.01873 0.06604 0.284 0.776803
## PhD -9.72551 4.91228 -1.980 0.048176 *
## Terminal -0.48690 5.43302 -0.090 0.928620
## S.F.Ratio 18.26146 13.83984 1.319 0.187508
## perc.alumni 1.39008 4.39572 0.316 0.751934
## Expend 0.05764 0.01254 4.595 5.26e-06 ***
## Grad.Rate 5.89480 3.11185 1.894 0.058662 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 993.8 on 603 degrees of freedom
## Multiple R-squared: 0.9347, Adjusted R-squared: 0.9328
## F-statistic: 507.5 on 17 and 603 DF, p-value: < 2.2e-16
lm.pred = predict(lm.fit, newdata = test, type = "response")
lm.mse = mean((lm.pred - test$Apps)^2)
lm.mse
## [1] 1567324
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.5
## Loading required package: Matrix
## Loaded glmnet 4.1-2
x=model.matrix(Apps~.,train)
y=model.matrix(Apps~.,test)
grid=10^seq(10,-2,length=100)
set.seed(1)
ridge.mod = glmnet(x,train$Apps,alpha=0, lambda = grid, thresh=1e-12)
cv.out=cv.glmnet(x,train$Apps,alpha=0, lambda = grid, thresh=1e-12)
plot(cv.out)
bestlam=cv.out$lambda.min
bestlam
## [1] 0.01
ridge.pred=predict(ridge.mod,s=bestlam,newx=y)
ridge.mse = mean((ridge.pred-test$Apps)^2)
ridge.mse
## [1] 1567278
x=model.matrix(Apps~.,train)
y=model.matrix(Apps~.,test)
grid=10^seq(10,-2,length=100)
set.seed(1)
lasso.mod = glmnet(x,train$Apps,alpha=1, lambda = grid, thresh=1e-12)
cv.out=cv.glmnet(x,train$Apps,alpha=1, lambda = grid, thresh=1e-12)
bestlam=cv.out$lambda.min
bestlam
## [1] 0.01
lasso.coef=predict(lasso.mod,s=bestlam,newx=y)
lasso.mse = mean((lasso.coef-test$Apps)^2)
lasso.mse
## [1] 1567261
predict(lasso.mod, s=bestlam, type = "coefficients")
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -630.61389530
## (Intercept) .
## PrivateYes -388.94711918
## Accept 1.69115587
## Enroll -1.21488255
## Top10perc 50.44933775
## Top25perc -13.62126672
## F.Undergrad 0.08264172
## P.Undergrad 0.06554432
## Outstate -0.07560901
## Room.Board 0.14159590
## Books 0.21154724
## Personal 0.01871734
## PhD -9.72479696
## Terminal -0.48590436
## S.F.Ratio 18.25562704
## perc.alumni 1.38634947
## Expend 0.05763637
## Grad.Rate 5.89356487
library(ISLR)
#library(pls)
set.seed(1)
pcr.fit=pcr(Apps~., data=train,scale=TRUE,validation="CV")
validationplot(pcr.fit,val.type="MSEP")
pcr.fit$ncomp
## [1] 17
pcr.pred=predict(pcr.fit,test,ncomp=17)
pcr.mse = mean((pcr.pred-test$Apps)^2)
pcr.mse
## [1] 1567324
set.seed(1)
pls.fit=plsr(Apps~., data=train,scale=TRUE, validation="CV")
validationplot(pls.fit,val.type="MSEP")
pls.fit$ncomp
## [1] 17
pls.pred=predict(pls.fit,test,ncomp=17)
pls.mse = mean((pls.pred-test$Apps)^2)
pls.mse
## [1] 1567324
model.names=c("LM", "Ridge", "Lasso", "PCR", "PLS")
all.errors = c(lm.mse, ridge.mse, lasso.mse, pcr.mse, pls.mse)
data.frame(model.names, all.errors)
## model.names all.errors
## 1 LM 1567324
## 2 Ridge 1567278
## 3 Lasso 1567261
## 4 PCR 1567324
## 5 PLS 1567324
Best subset selection
predict.regsubsets=function(object,newdata,id,...){
form=as.formula(object$call[[2]])
mat=model.matrix(form,newdata)
coefi=coef(object,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
#install.packages("leaps","MASS") ## if needed
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.5
library(MASS)
data("Boston")
#Boston = Boston[, -15] # removing crim01
regfit.full = regsubsets(crim ~ . , data = Boston, nvmax = 13)
reg.summary = summary(regfit.full)
reg.summary
## Subset selection object
## Call: regsubsets.formula(crim ~ ., data = Boston, nvmax = 13)
## 13 Variables (and intercept)
## Forced in Forced out
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## black FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
## zn indus chas nox rm age dis rad tax ptratio black lstat medv
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " "*" " "
## 3 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*" "*" " "
## 4 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " " " " " "*"
## 5 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " "*" " " "*"
## 6 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " " " "*" " " "*"
## 7 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" " " "*"
## 8 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" " " " " "*" "*" " " "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
k=10
set.seed(1)
folds=sample(1:k,nrow(Boston),replace=TRUE)
cv.errors=matrix(NA,k,13, dimnames=list(NULL, paste(1:13)))
for(j in 1:k){
best.fit=regsubsets(crim~.,data=Boston[folds!=j,],nvmax=13)
for(i in 1:13){
pred=predict(best.fit,Boston[folds==j,],id=i)
cv.errors[j,i]=mean( (Boston$crim[folds==j]-pred)^2)
}
}
mean.cv.errors=apply(cv.errors,2,mean)
mean.cv.errors
## 1 2 3 4 5 6 7 8
## 45.44573 43.87260 43.94979 44.02424 43.96415 43.96199 42.96268 42.66948
## 9 10 11 12 13
## 42.53822 42.73416 42.52367 42.46014 42.50125
which.min(mean.cv.errors)
## 12
## 12
par(mfrow=c(1,1))
plot(mean.cv.errors,type='b')
bss.error = min(mean.cv.errors)
bss.error
## [1] 42.46014
x=model.matrix(crim~.,Boston)
y=Boston$crim
grid=10^seq(10,-2,length=100)
set.seed(1)
lasso.mod = glmnet(x,y,alpha=1, lambda = grid, thresh=1e-12)
cv.out=cv.glmnet(x,y,alpha=1, lambda = grid, thresh=1e-12)
bestlam=cv.out$lambda.min
bestlam
## [1] 0.05336699
plot(cv.out)
lasso.coef=predict(lasso.mod,s=bestlam,newx=x)
lasso.mse = mean((lasso.coef-y)^2)
lasso.mse
## [1] 40.47009
x=model.matrix(crim~.,Boston)
y=Boston$crim
grid=10^seq(10,-2,length=100)
set.seed(1)
ridge.mod = glmnet(x,y,alpha=0, lambda = grid, thresh=1e-12)
cv.out=cv.glmnet(x,y,alpha=0, lambda = grid, thresh=1e-12)
bestlam=cv.out$lambda.min
bestlam
## [1] 0.1232847
plot(cv.out)
ridge.pred=predict(ridge.mod,s=bestlam,newx=x)
ridge.mse = mean((ridge.pred-y)^2)
ridge.mse
## [1] 40.36313
set.seed(1)
pcr.fit=pcr(crim~., data=Boston,scale=TRUE,validation="CV")
validationplot(pcr.fit,val.type="MSEP")
pcr.fit$ncomp
## [1] 13
pcr.pred=predict(pcr.fit,Boston,ncomp=13)
pcr.mse = mean((pcr.pred-y)^2)
pcr.mse
## [1] 40.31607
model.names = c("Best subset selection", "Lasso", "Ridge regression", "PCR")
error.matrix = c(bss.error, lasso.mse, ridge.mse, pcr.mse)
data.frame(model.names, error.matrix)
## model.names error.matrix
## 1 Best subset selection 42.46014
## 2 Lasso 40.47009
## 3 Ridge regression 40.36313
## 4 PCR 40.31607