iii is correct. The lasso reduces the flexibility, decreasing the variance and increasing the bias
iii is correct. Same as part (a)
ii is correct. Non-linear models are more flexible and have less bias than least squares.
library(ISLR2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
## Loaded glmnet 4.1-7
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
library(pls)
## Warning: package 'pls' was built under R version 4.2.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
str(College)
## 'data.frame': 777 obs. of 18 variables:
## $ Private : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Apps : num 1660 2186 1428 417 193 ...
## $ Accept : num 1232 1924 1097 349 146 ...
## $ Enroll : num 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : num 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : num 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: num 2885 2683 1036 510 249 ...
## $ P.Undergrad: num 537 1227 99 63 869 ...
## $ Outstate : num 7440 12280 11250 12960 7560 ...
## $ Room.Board : num 3300 6450 3750 5450 4120 ...
## $ Books : num 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : num 2200 1500 1165 875 1500 ...
## $ PhD : num 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : num 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: num 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : num 7041 10527 8735 19016 10922 ...
## $ Grad.Rate : num 60 56 54 59 15 55 63 73 80 52 ...
dim(College)
## [1] 777 18
names(College)
## [1] "Private" "Apps" "Accept" "Enroll" "Top10perc"
## [6] "Top25perc" "F.Undergrad" "P.Undergrad" "Outstate" "Room.Board"
## [11] "Books" "Personal" "PhD" "Terminal" "S.F.Ratio"
## [16] "perc.alumni" "Expend" "Grad.Rate"
sum(is.na(College))
## [1] 0
set.seed(1)
x=model.matrix(Apps~.,College)[,-1]
y=College$Apps
set.seed(1)
train=sample(1:nrow(x),nrow(x)/2)
test=(-train)
y.test=y[test]
lm.fit = lm(Apps~., data=College[train,])
lm.pred = predict(lm.fit, College[test,], type="response")
mean((lm.pred-College[test,]$Apps)^2)
## [1] 1135758
grid=10^seq(10,-2,length=100)
ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid)
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=0)
bestlam=cv.out$lambda.min
ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,])
mean((ridge.pred -y.test)^2)
## [1] 976268.9
lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)
set.seed(2)
cv.out2=cv.glmnet(x[train,],y[train],alpha=1)
bestlam =cv.out2$lambda.min
lasso.pred=predict(lasso.mod ,s=bestlam ,newx=x[test ,])
mean((lasso.pred - y.test)^2)
## [1] 1116252
#non-zero coefficient estimates
out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s= bestlam) [1:18,]
lasso.coef
## (Intercept) PrivateYes Accept Enroll Top10perc
## -468.96036214 -491.47351867 1.57117027 -0.76858284 48.25732969
## Top25perc F.Undergrad P.Undergrad Outstate Room.Board
## -12.94716203 0.04293538 0.04406960 -0.08340724 0.14963683
## Books Personal PhD Terminal S.F.Ratio
## 0.01549548 0.02915128 -8.42538012 -3.26671319 14.61167079
## perc.alumni Expend Grad.Rate
## -0.02612797 0.07718333 8.31579869
lasso.coef[lasso.coef!=0]
## (Intercept) PrivateYes Accept Enroll Top10perc
## -468.96036214 -491.47351867 1.57117027 -0.76858284 48.25732969
## Top25perc F.Undergrad P.Undergrad Outstate Room.Board
## -12.94716203 0.04293538 0.04406960 -0.08340724 0.14963683
## Books Personal PhD Terminal S.F.Ratio
## 0.01549548 0.02915128 -8.42538012 -3.26671319 14.61167079
## perc.alumni Expend Grad.Rate
## -0.02612797 0.07718333 8.31579869
set.seed(1)
pcr.fit=pcr(Apps~., data=College, subset=train, scale=TRUE,validation="CV")
summary(pcr.fit)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4288 4006 2373 2372 2069 1961 1919
## adjCV 4288 4007 2368 2369 1999 1948 1911
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1919 1921 1876 1832 1832 1836 1837
## adjCV 1912 1915 1868 1821 1823 1827 1827
## 14 comps 15 comps 16 comps 17 comps
## CV 1853 1759 1341 1270
## adjCV 1850 1733 1326 1257
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 32.20 57.78 65.31 70.99 76.37 81.27 84.8 87.85
## Apps 13.44 70.93 71.07 79.87 81.15 82.25 82.3 82.33
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.62 92.91 94.98 96.74 97.79 98.72 99.42
## Apps 83.38 84.76 84.80 84.84 85.11 85.14 90.55
## 16 comps 17 comps
## X 99.88 100.00
## Apps 93.42 93.89
validationplot(pcr.fit, val.type="MSEP")
pcr.pred=predict(pcr.fit,x[test ,],ncomp = 10)
mean((pcr.pred-y.test)^2)
## [1] 1723100
set.seed(1)
pls.fit=plsr(Apps~., data=College, subset=train, scale=TRUE,validation="CV")
summary (pls.fit)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4288 2217 2019 1761 1630 1533 1347
## adjCV 4288 2211 2012 1749 1605 1510 1331
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1309 1303 1286 1283 1283 1277 1271
## adjCV 1296 1289 1273 1270 1270 1264 1258
## 14 comps 15 comps 16 comps 17 comps
## CV 1270 1270 1270 1270
## adjCV 1258 1257 1257 1257
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 27.21 50.73 63.06 65.52 70.20 74.20 78.62 80.81
## Apps 75.39 81.24 86.97 91.14 92.62 93.43 93.56 93.68
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 83.29 87.17 89.15 91.37 92.58 94.42 96.98
## Apps 93.76 93.79 93.83 93.86 93.88 93.89 93.89
## 16 comps 17 comps
## X 98.78 100.00
## Apps 93.89 93.89
validationplot(pls.fit, val.type="MSEP")
pls.pred=predict(pls.fit,x[test,],ncomp =9)
mean((pls.pred -y.test)^2)
## [1] 1109578
Comparing the test errors for each model , show that
ridge regression model has the lowest test error, so this method may
result in the best model.
set.seed(1)
predict.regsubsets=function(object , newdata ,id ,...){
form=as.formula (object$call [[2]])
mat=model.matrix(form ,newdata )
coefi=coef(object ,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
k=10
p = ncol(Boston) - 1
set.seed(1)
folds=sample (1:k,nrow(Boston),replace=TRUE)
cv.errors=matrix(NA,k,13, dimnames=list(NULL,paste(1:13)))
for(j in 1:k){
best.fit=regsubsets(crim~.,data=Boston[folds!=j,],nvmax=13)
for(i in 1:13){
pred=predict(best.fit ,Boston[folds == j,],id=i)
cv.errors[j,i]= mean((Boston$crim[folds == j]-pred)^2)
}
}
mean.cv.errors=apply(cv.errors ,2, mean)
mean.cv.errors
## 1 2 3 4 5 6 7 8
## 45.44573 43.87260 43.94979 44.02424 43.96415 43.96199 42.96268 42.66948
## 9 10 11 12 13
## 42.53822 42.73416 42.52367 42.46014 42.50125
plot(mean.cv.errors ,type='b')
which.min(mean.cv.errors)
## 12
## 12
mean.cv.errors[which.min(mean.cv.errors)]
## 12
## 42.46014
set.seed(1)
x = model.matrix(crim ~ ., data = Boston)[,-1]
y = Boston$crim
lasso.mod=glmnet(x,y,alpha=1,lambda=grid)
cv.out3 = cv.glmnet(x, y, alpha=1)
plot(cv.out3)
set.seed(1)
x = model.matrix(crim ~ ., data = Boston)[,-1]
y = Boston$crim
ridge.mod=glmnet(x,y,alpha=0,lambda=grid)
cv.out4 = cv.glmnet(x, y, alpha=0)
plot(cv.out4)
set.seed(1)
pcr.fit=pcr(crim~., data=Boston, scale=TRUE,validation="CV")
summary(pcr.fit)
## Data: X dimension: 506 13
## Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.61 7.250 7.253 6.833 6.815 6.826 6.847
## adjCV 8.61 7.245 7.247 6.825 6.803 6.818 6.838
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.837 6.710 6.735 6.723 6.714 6.696 6.624
## adjCV 6.827 6.698 6.724 6.710 6.702 6.682 6.609
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 47.70 60.36 69.67 76.45 82.99 88.00 91.14 93.45
## crim 30.69 30.87 39.27 39.61 39.61 39.86 40.14 42.47
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 95.40 97.04 98.46 99.52 100.0
## crim 42.55 42.78 43.04 44.13 45.4
The best subset, the lasso, ridge regression has MSE approximately 42
I would choose best subset selection is the best model because it has less predictors than other models