Exercise 2 For parts (a) through (c), indicate which of i. through iv. is correct. Justify your answer. - i. More flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. - ii. More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias. - iii. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. - iv. Less flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias
Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. Similarly to the lasso model, the flexibility of the model decreases as λ increases. This results in decreased variance, but increased bias.
The non-linear methods relative to least squares, is:
More flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. Non-linear methods give you more flexibility when fitting a model. This results in a model with decreased variance, but increase biasness.
Exercise 9 In this exercise, we will predict the number of applications received using the other variables in the College data set. (a) Split the data set into a training set and a test set.
attach(College)
set.seed(42)
sum(is.na(College))
## [1] 0
train.size <- dim(College)[1] / 2
set.seed(42)
train <- sample(1:dim(College)[1], train.size)
test <- -train
C_train <- College[train, ]
C_test <- College[test, ]
fit_lm <- lm(Apps~., data=C_train)
pred_lm <- predict(fit_lm, C_test)
test_error <- mean((C_test[, "Apps"] - pred_lm)^2)
test_error
## [1] 1341776
- The test error is Test RSS is 1341776.
matrix_train <- model.matrix(Apps~., data=C_train)
matrix_test <- model.matrix(Apps~., data=C_test)
grid <- 10 ^ seq(4, -2, length=100)
ridge_re <- cv.glmnet(matrix_train, C_train[, "Apps"], alpha=0, lambda=grid, thresh=1e-12)
best_lmba <- ridge_re$lambda.min
best_lmba
## [1] 43.28761
pred_ridge_re <- predict(ridge_re, newx=matrix_test, s=best_lmba)
mean((C_test[, "Apps"] - pred_ridge_re)^2)
## [1] 1462449
lasso_1 <- cv.glmnet(matrix_train, C_train[, "Apps"], alpha=1, lambda=grid, thresh=1e-12)
best_lmba <- lasso_1$lambda.min
best_lmba
## [1] 21.54435
pred_lasso <- predict(lasso_1, newx=matrix_test, s=best_lmba)
mean((C_test[, "Apps"] - pred_lasso)^2)
## [1] 1410193
lasso_2 <- glmnet(model.matrix(Apps~., data=College), College[, "Apps"], alpha=1)
predict(lasso_2, s=best_lmba, type="coefficients")
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -6.038452e+02
## (Intercept) .
## PrivateYes -4.235413e+02
## Accept 1.455236e+00
## Enroll -2.003696e-01
## Top10perc 3.367640e+01
## Top25perc -2.403036e+00
## F.Undergrad .
## P.Undergrad 2.086035e-02
## Outstate -5.781855e-02
## Room.Board 1.246462e-01
## Books .
## Personal 1.832910e-05
## PhD -5.601313e+00
## Terminal -3.313824e+00
## S.F.Ratio 4.478684e+00
## perc.alumni -9.796600e-01
## Expend 6.967693e-02
## Grad.Rate 5.159652e+00
prin_comp_re <- pcr(Apps~., data=C_train, scale=TRUE, validation="CV")
validationplot(prin_comp_re, val.type="MSEP")
pred_prin_comp_re <- predict(prin_comp_re, C_test, ncomp=10)
mean((C_test[, "Apps"] - (pred_prin_comp_re))^2)
## [1] 2802274
fit_plsr <- plsr(Apps~., data=C_train, scale=TRUE, validation="CV")
validationplot(fit_plsr, val.type="MSEP")
pred_plsr<- predict(fit_plsr, C_test, ncomp=9)
mean((C_test[, "Apps"] - (pred_plsr))^2)
## [1] 1345885
avg_C_test <- mean(C_test[, "Apps"])
test_lm <- 1 - mean((C_test[, "Apps"] - pred_lm)^2) /mean((C_test[, "Apps"] - avg_C_test)^2)
test_ridge_re <- 1 - mean((C_test[, "Apps"] - pred_ridge_re)^2) /mean((C_test[, "Apps"] - avg_C_test)^2)
test_lasso_2 <- 1 - mean((C_test[, "Apps"] - pred_lasso)^2) /mean((C_test[, "Apps"] - avg_C_test)^2)
test_prin_comp_re <- 1 - mean((C_test[, "Apps"] - (pred_prin_comp_re))^2) /mean((C_test[, "Apps"] - avg_C_test)^2)
test_pred_plsr<- 1 - mean((C_test[, "Apps"] - (pred_plsr))^2) /mean((C_test[, "Apps"] - avg_C_test)^2)
test_lm
## [1] 0.9193214
test_ridge_re
## [1] 0.9120655
test_lasso_2
## [1] 0.9152076
test_prin_comp_re
## [1] 0.8315042
test_pred_plsr
## [1] 0.9190743
Exercise 11 (a) We will now try to predict per capita crime rate in the Boston data set. (a) Try out some of the regression methods explored in this chapter, such as best subset selection, the lasso, ridge regression, and PCR. Present and discuss results for the approaches that you consider.
attach(Boston)
set.seed(1)
train=sample(1:nrow(Boston), nrow(Boston)/2)
train.boston=Boston[train,]
test=(-train)
test.boston=Boston[test,]
crim.test=Boston$crim[test]
x.mat.train=model.matrix(crim~., data=train.boston)[,-crim]
x.mat.train=model.matrix(crim~., data=test.boston)[,-crim]
lm.full=lm(crim~.,data=train.boston)
lm.predict=predict(lm.full,test.boston)
lm.MSE=mean((lm.predict-crim.test)^2)
lm.MSE
## [1] 41.54639
library(leaps)
regfit.best=regsubsets(crim~.,data=Boston[train,],nvmax=13)
test.mat=model.matrix(crim~.,data=Boston[test,])
val.errors=rep(NA,13)
for(i in 1:13){
coefi=coef(regfit.best,id=i)
pred=test.mat[,names(coefi)]%*%coefi
val.errors[i]=mean((Boston$crim[test]-pred)^2)
}
val.errors
## [1] 40.14557 41.87706 42.40901 42.45745 43.03836 42.13258 41.25016 41.28957
## [9] 41.49271 41.50577 41.48839 41.46692 41.54639
which.min(val.errors)
## [1] 1
sub.mse=val.errors[1]
library(glmnet)
set.seed(1)
x=model.matrix(crim~.,Boston)[,-1]
y=Boston$crim
y.test=y[test]
train=sample(1:nrow(x), nrow(x)/2)
test=(-train)
grid=10^seq(10,-2,length=100)
ridge.mod = glmnet(x,y,lambda=grid, alpha=0)
cv.out=cv.glmnet(x[train,],y[train],alpha=0)
plot(cv.out)
bestlam=cv.out$lambda.min
ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,])
ridge.mse = mean((ridge.pred-y.test)^2)
ridge.mse
## [1] 38.01174
train=sample(1:nrow(x), nrow(x)/2)
test=(-train)
grid=10^seq(10,-2,length=100)
lasso.mod = glmnet(x,y,lambda=grid, alpha=1)
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv.out)
bestlam=cv.out$lambda.min
lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,])
lasso.mse = mean((lasso.pred-y.test)^2)
lasso.mse
## [1] 39.67203
set.seed(1)
pcr.fit = pcr(crim~., data=train.boston, Scale=TRUE, validation='CV')
summary(pcr.fit)
## Data: X dimension: 253 13
## Y dimension: 253 1
## Fit method: svdpc
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 9.275 7.305 7.267 7.254 7.234 7.130 7.115
## adjCV 9.275 7.300 7.259 7.246 7.226 7.121 7.107
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.795 6.763 6.743 6.694 6.694 6.696 6.700
## adjCV 6.782 6.753 6.732 6.682 6.682 6.684 6.686
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 81.27 97.18 99.08 99.72 99.89 99.93 99.97 99.99
## crim 39.26 40.61 40.87 41.25 43.24 43.95 48.88 49.34
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 100.00 100.00 100.00 100.00 100.00
## crim 49.94 50.86 50.93 50.98 51.37
validationplot(pcr.fit,val.type="MSEP")
pcr.pred=predict(pcr.fit,test.boston,ncomp =7)
pcr.mse=mean((pcr.pred-y.test)^2)
pcr.mse
## [1] 43.48068
out=glmnet(x,y,alpha=1,lambda =grid)
lasso.coef=predict(out,type='coefficient',s=bestlam)[1:14,]
lasso.coef
## (Intercept) zn indus chas nox rm
## 14.476496146 0.039476650 -0.070676946 -0.643189309 -8.433633678 0.323327442
## age dis rad tax ptratio black
## 0.000000000 -0.877313853 0.539447859 -0.001159816 -0.224731922 -0.007537653
## lstat medv
## 0.126684807 -0.175267556