Assignment 5

Question 2:

The lasso, relative to least squares is (iii), as the lasso is less flexible and has better predictions when there is less variance and more bias.
The ridge regression is also (iii), in which its less flexible and has improved prediction accuracy when its increased in bias is less tan its decrease in varaince.
the non linear method is (ii), as they are more flexible and will give better prediction accuracy when there is an increase in variance that is less than the decrease in bias.

library(tidyverse)
library(ggplot2)
library(ISLR2)
library(glmnet)

## Warning: package 'glmnet' was built under R version 4.2.3

library(caTools)

## Warning: package 'caTools' was built under R version 4.2.3

library(pls)

## Warning: package 'pls' was built under R version 4.2.3

library(leaps)

## Warning: package 'leaps' was built under R version 4.2.3

Question 9

data(College)

# split data
set.seed(11)
train = sample(1:dim(College)[1], dim(College)[1]/2)
test <- -train
train.college <-  College[train, ]
test.college <- College[test, ]

#fit linear model 

fit.lm <- lm(Apps ~ ., data = train.college)
predict.lm <- predict(fit.lm, test.college)
mean((predict.lm - test.college$Apps)^2)

## [1] 1026096

#fit ridge regression model
train.matrix <- model.matrix(Apps ~ ., data = train.college)
test.matrix <-  model.matrix(Apps ~ ., data = test.college)
grid <- 10^ seq(4, -2, length = 100)
fit.ridge <- glmnet(train.matrix, train.college$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
cross.ridge <- cv.glmnet(train.matrix, train.college$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
lam.thingy <- cross.ridge$lambda.min
lam.thingy

## [1] 0.01

predict.ridge <- predict(fit.ridge, s = lam.thingy, newx = test.matrix)
mean((predict.ridge - test.college$Apps)^2)

## [1] 1026069

# Fit a lasso on training set

lasso.model <- glmnet(train.matrix, train.college$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
cross.lasso <- cv.glmnet(train.matrix, train.college$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
lammy.lasso <- cross.lasso$lambda.min
lammy.lasso

## [1] 0.01

predict.lasso <- predict(lasso.model, s = lammy.lasso, newx = test.matrix)
mean((predict.lasso - test.college$Apps)^2)

## [1] 1026036

predict(lasso.model, s = lammy.lasso, type = "coefficients")

## 19 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept)   37.86520037
## (Intercept)    .         
## PrivateYes  -551.14946609
## Accept         1.74980812
## Enroll        -1.36005786
## Top10perc     65.55655577
## Top25perc    -22.52640339
## F.Undergrad    0.10181853
## P.Undergrad    0.01789131
## Outstate      -0.08706371
## Room.Board     0.15384585
## Books         -0.12227313
## Personal       0.16194591
## PhD          -14.29638634
## Terminal      -1.03118224
## S.F.Ratio      4.47956819
## perc.alumni   -0.45456280
## Expend         0.05618050
## Grad.Rate      9.07242834

# Fit PCR model on training set

Question 11

# a
predict.regsubsets <- function(object, newdata, id, ...) {
    form <-  as.formula(object$call[[2]])
    mat <-  model.matrix(form, newdata)
    coefi <-  coef(object, id = id)
    xvars <-  names(coefi)
    mat[, xvars] %*% coefi
}

k <- 10
n <- nrow(Boston)
set.seed(1)
folds <- sample(rep(1:k, length = n))
cv.errors <- matrix(NA, k, 12, 
                    dimnames = list(NULL, paste(1:12)))

for (j in 1:k) {
  best.fit <- regsubsets(crim ~ ., 
                         data = Boston[folds != j,],
                         nvmax = 12)
  for (i in 1:12) {
    pred <- predict(best.fit, Boston[folds == j, ], id = i)
    cv.errors[j, i] <- 
      mean((Boston$crim[folds == j] - pred)^2)
  }
}
mean.cv.errors <- apply(cv.errors, 2, mean)
mean.cv.errors

##        1        2        3        4        5        6        7        8 
## 46.00617 44.22854 44.41638 43.05242 43.47254 43.27042 43.01362 42.91980 
##        9       10       11       12 
## 42.75855 42.75734 42.67672 42.71781

# cross-validation selects a 9-variable model. 
plot (mean.cv.errors , type = "b")

(Q11) Lasso

set.seed(1)
x <- model.matrix(crim ~ ., Boston)[,-1]
y <- Boston$crim
grid <-  10^seq(10,-2,length=100)

train <- sample(1:nrow(x), nrow(x)/1.3)
test <- (-train)
y.test <- y[test]

lasso.mod <- glmnet(x[train, ], y[train], alpha = 1,
                    lambda = grid)
plot(lasso.mod)

cv.out <- cv.glmnet(x[train, ], y[train], alpha = 1)
plot(cv.out)

bestlam <- cv.out$lambda.min
lasso.pred <- predict(lasso.mod, s = bestlam,
                      newx = x[test, ])
mean((lasso.pred - y.test)^2)

## [1] 57.5101

out <- glmnet(x, y, alpha = 1, lambda = grid)
lasso.coef <- predict(out, type = "coefficients", s = bestlam)[1:13, ]
lasso.coef

##   (Intercept)            zn         indus          chas           nox 
##  9.9666892236  0.0381267348 -0.0666376190 -0.6802313678 -7.2456707073 
##            rm           age           dis           rad           tax 
##  0.4427557169  0.0000000000 -0.8290928402  0.5434410500 -0.0001081031 
##       ptratio         lstat          medv 
## -0.2342526657  0.1352607569 -0.1840797990

# MSE is 57.5101 and age is the only coefficient estimate that is zero. The lasso model with lambda chose by cross-validation contains only 12 variables.

Ridge Regression

cv.outridge <- cv.glmnet(x[train, ], y[train], alpha = 0)
bestlam.ridge <- cv.outridge$lambda.min

glm.mod <- glmnet(x[train,], y[train], alpha = 0, lambda = grid, thresh = 1e-12)
glm.pred <- predict(glm.mod, s = bestlam.ridge, newx = x[test,])

mean((glm.pred-y.test)^2)

## [1] 58.17261

glm.coef <- predict(glm.mod, type = "coefficients", s = bestlam.ridge)[1:13,]
glm.coef

##  (Intercept)           zn        indus         chas          nox           rm 
##  6.472904914  0.030196598 -0.069893845 -0.657060130 -4.722737407  0.243883933 
##          age          dis          rad          tax      ptratio        lstat 
##  0.003217559 -0.609518731  0.430397022  0.003789322 -0.220799827  0.164491934 
##         medv 
## -0.129883442

# MSE is 58.17261

PCR

pcr.fit <- pcr(crim ~ ., data = Boston, subset = train, scale = TRUE, validation = "CV")
validationplot(pcr.fit, val.type = "MSEP")

set.seed(1)
pcr.pred <- predict(pcr.fit, x[test,], ncomp = 9)
mean((pcr.pred-y.test)^2)

## [1] 60.23479

summary(pcr.fit)

## Data:    X dimension: 389 12 
##  Y dimension: 389 1
## Fit method: svdpc
## Number of components considered: 12
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           8.174    6.880    6.876    6.487    6.402    6.376    6.363
## adjCV        8.174    6.878    6.874    6.484    6.398    6.373    6.360
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps
## CV       6.234    6.220    6.221     6.233     6.206     6.131
## adjCV    6.225    6.216    6.216     6.228     6.199     6.123
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       50.53    64.10    73.32    80.42    86.70    90.27    92.78    94.95
## crim    29.72    29.88    37.90    39.29    39.92    40.45    43.02    43.25
##       9 comps  10 comps  11 comps  12 comps
## X       96.80     98.29     99.47    100.00
## crim    43.61     43.63     44.55     45.99

# MSE is 60.23479

(Q11) B and C

The best subset selection model selects a 9-variable models. I would possibly chose this model or the lasso, as the best subset has the least amount of variables needed for the model but the lasso model has the lowest MSE and is known to be a much easier model to interpret. The best subset model kept only 9 variables and the lasso model has 12.