Hw_5

knitr::include_graphics("/Users/arielleking/Documents/Stat 710/math782/Screen Shot 2022-11-27 at 4.43.46 PM 2.png")

knitr::include_graphics("/Users/arielleking/Documents/Stat 710/math782/Screen Shot 2022-11-27 at 4.27.45 PM 2.png")

A <- as.matrix(data.frame(c(3,2,4),c(1,2,3),c(4,3,7)))
A

##      c.3..2..4. c.1..2..3. c.4..3..7.
## [1,]          3          1          4
## [2,]          2          2          3
## [3,]          4          3          7

\(\lambda\) <- as.matrix(data.frame(c()))

e<-eigen(A)
e$values

## [1] 10.7397847+0.000000i  0.6301077+0.261769i  0.6301077-0.261769i

e$vectors

##               [,1]                  [,2]                  [,3]
## [1,] -0.4628927+0i -0.8115435+0.0000000i -0.8115435+0.0000000i
## [2,] -0.3807044+0i  0.1551859+0.3225569i  0.1551859-0.3225569i
## [3,] -0.8004964+0i  0.4420212-0.1337485i  0.4420212+0.1337485i

(Graduate Students only) In this exercise, we will generate simulated data, and will then usethis data to perform best subset selection. Use the rnorm() function to generate a predictor X of length n=100, as well as a noise vector ε of length n=100.

set.seed(16); n=100
x <- rnorm(n) #default mean = 0, and default std dev = 1... accepting those
error <- rnorm(n)

Generate a response vector Y of length n=100 according to the model Y=β0+β1x+β2X2+β3X3+ε,where β0,β1,β2, and β3 are constants of your choice.

b0 <- 17; b1 <- 3; b2 <- .8; b3 <- -2 #set coefficients
y <- b0 + b1*x + b2*x^2 + b3*x^3 + error #f(x)

Use the regsubsets() function to perform best subset selection in order to choose the best model containing the predictors X,X2,⋯,X10. What is the best model obtained according to Cp, BIC, and adjusted R2? Show some plots to provide evidence for your answer, and report the coefficients of the best model obtained. Note you will need to use the data.frame() function to create a single data set containing both X and Y.

library(leaps)

#Create data.frame with x^1,...x^10
x.new <- x
for(i in 2:10){
    x.new <- cbind(x.new,x^i)
}
colnames(x.new) <- paste("x", 1:ncol(x.new), sep="")
data.8 <- data.frame(cbind(y, x.new))
regfit.8 <- regsubsets(y~ ., data=data.8, nvmax=10) 
regsum.8 <- summary(regfit.8)

par(mfrow=c(2,2))

plot(regsum.8$cp, type="l", col=4, xlab = "# Variables", ylab = "Mallows Cp") 
points(which.min(regsum.8$cp),regsum.8$cp[which.min(regsum.8$cp)], col=4, pch = 15, cex=2)

plot(regsum.8$bic, type="l", col=6, xlab = "# Variables", ylab = "Bayes Information Criterion")
points(which.min(regsum.8$bic),regsum.8$bic[which.min(regsum.8$bic)], col=6, pch = 16, cex=2)

plot(regsum.8$adjr2, type="l", col=3, xlab = "# Variables", ylab = "Adjusted R Squared")
points(which.max(regsum.8$adjr2),regsum.8$adjr2[which.max(regsum.8$adjr2)], col=3, pch = 17, cex=2)

coef(regfit.8,which.min(regsum.8$bic))

## (Intercept)          x1          x2          x3 
##  16.9733268   3.0071671   0.8415477  -1.9856216

Repeat (c), using forward stepwise selection and also using back-wards stepwise selection. How does your answer compare to the results in (c)?

regfit.8.fwd <- regsubsets(y~ ., data=data.8, nvmax=10, method="forward") 
regsum.8.fwd <- summary(regfit.8.fwd)

par(mfrow=c(2,2))

plot(regsum.8.fwd$cp, type="l", col=4, main="Forward Stepwise Selection", xlab = "# Variables", ylab = "Mallows Cp") 
points(which.min(regsum.8.fwd$cp),regsum.8.fwd$cp[which.min(regsum.8.fwd$cp)], col=4, pch = 15, cex=2)

plot(regsum.8.fwd$bic, type="l", col=6, main="Forward Stepwise Selection", xlab = "# Variables", ylab = "Bayes Information Criterion")
points(which.min(regsum.8.fwd$bic),regsum.8.fwd$bic[which.min(regsum.8.fwd$bic)], col=6, pch = 16, cex=2)

plot(regsum.8.fwd$adjr2, type="l", col=3, main="Forward Stepwise Selection", xlab = "# Variables", ylab = "Adjusted R Squared")
points(which.max(regsum.8.fwd$adjr2),regsum.8.fwd$adjr2[which.max(regsum.8.fwd$adjr2)], col=3, pch = 17, cex=2)

coef(regfit.8.fwd,which.min(regsum.8.fwd$bic))

## (Intercept)          x1          x2          x3 
##  16.9733268   3.0071671   0.8415477  -1.9856216

regfit.8.bkwd <- regsubsets(y~ ., data=data.8, nvmax=10, method="backward") 
regsum.8.bkwd <- summary(regfit.8.bkwd)

par(mfrow=c(2,2))

plot(regsum.8.bkwd$cp, type="l", col=4, main="backward Stepwise Selection", xlab = "# Variables", ylab = "Mallows Cp") 
points(which.min(regsum.8.bkwd$cp),regsum.8.bkwd$cp[which.min(regsum.8.bkwd$cp)], col=4, pch = 15, cex=2)

plot(regsum.8.bkwd$bic, type="l", col=6, main="backward Stepwise Selection", xlab = "# Variables", ylab = "Bayes Information Criterion")
points(which.min(regsum.8.bkwd$bic),regsum.8.bkwd$bic[which.min(regsum.8.bkwd$bic)], col=6, pch = 16, cex=2)

plot(regsum.8.bkwd$adjr2, type="l", col=3, main="backward Stepwise Selection", xlab = "# Variables", ylab = "Adjusted R Squared")
points(which.max(regsum.8.bkwd$adjr2),regsum.8.bkwd$adjr2[which.max(regsum.8.bkwd$adjr2)], col=3, pch = 17, cex=2)

coef(regfit.8.bkwd,which.min(regsum.8.bkwd$bic))

## (Intercept)          x1          x2          x3 
##  16.9733268   3.0071671   0.8415477  -1.9856216

Now fit a lasso model to the simulated data, again using X,X2,⋯,X10 as predictors. Use cross-validation to select the optimal value of lambda. Create plots of the cross-validation error as a function of λ. Report the resulting coefficient estimates, and discuss the results obtained.

library(glmnet)

## Warning: package 'glmnet' was built under R version 4.1.2

## Loading required package: Matrix

## Loaded glmnet 4.1-4

#default for cv.glmnet() is 10 fold cross validation
set.seed(32)
grid.8 <- 10^seq(10,-2,length= nrow(data.8))
train.8 <- sample(1:nrow(data.8), nrow(data.8)/2) #create training set
test.8 <- (-train.8)
y.test.8 <-  y[test.8]
               
x.8.e <- model.matrix(y~.,data.8)[,-1] #x matrix
y.8.e <- data.8[,1] #y vector

cv.8.out <- cv.glmnet(x.8.e[train.8,], y.8.e[train.8], alpha=1)
plot(cv.8.out)

In this exercise, we will predict the number of applications received using the other variables in the Collegedata set. Split the data set into a training set and a test set. Use set.seed(123).

library(ISLR)
attach(College)

set.seed(1)
train=sample(c(TRUE,FALSE),nrow(College),rep=TRUE)
test=(!train)

College.train = College[train,]
College.test = College[test,]

Fit a linear model using least squares on the training set, and report the test error obtained.

lm.fit = lm(Apps~., data=College.train)
lm.pred = predict(lm.fit, College.test, type="response")
mean((lm.pred-College.test$Apps)^2)

## [1] 984743.1

Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

library(glmnet) 
set.seed(1)
#Set up matrices needed for the glmnet functions
train.mat = model.matrix(Apps~., data = College.train)
test.mat = model.matrix(Apps~., data = College.test)
#Choose lambda using cross-validation
cv.out = cv.glmnet(train.mat,College.train$Apps,alpha=0)
bestlam = cv.out$lambda.min
bestlam

## [1] 394.2365

#Fit a ridge regression
ridge.mod = glmnet(train.mat,College.train$Apps,alpha = 0)
#Make predictions
ridge.pred = predict(ridge.mod,s=bestlam,newx = test.mat)
#Calculate test error
mean((ridge.pred - College.test$Apps)^2)

## [1] 940970.9

Fit a lasso model on the training set, with λ chosen by cross-validation. Report the test error obtained, along with the number of non-zero coefficient estimates.

#Choose lambda using cross-validation
set.seed(1)
cv.out2 = cv.glmnet(train.mat,College.train$Apps,alpha=1)
bestlam2 = cv.out2$lambda.min
bestlam2

## [1] 59.92044

#Fit lasso model
lasso.mod = glmnet(train.mat,College.train$Apps,alpha=1)
#Make predictions
lasso.pred = predict(lasso.mod,s=bestlam2,newx=test.mat)
mean((lasso.pred - College.test$Apps)^2)

## [1] 993741.7

Fit a PCR model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.

library(pls)

## Warning: package 'pls' was built under R version 4.1.2

## 
## Attaching package: 'pls'

## The following object is masked from 'package:stats':
## 
##     loadings

set.seed(1)
#Fit and determine M based on CV results
pcr.fit=pcr(Apps~., data=College.train, scale=TRUE, validation="CV")
validationplot(pcr.fit,val.type = "MSEP")

summary(pcr.fit)

## Data:    X dimension: 393 17 
##  Y dimension: 393 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4189     4094     2321     2336     2168     2099     1961
## adjCV         4189     4094     2315     2332     2156     2088     1949
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1949     1937     1861      1792      1808      1830      1839
## adjCV     1940     1922     1845      1783      1800      1821      1831
##        14 comps  15 comps  16 comps  17 comps
## CV         1848      1612      1348      1333
## adjCV      1845      1584      1334      1320
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X      31.858    57.44    64.20    69.91    75.10    80.17    83.82    87.30
## Apps    4.353    70.99    71.18    76.84    78.34    81.03    81.59    82.21
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       90.26     92.74     94.79     96.70     97.76     98.67     99.37
## Apps    83.31     83.97     83.97     84.34     84.58     84.70     91.28
##       16 comps  17 comps
## X        99.82    100.00
## Apps     92.83     93.02

#Make predictions using chosen M
pcr.pred=predict(pcr.fit,College.test,ncomp=10)
#Compute test error
mean((pcr.pred-College.test$Apps)^2)

## [1] 1682909

Fit a PLS model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.

set.seed(1)
#Fit and determine M based on CV results
pls.fit = plsr(Apps~., data=College.train, scale=TRUE,validation="CV")
validationplot(pls.fit,val.type = "MSEP")

summary(pls.fit)

## Data:    X dimension: 393 17 
##  Y dimension: 393 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4189     2172     1932     1720     1669     1489     1382
## adjCV         4189     2163     1930     1709     1640     1463     1365
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1333     1328     1323      1329      1332      1334      1334
## adjCV     1321     1316     1310      1316      1319      1320      1321
##        14 comps  15 comps  16 comps  17 comps
## CV         1335      1333      1333      1333
## adjCV      1321      1320      1320      1320
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       26.01    44.96    62.49    65.22    68.52    72.89    77.13    80.46
## Apps    75.74    82.40    86.74    90.58    92.34    92.79    92.88    92.93
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       82.45     84.76     88.08     90.76     92.80     94.45     97.02
## Apps    92.98     93.00     93.01     93.01     93.02     93.02     93.02
##       16 comps  17 comps
## X        98.03    100.00
## Apps     93.02     93.02

Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?

pls.pred = predict(pls.fit, College.test, ncomp = 8)
#Compute test error
mean((pls.pred - College.test$Apps)^2)

## [1] 978534.3

pls.pred = predict(pls.fit, College.test, ncomp = 9)
#Compute test error
mean((pls.pred - College.test$Apps)^2)

## [1] 1007163

So the best performing PLS model utilizes M=8, and its test error is 978534.3. This is the second best performing model, second only to ridge regression.

Hw_5

Arielle King

11/27/2022