knitr::include_graphics("/Users/arielleking/Documents/Stat 710/math782/Screen Shot 2022-11-27 at 4.43.46 PM 2.png")
knitr::include_graphics("/Users/arielleking/Documents/Stat 710/math782/Screen Shot 2022-11-27 at 4.27.45 PM 2.png")
#
#
A <- as.matrix(data.frame(c(3,2,4),c(1,2,3),c(4,3,7)))
A
## c.3..2..4. c.1..2..3. c.4..3..7.
## [1,] 3 1 4
## [2,] 2 2 3
## [3,] 4 3 7
\(\lambda\) <- as.matrix(data.frame(c()))
e<-eigen(A)
e$values
## [1] 10.7397847+0.000000i 0.6301077+0.261769i 0.6301077-0.261769i
e$vectors
## [,1] [,2] [,3]
## [1,] -0.4628927+0i -0.8115435+0.0000000i -0.8115435+0.0000000i
## [2,] -0.3807044+0i 0.1551859+0.3225569i 0.1551859-0.3225569i
## [3,] -0.8004964+0i 0.4420212-0.1337485i 0.4420212+0.1337485i
set.seed(16); n=100
x <- rnorm(n) #default mean = 0, and default std dev = 1... accepting those
error <- rnorm(n)
Generate a response vector Y of length n=100 according to the model Y=β0+β1x+β2X2+β3X3+ε,where β0,β1,β2, and β3 are constants of your choice.
b0 <- 17; b1 <- 3; b2 <- .8; b3 <- -2 #set coefficients
y <- b0 + b1*x + b2*x^2 + b3*x^3 + error #f(x)
Use the regsubsets() function to perform best subset selection in order to choose the best model containing the predictors X,X2,⋯,X10. What is the best model obtained according to Cp, BIC, and adjusted R2? Show some plots to provide evidence for your answer, and report the coefficients of the best model obtained. Note you will need to use the data.frame() function to create a single data set containing both X and Y.
library(leaps)
#Create data.frame with x^1,...x^10
x.new <- x
for(i in 2:10){
x.new <- cbind(x.new,x^i)
}
colnames(x.new) <- paste("x", 1:ncol(x.new), sep="")
data.8 <- data.frame(cbind(y, x.new))
regfit.8 <- regsubsets(y~ ., data=data.8, nvmax=10)
regsum.8 <- summary(regfit.8)
par(mfrow=c(2,2))
plot(regsum.8$cp, type="l", col=4, xlab = "# Variables", ylab = "Mallows Cp")
points(which.min(regsum.8$cp),regsum.8$cp[which.min(regsum.8$cp)], col=4, pch = 15, cex=2)
plot(regsum.8$bic, type="l", col=6, xlab = "# Variables", ylab = "Bayes Information Criterion")
points(which.min(regsum.8$bic),regsum.8$bic[which.min(regsum.8$bic)], col=6, pch = 16, cex=2)
plot(regsum.8$adjr2, type="l", col=3, xlab = "# Variables", ylab = "Adjusted R Squared")
points(which.max(regsum.8$adjr2),regsum.8$adjr2[which.max(regsum.8$adjr2)], col=3, pch = 17, cex=2)
coef(regfit.8,which.min(regsum.8$bic))
## (Intercept) x1 x2 x3
## 16.9733268 3.0071671 0.8415477 -1.9856216
Repeat (c), using forward stepwise selection and also using back-wards stepwise selection. How does your answer compare to the results in (c)?
regfit.8.fwd <- regsubsets(y~ ., data=data.8, nvmax=10, method="forward")
regsum.8.fwd <- summary(regfit.8.fwd)
par(mfrow=c(2,2))
plot(regsum.8.fwd$cp, type="l", col=4, main="Forward Stepwise Selection", xlab = "# Variables", ylab = "Mallows Cp")
points(which.min(regsum.8.fwd$cp),regsum.8.fwd$cp[which.min(regsum.8.fwd$cp)], col=4, pch = 15, cex=2)
plot(regsum.8.fwd$bic, type="l", col=6, main="Forward Stepwise Selection", xlab = "# Variables", ylab = "Bayes Information Criterion")
points(which.min(regsum.8.fwd$bic),regsum.8.fwd$bic[which.min(regsum.8.fwd$bic)], col=6, pch = 16, cex=2)
plot(regsum.8.fwd$adjr2, type="l", col=3, main="Forward Stepwise Selection", xlab = "# Variables", ylab = "Adjusted R Squared")
points(which.max(regsum.8.fwd$adjr2),regsum.8.fwd$adjr2[which.max(regsum.8.fwd$adjr2)], col=3, pch = 17, cex=2)
coef(regfit.8.fwd,which.min(regsum.8.fwd$bic))
## (Intercept) x1 x2 x3
## 16.9733268 3.0071671 0.8415477 -1.9856216
regfit.8.bkwd <- regsubsets(y~ ., data=data.8, nvmax=10, method="backward")
regsum.8.bkwd <- summary(regfit.8.bkwd)
par(mfrow=c(2,2))
plot(regsum.8.bkwd$cp, type="l", col=4, main="backward Stepwise Selection", xlab = "# Variables", ylab = "Mallows Cp")
points(which.min(regsum.8.bkwd$cp),regsum.8.bkwd$cp[which.min(regsum.8.bkwd$cp)], col=4, pch = 15, cex=2)
plot(regsum.8.bkwd$bic, type="l", col=6, main="backward Stepwise Selection", xlab = "# Variables", ylab = "Bayes Information Criterion")
points(which.min(regsum.8.bkwd$bic),regsum.8.bkwd$bic[which.min(regsum.8.bkwd$bic)], col=6, pch = 16, cex=2)
plot(regsum.8.bkwd$adjr2, type="l", col=3, main="backward Stepwise Selection", xlab = "# Variables", ylab = "Adjusted R Squared")
points(which.max(regsum.8.bkwd$adjr2),regsum.8.bkwd$adjr2[which.max(regsum.8.bkwd$adjr2)], col=3, pch = 17, cex=2)
coef(regfit.8.bkwd,which.min(regsum.8.bkwd$bic))
## (Intercept) x1 x2 x3
## 16.9733268 3.0071671 0.8415477 -1.9856216
Now fit a lasso model to the simulated data, again using X,X2,⋯,X10 as predictors. Use cross-validation to select the optimal value of lambda. Create plots of the cross-validation error as a function of λ. Report the resulting coefficient estimates, and discuss the results obtained.
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.1.2
## Loading required package: Matrix
## Loaded glmnet 4.1-4
#default for cv.glmnet() is 10 fold cross validation
set.seed(32)
grid.8 <- 10^seq(10,-2,length= nrow(data.8))
train.8 <- sample(1:nrow(data.8), nrow(data.8)/2) #create training set
test.8 <- (-train.8)
y.test.8 <- y[test.8]
x.8.e <- model.matrix(y~.,data.8)[,-1] #x matrix
y.8.e <- data.8[,1] #y vector
cv.8.out <- cv.glmnet(x.8.e[train.8,], y.8.e[train.8], alpha=1)
plot(cv.8.out)
In this exercise, we will predict the number of applications received using the other variables in the Collegedata set. Split the data set into a training set and a test set. Use set.seed(123).
library(ISLR)
attach(College)
set.seed(1)
train=sample(c(TRUE,FALSE),nrow(College),rep=TRUE)
test=(!train)
College.train = College[train,]
College.test = College[test,]
Fit a linear model using least squares on the training set, and report the test error obtained.
lm.fit = lm(Apps~., data=College.train)
lm.pred = predict(lm.fit, College.test, type="response")
mean((lm.pred-College.test$Apps)^2)
## [1] 984743.1
Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.
library(glmnet)
set.seed(1)
#Set up matrices needed for the glmnet functions
train.mat = model.matrix(Apps~., data = College.train)
test.mat = model.matrix(Apps~., data = College.test)
#Choose lambda using cross-validation
cv.out = cv.glmnet(train.mat,College.train$Apps,alpha=0)
bestlam = cv.out$lambda.min
bestlam
## [1] 394.2365
#Fit a ridge regression
ridge.mod = glmnet(train.mat,College.train$Apps,alpha = 0)
#Make predictions
ridge.pred = predict(ridge.mod,s=bestlam,newx = test.mat)
#Calculate test error
mean((ridge.pred - College.test$Apps)^2)
## [1] 940970.9
Fit a lasso model on the training set, with λ chosen by cross-validation. Report the test error obtained, along with the number of non-zero coefficient estimates.
#Choose lambda using cross-validation
set.seed(1)
cv.out2 = cv.glmnet(train.mat,College.train$Apps,alpha=1)
bestlam2 = cv.out2$lambda.min
bestlam2
## [1] 59.92044
#Fit lasso model
lasso.mod = glmnet(train.mat,College.train$Apps,alpha=1)
#Make predictions
lasso.pred = predict(lasso.mod,s=bestlam2,newx=test.mat)
mean((lasso.pred - College.test$Apps)^2)
## [1] 993741.7
Fit a PCR model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.
library(pls)
## Warning: package 'pls' was built under R version 4.1.2
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(1)
#Fit and determine M based on CV results
pcr.fit=pcr(Apps~., data=College.train, scale=TRUE, validation="CV")
validationplot(pcr.fit,val.type = "MSEP")
summary(pcr.fit)
## Data: X dimension: 393 17
## Y dimension: 393 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4189 4094 2321 2336 2168 2099 1961
## adjCV 4189 4094 2315 2332 2156 2088 1949
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1949 1937 1861 1792 1808 1830 1839
## adjCV 1940 1922 1845 1783 1800 1821 1831
## 14 comps 15 comps 16 comps 17 comps
## CV 1848 1612 1348 1333
## adjCV 1845 1584 1334 1320
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 31.858 57.44 64.20 69.91 75.10 80.17 83.82 87.30
## Apps 4.353 70.99 71.18 76.84 78.34 81.03 81.59 82.21
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.26 92.74 94.79 96.70 97.76 98.67 99.37
## Apps 83.31 83.97 83.97 84.34 84.58 84.70 91.28
## 16 comps 17 comps
## X 99.82 100.00
## Apps 92.83 93.02
#Make predictions using chosen M
pcr.pred=predict(pcr.fit,College.test,ncomp=10)
#Compute test error
mean((pcr.pred-College.test$Apps)^2)
## [1] 1682909
Fit a PLS model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.
set.seed(1)
#Fit and determine M based on CV results
pls.fit = plsr(Apps~., data=College.train, scale=TRUE,validation="CV")
validationplot(pls.fit,val.type = "MSEP")
summary(pls.fit)
## Data: X dimension: 393 17
## Y dimension: 393 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4189 2172 1932 1720 1669 1489 1382
## adjCV 4189 2163 1930 1709 1640 1463 1365
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1333 1328 1323 1329 1332 1334 1334
## adjCV 1321 1316 1310 1316 1319 1320 1321
## 14 comps 15 comps 16 comps 17 comps
## CV 1335 1333 1333 1333
## adjCV 1321 1320 1320 1320
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 26.01 44.96 62.49 65.22 68.52 72.89 77.13 80.46
## Apps 75.74 82.40 86.74 90.58 92.34 92.79 92.88 92.93
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 82.45 84.76 88.08 90.76 92.80 94.45 97.02
## Apps 92.98 93.00 93.01 93.01 93.02 93.02 93.02
## 16 comps 17 comps
## X 98.03 100.00
## Apps 93.02 93.02
Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?
pls.pred = predict(pls.fit, College.test, ncomp = 8)
#Compute test error
mean((pls.pred - College.test$Apps)^2)
## [1] 978534.3
pls.pred = predict(pls.fit, College.test, ncomp = 9)
#Compute test error
mean((pls.pred - College.test$Apps)^2)
## [1] 1007163
So the best performing PLS model utilizes M=8, and its test error is 978534.3. This is the second best performing model, second only to ridge regression.