############################
# Homework 2
# Gabbi Allen
# Dr. Coberly
# MATH 5553-01
############################

############################
# Chapter 5, Problem 8
# We will now perform cross-validation on a simulated data set

# Part a: Generate a simulated data set as follows:

set.seed(1)
y <- rnorm(100)
x <- rnorm(100)
y <- x - 2*x^2 +rnorm(100)
# n = 100 and p = 1; x is the only predictor variable
# equation form of y: Y = X + 2X^2 + epsilon, where epsilon is the error term

# Part b: Create a scatterplot of X against Y. Comment on what you find

plot(x, y)

# The plot indicates a fairly strong quadratic relationship.

# Part c: Set a random seed, and then compute the LOOCV errors that result from
# fitting the following four models using least squares:
  # i. Y = B0 + B1X + epsilon
  # ii. Y = B0 + B1X + B2X^2 + epsilon
  # iii. Y = B0 + B1X + B2X^2 + B3X^3 + epsilon
  # iv. Y = B0 + B1X + B2X^2 + B3X^3 + B4X^4 + epsilon

library(boot) # load boot package
set.seed(96) # set random seed
Data <- data.frame(x, y)

    # part i
    glm.fit1 <- glm(y ~ x, data = Data)
    cv.err1 <- cv.glm(Data, glm.fit1)
    cv.err1$delta # these numbers are essentially identical
## [1] 5.890979 5.888812
    # part ii
    glm.fit2 <- glm(y ~ poly(x, 2), data = Data)
    cv.err2 <- cv.glm(Data, glm.fit2)
    cv.err2$delta # these numbers are essentially identical
## [1] 1.086596 1.086326
    # part iii
    glm.fit3 <- glm(y ~ poly(x, 3), data = Data)
    cv.err3 <- cv.glm(Data, glm.fit3)
    cv.err3$delta # these numbers are essentially identical
## [1] 1.102585 1.102227
    # part iv
    glm.fit4 <- glm(y ~ poly(x, 4), data = Data)
    cv.err4 <- cv.glm(Data, glm.fit4)
    cv.err4$delta # these numbers are essentially identical
## [1] 1.114772 1.114334
# part d: Repeat (c) using another random seed, and report your results. Are your 
# results the same as what you got in (c)? Why?

set.seed(17) # set random seed

    # part i
    glm.fit1 <- glm(y ~ x, data = Data)
    cv.err1 <- cv.glm(Data, glm.fit1)
    cv.err1$delta # these numbers are essentially identical
## [1] 5.890979 5.888812
    # part ii
    glm.fit2 <- glm(y ~ poly(x, 2), data = Data)
    cv.err2 <- cv.glm(Data, glm.fit2)
    cv.err2$delta # these numbers are essentially identical
## [1] 1.086596 1.086326
    # part iii
    glm.fit3 <- glm(y ~ poly(x, 3), data = Data)
    cv.err3 <- cv.glm(Data, glm.fit3)
    cv.err3$delta # these numbers are essentially identical
## [1] 1.102585 1.102227
    # part iv
    glm.fit4 <- glm(y ~ poly(x, 4), data = Data)
    cv.err4 <- cv.glm(Data, glm.fit4)
    cv.err4$delta # these numbers are essentially identical
## [1] 1.114772 1.114334
# The numbers I got in part d were identical to the numbers in part c. This is 
# because LOOCV leaves one observation out every time, so it doesn't matter what
# the seed is.
    
# Part e: Which of the models in (c) had the smallest LOOCV error?
# Is this what you expected? Explain your answer
    
# The quadratic models (the second one) had the smallest LOOCV error. This is not
# unexpected because when looking at the graph, there is clearly a quadratic 
# relationship between x and y
    
# Part f: Comment on the statistical significance of the coefficient estimates
# that results from fitting each of the models in (c) using least squares. Do 
# these results agree with the conclusions drawn based on the cross-validation 
# results?
    
summary(glm.fit4)
## 
## Call:
## glm(formula = y ~ poly(x, 4), data = Data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8914  -0.5244   0.0749   0.5932   2.7796  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.8277     0.1041 -17.549   <2e-16 ***
## poly(x, 4)1   2.3164     1.0415   2.224   0.0285 *  
## poly(x, 4)2 -21.0586     1.0415 -20.220   <2e-16 ***
## poly(x, 4)3  -0.3048     1.0415  -0.293   0.7704    
## poly(x, 4)4  -0.4926     1.0415  -0.473   0.6373    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.084654)
## 
##     Null deviance: 552.21  on 99  degrees of freedom
## Residual deviance: 103.04  on 95  degrees of freedom
## AIC: 298.78
## 
## Number of Fisher Scoring iterations: 2
# It appears that the linear and quadratic terms are statistically significant,
# as well as the intercept term. Because our CV errors were lowest for the 
# quadratic model, this isn't surprising.

rm(list = ls()) # clear all variables/models to start fresh for next problem


############################
# Chapter 6, Problem 8

# In this exercise, we will generate simulated data, and will then use this data
# to perform best subset selection.

# Part a: Use the rnorm() function to generate a predictor X of length n = 100, 
# as well as a noise vector epsilon of length n = 100.

set.seed(3)
x <- rnorm(100)
epsilon <- rnorm(100)

# Part b: Generate a response vector Y of length n = 100 according to the model
# Y = B0 + B1X + B2X^2 + B3X^3 + epsilon
# where B0, B1, B2, and B3 are constants of your choice.

beta <- c(2, 4, 7, 3)

y <- beta[1] + beta[2]*x + beta[3]*x^2 + beta[4]*x^3 + epsilon

# Part c: Use the regsubsets() function to perform best subset selection in order
# to choose the best model containing the predictors x, x^2, ..., x^10. What is
# the best model obtained according to Cp, BIC, and adjusted R^2? Show some plots
# to provide evidence for your answer, and report the coefficients of the best 
# model obtained. Note you will need to use the data.frame() function to create
# a single data set using both x and y.

library(leaps) # load package
plot(x,y) # plot data points
grid <- seq(min(x), max(x), length.out = 50)
lines(grid, predict(loess(y ~ x), data.frame(x = grid)))

x.mat <- poly(x, 10, raw = T)

d.frame <- data.frame(x,y) # create a single data set using both x and y

regfit.full <- regsubsets(y ~ x.mat, data = d.frame, nvmax = 10) 
# best subset selection model
sum.full <- summary(regfit.full)
objects(sum.full)
## [1] "adjr2"  "bic"    "cp"     "obj"    "outmat" "rsq"    "rss"    "which"
sum.full$outmat # see which predictors are chosen depending on how many are chosen
##           x.mat1 x.mat2 x.mat3 x.mat4 x.mat5 x.mat6 x.mat7 x.mat8 x.mat9
## 1  ( 1 )  "*"    " "    " "    " "    " "    " "    " "    " "    " "   
## 2  ( 1 )  " "    "*"    "*"    " "    " "    " "    " "    " "    " "   
## 3  ( 1 )  "*"    "*"    "*"    " "    " "    " "    " "    " "    " "   
## 4  ( 1 )  "*"    "*"    "*"    " "    " "    "*"    " "    " "    " "   
## 5  ( 1 )  "*"    "*"    " "    " "    "*"    " "    "*"    " "    "*"   
## 6  ( 1 )  "*"    "*"    "*"    " "    "*"    " "    "*"    " "    "*"   
## 7  ( 1 )  "*"    "*"    " "    "*"    "*"    "*"    "*"    " "    "*"   
## 8  ( 1 )  "*"    "*"    " "    " "    "*"    "*"    "*"    "*"    "*"   
## 9  ( 1 )  "*"    "*"    " "    "*"    "*"    "*"    "*"    "*"    "*"   
## 10  ( 1 ) "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"   
##           x.mat10
## 1  ( 1 )  " "    
## 2  ( 1 )  " "    
## 3  ( 1 )  " "    
## 4  ( 1 )  " "    
## 5  ( 1 )  " "    
## 6  ( 1 )  " "    
## 7  ( 1 )  " "    
## 8  ( 1 )  "*"    
## 9  ( 1 )  "*"    
## 10  ( 1 ) "*"
sum.full$adjr2 # adjusted R^2
##  [1] 0.6186582 0.9372859 0.9845940 0.9844369 0.9845636 0.9844058 0.9842723
##  [8] 0.9844977 0.9846469 0.9849463
plot(sum.full$adjr2)
indx <- which.max(sum.full$adjr2)
points(indx, sum.full$adjr2[indx], pch = 19)

# The model with 10 variables has the highest adjusted R^2, but there does not 
# appear to be a particularly meaningful difference between the best 3-variable
# model and the best 10 variable model.

plot(sum.full$bic)
indx.2 <- which.min(sum.full$bic)
points(indx.2, sum.full$bic[indx.2], pch = 19)

# In the BIC graph, the model with the lowest BIC is the 3-variable model.

plot(sum.full$cp)
indx.3 <- which.min(sum.full$cp)
points(indx.3, sum.full$cp[indx.3], pch = 19)

# Again, in the cP graph, the 3-variable model has the lowest cP. Because the 
# 3-variable model has the lowest BIC and cP error, and there isn't a meaningful
# increase in adjusted R^2, we will treat the 3-variable model as the best model.

sum.full
## Subset selection object
## Call: regsubsets.formula(y ~ x.mat, data = d.frame, nvmax = 10)
## 10 Variables  (and intercept)
##         Forced in Forced out
## x.mat1      FALSE      FALSE
## x.mat2      FALSE      FALSE
## x.mat3      FALSE      FALSE
## x.mat4      FALSE      FALSE
## x.mat5      FALSE      FALSE
## x.mat6      FALSE      FALSE
## x.mat7      FALSE      FALSE
## x.mat8      FALSE      FALSE
## x.mat9      FALSE      FALSE
## x.mat10     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
##           x.mat1 x.mat2 x.mat3 x.mat4 x.mat5 x.mat6 x.mat7 x.mat8 x.mat9
## 1  ( 1 )  "*"    " "    " "    " "    " "    " "    " "    " "    " "   
## 2  ( 1 )  " "    "*"    "*"    " "    " "    " "    " "    " "    " "   
## 3  ( 1 )  "*"    "*"    "*"    " "    " "    " "    " "    " "    " "   
## 4  ( 1 )  "*"    "*"    "*"    " "    " "    "*"    " "    " "    " "   
## 5  ( 1 )  "*"    "*"    " "    " "    "*"    " "    "*"    " "    "*"   
## 6  ( 1 )  "*"    "*"    "*"    " "    "*"    " "    "*"    " "    "*"   
## 7  ( 1 )  "*"    "*"    " "    "*"    "*"    "*"    "*"    " "    "*"   
## 8  ( 1 )  "*"    "*"    " "    " "    "*"    "*"    "*"    "*"    "*"   
## 9  ( 1 )  "*"    "*"    " "    "*"    "*"    "*"    "*"    "*"    "*"   
## 10  ( 1 ) "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"   
##           x.mat10
## 1  ( 1 )  " "    
## 2  ( 1 )  " "    
## 3  ( 1 )  " "    
## 4  ( 1 )  " "    
## 5  ( 1 )  " "    
## 6  ( 1 )  " "    
## 7  ( 1 )  " "    
## 8  ( 1 )  "*"    
## 9  ( 1 )  "*"    
## 10  ( 1 ) "*"
coef(regfit.full, 3)
## (Intercept)      x.mat1      x.mat2      x.mat3 
##    1.957565    3.888312    7.087611    3.017670
# So, the third model uses the first (number of hits), second (CRBI), and third 
# (PutOuts) variables in the model. The coefficient for the first variable is
# 2.8038, the coefficient for the second variable is 0.68253, and the coefficient
# for the third variable is 0.27358. 

# Part d: Repeat (c), using forward stepwise selection and also using backwards
# stepwise selection. How does your answer compare to the results in (c)?

# Forward stepwise selection:

regfit.fwd <- regsubsets(y ~ x.mat, method="forward", data = d.frame, nvmax=10)
sum.fwd <- summary(regfit.fwd)
sum.fwd
## Subset selection object
## Call: regsubsets.formula(y ~ x.mat, method = "forward", data = d.frame, 
##     nvmax = 10)
## 10 Variables  (and intercept)
##         Forced in Forced out
## x.mat1      FALSE      FALSE
## x.mat2      FALSE      FALSE
## x.mat3      FALSE      FALSE
## x.mat4      FALSE      FALSE
## x.mat5      FALSE      FALSE
## x.mat6      FALSE      FALSE
## x.mat7      FALSE      FALSE
## x.mat8      FALSE      FALSE
## x.mat9      FALSE      FALSE
## x.mat10     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: forward
##           x.mat1 x.mat2 x.mat3 x.mat4 x.mat5 x.mat6 x.mat7 x.mat8 x.mat9
## 1  ( 1 )  "*"    " "    " "    " "    " "    " "    " "    " "    " "   
## 2  ( 1 )  "*"    "*"    " "    " "    " "    " "    " "    " "    " "   
## 3  ( 1 )  "*"    "*"    "*"    " "    " "    " "    " "    " "    " "   
## 4  ( 1 )  "*"    "*"    "*"    " "    " "    "*"    " "    " "    " "   
## 5  ( 1 )  "*"    "*"    "*"    " "    "*"    "*"    " "    " "    " "   
## 6  ( 1 )  "*"    "*"    "*"    "*"    "*"    "*"    " "    " "    " "   
## 7  ( 1 )  "*"    "*"    "*"    "*"    "*"    "*"    "*"    " "    " "   
## 8  ( 1 )  "*"    "*"    "*"    "*"    "*"    "*"    "*"    " "    "*"   
## 9  ( 1 )  "*"    "*"    "*"    "*"    "*"    "*"    "*"    " "    "*"   
## 10  ( 1 ) "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"   
##           x.mat10
## 1  ( 1 )  " "    
## 2  ( 1 )  " "    
## 3  ( 1 )  " "    
## 4  ( 1 )  " "    
## 5  ( 1 )  " "    
## 6  ( 1 )  " "    
## 7  ( 1 )  " "    
## 8  ( 1 )  " "    
## 9  ( 1 )  "*"    
## 10  ( 1 ) "*"
objects(sum.fwd)
## [1] "adjr2"  "bic"    "cp"     "obj"    "outmat" "rsq"    "rss"    "which"
sum.fwd$adjr2
##  [1] 0.6186582 0.8735752 0.9845940 0.9844369 0.9843192 0.9842197 0.9841484
##  [8] 0.9841141 0.9840127 0.9849463
plot(sum.fwd$adjr2)
indx.fwd <- which.max(sum.fwd$adjr2)
points(indx.fwd, sum.fwd$adjr2[indx.fwd], pch = 19)

# So the highest adjusted R^2 is the 10- variable model, but there doesn't appear
# to be much improvement between the 3-variable model and the 10-variable model.

sum.fwd$bic
##  [1]  -88.21081 -195.03613 -401.95623 -397.38380 -393.08360 -388.91501
##  [7] -384.94025 -381.21184 -377.07508 -379.60464
plot(sum.fwd$bic)
indx.fwd2 <- which.min(sum.fwd$bic)
points(indx.fwd2, sum.fwd$bic[indx.fwd2], pch = 19)

# In terms of BIC, the 3-variable model returns the lowest BIC.

sum.fwd$cp
##  [1] 2386.546245  720.630863    6.246770    8.214612    9.915538
##  [6]   11.488996   12.876348   14.030682   15.581917   11.000000
plot(sum.fwd$cp)
indx.fwd3 <- which.min(sum.fwd$cp)
points(indx.fwd3, sum.fwd$cp[indx.fwd3], pch = 19)

sum.fwd
## Subset selection object
## Call: regsubsets.formula(y ~ x.mat, method = "forward", data = d.frame, 
##     nvmax = 10)
## 10 Variables  (and intercept)
##         Forced in Forced out
## x.mat1      FALSE      FALSE
## x.mat2      FALSE      FALSE
## x.mat3      FALSE      FALSE
## x.mat4      FALSE      FALSE
## x.mat5      FALSE      FALSE
## x.mat6      FALSE      FALSE
## x.mat7      FALSE      FALSE
## x.mat8      FALSE      FALSE
## x.mat9      FALSE      FALSE
## x.mat10     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: forward
##           x.mat1 x.mat2 x.mat3 x.mat4 x.mat5 x.mat6 x.mat7 x.mat8 x.mat9
## 1  ( 1 )  "*"    " "    " "    " "    " "    " "    " "    " "    " "   
## 2  ( 1 )  "*"    "*"    " "    " "    " "    " "    " "    " "    " "   
## 3  ( 1 )  "*"    "*"    "*"    " "    " "    " "    " "    " "    " "   
## 4  ( 1 )  "*"    "*"    "*"    " "    " "    "*"    " "    " "    " "   
## 5  ( 1 )  "*"    "*"    "*"    " "    "*"    "*"    " "    " "    " "   
## 6  ( 1 )  "*"    "*"    "*"    "*"    "*"    "*"    " "    " "    " "   
## 7  ( 1 )  "*"    "*"    "*"    "*"    "*"    "*"    "*"    " "    " "   
## 8  ( 1 )  "*"    "*"    "*"    "*"    "*"    "*"    "*"    " "    "*"   
## 9  ( 1 )  "*"    "*"    "*"    "*"    "*"    "*"    "*"    " "    "*"   
## 10  ( 1 ) "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"   
##           x.mat10
## 1  ( 1 )  " "    
## 2  ( 1 )  " "    
## 3  ( 1 )  " "    
## 4  ( 1 )  " "    
## 5  ( 1 )  " "    
## 6  ( 1 )  " "    
## 7  ( 1 )  " "    
## 8  ( 1 )  " "    
## 9  ( 1 )  "*"    
## 10  ( 1 ) "*"
coef(regfit.fwd, 3)
## (Intercept)      x.mat1      x.mat2      x.mat3 
##    1.957565    3.888312    7.087611    3.017670
# Again, the 3-variable model returns the lowest cP. Therefore, the 3-variable 
# model again is the best model selected by forward stepwise selection. The 
# variables selected are x1, x2, and x3, with the coefficient for x1 = 3.888, the 
# coefficient for x2 = 7.088, and the coefficient for x3 = 3.018. The coefficients
# acquired using forward stepsize selection are different than the ones produced
# by using best subset selection. 

# Backwards stepwise selection:

regfit.bwd <- regsubsets(y ~ x.mat, method = "backward", data = d.frame, 
                         nvmax = 10)
sum.bwd <- summary(regfit.bwd)
sum.bwd
## Subset selection object
## Call: regsubsets.formula(y ~ x.mat, method = "backward", data = d.frame, 
##     nvmax = 10)
## 10 Variables  (and intercept)
##         Forced in Forced out
## x.mat1      FALSE      FALSE
## x.mat2      FALSE      FALSE
## x.mat3      FALSE      FALSE
## x.mat4      FALSE      FALSE
## x.mat5      FALSE      FALSE
## x.mat6      FALSE      FALSE
## x.mat7      FALSE      FALSE
## x.mat8      FALSE      FALSE
## x.mat9      FALSE      FALSE
## x.mat10     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: backward
##           x.mat1 x.mat2 x.mat3 x.mat4 x.mat5 x.mat6 x.mat7 x.mat8 x.mat9
## 1  ( 1 )  "*"    " "    " "    " "    " "    " "    " "    " "    " "   
## 2  ( 1 )  "*"    "*"    " "    " "    " "    " "    " "    " "    " "   
## 3  ( 1 )  "*"    "*"    " "    " "    "*"    " "    " "    " "    " "   
## 4  ( 1 )  "*"    "*"    " "    " "    "*"    " "    "*"    " "    " "   
## 5  ( 1 )  "*"    "*"    " "    " "    "*"    " "    "*"    " "    "*"   
## 6  ( 1 )  "*"    "*"    " "    " "    "*"    " "    "*"    "*"    "*"   
## 7  ( 1 )  "*"    "*"    " "    " "    "*"    "*"    "*"    "*"    "*"   
## 8  ( 1 )  "*"    "*"    " "    " "    "*"    "*"    "*"    "*"    "*"   
## 9  ( 1 )  "*"    "*"    " "    "*"    "*"    "*"    "*"    "*"    "*"   
## 10  ( 1 ) "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"   
##           x.mat10
## 1  ( 1 )  " "    
## 2  ( 1 )  " "    
## 3  ( 1 )  " "    
## 4  ( 1 )  " "    
## 5  ( 1 )  " "    
## 6  ( 1 )  " "    
## 7  ( 1 )  " "    
## 8  ( 1 )  "*"    
## 9  ( 1 )  "*"    
## 10  ( 1 ) "*"
objects(sum.bwd)
## [1] "adjr2"  "bic"    "cp"     "obj"    "outmat" "rsq"    "rss"    "which"
plot(sum.bwd$adjr)
indx.bwd <- which.max(sum.bwd$adjr2)
points(indx.bwd, sum.bwd$adjr2[indx.bwd], pch = 19)

# Again, the 10-variable model has the highest adjusted R^2, but there isn't a
# substantial difference between the 3-variable model and the 10-variable
# model. 

plot(sum.bwd$bic)
indx.bwd2 <- which.min(sum.bwd$bic)
points(indx.bwd2, sum.bwd$bic[indx.bwd2], pch = 19)

# We get a different result when using backwards stepwise selection than we did 
# before. This time, the 5-variable model has the smallest BIC. However, there 
# doesn't seem to be a significant difference in BIC between the 4-variable
# and 5-variable model, but there does to appear to be a somewhat large difference
# between the 3-variable and 5-variable models.

plot(sum.bwd$cp)
indx.bwd3 <- which.min(sum.bwd$cp)
points(indx.bwd3, sum.bwd$cp[indx.bwd3], pch = 19)

# Differently than when we used best subset and forward stepwise selection,
# the model with the smallest cP is the 5-variable model. There does not appear
# to be a substantial difference between the 5-variable and 3-variable model.
# Because of this, I am selecting the 4-variable model as the best model when 
# using backwards stepwise selection, as that appears to appropriately maximize
# the adjusted R^2 value and minimize cP and BIC. 

sum.bwd
## Subset selection object
## Call: regsubsets.formula(y ~ x.mat, method = "backward", data = d.frame, 
##     nvmax = 10)
## 10 Variables  (and intercept)
##         Forced in Forced out
## x.mat1      FALSE      FALSE
## x.mat2      FALSE      FALSE
## x.mat3      FALSE      FALSE
## x.mat4      FALSE      FALSE
## x.mat5      FALSE      FALSE
## x.mat6      FALSE      FALSE
## x.mat7      FALSE      FALSE
## x.mat8      FALSE      FALSE
## x.mat9      FALSE      FALSE
## x.mat10     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: backward
##           x.mat1 x.mat2 x.mat3 x.mat4 x.mat5 x.mat6 x.mat7 x.mat8 x.mat9
## 1  ( 1 )  "*"    " "    " "    " "    " "    " "    " "    " "    " "   
## 2  ( 1 )  "*"    "*"    " "    " "    " "    " "    " "    " "    " "   
## 3  ( 1 )  "*"    "*"    " "    " "    "*"    " "    " "    " "    " "   
## 4  ( 1 )  "*"    "*"    " "    " "    "*"    " "    "*"    " "    " "   
## 5  ( 1 )  "*"    "*"    " "    " "    "*"    " "    "*"    " "    "*"   
## 6  ( 1 )  "*"    "*"    " "    " "    "*"    " "    "*"    "*"    "*"   
## 7  ( 1 )  "*"    "*"    " "    " "    "*"    "*"    "*"    "*"    "*"   
## 8  ( 1 )  "*"    "*"    " "    " "    "*"    "*"    "*"    "*"    "*"   
## 9  ( 1 )  "*"    "*"    " "    "*"    "*"    "*"    "*"    "*"    "*"   
## 10  ( 1 ) "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"   
##           x.mat10
## 1  ( 1 )  " "    
## 2  ( 1 )  " "    
## 3  ( 1 )  " "    
## 4  ( 1 )  " "    
## 5  ( 1 )  " "    
## 6  ( 1 )  " "    
## 7  ( 1 )  " "    
## 8  ( 1 )  "*"    
## 9  ( 1 )  "*"    
## 10  ( 1 ) "*"
coef(regfit.bwd, 4)
## (Intercept)      x.mat1      x.mat2      x.mat5      x.mat7 
##   1.9711524   5.4901289   7.0905984   1.3653743  -0.1641141
# The 4-variable model when selected using backwards stepwise selection uses the
# first variable, the second variable, the fifth variable, and the seventh 
# variable. Coefficients for x1 = 5.490, x2 = 7.091, x5 = 1.365, x7 = -0.164.
# This is different from the forward stepwise selection model and the best
# subset selection model, both in coefficients and in selected variables. 
# Even if we had selected the 3-variable model using backwards stepwise selection,
# the variables selected would have been x1, x2, and x5, rather than x1, x2, and
# x3. 


# Part e: Now fit a lasso model to the simulated data, again using x, x^2, ..., 
# x^10 as predictors. Use cross-validation to select the optimal value of lambda.
# Create plots of the corss-validation error as a function of lambda. Report the 
# resulting coefficient estimates, and discuss the results obtained.

set.seed(1)
xmat <- model.matrix(y ~ poly(x, 10, raw = T), data = d.frame)[,-1]

library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-13

grid <- 10^seq(10, -2, length = 100)
lasso.mod <- glmnet(xmat, y, alpha = 1, lambda = grid)
plot(lasso.mod)

set.seed(1)
cv.out <- cv.glmnet(xmat, y, alpha = 1)
plot(cv.out)

bestlam <- cv.out$lambda.min
lasso.pred <- predict(lasso.mod, s = bestlam, type = "coefficients")

lasso.pred
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                               1
## (Intercept)            2.016542
## poly(x, 10, raw = T)1  3.897243
## poly(x, 10, raw = T)2  7.003016
## poly(x, 10, raw = T)3  2.975668
## poly(x, 10, raw = T)4  .       
## poly(x, 10, raw = T)5  .       
## poly(x, 10, raw = T)6  .       
## poly(x, 10, raw = T)7  .       
## poly(x, 10, raw = T)8  .       
## poly(x, 10, raw = T)9  .       
## poly(x, 10, raw = T)10 .
coef(regfit.bwd, 4)
## (Intercept)      x.mat1      x.mat2      x.mat5      x.mat7 
##   1.9711524   5.4901289   7.0905984   1.3653743  -0.1641141
coef(regfit.fwd, 3)
## (Intercept)      x.mat1      x.mat2      x.mat3 
##    1.957565    3.888312    7.087611    3.017670
coef(regfit.full, 3)
## (Intercept)      x.mat1      x.mat2      x.mat3 
##    1.957565    3.888312    7.087611    3.017670
# The lasso model picks x1, x2, and x3, similar to the previous best subset 
# selection and forward stepwise selection models. For the lasso model, x1 = 3.897,
# x2 = 7.003, and x3 = 2.976. Additionally, the resulting coefficients for the 
# lasso model are very similar to the coefficients acquired for the best subset
# selection model and for the forward stepwise model. The x2 variable is quite 
# similar across models. 

rm(list = ls()) # clear environment to get ready for next problem

###########################
# Chapter 6, Problem 9

# In this exercise, we will predict the number of applications received using
# the other variables in the College data set.

library(ISLR)

# Part a: Split the data into a training set and a test set.

college <- College
attach(college)
names(college)
##  [1] "Private"     "Apps"        "Accept"      "Enroll"      "Top10perc"  
##  [6] "Top25perc"   "F.Undergrad" "P.Undergrad" "Outstate"    "Room.Board" 
## [11] "Books"       "Personal"    "PhD"         "Terminal"    "S.F.Ratio"  
## [16] "perc.alumni" "Expend"      "Grad.Rate"
train <- sample(1:nrow(college), nrow(college)/2)
test = -train
college.train <- college[train, ]
college.test <- college[test, ]

# Part b: Fit a linear model using least squares on the training set, and report
# the test error obtained.

college.lm <- lm(Apps ~ ., data = college.train)
college.pred <- predict(college.lm, college.test)
mean((college.pred - college.test$Apps)^2)
## [1] 1132563
# So the test MSE is 1719379.

# Part c: Fit a ridge regression model on the training set, with lambda chosen
# by cross-validation. Report on the test error obtained.

grid <- 10^seq(5, -2, length = 100)
train.mat <- model.matrix(Apps ~ ., data = college.train)
test.mat <- model.matrix (Apps ~ ., data = college.test)
ridge.college <- glmnet(train.mat, college.train$Apps, alpha = 0, 
                        lambda = grid, thresh = 1e-12)
ridge.cv.out <- cv.glmnet(train.mat, college.train$Apps, alpha = 0,
                          lambda = grid, thresh = 1e-12)
plot(ridge.cv.out)

bestlam.ridge <- ridge.cv.out$lambda.min
bestlam.ridge
## [1] 0.01
# So the best lambda value is 24.771.

ridge.pred <- predict(ridge.college, s = bestlam.ridge, newx = test.mat)
mean((ridge.pred - college.test$Apps)^2)
## [1] 1132519
# The test MSE is 1827747, higher than least squares.

# Part d: Fit a lasso model on the training set, with lambda chosen by cross-
# validation. Report the test error obtained, along with the number of non-zero
# coefficient estimates.

lasso.college <- glmnet(train.mat, college.train$Apps, alpha = 1, lambda = grid)
lasso.cv.out <- cv.glmnet(train.mat, college.train$Apps, alpha = 1, lambda = grid)
plot(lasso.cv.out)

bestlam.lasso <- lasso.cv.out$lambda.min
bestlam.lasso
## [1] 0.01
# So the best lambda value is 24.77076.

lasso.pred <- predict(lasso.college, s = bestlam.lasso, newx = test.mat)
mean((lasso.pred - college.test$Apps)^2)
## [1] 1131629
# The test MSE is 1906073, which is larger than the ridge regression error and 
# larger than the least squares error. 

lasso.pred.coef <- predict(lasso.college, s = bestlam.lasso, type = "coefficients")
lasso.pred.coef
## 19 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept) -855.61504558
## (Intercept)    .         
## PrivateYes  -228.72339945
## Accept         1.76228998
## Enroll        -1.58326629
## Top10perc     67.69850987
## Top25perc    -24.29711506
## F.Undergrad    0.12842608
## P.Undergrad    0.10192439
## Outstate      -0.11576445
## Room.Board     0.13861694
## Books          0.45940028
## Personal      -0.05639144
## PhD           -7.65426623
## Terminal     -10.54972812
## S.F.Ratio     44.56058688
## perc.alumni    5.68664205
## Expend         0.10686250
## Grad.Rate     10.97568606
# 11 out of 16 variables are included in the lasso model. The variables included
# are graduation rate, percent of alumni who donate, percent of faculty with 
# terminal degree, percent of faculty with Ph.D.'s, room and board costs, 
# number of part time students, number of full time students,  
# percent of new students from top 10% of HS class, number of new students 
# enrolled, number of applications accepted, and a factor with levels no and 
# yes indicating private or public university.

# Part e: Fit a PCR model on the training set, with M chosen by cross-validation.
# Report the test error obtained, along with the value of M selected by 
# cross-validation.

library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
set.seed(2)
college.pcr <- pcr(Apps ~ ., data = college.train, scale = TRUE, 
                   validation = "CV")
summary(college.pcr)
## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4076     3962     2340     2362     2165     1933     1899
## adjCV         4076     3959     2334     2358     2178     1907     1889
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1890     1902     1875      1884      1895      1893      1918
## adjCV     1876     1893     1865      1874      1885      1883      1908
##        14 comps  15 comps  16 comps  17 comps
## CV         1937      1779      1412      1307
## adjCV      1982      1739      1395      1294
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X      31.100    55.92    63.61    69.43    74.88    79.92     83.7
## Apps    7.934    69.39    69.39    74.33    80.84    81.51     81.8
##       8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X       87.35    90.34     92.78     94.90     96.70     97.85     98.66
## Apps    81.80    82.44     82.50     82.51     82.66     82.70     84.53
##       15 comps  16 comps  17 comps
## X        99.40     99.84    100.00
## Apps     90.56     92.06     92.74
validationplot(college.pcr, val.type = "MSEP")

# It looks like the smallest MSEP happens at M = 10.

pcr.pred <- predict(college.pcr, college.test, ncomp = 16)
mean((pcr.pred - college.test$Apps)^2)
## [1] 1098421
# So MSE for PCR is 1998540, higher than least squares, ridge regression, and the
# lasso methods.

# Part f: Fit a PLS model on the training set, with M chosen by cross-validation.
# Report the test error obtained, along with the value of M selected by cross-
# validation.

set.seed(1)
college.pls <- plsr(Apps ~ ., data = college.train, scale = TRUE, 
                    validation = "CV")
summary(college.pls)
## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4076     2139     1942     1774     1692     1545     1445
## adjCV         4076     2132     1943     1761     1649     1510     1421
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1379     1347     1340      1340      1336      1328      1328
## adjCV     1362     1332     1325      1325      1322      1314      1314
##        14 comps  15 comps  16 comps  17 comps
## CV         1330      1329      1328      1328
## adjCV      1315      1314      1314      1314
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X       25.70    46.36    61.26    63.07    66.74    71.25    76.44
## Apps    74.52    80.88    85.71    90.93    92.07    92.29    92.39
##       8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X       79.76    82.60     85.62     88.93     90.20     92.12     93.32
## Apps    92.53    92.63     92.67     92.69     92.73     92.74     92.74
##       15 comps  16 comps  17 comps
## X        95.83     98.01    100.00
## Apps     92.74     92.74     92.74
validationplot(college.pls, val.type = "MSEP")

# It appears by the time M = 6, any additional decrease is basically nonexistent,
# and so we'll let M = 6.

pls.pred <- predict(college.pls, college.test, ncomp = 6)
mean((pls.pred - college.test$Apps)^2)
## [1] 1102855
# So the MSE for PLS is 1851769, smaller than PCR, smaller than lasso, larger 
# than ridge regression, and larger than the least squares. 

# Part g: Comment on the results obtained. How accurately can we predict the 
# number of college applications recieved? Is there much difference among the 
# test errors resulting from these five approaches?

# Compute R^2 for all 

test.avg <- mean(college.test$Apps)
lm.r2 <- 1 - mean((college.pred - college.test$Apps)^2) / 
  mean((test.avg - college.test$Apps)^2)
ridge.r2 <- 1 - mean((ridge.pred - college.test$Apps)^2)/
                       mean((test.avg - college.test$Apps)^2)
lasso.r2 <- 1 - mean((lasso.pred - college.test$Apps)^2) / 
  mean((test.avg - college.test$Apps)^2)
pcr.r2 <- 1 - mean((pcr.pred - college.test$Apps)^2) / 
  mean((test.avg - college.test$Apps)^2)
pls.r2 <- 1 - mean((pls.pred - college.test$Apps)^2) / 
  mean((test.avg - college.test$Apps)^2)
lm.r2
## [1] 0.9154386
ridge.r2
## [1] 0.9154419
lasso.r2
## [1] 0.9155084
pcr.r2
## [1] 0.9179878
pls.r2
## [1] 0.9176568
# We can predict accurately the number of college applications - all our 
# models explain about 88 - 89% of the variance seen in the data. There is not a 
# huge difference in R^2 among the models  - it appears that the linear model
# has the lowest R^2, but the adjusted R^2 may be lower, as the normal R^2 
# does not have a penalty term for increasing numbers of variables. This may 
# change if the code is run again, as the models depend heavily on the test and
# training data, which is randomized each time the code is run.