Number 6

a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

library(ISLR2)
library(boot)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'tibble' was built under R version 4.2.2
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.2
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'forcats' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
wage <- Wage
cv <- rep(1:10)
for (i in 1:10) {
    glmfit <- glm(wage ~ poly(age, i), data = Wage)
    cv[i] <- cv.glm(Wage, glmfit, K = 10)$delta[1]}

plot(1:i, cv)
min <- which.min(cv)
points(min, cv[min], col = 'red')

# the lowest test mse is 7, but poly value of 3 will be used
glmfit <- glm(wage ~ poly(age, 3), data = wage)
agelims=range(wage$age)
agegrid=seq(from=agelims[1],to=agelims[2])
x <- seq(min(wage$age),max(wage$age))
pred <- predict(glmfit, newdata = list(age = agegrid))
plot(wage$age,wage$wage, col = "darkgrey")
lines(x,pred,lwd=2,col="red")

fit1=lm(wage~age,data=Wage)
fit2=lm(wage~poly(age,2),data=Wage)
fit3=lm(wage~poly(age,3),data=Wage)
fit4=lm(wage~poly(age,4),data=Wage)
fit5=lm(wage~poly(age,5),data=Wage)
anova(fit1,fit2,fit3,fit4,fit5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.

cv <- rep(1:10)
for (i in 2:10) {
    wage$age.cut <- cut(wage$age, i)
    fit <- glm(wage ~ age.cut, data = wage)
    cv[i] <- cv.glm(wage, fit, K = 10)$delta[1]
}

plot(2:10, cv[-1], xlab = 'Cuts', ylab = 'Test MSE')

# we can see that the lowest cut point obtained in the graph is at 8. 

fit

plot(wage ~ age, data = Wage, col = "darkgrey")
glmfit <- glm(wage ~ cut(age, 8), data = wage)
pred <- predict(glmfit, list(age = agegrid))
lines(agegrid, pred, col = "red", lwd = 2)

Number 10

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

library(caTools)
set.seed(1)
college <- College
sample <- sample.split(college$Apps, SplitRatio = 0.5)
test <- train  <- subset(college, sample == TRUE)
test   <- subset(college, sample == FALSE)
library(leaps)
library(gam)
## Loading required package: splines
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.22
regfit <- regsubsets(Outstate ~ ., data = train, nvmax = 17)
reg.summary <- summary(regfit)

par (mfrow = c(2, 2))
plot (reg.summary$rss , xlab = " Number of Variables ", ylab = "RSS ", type = "l")

plot(reg.summary$adjr2, xlab = " Number of Variables ",
ylab = " Adjusted RSq ", type = "l")
which.max(reg.summary$adjr2)
## [1] 11
points(13, reg.summary$adjr2[13], col = " red ", cex = 2, pch = 20)

plot(reg.summary$cp, xlab = " Number of Variables ", ylab = "Cp", type = "l")
which.min (reg.summary$cp)
## [1] 10
points (12, reg.summary$cp[12], col = " red ", cex = 2, pch = 20)


which.min(reg.summary$bic)
## [1] 6
plot (reg.summary$bic , xlab = " Number of Variables ", ylab = " BIC ", type = "l")
points (8, reg.summary$bic[8], col = " red ", cex = 2, pch = 20)

# The BIC tells us to stop at 8 variables
par(mfrow = c(1,2))
plot(regfit, scale = "r2")
plot(regfit, scale = "adjr2")

plot(regfit, scale = "Cp")
plot(regfit, scale = "bic")

coef(regfit, id = 6)
##   (Intercept)    PrivateYes    Room.Board      Terminal   perc.alumni 
## -3579.4690422  2484.7981053     1.1390901    33.2412341    53.4580937 
##        Expend     Grad.Rate 
##     0.1857791    25.8777174

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gam1 <- lm(Outstate~ns(Room.Board,2)+ns(Terminal,2)+ ns(perc.alumni,2) + ns(Expend,5) + ns(Grad.Rate,2) + Private, data=train)
gam2 <- gam(Outstate ~s(Room.Board, df = 2) + s(Terminal, df = 2) +  s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, df = 2) + Private, data = train)
par(mfrow = c(2,3))
plot(gam2, se=TRUE,col ="#1c9099")

summary(gam2)
## 
## Call: gam(formula = Outstate ~ s(Room.Board, df = 2) + s(Terminal, 
##     df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, 
##     df = 2) + Private, data = train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -5625.99 -1020.40    97.92  1228.22  7659.61 
## 
## (Dispersion Parameter for gaussian family taken to be 3277325)
## 
##     Null Deviance: 6217170124 on 387 degrees of freedom
## Residual Deviance: 1222443287 on 373.0003 degrees of freedom
## AIC: 6938.783 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                         Df     Sum Sq    Mean Sq F value    Pr(>F)    
## s(Room.Board, df = 2)    1 2369817502 2369817502 723.095 < 2.2e-16 ***
## s(Terminal, df = 2)      1   81469518   81469518  24.858 9.469e-07 ***
## s(perc.alumni, df = 2)   1  720940487  720940487 219.978 < 2.2e-16 ***
## s(Expend, df = 5)        1  570748780  570748780 174.151 < 2.2e-16 ***
## s(Grad.Rate, df = 2)     1  101439820  101439820  30.952 5.056e-08 ***
## Private                  1  233522088  233522088  71.254 7.044e-16 ***
## Residuals              373 1222443287    3277325                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                        Npar Df  Npar F     Pr(F)    
## (Intercept)                                         
## s(Room.Board, df = 2)        1  1.1340    0.2876    
## s(Terminal, df = 2)          1  2.1386    0.1445    
## s(perc.alumni, df = 2)       1  1.7096    0.1918    
## s(Expend, df = 5)            4 20.5681 2.442e-15 ***
## s(Grad.Rate, df = 2)         1  0.5476    0.4597    
## Private                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(2,3))
plot.Gam(gam1, se=TRUE, col="#f03b20")

# The predictor variables look pretty linear; especially grad rate, room board, and perc alumni, and terminal. This means that out-of-state tuition increases as the linear predictors do (this would be opposite if the linear predictors were going in the negative direction. Also, the Outstate variable is higher with private schools, shown by the private plot.

##(c) Evaluate the model obtained on the test set, and explain the results obtained.

gampred <- predict(gam1, test)
gampred2 <- predict(gam2, test)
mean((test$Outstate - gampred)^2)
## [1] 3874156
mean((test$Outstate - gampred2)^2)
## [1] 3791404
# The MSE's are shown below.

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

# The only variable that does not have a linear relationship with the predictor is the Expend variable. This variable is linear at the lower amounts, but sort of flattens out as Expend gets higher.