Questioin 6. Further analyze the Wage data set.

a) Perform polynomial regression to predict wage using age. Use cross validation to select the optimal degree (d) for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

The optimal degree found by cross-validation is 4. The ANOVA shows that a cubic or quartic polynomial will provide a resonable fit.

library(ISLR2)
library(boot)
attach(Wage)
set.seed(5)
cv.error <- rep(0,5)
for (i in 1:5){
glm.fit <- glm(wage ~ poly(age,i),data=Wage)
cv.error[i]<- cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv.error
## [1] 1675.881 1600.625 1595.877 1594.165 1594.823
plot(x=1:5, y=cv.error, xlab= "age", ylab = "CV Error", type = "b", pch = 20, lwd = 1, lty = 3)
degree.min <- which.min(cv.error)
points(degree.min, cv.error[degree.min], col = "orange", cex = 2, pch = 20)

fit.1<-lm(wage~age, data=Wage)
fit.2<-lm(wage~poly(age,2), data=Wage)
fit.3<-lm(wage~poly(age,3), data=Wage)
fit.4<-lm(wage~poly(age,4), data=Wage)
fit.5<-lm(wage~poly(age,5), data=Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
age.limit<- range(age)
age.grid<- seq(from=age.limit[1], to=age.limit[2])
pred <- predict(fit.4, newdata = list(age=age.grid), se=T)
se.bands<- cbind(pred$fit+2*pred$se.fit, pred$fit -2*pred$se.fit)
plot(age, wage, xlim=age.limit, cex=.5, col ="darkgreen")
title<- ("Degree-4 polynomial")
lines(age.grid, pred$fit, lwd = 2, col="blue")
matlines (age.grid, se.bands, lwd =1, col="blue", lty = 3)

#### b) Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained. #### Cross-validation shows that 8 cuts will suffice.

set.seed(10)
cv.err<- rep(NA, 10)
for(i in 2:10){
  Wage$age.cut <- cut(Wage$age,i)
  glm.fit <- glm(wage ~ age.cut, data=Wage)
  cv.err[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}
cv.err
##  [1]       NA 1733.954 1681.195 1637.324 1632.526 1624.776 1610.919 1600.948
##  [9] 1610.202 1603.532
plot(x=1:10, y=cv.err, xlab= "age", ylab = "CV Error", type = "b", pch = 20, lwd = 1, lty = 3)
degree.min <- which.min(cv.err)
points(degree.min, cv.err[degree.min], col = "blue", cex = 2, pch = 20)

step.fit <- glm(wage~cut(age,8), data=Wage)
preds<- predict(step.fit, data.frame(age=age.grid))
plot(age, wage, col = "grey")
lines(age.grid, preds, col = "purple", lwd = 2)

### Question 10. This question relates to the College data set.

a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

A six variable model is appropriate.

attach(College)
library(leaps)
set.seed(15)
test.set <- sample(1:nrow(College), nrow(College)/2)
train <- College[-test.set,]
test<- College[test.set,]
#Forward step-wise selection
fwd.fit<- regsubsets(Outstate ~., data = College, nvmax = 17, method = "forward")
fit.summary <- summary(fwd.fit)
fit.summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, nvmax = 17, 
##     method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 9  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 10  ( 1 ) "*"        " "  "*"    " "    " "       " "       " "        
## 11  ( 1 ) "*"        " "  "*"    " "    " "       " "       "*"        
## 12  ( 1 ) "*"        "*"  "*"    " "    " "       " "       "*"        
## 13  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"        
## 14  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 15  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
## 6  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
## 7  ( 1 )  " "         "*"        " "   "*"      "*" " "      " "      
## 8  ( 1 )  " "         "*"        " "   "*"      "*" "*"      " "      
## 9  ( 1 )  " "         "*"        " "   "*"      "*" "*"      "*"      
## 10  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 11  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 12  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 13  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 14  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 15  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"      
## 16  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "      
## 2  ( 1 )  " "         "*"    " "      
## 3  ( 1 )  " "         "*"    " "      
## 4  ( 1 )  "*"         "*"    " "      
## 5  ( 1 )  "*"         "*"    " "      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    "*"      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"
coef(fwd.fit, 6)
##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3553.2345268  2768.6347025     0.9679086    35.5283359    48.4221031 
##        Expend     Grad.Rate 
##     0.2210255    29.7119093

b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20.2
library(splines)
gam_fit<- gam(Outstate~ Private+ s(Room.Board,df=2) + s(PhD, df=2) + s(perc.alumni,df=2) + s(Expend, df=2) + s(Grad.Rate, df=2), data = train)
par(mfrow=c(2,3))
plot(gam_fit, se=T, col="purple")

#### c) Evaluate the model obtained on the test set, and explaing the results obtained. #### The model obtains an r-squared of ~0.78.

pred <- predict(gam_fit, newdata = test)
error <- mean((test$Outstate-pred)^2)
error
## [1] 3750164
TotSS= mean((test$Outstate -mean(test$Outstate))^2)
Rsq = 1- error/TotSS
Rsq
## [1] 0.7780796

d) For which variables, if any, is there evidence of a non-linear relationship with the response?

Expend and PhD have a non-linear relationships, the remaining 4 variables are adequate as linear functions.

summary(gam_fit)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, 
##     df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 2) + s(Grad.Rate, 
##     df = 2), data = train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7444.47 -1201.52   -45.45  1229.99  5056.03 
## 
## (Dispersion Parameter for gaussian family taken to be 3540380)
## 
##     Null Deviance: 6001737341 on 388 degrees of freedom
## Residual Deviance: 1334722893 on 376.9999 degrees of freedom
## AIC: 6983.766 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                         Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private                  1 1557062605 1557062605 439.801 < 2.2e-16 ***
## s(Room.Board, df = 2)    1 1421421366 1421421366 401.488 < 2.2e-16 ***
## s(PhD, df = 2)           1  484793080  484793080 136.933 < 2.2e-16 ***
## s(perc.alumni, df = 2)   1  266302517  266302517  75.219 < 2.2e-16 ***
## s(Expend, df = 2)        1  405736370  405736370 114.603 < 2.2e-16 ***
## s(Grad.Rate, df = 2)     1   72573436   72573436  20.499 8.014e-06 ***
## Residuals              377 1334722893    3540380                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                        Npar Df Npar F     Pr(F)    
## (Intercept)                                        
## Private                                            
## s(Room.Board, df = 2)        1  0.583   0.44544    
## s(PhD, df = 2)               1  3.804   0.05185 .  
## s(perc.alumni, df = 2)       1  0.522   0.47063    
## s(Expend, df = 2)            1 37.796 2.005e-09 ***
## s(Grad.Rate, df = 2)         1  1.638   0.20138    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1