6. In this exercise, you will further analyze the Wage data set considered throughout this chapter.

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.2
library(boot)
attach(Wage)
summary(Wage)
##       year           age                     maritl           race     
##  Min.   :2003   Min.   :18.00   1. Never Married: 648   1. White:2480  
##  1st Qu.:2004   1st Qu.:33.75   2. Married      :2074   2. Black: 293  
##  Median :2006   Median :42.00   3. Widowed      :  19   3. Asian: 190  
##  Mean   :2006   Mean   :42.41   4. Divorced     : 204   4. Other:  37  
##  3rd Qu.:2008   3rd Qu.:51.00   5. Separated    :  55                  
##  Max.   :2009   Max.   :80.00                                          
##                                                                        
##               education                     region               jobclass   
##  1. < HS Grad      :268   2. Middle Atlantic   :3000   1. Industrial :1544  
##  2. HS Grad        :971   1. New England       :   0   2. Information:1456  
##  3. Some College   :650   3. East North Central:   0                        
##  4. College Grad   :685   4. West North Central:   0                        
##  5. Advanced Degree:426   5. South Atlantic    :   0                        
##                           6. East South Central:   0                        
##                           (Other)              :   0                        
##             health      health_ins      logwage           wage       
##  1. <=Good     : 858   1. Yes:2083   Min.   :3.000   Min.   : 20.09  
##  2. >=Very Good:2142   2. No : 917   1st Qu.:4.447   1st Qu.: 85.38  
##                                      Median :4.653   Median :104.92  
##                                      Mean   :4.654   Mean   :111.70  
##                                      3rd Qu.:4.857   3rd Qu.:128.68  
##                                      Max.   :5.763   Max.   :318.34  
## 

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

poly.mse=c()
for(degree in 1:8){
  glm.fit=glm(wage~poly(age, degree, raw=T), data=Wage)
  cv.error=cv.glm(glm.fit, data = Wage, K=10)$delta[1]
  poly.mse=c(poly.mse, cv.error)
}
plot(poly.mse, xlab='Degree', ylab='CV Error', type='l')
degree.min=which.min(poly.mse)
points(degree.min, poly.mse[degree.min], pch=18, cex=2, col='darkgreen')

fit=lm(wage~poly(age,5),data=Wage)
agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

par(mfrow=c(1,1),mar=c(4.5,4.5,1,1),oma=c(0,0,2,0))
plot(age,wage,xlim=agelims,cex =.5,col="darkgrey")
title("Degree-5 Polynomial",outer=T)

lines(age.grid,preds$fit,lwd=2,col="darkblue")
matlines(age.grid,se.bands,lwd=1,col="lightblue",lty=3)

The ANOVA results are as follows:

fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

These results suggest using a degree of 4 is optimal while the degree of 5 as shown above is unneccesary.

fit=lm(wage~poly(age,4),data=Wage)

agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

par(mfrow=c(1,1),mar=c(4.5,4.5,1,1),oma=c(0,0,2,0))
plot(age,wage,xlim=agelims,cex =.5,col="darkgrey")
title("Degree-4 Polynomial",outer=T)

lines(age.grid,preds$fit,lwd=2,col="darkblue")
matlines(age.grid,se.bands,lwd=1,col="lightblue",lty=3)

(b) Fit a step function to predict wage using age, and perform cross validation to choose the optimal number of cuts. Make a plot of the fit obtained.

set.seed(321)
step.mse=c()
for(i in 2:10){
  Wage.model=model.frame(wage~cut(age,i), data=Wage)
  names(Wage.model)=c('wage','age')
  
  step.fit=glm(wage~age,data=Wage.model)
  cv.err=cv.glm(step.fit, data = Wage.model, K=10)$delta[1]
  step.mse=c(step.mse, cv.err)
}

plot(step.mse,xlab='Cuts',ylab='CV Error',type='l')
cut.min=which.min(step.mse)
points(cut.min,step.mse[cut.min],pch=18,cex=2,col='darkgreen')

fit=lm(wage~cut(age,7),data=Wage)
coef(summary(fit))
##                        Estimate Std. Error   t value      Pr(>|t|)
## (Intercept)            80.06402   2.405222 33.287586 5.517532e-207
## cut(age, 7)(26.9,35.7] 24.51568   2.892501  8.475600  3.617008e-17
## cut(age, 7)(35.7,44.6] 38.59253   2.792477 13.820180  3.695152e-42
## cut(age, 7)(44.6,53.4] 38.05482   2.812402 13.531077  1.556835e-40
## cut(age, 7)(53.4,62.3] 39.03969   3.082094 12.666611  7.411125e-36
## cut(age, 7)(62.3,71.1] 31.03930   4.884196  6.355048  2.400708e-10
## cut(age, 7)(71.1,80.1] 15.99445   9.075719  1.762334  7.811488e-02
fit=lm(wage~cut(age,7),data=Wage)

agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

par(mfrow=c(1,1),mar=c(4.5,4.5,1,1),oma=c(0,0,2,0))
plot(age,wage,xlim=agelims,cex =.5,col="darkgrey")
title("Step Function Fit",outer=T)

lines(age.grid,preds$fit,lwd=2,col="darkblue")
matlines(age.grid,se.bands,lwd=1,col="lightblue",lty=3)

detach(Wage)

10. This question relates to the College data set.

attach(College)

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

set.seed(1)
train=sample(dim(College)[1], dim(College)[1]/2)
test=(-train)
College.train = College[train, ]
College.test = College[test, ]
library(leaps)
## Warning: package 'leaps' was built under R version 4.1.3
regfit.fwd=regsubsets(Outstate~., data=College.train, nvmax=17, method ="forward")
reg.summary=summary(regfit.fwd)
par(mfrow=c(2,2))
plot(reg.summary$rss, xlab="Number of Variables ", ylab="RSS", type="l")
plot(reg.summary$adjr2, xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
which.max(reg.summary$adjr2)
## [1] 14
points(14,reg.summary$adjr2[14], col="red", cex=2, pch =20)
plot(reg.summary$cp,xlab="Number of Variables ",ylab="Cp",type='l')
which.min(reg.summary$cp)
## [1] 14
points(14,reg.summary$cp[14], col ="red", cex=2, pch =20)

plot(reg.summary$bic ,xlab="Number of Variables ", ylab="BIC", type='l')
which.min(reg.summary$bic)
## [1] 6
points (6,reg.summary$bic [6],col="red",cex=2,pch =20)

regfit=regsubsets(Outstate~., data = College, method = "forward")
coeffs=coef(regfit, id = 6)
names(coeffs)
## [1] "(Intercept)" "PrivateYes"  "Room.Board"  "PhD"         "perc.alumni"
## [6] "Expend"      "Grad.Rate"

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results and explain your findings.

library(gam)
## Warning: package 'gam' was built under R version 4.1.3
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.1.2
## Loaded gam 1.20.1
gam1=gam(Outstate ~ Private + s(Room.Board, 2) + s(PhD, 2) + s(perc.alumni, 2) + s(Expend, 5) + s(Grad.Rate, 2), data=College.train)
par(mfrow = c(2, 3))
plot(gam1, se=TRUE, col ="#1c9099")

Expend looks to be non-linear, while the other variables appear fairly linear.

(c) Evaluate the model obtained on the test set, and explain the results obtained.

preds=predict(gam1, College.test)
error=mean((College.test$Outstate - preds)^2)
error
## [1] 3349290
tss=mean((College.test$Outstate - mean(College.test$Outstate))^2)
rss=1-error/tss
rss
## [1] 0.7660016

The R-squared is 0.766 using a 6 variable model.

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gam1)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 2) + s(PhD, 
##     2) + s(perc.alumni, 2) + s(Expend, 5) + s(Grad.Rate, 2), 
##     data = College.train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7402.89 -1114.45   -12.67  1282.69  7470.60 
## 
## (Dispersion Parameter for gaussian family taken to be 3711182)
## 
##     Null Deviance: 6989966760 on 387 degrees of freedom
## Residual Deviance: 1384271126 on 373 degrees of freedom
## AIC: 6987.021 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 1778718277 1778718277 479.286 < 2.2e-16 ***
## s(Room.Board, 2)    1 1577115244 1577115244 424.963 < 2.2e-16 ***
## s(PhD, 2)           1  322431195  322431195  86.881 < 2.2e-16 ***
## s(perc.alumni, 2)   1  336869281  336869281  90.771 < 2.2e-16 ***
## s(Expend, 5)        1  530538753  530538753 142.957 < 2.2e-16 ***
## s(Grad.Rate, 2)     1   86504998   86504998  23.309 2.016e-06 ***
## Residuals         373 1384271126    3711182                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F     Pr(F)    
## (Intercept)                                    
## Private                                        
## s(Room.Board, 2)        1  1.9157    0.1672    
## s(PhD, 2)               1  0.9699    0.3253    
## s(perc.alumni, 2)       1  0.1859    0.6666    
## s(Expend, 5)            4 20.5075 2.665e-15 ***
## s(Grad.Rate, 2)         1  0.5702    0.4506    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The Expend variable is the only one that shows significance of a non-linear relationship and we can also see that in the plot of the GAM fit in 10(b).