Question 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter. (a)Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

SOLUTION

#1. Choosing optimal degree d

set.seed(1)
cv.error <- rep(0,10)
for (i in 1:10){
  glm.fit <- glm(wage~poly(age, i, raw=T), data=Wage)
  cv.error[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}
plot(1:10, cv.error)

d=5 is sufficient.

#2.Using ANOVA

fit.1 <- lm(wage ∼ age , data = Wage)
fit.2 <- lm(wage ∼ poly(age , 2), data = Wage)
fit.3 <- lm(wage ∼ poly(age , 3), data = Wage)
fit.4 <- lm(wage ∼ poly(age , 4), data = Wage)
fit.5 <- lm(wage ∼ poly(age , 5), data = Wage)
 anova (fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From the results, we see that the linear and quadratic fit are insufficient as the p-value comparing the Model 1 throough model 3 is very small. Hence either the model with cubic and fourth degree polynomial is sufficient.

POLYNOMIAL FIT

#degree 5
poly.fit <- lm(wage~poly(age, 5, raw=T), data=Wage)
agelims <- range(age)
age.grid <- seq(from=agelims[1], to=agelims[2])
preds <- predict(poly.fit, newdata=list(age=age.grid), se=TRUE)
se.bands <- cbind(preds$fit + 2*preds$se.fit, preds$fit-2*preds$se.fit)

par(mfrow=c(1,2), mar=c(4.5, 4.5, 1, 1), oma=c(0,0,4,0))
plot(age,wage, xlim=agelims, col="darkgrey")
title("Degree-5 Polynomial", outer=T)
lines(age.grid,preds$fit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)

#degree 4
poly.fit4 <- lm(wage~poly(age, 4), data=Wage)
agelims <- range(age)
age.grid <- seq(from=agelims[1], to=agelims[2])
preds <- predict(poly.fit, newdata=list(age=age.grid), se=TRUE)
se.bands <- cbind(preds$fit + 2*preds$se.fit, preds$fit-2*preds$se.fit)

par(mfrow=c(1,2), mar=c(4.5, 4.5, 1, 1), oma=c(0,0,4,0))

plot(age,wage, xlim=agelims, col="darkgrey")
title("Degree-4 Polynomial", outer=T)
lines(age.grid,preds$fit, lwd=2, col="red")
matlines(age.grid, se.bands, lwd=1, col="red", lty=3)

  1. Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained. ## SOLUTION
set.seed(1)
cv.error.1 <- rep(NA,10)
for (i in 2:10) {
    Wage$age.cut <- cut(age, i)
    fits <- glm(wage ~ age.cut, data = Wage)
    cv.error.1[i] <- cv.glm(Wage, fits, K = 10)$delta[1]
}
cv.error.1
##  [1]       NA 1734.489 1684.271 1635.552 1632.080 1623.415 1614.996 1601.318
##  [9] 1613.954 1606.331
plot(cv.error.1)

cuts=8

step.fit <- lm(wage~cut(age,8), data=Wage)
summary(step.fit)
## 
## Call:
## lm(formula = wage ~ cut(age, 8), data = Wage)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -99.697 -24.552  -5.307  15.417 198.560 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              76.282      2.630  29.007  < 2e-16 ***
## cut(age, 8)(25.8,33.5]   25.833      3.161   8.172 4.44e-16 ***
## cut(age, 8)(33.5,41.2]   40.226      3.049  13.193  < 2e-16 ***
## cut(age, 8)(41.2,49]     43.501      3.018  14.412  < 2e-16 ***
## cut(age, 8)(49,56.8]     40.136      3.177  12.634  < 2e-16 ***
## cut(age, 8)(56.8,64.5]   44.102      3.564  12.373  < 2e-16 ***
## cut(age, 8)(64.5,72.2]   28.948      6.042   4.792 1.74e-06 ***
## cut(age, 8)(72.2,80.1]   15.224      9.781   1.556     0.12    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.97 on 2992 degrees of freedom
## Multiple R-squared:  0.08467,    Adjusted R-squared:  0.08253 
## F-statistic: 39.54 on 7 and 2992 DF,  p-value: < 2.2e-16
agelim<-range(age)
age.grid <- seq(from=agelim[1], to=agelim[2])
age.pred <- predict(step.fit, newdata=list(age=age.grid), se=T)
se.bands <- cbind(age.pred$fit + 2*age.pred$se.fit, age.pred$fit-2*age.pred$se.fit)

par(mfrow=c(1,2), mar=c(4.5, 4.5, 1, 1), oma=c(0,0,4,0))
plot(Wage$age, wage, xlim=agelim, cex=0.5, col="darkgrey")
lines(age.grid, age.pred$fit, lwd=2, col="red")
matlines(age.grid, se.bands, lwd=1, col="yellow", lty=3)

10.This question relates to the College data set. (a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

attach(College)
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
library(leaps)
## Warning: package 'leaps' was built under R version 4.1.3
set.seed(4)
split <- sample.split(College, SplitRatio = 0.7)
Train<- subset(College, split==TRUE)
Test<- subset(College, split==FALSE)

outstae.fwd <- regsubsets(Outstate~.,data=Train, method = "forward", nvmax = 17)
reg.summary <- summary(outstae.fwd)

## CONDITIONS
par (mfrow = c(2, 2))
plot (reg.summary$rss , xlab = " Number of Variables ",
ylab = " RSS ", type = "l")
plot (reg.summary$adjr2 , xlab = " Number of Variables ",
ylab = " Adjusted RSq ", type = "l")
which.max (reg.summary$adjr2)
## [1] 13
points (13, reg.summary$adjr2[13], col = " red ", cex = 2,
pch = 20)
plot(reg.summary$cp, xlab = " Number of Variables ",
ylab = "Cp", type = "l")
which.min (reg.summary$cp)
## [1] 13
points (13, reg.summary$cp[13], col = " red ", cex = 2,
pch = 20)
which.min (reg.summary$bic)
## [1] 12
plot(reg.summary$bic , xlab = " Number of Variables ",
ylab = " BIC ", type = "l")

coef(outstae.fwd, 12)
##  (Intercept)   PrivateYes         Apps       Accept    Top10perc  F.Undergrad 
## -498.8406116 1998.8934761   -0.3535329    0.8428989   27.6730748   -0.1843719 
##   Room.Board     Personal          PhD    S.F.Ratio  perc.alumni       Expend 
##    0.9662785   -0.2089996   22.8819556  -69.4121802   44.5025373    0.1935047 
##    Grad.Rate 
##   18.8065512

I would choose a twelve-variable model as it has the smallest bic value.

  1. Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
library(gam)
## Warning: package 'gam' was built under R version 4.1.3
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.1.3
## Loaded gam 1.22-2
gam1 <- lm(Outstate~ns(Top10perc,4)+Private+ns(Apps,4)+ns(Accept,4)+ns(F.Undergrad, 4)+ns(Room.Board,4)+ns(Personal,4)+ns(PhD,4)+ns(S.F.Ratio,4)+ns(perc.alumni,4)+ns(Expend,4)+ns(Grad.Rate,4), data=Train)

par(mfrow=c(1,3))
gam.m1 <- gam(Outstate~s(Top10perc,4)+Private+s(Apps,4)+s(Accept,4)+s(F.Undergrad, 4)+s(Room.Board,4)+s(Personal,4)+s(PhD,4)+s(S.F.Ratio,4)+s(perc.alumni,4)+s(Expend,4)+s(Grad.Rate,4), data=Train)

plot(gam.m1, se=T, col="red")

  1. Evaluate the model obtained on the test set, and explain the results obtained.
preds <- predict(gam.m1, newdata = Test)
test.MSE <- mean((preds-Test$Outstate)^2)
test.MSE
## [1] 3137820

T (d) For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gam.m1)
## 
## Call: gam(formula = Outstate ~ s(Top10perc, 4) + Private + s(Apps, 
##     4) + s(Accept, 4) + s(F.Undergrad, 4) + s(Room.Board, 4) + 
##     s(Personal, 4) + s(PhD, 4) + s(S.F.Ratio, 4) + s(perc.alumni, 
##     4) + s(Expend, 4) + s(Grad.Rate, 4), data = Train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -5691.06 -1058.50    85.85  1135.80  7566.55 
## 
## (Dispersion Parameter for gaussian family taken to be 3220605)
## 
##     Null Deviance: 8612051022 on 518 degrees of freedom
## Residual Deviance: 1523344492 on 472.9995 degrees of freedom
## AIC: 9295.947 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## s(Top10perc, 4)     1 2175704900 2175704900 675.5579 < 2.2e-16 ***
## Private             1 1749570776 1749570776 543.2429 < 2.2e-16 ***
## s(Apps, 4)          1  126603875  126603875  39.3106 8.155e-10 ***
## s(Accept, 4)        1     444627     444627   0.1381   0.71039    
## s(F.Undergrad, 4)   1  329233234  329233234 102.2271 < 2.2e-16 ***
## s(Room.Board, 4)    1  593311562  593311562 184.2236 < 2.2e-16 ***
## s(Personal, 4)      1   18725265   18725265   5.8142   0.01628 *  
## s(PhD, 4)           1   60108701   60108701  18.6638 1.901e-05 ***
## s(S.F.Ratio, 4)     1  137150511  137150511  42.5853 1.745e-10 ***
## s(perc.alumni, 4)   1  127348161  127348161  39.5417 7.311e-10 ***
## s(Expend, 4)        1  440731420  440731420 136.8474 < 2.2e-16 ***
## s(Grad.Rate, 4)     1   32300890   32300890  10.0294   0.00164 ** 
## Residuals         473 1523344492    3220605                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F     Pr(F)    
## (Intercept)                                    
## s(Top10perc, 4)         3  0.9085  0.436766    
## Private                                        
## s(Apps, 4)              3  2.4895  0.059700 .  
## s(Accept, 4)            3  8.6563 1.331e-05 ***
## s(F.Undergrad, 4)       3  4.2898  0.005305 ** 
## s(Room.Board, 4)        3  0.6059  0.611402    
## s(Personal, 4)          3  3.6482  0.012674 *  
## s(PhD, 4)               3  1.1009  0.348385    
## s(S.F.Ratio, 4)         3  3.5481  0.014508 *  
## s(perc.alumni, 4)       3  0.9702  0.406547    
## s(Expend, 4)            3 20.4074 1.881e-12 ***
## s(Grad.Rate, 4)         3  1.7415  0.157646    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1