Data

data("Wage")
data("College")

Question 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

  1. Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
for(i in 1:10){
  fit=glm(wage~poly(age, i), data=Wage)
}
summary(fit)
## 
## Call:
## glm(formula = wage ~ poly(age, i), data = Wage)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -100.38   -24.45    -4.97    15.49   199.61  
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     111.7036     0.7283 153.369  < 2e-16 ***
## poly(age, i)1   447.0679    39.8924  11.207  < 2e-16 ***
## poly(age, i)2  -478.3158    39.8924 -11.990  < 2e-16 ***
## poly(age, i)3   125.5217    39.8924   3.147  0.00167 ** 
## poly(age, i)4   -77.9112    39.8924  -1.953  0.05091 .  
## poly(age, i)5   -35.8129    39.8924  -0.898  0.36940    
## poly(age, i)6    62.7077    39.8924   1.572  0.11607    
## poly(age, i)7    50.5498    39.8924   1.267  0.20520    
## poly(age, i)8   -11.2547    39.8924  -0.282  0.77787    
## poly(age, i)9   -83.6918    39.8924  -2.098  0.03599 *  
## poly(age, i)10    1.6240    39.8924   0.041  0.96753    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1591.402)
## 
##     Null deviance: 5222086  on 2999  degrees of freedom
## Residual deviance: 4756701  on 2989  degrees of freedom
## AIC: 30644
## 
## Number of Fisher Scoring iterations: 2
fit1=lm(wage~age, data=Wage)
fit2=lm(wage~poly(age, 2), data=Wage)
fit3=lm(wage~poly(age, 3), data=Wage)
fit4=lm(wage~poly(age, 4), data=Wage)
fit5=lm(wage~poly(age, 5), data=Wage)
anova(fit1, fit2, fit3, fit4, fit5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Degree 4 was the chosen polynomial and after using Anova we see that either 3 or 4 degree would have been a good fit.

plot(wage~age, data=Wage, col="grey")
title ("Polinomial Regression with degree 4", outer=T)
agel=range(Wage$age)
ageg=seq(from=agel[1], to=agel[2])
fit=glm(wage~poly(age, 4), data=Wage)
pred=predict(fit, newdata=list(age=ageg))
lines(ageg, pred, col="blue", lwd=2)

  1. Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.
for(i in 2:10){
  fit=glm(wage~cut(age, i), data=Wage)
}
summary(fit)
## 
## Call:
## glm(formula = wage ~ cut(age, i), data = Wage)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -99.966  -24.791   -4.883   16.025  201.787  
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              73.344      3.024  24.253  < 2e-16 ***
## cut(age, i)(24.2,30.4]   24.359      3.709   6.567 6.02e-11 ***
## cut(age, i)(30.4,36.6]   35.441      3.570   9.929  < 2e-16 ***
## cut(age, i)(36.6,42.8]   44.767      3.478  12.871  < 2e-16 ***
## cut(age, i)(42.8,49]     46.707      3.413  13.687  < 2e-16 ***
## cut(age, i)(49,55.2]     43.145      3.574  12.072  < 2e-16 ***
## cut(age, i)(55.2,61.4]   46.389      3.882  11.948  < 2e-16 ***
## cut(age, i)(61.4,67.6]   43.211      5.081   8.504  < 2e-16 ***
## cut(age, i)(67.6,73.8]   23.271      7.796   2.985  0.00286 ** 
## cut(age, i)(73.8,80.1]   21.217     11.500   1.845  0.06515 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1600.43)
## 
##     Null deviance: 5222086  on 2999  degrees of freedom
## Residual deviance: 4785285  on 2990  degrees of freedom
## AIC: 30660
## 
## Number of Fisher Scoring iterations: 2
plot(wage~age, data=Wage, col="grey")
title ("Step function with 10 cuts", outer=T)
agel=range(Wage$age)
ageg=seq(from=agel[1], to=agel[2])
fit=glm(wage~cut(age, 10), data=Wage)
pred=predict(fit, newdata=list(age=ageg))
lines(ageg, pred, col="blue", lwd=2)

Question 10

This question relates to the College data set.

  1. Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
set.seed(1)
i=sample(1:nrow(College), nrow(College)*0.8)
train=College[i,]
test=College[-i,]


stepwise=regsubsets(Outstate~., data=train, method="forward", nvmax=18)
stepwises=summary(stepwise)

plot(stepwises$cp, xlab="Variables", ylab="cp")
points(which.min(stepwises$cp), stepwises$cp[which.min(stepwises$cp)], col="blue", cex=2, pch=20)

plot(stepwises$bic, xlab="Variables", ylab="bic")
points(which.min(stepwises$bic), stepwises$bic[which.min(stepwises$bic)], col="blue", cex=2, pch=20)

Using the BIC and Cp the best model is anywhere between 11-13 predictors

  1. Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
coef(stepwise, id=11)
##   (Intercept)    PrivateYes          Apps        Accept        Enroll 
## -2257.2968624  2528.5841801    -0.3256427     0.8004141    -0.7645416 
##     Top10perc    Room.Board      Personal           PhD   perc.alumni 
##    27.1711538     0.9342392    -0.3472783    26.7456972    51.6472233 
##        Expend     Grad.Rate 
##     0.2031258    21.0112073
gam1=gam(Outstate~Private+s(Apps)+s(Accept)+s(Enroll)+s(Top10perc)+s(Room.Board)+s(Personal)+s(PhD)+s(perc.alumni)+s(Expend)+s(Grad.Rate), data=train)
plot(gam1, se=TRUE, col="blue")

There is linearity with some of the variables like PhD and perc.alumni, but also some non-linearity like with enroll or accept.

  1. Evaluate the model obtained on the test set, and explain the results obtained.
pred=predict(gam1, test)
error=mean((test$Outstate-pred)^2)
error
## [1] 2768897
t=mean((test$Outstate-mean(test$Outstate))^2)
r2=1-error/t
r2
## [1] 0.8026239
fitlm=lm(Outstate~Private+Apps+Accept+Enroll+Top10perc+Room.Board+Personal+PhD+perc.alumni+Expend+Grad.Rate, data=train)

predl=predict(fitlm, test)
errorl=mean((test$Outstate-predl)^2)
errorl
## [1] 3226502
tl=mean((test$Outstate-mean(test$Outstate))^2)
r2l=1-errorl/tl
r2l
## [1] 0.7700043

The gam model performed better than the linear model.

  1. For which variables, if any, is there evidence of a non-linear relationship with the response?
summary(gam1)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Outstate ~ Private + s(Apps) + s(Accept) + s(Enroll) + s(Top10perc) + 
##     s(Room.Board) + s(Personal) + s(PhD) + s(perc.alumni) + s(Expend) + 
##     s(Grad.Rate)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8740.8      209.8  41.664   <2e-16 ***
## PrivateYes    2398.1      267.1   8.978   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf Ref.df      F  p-value    
## s(Apps)        1.000  1.000  2.110  0.14688    
## s(Accept)      5.434  6.673  5.720 4.80e-06 ***
## s(Enroll)      8.423  8.874  4.677 8.43e-06 ***
## s(Top10perc)   1.370  1.659  2.651  0.10622    
## s(Room.Board)  5.860  7.064  8.144  < 2e-16 ***
## s(Personal)    3.386  4.245  3.441  0.00746 ** 
## s(PhD)         1.000  1.000  2.225  0.13636    
## s(perc.alumni) 1.000  1.000 27.443 3.93e-07 ***
## s(Expend)      6.902  7.970 23.204  < 2e-16 ***
## s(Grad.Rate)   3.267  4.142  3.985  0.00326 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.813   Deviance explained = 82.4%
## GCV = 3.3422e+06  Scale est. = 3.1289e+06  n = 621

The strongest evidence for non-linear relationship is for the variables Accept, Enroll, Room.Board, perc.alumni, and Expend. The second strongest evidence would be Personal and Grad.Rate.