library(ISLR)
library(boot)
library(splines)
library(MASS)
library(leaps)
library(gam)



Exercise 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

Part a)

Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

attach(Wage)
set.seed(123)

cv.error.10 = rep(0,10)
for (i in 1:10) {
  glm.fit=glm(wage ~ poly(age,i),data=Wage)
  cv.error.10[i]=cv.glm(Wage,glm.fit,K=10)$delta[1]
}
plot(cv.error.10, type="l", xlab="Degree", ylab="CV Error")
deg.min <- which.min(cv.error.10)
points(deg.min, cv.error.10[deg.min], col = 'red', cex = 2, pch = 19)

The plot brings back a minimum CV error at the degree 10. However, we can see that all the way since degree 4, the error is small enough. We will run the ANOVA fit and see what it will bring us.

lm.fit = glm(wage ~ poly(age,4),data=Wage)
summary(lm.fit)
## 
## Call:
## glm(formula = wage ~ poly(age, 4), data = Wage)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -98.707  -24.626   -4.993   15.217  203.693  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    111.7036     0.7287 153.283  < 2e-16 ***
## poly(age, 4)1  447.0679    39.9148  11.201  < 2e-16 ***
## poly(age, 4)2 -478.3158    39.9148 -11.983  < 2e-16 ***
## poly(age, 4)3  125.5217    39.9148   3.145  0.00168 ** 
## poly(age, 4)4  -77.9112    39.9148  -1.952  0.05104 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1593.19)
## 
##     Null deviance: 5222086  on 2999  degrees of freedom
## Residual deviance: 4771604  on 2995  degrees of freedom
## AIC: 30641
## 
## Number of Fisher Scoring iterations: 2
#Using ANOVA to compare degree 4 model with others
fit.1 = lm(wage ~ age ,data=Wage)
fit.2 = lm(wage ~ poly(age ,2), data=Wage)
fit.3 = lm(wage ~ poly(age ,3), data=Wage)
fit.4 = lm(wage ~ poly(age ,4), data=Wage)
fit.5 = lm(wage ~ poly(age ,5), data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Comments: We can see from the ANOVA test that higher than the 4th degree the model looses significance (p-value is higher than 0.05), and even when the degree 4 is slightly higher than 0.05 with 0.51, we will accept this as significant. This result matches with what we saw in the plot from the CV Errors where our model reaches a very low error rate at degree 4, and even when it keeps decreasing at further degrees, the change is not significant.

agelims = range(age)
age.grid = seq(from=agelims[1], to=agelims[2])

preds = predict(lm.fit, newdata=list( age=age.grid) , se=TRUE)
se.bands = cbind(preds$fit + 2 * preds$se.fit , preds$fit-2 * preds$se.fit)

plot(age, wage, xlim = agelims, cex=.5, col = "darkgrey")
title("Polynomial fit - degree 4")
lines(age.grid, preds$fit, lwd = 2, col = "red")
matlines(age.grid, se.bands, lwd =1,col="red",lty =3)



Part b)

Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.

set.seed(123)
cv.error.20 = rep(NA,19)
for (i in 2:20) {
  Wage$age.cut = cut(Wage$age,i)
  step.fit=glm(wage ~ age.cut, data=Wage)
  cv.error.20[i-1]=cv.glm(Wage, step.fit, K=10)$delta[1] 
}
cv.error.20
##  [1] 1734.784 1683.196 1636.622 1631.955 1621.842 1612.172 1603.495 1607.430
##  [9] 1605.066 1599.293 1605.722 1605.079 1605.642 1606.663 1599.978 1602.727
## [17] 1605.128 1603.929 1603.177
plot(cv.error.20, type='l',ylab="CV Error")

deg.min <- which.min(cv.error.20)
points(deg.min, cv.error.20[deg.min], col = 'red', cex = 2, pch = 19)

Comments: According to our plot, the optimal number of cuts is 10, we can also see that from cut 7, the Error value decreases significantly. However we will make a fit with 10 cuts.

step.fit = glm(wage~cut(age,10), data=Wage)

preds2 = predict(step.fit,newdata=list(age=age.grid), se=T)
se.bands2 = cbind(preds2$fit+2*preds2$se.fit,preds2$fit-2*preds2$se.fit)

plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Step function - 10 cuts")
lines(age.grid,preds2$fit,lwd=2,col="red")
matlines(age.grid,se.bands2,lwd =1,col="red",lty =3)



Exercise 10

This question relates to the College data set.

Part a)

Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

detach(Wage)
attach(College)
set.seed(123)
idx.train = sample(1:nrow(College), size = 0.8 * nrow(College))
train.df = College[idx.train,]
test.df = College[-idx.train,]
fit.fwd = regsubsets(Outstate ~ . , data = train.df, nvmax=17, method = "forward")
fit.summary = summary(fit.fwd)

Choosing the best subset

which.min(fit.summary$cp) #find the lowest test error
## [1] 13
which.min(fit.summary$bic) #find the smallest BIC value
## [1] 11
which.max(fit.summary$adjr2) #find the best adjusted r (explains the most of the variance)
## [1] 14
#We will plot it

par(mfrow=c(2,2))
plot(1:17, fit.summary$cp,xlab="Variables",ylab="Cp",main="Cp", type='b')
plot(1:17, fit.summary$bic,xlab="Variables",ylab="BIC",main="BIC", type='b')
plot(1:17, fit.summary$adjr2,xlab="Variables",ylab="Adjusted R2",main="Adjusted R2", type='b')

Comments: We can see that all values start decreasing or increasing at the 5-6 predictors. We will select 6 predictors and see the coefficients when choosing 6 predictors.

coef(fit.fwd,6)
##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3458.3659400  2848.0400657     0.9185917    37.4327931    47.9043047 
##        Expend     Grad.Rate 
##     0.2177383    29.1813781



Part b)

Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gam.m1 = gam(Outstate ~ Private +
               s(Room.Board,4) +
               s(PhD,4) +
               s(perc.alumni,2) +
               s(Expend,4) +
               s(Grad.Rate,5), data=train.df)
par(mfrow=c(2,3))
plot(gam.m1, col="blue", se=T)

Comments: I started using 4 degrees of freedom, assuming all of the variables to be behaving the same way, but then changed the perc.alumni to 2 (due to its closeness to a more linear model) and the Grad.Rate to 5, due to its more fluctuating behavior.

Interpretation: When all the variables are fixed, out-of-state tuition increases as room and board costs get higher. Similarly, our response variable increases when the proportion of alumni who donate (perc.alumni) increases. For the rest of the variables, the increase of out-of-tuition is not linear, and increase at certain point, but then behaves differently.



Part c)

Evaluate the model obtained on the test set, and explain the results obtained.

preds = predict(gam.m1,newdata = test.df)
mse = mean((test.df$Outstate - preds)^2)
mse
## [1] 3085088



Part d)

For which variables, if any, is there evidence of a non-linear relationship with the response?

Response: Like we saw earlier, Grad. Rate is the variable that is less linear of them all, so we will start by not adding it, later we will include it and at the end we will include it with the Spline with 4 and 5 degrees of freedom.

gam.m2 = gam(Outstate~Private+s(Room.Board,4)+s(PhD,4)+s(perc.alumni,2)+s(Expend,4), data=train.df) # No Grad.Rate
gam.m3 = gam(Outstate~Private+s(Room.Board,4)+s(PhD,4)+s(perc.alumni,2)+s(Expend,4)+Grad.Rate, data=train.df) # Linear Grad. rate
gam.m4 = gam(Outstate~Private+s(Room.Board,4)+s(PhD,4)+s(perc.alumni,2)+s(Expend,4)+s(Grad.Rate,4), data=train.df) #Grad. rate with 4 deg. freedom
gam.m5 = gam(Outstate~Private+s(Room.Board,4)+s(PhD,4)+s(perc.alumni,2)+s(Expend,4)+s(Grad.Rate,5), data=train.df) #Grad. rate with 5 deg. freedom
anova(gam.m1,gam.m2,gam.m3,gam.m4,gam.m5, test="F")
## Analysis of Deviance Table
## 
## Model 1: Outstate ~ Private + s(Room.Board, 4) + s(PhD, 4) + s(perc.alumni, 
##     2) + s(Expend, 4) + s(Grad.Rate, 5)
## Model 2: Outstate ~ Private + s(Room.Board, 4) + s(PhD, 4) + s(perc.alumni, 
##     2) + s(Expend, 4)
## Model 3: Outstate ~ Private + s(Room.Board, 4) + s(PhD, 4) + s(perc.alumni, 
##     2) + s(Expend, 4) + Grad.Rate
## Model 4: Outstate ~ Private + s(Room.Board, 4) + s(PhD, 4) + s(perc.alumni, 
##     2) + s(Expend, 4) + s(Grad.Rate, 4)
## Model 5: Outstate ~ Private + s(Room.Board, 4) + s(PhD, 4) + s(perc.alumni, 
##     2) + s(Expend, 4) + s(Grad.Rate, 5)
##   Resid. Df Resid. Dev       Df   Deviance       F    Pr(>F)    
## 1       600 2134393081                                          
## 2       605 2276654446 -4.99970 -142261365  7.9987 2.591e-07 ***
## 3       604 2171794788  1.00000  104859658 29.4772 8.226e-08 ***
## 4       601 2140992223  2.99986   30802565  2.8864   0.03504 *  
## 5       600 2134393081  0.99985    6599142  1.8554   0.17367    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Comments: From this F test we can conclude that Grad. Rate is not linear.