library(tidyverse)
library(ISLR)
library(boot)
library(leaps)
library(gam)
6. In this exercise, you will further analyze the Wage data set considered throughout this chapter.
set.seed(1)
df<- Wage
degrees <- seq(1,5) # List of degrees
Poly.fits <- vector("list", length(degrees)) #Empty container to hold models
Poly.error <- rep(NA,length(degrees))
for (d in degrees){
Poly.fits[[d]]<- glm(wage ~ poly(age,d), data = df)
Poly.error[d] <- cv.glm(df,Poly.fits[[d]], K = 10)$delta[1]
}
do.call("anova", c(Poly.fits, test = "F"))
## Analysis of Deviance Table
##
## Model 1: wage ~ poly(age, d)
## Model 2: wage ~ poly(age, d)
## Model 3: wage ~ poly(age, d)
## Model 4: wage ~ poly(age, d)
## Model 5: wage ~ poly(age, d)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(degrees,Poly.error, type = "l")
Based on the ANOVA we can see that the cut off for the polynomial is degree 4. We should not include any more polynomials after 4. We can see in the plot that the MSE stops going down significantly after 3 degrees.
cuts <- seq(2,10) # List of cuts
Step.fits <- vector("list", length(cuts)) #Empty container to hold models
Step.error <- rep(NA,length(degrees))
for (c in cuts){
df$age.cut <- cut(df$age,c)
Step.fits[[c-1]]<- glm(wage ~ age.cut, data = df)
Step.error[c-1] <- cv.glm(df,Step.fits[[c-1]], K = 10)$delta[1]
}
plot(cuts,Step.error, type = "l")
We can see that the smallest MSE will be at when cuts is equal to 8.
10. This question relates to the College data set.
df10<- College
train <- sample(1:nrow(df10),nrow(df10)*.80)
test <- -train
Forward.fit <- regsubsets(Outstate ~ . , data = df10, subset = train, method = "forward")
Forward.summary <- summary(Forward.fit)
Forward.summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = df10, subset = train,
## method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " "*" " " " " " " " "
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " " " "*" " "
## 6 ( 1 ) " " "*" " " " " " " "*" " "
## 7 ( 1 ) " " "*" " " "*" " " "*" " "
## 8 ( 1 ) " " "*" " " "*" " " "*" " "
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
coef(Forward.fit, id = which.min(Forward.summary$bic)) # The model with the smallest BIC
## (Intercept) PrivateYes Room.Board Personal Terminal
## -3362.0864196 2689.0864973 0.9005720 -0.4939185 45.8881925
## perc.alumni Expend Grad.Rate
## 49.8460815 0.2062247 27.1159646
Gam.fit <- gam(Outstate ~ Private+ s(Top10perc, 5) + s(Room.Board, 5) + s(PhD, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), data = df10, subset = train)
par(mfrow = c(2,3))
plot(Gam.fit, se = TRUE, col = 'blue')
From looking at the graphs we can see that Room.Board and perc.alumni are more linear with Outstate. Expend and Phd are non linear.
Gam.pred <- predict(Gam.fit, df10[test,])
Gam.RSS <- sum(sum((df10[test, ]$Outstate - Gam.pred)^2))
df10.TSS <- sum((df10[test, ]$Outstate - mean(df10[test, ]$Outstate)) ^ 2)
1 - (Gam.RSS / df10.TSS)
## [1] 0.7662809
mean((df10[test, ]$Outstate - Gam.pred) ^ 2)
## [1] 3412040
The R squared is .79 for for the test set. The model has an MSE is 3336102. With an R squared of .79 is pretty good
Forward.Best <- lm(Outstate~ Private + Top10perc + Room.Board+ PhD + perc.alumni+ Expend + Grad.Rate, data = df10, subset = train)
Forward.pred <- predict(Forward.Best, df10[test,])
Forward.RSS <- sum(sum((df10[test, ]$Outstate - Forward.pred)^2))
1 - (Forward.RSS / df10.TSS)
## [1] 0.7471749
mean((df10[test, ]$Outstate - Forward.pred) ^ 2)
## [1] 3690967
We compared to the model with forward selection the GAM model has higher R squared and less MSE.
summary(Gam.fit)
##
## Call: gam(formula = Outstate ~ Private + s(Top10perc, 5) + s(Room.Board,
## 5) + s(PhD, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate,
## 5), data = df10, subset = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7874.98 -1096.48 94.84 1219.56 7327.69
##
## (Dispersion Parameter for gaussian family taken to be 3453863)
##
## Null Deviance: 10279994217 on 620 degrees of freedom
## Residual Deviance: 2034322618 on 588.9992 degrees of freedom
## AIC: 11144.63
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2732425223 2732425223 791.121 < 2.2e-16 ***
## s(Top10perc, 5) 1 1960119152 1960119152 567.515 < 2.2e-16 ***
## s(Room.Board, 5) 1 927816409 927816409 268.632 < 2.2e-16 ***
## s(PhD, 5) 1 178077742 178077742 51.559 2.104e-12 ***
## s(perc.alumni, 5) 1 214907305 214907305 62.222 1.498e-14 ***
## s(Expend, 5) 1 622493904 622493904 180.231 < 2.2e-16 ***
## s(Grad.Rate, 5) 1 89693567 89693567 25.969 4.680e-07 ***
## Residuals 589 2034322618 3453863
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Top10perc, 5) 4 1.2943 0.27093
## s(Room.Board, 5) 4 2.7493 0.02754 *
## s(PhD, 5) 4 2.4862 0.04252 *
## s(perc.alumni, 5) 4 1.0368 0.38742
## s(Expend, 5) 4 26.9943 < 2e-16 ***
## s(Grad.Rate, 5) 4 1.9501 0.10068
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The anova for nonparametric effects shows tahte Expend and perc.alumni have signifincat non parametric effects.