In this exercise, you will further analyze the Wage data set considered throughout this chapter.
Use ISLR, Boot, and the Wage dataset
require (ISLR)
## Loading required package: ISLR
require (boot)
## Loading required package: boot
data (Wage)
set.seed(1)
Using cross validation to select the optimal degree d for the polynomial.
error_cross<- rep(0,10)
for (i in 1:10) {fit.glm<-glm(wage~poly(age,i),data=Wage)
error_cross[i]<-cv.glm(Wage,fit.glm,K=10)$delta[1]
}
Plotting Cross Validation Error
error_cross
## [1] 1675.837 1601.012 1598.801 1594.217 1594.625 1594.888 1595.500
## [8] 1595.436 1596.335 1595.835
plot(error_cross,type = "b")
from the plot, d = 4 looks like a good choice.
ANOVA for comparison with polynomial. I chose to test degrees from 1 through 10 to ensure a full comparison against ANOVA.
fit_1 <- lm(wage~age, data=Wage)
fit_2 <- lm(wage~poly(age,2), data=Wage)
fit_3 <- lm(wage~poly(age,3), data=Wage)
fit_4 <- lm(wage~poly(age,4), data=Wage)
fit_5 <- lm(wage~poly(age,5), data=Wage)
fit_6 <- lm(wage~poly(age,6), data=Wage)
fit_7 <- lm(wage~poly(age,7), data=Wage)
fit_8 <- lm(wage~poly(age,8), data=Wage)
fit_9 <- lm(wage~poly(age,9), data=Wage)
fit_10 <- lm(wage~poly(age,10), data=Wage)
anova(fit_1,fit_2,fit_3,fit_4,fit_5,fit_6,fit_7,fit_8,fit_9,fit_10)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
## Model 7: wage ~ poly(age, 7)
## Model 8: wage ~ poly(age, 8)
## Model 9: wage ~ poly(age, 9)
## Model 10: wage ~ poly(age, 10)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.7638 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.9005 0.001669 **
## 4 2995 4771604 1 6070 3.8143 0.050909 .
## 5 2994 4770322 1 1283 0.8059 0.369398
## 6 2993 4766389 1 3932 2.4709 0.116074
## 7 2992 4763834 1 2555 1.6057 0.205199
## 8 2991 4763707 1 127 0.0796 0.777865
## 9 2990 4756703 1 7004 4.4014 0.035994 *
## 10 2989 4756701 1 3 0.0017 0.967529
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
d = 3 and d = 4 look the best. I decided to choose d = 4
agel<-range(Wage$age)
grid<-seq(agel[1],agel[2])
pred<- predict(fit_4, newdata=list(age=grid), se=TRUE)
Making a plot of the resulting polynomial fit
se.bands <- pred$fit + cbind(2*pred$se.fit, -2*pred$se.fit)
par(mfrow=c(1,1), mar=c(4.5,4.5,1,1), oma=c(0,0,4,0))
plot(Wage$age, Wage$wage, xlim=agel, cex=0.5, col="grey")
title("Polynomial Fit,d=4", outer=TRUE)
lines(grid, pred$fit, lwd=2, col="blue")
matlines(grid, se.bands, lwd=1, col="blue", lty=3)
Cross validation
error_cv<-rep(0,9)
for (i in 2:10) {
Wage$age.cut <- cut(Wage$age,i)
glm.fit <- glm(wage~age.cut, data=Wage)
error_cv[i-1] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}
Plotting cross-validation
error_cv
## [1] 1733.880 1683.337 1635.042 1633.382 1622.835 1610.996 1605.042 1609.491
## [9] 1606.329
plot(2:10, error_cv, type="b")
eight cuts looks ideal.
fit_08<- glm(wage~cut(age,8), data=Wage)
pred2 <- predict(fit_08, newdata=list(age=grid), se=TRUE)
Plotting the fit
se.bands2 <- pred2$fit + cbind(2*pred2$se.fit, -2*pred2$se.fit)
plot(Wage$age, Wage$wage, xlim=agel, cex=0.5, col="darkgrey")
title("Fit_08")
lines(grid, pred2$fit, lwd=2, col="blue")
matlines(grid, se.bands2, lwd=1, col="blue", lty=3)
This question relates to the College data set.
require(leaps)
## Loading required package: leaps
data("College")
set.seed(1)
Splitting the data into train and test sets
trainid<- sample(1:nrow(College), nrow(College)/2)
data.train <- College[trainid,]
data.test <- College[-trainid,]
Predict function to perform forward selection
predict.regsubsets <- function(object, newdata, id, ...){
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
coefi <- coef(object, id=id)
xvars <- names(coefi)
mat[,xvars]%*%coefi
}
Forward selection
frwd.fit<- regsubsets(Outstate~., data=data.train, nvmax=ncol(College)-1)
(summary.frwd<- summary(frwd.fit))
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = data.train, nvmax = ncol(College) -
## 1)
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: exhaustive
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " " " " " "
## 9 ( 1 ) "*" "*" "*" " " " " " " "*"
## 10 ( 1 ) "*" "*" "*" " " " " " " "*"
## 11 ( 1 ) "*" "*" "*" " " " " " " "*"
## 12 ( 1 ) "*" "*" "*" " " " " " " "*"
## 13 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 14 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 15 ( 1 ) "*" "*" "*" " " "*" "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " " " "*" " "
## 6 ( 1 ) " " "*" " " " " " " "*" " "
## 7 ( 1 ) " " "*" " " "*" " " "*" " "
## 8 ( 1 ) " " "*" " " "*" " " "*" "*"
## 9 ( 1 ) " " "*" " " " " " " "*" " "
## 10 ( 1 ) " " "*" " " " " " " "*" "*"
## 11 ( 1 ) " " "*" " " "*" " " "*" "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" "*"
## 14 ( 1 ) "*" "*" " " "*" "*" "*" "*"
## 15 ( 1 ) "*" "*" " " "*" "*" "*" "*"
## 16 ( 1 ) "*" "*" " " "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
## 9 ( 1 ) "*" "*" "*"
## 10 ( 1 ) "*" "*" "*"
## 11 ( 1 ) "*" "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
err_frwd<- rep(NA, ncol(College)-1)
for(i in 1:(ncol(College)-1)) {
frwd.pred<- predict(frwd.fit, data.test, id=i)
err_frwd[i] <- mean((data.test$Outstate - frwd.pred)^2)
}
Plotting to identify a satisfactory model that only uses a subset of predictors
par(mfrow=c(2,2))
plot(err_frwd, type="b", main="Test MSE", xlab="Number of Predictors")
min.mse <- which.min(err_frwd)
points(min.mse, err_frwd[min.mse], col="red", pch=9, lwd=5)
plot(summary.frwd$adjr2, type="b", main="Adjusted R^2", xlab="Number of Predictors")
max.adjr2 <- which.max(summary.frwd$adjr2)
points(max.adjr2, summary.frwd$adjr2[max.adjr2], col="red", pch=9, lwd=5)
plot(summary.frwd$cp, type="b", main="cp", xlab="Number of Predictors")
min.cp <- which.min(summary.frwd$cp)
points(min.cp, summary.frwd$cp[min.cp], col="red", pch=9, lwd=5)
plot(summary.frwd$bic, type="b", main="bic", xlab="Number of Predictors")
min.bic <- which.min(summary.frwd$bic)
points(min.bic, summary.frwd$bic[min.bic], col="red", pch=9, lwd=5)
The model doesn’t improve after 6 predictors
coef(frwd.fit,6)
## (Intercept) PrivateYes Room.Board Terminal perc.alumni
## -4241.4402916 2790.4303173 0.9629335 37.8412517 60.6406044
## Expend Grad.Rate
## 0.2149396 30.3831268
Six Variables = Private, Room Board, Terminal, perc.alumni, Expend, Grad.Rate
GAM
require(gam)
## Loading required package: gam
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.16
fit.gam<- gam(Outstate~Private+s(Room.Board,df=2)+s(Terminal,df=2)+s(perc.alumni,df=2)+s(Expend,df=5)+s(Grad.Rate,df=2), data=data.train)
par(mfrow=c(1,3))
plot(fit.gam, se=TRUE, col="blue")
pred3 <- predict(fit.gam, data.test)
(mse.error <- mean((data.test$Outstate - pred3)^2))
## [1] 3721642
err_frwd[6]
## [1] 4357411
Better than linear fit
summary(fit.gam)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(Terminal,
## df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate,
## df = 2), data = data.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5010.54 -1143.18 46.42 1230.10 7702.80
##
## (Dispersion Parameter for gaussian family taken to be 3310745)
##
## Null Deviance: 6221998532 on 387 degrees of freedom
## Residual Deviance: 1234907641 on 372.9999 degrees of freedom
## AIC: 6942.72
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1795888516 1795888516 542.442 < 2.2e-16 ***
## s(Room.Board, df = 2) 1 1243271116 1243271116 375.526 < 2.2e-16 ***
## s(Terminal, df = 2) 1 388884475 388884475 117.461 < 2.2e-16 ***
## s(perc.alumni, df = 2) 1 308209205 308209205 93.094 < 2.2e-16 ***
## s(Expend, df = 5) 1 456342500 456342500 137.837 < 2.2e-16 ***
## s(Grad.Rate, df = 2) 1 63658700 63658700 19.228 1.511e-05 ***
## Residuals 373 1234907641 3310745
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, df = 2) 1 3.6382 0.05723 .
## s(Terminal, df = 2) 1 1.4197 0.23421
## s(perc.alumni, df = 2) 1 1.4844 0.22386
## s(Expend, df = 5) 4 16.9959 8.171e-13 ***
## s(Grad.Rate, df = 2) 1 4.1068 0.04343 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Significant evidence of non-linear relationship with the ‘Expend’ variable, some with ‘Room.Board’ and ‘Grad.Rate’ but none with ‘perc.alumni’ and ‘Terminal’.