library(ISLR2)
library(boot)
set.seed(1)
cv.error <- rep(0,5)
for (i in 1:5) {
fit <- glm(wage ~ poly(age, i), data = Wage)
cv.error[i] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
cv.error
## [1] 1676.826 1600.763 1598.399 1595.651 1594.977
glm.fit1 <- lm(wage ~ poly(age,1), data = Wage)
glm.fit2 <- lm(wage ~ poly(age,2), data = Wage)
glm.fit3 <- lm(wage ~ poly(age,3), data = Wage)
glm.fit4 <- lm(wage ~ poly(age,4), data = Wage)
glm.fit5 <- lm(wage ~ poly(age,5), data = Wage)
anova(glm.fit1, glm.fit2, glm.fit3, glm.fit4, glm.fit5)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
attach(Wage)
agelims <- range(age)
age.grid <- seq(from = agelims[1], to = agelims[2])
preds <- predict(glm.fit3, newdata = list(age=age.grid), se=TRUE)
se.bands <- cbind(preds$fit + 2*preds$se.fit, preds$fit - 2*preds$se.fit)
plot(age, wage, xlim=agelims, cex=.5, col = "darkgrey")
title("Degree-3 Polynomial", outer=T)
lines(age.grid, preds$fit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)
cv.error <- rep(0,9)
for (i in 2:10) {
Wage$age.cut <- cut(Wage$age, i)
glm.fit <- glm(wage ~ age.cut, data = Wage)
cv.error[i-1] <- cv.glm(Wage, glm.fit, K = 10)$delta[1]
}
cv.error
## [1] 1734.999 1682.289 1636.768 1632.149 1624.180 1611.908 1601.480 1611.079
## [9] 1604.125
step.fit <- glm(wage~cut(age, 8), data=Wage)
agelims <- range(Wage$age)
age.grid <- seq(from=agelims[1], to=agelims[2])
step.pred <- predict(step.fit, data.frame(age=age.grid))
plot(wage~age, data=Wage, col="darkgrey")
lines(age.grid, step.pred, col="blue", lwd=2)
set.seed(1)
library(leaps)
train <- sample(1:nrow(College), nrow(College)/2)
test <- (-train)
y.test <- College$Outstate[test]
reg.forward <- regsubsets(Outstate ~ ., data = College, nvmax = 19, method = "forward", subset = train)
summary(reg.forward)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, nvmax = 19,
## method = "forward", subset = train)
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " "*" " " " "
## 9 ( 1 ) "*" " " " " " " "*" " " " "
## 10 ( 1 ) "*" " " " " " " "*" " " " "
## 11 ( 1 ) "*" " " "*" " " "*" " " " "
## 12 ( 1 ) "*" "*" "*" " " "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " "*" " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " " " " " " "
## 6 ( 1 ) " " "*" " " " " " " "*" " "
## 7 ( 1 ) " " "*" " " "*" " " "*" " "
## 8 ( 1 ) " " "*" " " "*" " " "*" " "
## 9 ( 1 ) " " "*" " " "*" " " "*" "*"
## 10 ( 1 ) " " "*" "*" "*" " " "*" "*"
## 11 ( 1 ) " " "*" "*" "*" " " "*" "*"
## 12 ( 1 ) " " "*" "*" "*" " " "*" "*"
## 13 ( 1 ) " " "*" "*" "*" " " "*" "*"
## 14 ( 1 ) " " "*" "*" "*" " " "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" " " "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) "*" " " " "
## 3 ( 1 ) "*" "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" "*"
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
## 9 ( 1 ) "*" "*" "*"
## 10 ( 1 ) "*" "*" "*"
## 11 ( 1 ) "*" "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
par(mfrow = c(1,3))
plot(summary(reg.forward)$cp)
x = which.min(summary(reg.forward)$cp)
y = summary(reg.forward)$cp[x]
points(x,y, col='red', cex = 2, pch = 20)
x
## [1] 14
coef(reg.forward, 9)
## (Intercept) PrivateYes Top10perc Room.Board Personal
## -1953.5313547 2527.8835558 18.5938114 1.0726124 -0.4021046
## Terminal S.F.Ratio perc.alumni Expend Grad.Rate
## 33.5768041 -75.8939438 49.5421868 0.1474168 26.5570861
library(gam)
attach(College)
gam.fit <- glm(Outstate ~ Private + Top10perc + Room.Board + Personal + Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate, data = College, subset = train)
summary(gam.fit)
##
## Call:
## glm(formula = Outstate ~ Private + Top10perc + Room.Board + Personal +
## Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate,
## data = College, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6803.9 -1429.0 -90.1 1320.4 9741.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.954e+03 1.144e+03 -1.707 0.088633 .
## PrivateYes 2.528e+03 3.145e+02 8.037 1.19e-14 ***
## Top10perc 1.859e+01 9.260e+00 2.008 0.045353 *
## Room.Board 1.073e+00 1.221e-01 8.788 < 2e-16 ***
## Personal -4.021e-01 1.621e-01 -2.481 0.013539 *
## Terminal 3.358e+01 9.372e+00 3.583 0.000384 ***
## S.F.Ratio -7.589e+01 3.893e+01 -1.949 0.051982 .
## perc.alumni 4.954e+01 1.121e+01 4.418 1.30e-05 ***
## Expend 1.474e-01 2.957e-02 4.986 9.40e-07 ***
## Grad.Rate 2.656e+01 8.102e+00 3.278 0.001143 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 4376026)
##
## Null deviance: 6989966760 on 387 degrees of freedom
## Residual deviance: 1654137740 on 378 degrees of freedom
## AIC: 7046.1
##
## Number of Fisher Scoring iterations: 2
summary(gam.fit)
##
## Call:
## glm(formula = Outstate ~ Private + Top10perc + Room.Board + Personal +
## Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate,
## data = College, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6803.9 -1429.0 -90.1 1320.4 9741.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.954e+03 1.144e+03 -1.707 0.088633 .
## PrivateYes 2.528e+03 3.145e+02 8.037 1.19e-14 ***
## Top10perc 1.859e+01 9.260e+00 2.008 0.045353 *
## Room.Board 1.073e+00 1.221e-01 8.788 < 2e-16 ***
## Personal -4.021e-01 1.621e-01 -2.481 0.013539 *
## Terminal 3.358e+01 9.372e+00 3.583 0.000384 ***
## S.F.Ratio -7.589e+01 3.893e+01 -1.949 0.051982 .
## perc.alumni 4.954e+01 1.121e+01 4.418 1.30e-05 ***
## Expend 1.474e-01 2.957e-02 4.986 9.40e-07 ***
## Grad.Rate 2.656e+01 8.102e+00 3.278 0.001143 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 4376026)
##
## Null deviance: 6989966760 on 387 degrees of freedom
## Residual deviance: 1654137740 on 378 degrees of freedom
## AIC: 7046.1
##
## Number of Fisher Scoring iterations: 2
par(mfrow = c(1,3))
plot(gam.fit, se = TRUE, col = "blue")
gam.pred <- predict(gam.fit, College)[test]
mean((y.test - gam.pred)^2)
## [1] 3838132
plot(Outstate ~ Enroll)
plot(Outstate ~ Top10perc)
plot(Outstate ~ Grad.Rate)