data("Wage")set.seed(1)
cv.delta2 = rep(NA, 10) # store adjusted cv estimate, second component of the vector delta in the returned value
for (i in 1:10) {
wage.poly = glm(wage~poly(age, i), data=Wage)
cv.delta2[i] = cv.glm(Wage, wage.poly, K=10)$delta[2]
}
plot(1:10, cv.delta2, xlab="Degree", ylab="CV error", type="l", pch=20, lwd=2)
min.error.deg = which.min(cv.delta2)
points(min.error.deg, cv.delta2[min.error.deg], col = 'blue', cex = 2, pch = 18)wage.poly.1 = lm(wage~poly(age, 1), data=Wage)
wage.poly.2 = lm(wage~poly(age, 2), data=Wage)
wage.poly.3 = lm(wage~poly(age, 3), data=Wage)
wage.poly.4 = lm(wage~poly(age, 4), data=Wage)
wage.poly.5 = lm(wage~poly(age, 5), data=Wage)
wage.poly.6 = lm(wage~poly(age, 6), data=Wage)
wage.poly.7 = lm(wage~poly(age, 7), data=Wage)
wage.poly.8 = lm(wage~poly(age, 8), data=Wage)
wage.poly.9 = lm(wage~poly(age, 9), data=Wage)
wage.poly.10 = lm(wage~poly(age, 10), data=Wage)
anova(wage.poly.1, wage.poly.2, wage.poly.3, wage.poly.4, wage.poly.5, wage.poly.6, wage.poly.7, wage.poly.8, wage.poly.9, wage.poly.10)wage.poly = lm(wage ~ poly(age , 3), data = Wage)
agelims <- range(Wage$age)
age.grid <- seq(from = agelims[1], to = agelims [2])
preds <- predict(wage.poly , newdata = list(age = age.grid), se = TRUE)
se.bands <- cbind(preds$fit + 2 * preds$se.fit ,preds$fit - 2 * preds$se.fit)par(mfrow = c(1, 1), mar = c(4.5 , 4.5, 1, 1), oma = c(0, 0, 4, 0))
plot(Wage$age , Wage$wage , xlim = agelims , cex = .5, col = "darkgrey")
title("Degree - 3 Polynomial", outer = T)
lines(age.grid, preds$fit , lwd = 2, col = "blue")
matlines(age.grid , se.bands, lwd = 1, col = "blue", lty = 3)cv.delta2.step = rep(NA, 10)
for (i in 2:10) {
Wage$age_cut = cut(Wage$age, i) # cut
Wage.step = glm(wage~age_cut, data=Wage)
cv.delta2.step[i] = cv.glm(Wage, Wage.step, K=10)$delta[2]
}
plot(2:10, cv.delta2.step[-1], xlab="Number of cuts", ylab="CV error", type="l", pch=20, lwd=2)
min.err.deg = which.min(cv.delta2.step)
points(min.err.deg, cv.delta2.step[min.err.deg], col = 'blue', cex = 2, pch = 18)Wage.steps = glm(wage~cut(age, 8), data=Wage)
agelims = range(Wage$age)
age.grid = seq(from=agelims[1], to=agelims[2])
step.pred = predict(Wage.steps, data.frame(age=age.grid))
plot(wage~age, data=Wage, col="darkgrey")
lines(age.grid, step.pred, lwd=2)
str(College)## 'data.frame': 777 obs. of 18 variables:
## $ Private : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Apps : num 1660 2186 1428 417 193 ...
## $ Accept : num 1232 1924 1097 349 146 ...
## $ Enroll : num 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : num 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : num 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: num 2885 2683 1036 510 249 ...
## $ P.Undergrad: num 537 1227 99 63 869 ...
## $ Outstate : num 7440 12280 11250 12960 7560 ...
## $ Room.Board : num 3300 6450 3750 5450 4120 ...
## $ Books : num 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : num 2200 1500 1165 875 1500 ...
## $ PhD : num 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : num 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: num 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : num 7041 10527 8735 19016 10922 ...
## $ Grad.Rate : num 60 56 54 59 15 55 63 73 80 52 ...
set.seed(1)
train = sample(1:nrow(College), size = 0.8 * nrow(College))
college.train = College[train,]
college.test = College[-train,]college.step = regsubsets(Outstate ~ ., data = college.train, nvmax = 17, method = "forward")
step.summary = summary(college.step)par(mfrow = c(1, 3))
## Plot CP values
plot(step.summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l") # CP value
grid(nx = NULL, ny = NULL , lty = 3, col = "gray", lwd = 2)
min.cp = min(step.summary$cp)
std.cp = sd(step.summary$cp)
abline(h = min.cp + 0.2 * std.cp, col = "red", lty = 2)
abline(h = min.cp - 0.2 * std.cp, col = "red", lty = 2)
## Plot BIC values
plot(step.summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "l")
min.bic = min(step.summary$bic)
std.bic = sd(step.summary$bic)
abline(h = min.bic + 0.2 * std.bic, col = "red", lty = 2)
abline(h = min.bic - 0.2 * std.bic, col = "red", lty = 2)
## Plot Adjusted R2 values
plot(step.summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted R2", type = "l", ylim = c(0.4, 0.8))
max.adjr2 = max(step.summary$adjr2)
std.adjr2 = sd(step.summary$adjr2)
abline(h = max.adjr2 + 0.2 * std.adjr2, col = "red", lty = 2)
abline(h = max.adjr2 - 0.2 * std.adjr2, col = "red", lty = 2)model.step = regsubsets(Outstate ~ ., data = College, method = "forward")
names(coef(model.step, id = 6))## [1] "(Intercept)" "PrivateYes" "Room.Board" "PhD" "perc.alumni"
## [6] "Expend" "Grad.Rate"
college.gam = gam(Outstate ~ Private + s(Room.Board) + s(PhD ) +
s(perc.alumni) + s(Expend) + s(Grad.Rate), data = college.train)par(mfrow = c(2, 3))
plot(college.gam, se = T, col = "blue")college.pred = predict(college.gam, college.test)
gam.error = mean((college.test$Outstate - college.pred)^2)
gam.totss = mean((college.test$Outstate - mean(college.test$Outstate))^2)
test.rss.gam = 1 - gam.error/gam.totss
test.rss.gam## [1] 0.7895001
model.step = lm(Outstate ~ Private + Room.Board + PhD +perc.alumni + Expend + Grad.Rate, college.train)
college.pred1 = predict(model.step, college.test)
lm.error = mean((college.test$Outstate - college.pred1)^2)
lm.tss = mean((college.test$Outstate - mean(college.test$Outstate))^2)
test.rss.lm1 = 1 - lm.error/lm.tss
test.rss.lm1## [1] 0.7511881
- The R2 from the GAM models(0.79) is higher than the test RSS of the linear models(0.75).
To check if there is evidence of non-linear relationship between the response and some of the variable, I will check on the summary of the GAM model.
summary(college.gam)##
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) +
## s(Expend) + s(Grad.Rate), data = college.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7583.94 -1119.15 67.43 1315.36 7811.09
##
## (Dispersion Parameter for gaussian family taken to be 3600382)
##
## Null Deviance: 10354544612 on 620 degrees of freedom
## Residual Deviance: 2156628525 on 599 degrees of freedom
## AIC: 11160.88
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2655615113 2655615113 737.593 < 2.2e-16 ***
## s(Room.Board) 1 2017158217 2017158217 560.262 < 2.2e-16 ***
## s(PhD) 1 686955251 686955251 190.801 < 2.2e-16 ***
## s(perc.alumni) 1 486132617 486132617 135.023 < 2.2e-16 ***
## s(Expend) 1 816091757 816091757 226.668 < 2.2e-16 ***
## s(Grad.Rate) 1 108100341 108100341 30.025 6.29e-08 ***
## Residuals 599 2156628525 3600382
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board) 3 2.605 0.05101 .
## s(PhD) 3 1.635 0.17999
## s(perc.alumni) 3 0.948 0.41718
## s(Expend) 3 34.728 < 2e-16 ***
## s(Grad.Rate) 3 2.149 0.09298 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1