attach(Wage)
set.seed(42)
<- rep(NA, 5)
cv.errors for (i in 1:5){
<- glm(wage ~ poly(age, i))
fit <- cv.glm(Wage, fit)$delta[1]
cv.errors[i] }
which.min(cv.errors)
## [1] 4
plot(1:5, cv.errors, xlab = "Degree", ylab = "MSE",type = "l")
points(which.min(cv.errors), cv.errors[which.min(cv.errors)], col = "red", pch = 20, cex = 2)
par(mfrow = c(1, 1), mar = c(4.5, 4.5, 1, 1), oma = c(0, 0, 2, 0))
<- lm(wage ~ poly(age, 4))
fit
<- range(age)
agelims <- seq(from = agelims[1], to = agelims[2])
age.grid
<- predict(fit, newdata = list(age = age.grid), se = TRUE)
preds <- cbind(preds$fit + 2 * preds$se.fit, preds$fit - 2 * preds$se.fit)
se.bands
plot(age, wage, xlim = agelims, cex = 0.5, col = "darkgrey")
title("Degree 4 Polynomial", outer = T)
lines(age.grid, preds$fit, lwd = 2, col = "red")
matlines(age.grid, se.bands, lwd = 1, col = "pink", lty = 3)
<- rep(NA, 10)
cv.errors
for(i in 2:10){
$age.cut <- cut(Wage$age, i)
Wage<- glm(wage ~ age.cut, data = Wage)
fit <- cv.glm(Wage, fit, K = 10)$delta[1]
cv.errors[i] }
plot(2:10, cv.errors[-1], type = "l", xlab = "# of Cuts", ylab = "Test MSE")
points(which.min(cv.errors), cv.errors[which.min(cv.errors)], col = "red", pch = 20, cex = 2)
plot(wage ~ age, col = "darkgrey")
<- glm(wage ~ cut(age, 8))
fit <- predict(fit, list(age = age.grid))
preds lines(age.grid, preds, col = "red", lwd = 2)
detach(Wage)
set.seed(42)
<- sample(1:nrow(College), nrow(College)/5)
test.index <- College[test.index,]
test <- College[-test.index,]
train
<- regsubsets(Outstate ~ ., data = train, nvmax = 17, method = "forward")
fit <- summary(fit)
fit.summary fit.summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = train, nvmax = 17, method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " " " " " "
## 9 ( 1 ) "*" " " " " " " " " " " " "
## 10 ( 1 ) "*" " " "*" " " " " " " " "
## 11 ( 1 ) "*" " " "*" " " " " " " "*"
## 12 ( 1 ) "*" "*" "*" " " " " " " "*"
## 13 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 14 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 15 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*" "*" " " " "
## 8 ( 1 ) " " "*" " " "*" "*" "*" " "
## 9 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 10 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 11 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 12 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 13 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 14 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 15 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 16 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
## 9 ( 1 ) "*" "*" "*"
## 10 ( 1 ) "*" "*" "*"
## 11 ( 1 ) "*" "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
which.min(fit.summary$cp)
## [1] 13
which.min(fit.summary$bic)
## [1] 12
which.max(fit.summary$adjr2)
## [1] 14
par(mfrow=c(2,2))
plot(fit.summary$cp ,xlab="Number of Variables ", ylab="Cp",
type="b")
points(which.min(fit.summary$cp), fit.summary$cp[which.min(fit.summary$cp)], col="red", cex=2, pch=20)
plot(fit.summary$bic ,xlab="Number of Variables ",
ylab="BIC",type="b")
points(which.min(fit.summary$bic), fit.summary$bic[which.min(fit.summary$bic)], col="red", cex=2, pch=20)
plot(fit.summary$adjr2 ,xlab="Number of Variables ",
ylab="Adjusted R^2^",type="b")
points(which.max(fit.summary$adjr2), fit.summary$adjr2[which.max(fit.summary$adjr2)], col="red", cex=2, pch=20)
<- model.matrix(Outstate~., data=test)
test.matrix
<- rep(NA,17)
val.errors for(i in 1:17){
<- coef(fit,id=i)
coefi <- test.matrix[,names(coefi)]%*%coefi
pred <- mean((test$Outstate-pred)^2)
val.errors[i]
}
which.min(val.errors)
## [1] 15
plot(val.errors, type='b')
points(which.min(val.errors), val.errors[15], col='red', pch=20, cex=2)
<- regsubsets(Outstate ~ ., data = College, nvmax = 17, method = "forward")
fit.full coef(fit.full, 6)
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3553.2345268 2768.6347025 0.9679086 35.5283359 48.4221031
## Expend Grad.Rate
## 0.2210255 29.7119093
Expend
and Grad.Rate
may have non-linear relationships with Outstate
.<- gam(Outstate ~ Private + s(Room.Board, 5) + s(PhD, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), data = train)
gam.fit
par(mfrow = c(2, 3))
plot(gam.fit, se = TRUE, col = "red")
<- predict(gam.fit, test)
preds <- mean((test$Outstate-preds)^2)
error 6] - error val.errors[
## [1] 819791.5
Expend
has a very strong non-linear relationship with Outstate
, and Room.Board
, PhD
, and Grad.Rate
all have moderate non-linear relationships with Outstate
.summary(gam.fit)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 5) + s(PhD,
## 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5),
## data = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7425.28 -1034.03 75.44 1221.06 7050.00
##
## (Dispersion Parameter for gaussian family taken to be 3339620)
##
## Null Deviance: 9930785374 on 621 degrees of freedom
## Residual Deviance: 1987073638 on 594.9999 degrees of freedom
## AIC: 11136.85
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2676168036 2676168036 801.339 < 2.2e-16 ***
## s(Room.Board, 5) 1 1860022552 1860022552 556.956 < 2.2e-16 ***
## s(PhD, 5) 1 731910548 731910548 219.160 < 2.2e-16 ***
## s(perc.alumni, 5) 1 468016569 468016569 140.141 < 2.2e-16 ***
## s(Expend, 5) 1 908149151 908149151 271.932 < 2.2e-16 ***
## s(Grad.Rate, 5) 1 110013934 110013934 32.942 1.514e-08 ***
## Residuals 595 1987073638 3339620
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 5) 4 2.3977 0.04913 *
## s(PhD, 5) 4 2.4659 0.04395 *
## s(perc.alumni, 5) 4 1.0603 0.37535
## s(Expend, 5) 4 25.6399 < 2e-16 ***
## s(Grad.Rate, 5) 4 2.7719 0.02652 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1