library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(boot)
data("Wage")
delta_z = rep(NA, 10)
for (i in 1:10) {LMfit1 = glm(wage~poly(age, i), data=Wage)
delta_z[i] = cv.glm(Wage, LMfit1, K=10)$delta[1]}
delta_z
## [1] 1675.557 1601.395 1595.659 1595.339 1596.171 1595.740 1594.250 1596.182
## [9] 1591.607 1593.251
plot(1:10, delta_z, xlab = 'Degree', ylab = 'Test Error', type = 'l')
degree.min <- min(delta_z)
points(degree.min, delta_z[degree.min], col = 'blue', lwd = 2, pch = 20)
fit1= lm(wage∼age ,data=Wage)
fit2= lm(wage∼poly(age ,2) ,data=Wage)
fit3= lm(wage∼poly(age ,3) ,data=Wage)
fit4= lm(wage∼poly(age ,4) ,data=Wage)
fit5= lm(wage∼poly(age ,5) ,data=Wage)
anova(fit1, fit2, fit3, fit4, fit5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(wage~age, data = Wage, col = "darkgrey")
agelims = range(Wage$age)
age.grid = seq (from=agelims [1], to=agelims [2])
LMfit3 <- lm(wage ~ poly(age, 3), data = Wage)
preds=predict (LMfit3 ,newdata =list(age=age.grid))
lines(age.grid, preds, col = "red", lwd = 2)
8 Cuts are optimal.
cv_errors = rep(NA, 10)
for (i in 2:10) {
Wage$age.cut = cut(Wage$age, i)
LMfit4 = glm(wage~age.cut, data=Wage)
cv_errors[i] = cv.glm(Wage, LMfit4, K=10)$delta[2]
}
plot(2:10, cv_errors[-1], xlab="Number of cuts", ylab="Test error", type="l", pch=20, lwd=2)
degree.min2 = min(cv_errors)
points(degree.min2, cv_errors[degree.min2], col = 'blue', lwd = 2, pch = 20)
LMfit5 <- glm(wage ~ cut(age, 8), data = Wage)
plot(wage~age, data = Wage, col = "darkgrey")
agelims = range(Wage$age)
age.grid = seq (from=agelims [1], to=agelims [2])
preds2=predict (LMfit5 ,newdata =list(age=age.grid))
lines(age.grid, preds2, col = "red", lwd = 2)
data('College')
library(leaps)
## Warning: package 'leaps' was built under R version 3.6.3
train = sample(1: nrow(College), nrow(College)/2)
test <- -train
College.train = College[train, ]
College.test = College[test, ]
LMfit6 <- regsubsets(Outstate ~ ., data = College.train, method = 'forward')
fit.summary <- summary(LMfit6)
par(mfrow=c(2,2))
plot(fit.summary$cp ,xlab="Number of Variables ", ylab="Cp",
type="b")
points(which.min(fit.summary$cp), fit.summary$cp[which.min(fit.summary$cp)], col="red", cex=2, pch=20)
plot(fit.summary$bic ,xlab="Number of Variables ", ylab="BIC",
type="b")
points(which.min(fit.summary$bic), fit.summary$bic[which.min(fit.summary$bic)], col="red", cex=2, pch=20)
coef(LMfit6, id = 6)
## (Intercept) PrivateYes Room.Board Terminal perc.alumni
## -4429.2119297 2718.5469105 0.9975076 34.0583542 37.5178405
## Expend Grad.Rate
## 0.2043631 46.0407806
library(gam)
## Warning: package 'gam' was built under R version 3.6.3
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.6.3
## Loaded gam 1.16.1
GAMmod = gam(Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), data = College.train)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
par(mfrow = c(2,3))
plot(GAMmod, se = T, col = 'blue')
R square is 0.7792718
predgam = predict(GAMmod, College.test)
gam.err = mean((College.test$Outstate - predgam)^2)
gam.err
## [1] 3555382
gam.tss = mean((College.test$Outstate - mean(College.test$Outstate))^2)
gam.tss
## [1] 16053612
gam.rss = 1 - gam.err/gam.tss
gam.rss
## [1] 0.7785307
The model, Anova for Parametric Effects suggests a strong non-linear relationship between “Outstate” and “Expend”
summary(GAMmod)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 5) + s(Terminal,
## 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5),
## data = College.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6825.03 -1092.17 -50.86 1153.66 7805.79
##
## (Dispersion Parameter for gaussian family taken to be 3588585)
##
## Null Deviance: 6309716682 on 387 degrees of freedom
## Residual Deviance: 1295477607 on 360.9995 degrees of freedom
## AIC: 6985.3
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1632490381 1632490381 454.912 < 2.2e-16 ***
## s(Room.Board, 5) 1 1207287961 1207287961 336.425 < 2.2e-16 ***
## s(Terminal, 5) 1 294916787 294916787 82.182 < 2.2e-16 ***
## s(perc.alumni, 5) 1 220220352 220220352 61.367 5.346e-14 ***
## s(Expend, 5) 1 481145035 481145035 134.077 < 2.2e-16 ***
## s(Grad.Rate, 5) 1 176809984 176809984 49.270 1.111e-11 ***
## Residuals 361 1295477607 3588585
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 5) 4 1.9899 0.09550 .
## s(Terminal, 5) 4 3.2012 0.01330 *
## s(perc.alumni, 5) 4 0.9403 0.44059
## s(Expend, 5) 4 17.1480 6.877e-13 ***
## s(Grad.Rate, 5) 4 2.8091 0.02551 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1