In this exercise, you will further analyze the Wage
data set considered throughout this chapter.
library(ISLR)
library(boot)
data(Wage)
set.seed(100)
cv.error <- rep(0,5)
for (i in 1:5){
glm.fit <- glm(wage ~ poly(age,i),data=Wage)
cv.error[i]<- cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv.error
## [1] 1677.456 1600.961 1596.398 1594.752 1597.006
plot(cv.error, type="b", xlab="Degree", ylab="Test MSE")
points(which.min(cv.error), cv.error[4], col="green", pch=20, cex=2)
By looking at the plot, it looks like the optimal number of clusters is 4 according to cross-validation.
model_1 <- lm(wage ~ age, data=Wage)
model_2 <- lm(wage ~ poly(age, 2), data=Wage)
model_3 <- lm(wage ~ poly(age, 3), data=Wage)
model_4 <- lm(wage ~ poly(age, 4), data=Wage)
model_5 <- lm(wage ~ poly(age, 5), data=Wage)
anova(model_1, model_2, model_3, model_4, model_5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
It looks like Model 3 and Model 4 are the best models according to their p-values being less than 0.05.
plot(wage ~ age, data = Wage, col = "pink")
age_range <- range(Wage$age)
age_grid <- seq(from = age_range[1], to = age_range[2])
poly_model <- lm(wage ~ poly(age, 3), data = Wage)
preds <- predict(poly_model, newdata = list(age = age_grid))
lines(age_grid, preds, col = "black", lwd = 3)
wage
using age
, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.cv_errors_partb <- rep(NA, 10)
for(i in 2:10){
Wage$age.cut <- cut(Wage$age,i)
glm.fit <- glm(wage ~ age.cut, data=Wage)
cv_errors_partb[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}
cv_errors_partb
## [1] NA 1733.379 1682.599 1635.304 1631.746 1622.379 1609.777 1601.282
## [9] 1607.650 1604.789
plot(2:10, cv_errors_partb[-1], type="b", xlab="Number of cuts", ylab="Test MSE")
points(which.min(cv_errors_partb), cv_errors_partb[which.min(cv_errors_partb)], col="green", pch=20, cex=2)
By looking at this MSE project the optimal number of clusters is 8.
plot(wage ~ age, data = Wage, col = "pink")
fit <- glm(wage ~ cut(age, 8), data = Wage)
preds <- predict(fit, list(age = age_grid))
lines(age_grid, preds, col = "black", lwd = 3)
This question relates to the College data set.
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, subset = train,
## method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " " " " " "
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" "*" " " "*" " " " "
## 8 ( 1 ) " " "*" "*" " " "*" "*" " "
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20
gam_model <- gam(Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), data = College, subset = train)
par(mfrow = c(2,3))
plot(gam_model, se = TRUE, col = 'pink')
From the plots it looks like Expend
and Grad.Rate
are non-linear with Outstate
.
preds_q10 <- predict(gam_model, College[test, ])
RSS <- sum((College[test, ]$Outstate - preds_q10)^2)
TSS <- sum((College[test, ]$Outstate - mean(College[test, ]$Outstate)) ^ 2)
1 - (RSS / TSS)
## [1] 0.7682499
From this calculation it looks like the R squared statistic is 0.77. This is the variance explained by the model.
summary(gam_model)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 5) + s(Terminal,
## 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5),
## data = College, subset = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6567.11 -1131.80 30.67 1232.04 4546.73
##
## (Dispersion Parameter for gaussian family taken to be 3277148)
##
## Null Deviance: 6379056571 on 387 degrees of freedom
## Residual Deviance: 1183050282 on 361 degrees of freedom
## AIC: 6950.075
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1683336087 1683336087 513.659 < 2.2e-16 ***
## s(Room.Board, 5) 1 1405283832 1405283832 428.813 < 2.2e-16 ***
## s(Terminal, 5) 1 364996890 364996890 111.376 < 2.2e-16 ***
## s(perc.alumni, 5) 1 270891372 270891372 82.661 < 2.2e-16 ***
## s(Expend, 5) 1 587433739 587433739 179.251 < 2.2e-16 ***
## s(Grad.Rate, 5) 1 96795747 96795747 29.537 1.012e-07 ***
## Residuals 361 1183050282 3277148
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 5) 4 2.4695 0.044444 *
## s(Terminal, 5) 4 1.6104 0.171044
## s(perc.alumni, 5) 4 1.8913 0.111372
## s(Expend, 5) 4 15.1845 1.743e-11 ***
## s(Grad.Rate, 5) 4 4.3415 0.001926 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Looking at the p-values, Expend
has the strongest linear relationship with Outstate
. Other variables that have a linear relationship with Outstate
are Room.Board and Grad.Rate
.