library(ISLR)
attach(Wage)
library(boot)
set.seed(1)
cv.error <- rep(0,10)
for(i in 1:10){
glm.fit <- glm(wage~poly(age,i), data=Wage)
cv.error[i] <- cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv.error
## [1] 1676.826 1600.763 1598.399 1595.651 1594.977 1596.061 1594.298 1598.134
## [9] 1593.913 1595.950
which.min(cv.error)
## [1] 9
cv.error[9]
## [1] 1593.913
The lowest d MSE was the 9th degree, with a result of
1593.913; however, the change of MSE after d=4 is
negligible (1595.651), so that will be used going forward. Utilizing the
ANOVA below, we see that after the 4th polynomial, it stops being
significant, which is in agreement with that above.
fit.1 <- lm(wage~age, data=Wage)
fit.2 <- lm(wage~poly (age, 2), data=Wage)
fit.3 <- lm(wage~poly (age, 3), data=Wage)
fit.4 <- lm(wage~poly (age, 4), data=Wage)
fit.5 <- lm(wage~poly (age, 5), data=Wage)
anova ( fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(wage ~ age, data=Wage, col="darkgrey")
agelims <- range(age)
age.grid <- seq(from=agelims[1], to=agelims[2])
fit <- lm(wage~poly(age,4), data=Wage)
preds <- predict(fit,newdata = list(age=age.grid),se=TRUE)
se.bands <- cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
title("Degree-4 Polynomial",outer=T)
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)
cv.errors <- rep(NA, 10)
for(i in 2:10){
Wage$age.cut <- cut(Wage$age,i)
glm.fit <- glm(wage ~ age.cut, data=Wage)
cv.errors[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}
cv.errors
## [1] NA 1735.201 1682.307 1636.967 1631.602 1625.438 1613.727 1600.229
## [9] 1612.512 1607.027
which.min(cv.errors)
## [1] 8
This shows that we should be using 8 cuts
table(cut(age,8))
##
## (17.9,25.8] (25.8,33.5] (33.5,41.2] (41.2,49] (49,56.8] (56.8,64.5]
## 231 519 671 728 503 276
## (64.5,72.2] (72.2,80.1]
## 54 18
fit=lm(wage~cut(age,8),data=Wage)
coef(summary(fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.28175 2.629812 29.006542 3.110596e-163
## cut(age, 8)(25.8,33.5] 25.83329 3.161343 8.171618 4.440913e-16
## cut(age, 8)(33.5,41.2] 40.22568 3.049065 13.192791 1.136044e-38
## cut(age, 8)(41.2,49] 43.50112 3.018341 14.412262 1.406253e-45
## cut(age, 8)(49,56.8] 40.13583 3.176792 12.634076 1.098741e-35
## cut(age, 8)(56.8,64.5] 44.10243 3.564299 12.373380 2.481643e-34
## cut(age, 8)(64.5,72.2] 28.94825 6.041576 4.791505 1.736008e-06
## cut(age, 8)(72.2,80.1] 15.22418 9.781110 1.556488 1.196978e-01
preds <- predict(fit, list(age=age.grid))
plot(wage~age, data=Wage, col='darkgrey')
lines(age.grid, preds, col='blue', lwd=2)
detach(Wage)
attach(College)
library(leaps)
dim(College)
## [1] 777 18
regfit.fwd <- regsubsets(Outstate~.,College, nvmax=18, method="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., College, nvmax = 18, method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " " " " " "
## 9 ( 1 ) "*" " " " " " " " " " " " "
## 10 ( 1 ) "*" " " "*" " " " " " " " "
## 11 ( 1 ) "*" " " "*" " " " " " " "*"
## 12 ( 1 ) "*" "*" "*" " " " " " " "*"
## 13 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 14 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 15 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*" "*" " " " "
## 8 ( 1 ) " " "*" " " "*" "*" "*" " "
## 9 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 10 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 11 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 12 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 13 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 14 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 15 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 16 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
## 9 ( 1 ) "*" "*" "*"
## 10 ( 1 ) "*" "*" "*"
## 11 ( 1 ) "*" "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
set.seed(1)
train <- sample(c(TRUE, FALSE), nrow(College), rep=TRUE)
head(train)
## [1] TRUE FALSE TRUE TRUE FALSE TRUE
test <- (!train)
head(test)
## [1] FALSE TRUE FALSE FALSE TRUE FALSE
regfit.full <- regsubsets(Outstate~.,data=College[train,], nvmax=17)
test.mat <- model.matrix(Outstate~.,data=College[test,])
val.errors <- rep(NA,17)
for(i in 1:17){
coefi <- coef(regfit.full, id=i)
pred <- test.mat[,names(coefi)]%*%coefi
val.errors[i] <- mean((College$Outstate[test]-pred)^2)
}
val.errors
## [1] 8743411 6271243 5188723 4548428 4269584 4087184 4067437 4119695 4031878
## [10] 3809441 3833116 3827900 3846028 3836026 3822375 3826807 3828915
#how many variables is best for the model?
which.min(val.errors)
## [1] 10
coef(regfit.full,10)
## (Intercept) PrivateYes Apps Accept Top10perc
## -3189.1762784 2340.3784123 -0.2845174 0.7052637 29.7207779
## F.Undergrad Room.Board Terminal perc.alumni Expend
## -0.1492326 0.9832653 27.5107535 51.2155999 0.1966401
## Grad.Rate
## 22.4494743
The best model uses 10 variables: Private,
Apps, Accept, Top10perc,
F.Undergrad, Room.Board,
Terminal, perc.alumni, Expend,
and Grad.Rate.
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-1
gam1 <- gam(Outstate~Private+s(Apps,4)+s(Accept,4)+s(Top10perc, 4)+s(F.Undergrad,4)+s(Room.Board,4)+s(Terminal,4)+s(perc.alumni,4)+s(Expend,4)+s(Grad.Rate,4), data=College[train,])
par(mfrow=c(1,3))
plot(gam1, se=TRUE,col="blue")
preds <- predict(gam1, newdata = College[test,])
mean((College[test,]$Outstate-preds)^2)
## [1] 3699419
The MSE of this model on the test data is 3699419,
From the plots in Q10b, it appears that the following variables are
non-linear: top10perc, Terminal,
Expend, and Grad.Rate