library(ISLR)
library(boot)
library(leaps)
library(gam)
attach(Wage)
set.seed(1)
crossValError = rep(1,6)
for (i in 1:6) {
poly.fit = glm(wage~poly(age,i), data = Wage)
crossValError[i] = cv.glm(Wage, poly.fit, K=10)$delta[1]
}
crossValError
## [1] 1676.826 1600.763 1598.399 1595.651 1594.977 1596.061
plot(crossValError, type='o', col = 'blue')
which.min(crossValError)
## [1] 5
The model fit with 5 as the degree of the polynomial has the lowest MSE, but based on the above plot, we can see there is very minimal difference between using 4 and 5 as the degree of the polynomial. As such 4 will be used as the degree of the polynomial.
fit.4=lm(wage~poly(age,4),data=Wage)
coef(summary(fit.4))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.70361 0.7287409 153.283015 0.000000e+00
## poly(age, 4)1 447.06785 39.9147851 11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3 125.52169 39.9147851 3.144742 1.678622e-03
## poly(age, 4)4 -77.91118 39.9147851 -1.951938 5.103865e-02
fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
fit.6=lm(wage~poly(age,6),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5,fit.6)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.6636 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8936 0.001675 **
## 4 2995 4771604 1 6070 3.8117 0.050989 .
## 5 2994 4770322 1 1283 0.8054 0.369565
## 6 2993 4766389 1 3932 2.4692 0.116201
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Based on the above ANOVA table, we can use a quadratic polynomial. The fits with 2 and 3 degrees for the polynomial are insufficient, while greater than 4 seems unnecessarily high. This corresponds with the results from cross-validation.
agelims = range(Wage$age)
age.grid = seq(from=agelims[1], to=agelims[2])
lm.fit = lm(wage~poly(age, 3), data=Wage)
lm.pred = predict(lm.fit, data.frame(age=age.grid))
attach(Wage)
plot(Wage$age,wage,xlim=agelims,cex =.5,col="darkgrey")
lines(age.grid, lm.pred, col="blue", lwd=2)
step.mse=c()
for(i in 2:10){
fit.step=model.frame(wage~cut(age,i),data=Wage)
names(fit.step)=c('wage','age')
step.fit=glm(wage~age,data=fit.step)
mse=cv.glm(step.fit,data = fit.step,K=10)$delta[1]
step.mse=c(step.mse,mse)
}
plot(step.mse,xlab='Degree of Polynomial',ylab='Cross Validation Error',type='l', col='blue')
which.min(step.mse)
## [1] 7
agelims = range(Wage$age)
age.grid = seq(from=agelims[1], to=agelims[2])
fit.step = glm(wage~cut(age,7), data=Wage)
pred.step = predict(fit.step, newdata=list(age=age.grid), se=TRUE)
se.bands = pred.step$fit + cbind(2*pred.step$se.fit, -2*pred.step$se.fit)
plot(Wage$age,Wage$wage, xlim=agelims, cex=0.5, col="darkgrey")
lines(age.grid, pred.step$fit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)
attach(College)
set.seed(1)
train=sample(1:nrow(College), nrow(College)*.75)
train.college=College[train,]
test=(-train)
test.college=College[test,]
outstate.test=College$Outstate[test]
regfit.best=regsubsets(Outstate~.,data=College[train,],nvmax=17)
test.mat=model.matrix(Outstate~.,data=College[test,])
val.errors=rep(NA,17)
for(i in 1:17){
coefi=coef(regfit.best,id=i)
pred=test.mat[,names(coefi)]%*%coefi
val.errors[i]=mean((College$Outstate[test]-pred)^2)
}
val.errors
## [1] 7369252 6221067 4950665 3993584 3757305 3529917 3572471 3655013 3419373
## [10] 3257888 3356971 3322114 3301218 3276352 3296237 3313388 3303868
which.min(val.errors)
## [1] 10
summary(regfit.best)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College[train, ], nvmax = 17)
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: exhaustive
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " "*" " " " "
## 9 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 10 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 11 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 12 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 13 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 14 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*" "*" " " " "
## 8 ( 1 ) " " "*" " " "*" " " "*" " "
## 9 ( 1 ) " " "*" " " " " " " "*" " "
## 10 ( 1 ) " " "*" " " " " " " "*" " "
## 11 ( 1 ) " " "*" " " "*" " " "*" " "
## 12 ( 1 ) " " "*" " " "*" " " "*" "*"
## 13 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 14 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 15 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 16 ( 1 ) "*" "*" " " "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) "*" " " " "
## 3 ( 1 ) "*" "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
## 9 ( 1 ) "*" "*" " "
## 10 ( 1 ) "*" "*" "*"
## 11 ( 1 ) "*" "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
The model with 10 predictors is identified as being the best. This model predicts Outstate with Private, Apps, Accept, Top10perc, F.Undergrad, Room.Board, Terminal, perc.alumni, Expend, and Grad.Rate
attach(College)
gam.fit = gam(Outstate~Private+s(Apps,4)+s(Accept,4)+s(Top10perc,4)+s(F.Undergrad,4)+s(Room.Board,4)+s(Terminal,4)+s(perc.alumni,4)+s(Expend,4)+s(Grad.Rate,4), data = College[train,])
plot.Gam(gam.fit, se=TRUE,col ="RED")
Most of the predictor variables seem to have a fairly linear. Expend is the most non-linear. However, there is some movement in the plots for each of the other variables, so perhaps a GAM would provide a better fit than a linear model.
pred.gam = predict(gam.fit, test.college)
MSE.gam = mean((outstate.test - pred.gam)^2)
MSE.gam
## [1] 2984282
val.errors[10]
## [1] 3257888
The GAM resulted in a MSE of 2984282. Compared to the MSE from the best linear model, 3257888. Based on this, the GAM looks to be a better fit.
Expend appears to be the most non-linear. Additionally, it appears that both Top10perc and Room.Board may have a slightly non-linear relationship with Outstate