library(ISLR2)
attach(Wage)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
require(boot)
## Loading required package: boot
set.seed(1)
cv.error=rep(0,5)
for (i in 1:5){
glm.fit=glm(wage~poly(age,i),data=Wage)
cv.error[i]=cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv.error
## [1] 1676.826 1600.763 1598.399 1595.651 1594.977
plot(cv.error,type="b",xlab="Degree",ylab="Test MSE")
points(which.min(cv.error),cv.error[4],col="yellow",pch=20,cex=2)
fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We fit our linear model 5 times to compare it to different polynomials. The p-value from Model 1 to Model 2 is virtually zero, suggesting that this is not a good fit. The Model 2 and Model 3 are very identical. Model 3 to Model 4 has a p-value of around 5%, whereas Model 4 to Model 5 has a p-value of 0.37. As a result, a cubit or quatric polynomial is the best choice.
plot(wage ~ age,
data = Wage,
col = "darkgrey")
age.range = range(Wage$age)
age.grid = seq(from = age.range[1],
to = age.range[2])
fit = lm(wage ~ poly(age, 3),
data = Wage)
preds = predict(fit,
newdata = list(age = age.grid))
lines(age.grid,
preds,
col = "yellow",
lwd = 2)
cv.errors=rep(NA, 10)
for(i in 2:10){
Wage$age.cut=cut(Wage$age,i)
glm.fit=glm(wage ~ age.cut, data=Wage)
cv.errors[i]=cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv.errors
## [1] NA 1734.999 1682.289 1636.768 1632.149 1624.180 1611.908 1601.480
## [9] 1611.079 1604.125
plot(2:10,cv.errors[-1],type="b",xlab="Number of cuts",ylab="Test MSE")
points(which.min(cv.errors),cv.errors[which.min(cv.errors)],col="yellow",pch=20,cex=2)
plot(wage ~ age,
data = Wage,
col = "darkgrey")
fit = glm(wage ~ cut(age, 8),
data = Wage)
preds = predict(fit, list(age = age.grid))
lines(age.grid, preds,
col = "yellow",
lwd = 2)
set.seed(1)
training.data=sample(1:nrow(College),nrow(College)/4)
train=College[training.data, ]
test=College[-training.data, ]
require(leaps)
## Loading required package: leaps
fwd=regsubsets(Outstate~.,data=train,nvmax=17,method='forward')
fwd_sum=summary(fwd)
par(mfrow=c(2,2))
plot(fwd_sum$cp,xlab="Number of Variables",ylab="Cp",type="b")
points(which.min(fwd_sum$cp),fwd_sum$cp[which.min(fwd_sum$cp)],col="yellow",cex=2,pch=20)
plot(fwd_sum$bic,xlab="Number of Variables",ylab="BIC",type="b")
points(which.min(fwd_sum$bic),fwd_sum$bic[which.min(fwd_sum$bic)],col="yellow",cex=2,pch=20)
plot(fwd_sum$adjr2,xlab="Number of Variables",ylab="Adjusted R^2^",type="b")
points(which.max(fwd_sum$adjr2),fwd_sum$adjr2[which.max(fwd_sum$adjr2)],col="yellow",cex=2,pch=20)
test.matrix=model.matrix(Outstate~.,data=test)
val.errors=rep(NA,17)
for(i in 1:17){
coefi=coef(fwd,id=i)
pred=test.matrix[,names(coefi)]%*%coefi
val.errors[i]=mean((test$Outstate-pred)^2)
}
which.min(val.errors)
## [1] 6
The forward stepwise model has 12 variables, although a simpler model with only 6 variables will be sufficient the test MSE does not decrease from 6 to 12.
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20.2
gam.fit=gam(Outstate~Private+s(Room.Board,3)+s(Terminal,3)+s(perc.alumni,3)+s(Expend, 3)+s(Grad.Rate,3),data=train)
par(mfrow=c(2,3))
plot(gam.fit,se=TRUE,col="yellow")
preds=predict(gam.fit,newdata=test)
error=mean((test$Outstate-preds)^2)
val.errors[6]-error
## [1] 423091.4
gam outperforms a simple linear model using only six variables.
summary(gam.fit)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 3) + s(Terminal,
## 3) + s(perc.alumni, 3) + s(Expend, 3) + s(Grad.Rate, 3),
## data = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6743.42 -1148.64 -68.99 1082.48 7675.24
##
## (Dispersion Parameter for gaussian family taken to be 4106616)
##
## Null Deviance: 3581544240 on 193 degrees of freedom
## Residual Deviance: 726869124 on 176.9995 degrees of freedom
## AIC: 3523.01
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 903194457 903194457 219.936 < 2.2e-16 ***
## s(Room.Board, 3) 1 687781046 687781046 167.481 < 2.2e-16 ***
## s(Terminal, 3) 1 132474605 132474605 32.259 5.446e-08 ***
## s(perc.alumni, 3) 1 176550785 176550785 42.992 5.822e-10 ***
## s(Expend, 3) 1 367552516 367552516 89.502 < 2.2e-16 ***
## s(Grad.Rate, 3) 1 58312745 58312745 14.200 0.0002237 ***
## Residuals 177 726869124 4106616
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 3) 2 1.0805 0.3416
## s(Terminal, 3) 2 0.9042 0.4067
## s(perc.alumni, 3) 2 0.9588 0.3853
## s(Expend, 3) 2 21.0946 6.068e-09 ***
## s(Grad.Rate, 3) 2 1.1730 0.3118
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
According to the model, there is a significant non-linear link between “Outstate” and “Expend,” and a moderately strong non-linear relationship between “outstate” and “Grad.Rate.”