library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.2
library(boot)
attach(Wage)
summary(Wage)
## year age maritl race
## Min. :2003 Min. :18.00 1. Never Married: 648 1. White:2480
## 1st Qu.:2004 1st Qu.:33.75 2. Married :2074 2. Black: 293
## Median :2006 Median :42.00 3. Widowed : 19 3. Asian: 190
## Mean :2006 Mean :42.41 4. Divorced : 204 4. Other: 37
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## education region jobclass
## 1. < HS Grad :268 2. Middle Atlantic :3000 1. Industrial :1544
## 2. HS Grad :971 1. New England : 0 2. Information:1456
## 3. Some College :650 3. East North Central: 0
## 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## health health_ins logwage wage
## 1. <=Good : 858 1. Yes:2083 Min. :3.000 Min. : 20.09
## 2. >=Very Good:2142 2. No : 917 1st Qu.:4.447 1st Qu.: 85.38
## Median :4.653 Median :104.92
## Mean :4.654 Mean :111.70
## 3rd Qu.:4.857 3rd Qu.:128.68
## Max. :5.763 Max. :318.34
##
poly.mse=c()
for(degree in 1:8){
glm.fit=glm(wage~poly(age, degree, raw=T), data=Wage)
cv.error=cv.glm(glm.fit, data = Wage, K=10)$delta[1]
poly.mse=c(poly.mse, cv.error)
}
plot(poly.mse, xlab='Degree', ylab='CV Error', type='l')
degree.min=which.min(poly.mse)
points(degree.min, poly.mse[degree.min], pch=18, cex=2, col='darkgreen')
fit=lm(wage~poly(age,5),data=Wage)
agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1),oma=c(0,0,2,0))
plot(age,wage,xlim=agelims,cex =.5,col="darkgrey")
title("Degree-5 Polynomial",outer=T)
lines(age.grid,preds$fit,lwd=2,col="darkblue")
matlines(age.grid,se.bands,lwd=1,col="lightblue",lty=3)
The ANOVA results are as follows:
fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
These results suggest using a degree of 4 is optimal while the degree of 5 as shown above is unneccesary.
fit=lm(wage~poly(age,4),data=Wage)
agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1),oma=c(0,0,2,0))
plot(age,wage,xlim=agelims,cex =.5,col="darkgrey")
title("Degree-4 Polynomial",outer=T)
lines(age.grid,preds$fit,lwd=2,col="darkblue")
matlines(age.grid,se.bands,lwd=1,col="lightblue",lty=3)
set.seed(321)
step.mse=c()
for(i in 2:10){
Wage.model=model.frame(wage~cut(age,i), data=Wage)
names(Wage.model)=c('wage','age')
step.fit=glm(wage~age,data=Wage.model)
cv.err=cv.glm(step.fit, data = Wage.model, K=10)$delta[1]
step.mse=c(step.mse, cv.err)
}
plot(step.mse,xlab='Cuts',ylab='CV Error',type='l')
cut.min=which.min(step.mse)
points(cut.min,step.mse[cut.min],pch=18,cex=2,col='darkgreen')
fit=lm(wage~cut(age,7),data=Wage)
coef(summary(fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80.06402 2.405222 33.287586 5.517532e-207
## cut(age, 7)(26.9,35.7] 24.51568 2.892501 8.475600 3.617008e-17
## cut(age, 7)(35.7,44.6] 38.59253 2.792477 13.820180 3.695152e-42
## cut(age, 7)(44.6,53.4] 38.05482 2.812402 13.531077 1.556835e-40
## cut(age, 7)(53.4,62.3] 39.03969 3.082094 12.666611 7.411125e-36
## cut(age, 7)(62.3,71.1] 31.03930 4.884196 6.355048 2.400708e-10
## cut(age, 7)(71.1,80.1] 15.99445 9.075719 1.762334 7.811488e-02
fit=lm(wage~cut(age,7),data=Wage)
agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1),oma=c(0,0,2,0))
plot(age,wage,xlim=agelims,cex =.5,col="darkgrey")
title("Step Function Fit",outer=T)
lines(age.grid,preds$fit,lwd=2,col="darkblue")
matlines(age.grid,se.bands,lwd=1,col="lightblue",lty=3)
detach(Wage)
attach(College)
set.seed(1)
train=sample(dim(College)[1], dim(College)[1]/2)
test=(-train)
College.train = College[train, ]
College.test = College[test, ]
library(leaps)
## Warning: package 'leaps' was built under R version 4.1.3
regfit.fwd=regsubsets(Outstate~., data=College.train, nvmax=17, method ="forward")
reg.summary=summary(regfit.fwd)
par(mfrow=c(2,2))
plot(reg.summary$rss, xlab="Number of Variables ", ylab="RSS", type="l")
plot(reg.summary$adjr2, xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
which.max(reg.summary$adjr2)
## [1] 14
points(14,reg.summary$adjr2[14], col="red", cex=2, pch =20)
plot(reg.summary$cp,xlab="Number of Variables ",ylab="Cp",type='l')
which.min(reg.summary$cp)
## [1] 14
points(14,reg.summary$cp[14], col ="red", cex=2, pch =20)
plot(reg.summary$bic ,xlab="Number of Variables ", ylab="BIC", type='l')
which.min(reg.summary$bic)
## [1] 6
points (6,reg.summary$bic [6],col="red",cex=2,pch =20)
regfit=regsubsets(Outstate~., data = College, method = "forward")
coeffs=coef(regfit, id = 6)
names(coeffs)
## [1] "(Intercept)" "PrivateYes" "Room.Board" "PhD" "perc.alumni"
## [6] "Expend" "Grad.Rate"
library(gam)
## Warning: package 'gam' was built under R version 4.1.3
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.1.2
## Loaded gam 1.20.1
gam1=gam(Outstate ~ Private + s(Room.Board, 2) + s(PhD, 2) + s(perc.alumni, 2) + s(Expend, 5) + s(Grad.Rate, 2), data=College.train)
par(mfrow = c(2, 3))
plot(gam1, se=TRUE, col ="#1c9099")
Expend looks to be non-linear, while the other variables appear fairly
linear.
preds=predict(gam1, College.test)
error=mean((College.test$Outstate - preds)^2)
error
## [1] 3349290
tss=mean((College.test$Outstate - mean(College.test$Outstate))^2)
rss=1-error/tss
rss
## [1] 0.7660016
The R-squared is 0.766 using a 6 variable model.
summary(gam1)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 2) + s(PhD,
## 2) + s(perc.alumni, 2) + s(Expend, 5) + s(Grad.Rate, 2),
## data = College.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7402.89 -1114.45 -12.67 1282.69 7470.60
##
## (Dispersion Parameter for gaussian family taken to be 3711182)
##
## Null Deviance: 6989966760 on 387 degrees of freedom
## Residual Deviance: 1384271126 on 373 degrees of freedom
## AIC: 6987.021
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1778718277 1778718277 479.286 < 2.2e-16 ***
## s(Room.Board, 2) 1 1577115244 1577115244 424.963 < 2.2e-16 ***
## s(PhD, 2) 1 322431195 322431195 86.881 < 2.2e-16 ***
## s(perc.alumni, 2) 1 336869281 336869281 90.771 < 2.2e-16 ***
## s(Expend, 5) 1 530538753 530538753 142.957 < 2.2e-16 ***
## s(Grad.Rate, 2) 1 86504998 86504998 23.309 2.016e-06 ***
## Residuals 373 1384271126 3711182
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 2) 1 1.9157 0.1672
## s(PhD, 2) 1 0.9699 0.3253
## s(perc.alumni, 2) 1 0.1859 0.6666
## s(Expend, 5) 4 20.5075 2.665e-15 ***
## s(Grad.Rate, 2) 1 0.5702 0.4506
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The Expend variable is the only one that shows significance of a non-linear relationship and we can also see that in the plot of the GAM fit in 10(b).