We will be using the Wage dataset from the ISLR package. We will also need the gam package.
#install.packages("ISLR")
library(ISLR)
attach(Wage)
#install.packages("gam")
library(gam)
# model with natural splines
gam1=lm(wage~ns(year ,4)+ns(age ,5)+education ,data=Wage)
summary(gam1)
##
## Call:
## lm(formula = wage ~ ns(year, 4) + ns(age, 5) + education, data = Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -120.513 -19.608 -3.583 14.112 214.535
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.949 4.704 9.980 < 2e-16 ***
## ns(year, 4)1 8.625 3.466 2.488 0.01289 *
## ns(year, 4)2 3.762 2.959 1.271 0.20369
## ns(year, 4)3 8.127 4.211 1.930 0.05375 .
## ns(year, 4)4 6.806 2.397 2.840 0.00455 **
## ns(age, 5)1 45.170 4.193 10.771 < 2e-16 ***
## ns(age, 5)2 38.450 5.076 7.575 4.78e-14 ***
## ns(age, 5)3 34.239 4.383 7.813 7.69e-15 ***
## ns(age, 5)4 48.678 10.572 4.605 4.31e-06 ***
## ns(age, 5)5 6.557 8.367 0.784 0.43328
## education2. HS Grad 10.983 2.430 4.520 6.43e-06 ***
## education3. Some College 23.473 2.562 9.163 < 2e-16 ***
## education4. College Grad 38.314 2.547 15.042 < 2e-16 ***
## education5. Advanced Degree 62.554 2.761 22.654 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.16 on 2986 degrees of freedom
## Multiple R-squared: 0.293, Adjusted R-squared: 0.2899
## F-statistic: 95.2 on 13 and 2986 DF, p-value: < 2.2e-16
# model with smoothing spline
gam.m3=gam(wage~s(year ,4)+s(age ,5)+education ,data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
summary(gam.m3)
##
## Call: gam(formula = wage ~ s(year, 4) + s(age, 5) + education, data = Wage)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -119.43 -19.70 -3.33 14.17 213.48
##
## (Dispersion Parameter for gaussian family taken to be 1235.69)
##
## Null Deviance: 5222086 on 2999 degrees of freedom
## Residual Deviance: 3689770 on 2986 degrees of freedom
## AIC: 29887.75
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(year, 4) 1 27162 27162 21.981 2.877e-06 ***
## s(age, 5) 1 195338 195338 158.081 < 2.2e-16 ***
## education 4 1069726 267432 216.423 < 2.2e-16 ***
## Residuals 2986 3689770 1236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(year, 4) 3 1.086 0.3537
## s(age, 5) 4 32.380 <2e-16 ***
## education
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# plot recognizes gam object
par(mfrow=c(1,3))
plot(gam.m3, se=TRUE ,col ="blue")
# plot.Gam (note typo in book)
plot.Gam(gam1, se=TRUE ,col ="red")
# compare gam models
gam.m1=gam(wage~s(age ,5)+education ,data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
gam.m2=gam(wage~year+s(age ,5)+education ,data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
anova(gam.m1,gam.m2,gam.m3,test="F")
## Analysis of Deviance Table
##
## Model 1: wage ~ s(age, 5) + education
## Model 2: wage ~ year + s(age, 5) + education
## Model 3: wage ~ s(year, 4) + s(age, 5) + education
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 2990 3711731
## 2 2989 3693842 1 17889.2 14.4771 0.0001447 ***
## 3 2986 3689770 3 4071.1 1.0982 0.3485661
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# summarise model
summary(gam.m3)
##
## Call: gam(formula = wage ~ s(year, 4) + s(age, 5) + education, data = Wage)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -119.43 -19.70 -3.33 14.17 213.48
##
## (Dispersion Parameter for gaussian family taken to be 1235.69)
##
## Null Deviance: 5222086 on 2999 degrees of freedom
## Residual Deviance: 3689770 on 2986 degrees of freedom
## AIC: 29887.75
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(year, 4) 1 27162 27162 21.981 2.877e-06 ***
## s(age, 5) 1 195338 195338 158.081 < 2.2e-16 ***
## education 4 1069726 267432 216.423 < 2.2e-16 ***
## Residuals 2986 3689770 1236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(year, 4) 3 1.086 0.3537
## s(age, 5) 4 32.380 <2e-16 ***
## education
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# predictions
preds=predict(gam.m2, newdata=Wage)
head(preds)
## 231655 86582 161300 155159 11443 376662
## 49.98235 99.55449 112.76257 127.50172 101.15355 131.30566
# model with loess with span 0.7
gam.lo=gam(wage~s(year ,df=4)+lo(age ,span =0.7)+education ,
data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
# again note typo in book
plot.Gam(gam.lo, se=TRUE , col =" green")
# model with loess with span 0.5
# with interaction between year and age
gam.lo.i=gam(wage~lo(year,age,span=0.5)+education,data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : liv
## too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : lv
## too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : liv
## too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : lv
## too small. (Discovered by lowesd)
# plot surface
#install.packages("akima")
library(akima)
par(mfrow=c(1,1))
plot(gam.lo.i)
# logistic regression for gam
gam.lr=gam(I(wage >250)~year+s(age ,df=5)+education ,
family=binomial ,data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
par(mfrow=c(1,3))
plot(gam.lr,se=T,col="green ")
table(education ,I(wage >250))
##
## education FALSE TRUE
## 1. < HS Grad 268 0
## 2. HS Grad 966 5
## 3. Some College 643 7
## 4. College Grad 663 22
## 5. Advanced Degree 381 45
# take out less than hs
gam.lr.s=gam(I(wage >250)~year+s(age ,df=5)+education ,family=
binomial ,data=Wage , subset =(education !="1. < HS Grad"))
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
plot(gam.lr.s,se=T,col="green")