cat("\014")
library(ISLR)
attach(Wage)
head(Wage)
## year age sex maritl race education
## 231655 2006 18 1. Male 1. Never Married 1. White 1. < HS Grad
## 86582 2004 24 1. Male 1. Never Married 1. White 4. College Grad
## 161300 2003 45 1. Male 2. Married 1. White 3. Some College
## 155159 2003 43 1. Male 2. Married 3. Asian 4. College Grad
## 11443 2005 50 1. Male 4. Divorced 1. White 2. HS Grad
## 376662 2008 54 1. Male 2. Married 1. White 4. College Grad
## region jobclass health health_ins
## 231655 2. Middle Atlantic 1. Industrial 1. <=Good 2. No
## 86582 2. Middle Atlantic 2. Information 2. >=Very Good 2. No
## 161300 2. Middle Atlantic 1. Industrial 1. <=Good 1. Yes
## 155159 2. Middle Atlantic 2. Information 2. >=Very Good 1. Yes
## 11443 2. Middle Atlantic 2. Information 1. <=Good 1. Yes
## 376662 2. Middle Atlantic 2. Information 2. >=Very Good 1. Yes
## logwage wage
## 231655 4.318063 75.04315
## 86582 4.255273 70.47602
## 161300 4.875061 130.98218
## 155159 5.041393 154.68529
## 11443 4.318063 75.04315
## 376662 4.845098 127.11574
?Wage #information about data
## starting httpd help server ... done
Making model
fit=lm(wage~poly(age,4),data=4)
summary(fit)
##
## Call:
## lm(formula = wage ~ poly(age, 4), data = 4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.707 -24.626 -4.993 15.217 203.693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.7036 0.7287 153.283 < 2e-16 ***
## poly(age, 4)1 447.0679 39.9148 11.201 < 2e-16 ***
## poly(age, 4)2 -478.3158 39.9148 -11.983 < 2e-16 ***
## poly(age, 4)3 125.5217 39.9148 3.145 0.00168 **
## poly(age, 4)4 -77.9112 39.9148 -1.952 0.05104 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.91 on 2995 degrees of freedom
## Multiple R-squared: 0.08626, Adjusted R-squared: 0.08504
## F-statistic: 70.69 on 4 and 2995 DF, p-value: < 2.2e-16
Plotting the fit with statndard deviation
agelimits=range(age)
age.grid = seq(from=agelimits[1],to=agelimits[2])
preds = predict(fit,list(age=age.grid),se=T)
se.bands=cbind(preds$fit+2*preds$se,preds$fit-2*preds$se)
plot(age,wage,xlab="Age",ylab="Wage",col="darkgrey")
lines(age.grid,preds$fit,lwd=2,col="red")
matlines(age.grid,se.bands,col="blue",lty=2)
Alternative way of making polynomial model but it is not orthonormal there may exist interactions between the preictors
fita=lm(wage~age+I(age^2)+I(age^3)+I(age^4),data=Wage)
summary(fita)
##
## Call:
## lm(formula = wage ~ age + I(age^2) + I(age^3) + I(age^4), data = Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.707 -24.626 -4.993 15.217 203.693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.842e+02 6.004e+01 -3.067 0.002180 **
## age 2.125e+01 5.887e+00 3.609 0.000312 ***
## I(age^2) -5.639e-01 2.061e-01 -2.736 0.006261 **
## I(age^3) 6.811e-03 3.066e-03 2.221 0.026398 *
## I(age^4) -3.204e-05 1.641e-05 -1.952 0.051039 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.91 on 2995 degrees of freedom
## Multiple R-squared: 0.08626, Adjusted R-squared: 0.08504
## F-statistic: 70.69 on 4 and 2995 DF, p-value: < 2.2e-16
Altough there may exist interactions in the second case but the fit is same for both the first one and the second one
plot(fitted(fit),fitted(fita),xlab="Using poly",ylab="Using I")
Now testing the different polynomial fits using the anova function
attach(Wage)
## The following objects are masked from Wage (pos = 3):
##
## age, education, health, health_ins, jobclass, logwage, maritl,
## race, region, sex, wage, year
fita=lm(wage~education)
fitb=lm(wage~education+age)
fitc=lm(wage~education+poly(age,2))
fitd=lm(wage~education+poly(age,3))
anova(fita,fitb,fitc,fitd)
## Analysis of Variance Table
##
## Model 1: wage ~ education
## Model 2: wage ~ education + age
## Model 3: wage ~ education + poly(age, 2)
## Model 4: wage ~ education + poly(age, 3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2995 3995721
## 2 2994 3867992 1 127729 102.7378 <2e-16 ***
## 3 2993 3725395 1 142597 114.6969 <2e-16 ***
## 4 2992 3719809 1 5587 4.4936 0.0341 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit1=glm(I(wage>250)~poly(age,3),data=Wage,family="binomial")
summary(fit1)
##
## Call:
## glm(formula = I(wage > 250) ~ poly(age, 3), family = "binomial",
## data = Wage)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2808 -0.2736 -0.2487 -0.1758 3.2868
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.8486 0.1597 -24.100 < 2e-16 ***
## poly(age, 3)1 37.8846 11.4818 3.300 0.000968 ***
## poly(age, 3)2 -29.5129 10.5626 -2.794 0.005205 **
## poly(age, 3)3 9.7966 8.9990 1.089 0.276317
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 730.53 on 2999 degrees of freedom
## Residual deviance: 707.92 on 2996 degrees of freedom
## AIC: 715.92
##
## Number of Fisher Scoring iterations: 8
preds=predict(fit1,list(age=age.grid),se=T)
se.bands=cbind(preds$fit+2*preds$se,preds$fit-2*preds$se)
Stanadard errrors are calculated on the logit sacle so we have to convert it to the probability scale
prob.bands=exp(se.bands)/(1+exp(se.bands))
(prob.bands)
## [,1] [,2]
## 1 0.01025495 2.123527e-05
## 2 0.01053092 4.080363e-05
## 3 0.01087189 7.538541e-05
## 4 0.01128089 1.340508e-04
## 5 0.01176205 2.296578e-04
## 6 0.01232072 3.794433e-04
## 7 0.01296358 6.051691e-04
## 8 0.01369892 9.325343e-04
## 9 0.01453688 1.389597e-03
## 10 0.01548977 2.004061e-03
## 11 0.01657237 2.799516e-03
## 12 0.01780191 3.791045e-03
## 13 0.01919759 4.981028e-03
## 14 0.02077887 6.356352e-03
## 15 0.02256200 7.888430e-03
## 16 0.02455446 9.536941e-03
## 17 0.02674822 1.125685e-02
## 18 0.02911418 1.300640e-02
## 19 0.03160081 1.475274e-02
## 20 0.03413839 1.647341e-02
## 21 0.03664753 1.815394e-02
## 22 0.03904921 1.978390e-02
## 23 0.04127375 2.135309e-02
## 24 0.04326739 2.284874e-02
## 25 0.04499620 2.425397e-02
## 26 0.04644745 2.554708e-02
## 27 0.04762882 2.670185e-02
## 28 0.04856550 2.768871e-02
## 29 0.04929546 2.847717e-02
## 30 0.04986301 2.903957e-02
## 31 0.05031144 2.935578e-02
## 32 0.05067603 2.941775e-02
## 33 0.05097941 2.923217e-02
## 34 0.05123050 2.882008e-02
## 35 0.05142720 2.821339e-02
## 36 0.05156117 2.744974e-02
## 37 0.05162331 2.656740e-02
## 38 0.05160841 2.560129e-02
## 39 0.05151870 2.458043e-02
## 40 0.05136645 2.352673e-02
## 41 0.05117575 2.245466e-02
## 42 0.05098377 2.137157e-02
## 43 0.05084164 2.027851e-02
## 44 0.05081493 1.917160e-02
## 45 0.05098387 1.804387e-02
## 46 0.05144340 1.688767e-02
## 47 0.05230376 1.569729e-02
## 48 0.05369223 1.447147e-02
## 49 0.05575745 1.321509e-02
## 50 0.05867738 1.193960e-02
## 51 0.06267233 1.066211e-02
## 52 0.06802392 9.403405e-03
## 53 0.07510143 8.185544e-03
## 54 0.08439700 7.029476e-03
## 55 0.09657131 5.953179e-03
## 56 0.11251039 4.970419e-03
## 57 0.13339031 4.090170e-03
## 58 0.16073644 3.316579e-03
## 59 0.19644305 2.649358e-03
## 60 0.24268164 2.084462e-03
## 61 0.30157692 1.614919e-03
## 62 0.37450565 1.231711e-03
## 63 0.46098178 9.246186e-04
matplot(age.grid,prob.bands,col="blue",lwd=c(2,1,1),lty=c(1,2,2),type="l",ylim=c(0,0.1),xlab="Age",ylab="Probability")
points(jitter(age),I(wage>250)/10,pch="l",cex=0.5)