Non linear models

Author : Arvind Saini

Attaching libraries and data

cat("\014")

library(ISLR)
attach(Wage)
head(Wage)

##        year age     sex           maritl     race       education
## 231655 2006  18 1. Male 1. Never Married 1. White    1. < HS Grad
## 86582  2004  24 1. Male 1. Never Married 1. White 4. College Grad
## 161300 2003  45 1. Male       2. Married 1. White 3. Some College
## 155159 2003  43 1. Male       2. Married 3. Asian 4. College Grad
## 11443  2005  50 1. Male      4. Divorced 1. White      2. HS Grad
## 376662 2008  54 1. Male       2. Married 1. White 4. College Grad
##                    region       jobclass         health health_ins
## 231655 2. Middle Atlantic  1. Industrial      1. <=Good      2. No
## 86582  2. Middle Atlantic 2. Information 2. >=Very Good      2. No
## 161300 2. Middle Atlantic  1. Industrial      1. <=Good     1. Yes
## 155159 2. Middle Atlantic 2. Information 2. >=Very Good     1. Yes
## 11443  2. Middle Atlantic 2. Information      1. <=Good     1. Yes
## 376662 2. Middle Atlantic 2. Information 2. >=Very Good     1. Yes
##         logwage      wage
## 231655 4.318063  75.04315
## 86582  4.255273  70.47602
## 161300 4.875061 130.98218
## 155159 5.041393 154.68529
## 11443  4.318063  75.04315
## 376662 4.845098 127.11574

?Wage  #information about data

## starting httpd help server ... done

Polynomial Models

Making model

fit=lm(wage~poly(age,4),data=4)
summary(fit)

## 
## Call:
## lm(formula = wage ~ poly(age, 4), data = 4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -98.707 -24.626  -4.993  15.217 203.693 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    111.7036     0.7287 153.283  < 2e-16 ***
## poly(age, 4)1  447.0679    39.9148  11.201  < 2e-16 ***
## poly(age, 4)2 -478.3158    39.9148 -11.983  < 2e-16 ***
## poly(age, 4)3  125.5217    39.9148   3.145  0.00168 ** 
## poly(age, 4)4  -77.9112    39.9148  -1.952  0.05104 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.91 on 2995 degrees of freedom
## Multiple R-squared:  0.08626,    Adjusted R-squared:  0.08504 
## F-statistic: 70.69 on 4 and 2995 DF,  p-value: < 2.2e-16

Plotting the fit with statndard deviation

agelimits=range(age)
age.grid = seq(from=agelimits[1],to=agelimits[2])
preds = predict(fit,list(age=age.grid),se=T)
se.bands=cbind(preds$fit+2*preds$se,preds$fit-2*preds$se)
plot(age,wage,xlab="Age",ylab="Wage",col="darkgrey")
lines(age.grid,preds$fit,lwd=2,col="red")
matlines(age.grid,se.bands,col="blue",lty=2)

Alternative way of making polynomial model but it is not orthonormal there may exist interactions between the preictors

fita=lm(wage~age+I(age^2)+I(age^3)+I(age^4),data=Wage)
summary(fita)

## 
## Call:
## lm(formula = wage ~ age + I(age^2) + I(age^3) + I(age^4), data = Wage)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -98.707 -24.626  -4.993  15.217 203.693 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.842e+02  6.004e+01  -3.067 0.002180 ** 
## age          2.125e+01  5.887e+00   3.609 0.000312 ***
## I(age^2)    -5.639e-01  2.061e-01  -2.736 0.006261 ** 
## I(age^3)     6.811e-03  3.066e-03   2.221 0.026398 *  
## I(age^4)    -3.204e-05  1.641e-05  -1.952 0.051039 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.91 on 2995 degrees of freedom
## Multiple R-squared:  0.08626,    Adjusted R-squared:  0.08504 
## F-statistic: 70.69 on 4 and 2995 DF,  p-value: < 2.2e-16

Altough there may exist interactions in the second case but the fit is same for both the first one and the second one

plot(fitted(fit),fitted(fita),xlab="Using poly",ylab="Using I")

Now testing the different polynomial fits using the anova function

attach(Wage)

## The following objects are masked from Wage (pos = 3):
## 
##     age, education, health, health_ins, jobclass, logwage, maritl,
##     race, region, sex, wage, year

fita=lm(wage~education)
fitb=lm(wage~education+age)
fitc=lm(wage~education+poly(age,2))
fitd=lm(wage~education+poly(age,3))
anova(fita,fitb,fitc,fitd)

## Analysis of Variance Table
## 
## Model 1: wage ~ education
## Model 2: wage ~ education + age
## Model 3: wage ~ education + poly(age, 2)
## Model 4: wage ~ education + poly(age, 3)
##   Res.Df     RSS Df Sum of Sq        F Pr(>F)    
## 1   2995 3995721                                 
## 2   2994 3867992  1    127729 102.7378 <2e-16 ***
## 3   2993 3725395  1    142597 114.6969 <2e-16 ***
## 4   2992 3719809  1      5587   4.4936 0.0341 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Polynomial Logistic Regression

fit1=glm(I(wage>250)~poly(age,3),data=Wage,family="binomial")
summary(fit1)

## 
## Call:
## glm(formula = I(wage > 250) ~ poly(age, 3), family = "binomial", 
##     data = Wage)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2808  -0.2736  -0.2487  -0.1758   3.2868  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.8486     0.1597 -24.100  < 2e-16 ***
## poly(age, 3)1  37.8846    11.4818   3.300 0.000968 ***
## poly(age, 3)2 -29.5129    10.5626  -2.794 0.005205 ** 
## poly(age, 3)3   9.7966     8.9990   1.089 0.276317    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 730.53  on 2999  degrees of freedom
## Residual deviance: 707.92  on 2996  degrees of freedom
## AIC: 715.92
## 
## Number of Fisher Scoring iterations: 8

preds=predict(fit1,list(age=age.grid),se=T)
se.bands=cbind(preds$fit+2*preds$se,preds$fit-2*preds$se)

Stanadard errrors are calculated on the logit sacle so we have to convert it to the probability scale

prob.bands=exp(se.bands)/(1+exp(se.bands))
(prob.bands)

##          [,1]         [,2]
## 1  0.01025495 2.123527e-05
## 2  0.01053092 4.080363e-05
## 3  0.01087189 7.538541e-05
## 4  0.01128089 1.340508e-04
## 5  0.01176205 2.296578e-04
## 6  0.01232072 3.794433e-04
## 7  0.01296358 6.051691e-04
## 8  0.01369892 9.325343e-04
## 9  0.01453688 1.389597e-03
## 10 0.01548977 2.004061e-03
## 11 0.01657237 2.799516e-03
## 12 0.01780191 3.791045e-03
## 13 0.01919759 4.981028e-03
## 14 0.02077887 6.356352e-03
## 15 0.02256200 7.888430e-03
## 16 0.02455446 9.536941e-03
## 17 0.02674822 1.125685e-02
## 18 0.02911418 1.300640e-02
## 19 0.03160081 1.475274e-02
## 20 0.03413839 1.647341e-02
## 21 0.03664753 1.815394e-02
## 22 0.03904921 1.978390e-02
## 23 0.04127375 2.135309e-02
## 24 0.04326739 2.284874e-02
## 25 0.04499620 2.425397e-02
## 26 0.04644745 2.554708e-02
## 27 0.04762882 2.670185e-02
## 28 0.04856550 2.768871e-02
## 29 0.04929546 2.847717e-02
## 30 0.04986301 2.903957e-02
## 31 0.05031144 2.935578e-02
## 32 0.05067603 2.941775e-02
## 33 0.05097941 2.923217e-02
## 34 0.05123050 2.882008e-02
## 35 0.05142720 2.821339e-02
## 36 0.05156117 2.744974e-02
## 37 0.05162331 2.656740e-02
## 38 0.05160841 2.560129e-02
## 39 0.05151870 2.458043e-02
## 40 0.05136645 2.352673e-02
## 41 0.05117575 2.245466e-02
## 42 0.05098377 2.137157e-02
## 43 0.05084164 2.027851e-02
## 44 0.05081493 1.917160e-02
## 45 0.05098387 1.804387e-02
## 46 0.05144340 1.688767e-02
## 47 0.05230376 1.569729e-02
## 48 0.05369223 1.447147e-02
## 49 0.05575745 1.321509e-02
## 50 0.05867738 1.193960e-02
## 51 0.06267233 1.066211e-02
## 52 0.06802392 9.403405e-03
## 53 0.07510143 8.185544e-03
## 54 0.08439700 7.029476e-03
## 55 0.09657131 5.953179e-03
## 56 0.11251039 4.970419e-03
## 57 0.13339031 4.090170e-03
## 58 0.16073644 3.316579e-03
## 59 0.19644305 2.649358e-03
## 60 0.24268164 2.084462e-03
## 61 0.30157692 1.614919e-03
## 62 0.37450565 1.231711e-03
## 63 0.46098178 9.246186e-04

matplot(age.grid,prob.bands,col="blue",lwd=c(2,1,1),lty=c(1,2,2),type="l",ylim=c(0,0.1),xlab="Age",ylab="Probability")
points(jitter(age),I(wage>250)/10,pch="l",cex=0.5)