require(faraway)
## Loading required package: faraway
## Warning: package 'faraway' was built under R version 3.3.3
head(airpass)
##   pass     year
## 1  112 49.08333
## 2  118 49.16667
## 3  132 49.25000
## 4  129 49.33333
## 5  121 49.41667
## 6  135 49.50000
plot(pass~year, airpass, type = "l")

mod <- lm(pass~year, airpass)
summary(mod)
## 
## Call:
## lm(formula = pass ~ year, data = airpass)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -93.858 -30.727  -5.757  24.489 164.999 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1474.771     61.106  -24.14   <2e-16 ***
## year           31.886      1.108   28.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.06 on 142 degrees of freedom
## Multiple R-squared:  0.8536, Adjusted R-squared:  0.8526 
## F-statistic: 828.2 on 1 and 142 DF,  p-value: < 2.2e-16
plot(mod)

quadMod <- lm(pass~year + I(year^2), airpass)
summary(quadMod) #if a higher order polynomial is significant, KEEP the lower order polynomials, EVEN if they aren't significant
## 
## Call:
## lm(formula = pass ~ year + I(year^2), data = airpass)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -100.353  -27.339   -7.442   21.603  146.116 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 1570.5174  1053.9017   1.490  0.13841   
## year         -79.2078    38.4007  -2.063  0.04098 * 
## I(year^2)      1.0092     0.3487   2.894  0.00441 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.91 on 141 degrees of freedom
## Multiple R-squared:  0.8618, Adjusted R-squared:  0.8599 
## F-statistic: 439.8 on 2 and 141 DF,  p-value: < 2.2e-16
plot(quadMod)

Temperature in Ann Arbor, MI

data(aatemp)
plot(temp~year, aatemp) #honesty looks like no trend

plot(temp~year, aatemp, type = "l") #if anything, maybe a 4th order model?? because it kind of goes down, up, down, up???

tempMod <- lm(temp~year, aatemp)
summary(tempMod) #this is significant
## 
## Call:
## lm(formula = temp ~ year, data = aatemp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9843 -0.9113 -0.0820  0.9946  3.5343 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 24.005510   7.310781   3.284  0.00136 **
## year         0.012237   0.003768   3.247  0.00153 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.466 on 113 degrees of freedom
## Multiple R-squared:  0.08536,    Adjusted R-squared:  0.07727 
## F-statistic: 10.55 on 1 and 113 DF,  p-value: 0.001533
plot(tempMod) #not good plots, though

quadTempMod <- lm(temp~year + I(year^2), aatemp)
summary(quadTempMod) #nope
## 
## Call:
## lm(formula = temp ~ year + I(year^2), data = aatemp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0412 -0.9538 -0.0624  0.9959  3.5820 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.127e+02  3.837e+02  -0.554    0.580
## year         2.567e-01  3.962e-01   0.648    0.518
## I(year^2)   -6.307e-05  1.022e-04  -0.617    0.539
## 
## Residual standard error: 1.47 on 112 degrees of freedom
## Multiple R-squared:  0.08846,    Adjusted R-squared:  0.07218 
## F-statistic: 5.434 on 2 and 112 DF,  p-value: 0.005591
cubeTempMod <- lm(temp~year + I(year^2) + I(year^3), aatemp)
summary(cubeTempMod) #YES
## 
## Call:
## lm(formula = temp ~ year + I(year^2) + I(year^3), data = aatemp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8557 -0.9646 -0.1552  1.0485  4.1538 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  3.959e+04  1.734e+04   2.283   0.0243 *
## year        -6.159e+01  2.694e+01  -2.286   0.0241 *
## I(year^2)    3.197e-02  1.395e-02   2.291   0.0238 *
## I(year^3)   -5.527e-06  2.407e-06  -2.296   0.0236 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.443 on 111 degrees of freedom
## Multiple R-squared:  0.1298, Adjusted R-squared:  0.1063 
## F-statistic: 5.518 on 3 and 111 DF,  p-value: 0.001436
plot(cubeTempMod) #and the plots are sound

fourthTempMod <- lm(temp~year + I(year^2) + I(year^3) + I(year^4), aatemp)
summary(fourthTempMod) #nope
## 
## Call:
## lm(formula = temp ~ year + I(year^2) + I(year^3) + I(year^4), 
##     data = aatemp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0085 -0.9618 -0.0913  0.9926  3.7370 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  1.497e+06  8.553e+05   1.750   0.0829 .
## year        -3.086e+03  1.775e+03  -1.739   0.0849 .
## I(year^2)    2.385e+00  1.381e+00   1.727   0.0869 .
## I(year^3)   -8.189e-04  4.773e-04  -1.716   0.0890 .
## I(year^4)    1.054e-07  6.186e-08   1.704   0.0912 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.431 on 110 degrees of freedom
## Multiple R-squared:  0.1522, Adjusted R-squared:  0.1213 
## F-statistic: 4.936 on 4 and 110 DF,  p-value: 0.001068
coef(cubeTempMod) %*% c(1, 2002, 2002^2, 2002^3) #47.5 for 2002
##          [,1]
## [1,] 47.50484
plot(temp~year, aatemp, type = "l")
lines(cubeTempMod$fitted.values ~ year, aatemp, col = "red")

Which model is the best for using year to predict births in the United States?

data("divusa")
head(divusa)
##   year divorce unemployed femlab marriage birth military
## 1 1920     8.0        5.2  22.70     92.0 117.9   3.2247
## 2 1921     7.2       11.7  22.79     83.0 119.8   3.5614
## 3 1922     6.6        6.7  22.88     79.7 111.2   2.4553
## 4 1923     7.1        2.4  22.97     85.2 110.5   2.2065
## 5 1924     7.2        5.0  23.06     80.3 110.9   2.2889
## 6 1925     7.2        3.2  23.15     79.2 106.6   2.1735
plot(birth~year, divusa, type = "l")

birthMod <- lm(birth~year, divusa)
summary(birthMod)
## 
## Call:
## lm(formula = birth ~ year, data = divusa)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.159 -11.129  -4.086  11.232  33.509 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1073.45274  161.22254   6.658 4.03e-09 ***
## year          -0.50284    0.08234  -6.107 4.14e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.06 on 75 degrees of freedom
## Multiple R-squared:  0.3321, Adjusted R-squared:  0.3232 
## F-statistic:  37.3 on 1 and 75 DF,  p-value: 4.143e-08
plot(birthMod) #significant model but really obviously is not linear data

quadBirthMod <- lm(birth~year + I(year^2), divusa)
summary(quadBirthMod)
## 
## Call:
## lm(formula = birth ~ year + I(year^2), data = divusa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.2952 -11.8939  -0.7406  12.5154  26.3901 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.428e+04  1.464e+04  -3.709 0.000400 ***
## year         5.604e+01  1.495e+01   3.748 0.000351 ***
## I(year^2)   -1.444e-02  3.818e-03  -3.782 0.000313 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.8 on 74 degrees of freedom
## Multiple R-squared:  0.4403, Adjusted R-squared:  0.4252 
## F-statistic: 29.11 on 2 and 74 DF,  p-value: 4.718e-10
plot(quadBirthMod) #still not meeting assumptions of polynomial regression?

cubeBirthMod <- lm(birth~year + I(year^2) + I(year^3), divusa)
summary(cubeBirthMod) #not significant
## 
## Call:
## lm(formula = birth ~ year + I(year^2) + I(year^3), data = divusa)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.382 -12.208  -1.928  12.042  26.581 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  1.557e+06  1.467e+06   1.062    0.292
## year        -2.414e+03  2.248e+03  -1.074    0.286
## I(year^2)    1.247e+00  1.148e+00   1.086    0.281
## I(year^3)   -2.148e-04  1.954e-04  -1.099    0.275
## 
## Residual standard error: 14.78 on 73 degrees of freedom
## Multiple R-squared:  0.4494, Adjusted R-squared:  0.4268 
## F-statistic: 19.86 on 3 and 73 DF,  p-value: 1.625e-09
fourthBirthMod <- lm(birth~poly(year, 4), divusa)
summary(fourthBirthMod) #significant
## 
## Call:
## lm(formula = birth ~ poly(year, 4), data = divusa)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.505  -7.543   1.208   7.846  13.306 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     88.8883     0.9585  92.738  < 2e-16 ***
## poly(year, 4)1 -98.0709     8.4107 -11.660  < 2e-16 ***
## poly(year, 4)2 -55.9709     8.4107  -6.655 4.76e-09 ***
## poly(year, 4)3 -16.2403     8.4107  -1.931   0.0574 .  
## poly(year, 4)4 104.1628     8.4107  12.385  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.411 on 72 degrees of freedom
## Multiple R-squared:  0.8241, Adjusted R-squared:  0.8143 
## F-statistic: 84.34 on 4 and 72 DF,  p-value: < 2.2e-16
plot(fourthBirthMod) #bad plots, though

##skipping ahead to the truth

eighthMod <- lm(birth~poly(year, 8), divusa)
plot(eighthMod)

plot(birth~year, divusa, type = "l")
lines(eighthMod$fitted.values ~ year, divusa, col = "red")

## how to predict

newdat <- data.frame(year = 2000)
predict(eighthMod, newdata = newdat) #point
##        1 
## 89.05121
predict(eighthMod, newdata = newdat, interval = "confidence")
##        fit     lwr      upr
## 1 89.05121 57.1107 120.9917
predict(eighthMod, newdata = newdat, interval = "prediction")
##        fit      lwr      upr
## 1 89.05121 56.49719 121.6052

How to check for autocorrelation

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
dwtest(eighthMod, alternative = "two.sided")
## 
##  Durbin-Watson test
## 
## data:  eighthMod
## DW = 1.1621, p-value = 8.211e-07
## alternative hypothesis: true autocorrelation is not 0
dwtest(cubeTempMod) #sig p
## 
##  Durbin-Watson test
## 
## data:  cubeTempMod
## DW = 1.7171, p-value = 0.03464
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(cubeTempMod, alternative = "two.sided")
## 
##  Durbin-Watson test
## 
## data:  cubeTempMod
## DW = 1.7171, p-value = 0.06928
## alternative hypothesis: true autocorrelation is not 0