Learning Log 15

require(faraway)

## Loading required package: faraway

## Warning: package 'faraway' was built under R version 3.3.3

head(airpass)

##   pass     year
## 1  112 49.08333
## 2  118 49.16667
## 3  132 49.25000
## 4  129 49.33333
## 5  121 49.41667
## 6  135 49.50000

plot(pass~year, airpass, type = "l")

mod <- lm(pass~year, airpass)
summary(mod)

## 
## Call:
## lm(formula = pass ~ year, data = airpass)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -93.858 -30.727  -5.757  24.489 164.999 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1474.771     61.106  -24.14   <2e-16 ***
## year           31.886      1.108   28.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.06 on 142 degrees of freedom
## Multiple R-squared:  0.8536, Adjusted R-squared:  0.8526 
## F-statistic: 828.2 on 1 and 142 DF,  p-value: < 2.2e-16

plot(mod)

quadMod <- lm(pass~year + I(year^2), airpass)
summary(quadMod) #if a higher order polynomial is significant, KEEP the lower order polynomials, EVEN if they aren't significant

## 
## Call:
## lm(formula = pass ~ year + I(year^2), data = airpass)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -100.353  -27.339   -7.442   21.603  146.116 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 1570.5174  1053.9017   1.490  0.13841   
## year         -79.2078    38.4007  -2.063  0.04098 * 
## I(year^2)      1.0092     0.3487   2.894  0.00441 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.91 on 141 degrees of freedom
## Multiple R-squared:  0.8618, Adjusted R-squared:  0.8599 
## F-statistic: 439.8 on 2 and 141 DF,  p-value: < 2.2e-16

plot(quadMod)

Temperature in Ann Arbor, MI

data(aatemp)
plot(temp~year, aatemp) #honesty looks like no trend

plot(temp~year, aatemp, type = "l") #if anything, maybe a 4th order model?? because it kind of goes down, up, down, up???

tempMod <- lm(temp~year, aatemp)
summary(tempMod) #this is significant

## 
## Call:
## lm(formula = temp ~ year, data = aatemp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9843 -0.9113 -0.0820  0.9946  3.5343 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 24.005510   7.310781   3.284  0.00136 **
## year         0.012237   0.003768   3.247  0.00153 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.466 on 113 degrees of freedom
## Multiple R-squared:  0.08536,    Adjusted R-squared:  0.07727 
## F-statistic: 10.55 on 1 and 113 DF,  p-value: 0.001533

plot(tempMod) #not good plots, though

quadTempMod <- lm(temp~year + I(year^2), aatemp)
summary(quadTempMod) #nope

## 
## Call:
## lm(formula = temp ~ year + I(year^2), data = aatemp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0412 -0.9538 -0.0624  0.9959  3.5820 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.127e+02  3.837e+02  -0.554    0.580
## year         2.567e-01  3.962e-01   0.648    0.518
## I(year^2)   -6.307e-05  1.022e-04  -0.617    0.539
## 
## Residual standard error: 1.47 on 112 degrees of freedom
## Multiple R-squared:  0.08846,    Adjusted R-squared:  0.07218 
## F-statistic: 5.434 on 2 and 112 DF,  p-value: 0.005591

cubeTempMod <- lm(temp~year + I(year^2) + I(year^3), aatemp)
summary(cubeTempMod) #YES

## 
## Call:
## lm(formula = temp ~ year + I(year^2) + I(year^3), data = aatemp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8557 -0.9646 -0.1552  1.0485  4.1538 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  3.959e+04  1.734e+04   2.283   0.0243 *
## year        -6.159e+01  2.694e+01  -2.286   0.0241 *
## I(year^2)    3.197e-02  1.395e-02   2.291   0.0238 *
## I(year^3)   -5.527e-06  2.407e-06  -2.296   0.0236 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.443 on 111 degrees of freedom
## Multiple R-squared:  0.1298, Adjusted R-squared:  0.1063 
## F-statistic: 5.518 on 3 and 111 DF,  p-value: 0.001436

plot(cubeTempMod) #and the plots are sound

fourthTempMod <- lm(temp~year + I(year^2) + I(year^3) + I(year^4), aatemp)
summary(fourthTempMod) #nope

## 
## Call:
## lm(formula = temp ~ year + I(year^2) + I(year^3) + I(year^4), 
##     data = aatemp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0085 -0.9618 -0.0913  0.9926  3.7370 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  1.497e+06  8.553e+05   1.750   0.0829 .
## year        -3.086e+03  1.775e+03  -1.739   0.0849 .
## I(year^2)    2.385e+00  1.381e+00   1.727   0.0869 .
## I(year^3)   -8.189e-04  4.773e-04  -1.716   0.0890 .
## I(year^4)    1.054e-07  6.186e-08   1.704   0.0912 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.431 on 110 degrees of freedom
## Multiple R-squared:  0.1522, Adjusted R-squared:  0.1213 
## F-statistic: 4.936 on 4 and 110 DF,  p-value: 0.001068

coef(cubeTempMod) %*% c(1, 2002, 2002^2, 2002^3) #47.5 for 2002

##          [,1]
## [1,] 47.50484

plot(temp~year, aatemp, type = "l")
lines(cubeTempMod$fitted.values ~ year, aatemp, col = "red")

Which model is the best for using year to predict births in the United States?

data("divusa")
head(divusa)

##   year divorce unemployed femlab marriage birth military
## 1 1920     8.0        5.2  22.70     92.0 117.9   3.2247
## 2 1921     7.2       11.7  22.79     83.0 119.8   3.5614
## 3 1922     6.6        6.7  22.88     79.7 111.2   2.4553
## 4 1923     7.1        2.4  22.97     85.2 110.5   2.2065
## 5 1924     7.2        5.0  23.06     80.3 110.9   2.2889
## 6 1925     7.2        3.2  23.15     79.2 106.6   2.1735

plot(birth~year, divusa, type = "l")

birthMod <- lm(birth~year, divusa)
summary(birthMod)

## 
## Call:
## lm(formula = birth ~ year, data = divusa)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.159 -11.129  -4.086  11.232  33.509 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1073.45274  161.22254   6.658 4.03e-09 ***
## year          -0.50284    0.08234  -6.107 4.14e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.06 on 75 degrees of freedom
## Multiple R-squared:  0.3321, Adjusted R-squared:  0.3232 
## F-statistic:  37.3 on 1 and 75 DF,  p-value: 4.143e-08

plot(birthMod) #significant model but really obviously is not linear data

quadBirthMod <- lm(birth~year + I(year^2), divusa)
summary(quadBirthMod)

## 
## Call:
## lm(formula = birth ~ year + I(year^2), data = divusa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.2952 -11.8939  -0.7406  12.5154  26.3901 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.428e+04  1.464e+04  -3.709 0.000400 ***
## year         5.604e+01  1.495e+01   3.748 0.000351 ***
## I(year^2)   -1.444e-02  3.818e-03  -3.782 0.000313 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.8 on 74 degrees of freedom
## Multiple R-squared:  0.4403, Adjusted R-squared:  0.4252 
## F-statistic: 29.11 on 2 and 74 DF,  p-value: 4.718e-10

plot(quadBirthMod) #still not meeting assumptions of polynomial regression?

cubeBirthMod <- lm(birth~year + I(year^2) + I(year^3), divusa)
summary(cubeBirthMod) #not significant

## 
## Call:
## lm(formula = birth ~ year + I(year^2) + I(year^3), data = divusa)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.382 -12.208  -1.928  12.042  26.581 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  1.557e+06  1.467e+06   1.062    0.292
## year        -2.414e+03  2.248e+03  -1.074    0.286
## I(year^2)    1.247e+00  1.148e+00   1.086    0.281
## I(year^3)   -2.148e-04  1.954e-04  -1.099    0.275
## 
## Residual standard error: 14.78 on 73 degrees of freedom
## Multiple R-squared:  0.4494, Adjusted R-squared:  0.4268 
## F-statistic: 19.86 on 3 and 73 DF,  p-value: 1.625e-09

fourthBirthMod <- lm(birth~poly(year, 4), divusa)
summary(fourthBirthMod) #significant

## 
## Call:
## lm(formula = birth ~ poly(year, 4), data = divusa)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.505  -7.543   1.208   7.846  13.306 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     88.8883     0.9585  92.738  < 2e-16 ***
## poly(year, 4)1 -98.0709     8.4107 -11.660  < 2e-16 ***
## poly(year, 4)2 -55.9709     8.4107  -6.655 4.76e-09 ***
## poly(year, 4)3 -16.2403     8.4107  -1.931   0.0574 .  
## poly(year, 4)4 104.1628     8.4107  12.385  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.411 on 72 degrees of freedom
## Multiple R-squared:  0.8241, Adjusted R-squared:  0.8143 
## F-statistic: 84.34 on 4 and 72 DF,  p-value: < 2.2e-16

plot(fourthBirthMod) #bad plots, though

##skipping ahead to the truth

eighthMod <- lm(birth~poly(year, 8), divusa)
plot(eighthMod)

plot(birth~year, divusa, type = "l")
lines(eighthMod$fitted.values ~ year, divusa, col = "red")

## how to predict

newdat <- data.frame(year = 2000)
predict(eighthMod, newdata = newdat) #point

##        1 
## 89.05121

predict(eighthMod, newdata = newdat, interval = "confidence")

##        fit     lwr      upr
## 1 89.05121 57.1107 120.9917

predict(eighthMod, newdata = newdat, interval = "prediction")

##        fit      lwr      upr
## 1 89.05121 56.49719 121.6052

How to check for autocorrelation

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

dwtest(eighthMod, alternative = "two.sided")

## 
##  Durbin-Watson test
## 
## data:  eighthMod
## DW = 1.1621, p-value = 8.211e-07
## alternative hypothesis: true autocorrelation is not 0

dwtest(cubeTempMod) #sig p

## 
##  Durbin-Watson test
## 
## data:  cubeTempMod
## DW = 1.7171, p-value = 0.03464
## alternative hypothesis: true autocorrelation is greater than 0

dwtest(cubeTempMod, alternative = "two.sided")

## 
##  Durbin-Watson test
## 
## data:  cubeTempMod
## DW = 1.7171, p-value = 0.06928
## alternative hypothesis: true autocorrelation is not 0

Learning Log 15

Jen Kochaver

April 3, 2018

Temperature in Ann Arbor, MI

Which model is the best for using year to predict births in the United States?

How to check for autocorrelation