require(faraway)
## Loading required package: faraway
## Warning: package 'faraway' was built under R version 3.3.3
head(airpass)
## pass year
## 1 112 49.08333
## 2 118 49.16667
## 3 132 49.25000
## 4 129 49.33333
## 5 121 49.41667
## 6 135 49.50000
plot(pass~year, airpass, type = "l")

mod <- lm(pass~year, airpass)
summary(mod)
##
## Call:
## lm(formula = pass ~ year, data = airpass)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93.858 -30.727 -5.757 24.489 164.999
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1474.771 61.106 -24.14 <2e-16 ***
## year 31.886 1.108 28.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.06 on 142 degrees of freedom
## Multiple R-squared: 0.8536, Adjusted R-squared: 0.8526
## F-statistic: 828.2 on 1 and 142 DF, p-value: < 2.2e-16
plot(mod)




quadMod <- lm(pass~year + I(year^2), airpass)
summary(quadMod) #if a higher order polynomial is significant, KEEP the lower order polynomials, EVEN if they aren't significant
##
## Call:
## lm(formula = pass ~ year + I(year^2), data = airpass)
##
## Residuals:
## Min 1Q Median 3Q Max
## -100.353 -27.339 -7.442 21.603 146.116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1570.5174 1053.9017 1.490 0.13841
## year -79.2078 38.4007 -2.063 0.04098 *
## I(year^2) 1.0092 0.3487 2.894 0.00441 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.91 on 141 degrees of freedom
## Multiple R-squared: 0.8618, Adjusted R-squared: 0.8599
## F-statistic: 439.8 on 2 and 141 DF, p-value: < 2.2e-16
plot(quadMod)




Temperature in Ann Arbor, MI
data(aatemp)
plot(temp~year, aatemp) #honesty looks like no trend

plot(temp~year, aatemp, type = "l") #if anything, maybe a 4th order model?? because it kind of goes down, up, down, up???

tempMod <- lm(temp~year, aatemp)
summary(tempMod) #this is significant
##
## Call:
## lm(formula = temp ~ year, data = aatemp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9843 -0.9113 -0.0820 0.9946 3.5343
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.005510 7.310781 3.284 0.00136 **
## year 0.012237 0.003768 3.247 0.00153 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.466 on 113 degrees of freedom
## Multiple R-squared: 0.08536, Adjusted R-squared: 0.07727
## F-statistic: 10.55 on 1 and 113 DF, p-value: 0.001533
plot(tempMod) #not good plots, though




quadTempMod <- lm(temp~year + I(year^2), aatemp)
summary(quadTempMod) #nope
##
## Call:
## lm(formula = temp ~ year + I(year^2), data = aatemp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0412 -0.9538 -0.0624 0.9959 3.5820
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.127e+02 3.837e+02 -0.554 0.580
## year 2.567e-01 3.962e-01 0.648 0.518
## I(year^2) -6.307e-05 1.022e-04 -0.617 0.539
##
## Residual standard error: 1.47 on 112 degrees of freedom
## Multiple R-squared: 0.08846, Adjusted R-squared: 0.07218
## F-statistic: 5.434 on 2 and 112 DF, p-value: 0.005591
cubeTempMod <- lm(temp~year + I(year^2) + I(year^3), aatemp)
summary(cubeTempMod) #YES
##
## Call:
## lm(formula = temp ~ year + I(year^2) + I(year^3), data = aatemp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8557 -0.9646 -0.1552 1.0485 4.1538
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.959e+04 1.734e+04 2.283 0.0243 *
## year -6.159e+01 2.694e+01 -2.286 0.0241 *
## I(year^2) 3.197e-02 1.395e-02 2.291 0.0238 *
## I(year^3) -5.527e-06 2.407e-06 -2.296 0.0236 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.443 on 111 degrees of freedom
## Multiple R-squared: 0.1298, Adjusted R-squared: 0.1063
## F-statistic: 5.518 on 3 and 111 DF, p-value: 0.001436
plot(cubeTempMod) #and the plots are sound




fourthTempMod <- lm(temp~year + I(year^2) + I(year^3) + I(year^4), aatemp)
summary(fourthTempMod) #nope
##
## Call:
## lm(formula = temp ~ year + I(year^2) + I(year^3) + I(year^4),
## data = aatemp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0085 -0.9618 -0.0913 0.9926 3.7370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.497e+06 8.553e+05 1.750 0.0829 .
## year -3.086e+03 1.775e+03 -1.739 0.0849 .
## I(year^2) 2.385e+00 1.381e+00 1.727 0.0869 .
## I(year^3) -8.189e-04 4.773e-04 -1.716 0.0890 .
## I(year^4) 1.054e-07 6.186e-08 1.704 0.0912 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.431 on 110 degrees of freedom
## Multiple R-squared: 0.1522, Adjusted R-squared: 0.1213
## F-statistic: 4.936 on 4 and 110 DF, p-value: 0.001068
coef(cubeTempMod) %*% c(1, 2002, 2002^2, 2002^3) #47.5 for 2002
## [,1]
## [1,] 47.50484
plot(temp~year, aatemp, type = "l")
lines(cubeTempMod$fitted.values ~ year, aatemp, col = "red")

Which model is the best for using year to predict births in the United States?
data("divusa")
head(divusa)
## year divorce unemployed femlab marriage birth military
## 1 1920 8.0 5.2 22.70 92.0 117.9 3.2247
## 2 1921 7.2 11.7 22.79 83.0 119.8 3.5614
## 3 1922 6.6 6.7 22.88 79.7 111.2 2.4553
## 4 1923 7.1 2.4 22.97 85.2 110.5 2.2065
## 5 1924 7.2 5.0 23.06 80.3 110.9 2.2889
## 6 1925 7.2 3.2 23.15 79.2 106.6 2.1735
plot(birth~year, divusa, type = "l")

birthMod <- lm(birth~year, divusa)
summary(birthMod)
##
## Call:
## lm(formula = birth ~ year, data = divusa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.159 -11.129 -4.086 11.232 33.509
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1073.45274 161.22254 6.658 4.03e-09 ***
## year -0.50284 0.08234 -6.107 4.14e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.06 on 75 degrees of freedom
## Multiple R-squared: 0.3321, Adjusted R-squared: 0.3232
## F-statistic: 37.3 on 1 and 75 DF, p-value: 4.143e-08
plot(birthMod) #significant model but really obviously is not linear data




quadBirthMod <- lm(birth~year + I(year^2), divusa)
summary(quadBirthMod)
##
## Call:
## lm(formula = birth ~ year + I(year^2), data = divusa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.2952 -11.8939 -0.7406 12.5154 26.3901
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.428e+04 1.464e+04 -3.709 0.000400 ***
## year 5.604e+01 1.495e+01 3.748 0.000351 ***
## I(year^2) -1.444e-02 3.818e-03 -3.782 0.000313 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.8 on 74 degrees of freedom
## Multiple R-squared: 0.4403, Adjusted R-squared: 0.4252
## F-statistic: 29.11 on 2 and 74 DF, p-value: 4.718e-10
plot(quadBirthMod) #still not meeting assumptions of polynomial regression?




cubeBirthMod <- lm(birth~year + I(year^2) + I(year^3), divusa)
summary(cubeBirthMod) #not significant
##
## Call:
## lm(formula = birth ~ year + I(year^2) + I(year^3), data = divusa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.382 -12.208 -1.928 12.042 26.581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.557e+06 1.467e+06 1.062 0.292
## year -2.414e+03 2.248e+03 -1.074 0.286
## I(year^2) 1.247e+00 1.148e+00 1.086 0.281
## I(year^3) -2.148e-04 1.954e-04 -1.099 0.275
##
## Residual standard error: 14.78 on 73 degrees of freedom
## Multiple R-squared: 0.4494, Adjusted R-squared: 0.4268
## F-statistic: 19.86 on 3 and 73 DF, p-value: 1.625e-09
fourthBirthMod <- lm(birth~poly(year, 4), divusa)
summary(fourthBirthMod) #significant
##
## Call:
## lm(formula = birth ~ poly(year, 4), data = divusa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.505 -7.543 1.208 7.846 13.306
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 88.8883 0.9585 92.738 < 2e-16 ***
## poly(year, 4)1 -98.0709 8.4107 -11.660 < 2e-16 ***
## poly(year, 4)2 -55.9709 8.4107 -6.655 4.76e-09 ***
## poly(year, 4)3 -16.2403 8.4107 -1.931 0.0574 .
## poly(year, 4)4 104.1628 8.4107 12.385 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.411 on 72 degrees of freedom
## Multiple R-squared: 0.8241, Adjusted R-squared: 0.8143
## F-statistic: 84.34 on 4 and 72 DF, p-value: < 2.2e-16
plot(fourthBirthMod) #bad plots, though




##skipping ahead to the truth
eighthMod <- lm(birth~poly(year, 8), divusa)
plot(eighthMod)




plot(birth~year, divusa, type = "l")
lines(eighthMod$fitted.values ~ year, divusa, col = "red")

## how to predict
newdat <- data.frame(year = 2000)
predict(eighthMod, newdata = newdat) #point
## 1
## 89.05121
predict(eighthMod, newdata = newdat, interval = "confidence")
## fit lwr upr
## 1 89.05121 57.1107 120.9917
predict(eighthMod, newdata = newdat, interval = "prediction")
## fit lwr upr
## 1 89.05121 56.49719 121.6052
How to check for autocorrelation
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
dwtest(eighthMod, alternative = "two.sided")
##
## Durbin-Watson test
##
## data: eighthMod
## DW = 1.1621, p-value = 8.211e-07
## alternative hypothesis: true autocorrelation is not 0
dwtest(cubeTempMod) #sig p
##
## Durbin-Watson test
##
## data: cubeTempMod
## DW = 1.7171, p-value = 0.03464
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(cubeTempMod, alternative = "two.sided")
##
## Durbin-Watson test
##
## data: cubeTempMod
## DW = 1.7171, p-value = 0.06928
## alternative hypothesis: true autocorrelation is not 0