library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
data <- read.csv("http://www.cknudson.com/data/MacNaturalGas.csv")
mod1 <- lm(therms ~ month, data)
ggplot(data, aes(month, therms)) + geom_point()
## Warning: Removed 12 rows containing missing values (geom_point).
data$monthsquared <- (data$month)^2
mod2 <- lm(therms ~ month + monthsquared, data)
summary(mod2)
##
## Call:
## lm(formula = therms ~ month + monthsquared, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -224.58 -32.32 -14.41 36.10 184.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 549.8271 23.7119 23.19 <2e-16 ***
## month -147.6907 8.3893 -17.61 <2e-16 ***
## monthsquared 10.4416 0.6314 16.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68.02 on 96 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.7672, Adjusted R-squared: 0.7623
## F-statistic: 158.1 on 2 and 96 DF, p-value: < 2.2e-16
anova(mod2, mod1)
## Analysis of Variance Table
##
## Model 1: therms ~ month + monthsquared
## Model 2: therms ~ month
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 96 444221
## 2 97 1709752 -1 -1265531 273.49 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod1)
## [1] 1252.867
AIC(mod2)
## [1] 1121.437
BIC(mod1)
## [1] 1260.652
BIC(mod2)
## [1] 1131.817
data$monthsqrt <- (data$month)^.5
mod3 <- lm(therms ~ month + monthsqrt, data)
summary(mod3)
##
## Call:
## lm(formula = therms ~ month + monthsqrt, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -242.84 -61.90 -28.56 54.94 228.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1089.40 84.11 12.952 < 2e-16 ***
## month 164.76 17.34 9.502 1.75e-15 ***
## monthsqrt -823.32 79.50 -10.356 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 91.72 on 96 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.5767, Adjusted R-squared: 0.5679
## F-statistic: 65.4 on 2 and 96 DF, p-value: < 2.2e-16
AIC(mod3)
## [1] 1180.611
BIC(mod3)
## [1] 1190.991
Mod1 is nested in mod2 and mod3 since both of those models also use the variable month. Anova and Likelihood ratio test are used for nested models. If not nested, we MUST use AIC or BIC.
logLik(mod1)
## 'log Lik.' -623.4335 (df=3)
logLik(mod2)
## 'log Lik.' -556.7183 (df=4)
#find 134 by doing 2 times the difference of bigger model minus smaller model
#small pvalue means bigger model is better (null Ho is small model is good, alternative is bigger model is better)
pchisq(134, lower.tail = FALSE, df = 1)
## [1] 5.463547e-31
xvar <- 1:12
coef(mod2)
## (Intercept) month monthsquared
## 549.8271 -147.6907 10.4416
ypreds <- 549.8271+ - 147.607*xvar + 10.4416*xvar*xvar
with(data, (plot(month, therms)))
## NULL
plot(mod1)
plot(mod2)
lifedata <- read.csv("http://www.cknudson.com/data/LifeExp.csv")
summary(lifedata)
## Birth Death InfantDeath MaleLife
## Min. : 9.70 Min. : 2.20 Min. : 4.5 Min. :38.10
## 1st Qu.:14.50 1st Qu.: 7.80 1st Qu.: 13.1 1st Qu.:55.80
## Median :29.00 Median : 9.50 Median : 43.0 Median :63.70
## Mean :29.23 Mean :10.84 Mean : 54.9 Mean :61.49
## 3rd Qu.:42.20 3rd Qu.:12.50 3rd Qu.: 83.0 3rd Qu.:68.60
## Max. :52.20 Max. :25.00 Max. :181.6 Max. :75.90
##
## FemaleLife GNP Region Country
## Min. :41.20 Min. : 80 Africa :27 Afghanistan: 1
## 1st Qu.:57.50 1st Qu.: 475 Asia :16 Albania : 1
## Median :67.80 Median : 1690 East.Eur :11 Algeria : 1
## Mean :66.15 Mean : 5741 LatinAmer :12 Angola : 1
## 3rd Qu.:75.40 3rd Qu.: 7325 MiddleEast:12 Argentina : 1
## Max. :81.80 Max. :34064 Other :19 Austria : 1
## NA's :6 (Other) :91
malemod1 <- lm(MaleLife ~ GNP, lifedata)
summary(malemod1)
##
## Call:
## lm(formula = MaleLife ~ GNP, data = lifedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.999 -5.388 1.222 5.840 12.553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.694e+01 9.647e-01 59.03 < 2e-16 ***
## GNP 7.728e-04 9.758e-05 7.92 6.35e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.492 on 89 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.4134, Adjusted R-squared: 0.4068
## F-statistic: 62.72 on 1 and 89 DF, p-value: 6.345e-12
with(lifedata, plot(GNP, MaleLife))
lifedata$logGNP <- log(lifedata$GNP)
malemod2 <- lm(MaleLife ~ GNP + logGNP, lifedata)
summary(malemod2)
##
## Call:
## lm(formula = MaleLife ~ GNP + logGNP, data = lifedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.8862 -3.1499 0.3632 3.5823 14.3717
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.4063459 4.6241607 4.413 2.88e-05 ***
## GNP -0.0001969 0.0001423 -1.384 0.17
## logGNP 5.6053605 0.7003013 8.004 4.54e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.732 on 88 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.6605, Adjusted R-squared: 0.6528
## F-statistic: 85.62 on 2 and 88 DF, p-value: < 2.2e-16
lifedata$GNPsqrt <- (lifedata$GNP)^.5
malemod3 <- lm(MaleLife ~ logGNP, lifedata)
summary(malemod3)
##
## Call:
## lm(formula = MaleLife ~ logGNP, data = lifedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5872 -3.5064 0.0095 3.7562 14.1309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.4726 2.8387 8.973 4.27e-14 ***
## logGNP 4.7804 0.3693 12.946 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.761 on 89 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.6532, Adjusted R-squared: 0.6493
## F-statistic: 167.6 on 1 and 89 DF, p-value: < 2.2e-16
pred <- 25.4726 + 4.7804*log(5000)
pred
## [1] 66.18819
with(lifedata, plot(GNP, MaleLife))
xvar <- 1:35000
ypreds <- 25.4726 + 4.7804*log(xvar)
lines(xvar, ypreds)
AIC(malemod1)
## [1] 628.7462
AIC(malemod3)
## [1] 580.9289
anova(malemod2, malemod1)
## Analysis of Variance Table
##
## Model 1: MaleLife ~ GNP + logGNP
## Model 2: MaleLife ~ GNP
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 88 2891.0
## 2 89 4995.8 -1 -2104.8 64.067 4.544e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(malemod3)
malemod5 <- lm(MaleLife ~ log(log(log(GNP))) + log(FemaleLife), lifedata)
plot(malemod5)
summary(malemod5)
##
## Call:
## lm(formula = MaleLife ~ log(log(log(GNP))) + log(FemaleLife),
## data = lifedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7238 -0.8300 -0.0058 1.0647 5.4884
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -156.181 6.301 -24.787 <2e-16 ***
## log(log(log(GNP))) 1.859 2.926 0.635 0.527
## log(FemaleLife) 51.811 1.875 27.638 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.851 on 88 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.9646, Adjusted R-squared: 0.9638
## F-statistic: 1199 on 2 and 88 DF, p-value: < 2.2e-16
with(lifedata, plot(FemaleLife, MaleLife))