library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
data <- read.csv("http://www.cknudson.com/data/MacNaturalGas.csv")
mod1 <- lm(therms ~ month, data)
ggplot(data, aes(month, therms)) + geom_point()
## Warning: Removed 12 rows containing missing values (geom_point).

data$monthsquared <- (data$month)^2
mod2 <- lm(therms ~ month + monthsquared, data)
summary(mod2)
## 
## Call:
## lm(formula = therms ~ month + monthsquared, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -224.58  -32.32  -14.41   36.10  184.27 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   549.8271    23.7119   23.19   <2e-16 ***
## month        -147.6907     8.3893  -17.61   <2e-16 ***
## monthsquared   10.4416     0.6314   16.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68.02 on 96 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.7672, Adjusted R-squared:  0.7623 
## F-statistic: 158.1 on 2 and 96 DF,  p-value: < 2.2e-16
anova(mod2, mod1)
## Analysis of Variance Table
## 
## Model 1: therms ~ month + monthsquared
## Model 2: therms ~ month
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1     96  444221                                  
## 2     97 1709752 -1  -1265531 273.49 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod1)
## [1] 1252.867
AIC(mod2)
## [1] 1121.437
BIC(mod1)
## [1] 1260.652
BIC(mod2)
## [1] 1131.817
data$monthsqrt <- (data$month)^.5
mod3 <- lm(therms ~ month + monthsqrt, data)
summary(mod3)
## 
## Call:
## lm(formula = therms ~ month + monthsqrt, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -242.84  -61.90  -28.56   54.94  228.58 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1089.40      84.11  12.952  < 2e-16 ***
## month         164.76      17.34   9.502 1.75e-15 ***
## monthsqrt    -823.32      79.50 -10.356  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 91.72 on 96 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.5767, Adjusted R-squared:  0.5679 
## F-statistic:  65.4 on 2 and 96 DF,  p-value: < 2.2e-16
AIC(mod3)
## [1] 1180.611
BIC(mod3)
## [1] 1190.991

Mod1 is nested in mod2 and mod3 since both of those models also use the variable month. Anova and Likelihood ratio test are used for nested models. If not nested, we MUST use AIC or BIC.

logLik(mod1)
## 'log Lik.' -623.4335 (df=3)
logLik(mod2)
## 'log Lik.' -556.7183 (df=4)
#find 134 by doing 2 times the difference of bigger model minus smaller model
#small pvalue means bigger model is better (null Ho is small model is good, alternative is bigger model is better)
pchisq(134, lower.tail = FALSE, df = 1)
## [1] 5.463547e-31
xvar <- 1:12
coef(mod2)
##  (Intercept)        month monthsquared 
##     549.8271    -147.6907      10.4416
ypreds <- 549.8271+ - 147.607*xvar + 10.4416*xvar*xvar
with(data, (plot(month, therms)))

## NULL
plot(mod1)

plot(mod2)

lifedata <- read.csv("http://www.cknudson.com/data/LifeExp.csv")
summary(lifedata)
##      Birth           Death        InfantDeath       MaleLife    
##  Min.   : 9.70   Min.   : 2.20   Min.   :  4.5   Min.   :38.10  
##  1st Qu.:14.50   1st Qu.: 7.80   1st Qu.: 13.1   1st Qu.:55.80  
##  Median :29.00   Median : 9.50   Median : 43.0   Median :63.70  
##  Mean   :29.23   Mean   :10.84   Mean   : 54.9   Mean   :61.49  
##  3rd Qu.:42.20   3rd Qu.:12.50   3rd Qu.: 83.0   3rd Qu.:68.60  
##  Max.   :52.20   Max.   :25.00   Max.   :181.6   Max.   :75.90  
##                                                                 
##    FemaleLife         GNP               Region          Country  
##  Min.   :41.20   Min.   :   80   Africa    :27   Afghanistan: 1  
##  1st Qu.:57.50   1st Qu.:  475   Asia      :16   Albania    : 1  
##  Median :67.80   Median : 1690   East.Eur  :11   Algeria    : 1  
##  Mean   :66.15   Mean   : 5741   LatinAmer :12   Angola     : 1  
##  3rd Qu.:75.40   3rd Qu.: 7325   MiddleEast:12   Argentina  : 1  
##  Max.   :81.80   Max.   :34064   Other     :19   Austria    : 1  
##                  NA's   :6                       (Other)    :91
malemod1 <- lm(MaleLife ~ GNP, lifedata)
summary(malemod1)
## 
## Call:
## lm(formula = MaleLife ~ GNP, data = lifedata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.999  -5.388   1.222   5.840  12.553 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.694e+01  9.647e-01   59.03  < 2e-16 ***
## GNP         7.728e-04  9.758e-05    7.92 6.35e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.492 on 89 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.4134, Adjusted R-squared:  0.4068 
## F-statistic: 62.72 on 1 and 89 DF,  p-value: 6.345e-12
with(lifedata, plot(GNP, MaleLife))

lifedata$logGNP <- log(lifedata$GNP)
malemod2 <- lm(MaleLife ~ GNP + logGNP, lifedata)
summary(malemod2)
## 
## Call:
## lm(formula = MaleLife ~ GNP + logGNP, data = lifedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.8862  -3.1499   0.3632   3.5823  14.3717 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 20.4063459  4.6241607   4.413 2.88e-05 ***
## GNP         -0.0001969  0.0001423  -1.384     0.17    
## logGNP       5.6053605  0.7003013   8.004 4.54e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.732 on 88 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.6605, Adjusted R-squared:  0.6528 
## F-statistic: 85.62 on 2 and 88 DF,  p-value: < 2.2e-16
lifedata$GNPsqrt <- (lifedata$GNP)^.5
malemod3 <- lm(MaleLife ~ logGNP, lifedata)
summary(malemod3)
## 
## Call:
## lm(formula = MaleLife ~ logGNP, data = lifedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5872  -3.5064   0.0095   3.7562  14.1309 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.4726     2.8387   8.973 4.27e-14 ***
## logGNP        4.7804     0.3693  12.946  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.761 on 89 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.6532, Adjusted R-squared:  0.6493 
## F-statistic: 167.6 on 1 and 89 DF,  p-value: < 2.2e-16
pred <- 25.4726 + 4.7804*log(5000)
pred
## [1] 66.18819
with(lifedata, plot(GNP, MaleLife))
xvar <- 1:35000
ypreds <- 25.4726 + 4.7804*log(xvar)
lines(xvar, ypreds)

AIC(malemod1)
## [1] 628.7462
AIC(malemod3)
## [1] 580.9289
anova(malemod2, malemod1)
## Analysis of Variance Table
## 
## Model 1: MaleLife ~ GNP + logGNP
## Model 2: MaleLife ~ GNP
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1     88 2891.0                                  
## 2     89 4995.8 -1   -2104.8 64.067 4.544e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(malemod3)

malemod5 <- lm(MaleLife ~ log(log(log(GNP))) + log(FemaleLife), lifedata)
plot(malemod5)

summary(malemod5)
## 
## Call:
## lm(formula = MaleLife ~ log(log(log(GNP))) + log(FemaleLife), 
##     data = lifedata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7238 -0.8300 -0.0058  1.0647  5.4884 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -156.181      6.301 -24.787   <2e-16 ***
## log(log(log(GNP)))    1.859      2.926   0.635    0.527    
## log(FemaleLife)      51.811      1.875  27.638   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.851 on 88 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.9646, Adjusted R-squared:  0.9638 
## F-statistic:  1199 on 2 and 88 DF,  p-value: < 2.2e-16
with(lifedata, plot(FemaleLife, MaleLife))