dat <- read.csv("http://www.cknudson.com/data/MacNaturalGas.csv")

# Worksheet from chat
# https://www.dropbox.com/s/wdvpuxleqo5djd8/Transformation.pdf?dl=0

mod1<- lm(therms~ month, data=dat)
#plot data
with(dat, plot(therms~month))
abline(mod1)
#adding a variable to the dataset
dat$monthsquare<- (dat$month)^2

#polynomial regression
mod2<-lm(therms~ month + monthsquare, data=dat)
summary(mod2)
## 
## Call:
## lm(formula = therms ~ month + monthsquare, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -224.58  -32.32  -14.41   36.10  184.27 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  549.8271    23.7119   23.19   <2e-16 ***
## month       -147.6907     8.3893  -17.61   <2e-16 ***
## monthsquare   10.4416     0.6314   16.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68.02 on 96 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.7672, Adjusted R-squared:  0.7623 
## F-statistic: 158.1 on 2 and 96 DF,  p-value: < 2.2e-16
#comparing models
anova(mod1,mod2)
## Analysis of Variance Table
## 
## Model 1: therms ~ month
## Model 2: therms ~ month + monthsquare
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1     97 1709752                                  
## 2     96  444221  1   1265531 273.49 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod1)
## [1] 1252.867
AIC(mod2) #smaller is better!
## [1] 1121.437
#AIC balances model fit and model complexity

BIC(mod1)
## [1] 1260.652
BIC(mod2) #again, smaller is better
## [1] 1131.817
#can compare any two models with the same response variable (y). 
#Cannot necessarily do this with other more formal tests.

#Two models are nested if the all the variables in one model are included in the variables of another model
#ie. mod1 is nested in mod2
#anova and LRT are for NESTED models!
#for not nested models, MUST use AIC or BIC

logLik(mod1) # gives log likelihood for model
## 'log Lik.' -623.4335 (df=3)
logLik(mod2)
## 'log Lik.' -556.7183 (df=4)
# LRT (test stat) = 2*(logLike(bigMod)-logLike(smallMod))
#bigMod= more complex model
#distribution for test stat is Chi-Squared with df = #params in bigMod -# params in smallMod
 #ie test stat for mod 1 & mod 2 = 134 w/ 1 df

pchisq(134, lower.tail= FALSE, df=1)
## [1] 5.463547e-31
# a signifant result (small p value) in the LRT means that the bigger model is better
# H0: smaller model is fine, no need for added complexity
# HA: bigger model is better!

#plot our mod2
xvar <- 1:12
ypreds<- 549.8271 + -147.6907 * xvar + 10.4416 * xvar*xvar
ypreds
##  [1] 412.5780 296.2121 200.7294 126.1299  72.4136  39.5805  27.6306  36.5639
##  [9]  66.3804 117.0801 188.6630 281.1291
#add parabola to plot from before
lines(xvar, ypreds)

plot(mod1)

#first plot, Resids v fitted vals --> want randomly scattered above & below 0, with trend line being horizontal on 0 (ideal)
# this one needs to be good before moving on to the next plots
plot(mod2)

#second QQ Norm plot --> if residuals are perfectly normal, follow the dotted line perfectly
#third & fourth plots --> not important for this class

#---------------Practice--------------------------
lifedata <- read.csv("http://www.cknudson.com/data/LifeExp.csv")

#1a.
with(lifedata, plot(MaleLife~GNP))
#1b.
lifemodel1<-lm(MaleLife~GNP, data=lifedata)
abline(lifemodel1)

#1c.
summary(lifemodel1)
## 
## Call:
## lm(formula = MaleLife ~ GNP, data = lifedata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.999  -5.388   1.222   5.840  12.553 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.694e+01  9.647e-01   59.03  < 2e-16 ***
## GNP         7.728e-04  9.758e-05    7.92 6.35e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.492 on 89 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.4134, Adjusted R-squared:  0.4068 
## F-statistic: 62.72 on 1 and 89 DF,  p-value: 6.345e-12
# the R^2 of this model is 0.4134 this means that the model accounts for about 41% of the observed variation from the mean in male life expectancy

#2.
lifedata$logGNP<- log(lifedata$GNP)
lifemodel2<- lm(MaleLife~logGNP, data= lifedata)
coef(lifemodel2)
## (Intercept)      logGNP 
##   25.472622    4.780417
est5000<- coef(lifemodel2)[1] + coef(lifemodel2)[2]*(log(5000))
est5000
## (Intercept) 
##    66.18836
summary(lifemodel2)
## 
## Call:
## lm(formula = MaleLife ~ logGNP, data = lifedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5872  -3.5064   0.0095   3.7562  14.1309 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.4726     2.8387   8.973 4.27e-14 ***
## logGNP        4.7804     0.3693  12.946  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.761 on 89 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.6532, Adjusted R-squared:  0.6493 
## F-statistic: 167.6 on 1 and 89 DF,  p-value: < 2.2e-16
#the R^2 of this model is 0.6532, which means that it accounts for more of the variability from the mean than the first model

plot(lifemodel1)

plot(lifemodel2)

#plot with logGNP on x-axis
with(lifedata, plot(MaleLife~logGNP))
abline(lifemodel2)

#plot with original GNP on xaxis
xvars<- c(0:35000)
ys<-coef(lifemodel2)[1] + coef(lifemodel2)[2]*(log(xvars))
lines(xvars,ys)