Week1

beer <- read.csv("http://www.cknudson.com/data/MNbeer.csv")

mod = lm(Rating ~ IBU, data = beer)
mod2 = lm(Rating ~ IBU + Brewery, data = beer)

summary(mod)

## 
## Call:
## lm(formula = Rating ~ IBU, data = beer)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3822 -2.4669  0.3309  2.0854  9.8856 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 83.76336    1.36811  61.225   <2e-16 ***
## IBU          0.06694    0.02474   2.706   0.0098 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.484 on 42 degrees of freedom
## Multiple R-squared:  0.1485, Adjusted R-squared:  0.1282 
## F-statistic: 7.322 on 1 and 42 DF,  p-value: 0.0098

summary(mod2)

## 
## Call:
## lm(formula = Rating ~ IBU + Brewery, data = beer)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7732 -1.7126  0.0889  1.2031  5.9521 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          83.98984    1.65525  50.741  < 2e-16 ***
## IBU                   0.05493    0.01969   2.790  0.00848 ** 
## BreweryBent Paddle    1.19330    1.78467   0.669  0.50811    
## BreweryFulton        -2.00670    1.78467  -1.124  0.26849    
## BreweryIndeed         0.46958    1.67917   0.280  0.78139    
## BrewerySteel Toe      1.16761    1.87697   0.622  0.53793    
## BrewerySummit        -1.47248    1.66443  -0.885  0.38237    
## BrewerySurly          5.20257    1.66495   3.125  0.00357 ** 
## BreweryUrban Growler -2.59571    1.78495  -1.454  0.15479    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.654 on 35 degrees of freedom
## Multiple R-squared:  0.5881, Adjusted R-squared:  0.4939 
## F-statistic: 6.246 on 8 and 35 DF,  p-value: 5.108e-05

dat <- read.csv("http://www.cknudson.com/data/MacNaturalGas.csv")
mod1 = lm(therms ~ month, data = dat)
dat$monthsquared = (dat$month)^2 #data transformation
mod2 = lm(therms ~month + monthsquared, data = dat)
summary(mod2)

## 
## Call:
## lm(formula = therms ~ month + monthsquared, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -224.58  -32.32  -14.41   36.10  184.27 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   549.8271    23.7119   23.19   <2e-16 ***
## month        -147.6907     8.3893  -17.61   <2e-16 ***
## monthsquared   10.4416     0.6314   16.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68.02 on 96 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.7672, Adjusted R-squared:  0.7623 
## F-statistic: 158.1 on 2 and 96 DF,  p-value: < 2.2e-16

AIC(mod1)

## [1] 1252.867

AIC(mod2) #smaller is better

## [1] 1121.437

#AIC balances model fit and model complexity
BIC(mod1)

## [1] 1260.652

BIC(mod2) #smaller is better

## [1] 1131.817

#mod1 is nested in mod2 and mod3
  #all the variables in mod1 (month) are also in mod2 and mod3
#mod2 and mod3 are not nested models
#can only do anova and LRT for NESTED models
  #when models aren't nested, you have to use AIC or BIC

logLik(mod1) #gives log likelihood

## 'log Lik.' -623.4335 (df=3)

logLik(mod2)

## 'log Lik.' -556.7183 (df=4)

#LRT
  #test stat: 2(loglike(bigmod)-loglike(smallmod))
  #chi-squared distribution
  #df = # parameters in bigger model - # parameters in smaller model
  #if results are significant (small p-value), bigger model is better
    #H0: smaller model is fine
    #HA: bigger model is better

#plot our mod2
xvar = 1:12
coef(mod2)

##  (Intercept)        month monthsquared 
##     549.8271    -147.6907      10.4416

ypreds = 549.8271 + -147.6907 * xvar + 10.4416*xvar*xvar
#lines(xvar, ypreds)

plot(mod1)

#residuals vs predicted plot - ideally there would be no patter with red line straight across at 0

#--------------------------------------
lifedata <- read.csv("http://www.cknudson.com/data/LifeExp.csv")
plot(MaleLife ~ GNP, data = lifedata) #weak linear relationship?
#alternative:
with(lifedata, plot(MaleLife~GNP))

lifemod1= lm(MaleLife ~ GNP, data = lifedata)
abline(lifemod1)

summary(lifemod1) #r-squared = 0.4134

## 
## Call:
## lm(formula = MaleLife ~ GNP, data = lifedata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.999  -5.388   1.222   5.840  12.553 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.694e+01  9.647e-01   59.03  < 2e-16 ***
## GNP         7.728e-04  9.758e-05    7.92 6.35e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.492 on 89 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.4134, Adjusted R-squared:  0.4068 
## F-statistic: 62.72 on 1 and 89 DF,  p-value: 6.345e-12

#41.34% of the variation in MaleLife can be explained by GNP

life <- transform(lifedata, lnGNP = log(GNP))
lifemodel2 <- lm(MaleLife ~ lnGNP, data=life)
summary(lifemodel2) #r-squared = 0.6532

## 
## Call:
## lm(formula = MaleLife ~ lnGNP, data = life)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5872  -3.5064   0.0095   3.7562  14.1309 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.4726     2.8387   8.973 4.27e-14 ***
## lnGNP         4.7804     0.3693  12.946  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.761 on 89 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.6532, Adjusted R-squared:  0.6493 
## F-statistic: 167.6 on 1 and 89 DF,  p-value: < 2.2e-16

4.7804*log(5000) + 25.4726 #predict male life expectancy for a nation with GNP of $5000

## [1] 66.18819

plot(MaleLife ~ log(GNP), data = lifedata) #plot new model
abline(lifemodel2)

with(lifedata, plot(MaleLife~log(GNP)))
abline(lifemodel2)

xvars<- c(0:35000)
ys<-coef(lifemodel2)[1] + coef(lifemodel2)[2]*(log(xvars))
lines(xvars,ys)