beer <- read.csv("http://www.cknudson.com/data/MNbeer.csv")
mod = lm(Rating ~ IBU, data = beer)
mod2 = lm(Rating ~ IBU + Brewery, data = beer)
summary(mod)
##
## Call:
## lm(formula = Rating ~ IBU, data = beer)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3822 -2.4669 0.3309 2.0854 9.8856
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.76336 1.36811 61.225 <2e-16 ***
## IBU 0.06694 0.02474 2.706 0.0098 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.484 on 42 degrees of freedom
## Multiple R-squared: 0.1485, Adjusted R-squared: 0.1282
## F-statistic: 7.322 on 1 and 42 DF, p-value: 0.0098
summary(mod2)
##
## Call:
## lm(formula = Rating ~ IBU + Brewery, data = beer)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7732 -1.7126 0.0889 1.2031 5.9521
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.98984 1.65525 50.741 < 2e-16 ***
## IBU 0.05493 0.01969 2.790 0.00848 **
## BreweryBent Paddle 1.19330 1.78467 0.669 0.50811
## BreweryFulton -2.00670 1.78467 -1.124 0.26849
## BreweryIndeed 0.46958 1.67917 0.280 0.78139
## BrewerySteel Toe 1.16761 1.87697 0.622 0.53793
## BrewerySummit -1.47248 1.66443 -0.885 0.38237
## BrewerySurly 5.20257 1.66495 3.125 0.00357 **
## BreweryUrban Growler -2.59571 1.78495 -1.454 0.15479
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.654 on 35 degrees of freedom
## Multiple R-squared: 0.5881, Adjusted R-squared: 0.4939
## F-statistic: 6.246 on 8 and 35 DF, p-value: 5.108e-05
dat <- read.csv("http://www.cknudson.com/data/MacNaturalGas.csv")
mod1 = lm(therms ~ month, data = dat)
dat$monthsquared = (dat$month)^2 #data transformation
mod2 = lm(therms ~month + monthsquared, data = dat)
summary(mod2)
##
## Call:
## lm(formula = therms ~ month + monthsquared, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -224.58 -32.32 -14.41 36.10 184.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 549.8271 23.7119 23.19 <2e-16 ***
## month -147.6907 8.3893 -17.61 <2e-16 ***
## monthsquared 10.4416 0.6314 16.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68.02 on 96 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.7672, Adjusted R-squared: 0.7623
## F-statistic: 158.1 on 2 and 96 DF, p-value: < 2.2e-16
AIC(mod1)
## [1] 1252.867
AIC(mod2) #smaller is better
## [1] 1121.437
#AIC balances model fit and model complexity
BIC(mod1)
## [1] 1260.652
BIC(mod2) #smaller is better
## [1] 1131.817
#mod1 is nested in mod2 and mod3
#all the variables in mod1 (month) are also in mod2 and mod3
#mod2 and mod3 are not nested models
#can only do anova and LRT for NESTED models
#when models aren't nested, you have to use AIC or BIC
logLik(mod1) #gives log likelihood
## 'log Lik.' -623.4335 (df=3)
logLik(mod2)
## 'log Lik.' -556.7183 (df=4)
#LRT
#test stat: 2(loglike(bigmod)-loglike(smallmod))
#chi-squared distribution
#df = # parameters in bigger model - # parameters in smaller model
#if results are significant (small p-value), bigger model is better
#H0: smaller model is fine
#HA: bigger model is better
#plot our mod2
xvar = 1:12
coef(mod2)
## (Intercept) month monthsquared
## 549.8271 -147.6907 10.4416
ypreds = 549.8271 + -147.6907 * xvar + 10.4416*xvar*xvar
#lines(xvar, ypreds)
plot(mod1)




#residuals vs predicted plot - ideally there would be no patter with red line straight across at 0
#--------------------------------------
lifedata <- read.csv("http://www.cknudson.com/data/LifeExp.csv")
plot(MaleLife ~ GNP, data = lifedata) #weak linear relationship?
#alternative:
with(lifedata, plot(MaleLife~GNP))
lifemod1= lm(MaleLife ~ GNP, data = lifedata)
abline(lifemod1)

summary(lifemod1) #r-squared = 0.4134
##
## Call:
## lm(formula = MaleLife ~ GNP, data = lifedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.999 -5.388 1.222 5.840 12.553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.694e+01 9.647e-01 59.03 < 2e-16 ***
## GNP 7.728e-04 9.758e-05 7.92 6.35e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.492 on 89 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.4134, Adjusted R-squared: 0.4068
## F-statistic: 62.72 on 1 and 89 DF, p-value: 6.345e-12
#41.34% of the variation in MaleLife can be explained by GNP
life <- transform(lifedata, lnGNP = log(GNP))
lifemodel2 <- lm(MaleLife ~ lnGNP, data=life)
summary(lifemodel2) #r-squared = 0.6532
##
## Call:
## lm(formula = MaleLife ~ lnGNP, data = life)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5872 -3.5064 0.0095 3.7562 14.1309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.4726 2.8387 8.973 4.27e-14 ***
## lnGNP 4.7804 0.3693 12.946 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.761 on 89 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.6532, Adjusted R-squared: 0.6493
## F-statistic: 167.6 on 1 and 89 DF, p-value: < 2.2e-16
4.7804*log(5000) + 25.4726 #predict male life expectancy for a nation with GNP of $5000
## [1] 66.18819
plot(MaleLife ~ log(GNP), data = lifedata) #plot new model
abline(lifemodel2)

with(lifedata, plot(MaleLife~log(GNP)))
abline(lifemodel2)
xvars<- c(0:35000)
ys<-coef(lifemodel2)[1] + coef(lifemodel2)[2]*(log(xvars))
lines(xvars,ys)
