dat <- read.csv("http://www.cknudson.com/data/MacNaturalGas.csv")
# Worksheet from chat
# https://www.dropbox.com/s/wdvpuxleqo5djd8/Transformation.pdf?dl=0
mod1<- lm(therms~ month, data=dat)
#plot data
with(dat, plot(therms~month))
abline(mod1)
#adding a variable to the dataset
dat$monthsquare<- (dat$month)^2
#polynomial regression
mod2<-lm(therms~ month + monthsquare, data=dat)
summary(mod2)
##
## Call:
## lm(formula = therms ~ month + monthsquare, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -224.58 -32.32 -14.41 36.10 184.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 549.8271 23.7119 23.19 <2e-16 ***
## month -147.6907 8.3893 -17.61 <2e-16 ***
## monthsquare 10.4416 0.6314 16.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68.02 on 96 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.7672, Adjusted R-squared: 0.7623
## F-statistic: 158.1 on 2 and 96 DF, p-value: < 2.2e-16
#comparing models
anova(mod1,mod2)
## Analysis of Variance Table
##
## Model 1: therms ~ month
## Model 2: therms ~ month + monthsquare
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 97 1709752
## 2 96 444221 1 1265531 273.49 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod1)
## [1] 1252.867
AIC(mod2) #smaller is better!
## [1] 1121.437
#AIC balances model fit and model complexity
BIC(mod1)
## [1] 1260.652
BIC(mod2) #again, smaller is better
## [1] 1131.817
#can compare any two models with the same response variable (y).
#Cannot necessarily do this with other more formal tests.
#Two models are nested if the all the variables in one model are included in the variables of another model
#ie. mod1 is nested in mod2
#anova and LRT are for NESTED models!
#for not nested models, MUST use AIC or BIC
logLik(mod1) # gives log likelihood for model
## 'log Lik.' -623.4335 (df=3)
logLik(mod2)
## 'log Lik.' -556.7183 (df=4)
# LRT (test stat) = 2*(logLike(bigMod)-logLike(smallMod))
#bigMod= more complex model
#distribution for test stat is Chi-Squared with df = #params in bigMod -# params in smallMod
#ie test stat for mod 1 & mod 2 = 134 w/ 1 df
pchisq(134, lower.tail= FALSE, df=1)
## [1] 5.463547e-31
# a signifant result (small p value) in the LRT means that the bigger model is better
# H0: smaller model is fine, no need for added complexity
# HA: bigger model is better!
#plot our mod2
xvar <- 1:12
ypreds<- 549.8271 + -147.6907 * xvar + 10.4416 * xvar*xvar
ypreds
## [1] 412.5780 296.2121 200.7294 126.1299 72.4136 39.5805 27.6306 36.5639
## [9] 66.3804 117.0801 188.6630 281.1291
#add parabola to plot from before
lines(xvar, ypreds)

plot(mod1)




#first plot, Resids v fitted vals --> want randomly scattered above & below 0, with trend line being horizontal on 0 (ideal)
# this one needs to be good before moving on to the next plots
plot(mod2)




#second QQ Norm plot --> if residuals are perfectly normal, follow the dotted line perfectly
#third & fourth plots --> not important for this class
#---------------Practice--------------------------
lifedata <- read.csv("http://www.cknudson.com/data/LifeExp.csv")
#1a.
with(lifedata, plot(MaleLife~GNP))
#1b.
lifemodel1<-lm(MaleLife~GNP, data=lifedata)
abline(lifemodel1)

#1c.
summary(lifemodel1)
##
## Call:
## lm(formula = MaleLife ~ GNP, data = lifedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.999 -5.388 1.222 5.840 12.553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.694e+01 9.647e-01 59.03 < 2e-16 ***
## GNP 7.728e-04 9.758e-05 7.92 6.35e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.492 on 89 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.4134, Adjusted R-squared: 0.4068
## F-statistic: 62.72 on 1 and 89 DF, p-value: 6.345e-12
# the R^2 of this model is 0.4134 this means that the model accounts for about 41% of the observed variation from the mean in male life expectancy
#2.
lifedata$logGNP<- log(lifedata$GNP)
lifemodel2<- lm(MaleLife~logGNP, data= lifedata)
coef(lifemodel2)
## (Intercept) logGNP
## 25.472622 4.780417
est5000<- coef(lifemodel2)[1] + coef(lifemodel2)[2]*(log(5000))
est5000
## (Intercept)
## 66.18836
summary(lifemodel2)
##
## Call:
## lm(formula = MaleLife ~ logGNP, data = lifedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5872 -3.5064 0.0095 3.7562 14.1309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.4726 2.8387 8.973 4.27e-14 ***
## logGNP 4.7804 0.3693 12.946 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.761 on 89 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.6532, Adjusted R-squared: 0.6493
## F-statistic: 167.6 on 1 and 89 DF, p-value: < 2.2e-16
#the R^2 of this model is 0.6532, which means that it accounts for more of the variability from the mean than the first model
plot(lifemodel1)




plot(lifemodel2)




#plot with logGNP on x-axis
with(lifedata, plot(MaleLife~logGNP))
abline(lifemodel2)
#plot with original GNP on xaxis
xvars<- c(0:35000)
ys<-coef(lifemodel2)[1] + coef(lifemodel2)[2]*(log(xvars))
lines(xvars,ys)
