#Read in CSV file
beer <- read.csv("http://www.cknudson.com/data/MNbeer.csv")
#View the dataset
names(beer)
## [1] "Brewery" "Beer" "Description" "Style" "ABV"
## [6] "IBU" "Rating" "Good"
str(beer)
## 'data.frame': 44 obs. of 8 variables:
## $ Brewery : chr "Bauhaus" "Bauhaus" "Bauhaus" "Bauhaus" ...
## $ Beer : chr "Wonderstuff" "Stargazer" "Wagon Party" "Sky-Five!" ...
## $ Description: chr "New Bohemian Pilsner" "German Style Schwarzbier" "West Cost Style Lager" "Midwest Coast IPA" ...
## $ Style : chr "Lager" "Lager" "Lager" "IPA" ...
## $ ABV : num 5.4 5 5.4 6.7 4.8 5 6.2 5.6 6 5.4 ...
## $ IBU : int 48 28 55 70 48 38 68 32 34 45 ...
## $ Rating : int 88 87 86 86 85 87 89 88 89 90 ...
## $ Good : int 0 0 0 0 0 0 0 0 0 1 ...
summary(beer)
## Brewery Beer Description Style
## Length:44 Length:44 Length:44 Length:44
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## ABV IBU Rating Good
## Min. :4.200 Min. :15.00 Min. :79.00 Min. :0.0000
## 1st Qu.:5.200 1st Qu.:33.00 1st Qu.:85.00 1st Qu.:0.0000
## Median :5.600 Median :48.50 Median :87.00 Median :0.0000
## Mean :5.818 Mean :51.07 Mean :87.18 Mean :0.2727
## 3rd Qu.:6.500 3rd Qu.:68.25 3rd Qu.:90.00 3rd Qu.:1.0000
## Max. :7.500 Max. :99.00 Max. :98.00 Max. :1.0000
#plot data to check for normality
with(beer,plot(IBU~Rating))

#Simple linear regression
mod1 <- lm(Rating~IBU, data=beer)
qqnorm(mod1$residuals) #easiest way to check for normality is by checking residuals
qqline(mod1$residuals) #If residuals fall along the line can assume residuals are approximately normal

#Multi Linear Regression and Anova
mod1 <- lm(Rating~Brewery, data=beer)
coef(mod1)
## (Intercept) BreweryBent Paddle BreweryFulton
## 86.750000 0.850000 -2.350000
## BreweryIndeed BrewerySteel Toe BrewerySummit
## 1.107143 1.250000 -1.321429
## BrewerySurly BreweryUrban Growler
## 5.392857 -2.950000
mod0 <- lm(Rating~1, data=beer)
coef(mod0)
## (Intercept)
## 87.18182
anova(mod1,mod0) #A significant Pr(>F) value suggests that model 1 is significantly better than mod0
## Analysis of Variance Table
##
## Model 1: Rating ~ Brewery
## Model 2: Rating ~ 1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 36 301.38
## 2 43 598.55 -7 -297.17 5.071 0.0004478 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod2<-lm(Rating~Brewery+IBU, data=beer) #add predictors with +
summary(mod2) #check for significant predictors, R^2 value, and p value
##
## Call:
## lm(formula = Rating ~ Brewery + IBU, data = beer)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7732 -1.7126 0.0889 1.2031 5.9521
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.98984 1.65525 50.741 < 2e-16 ***
## BreweryBent Paddle 1.19330 1.78467 0.669 0.50811
## BreweryFulton -2.00670 1.78467 -1.124 0.26849
## BreweryIndeed 0.46958 1.67917 0.280 0.78139
## BrewerySteel Toe 1.16761 1.87697 0.622 0.53793
## BrewerySummit -1.47248 1.66443 -0.885 0.38237
## BrewerySurly 5.20257 1.66495 3.125 0.00357 **
## BreweryUrban Growler -2.59571 1.78495 -1.454 0.15479
## IBU 0.05493 0.01969 2.790 0.00848 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.654 on 35 degrees of freedom
## Multiple R-squared: 0.5881, Adjusted R-squared: 0.4939
## F-statistic: 6.246 on 8 and 35 DF, p-value: 5.108e-05
#Data Transformation
dat <- read.csv("http://www.cknudson.com/data/MacNaturalGas.csv")
#Model the amount of therms used monthly
model1 <- lm(therms ~ month, data = dat)
with(dat, plot(therms~month)) #data is parabolic
abline(mod1) #use a trasnformation to make data normal
## Warning in abline(mod1): only using the first two of 8 regression coefficients

monthsq <- (dat$month)^2
model2<- lm(therms~month + monthsq,data=dat)
summary(model2)
##
## Call:
## lm(formula = therms ~ month + monthsq, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -224.58 -32.32 -14.41 36.10 184.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 549.8271 23.7119 23.19 <2e-16 ***
## month -147.6907 8.3893 -17.61 <2e-16 ***
## monthsq 10.4416 0.6314 16.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68.02 on 96 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.7672, Adjusted R-squared: 0.7623
## F-statistic: 158.1 on 2 and 96 DF, p-value: < 2.2e-16
anova(model2,model1)
## Analysis of Variance Table
##
## Model 1: therms ~ month + monthsq
## Model 2: therms ~ month
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 96 444221
## 2 97 1709752 -1 -1265531 273.49 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Comparing Models
#Nested means they share predictors
#Anova and LRT for NESTED models
#For not nested models, MUST use AIC or BIC
AIC(model1)
## [1] 1252.867
AIC(model2) #smaller AIC is better
## [1] 1121.437
BIC(model1)
## [1] 1260.652
BIC(model2)#smaller BIC is better
## [1] 1131.817
logLik(model1)
## 'log Lik.' -623.4335 (df=3)
logLik(model2)
## 'log Lik.' -556.7183 (df=4)