RMD-1.R

#Read in CSV file
beer <- read.csv("http://www.cknudson.com/data/MNbeer.csv")

#View the dataset
names(beer)

## [1] "Brewery"     "Beer"        "Description" "Style"       "ABV"        
## [6] "IBU"         "Rating"      "Good"

str(beer)

## 'data.frame':    44 obs. of  8 variables:
##  $ Brewery    : chr  "Bauhaus" "Bauhaus" "Bauhaus" "Bauhaus" ...
##  $ Beer       : chr  "Wonderstuff" "Stargazer" "Wagon Party" "Sky-Five!" ...
##  $ Description: chr  "New Bohemian Pilsner" "German Style Schwarzbier" "West Cost Style Lager" "Midwest Coast IPA" ...
##  $ Style      : chr  "Lager" "Lager" "Lager" "IPA" ...
##  $ ABV        : num  5.4 5 5.4 6.7 4.8 5 6.2 5.6 6 5.4 ...
##  $ IBU        : int  48 28 55 70 48 38 68 32 34 45 ...
##  $ Rating     : int  88 87 86 86 85 87 89 88 89 90 ...
##  $ Good       : int  0 0 0 0 0 0 0 0 0 1 ...

summary(beer)

##    Brewery              Beer           Description           Style          
##  Length:44          Length:44          Length:44          Length:44         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       ABV             IBU            Rating           Good       
##  Min.   :4.200   Min.   :15.00   Min.   :79.00   Min.   :0.0000  
##  1st Qu.:5.200   1st Qu.:33.00   1st Qu.:85.00   1st Qu.:0.0000  
##  Median :5.600   Median :48.50   Median :87.00   Median :0.0000  
##  Mean   :5.818   Mean   :51.07   Mean   :87.18   Mean   :0.2727  
##  3rd Qu.:6.500   3rd Qu.:68.25   3rd Qu.:90.00   3rd Qu.:1.0000  
##  Max.   :7.500   Max.   :99.00   Max.   :98.00   Max.   :1.0000

#plot data to check for normality
with(beer,plot(IBU~Rating))

#Simple linear regression
mod1 <- lm(Rating~IBU, data=beer)
qqnorm(mod1$residuals) #easiest way to check for normality is by checking residuals
qqline(mod1$residuals) #If residuals fall along the line can assume residuals are approximately normal

#Multi Linear Regression and Anova
mod1 <- lm(Rating~Brewery, data=beer)
coef(mod1)

##          (Intercept)   BreweryBent Paddle        BreweryFulton 
##            86.750000             0.850000            -2.350000 
##        BreweryIndeed     BrewerySteel Toe        BrewerySummit 
##             1.107143             1.250000            -1.321429 
##         BrewerySurly BreweryUrban Growler 
##             5.392857            -2.950000

mod0 <- lm(Rating~1, data=beer)
coef(mod0)

## (Intercept) 
##    87.18182

anova(mod1,mod0) #A significant Pr(>F) value suggests that model 1 is significantly better than mod0

## Analysis of Variance Table
## 
## Model 1: Rating ~ Brewery
## Model 2: Rating ~ 1
##   Res.Df    RSS Df Sum of Sq     F    Pr(>F)    
## 1     36 301.38                                 
## 2     43 598.55 -7   -297.17 5.071 0.0004478 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

mod2<-lm(Rating~Brewery+IBU, data=beer) #add predictors with +
summary(mod2) #check for significant predictors, R^2 value, and p value

## 
## Call:
## lm(formula = Rating ~ Brewery + IBU, data = beer)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7732 -1.7126  0.0889  1.2031  5.9521 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          83.98984    1.65525  50.741  < 2e-16 ***
## BreweryBent Paddle    1.19330    1.78467   0.669  0.50811    
## BreweryFulton        -2.00670    1.78467  -1.124  0.26849    
## BreweryIndeed         0.46958    1.67917   0.280  0.78139    
## BrewerySteel Toe      1.16761    1.87697   0.622  0.53793    
## BrewerySummit        -1.47248    1.66443  -0.885  0.38237    
## BrewerySurly          5.20257    1.66495   3.125  0.00357 ** 
## BreweryUrban Growler -2.59571    1.78495  -1.454  0.15479    
## IBU                   0.05493    0.01969   2.790  0.00848 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.654 on 35 degrees of freedom
## Multiple R-squared:  0.5881, Adjusted R-squared:  0.4939 
## F-statistic: 6.246 on 8 and 35 DF,  p-value: 5.108e-05

#Data Transformation
dat <- read.csv("http://www.cknudson.com/data/MacNaturalGas.csv")

#Model the amount of therms used monthly
model1 <- lm(therms ~ month, data = dat)
with(dat, plot(therms~month)) #data is parabolic
abline(mod1) #use a trasnformation to make data normal

## Warning in abline(mod1): only using the first two of 8 regression coefficients

monthsq <- (dat$month)^2
model2<- lm(therms~month + monthsq,data=dat)
summary(model2)

## 
## Call:
## lm(formula = therms ~ month + monthsq, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -224.58  -32.32  -14.41   36.10  184.27 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  549.8271    23.7119   23.19   <2e-16 ***
## month       -147.6907     8.3893  -17.61   <2e-16 ***
## monthsq       10.4416     0.6314   16.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68.02 on 96 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.7672, Adjusted R-squared:  0.7623 
## F-statistic: 158.1 on 2 and 96 DF,  p-value: < 2.2e-16

anova(model2,model1)

## Analysis of Variance Table
## 
## Model 1: therms ~ month + monthsq
## Model 2: therms ~ month
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1     96  444221                                  
## 2     97 1709752 -1  -1265531 273.49 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Comparing Models
#Nested means they share predictors
#Anova and LRT for NESTED models
#For not nested models, MUST use AIC or BIC

AIC(model1)

## [1] 1252.867

AIC(model2) #smaller AIC is better

## [1] 1121.437

BIC(model1)

## [1] 1260.652

BIC(model2)#smaller BIC is better

## [1] 1131.817

logLik(model1)

## 'log Lik.' -623.4335 (df=3)

logLik(model2)

## 'log Lik.' -556.7183 (df=4)

RMD-1.R

nicholastourtillott

2021-02-14