Motivating Example: Breakfast Cereal

Data is a sample of 30 breakfast cereals.

cereal<-read.delim("https://www.lock5stat.com/datasets/Cereal.txt", 
                   header=TRUE)
head(cereal)
##                    Name Company Serving Calories Fat Sodium Carbs Fiber Sugars
## 1            AppleJacks       K    1.00      117 0.6    143    27   0.5   15.0
## 2             Boo Berry       G    1.00      118 0.8    211    27   0.1   14.0
## 3          Cap'n Crunch       Q    0.75      144 2.1    269    31   1.1   16.0
## 4 Cinnamon Toast Crunch       G    0.75      169 4.4    408    32   1.7   13.3
## 5          Cocoa Blasts       Q    1.00      130 1.2    135    29   0.8   16.0
## 6           Cocoa Puffs       G    1.00      117 1.0    171    26   0.8   14.0
##   Protein
## 1     1.0
## 2     1.0
## 3     1.3
## 4     2.7
## 5     1.0
## 6     1.0

Look at the data: Scatterplot

library(tidyverse)

ggplot(cereal, aes(x=Sugars, y=Calories))+
  geom_point()+
  theme_bw()

Fit a Simple Linear Regression Model

ggplot(cereal, aes(x=Sugars, y=Calories))+
  geom_point()+
  geom_smooth(method="lm", se=FALSE)+
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

mod<-lm(Calories~Sugars, data=cereal)
summary(mod)
## 
## Call:
## lm(formula = Calories ~ Sugars, data = cereal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.574 -25.282  -2.549  17.796  51.805 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  88.9204    10.8120   8.224 5.96e-09 ***
## Sugars        4.3103     0.9269   4.650 7.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.61 on 28 degrees of freedom
## Multiple R-squared:  0.4357, Adjusted R-squared:  0.4156 
## F-statistic: 21.62 on 1 and 28 DF,  p-value: 7.217e-05
anova(mod)
## Analysis of Variance Table
## 
## Response: Calories
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## Sugars     1  15316 15316.5  21.623 7.217e-05 ***
## Residuals 28  19834   708.3                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Check the model assumptions

qqnorm(mod$residuals)
qqline(mod$residuals)

ggplot(data=cereal, aes(x=cereal$Sugar, y=mod$residuals))+
  geom_point()+
  ggtitle("Residual Plot")+
  theme_bw()+
  geom_hline(yintercept = 0, 
             color="blue", lty=2, lwd=1)
## Warning: Use of `cereal$Sugar` is discouraged. Use `Sugar` instead.

Inference for Regression

Hypothesis Tests

summary(mod)
## 
## Call:
## lm(formula = Calories ~ Sugars, data = cereal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.574 -25.282  -2.549  17.796  51.805 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  88.9204    10.8120   8.224 5.96e-09 ***
## Sugars        4.3103     0.9269   4.650 7.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.61 on 28 degrees of freedom
## Multiple R-squared:  0.4357, Adjusted R-squared:  0.4156 
## F-statistic: 21.62 on 1 and 28 DF,  p-value: 7.217e-05

Confidence Intervals

4.310 + c(-1, 1)*qt(0.975, 28)*0.9269
## [1] 2.411331 6.208669
confint(mod)
##                 2.5 %     97.5 %
## (Intercept) 66.772987 111.067837
## Sugars       2.411535   6.208987

Multiple Linear Regresion

pairs(Calories~Sugars+Fat+Protein+Carbs, data=cereal)

mod2<-lm(Calories~Sugars+Carbs, data=cereal)
summary(mod2)
## 
## Call:
## lm(formula = Calories ~ Sugars + Carbs, data = cereal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.763  -6.711  -1.976   2.273  28.015 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.4568     9.1821   0.485    0.631    
## Sugars        0.6528     0.5323   1.226    0.231    
## Carbs         4.1317     0.3834  10.776 2.79e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.77 on 27 degrees of freedom
## Multiple R-squared:  0.8935, Adjusted R-squared:  0.8857 
## F-statistic: 113.3 on 2 and 27 DF,  p-value: 7.355e-14
#install.packages("scatterplot3d")
library(scatterplot3d)
s3d<-scatterplot3d(cereal[,c(9, 7,4)])
s3d$plane3d(mod2)