UPDATED: FALL 2021

Motivating Example: Breakfast Cereal

Data is a sample of 30 breakfast cereals.

cereal<-read.csv("https://raw.githubusercontent.com/kitadasmalley/MATH138/main/HAWKES/Data/cerealDat.csv",
                 header=TRUE)
str(cereal)
## 'data.frame':    77 obs. of  15 variables:
##  $ Shelf                      : Factor w/ 3 levels "Bottom","Middle",..: 3 3 3 3 3 1 2 3 1 3 ...
##  $ Name                       : Factor w/ 77 levels "100%_Bran","100%_Natural_Bran",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Manufacturer               : Factor w/ 7 levels "A","G","K","N",..: 4 6 3 3 7 2 3 2 7 5 ...
##  $ Type                       : Factor w/ 2 levels "C","H": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Calories                   : int  70 120 70 50 110 110 110 130 90 90 ...
##  $ Protein                    : int  4 3 4 4 2 2 2 3 2 3 ...
##  $ Fat                        : int  1 5 1 0 2 2 0 2 1 0 ...
##  $ Sodium                     : int  130 15 260 140 200 180 125 210 200 210 ...
##  $ Fiber                      : num  10 2 9 14 1 1.5 1 2 4 5 ...
##  $ Carbohydrates              : num  5 8 7 8 14 10.5 11 18 15 13 ...
##  $ Sugars                     : int  6 8 5 0 8 10 14 8 6 5 ...
##  $ Potassium                  : int  280 135 320 330 NA 70 30 100 125 190 ...
##  $ Vitamins                   : int  25 0 25 25 25 25 25 25 25 25 ...
##  $ Weight..of.One.Serving.Cup.: num  1 1 1 1 1 1 1 1.33 1 1 ...
##  $ Cups.in.Serving            : num  0.33 1 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...

Comparing Multiple Means

It is commonly thought that cereals are displayed in certain shelf locations at a market to draw the attention of children. Make a hypothesis about shelf location (Shelf) and the sugar content in a serving of cereal.

library(ggplot2)
ggplot(cereal, aes(x=Shelf, y=Sugars, fill=Shelf))+
  geom_boxplot()

One-Way ANOVA

Testing Multiple Means

mod<-lm(Sugars~Shelf, data=cereal)
anova(mod)
## Analysis of Variance Table
## 
## Response: Sugars
##           Df  Sum Sq Mean Sq F value   Pr(>F)   
## Shelf      2  220.23 110.117  6.6013 0.002316 **
## Residuals 73 1217.71  16.681                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Regression Modeling

Look at the data: Scatterplot

library(tidyverse)

ggplot(cereal, aes(x=Sugars, y=Calories))+
  geom_point()+
  theme_bw()

Fit a Simple Linear Regression Model

ggplot(cereal, aes(x=Sugars, y=Calories))+
  geom_point()+
  geom_smooth(method="lm", se=FALSE)+
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

mod<-lm(Calories~Sugars, data=cereal)
summary(mod)
## 
## Call:
## lm(formula = Calories ~ Sugars, data = cereal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.158  -9.585   0.486  11.441  37.879 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  89.1578     3.5429  25.165  < 2e-16 ***
## Sugars        2.5356     0.4287   5.914 9.58e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.26 on 74 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.321,  Adjusted R-squared:  0.3118 
## F-statistic: 34.98 on 1 and 74 DF,  p-value: 9.581e-08
anova(mod)
## Analysis of Variance Table
## 
## Response: Calories
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## Sugars     1  9244.9  9244.9  34.977 9.581e-08 ***
## Residuals 74 19559.0   264.3                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Check the model assumptions

plot(mod)

Inference for Regression

Hypothesis Tests

summary(mod)
## 
## Call:
## lm(formula = Calories ~ Sugars, data = cereal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.158  -9.585   0.486  11.441  37.879 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  89.1578     3.5429  25.165  < 2e-16 ***
## Sugars        2.5356     0.4287   5.914 9.58e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.26 on 74 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.321,  Adjusted R-squared:  0.3118 
## F-statistic: 34.98 on 1 and 74 DF,  p-value: 9.581e-08

Confidence Interval for Slope

2.5356 + c(-1, 1)*qt(0.975, df=74)*0.4287
## [1] 1.681397 3.389803
confint(mod)
##                 2.5 %    97.5 %
## (Intercept) 82.098338 96.217243
## Sugars       1.681327  3.389863

Multiple Linear Regresion

Pairs plots

In Base R
pairs(Calories~Sugars+Fat+Protein+Carbohydrates, data=cereal)

In GGally
library(GGally)

ggpairs(cereal[,c(5, 11, 7, 6, 10)])

Model the Data

mod2<-lm(Calories~Sugars+Carbohydrates, data=cereal)
summary(mod2)
## 
## Call:
## lm(formula = Calories ~ Sugars + Carbohydrates, data = cereal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.670  -7.790  -1.998   5.136  32.178 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    29.1064     7.1786   4.055 0.000124 ***
## Sugars          3.9575     0.3387  11.683  < 2e-16 ***
## Carbohydrates   3.3819     0.3796   8.909 2.76e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.33 on 73 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6747, Adjusted R-squared:  0.6658 
## F-statistic: 75.69 on 2 and 73 DF,  p-value: < 2.2e-16
3D Scatterplot
#install.packages("scatterplot3d")
library(scatterplot3d)
s3d<-scatterplot3d(cereal[,c(5, 11,10)])
s3d$plane3d(mod2)