library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
data("fastfood", package='openintro')
head(fastfood)
## # A tibble: 6 × 17
## restaur…¹ item calor…² cal_fat total…³ sat_fat trans…⁴ chole…⁵ sodium total…⁶
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mcdonalds Arti… 380 60 7 2 0 95 1110 44
## 2 Mcdonalds Sing… 840 410 45 17 1.5 130 1580 62
## 3 Mcdonalds Doub… 1130 600 67 27 3 220 1920 63
## 4 Mcdonalds Gril… 750 280 31 10 0.5 155 1940 62
## 5 Mcdonalds Cris… 920 410 45 12 0.5 120 1980 81
## 6 Mcdonalds Big … 540 250 28 10 1 80 950 46
## # … with 7 more variables: fiber <dbl>, sugar <dbl>, protein <dbl>,
## # vit_a <dbl>, vit_c <dbl>, calcium <dbl>, salad <chr>, and abbreviated
## # variable names ¹restaurant, ²calories, ³total_fat, ⁴trans_fat,
## # ⁵cholesterol, ⁶total_carb
mcdonalds <- fastfood %>%
filter(restaurant == "Mcdonalds")
dairy_queen <- fastfood %>%
filter(restaurant == "Dairy Queen")
summary(mcdonalds$cal_fat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50.0 160.0 240.0 285.6 320.0 1270.0
hist(mcdonalds$cal_fat)
summary(dairy_queen$cal_fat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 160.0 220.0 260.5 310.0 670.0
hist(dairy_queen$cal_fat)
### The Normal Distribution
dqmean <- mean(dairy_queen$cal_fat)
dqsd <- sd(dairy_queen$cal_fat)
ggplot(data = dairy_queen, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### Exercise 2
ggplot(data = dairy_queen, aes(sample = cal_fat)) +
geom_line(stat = "qq")
sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)
ggplot(data = dairy_queen, aes(sample = sim_norm)) +
geom_line(stat = "qq")
qqnormsim(sample = cal_fat, data = dairy_queen)
#### Exercise 4
# Teoretical Normal Distribution
Taco_Bell <- fastfood %>%
filter(restaurant == "Taco Bell")
cal_mean <- mean(Taco_Bell$calories)
cal_sd <- sd(Taco_Bell$calories)
1 - pnorm(q=500, mean= cal_mean, sd=cal_sd )
## [1] 0.3799298
# Probability Emperically
Taco_Bell %>%
filter(calories > 500) %>%
summarise(percent = n() / nrow(Taco_Bell))
## # A tibble: 1 × 1
## percent
## <dbl>
## 1 0.374
# Teoretical Normal Distribution
Sonic <- fastfood %>%
filter(restaurant == "Sonic")
cal_mean <- mean(Sonic$calories)
cal_sd <- sd(Sonic$calories)
1 - pnorm(q=500, mean= cal_mean, sd=cal_sd )
## [1] 0.6692008
# Probability Emperically
Sonic %>%
filter(calories > 500) %>%
summarise(percent = n() / nrow(Sonic))
## # A tibble: 1 × 1
## percent
## <dbl>
## 1 0.547
fastfood %>%
group_by(restaurant) %>%
ggplot(aes(sample = sodium)) +
geom_line(stat = "qq") +
facet_wrap(.~restaurant)
#### While searching though the internet to find a way to do a groupby
of all 8 restaurant I found the Shapiro-Wilk Normality Test. Below I
used the Shapiro-Wilk Normality Test. The Shapiro Wilk test is the most
powerful test when testing for a normal distribution. This function
produces a test statistic W along with a corresponding p-value. If the
p-value is less than α =.05, there is sufficient evidence to say that
the sample does not come from a population that is normally distributed.
Greater than .05 is considered normally distributed.
arbys <- fastfood %>%
filter(restaurant == "Arbys")
burger_king <- fastfood %>%
filter(restaurant == "Burger King")
chick_fill_a <- fastfood %>%
filter(restaurant == "Chick Fill-A")
subway <- fastfood %>%
filter(restaurant == "Subway")
# install.packages("onewaytests")
library("onewaytests")
#Shapiro-Wilk Normality Test (alpha = 0.05)
onewaytests::nor.test(sodium~restaurant, data = fastfood)
##
## Shapiro-Wilk Normality Test (alpha = 0.05)
## --------------------------------------------------
## data : sodium and restaurant
##
## Level Statistic p.value Normality
## 1 Arbys 0.9707314 1.985112e-01 Not reject
## 2 Burger King 0.9729123 1.331376e-01 Not reject
## 3 Chick Fil-A 0.8666284 2.502900e-03 Reject
## 4 Dairy Queen 0.8450386 4.714713e-05 Reject
## 5 Mcdonalds 0.7692179 4.458093e-08 Reject
## 6 Sonic 0.8228571 1.783838e-06 Reject
## 7 Subway 0.9217523 2.514761e-05 Reject
## 8 Taco Bell 0.9550146 6.989912e-04 Reject
## --------------------------------------------------
tapply(fastfood$sodium, fastfood$restaurant, shapiro.test)
## $Arbys
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.97073, p-value = 0.1985
##
##
## $`Burger King`
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.97291, p-value = 0.1331
##
##
## $`Chick Fil-A`
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.86663, p-value = 0.002503
##
##
## $`Dairy Queen`
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.84504, p-value = 4.715e-05
##
##
## $Mcdonalds
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.76922, p-value = 4.458e-08
##
##
## $Sonic
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.82286, p-value = 1.784e-06
##
##
## $Subway
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.92175, p-value = 2.515e-05
##
##
## $`Taco Bell`
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.95501, p-value = 0.000699
ggplot(data = Taco_Bell, aes(sample = total_carb)) +
geom_line(stat = "qq")
tbmean <- mean(Taco_Bell$total_carb)
tbsd <- sd(Taco_Bell$total_carb)
ggplot(data = Taco_Bell, aes(x = total_carb)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = tbmean, sd = tbsd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.