library(tidyverse)
library(openintro)
data(fastfood)
glimpse(fastfood)## Rows: 515
## Columns: 17
## $ restaurant <chr> "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdon…
## $ item <chr> "Artisan Grilled Chicken Sandwich", "Single Bacon Smokehou…
## $ calories <dbl> 380, 840, 1130, 750, 920, 540, 300, 510, 430, 770, 380, 62…
## $ cal_fat <dbl> 60, 410, 600, 280, 410, 250, 100, 210, 190, 400, 170, 300,…
## $ total_fat <dbl> 7, 45, 67, 31, 45, 28, 12, 24, 21, 45, 18, 34, 20, 34, 8, …
## $ sat_fat <dbl> 2.0, 17.0, 27.0, 10.0, 12.0, 10.0, 5.0, 4.0, 11.0, 21.0, 4…
## $ trans_fat <dbl> 0.0, 1.5, 3.0, 0.5, 0.5, 1.0, 0.5, 0.0, 1.0, 2.5, 0.0, 1.5…
## $ cholesterol <dbl> 95, 130, 220, 155, 120, 80, 40, 65, 85, 175, 40, 95, 125, …
## $ sodium <dbl> 1110, 1580, 1920, 1940, 1980, 950, 680, 1040, 1040, 1290, …
## $ total_carb <dbl> 44, 62, 63, 62, 81, 46, 33, 49, 35, 42, 38, 48, 48, 67, 31…
## $ fiber <dbl> 3, 2, 3, 2, 4, 3, 2, 3, 2, 3, 2, 3, 3, 5, 2, 2, 3, 3, 5, 2…
## $ sugar <dbl> 11, 18, 18, 18, 18, 9, 7, 6, 7, 10, 5, 11, 11, 11, 6, 3, 1…
## $ protein <dbl> 37, 46, 70, 55, 46, 25, 15, 25, 25, 51, 15, 32, 42, 33, 13…
## $ vit_a <dbl> 4, 6, 10, 6, 6, 10, 10, 0, 20, 20, 2, 10, 10, 10, 2, 4, 6,…
## $ vit_c <dbl> 20, 20, 20, 25, 20, 2, 2, 4, 4, 6, 0, 10, 20, 15, 2, 6, 15…
## $ calcium <dbl> 20, 20, 50, 20, 20, 15, 10, 2, 15, 20, 15, 35, 35, 35, 4, …
## $ salad <chr> "Other", "Other", "Other", "Other", "Other", "Other", "Oth…
mcdonalds <- fastfood %>%
filter(restaurant == "Mcdonalds")
dairy_queen <- fastfood %>%
filter(restaurant == "Dairy Queen")Based on the 2 histograms, the McDonalds data has more of a right skew, with the bulk of the data being on the left. The Dairy Queen data is somewhat more centrally located, with only a minor right skew. That being said, the data for McDonalds covers 1400 calories, and the DQ data is only 700. So the bulk of the data is actually in a similar place for both data sets- between 100-500 calories.
hist(mcdonalds$cal_fat)hist(dairy_queen$cal_fat)dqmean <- mean(dairy_queen$cal_fat)
dqsd <- sd(dairy_queen$cal_fat)ggplot(data = dairy_queen, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato")## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Based on this plot, the data do not follow a nearly normal distribution. The density data well exceeds the curve, there are significant gaps, and the right tail does not fit within the curve.
ggplot(data = dairy_queen, aes(sample = cal_fat)) +
geom_line(stat = "qq")library(ggplot2)sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)ggplot(data = dairy_queen, aes(sample = sim_norm)) +
geom_line(stat = "qq")The plots are similar, though the center portion of the simulated data is closer to the line than the real data. The simulated data is more of a “straight” line whereas the real data has several curves and departures.
qqnormsim(sample = cal_fat, data = dairy_queen)The plot for the real data does look similar to the plots for the simulated data. None of the plots are an exact match for each other, but the real data is quite similar to sim 7 and sim 8 in particular, with the lower and upper tails. I would say the plots provide evidence that the calories from fat are nearly normal.
1 - pnorm(q = 600, mean = dqmean, sd = dqsd)## [1] 0.01501523
dairy_queen %>%
filter(cal_fat > 600) %>%
summarise(percent = n() / nrow(dairy_queen))## # A tibble: 1 × 1
## percent
## <dbl>
## 1 0.0476
Chick_fil_a <- fastfood %>%
filter(restaurant == "Chick Fil-A")
arbys <- fastfood %>%
filter(restaurant == "Arbys")cfmean <- mean(Chick_fil_a$cal_fat)
cfsd <- sd(Chick_fil_a$cal_fat)
armean <- mean(arbys$cal_fat)
arsd <- sd(arbys$cal_fat)ggplot(data = Chick_fil_a, aes(sample = cal_fat)) +
geom_line(stat = "qq")qqnormsim(sample = cal_fat, data = Chick_fil_a)1 - pnorm(q = 300, mean = cfmean, sd = cfsd)## [1] 0.06543432
Chick_fil_a %>%
filter(cal_fat >300) %>%
summarise(percent = n() / nrow(Chick_fil_a))## # A tibble: 1 × 1
## percent
## <dbl>
## 1 0.0741
ggplot(data = arbys, aes(sample = cal_fat)) +
geom_line(stat = "qq")qqnormsim(sample = cal_fat, data = arbys)1 - pnorm(q = 800, mean = armean, sd = arsd)## [1] 3.39207e-07
arbys %>%
filter(cal_fat > 800) %>%
summarise(percent = n() / nrow(arbys))## # A tibble: 1 × 1
## percent
## <dbl>
## 1 0