library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
data("fastfood", package='openintro')
head(fastfood)
mcdonalds <- fastfood %>%
filter(restaurant == "Mcdonalds")
dairy_queen <- fastfood %>%
filter(restaurant == "Dairy Queen")
ggplot(data = dairy_queen, aes(x = cal_fat)) +
geom_histogram(binwidth = 15)
ggplot(data = mcdonalds, aes(x = cal_fat)) +
geom_histogram(binwidth = 15)
dairy_queen_counts <- fastfood %>%
filter(restaurant == "Dairy Queen") %>% count(cal_fat)
mcdonalds_counts <- fastfood %>%
filter(restaurant == "Mcdonalds") %>% count(cal_fat)
ggplot() +
geom_point(data = dairy_queen_counts, aes(x = cal_fat, y = n), color = "blue") +
geom_point(data = mcdonalds_counts, aes(x = cal_fat, y = n), color = "red")
fastFoodSubset <- fastfood %>%
filter(restaurant == "Mcdonalds" | restaurant == "Dairy Queen") %>%
count(restaurant, cal_fat)
fastFoodSubset
ggplot(data = fastFoodSubset, aes(x = cal_fat, y = n, color= restaurant )) +
geom_point()
When comparing and contrasting the data, McDonalds has a wider spread as well as significantly more options. This may be due to the fact of the menu composition as DQ appears to target ice cream whereas McDonalds is a mix of both food and ice cream. If we remove the extreme McDonalds values, theirs center should be approximately the same, and their shapes are remarkably close to each other.
dqmean <- mean(dairy_queen$cal_fat)
dqsd <- sd(dairy_queen$cal_fat)
ggplot(data = dairy_queen, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Looking at the plot, the cal_fat for DQ appears to follow an almost nearly normal distrubution until the upper end of the cal_fat values where it deviates.
sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)
sim_norm
## [1] 337.11667 169.67562 265.84329 295.99060 51.63293 417.68812
## [7] 251.48065 322.63522 132.80237 404.82098 269.31568 354.22833
## [13] -19.97419 231.42225 107.55406 270.40700 301.45740 375.95839
## [19] 288.41480 117.56646 524.71657 149.81713 202.26686 212.41018
## [25] 175.59007 614.80777 219.05603 467.69258 331.89379 747.97536
## [31] 122.77859 369.07107 245.80837 322.12927 357.01169 363.10974
## [37] 282.25802 301.06208 247.38547 -155.01396 117.65219 286.96962
qqnorm(sim_norm)
qqline(sim_norm)
Not all the points fall on the line as nothing is perfect! However, this is comparable to the data in the DQ set!
qqnormsim(sample = cal_fat, data = dairy_queen)
Yes, the plots demonstrate a remarkable similarty to the data, indicating that calories are nearly normal.
mdmean_md <- mean(mcdonalds$cal_fat)
mdsd_md <- sd(mcdonalds$cal_fat)
ggplot(data = mcdonalds, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = mdmean_md, sd = mdsd_md), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qqnormsim(sample = cal_fat, data = mcdonalds)
Just like the DQ example, the McDonald’s menu appears to follow the normal distrubtion as it not only hugs tightly to the line above, but appears to match some of the simulated examples.
Question 1: How many items on the McDonald’s menu are under 500mg of salt?
mdmean_md <- mean(mcdonalds$sodium)
mdsd_md <- sd(mcdonalds$sodium)
pnorm(q = 500, mean = mdmean_md, sd = mdsd_md)
## [1] 0.1826921
mcdonalds %>%
filter(sodium < 500) %>%
summarise(percent = n() / nrow(mcdonalds))
In terms of item selection, McDonalds has less items than expected with less than 500mg of sodium!
Question 2: How many items on the Dairy Queen Menu have more than 20g of sugar?
dqmean_md <- mean(dairy_queen$sugar)
dqsd_md <- sd(dairy_queen$sugar)
1-pnorm(q = 20, mean = dqmean_md, sd = dqsd_md)
## [1] 0.003318644
mcdonalds %>%
filter(sugar > 20) %>%
summarise(percent = n() / nrow(dairy_queen))
In terms of item selection, Dairy Queen has more items than expected with more than 20g of sugar!
for (ids in unique(fastfood$restaurant)){
test <- fastfood %>%
filter(restaurant == ids)
qqnorm(test[test$restaurant == ids, c('sodium') ]$sodium, main = ids)
qqline(test[test$restaurant == ids, c('sodium') ]$sodium)
}
Looking at the chart, Burger King is closest to normal.
Practically, the stepwise pattern is a result of creating discrete measurements rather than continous ones. For example is all measurements are rounded to the nearst digit would create a stepwise pattern.
#Normal plot for total carbohydrates from Dairy Queen
qqnorm(dairy_queen$total_carb, main = "Dairy Queen Carbs")
qqline(dairy_queen$total_carb)
dqmean <- mean(dairy_queen$total_carb)
dqsd <- sd(dairy_queen$total_carb)
ggplot(data = dairy_queen, aes(x = total_carb)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Based on the diagram, the total_carb distrubution is right shifted, and is fairly normal.