Load packages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.5 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
head(fastfood)
## # A tibble: 6 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mcdonalds Arti~ 380 60 7 2 0 95
## 2 Mcdonalds Sing~ 840 410 45 17 1.5 130
## 3 Mcdonalds Doub~ 1130 600 67 27 3 220
## 4 Mcdonalds Gril~ 750 280 31 10 0.5 155
## 5 Mcdonalds Cris~ 920 410 45 12 0.5 120
## 6 Mcdonalds Big ~ 540 250 28 10 1 80
## # ... with 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>,
## # sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>,
## # salad <chr>
glimpse(fastfood)
## Rows: 515
## Columns: 17
## $ restaurant <chr> "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mc...
## $ item <chr> "Artisan Grilled Chicken Sandwich", "Single Bacon Smoke...
## $ calories <dbl> 380, 840, 1130, 750, 920, 540, 300, 510, 430, 770, 380,...
## $ cal_fat <dbl> 60, 410, 600, 280, 410, 250, 100, 210, 190, 400, 170, 3...
## $ total_fat <dbl> 7, 45, 67, 31, 45, 28, 12, 24, 21, 45, 18, 34, 20, 34, ...
## $ sat_fat <dbl> 2.0, 17.0, 27.0, 10.0, 12.0, 10.0, 5.0, 4.0, 11.0, 21.0...
## $ trans_fat <dbl> 0.0, 1.5, 3.0, 0.5, 0.5, 1.0, 0.5, 0.0, 1.0, 2.5, 0.0, ...
## $ cholesterol <dbl> 95, 130, 220, 155, 120, 80, 40, 65, 85, 175, 40, 95, 12...
## $ sodium <dbl> 1110, 1580, 1920, 1940, 1980, 950, 680, 1040, 1040, 129...
## $ total_carb <dbl> 44, 62, 63, 62, 81, 46, 33, 49, 35, 42, 38, 48, 48, 67,...
## $ fiber <dbl> 3, 2, 3, 2, 4, 3, 2, 3, 2, 3, 2, 3, 3, 5, 2, 2, 3, 3, 5...
## $ sugar <dbl> 11, 18, 18, 18, 18, 9, 7, 6, 7, 10, 5, 11, 11, 11, 6, 3...
## $ protein <dbl> 37, 46, 70, 55, 46, 25, 15, 25, 25, 51, 15, 32, 42, 33,...
## $ vit_a <dbl> 4, 6, 10, 6, 6, 10, 10, 0, 20, 20, 2, 10, 10, 10, 2, 4,...
## $ vit_c <dbl> 20, 20, 20, 25, 20, 2, 2, 4, 4, 6, 0, 10, 20, 15, 2, 6,...
## $ calcium <dbl> 20, 20, 50, 20, 20, 15, 10, 2, 15, 20, 15, 35, 35, 35, ...
## $ salad <chr> "Other", "Other", "Other", "Other", "Other", "Other", "...
fastfood <- fastfood
Focus on restaurant, calories, and calories from fat
mcdonalds <- fastfood %>%
filter(restaurant == "Mcdonalds")
dairy_queen <- fastfood %>%
filter(restaurant == "Dairy Queen")
ggplot(mcdonalds, aes(x=cal_fat)) +
geom_histogram(binwidth = 100)
ggplot(dairy_queen, aes(x=cal_fat)) +
geom_histogram(binwidth = 100)
Plotting the normal distribution curve on top of the histogram:
dqmean <- mean(dairy_queen$cal_fat)
dqsd <- sd(dairy_queen$cal_fat)
ggplot(dairy_queen, aes(x=cal_fat)) +
geom_blank() +
geom_histogram(aes(y=..density..), binwidth = 100) +
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato")
Constructing a normal probability plot AKA a normal Q-Q plot for “quantile-quantile”
ggplot(dairy_queen, aes(sample = cal_fat)) +
geom_line(stat = "qq")
Note: instead of “x =” we use “sample =”
How close is close enough?
sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)
No, all the points are not perfectly along a straight line. It looks pretty similar to the actual data. So I guess it would be considered a normal distribution… this is probably why they say in the textbook that it is hard to eyeball a histogram and decide whether it is normal or not.
ggplot(data = NULL, aes(sample = sim_norm)) +
geom_line(stat = "qq")
qqnormsim(sample = cal_fat, data = dairy_queen)
qqnormsim(sample = cal_fat, data = mcdonalds)
If the plot is normal, we can use tools such as qnorm and pnorm
For example, to find the probability that a randomly chosen item on the Dairy Queen menu has more than 600 calories from fat:
1 - pnorm(q = 600, mean = dqmean, sd = dqsd)
## [1] 0.01501523
1 - pnorm(600, dqmean, dqsd)
## [1] 0.01501523
dairy_queen %>%
filter(cal_fat > 600) %>%
summarise(percent = n() / nrow(dairy_queen))
## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0.0476
pnorm(200, dqmean, dqsd)
## [1] 0.3495757
dairy_queen %>%
filter(cal_fat < 200) %>%
summarise(percent = n() / nrow(dairy_queen))
## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0.429
pnorm(600, dqmean, dqsd) - pnorm(200, dqmean, dqsd)
## [1] 0.6354091
dairy_queen %>%
filter(cal_fat < 600 & cal_fat > 200) %>%
summarise(percent = n() / nrow(dairy_queen))
## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0.5
unique(fastfood$restaurant)
## [1] "Mcdonalds" "Chick Fil-A" "Sonic" "Arbys" "Burger King"
## [6] "Dairy Queen" "Subway" "Taco Bell"
chick_fil_a <- fastfood %>%
filter(restaurant == "Chick Fil-A")
arbys <- fastfood %>%
filter(restaurant == "Arbys")
sonic <- fastfood %>%
filter(restaurant == "Sonic")
burger_king <- fastfood %>%
filter(restaurant == "Burger King")
subway <- fastfood %>%
filter(restaurant == "Subway")
taco_bell <- fastfood %>%
filter(restaurant == "Taco Bell")
qqnormsim(sample = sodium, data = mcdonalds)
qqnormsim(sample = sodium, data = chick_fil_a)
qqnormsim(sample = sodium, data = sonic)
qqnormsim(sample = sodium, data = arbys)
qqnormsim(sample = sodium, data = burger_king)
qqnormsim(sample = sodium, data = dairy_queen)
qqnormsim(sample = sodium, data = subway)
qqnormsim(sample = sodium, data = taco_bell)
ggplot(chick_fil_a, aes(sample = total_carb)) +
geom_line(stat = "qq")
qqnormsim(sample = total_carb, data = chick_fil_a)
ggplot(chick_fil_a, aes(x = total_carb)) +
geom_histogram(binwidth = 5)
chickmean <- mean(chick_fil_a$total_carb)
chicksd <- sd(chick_fil_a$total_carb)
ggplot(chick_fil_a, aes(x=total_carb)) +
geom_blank() +
geom_histogram(aes(y=..density..), binwidth = 5) +
stat_function(fun = dnorm, args = c(mean = chickmean, sd = chicksd), col = "tomato")
ggplot(dairy_queen, aes(sample = cal_fat))+
stat_qq()+stat_qq_line()
ggplot(data = NULL, aes(sample = sim_norm)) +
stat_qq()+
stat_qq_line()
ggplot(chick_fil_a, aes(sample = total_carb)) +
stat_qq()+
stat_qq_line()