library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openintro)## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
head(fastfood)## # A tibble: 6 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mcdonalds Arti~ 380 60 7 2 0 95
## 2 Mcdonalds Sing~ 840 410 45 17 1.5 130
## 3 Mcdonalds Doub~ 1130 600 67 27 3 220
## 4 Mcdonalds Gril~ 750 280 31 10 0.5 155
## 5 Mcdonalds Cris~ 920 410 45 12 0.5 120
## 6 Mcdonalds Big ~ 540 250 28 10 1 80
## # ... with 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>,
## # sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>,
## # salad <chr>
mcdonalds <- fastfood %>%
filter(restaurant == "Mcdonalds")
dairy_queen <- fastfood %>%
filter(restaurant == "Dairy Queen")summary(mcdonalds)## restaurant item calories cal_fat
## Length:57 Length:57 Min. : 140.0 Min. : 50.0
## Class :character Class :character 1st Qu.: 380.0 1st Qu.: 160.0
## Mode :character Mode :character Median : 540.0 Median : 240.0
## Mean : 640.4 Mean : 285.6
## 3rd Qu.: 740.0 3rd Qu.: 320.0
## Max. :2430.0 Max. :1270.0
## total_fat sat_fat trans_fat cholesterol
## Min. : 5.00 Min. : 0.500 Min. :0.0000 Min. : 0.0
## 1st Qu.: 18.00 1st Qu.: 4.500 1st Qu.:0.0000 1st Qu.: 70.0
## Median : 27.00 Median : 7.000 Median :0.0000 Median : 95.0
## Mean : 31.81 Mean : 8.289 Mean :0.4649 Mean :109.7
## 3rd Qu.: 36.00 3rd Qu.:11.000 3rd Qu.:1.0000 3rd Qu.:125.0
## Max. :141.00 Max. :27.000 Max. :3.0000 Max. :475.0
## sodium total_carb fiber sugar
## Min. : 20 Min. : 9.00 Min. :0.000 Min. : 0.00
## 1st Qu.: 870 1st Qu.: 32.00 1st Qu.:2.000 1st Qu.: 4.00
## Median :1120 Median : 46.00 Median :3.000 Median : 9.00
## Mean :1438 Mean : 48.79 Mean :3.228 Mean :11.07
## 3rd Qu.:1780 3rd Qu.: 62.00 3rd Qu.:4.000 3rd Qu.:13.00
## Max. :6080 Max. :156.00 Max. :8.000 Max. :87.00
## protein vit_a vit_c calcium
## Min. : 7.0 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 25.0 1st Qu.: 2.00 1st Qu.: 2.0 1st Qu.: 6.0
## Median : 33.0 Median : 6.00 Median :15.0 Median : 15.0
## Mean : 40.3 Mean : 33.72 Mean :18.3 Mean : 20.6
## 3rd Qu.: 46.0 3rd Qu.: 20.00 3rd Qu.:25.0 3rd Qu.: 20.0
## Max. :186.0 Max. :180.00 Max. :70.0 Max. :290.0
## salad
## Length:57
## Class :character
## Mode :character
##
##
##
ggplot(data= mcdonalds, aes(x= cal_fat)) +
geom_histogram(bins= 30, fill= "blue") +
ggtitle("Distribution Calorie Fat from Mc Donalds")summary(dairy_queen)## restaurant item calories cal_fat
## Length:42 Length:42 Min. : 20.0 Min. : 0.0
## Class :character Class :character 1st Qu.: 350.0 1st Qu.:160.0
## Mode :character Mode :character Median : 485.0 Median :220.0
## Mean : 520.2 Mean :260.5
## 3rd Qu.: 630.0 3rd Qu.:310.0
## Max. :1260.0 Max. :670.0
##
## total_fat sat_fat trans_fat cholesterol
## Min. : 0.00 Min. : 0.00 Min. :0.0000 Min. : 0.00
## 1st Qu.:18.00 1st Qu.: 5.00 1st Qu.:0.0000 1st Qu.: 41.25
## Median :24.50 Median : 9.00 Median :1.0000 Median : 60.00
## Mean :28.86 Mean :10.44 Mean :0.6786 Mean : 71.55
## 3rd Qu.:34.75 3rd Qu.:12.50 3rd Qu.:1.0000 3rd Qu.:100.00
## Max. :75.00 Max. :43.00 Max. :2.0000 Max. :180.00
##
## sodium total_carb fiber sugar
## Min. : 15.0 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 847.5 1st Qu.: 25.25 1st Qu.: 1.000 1st Qu.: 3.000
## Median :1030.0 Median : 34.00 Median : 2.000 Median : 6.000
## Mean :1181.8 Mean : 38.69 Mean : 2.833 Mean : 6.357
## 3rd Qu.:1362.5 3rd Qu.: 44.75 3rd Qu.: 3.000 3rd Qu.: 8.750
## Max. :3500.0 Max. :121.00 Max. :12.000 Max. :30.000
##
## protein vit_a vit_c calcium
## Min. : 1.00 Min. : 0 Min. : 0.00 Min. : 0.00
## 1st Qu.:17.00 1st Qu.: 9 1st Qu.: 0.00 1st Qu.: 6.00
## Median :23.00 Median :10 Median : 4.00 Median : 10.00
## Mean :24.83 Mean :14 Mean : 4.37 Mean : 16.41
## 3rd Qu.:34.00 3rd Qu.:20 3rd Qu.: 6.00 3rd Qu.: 20.00
## Max. :49.00 Max. :50 Max. :30.00 Max. :100.00
## NA's :15 NA's :15 NA's :15
## salad
## Length:42
## Class :character
## Mode :character
##
##
##
##
ggplot(data = dairy_queen, aes(x = cal_fat)) +
geom_histogram(fill = "red", binwidth = 30) +
ggtitle("Distribution Calorie Fat from Dairy Queen")Answer: The distribution of the calories from fat of Dairy Queen’s items and Mc Donalds’ are close to normal. Each set is right skewed (the tail runs to the right). With that said, the frequency histogram(s) highlight a number of unique differences between the distributions. McDonald’s has a higher minimum, maximum (approximately 1250) and center (around 280) for fat calories. Whereas Dairy Queen’s curve is far less skewed, with a small skew to the right, the center of Dairy Queen’s curve is around 250 calories from fat and a max value around 675 calories from fat. Also, the McDonald’s x axis increases in increments of 200 cals while the Dairy Queen one increases in increments of 100 cals.
dqmean <- mean(dairy_queen$cal_fat)
dqsd <- sd(dairy_queen$cal_fat)ggplot(data = dairy_queen, aes(x = cal_fat)) + geom_blank() + geom_histogram(aes(y = ..density..)) + stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Answer: Yes, Dairy Queen’s calories from fat curve follows a nearly normal distribution.
ggplot(dairy_queen, aes(sample = cal_fat))+
stat_qq()+stat_qq_line()sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)ggplot(data= NULL, aes(sample= sim_norm)) +
stat_qq()+stat_qq_line() Answer: No, all the points on the line do not collapse. The probability graphs are identical but not the same for the actual data of the simulated. There is a smaller slope from x= -2 to x= -1 and a larger slope from x= 1 to x= 2.3 for the actual results. The plots are quite close, rather than the nuance.
qqnormsim(sample = cal_fat, data = dairy_queen)Answer: Yes, the Dairy Queen “cal_fat” normal probability plot is pretty closely aligned with all our simulated data probability plots, although it curves slightly below the qqline (diagonal line) while the simulations did not.
qqnormsim(sample = cal_fat, data = mcdonalds)Answer: The Mc Donalds “cal_fat” data is nearly normal. Although the slope is rather small near the beginning and rather larger later on, it does form a diagonal line near the qqline (diagonal line) and closely mimics a couple of the simulated plots up until these higher values.
1 - pnorm(q = 600, mean = dqmean, sd = dqsd)## [1] 0.01501523
dairy_queen %>%
filter(cal_fat > 600) %>%
summarise(percent = n() / nrow(dairy_queen))## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0.0476
Question 1: What is the probability that a randomly chosen Chick Fil-A product has more than 400 calories from fat?
chick_fil_a <- fastfood %>%
filter(restaurant == "Chick Fil-A")
a_mean <- mean(chick_fil_a$cal_fat)
a_sd <- sd(chick_fil_a$cal_fat)
1 - pnorm(q = 400, mean = a_mean, sd = a_sd)## [1] 0.006429412
chick_fil_a %>%
filter(cal_fat > 400) %>%
summarise(percent = n() / nrow(chick_fil_a))## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0.0741
Answer: There is around a 0.074%, around 0%, chance of randomly selecting a Chick Fil-A item above 400 calories from fat. Quit healthy! (sort of)
Question 2: What is the probability that a randomly chosen product from any of these fast food restaurants is less than 400 calories?
ff_mean <- mean(fastfood$calories)
ff_sd <- sd(fastfood$calories)
pnorm(q = 400, mean = ff_mean, sd = ff_sd)## [1] 0.3214986
fastfood %>%
filter(calories < 400) %>%
summarise(percent = n() / nrow(fastfood))## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0.359
Answer: There is approximately 0.36% chance of randomly selecting a product from any of these fast food restaurants is less than 400 calories.
Answer: Although the calculated probabilities varied slightly for both calculations, those for probability that a randomly chosen product from any of these fast food restaurants was less than 400 calories were in closer agreement.
unique(fastfood$restaurant)## [1] "Mcdonalds" "Chick Fil-A" "Sonic" "Arbys" "Burger King"
## [6] "Dairy Queen" "Subway" "Taco Bell"
Arby’s Restaurant
arbys <- fastfood %>%
filter(restaurant == "Arbys")
ggplot(arbys, aes(sample = sodium))+
stat_qq()+stat_qq_line()Burger King Restaurant
burgerking <- fastfood %>%
filter(restaurant == "Burger King")
ggplot(burgerking, aes(sample = sodium))+
stat_qq()+stat_qq_line()Chick Fil-A Restaurant
chick_fil_a <- fastfood %>%
filter(restaurant == "Chick Fil-A")
ggplot(chick_fil_a, aes(sample = sodium))+
stat_qq()+stat_qq_line()Dairy Queen Restaurant
dairy_queen <- fastfood %>%
filter(restaurant == "Dairy Queen")
ggplot(dairy_queen, aes(sample = sodium))+
stat_qq()+stat_qq_line()Mc Donald’s Restaurant
mcdonalds <- fastfood %>%
filter(restaurant == "Mcdonalds")
ggplot(mcdonalds, aes(sample = sodium))+
stat_qq()+stat_qq_line()Sonic Restaurant
sonic <- fastfood %>%
filter(restaurant == "Sonic")
ggplot(sonic, aes(sample = sodium))+
stat_qq()+stat_qq_line()Subway Restaurant
subway <- fastfood %>%
filter(restaurant == "Subway")
ggplot(subway, aes(sample = sodium))+
stat_qq()+stat_qq_line()Taco Bell Restaurant
tacobell <- fastfood %>%
filter(restaurant == "Taco Bell")
ggplot(tacobell, aes(sample = sodium))+
stat_qq()+stat_qq_line()Answer: Burger King appear to have the closest to normal distributions for their sodium data.
Answer: Some of the normal probability plots for sodium distributions seem to have a stepwise pattern. I think this might be the case because the data collected were rounding.
ggplot(dairy_queen, aes(sample = total_carb))+
stat_qq()+stat_qq_line()ggplot(data= dairy_queen, aes(x= total_carb)) +
geom_blank() +
geom_histogram(aes(y= ..density..), bins= 7) +
stat_function(fun= dnorm, args= c(mean= mean(dairy_queen$total_carb),
sd= sd(dairy_queen$total_carb)), col= "red")Answer: Based on the normal probability plot, this variable (total carbohydrates) is right skewed. Also, the histogram confirms this with data being concentrated on the left with a tail running to the right.