## Warning: package 'tidyverse' was built under R version 4.0.3
## Warning: package 'openintro' was built under R version 4.0.3
## Warning: package 'airports' was built under R version 4.0.3
## Warning: package 'cherryblossom' was built under R version 4.0.3
## Warning: package 'usdata' was built under R version 4.0.3
data("fastfood", package = 'openintro')
view(fastfood)
#filtering restaraunt information via name
mcdonalds <- fastfood %>%
filter(restaurant == "Mcdonalds")
mcdonalds
## # A tibble: 57 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mcdonalds Arti~ 380 60 7 2 0 95
## 2 Mcdonalds Sing~ 840 410 45 17 1.5 130
## 3 Mcdonalds Doub~ 1130 600 67 27 3 220
## 4 Mcdonalds Gril~ 750 280 31 10 0.5 155
## 5 Mcdonalds Cris~ 920 410 45 12 0.5 120
## 6 Mcdonalds Big ~ 540 250 28 10 1 80
## 7 Mcdonalds Chee~ 300 100 12 5 0.5 40
## 8 Mcdonalds Clas~ 510 210 24 4 0 65
## 9 Mcdonalds Doub~ 430 190 21 11 1 85
## 10 Mcdonalds Doub~ 770 400 45 21 2.5 175
## # ... with 47 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## # fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## # calcium <dbl>, salad <chr>
#filtering restaraunt information via name
dairy_queen <- fastfood %>%
filter(restaurant == "Dairy Queen")
dairy_queen
## # A tibble: 42 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Dairy Que~ 1/2 ~ 1000 660 74 26 2 170
## 2 Dairy Que~ 1/2 ~ 800 460 51 20 2 135
## 3 Dairy Que~ 1/4 ~ 630 330 37 13 1 95
## 4 Dairy Que~ 1/4 ~ 540 270 30 11 1 70
## 5 Dairy Que~ 1/4 ~ 570 310 35 11 1 75
## 6 Dairy Que~ Orig~ 400 160 18 9 1 65
## 7 Dairy Que~ Orig~ 630 310 34 18 2 125
## 8 Dairy Que~ 4 Pi~ 1030 480 53 9 1 80
## 9 Dairy Que~ 6 Pi~ 1260 590 66 11 1 120
## 10 Dairy Que~ Baco~ 420 240 26 11 1 60
## # ... with 32 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## # fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## # calcium <dbl>, salad <chr>
Exercise 1
Make a plot (or plots) to visualize the distributions of the amount of calories from fat of the options from these two restaurants. How do their centers, shapes, and spreads compare?
x1 <- mcdonalds$cal_fat
x2 <- dairy_queen$cal_fat
summary(x1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50.0 160.0 240.0 285.6 320.0 1270.0
‘Graphing citation: https://www.r-bloggers.com/2012/10/adding-measures-of-central-tendency-to-histograms-in-r/’
# McDonalds fat calories
hist(x1,
col = "peachpuff",
border = "black",
prob = TRUE, # show densities instead of frequencies
xlab = "calories",
main = "McDonalds' Fat Calories",
lwd = 2)

hist(x2,
col = "peachpuff",
border = "black",
prob = TRUE, # show densities instead of frequencies
xlab = "calories",
main = "Dairy Queen Fat Calories",
lwd = 2)

Answer to question 1:
These datum have different distributions. While they both tend to skew right, the McDonald’s plot does so more extremely, while the skewness of the DQ plot favors the center.
dqmean <- mean(dairy_queen$cal_fat)
dqstd <- sd(dairy_queen$cal_fat)
mcd_mean <- mean(mcdonalds$cal_fat)
mcd_std <- mean(mcdonalds$cal_fat)
library(ggplot2)
ggplot(data = dairy_queen, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqstd), col = "green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise 2
Based on the this plot, does it appear that the data follow a nearly normal distribution? > Yes, it appears this data somewhat follows a normal distribution.
Evaluating the normal distribution
Below is a QQ plot
ggplot(data = dairy_queen, aes(sample = cal_fat)) +
geom_line(stat = "qq")

sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqstd)
sim_norm
## [1] 114.66729 310.09268 442.56909 315.06549 410.65596 93.80603
## [7] 440.62706 158.72589 402.10043 119.03274 133.98377 366.53149
## [13] -232.67158 357.12501 469.09034 518.72800 88.60428 398.60715
## [19] 461.07955 132.71959 323.28978 230.70975 395.67408 100.60892
## [25] 107.51827 176.23780 210.17885 77.04213 377.37179 64.41452
## [31] 34.55038 382.11210 317.46534 240.50662 541.01741 226.84458
## [37] 349.16764 127.70771 386.70136 254.74305 405.38533 432.58581
Exercise 3
Make a normal probability plot of sim_norm. Do all of the points fall on the line? How does this plot compare to the probability plot for the real data? (Since sim_norm is not a data frame, it can be put directly into the sample argument and the data argument can be dropped.) > Not all the points of this normal plot fall onto the line, although most do. It has a similar shape to the plot of the real data, but it is not exact.
qqnorm(sim_norm,
ylab = "Sample",
xlab = "Theoretical",
main = "DQ Calories from fat",)
qqline(sim_norm)
8 plotted simulations of the normal data
qqnormsim(sample = cal_fat, data = dairy_queen)

Exercise 4
Does the normal probability plot for the calories from fat look similar to the plots created for the simulated data? That is, do the plots provide evidence that the female heights are nearly normal? > Yes, the normal plot for the fat calories does look similar to the simulated plots. Because we can observe a similar cluster of data in the middle amongst all 9 plots, it is safe to say the data is nearly normal distributed.
Exercise 5
Using the same technique, determine whether or not the calories from McDonald’s menu appear to come from a normal distribution. > Yes, based on the simulation data, we can arrive at the same conclusions mentioned in the previous question regarding Mcdonalds’ calories from fat.
sim_norm2 <- rnorm(n = nrow(mcdonalds), mean = dqmean, sd = dqstd)
qqnorm(sim_norm2,
ylab = "Sample",
xlab = "Theoretical",
main = "Mcdonald's Calories from fat",)
qqline(sim_norm2)

qqnormsim(sample = cal_fat, data = mcdonalds)

# compliment
compliment <- 1 - pnorm(q=600, mean = dqmean, sd = dqstd)
compliment
## [1] 0.01501523
# Theoretical probability
dairy_queen %>%
filter(cal_fat > 600) %>%
summarise(percent = n() / nrow(dairy_queen))
## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0.0476
Exercise 6
Write out two probability questions that you would like to answer about any of the restaurants in this dataset. Calculate those probabilities using both the theoretical normal distribution as well as the empirical distribution (four probabilities in all). Which one had a closer agreement between the two methods? > What is the probability fat from calories is greater than 350 from McDonalds’ menu? What about being greater than 900 from DQ’s menu? > Findings: After performing the calculations below, we can see there is a considerably closer agreement for DQ menu items being greater than 900.
# Empirical
E1 <- 1 - pnorm(q = 350, mean = mcd_mean, sd = mcd_std)
E1
## [1] 0.4108224
# Theoretical
T1 <- mcdonalds %>%
filter(cal_fat > 350) %>%
summarise(percent = n() / nrow(mcdonalds))
T1
## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0.211
# Difference
diff1 <- abs(E1 - T1)
diff1
## percent
## 1 0.2002961
Question II:: > What is the probability fat from calories is greater than than 900 from Dairy Queen’s menu?
# Empirical
E2 <- 1 - pnorm(q = 900, mean = dqmean, sd = dqstd)
E2
## [1] 2.186778e-05
# Theoretical
T2 <- dairy_queen %>%
filter(cal_fat > 900) %>%
summarise(percent = n() / nrow(dairy_queen))
T2
## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0
# Difference
diff2 <- abs(E2 - T2)
diff2
## percent
## 1 2.186778e-05
dq <- data.frame(item = c(dairy_queen$item),
fat_calories = c(dairy_queen$cal_fat))
dq <- dq[order(dq$item),]
view(dq)
mcd <- data.frame(item = c(mcdonalds$item),
fat_calories = c(mcdonalds$cal_fat))
mcd <- mcd[order(mcd$item),]
view(mcd)
Exercise 6:
Now let’s consider some of the other variables in the dataset. Out of all the different restaurants, which ones’ distribution is the closest to normal for sodium? > Arby’s data is closest to the normal distribution
# Normal plot sodium Chick Fil A
chick_fil_a <- fastfood %>%
filter(restaurant == "Chick Fil-A")
view(chick_fil_a)
Chick_fil_sodi <- chick_fil_a$sodium
Chick_sodi_mean <- mean(chick_fil_a$sodium)
Chick_sodi_std <- sd(chick_fil_a$sodium)
ggplot(data = chick_fil_a, aes(sample = Chick_fil_sodi)) +
geom_line(stat = "qq")

# Normal plot histogram Chick Fil A
ggplot(data = chick_fil_a, aes(x = Chick_fil_sodi)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = Chick_sodi_mean, sd = Chick_sodi_std), col = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Normal plot sodium Sonic
sonic <- fastfood %>%
filter(restaurant == "Sonic")
Sonic_sodi <- sonic$sodium
Sonic_sodi_mean <- mean(sonic$sodium)
Sonic_sodi_std <- sd(sonic$sodium)
ggplot(data = sonic, aes(sample = Sonic_sodi)) +
geom_line(stat = "qq")

# Normal plot histogram Sonic
ggplot(data = sonic, aes(x = Sonic_sodi)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = Sonic_sodi_mean, sd = Sonic_sodi_std), col = "blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Normal plot sodium Arbys
arbys <- fastfood %>%
filter(restaurant == "Arbys")
arbys_sodi <- arbys$sodium
arbys_sodi_mean <- mean(arbys$sodium)
arbys_sodi_std <- sd(arbys$sodium)
ggplot(data = arbys, aes(sample = arbys_sodi)) +
geom_line(stat = "qq")

# Normal plot histogram ARBYS
ggplot(data = arbys, aes(x = arbys_sodi)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = arbys_sodi_mean, sd = arbys_sodi_std), col = "green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Normal plot sodium BK
burger_king <- fastfood %>%
filter(restaurant == "Burger King ")
burger_king_sodi <- burger_king$sodium
burger_king_sodi_mean <- mean(burger_king$sodium)
burger_king_sodi_std <- sd(burger_king$sodium)
ggplot(data = burger_king, aes(sample = burger_king_sodi)) +
geom_line(stat = "qq")

# # Normal plot sodium histogram BK
# ggplot(data = burger_king, aes(x = burger_king_sodi)) +
# geom_blank() +
# geom_histogram(aes(y = ..density..)) +
# stat_function(fun = dnorm, args = c(mean = burger_king_sodi_mean, sd = burger_king_sodi_std), col = "white")
# Normal plot SUBWAY
subway <- fastfood %>%
filter(restaurant == "Subway")
subway_sodi <- subway$sodium
subway_sodi_mean <- mean(subway$sodium)
subway_sodi_std <- sd(subway_sodi)
ggplot(data = subway, aes(sample = subway_sodi)) +
geom_line(stat = "qq")

# Normal plot histogram SUBWAY
ggplot(data = subway, aes(x = subway_sodi)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = subway_sodi_mean, sd = subway_sodi_std), col = "purple")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Normal distribution of sodium: TACO BELL
tbell <- fastfood %>%
filter(restaurant == "Taco Bell")
tbell_sodi <- tbell$sodium
tbell_sodi_mean <- mean(tbell$sodium)
tbell_sodi_std <- sd(tbell_sodi)
ggplot(data = tbell, aes(sample = tbell_sodi)) +
geom_line(stat = "qq")

# Normal plot histogram TACO BELL
ggplot(data = tbell, aes(x = tbell_sodi)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = tbell_sodi_mean, sd = tbell_sodi_std), col = "blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Normal distribution for sodium DAIRY QUEEN
dq_sodi <- dairy_queen$sodium
dq_sodi_mean <- mean(dairy_queen$sodium)
dq_sodi_std <- sd(dairy_queen$sodium)
ggplot(data = dairy_queen, aes(sample = dq_sodi)) +
geom_line(stat = "qq")

# Normal plot hihstogram DAIRY QUEEN
ggplot(data = dairy_queen, aes(x = dq_sodi)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = dq_sodi_mean, sd = dq_sodi_std), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Normal distribution for sodium McDonalds
Mcdon_sodi <- mcdonalds$sodium
Mcdon_sodi_mean <- mean(mcdonalds$sodium)
Mcdon_sodi_std <- sd(mcdonalds$sodium)
ggplot(data = mcdonalds, aes(sample = Mcdon_sodi)) +
geom_line(stat = "qq")

# Normal plot histogram Mcdonalds
ggplot(data = mcdonalds, aes(x = Mcdon_sodi)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = Mcdon_sodi_mean, sd = Mcdon_sodi_std), col = "green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise 8
Note that some of the normal probability plots for sodium distributions seem to have a stepwise pattern. why do you think this might be the case? > I think this is because variations of items may have similar amounts of sodium. For instance, it is likely that the many variations of chicken sandwhiches that chick fil a offers have similar amounts of sodium to each other.
Exercise 9
As you can see, normal probability plots can be used both to assess normality and visualize skewness. Make a normal probability plot for the total carbohydrates from a restaurant of your choice. Based on this normal probability plot, is this variable left skewed, symmetric, or right skewed? Use a histogram to confirm your findings. > From the plots below, we can conclude this variable is right skewed.
# Normal distribution for carbohydrates McDonalds
Mcdon_carbo <- mcdonalds$total_carb
Mcdon_carbo_mean <- mean(mcdonalds$total_carb)
Mcdon_carbo_std <- sd(mcdonalds$total_carb)
ggplot(data = mcdonalds, aes(sample = Mcdon_carbo)) +
geom_line(stat = "qq")

# Normal distribution carbs HISTOGRAM McDonalds
ggplot(data = mcdonalds, aes(x = Mcdon_carbo)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = Mcdon_carbo_mean, sd = Mcdon_carbo_std), col = "hot pink")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

