library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
library(openintro)
## Warning: package 'openintro' was built under R version 4.0.3
## Warning: package 'airports' was built under R version 4.0.3
## Warning: package 'cherryblossom' was built under R version 4.0.3
## Warning: package 'usdata' was built under R version 4.0.3
library(ggplot2)
data("fastfood", package = 'openintro')
view(fastfood)
#filtering restaraunt information via name
mcdonalds <- fastfood %>%
  filter(restaurant == "Mcdonalds")
mcdonalds
## # A tibble: 57 x 17
##    restaurant item  calories cal_fat total_fat sat_fat trans_fat cholesterol
##    <chr>      <chr>    <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
##  1 Mcdonalds  Arti~      380      60         7       2       0            95
##  2 Mcdonalds  Sing~      840     410        45      17       1.5         130
##  3 Mcdonalds  Doub~     1130     600        67      27       3           220
##  4 Mcdonalds  Gril~      750     280        31      10       0.5         155
##  5 Mcdonalds  Cris~      920     410        45      12       0.5         120
##  6 Mcdonalds  Big ~      540     250        28      10       1            80
##  7 Mcdonalds  Chee~      300     100        12       5       0.5          40
##  8 Mcdonalds  Clas~      510     210        24       4       0            65
##  9 Mcdonalds  Doub~      430     190        21      11       1            85
## 10 Mcdonalds  Doub~      770     400        45      21       2.5         175
## # ... with 47 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## #   fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## #   calcium <dbl>, salad <chr>
#filtering restaraunt information via name
dairy_queen <- fastfood %>%
  filter(restaurant == "Dairy Queen")
dairy_queen
## # A tibble: 42 x 17
##    restaurant item  calories cal_fat total_fat sat_fat trans_fat cholesterol
##    <chr>      <chr>    <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
##  1 Dairy Que~ 1/2 ~     1000     660        74      26         2         170
##  2 Dairy Que~ 1/2 ~      800     460        51      20         2         135
##  3 Dairy Que~ 1/4 ~      630     330        37      13         1          95
##  4 Dairy Que~ 1/4 ~      540     270        30      11         1          70
##  5 Dairy Que~ 1/4 ~      570     310        35      11         1          75
##  6 Dairy Que~ Orig~      400     160        18       9         1          65
##  7 Dairy Que~ Orig~      630     310        34      18         2         125
##  8 Dairy Que~ 4 Pi~     1030     480        53       9         1          80
##  9 Dairy Que~ 6 Pi~     1260     590        66      11         1         120
## 10 Dairy Que~ Baco~      420     240        26      11         1          60
## # ... with 32 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## #   fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## #   calcium <dbl>, salad <chr>

Exercise 1

Make a plot (or plots) to visualize the distributions of the amount of calories from fat of the options from these two restaurants. How do their centers, shapes, and spreads compare?

x1 <- mcdonalds$cal_fat
x2 <- dairy_queen$cal_fat

summary(x1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    50.0   160.0   240.0   285.6   320.0  1270.0

‘Graphing citation: https://www.r-bloggers.com/2012/10/adding-measures-of-central-tendency-to-histograms-in-r/’

# McDonalds fat calories

hist(x1,
     col = "peachpuff",
     border = "black", 
     prob = TRUE, # show densities instead of frequencies
     xlab = "calories",
     main = "McDonalds' Fat Calories",
     lwd = 2)

# thickness of lin
hist(x2,
     col = "peachpuff",
     border = "black", 
     prob = TRUE, # show densities instead of frequencies
     xlab = "calories",
     main = "Dairy Queen Fat Calories",
     lwd = 2)

Answer to question 1:

These datum have different distributions. While they both tend to skew right, the McDonald’s plot does so more extremely, while the skewness of the DQ plot favors the center.

dqmean <- mean(dairy_queen$cal_fat)
dqstd <- sd(dairy_queen$cal_fat)
mcd_mean <- mean(mcdonalds$cal_fat)
mcd_std <- mean(mcdonalds$cal_fat)
library(ggplot2)
ggplot(data = dairy_queen, aes(x = cal_fat)) +
        geom_blank() +
        geom_histogram(aes(y = ..density..)) +
        stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqstd), col = "green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise 2

Based on the this plot, does it appear that the data follow a nearly normal distribution? > Yes, it appears this data somewhat follows a normal distribution.

Evaluating the normal distribution

Below is a QQ plot

ggplot(data = dairy_queen, aes(sample = cal_fat)) + 
  geom_line(stat = "qq")

sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqstd)
sim_norm
##  [1]  114.66729  310.09268  442.56909  315.06549  410.65596   93.80603
##  [7]  440.62706  158.72589  402.10043  119.03274  133.98377  366.53149
## [13] -232.67158  357.12501  469.09034  518.72800   88.60428  398.60715
## [19]  461.07955  132.71959  323.28978  230.70975  395.67408  100.60892
## [25]  107.51827  176.23780  210.17885   77.04213  377.37179   64.41452
## [31]   34.55038  382.11210  317.46534  240.50662  541.01741  226.84458
## [37]  349.16764  127.70771  386.70136  254.74305  405.38533  432.58581

Exercise 3

Make a normal probability plot of sim_norm. Do all of the points fall on the line? How does this plot compare to the probability plot for the real data? (Since sim_norm is not a data frame, it can be put directly into the sample argument and the data argument can be dropped.) > Not all the points of this normal plot fall onto the line, although most do. It has a similar shape to the plot of the real data, but it is not exact.

qqnorm(sim_norm,
       ylab = "Sample",
       xlab = "Theoretical",
       main = "DQ Calories from fat",)
qqline(sim_norm)

8 plotted simulations of the normal data

qqnormsim(sample = cal_fat, data = dairy_queen)

Exercise 4

Does the normal probability plot for the calories from fat look similar to the plots created for the simulated data? That is, do the plots provide evidence that the female heights are nearly normal? > Yes, the normal plot for the fat calories does look similar to the simulated plots. Because we can observe a similar cluster of data in the middle amongst all 9 plots, it is safe to say the data is nearly normal distributed.

Exercise 5

Using the same technique, determine whether or not the calories from McDonald’s menu appear to come from a normal distribution. > Yes, based on the simulation data, we can arrive at the same conclusions mentioned in the previous question regarding Mcdonalds’ calories from fat.

sim_norm2 <- rnorm(n = nrow(mcdonalds), mean = dqmean, sd = dqstd)

qqnorm(sim_norm2,
       ylab = "Sample",
       xlab = "Theoretical",
       main = "Mcdonald's Calories from fat",)
qqline(sim_norm2)

qqnormsim(sample = cal_fat, data = mcdonalds)

# compliment
compliment <- 1 - pnorm(q=600, mean = dqmean, sd = dqstd)
compliment
## [1] 0.01501523
# Theoretical probability
dairy_queen %>%
  filter(cal_fat > 600) %>%
  summarise(percent = n() / nrow(dairy_queen))
## # A tibble: 1 x 1
##   percent
##     <dbl>
## 1  0.0476

Exercise 6

Write out two probability questions that you would like to answer about any of the restaurants in this dataset. Calculate those probabilities using both the theoretical normal distribution as well as the empirical distribution (four probabilities in all). Which one had a closer agreement between the two methods? > What is the probability fat from calories is greater than 350 from McDonalds’ menu? What about being greater than 900 from DQ’s menu? > Findings: After performing the calculations below, we can see there is a considerably closer agreement for DQ menu items being greater than 900.

# Empirical
E1 <- 1 - pnorm(q = 350, mean = mcd_mean, sd = mcd_std)
E1
## [1] 0.4108224
# Theoretical
T1 <- mcdonalds %>%
  filter(cal_fat > 350) %>%
  summarise(percent = n() / nrow(mcdonalds))
T1
## # A tibble: 1 x 1
##   percent
##     <dbl>
## 1   0.211
# Difference
diff1 <- abs(E1 - T1)
diff1
##     percent
## 1 0.2002961

Question II:: > What is the probability fat from calories is greater than than 900 from Dairy Queen’s menu?

# Empirical
E2 <- 1 - pnorm(q = 900, mean = dqmean, sd = dqstd)
E2
## [1] 2.186778e-05
# Theoretical

T2 <- dairy_queen %>%
  filter(cal_fat > 900) %>%
  summarise(percent = n() / nrow(dairy_queen))
T2
## # A tibble: 1 x 1
##   percent
##     <dbl>
## 1       0
# Difference
diff2 <- abs(E2 - T2)
diff2
##        percent
## 1 2.186778e-05
dq <- data.frame(item = c(dairy_queen$item),
                 fat_calories = c(dairy_queen$cal_fat))
dq <- dq[order(dq$item),]
view(dq)

mcd <- data.frame(item = c(mcdonalds$item),
                 fat_calories = c(mcdonalds$cal_fat))
mcd <- mcd[order(mcd$item),]
view(mcd)

Exercise 6:

Now let’s consider some of the other variables in the dataset. Out of all the different restaurants, which ones’ distribution is the closest to normal for sodium? > Arby’s data is closest to the normal distribution

# Normal plot sodium Chick Fil A 

chick_fil_a <- fastfood %>%
  filter(restaurant == "Chick Fil-A")
view(chick_fil_a)
Chick_fil_sodi <- chick_fil_a$sodium
Chick_sodi_mean <- mean(chick_fil_a$sodium)
Chick_sodi_std <- sd(chick_fil_a$sodium)

ggplot(data = chick_fil_a, aes(sample = Chick_fil_sodi)) + 
  geom_line(stat = "qq")

# Normal plot histogram Chick Fil A
ggplot(data = chick_fil_a, aes(x = Chick_fil_sodi)) +
        geom_blank() +
        geom_histogram(aes(y = ..density..)) +
        stat_function(fun = dnorm, args = c(mean = Chick_sodi_mean, sd = Chick_sodi_std), col = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Normal plot sodium Sonic
sonic <- fastfood %>%
  filter(restaurant == "Sonic")
Sonic_sodi <- sonic$sodium
Sonic_sodi_mean <- mean(sonic$sodium)
Sonic_sodi_std <- sd(sonic$sodium)

ggplot(data = sonic, aes(sample = Sonic_sodi)) + 
  geom_line(stat = "qq")

# Normal plot histogram Sonic
ggplot(data = sonic, aes(x = Sonic_sodi)) +
        geom_blank() +
        geom_histogram(aes(y = ..density..)) +
        stat_function(fun = dnorm, args = c(mean = Sonic_sodi_mean, sd = Sonic_sodi_std), col = "blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Normal plot sodium Arbys
arbys <- fastfood %>%
  filter(restaurant == "Arbys")
arbys_sodi <- arbys$sodium
arbys_sodi_mean <- mean(arbys$sodium)
arbys_sodi_std <- sd(arbys$sodium)

ggplot(data = arbys, aes(sample = arbys_sodi)) + 
  geom_line(stat = "qq")

# Normal plot histogram ARBYS
ggplot(data = arbys, aes(x = arbys_sodi)) +
        geom_blank() +
        geom_histogram(aes(y = ..density..)) +
        stat_function(fun = dnorm, args = c(mean = arbys_sodi_mean, sd = arbys_sodi_std), col = "green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Normal plot sodium BK
burger_king <- fastfood %>%
  filter(restaurant == "Burger King ")
burger_king_sodi <- burger_king$sodium
burger_king_sodi_mean <- mean(burger_king$sodium)
burger_king_sodi_std <- sd(burger_king$sodium)

ggplot(data = burger_king, aes(sample = burger_king_sodi)) + 
  geom_line(stat = "qq")

# # Normal plot sodium histogram BK
# ggplot(data = burger_king, aes(x = burger_king_sodi)) +
#         geom_blank() +
#         geom_histogram(aes(y = ..density..)) +
#         stat_function(fun = dnorm, args = c(mean = burger_king_sodi_mean, sd = burger_king_sodi_std), col = "white")
# Normal plot SUBWAY
subway <- fastfood %>%
  filter(restaurant == "Subway")
subway_sodi <- subway$sodium
subway_sodi_mean <- mean(subway$sodium)
subway_sodi_std <- sd(subway_sodi)

ggplot(data = subway, aes(sample = subway_sodi)) + 
  geom_line(stat = "qq")

# Normal plot histogram SUBWAY
ggplot(data = subway, aes(x = subway_sodi)) +
        geom_blank() +
        geom_histogram(aes(y = ..density..)) +
        stat_function(fun = dnorm, args = c(mean = subway_sodi_mean, sd = subway_sodi_std), col = "purple")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Normal distribution of sodium: TACO BELL
tbell <- fastfood %>%
  filter(restaurant == "Taco Bell")
tbell_sodi <- tbell$sodium
tbell_sodi_mean <- mean(tbell$sodium)
tbell_sodi_std <- sd(tbell_sodi)

ggplot(data = tbell, aes(sample = tbell_sodi)) + 
  geom_line(stat = "qq")

# Normal plot histogram TACO BELL
ggplot(data = tbell, aes(x = tbell_sodi)) +
        geom_blank() +
        geom_histogram(aes(y = ..density..)) +
        stat_function(fun = dnorm, args = c(mean = tbell_sodi_mean, sd = tbell_sodi_std), col = "blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Normal distribution for sodium DAIRY QUEEN
dq_sodi <- dairy_queen$sodium
dq_sodi_mean <- mean(dairy_queen$sodium)
dq_sodi_std <- sd(dairy_queen$sodium)

ggplot(data = dairy_queen, aes(sample = dq_sodi)) + 
  geom_line(stat = "qq")

# Normal plot hihstogram DAIRY QUEEN
ggplot(data = dairy_queen, aes(x = dq_sodi)) +
        geom_blank() +
        geom_histogram(aes(y = ..density..)) +
        stat_function(fun = dnorm, args = c(mean = dq_sodi_mean, sd = dq_sodi_std), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Normal distribution for sodium McDonalds
Mcdon_sodi <- mcdonalds$sodium
Mcdon_sodi_mean <- mean(mcdonalds$sodium)
Mcdon_sodi_std <- sd(mcdonalds$sodium)

ggplot(data = mcdonalds, aes(sample = Mcdon_sodi)) + 
  geom_line(stat = "qq")

# Normal plot histogram Mcdonalds

ggplot(data = mcdonalds, aes(x = Mcdon_sodi)) +
        geom_blank() +
        geom_histogram(aes(y = ..density..)) +
        stat_function(fun = dnorm, args = c(mean = Mcdon_sodi_mean, sd = Mcdon_sodi_std), col = "green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise 8

Note that some of the normal probability plots for sodium distributions seem to have a stepwise pattern. why do you think this might be the case? > I think this is because variations of items may have similar amounts of sodium. For instance, it is likely that the many variations of chicken sandwhiches that chick fil a offers have similar amounts of sodium to each other.

Exercise 9

As you can see, normal probability plots can be used both to assess normality and visualize skewness. Make a normal probability plot for the total carbohydrates from a restaurant of your choice. Based on this normal probability plot, is this variable left skewed, symmetric, or right skewed? Use a histogram to confirm your findings. > From the plots below, we can conclude this variable is right skewed.

# Normal distribution for carbohydrates McDonalds
Mcdon_carbo <- mcdonalds$total_carb
Mcdon_carbo_mean <- mean(mcdonalds$total_carb)
Mcdon_carbo_std <- sd(mcdonalds$total_carb)

ggplot(data = mcdonalds, aes(sample = Mcdon_carbo)) + 
  geom_line(stat = "qq")

# Normal distribution carbs HISTOGRAM McDonalds
ggplot(data = mcdonalds, aes(x = Mcdon_carbo)) +
        geom_blank() +
        geom_histogram(aes(y = ..density..)) +
        stat_function(fun = dnorm, args = c(mean = Mcdon_carbo_mean, sd = Mcdon_carbo_std), col = "hot pink")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

