library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.2
## Warning: package 'tibble' was built under R version 4.0.2
## Warning: package 'tidyr' was built under R version 4.0.2
## Warning: package 'dplyr' was built under R version 4.0.2
library(openintro)
## Warning: package 'openintro' was built under R version 4.0.2
## Warning: package 'airports' was built under R version 4.0.2
## Warning: package 'cherryblossom' was built under R version 4.0.2
## Warning: package 'usdata' was built under R version 4.0.2
write.csv (fastfood, "fastfood.csv")
getwd()
## [1] "C:/Users/Jerome/Documents/Math_217_Week_4_RLab"
fastfood
## # A tibble: 515 x 17
##    restaurant item  calories cal_fat total_fat sat_fat trans_fat cholesterol
##    <chr>      <chr>    <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
##  1 Mcdonalds  Arti~      380      60         7       2       0            95
##  2 Mcdonalds  Sing~      840     410        45      17       1.5         130
##  3 Mcdonalds  Doub~     1130     600        67      27       3           220
##  4 Mcdonalds  Gril~      750     280        31      10       0.5         155
##  5 Mcdonalds  Cris~      920     410        45      12       0.5         120
##  6 Mcdonalds  Big ~      540     250        28      10       1            80
##  7 Mcdonalds  Chee~      300     100        12       5       0.5          40
##  8 Mcdonalds  Clas~      510     210        24       4       0            65
##  9 Mcdonalds  Doub~      430     190        21      11       1            85
## 10 Mcdonalds  Doub~      770     400        45      21       2.5         175
## # ... with 505 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## #   fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## #   calcium <dbl>, salad <chr>

Exercise 0

Insert any text here.

head(fastfood)
## # A tibble: 6 x 17
##   restaurant item  calories cal_fat total_fat sat_fat trans_fat cholesterol
##   <chr>      <chr>    <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
## 1 Mcdonalds  Arti~      380      60         7       2       0            95
## 2 Mcdonalds  Sing~      840     410        45      17       1.5         130
## 3 Mcdonalds  Doub~     1130     600        67      27       3           220
## 4 Mcdonalds  Gril~      750     280        31      10       0.5         155
## 5 Mcdonalds  Cris~      920     410        45      12       0.5         120
## 6 Mcdonalds  Big ~      540     250        28      10       1            80
## # ... with 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>,
## #   sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>,
## #   salad <chr>
glimpse(fastfood)
## Rows: 515
## Columns: 17
## $ restaurant  <chr> "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mc...
## $ item        <chr> "Artisan Grilled Chicken Sandwich", "Single Bacon Smoke...
## $ calories    <dbl> 380, 840, 1130, 750, 920, 540, 300, 510, 430, 770, 380,...
## $ cal_fat     <dbl> 60, 410, 600, 280, 410, 250, 100, 210, 190, 400, 170, 3...
## $ total_fat   <dbl> 7, 45, 67, 31, 45, 28, 12, 24, 21, 45, 18, 34, 20, 34, ...
## $ sat_fat     <dbl> 2.0, 17.0, 27.0, 10.0, 12.0, 10.0, 5.0, 4.0, 11.0, 21.0...
## $ trans_fat   <dbl> 0.0, 1.5, 3.0, 0.5, 0.5, 1.0, 0.5, 0.0, 1.0, 2.5, 0.0, ...
## $ cholesterol <dbl> 95, 130, 220, 155, 120, 80, 40, 65, 85, 175, 40, 95, 12...
## $ sodium      <dbl> 1110, 1580, 1920, 1940, 1980, 950, 680, 1040, 1040, 129...
## $ total_carb  <dbl> 44, 62, 63, 62, 81, 46, 33, 49, 35, 42, 38, 48, 48, 67,...
## $ fiber       <dbl> 3, 2, 3, 2, 4, 3, 2, 3, 2, 3, 2, 3, 3, 5, 2, 2, 3, 3, 5...
## $ sugar       <dbl> 11, 18, 18, 18, 18, 9, 7, 6, 7, 10, 5, 11, 11, 11, 6, 3...
## $ protein     <dbl> 37, 46, 70, 55, 46, 25, 15, 25, 25, 51, 15, 32, 42, 33,...
## $ vit_a       <dbl> 4, 6, 10, 6, 6, 10, 10, 0, 20, 20, 2, 10, 10, 10, 2, 4,...
## $ vit_c       <dbl> 20, 20, 20, 25, 20, 2, 2, 4, 4, 6, 0, 10, 20, 15, 2, 6,...
## $ calcium     <dbl> 20, 20, 50, 20, 20, 15, 10, 2, 15, 20, 15, 35, 35, 35, ...
## $ salad       <chr> "Other", "Other", "Other", "Other", "Other", "Other", "...
mcdonalds <- fastfood %>%
  filter(restaurant == "Mcdonalds")
dairy_queen <- fastfood %>%
  filter(restaurant == "Dairy Queen")

Exercise 1

dqmean <- mean(dairy_queen$cal_fat)
dqsd <- sd(dairy_queen$cal_fat)
ggplot(data = dairy_queen, aes(x = cal_fat)) +
  geom_blank() +
  geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise 2

ggplot(data = dairy_queen, aes(sample = cal_fat)) +
  geom_line(stat = "qq")

### Exercise 3

sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)
qqnormsim(sample = cal_fat, data = dairy_queen)

### Exercise 4

The plot using actual DQ data looks much different than all 8 simulated plots. In the DQ data, there is a marked sharp rise at x = 1.45; none of the simulated plots show that pattern. Sim 6 comes close, but it doesn’t have the steep rise the DQ data have.

Exercise 5

mcdmean <- mean(mcdonalds$cal_fat)
mcdsd <- sd(mcdonalds$cal_fat)
mcdmean
## [1] 285.614
ggplot(data = mcdonalds, aes(x = cal_fat)) +
  geom_blank() +
  geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, args = c(mean = mcdmean, sd = mcdsd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = mcdonalds, aes(sample = cal_fat)) +
  geom_line(stat = "qq")

Normal Probabilities

pnorm(q = 600, mean = dqmean, sd = dqsd)
## [1] 0.9849848
1-pnorm(q = 600, mean =dqmean, sd = dqsd)
## [1] 0.01501523

`

dairy_queen %>%
  filter(cal_fat > 600) %>%
  summarize(percent = n() / nrow (dairy_queen))
## # A tibble: 1 x 1
##   percent
##     <dbl>
## 1  0.0476
getwd()
## [1] "C:/Users/Jerome/Documents/Math_217_Week_4_RLab"

Exercise 6

TWo questions I would like to answer about any of the restaurants.

  1. What is the probability of getting an item w/ a cholesterol level < 50 at Subway?
  2. What is the probability of getting an item w/ a cal_faat level > 150 at Subway?

Step 1 - Filter on Subway, calculate means and standard deviations of cholesterol and cal_fat.

subway <- fastfood %>%
  filter(restaurant == "Subway")

subwaymean <- mean(subway$cal_fat)
subwaysd <- sd(subway$cal_fat)
ggplot(data = subway, aes(x = cal_fat)) +
  geom_blank() +
  geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, args = c(mean = subwaymean, sd = subwaysd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

subwaymean2 <- mean(subway$cholesterol)
subwaysd2 <- sd(subway$cholesterol)
ggplot(data = subway, aes(x = cholesterol)) +
  geom_blank() +
  geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, args = c(mean = subwaymean2, sd = subwaysd2), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Step 2 - Calculate the theoretical probability and the empirical probability of getting an item w/ > 150 calories of fat at Subway.

pnorm(q = 150, mean = subwaymean, sd = subwaysd)
## [1] 0.455404
1-pnorm(q = 150, mean =subwaymean, sd = subwaysd)
## [1] 0.544596
subway %>%
  filter (cal_fat > 150) %>%
  summarise (percent = n() / nrow(subway))
## # A tibble: 1 x 1
##   percent
##     <dbl>
## 1   0.427

Therefore, the probability of getting an item at Subway w/ > 150 calories of fat (Pr (Y > 150) is somewhere between 0.43 and 0.54. The theoretical probability is 0.54; the empirical probability is 0.43.

Step 3 - Calculate the theoretical probability and empirical probability of getting an item w/ < 50 (mg?) of cholesterol at Subway.

pnorm(q = 50, mean = subwaymean2, sd = subwaysd2)
## [1] 0.3912274
subway %>%
  filter (cholesterol < 50) %>%
  summarise (percent = n() / nrow(subway))
## # A tibble: 1 x 1
##   percent
##     <dbl>
## 1   0.438

Therefore, the probability of getting an item at Subway w/ < (mg?) of cholesterol is somewhere between 0.39 and 0.44. The theoretical probability is 0.39; the empirical probability is 0.43.

Exercise 7 - Which restaurant is closest to normal for sodium?

1st - calculate what’s normal for sodium

allmean <- mean(fastfood$sodium)
allsd <- sd(fastfood$sodium)
ggplot(data = fastfood, aes(x = sodium)) +
  geom_blank() +
  geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, args = c(mean = allmean, sd = allsd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pnorm(nrow(fastfood),allmean,allsd)
## [1] 0.144445
sim_norm <- rnorm(n = nrow(fastfood), mean = allmean, sd = allsd)
qqnormsim(sample = sodium, data = fastfood)

Calculate Normal for each of the individual restaurants. Begin by creating subsets for each restaurant.

arbys <- fastfood %>%
  filter(restaurant == "Arbys")

bk <- fastfood %>%
  filter(restaurant == "Burger King")

chick <- fastfood %>%
  filter(restaurant == "Chick Fil-A")

sonic <- fastfood %>%
  filter(restaurant == "Sonic")

tb <- fastfood %>%
  filter(restaurant == "Taco Bell")

Calculate the Shapiro-Wilks coefficient for each restaurant.

shapiro.test(subway$sodium)
## 
##  Shapiro-Wilk normality test
## 
## data:  subway$sodium
## W = 0.92175, p-value = 2.515e-05
shapiro.test (dairy_queen$sodium)
## 
##  Shapiro-Wilk normality test
## 
## data:  dairy_queen$sodium
## W = 0.84504, p-value = 4.715e-05
shapiro.test(fastfood$sodium)
## 
##  Shapiro-Wilk normality test
## 
## data:  fastfood$sodium
## W = 0.88668, p-value < 2.2e-16
shapiro.test(bk$sodium)
## 
##  Shapiro-Wilk normality test
## 
## data:  bk$sodium
## W = 0.97291, p-value = 0.1331
shapiro.test(arbys$sodium)
## 
##  Shapiro-Wilk normality test
## 
## data:  arbys$sodium
## W = 0.97073, p-value = 0.1985
shapiro.test(mcdonalds$sodium)
## 
##  Shapiro-Wilk normality test
## 
## data:  mcdonalds$sodium
## W = 0.76922, p-value = 4.458e-08
shapiro.test(tb$sodium)
## 
##  Shapiro-Wilk normality test
## 
## data:  tb$sodium
## W = 0.95501, p-value = 0.000699
shapiro.test(sonic$sodium)
## 
##  Shapiro-Wilk normality test
## 
## data:  sonic$sodium
## W = 0.82286, p-value = 1.784e-06
shapiro.test(chick$sodium)
## 
##  Shapiro-Wilk normality test
## 
## data:  chick$sodium
## W = 0.86663, p-value = 0.002503

Based on the Shapiro-Wilks test, it appears Arbys has the highest non-normal sodium levels, followed by Burger King. Overall, the entire dataset shows normal levels of sodium; the other 6 restaurants all show levels of sodium well within normal range, given the probabilities of the Shapiro-Wilks tests on those respective datasets.

Exercise 8 - Stepwise patterns

Calculate the means and standard deviations for each restaurant.

arbysmean <- mean(arbys$sodium)
arbyssd <- sd(arbys$sodium)


bkmean <- mean(fastfood$sodium)
bksd <- sd(fastfood$sodium)


allmean <- mean(bk$sodium)
allsd <- sd(bk$sodium)



chickmean <- mean(chick$sodium)
chicksd <- sd(chick$sodium)

sonicmean <- mean(sonic$sodium)
sonicsd <- sd(sonic$sodium)


tbmean <- mean(tb$sodium)
tbsd <- sd(tb$sodium)

Now produce the QQ plots

ggplot (data = arbys, aes(sample = sodium)) +
  geom_line(stat = "qq") +
labs(title = "Arbys")

ggplot (data = bk, aes(sample = sodium)) +
 geom_line(stat= "qq") +
labs(title = "Burger King")

ggplot (data = chick, aes(sample = sodium)) +
  geom_line(stat= "qq") +
  labs(title = "Chick Fil-A")

ggplot (data = dairy_queen, aes(sample = sodium)) +
  geom_line(stat= "qq") +
  labs(title = "Dairy Queen")

ggplot (data = mcdonalds, aes(sample = sodium)) +
  geom_line(stat= "qq") +
  labs(title = "McDonalds")

ggplot (data = sonic, aes(sample = sodium)) +
  geom_line(stat= "qq") +
  labs(title = "Sonic")

ggplot (data = subway, aes(sample = sodium)) +
  geom_line(stat= "qq") +
  labs(title = "Subway")

ggplot (data = tb, aes(sample = sodium)) +
  geom_line(stat= "qq") +
  labs(title = "Taco Bell")

ggplot (data = fastfood, aes(sample = sodium)) +
  geom_line(stat= "qq") +
  labs(title = "All")

The stepwise pattern may be caused by the sodium levels in the different items. If an item has a much greatere sodium level than the previous item, that will cause the line to “jump.”

Exercise 9 - Total Carbs, Normal Probability Plot and Histogram

ggplot (data = subway, aes(sample = total_carb)) +
  geom_line(stat= "qq") +
  labs(title = "Subway")

This appears to be skewed to the left; the tail will be long to the origin, with most of the bars to the right of the histogram

hist (subway$total_carb, main = "Total Carbs in Selected Subway Items", xlab = "Total Carbs")

I guess I missed my guess; the distribution is somewhat bi-modal, with tails (sort of) at either end.

