loading data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(openintro)

## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata

the data

data("fastfood",package= 'openintro')
glimpse(fastfood)

## Rows: 515
## Columns: 17
## $ restaurant  <chr> "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdon…
## $ item        <chr> "Artisan Grilled Chicken Sandwich", "Single Bacon Smokehou…
## $ calories    <dbl> 380, 840, 1130, 750, 920, 540, 300, 510, 430, 770, 380, 62…
## $ cal_fat     <dbl> 60, 410, 600, 280, 410, 250, 100, 210, 190, 400, 170, 300,…
## $ total_fat   <dbl> 7, 45, 67, 31, 45, 28, 12, 24, 21, 45, 18, 34, 20, 34, 8, …
## $ sat_fat     <dbl> 2.0, 17.0, 27.0, 10.0, 12.0, 10.0, 5.0, 4.0, 11.0, 21.0, 4…
## $ trans_fat   <dbl> 0.0, 1.5, 3.0, 0.5, 0.5, 1.0, 0.5, 0.0, 1.0, 2.5, 0.0, 1.5…
## $ cholesterol <dbl> 95, 130, 220, 155, 120, 80, 40, 65, 85, 175, 40, 95, 125, …
## $ sodium      <dbl> 1110, 1580, 1920, 1940, 1980, 950, 680, 1040, 1040, 1290, …
## $ total_carb  <dbl> 44, 62, 63, 62, 81, 46, 33, 49, 35, 42, 38, 48, 48, 67, 31…
## $ fiber       <dbl> 3, 2, 3, 2, 4, 3, 2, 3, 2, 3, 2, 3, 3, 5, 2, 2, 3, 3, 5, 2…
## $ sugar       <dbl> 11, 18, 18, 18, 18, 9, 7, 6, 7, 10, 5, 11, 11, 11, 6, 3, 1…
## $ protein     <dbl> 37, 46, 70, 55, 46, 25, 15, 25, 25, 51, 15, 32, 42, 33, 13…
## $ vit_a       <dbl> 4, 6, 10, 6, 6, 10, 10, 0, 20, 20, 2, 10, 10, 10, 2, 4, 6,…
## $ vit_c       <dbl> 20, 20, 20, 25, 20, 2, 2, 4, 4, 6, 0, 10, 20, 15, 2, 6, 15…
## $ calcium     <dbl> 20, 20, 50, 20, 20, 15, 10, 2, 15, 20, 15, 35, 35, 35, 4, …
## $ salad       <chr> "Other", "Other", "Other", "Other", "Other", "Other", "Oth…

head(fastfood)

## # A tibble: 6 × 17
##   restaurant item       calories cal_fat total_fat sat_fat trans_fat cholesterol
##   <chr>      <chr>         <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
## 1 Mcdonalds  Artisan G…      380      60         7       2       0            95
## 2 Mcdonalds  Single Ba…      840     410        45      17       1.5         130
## 3 Mcdonalds  Double Ba…     1130     600        67      27       3           220
## 4 Mcdonalds  Grilled B…      750     280        31      10       0.5         155
## 5 Mcdonalds  Crispy Ba…      920     410        45      12       0.5         120
## 6 Mcdonalds  Big Mac         540     250        28      10       1            80
## # ℹ 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>, sugar <dbl>,
## #   protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>, salad <chr>

screening McDonalds and Dairy Queen

mcdonalds <- fastfood %>%
  filter(restaurant == "Mcdonalds")
dairy_queen <- fastfood %>%
  filter(restaurant == "Dairy Queen")

Plotting Data

Make a plot (or plots) to visualize the distributions of the amount of calories from fat of the options from these two restaurants. How do their centers, shapes, and spreads compare?

The comparison will allow a visual of calories distribution which is crucial to demonstrate normality. Also int the bellow histogram the shape of both distributions is assessed for simitry and modality, Therefore the below graph almost resemble a bell curve. To obtain a normal distributed data the histogram must follows the normal curve closely.

dqmean <- mean(dairy_queen$cal_fat)
dqsd   <- sd(dairy_queen$cal_fat)

print(dairy_queen)

## # A tibble: 42 × 17
##    restaurant  item     calories cal_fat total_fat sat_fat trans_fat cholesterol
##    <chr>       <chr>       <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
##  1 Dairy Queen 1/2 lb.…     1000     660        74      26         2         170
##  2 Dairy Queen 1/2 lb.…      800     460        51      20         2         135
##  3 Dairy Queen 1/4 lb.…      630     330        37      13         1          95
##  4 Dairy Queen 1/4 lb.…      540     270        30      11         1          70
##  5 Dairy Queen 1/4 lb.…      570     310        35      11         1          75
##  6 Dairy Queen Origina…      400     160        18       9         1          65
##  7 Dairy Queen Origina…      630     310        34      18         2         125
##  8 Dairy Queen 4 Piece…     1030     480        53       9         1          80
##  9 Dairy Queen 6 Piece…     1260     590        66      11         1         120
## 10 Dairy Queen Bacon C…      420     240        26      11         1          60
## # ℹ 32 more rows
## # ℹ 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>, sugar <dbl>,
## #   protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>, salad <chr>

library(ggplot2)

 ggplot() +
   geom_histogram(data = mcdonalds, aes(x= cal_fat, y= after_stat (density)), alpha =0.5, fill= "red", binwidth = 50) +
   geom_histogram(data=dairy_queen, aes(x=cal_fat, y= after_stat (density)), alpha = 0.5, fill =" green", binwidth = 50)

ggplot(data = dairy_queen, aes(x = cal_fat)) +
        geom_blank() +
        geom_histogram(aes(y = ..density..), binwidth = 30) +
        stat_function(fun = dnorm, args = list(mean = dqmean, sd = dqsd), col = "tomato", size=1)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Evaluating Norma Distribution

Based on the this plot, does it appear that the data follow a nearly normal distribution?

Base on the plot,Points along the diagonal indicate normalcy.

Deviations in the tails imply skewness.

ggplot(data = dairy_queen, aes(sample = cal_fat)) + 
  geom_line(stat = "qq")

ggplot(data = dairy_queen, aes(sample = cal_fat)) +
  geom_qq() +
  geom_qq_line()

The majority of the points lie on the line, indicating approaching normalcy.

A few deviations in the tails suggest slight skewness.

McDonald Data Normality

ggplot(data = mcdonalds, aes(sample = cal_fat)) +
  geom_qq() +
  geom_qq_line()

Like Dairy Queen, McDonald’s data does not appear to follow a perfectly normal distribution. While most points in the Q-Q plot are aligned along the diagonal, deviations in the tails indicate some skewness or kurtosis. Dairy Queen’s data has modest deviations, but it follows the normal curve more closely than McDonald’s.

stimulating the data generating Normal distribution using “rnorm”

sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd) 

print(sim_norm)

##  [1] 413.83344 328.11901 318.96006 330.63449 556.29609 234.73603 309.60736
##  [8] 396.41828  41.89200 426.62690 148.34437 -18.03918 -30.93778  47.90816
## [15] 239.29231 413.23426 533.48148 704.46830 327.20686 -28.17674 437.57076
## [22] 347.27707 321.30481 106.63157  39.24285 482.72816 204.11233 140.09860
## [29] 241.73893  50.55494 138.04057 170.65206 147.86359 -43.75166 248.42041
## [36] 148.00487 535.63289 552.15499 275.78102 306.27509 195.13412 193.73653

Exercise 3: Generating Normal distribution

All point does not fall on the line,

sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)
ggplot() +
  geom_qq(aes(sample = sim_norm)) +
  geom_qq_line(aes(sample = sim_norm))

Since the Q-Q plot resembles the simulated plot, the normalcy assumption is reasonable.

Not every point is on the line. The probability plot for the simulated data more closely resembles the diagonal line than the genuine data, indicating that the real data deviate from a normal distribution, particularly in the tails.

qqnormsim(sample = cal_fat, data = dairy_queen)

## Normal Probability

1- pnorm(q =600, mean = dqmean, sd= dqsd)

## [1] 0.01501523

dairy_queen %>% 
  filter(cal_fat > 600) %>%
  summarise(percent = n() / nrow(dairy_queen))

## # A tibble: 1 × 1
##   percent
##     <dbl>
## 1  0.0476

What is the probability that a McDonald’s item has more than 900 calories from fat?

What are the chances that a Dairy Queen dish contains fewer than 400 calories from fat? Determine the solution by using both theoretical and empirical probability calculations.

Theoretical calculations McDonalds

1 - pnorm(q = 900, mean = mean(mcdonalds$cal_fat), sd = sd(mcdonalds$cal_fat))

## [1] 0.002707129

pnorm(q = 450, mean = dqmean, sd = dqsd)

## [1] 0.8870773

Theoretical calculations Dairy_Queen

1 - pnorm(q = 900, mean = mean(dairy_queen$cal_fat), sd = sd(dairy_queen$cal_fat))

## [1] 2.186778e-05

pnorm(q = 450, mean = dqmean, sd = dqsd)

## [1] 0.8870773

Emperical calculations McDonalds and Dairy_Queen

mcdonalds|>
  filter(cal_fat > 90) |>
  summarise(percent = n() / nrow(mcdonalds))

## # A tibble: 1 × 1
##   percent
##     <dbl>
## 1   0.895

dairy_queen |>
  filter(cal_fat < 450) |>
  summarise(percent = n() / nrow(dairy_queen))

## # A tibble: 1 × 1
##   percent
##     <dbl>
## 1   0.881

Dairy Queen’s empirical probability was close to its theoretical value.

The McDonald’s calculation indicated larger variation, perhaps owing to skewness.

Analyzing normal sodium

fastfood |> 
  group_by(restaurant) |>
  summarise(normality = shapiro.test(sodium) $p.value)

## # A tibble: 8 × 2
##   restaurant     normality
##   <chr>              <dbl>
## 1 Arbys       0.199       
## 2 Burger King 0.133       
## 3 Chick Fil-A 0.00250     
## 4 Dairy Queen 0.0000471   
## 5 Mcdonalds   0.0000000446
## 6 Sonic       0.00000178  
## 7 Subway      0.0000251   
## 8 Taco Bell   0.000699

A higher p-value (> 0.05) implies normalcy, indicating that the sodium content distribution does not significantly vary from a normal distribution.

Discrete data grouping can generate step wise Q-Q plot patterns.

Sodium values may be rounded at restaurant levels.

ggplot(data = fastfood %>% filter(restaurant == "Chick Fil-A"), aes(sample = total_carb)) +
  geom_qq() +
  geom_qq_line()

## Total Carbohydrates normal probability plot

ggplot(data = fastfood %>% filter(restaurant == "Chick Fil-A"), aes(sample = total_carb)) +
  geom_qq() +
  geom_qq_line()

# confirming Skewness by using a histogram

ggplot(fastfood %>% filter(restaurant == "Chick Fil-A"), aes(x = total_carb)) +
  geom_histogram(binwidth = 5, fill = "gold", color = "blue")

This analysis explains how to assess normalcy in fast food nutritional data. The tools utilized (histograms, Q-Q plots, and probability calculations) aid in deciding if parametric statistical approaches can be used.

La the normal distribution

W. Durosier

2025-03-01