library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
data("fastfood",package= 'openintro')
glimpse(fastfood)
## Rows: 515
## Columns: 17
## $ restaurant <chr> "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdon…
## $ item <chr> "Artisan Grilled Chicken Sandwich", "Single Bacon Smokehou…
## $ calories <dbl> 380, 840, 1130, 750, 920, 540, 300, 510, 430, 770, 380, 62…
## $ cal_fat <dbl> 60, 410, 600, 280, 410, 250, 100, 210, 190, 400, 170, 300,…
## $ total_fat <dbl> 7, 45, 67, 31, 45, 28, 12, 24, 21, 45, 18, 34, 20, 34, 8, …
## $ sat_fat <dbl> 2.0, 17.0, 27.0, 10.0, 12.0, 10.0, 5.0, 4.0, 11.0, 21.0, 4…
## $ trans_fat <dbl> 0.0, 1.5, 3.0, 0.5, 0.5, 1.0, 0.5, 0.0, 1.0, 2.5, 0.0, 1.5…
## $ cholesterol <dbl> 95, 130, 220, 155, 120, 80, 40, 65, 85, 175, 40, 95, 125, …
## $ sodium <dbl> 1110, 1580, 1920, 1940, 1980, 950, 680, 1040, 1040, 1290, …
## $ total_carb <dbl> 44, 62, 63, 62, 81, 46, 33, 49, 35, 42, 38, 48, 48, 67, 31…
## $ fiber <dbl> 3, 2, 3, 2, 4, 3, 2, 3, 2, 3, 2, 3, 3, 5, 2, 2, 3, 3, 5, 2…
## $ sugar <dbl> 11, 18, 18, 18, 18, 9, 7, 6, 7, 10, 5, 11, 11, 11, 6, 3, 1…
## $ protein <dbl> 37, 46, 70, 55, 46, 25, 15, 25, 25, 51, 15, 32, 42, 33, 13…
## $ vit_a <dbl> 4, 6, 10, 6, 6, 10, 10, 0, 20, 20, 2, 10, 10, 10, 2, 4, 6,…
## $ vit_c <dbl> 20, 20, 20, 25, 20, 2, 2, 4, 4, 6, 0, 10, 20, 15, 2, 6, 15…
## $ calcium <dbl> 20, 20, 50, 20, 20, 15, 10, 2, 15, 20, 15, 35, 35, 35, 4, …
## $ salad <chr> "Other", "Other", "Other", "Other", "Other", "Other", "Oth…
head(fastfood)
## # A tibble: 6 × 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mcdonalds Artisan G… 380 60 7 2 0 95
## 2 Mcdonalds Single Ba… 840 410 45 17 1.5 130
## 3 Mcdonalds Double Ba… 1130 600 67 27 3 220
## 4 Mcdonalds Grilled B… 750 280 31 10 0.5 155
## 5 Mcdonalds Crispy Ba… 920 410 45 12 0.5 120
## 6 Mcdonalds Big Mac 540 250 28 10 1 80
## # ℹ 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>, sugar <dbl>,
## # protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>, salad <chr>
mcdonalds <- fastfood %>%
filter(restaurant == "Mcdonalds")
dairy_queen <- fastfood %>%
filter(restaurant == "Dairy Queen")
Make a plot (or plots) to visualize the distributions of the amount of calories from fat of the options from these two restaurants. How do their centers, shapes, and spreads compare?
The comparison will allow a visual of calories distribution which is crucial to demonstrate normality. Also int the bellow histogram the shape of both distributions is assessed for simitry and modality, Therefore the below graph almost resemble a bell curve. To obtain a normal distributed data the histogram must follows the normal curve closely.
dqmean <- mean(dairy_queen$cal_fat)
dqsd <- sd(dairy_queen$cal_fat)
print(dairy_queen)
## # A tibble: 42 × 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Dairy Queen 1/2 lb.… 1000 660 74 26 2 170
## 2 Dairy Queen 1/2 lb.… 800 460 51 20 2 135
## 3 Dairy Queen 1/4 lb.… 630 330 37 13 1 95
## 4 Dairy Queen 1/4 lb.… 540 270 30 11 1 70
## 5 Dairy Queen 1/4 lb.… 570 310 35 11 1 75
## 6 Dairy Queen Origina… 400 160 18 9 1 65
## 7 Dairy Queen Origina… 630 310 34 18 2 125
## 8 Dairy Queen 4 Piece… 1030 480 53 9 1 80
## 9 Dairy Queen 6 Piece… 1260 590 66 11 1 120
## 10 Dairy Queen Bacon C… 420 240 26 11 1 60
## # ℹ 32 more rows
## # ℹ 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>, sugar <dbl>,
## # protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>, salad <chr>
library(ggplot2)
ggplot() +
geom_histogram(data = mcdonalds, aes(x= cal_fat, y= after_stat (density)), alpha =0.5, fill= "red", binwidth = 50) +
geom_histogram(data=dairy_queen, aes(x=cal_fat, y= after_stat (density)), alpha = 0.5, fill =" green", binwidth = 50)
ggplot(data = dairy_queen, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(aes(y = ..density..), binwidth = 30) +
stat_function(fun = dnorm, args = list(mean = dqmean, sd = dqsd), col = "tomato", size=1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Evaluating Norma Distribution
Base on the plot,Points along the diagonal indicate normalcy.
Deviations in the tails imply skewness.
ggplot(data = dairy_queen, aes(sample = cal_fat)) +
geom_line(stat = "qq")
ggplot(data = dairy_queen, aes(sample = cal_fat)) +
geom_qq() +
geom_qq_line()
The majority of the points lie on the line, indicating approaching normalcy.
A few deviations in the tails suggest slight skewness.
ggplot(data = mcdonalds, aes(sample = cal_fat)) +
geom_qq() +
geom_qq_line()
Like Dairy Queen, McDonald’s data does not appear to follow a perfectly
normal distribution. While most points in the Q-Q plot are aligned along
the diagonal, deviations in the tails indicate some skewness or
kurtosis. Dairy Queen’s data has modest deviations, but it follows the
normal curve more closely than McDonald’s.
sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)
print(sim_norm)
## [1] 413.83344 328.11901 318.96006 330.63449 556.29609 234.73603 309.60736
## [8] 396.41828 41.89200 426.62690 148.34437 -18.03918 -30.93778 47.90816
## [15] 239.29231 413.23426 533.48148 704.46830 327.20686 -28.17674 437.57076
## [22] 347.27707 321.30481 106.63157 39.24285 482.72816 204.11233 140.09860
## [29] 241.73893 50.55494 138.04057 170.65206 147.86359 -43.75166 248.42041
## [36] 148.00487 535.63289 552.15499 275.78102 306.27509 195.13412 193.73653
All point does not fall on the line,
sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)
ggplot() +
geom_qq(aes(sample = sim_norm)) +
geom_qq_line(aes(sample = sim_norm))
Since the Q-Q plot resembles the simulated plot, the normalcy assumption
is reasonable.
Not every point is on the line. The probability plot for the simulated data more closely resembles the diagonal line than the genuine data, indicating that the real data deviate from a normal distribution, particularly in the tails.
qqnormsim(sample = cal_fat, data = dairy_queen)
## Normal Probability
1- pnorm(q =600, mean = dqmean, sd= dqsd)
## [1] 0.01501523
dairy_queen %>%
filter(cal_fat > 600) %>%
summarise(percent = n() / nrow(dairy_queen))
## # A tibble: 1 × 1
## percent
## <dbl>
## 1 0.0476
What is the probability that a McDonald’s item has more than 900 calories from fat?
What are the chances that a Dairy Queen dish contains fewer than 400 calories from fat? Determine the solution by using both theoretical and empirical probability calculations.
1 - pnorm(q = 900, mean = mean(mcdonalds$cal_fat), sd = sd(mcdonalds$cal_fat))
## [1] 0.002707129
pnorm(q = 450, mean = dqmean, sd = dqsd)
## [1] 0.8870773
1 - pnorm(q = 900, mean = mean(dairy_queen$cal_fat), sd = sd(dairy_queen$cal_fat))
## [1] 2.186778e-05
pnorm(q = 450, mean = dqmean, sd = dqsd)
## [1] 0.8870773
mcdonalds|>
filter(cal_fat > 90) |>
summarise(percent = n() / nrow(mcdonalds))
## # A tibble: 1 × 1
## percent
## <dbl>
## 1 0.895
dairy_queen |>
filter(cal_fat < 450) |>
summarise(percent = n() / nrow(dairy_queen))
## # A tibble: 1 × 1
## percent
## <dbl>
## 1 0.881
Dairy Queen’s empirical probability was close to its theoretical value.
The McDonald’s calculation indicated larger variation, perhaps owing to skewness.
fastfood |>
group_by(restaurant) |>
summarise(normality = shapiro.test(sodium) $p.value)
## # A tibble: 8 × 2
## restaurant normality
## <chr> <dbl>
## 1 Arbys 0.199
## 2 Burger King 0.133
## 3 Chick Fil-A 0.00250
## 4 Dairy Queen 0.0000471
## 5 Mcdonalds 0.0000000446
## 6 Sonic 0.00000178
## 7 Subway 0.0000251
## 8 Taco Bell 0.000699
A higher p-value (> 0.05) implies normalcy, indicating that the sodium content distribution does not significantly vary from a normal distribution.
Discrete data grouping can generate step wise Q-Q plot patterns.
Sodium values may be rounded at restaurant levels.
ggplot(data = fastfood %>% filter(restaurant == "Chick Fil-A"), aes(sample = total_carb)) +
geom_qq() +
geom_qq_line()
## Total Carbohydrates normal probability plot
ggplot(data = fastfood %>% filter(restaurant == "Chick Fil-A"), aes(sample = total_carb)) +
geom_qq() +
geom_qq_line()
# confirming Skewness by using a histogram
ggplot(fastfood %>% filter(restaurant == "Chick Fil-A"), aes(x = total_carb)) +
geom_histogram(binwidth = 5, fill = "gold", color = "blue")
This analysis explains how to assess normalcy in fast food nutritional
data. The tools utilized (histograms, Q-Q plots, and probability
calculations) aid in deciding if parametric statistical approaches can
be used.