The Project Description

The goal is to explore the nutrition of entree items and the sales of fast food restaurants in 2018. I will draw a set of plots to illustrate the nutritional value of different meals offered in these restaurants.

Loading Libraries and data

library(tidyverse)
library(ggrepel)
library(knitr)

calories_df <- read_csv("data_fastfood_calories.csv")
sales_df    <- read_csv("data_fastfood_sales.csv")

First glance at the data

calories_df    %>% 
  head(n=3)    %>% 
  kable()
restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol sodium total_carb fiber sugar protein vit_a vit_c calcium
Mcdonalds Artisan Grilled Chicken Sandwich 380 60 7 2 0.0 95 1110 44 3 11 37 4 20 20
Mcdonalds Single Bacon Smokehouse Burger 840 410 45 17 1.5 130 1580 62 2 18 46 6 20 20
Mcdonalds Double Bacon Smokehouse Burger 1130 600 67 27 3.0 220 1920 63 3 18 70 10 20 50
sales_df %>% 
  head() %>% 
  kable()
restaurant average_sales us_sales num_company_stores num_franchised_stores unit_count
Subway 416.86 10800.00 0 25908 25908
Mcdonalds 2670.32 37480.67 842 13194 14036
Starbucks 945.27 13167.61 8222 5708 13930
Dunkin Donuts 733.13 9192.00 0 12538 12538
Pizza Hut 900.00 5510.84 96 7426 7522
Burger King 1387.81 10028.32 50 7196 7266

Part 1 - Which restaurants offer healthy meals?

To answer this question, I used sodium, sugar and total_fat variables as proxies for the food health.

calories_df %>% 
  select(restaurant, total_fat, sodium, sugar ) %>% 
  pivot_longer(cols      = total_fat:sugar,
               names_to  = "ingredient", 
               values_to = "value") %>%

  ggplot(aes(x = restaurant, y = value, 
             fill = ingredient, label = value)) +
  geom_boxplot(varwidth = TRUE) +
  geom_text_repel(size = 2,
                  max.overlaps = getOption("ggrepel.max.overlaps", default = 20)) +
  theme(legend.position = "none")+
  coord_flip() +
  facet_wrap(~ingredient, scales = "free_x")

Labeling based on the meal item

It is a good idea to label the boxplot based on meal items to get a sense of the calories data.

is_outlier <- function(x) {
  return(x < quantile(x, 0.25) - 1.5 * IQR(x) | x > quantile(x, 0.75) + 1.5 * IQR(x))
}

# Creating a tidy dataset for visualisation
calories_df %>% 
  select(restaurant, item, total_fat, sodium, sugar ) %>% 
  pivot_longer(cols      = total_fat:sugar,
               names_to  = "ingredient", 
               values_to = "value") %>% 
  group_by(ingredient) %>%
  mutate(outlier = ifelse(is_outlier(value), item, as.numeric(NA))) %>%

  ggplot(aes(x = restaurant, y = value, 
             fill = ingredient, label = outlier)) +
    geom_boxplot(width = 1, varwidth = TRUE) +
    geom_text_repel(size = 2,
                    nudge_y = 3,
                    max.overlaps = getOption("ggrepel.max.overlaps", default = 20)) +
    theme(legend.position = "none")+
    coord_flip() +
  
  # each variable has a different scale (scales = "free_x")
    facet_wrap(~ingredient, nrow = 3, scales = "free_x")

Part 2- The number of pieces

One problem in the previous plot is the number of pieces. Bigger portions of meal (such as “20 piece Buttermilk Crispy Chicken Tenders”) have naturally more sodium, sugar and total fat compared to “3 piece Buttermilk Crispy Chicken Tenders”, so I keep the minimum portion in the dataset and remove bigger portions. In this way, I can compare different meals eaten by one person.

An Example of different portions of the same meal:

calories_df %>%
    # Meal items with pieces
    filter( str_detect(item, regex(" [pP]iece ", ignore_case = TRUE))) %>%
    select(restaurant, item, sodium, sugar, total_fat) %>% 
    head(n=11) %>% 
    kable()
restaurant item sodium sugar total_fat
Mcdonalds 3 piece Buttermilk Crispy Chicken Tenders 910 0 21
Mcdonalds 4 piece Buttermilk Crispy Chicken Tenders 1290 1 28
Mcdonalds 6 piece Buttermilk Crispy Chicken Tenders 1890 1 44
Mcdonalds 10 piece Buttermilk Crispy Chicken Tenders 3230 4 70
Mcdonalds 12 piece Buttermilk Crispy Chicken Tenders 3770 2 88
Mcdonalds 20 piece Buttermilk Crispy Chicken Tenders 6080 3 141
Mcdonalds 4 Piece Chicken McNuggets 340 0 11
Mcdonalds 6 Piece Chicken McNuggets 510 0 16
Mcdonalds 10 Piece Chicken McNuggets 840 0 27
Mcdonalds 20 Piece Chicken McNuggets 1680 0 53
Mcdonalds 40 piece Chicken McNuggets 3370 1 107

Removing bigger portions from the dataset:

piece_df <-
calories_df %>% 
  separate(item, into = c("no_piece","item_body"), sep = " [pP]iece ", remove = FALSE ) %>% 
  group_by(item_body) %>%
  
  # Keeping the rows with minimum number of pieces per item body group
  slice(which.min(no_piece)) %>% 
  
  ungroup() %>%
  
  # Removing temporary columns
  select(-c(no_piece,item_body))


revised_calories_df <-
  calories_df %>%
    # Removing all the rows with pieces
    filter(! str_detect(item, regex(" [pP]iece ", ignore_case = TRUE))) %>% 
    
    # Adding the revised dataset(piece_df)
    bind_rows(piece_df)

Examples from the revised dataset:

revised_calories_df %>%
    # Meal items with pieces
    filter( str_detect(item, regex(" [pP]iece ", ignore_case = TRUE))) %>%
    select(restaurant, item, sodium, sugar, total_fat) %>% 
    head(n=11) %>% 
    kable()
restaurant item sodium sugar total_fat
Mcdonalds 3 piece Buttermilk Crispy Chicken Tenders 910 0 21
Chick Fil-A 1 Piece Chick-n-Strips 320 1 6
Mcdonalds 4 Piece Chicken McNuggets 340 0 11
Chick Fil-A 4 piece Chicken Nuggets 490 0 6
Dairy Queen 4 Piece Chicken Strip Basket w/ Country Gravy 2780 4 53
Sonic 3 Piece Crispy Chicken Tender Dinner 800 0 14
Chick Fil-A 4 Piece Grilled Chicken Nuggets 220 0 2
Arbys 2 piece Prime-Cut Chicken Tenders 640 0 11
Burger King 4 Piece Spicy Chicken Nuggets 570 0 15
Sonic 3 Piece Super Crunch Chicken Strip Dinner 2160 9 46
Sonic 3 Piece Super Crunch Chicken Strips 670 0 16

In the same way, I remove Large portions from the revised dataset:

size_pattern <- "[Ll]arge|[Ss]mall|[Rr]egular"

revised_calories_df %>%
      filter( str_detect(item, regex(size_pattern, ignore_case = TRUE))) %>%
      select(restaurant, item, sodium, sugar, total_fat) %>% 
      kable()
restaurant item sodium sugar total_fat
Chick Fil-A Regular Grilled Chicken Sub Sandwich 1000 10 13
Sonic Small Jumbo Popcorn Chicken 1250 1 22
Sonic Large Jumbo Popcorn Chicken 1890 2 32
Sonic Small Spicy Jumbo Popcorn Chicken 860 0 17
Sonic Large Spicy Jumbo Popcorn Chicken 1500 0 30
Sonic All Beef Regular Hot Dog – 6" 870 3 18
Dairy Queen Regular Cheese Curds 900 0 45
Dairy Queen Large Cheese Curds 2210 30 75
large_pattern <- "[Ll]arge|[Ff]ootlong|[Dd]ouble|[Uu]ltimate"

revised_calories_df<- 
revised_calories_df %>%
      filter( ! str_detect(item, regex(large_pattern, ignore_case = TRUE)))

The new boxplot based on the revised dataset:

revised_calories_df %>% 
  select(restaurant, item, total_fat, sodium, sugar ) %>% 
  pivot_longer(cols      = total_fat:sugar,
               names_to  = "ingredient", 
               values_to = "value") %>% 
  group_by(ingredient) %>%
  mutate(outlier = ifelse(is_outlier(value), item, as.numeric(NA))) %>%
  
  ggplot(aes(x = restaurant, y = value, 
             fill = ingredient, label = outlier)) +
  geom_boxplot(varwidth = TRUE, position = position_dodge(10)) +
  geom_text_repel(size = 2) +
  theme(legend.position = "none")+
  coord_flip() +
  
  # each variable has a different scale (scales = "free_x")
  facet_wrap(~ingredient, nrow = 3, scales = "free_x")

As we see in this plot compared to the plot for Part 1, some outliers have been removed and the plot is more understandable.

Part 3- Vegetarian versus regular meals

As we see in the last plot, many of the high-sodium, high-sugar and high-fat items have meat, so it makes sense to investigate these health factors in vegetarian/vegan versus regular meals.

# Determining Vegetarian items. 
# I assume that a vegetarian item has "veg" in its name

revised_calories_df <- 
revised_calories_df %>%
  mutate(item_status = ifelse(grepl("[Vv]eg", item), "Vegetarian", "Regular"))


revised_calories_df %>% 
  select(restaurant,item, item_status, total_fat, sodium, sugar ) %>% 
  pivot_longer(cols      = total_fat:sugar,
               names_to  = "ingredient", 
               values_to = "value") %>%
  
  ggplot(aes(x = restaurant, y = value, 
              fill = item_status, label = value)) +
  geom_boxplot(varwidth = TRUE) +
  geom_text_repel(size = 2) +
  # theme(legend.position = "none")+
  coord_flip() +
  facet_wrap(~ingredient, scale = "free_x")

Looking at this plot, there is not a clear pattern in sodium, sugar and total fat between vegetarian and regular items, so we cannot conclude that in these restaurants, vegetarian items are healthier. We need to dive deeper into this problem.

Part 4 - Investigating Sales

sales_df %>% 
ggplot(aes(x = unit_count, y = us_sales,
           label = restaurant)) +
  geom_point() +
  geom_text_repel(size = 4) 

In order to make the plot more interesting, I separated giant companies from smaller companies:

sales_df %>% 
  # separating giant companies from smaller companies
  mutate(status = ifelse(us_sales > 5000| unit_count > 5000, "giant","ordinary")) %>%
  mutate(status = factor(status, levels = c("ordinary", "giant"))) %>% 
  
  ggplot(aes(x = unit_count, y = us_sales,
             label = restaurant)) +
  geom_point() +
  geom_text_repel(size = 3, 
                  max.overlaps = getOption("ggrepel.max.overlaps", default = 20),
) +
    facet_wrap(~status, scales = "free")

Part 5 - Bubble Plot

I Added one variable to the previous plot: average_sales

sales_df %>% 
  # segregating giant companies from smaller companies
  mutate(status = ifelse(us_sales > 5000| unit_count > 5000, "giant","ordinary")) %>%
  mutate(status = factor(status, levels = c("ordinary", "giant"))) %>% 
  
  ggplot(aes(x = unit_count, y = us_sales,
             size = average_sales, label = restaurant)) +
  geom_point(col = "purple", alpha = 0.5) +
  geom_text_repel(size = 3, 
                  max.overlaps = getOption("ggrepel.max.overlaps", default = 20),
  ) +
  facet_wrap(~status, scales = "free")