Healthy fastfood restaurants

The Project Description

The goal is to explore the nutrition of entree items and the sales of fast food restaurants in 2018. I will draw a set of plots to illustrate the nutritional value of different meals offered in these restaurants.

Loading Libraries and data

library(tidyverse)
library(ggrepel)
library(knitr)

calories_df <- read_csv("data_fastfood_calories.csv")
sales_df    <- read_csv("data_fastfood_sales.csv")

First glance at the data

calories_df    %>% 
  head(n=3)    %>% 
  kable()

restaurant	item	calories	cal_fat	total_fat	sat_fat	trans_fat	cholesterol	sodium	total_carb	fiber	sugar	protein	vit_a	vit_c	calcium
Mcdonalds	Artisan Grilled Chicken Sandwich	380	60	7	2	0.0	95	1110	44	3	11	37	4	20	20
Mcdonalds	Single Bacon Smokehouse Burger	840	410	45	17	1.5	130	1580	62	2	18	46	6	20	20
Mcdonalds	Double Bacon Smokehouse Burger	1130	600	67	27	3.0	220	1920	63	3	18	70	10	20	50

sales_df %>% 
  head() %>% 
  kable()

restaurant	average_sales	us_sales	num_company_stores	num_franchised_stores	unit_count
Subway	416.86	10800.00	0	25908	25908
Mcdonalds	2670.32	37480.67	842	13194	14036
Starbucks	945.27	13167.61	8222	5708	13930
Dunkin Donuts	733.13	9192.00	0	12538	12538
Pizza Hut	900.00	5510.84	96	7426	7522
Burger King	1387.81	10028.32	50	7196	7266

Part 1 - Which restaurants offer healthy meals?

To answer this question, I used sodium, sugar and total_fat variables as proxies for the food health.

calories_df %>% 
  select(restaurant, total_fat, sodium, sugar ) %>% 
  pivot_longer(cols      = total_fat:sugar,
               names_to  = "ingredient", 
               values_to = "value") %>%

  ggplot(aes(x = restaurant, y = value, 
             fill = ingredient, label = value)) +
  geom_boxplot(varwidth = TRUE) +
  geom_text_repel(size = 2,
                  max.overlaps = getOption("ggrepel.max.overlaps", default = 20)) +
  theme(legend.position = "none")+
  coord_flip() +
  facet_wrap(~ingredient, scales = "free_x")

Labeling based on the meal item

It is a good idea to label the boxplot based on meal items to get a sense of the calories data.

is_outlier <- function(x) {
  return(x < quantile(x, 0.25) - 1.5 * IQR(x) | x > quantile(x, 0.75) + 1.5 * IQR(x))
}

# Creating a tidy dataset for visualisation
calories_df %>% 
  select(restaurant, item, total_fat, sodium, sugar ) %>% 
  pivot_longer(cols      = total_fat:sugar,
               names_to  = "ingredient", 
               values_to = "value") %>% 
  group_by(ingredient) %>%
  mutate(outlier = ifelse(is_outlier(value), item, as.numeric(NA))) %>%

  ggplot(aes(x = restaurant, y = value, 
             fill = ingredient, label = outlier)) +
    geom_boxplot(width = 1, varwidth = TRUE) +
    geom_text_repel(size = 2,
                    nudge_y = 3,
                    max.overlaps = getOption("ggrepel.max.overlaps", default = 20)) +
    theme(legend.position = "none")+
    coord_flip() +
  
  # each variable has a different scale (scales = "free_x")
    facet_wrap(~ingredient, nrow = 3, scales = "free_x")

Part 2- The number of pieces

One problem in the previous plot is the number of pieces. Bigger portions of meal (such as “20 piece Buttermilk Crispy Chicken Tenders”) have naturally more sodium, sugar and total fat compared to “3 piece Buttermilk Crispy Chicken Tenders”, so I keep the minimum portion in the dataset and remove bigger portions. In this way, I can compare different meals eaten by one person.

An Example of different portions of the same meal:

calories_df %>%
    # Meal items with pieces
    filter( str_detect(item, regex(" [pP]iece ", ignore_case = TRUE))) %>%
    select(restaurant, item, sodium, sugar, total_fat) %>% 
    head(n=11) %>% 
    kable()

restaurant	item	sodium	sugar	total_fat
Mcdonalds	3 piece Buttermilk Crispy Chicken Tenders	910	0	21
Mcdonalds	4 piece Buttermilk Crispy Chicken Tenders	1290	1	28
Mcdonalds	6 piece Buttermilk Crispy Chicken Tenders	1890	1	44
Mcdonalds	10 piece Buttermilk Crispy Chicken Tenders	3230	4	70
Mcdonalds	12 piece Buttermilk Crispy Chicken Tenders	3770	2	88
Mcdonalds	20 piece Buttermilk Crispy Chicken Tenders	6080	3	141
Mcdonalds	4 Piece Chicken McNuggets	340	0	11
Mcdonalds	6 Piece Chicken McNuggets	510	0	16
Mcdonalds	10 Piece Chicken McNuggets	840	0	27
Mcdonalds	20 Piece Chicken McNuggets	1680	0	53
Mcdonalds	40 piece Chicken McNuggets	3370	1	107

Removing bigger portions from the dataset:

piece_df <-
calories_df %>% 
  separate(item, into = c("no_piece","item_body"), sep = " [pP]iece ", remove = FALSE ) %>% 
  group_by(item_body) %>%
  
  # Keeping the rows with minimum number of pieces per item body group
  slice(which.min(no_piece)) %>% 
  
  ungroup() %>%
  
  # Removing temporary columns
  select(-c(no_piece,item_body))


revised_calories_df <-
  calories_df %>%
    # Removing all the rows with pieces
    filter(! str_detect(item, regex(" [pP]iece ", ignore_case = TRUE))) %>% 
    
    # Adding the revised dataset(piece_df)
    bind_rows(piece_df)

Examples from the revised dataset:

revised_calories_df %>%
    # Meal items with pieces
    filter( str_detect(item, regex(" [pP]iece ", ignore_case = TRUE))) %>%
    select(restaurant, item, sodium, sugar, total_fat) %>% 
    head(n=11) %>% 
    kable()

restaurant	item	sodium	sugar	total_fat
Mcdonalds	3 piece Buttermilk Crispy Chicken Tenders	910	0	21
Chick Fil-A	1 Piece Chick-n-Strips	320	1	6
Mcdonalds	4 Piece Chicken McNuggets	340	0	11
Chick Fil-A	4 piece Chicken Nuggets	490	0	6
Dairy Queen	4 Piece Chicken Strip Basket w/ Country Gravy	2780	4	53
Sonic	3 Piece Crispy Chicken Tender Dinner	800	0	14
Chick Fil-A	4 Piece Grilled Chicken Nuggets	220	0	2
Arbys	2 piece Prime-Cut Chicken Tenders	640	0	11
Burger King	4 Piece Spicy Chicken Nuggets	570	0	15
Sonic	3 Piece Super Crunch Chicken Strip Dinner	2160	9	46
Sonic	3 Piece Super Crunch Chicken Strips	670	0	16

In the same way, I remove Large portions from the revised dataset:

size_pattern <- "[Ll]arge|[Ss]mall|[Rr]egular"

revised_calories_df %>%
      filter( str_detect(item, regex(size_pattern, ignore_case = TRUE))) %>%
      select(restaurant, item, sodium, sugar, total_fat) %>% 
      kable()

restaurant	item	sodium	sugar	total_fat
Chick Fil-A	Regular Grilled Chicken Sub Sandwich	1000	10	13
Sonic	Small Jumbo Popcorn Chicken	1250	1	22
Sonic	Large Jumbo Popcorn Chicken	1890	2	32
Sonic	Small Spicy Jumbo Popcorn Chicken	860	0	17
Sonic	Large Spicy Jumbo Popcorn Chicken	1500	0	30
Sonic	All Beef Regular Hot Dog – 6"	870	3	18
Dairy Queen	Regular Cheese Curds	900	0	45
Dairy Queen	Large Cheese Curds	2210	30	75

large_pattern <- "[Ll]arge|[Ff]ootlong|[Dd]ouble|[Uu]ltimate"

revised_calories_df<- 
revised_calories_df %>%
      filter( ! str_detect(item, regex(large_pattern, ignore_case = TRUE)))

The new boxplot based on the revised dataset:

revised_calories_df %>% 
  select(restaurant, item, total_fat, sodium, sugar ) %>% 
  pivot_longer(cols      = total_fat:sugar,
               names_to  = "ingredient", 
               values_to = "value") %>% 
  group_by(ingredient) %>%
  mutate(outlier = ifelse(is_outlier(value), item, as.numeric(NA))) %>%
  
  ggplot(aes(x = restaurant, y = value, 
             fill = ingredient, label = outlier)) +
  geom_boxplot(varwidth = TRUE, position = position_dodge(10)) +
  geom_text_repel(size = 2) +
  theme(legend.position = "none")+
  coord_flip() +
  
  # each variable has a different scale (scales = "free_x")
  facet_wrap(~ingredient, nrow = 3, scales = "free_x")

As we see in this plot compared to the plot for Part 1, some outliers have been removed and the plot is more understandable.

Part 3- Vegetarian versus regular meals

As we see in the last plot, many of the high-sodium, high-sugar and high-fat items have meat, so it makes sense to investigate these health factors in vegetarian/vegan versus regular meals.

# Determining Vegetarian items. 
# I assume that a vegetarian item has "veg" in its name

revised_calories_df <- 
revised_calories_df %>%
  mutate(item_status = ifelse(grepl("[Vv]eg", item), "Vegetarian", "Regular"))


revised_calories_df %>% 
  select(restaurant,item, item_status, total_fat, sodium, sugar ) %>% 
  pivot_longer(cols      = total_fat:sugar,
               names_to  = "ingredient", 
               values_to = "value") %>%
  
  ggplot(aes(x = restaurant, y = value, 
              fill = item_status, label = value)) +
  geom_boxplot(varwidth = TRUE) +
  geom_text_repel(size = 2) +
  # theme(legend.position = "none")+
  coord_flip() +
  facet_wrap(~ingredient, scale = "free_x")

Looking at this plot, there is not a clear pattern in sodium, sugar and total fat between vegetarian and regular items, so we cannot conclude that in these restaurants, vegetarian items are healthier. We need to dive deeper into this problem.

Part 4 - Investigating Sales

sales_df %>% 
ggplot(aes(x = unit_count, y = us_sales,
           label = restaurant)) +
  geom_point() +
  geom_text_repel(size = 4)

In order to make the plot more interesting, I separated giant companies from smaller companies:

sales_df %>% 
  # separating giant companies from smaller companies
  mutate(status = ifelse(us_sales > 5000| unit_count > 5000, "giant","ordinary")) %>%
  mutate(status = factor(status, levels = c("ordinary", "giant"))) %>% 
  
  ggplot(aes(x = unit_count, y = us_sales,
             label = restaurant)) +
  geom_point() +
  geom_text_repel(size = 3, 
                  max.overlaps = getOption("ggrepel.max.overlaps", default = 20),
) +
    facet_wrap(~status, scales = "free")

Part 5 - Bubble Plot

I Added one variable to the previous plot: average_sales

sales_df %>% 
  # segregating giant companies from smaller companies
  mutate(status = ifelse(us_sales > 5000| unit_count > 5000, "giant","ordinary")) %>%
  mutate(status = factor(status, levels = c("ordinary", "giant"))) %>% 
  
  ggplot(aes(x = unit_count, y = us_sales,
             size = average_sales, label = restaurant)) +
  geom_point(col = "purple", alpha = 0.5) +
  geom_text_repel(size = 3, 
                  max.overlaps = getOption("ggrepel.max.overlaps", default = 20),
  ) +
  facet_wrap(~status, scales = "free")