Instructions:

In this course, we have learned the importance of data visualization and how to leverage packages in the Tidyverse for data visualization in R. This project will give you the opportunity to practice those skills in greater depth.

This assignment uses two data sets about fast food restaurants. Specifically, we will create data visualizations using:

  1. Data set: annual sales information and number of stores per fast food restaurants in 2018 (data_fastfood_sales.csv). Source: Data originally from Hubworks.

  2. Data set: nutrition information about entrees from fast food restaurants (data_fastfood_calories.csv). Source: Data originally from the GitHub repository from the Tidy Tuesday project in R4DS online learning community.

Load Libraries and Read Data

library(pacman)
p_load(tidyverse, skimr, ggrepel)

Annual sales information and number of stores per fast food restaurants in 2018

sales <- read_csv('../data/raw_data/data_fastfood_sales.csv')
## Rows: 19 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): restaurant
## dbl (5): average_sales, us_sales, num_company_stores, num_franchised_stores,...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Nutrition information about entrees from fast food restaurants

calories <- read_csv('../data/raw_data/data_fastfood_calories.csv')
## Rows: 515 Columns: 16
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (2): restaurant, item
## dbl (14): calories, cal_fat, total_fat, sat_fat, trans_fat, cholesterol, sod...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore the Data

skim(sales)
Data summary
Name sales
Number of rows 19
Number of columns 6
_______________________
Column type frequency:
character 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
restaurant 0 1 3 15 0 19 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
average_sales 0 1 1189.88 541.52 360.72 857.50 1130.00 1470.10 2670.32 ▆▇▆▁▁
us_sales 0 1 7592.69 8007.32 606.00 3499.88 4476.41 9539.12 37480.67 ▇▃▁▁▁
num_company_stores 0 1 839.00 1875.80 0.00 53.50 276.00 677.50 8222.00 ▇▁▁▁▁
num_franchised_stores 0 1 5998.53 5894.51 0.00 2583.00 4055.00 6497.50 25908.00 ▇▅▂▁▁
unit_count 0 1 6838.05 5997.13 2231.00 3034.50 4332.00 7394.00 25908.00 ▇▁▂▁▁
skim(calories)
Data summary
Name calories
Number of rows 515
Number of columns 16
_______________________
Column type frequency:
character 2
numeric 14
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
restaurant 0 1 5 11 0 8 0
item 0 1 5 63 0 505 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
calories 0 1.00 530.91 282.44 20 330.0 490.0 690 2430 ▇▆▁▁▁
cal_fat 0 1.00 238.81 166.41 0 120.0 210.0 310 1270 ▇▃▁▁▁
total_fat 0 1.00 26.59 18.41 0 14.0 23.0 35 141 ▇▃▁▁▁
sat_fat 0 1.00 8.15 6.42 0 4.0 7.0 11 47 ▇▃▁▁▁
trans_fat 0 1.00 0.47 0.84 0 0.0 0.0 1 8 ▇▁▁▁▁
cholesterol 0 1.00 72.46 63.16 0 35.0 60.0 95 805 ▇▁▁▁▁
sodium 0 1.00 1246.74 689.95 15 800.0 1110.0 1550 6080 ▇▆▁▁▁
total_carb 0 1.00 45.66 24.88 0 28.5 44.0 57 156 ▅▇▂▁▁
fiber 12 0.98 4.14 3.04 0 2.0 3.0 5 17 ▇▅▂▁▁
sugar 0 1.00 7.26 6.76 0 3.0 6.0 9 87 ▇▁▁▁▁
protein 1 1.00 27.89 17.68 1 16.0 24.5 36 186 ▇▂▁▁▁
vit_a 214 0.58 18.86 31.38 0 4.0 10.0 20 180 ▇▁▁▁▁
vit_c 210 0.59 20.17 30.59 0 4.0 10.0 30 400 ▇▁▁▁▁
calcium 210 0.59 24.85 25.52 0 8.0 20.0 30 290 ▇▁▁▁▁
glimpse(sales)
## Rows: 19
## Columns: 6
## $ restaurant            <chr> "Subway", "Mcdonalds", "Starbucks", "Dunkin Donu~
## $ average_sales         <dbl> 416.86, 2670.32, 945.27, 733.13, 900.00, 1387.81~
## $ us_sales              <dbl> 10800.00, 37480.67, 13167.61, 9192.00, 5510.84, ~
## $ num_company_stores    <dbl> 0, 842, 8222, 0, 96, 50, 647, 337, 392, 535, 54,~
## $ num_franchised_stores <dbl> 25908, 13194, 5708, 12538, 7426, 7196, 5799, 543~
## $ unit_count            <dbl> 25908, 14036, 13930, 12538, 7522, 7266, 6446, 57~
glimpse(calories)
## Rows: 515
## Columns: 16
## $ restaurant  <chr> "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdon~
## $ item        <chr> "Artisan Grilled Chicken Sandwich", "Single Bacon Smokehou~
## $ calories    <dbl> 380, 840, 1130, 750, 920, 540, 300, 510, 430, 770, 380, 62~
## $ cal_fat     <dbl> 60, 410, 600, 280, 410, 250, 100, 210, 190, 400, 170, 300,~
## $ total_fat   <dbl> 7, 45, 67, 31, 45, 28, 12, 24, 21, 45, 18, 34, 20, 34, 8, ~
## $ sat_fat     <dbl> 2.0, 17.0, 27.0, 10.0, 12.0, 10.0, 5.0, 4.0, 11.0, 21.0, 4~
## $ trans_fat   <dbl> 0.0, 1.5, 3.0, 0.5, 0.5, 1.0, 0.5, 0.0, 1.0, 2.5, 0.0, 1.5~
## $ cholesterol <dbl> 95, 130, 220, 155, 120, 80, 40, 65, 85, 175, 40, 95, 125, ~
## $ sodium      <dbl> 1110, 1580, 1920, 1940, 1980, 950, 680, 1040, 1040, 1290, ~
## $ total_carb  <dbl> 44, 62, 63, 62, 81, 46, 33, 49, 35, 42, 38, 48, 48, 67, 31~
## $ fiber       <dbl> 3, 2, 3, 2, 4, 3, 2, 3, 2, 3, 2, 3, 3, 5, 2, 2, 3, 3, 5, 2~
## $ sugar       <dbl> 11, 18, 18, 18, 18, 9, 7, 6, 7, 10, 5, 11, 11, 11, 6, 3, 1~
## $ protein     <dbl> 37, 46, 70, 55, 46, 25, 15, 25, 25, 51, 15, 32, 42, 33, 13~
## $ vit_a       <dbl> 4, 6, 10, 6, 6, 10, 10, 0, 20, 20, 2, 10, 10, 10, 2, 4, 6,~
## $ vit_c       <dbl> 20, 20, 20, 25, 20, 2, 2, 4, 4, 6, 0, 10, 20, 15, 2, 6, 15~
## $ calcium     <dbl> 20, 20, 50, 20, 20, 15, 10, 2, 15, 20, 15, 35, 35, 35, 4, ~
unique(sales$restaurant)     
##  [1] "Subway"          "Mcdonalds"       "Starbucks"       "Dunkin Donuts"  
##  [5] "Pizza Hut"       "Burger King"     "Taco Bell"       "Wendys"         
##  [9] "Dominos"         "Little Caesars"  "KFC"             "Sonic"          
## [13] "Arbys"           "Papa Johns"      "Jimmy Johns"     "Baskin-Robbins" 
## [17] "Chipotle"        "Jack in the Box" "Popeyes"
unique(calories$restaurant)
## [1] "Mcdonalds"   "Chick Fil-A" "Sonic"       "Arbys"       "Burger King"
## [6] "Dairy Queen" "Subway"      "Taco Bell"

Problem 1: Sales vs. Num. Stores

## data
p1 <- sales %>% 
  
  select(restaurant, us_sales, num_franchised_stores, unit_count) %>% 
  mutate(prop_franchised_stores = num_franchised_stores/unit_count) %>% 
  group_by(restaurant, num_franchised_stores, us_sales, prop_franchised_stores, unit_count) %>%
  summarise(us_sales = sum(us_sales)) %>% 
  arrange(desc(prop_franchised_stores)) %>% 
  ungroup()
## `summarise()` has grouped output by 'restaurant', 'num_franchised_stores',
## 'us_sales', 'prop_franchised_stores'. You can override using the `.groups`
## argument.
## visualization
p1 %>% 
  ggplot(aes(x = us_sales,
             y = unit_count,
             color = prop_franchised_stores)) + 
  
  geom_point(size = 3) + 
  scale_x_log10() +
  scale_y_log10() +

  labs(
    title = "Problem 1: US Sales vs. Number of Stores",
    subtitle = '',
    x = 'U.S. sales in millions (log10 scale)',
    y = 'Total number if stores (log10 scale)',
    caption =paste0("Data: Hubworks & Tidy Tuesday project • Visualization: Steven Ponce (@sponce1)"),
    color = "Proportion of stores\nfranchised") +   # legend title
  
  theme_bw() +
  
  # theme
  theme(
    plot.margin = margin(t = 10, r = 5, b = 10, l = 5),
    legend.position = 'right',
    axis.title = element_text(size = 12),
    axis.text  = element_text(size = 10),
    
    plot.title = element_text(
      color = "#2C3E50",
      face = "bold",
      size = 14,  
      margin = margin(t = 10)),
    
    plot.subtitle = element_text(
      color = "#2C3E50",
      size = 14,  
      margin = margin(b = 5)),
    
    plot.caption = element_text(
      color = "grey60",
      size = 10,
      hjust = .5,
      margin = margin(t = 15, b = 15)) 
  )   +
  
  # datapoints labels
  geom_text_repel(aes(label = restaurant), color = 'black')

Problem 2: Average sales per unit store

## data
p2 <- sales %>% 
  
  select(restaurant, average_sales) %>% 
  group_by(restaurant) %>%
  summarise(average_sales = sum(average_sales)) %>% 
  arrange(desc(average_sales)) %>% 
  
  # bar labels
  mutate(
    bar_label = round(average_sales, digits = 0),
    bar_label = paste0('$', {bar_label})
    ) %>% 
  
  # order by sales from High to Low
  mutate(restaurant = restaurant %>% fct_reorder(average_sales)) %>% 
  ungroup()


## visualization  
p2 %>% 
  ggplot(aes(x = average_sales, y = restaurant)) +
  geom_col() +
  
  scale_x_continuous(labels = scales::dollar_format()) +
  scale_y_discrete() +
  coord_cartesian(clip='off') +
  
  labs(
    title = "Problem 2: Average sales per unit store",
    subtitle = '',
    x = 'Average sales per unit store (in thousands)',
    y = 'Restaurant',
    caption =paste0("Data: Hubworks & Tidy Tuesday project • Visualization: Steven Ponce (@sponce1)")) +
  
  theme_classic() +
  
  # theme
  theme(
    plot.margin = margin(t = 10, r = 25, b = 10, l = 20),
    legend.position = 'right',
    axis.title = element_text(size = 12),
    axis.text  = element_text(size = 10),
    
    plot.title = element_text(
      color = "#2C3E50",
      face = "bold",
      size = 14,  
      margin = margin(t = 10)),
    
    plot.subtitle = element_text(
      color = "#2C3E50",
      size = 14,  
      margin = margin(b = 5)),
    
    plot.caption = element_text(
      color = "grey60",
      size = 10,
      hjust = .5,
      margin = margin(t = 15, b = 15)) 
  )  +
  
  # col labels
  geom_text(aes(label = bar_label, hjust= -0.25)) 

Problem 3: Sodium Levels

## data
p3 <- calories %>% 
  select(restaurant, item, calories, sodium) 

## visualization
p3 %>% 
  ggplot(aes(x = calories, y = sodium)) + 
  
  facet_wrap(~restaurant) +
  geom_point(size = 2, alpha = 0.5) +
  geom_hline(yintercept = 2300) + 
  
  geom_text_repel(data = p3  %>% filter(sodium > 2300),
                  aes(label = item, direction = 'y'), 
                  nudge_x = 800, nudge_y = 1000) +

  #labs
  labs(
    title = "Problem 3: Sodium Levels",
    subtitle = '',
    x = 'Calories',
    y = 'Sodium (mg)',
    caption =paste0("Data: Hubworks & Tidy Tuesday project • Visualization: Steven Ponce (@sponce1)")
    ) +
  
  # theme
  theme_bw() + 
  
  theme(
    plot.margin = margin(t = 10, r = 25, b = 10, l = 20),
    legend.position = 'right',
    axis.title = element_text(size = 12),
    axis.text  = element_text(size = 10),
    
    plot.title = element_text(
      color = "#2C3E50",
      face = "bold",
      size = 14,  
      margin = margin(t = 10)),
    
    plot.subtitle = element_text(
      color = "#2C3E50",
      size = 14,  
      margin = margin(b = 5)),
    
    plot.caption = element_text(
      color = "grey60",
      size = 10,
      hjust = .5,
      margin = margin(t = 15, b = 15)))

Problem 4: Any Salad?

## data
p4 <- calories %>% 
  select(restaurant, item, calories) %>% 
  # salad available?
  mutate(is_salad = str_detect(str_to_lower(item), 'salad')) %>% 
  # order by calories from High to Low
  mutate(restaurant = restaurant %>% fct_reorder(calories)) 


## visualization
p4 %>%  
  ggplot(aes(x = calories, y = restaurant)) +
  
  geom_boxplot(outlier.shape = NA) + 
  geom_jitter(aes(color = is_salad)) + 
  scale_x_log10() +
  scale_y_discrete() +
  scale_color_discrete(labels=c("Not a salad", "Salad")) +
  
  #labs
  labs(
    title = "Problem 4: Any salad? ",
    subtitle = '',
    x = 'Calories (log10 scale)',
    y = 'Restaurant',
    color = "Is the entree\n a salad?",
    caption =paste0("Data: Hubworks & Tidy Tuesday project • Visualization: Steven Ponce (@sponce1)") 
  ) +
  
  # theme
  theme_bw() + 
  
  theme(
    plot.margin = margin(t = 10, r = 25, b = 10, l = 20),
    legend.position = 'right',
    axis.title = element_text(size = 12),
    axis.text  = element_text(size = 10),
    
    plot.title = element_text(
      color = "#2C3E50",
      face = "bold",
      size = 14,  
      margin = margin(t = 10)),
    
    plot.subtitle = element_text(
      color = "#2C3E50",
      size = 14,  
      margin = margin(b = 5)),
    
    plot.caption = element_text(
      color = "grey60",
      size = 10,
      hjust = .5,
      margin = margin(t = 15, b = 15)))

Problem 5: Sugar Levels

## data
p5 <- sales %>% 
  inner_join(calories, by = 'restaurant') %>%
  
  select(restaurant, item, calories, sugar, us_sales) %>% 
  filter(restaurant!="Taco Bell") %>%
  
  group_by(restaurant, us_sales) %>%
  summarise(median_sugar = median(sugar)) %>% 
  
  ungroup() %>% 
  arrange(desc(median_sugar)) %>% 
  
  # order by sugar from High to Low
  mutate(restaurant = restaurant %>% fct_reorder(median_sugar))
## `summarise()` has grouped output by 'restaurant'. You can override using the
## `.groups` argument.
## visualization
p5 %>%  
  ggplot(aes(x = restaurant,
             y = us_sales)) +
  geom_col(aes(fill = median_sugar)) + 
  
  scale_x_discrete() + 
  scale_y_continuous() + 
  scale_fill_viridis_c() +
  
  #labs
  labs(
    title = "Problem 5: Sugar Levels ",
    subtitle = '',
    x = 'Restaurant',
    y = 'U.S. sales (in millions)',
    fill = "Median sugar (grams)\nin fast food entrees",
    caption =paste0("Data: Hubworks & Tidy Tuesday project • Visualization: Steven Ponce (@sponce1)") 
  ) +
  
  # theme
  theme_classic() + 
  
  theme(
    plot.margin = margin(t = 10, r = 25, b = 10, l = 20),
    legend.position = 'right',
    axis.title = element_text(size = 12),
    axis.text  = element_text(size = 10),
    
    plot.title = element_text(
      color = "#2C3E50",
      face = "bold",
      size = 14,  
      margin = margin(t = 10)),
    
    plot.subtitle = element_text(
      color = "#2C3E50",
      size = 14,  
      margin = margin(b = 5)),
    
    plot.caption = element_text(
      color = "grey60",
      size = 10,
      hjust = .5,
      margin = margin(t = 15, b = 15)))