Instructions:

In this course, we have learned the importance of data visualization and how to leverage packages in the Tidyverse for data visualization in R. This project will give you the opportunity to practice those skills in greater depth.

This assignment uses two data sets about fast food restaurants. Specifically, we will create data visualizations using:

Data set: annual sales information and number of stores per fast food restaurants in 2018 (data_fastfood_sales.csv). Source: Data originally from Hubworks.

Data set: nutrition information about entrees from fast food restaurants (data_fastfood_calories.csv). Source: Data originally from the GitHub repository from the Tidy Tuesday project in R4DS online learning community.

Load Libraries and Read Data

library(pacman)
p_load(tidyverse, skimr, ggrepel)

Annual sales information and number of stores per fast food restaurants in 2018

sales <- read_csv('../data/raw_data/data_fastfood_sales.csv')

## Rows: 19 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): restaurant
## dbl (5): average_sales, us_sales, num_company_stores, num_franchised_stores,...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Nutrition information about entrees from fast food restaurants

calories <- read_csv('../data/raw_data/data_fastfood_calories.csv')

## Rows: 515 Columns: 16
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (2): restaurant, item
## dbl (14): calories, cal_fat, total_fat, sat_fat, trans_fat, cholesterol, sod...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore the Data

skim(sales)

Data summary
Name	sales
Number of rows	19
Number of columns	6
_______________________
Column type frequency:
character	1
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
restaurant	0	1	3	15	0	19	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
average_sales	1	1189.88	541.52	360.72	857.50	1130.00	1470.10	2670.32	▆▇▆▁▁
us_sales	1	7592.69	8007.32	606.00	3499.88	4476.41	9539.12	37480.67	▇▃▁▁▁
num_company_stores	1	839.00	1875.80	0.00	53.50	276.00	677.50	8222.00	▇▁▁▁▁
num_franchised_stores	1	5998.53	5894.51	0.00	2583.00	4055.00	6497.50	25908.00	▇▅▂▁▁
unit_count	1	6838.05	5997.13	2231.00	3034.50	4332.00	7394.00	25908.00	▇▁▂▁▁

skim(calories)

Data summary
Name	calories
Number of rows	515
Number of columns	16
_______________________
Column type frequency:
character	2
numeric	14
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
restaurant	0	1	5	11	0	8	0
item	0	1	5	63	0	505	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
calories	0	1.00	530.91	282.44	20	330.0	490.0	690	2430	▇▆▁▁▁
cal_fat	0	1.00	238.81	166.41	0	120.0	210.0	310	1270	▇▃▁▁▁
total_fat	0	1.00	26.59	18.41	0	14.0	23.0	35	141	▇▃▁▁▁
sat_fat	0	1.00	8.15	6.42	0	4.0	7.0	11	47	▇▃▁▁▁
trans_fat	0	1.00	0.47	0.84	0	0.0	0.0	1	8	▇▁▁▁▁
cholesterol	0	1.00	72.46	63.16	0	35.0	60.0	95	805	▇▁▁▁▁
sodium	0	1.00	1246.74	689.95	15	800.0	1110.0	1550	6080	▇▆▁▁▁
total_carb	0	1.00	45.66	24.88	0	28.5	44.0	57	156	▅▇▂▁▁
fiber	12	0.98	4.14	3.04	0	2.0	3.0	5	17	▇▅▂▁▁
sugar	0	1.00	7.26	6.76	0	3.0	6.0	9	87	▇▁▁▁▁
protein	1	1.00	27.89	17.68	1	16.0	24.5	36	186	▇▂▁▁▁
vit_a	214	0.58	18.86	31.38	0	4.0	10.0	20	180	▇▁▁▁▁
vit_c	210	0.59	20.17	30.59	0	4.0	10.0	30	400	▇▁▁▁▁
calcium	210	0.59	24.85	25.52	0	8.0	20.0	30	290	▇▁▁▁▁

glimpse(sales)

## Rows: 19
## Columns: 6
## $ restaurant            <chr> "Subway", "Mcdonalds", "Starbucks", "Dunkin Donu~
## $ average_sales         <dbl> 416.86, 2670.32, 945.27, 733.13, 900.00, 1387.81~
## $ us_sales              <dbl> 10800.00, 37480.67, 13167.61, 9192.00, 5510.84, ~
## $ num_company_stores    <dbl> 0, 842, 8222, 0, 96, 50, 647, 337, 392, 535, 54,~
## $ num_franchised_stores <dbl> 25908, 13194, 5708, 12538, 7426, 7196, 5799, 543~
## $ unit_count            <dbl> 25908, 14036, 13930, 12538, 7522, 7266, 6446, 57~

glimpse(calories)

## Rows: 515
## Columns: 16
## $ restaurant  <chr> "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdon~
## $ item        <chr> "Artisan Grilled Chicken Sandwich", "Single Bacon Smokehou~
## $ calories    <dbl> 380, 840, 1130, 750, 920, 540, 300, 510, 430, 770, 380, 62~
## $ cal_fat     <dbl> 60, 410, 600, 280, 410, 250, 100, 210, 190, 400, 170, 300,~
## $ total_fat   <dbl> 7, 45, 67, 31, 45, 28, 12, 24, 21, 45, 18, 34, 20, 34, 8, ~
## $ sat_fat     <dbl> 2.0, 17.0, 27.0, 10.0, 12.0, 10.0, 5.0, 4.0, 11.0, 21.0, 4~
## $ trans_fat   <dbl> 0.0, 1.5, 3.0, 0.5, 0.5, 1.0, 0.5, 0.0, 1.0, 2.5, 0.0, 1.5~
## $ cholesterol <dbl> 95, 130, 220, 155, 120, 80, 40, 65, 85, 175, 40, 95, 125, ~
## $ sodium      <dbl> 1110, 1580, 1920, 1940, 1980, 950, 680, 1040, 1040, 1290, ~
## $ total_carb  <dbl> 44, 62, 63, 62, 81, 46, 33, 49, 35, 42, 38, 48, 48, 67, 31~
## $ fiber       <dbl> 3, 2, 3, 2, 4, 3, 2, 3, 2, 3, 2, 3, 3, 5, 2, 2, 3, 3, 5, 2~
## $ sugar       <dbl> 11, 18, 18, 18, 18, 9, 7, 6, 7, 10, 5, 11, 11, 11, 6, 3, 1~
## $ protein     <dbl> 37, 46, 70, 55, 46, 25, 15, 25, 25, 51, 15, 32, 42, 33, 13~
## $ vit_a       <dbl> 4, 6, 10, 6, 6, 10, 10, 0, 20, 20, 2, 10, 10, 10, 2, 4, 6,~
## $ vit_c       <dbl> 20, 20, 20, 25, 20, 2, 2, 4, 4, 6, 0, 10, 20, 15, 2, 6, 15~
## $ calcium     <dbl> 20, 20, 50, 20, 20, 15, 10, 2, 15, 20, 15, 35, 35, 35, 4, ~

unique(sales$restaurant)

##  [1] "Subway"          "Mcdonalds"       "Starbucks"       "Dunkin Donuts"  
##  [5] "Pizza Hut"       "Burger King"     "Taco Bell"       "Wendys"         
##  [9] "Dominos"         "Little Caesars"  "KFC"             "Sonic"          
## [13] "Arbys"           "Papa Johns"      "Jimmy Johns"     "Baskin-Robbins" 
## [17] "Chipotle"        "Jack in the Box" "Popeyes"

unique(calories$restaurant)

## [1] "Mcdonalds"   "Chick Fil-A" "Sonic"       "Arbys"       "Burger King"
## [6] "Dairy Queen" "Subway"      "Taco Bell"

Problem 1: Sales vs. Num. Stores

## data
p1 <- sales %>% 
  
  select(restaurant, us_sales, num_franchised_stores, unit_count) %>% 
  mutate(prop_franchised_stores = num_franchised_stores/unit_count) %>% 
  group_by(restaurant, num_franchised_stores, us_sales, prop_franchised_stores, unit_count) %>%
  summarise(us_sales = sum(us_sales)) %>% 
  arrange(desc(prop_franchised_stores)) %>% 
  ungroup()

## `summarise()` has grouped output by 'restaurant', 'num_franchised_stores',
## 'us_sales', 'prop_franchised_stores'. You can override using the `.groups`
## argument.

## visualization
p1 %>% 
  ggplot(aes(x = us_sales,
             y = unit_count,
             color = prop_franchised_stores)) + 
  
  geom_point(size = 3) + 
  scale_x_log10() +
  scale_y_log10() +

  labs(
    title = "Problem 1: US Sales vs. Number of Stores",
    subtitle = '',
    x = 'U.S. sales in millions (log10 scale)',
    y = 'Total number if stores (log10 scale)',
    caption =paste0("Data: Hubworks & Tidy Tuesday project • Visualization: Steven Ponce (@sponce1)"),
    color = "Proportion of stores\nfranchised") +   # legend title
  
  theme_bw() +
  
  # theme
  theme(
    plot.margin = margin(t = 10, r = 5, b = 10, l = 5),
    legend.position = 'right',
    axis.title = element_text(size = 12),
    axis.text  = element_text(size = 10),
    
    plot.title = element_text(
      color = "#2C3E50",
      face = "bold",
      size = 14,  
      margin = margin(t = 10)),
    
    plot.subtitle = element_text(
      color = "#2C3E50",
      size = 14,  
      margin = margin(b = 5)),
    
    plot.caption = element_text(
      color = "grey60",
      size = 10,
      hjust = .5,
      margin = margin(t = 15, b = 15)) 
  )   +
  
  # datapoints labels
  geom_text_repel(aes(label = restaurant), color = 'black')

Problem 2: Average sales per unit store

## data
p2 <- sales %>% 
  
  select(restaurant, average_sales) %>% 
  group_by(restaurant) %>%
  summarise(average_sales = sum(average_sales)) %>% 
  arrange(desc(average_sales)) %>% 
  
  # bar labels
  mutate(
    bar_label = round(average_sales, digits = 0),
    bar_label = paste0('$', {bar_label})
    ) %>% 
  
  # order by sales from High to Low
  mutate(restaurant = restaurant %>% fct_reorder(average_sales)) %>% 
  ungroup()


## visualization  
p2 %>% 
  ggplot(aes(x = average_sales, y = restaurant)) +
  geom_col() +
  
  scale_x_continuous(labels = scales::dollar_format()) +
  scale_y_discrete() +
  coord_cartesian(clip='off') +
  
  labs(
    title = "Problem 2: Average sales per unit store",
    subtitle = '',
    x = 'Average sales per unit store (in thousands)',
    y = 'Restaurant',
    caption =paste0("Data: Hubworks & Tidy Tuesday project • Visualization: Steven Ponce (@sponce1)")) +
  
  theme_classic() +
  
  # theme
  theme(
    plot.margin = margin(t = 10, r = 25, b = 10, l = 20),
    legend.position = 'right',
    axis.title = element_text(size = 12),
    axis.text  = element_text(size = 10),
    
    plot.title = element_text(
      color = "#2C3E50",
      face = "bold",
      size = 14,  
      margin = margin(t = 10)),
    
    plot.subtitle = element_text(
      color = "#2C3E50",
      size = 14,  
      margin = margin(b = 5)),
    
    plot.caption = element_text(
      color = "grey60",
      size = 10,
      hjust = .5,
      margin = margin(t = 15, b = 15)) 
  )  +
  
  # col labels
  geom_text(aes(label = bar_label, hjust= -0.25))

Problem 3: Sodium Levels

## data
p3 <- calories %>% 
  select(restaurant, item, calories, sodium) 

## visualization
p3 %>% 
  ggplot(aes(x = calories, y = sodium)) + 
  
  facet_wrap(~restaurant) +
  geom_point(size = 2, alpha = 0.5) +
  geom_hline(yintercept = 2300) + 
  
  geom_text_repel(data = p3  %>% filter(sodium > 2300),
                  aes(label = item, direction = 'y'), 
                  nudge_x = 800, nudge_y = 1000) +

  #labs
  labs(
    title = "Problem 3: Sodium Levels",
    subtitle = '',
    x = 'Calories',
    y = 'Sodium (mg)',
    caption =paste0("Data: Hubworks & Tidy Tuesday project • Visualization: Steven Ponce (@sponce1)")
    ) +
  
  # theme
  theme_bw() + 
  
  theme(
    plot.margin = margin(t = 10, r = 25, b = 10, l = 20),
    legend.position = 'right',
    axis.title = element_text(size = 12),
    axis.text  = element_text(size = 10),
    
    plot.title = element_text(
      color = "#2C3E50",
      face = "bold",
      size = 14,  
      margin = margin(t = 10)),
    
    plot.subtitle = element_text(
      color = "#2C3E50",
      size = 14,  
      margin = margin(b = 5)),
    
    plot.caption = element_text(
      color = "grey60",
      size = 10,
      hjust = .5,
      margin = margin(t = 15, b = 15)))

Problem 4: Any Salad?

## data
p4 <- calories %>% 
  select(restaurant, item, calories) %>% 
  # salad available?
  mutate(is_salad = str_detect(str_to_lower(item), 'salad')) %>% 
  # order by calories from High to Low
  mutate(restaurant = restaurant %>% fct_reorder(calories)) 


## visualization
p4 %>%  
  ggplot(aes(x = calories, y = restaurant)) +
  
  geom_boxplot(outlier.shape = NA) + 
  geom_jitter(aes(color = is_salad)) + 
  scale_x_log10() +
  scale_y_discrete() +
  scale_color_discrete(labels=c("Not a salad", "Salad")) +
  
  #labs
  labs(
    title = "Problem 4: Any salad? ",
    subtitle = '',
    x = 'Calories (log10 scale)',
    y = 'Restaurant',
    color = "Is the entree\n a salad?",
    caption =paste0("Data: Hubworks & Tidy Tuesday project • Visualization: Steven Ponce (@sponce1)") 
  ) +
  
  # theme
  theme_bw() + 
  
  theme(
    plot.margin = margin(t = 10, r = 25, b = 10, l = 20),
    legend.position = 'right',
    axis.title = element_text(size = 12),
    axis.text  = element_text(size = 10),
    
    plot.title = element_text(
      color = "#2C3E50",
      face = "bold",
      size = 14,  
      margin = margin(t = 10)),
    
    plot.subtitle = element_text(
      color = "#2C3E50",
      size = 14,  
      margin = margin(b = 5)),
    
    plot.caption = element_text(
      color = "grey60",
      size = 10,
      hjust = .5,
      margin = margin(t = 15, b = 15)))

Problem 5: Sugar Levels

## data
p5 <- sales %>% 
  inner_join(calories, by = 'restaurant') %>%
  
  select(restaurant, item, calories, sugar, us_sales) %>% 
  filter(restaurant!="Taco Bell") %>%
  
  group_by(restaurant, us_sales) %>%
  summarise(median_sugar = median(sugar)) %>% 
  
  ungroup() %>% 
  arrange(desc(median_sugar)) %>% 
  
  # order by sugar from High to Low
  mutate(restaurant = restaurant %>% fct_reorder(median_sugar))

## `summarise()` has grouped output by 'restaurant'. You can override using the
## `.groups` argument.

## visualization
p5 %>%  
  ggplot(aes(x = restaurant,
             y = us_sales)) +
  geom_col(aes(fill = median_sugar)) + 
  
  scale_x_discrete() + 
  scale_y_continuous() + 
  scale_fill_viridis_c() +
  
  #labs
  labs(
    title = "Problem 5: Sugar Levels ",
    subtitle = '',
    x = 'Restaurant',
    y = 'U.S. sales (in millions)',
    fill = "Median sugar (grams)\nin fast food entrees",
    caption =paste0("Data: Hubworks & Tidy Tuesday project • Visualization: Steven Ponce (@sponce1)") 
  ) +
  
  # theme
  theme_classic() + 
  
  theme(
    plot.margin = margin(t = 10, r = 25, b = 10, l = 20),
    legend.position = 'right',
    axis.title = element_text(size = 12),
    axis.text  = element_text(size = 10),
    
    plot.title = element_text(
      color = "#2C3E50",
      face = "bold",
      size = 14,  
      margin = margin(t = 10)),
    
    plot.subtitle = element_text(
      color = "#2C3E50",
      size = 14,  
      margin = margin(b = 5)),
    
    plot.caption = element_text(
      color = "grey60",
      size = 10,
      hjust = .5,
      margin = margin(t = 15, b = 15)))

Course 04: Final Project

Steven Ponce

2022-03-06