Analysis of Beef Steak Items in the NYPL Restaurant Menu Dataset

This report provides reproducible code to analyze the proportion of NYPL Restaurant Menu items which contain references to beef steak products. We plot the proportion of menu items that involve beef steak over time, grouping by the decade, by the day of the week and by meal (breakfast versus lunch versus dinner). Each of these is a separate element of our data cleaning workflow.

library(tidyverse)
library(lubridate)
library(ngram)

menu_items <- read_csv("/Users/ericthompson/Downloads/NYPL-Dataset/MenuItem.csv")
dishes <- read_csv("/Users/ericthompson/Downloads/NYPL-Dataset/Dish.csv")
menu_pages <- read_csv("/Users/ericthompson/Downloads/NYPL-Dataset/MenuPage.csv")
menu <- read_csv("/Users/ericthompson/Downloads/NYPL-Dataset/Menu.csv")
menu_cleaned <- read_csv("/Users/ericthompson/Downloads/NYPL-Dataset/Menu-cleaned.csv")

steak_dishes_unfiltered <- dishes[
  grep("Steak|Porterhouse|Sirloin|Filet Mignon|T-bone|tri-tip|skirt|flank|
       beef|hanger|flat iron|rump|entrecote|brisket", 
  dishes$name, ignore.case = TRUE), ]

# some non-beef products remain in steak_dishes_unfiltered, such as halibut or lamb
words.freq <- table(unlist(str_split(str_c(steak_dishes_unfiltered$name, sep=" ",  collapse = " "), " ")))

# top 50 most common terms in our hes_unfiltered
words.freq %>% sort(decreasing = TRUE) %>% head(50)
## 
##       Steak        with     Sirloin      Steak,         and          of 
##        3950        3675        2592        2130        1981        1559 
##       steak     Broiled           a       Fried    Potatoes      French 
##        1391        1386        1111        1111         974         941 
##          or       Filet          la       Sauce         mit  Tenderloin 
##         924         904         820         807         761         713 
##           -      steak,        Beef       STEAK     sirloin         for 
##         649         644         638         625         600         599 
##       Small Porterhouse      Mignon      Onions       Extra     Grilled 
##         593         478         473         464         457         435 
##         und      Butter       Fresh   Mushrooms       sauce   Beefsteak 
##         429         418         414         409         400         399 
##     SIRLOIN               Potatoes,       Prime      STEAK,    potatoes 
##         393         373         365         358         358         352 
##    Mushroom       Beef,   Hamburger     Chopped      Potato   mushrooms 
##         346         345         345         343         343         341 
##       Salad          in 
##         341         327
# We identify key terms which are likely misclassified as beef steak-related
words.freq %>% as.data.frame() %>% filter(Var1 == "Halibut")
##      Var1 Freq
## 1 Halibut  223
words.freq %>% as.data.frame() %>% filter(Var1 == "Lamb")
##   Var1 Freq
## 1 Lamb  122
# find incorrectly classified steak dishes (containing terms like "halibut steak" or "lamb steak")
incorrect_steak_dishes <- steak_dishes_unfiltered[
  grep("Halibut|Lamb|Swordfish|Codfish|chicken|ham|veal", 
  steak_dishes_unfiltered$name, ignore.case = TRUE), ]

# filter out non-beef products
steak_dishes <- filter(steak_dishes_unfiltered, name %in% incorrect_steak_dishes$name == FALSE)


write_csv(steak_dishes, "/Users/ericthompson/steak_dishes.csv")

# join all datasets - this can dictate a schema (joining by keys)
df_all <-
  menu_items %>% 
  left_join(dishes, by=c("dish_id"="id")) %>%
  left_join(menu_pages, by=c("menu_page_id"="id")) %>%
  left_join(menu_cleaned, by=c("menu_id"="id")) %>%
  mutate(decade = floor(year(date)/10)*10) %>%
  #ensure valid decade
  filter(decade %in% seq(1850, 2010, by=10))

# NAs should be "UNKNOWN"
df_all$meal[is.na(df_all$meal)] = "UNKNOWN"

We see that around 1900 the frequency of steak items was much higher than in the following decades. Could some of this have been caused by wartime rationing of meats, such as the regulations imposed by the Office of Price Administration in 1942?

We also note that in recent decades (e.g. 1970 through 2010), the steak frequency has been highly volatile. This is likely caused by the lower number of observations (number of menus submitted) during these decades. There is a much larger sample size in the early parts of the 1900’s.

This plot also reveals that steak as a breakfast item reduced significantly beginning in the early parts of the 20th century.

# find steak proportion by decade
df_all_summary_by_decade <- 
  df_all %>% 
  group_by(decade) %>% 
  summarize(count=n())

df_steak <-
  df_all %>% 
  filter(df_all$dish_id %in% steak_dishes$id)

# group by decade
df_steak_summary_by_decade <- 
  df_steak %>% 
  group_by(decade) %>% 
  summarize(count=n())

# find proportion steak
summary_by_decade <- df_all_summary_by_decade %>%
  left_join(df_steak_summary_by_decade, by=c("decade"="decade")) %>%
  mutate(proportion_steak = count.y/count.x) %>%
  rename(menu_item_count = count.x) %>%
  rename(steak_item_count = count.y) %>%
  select(decade, menu_item_count, steak_item_count, proportion_steak) %>%
  arrange(decade)

summary_by_decade[is.na(summary_by_decade)] = 0

# plot steak proportion by decade
summary_by_decade %>%
  ggplot(aes(x=decade, y=proportion_steak)) +
  geom_line()

In addition to decade, we can group and plot steak frequency by meal. This plot does not show noticeable trends between days of the week. Instead, we see that the overall trends demonstrated in the previous two plots are fairly similar when grouped by day of week.

# find steak proportion by decade and meal
df_all_summary_by_meal <- 
  df_all %>% 
  group_by(decade, meal) %>% 
  summarize(count=n())

df_steak <-
  df_all %>% 
  filter(df_all$dish_id %in% steak_dishes$id)

# group by decade and meal
df_steak_summary_by_meal <- 
  df_steak %>% 
  group_by(decade, meal) %>% 
  summarize(count=n())

# find proportion steak
summary_by_meal <- df_all_summary_by_meal %>%
  left_join(df_steak_summary_by_meal, by=c("decade"="decade", "meal"="meal")) %>%
  mutate(proportion_steak = count.y/count.x) %>%
  rename(menu_item_count = count.x) %>%
  rename(steak_item_count = count.y) %>%
  select(decade, meal, menu_item_count, steak_item_count, proportion_steak) %>%
  arrange(decade, meal)

summary_by_meal[is.na(summary_by_meal)] = 0

# plot steak proportion by meal
summary_by_meal %>%
  ggplot(aes(x=decade, y=proportion_steak, group=meal, color=meal)) +
  geom_line()

# find steak proportion by decade and weekday
df_all_summary_by_weekday <- 
  df_all %>% 
  group_by(decade, weekday) %>% 
  summarize(count=n())

df_steak <-
  df_all %>% 
  filter(df_all$dish_id %in% steak_dishes$id)

df_steak_summary_by_weekday <- 
  df_steak %>% 
  group_by(decade, weekday) %>% 
  summarize(count=n())

summary_by_weekday <- df_all_summary_by_weekday %>%
  left_join(df_steak_summary_by_weekday, by=c("decade"="decade", "weekday"="weekday")) %>%
  mutate(proportion_steak = count.y/count.x) %>%
  rename(menu_item_count = count.x) %>%
  rename(steak_item_count = count.y) %>%
  select(decade, weekday, menu_item_count, steak_item_count, proportion_steak) %>%
  arrange(decade, weekday)

summary_by_weekday[is.na(summary_by_weekday)] = 0

# plot steak proportion by weekday
summary_by_weekday %>%
  ggplot(aes(x=decade, y=proportion_steak, group=weekday, color=weekday)) +
  geom_line()