This report provides reproducible code to analyze the proportion of NYPL Restaurant Menu items which contain references to beef steak products. We plot the proportion of menu items that involve beef steak over time, grouping by the decade, by the day of the week and by meal (breakfast versus lunch versus dinner). Each of these is a separate element of our data cleaning workflow.
library(tidyverse)
library(lubridate)
library(ngram)
menu_items <- read_csv("/Users/ericthompson/Downloads/NYPL-Dataset/MenuItem.csv")
dishes <- read_csv("/Users/ericthompson/Downloads/NYPL-Dataset/Dish.csv")
menu_pages <- read_csv("/Users/ericthompson/Downloads/NYPL-Dataset/MenuPage.csv")
menu <- read_csv("/Users/ericthompson/Downloads/NYPL-Dataset/Menu.csv")
menu_cleaned <- read_csv("/Users/ericthompson/Downloads/NYPL-Dataset/Menu-cleaned.csv")
steak_dishes_unfiltered <- dishes[
grep("Steak|Porterhouse|Sirloin|Filet Mignon|T-bone|tri-tip|skirt|flank|
beef|hanger|flat iron|rump|entrecote|brisket",
dishes$name, ignore.case = TRUE), ]
# some non-beef products remain in steak_dishes_unfiltered, such as halibut or lamb
words.freq <- table(unlist(str_split(str_c(steak_dishes_unfiltered$name, sep=" ", collapse = " "), " ")))
# top 50 most common terms in our hes_unfiltered
words.freq %>% sort(decreasing = TRUE) %>% head(50)
##
## Steak with Sirloin Steak, and of
## 3950 3675 2592 2130 1981 1559
## steak Broiled a Fried Potatoes French
## 1391 1386 1111 1111 974 941
## or Filet la Sauce mit Tenderloin
## 924 904 820 807 761 713
## - steak, Beef STEAK sirloin for
## 649 644 638 625 600 599
## Small Porterhouse Mignon Onions Extra Grilled
## 593 478 473 464 457 435
## und Butter Fresh Mushrooms sauce Beefsteak
## 429 418 414 409 400 399
## SIRLOIN Potatoes, Prime STEAK, potatoes
## 393 373 365 358 358 352
## Mushroom Beef, Hamburger Chopped Potato mushrooms
## 346 345 345 343 343 341
## Salad in
## 341 327
# We identify key terms which are likely misclassified as beef steak-related
words.freq %>% as.data.frame() %>% filter(Var1 == "Halibut")
## Var1 Freq
## 1 Halibut 223
words.freq %>% as.data.frame() %>% filter(Var1 == "Lamb")
## Var1 Freq
## 1 Lamb 122
# find incorrectly classified steak dishes (containing terms like "halibut steak" or "lamb steak")
incorrect_steak_dishes <- steak_dishes_unfiltered[
grep("Halibut|Lamb|Swordfish|Codfish|chicken|ham|veal",
steak_dishes_unfiltered$name, ignore.case = TRUE), ]
# filter out non-beef products
steak_dishes <- filter(steak_dishes_unfiltered, name %in% incorrect_steak_dishes$name == FALSE)
write_csv(steak_dishes, "/Users/ericthompson/steak_dishes.csv")
# join all datasets - this can dictate a schema (joining by keys)
df_all <-
menu_items %>%
left_join(dishes, by=c("dish_id"="id")) %>%
left_join(menu_pages, by=c("menu_page_id"="id")) %>%
left_join(menu_cleaned, by=c("menu_id"="id")) %>%
mutate(decade = floor(year(date)/10)*10) %>%
#ensure valid decade
filter(decade %in% seq(1850, 2010, by=10))
# NAs should be "UNKNOWN"
df_all$meal[is.na(df_all$meal)] = "UNKNOWN"
We see that around 1900 the frequency of steak items was much higher than in the following decades. Could some of this have been caused by wartime rationing of meats, such as the regulations imposed by the Office of Price Administration in 1942?
We also note that in recent decades (e.g. 1970 through 2010), the steak frequency has been highly volatile. This is likely caused by the lower number of observations (number of menus submitted) during these decades. There is a much larger sample size in the early parts of the 1900’s.
This plot also reveals that steak as a breakfast item reduced significantly beginning in the early parts of the 20th century.
# find steak proportion by decade
df_all_summary_by_decade <-
df_all %>%
group_by(decade) %>%
summarize(count=n())
df_steak <-
df_all %>%
filter(df_all$dish_id %in% steak_dishes$id)
# group by decade
df_steak_summary_by_decade <-
df_steak %>%
group_by(decade) %>%
summarize(count=n())
# find proportion steak
summary_by_decade <- df_all_summary_by_decade %>%
left_join(df_steak_summary_by_decade, by=c("decade"="decade")) %>%
mutate(proportion_steak = count.y/count.x) %>%
rename(menu_item_count = count.x) %>%
rename(steak_item_count = count.y) %>%
select(decade, menu_item_count, steak_item_count, proportion_steak) %>%
arrange(decade)
summary_by_decade[is.na(summary_by_decade)] = 0
# plot steak proportion by decade
summary_by_decade %>%
ggplot(aes(x=decade, y=proportion_steak)) +
geom_line()
In addition to decade, we can group and plot steak frequency by meal. This plot does not show noticeable trends between days of the week. Instead, we see that the overall trends demonstrated in the previous two plots are fairly similar when grouped by day of week.
# find steak proportion by decade and meal
df_all_summary_by_meal <-
df_all %>%
group_by(decade, meal) %>%
summarize(count=n())
df_steak <-
df_all %>%
filter(df_all$dish_id %in% steak_dishes$id)
# group by decade and meal
df_steak_summary_by_meal <-
df_steak %>%
group_by(decade, meal) %>%
summarize(count=n())
# find proportion steak
summary_by_meal <- df_all_summary_by_meal %>%
left_join(df_steak_summary_by_meal, by=c("decade"="decade", "meal"="meal")) %>%
mutate(proportion_steak = count.y/count.x) %>%
rename(menu_item_count = count.x) %>%
rename(steak_item_count = count.y) %>%
select(decade, meal, menu_item_count, steak_item_count, proportion_steak) %>%
arrange(decade, meal)
summary_by_meal[is.na(summary_by_meal)] = 0
# plot steak proportion by meal
summary_by_meal %>%
ggplot(aes(x=decade, y=proportion_steak, group=meal, color=meal)) +
geom_line()
# find steak proportion by decade and weekday
df_all_summary_by_weekday <-
df_all %>%
group_by(decade, weekday) %>%
summarize(count=n())
df_steak <-
df_all %>%
filter(df_all$dish_id %in% steak_dishes$id)
df_steak_summary_by_weekday <-
df_steak %>%
group_by(decade, weekday) %>%
summarize(count=n())
summary_by_weekday <- df_all_summary_by_weekday %>%
left_join(df_steak_summary_by_weekday, by=c("decade"="decade", "weekday"="weekday")) %>%
mutate(proportion_steak = count.y/count.x) %>%
rename(menu_item_count = count.x) %>%
rename(steak_item_count = count.y) %>%
select(decade, weekday, menu_item_count, steak_item_count, proportion_steak) %>%
arrange(decade, weekday)
summary_by_weekday[is.na(summary_by_weekday)] = 0
# plot steak proportion by weekday
summary_by_weekday %>%
ggplot(aes(x=decade, y=proportion_steak, group=weekday, color=weekday)) +
geom_line()