library(tidyverse)
library(knitr)
library(dplyr)
library(ggplot2)
In the following code block, I load the csv of school meals in NYC and glimpse the data frame. I am interested in comparing student participation in school meals by borough and by type of meal.
school_food_url <- "https://raw.githubusercontent.com/mollysiebecker/DATA-607/main/2019_School_Food_Report_LL215.csv"
school_food <- read.csv(url(school_food_url))
head(school_food)
## Borough District Building.Code DBN School.Name
## 1 M 1 M015 01M015 P.S. 015 Roberto Clemente
## 2 M 1 M019 01M019 P.S. 019 Asher Levy
## 3 M 1 M020 01M020 P.S. 020 Anna Silver
## 4 M 1 M025 01M509 Marta Valle High School
## 5 M 1 M034 01M034 P.S. 034 Franklin D. Roosevelt
## 6 M 1 M060 01M450 East Side Community School
## SchoolFood.Code Breakfast.Before.Bell.ADP.............FY18
## 1 1015 92
## 2 1019 84
## 3 1020 198
## 4 1025 15
## 5 1034 111
## 6 1060 38
## Breakfast.Before.Bell.ADP.............FY19
## 1 75
## 2 77
## 3 183
## 4 13
## 5 93
## 6 42
## Breakfast.In.Classroom.ADP..where.available...see..Key..tab............FY18
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## Breakfast.In.Classroom.ADP..where.available...see..Key..tab............FY19
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## Breakfast.Before.Bell..Y...breakfast.before.bell.in.Any.Part.of.DBN...........FY18
## 1 true
## 2 true
## 3 true
## 4 true
## 5 true
## 6 true
## Breakfast.Before.Bell..Y...breakfast.before.bell.in.Any.Part.of.DBN............FY19
## 1 true
## 2 true
## 3 true
## 4 true
## 5 true
## 6 true
## Breakfast.In.Classroom.Offered..Y...BIC.in.Any.Part.of.DBN.......................FY18
## 1
## 2
## 3 true
## 4
## 5
## 6
## Breakfast.In.Classroom.Offered..Y...BIC.in.Any.Part.of.DBN.......................FY19
## 1
## 2 true
## 3 true
## 4
## 5
## 6
## Grab...Go.Offered..Y...G.G.in.Any.Part.of.DBN........................FY18
## 1
## 2
## 3 true
## 4
## 5
## 6
## Grab...Go.Offered..Y...G.G.in.Any.Part.of.DBN........................FY19
## 1
## 2 true
## 3 true
## 4
## 5
## 6
## Lunches.ADP............FY18 Lunches.ADP............FY19
## 1 145 133
## 2 179 175
## 3 314 307
## 4 43 34
## 5 226 206
## 6 90 98
## Salad.bar...Y...Salad.Bar.offered.in.building..................FY18
## 1 true
## 2 true
## 3 true
## 4 true
## 5 true
## 6 true
## Salad.bar...Y...Salad.Bar.offered.in.building..................FY19
## 1 true
## 2 true
## 3 true
## 4 true
## 5 true
## 6 true
## Snacks.ADP............FY18 Snacks.ADP............FY19
## 1 NA 60
## 2 NA NA
## 3 155 140
## 4 71 71
## 5 NA NA
## 6 124 127
## Suppers.ADP............FY18 Suppers.ADP............FY19
## 1 35 45
## 2 NA NA
## 3 179 139
## 4 67 58
## 5 66 26
## 6 37 23
In the following code block, I drop columns that only indicate whether a given type of meal is offered and rename the columns.
school_food <- school_food %>%
select(-c(2:6, 11:16, 19:20)) %>%
rename("borough" = "Borough", "breakfast_bb_fy18" = "Breakfast.Before.Bell.ADP.............FY18", "breakfast_bb_fy19" = "Breakfast.Before.Bell.ADP.............FY19", "breakfast_class_fy18" = "Breakfast.In.Classroom.ADP..where.available...see..Key..tab............FY18", "breakfast_class_fy19" = "Breakfast.In.Classroom.ADP..where.available...see..Key..tab............FY19")
I’m not interested in the difference between breakfast before the bell and breakfast in the classroom, so I create a new variable that sums the two types of breakfast.
school_food$breakfast_bb_fy18[is.na(school_food$breakfast_bb_fy18)] <- 0
school_food$breakfast_bb_fy19[is.na(school_food$breakfast_bb_fy19)] <- 0
school_food$breakfast_class_fy18[is.na(school_food$breakfast_class_fy18)] <- 0
school_food$breakfast_class_fy19[is.na(school_food$breakfast_class_fy19)] <- 0
school_food <- school_food %>%
mutate(breakfast_fy18 = breakfast_bb_fy18+breakfast_class_fy18,
breakfast_fy19 = breakfast_bb_fy19+breakfast_class_fy19,
`breakfast_bb_fy18` = NULL,
`breakfast_class_fy18` = NULL,
`breakfast_bb_fy19` = NULL,
`breakfast_class_fy19` = NULL)
Finally, I pivot longer, and extract strings for the fiscal year and type of meal, then replace values for greater clarity. I display the first 40 rows of the resulting tidy data set using kable.
school_food <- school_food %>%
pivot_longer(cols = 2:9, names_to = "meal", values_to = "avg_daily_pop")
school_food <- school_food %>%
mutate(
fiscal_year = str_extract(school_food$meal, "\\d{2}"),
meal = str_extract(school_food$meal, "[:alpha:]+(?=\\.)|[:alpha:]+(?=\\_)")) %>%
select(borough, meal, fiscal_year, avg_daily_pop)
school_food$meal[school_food$meal == "Lunches"] <- "lunch"
school_food$meal[school_food$meal == "Snacks"] <- "snack"
school_food$meal[school_food$meal == "Suppers"] <- "supper"
school_food$fiscal_year[school_food$fiscal_year == "18"] <- "2018"
school_food$fiscal_year[school_food$fiscal_year == "19"] <- "2019"
school_food$borough[school_food$borough == "M"] <- "Manhattan"
school_food$borough[school_food$borough == "Q"] <- "Queens"
school_food$borough[school_food$borough == "X"] <- "Bronx"
school_food$borough[school_food$borough == "K"] <- "Brooklyn"
school_food$borough[school_food$borough == "R"] <- "Staten Island"
school_food_subset <- school_food[1:40, ]
kable(school_food_subset, format = "pipe", col.names = c("Borough", "Meal", "Fiscal Year", "Average Daily Population"), caption = "Student Meal Participation by Borough, Meal, and Year", align = "c")
Borough | Meal | Fiscal Year | Average Daily Population |
---|---|---|---|
Manhattan | lunch | 2018 | 145 |
Manhattan | lunch | 2019 | 133 |
Manhattan | snack | 2018 | NA |
Manhattan | snack | 2019 | 60 |
Manhattan | supper | 2018 | 35 |
Manhattan | supper | 2019 | 45 |
Manhattan | breakfast | 2018 | 92 |
Manhattan | breakfast | 2019 | 75 |
Manhattan | lunch | 2018 | 179 |
Manhattan | lunch | 2019 | 175 |
Manhattan | snack | 2018 | NA |
Manhattan | snack | 2019 | NA |
Manhattan | supper | 2018 | NA |
Manhattan | supper | 2019 | NA |
Manhattan | breakfast | 2018 | 84 |
Manhattan | breakfast | 2019 | 77 |
Manhattan | lunch | 2018 | 314 |
Manhattan | lunch | 2019 | 307 |
Manhattan | snack | 2018 | 155 |
Manhattan | snack | 2019 | 140 |
Manhattan | supper | 2018 | 179 |
Manhattan | supper | 2019 | 139 |
Manhattan | breakfast | 2018 | 198 |
Manhattan | breakfast | 2019 | 183 |
Manhattan | lunch | 2018 | 43 |
Manhattan | lunch | 2019 | 34 |
Manhattan | snack | 2018 | 71 |
Manhattan | snack | 2019 | 71 |
Manhattan | supper | 2018 | 67 |
Manhattan | supper | 2019 | 58 |
Manhattan | breakfast | 2018 | 15 |
Manhattan | breakfast | 2019 | 13 |
Manhattan | lunch | 2018 | 226 |
Manhattan | lunch | 2019 | 206 |
Manhattan | snack | 2018 | NA |
Manhattan | snack | 2019 | NA |
Manhattan | supper | 2018 | 66 |
Manhattan | supper | 2019 | 26 |
Manhattan | breakfast | 2018 | 111 |
Manhattan | breakfast | 2019 | 93 |
Below, I create two new data frames, one that averages the average daily population in a given school (in schools where the meal is offered) for each meal by borough (including both 2018 and 2019,) and another that calculates the total average daily population for each borough. Then I specify the order of the meals for ease in visualization and display the results of each data frame in a bar graph.
school_food_summary <- school_food %>%
drop_na() %>%
group_by(borough, meal) %>%
summarize(borough_wide_adp = mean(avg_daily_pop), .groups = "drop")
school_food_borough_total <- school_food_summary %>%
drop_na() %>%
group_by(borough) %>%
summarize(borough_wide_adp = sum(borough_wide_adp), .groups = "drop")
ggplot(school_food_borough_total, aes(x = borough, y = borough_wide_adp)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "School Meals: Average Daily Population by Borough",
x = "Borough", y = "Average Daily Population") +
scale_fill_brewer(palette = "Set3")
meal_order <- c("breakfast", "lunch", "snack", "supper")
school_food_summary$meal <- factor(school_food_summary$meal, levels = meal_order)
ggplot(school_food_summary, aes(x = borough, y = borough_wide_adp, fill = meal)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "School Meals: Average Daily Population by Borough and Meal",
x = "Borough", y = "Average Daily Population") +
scale_fill_brewer(palette = "Set3")
Below, I create a similar data frame and bar graph, grouping by year, to check to see if there was a significant difference in student meal participation in 2018 and 2019.
school_food_summary_year <- school_food %>%
drop_na() %>%
group_by(fiscal_year, meal) %>%
summarize(yearly_adp = mean(avg_daily_pop), .groups = "drop")
ggplot(school_food_summary_year, aes(x = fiscal_year, y = yearly_adp, fill = meal)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "School Meals: Average Daily Population by Year",
x = "Fiscal Year", y = "Average Daily Population") +
scale_fill_brewer(palette = "Set3")
Student participation in school meals, as measured by average daily population, was similar across meal types from 2018 to 2019, with only a slight decrease. However, it is sufficiently similar to justify grouping both years together when comparing student participation by borough. In each borough, lunch is the meal with the greatest student participation, followed by breakfast in all boroughs except Manhattan, in which snack slightly exceeds breakfast. Queens has the greatest student participation, as measured by average number of students, for all meals overall, as well as for the two most popular meals. What this data cannot measure is how many students participate relative to the enrollment at each school, which would provide a greater basis for comparison across boroughs. Further analysis should join this data with data that shows the enrollment at each school in order to calculate this proportion.