library(tidyverse)
library(janitor)
library(here)
library(dplyr)

rawrestinspection <- read.csv(here("restuarant_inspections.csv"))

restinspect <- rawrestinspection %>%
  mutate(yearinspection = substr(DATE_INSPECTION, 1, 4)) %>%
  select_all(tolower)

Inspecting King County’s Restaurant Inspection Data

I explored King County’s Restaurant Inspection dataset. King County is the most densely populated county in Washington state. These inspections are made to assure that the food is being handled properly from preparation through serving. This dataset contains inspections from 2006 through 2023. I started with general curiosity about exploring this dataset. Because of its size, I then set out to find the biggest culprits of unsatisfactory inspections scores in the Seattle in 2023.

To learn more about this dataset please visit.

First Inspection

I wanted to get a feel for this dataset by looking at the range of all inspection scores, across all years. Please note that documentation on this dataset is not the greatest and upon completion of this graph I could not find out what the score means entirely. It appears that 0 is best, and any number above 0 is less than satisfactory but ranges based on the number and type of violations, making it relatively meaningless other than higher is worse it appears. The real score thats valuable to compare is held within result inspection, which I choose to examine next. Also, this data was not the best for point graphs but I wanted to practice using this type.

This graph told me that the higher scores appeared to fall in 2023 so I chose to hone in on that year going forward.

#all restaurants, fairly meaningless just wanted to use geom_point()
restinspect %>%
  ggplot(aes(x= yearinspection, y = score_inspection, color = risk)) +
  geom_point() + 
  coord_flip() +
  labs(title = "All King County's Restaurant's by Inspection Score, 2006-2023",
       caption = "Fig.1.2023 has the worst scores. This was just to use geom_point() and initial data exploration. \nAny score above 0 is less than satisfactory.\n Blank : No Risk Information\nRisk I:    Low Risk - Cold holding - Limited food preparation.\nRisk II:   Medium Risk - No Cook Step Food Preparation.\nRisk III: High Risk - Same day service or complex food preparation.",
       x = "Year of Inspection",
       y = "Inspection Score")

Honing in on 2023

Next I chose to examine only 2023 inspections, this time taking a look at the result inspection fiel. This field had a much more meaningful qualitative rating including many values that were a little perplexing but the most common and helpful values were: complete, in compliance, incomplete, unsatisfactory, and satisfactory.

Here I see that Seattle by far has the most unsatisfactory scores. This is not surprising however, because Seattle is also very urban and has the highest population in the county. Regardless, I found the huge discrepancy between Seattle and all other cities worth investigating.

#find outlier city with unsat scores
restinspect %>%
  filter(yearinspection == 2023) %>%
  filter(result_inspection == 'Unsatisfactory') %>%
  ggplot(aes(x = city)) +
  geom_bar() + 
  coord_flip() +
  labs(title = "Worst Restaurants in 2023 by King County Cities",
       caption = "Fig. 2. Seattle has the most unsatisfactory scores in 2023.",
       x = "City",
       y = "Number of Unsatisfactory Scores")

Taking a closer look at Seattle

Because of this huge discrepancy, I decided to take a look at the top restaurants in Seattle that received the most unsatisfactory scores across the span of the dataset. I had to limit the number of unsatisfactory scores to be greater than 30 because there were too many restaurants to visualize. T-Mobile Park’s scores appear to be consistently worse across the date range of this dataset. Note, this “restaurant” includes the food vendors within T-Mobile Park/Stadium.

# Seattle's Worst Restaurants Through the Years
restinspect %>%
  group_by(name, yearinspection) %>%
  filter(city == "Seattle" & result_inspection == 'Unsatisfactory') %>%
  summarise(num_observations = n()) %>%
  filter(num_observations > 30) %>%
  ggplot(aes(x = yearinspection, y= num_observations, color=name)) +
  geom_point() +
  labs(title = "Seattle's Worst Restaurants Through the Years",
       subtitle = "Greater than 30 'unsatisfactory' scores",
       caption = "Fig. 3. T-Mobile Park's scores appear to be consistently worse \nacross the date range of this dataset.",
       x = "Year Inspected",
       y = "Number of Unsatisfactory Scores")

Narrowing the scope

Next, I chose to do a similar exploration as above, but just within 2023 to see if our unsatisfactory score offenders persisted in the “worst” scored year, 2023, from our first graph.Two things stand out, again T-Mobile has the most unsatisfactory scores, with a culmination of 2023 being the worst scored year for those establishments.

knitr::include_graphics(here::here("tmobile.jpeg"))

Fig. 4.

# Seattle worst restaurants in 2023
restinspect %>%
  group_by(name) %>%
  filter(city == c("Seattle")) %>%
  filter(yearinspection == 2023) %>%
  filter(result_inspection == 'Unsatisfactory') %>%
  summarise(num_observations = n()) %>%
  filter(num_observations > 20) %>%
  ggplot(aes(x = name, y= num_observations, fill=num_observations)) +
  geom_col() +
  coord_flip() +
  theme_classic() +
  #scale_fill_gradient(low = "green", high = "red", limits = c(20, max(num_observations))) +
  labs(title = "Seattle's Worst Restaurants in 2023",
       subtitle = "Only greater than 20 'unsatisfactory' scores",
       caption = "Fig. 4",
       #scale_colour_distiller(name = "No. of Unsatisfactory Scores", palette = "RdYlBu"),
       x = "Year Inspected",
       y = "Number of Unsatisfactory Scores")

# T-Mobile Park's Unsatisfactory Inspection Score Through the Years
restinspect %>%
  filter(city == "Seattle" & name == "T-MOBILE PARK"& result_inspection == 'Unsatisfactory') %>%
  ggplot(aes(x = yearinspection)) +
  geom_bar() +
  theme_classic() +
  labs(title = "T-Mobile Park's Unsatisfactory Inspection Score Through the Years",
       caption = "Fig. 5. King County's Lowest Scored Establishment, T-Mobile Park, with the lowest scored year in 2023.",
       x = "Year Inspected",
       y = "Number of Unsatisfactory Scores")

As stated in my reflection, this week was particularly frustrating for me in finding the right dataset to be able to explore all the items/graph functionality covered in this lesson. I spent most of the lesson on the dataset below but pivoted to the restaurant inspection dataset as there were more variables to explore and I felt fit the objectives of this lesson a bit more. I realize I did not meet the requirement that I explore two variables in one of the plots, but I just did not find the right data in time for this lesson to be able to do list. In an attempt to get as many points as possible, please see the graphs I created below for my first dataset. For the sake of my time management I will not write up and compile a detailed story here.

Exploring the Food Research Atlas

This section briefly examines data and some estimates derived from the Economic Research Service U.S Department of Agriculture’s Food Access Research Atlas 2019. Limited access to supermarkets, supercenters, grocery stores, or other sources of healthy and affordable food may make it harder for some people to eat a healthy diet in this country. There are many ways to measure food store access for individuals and for neighborhoods, and many ways to define which areas are low-income and low access—neighborhoods that lack healthy food sources. Most measures and definitions consider at least some of the following indicators of access: - Accessibility to sources of healthy food, as measured by distance to a store or by the number of stores in an area; - Individual-level resources that may affect accessibility, such as family income or vehicle availability; - Neighborhood-level indicators of resources, such as the average income of the neighborhood and the availability of public transportation. For more information about the data and methods for this atlas, please visit. Estimates in the Atlas for 2019 are based on a 2019 list of supermarkets, the 2010 Decennial Census, and the 2014-18 American Community Survey (ACS).

rawatlas <- read.csv("C://Users//DMesler//OneDrive - The Pennsylvania State University//spring1_486//lesson9//FoodAccessResearchAtlasData2019.csv")

##pairing down the raw atlas
cleanatlas <- rawatlas %>%
  na.omit() %>%
  select_all(tolower) %>%
  select(censustract, state, county, urban, pop2010, lapop1_10, lapop05_10, lapop1_20, lalowi1_10, lalowi05_10, lalowi1_20, lilatracts_1and10, lilatracts_halfand10, lilatracts_1and20, lilatracts_vehicle)
  
##normalized data
# aggregate data from census tract to state level, estimate
summarized_data <- rawatlas %>%
  group_by(State) %>%
  summarise(
    sum_urban = sum(Urban),
    sum_LILA_1and10 = sum(LILATracts_1And10),
    sum_LILA_1and20 = sum(LILATracts_1And20),
    sum_LA1and10 = sum(LA1and10),
    sum_LA1and20 = sum(LA1and20),
    sum_LA_vehicle_20 = sum(LATractsVehicle_20),
    sum_HUNVflag = sum(HUNVFlag),
    sum_state_POP2010 = sum(Pop2010),
    numoftractsperstate = (CensusTract = n())
  )
# use those summaries to create normalized state estimates
normalized_atlas <- summarized_data %>%
  mutate(
    percent_tracts_urban = sum_urban / numoftractsperstate,
    percent_tracts_LILA_1and10 = sum_LILA_1and10 / numoftractsperstate,
    percent_tracts_LILA_1and20 = sum_LILA_1and20 / numoftractsperstate, 
    percent_tracts_LA1and10 = sum_LA1and10 / numoftractsperstate,
    percent_tracts_LA1and20 = sum_LA1and20 / numoftractsperstate,
    percent_tracts_LA_vehicle_20 = sum_LA_vehicle_20 / numoftractsperstate,
    percent_tracts_HUNVFLAG = sum_HUNVflag / numoftractsperstate
  )

Note: It provides food access data for populations within census tracts, though for this examination, variables presented are estimates. The variables presented in this examination are tract counts (flags) that are normalized by total number of census tracts for each state (percent of state’s tract that are x).

In the Food Access Research Atlas, low access to healthy food is defined as being far from a supermarket, supercenter, or large grocery store. A census tract is considered to have low access if a significant number or share of individuals in the tract is far from a supermarket. “low access” or “LA” displays this. Low-income census tracts where a significant number (at least 500 people) or share (at least 33 percent) of the population is greater than 1 mile from the nearest supermarket, supercenter, or large grocery store for an urban area or greater than 10 miles for a rural area. This measure shows that an estimated 18.8 million people, or 6.1 percent of the U.S. population, live in low-income and low access tracts and are more than 1 mile or 10 miles from a supermarket. “low access and low income” or “LILA” displays this.

#plot
#column chart with percent of tracts la1and10
normalized_atlas %>%
  #group_by(State) %>%
  ggplot(aes(x = State, y = percent_tracts_LILA_1and10)) +
  geom_col() +
  coord_flip() +
  labs(title = "Percent of Tracts Considered Low income and Low access (LILA) \nat 1 mile for urban tracts and 10 miles for rural tracts (1 and 10) ",
       subtitle = "Percent of tracts generated based on number of tracts in each state",
       caption = "Fig. 1.",
       x = "State",
       y = "Percent of tracts LILA 1 and 10")

I chose to compare the percentage of tracts for each state between Low Access (LA) at 1 and 10 vs low income + low access (LILA) at 1 and 10.

#percent la 1 and 10 in comparison with lila 1 and 10
normalized_atlas %>%
  ggplot(aes(x = State)) +
  geom_col(aes(y = percent_tracts_LA1and10), fill = "blue", position = "dodge") +
  geom_col(aes(y = percent_tracts_LILA_1and10), fill = "green", position = "dodge") +
  coord_flip() +
   labs(title = "Comparison of Percentage of LA and LILA Tracts Per State",
       subtitle = "Both at 1 mile for urban tracts and 10 miles for rural tracts (1 and 10). \nPercent of tracts generated based on number of tracts in each state",
       caption = "Fig. 2.",
       x = "State",
       y = "Percent of tracts LILA 1 and 10, green \n Percent of tracts LA 1 and 10, blue")

#raw pop counts of la pop 1_10 with state pop
cleanatlas %>%
  ggplot(aes(x = state)) +
  geom_col(aes(y = pop2010), fill = "gray", position = "dodge") +
  geom_col(aes(y = as.integer(lapop1_10)), fill = "purple", position = "dodge") +
  coord_flip() +
   labs(title = "Low Access Populations Compared with State Population",
       subtitle = "At 1 mile for urban tracts and 10 miles for rural tracts (1 and 10)",
       caption = "Fig. 3.",
       x = "State",
       y = "Low Access Population, purple \n State Population, gray")

#closer look at washington
cleanatlas %>%
  filter(state == 'Washington') %>%
  group_by(county) %>%
  ggplot(aes(y = county)) +
  geom_col(aes(x = pop2010), fill = "gray", position = "dodge") +
  geom_col(aes(x = as.integer(lapop1_10)), fill = "green", position = "dodge") +
  theme_classic() +
  labs(title = "Washington State County Populations Measured as Low Access \nat 1 and 10 miles",
       subtitle = "Only greater than 30 'unsatisfactory' scores",
       caption = "Fig. 4.",
       x = "County Population considered Low Access at 1 and 10, green \n County Total Population, gray",
       y = "Washington State County")

Provided by the Economic Research Service (ERS), U.S Department of Agriculture (USDA).

Lesson 3

D.Mesler

2024-03-12