Data Analytics UT435

#install.packages("arrow")
#install.packages("tidyverse")
#install.packages("jsonlite")
#install.packages("leaflet")
#install.packages("gt")
#install.packages("tigris")

library(arrow)

## 
## Attaching package: 'arrow'

## The following object is masked from 'package:utils':
## 
##     timestamp

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(jsonlite)
library(purrr)

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:jsonlite':
## 
##     flatten

# Generates a table using the tags of POIs in DC, displaying quantities of each type of POI. Many POIs have multiple tags. 

pois <- open_dataset("DCData")
tag_frequency <- pois %>%
  select(CATEGORY_TAGS) %>%
  collect() %>%
  filter(!is.na(CATEGORY_TAGS), CATEGORY_TAGS != "[]", CATEGORY_TAGS != "") %>% #filter empty tags
  mutate(tags = map(CATEGORY_TAGS, ~fromJSON(.x))) %>%
  unnest(tags) %>%
  count(tags, name = "poi count dc", sort = TRUE) #show number of pois in order of most to least
print(tag_frequency)

## # A tibble: 881 × 2
##    tags              `poi count dc`
##    <chr>                      <int>
##  1 Buses                       3777
##  2 Bus Station                 3775
##  3 Bar or Pub                  1783
##  4 Bakery                      1549
##  5 Cocktail Lounge             1416
##  6 General Dentistry           1367
##  7 Hygiene                     1341
##  8 Cafe                        1338
##  9 American Food               1313
## 10 Deli                        1288
## # ℹ 871 more rows

# Focused only on the grocery tags. Before making a comparison with another POI, I wanted to generate a map of just grociery stores to test the ability of leaflet. 
library(leaflet)

dc_groceries <- pois %>%
  filter(
    grepl("grocery", CATEGORY_TAGS, ignore.case = TRUE) |
    grepl("grocery", SUB_CATEGORY, ignore.case = TRUE)
  ) %>%
  select(LOCATION_NAME, LATITUDE, LONGITUDE, SUB_CATEGORY, STREET_ADDRESS) %>%
  collect()

leaflet(dc_groceries) %>%
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addCircleMarkers(
    lng = ~LONGITUDE, 
    lat = ~LATITUDE, 
    popup = ~paste0("<b>", LOCATION_NAME, "</b><br>", STREET_ADDRESS, "<br>", SUB_CATEGORY),
    radius = 5,
    color = "forestgreen",
    stroke = FALSE, 
    fillOpacity = 0.7,
    clusterOptions = markerClusterOptions() #group markers when zoomed out
  )

# Next, I wanted to make a comparison between grocery stores and metros. I extracted POI metro locations from the same dataset. 
metro_data <- pois %>%
  filter(grepl("Metro Station", CATEGORY_TAGS, ignore.case = TRUE)) %>%
  collect() %>%
  mutate(tags = map(CATEGORY_TAGS, ~fromJSON(.x))) %>%
  unnest(tags) %>%
  filter(tags == "Metro Station") %>%
  mutate(LATITUDE = as.numeric(LATITUDE), 
         LONGITUDE = as.numeric(LONGITUDE)) %>%
  filter(!is.na(LATITUDE))

print(paste("Found", nrow(metro_data), "Metro Stations."))

## [1] "Found 70 Metro Stations."

# Many metro stations needed to be filtered out of this project, including the DC streetcar, which no longer exists.  
library(sf)

## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.4.0; sf_use_s2() is TRUE

metro_sf <- metro_data %>%
  # exclude anything containing "Streetcar"
  filter(!grepl("Streetcar", LOCATION_NAME, ignore.case = TRUE)) %>%
  st_as_sf(coords = c("LONGITUDE", "LATITUDE"), crs = 4326) %>%
  st_transform(3857) 

# lots of duplicates showed up when using category tags, so this filted out duplicates
clusters <- st_is_within_distance(metro_sf, dist = 100)
unique_clusters <- unique(clusters)
metro_deduped_sf <- metro_sf[sapply(unique_clusters, function(x) x[1]), ]

metro_final <- metro_deduped_sf %>%
  st_transform(4326) %>%
  mutate(
    LONGITUDE = st_coordinates(.)[,1],
    LATITUDE = st_coordinates(.)[,2]
  ) %>%
  st_drop_geometry()

print(paste("Original rows (including streetcars and duplicates):", nrow(metro_data)))

## [1] "Original rows (including streetcars and duplicates): 70"

print(paste("True Metro Count", nrow(metro_final)))

## [1] "True Metro Count 47"

# Map of both dc metro stations and grocery stores
leaflet() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addCircleMarkers(
    data = metro_final,
    lng = ~LONGITUDE, lat = ~LATITUDE,
    radius = 6,           # larger than groceries
    color = "black",      
    weight = 2,
    fillColor = "#005596", # classic metro blue
    fillOpacity = 0.9,
    group = "Metro Stations",
    popup = ~paste0("<b>Metro Hub: </b>", LOCATION_NAME)
  ) %>%
  
  addCircleMarkers(
    data = dc_groceries,
    lng = ~LONGITUDE, lat = ~LATITUDE,
    radius = 3,
    color = "forestgreen",
    stroke = FALSE,
    fillOpacity = 0.8,
    group = "Groceries",
    popup = ~paste0("<b>Grocery: </b>", LOCATION_NAME)
  ) %>%
  
  addLayersControl(
    overlayGroups = c("Metro Stations", "Groceries"),
    options = layersControlOptions(collapsed = FALSE)
  )

# Spatial map, connecting each grocery store to their closest metro station. Sorted in two colored categories: walkable, and transit isolated

grocery_sf <- st_as_sf(dc_groceries, coords = c("LONGITUDE", "LATITUDE"), crs = 4326)
metro_sf <- st_as_sf(metro_final, coords = c("LONGITUDE", "LATITUDE"), crs = 4326)
nearest_metro_idx <- st_nearest_feature(grocery_sf, metro_sf)
spider_lines <- lapply(1:nrow(grocery_sf), function(i) {
  st_linestring(rbind(
    st_coordinates(grocery_sf[i, ]),
    st_coordinates(metro_sf[nearest_metro_idx[i], ])
  ))
})
spider_sf <- st_sf(
  geometry = st_sfc(spider_lines, crs = 4326),
  store_name = grocery_sf$LOCATION_NAME,
  dist_m = as.numeric(st_distance(grocery_sf, metro_sf[nearest_metro_idx,], by_element = TRUE))
)

#map
leaflet() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolylines(
    data = spider_sf,
    # blue if < 800m, red if > 800m
    color = ~ifelse(dist_m > 800, "red", "orange"), 
    
# As a colorblind person, I tried to use the most accessible color combination for me. This map can get a little hard to read because of clustering of lines.
    weight = ~ifelse(dist_m > 800, 2, 2),
    opacity = 0.7,
    label = ~paste0(store_name, ": ", round(dist_m), "m to Metro")
  ) %>%
  addCircleMarkers(
    data = metro_sf,
    color = "black",
    fillColor = "#005596", # metro blue
    fillOpacity = 5,
    radius = 4,
    label = ~LOCATION_NAME 
  ) %>%
  
  # legend
  addLegend(
    position = "bottomright",
    colors = c("orange", "red"),
    labels = c("Walkable (< 800m)", "Transit Isolated (> 800m)"),
    title = "Grocery to Metro Distance"
  )

# Horizonal bar graph displaying the top 10 metro stations in DC with 5-minute access to grocery stores
library(stringr)
library(ggplot2)

top_10_stations <- metro_final %>%
  st_as_sf(coords = c("LONGITUDE", "LATITUDE"), crs = 4326) %>%
  mutate(grocery_count_5min = st_is_within_distance(., grocery_sf, dist = 400) %>% 
           lengths()) %>%
  arrange(desc(grocery_count_5min)) %>%
  head(10)

top_10_plot_data <- top_10_stations %>%
  # removes the "METRO STATION" suffix
  mutate(
    NAME_SHORT = gsub("Washington Metropolitan Area Transit Authority-", "", LOCATION_NAME),
    NAME_SHORT = gsub("(?i) METRO STATION", "", NAME_SHORT), 
    
    # fixes capitalization
    NAME_SHORT = str_to_title(NAME_SHORT),
    NAME_SHORT = gsub("Noma", "NoMa", NAME_SHORT) 
  )

# creates bar graph
ggplot(top_10_plot_data, aes(x = reorder(NAME_SHORT, grocery_count_5min), y = grocery_count_5min)) +
  geom_bar(stat = "identity", fill = "#005596", width = 0.7) +  #metro blue
  coord_flip() + 
  labs(
    title = "Top 10 DC Metro Stations by Food Access",
    subtitle = "Number of grocery stores within a 5-minute walk (400m)",
    x = NULL, 
    y = "Count of Grocery Stores",
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#222222"),
    plot.subtitle = element_text(size = 11, color = "#555555"),
    axis.text.y = element_text(face = "bold", size = 10),
    panel.grid.major.y = element_blank() 
  )

# Table to represent the percentage of grocery stores in DC accessible by DC Metro. Three groups indicating a difference in walkability.
library(gt)

access_summary <- data.frame(
  Distance = c("400m", "800m", "1600m"),
  Walk_Time = c("5 min or less", "10 min or less", "20 min or less")
) %>%
  mutate(
    Grocery_Count = purrr::map_dbl(c(400, 800, 1600), function(d) {
      sum(st_is_within_distance(grocery_sf, metro_sf, dist = d) %>% lengths() > 0)
    }),
    Percentage = (Grocery_Count / nrow(grocery_sf)) * 100
  )

table_data <- access_summary %>%
  mutate(
    Distance = case_when(
      Distance == "400m"  ~ "Primary Zone (400m)",
      Distance == "800m"  ~ "Secondary Zone (800m)",
      Distance == "1600m" ~ "Extended Zone (1600m)"
    )
  )

# create table
table_data %>%
  gt() %>%
  tab_header(
    title = md("Percentage of Grocery Stores Within Walking Distance of DC Metro"),
  ) %>%
  cols_label(
    Distance = "Distance to Metro",
    Walk_Time = "Estimated Walk Time",
    Grocery_Count = "# of Stores",
    Percentage = "Coverage (%)"
  ) %>%
  fmt_number(
    columns = Percentage,
    decimals = 1
  ) %>%
  tab_source_note(
    source_note = paste0("Note: Analysis based on ", nrow(grocery_sf), " grocery stores and ", nrow(metro_sf), " Metro stations.")
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(columns = Percentage)
  ) %>%
  opt_table_font(
    font = google_font(name = "Times New Roman")
  )

Distance to Metro	Estimated Walk Time	# of Stores	Coverage (%)
Percentage of Grocery Stores Within Walking Distance of DC Metro
Primary Zone (400m)	5 min or less	111	29.3
Secondary Zone (800m)	10 min or less	226	59.6
Extended Zone (1600m)	20 min or less	322	85.0
Note: Analysis based on 379 grocery stores and 47 Metro stations.

Data Analytics UT435

Luther Hoy

2026-03-07