#install.packages("arrow")
#install.packages("tidyverse")
#install.packages("jsonlite")
#install.packages("leaflet")
#install.packages("gt")
#install.packages("tigris")
library(arrow)
##
## Attaching package: 'arrow'
## The following object is masked from 'package:utils':
##
## timestamp
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(jsonlite)
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:jsonlite':
##
## flatten
# Generates a table using the tags of POIs in DC, displaying quantities of each type of POI. Many POIs have multiple tags.
pois <- open_dataset("DCData")
tag_frequency <- pois %>%
select(CATEGORY_TAGS) %>%
collect() %>%
filter(!is.na(CATEGORY_TAGS), CATEGORY_TAGS != "[]", CATEGORY_TAGS != "") %>% #filter empty tags
mutate(tags = map(CATEGORY_TAGS, ~fromJSON(.x))) %>%
unnest(tags) %>%
count(tags, name = "poi count dc", sort = TRUE) #show number of pois in order of most to least
print(tag_frequency)
## # A tibble: 881 × 2
## tags `poi count dc`
## <chr> <int>
## 1 Buses 3777
## 2 Bus Station 3775
## 3 Bar or Pub 1783
## 4 Bakery 1549
## 5 Cocktail Lounge 1416
## 6 General Dentistry 1367
## 7 Hygiene 1341
## 8 Cafe 1338
## 9 American Food 1313
## 10 Deli 1288
## # ℹ 871 more rows
# Focused only on the grocery tags. Before making a comparison with another POI, I wanted to generate a map of just grociery stores to test the ability of leaflet.
library(leaflet)
dc_groceries <- pois %>%
filter(
grepl("grocery", CATEGORY_TAGS, ignore.case = TRUE) |
grepl("grocery", SUB_CATEGORY, ignore.case = TRUE)
) %>%
select(LOCATION_NAME, LATITUDE, LONGITUDE, SUB_CATEGORY, STREET_ADDRESS) %>%
collect()
leaflet(dc_groceries) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(
lng = ~LONGITUDE,
lat = ~LATITUDE,
popup = ~paste0("<b>", LOCATION_NAME, "</b><br>", STREET_ADDRESS, "<br>", SUB_CATEGORY),
radius = 5,
color = "forestgreen",
stroke = FALSE,
fillOpacity = 0.7,
clusterOptions = markerClusterOptions() #group markers when zoomed out
)
# Next, I wanted to make a comparison between grocery stores and metros. I extracted POI metro locations from the same dataset.
metro_data <- pois %>%
filter(grepl("Metro Station", CATEGORY_TAGS, ignore.case = TRUE)) %>%
collect() %>%
mutate(tags = map(CATEGORY_TAGS, ~fromJSON(.x))) %>%
unnest(tags) %>%
filter(tags == "Metro Station") %>%
mutate(LATITUDE = as.numeric(LATITUDE),
LONGITUDE = as.numeric(LONGITUDE)) %>%
filter(!is.na(LATITUDE))
print(paste("Found", nrow(metro_data), "Metro Stations."))
## [1] "Found 70 Metro Stations."
# Many metro stations needed to be filtered out of this project, including the DC streetcar, which no longer exists.
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.4.0; sf_use_s2() is TRUE
metro_sf <- metro_data %>%
# exclude anything containing "Streetcar"
filter(!grepl("Streetcar", LOCATION_NAME, ignore.case = TRUE)) %>%
st_as_sf(coords = c("LONGITUDE", "LATITUDE"), crs = 4326) %>%
st_transform(3857)
# lots of duplicates showed up when using category tags, so this filted out duplicates
clusters <- st_is_within_distance(metro_sf, dist = 100)
unique_clusters <- unique(clusters)
metro_deduped_sf <- metro_sf[sapply(unique_clusters, function(x) x[1]), ]
metro_final <- metro_deduped_sf %>%
st_transform(4326) %>%
mutate(
LONGITUDE = st_coordinates(.)[,1],
LATITUDE = st_coordinates(.)[,2]
) %>%
st_drop_geometry()
print(paste("Original rows (including streetcars and duplicates):", nrow(metro_data)))
## [1] "Original rows (including streetcars and duplicates): 70"
print(paste("True Metro Count", nrow(metro_final)))
## [1] "True Metro Count 47"
# Map of both dc metro stations and grocery stores
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(
data = metro_final,
lng = ~LONGITUDE, lat = ~LATITUDE,
radius = 6, # larger than groceries
color = "black",
weight = 2,
fillColor = "#005596", # classic metro blue
fillOpacity = 0.9,
group = "Metro Stations",
popup = ~paste0("<b>Metro Hub: </b>", LOCATION_NAME)
) %>%
addCircleMarkers(
data = dc_groceries,
lng = ~LONGITUDE, lat = ~LATITUDE,
radius = 3,
color = "forestgreen",
stroke = FALSE,
fillOpacity = 0.8,
group = "Groceries",
popup = ~paste0("<b>Grocery: </b>", LOCATION_NAME)
) %>%
addLayersControl(
overlayGroups = c("Metro Stations", "Groceries"),
options = layersControlOptions(collapsed = FALSE)
)
# Spatial map, connecting each grocery store to their closest metro station. Sorted in two colored categories: walkable, and transit isolated
grocery_sf <- st_as_sf(dc_groceries, coords = c("LONGITUDE", "LATITUDE"), crs = 4326)
metro_sf <- st_as_sf(metro_final, coords = c("LONGITUDE", "LATITUDE"), crs = 4326)
nearest_metro_idx <- st_nearest_feature(grocery_sf, metro_sf)
spider_lines <- lapply(1:nrow(grocery_sf), function(i) {
st_linestring(rbind(
st_coordinates(grocery_sf[i, ]),
st_coordinates(metro_sf[nearest_metro_idx[i], ])
))
})
spider_sf <- st_sf(
geometry = st_sfc(spider_lines, crs = 4326),
store_name = grocery_sf$LOCATION_NAME,
dist_m = as.numeric(st_distance(grocery_sf, metro_sf[nearest_metro_idx,], by_element = TRUE))
)
#map
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolylines(
data = spider_sf,
# blue if < 800m, red if > 800m
color = ~ifelse(dist_m > 800, "red", "orange"),
# As a colorblind person, I tried to use the most accessible color combination for me. This map can get a little hard to read because of clustering of lines.
weight = ~ifelse(dist_m > 800, 2, 2),
opacity = 0.7,
label = ~paste0(store_name, ": ", round(dist_m), "m to Metro")
) %>%
addCircleMarkers(
data = metro_sf,
color = "black",
fillColor = "#005596", # metro blue
fillOpacity = 5,
radius = 4,
label = ~LOCATION_NAME
) %>%
# legend
addLegend(
position = "bottomright",
colors = c("orange", "red"),
labels = c("Walkable (< 800m)", "Transit Isolated (> 800m)"),
title = "Grocery to Metro Distance"
)
# Horizonal bar graph displaying the top 10 metro stations in DC with 5-minute access to grocery stores
library(stringr)
library(ggplot2)
top_10_stations <- metro_final %>%
st_as_sf(coords = c("LONGITUDE", "LATITUDE"), crs = 4326) %>%
mutate(grocery_count_5min = st_is_within_distance(., grocery_sf, dist = 400) %>%
lengths()) %>%
arrange(desc(grocery_count_5min)) %>%
head(10)
top_10_plot_data <- top_10_stations %>%
# removes the "METRO STATION" suffix
mutate(
NAME_SHORT = gsub("Washington Metropolitan Area Transit Authority-", "", LOCATION_NAME),
NAME_SHORT = gsub("(?i) METRO STATION", "", NAME_SHORT),
# fixes capitalization
NAME_SHORT = str_to_title(NAME_SHORT),
NAME_SHORT = gsub("Noma", "NoMa", NAME_SHORT)
)
# creates bar graph
ggplot(top_10_plot_data, aes(x = reorder(NAME_SHORT, grocery_count_5min), y = grocery_count_5min)) +
geom_bar(stat = "identity", fill = "#005596", width = 0.7) + #metro blue
coord_flip() +
labs(
title = "Top 10 DC Metro Stations by Food Access",
subtitle = "Number of grocery stores within a 5-minute walk (400m)",
x = NULL,
y = "Count of Grocery Stores",
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#222222"),
plot.subtitle = element_text(size = 11, color = "#555555"),
axis.text.y = element_text(face = "bold", size = 10),
panel.grid.major.y = element_blank()
)

# Table to represent the percentage of grocery stores in DC accessible by DC Metro. Three groups indicating a difference in walkability.
library(gt)
access_summary <- data.frame(
Distance = c("400m", "800m", "1600m"),
Walk_Time = c("5 min or less", "10 min or less", "20 min or less")
) %>%
mutate(
Grocery_Count = purrr::map_dbl(c(400, 800, 1600), function(d) {
sum(st_is_within_distance(grocery_sf, metro_sf, dist = d) %>% lengths() > 0)
}),
Percentage = (Grocery_Count / nrow(grocery_sf)) * 100
)
table_data <- access_summary %>%
mutate(
Distance = case_when(
Distance == "400m" ~ "Primary Zone (400m)",
Distance == "800m" ~ "Secondary Zone (800m)",
Distance == "1600m" ~ "Extended Zone (1600m)"
)
)
# create table
table_data %>%
gt() %>%
tab_header(
title = md("Percentage of Grocery Stores Within Walking Distance of DC Metro"),
) %>%
cols_label(
Distance = "Distance to Metro",
Walk_Time = "Estimated Walk Time",
Grocery_Count = "# of Stores",
Percentage = "Coverage (%)"
) %>%
fmt_number(
columns = Percentage,
decimals = 1
) %>%
tab_source_note(
source_note = paste0("Note: Analysis based on ", nrow(grocery_sf), " grocery stores and ", nrow(metro_sf), " Metro stations.")
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(columns = Percentage)
) %>%
opt_table_font(
font = google_font(name = "Times New Roman")
)
| Percentage of Grocery Stores Within Walking Distance of DC Metro |
| Distance to Metro |
Estimated Walk Time |
# of Stores |
Coverage (%) |
| Primary Zone (400m) |
5 min or less |
111 |
29.3 |
| Secondary Zone (800m) |
10 min or less |
226 |
59.6 |
| Extended Zone (1600m) |
20 min or less |
322 |
85.0 |
| Note: Analysis based on 379 grocery stores and 47 Metro stations. |