library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

Introduction

This lab focuses on combining multiple datasets into a single spatial dataset at the ZIP code level. The goal is to better understand how COVID-19 outcomes relate to local infrastructure and demographic characteristics across New York City. The datasets used in this analysis include COVID-19 testing data, food retail store locations, health facilities, and Census ACS demographic data. Because these datasets come in different formats and geographic units, both attribute joins and spatial joins are required to bring them together.

Preparation and Data Setup

Before beginning the main tasks, several preparation steps were necessary. First, the spatial objects created in the previous lab were loaded. These include ZIP code polygons, food retail store locations, and health facility locations. Additionally, data cleaning was required to ensure that join fields matched correctly. In particular, ZIP code values in the COVID dataset were converted to character format to match the ZIP code field in the spatial data. This step is important because mismatched data types would prevent a successful join.

zip_path <- "/Users/amallali/Desktop/GTECH 78520/Section_07/spatial_lab1_objects.RData"

covid_path <- "/Users/amallali/Desktop/GTECH 78520/Section_07/Section_08/R-Spatial_II_Lab/tests-by-zcta_2020_04_12.csv"

shp_path <- "/Users/amallali/Desktop/GTECH 78520/Section_07/Section_08/R-Spatial_II_Lab/geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp"

acs_path <- "/Users/amallali/Desktop/GTECH 78520/Section_07/Section_08/R-Spatial_II_Lab/ACSDP5Y2018.DP05_data_with_overlays_2020-04-22T132935.csv"

# LOAD DATA
load(zip_path)

covid_df <- read.csv(covid_path)

covid_df <- covid_df %>%
  mutate(MODZCTA = as.character(MODZCTA)) %>%
  filter(!is.na(MODZCTA))

Task 1: Joining COVID-19 Data to ZIP Code Areas Using Attribute Matching

This step links COVID-19 testing and case data to the ZIP code polygons. Since both datasets contain ZIP code identifiers, an attribute join is used. The left_join() function is used to preserve all ZIP code areas while attaching the COVID data.

zpNYC <- zip_sf %>%
  left_join(covid_df, by = c("ZIPCODE" = "MODZCTA"))

Task 2: Aggregating Food Retail Stores to ZIP Code Areas Using Spatial Join

In this step, food retail store locations (points) are aggregated to ZIP code polygons. Because these datasets do not share a common ID, a spatial join is required. Only specific types of food establishments are included based on the classification codes in the dataset. The result is a count of food stores within each ZIP code. Note: Because establishment types are stored as combined classification codes (e.g., JAC, JAD), pattern matching was used to identify relevant categories.

food_filtered <- food_sf %>%
  filter(stringr::str_detect(`Establishment.Type`, "[AJD]"))

food_counts <- st_join(zpNYC, food_filtered, join = st_contains) %>%
  group_by(ZIPCODE) %>%
  summarise(FoodStoreNum = n(), .groups = "drop")

zpNYC <- zpNYC %>%
  left_join(st_drop_geometry(food_counts), by = "ZIPCODE")

Task 3: Aggregating Health Facilities (Nursing Homes) to ZIP Code Areas

This step follows a similar process to Task 2, but focuses specifically on nursing homes. These facilities are selected from the broader health dataset and then spatially aggregated to ZIP code areas. This allows us to understand how many nursing homes are located within each ZIP code.

health_filtered <- health_sf %>%
  filter(`Short Description` == "NH")

health_counts <- st_join(zpNYC, health_filtered, join = st_contains) %>%
  group_by(ZIPCODE) %>%
  summarise(NursingHomeNum = n(), .groups = "drop")

zpNYC <- zpNYC %>%
  left_join(st_drop_geometry(health_counts), by = "ZIPCODE")

# Clean duplicate NursingHomeNum columns if they exist
if("NursingHomeNum.x" %in% names(zpNYC)){
  zpNYC <- zpNYC %>%
    mutate(NursingHomeNum = NursingHomeNum.x) %>%
    select(-NursingHomeNum.x, -NursingHomeNum.y)
}

Task 4: Joining ACS Demographic Data to Census Tracts

The ACS demographic data is provided at the census tract level, so it must first be joined to the NYC Planning census tract dataset. Because these datasets use different formats for identifying tracts, a new matching column is created using substring operations. This allows for a proper attribute join between the two datasets.

nycCensus <- st_read(shp_path)
## Reading layer `geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a' from data source `/Users/amallali/Desktop/GTECH 78520/Section_07/Section_08/R-Spatial_II_Lab/geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 2165 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25559 ymin: 40.49612 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS:  WGS84(DD)
acs_df <- readLines(acs_path) %>%
  magrittr::extract(-2) %>%
  textConnection() %>%
  read.csv(header=TRUE, quote="\"") %>%
  select(
    GEO_ID,
    totPop = DP05_0001E,
    elderlyPop = DP05_0024E,
    whitePop = DP05_0037E,
    blackPop = DP05_0038E,
    asianPop = DP05_0067E,
    hispanicPop = DP05_0071E
  ) %>%
  mutate(tract = stringr::str_sub(GEO_ID, -6))

nycCensus <- nycCensus %>%
  mutate(tract = as.character(ct2010))

Task 5: Aggregating Census Data to ZIP Code Level Using Spatial Join

In the final step, the census tract data is aggregated to the ZIP code level. Since census tracts and ZIP codes are different geographic units, a spatial join is used. The tract polygons are converted to centroids, and then assigned to ZIP codes. The demographic variables are then summed within each ZIP code area.

popData <- left_join(nycCensus, acs_df, by = "tract", relationship = "many-to-many")

popData <- st_transform(popData, st_crs(zpNYC))

final_data <- st_join(
  zpNYC,
  st_centroid(popData),
  join = st_contains
) %>%
  group_by(
    ZIPCODE,
    PO_NAME,
    COUNTY,
    Positive,
    Total,
    zcta_cum.perc_pos,
    FoodStoreNum,
    NursingHomeNum
  ) %>%
  summarise(
    totPop = sum(totPop, na.rm = TRUE),
    elderlyPop = sum(elderlyPop, na.rm = TRUE),
    asianPop = sum(asianPop, na.rm = TRUE),
    blackPop = sum(blackPop, na.rm = TRUE),
    hispanicPop = sum(hispanicPop, na.rm = TRUE),
    whitePop = sum(whitePop, na.rm = TRUE),
    .groups = "drop"
  )
## Warning: st_centroid assumes attributes are constant over geometries

Final Output Verification

names(final_data)
##  [1] "ZIPCODE"           "PO_NAME"           "COUNTY"           
##  [4] "Positive"          "Total"             "zcta_cum.perc_pos"
##  [7] "FoodStoreNum"      "NursingHomeNum"    "totPop"           
## [10] "elderlyPop"        "asianPop"          "blackPop"         
## [13] "hispanicPop"       "whitePop"          "geometry"
sum(final_data$totPop, na.rm = TRUE)
## [1] 18797507
summary(final_data$elderlyPop)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0    7496   10306   17574   48585

Visualizing the Data

Total Population by ZIP Code

This map shows how the total population is distributed across ZIP code areas in New York City. By mapping population, we can quickly see which areas are more densely populated and may require more resources or attention when analyzing public health patterns. Although mapping was not a required component of the assignment, I wanted to visualize some of the data.

library(ggplot2)

ggplot(final_data) +
  geom_sf(aes(fill = totPop)) +
  scale_fill_viridis_c(option = "plasma") +
  labs(
    title = "Total Population by ZIP Code in New York City",
    fill = "Population"
  ) +
  theme_minimal()

COVID-19 Positivity Rate by ZIP Code

This map displays the COVID-19 positivity rate across ZIP code areas. It helps highlight how the impact of COVID-19 varies across different neighborhoods and provides useful context when compared with population and infrastructure patterns.

ggplot(zpNYC) +
  geom_sf(aes(fill = zcta_cum.perc_pos)) +
  scale_fill_viridis_c(option = "magma") +
  labs(
    title = "COVID-19 Positivity Rate by ZIP Code",
    fill = "Positivity (%)"
  ) +
  theme_minimal()

Conclusion

This lab demonstrates how multiple datasets with different structures and geographic units can be successfully combined into a single spatial dataset. Attribute joins were used when datasets shared a common identifier, such as ZIP codes, while spatial joins were necessary when working across different geographic levels, such as census tracts and ZIP codes. The final dataset provides a comprehensive view of COVID-19 outcomes, food access, healthcare infrastructure, and demographic patterns at the ZIP code level.