Introduction

This analysis combines multiple spatial and demographic datasets to study potential relationships between COVID-19 cases, food access, healthcare facilities, and demographic characteristics across New York City zip codes. The integration of these datasets provides a foundation for examining social determinants of health in the context of the pandemic.

# Load required libraries
library(dplyr)
library(sf)
library(ggplot2)
library(magrittr)
library(stringr)
library(kableExtra)

Workflow Overview

This analysis follows these key steps:

  1. Join COVID-19 testing data to NYC zip code areas
  2. Aggregate food retail store data to zip codes
  3. Aggregate health facilities (especially nursing homes) to zip codes
  4. Join ACS population demographics to NYC census tracts
  5. Aggregate census data to zip code areas

Step 1: Joining COVID-19 Data to Zip Code Areas

# Load zip code boundaries
zpNYC <- sf::st_read("NYC_zip_code_shapefile.shp") 

# Load COVID-19 test data
covidData <- read.csv("tests-by-zcta_2021_04_23.csv") 

# Join COVID data to zip code polygons
zpNYC <- merge(zpNYC, covidData, by.x = "ZIPCODE", by.y = "ZCTA")

# Examine the first few records
head(zpNYC) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)

Step 2: Aggregating Food Retail Stores to Zip Codes

# Filter food stores (types A, J, or D) and aggregate to zip codes
nycFoodStoresByZip <- nycFoodStoreSF %>% 
  dplyr::filter(stringr::str_detect(Estbl_T, '[AJD]')) %>%
  sf::st_join(zpNYC, ., join = st_contains) %>%
  group_by(ZIPCODE) %>%
  summarise(FoodStoreNum = n())

# Map food store counts by zip code
ggplot(nycFoodStoresByZip) +
  geom_sf(aes(fill = FoodStoreNum)) +
  scale_fill_viridis_c(option = "plasma") +
  theme_minimal() +
  labs(title = "Number of Food Stores by NYC Zip Code",
       fill = "Food Store Count") +
  theme(legend.position = "bottom")

Step 3: Aggregating Health Facilities to Zip Codes

We filter for nursing homes and then count them by zip code.

# Filter for nursing homes
nycNursingHome <- nycHealthFacilities %>% 
  dplyr::filter(Short.Description == 'NH')

# Aggregate nursing homes to zip codes
nursingHomesByZip <- zpNYC %>%
  sf::st_join(nycNursingHome, join = st_contains) %>%
  group_by(ZIPCODE) %>%
  summarise(NursingHomeNum = n())

# Map nursing home counts
ggplot(nursingHomesByZip) +
  geom_sf(aes(fill = NursingHomeNum)) +
  scale_fill_viridis_c(option = "magma") +
  theme_minimal() +
  labs(title = "Number of Nursing Homes by NYC Zip Code",
       fill = "Nursing Home Count") +
  theme(legend.position = "bottom")

Step 4: Joining Census ACS Data to Census Tracts

# Load NYC census tract boundaries
nycCensus <- sf::st_read('./R-Spatial_II_Lab/geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp', 
                         stringsAsFactors = FALSE)

# Add county FIPS codes based on borough names
nycCensus %<>% dplyr::mutate(cntyFIPS = case_when(
  boro_name == 'Bronx' ~ '005',
  boro_name == 'Brooklyn' ~ '047',
  boro_name == 'Manhattan' ~ '061',
  boro_name == 'Queens' ~ '081',
  boro_name == 'Staten Island' ~ '085'),
  tractFIPS = paste(cntyFIPS, ct2010, sep='')
)

# Load ACS demographic data, bypassing problematic second line
acsData <- readLines("./R-Spatial_II_Lab/ACSDP5Y2018.DP05_data_with_overlays_2020-04-22T132935.csv") %>%
  magrittr::extract(-2) %>% 
  textConnection() %>%
  read.csv(header = TRUE, quote = "\"") %>%
  dplyr::select(GEO_ID, 
                totPop = DP05_0001E, 
                elderlyPop = DP05_0024E, # >= 65
                malePop = DP05_0002E, 
                femalePop = DP05_0003E,  
                whitePop = DP05_0037E, 
                blackPop = DP05_0038E,
                asianPop = DP05_0067E, 
                hispanicPop = DP05_0071E,
                adultPop = DP05_0021E, 
                citizenAdult = DP05_0087E) %>%
  dplyr::mutate(censusCode = stringr::str_sub(GEO_ID, -9, -1))

# Merge ACS data with census tract boundaries
popData <- merge(nycCensus, acsData, by.x = 'tractFIPS', by.y = 'censusCode')

# Verify total population matches expectations
sum(popData$totPop)

# Transform to match coordinate system of zip code data
popNYC <- sf::st_transform(popData, st_crs(zpNYC))

Step 5: Aggregating Census Data to Zip Code Level

# Aggregate census data to zip codes using centroids
covidPopZipNYC <- sf::st_join(zpNYC, 
                             popNYC %>% sf::st_centroid(), 
                             join = st_contains) %>% 
  group_by(ZIPCODE, PO_NAME, POPULATION, COUNTY, COVID_CASE_COUNT, TOTAL_COVID_TESTS) %>%
  summarise(totPop = sum(totPop),
            malePctg = sum(malePop)/sum(totPop)*100,
            asianPop = sum(asianPop),
            blackPop = sum(blackPop),
            hispanicPop = sum(hispanicPop),
            whitePop = sum(whitePop)) 

# Check for ZIP codes with missing population data
missingPop <- covidPopZipNYC %>% 
  dplyr::filter(is.na(totPop)) %>%
  st_drop_geometry() %>%
  select(ZIPCODE, PO_NAME, COUNTY)

kable(missingPop, caption = "ZIP Codes with Missing Population Data") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)

#Final Results

# Map COVID case counts
ggplot(covidPopZipNYC) +
  geom_sf(aes(fill = COVID_CASE_COUNT)) +
  scale_fill_viridis_c(option = "inferno") +
  theme_minimal() +
  labs(title = "COVID-19 Cases by NYC ZIP Code",
       fill = "Case Count") +
  theme(legend.position = "bottom")

# Map Asian population distribution
ggplot(covidPopZipNYC) +
  geom_sf(aes(fill = asianPop)) +
  scale_fill_viridis_c() +
  theme_minimal() +
  labs(title = "Asian Population by NYC ZIP Code",
       fill = "Population") +
  theme(legend.position = "bottom")

Correlation Analysis

# Extract numeric columns and calculate correlations
cor_data <- covidPopZipNYC %>%
  st_drop_geometry() %>%
  select(COVID_CASE_COUNT, TOTAL_COVID_TESTS, totPop, FoodStoreNum, 
         NursingHomeNum, asianPop, blackPop, hispanicPop, whitePop)

# Calculate correlation matrix
cor_matrix <- cor(cor_data, use = "complete.obs")

# Visualize correlation matrix
corrplot::corrplot(cor_matrix, method = "circle", type = "upper", 
                   tl.col = "black", tl.srt = 45)

Conclusion

This analysis integrates COVID-19 testing data, food access indicators, healthcare infrastructure, and demographic information at the ZIP code level. To explore relationships between social determinants of health and COVID-19 outcomes in New York City.