This analysis combines multiple spatial and demographic datasets to study potential relationships between COVID-19 cases, food access, healthcare facilities, and demographic characteristics across New York City zip codes. The integration of these datasets provides a foundation for examining social determinants of health in the context of the pandemic.
# Load required libraries
library(dplyr)
library(sf)
library(ggplot2)
library(magrittr)
library(stringr)
library(kableExtra)
This analysis follows these key steps:
# Load zip code boundaries
zpNYC <- sf::st_read("NYC_zip_code_shapefile.shp")
# Load COVID-19 test data
covidData <- read.csv("tests-by-zcta_2021_04_23.csv")
# Join COVID data to zip code polygons
zpNYC <- merge(zpNYC, covidData, by.x = "ZIPCODE", by.y = "ZCTA")
# Examine the first few records
head(zpNYC) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
# Filter food stores (types A, J, or D) and aggregate to zip codes
nycFoodStoresByZip <- nycFoodStoreSF %>%
dplyr::filter(stringr::str_detect(Estbl_T, '[AJD]')) %>%
sf::st_join(zpNYC, ., join = st_contains) %>%
group_by(ZIPCODE) %>%
summarise(FoodStoreNum = n())
# Map food store counts by zip code
ggplot(nycFoodStoresByZip) +
geom_sf(aes(fill = FoodStoreNum)) +
scale_fill_viridis_c(option = "plasma") +
theme_minimal() +
labs(title = "Number of Food Stores by NYC Zip Code",
fill = "Food Store Count") +
theme(legend.position = "bottom")
We filter for nursing homes and then count them by zip code.
# Filter for nursing homes
nycNursingHome <- nycHealthFacilities %>%
dplyr::filter(Short.Description == 'NH')
# Aggregate nursing homes to zip codes
nursingHomesByZip <- zpNYC %>%
sf::st_join(nycNursingHome, join = st_contains) %>%
group_by(ZIPCODE) %>%
summarise(NursingHomeNum = n())
# Map nursing home counts
ggplot(nursingHomesByZip) +
geom_sf(aes(fill = NursingHomeNum)) +
scale_fill_viridis_c(option = "magma") +
theme_minimal() +
labs(title = "Number of Nursing Homes by NYC Zip Code",
fill = "Nursing Home Count") +
theme(legend.position = "bottom")
# Load NYC census tract boundaries
nycCensus <- sf::st_read('./R-Spatial_II_Lab/geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp',
stringsAsFactors = FALSE)
# Add county FIPS codes based on borough names
nycCensus %<>% dplyr::mutate(cntyFIPS = case_when(
boro_name == 'Bronx' ~ '005',
boro_name == 'Brooklyn' ~ '047',
boro_name == 'Manhattan' ~ '061',
boro_name == 'Queens' ~ '081',
boro_name == 'Staten Island' ~ '085'),
tractFIPS = paste(cntyFIPS, ct2010, sep='')
)
# Load ACS demographic data, bypassing problematic second line
acsData <- readLines("./R-Spatial_II_Lab/ACSDP5Y2018.DP05_data_with_overlays_2020-04-22T132935.csv") %>%
magrittr::extract(-2) %>%
textConnection() %>%
read.csv(header = TRUE, quote = "\"") %>%
dplyr::select(GEO_ID,
totPop = DP05_0001E,
elderlyPop = DP05_0024E, # >= 65
malePop = DP05_0002E,
femalePop = DP05_0003E,
whitePop = DP05_0037E,
blackPop = DP05_0038E,
asianPop = DP05_0067E,
hispanicPop = DP05_0071E,
adultPop = DP05_0021E,
citizenAdult = DP05_0087E) %>%
dplyr::mutate(censusCode = stringr::str_sub(GEO_ID, -9, -1))
# Merge ACS data with census tract boundaries
popData <- merge(nycCensus, acsData, by.x = 'tractFIPS', by.y = 'censusCode')
# Verify total population matches expectations
sum(popData$totPop)
# Transform to match coordinate system of zip code data
popNYC <- sf::st_transform(popData, st_crs(zpNYC))
# Aggregate census data to zip codes using centroids
covidPopZipNYC <- sf::st_join(zpNYC,
popNYC %>% sf::st_centroid(),
join = st_contains) %>%
group_by(ZIPCODE, PO_NAME, POPULATION, COUNTY, COVID_CASE_COUNT, TOTAL_COVID_TESTS) %>%
summarise(totPop = sum(totPop),
malePctg = sum(malePop)/sum(totPop)*100,
asianPop = sum(asianPop),
blackPop = sum(blackPop),
hispanicPop = sum(hispanicPop),
whitePop = sum(whitePop))
# Check for ZIP codes with missing population data
missingPop <- covidPopZipNYC %>%
dplyr::filter(is.na(totPop)) %>%
st_drop_geometry() %>%
select(ZIPCODE, PO_NAME, COUNTY)
kable(missingPop, caption = "ZIP Codes with Missing Population Data") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
#Final Results
# Map COVID case counts
ggplot(covidPopZipNYC) +
geom_sf(aes(fill = COVID_CASE_COUNT)) +
scale_fill_viridis_c(option = "inferno") +
theme_minimal() +
labs(title = "COVID-19 Cases by NYC ZIP Code",
fill = "Case Count") +
theme(legend.position = "bottom")
# Map Asian population distribution
ggplot(covidPopZipNYC) +
geom_sf(aes(fill = asianPop)) +
scale_fill_viridis_c() +
theme_minimal() +
labs(title = "Asian Population by NYC ZIP Code",
fill = "Population") +
theme(legend.position = "bottom")
# Extract numeric columns and calculate correlations
cor_data <- covidPopZipNYC %>%
st_drop_geometry() %>%
select(COVID_CASE_COUNT, TOTAL_COVID_TESTS, totPop, FoodStoreNum,
NursingHomeNum, asianPop, blackPop, hispanicPop, whitePop)
# Calculate correlation matrix
cor_matrix <- cor(cor_data, use = "complete.obs")
# Visualize correlation matrix
corrplot::corrplot(cor_matrix, method = "circle", type = "upper",
tl.col = "black", tl.srt = 45)
This analysis integrates COVID-19 testing data, food access indicators, healthcare infrastructure, and demographic information at the ZIP code level. To explore relationships between social determinants of health and COVID-19 outcomes in New York City.