Spatial Data Manipulation in R

Setting up

task 1: Join the COVID-19 data to the NYC zip code area data (sf or sp polygons).

Prepare data from last lab

zip_codes <- st_read('../assign7/R-Spatial_I_Lab/ZIP_CODE_040114/ZIP_CODE_040114.shp')
## Reading layer `ZIP_CODE_040114' from data source 
##   `/Users/williamcornejo/Desktop/Desktop - william’s MacBook Air/school/gtech705/gtech78520/assign7/R-Spatial_I_Lab/ZIP_CODE_040114/ZIP_CODE_040114.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 263 features and 12 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 913129 ymin: 120020.9 xmax: 1067494 ymax: 272710.9
## Projected CRS: NAD83 / New York Long Island (ftUS)
retail_df_xy <- read.csv('../assign7/R-Spatial_I_Lab/nys_retail_food_store_xy.csv')
r_sf <- retail_df_xy[!is.na(retail_df_xy$X),]
r_sf <- r_sf[!is.na(r_sf$Y),]
r_sf <- st_as_sf(r_sf, coords = c("X", "Y"), crs="4326")

health_facilities <- read.csv('../assign7/R-Spatial_I_Lab/NYS_Health_Facility 3(in).csv')
hf_df <- health_facilities[!is.na(health_facilities$Facility.Longitude), ]
hf_df <- hf_df[!is.na(hf_df$Facility.Latitude), ]
hf_df <- hf_df[!is.na(hf_df$Facility.Longitude), ]
hf_df <- hf_df[hf_df$Facility.Latitude != 0, ]
hf_df <- hf_df[hf_df$Facility.Longitude != 0, ]
hf_sf <- st_as_sf(hf_df, coords = c("Facility.Longitude", "Facility.Latitude"))
# Read covid data

#covid_1 <- read.csv('R-Spatial_II_Lab/tests-by-zcta_2020_04_12.csv')
#covid_2 <- read.csv('R-Spatial_II_Lab/tests-by-zcta_2020_04_19.csv')
covid_3 <- read.csv('R-Spatial_II_Lab/tests-by-zcta_2021_04_23.csv')

# Read zip code data as spatial
zip_codes_1 <- st_transform(zip_codes, 4326) 

# Join by zipcode
# Convert zcta to char
covid_3 <- covid_3[!is.na(covid_3$MODIFIED_ZCTA), ]
covid_3_sf <- st_as_sf(covid_3, coords = c("lon", "lat"), crs=4326)
covid_3$MODZCTA_2 <- as.character(covid_3$MODIFIED_ZCTA)
covid_3_sf$MODZCTA_2 <- as.character(covid_3_sf$MODIFIED_ZCTA)
covid_3_join <- base::merge(covid_3, zip_codes_1, by.x='MODZCTA_2', by.y='ZIPCODE')
covid_3_join <- st_as_sf(covid_3_join, coords = c("lon", "lat"), crs=4326)
covid_3_join <- covid_3_join %>% 
  rename(ZIPCODE = MODZCTA_2)
# Join with just st_join
covid_3_sf <- st_join(zip_codes_1, covid_3_sf)

#mapview(covid_3_join, zcol=c('COVID_CASE_COUNT'))
  knitr::include_graphics('Screen Shot 2025-03-26 at 4.59.23 PM.png')

task 2: Aggregate the NYC food retails store data (points) to the zip code data, so that we know how many retail stores in each zip code area. Note that not all locations are for food retail. And we need to choose the specific types according to the data.

#r_sf$zip_code <- as.character(r_sf$Zip.Code)
nyc_food_store <- st_read('R-Spatial_II_Lab/nycFoodStore.shp')
## Reading layer `nycFoodStore' from data source 
##   `/Users/williamcornejo/Desktop/Desktop - william’s MacBook Air/school/gtech705/gtech78520/assign8/R-Spatial_II_Lab/nycFoodStore.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 11300 features and 16 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -74.2484 ymin: 40.50782 xmax: -73.67061 ymax: 40.91008
## Geodetic CRS:  WGS 84
st_crs(r_sf) <- 4326

r_sf_filt <- r_sf %>%
  dplyr::filter(stringr::str_detect(Establishment.Type, '[ABDJ]'))

retail_zips <- sf::st_join(zip_codes_1, r_sf_filt, join= st_contains) %>%
  group_by(ZIPCODE) %>%
  summarise(FoodStoreNum = n())
covid_3_sf_new <- covid_3_join %>%
  left_join(st_drop_geometry(retail_zips), by ="ZIPCODE")
#mapview(covid_3_sf_new, zcol=c('FoodStoreNum'))
#mapview(retail_zips_2)
knitr::include_graphics('Screen Shot 2025-03-27 at 12.47.54 PM.png')

task 3: Aggregate the NYC health facilities (points) to the zip code data. Similarly, choose appropriate subtypes such as nursing homes from the facilities.

# Adjust CRS of health facilities
st_crs(hf_sf) <- 4326
# Filter for hospitals
hf_sf_hospital <- hf_sf %>%
  filter(Short.Description == 'HOSP')
# Join zipcodes sf to hospitals to get count by zip
hf_zip <- sf::st_join(zip_codes_1, hf_sf_hospital, join= st_contains) %>%
  group_by(ZIPCODE) %>%
  summarise(hfNum = n())
# Ignore, other join based in geometry
#hf_zip_1 <- zip_codes_1 %>%
#  st_join(hf_sf_hospital) %>%
#  group_by(geometry) %>%
#  summarise(hfNum = n())
# join hospital count by zip to sf which will have everything (retail, health and acs)
covid_3_sf_new_1 <- covid_3_sf_new %>%
  left_join(st_drop_geometry(hf_zip), by = "ZIPCODE")
mapview(hf_zip['hfNum'])
knitr::include_graphics('Screen Shot 2025-03-28 at 2.51.17 PM.png')

task 4: Join the Census ACS population, race, and age data to the NYC Planning Census Tract Data.

# Some code from key tasks file
acsData <- readLines("R-Spatial_II_Lab/ACSDP5Y2018.DP05_data_with_overlays_2020-04-22T132935.csv") %>%
  magrittr::extract(-2) %>% 
  textConnection() %>%
  read.csv(header=TRUE, quote= "\"") %>%
  dplyr::select(GEO_ID, 
                totPop = DP05_0001E, elderlyPop = DP05_0024E, # >= 65
                malePop = DP05_0002E, femalePop = DP05_0003E,  
                whitePop = DP05_0037E, blackPop = DP05_0038E,
                asianPop = DP05_0067E, hispanicPop = DP05_0071E,
                adultPop = DP05_0021E, citizenAdult = DP05_0087E) %>%
  dplyr::mutate(censusCode = stringr::str_sub(GEO_ID, -9,-1));
nycCensus <- sf::st_read('R-Spatial_II_Lab/2010 Census Tracts/geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp', stringsAsFactors = FALSE)
## Reading layer `geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a' from data source `/Users/williamcornejo/Desktop/Desktop - william’s MacBook Air/school/gtech705/gtech78520/assign8/R-Spatial_II_Lab/2010 Census Tracts/geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 2165 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25559 ymin: 40.49612 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS:  WGS84(DD)
nycCensus %<>% dplyr::mutate(cntyFIPS = case_when(
  boro_name == 'Bronx' ~ '005',
  boro_name == 'Brooklyn' ~ '047',
  boro_name == 'Manhattan' ~ '061',
  boro_name == 'Queens' ~ '081',
  boro_name == 'Staten Island' ~ '085'),
  tractFIPS = paste(cntyFIPS, ct2010, sep='')
)
# Make new column from nycCensus$boroct201, where we add 1400000US3600 to it
# And name it GEO_ID, which is optional
acsData %>%
  magrittr::extract(1:10,)
##                  GEO_ID totPop elderlyPop malePop femalePop whitePop blackPop
## 1  1400000US36005000100   7080         51    6503       577     1773     4239
## 2  1400000US36005000200   4542        950    2264      2278     2165     1279
## 3  1400000US36005000400   5634        710    2807      2827     2623     1699
## 4  1400000US36005001600   5917        989    2365      3552     2406     2434
## 5  1400000US36005001900   2765         76    1363      1402      585     1041
## 6  1400000US36005002000   9409        977    4119      5290     3185     4487
## 7  1400000US36005002300   4600        648    2175      2425      479     2122
## 8  1400000US36005002400    172          0     121        51       69       89
## 9  1400000US36005002500   5887        548    2958      2929      903     1344
## 10 1400000US36005002701   2868        243    1259      1609      243      987
##    asianPop hispanicPop adultPop citizenAdult censusCode
## 1       130        2329     6909         6100  005000100
## 2       119        3367     3582         2952  005000200
## 3       226        3873     4507         4214  005000400
## 4        68        3603     4416         3851  005001600
## 5       130        1413     2008         1787  005001900
## 6        29        5905     6851         6170  005002000
## 7        27        2674     3498         3056  005002300
## 8        14           0      131           42  005002400
## 9        68        4562     4237         2722  005002500
## 10        0        1985     1848         1412  005002701
# Merge (JOIN) ACS data to the census tracts
popData <- merge(nycCensus, acsData, by.x ='tractFIPS', by.y = 'censusCode')
#mapview(popData, zcol=c('totPop'))
knitr::include_graphics('Screen Shot 2025-03-26 at 10.17.40 PM.png')

task 5: Aggregate the ACS census data to zip code area data.

popData_1 <- st_transform(popData, 4326)

# Ignore
#covid_3_sf_new_f <- covid_3_sf_new_1 %>%
#  left_join(
#    st_drop_geometry(popData_1),  # Ensure no geometry conflict
#    by = c("CTY_FIPS" = "cntyFIPS")  # Correct join syntax
#  )
# Final dataframe with zipcodes, census tracts,
c_final <- st_join(covid_3_sf_new_1, popData_1)
#mapview(c_final['asianPop'])
#plot(covid_3_sf_new_f['totPop'])

Confirming all the data is in one table with colnames

colnames(c_final)
##  [1] "ZIPCODE"           "MODIFIED_ZCTA"     "NEIGHBORHOOD_NAME"
##  [4] "BOROUGH_GROUP"     "label"             "COVID_CASE_COUNT" 
##  [7] "COVID_CASE_RATE"   "POP_DENOMINATOR"   "COVID_DEATH_COUNT"
## [10] "COVID_DEATH_RATE"  "PERCENT_POSITIVE"  "TOTAL_COVID_TESTS"
## [13] "BLDGZIP"           "PO_NAME"           "POPULATION"       
## [16] "AREA"              "STATE"             "COUNTY"           
## [19] "ST_FIPS"           "CTY_FIPS"          "URL"              
## [22] "SHAPE_AREA"        "SHAPE_LEN"         "FoodStoreNum"     
## [25] "hfNum"             "tractFIPS"         "boro_code"        
## [28] "boro_ct201"        "boro_name"         "cdeligibil"       
## [31] "ct2010"            "ctlabel"           "ntacode"          
## [34] "ntaname"           "puma"              "shape_area"       
## [37] "shape_leng"        "cntyFIPS"          "GEO_ID"           
## [40] "totPop"            "elderlyPop"        "malePop"          
## [43] "femalePop"         "whitePop"          "blackPop"         
## [46] "asianPop"          "hispanicPop"       "adultPop"         
## [49] "citizenAdult"      "geometry"

Plotting Asian Population by Zip Code

knitr::include_graphics('Screen Shot 2025-03-28 at 8.44.06 PM.png')

In the end, we should have the confirmed and tested cases of covid-19, numbers of specific types of food stores, numbers of specific types of health facilities, and population (total population, elderly, by race, etc.) at the zip code level. We should also have boroughs, names, etc. for each zip code area.