GTECH_38520_2025S_Week08_24178392

R Spatial Lab Assignment # 2

task 0: Import necessary libraries and data path

task 1: Join the COVID-19 data to the NYC zip code area

## Read COVID-19 file
covid_19_cases_df <- read.csv(file.path(data_dir,'tests-by-zcta_2021_04_23.csv'))


## Reference: Convert method from https://stackoverflow.com/questions/57711323/convert-dataframe-numeric-column-to-character
covid_19_cases_df$MODIFIED_ZCTA<-as.character(covid_19_cases_df$MODIFIED_ZCTA)

## Convert COVID-19 file to spatial file
covid_19_cases_sf <- covid_19_cases_df %>%
                      ## remove the na values from the list, otherwise sf() cannot work
                      filter(!is.na(lon) , !is.na(lat))

## Read Zip Code Shapefile
zip_code_sf <- st_read(file.path(data_dir,'ZIP_CODE_040114/ZIP_CODE_040114.shp'))

## Reading layer `ZIP_CODE_040114' from data source 
##   `/Users/ziruizheng/Desktop/College Class/Now/Data Analysis & Viz/2025_Spring_week 7-10/Section_08/R-Spatial_II_Lab/ZIP_CODE_040114/ZIP_CODE_040114.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 263 features and 12 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 913129 ymin: 120020.9 xmax: 1067494 ymax: 272710.9
## Projected CRS: NAD83 / New York Long Island (ftUS)

zip_code_sf <- st_transform(zip_code_sf, crs = 4326)


long_text <- 'Notice:\n\nI did the assignment in two methods: left join if it is a combo of df & sf. st_join if it is a combo of two sfs. )'
wrapped_text <- strwrap(long_text, width = 70)
cat(wrapped_text, sep = "\n")

## Notice:
## 
## I did the assignment in two methods: left join if it is a combo of df
## & sf. st_join if it is a combo of two sfs. )

## Join the Covid-19 dataframe and Zip Code sf in st_join
Covid_zip_code_merged_sf_st_join  <- zip_code_sf %>%
  left_join(covid_19_cases_df, by = c("ZIPCODE" = "MODIFIED_ZCTA"))  


cat('1. left_join result for Covi-19 with Zipcode:')

## 1. left_join result for Covi-19 with Zipcode:

glimpse(Covid_zip_code_merged_sf_st_join)

## Rows: 263
## Columns: 25
## $ ZIPCODE           <chr> "11436", "11213", "11212", "11225", "11218", "11226"…
## $ BLDGZIP           <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
## $ PO_NAME           <chr> "Jamaica", "Brooklyn", "Brooklyn", "Brooklyn", "Broo…
## $ POPULATION        <dbl> 18681, 62426, 83866, 56527, 72280, 106132, 92561, 67…
## $ AREA              <dbl> 22699295, 29631004, 41972104, 23698630, 36868799, 39…
## $ STATE             <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY"…
## $ COUNTY            <chr> "Queens", "Kings", "Kings", "Kings", "Kings", "Kings…
## $ ST_FIPS           <chr> "36", "36", "36", "36", "36", "36", "36", "36", "36"…
## $ CTY_FIPS          <chr> "081", "047", "047", "047", "047", "047", "047", "04…
## $ URL               <chr> "http://www.usps.com/", "http://www.usps.com/", "htt…
## $ SHAPE_AREA        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ SHAPE_LEN         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ NEIGHBORHOOD_NAME <chr> "South Jamaica/South Ozone Park", "Crown Heights (Ea…
## $ BOROUGH_GROUP     <chr> "Queens", "Brooklyn", "Brooklyn", "Brooklyn", "Brook…
## $ label             <chr> "11436", "11213", "11212", "11225", "11218", "11226"…
## $ lat               <dbl> 40.67582, 40.67107, 40.66293, 40.66306, 40.64348, 40…
## $ lon               <dbl> -73.79662, -73.93633, -73.91301, -73.95423, -73.9760…
## $ COVID_CASE_COUNT  <int> 1888, 5166, 7182, 3833, 6199, 7279, 8429, 5380, 1104…
## $ COVID_CASE_RATE   <dbl> 9419.96, 7996.75, 9709.74, 6664.50, 8377.49, 7476.75…
## $ POP_DENOMINATOR   <dbl> 20042.54, 64601.26, 73966.99, 57513.69, 73995.92, 97…
## $ COVID_DEATH_COUNT <int> 64, 203, 330, 177, 218, 368, 256, 206, 380, 219, 113…
## $ COVID_DEATH_RATE  <dbl> 319.32, 314.24, 446.14, 307.75, 294.61, 378.00, 284.…
## $ PERCENT_POSITIVE  <dbl> 17.57, 13.72, 15.64, 11.62, 13.93, 13.33, 15.64, 15.…
## $ TOTAL_COVID_TESTS <int> 11082, 38560, 47319, 33709, 45884, 56287, 55444, 350…
## $ geometry          <POLYGON [°]> POLYGON ((-73.80585 40.6829..., POLYGON ((-7…

task 2: Aggregate the NYC food retails store to the zip code data

## Read Food Retail Shapefile
retail_food_sf <- st_read(file.path(data_dir,'nycFoodStore.shp'))

## Reading layer `nycFoodStore' from data source 
##   `/Users/ziruizheng/Desktop/College Class/Now/Data Analysis & Viz/2025_Spring_week 7-10/Section_08/R-Spatial_II_Lab/nycFoodStore.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 11300 features and 16 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -74.2484 ymin: 40.50782 xmax: -73.67061 ymax: 40.91008
## Geodetic CRS:  WGS 84

retail_food_sf <- st_transform(retail_food_sf, crs = 4326)


## Aggregate food retail and zipcode
Food_retail_zip_code_sf <- retail_food_sf %>% 
  filter(City == "NEW YORK") %>%
  st_join(zip_code_sf) %>%
  group_by(ZIPCODE) %>%
  summarize(store_number = n()) 

cat('2. st_join result for Food retail store with Zipcode:')

## 2. st_join result for Food retail store with Zipcode:

glimpse(Food_retail_zip_code_sf)

## Rows: 52
## Columns: 3
## $ ZIPCODE      <chr> "10001", "10002", "10003", "10004", "10005", "10006", "10…
## $ store_number <int> 54, 199, 94, 10, 9, 5, 20, 84, 48, 110, 46, 116, 55, 63, …
## $ geometry     <MULTIPOINT [°]> MULTIPOINT ((-73.99079 40.7..., MULTIPOINT ((-…

task 3: Aggregate the NYC Health Facilities to the zip code data

## Read Health Facilities file
health_facility_df <- read.csv(file.path(data_dir,'NYS_Health_Facility.csv'))

## Filter df with non NAs
health_facility_df <- health_facility_df %>%
                      ## remove the na values from the list, otherwise sf() cannot work
                      filter(!is.na(Facility.Longitude) , !is.na(Facility.Latitude)) %>%
                      filter(Short.Description == 'NH') %>%
                      group_by(Facility.Zip.Code) %>%
                      summarise(number_of_nurse_home = n())


## Aggregate the facility of nurse home with zip code
health_facility_zip_code_merge_df <- zip_code_sf %>%
  left_join(health_facility_df, by = c("ZIPCODE" = "Facility.Zip.Code")) %>%
  select(ZIPCODE, POPULATION, STATE, COUNTY, number_of_nurse_home, geometry)

cat('3. left_join result for health facility with Zipcode:')

## 3. left_join result for health facility with Zipcode:

glimpse(health_facility_zip_code_merge_df)

## Rows: 263
## Columns: 6
## $ ZIPCODE              <chr> "11436", "11213", "11212", "11225", "11218", "112…
## $ POPULATION           <dbl> 18681, 62426, 83866, 56527, 72280, 106132, 92561,…
## $ STATE                <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "…
## $ COUNTY               <chr> "Queens", "Kings", "Kings", "Kings", "Kings", "Ki…
## $ number_of_nurse_home <int> NA, 1, 1, NA, 1, 3, 3, NA, 1, NA, 3, NA, 2, 5, 8,…
## $ geometry             <POLYGON [°]> POLYGON ((-73.80585 40.6829..., POLYGON (…

Task 4: Join the Census ACS population, race, and age data to the NYC Planning Census Tract Data

## NYC Planning Census Tract
Tract_sf <- st_read(file.path(data_dir,'2010 Census Tracts/geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp'))

## Reading layer `geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a' from data source `/Users/ziruizheng/Desktop/College Class/Now/Data Analysis & Viz/2025_Spring_week 7-10/Section_08/R-Spatial_II_Lab/2010 Census Tracts/geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 2165 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25559 ymin: 40.49612 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS:  WGS84(DD)

long_text <- 'Notice:\n\n Task 4 is a challenging topic the common factors in two Tract and Census data files are unclear in the first place. So I had to standardize the files a little bit in order to use the left_join function. I used "id" and "boro_name" columns to match these two files. Process is specified below: 
\n 1.Census file have boro and state glue in one string, the separate() function is used to separate them into individual columns. 
\n 2.Census and Tract have different boro names, ex: "New York State" & "Manhattan", case_when() is used to mutate new column to fix it. 
\n 3.Only left_join by boro name will create many-to-many warning, therefore "id" col is applied to uniquely join elements in two files'
wrapped_text <- strwrap(long_text, width = 70)
cat(wrapped_text, sep = "\n")

## Notice:
## 
## Task 4 is a challenging topic the common factors in two Tract and
## Census data files are unclear in the first place. So I had to
## standardize the files a little bit in order to use the left_join
## function. I used "id" and "boro_name" columns to match these two
## files. Process is specified below:
## 
## 1.Census file have boro and state glue in one string, the separate()
## function is used to separate them into individual columns.
## 
## 2.Census and Tract have different boro names, ex: "New York State" &
## "Manhattan", case_when() is used to mutate new column to fix it.
## 
## 3.Only left_join by boro name will create many-to-many warning,
## therefore "id" col is applied to uniquely join elements in two files

## Read Census ACS
Census_ACS_df <- read.csv(file.path(data_dir,'ACSDP5Y2018.DP05_data_with_overlays_2020-04-22T132935.csv'))

colnames(Census_ACS_df) <- Census_ACS_df[1,] # Adjust to the correct colname

## Reference: Separate one column value into multi-columns from https://forum.posit.co/t/separating-data-into-multiple-columns-from-one-column-in-r/77470
colnames(Census_ACS_df) <- make.unique(colnames(Census_ACS_df))
library(tidyverse)
Census_ACS_df <- Census_ACS_df %>%
                separate(`Geographic Area Name`, into = c ('tract field', 'county', 'state'),  sep =", ")

## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [1].

## Reference: substr to get the unique id from https://stackoverflow.com/questions/27830976/using-substring-on-a-column-in-r
id_length <- str_length(Census_ACS_df$id)
Census_ACS_df$id <- substr(Census_ACS_df$id, id_length - 5, id_length)


## Reference: Case when to covert census boro to match tract boro data from https://dplyr.tidyverse.org/reference/case_when.html
Census_ACS_df <- Census_ACS_df %>% 
                  mutate (boro_name = case_when(
                    county == 'New York County' ~ 'Manhattan',
                    county == 'Bronx County' ~ 'Bronx',
                    county == 'Queens County' ~ 'Queens',
                    county == 'Kings County' ~ 'Staten Island'
                  ))

## Join the Census ACS dataframe and Tract sf
Census_ACS_Tract_merged <- Tract_sf %>%
  left_join(Census_ACS_df, by = c("boro_name" = "boro_name", "ct2010" = "id"))  


cat('4. left_join result for census with tract (Data is re-selected for readability):')

## 4. left_join result for census with tract (Data is re-selected for readability):

Census_ACS_Tract_merged <- Census_ACS_Tract_merged %>%
                            select(1:10)
glimpse(Census_ACS_Tract_merged)

## Rows: 2,165
## Columns: 11
## $ boro_code  <chr> "5", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",…
## $ boro_ct201 <chr> "5000900", "1009800", "1010000", "1010200", "1010400", "101…
## $ boro_name  <chr> "Staten Island", "Manhattan", "Manhattan", "Manhattan", "Ma…
## $ cdeligibil <chr> "E", "I", "I", "I", "I", "I", "I", "I", "I", "I", "E", "I",…
## $ ct2010     <chr> "000900", "009800", "010000", "010200", "010400", "011300",…
## $ ctlabel    <chr> "9", "98", "100", "102", "104", "113", "114.02", "130", "14…
## $ ntacode    <chr> "SI22", "MN19", "MN19", "MN17", "MN17", "MN17", "MN40", "MN…
## $ ntaname    <chr> "West New Brighton-New Brighton-St. George", "Turtle Bay-Ea…
## $ puma       <chr> "3903", "3808", "3808", "3807", "3807", "3807", "3805", "38…
## $ shape_area <dbl> 2497009.7, 1906016.4, 1860938.4, 1860992.7, 1864600.4, 1890…
## $ geometry   <MULTIPOLYGON [°]> MULTIPOLYGON (((-74.07921 4..., MULTIPOLYGON (…

Task 5: Aggregate the ACS census data to zip code area

ACS_census_zip_code_sf <- Census_ACS_Tract_merged %>% 
  st_transform(4326) %>%
  st_join(zip_code_sf) %>%
  group_by(ZIPCODE) %>%
  summarize(ACS_number = n()) 

cat('5. st_join result for ACS census with Zipcode:')

## 5. st_join result for ACS census with Zipcode:

glimpse(ACS_census_zip_code_sf)

## Rows: 249
## Columns: 3
## $ ZIPCODE    <chr> "00083", "10001", "10002", "10003", "10004", "10005", "1000…
## $ ACS_number <int> 25, 14, 27, 18, 7, 4, 4, 10, 20, 13, 18, 14, 15, 12, 12, 12…
## $ geometry   <GEOMETRY [°]> POLYGON ((-73.95817 40.8005..., POLYGON ((-73.9932…

GTECH_38520_2025S_Week08_24178392_ZiruiZheng25

Zirui Zheng

2/06/2025

Explanation of the template

R Spatial Lab Assignment # 2

task 0: Import necessary libraries and data path

task 1: Join the COVID-19 data to the NYC zip code area

task 2: Aggregate the NYC food retails store to the zip code data

task 3: Aggregate the NYC Health Facilities to the zip code data

Task 4: Join the Census ACS population, race, and age data to the NYC Planning Census Tract Data

Task 5: Aggregate the ACS census data to zip code area