R Spatial Lab Assignment # 2
task 0: Import necessary libraries and data path
task 1: Join the COVID-19 data to the NYC zip code area
## Read COVID-19 file
covid_19_cases_df <- read.csv(file.path(data_dir,'tests-by-zcta_2021_04_23.csv'))
## Reference: Convert method from https://stackoverflow.com/questions/57711323/convert-dataframe-numeric-column-to-character
covid_19_cases_df$MODIFIED_ZCTA<-as.character(covid_19_cases_df$MODIFIED_ZCTA)
## Convert COVID-19 file to spatial file
covid_19_cases_sf <- covid_19_cases_df %>%
## remove the na values from the list, otherwise sf() cannot work
filter(!is.na(lon) , !is.na(lat))
## Read Zip Code Shapefile
zip_code_sf <- st_read(file.path(data_dir,'ZIP_CODE_040114/ZIP_CODE_040114.shp'))
## Reading layer `ZIP_CODE_040114' from data source
## `/Users/ziruizheng/Desktop/College Class/Now/Data Analysis & Viz/2025_Spring_week 7-10/Section_08/R-Spatial_II_Lab/ZIP_CODE_040114/ZIP_CODE_040114.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 263 features and 12 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 913129 ymin: 120020.9 xmax: 1067494 ymax: 272710.9
## Projected CRS: NAD83 / New York Long Island (ftUS)
zip_code_sf <- st_transform(zip_code_sf, crs = 4326)
long_text <- 'Notice:\n\nI did the assignment in two methods: left join if it is a combo of df & sf. st_join if it is a combo of two sfs. )'
wrapped_text <- strwrap(long_text, width = 70)
cat(wrapped_text, sep = "\n")
## Notice:
##
## I did the assignment in two methods: left join if it is a combo of df
## & sf. st_join if it is a combo of two sfs. )
## Join the Covid-19 dataframe and Zip Code sf in st_join
Covid_zip_code_merged_sf_st_join <- zip_code_sf %>%
left_join(covid_19_cases_df, by = c("ZIPCODE" = "MODIFIED_ZCTA"))
cat('1. left_join result for Covi-19 with Zipcode:')
## 1. left_join result for Covi-19 with Zipcode:
glimpse(Covid_zip_code_merged_sf_st_join)
## Rows: 263
## Columns: 25
## $ ZIPCODE <chr> "11436", "11213", "11212", "11225", "11218", "11226"…
## $ BLDGZIP <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
## $ PO_NAME <chr> "Jamaica", "Brooklyn", "Brooklyn", "Brooklyn", "Broo…
## $ POPULATION <dbl> 18681, 62426, 83866, 56527, 72280, 106132, 92561, 67…
## $ AREA <dbl> 22699295, 29631004, 41972104, 23698630, 36868799, 39…
## $ STATE <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY"…
## $ COUNTY <chr> "Queens", "Kings", "Kings", "Kings", "Kings", "Kings…
## $ ST_FIPS <chr> "36", "36", "36", "36", "36", "36", "36", "36", "36"…
## $ CTY_FIPS <chr> "081", "047", "047", "047", "047", "047", "047", "04…
## $ URL <chr> "http://www.usps.com/", "http://www.usps.com/", "htt…
## $ SHAPE_AREA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ SHAPE_LEN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ NEIGHBORHOOD_NAME <chr> "South Jamaica/South Ozone Park", "Crown Heights (Ea…
## $ BOROUGH_GROUP <chr> "Queens", "Brooklyn", "Brooklyn", "Brooklyn", "Brook…
## $ label <chr> "11436", "11213", "11212", "11225", "11218", "11226"…
## $ lat <dbl> 40.67582, 40.67107, 40.66293, 40.66306, 40.64348, 40…
## $ lon <dbl> -73.79662, -73.93633, -73.91301, -73.95423, -73.9760…
## $ COVID_CASE_COUNT <int> 1888, 5166, 7182, 3833, 6199, 7279, 8429, 5380, 1104…
## $ COVID_CASE_RATE <dbl> 9419.96, 7996.75, 9709.74, 6664.50, 8377.49, 7476.75…
## $ POP_DENOMINATOR <dbl> 20042.54, 64601.26, 73966.99, 57513.69, 73995.92, 97…
## $ COVID_DEATH_COUNT <int> 64, 203, 330, 177, 218, 368, 256, 206, 380, 219, 113…
## $ COVID_DEATH_RATE <dbl> 319.32, 314.24, 446.14, 307.75, 294.61, 378.00, 284.…
## $ PERCENT_POSITIVE <dbl> 17.57, 13.72, 15.64, 11.62, 13.93, 13.33, 15.64, 15.…
## $ TOTAL_COVID_TESTS <int> 11082, 38560, 47319, 33709, 45884, 56287, 55444, 350…
## $ geometry <POLYGON [°]> POLYGON ((-73.80585 40.6829..., POLYGON ((-7…
task 2: Aggregate the NYC food retails store to the zip code
data
## Read Food Retail Shapefile
retail_food_sf <- st_read(file.path(data_dir,'nycFoodStore.shp'))
## Reading layer `nycFoodStore' from data source
## `/Users/ziruizheng/Desktop/College Class/Now/Data Analysis & Viz/2025_Spring_week 7-10/Section_08/R-Spatial_II_Lab/nycFoodStore.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 11300 features and 16 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -74.2484 ymin: 40.50782 xmax: -73.67061 ymax: 40.91008
## Geodetic CRS: WGS 84
retail_food_sf <- st_transform(retail_food_sf, crs = 4326)
## Aggregate food retail and zipcode
Food_retail_zip_code_sf <- retail_food_sf %>%
filter(City == "NEW YORK") %>%
st_join(zip_code_sf) %>%
group_by(ZIPCODE) %>%
summarize(store_number = n())
cat('2. st_join result for Food retail store with Zipcode:')
## 2. st_join result for Food retail store with Zipcode:
glimpse(Food_retail_zip_code_sf)
## Rows: 52
## Columns: 3
## $ ZIPCODE <chr> "10001", "10002", "10003", "10004", "10005", "10006", "10…
## $ store_number <int> 54, 199, 94, 10, 9, 5, 20, 84, 48, 110, 46, 116, 55, 63, …
## $ geometry <MULTIPOINT [°]> MULTIPOINT ((-73.99079 40.7..., MULTIPOINT ((-…
task 3: Aggregate the NYC Health Facilities to the zip code
data
## Read Health Facilities file
health_facility_df <- read.csv(file.path(data_dir,'NYS_Health_Facility.csv'))
## Filter df with non NAs
health_facility_df <- health_facility_df %>%
## remove the na values from the list, otherwise sf() cannot work
filter(!is.na(Facility.Longitude) , !is.na(Facility.Latitude)) %>%
filter(Short.Description == 'NH') %>%
group_by(Facility.Zip.Code) %>%
summarise(number_of_nurse_home = n())
## Aggregate the facility of nurse home with zip code
health_facility_zip_code_merge_df <- zip_code_sf %>%
left_join(health_facility_df, by = c("ZIPCODE" = "Facility.Zip.Code")) %>%
select(ZIPCODE, POPULATION, STATE, COUNTY, number_of_nurse_home, geometry)
cat('3. left_join result for health facility with Zipcode:')
## 3. left_join result for health facility with Zipcode:
glimpse(health_facility_zip_code_merge_df)
## Rows: 263
## Columns: 6
## $ ZIPCODE <chr> "11436", "11213", "11212", "11225", "11218", "112…
## $ POPULATION <dbl> 18681, 62426, 83866, 56527, 72280, 106132, 92561,…
## $ STATE <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "…
## $ COUNTY <chr> "Queens", "Kings", "Kings", "Kings", "Kings", "Ki…
## $ number_of_nurse_home <int> NA, 1, 1, NA, 1, 3, 3, NA, 1, NA, 3, NA, 2, 5, 8,…
## $ geometry <POLYGON [°]> POLYGON ((-73.80585 40.6829..., POLYGON (…
Task 4: Join the Census ACS population, race, and age data to the
NYC Planning Census Tract Data
## NYC Planning Census Tract
Tract_sf <- st_read(file.path(data_dir,'2010 Census Tracts/geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp'))
## Reading layer `geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a' from data source `/Users/ziruizheng/Desktop/College Class/Now/Data Analysis & Viz/2025_Spring_week 7-10/Section_08/R-Spatial_II_Lab/2010 Census Tracts/geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 2165 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -74.25559 ymin: 40.49612 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS: WGS84(DD)
long_text <- 'Notice:\n\n Task 4 is a challenging topic the common factors in two Tract and Census data files are unclear in the first place. So I had to standardize the files a little bit in order to use the left_join function. I used "id" and "boro_name" columns to match these two files. Process is specified below:
\n 1.Census file have boro and state glue in one string, the separate() function is used to separate them into individual columns.
\n 2.Census and Tract have different boro names, ex: "New York State" & "Manhattan", case_when() is used to mutate new column to fix it.
\n 3.Only left_join by boro name will create many-to-many warning, therefore "id" col is applied to uniquely join elements in two files'
wrapped_text <- strwrap(long_text, width = 70)
cat(wrapped_text, sep = "\n")
## Notice:
##
## Task 4 is a challenging topic the common factors in two Tract and
## Census data files are unclear in the first place. So I had to
## standardize the files a little bit in order to use the left_join
## function. I used "id" and "boro_name" columns to match these two
## files. Process is specified below:
##
## 1.Census file have boro and state glue in one string, the separate()
## function is used to separate them into individual columns.
##
## 2.Census and Tract have different boro names, ex: "New York State" &
## "Manhattan", case_when() is used to mutate new column to fix it.
##
## 3.Only left_join by boro name will create many-to-many warning,
## therefore "id" col is applied to uniquely join elements in two files
## Read Census ACS
Census_ACS_df <- read.csv(file.path(data_dir,'ACSDP5Y2018.DP05_data_with_overlays_2020-04-22T132935.csv'))
colnames(Census_ACS_df) <- Census_ACS_df[1,] # Adjust to the correct colname
## Reference: Separate one column value into multi-columns from https://forum.posit.co/t/separating-data-into-multiple-columns-from-one-column-in-r/77470
colnames(Census_ACS_df) <- make.unique(colnames(Census_ACS_df))
library(tidyverse)
Census_ACS_df <- Census_ACS_df %>%
separate(`Geographic Area Name`, into = c ('tract field', 'county', 'state'), sep =", ")
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [1].
## Reference: substr to get the unique id from https://stackoverflow.com/questions/27830976/using-substring-on-a-column-in-r
id_length <- str_length(Census_ACS_df$id)
Census_ACS_df$id <- substr(Census_ACS_df$id, id_length - 5, id_length)
## Reference: Case when to covert census boro to match tract boro data from https://dplyr.tidyverse.org/reference/case_when.html
Census_ACS_df <- Census_ACS_df %>%
mutate (boro_name = case_when(
county == 'New York County' ~ 'Manhattan',
county == 'Bronx County' ~ 'Bronx',
county == 'Queens County' ~ 'Queens',
county == 'Kings County' ~ 'Staten Island'
))
## Join the Census ACS dataframe and Tract sf
Census_ACS_Tract_merged <- Tract_sf %>%
left_join(Census_ACS_df, by = c("boro_name" = "boro_name", "ct2010" = "id"))
cat('4. left_join result for census with tract (Data is re-selected for readability):')
## 4. left_join result for census with tract (Data is re-selected for readability):
Census_ACS_Tract_merged <- Census_ACS_Tract_merged %>%
select(1:10)
glimpse(Census_ACS_Tract_merged)
## Rows: 2,165
## Columns: 11
## $ boro_code <chr> "5", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",…
## $ boro_ct201 <chr> "5000900", "1009800", "1010000", "1010200", "1010400", "101…
## $ boro_name <chr> "Staten Island", "Manhattan", "Manhattan", "Manhattan", "Ma…
## $ cdeligibil <chr> "E", "I", "I", "I", "I", "I", "I", "I", "I", "I", "E", "I",…
## $ ct2010 <chr> "000900", "009800", "010000", "010200", "010400", "011300",…
## $ ctlabel <chr> "9", "98", "100", "102", "104", "113", "114.02", "130", "14…
## $ ntacode <chr> "SI22", "MN19", "MN19", "MN17", "MN17", "MN17", "MN40", "MN…
## $ ntaname <chr> "West New Brighton-New Brighton-St. George", "Turtle Bay-Ea…
## $ puma <chr> "3903", "3808", "3808", "3807", "3807", "3807", "3805", "38…
## $ shape_area <dbl> 2497009.7, 1906016.4, 1860938.4, 1860992.7, 1864600.4, 1890…
## $ geometry <MULTIPOLYGON [°]> MULTIPOLYGON (((-74.07921 4..., MULTIPOLYGON (…
Task 5: Aggregate the ACS census data to zip code area
ACS_census_zip_code_sf <- Census_ACS_Tract_merged %>%
st_transform(4326) %>%
st_join(zip_code_sf) %>%
group_by(ZIPCODE) %>%
summarize(ACS_number = n())
cat('5. st_join result for ACS census with Zipcode:')
## 5. st_join result for ACS census with Zipcode:
glimpse(ACS_census_zip_code_sf)
## Rows: 249
## Columns: 3
## $ ZIPCODE <chr> "00083", "10001", "10002", "10003", "10004", "10005", "1000…
## $ ACS_number <int> 25, 14, 27, 18, 7, 4, 4, 10, 20, 13, 18, 14, 15, 12, 12, 12…
## $ geometry <GEOMETRY [°]> POLYGON ((-73.95817 40.8005..., POLYGON ((-73.9932…