Loading the libraries

library(here)
## here() starts at /run/media/gonzalo/Windows-SSD/Users/mhgon/Documents/Academia/Posgrado/Doctorado/Clases/4. Semestre/Data analysis
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.4.0, PROJ 8.2.0; sf_use_s2() is TRUE


Task 1: Join the COVID-19 data to the NYC zip code area —————-

sf_zip <- st_read(dsn = here("Module 9/Data/ZIP_CODE_040114.shp")) %>% 
  janitor::clean_names()
## Reading layer `ZIP_CODE_040114' from data source 
##   `/run/media/gonzalo/Windows-SSD/Users/mhgon/Documents/Academia/Posgrado/Doctorado/Clases/4. Semestre/Data analysis/Module 9/Data/ZIP_CODE_040114.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 263 features and 12 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 913129 ymin: 120020.9 xmax: 1067494 ymax: 272710.9
## Projected CRS: NAD83 / New York Long Island (ftUS)
sf_zip_2831 <- st_transform(sf_zip, 2831)

d_covid <- read_csv(file = here("Module 10/Data/tests-by-zcta_2021_04_23.csv")) %>% 
  janitor::clean_names()
## Rows: 177 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): NEIGHBORHOOD_NAME, BOROUGH_GROUP, label
## dbl (10): MODIFIED_ZCTA, lat, lon, COVID_CASE_COUNT, COVID_CASE_RATE, POP_DE...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sf_zip_2831$zipcode <- as.numeric(sf_zip_2831$zipcode)

## Joining

j_zip_covid <- inner_join(x = sf_zip_2831, y =  d_covid,
                       by = c("zipcode" = "modified_zcta"))

head(j_zip_covid)
## Simple feature collection with 6 features and 24 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 300682.9 ymin: 51483.11 xmax: 317920 ymax: 57723.1
## Projected CRS: NAD83(HARN) / New York Long Island
##   zipcode bldgzip  po_name population     area state county st_fips cty_fips
## 1   11436       0  Jamaica      18681 22699295    NY Queens      36      081
## 2   11213       0 Brooklyn      62426 29631004    NY  Kings      36      047
## 3   11212       0 Brooklyn      83866 41972104    NY  Kings      36      047
## 4   11225       0 Brooklyn      56527 23698630    NY  Kings      36      047
## 5   11218       0 Brooklyn      72280 36868799    NY  Kings      36      047
## 6   11226       0 Brooklyn     106132 39408598    NY  Kings      36      047
##                    url shape_area shape_len
## 1 http://www.usps.com/          0         0
## 2 http://www.usps.com/          0         0
## 3 http://www.usps.com/          0         0
## 4 http://www.usps.com/          0         0
## 5 http://www.usps.com/          0         0
## 6 http://www.usps.com/          0         0
##                                neighborhood_name borough_group label      lat
## 1                 South Jamaica/South Ozone Park        Queens 11436 40.67582
## 2                           Crown Heights (East)      Brooklyn 11213 40.67107
## 3                         Ocean Hill-Brownsville      Brooklyn 11212 40.66293
## 4 Crown Heights (West)/Prospect Lefferts Gardens      Brooklyn 11225 40.66306
## 5                     Kensington/Windsor Terrace      Brooklyn 11218 40.64348
## 6             Flatbush/Prospect Lefferts Gardens      Brooklyn 11226 40.64646
##         lon covid_case_count covid_case_rate pop_denominator covid_death_count
## 1 -73.79662             1888         9419.96        20042.54                64
## 2 -73.93633             5166         7996.75        64601.26               203
## 3 -73.91301             7182         9709.74        73966.99               330
## 4 -73.95423             3833         6664.50        57513.69               177
## 5 -73.97604             6199         8377.49        73995.92               218
## 6 -73.95665             7279         7476.75        97355.08               368
##   covid_death_rate percent_positive total_covid_tests
## 1           319.32            17.57             11082
## 2           314.24            13.72             38560
## 3           446.14            15.64             47319
## 4           307.75            11.62             33709
## 5           294.61            13.93             45884
## 6           378.00            13.33             56287
##                         geometry
## 1 POLYGON ((316413.1 57343.78...
## 2 POLYGON ((305292.6 56974.38...
## 3 POLYGON ((308206.6 55989.84...
## 4 POLYGON ((303553.6 55965.85...
## 5 POLYGON ((302361.4 53737.72...
## 6 POLYGON ((303222.3 54212.67...


Task 2: Aggregate retail stores —————————————–

Aggregate the NYC food retails store data (points) to the zip code data, so that we know how many retail stores in each zip code area. Note that not all locations are for food retail. And we need to choose the specific types according to the data.

sf_food <- st_read(dsn = here("Module 9/Data/week_11_hmk.gpkg"), 
                   layer = 'retail_stores')
## Reading layer `retail_stores' from data source 
##   `/run/media/gonzalo/Windows-SSD/Users/mhgon/Documents/Academia/Posgrado/Doctorado/Clases/4. Semestre/Data analysis/Module 9/Data/week_11_hmk.gpkg' 
##   using driver `GPKG'
## Simple feature collection with 23974 features and 13 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -79.75953 ymin: 40.50782 xmax: -71.93873 ymax: 44.99484
## Geodetic CRS:  WGS 84
sf_food_2831 <-st_transform(x = sf_food, crs = 2831)

sf_zip_food <-  st_join(x = sf_zip_2831, y = sf_food_2831,
                          join = st_contains)

sf_zip_food %>% 
  select(-url, -license_number, -state.y, -zip_code,
         -street_name, -street_number, -shape_len,
         -st_fips, -cty_fips) %>% 
mutate(shape_area = st_area(geometry)) -> sf_zip_food

head(sf_zip_food)
## Simple feature collection with 6 features and 16 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 304391.5 ymin: 55137.87 xmax: 317920 ymax: 57723.1
## Projected CRS: NAD83(HARN) / New York Long Island
##     zipcode bldgzip  po_name population     area state.x county    shape_area
## 1     11436       0  Jamaica      18681 22699295      NY Queens 2108841 [m^2]
## 1.1   11436       0  Jamaica      18681 22699295      NY Queens 2108841 [m^2]
## 1.2   11436       0  Jamaica      18681 22699295      NY Queens 2108841 [m^2]
## 2     11213       0 Brooklyn      62426 29631004      NY  Kings 2752820 [m^2]
## 2.1   11213       0 Brooklyn      62426 29631004      NY  Kings 2752820 [m^2]
## 2.2   11213       0 Brooklyn      62426 29631004      NY  Kings 2752820 [m^2]
##     operation_type establishment_type                       entity_name
## 1            Store                JAC           142 BEST SUPER DELI INC
## 1.1          Store                JAC                 ALSAMET FOOD CORP
## 1.2          Store                JAC                   HUQAIS SHUAIB M
## 2            Store                JAC 108 DELI GROCERY & COFFEE BAR COR
## 2.1          Store                JAC             188 UTICA GROCERY INC
## 2.2          Store                JAC              235 GOURMET DELI INC
##                    dba_name     city square_footage      lat      long
## 1       142 BEST SUPER DELI  JAMAICA              0 40.67410 -73.79860
## 1.1            ALSAMET FOOD  JAMAICA           2000 40.67341 -73.79064
## 1.2 KING OF ROCKAWAY DELI &  JAMAICA           2000 40.67405 -73.79814
## 2   108 DELI GROCERY & COFF BROOKLYN            800 40.67679 -73.93872
## 2.1       188 UTICA GROCERY BROOKLYN           1800 40.67250 -73.93077
## 2.2        235 GOURMET DELI BROOKLYN            625 40.67089 -73.93371
##                           geometry
## 1   POLYGON ((316413.1 57343.78...
## 1.1 POLYGON ((316413.1 57343.78...
## 1.2 POLYGON ((316413.1 57343.78...
## 2   POLYGON ((305292.6 56974.38...
## 2.1 POLYGON ((305292.6 56974.38...
## 2.2 POLYGON ((305292.6 56974.38...


Filtered only food stores based on their name, and then summarizing by zipcode


pattern_food <- c("FOOD", "DELI", "GROCERY", "MEAT", 
                  "BUTCHER", "FISH", "FRUIT")

matching_food <- str_c(pattern_food, collapse = "|")

v_retail <- str_subset(sf_zip_food$entity_name, matching_food)

sf_retail <- dplyr::filter(.data = sf_zip_food,
                           entity_name %in% v_retail)

sf_sum_zf <- sf_zip_food %>% 
  group_by(zipcode) %>%
  summarise(n_stores = n(),
            zip_area = max(shape_area),
            dens_store = n_stores/zip_area * 1e6)

head(sf_sum_zf) 
## Simple feature collection with 6 features and 4 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: 296001.9 ymin: 57437.93 xmax: 304285.5 ymax: 70390.5
## Projected CRS: NAD83(HARN) / New York Long Island
## # A tibble: 6 × 5
##   zipcode n_stores zip_area dens_store                                  geometry
##     <dbl>    <int>    [m^2]    [1/m^2]                            <GEOMETRY [m]>
## 1      83        1 3558291.      0.281 POLYGON ((304285.5 69986.39, 304277.3 69…
## 2   10001       54 1653210.     32.7   POLYGON ((299301.7 65063.18, 299308.3 65…
## 3   10002      199 2441513.     81.5   POLYGON ((302161.1 63268.61, 302158.2 63…
## 4   10003       94 1443568.     65.1   POLYGON ((301701 63107.48, 301666.3 6304…
## 5   10004       13  713462.     18.2   MULTIPOLYGON (((297937 57439.28, 297925.…
## 6   10005        9  193509.     46.5   POLYGON ((299495.9 59703.66, 299493.3 59…



Task 3: Aggregate the NYC health facilities to the zip code data ——–

sf_health <- st_read(dsn = here("Module 9/Data/week_11_hmk.gpkg"), 
                     layer='health_facilities')
## Reading layer `health_facilities' from data source 
##   `/run/media/gonzalo/Windows-SSD/Users/mhgon/Documents/Academia/Posgrado/Doctorado/Clases/4. Semestre/Data analysis/Module 9/Data/week_11_hmk.gpkg' 
##   using driver `GPKG'
## Simple feature collection with 3843 features and 36 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -79.6299 ymin: 40.51677 xmax: -72.17 ymax: 44.97849
## Geodetic CRS:  WGS 84
sf_health_2831 <- st_transform(x = sf_health, crs = 2831)

sf_zip_health <- st_join(x = sf_zip_2831, sf_health_2831,
                         join = st_contains)

sf_zip_health %>% 
  select(zipcode, population, county, facility_name,
         description, operator_name, ownership_type,
         facility_longitude, facility_latitude) %>% 
  rename(long = facility_longitude, 
         lat = facility_latitude) %>% 
  mutate(zip_area = st_area(geometry)) -> sf_zip_health

pattern_health <- c("nursing", "health", "hospital", "clinic", 
                  "rehabilitation", "center", "therapy", "dental")

matching_health <- str_c(pattern_health, collapse = "|")

sf_zip_health$facility_name <- str_to_lower(string = sf_zip_health$facility_name)

v_health <- str_subset(sf_zip_health$facility_name, matching_health)
  
sf_health <- dplyr::filter(.data = sf_zip_health, 
                    facility_name %in% v_health)
sf_health %>% 
  group_by(zipcode) %>% 
  summarise(n_fac = n(),
            zip_area = max(zip_area),
            dens_store = n_fac/zip_area * 1e6) -> sf_sum_zh

head(sf_sum_zh)
## Simple feature collection with 6 features and 4 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 298505.2 ymin: 59194.22 xmax: 302415 ymax: 65630.25
## Projected CRS: NAD83(HARN) / New York Long Island
## # A tibble: 6 × 5
##   zipcode n_fac zip_area dens_store                                     geometry
##     <dbl> <int>    [m^2]    [1/m^2]                                <POLYGON [m]>
## 1   10001     6 1653210.       3.63 ((299301.7 65063.18, 299308.3 65086.04, 299…
## 2   10002    12 2441513.       4.91 ((302161.1 63268.61, 302158.2 63257.84, 302…
## 3   10003     6 1443568.       4.16 ((301701 63107.48, 301666.3 63044.81, 30162…
## 4   10004     1  371848.       2.69 ((299069.7 59914.07, 299157.4 59870.7, 2991…
## 5   10007     1  494977.       2.02 ((298869 61159.92, 298910.7 61132.43, 29893…
## 6   10009     4 1477491.       2.71 ((302242.3 62633.83, 302246.5 62629.11, 302…


Task 4: Joining Census ACS ————————————————

Join the Census ACS population, race, and age data to the NYC Planning Census Tract Data. Aggregate the ACS census data to zip code area data.

d_census <- read_csv(file = here("Module 10", "Data",
                                 "ACSDP5Y2018.DP05_data_with_overlays_2020-04-22T132935.csv")) %>% 
  slice(-1) %>% 
  janitor::clean_names()  
  
sf_census <- st_read(dsn = here("Module 10", "Data",
                                "geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp")) %>% 
  janitor::clean_names() %>% 
  select(-cdeligibil, -puma, -shape_area)
## Reading layer `geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a' from data source `/run/media/gonzalo/Windows-SSD/Users/mhgon/Documents/Academia/Posgrado/Doctorado/Clases/4. Semestre/Data analysis/Module 10/Data/geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 2165 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25559 ymin: 40.49612 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS:  WGS84(DD)
# To join the data sets I used the GEO_ID and the census track code. However, I first
# extracted the code from the string.

d_census <- d_census %>% 
  mutate(geo_id = str_split_fixed(string = geo_id, pattern = "S",
                                  n = 2)[,2]) 

# This did not work since the census track code is different in the ACS data.
# So I decided to join the data using the county name and the census track number.

d_acs <- d_census %>% 
  separate(col = name, into = c("ct_code", "county", "state"),
           sep = ",", remove = T) %>% 
  mutate(ct_label = gsub("[^0-9.]", "",  ct_code),
         county = str_remove(string = county, pattern = "County"),
         county = str_trim(string = county, side = "both")) %>% 
  mutate(across(.cols = starts_with(match = "dp"),
                .fns = as.numeric)) %>%
  select(-state)  

d_acs$county <- as_factor(d_acs$county)

d_acs$county <- forcats::fct_recode(.f = d_acs$county, 
                    Bronx = "Bronx",
                    Brooklyn = "Kings",
                    Manhattan = "New York",
                    Queens = "Queens",
                    `Staten Island` = "Richmond")

sf_j_census <- dplyr::inner_join(x = sf_census, y = d_acs, 
                                 by = c("boro_name" = "county",
                                        "ctlabel" = "ct_label")) 


Task 5: Aggregating census data by zipcode ——————————

Equalizing the CRS of census and zipcode data.

sf_j_census <- st_transform(x = sf_j_census, crs = 2831)


sf_zip_2831 <- select(.data = sf_zip_2831,
                      -state, -st_fips, -url,
                      -shape_area, -shape_len)

sf_j_zc <- st_join(x = sf_zip_2831, y = sf_j_census,
                   join = st_contains)


sf_j_zc %>% 
  mutate(zip_area = st_area(geometry)) %>% 
  group_by(zipcode)  -> sf_sum_zc


head(sf_sum_zc)
## Simple feature collection with 6 features and 374 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 304391.5 ymin: 55137.87 xmax: 317920 ymax: 57723.1
## Projected CRS: NAD83(HARN) / New York Long Island
## # A tibble: 6 × 375
## # Groups:   zipcode [2]
##   zipcode bldgzip po_name population   area county cty_fips boro_code boro_ct201
##     <dbl> <chr>   <chr>        <dbl>  <dbl> <chr>  <chr>    <chr>     <chr>     
## 1   11436 0       Jamaica      18681 2.27e7 Queens 081      4         4018800   
## 2   11436 0       Jamaica      18681 2.27e7 Queens 081      4         4018200   
## 3   11436 0       Jamaica      18681 2.27e7 Queens 081      4         4018600   
## 4   11436 0       Jamaica      18681 2.27e7 Queens 081      4         4079000   
## 5   11213 0       Brookl…      62426 2.96e7 Kings  047      3         3034900   
## 6   11213 0       Brookl…      62426 2.96e7 Kings  047      3         3035100   
## # … with 366 more variables: boro_name <chr>, ct2010 <chr>, ctlabel <chr>,
## #   ntacode <chr>, ntaname <chr>, shape_leng <dbl>, geo_id <chr>,
## #   ct_code <chr>, dp05_0031pm <dbl>, dp05_0032e <dbl>, dp05_0032m <dbl>,
## #   dp05_0032pe <dbl>, dp05_0032pm <dbl>, dp05_0033e <dbl>, dp05_0033m <dbl>,
## #   dp05_0033pe <dbl>, dp05_0033pm <dbl>, dp05_0034e <dbl>, dp05_0034m <dbl>,
## #   dp05_0034pe <dbl>, dp05_0034pm <dbl>, dp05_0035e <dbl>, dp05_0035m <dbl>,
## #   dp05_0035pe <dbl>, dp05_0035pm <dbl>, dp05_0036e <dbl>, dp05_0036m <dbl>, …