library(here)
## here() starts at /run/media/gonzalo/Windows-SSD/Users/mhgon/Documents/Academia/Posgrado/Doctorado/Clases/4. Semestre/Data analysis
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.4.0, PROJ 8.2.0; sf_use_s2() is TRUE
sf_zip <- st_read(dsn = here("Module 9/Data/ZIP_CODE_040114.shp")) %>%
janitor::clean_names()
## Reading layer `ZIP_CODE_040114' from data source
## `/run/media/gonzalo/Windows-SSD/Users/mhgon/Documents/Academia/Posgrado/Doctorado/Clases/4. Semestre/Data analysis/Module 9/Data/ZIP_CODE_040114.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 263 features and 12 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 913129 ymin: 120020.9 xmax: 1067494 ymax: 272710.9
## Projected CRS: NAD83 / New York Long Island (ftUS)
sf_zip_2831 <- st_transform(sf_zip, 2831)
d_covid <- read_csv(file = here("Module 10/Data/tests-by-zcta_2021_04_23.csv")) %>%
janitor::clean_names()
## Rows: 177 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): NEIGHBORHOOD_NAME, BOROUGH_GROUP, label
## dbl (10): MODIFIED_ZCTA, lat, lon, COVID_CASE_COUNT, COVID_CASE_RATE, POP_DE...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sf_zip_2831$zipcode <- as.numeric(sf_zip_2831$zipcode)
## Joining
j_zip_covid <- inner_join(x = sf_zip_2831, y = d_covid,
by = c("zipcode" = "modified_zcta"))
head(j_zip_covid)
## Simple feature collection with 6 features and 24 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 300682.9 ymin: 51483.11 xmax: 317920 ymax: 57723.1
## Projected CRS: NAD83(HARN) / New York Long Island
## zipcode bldgzip po_name population area state county st_fips cty_fips
## 1 11436 0 Jamaica 18681 22699295 NY Queens 36 081
## 2 11213 0 Brooklyn 62426 29631004 NY Kings 36 047
## 3 11212 0 Brooklyn 83866 41972104 NY Kings 36 047
## 4 11225 0 Brooklyn 56527 23698630 NY Kings 36 047
## 5 11218 0 Brooklyn 72280 36868799 NY Kings 36 047
## 6 11226 0 Brooklyn 106132 39408598 NY Kings 36 047
## url shape_area shape_len
## 1 http://www.usps.com/ 0 0
## 2 http://www.usps.com/ 0 0
## 3 http://www.usps.com/ 0 0
## 4 http://www.usps.com/ 0 0
## 5 http://www.usps.com/ 0 0
## 6 http://www.usps.com/ 0 0
## neighborhood_name borough_group label lat
## 1 South Jamaica/South Ozone Park Queens 11436 40.67582
## 2 Crown Heights (East) Brooklyn 11213 40.67107
## 3 Ocean Hill-Brownsville Brooklyn 11212 40.66293
## 4 Crown Heights (West)/Prospect Lefferts Gardens Brooklyn 11225 40.66306
## 5 Kensington/Windsor Terrace Brooklyn 11218 40.64348
## 6 Flatbush/Prospect Lefferts Gardens Brooklyn 11226 40.64646
## lon covid_case_count covid_case_rate pop_denominator covid_death_count
## 1 -73.79662 1888 9419.96 20042.54 64
## 2 -73.93633 5166 7996.75 64601.26 203
## 3 -73.91301 7182 9709.74 73966.99 330
## 4 -73.95423 3833 6664.50 57513.69 177
## 5 -73.97604 6199 8377.49 73995.92 218
## 6 -73.95665 7279 7476.75 97355.08 368
## covid_death_rate percent_positive total_covid_tests
## 1 319.32 17.57 11082
## 2 314.24 13.72 38560
## 3 446.14 15.64 47319
## 4 307.75 11.62 33709
## 5 294.61 13.93 45884
## 6 378.00 13.33 56287
## geometry
## 1 POLYGON ((316413.1 57343.78...
## 2 POLYGON ((305292.6 56974.38...
## 3 POLYGON ((308206.6 55989.84...
## 4 POLYGON ((303553.6 55965.85...
## 5 POLYGON ((302361.4 53737.72...
## 6 POLYGON ((303222.3 54212.67...
Aggregate the NYC food retails store data (points) to the zip code data, so that we know how many retail stores in each zip code area. Note that not all locations are for food retail. And we need to choose the specific types according to the data.
sf_food <- st_read(dsn = here("Module 9/Data/week_11_hmk.gpkg"),
layer = 'retail_stores')
## Reading layer `retail_stores' from data source
## `/run/media/gonzalo/Windows-SSD/Users/mhgon/Documents/Academia/Posgrado/Doctorado/Clases/4. Semestre/Data analysis/Module 9/Data/week_11_hmk.gpkg'
## using driver `GPKG'
## Simple feature collection with 23974 features and 13 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -79.75953 ymin: 40.50782 xmax: -71.93873 ymax: 44.99484
## Geodetic CRS: WGS 84
sf_food_2831 <-st_transform(x = sf_food, crs = 2831)
sf_zip_food <- st_join(x = sf_zip_2831, y = sf_food_2831,
join = st_contains)
sf_zip_food %>%
select(-url, -license_number, -state.y, -zip_code,
-street_name, -street_number, -shape_len,
-st_fips, -cty_fips) %>%
mutate(shape_area = st_area(geometry)) -> sf_zip_food
head(sf_zip_food)
## Simple feature collection with 6 features and 16 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 304391.5 ymin: 55137.87 xmax: 317920 ymax: 57723.1
## Projected CRS: NAD83(HARN) / New York Long Island
## zipcode bldgzip po_name population area state.x county shape_area
## 1 11436 0 Jamaica 18681 22699295 NY Queens 2108841 [m^2]
## 1.1 11436 0 Jamaica 18681 22699295 NY Queens 2108841 [m^2]
## 1.2 11436 0 Jamaica 18681 22699295 NY Queens 2108841 [m^2]
## 2 11213 0 Brooklyn 62426 29631004 NY Kings 2752820 [m^2]
## 2.1 11213 0 Brooklyn 62426 29631004 NY Kings 2752820 [m^2]
## 2.2 11213 0 Brooklyn 62426 29631004 NY Kings 2752820 [m^2]
## operation_type establishment_type entity_name
## 1 Store JAC 142 BEST SUPER DELI INC
## 1.1 Store JAC ALSAMET FOOD CORP
## 1.2 Store JAC HUQAIS SHUAIB M
## 2 Store JAC 108 DELI GROCERY & COFFEE BAR COR
## 2.1 Store JAC 188 UTICA GROCERY INC
## 2.2 Store JAC 235 GOURMET DELI INC
## dba_name city square_footage lat long
## 1 142 BEST SUPER DELI JAMAICA 0 40.67410 -73.79860
## 1.1 ALSAMET FOOD JAMAICA 2000 40.67341 -73.79064
## 1.2 KING OF ROCKAWAY DELI & JAMAICA 2000 40.67405 -73.79814
## 2 108 DELI GROCERY & COFF BROOKLYN 800 40.67679 -73.93872
## 2.1 188 UTICA GROCERY BROOKLYN 1800 40.67250 -73.93077
## 2.2 235 GOURMET DELI BROOKLYN 625 40.67089 -73.93371
## geometry
## 1 POLYGON ((316413.1 57343.78...
## 1.1 POLYGON ((316413.1 57343.78...
## 1.2 POLYGON ((316413.1 57343.78...
## 2 POLYGON ((305292.6 56974.38...
## 2.1 POLYGON ((305292.6 56974.38...
## 2.2 POLYGON ((305292.6 56974.38...
Filtered only food stores based on their name, and then summarizing by zipcode
pattern_food <- c("FOOD", "DELI", "GROCERY", "MEAT",
"BUTCHER", "FISH", "FRUIT")
matching_food <- str_c(pattern_food, collapse = "|")
v_retail <- str_subset(sf_zip_food$entity_name, matching_food)
sf_retail <- dplyr::filter(.data = sf_zip_food,
entity_name %in% v_retail)
sf_sum_zf <- sf_zip_food %>%
group_by(zipcode) %>%
summarise(n_stores = n(),
zip_area = max(shape_area),
dens_store = n_stores/zip_area * 1e6)
head(sf_sum_zf)
## Simple feature collection with 6 features and 4 fields
## Geometry type: GEOMETRY
## Dimension: XY
## Bounding box: xmin: 296001.9 ymin: 57437.93 xmax: 304285.5 ymax: 70390.5
## Projected CRS: NAD83(HARN) / New York Long Island
## # A tibble: 6 × 5
## zipcode n_stores zip_area dens_store geometry
## <dbl> <int> [m^2] [1/m^2] <GEOMETRY [m]>
## 1 83 1 3558291. 0.281 POLYGON ((304285.5 69986.39, 304277.3 69…
## 2 10001 54 1653210. 32.7 POLYGON ((299301.7 65063.18, 299308.3 65…
## 3 10002 199 2441513. 81.5 POLYGON ((302161.1 63268.61, 302158.2 63…
## 4 10003 94 1443568. 65.1 POLYGON ((301701 63107.48, 301666.3 6304…
## 5 10004 13 713462. 18.2 MULTIPOLYGON (((297937 57439.28, 297925.…
## 6 10005 9 193509. 46.5 POLYGON ((299495.9 59703.66, 299493.3 59…
sf_health <- st_read(dsn = here("Module 9/Data/week_11_hmk.gpkg"),
layer='health_facilities')
## Reading layer `health_facilities' from data source
## `/run/media/gonzalo/Windows-SSD/Users/mhgon/Documents/Academia/Posgrado/Doctorado/Clases/4. Semestre/Data analysis/Module 9/Data/week_11_hmk.gpkg'
## using driver `GPKG'
## Simple feature collection with 3843 features and 36 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -79.6299 ymin: 40.51677 xmax: -72.17 ymax: 44.97849
## Geodetic CRS: WGS 84
sf_health_2831 <- st_transform(x = sf_health, crs = 2831)
sf_zip_health <- st_join(x = sf_zip_2831, sf_health_2831,
join = st_contains)
sf_zip_health %>%
select(zipcode, population, county, facility_name,
description, operator_name, ownership_type,
facility_longitude, facility_latitude) %>%
rename(long = facility_longitude,
lat = facility_latitude) %>%
mutate(zip_area = st_area(geometry)) -> sf_zip_health
pattern_health <- c("nursing", "health", "hospital", "clinic",
"rehabilitation", "center", "therapy", "dental")
matching_health <- str_c(pattern_health, collapse = "|")
sf_zip_health$facility_name <- str_to_lower(string = sf_zip_health$facility_name)
v_health <- str_subset(sf_zip_health$facility_name, matching_health)
sf_health <- dplyr::filter(.data = sf_zip_health,
facility_name %in% v_health)
sf_health %>%
group_by(zipcode) %>%
summarise(n_fac = n(),
zip_area = max(zip_area),
dens_store = n_fac/zip_area * 1e6) -> sf_sum_zh
head(sf_sum_zh)
## Simple feature collection with 6 features and 4 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 298505.2 ymin: 59194.22 xmax: 302415 ymax: 65630.25
## Projected CRS: NAD83(HARN) / New York Long Island
## # A tibble: 6 × 5
## zipcode n_fac zip_area dens_store geometry
## <dbl> <int> [m^2] [1/m^2] <POLYGON [m]>
## 1 10001 6 1653210. 3.63 ((299301.7 65063.18, 299308.3 65086.04, 299…
## 2 10002 12 2441513. 4.91 ((302161.1 63268.61, 302158.2 63257.84, 302…
## 3 10003 6 1443568. 4.16 ((301701 63107.48, 301666.3 63044.81, 30162…
## 4 10004 1 371848. 2.69 ((299069.7 59914.07, 299157.4 59870.7, 2991…
## 5 10007 1 494977. 2.02 ((298869 61159.92, 298910.7 61132.43, 29893…
## 6 10009 4 1477491. 2.71 ((302242.3 62633.83, 302246.5 62629.11, 302…
Join the Census ACS population, race, and age data to the NYC Planning Census Tract Data. Aggregate the ACS census data to zip code area data.
d_census <- read_csv(file = here("Module 10", "Data",
"ACSDP5Y2018.DP05_data_with_overlays_2020-04-22T132935.csv")) %>%
slice(-1) %>%
janitor::clean_names()
sf_census <- st_read(dsn = here("Module 10", "Data",
"geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp")) %>%
janitor::clean_names() %>%
select(-cdeligibil, -puma, -shape_area)
## Reading layer `geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a' from data source `/run/media/gonzalo/Windows-SSD/Users/mhgon/Documents/Academia/Posgrado/Doctorado/Clases/4. Semestre/Data analysis/Module 10/Data/geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 2165 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -74.25559 ymin: 40.49612 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS: WGS84(DD)
# To join the data sets I used the GEO_ID and the census track code. However, I first
# extracted the code from the string.
d_census <- d_census %>%
mutate(geo_id = str_split_fixed(string = geo_id, pattern = "S",
n = 2)[,2])
# This did not work since the census track code is different in the ACS data.
# So I decided to join the data using the county name and the census track number.
d_acs <- d_census %>%
separate(col = name, into = c("ct_code", "county", "state"),
sep = ",", remove = T) %>%
mutate(ct_label = gsub("[^0-9.]", "", ct_code),
county = str_remove(string = county, pattern = "County"),
county = str_trim(string = county, side = "both")) %>%
mutate(across(.cols = starts_with(match = "dp"),
.fns = as.numeric)) %>%
select(-state)
d_acs$county <- as_factor(d_acs$county)
d_acs$county <- forcats::fct_recode(.f = d_acs$county,
Bronx = "Bronx",
Brooklyn = "Kings",
Manhattan = "New York",
Queens = "Queens",
`Staten Island` = "Richmond")
sf_j_census <- dplyr::inner_join(x = sf_census, y = d_acs,
by = c("boro_name" = "county",
"ctlabel" = "ct_label"))
Equalizing the CRS of census and zipcode data.
sf_j_census <- st_transform(x = sf_j_census, crs = 2831)
sf_zip_2831 <- select(.data = sf_zip_2831,
-state, -st_fips, -url,
-shape_area, -shape_len)
sf_j_zc <- st_join(x = sf_zip_2831, y = sf_j_census,
join = st_contains)
sf_j_zc %>%
mutate(zip_area = st_area(geometry)) %>%
group_by(zipcode) -> sf_sum_zc
head(sf_sum_zc)
## Simple feature collection with 6 features and 374 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 304391.5 ymin: 55137.87 xmax: 317920 ymax: 57723.1
## Projected CRS: NAD83(HARN) / New York Long Island
## # A tibble: 6 × 375
## # Groups: zipcode [2]
## zipcode bldgzip po_name population area county cty_fips boro_code boro_ct201
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 11436 0 Jamaica 18681 2.27e7 Queens 081 4 4018800
## 2 11436 0 Jamaica 18681 2.27e7 Queens 081 4 4018200
## 3 11436 0 Jamaica 18681 2.27e7 Queens 081 4 4018600
## 4 11436 0 Jamaica 18681 2.27e7 Queens 081 4 4079000
## 5 11213 0 Brookl… 62426 2.96e7 Kings 047 3 3034900
## 6 11213 0 Brookl… 62426 2.96e7 Kings 047 3 3035100
## # … with 366 more variables: boro_name <chr>, ct2010 <chr>, ctlabel <chr>,
## # ntacode <chr>, ntaname <chr>, shape_leng <dbl>, geo_id <chr>,
## # ct_code <chr>, dp05_0031pm <dbl>, dp05_0032e <dbl>, dp05_0032m <dbl>,
## # dp05_0032pe <dbl>, dp05_0032pm <dbl>, dp05_0033e <dbl>, dp05_0033m <dbl>,
## # dp05_0033pe <dbl>, dp05_0033pm <dbl>, dp05_0034e <dbl>, dp05_0034m <dbl>,
## # dp05_0034pe <dbl>, dp05_0034pm <dbl>, dp05_0035e <dbl>, dp05_0035m <dbl>,
## # dp05_0035pe <dbl>, dp05_0035pm <dbl>, dp05_0036e <dbl>, dp05_0036m <dbl>, …