### task 1: Join the COVID-19 data to the NYC zip code area data (sf or sp polygons).
task 2: Aggregate the NYC food retails store data (points) to the zip code data, so that we know how many retail stores in each zip code area.
task 3: Aggregate the NYC health facilities (points) to the zip code data. Similarly, choose appropriate subtypes such as nursing homes from the facilities.
``` r
covid_apr12 <- read_csv("R-Spatial_II_Lab/tests-by-zcta_2020_04_12.csv")
covid_apr19 <- read_csv("R-Spatial_II_Lab/tests-by-zcta_2020_04_19.csv")
covid_apr21 <- read_csv("R-Spatial_II_Lab/tests-by-zcta_2021_04_23.csv")
covid_apr21 <- covid_apr21 %>% rename_at(1, ~'MODZCTA')
covid_apr12 <- covid_apr12 %>% filter(MODZCTA != "NA")
covid_apr19 <- covid_apr19 %>% filter(MODZCTA != "NA")
covid_apr21 <- covid_apr21 %>% filter(MODZCTA != "NA")
covid_apr12 <- covid_apr12 %>% mutate(MODZCTA = as.character(MODZCTA))
covid_apr19 <- covid_apr19 %>% mutate(MODZCTA = as.character(MODZCTA))
covid_apr21 <- covid_apr21 %>% mutate(MODZCTA = as.character(MODZCTA))
zipcodes <- st_read("data/nyc/nyczipcodes.shp")
## Reading layer `nyczipcodes' from data source
## `C:\Users\student\OneDrive\.HUNTER\[4] SPRING 26\GTECH38520\work\R-spatial\data\nyc\nyczipcodes.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 178 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -74.25559 ymin: 40.49612 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS: WGS 84
covid_combined <- covid_apr12 %>%
select(MODZCTA, Positive_apr12 = Positive, Total_apr12 = Total) %>%
full_join(
covid_apr19 %>%
select(MODZCTA, Positive_apr19 = Positive, Total_apr19 = Total),
by = "MODZCTA"
) %>%
# Calculate combined Positive and Total from April 12 and April 19
mutate(
Positive_combined_apr12_19 = coalesce(Positive_apr12, 0) + coalesce(Positive_apr19, 0),
Total_combined_apr12_19 = coalesce(Total_apr12, 0) + coalesce(Total_apr19, 0),
# Calculate percentage for the combined period
perc_pos_combined_apr12_19 = (Positive_combined_apr12_19 / Total_combined_apr12_19) * 100
) %>%
# Join with April 2021 data
full_join(
covid_apr21 %>%
select(MODZCTA,
COVID_CASE_COUNT,
TOTAL_COVID_TESTS,
PERCENT_POSITIVE
),
by = "MODZCTA"
) %>%
# Calculate final combined metrics
mutate(
# Combined Positive from all three time periods
Positive_all_dates = coalesce(Positive_combined_apr12_19, 0) + coalesce(COVID_CASE_COUNT, 0),
# Combined Total tests from all three time periods
Total_all_dates = coalesce(Total_combined_apr12_19, 0) + coalesce(TOTAL_COVID_TESTS, 0),
# Overall positivity rate across all periods
overall_positivity_rate = (Positive_all_dates / Total_all_dates) * 100
)
zipcodes <- zipcodes %>% rename_at(1, ~'MODZCTA')
zipcodes_with_data <- zipcodes %>%
left_join(covid_combined, by = "MODZCTA")
ggplot(zipcodes_with_data) +
geom_sf(aes(fill = overall_positivity_rate), color = "white", size = 0.1) +
scale_fill_viridis_c(option = "plasma",
name = "Positivity Rate (%)"
) +
theme_minimal() +
labs(title = "COVID-19 Overall Positivity Rate by ZIP Code, 4/12-4/23",
)

stores_df <- read_csv("data/nys_retail_food_store_xy.csv")
stores_aggregated <- stores_df %>%
# Remove rows with missing zip codes
filter(!is.na(Zip.Code)) %>%
# Clean zip codes (extract 5-digit codes)
mutate(Zip.Code = as.character(Zip.Code),
Zip.Code = str_extract(Zip.Code, "\\d{5}")) %>%
filter(!is.na(Zip.Code)) %>%
# Count stores per zip code
group_by(Zip.Code) %>%
summarise(
store_count = n(),
.groups = 'drop'
)
names(stores_aggregated)[1] <- "MODZCTA"
zipcodes_with_data <- zipcodes_with_data %>%
mutate(MODZCTA = as.character(MODZCTA)) %>%
left_join(stores_aggregated, by = "MODZCTA") %>%
mutate(store_count = replace_na(store_count, 0))
ggplot(zipcodes_with_data) +
geom_sf(aes(fill = store_count), color = "white", size = 0.1) +
scale_fill_viridis_c(option = "plasma",
name = "Number of Stores",
na.value = "grey90") +
theme_minimal() +
labs(title = "Number of Stores by ZIP Code",
)

facilities <- read.csv("data/NYS_Health_Facility.csv", stringsAsFactors = FALSE)
facilities_sf <- facilities %>%
filter(!is.na(Facility.Latitude) & !is.na(Facility.Longitude)) %>%
st_as_sf(coords = c("Facility.Longitude", "Facility.Latitude"), crs = 4326)
facility_counts <- facilities_sf %>%
st_drop_geometry() %>%
group_by(Facility.Zip.Code) %>%
summarise(
total_facilities = n(),
# Count by facility type
hospitals = sum(Short.Description == "HOSP", na.rm = TRUE),
hospice = sum(Short.Description == "HSPC", na.rm = TRUE),
hha = sum(Short.Description == "CHHA", na.rm = TRUE),
dtc = sum(Short.Description == "DTC" | Short.Description == "DTC-EC", na.rm = TRUE),
hospital_ext_clinics = sum(grepl("HOSP-EC|HOSP-SB", Short.Description), na.rm = TRUE),
# For nursing homes specifically
residential_health = sum(Short.Description == "NH", na.rm = TRUE),
) %>%
arrange(desc(total_facilities))
names(facility_counts)[1] <- "MODZCTA"
zipcodes_with_data <- zipcodes_with_data %>%
left_join(facility_counts, by = "MODZCTA")
zipcodes_with_data <- zipcodes_with_data %>% replace(is.na(.), 0)
ggplot(zipcodes_with_data) +
geom_sf(aes(fill = total_facilities), color = "white", size = 0.1) +
scale_fill_viridis_c(option = "plasma",
name = "Number of Health Facilities",
na.value = "grey90") +
theme_minimal() +
labs(title = "Number of Health Facilities by ZIP Code",
)

str(zipcodes_with_data)
## Classes 'sf' and 'data.frame': 178 obs. of 26 variables:
## $ MODZCTA : chr "10001" "10002" "10003" "10026" ...
## $ label : chr "10001, 10118" "10002" "10003" "10026" ...
## $ zcta : chr "10001, 10119, 10199" "10002" "10003" "10026" ...
## $ pop_est : num 23072 74993 54682 39363 3028 ...
## $ Positive_apr12 : num 211 539 279 308 23 38 10 34 381 166 ...
## $ Total_apr12 : num 448 1024 662 605 59 ...
## $ Positive_apr19 : num 260 712 347 428 24 44 14 40 518 201 ...
## $ Total_apr19 : num 571 1358 830 856 64 ...
## $ Positive_combined_apr12_19: num 471 1251 626 736 47 ...
## $ Total_combined_apr12_19 : num 1019 2382 1492 1461 123 ...
## $ perc_pos_combined_apr12_19: num 46.2 52.5 42 50.4 38.2 ...
## $ COVID_CASE_COUNT : num 1542 5902 2803 2586 247 ...
## $ TOTAL_COVID_TESTS : num 20158 48197 41076 23569 3599 ...
## $ PERCENT_POSITIVE : num 7.86 12.63 6.93 11.21 6.92 ...
## $ Positive_all_dates : num 2013 7153 3429 3322 294 ...
## $ Total_all_dates : num 21177 50579 42568 25030 3722 ...
## $ overall_positivity_rate : num 9.51 14.14 8.06 13.27 7.9 ...
## $ store_count : int 60 192 102 50 10 11 4 26 88 56 ...
## $ total_facilities : num 7 20 16 4 1 1 1 2 5 12 ...
## $ hospitals : num 0 0 3 0 0 0 0 0 0 0 ...
## $ hospice : num 0 0 0 0 0 0 1 0 0 0 ...
## $ hha : num 2 0 0 0 0 0 0 0 0 0 ...
## $ dtc : num 4 8 7 4 1 1 0 0 3 5 ...
## $ hospital_ext_clinics : num 0 4 4 0 0 0 0 1 1 6 ...
## $ residential_health : num 0 2 0 0 0 0 0 0 0 0 ...
## $ geometry :sfc_MULTIPOLYGON of length 178; first list element: List of 1
## ..$ :List of 1
## .. ..$ : num [1:84, 1:2] -74 -74 -74 -74 -74 ...
## ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
## - attr(*, "sf_column")= chr "geometry"
## - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "names")= chr [1:25] "MODZCTA" "label" "zcta" "pop_est" ...
task 4: Join the Census ACS population, race, and age data to the
NYC Planning Census Tract Data.
acsdata <- read.csv("R-Spatial_II_Lab/ACSDP5Y2018.DP05_data_with_overlays_2020-04-22T132935.csv")
dcpdata <- read.csv("data/nyc/nyc_census_tracts_pophu.csv")
acsdata$GEO_ID <- sub("^1400000US", "", acsdata$GEO_ID)
dcpdata <- dcpdata %>% rename_at(1, ~'GEO_ID')
dcpdata <- dcpdata %>% mutate(GEO_ID = as.character(GEO_ID))
acsdata <- acsdata %>% mutate(GEO_ID = as.character(GEO_ID))
merged_data <- full_join(acsdata, dcpdata, by = "GEO_ID")
task 5: Aggregate the ACS census data to zip code area data.
nycacs_census_tracts_sf <- st_read('data/nyc/nyc_acs_tracts.shp')
## Reading layer `nyc_acs_tracts' from data source
## `C:\Users\student\OneDrive\.HUNTER\[4] SPRING 26\GTECH38520\work\R-spatial\data\nyc\nyc_acs_tracts.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 2166 features and 113 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -74.25559 ymin: 40.49612 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS: NAD83
str(nycacs_census_tracts_sf)
## Classes 'sf' and 'data.frame': 2166 obs. of 114 variables:
## $ UNEMP_RATE: num 0 0.0817 0.1706 0 0.088 ...
## $ cartodb_id: num 1 2 3 4 5 6 7 8 9 10 ...
## $ withssi : int 0 228 658 0 736 0 261 0 39 638 ...
## $ withsocial: int 0 353 1577 0 1382 99 1122 10 264 895 ...
## $ withpubass: int 0 47 198 0 194 0 145 0 5 334 ...
## $ struggling: int 0 694 2589 0 2953 337 3085 17 131 1938 ...
## $ profession: int 0 0 36 0 19 745 18 49 35 0 ...
## $ popunemplo: int 0 92 549 0 379 321 432 26 106 494 ...
## $ poptot : int 0 2773 8339 0 10760 7024 10955 849 1701 5923 ...
## $ popover18 : int 0 2351 6878 0 8867 6637 8932 711 1241 4755 ...
## $ popinlabou: int 0 1126 3218 0 4305 5702 5114 657 735 2283 ...
## $ poororstru: int 0 2026 4833 0 7044 1041 6376 112 232 4115 ...
## $ poor : int 0 1332 2244 0 4091 704 3291 95 101 2177 ...
## $ pacificune: int 0 0 0 0 13 0 0 0 0 0 ...
## $ pacificinl: int 0 0 0 0 13 0 0 0 0 0 ...
## $ pacific : int 0 0 0 0 13 0 0 0 0 0 ...
## $ otherunemp: int 0 0 103 0 46 0 0 0 17 151 ...
## $ otherinlab: int 0 144 348 0 609 112 184 48 90 525 ...
## $ otherethni: int 0 598 1175 0 1799 112 377 48 183 1664 ...
## $ onlyprofes: int 0 0 102 0 20 890 116 78 77 0 ...
## $ onlymaster: int 0 77 412 0 292 2162 408 233 328 16 ...
## $ onlylessth: int 0 878 2164 0 3793 121 4384 34 115 1846 ...
## $ onlyhighsc: int 0 1088 4068 0 3868 5508 3882 631 1093 2343 ...
## $ onlydoctor: int 0 0 66 0 1 145 98 29 42 0 ...
## $ onlycolleg: int 0 471 2355 0 2290 5371 2223 585 884 1111 ...
## $ onlybachel: int 0 271 1269 0 1322 4685 1262 481 687 200 ...
## $ okay : int 0 742 3474 0 3499 5982 4579 733 1469 1798 ...
## $ mixedunemp: int 0 16 21 0 14 24 0 0 51 31 ...
## $ mixedinlab: int 0 72 134 0 100 136 91 10 62 140 ...
## $ mixed : int 0 175 234 0 251 224 115 16 102 334 ...
## $ master : int 0 77 310 0 272 1272 292 155 251 16 ...
## $ maleunempl: int 0 76 349 0 179 204 197 8 72 277 ...
## $ maleover18: int 0 1101 3134 0 4068 3557 3992 454 613 1935 ...
## $ malepro : int 0 0 36 0 19 473 48 51 61 0 ...
## $ malemastr : int 0 10 143 0 126 1034 181 123 139 0 ...
## $ male_lesHS: int 0 302 1063 0 1693 29 2013 17 103 714 ...
## $ male_HS : int 0 607 1893 0 1651 2985 1702 414 485 985 ...
## $ male_doctr: int 0 0 24 0 0 77 30 29 38 0 ...
## $ male_collg: int 0 227 1225 0 997 2924 871 381 368 462 ...
## $ male_BA : int 0 132 668 0 562 2411 499 288 292 43 ...
## $ maleinlabo: int 0 612 1640 0 2059 3299 2466 423 392 991 ...
## $ maledrop : int 0 16 8 0 0 0 0 0 0 0 ...
## $ male16to19: int 0 65 253 0 162 37 236 20 32 187 ...
## $ male : int 0 1318 3850 0 4796 3840 5244 533 897 2514 ...
## $ lessthanhi: int 0 878 2164 0 3793 121 4384 34 115 1846 ...
## $ lessthan10: int 0 212 760 0 1100 289 625 2 51 463 ...
## $ households: int 0 986 3382 0 3838 4104 3950 367 739 2224 ...
## $ hispanicun: int 0 30 269 0 115 27 0 0 68 335 ...
## $ hispanicin: int 0 357 1171 0 819 297 236 77 201 1145 ...
## $ hispanic : int 0 1187 3503 0 2608 374 318 108 360 3478 ...
## $ highschool: int 0 617 1713 0 1578 137 1659 46 209 1232 ...
## $ geo_state : int 36 36 36 36 36 36 36 36 36 36 ...
## $ geo_place : int 51000 51000 51000 51000 51000 51000 51000 51000 51000 51000 ...
## $ geo_county: int 61 61 61 61 61 61 61 61 61 61 ...
## $ field_1 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ femaleunem: int 0 16 200 0 200 117 235 18 34 217 ...
## $ femaleover: int 0 1250 3744 0 4799 3080 4940 257 628 2820 ...
## $ fem_profes: int 0 0 66 0 1 417 68 27 16 0 ...
## $ fem_master: int 0 67 269 0 166 1128 227 110 189 16 ...
## $ fem_lessHS: int 0 576 1101 0 2100 92 2371 17 12 1132 ...
## $ fem_HS : int 0 481 2175 0 2217 2523 2180 217 608 1358 ...
## $ fem_doctor: int 0 0 42 0 1 68 68 0 4 0 ...
## $ fem_colleg: int 0 244 1130 0 1293 2447 1352 204 516 649 ...
## $ fem_BA : int 0 139 601 0 760 2274 763 193 395 157 ...
## $ femaleinla: int 0 514 1578 0 2246 2403 2648 234 343 1292 ...
## $ femaledrop: int 0 0 1 0 0 0 0 0 0 14 ...
## $ femal16_19: int 0 84 124 0 271 2 126 2 32 242 ...
## $ female : int 0 1455 4489 0 5964 3184 5711 316 804 3409 ...
## $ europeanun: int 0 14 328 0 56 188 71 26 22 132 ...
## $ europeanin: int 0 303 1641 0 525 4058 492 540 543 510 ...
## $ european : int 0 540 4091 0 1181 4975 929 717 1327 1645 ...
## $ doctorate : int 0 0 66 0 1 145 98 29 42 0 ...
## $ com_90plus: int 0 49 101 0 36 61 82 17 27 178 ...
## $ comm_5less: int 0 99 0 0 1 220 46 26 0 0 ...
## $ comm_60_89: int 0 40 215 0 370 136 687 32 41 331 ...
## $ comm_5_14 : int 0 121 226 0 331 913 718 87 58 99 ...
## $ comm_45_59: int 0 142 171 0 284 287 409 46 84 351 ...
## $ comm_30_44: int 0 217 923 0 1442 1514 1118 198 210 493 ...
## $ comm_15_29: int 0 352 970 0 1250 1840 1363 192 134 304 ...
## $ college : int 0 200 1086 0 968 686 961 104 197 911 ...
## $ bachelor : int 0 194 857 0 1030 2523 854 248 359 184 ...
## $ asianunemp: int 0 62 38 0 207 108 352 0 16 61 ...
## $ asianinlab: int 0 559 615 0 2736 1286 4283 42 35 491 ...
## $ asian : int 0 1249 1724 0 6549 1598 9448 51 75 922 ...
## $ americanun: int 0 0 0 0 0 1 0 0 0 24 ...
## $ americanin: int 0 0 0 0 57 1 0 0 0 24 ...
## $ american : int 0 43 0 0 57 1 0 0 0 51 ...
## $ africanune: int 0 0 59 0 43 0 9 0 0 95 ...
## $ africaninl: int 0 48 434 0 265 109 64 17 5 593 ...
## $ african : int 0 168 1115 0 910 114 86 17 14 1307 ...
## $ puma : chr "3810" "3809" "3809" "3810" ...
## $ ntaname : chr "park-cemetery-etc-Manhattan" "Lower East Side" "Lower East Side" "park-cemetery-etc-Manhattan" ...
## $ ntacode : chr "MN99" "MN28" "MN28" "MN99" ...
## $ ctlabel : chr "1" "2.01" "2.02" "5" ...
## $ cdeligibil: chr "I" "E" "E" "I" ...
## $ boroname : chr "Manhattan" "Manhattan" "Manhattan" "Manhattan" ...
## $ medianinco: num NA 17282 24371 NA 18832 ...
## $ medianagem: num NA 39.3 44.9 NA 43.4 29.7 42 33.2 41.5 31.2 ...
## $ medianagef: num NA 43.8 47.9 NA 43 28.3 43 31.9 45.7 47.5 ...
## [list output truncated]
## - attr(*, "sf_column")= chr "geometry"
## - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "names")= chr [1:113] "UNEMP_RATE" "cartodb_id" "withssi" "withsocial" ...
nycacs_census_tracts_sf <- st_transform(nycacs_census_tracts_sf, st_crs(zipcodes))
tracts_with_zips <- st_join(nycacs_census_tracts_sf, zipcodes)
aggregated <- tracts_with_zips %>%
st_drop_geometry() %>%
group_by(MODZCTA) %>%
summarise(
across(where(is.numeric) & !matches("UNEMP_RATE"), sum, na.rm = TRUE),
UNEMP_RATE = mean(UNEMP_RATE, na.rm = TRUE)
)