### task 1: Join the COVID-19 data to the NYC zip code area data (sf or sp polygons). 
task 2: Aggregate the NYC food retails store data (points) to the zip code data, so that we know how many retail stores in each zip code area.
task 3: Aggregate the NYC health facilities (points) to the zip code data. Similarly, choose appropriate subtypes such as nursing homes from the facilities.


``` r
covid_apr12 <- read_csv("R-Spatial_II_Lab/tests-by-zcta_2020_04_12.csv")
covid_apr19 <- read_csv("R-Spatial_II_Lab/tests-by-zcta_2020_04_19.csv")
covid_apr21 <- read_csv("R-Spatial_II_Lab/tests-by-zcta_2021_04_23.csv")
covid_apr21 <- covid_apr21 %>% rename_at(1, ~'MODZCTA')
covid_apr12 <- covid_apr12 %>% filter(MODZCTA != "NA")
covid_apr19 <- covid_apr19 %>% filter(MODZCTA != "NA")
covid_apr21 <- covid_apr21 %>% filter(MODZCTA != "NA")
covid_apr12 <- covid_apr12 %>% mutate(MODZCTA = as.character(MODZCTA))
covid_apr19 <- covid_apr19 %>% mutate(MODZCTA = as.character(MODZCTA))
covid_apr21 <- covid_apr21 %>% mutate(MODZCTA = as.character(MODZCTA))

zipcodes <- st_read("data/nyc/nyczipcodes.shp")
## Reading layer `nyczipcodes' from data source 
##   `C:\Users\student\OneDrive\.HUNTER\[4] SPRING 26\GTECH38520\work\R-spatial\data\nyc\nyczipcodes.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 178 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25559 ymin: 40.49612 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS:  WGS 84
covid_combined <- covid_apr12 %>%
  select(MODZCTA, Positive_apr12 = Positive, Total_apr12 = Total) %>%
  full_join(
    covid_apr19 %>%
      select(MODZCTA, Positive_apr19 = Positive, Total_apr19 = Total),
    by = "MODZCTA"
  ) %>%
  # Calculate combined Positive and Total from April 12 and April 19
  mutate(
    Positive_combined_apr12_19 = coalesce(Positive_apr12, 0) + coalesce(Positive_apr19, 0),
    Total_combined_apr12_19 = coalesce(Total_apr12, 0) + coalesce(Total_apr19, 0),
    # Calculate percentage for the combined period
    perc_pos_combined_apr12_19 = (Positive_combined_apr12_19 / Total_combined_apr12_19) * 100
  ) %>%
  # Join with April 2021 data
  full_join(
    covid_apr21 %>%
      select(MODZCTA, 
             COVID_CASE_COUNT, 
             TOTAL_COVID_TESTS,
             PERCENT_POSITIVE
             ),
    by = "MODZCTA"
  ) %>%
  # Calculate final combined metrics
  mutate(
    # Combined Positive from all three time periods
    Positive_all_dates = coalesce(Positive_combined_apr12_19, 0) + coalesce(COVID_CASE_COUNT, 0),
    # Combined Total tests from all three time periods
    Total_all_dates = coalesce(Total_combined_apr12_19, 0) + coalesce(TOTAL_COVID_TESTS, 0),
    # Overall positivity rate across all periods
    overall_positivity_rate = (Positive_all_dates / Total_all_dates) * 100
  )


zipcodes <- zipcodes %>% rename_at(1, ~'MODZCTA')
zipcodes_with_data <- zipcodes %>%
  left_join(covid_combined, by = "MODZCTA")

ggplot(zipcodes_with_data) +
  geom_sf(aes(fill = overall_positivity_rate), color = "white", size = 0.1) +
  scale_fill_viridis_c(option = "plasma", 
                       name = "Positivity Rate (%)"
                       ) +
  theme_minimal() +
  labs(title = "COVID-19 Overall Positivity Rate by ZIP Code, 4/12-4/23",
       )

stores_df <- read_csv("data/nys_retail_food_store_xy.csv")

stores_aggregated <- stores_df %>%
  # Remove rows with missing zip codes
  filter(!is.na(Zip.Code)) %>%
  # Clean zip codes (extract 5-digit codes)
  mutate(Zip.Code = as.character(Zip.Code),
         Zip.Code = str_extract(Zip.Code, "\\d{5}")) %>%
  filter(!is.na(Zip.Code)) %>%
  # Count stores per zip code
  group_by(Zip.Code) %>%
  summarise(
    store_count = n(),
    .groups = 'drop'
  )

names(stores_aggregated)[1] <- "MODZCTA"

zipcodes_with_data <- zipcodes_with_data %>%
  mutate(MODZCTA = as.character(MODZCTA)) %>%
  left_join(stores_aggregated, by = "MODZCTA") %>%
  mutate(store_count = replace_na(store_count, 0))

ggplot(zipcodes_with_data) +
  geom_sf(aes(fill = store_count), color = "white", size = 0.1) +
  scale_fill_viridis_c(option = "plasma", 
                       name = "Number of Stores",
                       na.value = "grey90") +
  theme_minimal() +
  labs(title = "Number of Stores by ZIP Code",
      )

facilities <- read.csv("data/NYS_Health_Facility.csv", stringsAsFactors = FALSE)

facilities_sf <- facilities %>%
  filter(!is.na(Facility.Latitude) & !is.na(Facility.Longitude)) %>%
  st_as_sf(coords = c("Facility.Longitude", "Facility.Latitude"), crs = 4326)

facility_counts <- facilities_sf %>%
  st_drop_geometry() %>%
  group_by(Facility.Zip.Code) %>%
  summarise(
    total_facilities = n(),
    # Count by facility type
    hospitals = sum(Short.Description == "HOSP", na.rm = TRUE),
    hospice = sum(Short.Description == "HSPC", na.rm = TRUE),
    hha = sum(Short.Description == "CHHA", na.rm = TRUE),
    dtc = sum(Short.Description == "DTC" | Short.Description == "DTC-EC", na.rm = TRUE),
    hospital_ext_clinics = sum(grepl("HOSP-EC|HOSP-SB", Short.Description), na.rm = TRUE),
    # For nursing homes specifically
    residential_health = sum(Short.Description == "NH", na.rm = TRUE),
  ) %>%
  arrange(desc(total_facilities))

names(facility_counts)[1] <- "MODZCTA"
zipcodes_with_data <- zipcodes_with_data %>%
  left_join(facility_counts, by = "MODZCTA")
zipcodes_with_data <- zipcodes_with_data %>% replace(is.na(.), 0)

ggplot(zipcodes_with_data) +
  geom_sf(aes(fill = total_facilities), color = "white", size = 0.1) +
  scale_fill_viridis_c(option = "plasma", 
                       name = "Number of Health Facilities",
                       na.value = "grey90") +
  theme_minimal() +
  labs(title = "Number of Health Facilities by ZIP Code",
  )

str(zipcodes_with_data)
## Classes 'sf' and 'data.frame':   178 obs. of  26 variables:
##  $ MODZCTA                   : chr  "10001" "10002" "10003" "10026" ...
##  $ label                     : chr  "10001, 10118" "10002" "10003" "10026" ...
##  $ zcta                      : chr  "10001, 10119, 10199" "10002" "10003" "10026" ...
##  $ pop_est                   : num  23072 74993 54682 39363 3028 ...
##  $ Positive_apr12            : num  211 539 279 308 23 38 10 34 381 166 ...
##  $ Total_apr12               : num  448 1024 662 605 59 ...
##  $ Positive_apr19            : num  260 712 347 428 24 44 14 40 518 201 ...
##  $ Total_apr19               : num  571 1358 830 856 64 ...
##  $ Positive_combined_apr12_19: num  471 1251 626 736 47 ...
##  $ Total_combined_apr12_19   : num  1019 2382 1492 1461 123 ...
##  $ perc_pos_combined_apr12_19: num  46.2 52.5 42 50.4 38.2 ...
##  $ COVID_CASE_COUNT          : num  1542 5902 2803 2586 247 ...
##  $ TOTAL_COVID_TESTS         : num  20158 48197 41076 23569 3599 ...
##  $ PERCENT_POSITIVE          : num  7.86 12.63 6.93 11.21 6.92 ...
##  $ Positive_all_dates        : num  2013 7153 3429 3322 294 ...
##  $ Total_all_dates           : num  21177 50579 42568 25030 3722 ...
##  $ overall_positivity_rate   : num  9.51 14.14 8.06 13.27 7.9 ...
##  $ store_count               : int  60 192 102 50 10 11 4 26 88 56 ...
##  $ total_facilities          : num  7 20 16 4 1 1 1 2 5 12 ...
##  $ hospitals                 : num  0 0 3 0 0 0 0 0 0 0 ...
##  $ hospice                   : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ hha                       : num  2 0 0 0 0 0 0 0 0 0 ...
##  $ dtc                       : num  4 8 7 4 1 1 0 0 3 5 ...
##  $ hospital_ext_clinics      : num  0 4 4 0 0 0 0 1 1 6 ...
##  $ residential_health        : num  0 2 0 0 0 0 0 0 0 0 ...
##  $ geometry                  :sfc_MULTIPOLYGON of length 178; first list element: List of 1
##   ..$ :List of 1
##   .. ..$ : num [1:84, 1:2] -74 -74 -74 -74 -74 ...
##   ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
##  - attr(*, "sf_column")= chr "geometry"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "names")= chr [1:25] "MODZCTA" "label" "zcta" "pop_est" ...

task 4: Join the Census ACS population, race, and age data to the NYC Planning Census Tract Data.

acsdata <- read.csv("R-Spatial_II_Lab/ACSDP5Y2018.DP05_data_with_overlays_2020-04-22T132935.csv")
dcpdata <- read.csv("data/nyc/nyc_census_tracts_pophu.csv")

acsdata$GEO_ID <- sub("^1400000US", "", acsdata$GEO_ID)
dcpdata <- dcpdata %>% rename_at(1, ~'GEO_ID')
dcpdata <- dcpdata %>% mutate(GEO_ID = as.character(GEO_ID))
acsdata <- acsdata %>% mutate(GEO_ID = as.character(GEO_ID))
merged_data <- full_join(acsdata, dcpdata, by = "GEO_ID")

task 5: Aggregate the ACS census data to zip code area data.

nycacs_census_tracts_sf <- st_read('data/nyc/nyc_acs_tracts.shp')
## Reading layer `nyc_acs_tracts' from data source 
##   `C:\Users\student\OneDrive\.HUNTER\[4] SPRING 26\GTECH38520\work\R-spatial\data\nyc\nyc_acs_tracts.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 2166 features and 113 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25559 ymin: 40.49612 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS:  NAD83
str(nycacs_census_tracts_sf)
## Classes 'sf' and 'data.frame':   2166 obs. of  114 variables:
##  $ UNEMP_RATE: num  0 0.0817 0.1706 0 0.088 ...
##  $ cartodb_id: num  1 2 3 4 5 6 7 8 9 10 ...
##  $ withssi   : int  0 228 658 0 736 0 261 0 39 638 ...
##  $ withsocial: int  0 353 1577 0 1382 99 1122 10 264 895 ...
##  $ withpubass: int  0 47 198 0 194 0 145 0 5 334 ...
##  $ struggling: int  0 694 2589 0 2953 337 3085 17 131 1938 ...
##  $ profession: int  0 0 36 0 19 745 18 49 35 0 ...
##  $ popunemplo: int  0 92 549 0 379 321 432 26 106 494 ...
##  $ poptot    : int  0 2773 8339 0 10760 7024 10955 849 1701 5923 ...
##  $ popover18 : int  0 2351 6878 0 8867 6637 8932 711 1241 4755 ...
##  $ popinlabou: int  0 1126 3218 0 4305 5702 5114 657 735 2283 ...
##  $ poororstru: int  0 2026 4833 0 7044 1041 6376 112 232 4115 ...
##  $ poor      : int  0 1332 2244 0 4091 704 3291 95 101 2177 ...
##  $ pacificune: int  0 0 0 0 13 0 0 0 0 0 ...
##  $ pacificinl: int  0 0 0 0 13 0 0 0 0 0 ...
##  $ pacific   : int  0 0 0 0 13 0 0 0 0 0 ...
##  $ otherunemp: int  0 0 103 0 46 0 0 0 17 151 ...
##  $ otherinlab: int  0 144 348 0 609 112 184 48 90 525 ...
##  $ otherethni: int  0 598 1175 0 1799 112 377 48 183 1664 ...
##  $ onlyprofes: int  0 0 102 0 20 890 116 78 77 0 ...
##  $ onlymaster: int  0 77 412 0 292 2162 408 233 328 16 ...
##  $ onlylessth: int  0 878 2164 0 3793 121 4384 34 115 1846 ...
##  $ onlyhighsc: int  0 1088 4068 0 3868 5508 3882 631 1093 2343 ...
##  $ onlydoctor: int  0 0 66 0 1 145 98 29 42 0 ...
##  $ onlycolleg: int  0 471 2355 0 2290 5371 2223 585 884 1111 ...
##  $ onlybachel: int  0 271 1269 0 1322 4685 1262 481 687 200 ...
##  $ okay      : int  0 742 3474 0 3499 5982 4579 733 1469 1798 ...
##  $ mixedunemp: int  0 16 21 0 14 24 0 0 51 31 ...
##  $ mixedinlab: int  0 72 134 0 100 136 91 10 62 140 ...
##  $ mixed     : int  0 175 234 0 251 224 115 16 102 334 ...
##  $ master    : int  0 77 310 0 272 1272 292 155 251 16 ...
##  $ maleunempl: int  0 76 349 0 179 204 197 8 72 277 ...
##  $ maleover18: int  0 1101 3134 0 4068 3557 3992 454 613 1935 ...
##  $ malepro   : int  0 0 36 0 19 473 48 51 61 0 ...
##  $ malemastr : int  0 10 143 0 126 1034 181 123 139 0 ...
##  $ male_lesHS: int  0 302 1063 0 1693 29 2013 17 103 714 ...
##  $ male_HS   : int  0 607 1893 0 1651 2985 1702 414 485 985 ...
##  $ male_doctr: int  0 0 24 0 0 77 30 29 38 0 ...
##  $ male_collg: int  0 227 1225 0 997 2924 871 381 368 462 ...
##  $ male_BA   : int  0 132 668 0 562 2411 499 288 292 43 ...
##  $ maleinlabo: int  0 612 1640 0 2059 3299 2466 423 392 991 ...
##  $ maledrop  : int  0 16 8 0 0 0 0 0 0 0 ...
##  $ male16to19: int  0 65 253 0 162 37 236 20 32 187 ...
##  $ male      : int  0 1318 3850 0 4796 3840 5244 533 897 2514 ...
##  $ lessthanhi: int  0 878 2164 0 3793 121 4384 34 115 1846 ...
##  $ lessthan10: int  0 212 760 0 1100 289 625 2 51 463 ...
##  $ households: int  0 986 3382 0 3838 4104 3950 367 739 2224 ...
##  $ hispanicun: int  0 30 269 0 115 27 0 0 68 335 ...
##  $ hispanicin: int  0 357 1171 0 819 297 236 77 201 1145 ...
##  $ hispanic  : int  0 1187 3503 0 2608 374 318 108 360 3478 ...
##  $ highschool: int  0 617 1713 0 1578 137 1659 46 209 1232 ...
##  $ geo_state : int  36 36 36 36 36 36 36 36 36 36 ...
##  $ geo_place : int  51000 51000 51000 51000 51000 51000 51000 51000 51000 51000 ...
##  $ geo_county: int  61 61 61 61 61 61 61 61 61 61 ...
##  $ field_1   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ femaleunem: int  0 16 200 0 200 117 235 18 34 217 ...
##  $ femaleover: int  0 1250 3744 0 4799 3080 4940 257 628 2820 ...
##  $ fem_profes: int  0 0 66 0 1 417 68 27 16 0 ...
##  $ fem_master: int  0 67 269 0 166 1128 227 110 189 16 ...
##  $ fem_lessHS: int  0 576 1101 0 2100 92 2371 17 12 1132 ...
##  $ fem_HS    : int  0 481 2175 0 2217 2523 2180 217 608 1358 ...
##  $ fem_doctor: int  0 0 42 0 1 68 68 0 4 0 ...
##  $ fem_colleg: int  0 244 1130 0 1293 2447 1352 204 516 649 ...
##  $ fem_BA    : int  0 139 601 0 760 2274 763 193 395 157 ...
##  $ femaleinla: int  0 514 1578 0 2246 2403 2648 234 343 1292 ...
##  $ femaledrop: int  0 0 1 0 0 0 0 0 0 14 ...
##  $ femal16_19: int  0 84 124 0 271 2 126 2 32 242 ...
##  $ female    : int  0 1455 4489 0 5964 3184 5711 316 804 3409 ...
##  $ europeanun: int  0 14 328 0 56 188 71 26 22 132 ...
##  $ europeanin: int  0 303 1641 0 525 4058 492 540 543 510 ...
##  $ european  : int  0 540 4091 0 1181 4975 929 717 1327 1645 ...
##  $ doctorate : int  0 0 66 0 1 145 98 29 42 0 ...
##  $ com_90plus: int  0 49 101 0 36 61 82 17 27 178 ...
##  $ comm_5less: int  0 99 0 0 1 220 46 26 0 0 ...
##  $ comm_60_89: int  0 40 215 0 370 136 687 32 41 331 ...
##  $ comm_5_14 : int  0 121 226 0 331 913 718 87 58 99 ...
##  $ comm_45_59: int  0 142 171 0 284 287 409 46 84 351 ...
##  $ comm_30_44: int  0 217 923 0 1442 1514 1118 198 210 493 ...
##  $ comm_15_29: int  0 352 970 0 1250 1840 1363 192 134 304 ...
##  $ college   : int  0 200 1086 0 968 686 961 104 197 911 ...
##  $ bachelor  : int  0 194 857 0 1030 2523 854 248 359 184 ...
##  $ asianunemp: int  0 62 38 0 207 108 352 0 16 61 ...
##  $ asianinlab: int  0 559 615 0 2736 1286 4283 42 35 491 ...
##  $ asian     : int  0 1249 1724 0 6549 1598 9448 51 75 922 ...
##  $ americanun: int  0 0 0 0 0 1 0 0 0 24 ...
##  $ americanin: int  0 0 0 0 57 1 0 0 0 24 ...
##  $ american  : int  0 43 0 0 57 1 0 0 0 51 ...
##  $ africanune: int  0 0 59 0 43 0 9 0 0 95 ...
##  $ africaninl: int  0 48 434 0 265 109 64 17 5 593 ...
##  $ african   : int  0 168 1115 0 910 114 86 17 14 1307 ...
##  $ puma      : chr  "3810" "3809" "3809" "3810" ...
##  $ ntaname   : chr  "park-cemetery-etc-Manhattan" "Lower East Side" "Lower East Side" "park-cemetery-etc-Manhattan" ...
##  $ ntacode   : chr  "MN99" "MN28" "MN28" "MN99" ...
##  $ ctlabel   : chr  "1" "2.01" "2.02" "5" ...
##  $ cdeligibil: chr  "I" "E" "E" "I" ...
##  $ boroname  : chr  "Manhattan" "Manhattan" "Manhattan" "Manhattan" ...
##  $ medianinco: num  NA 17282 24371 NA 18832 ...
##  $ medianagem: num  NA 39.3 44.9 NA 43.4 29.7 42 33.2 41.5 31.2 ...
##  $ medianagef: num  NA 43.8 47.9 NA 43 28.3 43 31.9 45.7 47.5 ...
##   [list output truncated]
##  - attr(*, "sf_column")= chr "geometry"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "names")= chr [1:113] "UNEMP_RATE" "cartodb_id" "withssi" "withsocial" ...
nycacs_census_tracts_sf <- st_transform(nycacs_census_tracts_sf, st_crs(zipcodes))
tracts_with_zips <- st_join(nycacs_census_tracts_sf, zipcodes)

aggregated <- tracts_with_zips %>%
  st_drop_geometry() %>%
  group_by(MODZCTA) %>%
  summarise( 
    across(where(is.numeric) & !matches("UNEMP_RATE"), sum, na.rm = TRUE),
    UNEMP_RATE = mean(UNEMP_RATE, na.rm = TRUE)
    )