R Spaital Lab Assignment # 1

library(sf); library(rgdal); library(sp); require(magrittr)
library(raster); library(dplyr); library(rgeos); library(rstudioapi)


wd <- dirname(getActiveDocumentContext()$path)
setwd(wd)


#read in as SF
NYC_zip_sf <- st_read("../Wk 11/ZIP_CODE_040114/ZIP_CODE_040114.shp")
## Reading layer `ZIP_CODE_040114' from data source `/Volumes/Storage/School/'21 (1) Spring/GTECH 38520 R/Homework/Wk 11_15/Wk 11/ZIP_CODE_040114/ZIP_CODE_040114.shp' using driver `ESRI Shapefile'
## Simple feature collection with 263 features and 12 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 913129 ymin: 120020.9 xmax: 1067494 ymax: 272710.9
## Projected CRS: NAD83 / New York Long Island (ftUS)
as.numeric(as.character(NYC_zip_sf))
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA
str(NYC_zip_sf)
## Classes 'sf' and 'data.frame':   263 obs. of  13 variables:
##  $ ZIPCODE   : chr  "11436" "11213" "11212" "11225" ...
##  $ BLDGZIP   : chr  "0" "0" "0" "0" ...
##  $ PO_NAME   : chr  "Jamaica" "Brooklyn" "Brooklyn" "Brooklyn" ...
##  $ POPULATION: num  18681 62426 83866 56527 72280 ...
##  $ AREA      : num  22699295 29631004 41972104 23698630 36868799 ...
##  $ STATE     : chr  "NY" "NY" "NY" "NY" ...
##  $ COUNTY    : chr  "Queens" "Kings" "Kings" "Kings" ...
##  $ ST_FIPS   : chr  "36" "36" "36" "36" ...
##  $ CTY_FIPS  : chr  "081" "047" "047" "047" ...
##  $ URL       : chr  "http://www.usps.com/" "http://www.usps.com/" "http://www.usps.com/" "http://www.usps.com/" ...
##  $ SHAPE_AREA: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SHAPE_LEN : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ geometry  :sfc_POLYGON of length 263; first list element: List of 1
##   ..$ : num [1:159, 1:2] 1038098 1038142 1038171 1038280 1038521 ...
##   ..- attr(*, "class")= chr [1:3] "XY" "POLYGON" "sfg"
##  - attr(*, "sf_column")= chr "geometry"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "names")= chr [1:12] "ZIPCODE" "BLDGZIP" "PO_NAME" "POPULATION" ...
Covid_Data <- read.csv("tests-by-zcta_2021_04_23.csv")

str(Covid_Data)
## 'data.frame':    177 obs. of  13 variables:
##  $ MODIFIED_ZCTA    : int  10001 10002 10003 10004 10005 10006 10007 10009 10010 10011 ...
##  $ NEIGHBORHOOD_NAME: chr  "Chelsea/NoMad/West Chelsea" "Chinatown/Lower East Side" "East Village/Gramercy/Greenwich Village" "Financial District" ...
##  $ BOROUGH_GROUP    : chr  "Manhattan" "Manhattan" "Manhattan" "Manhattan" ...
##  $ label            : chr  "10001, 10118" "10002" "10003" "10004" ...
##  $ lat              : num  40.8 40.7 40.7 40.7 40.7 ...
##  $ lon              : num  -74 -74 -74 -74 -74 ...
##  $ COVID_CASE_COUNT : int  1542 5902 2803 247 413 164 379 3605 1686 2542 ...
##  $ COVID_CASE_RATE  : num  5584 7836 5193 8311 4716 ...
##  $ POP_DENOMINATOR  : num  27613 75323 53978 2972 8757 ...
##  $ COVID_DEATH_COUNT: int  35 264 48 2 0 1 4 118 37 62 ...
##  $ COVID_DEATH_RATE : num  126.8 350.5 88.9 67.3 0 ...
##  $ PERCENT_POSITIVE : num  7.86 12.63 6.93 6.92 6.72 ...
##  $ TOTAL_COVID_TESTS: int  20158 48197 41076 3599 6102 2441 6342 38773 27864 35539 ...
#left join of the two dfs accounting for difference in type. 
# filter out zipcodes where population is 0 and remove some NA areas. 
left_join(NYC_zip_sf %>% mutate(ZIPCODE = as.numeric(as.character(ZIPCODE))),
         Covid_Data, by = c("ZIPCODE" = "MODIFIED_ZCTA")) %>% 
         filter(POPULATION != 0) %>% 
         filter(BOROUGH_GROUP !=  'NA') -> Covid_ZIP 
str(Covid_ZIP)
## Classes 'sf' and 'data.frame':   189 obs. of  25 variables:
##  $ ZIPCODE          : num  11436 11213 11212 11225 11218 ...
##  $ BLDGZIP          : chr  "0" "0" "0" "0" ...
##  $ PO_NAME          : chr  "Jamaica" "Brooklyn" "Brooklyn" "Brooklyn" ...
##  $ POPULATION       : num  18681 62426 83866 56527 72280 ...
##  $ AREA             : num  22699295 29631004 41972104 23698630 36868799 ...
##  $ STATE            : chr  "NY" "NY" "NY" "NY" ...
##  $ COUNTY           : chr  "Queens" "Kings" "Kings" "Kings" ...
##  $ ST_FIPS          : chr  "36" "36" "36" "36" ...
##  $ CTY_FIPS         : chr  "081" "047" "047" "047" ...
##  $ URL              : chr  "http://www.usps.com/" "http://www.usps.com/" "http://www.usps.com/" "http://www.usps.com/" ...
##  $ SHAPE_AREA       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SHAPE_LEN        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ NEIGHBORHOOD_NAME: chr  "South Jamaica/South Ozone Park" "Crown Heights (East)" "Ocean Hill-Brownsville" "Crown Heights (West)/Prospect Lefferts Gardens" ...
##  $ BOROUGH_GROUP    : chr  "Queens" "Brooklyn" "Brooklyn" "Brooklyn" ...
##  $ label            : chr  "11436" "11213" "11212" "11225" ...
##  $ lat              : num  40.7 40.7 40.7 40.7 40.6 ...
##  $ lon              : num  -73.8 -73.9 -73.9 -74 -74 ...
##  $ COVID_CASE_COUNT : int  1888 5166 7182 3833 6199 7279 8429 5380 11044 7331 ...
##  $ COVID_CASE_RATE  : num  9420 7997 9710 6664 8377 ...
##  $ POP_DENOMINATOR  : num  20043 64601 73967 57514 73996 ...
##  $ COVID_DEATH_COUNT: int  64 203 330 177 218 368 256 206 380 219 ...
##  $ COVID_DEATH_RATE : num  319 314 446 308 295 ...
##  $ PERCENT_POSITIVE : num  17.6 13.7 15.6 11.6 13.9 ...
##  $ TOTAL_COVID_TESTS: int  11082 38560 47319 33709 45884 56287 55444 35070 59585 43449 ...
##  $ geometry         :sfc_POLYGON of length 189; first list element: List of 1
##   ..$ : num [1:159, 1:2] 1038098 1038142 1038171 1038280 1038521 ...
##   ..- attr(*, "class")= chr [1:3] "XY" "POLYGON" "sfg"
##  - attr(*, "sf_column")= chr "geometry"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "names")= chr [1:24] "ZIPCODE" "BLDGZIP" "PO_NAME" "POPULATION" ...
# Map of Percent positive per zipcode. 

plot(Covid_ZIP['PERCENT_POSITIVE'])

# Task 2

st_read("./nycFoodstore.shp") -> nycFoodStoreSF
## Reading layer `nycFoodStore' from data source `/Volumes/Storage/School/'21 (1) Spring/GTECH 38520 R/Homework/Wk 11_15/Wk 12/nycFoodStore.shp' using driver `ESRI Shapefile'
## Simple feature collection with 11300 features and 16 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -74.2484 ymin: 40.50782 xmax: -73.67061 ymax: 40.91008
## Geodetic CRS:  WGS 84
st_crs(nycFoodStoreSF)
## Coordinate Reference System:
##   User input: WGS 84 
##   wkt:
## GEOGCRS["WGS 84",
##     DATUM["World Geodetic System 1984",
##         ELLIPSOID["WGS 84",6378137,298.257223563,
##             LENGTHUNIT["metre",1]]],
##     PRIMEM["Greenwich",0,
##         ANGLEUNIT["degree",0.0174532925199433]],
##     CS[ellipsoidal,2],
##         AXIS["latitude",north,
##             ORDER[1],
##             ANGLEUNIT["degree",0.0174532925199433]],
##         AXIS["longitude",east,
##             ORDER[2],
##             ANGLEUNIT["degree",0.0174532925199433]],
##     ID["EPSG",4326]]
st_crs(Covid_ZIP)
## Coordinate Reference System:
##   User input: NAD83 / New York Long Island (ftUS) 
##   wkt:
## PROJCRS["NAD83 / New York Long Island (ftUS)",
##     BASEGEOGCRS["NAD83",
##         DATUM["North American Datum 1983",
##             ELLIPSOID["GRS 1980",6378137,298.257222101,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4269]],
##     CONVERSION["SPCS83 New York Long Island zone (US Survey feet)",
##         METHOD["Lambert Conic Conformal (2SP)",
##             ID["EPSG",9802]],
##         PARAMETER["Latitude of false origin",40.1666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8821]],
##         PARAMETER["Longitude of false origin",-74,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8822]],
##         PARAMETER["Latitude of 1st standard parallel",41.0333333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8823]],
##         PARAMETER["Latitude of 2nd standard parallel",40.6666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8824]],
##         PARAMETER["Easting at false origin",984250,
##             LENGTHUNIT["US survey foot",0.304800609601219],
##             ID["EPSG",8826]],
##         PARAMETER["Northing at false origin",0,
##             LENGTHUNIT["US survey foot",0.304800609601219],
##             ID["EPSG",8827]]],
##     CS[Cartesian,2],
##         AXIS["easting (X)",east,
##             ORDER[1],
##             LENGTHUNIT["US survey foot",0.304800609601219]],
##         AXIS["northing (Y)",north,
##             ORDER[2],
##             LENGTHUNIT["US survey foot",0.304800609601219]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["USA - New York - SPCS - Long Island"],
##         BBOX[40.47,-74.26,41.3,-71.8]],
##     ID["EPSG",2263]]
st_transform(nycFoodStoreSF, st_crs(2263)) -> nycFoodStoreSFLocal



nycFoodStoreSF %<>% st_transform(st_crs(Covid_ZIP)) 

nycFoodStoreSF %>%  filter(stringr::str_detect(Estbl_T, '[AJD]')) %>% 
st_join(Covid_ZIP, ., join = st_contains) -> nycFoodStoreJoin

nycFoodStoreJoin %>% 
  group_by(ZIPCODE, Estbl_T) %>% 
  summarise(FoodStoreNUM = n()) %>% 
  magrittr::extract('FoodStoreNUM') %>%
  plot(breaks = "jenks", main = 'Number of Food Stores')

# Task 3: Aggregate the NYC health facicilities points to zipcodes. Choose appropriate subtypes. 

Health_Fac <- st_read("../Wk 11/Health_Fac_SF/Health_Fac_SF.shp")
## Reading layer `Health_Fac_SF' from data source `/Volumes/Storage/School/'21 (1) Spring/GTECH 38520 R/Homework/Wk 11_15/Wk 11/Health_Fac_SF/Health_Fac_SF.shp' using driver `ESRI Shapefile'
## Simple feature collection with 1292 features and 11 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -74.19681 ymin: 40.51677 xmax: -73.69169 ymax: 40.91062
## Geodetic CRS:  WGS 84
str(Health_Fac)
## Classes 'sf' and 'data.frame':   1292 obs. of  12 variables:
##  $ Fclt_ID : int  6230 7257 9006 9970 1217 1820 4326 5574 7579 1653 ...
##  $ Fclty_N : chr  "NYU Langone Rutherford" "Park Ridge Family Health Center" "FedCare, Inc." "Parkmed NYC, LLC" ...
##  $ Shrt_Ds : chr  "HOSP-EC" "HOSP-EC" "DTC" "DTC" ...
##  $ Dscrptn : chr  "Hospital Extension Clinic" "Hospital Extension Clinic" "Diagnostic and Treatment Center" "Diagnostic and Treatment Center" ...
##  $ Fcl_A_1 : chr  "305 Second Ave" "6317 4th Avenue" "344 West 51 Street" "800 Second Avenue, 6th Floor" ...
##  $ Fcl_A_2 : chr  NA NA NA NA ...
##  $ Fclty_Ct: chr  "New York" "Brooklyn" "New York" "New York" ...
##  $ Fclty_S : chr  "New York" "New York" "New York" "New York" ...
##  $ Fcl_Z_C : chr  "10003" "11220" "10019" "10017" ...
##  $ Fcl_C_C : int  7093 7095 7093 7093 7094 7095 7094 7095 7096 7096 ...
##  $ Fclty_Cn: chr  "New York" "Kings" "New York" "New York" ...
##  $ geometry:sfc_POINT of length 1292; first list element:  'XY' num  -74 40.7
##  - attr(*, "sf_column")= chr "geometry"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "names")= chr [1:11] "Fclt_ID" "Fclty_N" "Shrt_Ds" "Dscrptn" ...
st_crs(Health_Fac)
## Coordinate Reference System:
##   User input: WGS 84 
##   wkt:
## GEOGCRS["WGS 84",
##     DATUM["World Geodetic System 1984",
##         ELLIPSOID["WGS 84",6378137,298.257223563,
##             LENGTHUNIT["metre",1]]],
##     PRIMEM["Greenwich",0,
##         ANGLEUNIT["degree",0.0174532925199433]],
##     CS[ellipsoidal,2],
##         AXIS["latitude",north,
##             ORDER[1],
##             ANGLEUNIT["degree",0.0174532925199433]],
##         AXIS["longitude",east,
##             ORDER[2],
##             ANGLEUNIT["degree",0.0174532925199433]],
##     ID["EPSG",4326]]
st_crs(Covid_ZIP)
## Coordinate Reference System:
##   User input: NAD83 / New York Long Island (ftUS) 
##   wkt:
## PROJCRS["NAD83 / New York Long Island (ftUS)",
##     BASEGEOGCRS["NAD83",
##         DATUM["North American Datum 1983",
##             ELLIPSOID["GRS 1980",6378137,298.257222101,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4269]],
##     CONVERSION["SPCS83 New York Long Island zone (US Survey feet)",
##         METHOD["Lambert Conic Conformal (2SP)",
##             ID["EPSG",9802]],
##         PARAMETER["Latitude of false origin",40.1666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8821]],
##         PARAMETER["Longitude of false origin",-74,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8822]],
##         PARAMETER["Latitude of 1st standard parallel",41.0333333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8823]],
##         PARAMETER["Latitude of 2nd standard parallel",40.6666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8824]],
##         PARAMETER["Easting at false origin",984250,
##             LENGTHUNIT["US survey foot",0.304800609601219],
##             ID["EPSG",8826]],
##         PARAMETER["Northing at false origin",0,
##             LENGTHUNIT["US survey foot",0.304800609601219],
##             ID["EPSG",8827]]],
##     CS[Cartesian,2],
##         AXIS["easting (X)",east,
##             ORDER[1],
##             LENGTHUNIT["US survey foot",0.304800609601219]],
##         AXIS["northing (Y)",north,
##             ORDER[2],
##             LENGTHUNIT["US survey foot",0.304800609601219]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["USA - New York - SPCS - Long Island"],
##         BBOX[40.47,-74.26,41.3,-71.8]],
##     ID["EPSG",2263]]
# Change Health Fac crs to 2263
st_transform(Health_Fac, st_crs(2263)) -> Health_Fac_Local

st_crs(Health_Fac_Local)
## Coordinate Reference System:
##   User input: EPSG:2263 
##   wkt:
## PROJCRS["NAD83 / New York Long Island (ftUS)",
##     BASEGEOGCRS["NAD83",
##         DATUM["North American Datum 1983",
##             ELLIPSOID["GRS 1980",6378137,298.257222101,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4269]],
##     CONVERSION["SPCS83 New York Long Island zone (US Survey feet)",
##         METHOD["Lambert Conic Conformal (2SP)",
##             ID["EPSG",9802]],
##         PARAMETER["Latitude of false origin",40.1666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8821]],
##         PARAMETER["Longitude of false origin",-74,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8822]],
##         PARAMETER["Latitude of 1st standard parallel",41.0333333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8823]],
##         PARAMETER["Latitude of 2nd standard parallel",40.6666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8824]],
##         PARAMETER["Easting at false origin",984250,
##             LENGTHUNIT["US survey foot",0.304800609601219],
##             ID["EPSG",8826]],
##         PARAMETER["Northing at false origin",0,
##             LENGTHUNIT["US survey foot",0.304800609601219],
##             ID["EPSG",8827]]],
##     CS[Cartesian,2],
##         AXIS["easting (X)",east,
##             ORDER[1],
##             LENGTHUNIT["US survey foot",0.304800609601219]],
##         AXIS["northing (Y)",north,
##             ORDER[2],
##             LENGTHUNIT["US survey foot",0.304800609601219]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["USA - New York - SPCS - Long Island"],
##         BBOX[40.47,-74.26,41.3,-71.8]],
##     ID["EPSG",2263]]
Health_Fac %<>% st_transform(st_crs(Covid_ZIP))

not_valid <- c("CHHA", "ADHCP", "HOSP-SB", "LTHHCP", "NA")

Health_Fac_Local %>%  filter(Shrt_Ds != "CHHA") %>% 
  filter(Shrt_Ds != "ADHCP") %>% 
  filter(Shrt_Ds != "HOSP-SB") %>%
  filter(Shrt_Ds != "LTHHCP") %>%
  filter(Shrt_Ds != "NA") %>% 
  st_join(Covid_ZIP, ., join = st_contains) %>%
  filter(Shrt_Ds != "NA")-> Health_Fac_Join



Health_Fac_Join %>% 
  group_by(ZIPCODE) %>% 
  summarise(HealthFacNUM = n()) %>% 
  magrittr::extract('HealthFacNUM') %>%
  plot(breaks = "jenks", main = 'Number of Health Facilities')

## Produces some blank areas. Potentially due to lack of existing facilities for that area. 


## Trouble shooting the error: 
## Error in cut.default(values, breaks, include.lowest = TRUE) : 
## 'breaks' are not unique
# st_crs(Health_Fac_Join)
# st_crs(Covid_ZIP)
# range(st_coordinates(Health_Fac_Local)) ##  127612.4 1069728.4
# range(st_coordinates(Health_Fac_Join)) ## 1 1067113 
# range(st_coordinates(nycFoodStoreJoin)) ## 1 1067113  r operates on this so this must be normal for the above join as well
# SOlved by removing Shrt_Ds from the grouping function. 


# Task 4: Join the Census ACS population, race, and age data to the NYC Planning Census Tract Data.

Census_TractSF <- st_read("./2010 Census Tracts/geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp", stringsAsFactors = FALSE)
## Reading layer `geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a' from data source `/Volumes/Storage/School/'21 (1) Spring/GTECH 38520 R/Homework/Wk 11_15/Wk 12/2010 Census Tracts/geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp' using driver `ESRI Shapefile'
## Simple feature collection with 2165 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25559 ymin: 40.49612 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS:  WGS84(DD)
Census_TractSF %<>% dplyr::mutate(cntyFIPS = dplyr::case_when(
  boro_name == 'Bronx' ~ '005',
  boro_name == 'Brooklyn' ~ '047',
  boro_name == 'Manhattan' ~ '061',
  boro_name == 'Queens' ~ '081',
  boro_name == 'Staten Island' ~ '085'),
  tractFIPS = paste(cntyFIPS, ct2010, sep='')
)

## I was worried the first line would interfere with subsequent operations, but on my original join, it disappears.

ACS_DATA <- readLines("./ACSDP5Y2018.DP05_data_with_overlays_2020-04-22T132935.csv") %>% 
  magrittr::extract(-2) %>% 
  textConnection() %>%
  read.csv(header=TRUE, quote= "\"") %>%
  dplyr::select(GEO_ID, 
                totPop = DP05_0001E, elderlyPop = DP05_0024E, # >= 65
                malePop = DP05_0002E, femalePop = DP05_0003E,  
                whitePop = DP05_0037E, blackPop = DP05_0038E,
                asianPop = DP05_0067E, hispanicPop = DP05_0071E,
                adultPop = DP05_0021E, citizenAdult = DP05_0087E) %>%
  dplyr::mutate(censusCode = stringr::str_sub(GEO_ID, -9,-1));

## displays the first ten rows in console
ACS_DATA %>%
  magrittr::extract(1:10,)
##                  GEO_ID totPop elderlyPop malePop femalePop whitePop blackPop
## 1  1400000US36005000100   7080         51    6503       577     1773     4239
## 2  1400000US36005000200   4542        950    2264      2278     2165     1279
## 3  1400000US36005000400   5634        710    2807      2827     2623     1699
## 4  1400000US36005001600   5917        989    2365      3552     2406     2434
## 5  1400000US36005001900   2765         76    1363      1402      585     1041
## 6  1400000US36005002000   9409        977    4119      5290     3185     4487
## 7  1400000US36005002300   4600        648    2175      2425      479     2122
## 8  1400000US36005002400    172          0     121        51       69       89
## 9  1400000US36005002500   5887        548    2958      2929      903     1344
## 10 1400000US36005002701   2868        243    1259      1609      243      987
##    asianPop hispanicPop adultPop citizenAdult censusCode
## 1       130        2329     6909         6100  005000100
## 2       119        3367     3582         2952  005000200
## 3       226        3873     4507         4214  005000400
## 4        68        3603     4416         3851  005001600
## 5       130        1413     2008         1787  005001900
## 6        29        5905     6851         6170  005002000
## 7        27        2674     3498         3056  005002300
## 8        14           0      131           42  005002400
## 9        68        4562     4237         2722  005002500
## 10        0        1985     1848         1412  005002701
## An SQL example from github:

  #   CASE WHEN (substring(geo_id2,4,2) = '61')
  #   THEN '1'
  #   WHEN (substring(geo_id2,4,2) = '05')
  #   THEN '2'
  #   WHEN (substring(geo_id2,4,2) = '47')
  #   THEN '3'
  #   WHEN (substring(geo_id2,4,2) = '81')
  #   THEN '4' 
  #   WHEN (substring(geo_id2,4,2) = '85')
  #   THEN '5'
  #   END
  # ) || right(geo_id2,6)

## R Method
## Replace the first digit of the boro ct 201 column with the corresponding geo id code. 

##My Original Attempt at modifying the borough codes
# Census_TractSF$boro_ct201 <- gsub('^1','061',Census_TractSF$boro_ct201)
# Census_TractSF$boro_ct201 <- gsub('^2','005',Census_TractSF$boro_ct201)
# Census_TractSF$boro_ct201 <- gsub('^4','081',Census_TractSF$boro_ct201)
# Census_TractSF$boro_ct201 <- gsub('^3','047',Census_TractSF$boro_ct201)
# Census_TractSF$boro_ct201 <- gsub('^5','085',Census_TractSF$boro_ct201)

str(Census_TractSF)
## Classes 'sf' and 'data.frame':   2165 obs. of  14 variables:
##  $ boro_code : chr  "5" "1" "1" "1" ...
##  $ boro_ct201: chr  "5000900" "1009800" "1010000" "1010200" ...
##  $ boro_name : chr  "Staten Island" "Manhattan" "Manhattan" "Manhattan" ...
##  $ cdeligibil: chr  "E" "I" "I" "I" ...
##  $ ct2010    : chr  "000900" "009800" "010000" "010200" ...
##  $ ctlabel   : chr  "9" "98" "100" "102" ...
##  $ ntacode   : chr  "SI22" "MN19" "MN19" "MN17" ...
##  $ ntaname   : chr  "West New Brighton-New Brighton-St. George" "Turtle Bay-East Midtown" "Turtle Bay-East Midtown" "Midtown-Midtown South" ...
##  $ puma      : chr  "3903" "3808" "3808" "3807" ...
##  $ shape_area: num  2497010 1906016 1860938 1860993 1864600 ...
##  $ shape_leng: num  7729 5534 5692 5688 5693 ...
##  $ geometry  :sfc_MULTIPOLYGON of length 2165; first list element: List of 1
##   ..$ :List of 1
##   .. ..$ : num [1:28, 1:2] -74.1 -74.1 -74.1 -74.1 -74.1 ...
##   ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
##  $ cntyFIPS  : chr  "085" "061" "061" "061" ...
##  $ tractFIPS : chr  "085000900" "061009800" "061010000" "061010200" ...
##  - attr(*, "sf_column")= chr "geometry"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "names")= chr [1:13] "boro_code" "boro_ct201" "boro_name" "cdeligibil" ...
str(ACS_DATA)
## 'data.frame':    2167 obs. of  12 variables:
##  $ GEO_ID      : chr  "1400000US36005000100" "1400000US36005000200" "1400000US36005000400" "1400000US36005001600" ...
##  $ totPop      : int  7080 4542 5634 5917 2765 9409 4600 172 5887 2868 ...
##  $ elderlyPop  : int  51 950 710 989 76 977 648 0 548 243 ...
##  $ malePop     : int  6503 2264 2807 2365 1363 4119 2175 121 2958 1259 ...
##  $ femalePop   : int  577 2278 2827 3552 1402 5290 2425 51 2929 1609 ...
##  $ whitePop    : int  1773 2165 2623 2406 585 3185 479 69 903 243 ...
##  $ blackPop    : int  4239 1279 1699 2434 1041 4487 2122 89 1344 987 ...
##  $ asianPop    : int  130 119 226 68 130 29 27 14 68 0 ...
##  $ hispanicPop : int  2329 3367 3873 3603 1413 5905 2674 0 4562 1985 ...
##  $ adultPop    : int  6909 3582 4507 4416 2008 6851 3498 131 4237 1848 ...
##  $ citizenAdult: int  6100 2952 4214 3851 1787 6170 3056 42 2722 1412 ...
##  $ censusCode  : chr  "005000100" "005000200" "005000400" "005001600" ...
# Makes a new column out of the final 8 digits of GEO_ID. Should be 8 columns
# ACS_DATA %>% mutate(., boro_match = str_sub(ACS_DATA$GEO_ID ,-8,-1)) -> ACS_DATA
# str(ACS_DATA)

# left_join(Census_TractSF, ACS_DATA, by = c("boro_ct201" = "boro_match")) -> ACS_to_TRACT

ACS_to_TRACT <- merge(Census_TractSF, ACS_DATA, by.x ='tractFIPS', by.y = 'censusCode')

sum(ACS_to_TRACT$totPop)
## [1] 8443713
# Task 5

# reprojects the ACS_to_TRACT table
popNYC <- sf::st_transform(ACS_to_TRACT, st_crs(Covid_ZIP))
  
st_crs(ACS_to_TRACT)
## Coordinate Reference System:
##   User input: WGS84(DD) 
##   wkt:
## GEOGCRS["WGS84(DD)",
##     DATUM["WGS84",
##         ELLIPSOID["WGS84",6378137,298.257223563,
##             LENGTHUNIT["metre",1,
##                 ID["EPSG",9001]]]],
##     PRIMEM["Greenwich",0,
##         ANGLEUNIT["degree",0.0174532925199433]],
##     CS[ellipsoidal,2],
##         AXIS["geodetic longitude",east,
##             ORDER[1],
##             ANGLEUNIT["degree",0.0174532925199433]],
##         AXIS["geodetic latitude",north,
##             ORDER[2],
##             ANGLEUNIT["degree",0.0174532925199433]]]
ACS_to_TRACT %>% magrittr::extract('elderlyPop') %>% plot(breaks = 'jenks')

covidPopZipNYC <- sf::st_join(Covid_ZIP, 
                              popNYC %>% sf::st_centroid(), # converts census tracts to points
                              join = st_contains) %>% 
  group_by(ZIPCODE, PO_NAME, POPULATION, COUNTY, COVID_CASE_COUNT, TOTAL_COVID_TESTS) %>% 
  summarise(totPop = sum(totPop),
            malePctg = sum(malePop)/totPop*100, 
            asianPop = sum(asianPop),
            blackPop = sum(blackPop),
            hispanicPop = sum(hispanicPop),
            whitePop = sum(whitePop)) 


covidPopZipNYC %>% head()
## Simple feature collection with 6 features and 12 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: 971132.6 ymin: 188447.3 xmax: 992172.8 ymax: 215324.9
## Projected CRS: NAD83 / New York Long Island (ftUS)
## # A tibble: 6 x 13
## # Groups:   ZIPCODE, PO_NAME, POPULATION, COUNTY, COVID_CASE_COUNT [6]
##   ZIPCODE PO_NAME POPULATION COUNTY COVID_CASE_COUNT TOTAL_COVID_TES… totPop
##     <dbl> <chr>        <dbl> <chr>             <int>            <int>  <int>
## 1   10001 New Yo…      22413 New Y…             1542            20158  19146
## 2   10002 New Yo…      81305 New Y…             5902            48197  74310
## 3   10003 New Yo…      55878 New Y…             2803            41076  53487
## 4   10004 New Yo…       2187 New Y…              247             3599     NA
## 5   10005 New Yo…       8107 New Y…              413             6102   8809
## 6   10006 New Yo…       3011 New Y…              164             2441   4639
## # … with 6 more variables: malePctg <dbl>, asianPop <int>, blackPop <int>,
## #   hispanicPop <int>, whitePop <int>, geometry <GEOMETRY [US_survey_foot]>
sum(covidPopZipNYC$totPop, na.rm = TRUE)
## [1] 8334092
## My na's do not match nor does my population, but I suspect this is because I removed them in earlier tasks. 
## So the only ones I have left are the result of more recent operations. 

covidPopZipNYC %>% dplyr::filter(is.na(totPop))
## Simple feature collection with 7 features and 12 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: 971132.6 ymin: 151085.5 xmax: 1049202 ymax: 263362
## Projected CRS: NAD83 / New York Long Island (ftUS)
## # A tibble: 7 x 13
## # Groups:   ZIPCODE, PO_NAME, POPULATION, COUNTY, COVID_CASE_COUNT [7]
##   ZIPCODE PO_NAME POPULATION COUNTY COVID_CASE_COUNT TOTAL_COVID_TES… totPop
## *   <dbl> <chr>        <dbl> <chr>             <int>            <int>  <int>
## 1   10004 New Yo…       2187 New Y…              247             3599     NA
## 2   10075 New Yo…      25203 New Y…             1453            18391     NA
## 3   10464 Bronx         4438 Bronx               387             2933     NA
## 4   11109 Long I…       2752 Queens              354             4687     NA
## 5   11231 Brookl…      33144 Kings              1842            24431     NA
## 6   11693 Far Ro…      11052 Kings              1111             7027     NA
## 7   11693 Far Ro…      11052 Queens             1111             7027     NA
## # … with 6 more variables: malePctg <dbl>, asianPop <int>, blackPop <int>,
## #   hispanicPop <int>, whitePop <int>, geometry <GEOMETRY [US_survey_foot]>
plot(covidPopZipNYC["COVID_CASE_COUNT"], breaks='jenks')

plot(covidPopZipNYC["asianPop"], breaks='jenks')