Task 1:
##Task 1:Join the COVID-19 data to the NYC zip code area data (sf or sp polygons)
nyc_postal_sf <- st_read("C:\\Users\\Admin\\OneDrive\\Documents\\Data & Vis in R\\Week7_10\\R_project7_10\\data\\ZIP_CODE_040114.shp")
## Reading layer `ZIP_CODE_040114' from data source
## `C:\Users\Admin\OneDrive\Documents\Data & Vis in R\Week7_10\R_project7_10\data\ZIP_CODE_040114.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 263 features and 12 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 913129 ymin: 120020.9 xmax: 1067494 ymax: 272710.9
## Projected CRS: NAD83 / New York Long Island (ftUS)
covid_data <- readr::read_csv("C:\\Users\\Admin\\OneDrive\\Documents\\Data & Vis in R\\Week7_10\\R_project7_10\\data\\R-Spatial_II_Lab\\tests-by-zcta_2021_04_23.csv")
## Rows: 177 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): NEIGHBORHOOD_NAME, BOROUGH_GROUP, label
## dbl (10): MODIFIED_ZCTA, lat, lon, COVID_CASE_COUNT, COVID_CASE_RATE, POP_DE...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
postal_covid_merged <- base::merge(nyc_postal_sf, covid_data, by.x = "ZIPCODE", by.y = "MODIFIED_ZCTA")
names(postal_covid_merged)
## [1] "ZIPCODE" "BLDGZIP" "PO_NAME"
## [4] "POPULATION" "AREA" "STATE"
## [7] "COUNTY" "ST_FIPS" "CTY_FIPS"
## [10] "URL" "SHAPE_AREA" "SHAPE_LEN"
## [13] "NEIGHBORHOOD_NAME" "BOROUGH_GROUP" "label"
## [16] "lat" "lon" "COVID_CASE_COUNT"
## [19] "COVID_CASE_RATE" "POP_DENOMINATOR" "COVID_DEATH_COUNT"
## [22] "COVID_DEATH_RATE" "PERCENT_POSITIVE" "TOTAL_COVID_TESTS"
## [25] "geometry"
st_crs(postal_covid_merged)
## Coordinate Reference System:
## User input: NAD83 / New York Long Island (ftUS)
## wkt:
## PROJCRS["NAD83 / New York Long Island (ftUS)",
## BASEGEOGCRS["NAD83",
## DATUM["North American Datum 1983",
## ELLIPSOID["GRS 1980",6378137,298.257222101,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4269]],
## CONVERSION["SPCS83 New York Long Island zone (US survey foot)",
## METHOD["Lambert Conic Conformal (2SP)",
## ID["EPSG",9802]],
## PARAMETER["Latitude of false origin",40.1666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8821]],
## PARAMETER["Longitude of false origin",-74,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8822]],
## PARAMETER["Latitude of 1st standard parallel",41.0333333333333,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8823]],
## PARAMETER["Latitude of 2nd standard parallel",40.6666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8824]],
## PARAMETER["Easting at false origin",984250,
## LENGTHUNIT["US survey foot",0.304800609601219],
## ID["EPSG",8826]],
## PARAMETER["Northing at false origin",0,
## LENGTHUNIT["US survey foot",0.304800609601219],
## ID["EPSG",8827]]],
## CS[Cartesian,2],
## AXIS["easting (X)",east,
## ORDER[1],
## LENGTHUNIT["US survey foot",0.304800609601219]],
## AXIS["northing (Y)",north,
## ORDER[2],
## LENGTHUNIT["US survey foot",0.304800609601219]],
## USAGE[
## SCOPE["Engineering survey, topographic mapping."],
## AREA["United States (USA) - New York - counties of Bronx; Kings; Nassau; New York; Queens; Richmond; Suffolk."],
## BBOX[40.47,-74.26,41.3,-71.8]],
## ID["EPSG",2263]]
Task 2:
##Task 2: Aggregate the NYC food retails store data (points) to the zip code data,
#so that we know how many retail stores in each zip code area. Note that not all,
#locations are for food retail. And we need to choose the specific types according to the data.
nyc_retail<- st_read("C:\\Users\\Admin\\OneDrive\\Documents\\Data & Vis in R\\Week7_10\\R_project7_10\\data\\R-Spatial_II_Lab\\nycFoodStore.shp")
## Reading layer `nycFoodStore' from data source
## `C:\Users\Admin\OneDrive\Documents\Data & Vis in R\Week7_10\R_project7_10\data\R-Spatial_II_Lab\nycFoodStore.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 11300 features and 16 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -74.2484 ymin: 40.50782 xmax: -73.67061 ymax: 40.91008
## Geodetic CRS: WGS 84
names(nyc_retail)
## [1] "ï__Cnty" "Lcns_Nm" "Oprtn_T" "Estbl_T" "Entty_N" "DBA_Nam"
## [7] "Strt_Nmb" "Stret_Nm" "Add_L_2" "Add_L_3" "City" "State"
## [13] "Zip_Cod" "Sqr_Ftg" "Locatin" "Coords" "geometry"
filter_stores <- nyc_retail %>%
filter(Estbl_T == 'JAC') #filter for food retail
#change the CRS so they are the same
filter_stores <- sf::st_transform(filter_stores, sf::st_crs(postal_covid_merged))
# Count stores within each ZIP code
store_counts <- st_join(filter_stores, postal_covid_merged, join = st_intersects) %>%
st_drop_geometry() %>% ## because we only need the counts
count(ZIPCODE, name = "n_stores") #made a new coulum to store a count by zipcode
# Add the counts back to original Zip code data
postal_covid_merged <- postal_covid_merged %>%
left_join(store_counts, by = "ZIPCODE") %>%
replace_na(list(n_stores = 0)) # Convert NA to 0 for zipcodes that dont have any stores
Task 3:
##Task 3: Aggregate the NYC health facilities (points) to the zip code data.
#Similarly, choose appropriate subtypes such as nursing homes from the facilities.
nyc_health <- read_csv("C:\\Users\\Admin\\OneDrive\\Documents\\Data & Vis in R\\Week7_10\\R_project7_10\\data\\NYS_Health_Facility.csv")
## Rows: 3990 Columns: 36
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (28): Facility Name, Short Description, Description, Facility Open Date,...
## dbl (8): Facility ID, Facility Phone Number, Facility Fax Number, Facility ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nyc_health_NH <- nyc_health %>%
filter(`Short Description` == 'NH') # NH = Nursing Homes filter
#count the number of NH per zipcode
facility_counts <- nyc_health_NH %>%
count(`Facility Zip Code`, name = "n_facilities")
#add the count of NH to the rest of the data
zpNYC <- left_join(postal_covid_merged,facility_counts, by = c("ZIPCODE" = "Facility Zip Code"))
st_crs(zpNYC)
## Coordinate Reference System:
## User input: NAD83 / New York Long Island (ftUS)
## wkt:
## PROJCRS["NAD83 / New York Long Island (ftUS)",
## BASEGEOGCRS["NAD83",
## DATUM["North American Datum 1983",
## ELLIPSOID["GRS 1980",6378137,298.257222101,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4269]],
## CONVERSION["SPCS83 New York Long Island zone (US survey foot)",
## METHOD["Lambert Conic Conformal (2SP)",
## ID["EPSG",9802]],
## PARAMETER["Latitude of false origin",40.1666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8821]],
## PARAMETER["Longitude of false origin",-74,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8822]],
## PARAMETER["Latitude of 1st standard parallel",41.0333333333333,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8823]],
## PARAMETER["Latitude of 2nd standard parallel",40.6666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8824]],
## PARAMETER["Easting at false origin",984250,
## LENGTHUNIT["US survey foot",0.304800609601219],
## ID["EPSG",8826]],
## PARAMETER["Northing at false origin",0,
## LENGTHUNIT["US survey foot",0.304800609601219],
## ID["EPSG",8827]]],
## CS[Cartesian,2],
## AXIS["easting (X)",east,
## ORDER[1],
## LENGTHUNIT["US survey foot",0.304800609601219]],
## AXIS["northing (Y)",north,
## ORDER[2],
## LENGTHUNIT["US survey foot",0.304800609601219]],
## USAGE[
## SCOPE["Engineering survey, topographic mapping."],
## AREA["United States (USA) - New York - counties of Bronx; Kings; Nassau; New York; Queens; Richmond; Suffolk."],
## BBOX[40.47,-74.26,41.3,-71.8]],
## ID["EPSG",2263]]
Task 4 & 5
##Task 4: Join the Census ACS population, race, and age data to the NYC Planning Census Tract Data.
# The NYC census tracts
nycCensus <- sf::st_read("C:\\Users\\Admin\\OneDrive\\Documents\\Data & Vis in R\\Week7_10\\R_project7_10\\data\\geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp",
stringsAsFactors = FALSE)
## Reading layer `geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a' from data source `C:\Users\Admin\OneDrive\Documents\Data & Vis in R\Week7_10\R_project7_10\data\geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 2165 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -74.25559 ymin: 40.49612 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS: WGS84(DD)
str(nycCensus)
## Classes 'sf' and 'data.frame': 2165 obs. of 12 variables:
## $ boro_code : chr "5" "1" "1" "1" ...
## $ boro_ct201: chr "5000900" "1009800" "1010000" "1010200" ...
## $ boro_name : chr "Staten Island" "Manhattan" "Manhattan" "Manhattan" ...
## $ cdeligibil: chr "E" "I" "I" "I" ...
## $ ct2010 : chr "000900" "009800" "010000" "010200" ...
## $ ctlabel : chr "9" "98" "100" "102" ...
## $ ntacode : chr "SI22" "MN19" "MN19" "MN17" ...
## $ ntaname : chr "West New Brighton-New Brighton-St. George" "Turtle Bay-East Midtown" "Turtle Bay-East Midtown" "Midtown-Midtown South" ...
## $ puma : chr "3903" "3808" "3808" "3807" ...
## $ shape_area: num 2497010 1906016 1860938 1860993 1864600 ...
## $ shape_leng: num 7729 5534 5692 5688 5693 ...
## $ geometry :sfc_MULTIPOLYGON of length 2165; first list element: List of 1
## ..$ :List of 1
## .. ..$ : num [1:28, 1:2] -74.1 -74.1 -74.1 -74.1 -74.1 ...
## ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
## - attr(*, "sf_column")= chr "geometry"
## - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "names")= chr [1:11] "boro_code" "boro_ct201" "boro_name" "cdeligibil" ...
# But it does not use the standard County FIPS, but boro code
nycCensus %<>% dplyr::mutate(cntyFIPS = case_when(
boro_name == 'Bronx' ~ '005',
boro_name == 'Brooklyn' ~ '047',
boro_name == 'Manhattan' ~ '061',
boro_name == 'Queens' ~ '081',
boro_name == 'Staten Island' ~ '085'),
tractFIPS = paste(cntyFIPS, ct2010, sep='')
)
#Make it the same CRS (NAD 83)
nycCensus<- st_transform(nycCensus, 2263)
# The ACS data from CENSUS website, 2018 data on population
# We used readLines to bypass the second line/row in the csv file.
# You can also manually delete that line first, and run the read_csv or read.cvs.
acsData <- readLines("C:\\Users\\Admin\\OneDrive\\Documents\\Data & Vis in R\\Week7_10\\R_project7_10\\data\\R-Spatial_II_Lab\\ACSDP5Y2018.DP05_data_with_overlays_2020-04-22T132935.csv") %>%
magrittr::extract(-2) %>%
textConnection() %>%
read.csv(header=TRUE, quote= "\"") %>%
dplyr::select(GEO_ID,
totPop = DP05_0001E, elderlyPop = DP05_0024E, # >= 65
malePop = DP05_0002E, femalePop = DP05_0003E,
whitePop = DP05_0037E, blackPop = DP05_0038E,
asianPop = DP05_0067E, hispanicPop = DP05_0071E,
adultPop = DP05_0021E, citizenAdult = DP05_0087E) %>%
dplyr::mutate(censusCode = stringr::str_sub(GEO_ID, -9,-1));
acsData %>%
magrittr::extract(1:10,)
## GEO_ID totPop elderlyPop malePop femalePop whitePop blackPop
## 1 1400000US36005000100 7080 51 6503 577 1773 4239
## 2 1400000US36005000200 4542 950 2264 2278 2165 1279
## 3 1400000US36005000400 5634 710 2807 2827 2623 1699
## 4 1400000US36005001600 5917 989 2365 3552 2406 2434
## 5 1400000US36005001900 2765 76 1363 1402 585 1041
## 6 1400000US36005002000 9409 977 4119 5290 3185 4487
## 7 1400000US36005002300 4600 648 2175 2425 479 2122
## 8 1400000US36005002400 172 0 121 51 69 89
## 9 1400000US36005002500 5887 548 2958 2929 903 1344
## 10 1400000US36005002701 2868 243 1259 1609 243 987
## asianPop hispanicPop adultPop citizenAdult censusCode
## 1 130 2329 6909 6100 005000100
## 2 119 3367 3582 2952 005000200
## 3 226 3873 4507 4214 005000400
## 4 68 3603 4416 3851 005001600
## 5 130 1413 2008 1787 005001900
## 6 29 5905 6851 6170 005002000
## 7 27 2674 3498 3056 005002300
## 8 14 0 131 42 005002400
## 9 68 4562 4237 2722 005002500
## 10 0 1985 1848 1412 005002701
# Merge (JOIN) ACS data to the census tracts
popData <- merge(nycCensus, acsData, by.x ='tractFIPS', by.y = 'censusCode')
# verify the data
sum(popData$totPop)
## [1] 8443713
st_crs(popData)
## Coordinate Reference System:
## User input: EPSG:2263
## wkt:
## PROJCRS["NAD83 / New York Long Island (ftUS)",
## BASEGEOGCRS["NAD83",
## DATUM["North American Datum 1983",
## ELLIPSOID["GRS 1980",6378137,298.257222101,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4269]],
## CONVERSION["SPCS83 New York Long Island zone (US survey foot)",
## METHOD["Lambert Conic Conformal (2SP)",
## ID["EPSG",9802]],
## PARAMETER["Latitude of false origin",40.1666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8821]],
## PARAMETER["Longitude of false origin",-74,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8822]],
## PARAMETER["Latitude of 1st standard parallel",41.0333333333333,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8823]],
## PARAMETER["Latitude of 2nd standard parallel",40.6666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8824]],
## PARAMETER["Easting at false origin",984250,
## LENGTHUNIT["US survey foot",0.304800609601219],
## ID["EPSG",8826]],
## PARAMETER["Northing at false origin",0,
## LENGTHUNIT["US survey foot",0.304800609601219],
## ID["EPSG",8827]]],
## CS[Cartesian,2],
## AXIS["easting (X)",east,
## ORDER[1],
## LENGTHUNIT["US survey foot",0.304800609601219]],
## AXIS["northing (Y)",north,
## ORDER[2],
## LENGTHUNIT["US survey foot",0.304800609601219]],
## USAGE[
## SCOPE["Engineering survey, topographic mapping."],
## AREA["United States (USA) - New York - counties of Bronx; Kings; Nassau; New York; Queens; Richmond; Suffolk."],
## BBOX[40.47,-74.26,41.3,-71.8]],
## ID["EPSG",2263]]
# this may take a long time to run
popData %>% magrittr::extract('elderlyPop') %>% plot(breaks = 'jenks')

# Now aggregate to the zip code level
covidPopZipNYC <- sf::st_join(zpNYC,
popData %>% sf::st_centroid(), # this essentially converts census tracts to points
join = st_contains) %>%
group_by(ZIPCODE, PO_NAME, POPULATION, COUNTY, COVID_CASE_COUNT, TOTAL_COVID_TESTS) %>% # use names(zpNYC) and names(popNYC) to see what we have
summarise(totPop = sum(totPop),
malePctg = sum(malePop)/totPop*100, # note the totPop is the newly calculated
asianPop = sum(asianPop),
blackPop = sum(blackPop),
hispanicPop = sum(hispanicPop),
whitePop = sum(whitePop))
## Warning: st_centroid assumes attributes are constant over geometries
## `summarise()` has grouped output by 'ZIPCODE', 'PO_NAME', 'POPULATION',
## 'COUNTY', 'COVID_CASE_COUNT'. You can override using the `.groups` argument.
covidPopZipNYC %>% head()
## Simple feature collection with 6 features and 12 fields
## Geometry type: GEOMETRY
## Dimension: XY
## Bounding box: xmin: 971132.6 ymin: 188447.3 xmax: 992172.8 ymax: 215324.9
## Projected CRS: NAD83 / New York Long Island (ftUS)
## # A tibble: 6 × 13
## # Groups: ZIPCODE, PO_NAME, POPULATION, COUNTY, COVID_CASE_COUNT [6]
## ZIPCODE PO_NAME POPULATION COUNTY COVID_CASE_COUNT TOTAL_COVID_TESTS totPop
## <chr> <chr> <dbl> <chr> <dbl> <dbl> <int>
## 1 10001 New York 22413 New York 1542 20158 19146
## 2 10002 New York 81305 New York 5902 48197 74310
## 3 10003 New York 55878 New York 2803 41076 53487
## 4 10004 New York 2187 New York 247 3599 NA
## 5 10005 New York 8107 New York 413 6102 8809
## 6 10006 New York 3011 New York 164 2441 4639
## # ℹ 6 more variables: malePctg <dbl>, asianPop <int>, blackPop <int>,
## # hispanicPop <int>, whitePop <int>, geometry <GEOMETRY [US_survey_foot]>
#Add facility's and stores back to final
NYC_pre <- left_join(covidPopZipNYC,facility_counts, by = c("ZIPCODE" = "Facility Zip Code"))
NYC_Final <- left_join(NYC_pre,store_counts, by = "ZIPCODE")
# Check and verify the data again
sum(NYC_Final$totPop, na.rm = T)
## [1] 8334092
# See where we have those NAs
NYC_Final %>% dplyr::filter(is.na(totPop))
## Simple feature collection with 7 features and 14 fields
## Geometry type: GEOMETRY
## Dimension: XY
## Bounding box: xmin: 971132.6 ymin: 151085.5 xmax: 1049202 ymax: 263362
## Projected CRS: NAD83 / New York Long Island (ftUS)
## # A tibble: 7 × 15
## # Groups: ZIPCODE, PO_NAME, POPULATION, COUNTY, COVID_CASE_COUNT [7]
## ZIPCODE PO_NAME POPULATION COUNTY COVID_CASE_COUNT TOTAL_COVID_TESTS totPop
## * <chr> <chr> <dbl> <chr> <dbl> <dbl> <int>
## 1 10004 New York 2187 New Y… 247 3599 NA
## 2 10075 New York 25203 New Y… 1453 18391 NA
## 3 10464 Bronx 4438 Bronx 387 2933 NA
## 4 11109 Long Isla… 2752 Queens 354 4687 NA
## 5 11231 Brooklyn 33144 Kings 1842 24431 NA
## 6 11693 Far Rocka… 11052 Kings 1111 7027 NA
## 7 11693 Far Rocka… 11052 Queens 1111 7027 NA
## # ℹ 8 more variables: malePctg <dbl>, asianPop <int>, blackPop <int>,
## # hispanicPop <int>, whitePop <int>, geometry <GEOMETRY [US_survey_foot]>,
## # n_facilities <int>, n_stores <int>
#Plot the data
plot(NYC_Final["COVID_CASE_COUNT"], breaks='jenks')

plot(NYC_Final["blackPop"], breaks='jenks')
