R Week 8 Assignment

Task 1:

##Task 1:Join the COVID-19 data to the NYC zip code area data (sf or sp polygons)
nyc_postal_sf <-  st_read("C:\\Users\\Admin\\OneDrive\\Documents\\Data & Vis in R\\Week7_10\\R_project7_10\\data\\ZIP_CODE_040114.shp")

## Reading layer `ZIP_CODE_040114' from data source 
##   `C:\Users\Admin\OneDrive\Documents\Data & Vis in R\Week7_10\R_project7_10\data\ZIP_CODE_040114.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 263 features and 12 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 913129 ymin: 120020.9 xmax: 1067494 ymax: 272710.9
## Projected CRS: NAD83 / New York Long Island (ftUS)

covid_data <- readr::read_csv("C:\\Users\\Admin\\OneDrive\\Documents\\Data & Vis in R\\Week7_10\\R_project7_10\\data\\R-Spatial_II_Lab\\tests-by-zcta_2021_04_23.csv")

## Rows: 177 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): NEIGHBORHOOD_NAME, BOROUGH_GROUP, label
## dbl (10): MODIFIED_ZCTA, lat, lon, COVID_CASE_COUNT, COVID_CASE_RATE, POP_DE...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

postal_covid_merged <- base::merge(nyc_postal_sf, covid_data, by.x = "ZIPCODE", by.y = "MODIFIED_ZCTA")
names(postal_covid_merged)

##  [1] "ZIPCODE"           "BLDGZIP"           "PO_NAME"          
##  [4] "POPULATION"        "AREA"              "STATE"            
##  [7] "COUNTY"            "ST_FIPS"           "CTY_FIPS"         
## [10] "URL"               "SHAPE_AREA"        "SHAPE_LEN"        
## [13] "NEIGHBORHOOD_NAME" "BOROUGH_GROUP"     "label"            
## [16] "lat"               "lon"               "COVID_CASE_COUNT" 
## [19] "COVID_CASE_RATE"   "POP_DENOMINATOR"   "COVID_DEATH_COUNT"
## [22] "COVID_DEATH_RATE"  "PERCENT_POSITIVE"  "TOTAL_COVID_TESTS"
## [25] "geometry"

st_crs(postal_covid_merged)

## Coordinate Reference System:
##   User input: NAD83 / New York Long Island (ftUS) 
##   wkt:
## PROJCRS["NAD83 / New York Long Island (ftUS)",
##     BASEGEOGCRS["NAD83",
##         DATUM["North American Datum 1983",
##             ELLIPSOID["GRS 1980",6378137,298.257222101,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4269]],
##     CONVERSION["SPCS83 New York Long Island zone (US survey foot)",
##         METHOD["Lambert Conic Conformal (2SP)",
##             ID["EPSG",9802]],
##         PARAMETER["Latitude of false origin",40.1666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8821]],
##         PARAMETER["Longitude of false origin",-74,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8822]],
##         PARAMETER["Latitude of 1st standard parallel",41.0333333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8823]],
##         PARAMETER["Latitude of 2nd standard parallel",40.6666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8824]],
##         PARAMETER["Easting at false origin",984250,
##             LENGTHUNIT["US survey foot",0.304800609601219],
##             ID["EPSG",8826]],
##         PARAMETER["Northing at false origin",0,
##             LENGTHUNIT["US survey foot",0.304800609601219],
##             ID["EPSG",8827]]],
##     CS[Cartesian,2],
##         AXIS["easting (X)",east,
##             ORDER[1],
##             LENGTHUNIT["US survey foot",0.304800609601219]],
##         AXIS["northing (Y)",north,
##             ORDER[2],
##             LENGTHUNIT["US survey foot",0.304800609601219]],
##     USAGE[
##         SCOPE["Engineering survey, topographic mapping."],
##         AREA["United States (USA) - New York - counties of Bronx; Kings; Nassau; New York; Queens; Richmond; Suffolk."],
##         BBOX[40.47,-74.26,41.3,-71.8]],
##     ID["EPSG",2263]]

Task 2:

##Task 2: Aggregate the NYC food retails store data (points) to the zip code data, 
#so that we know how many retail stores in each zip code area. Note that not all,
#locations are for food retail. And we need to choose the specific types according to the data.

nyc_retail<- st_read("C:\\Users\\Admin\\OneDrive\\Documents\\Data & Vis in R\\Week7_10\\R_project7_10\\data\\R-Spatial_II_Lab\\nycFoodStore.shp")

## Reading layer `nycFoodStore' from data source 
##   `C:\Users\Admin\OneDrive\Documents\Data & Vis in R\Week7_10\R_project7_10\data\R-Spatial_II_Lab\nycFoodStore.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 11300 features and 16 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -74.2484 ymin: 40.50782 xmax: -73.67061 ymax: 40.91008
## Geodetic CRS:  WGS 84

names(nyc_retail)

##  [1] "ï__Cnty"  "Lcns_Nm"  "Oprtn_T"  "Estbl_T"  "Entty_N"  "DBA_Nam" 
##  [7] "Strt_Nmb" "Stret_Nm" "Add_L_2"  "Add_L_3"  "City"     "State"   
## [13] "Zip_Cod"  "Sqr_Ftg"  "Locatin"  "Coords"   "geometry"

filter_stores <- nyc_retail %>% 
  filter(Estbl_T == 'JAC') #filter for food retail

#change the CRS so they are the same 
filter_stores <- sf::st_transform(filter_stores, sf::st_crs(postal_covid_merged))

# Count stores within each ZIP code
store_counts <- st_join(filter_stores, postal_covid_merged, join = st_intersects) %>%
  st_drop_geometry() %>% ## because we only need the counts
  count(ZIPCODE, name = "n_stores") #made a new coulum to store a count by zipcode

# Add the counts back to original Zip code data
postal_covid_merged <- postal_covid_merged %>%
  left_join(store_counts, by = "ZIPCODE") %>%
  replace_na(list(n_stores = 0))  # Convert NA to 0 for zipcodes that dont have any stores

Task 3:

##Task 3: Aggregate the NYC health facilities (points) to the zip code data.
#Similarly, choose appropriate subtypes such as nursing homes from the facilities.

nyc_health <- read_csv("C:\\Users\\Admin\\OneDrive\\Documents\\Data & Vis in R\\Week7_10\\R_project7_10\\data\\NYS_Health_Facility.csv")

## Rows: 3990 Columns: 36
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (28): Facility Name, Short Description, Description, Facility Open Date,...
## dbl  (8): Facility ID, Facility Phone Number, Facility Fax Number, Facility ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

nyc_health_NH <- nyc_health %>% 
  filter(`Short Description` == 'NH')  # NH = Nursing Homes filter

#count the number of NH per zipcode 
facility_counts <- nyc_health_NH %>%
  count(`Facility Zip Code`, name = "n_facilities")

#add the count of NH to the rest of the data
zpNYC <- left_join(postal_covid_merged,facility_counts, by = c("ZIPCODE" = "Facility Zip Code"))
st_crs(zpNYC)

## Coordinate Reference System:
##   User input: NAD83 / New York Long Island (ftUS) 
##   wkt:
## PROJCRS["NAD83 / New York Long Island (ftUS)",
##     BASEGEOGCRS["NAD83",
##         DATUM["North American Datum 1983",
##             ELLIPSOID["GRS 1980",6378137,298.257222101,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4269]],
##     CONVERSION["SPCS83 New York Long Island zone (US survey foot)",
##         METHOD["Lambert Conic Conformal (2SP)",
##             ID["EPSG",9802]],
##         PARAMETER["Latitude of false origin",40.1666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8821]],
##         PARAMETER["Longitude of false origin",-74,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8822]],
##         PARAMETER["Latitude of 1st standard parallel",41.0333333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8823]],
##         PARAMETER["Latitude of 2nd standard parallel",40.6666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8824]],
##         PARAMETER["Easting at false origin",984250,
##             LENGTHUNIT["US survey foot",0.304800609601219],
##             ID["EPSG",8826]],
##         PARAMETER["Northing at false origin",0,
##             LENGTHUNIT["US survey foot",0.304800609601219],
##             ID["EPSG",8827]]],
##     CS[Cartesian,2],
##         AXIS["easting (X)",east,
##             ORDER[1],
##             LENGTHUNIT["US survey foot",0.304800609601219]],
##         AXIS["northing (Y)",north,
##             ORDER[2],
##             LENGTHUNIT["US survey foot",0.304800609601219]],
##     USAGE[
##         SCOPE["Engineering survey, topographic mapping."],
##         AREA["United States (USA) - New York - counties of Bronx; Kings; Nassau; New York; Queens; Richmond; Suffolk."],
##         BBOX[40.47,-74.26,41.3,-71.8]],
##     ID["EPSG",2263]]

Task 4 & 5

##Task 4: Join the Census ACS population, race, and age data to the NYC Planning Census Tract Data.

# The NYC census tracts  
nycCensus <- sf::st_read("C:\\Users\\Admin\\OneDrive\\Documents\\Data & Vis in R\\Week7_10\\R_project7_10\\data\\geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp",
                         stringsAsFactors = FALSE)

## Reading layer `geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a' from data source `C:\Users\Admin\OneDrive\Documents\Data & Vis in R\Week7_10\R_project7_10\data\geo_export_1dc7b645-647b-4806-b9a0-7b79660f120a.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 2165 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25559 ymin: 40.49612 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS:  WGS84(DD)

str(nycCensus)

## Classes 'sf' and 'data.frame':   2165 obs. of  12 variables:
##  $ boro_code : chr  "5" "1" "1" "1" ...
##  $ boro_ct201: chr  "5000900" "1009800" "1010000" "1010200" ...
##  $ boro_name : chr  "Staten Island" "Manhattan" "Manhattan" "Manhattan" ...
##  $ cdeligibil: chr  "E" "I" "I" "I" ...
##  $ ct2010    : chr  "000900" "009800" "010000" "010200" ...
##  $ ctlabel   : chr  "9" "98" "100" "102" ...
##  $ ntacode   : chr  "SI22" "MN19" "MN19" "MN17" ...
##  $ ntaname   : chr  "West New Brighton-New Brighton-St. George" "Turtle Bay-East Midtown" "Turtle Bay-East Midtown" "Midtown-Midtown South" ...
##  $ puma      : chr  "3903" "3808" "3808" "3807" ...
##  $ shape_area: num  2497010 1906016 1860938 1860993 1864600 ...
##  $ shape_leng: num  7729 5534 5692 5688 5693 ...
##  $ geometry  :sfc_MULTIPOLYGON of length 2165; first list element: List of 1
##   ..$ :List of 1
##   .. ..$ : num [1:28, 1:2] -74.1 -74.1 -74.1 -74.1 -74.1 ...
##   ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
##  - attr(*, "sf_column")= chr "geometry"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "names")= chr [1:11] "boro_code" "boro_ct201" "boro_name" "cdeligibil" ...

# But it does not use the standard County FIPS, but boro code
nycCensus %<>% dplyr::mutate(cntyFIPS = case_when(
  boro_name == 'Bronx' ~ '005',
  boro_name == 'Brooklyn' ~ '047',
  boro_name == 'Manhattan' ~ '061',
  boro_name == 'Queens' ~ '081',
  boro_name == 'Staten Island' ~ '085'),
  tractFIPS = paste(cntyFIPS, ct2010, sep='')
)
#Make it the same CRS (NAD 83)
nycCensus<- st_transform(nycCensus, 2263)



# The ACS data from CENSUS website, 2018 data on population
# We used readLines to bypass the second line/row in the csv file. 
# You can also manually delete that line first, and run the read_csv or read.cvs. 
acsData <- readLines("C:\\Users\\Admin\\OneDrive\\Documents\\Data & Vis in R\\Week7_10\\R_project7_10\\data\\R-Spatial_II_Lab\\ACSDP5Y2018.DP05_data_with_overlays_2020-04-22T132935.csv") %>%
  magrittr::extract(-2) %>% 
  textConnection() %>%
  read.csv(header=TRUE, quote= "\"") %>%
  dplyr::select(GEO_ID, 
                totPop = DP05_0001E, elderlyPop = DP05_0024E, # >= 65
                malePop = DP05_0002E, femalePop = DP05_0003E,  
                whitePop = DP05_0037E, blackPop = DP05_0038E,
                asianPop = DP05_0067E, hispanicPop = DP05_0071E,
                adultPop = DP05_0021E, citizenAdult = DP05_0087E) %>%
  dplyr::mutate(censusCode = stringr::str_sub(GEO_ID, -9,-1));

acsData %>%
  magrittr::extract(1:10,)

##                  GEO_ID totPop elderlyPop malePop femalePop whitePop blackPop
## 1  1400000US36005000100   7080         51    6503       577     1773     4239
## 2  1400000US36005000200   4542        950    2264      2278     2165     1279
## 3  1400000US36005000400   5634        710    2807      2827     2623     1699
## 4  1400000US36005001600   5917        989    2365      3552     2406     2434
## 5  1400000US36005001900   2765         76    1363      1402      585     1041
## 6  1400000US36005002000   9409        977    4119      5290     3185     4487
## 7  1400000US36005002300   4600        648    2175      2425      479     2122
## 8  1400000US36005002400    172          0     121        51       69       89
## 9  1400000US36005002500   5887        548    2958      2929      903     1344
## 10 1400000US36005002701   2868        243    1259      1609      243      987
##    asianPop hispanicPop adultPop citizenAdult censusCode
## 1       130        2329     6909         6100  005000100
## 2       119        3367     3582         2952  005000200
## 3       226        3873     4507         4214  005000400
## 4        68        3603     4416         3851  005001600
## 5       130        1413     2008         1787  005001900
## 6        29        5905     6851         6170  005002000
## 7        27        2674     3498         3056  005002300
## 8        14           0      131           42  005002400
## 9        68        4562     4237         2722  005002500
## 10        0        1985     1848         1412  005002701

# Merge (JOIN) ACS data to the census tracts
popData <- merge(nycCensus, acsData, by.x ='tractFIPS', by.y = 'censusCode')

# verify the data
sum(popData$totPop)

## [1] 8443713

st_crs(popData)

## Coordinate Reference System:
##   User input: EPSG:2263 
##   wkt:
## PROJCRS["NAD83 / New York Long Island (ftUS)",
##     BASEGEOGCRS["NAD83",
##         DATUM["North American Datum 1983",
##             ELLIPSOID["GRS 1980",6378137,298.257222101,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4269]],
##     CONVERSION["SPCS83 New York Long Island zone (US survey foot)",
##         METHOD["Lambert Conic Conformal (2SP)",
##             ID["EPSG",9802]],
##         PARAMETER["Latitude of false origin",40.1666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8821]],
##         PARAMETER["Longitude of false origin",-74,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8822]],
##         PARAMETER["Latitude of 1st standard parallel",41.0333333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8823]],
##         PARAMETER["Latitude of 2nd standard parallel",40.6666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8824]],
##         PARAMETER["Easting at false origin",984250,
##             LENGTHUNIT["US survey foot",0.304800609601219],
##             ID["EPSG",8826]],
##         PARAMETER["Northing at false origin",0,
##             LENGTHUNIT["US survey foot",0.304800609601219],
##             ID["EPSG",8827]]],
##     CS[Cartesian,2],
##         AXIS["easting (X)",east,
##             ORDER[1],
##             LENGTHUNIT["US survey foot",0.304800609601219]],
##         AXIS["northing (Y)",north,
##             ORDER[2],
##             LENGTHUNIT["US survey foot",0.304800609601219]],
##     USAGE[
##         SCOPE["Engineering survey, topographic mapping."],
##         AREA["United States (USA) - New York - counties of Bronx; Kings; Nassau; New York; Queens; Richmond; Suffolk."],
##         BBOX[40.47,-74.26,41.3,-71.8]],
##     ID["EPSG",2263]]

# this may take a long time to run
popData %>% magrittr::extract('elderlyPop') %>% plot(breaks = 'jenks')

# Now aggregate to the zip code level
covidPopZipNYC <- sf::st_join(zpNYC, 
                              popData %>% sf::st_centroid(), # this essentially converts census tracts to points
                              join = st_contains) %>% 
  group_by(ZIPCODE, PO_NAME, POPULATION, COUNTY, COVID_CASE_COUNT, TOTAL_COVID_TESTS) %>% # use names(zpNYC) and names(popNYC) to see what we have
  summarise(totPop = sum(totPop),
            malePctg = sum(malePop)/totPop*100, # note the totPop is the newly calculated
            asianPop = sum(asianPop),
            blackPop = sum(blackPop),
            hispanicPop = sum(hispanicPop),
            whitePop = sum(whitePop))

## Warning: st_centroid assumes attributes are constant over geometries

## `summarise()` has grouped output by 'ZIPCODE', 'PO_NAME', 'POPULATION',
## 'COUNTY', 'COVID_CASE_COUNT'. You can override using the `.groups` argument.

covidPopZipNYC %>% head()

## Simple feature collection with 6 features and 12 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: 971132.6 ymin: 188447.3 xmax: 992172.8 ymax: 215324.9
## Projected CRS: NAD83 / New York Long Island (ftUS)
## # A tibble: 6 × 13
## # Groups:   ZIPCODE, PO_NAME, POPULATION, COUNTY, COVID_CASE_COUNT [6]
##   ZIPCODE PO_NAME  POPULATION COUNTY   COVID_CASE_COUNT TOTAL_COVID_TESTS totPop
##   <chr>   <chr>         <dbl> <chr>               <dbl>             <dbl>  <int>
## 1 10001   New York      22413 New York             1542             20158  19146
## 2 10002   New York      81305 New York             5902             48197  74310
## 3 10003   New York      55878 New York             2803             41076  53487
## 4 10004   New York       2187 New York              247              3599     NA
## 5 10005   New York       8107 New York              413              6102   8809
## 6 10006   New York       3011 New York              164              2441   4639
## # ℹ 6 more variables: malePctg <dbl>, asianPop <int>, blackPop <int>,
## #   hispanicPop <int>, whitePop <int>, geometry <GEOMETRY [US_survey_foot]>

#Add facility's and stores back to final  
NYC_pre <- left_join(covidPopZipNYC,facility_counts, by = c("ZIPCODE" = "Facility Zip Code"))
NYC_Final <- left_join(NYC_pre,store_counts, by = "ZIPCODE")

# Check and verify the data again
sum(NYC_Final$totPop, na.rm = T)

## [1] 8334092

# See where we have those NAs 
NYC_Final %>% dplyr::filter(is.na(totPop))

## Simple feature collection with 7 features and 14 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: 971132.6 ymin: 151085.5 xmax: 1049202 ymax: 263362
## Projected CRS: NAD83 / New York Long Island (ftUS)
## # A tibble: 7 × 15
## # Groups:   ZIPCODE, PO_NAME, POPULATION, COUNTY, COVID_CASE_COUNT [7]
##   ZIPCODE PO_NAME    POPULATION COUNTY COVID_CASE_COUNT TOTAL_COVID_TESTS totPop
## * <chr>   <chr>           <dbl> <chr>             <dbl>             <dbl>  <int>
## 1 10004   New York         2187 New Y…              247              3599     NA
## 2 10075   New York        25203 New Y…             1453             18391     NA
## 3 10464   Bronx            4438 Bronx               387              2933     NA
## 4 11109   Long Isla…       2752 Queens              354              4687     NA
## 5 11231   Brooklyn        33144 Kings              1842             24431     NA
## 6 11693   Far Rocka…      11052 Kings              1111              7027     NA
## 7 11693   Far Rocka…      11052 Queens             1111              7027     NA
## # ℹ 8 more variables: malePctg <dbl>, asianPop <int>, blackPop <int>,
## #   hispanicPop <int>, whitePop <int>, geometry <GEOMETRY [US_survey_foot]>,
## #   n_facilities <int>, n_stores <int>

#Plot the data 
plot(NYC_Final["COVID_CASE_COUNT"], breaks='jenks')

plot(NYC_Final["blackPop"], breaks='jenks')

R Week 8 Assignment

Christina Ebrahim

3/29/2025

Task 1:

Task 2:

Task 3:

Task 4 & 5