R Week 9 Assignment

Setup

Load Packages

require(rmarkdown);

## Loading required package: rmarkdown

require(knitr);

## Loading required package: knitr

require(tidyverse);

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

require(sf);

## Loading required package: sf
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE

require(mapview);

## Loading required package: mapview

require(magrittr);

## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

require(janitor);

## Loading required package: janitor
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

require(stringr); 
require(ggplot2); 
require(ggmap);

## Loading required package: ggmap
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
##   Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service/>
##   OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles/>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
## Attaching package: 'ggmap'
## 
## The following object is masked from 'package:magrittr':
## 
##     inset

require(leaflet);

## Loading required package: leaflet

require(gganimate);

## Loading required package: gganimate
## No renderer backend detected. gganimate will default to writing frames to separate files
## Consider installing:
## - the `gifski` package for gif output
## - the `av` package for video output
## and restarting the R session

require(tmap)

## Loading required package: tmap

require(scales)

## Loading required package: scales
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

require(ggpubr)

## Loading required package: ggpubr

nyc_zipdata <- st_read('/Users/alex/Desktop/Data Viz with R/Second Section/zip_data4.geojson')

## Reading layer `zip_data4' from data source 
##   `/Users/alex/Desktop/Data Viz with R/Second Section/zip_data4.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 248 features and 12 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 913129 ymin: 120020.9 xmax: 1067494 ymax: 272710.9
## Projected CRS: NAD83 / New York Long Island (ftUS)

acs_popbyzip <- st_read("~/Desktop/Data Viz with R/Second Section/acsPopByZip.gpkg")

## Reading layer `NYC_COVID_Pop_by_Zip' from data source 
##   `/Users/alex/Desktop/Data Viz with R/Second Section/acsPopByZip.gpkg' 
##   using driver `GPKG'
## Simple feature collection with 180 features and 13 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 913129 ymin: 120020.9 xmax: 1067113 ymax: 272710.9
## Projected CRS: NAD83 / New York Long Island (ftUS)

glimpse(nyc_zipdata)

## Rows: 248
## Columns: 13
## $ Zipcode           <chr> "00083", "10001", "10002", "10003", "10004", "10005"…
## $ PO_NAME           <chr> "Central Park", "New York", "New York", "New York", …
## $ COUNTY            <chr> "New York", "New York", "New York", "New York", "New…
## $ NEIGHBORHOOD_NAME <chr> NA, "Chelsea/NoMad/West Chelsea", "Chinatown/Lower E…
## $ POPULATION        <dbl> 25, 22413, 81305, 55878, 2187, 8107, 3011, 7323, 614…
## $ COVID_CASE_COUNT  <dbl> NA, 1542, 5902, 2803, 247, 413, 164, 379, 3605, 1686…
## $ TOTAL_COVID_TESTS <dbl> NA, 20158, 48197, 41076, 3599, 6102, 2441, 6342, 387…
## $ PERCENT_POSITIVE  <dbl> NA, 7.86, 12.63, 6.93, 6.92, 6.72, 6.84, 6.07, 9.49,…
## $ COVID_CASE_RATE   <dbl> NA, 5584.31, 7835.62, 5192.87, 8310.58, 4716.11, 484…
## $ COVID_DEATH_RATE  <dbl> NA, 126.75, 350.49, 88.93, 67.29, 0.00, 29.57, 57.21…
## $ TotalStores       <int> NA, 60, 192, 102, 10, 11, 4, 26, 88, 56, 102, 45, 11…
## $ TotalHealthFac    <int> 0, 14, 40, 40, 2, 2, 2, 2, 40, 14, 14, 40, 40, 26, 1…
## $ geometry          <MULTIPOLYGON [US_survey_foot]> MULTIPOLYGON (((998309.7 2…

glimpse(acs_popbyzip)

## Rows: 180
## Columns: 14
## $ ZIPCODE <chr> "10001", "10002", "10003", "10004", "10005", "10006", "10007",…
## $ PO_NAME <chr> "New York", "New York", "New York", "New York", "New York", "N…
## $ POPULAT <dbl> 22413, 81305, 55878, 2187, 8107, 3011, 7323, 61455, 29881, 505…
## $ COUNTY  <chr> "New York", "New York", "New York", "New York", "New York", "N…
## $ Positiv <int> 260, 712, 347, 24, 44, 14, 40, 518, 201, 394, 116, 179, 233, 5…
## $ Total   <int> 571, 1358, 830, 64, 137, 54, 130, 1180, 561, 852, 330, 426, 52…
## $ totPop  <dbl> 25645, 76815, 54181, NA, 8809, 2354, 9145, 56178, 33297, 51288…
## $ malPctg <dbl> 51.89706, 48.23277, 50.34053, NA, 42.81984, 39.59218, 51.89721…
## $ asianPp <dbl> 6788, 32502, 8027, NA, 1974, 640, 1729, 8150, 6188, 5666, 4680…
## $ blackPp <dbl> 1897, 7882, 3443, NA, 291, 48, 354, 6302, 1613, 3632, 973, 124…
## $ hspncPp <dbl> 3679, 21621, 4375, NA, 504, 245, 532, 13116, 3712, 5834, 1441,…
## $ whitePp <dbl> 16725, 27156, 42259, NA, 6603, 1693, 7332, 37268, 25437, 42528…
## $ eldrlyP <dbl> 3239, 15770, 6391, NA, 206, 66, 601, 8602, 4716, 8176, 3269, 4…
## $ geom    <MULTIPOLYGON [US_survey_foot]> MULTIPOLYGON (((981958.6 21..., MULT…

#test map
ggplot(nyc_zipdata) + 
  geom_sf(aes(fill=COVID_CASE_COUNT))

#Trying quantile breaks

range(nyc_zipdata$COVID_CASE_RATE)

## [1] NA NA

require(classInt)

## Loading required package: classInt

breaks_qt <- classIntervals(c(min(nyc_zipdata$COVID_CASE_RATE) - .00001,
                              nyc_zipdata$COVID_CASE_RATE), n = 7, style = "quantile")

## Warning in classIntervals(c(min(nyc_zipdata$COVID_CASE_RATE) - 1e-05,
## nyc_zipdata$COVID_CASE_RATE), : var has missing values, omitted in finding
## classes

breaks_qt

## style: quantile
##  [3413.15,5516.366) [5516.366,7187.643) [7187.643,8171.006) [8171.006,9071.344) 
##                  26                  25                  25                  25 
##  [9071.344,9996.29)  [9996.29,11124.31) [11124.31,16211.67] 
##                  25                  25                  26

# Static Map 1, COVID Data

nyc_zipdata_a <- mutate(nyc_zipdata, COVID_CASE_RATE = cut(COVID_CASE_RATE, breaks_qt$brks, dig.lab=6, digits = 1))
                        
ggplot(nyc_zipdata_a) + 
  geom_sf(aes(fill=COVID_CASE_RATE), color=NA) +
  scale_fill_brewer(palette = "RdPu", name = 'Case Rate') +
  labs(y = "Latitude", x = "Longitude", title="COVID Case Rates by NYC Zipcode, Week of 4/23/21") +
  coord_sf(crs = st_crs(2263)) +
  theme_minimal()

# Static Map 2, Health Facilities

sapply(nyc_zipdata, class)

## $Zipcode
## [1] "character"
## 
## $PO_NAME
## [1] "character"
## 
## $COUNTY
## [1] "character"
## 
## $NEIGHBORHOOD_NAME
## [1] "character"
## 
## $POPULATION
## [1] "numeric"
## 
## $COVID_CASE_COUNT
## [1] "numeric"
## 
## $TOTAL_COVID_TESTS
## [1] "numeric"
## 
## $PERCENT_POSITIVE
## [1] "numeric"
## 
## $COVID_CASE_RATE
## [1] "numeric"
## 
## $COVID_DEATH_RATE
## [1] "numeric"
## 
## $TotalStores
## [1] "integer"
## 
## $TotalHealthFac
## [1] "integer"
## 
## $geometry
## [1] "sfc_MULTIPOLYGON" "sfc"

ggplot(nyc_zipdata) + 
  geom_sf(aes(fill=TotalHealthFac), color=NA) +
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = 'Number of Health Facilities') +
  labs(y = "Latitude", x = "Longitude", title="NYC Health Facilities by Zipcode") +
  coord_sf(default_crs = sf::st_crs(2263)) +
  theme_minimal()

#Checking that I did it right in terms of health facilities map

nys_health_2263 <- st_read("~/Desktop/Data Viz with R/Second Section/nys_health_2263.geojson")

## Reading layer `nys_health_2263' from data source 
##   `/Users/alex/Desktop/Data Viz with R/Second Section/nys_health_2263.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 7686 features and 34 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -544951.7 ymin: 127612.4 xmax: 1489595 ymax: 1755603
## Projected CRS: NAD83 / New York Long Island (ftUS)

tot_stores <- read_csv("~/Desktop/Data Viz with R/Second Section/tot_stores.csv")

## Rows: 192 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): Zip.Code, TotalStores
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

tot_healthfac <- read_csv("~/Desktop/Data Viz with R/Second Section/tot_healthfac.geojson")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 357 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): {
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

NYC_Stores <- read_csv("~/Desktop/Data Viz with R/Second Section/NYC_Stores.geojson")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 14238 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): {
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

nys_health_2263 <- nys_health_2263

filtered_healthfacs_points <- st_intersection(nys_health_2263, nyc_zipdata)

## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries

ggplot(nyc_zipdata) + 
  geom_sf(aes(fill=TotalHealthFac)) +
  geom_sf(data=filtered_healthfacs_points, color="gold", fill=NA, size = 0.001) +
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = 'Number of Health Facilities') +
  labs(y = "Latitude", x = "Longitude", title="NYC Health Facilities by Zipcode") +
  coord_sf(default_crs = sf::st_crs(2263)) +
  theme_minimal()

#Comparing with an interactive map to triple check

mapview(nyc_zipdata, zcol = "TotalHealthFac", color = "lightblue", legend = TRUE) + mapview(filtered_healthfacs_points, color = "chartreuse", fill=NA,
          cex = 3, legend = FALSE)

#There clearly seems to be a mistake in my earlier data processing, based on the interactive map

#Troubleshooting

sum(nyc_zipdata$TotalHealthFac)

## [1] 4386

sum(tot_healthfac$TotalHealthFac)

## Warning: Unknown or uninitialised column: `TotalHealthFac`.

## [1] 0

nrow(filtered_healthfacs_points)

## [1] 2586

sum(filtered_healthfacs_points$TotalHealthFac)

## [1] 64024

nrow(NYC_Stores)

## [1] 14238

sum(tot_stores$TotalStores)

## [1] 14233

nrow(tot_stores)

## [1] 192

sum(nyc_zipdata$TotalStores)

## [1] NA

nyc_zipdata_b <- nyc_zipdata %>% select(-TotalStores, -TotalHealthFac)


head(nyc_zipdata_b)

## Simple feature collection with 6 features and 10 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 971132.6 ymin: 188447.3 xmax: 998309.7 ymax: 230942.5
## Projected CRS: NAD83 / New York Long Island (ftUS)
##   Zipcode      PO_NAME   COUNTY                       NEIGHBORHOOD_NAME
## 1   00083 Central Park New York                                    <NA>
## 2   10001     New York New York              Chelsea/NoMad/West Chelsea
## 3   10002     New York New York               Chinatown/Lower East Side
## 4   10003     New York New York East Village/Gramercy/Greenwich Village
## 5   10004     New York New York                      Financial District
## 6   10005     New York New York                      Financial District
##   POPULATION COVID_CASE_COUNT TOTAL_COVID_TESTS PERCENT_POSITIVE
## 1         25               NA                NA               NA
## 2      22413             1542             20158             7.86
## 3      81305             5902             48197            12.63
## 4      55878             2803             41076             6.93
## 5       2187              247              3599             6.92
## 6       8107              413              6102             6.72
##   COVID_CASE_RATE COVID_DEATH_RATE                       geometry
## 1              NA               NA MULTIPOLYGON (((998309.7 22...
## 2         5584.31           126.75 MULTIPOLYGON (((981958.6 21...
## 3         7835.62           350.49 MULTIPOLYGON (((991339.9 20...
## 4         5192.87            88.93 MULTIPOLYGON (((989830.5 20...
## 5         8310.58            67.29 MULTIPOLYGON (((977503.7 18...
## 6         4716.11             0.00 MULTIPOLYGON (((982595.7 19...

#Re-add the stores and health facility totals from the original 

class(filtered_healthfacs_points$Short.Description)

## [1] "character"

sum(is.na(filtered_healthfacs_points$Short.Description))

## [1] 0

sum(filtered_healthfacs_points$Short.Description == "")

## [1] 228

filtered_healthfacs_points <- filtered_healthfacs_points %>%
  mutate(Short.Description = ifelse(Short.Description == "", "Uncategorized", Short.Description))
                            

readd1 <- st_join(filtered_healthfacs_points, nyc_zipdata_b,join=st_within)

hf_typecount <- readd1 %>%
  st_drop_geometry() %>%
  count(Zipcode.x,  Short.Description) %>% group_by(Zipcode.x) 

hf_typewide <- hf_typecount %>%
  pivot_wider(
    names_from = Short.Description,
    values_from = n,
    values_fill = 0
  )

hf_totalcount <- readd1 %>%
  st_drop_geometry() %>%
  count(Zipcode.x, name="TotalHealthFac")

sum(hf_totalcount$TotalHealthFac)

## [1] 2586

hf_rejoined <- left_join(nyc_zipdata_b,
                           hf_totalcount, c("Zipcode" = "Zipcode.x"))

hf_rejoined_2<- left_join(hf_rejoined, hf_typewide, c("Zipcode" = "Zipcode.x"))

sum(hf_rejoined_2$TotalHealthFac, na.rm=TRUE) # double checking that the sum is still the same

## [1] 2586

head(tot_stores)

## # A tibble: 6 × 2
##   Zip.Code TotalStores
##      <dbl>       <dbl>
## 1    10001          60
## 2    10002         192
## 3    10003         102
## 4    10004          10
## 5    10005          11
## 6    10006           4

tot_stores$Zip.Code <- as.character(tot_stores$Zip.Code)

nyc_zipdata_c <- left_join(hf_rejoined_2, tot_stores, c("Zipcode" = "Zip.Code"))
sum(tot_stores$TotalStores)

## [1] 14233

sum(nyc_zipdata_c$TotalStores, na.rm=TRUE)

## [1] 14232

#close enough, only 1 off

BACK TO MAPPING

# Static Map 2, Final Version

nyc_zipdata_c$TotalHealthFac <- replace_na(nyc_zipdata_c$TotalHealthFac, 0)

ggplot(nyc_zipdata_c) + 
  geom_sf(aes(fill=TotalHealthFac), color=NA) +
  scale_fill_gradient(low = "grey", high = "darkblue", name = 'Number of Health Facilities') +
  labs(y = "Latitude", x = "Longitude", title="NYC Health Facilities by Zipcode") +
  coord_sf(default_crs = sf::st_crs(2263)) +
  theme_minimal()

ggplot(nyc_zipdata_c) + 
  geom_sf(aes(fill=TotalHealthFac), color=NA) +
  geom_sf(data=filtered_healthfacs_points, color="gold", fill=NA, size = 0.0001) +
  scale_fill_gradient(low = "grey", high = "darkblue", name = 'Number of Health Facilities') +
  labs(y = "Latitude", x = "Longitude", title="NYC Health Facilities by Zipcode") +
  coord_sf(default_crs = sf::st_crs(2263)) +
  theme_minimal()

# Multi-map Figure

g1 <- ggplot(nyc_zipdata_a) + 
  geom_sf(aes(fill=COVID_CASE_RATE), color=NA) +
  scale_fill_brewer(palette = "RdPu", name = 'Case Rate') +
  labs(y = "Latitude", x = "Longitude", title = "COVID Case Rates for the Week of 4/21/23") +
  coord_sf(crs = st_crs(2263)) +
  theme_minimal()

g2 <- ggplot(nyc_zipdata_c) + 
  geom_sf(aes(fill=TotalStores), color=NA) +
  scale_fill_gradient(low = "yellow", high = "darkgreen", name = 'Number of Stores') +
  labs(y = "Latitude", x = "Longitude", title = "Number of Stores by NYC Zipcode") +
  coord_sf(crs = st_crs(2263)) +
  theme_minimal() 


multiplot <- ggarrange(g1, g2, nrow = 1, ncol = 2) 

# Arrange the plots side by side
multiplot <- ggarrange(g1, g2, nrow = 1, ncol = 2)

# Save the arranged plot to a file with the desired width
ggsave('/Users/alex/Desktop/Data Viz with R/Second Section/multiplot_export.jpg', 
       plot = multiplot, 
       width = 18,  # Increase width to make the plot wider
       height = 7,  # Adjust height as needed
       units = "in")  # Specify units (inches)

library(knitr)

knitr::include_graphics('/Users/alex/Desktop/Data Viz with R/Second Section/multiplot_export.jpg')

# Interactive Map with tmap


covid_interactive <- nyc_zipdata_c %>% dplyr::select("NEIGHBORHOOD_NAME", "Zipcode", "POPULATION", "COVID_CASE_RATE", "COVID_DEATH_RATE")

tm_shape(covid_interactive) + tm_polygons("COVID_DEATH_RATE", fill.legend = tm_legend("COVID Death Rate by Zipcode"), fill.scale = tm_scale(values = "OrRd"))

## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "OrRd" is named
## "brewer.or_rd"
## Multiple palettes called "or_rd" found: "brewer.or_rd", "matplotlib.or_rd". The first one, "brewer.or_rd", is returned.

tmap_mode("view")

## ℹ tmap mode set to "view".

intmap <- tmap_last()

tmap_save(intmap, "covid_death_rate_interactive.html")

## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "OrRd" is named
## "brewer.or_rd"
## Multiple palettes called "or_rd" found: "brewer.or_rd", "matplotlib.or_rd". The first one, "brewer.or_rd", is returned.
## 
## Interactive map saved to /Users/alex/Desktop/Data Viz with R/Second Section/covid_death_rate_interactive.html

R Week 9 Assignment

Alex

2025-04-04

Setup

Load Packages

BACK TO MAPPING