Mapping the proportion of people with West India ancestry in Brooklyn, New York.

library(tidyverse)
library(tidycensus)
library(sf)
library(scales)
library(viridis)
library(plotly)

Methods

Census tract boundaries and data are from 2016-2020 5-year American Community Survey, assessed with the tidayverse R package. Census data includes:

Borough boundaries are representated with a shapefile that was downloaded from NYC Open Data

Get Data

# load all acs variables
#acs201620 <- load_variables(2020, "acs5", cache = T)

# Import table of PEOPLE REPORTING ANCESTRY: B04006
raw_ancestry <- get_acs(geography = "tract", 
                        variables = c(ancestry_pop = "B04006_001",
                                      west_indian = "B04006_094"), 
                        state='NY',
                        county = 'Kings',
                        geometry = T, 
                        year = 2020,
                        output = "wide") 
## Warning: • You have not set a Census API key. Users without a key are limited to 500
## queries per day and may experience performance limitations.
## ℹ For best results, get a Census API key at
## http://api.census.gov/data/key_signup.html and then supply the key to the
## `census_api_key()` function to use it throughout your tidycensus session.
## This warning is displayed once per session.
#mutate pct
west_indian <- raw_ancestry |> 
  mutate(pct_west_indian = west_indianE/ancestry_popE)

Check Data

# check the values of percent west indian to see how to map
##how many NAs
summary(west_indian$pct_west_indian)
##trend of data
hist(west_indian$pct_west_indian)

##check is na is actual na
na_tracts <- west_indian |> 
  filter(is.na(pct_west_indian))
##convert na to actual NAs
west_indian <- raw_ancestry |> 
  mutate(pct_west_indian = west_indianE/ancestry_popE,
         pct_west_indian = ifelse(is.nan(pct_west_indian), NA, pct_west_indian))

Bring in Shapefile Map Data

https://www.nyc.gov/site/planning/data-maps/open-data/census-download-metadata.page

https://data.cityofnewyork.us/City-Government/Borough-Boundaries/tqmj-j8zm

## import borough shapefiles from NYC Open Data
boros <- st_read("~/Downloads/Methods 1_R Learning/part2/data/raw/geo/BoroughBoundaries.geojson")
## import Neighborhood Tabulation Areas for NYC
nabes <- st_read("~/Downloads/Methods 1_R Learning/part2/data/raw/geo/nynta2020.shp")

Map Borough

west_indian_map <- ggplot() +
  geom_sf(data = west_indian |>
            filter(!is.na(pct_west_indian)),
          mapping = aes(fill = pct_west_indian, 
                        text = paste0(NAME, ":",
                                      "<br>Percent West Indian ancestry:",
                                      percent(pct_west_indian, accuracy=1))),
color = "transparent") +
  theme_void() +
  scale_fill_distiller(breaks=c(0, .2, .4, .6, .8, 1),
                       direction = 1,
                       na.value = "#fafafa",#"transparent"
                       name="Percent West Indian Ancestry (%)",
                       labels=percent_format(accuracy = 1L)) +
  labs(
    title = "Brooklyn, West Indian Ancestry by Census Tract",
    caption = "Source: American Community Survey, 2016-20"
  ) +
  geom_sf(data = nabes |> filter(BoroName == "Brooklyn"), 
          color = "#4D84BB", fill = NA, lwd = 0.1) + 
  geom_sf(data = boros |> filter(boro_name == "Brooklyn"), 
          color = "#244C95", fill = NA, lwd = .25)
## Warning in layer_sf(geom = GeomSf, data = data, mapping = mapping, stat = stat,
## : Ignoring unknown aesthetics: text
ggplotly(west_indian_map, tooltip = "text")

Prep for Neighborhood File

##Check projection of census tract data
st_crs(west_indian)
st_crs(nabes)

west_indian_2263 <- st_transform(west_indian, 2263)
st_crs(west_indian_2263)

#remove unnecessary fields in the neighborhood shapefile
nabes_selected <- nabes |>
  select(BoroCode, BoroName, NTA2020, NTAName)

west_indian_nabes <- west_indian_2263 |>
  st_join(nabes_selected, 
          left = TRUE,
          join = st_intersects,
          largest = TRUE)
## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries
library(knitr)
kable(head(west_indian_nabes, n=2))

Map Neighborhood

## create neighborhod data map
west_indian_map_nabe <- ggplot() +
  geom_sf(data = west_indian_nabes |>
            filter(!is.na(pct_west_indian)) |>
            filter(NTAName == "Crown Heights (North)"),
          mapping = aes(fill = pct_west_indian, 
                        text = paste0(NAME, ":",
                                      "<br>Percent West Indian ancestry:",
                                      percent(pct_west_indian, accuracy=1))),
color = "transparent") +
  theme_void() +
  scale_fill_distiller(breaks=c(0, .2, .4, .6, .8, 1),
                       direction = 1,
                       na.value = "transparent",
                       name="Percent West Indian Ancestry (%)",
                       labels=percent_format(accuracy = 1L)) +
  labs(
    title = "Crown Heights North, West Indian Ancestry by Census Tract",
    caption = "Source: American Community Survey, 2016-20"
  ) + 
  geom_sf(data = nabes |> filter(NTAName == "Crown Heights (North)"), 
          color = "#244C95", fill = NA, lwd = 0.25)
## Warning in layer_sf(geom = GeomSf, data = data, mapping = mapping, stat = stat,
## : Ignoring unknown aesthetics: text
ggplotly(west_indian_map_nabe, tooltip = "text")

Create Summary Table

##Calculate summary statistics 
west_indian_nabe_stats <- st_drop_geometry(west_indian_nabes) |> 
  group_by(NTAName) |> 
  summarise(Borough = first(BoroName),
            `Est. Total Population` = sum(ancestry_popE),
            `Est. Total West Indian Population` = sum(west_indianM)) |> 
  mutate(`Est. Percent West Indian Ancestry` = percent(`Est. Total West Indian Population`/`Est. Total Population`, accuracy = 1)) 

Missing Values

A missing value is a way to signal an absence of information in a dataset.

Common reasons for missing values:

  • the information is not available for that area
  • the information is not reliable for that area
  • there was an error in data collection or processing
  • data joined improperly
  • join ids don’t match in all areas

Missing values are a part of messy, real-world data. Understanding how missing data are defined in R and how to perform operations with them will be a critical component of your data cleaning and analysis work.

As you begin to work with a new dataset, you should always investigate and document the following:

  • Are missing values are present in my data?
  • Where are the missing values?
  • How will these missing values affect my analysis?

If NA values are not represented by NA you can:

  • Define the NA value while reading data into R
  • Change the data type of a column to force NA conversion
  • Use ifelse() to redefine value to ’NA
#na.rm = TRUE
#is.na(): #tests if a value is NA
#is.nan(): #tests if a value is NaN (Not a Number)
# create a vector of numeric values with one NA value
vector1 <- c(4, 6, 2, 8, NA, 9)

# view structure of vector1
str(vector1)
##  num [1:6] 4 6 2 8 NA 9

In R, missing values typically look like an NA appearing in a variable, a vector, or a dataframe:

You may also encounter missing values in datasets that aren’t NA.

Sometimes the creators of a dataset will use a numeric value to indicate missing data (such as 999) or a string of characters (such as “N/A” or “–”`).

You Can…

# define the dataset's NA value during data import
#deseg_pp_clean_na <- read_csv("part1/data/raw/invol_data_propublica.csv", 
#                              na = "N/A")
# change a column's data type from character to numeric
#deseg_conversion_clean <- deseg_pp_raw |>
#  mutate(Year.Placed = as.numeric(Year.Lifted))
# change a column's data type from character to numeric
#deseg_conversion_clean_ifelse <- deseg_pp_raw |>
#  mutate(Year.Placed = ifelse(Year.Placed == "N/A", 
#                              NA, 
#                              Year.Placed))
# attempt to calculate the median year deseg orders were placed
#median(deseg_pp_clean_na$Year.Placed)

It’s important not to ignore missing values when you are trying to run calculations with your data. It’s so important that R will not let you ignore it:

New functions and concepts

multiple geoms in one ggplot

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).