Mapping the proportion of people with West India ancestry in Brooklyn, New York.

library(tidyverse)
library(tidycensus)
library(sf)
library(scales)
library(viridis)
library(plotly)

Methods

Census tract boundaries and data are from 2016-2020 5-year American Community Survey, assessed with the tidayverse R package. Census data includes:

the number of people with West India ancestry
the number of people reporting ancestry

Borough boundaries are representated with a shapefile that was downloaded from NYC Open Data

Get Data

# load all acs variables
#acs201620 <- load_variables(2020, "acs5", cache = T)

# Import table of PEOPLE REPORTING ANCESTRY: B04006
raw_ancestry <- get_acs(geography = "tract", 
                        variables = c(ancestry_pop = "B04006_001",
                                      west_indian = "B04006_094"), 
                        state='NY',
                        county = 'Kings',
                        geometry = T, 
                        year = 2020,
                        output = "wide")

## Warning: • You have not set a Census API key. Users without a key are limited to 500
## queries per day and may experience performance limitations.
## ℹ For best results, get a Census API key at
## http://api.census.gov/data/key_signup.html and then supply the key to the
## `census_api_key()` function to use it throughout your tidycensus session.
## This warning is displayed once per session.

#mutate pct
west_indian <- raw_ancestry |> 
  mutate(pct_west_indian = west_indianE/ancestry_popE)

Check Data

# check the values of percent west indian to see how to map
##how many NAs
summary(west_indian$pct_west_indian)
##trend of data
hist(west_indian$pct_west_indian)

##check is na is actual na
na_tracts <- west_indian |> 
  filter(is.na(pct_west_indian))
##convert na to actual NAs
west_indian <- raw_ancestry |> 
  mutate(pct_west_indian = west_indianE/ancestry_popE,
         pct_west_indian = ifelse(is.nan(pct_west_indian), NA, pct_west_indian))

Bring in Shapefile Map Data

Download the shapefile of the 2020 Neighborhood Tabulation Areas from NYC Planning
Unzip and save to your part2/data/raw/geo folder

https://www.nyc.gov/site/planning/data-maps/open-data/census-download-metadata.page

Download the geojson of the NYC Borough Boundaries from NYC Open Data
Move it to your main_data/data/raw/geo folder

https://data.cityofnewyork.us/City-Government/Borough-Boundaries/tqmj-j8zm

## import borough shapefiles from NYC Open Data
boros <- st_read("~/Downloads/Methods 1_R Learning/part2/data/raw/geo/BoroughBoundaries.geojson")
## import Neighborhood Tabulation Areas for NYC
nabes <- st_read("~/Downloads/Methods 1_R Learning/part2/data/raw/geo/nynta2020.shp")

Map Borough

west_indian_map <- ggplot() +
  geom_sf(data = west_indian |>
            filter(!is.na(pct_west_indian)),
          mapping = aes(fill = pct_west_indian, 
                        text = paste0(NAME, ":",
                                      "<br>Percent West Indian ancestry:",
                                      percent(pct_west_indian, accuracy=1))),
color = "transparent") +
  theme_void() +
  scale_fill_distiller(breaks=c(0, .2, .4, .6, .8, 1),
                       direction = 1,
                       na.value = "#fafafa",#"transparent"
                       name="Percent West Indian Ancestry (%)",
                       labels=percent_format(accuracy = 1L)) +
  labs(
    title = "Brooklyn, West Indian Ancestry by Census Tract",
    caption = "Source: American Community Survey, 2016-20"
  ) +
  geom_sf(data = nabes |> filter(BoroName == "Brooklyn"), 
          color = "#4D84BB", fill = NA, lwd = 0.1) + 
  geom_sf(data = boros |> filter(boro_name == "Brooklyn"), 
          color = "#244C95", fill = NA, lwd = .25)

## Warning in layer_sf(geom = GeomSf, data = data, mapping = mapping, stat = stat,
## : Ignoring unknown aesthetics: text

ggplotly(west_indian_map, tooltip = "text")

Prep for Neighborhood File

##Check projection of census tract data
st_crs(west_indian)
st_crs(nabes)

west_indian_2263 <- st_transform(west_indian, 2263)
st_crs(west_indian_2263)

#remove unnecessary fields in the neighborhood shapefile
nabes_selected <- nabes |>
  select(BoroCode, BoroName, NTA2020, NTAName)

west_indian_nabes <- west_indian_2263 |>
  st_join(nabes_selected, 
          left = TRUE,
          join = st_intersects,
          largest = TRUE)

## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries

library(knitr)
kable(head(west_indian_nabes, n=2))

Map Neighborhood

## create neighborhod data map
west_indian_map_nabe <- ggplot() +
  geom_sf(data = west_indian_nabes |>
            filter(!is.na(pct_west_indian)) |>
            filter(NTAName == "Crown Heights (North)"),
          mapping = aes(fill = pct_west_indian, 
                        text = paste0(NAME, ":",
                                      "<br>Percent West Indian ancestry:",
                                      percent(pct_west_indian, accuracy=1))),
color = "transparent") +
  theme_void() +
  scale_fill_distiller(breaks=c(0, .2, .4, .6, .8, 1),
                       direction = 1,
                       na.value = "transparent",
                       name="Percent West Indian Ancestry (%)",
                       labels=percent_format(accuracy = 1L)) +
  labs(
    title = "Crown Heights North, West Indian Ancestry by Census Tract",
    caption = "Source: American Community Survey, 2016-20"
  ) + 
  geom_sf(data = nabes |> filter(NTAName == "Crown Heights (North)"), 
          color = "#244C95", fill = NA, lwd = 0.25)

## Warning in layer_sf(geom = GeomSf, data = data, mapping = mapping, stat = stat,
## : Ignoring unknown aesthetics: text

ggplotly(west_indian_map_nabe, tooltip = "text")

Create Summary Table

##Calculate summary statistics 
west_indian_nabe_stats <- st_drop_geometry(west_indian_nabes) |> 
  group_by(NTAName) |> 
  summarise(Borough = first(BoroName),
            `Est. Total Population` = sum(ancestry_popE),
            `Est. Total West Indian Population` = sum(west_indianM)) |> 
  mutate(`Est. Percent West Indian Ancestry` = percent(`Est. Total West Indian Population`/`Est. Total Population`, accuracy = 1))

Missing Values

A missing value is a way to signal an absence of information in a dataset.

Common reasons for missing values:

the information is not available for that area
the information is not reliable for that area
there was an error in data collection or processing
data joined improperly
join ids don’t match in all areas

Missing values are a part of messy, real-world data. Understanding how missing data are defined in R and how to perform operations with them will be a critical component of your data cleaning and analysis work.

As you begin to work with a new dataset, you should always investigate and document the following:

Are missing values are present in my data?
Where are the missing values?
How will these missing values affect my analysis?

If NA values are not represented by NA you can:

Define the NA value while reading data into R
Change the data type of a column to force NA conversion
Use ifelse() to redefine value to ’NA

#na.rm = TRUE
#is.na(): #tests if a value is NA
#is.nan(): #tests if a value is NaN (Not a Number)

# create a vector of numeric values with one NA value
vector1 <- c(4, 6, 2, 8, NA, 9)

# view structure of vector1
str(vector1)

##  num [1:6] 4 6 2 8 NA 9

In R, missing values typically look like an NA appearing in a variable, a vector, or a dataframe:

You may also encounter missing values in datasets that aren’t NA.

Sometimes the creators of a dataset will use a numeric value to indicate missing data (such as 999) or a string of characters (such as “N/A” or “–”`).

You Can…

# define the dataset's NA value during data import
#deseg_pp_clean_na <- read_csv("part1/data/raw/invol_data_propublica.csv", 
#                              na = "N/A")

# change a column's data type from character to numeric
#deseg_conversion_clean <- deseg_pp_raw |>
#  mutate(Year.Placed = as.numeric(Year.Lifted))

# change a column's data type from character to numeric
#deseg_conversion_clean_ifelse <- deseg_pp_raw |>
#  mutate(Year.Placed = ifelse(Year.Placed == "N/A", 
#                              NA, 
#                              Year.Placed))

# attempt to calculate the median year deseg orders were placed
#median(deseg_pp_clean_na$Year.Placed)

It’s important not to ignore missing values when you are trying to run calculations with your data. It’s so important that R will not let you ignore it:

Calculate the median year a desegregation order was placed:
If there is even one NA in the column, mathematical calculations return NA

New functions and concepts

multiple geoms in one ggplot

st_join()
st_crs()
st_transform()

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

West India Ancestry Map in Brooklyn