Create Neighborhoods from Census Tracts

Overview

This document outlines the procedure to identify neighborhoods and define their boundaries for any given county in the US. Generally, this method works better for cities and is less helpful for rural areas or small towns. By this method, neighborhoods are comprised of one or several census tracts. The general method is as follows:

Download tract boundaries
Make a hexagonal grid over area
Use Mapzen API to get ‘places’
Save “places” in sf object
Download empty crosswalk
Make map to build crosswalk
Build crosswalk
Summarize by neighborhood
Map neighborhoods

## Set working directory and clear memory
setwd("/Users/shannoncarter/Documents/JanuaryAdvisors/civicap")
rm(list = ls(all = T))

## Load required packages
require(tidyverse)
require(sf)
require(leaflet)
require(tigris)
require(htmlwidgets)
require(lwgeom)
require(raster)
require(rmapzen)
require(geojsonio)

## Set CRS to use for all spatial data
wgs84 <- st_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs +towgs84=0,0,0")

1. Download tract boundaries

You can do this part for one or many counties. Note though that later parts should be done one county at a time because they are computationally intensive. For a non-county starting point, download shapefile of whichever boundary (e.g.,city) you choose and st_join with census tracts to include only those census tracts in that boundary.

### COUNTY BOUNDARIES ###

# use tigris to get county boundaries - 48 = FIPS code for TX. google for desired state
counties <- tigris::counties(state = 48) %>% 
  # make a simple features object
  st_as_sf() %>% 
  # filter desired county/ies. can use name, or 3-digit FIPS code
  filter(NAME == "Collin" | NAME == "Dallas" | NAME == "Fort Bend" | NAME == "Tarrant" | NAME == "Travis") %>% 
  # transform to our chosen CRS
  st_transform(crs = wgs84) %>%
  # select only the columns we need and give better names
  dplyr::select(GEOID = "GEOID", county = "NAME", geometry = "geometry")


### CENSUS TRACT BOUNDARIES ###

# use tigris to get census tract boundaries for all 5 counties
# specify counties in call here, bc it lots of data, so don't want it loading the whole state
tracts <- tigris::tracts("48", c("085", "113", "157", "439", "453"), cb = T) %>% 
  # make a simple features object
  st_as_sf() %>% 
  # transform to our chosen CRS
  st_transform(crs = wgs84) %>% 
  # rename FIPS codes to county names
  mutate(county = case_when(
    COUNTYFP == "085" ~ "Collin",
    COUNTYFP == "113" ~ "Dallas",
    COUNTYFP == "157" ~ "Fort Bend",
    COUNTYFP == "439" ~ "Tarrant",
    COUNTYFP == "453" ~ "Travis")) %>% 
  # select only the columns we need and give better names
  dplyr::select(county = "county", GEOID = "GEOID", geometry = "geometry")

2. Make a hexagonal grid over area

The algorithm will scan over every cell of this grid and extract ‘places’ from the Mapzen API. Adjust the ‘cellsize’ argument for different resolutions. Larger cell size will give bigger places (e.g., cities and towns). 0.02 works well for neighborhood-level resolution.

## Make a variable-sized hex grid across one or all counties
# size is variable -- 0.02 seems like goo resolution
hexes <- counties %>% 
  # can do one or multiple counties
  filter(county == "Travis") %>%  
  # 0.02 works well for neighborhoods in cities. Consider trying other resolutions for different purposes
  st_make_grid(cellsize = .02, square = FALSE) %>% 
  # make an sf object
  st_sf()  %>%
  # add a cell id for bookkeeping purposes
  mutate(cell_id = row_number())

## Map hexes
leaflet() %>%
  addProviderTiles("CartoDB.Positron") %>% 
  setView(-97.8, 30.3, zoom = 9) %>%  
  # county boundary
  addPolygons(
    data = counties,
    fill = "white",
    fillOpacity = 0,
    weight = 2,
    color = 'black'
  ) %>%
  # hexes
  addPolygons(
    data = hexes,
    fill = 'blue', 
    color = 'blue',
    weight = 1,
    fillOpacity = 0.5,
    label = ~paste0(cell_id)
  )

3. Use Mapzen API to get ‘places’

Now, loop through every hex grid cell above and extract the ‘places’ within with the Mapzen API. This takes about 10 minutes per county.

## Mapzen API key
# https://cran.r-project.org/web/packages/rmapzen/vignettes/rmapzen-introduction.html
#mz_set_tile_host_nextzen(key = "yourAPIkeyhere")

## Make a function to extract places from a bound box
get_vector_tiles <- function(bbox){
  mz_box = mz_rect(bbox$xmin,bbox$ymin,bbox$xmax,bbox$ymax)
  mz_vector_tiles(mz_box)
}

## Loop through all hexes and extract places
# for each row of hexes (sf object made above)
for (i in 1:nrow(hexes)) {
  
  # subset data by row
  row <- hexes[i,]
  
  # make a bound box
  bounds <- extent(row) 
  lon_min = as.numeric(bounds@xmin)
  lon_max = as.numeric(bounds@xmax)
  lat_min = as.numeric(bounds@ymin)
  lat_max = as.numeric(bounds@ymax)
  bbox <- st_bbox(c(xmin = lon_min, xmax = lon_max, ymax = lat_max, ymin = lat_min), crs = st_crs(4326))
  
  # pull vector tiles for bounding box using function above
  vector_tiles <- get_vector_tiles(bbox)
  
  # some hexes don't have places-- write logic to fill these with NA
  # if no places...
  if (length(vector_tiles[["places"]][["features"]]) == 0) { 
    # make a df with NA for all places-associated fields
    places <- st_sf(kind = NA, name = NA, geometry = NA,
                    cell_id = row$cell_id,
                    # needs a geo col to make an sf, so need to keep this
                    cell_geo = row$geometry) 
    # if places...
    } else {
    # extract places and make a df with cell info
    places <- as_sf(vector_tiles$places) %>% 
      dplyr::select(kind, name, geometry) %>% 
      dplyr::mutate(cell_id = row$cell_id,
                    cell_geo = row$geometry)
    }
  
  # monitor progress
  #print(row$cell_id)
  
  # make/add to df
  if (i == 1) {
    combined <- places
  } else {
    combined <- rbind(combined, places)
  }
}

4. Save “places” in sf object

Clean and save places simple features object

## Clean and save combined

# remove geometry for cells/hexes
combined$cell_geo <- NULL
combined <- combined %>% 
  # remove NAs
  filter(!is.na(name)) %>%
  # remove unnecessary columns
  dplyr::select(-c(kind, cell_id)) 

# save as R data file
saveRDS(combined, 'places.rds')

5. Download empty crosswalk

Make a csv with the census tracts. Copy to an excel workbook– this is where you will populate the crosswalk.

## Pull census tract names for selected counties to make neighborhood crosswalk
tracts_for_crosswalk <- tracts %>%
  # filter county/ies of interest
  filter(county == "Travis") %>%
  # order by tract ID to make things easier later
  arrange(county, GEOID) %>% 
  # helpful to pull out last 4 digits-- easier for searching/matching later
  mutate(tract = str_sub(GEOID, 8, 11),
         neighborhood = NA)

# remove geometry
tracts_for_crosswalk$geometry <- NULL

# save to csv
# ** HIGHLY recommend copying contents to excel file and populating there so you don't accidently overwrite 
write.csv(tracts_for_crosswalk, "tracts_for_crosswalk.csv")

6. Make map to build crosswalk

Build a map with census tracts and ‘places’ extracted from mapzen. You’ll use this to build the crossalk between census tract and neighborhood.

## Load places, tract, and county data
# if any of these contain multiple counties, filter to do one county at a time
# the map used to build crosswalk has a lot of data and won't render with multiple counties
p <- readRDS("places.rds") %>% 
  st_as_sf(crs = wgs84) %>% 
  st_join(counties) %>%
  filter(county == "Travis")
t <- tracts %>% 
  filter(county == "Travis") %>% 
  mutate(tract = substr(GEOID, 8, 11))
c <- counties %>% 
  filter(county == "Travis")

## Map places over census tracts
places_map <- leaflet() %>%
  addProviderTiles("CartoDB.Positron") %>% 
   setView(-97.8, 30.3, zoom = 9) %>% 
  # county boundary
  addPolygons(
    data = c,
    fill = "white",
    fillOpacity = 0,
    weight = 2,
    color = 'black'
  ) %>% 
  # census tracts, permanent 4-digit label
  # 4-digit code, not a popup, makes it easier to build crosswalk
  addPolygons(
    data = t,
    fill = 'white',
    fillOpacity = 0,
    weight = 1.5,
    color = 'blue',
    label = ~paste0(tract),
    labelOptions = labelOptions(noHide = T, direction = 'middle', textOnly = T, textsize = 14)
  ) %>% 
  # census tracts, popup full census tract label
  # sometimes there are replicate 4-digit ('tract' in df). have full tract available 
  addPolygons(
    data = t,
    fill = 'white',
    fillOpacity = 0,
    weight = 1.5,
    color = 'blue',
    label = ~paste0(GEOID)
  ) %>%
  # places -- mapped without boundaries, text only
  addLabelOnlyMarkers(
    data = p,
    label =  ~as.character(name),
    labelOptions = labelOptions(noHide = T, direction = 'top', textOnly = T, textsize = 16))

## Save as html file
saveWidget(places_map, file = "tracts_places_map.html")

7. Build crosswalk

Open tracts_places_map.html and crosswalk.xlsx (copied from tracts_for_crosswalk.csv). Make column “neighborhood” in crosswalk.xlsx and populate with “places” from the map. Assign a neighborhood to each census tract. Multiple census tracts can/should comprise a neighborhood. Some hints:

The 4-digit code printed on the map is unique for most/all census tracts (depending on the size of the county), but some may be duplicated. Hover over tract on map for the full census tract ID
You can ctrl-f on map to find census tracts
Use “places” extracted from Mapzen as much as possible, but you can also use places printed on the map base.
- This is particularly helpful for towns
- If a tract has two places, you can decide which to name by seeing if one is bigger/more central on the map
- If a collection of tracts don’t have any ‘places’ nearby, see if there’s something on the map. Occasionally, I made up distinctions relative to the nearest biggest place, e.g., “North Arlington”
If a single census tract has two big places, name it with a slash, e.g., “Eldridge / West Oaks”
Do your best. It’s easy to make changes later, so don’t worry about getting everything perfect.

8. Summarize by neighborhood

The crosswalk assigns each census tract ID to a neighborhood. Now, we need to add back in the census tract geometry and summarize to dissolve the inner boundaries, where multiple census tracts comprise a neighborhood. This will leave us with the names and outlines of the neighborhoods. Save this as an sf object

# Read in crosswalk and summarize by neighborhood
n <- read.csv("tract_neighborhood_crosswalk.csv", header = T) %>% 
  mutate(GEOID = as.character(GEOID)) %>% 
  # join with tracts-- this brings geometry to each tract
  left_join(tracts) %>% 
  st_sf() %>% 
  # group by neighborhood
  group_by(neighborhood) %>% 
  # dissolves interior boundaries, keep county name
  summarize(county = first(county)) 

# Save as R data file
saveRDS(n, 'neighborhoods.rds')

9. Map neighborhoods

See what you’ve made, share with locals, and make any necessary changes to the crosswalk.

# Read in crosswalk
crosswalk <- read.csv("tract_neighborhood_crosswalk.csv")

# Add a variable to map to color
neighborhoods <- n %>% 
  mutate(category = factor(sample.int(12L, nrow(.), TRUE)))

# Join with tract shapefile
data2 <- tracts %>% 
  mutate(GEOID = as.numeric(GEOID)) %>% 
  left_join(crosswalk) %>% 
  st_join(neighborhoods)

# make palettes
set.seed(999) 
pc1 <- colorFactor(rainbow(12), neighborhoods$category)

## Map places over census tracts
n_map <- leaflet() %>%
  addProviderTiles("CartoDB.Positron") %>% 
  setView(-97.7, 30.27, zoom = 9) %>%
  addPolygons(
    data = counties,
    fill = "white",
    fillOpacity = 0,
    weight = 2,
    color = 'black'
  ) %>% 
  addPolygons(
    data = tracts,
    color = "black",
    stroke = T,
    fillOpacity = 0,
    weight = 0.7,
    dashArray = "3"
  ) %>%
  addPolygons(
    data = neighborhoods,
    color = ~pc1(category),
    stroke = T,
    fillOpacity = 0.4,
    weight = 1.5
  ) %>% 
  addPolygons(
    data = neighborhoods,
    fill = "white",
    fillOpacity = 0,
    weight = 1.5,
    color = 'black',
    label = ~paste0(neighborhood)
  ) 
n_map