Mapping the proportion of people with West India ancestry in Brooklyn, New York.
library(tidyverse)
library(tidycensus)
library(sf)
library(scales)
library(viridis)
library(plotly)
Census tract boundaries and data are from 2016-2020 5-year American Community Survey, assessed with the tidayverse R package. Census data includes:
Borough boundaries are representated with a shapefile that was downloaded from NYC Open Data
# load all acs variables
#acs201620 <- load_variables(2020, "acs5", cache = T)
# Import table of PEOPLE REPORTING ANCESTRY: B04006
raw_ancestry <- get_acs(geography = "tract",
variables = c(ancestry_pop = "B04006_001",
west_indian = "B04006_094"),
state='NY',
county = 'Kings',
geometry = T,
year = 2020,
output = "wide")
## Warning: • You have not set a Census API key. Users without a key are limited to 500
## queries per day and may experience performance limitations.
## ℹ For best results, get a Census API key at
## http://api.census.gov/data/key_signup.html and then supply the key to the
## `census_api_key()` function to use it throughout your tidycensus session.
## This warning is displayed once per session.
#mutate pct
west_indian <- raw_ancestry |>
mutate(pct_west_indian = west_indianE/ancestry_popE)
# check the values of percent west indian to see how to map
##how many NAs
summary(west_indian$pct_west_indian)
##trend of data
hist(west_indian$pct_west_indian)
##check is na is actual na
na_tracts <- west_indian |>
filter(is.na(pct_west_indian))
##convert na to actual NAs
west_indian <- raw_ancestry |>
mutate(pct_west_indian = west_indianE/ancestry_popE,
pct_west_indian = ifelse(is.nan(pct_west_indian), NA, pct_west_indian))
Download the shapefile of the 2020 Neighborhood Tabulation Areas from NYC Planning
Unzip and save to your part2/data/raw/geo folder
https://www.nyc.gov/site/planning/data-maps/open-data/census-download-metadata.page
Download the geojson of the NYC Borough Boundaries from NYC Open Data
Move it to your main_data/data/raw/geo folder
https://data.cityofnewyork.us/City-Government/Borough-Boundaries/tqmj-j8zm
## import borough shapefiles from NYC Open Data
boros <- st_read("~/Downloads/Methods 1_R Learning/part2/data/raw/geo/BoroughBoundaries.geojson")
## import Neighborhood Tabulation Areas for NYC
nabes <- st_read("~/Downloads/Methods 1_R Learning/part2/data/raw/geo/nynta2020.shp")
west_indian_map <- ggplot() +
geom_sf(data = west_indian |>
filter(!is.na(pct_west_indian)),
mapping = aes(fill = pct_west_indian,
text = paste0(NAME, ":",
"<br>Percent West Indian ancestry:",
percent(pct_west_indian, accuracy=1))),
color = "transparent") +
theme_void() +
scale_fill_distiller(breaks=c(0, .2, .4, .6, .8, 1),
direction = 1,
na.value = "#fafafa",#"transparent"
name="Percent West Indian Ancestry (%)",
labels=percent_format(accuracy = 1L)) +
labs(
title = "Brooklyn, West Indian Ancestry by Census Tract",
caption = "Source: American Community Survey, 2016-20"
) +
geom_sf(data = nabes |> filter(BoroName == "Brooklyn"),
color = "#4D84BB", fill = NA, lwd = 0.1) +
geom_sf(data = boros |> filter(boro_name == "Brooklyn"),
color = "#244C95", fill = NA, lwd = .25)
## Warning in layer_sf(geom = GeomSf, data = data, mapping = mapping, stat = stat,
## : Ignoring unknown aesthetics: text
ggplotly(west_indian_map, tooltip = "text")
##Check projection of census tract data
st_crs(west_indian)
st_crs(nabes)
west_indian_2263 <- st_transform(west_indian, 2263)
st_crs(west_indian_2263)
#remove unnecessary fields in the neighborhood shapefile
nabes_selected <- nabes |>
select(BoroCode, BoroName, NTA2020, NTAName)
west_indian_nabes <- west_indian_2263 |>
st_join(nabes_selected,
left = TRUE,
join = st_intersects,
largest = TRUE)
## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries
library(knitr)
kable(head(west_indian_nabes, n=2))
## create neighborhod data map
west_indian_map_nabe <- ggplot() +
geom_sf(data = west_indian_nabes |>
filter(!is.na(pct_west_indian)) |>
filter(NTAName == "Crown Heights (North)"),
mapping = aes(fill = pct_west_indian,
text = paste0(NAME, ":",
"<br>Percent West Indian ancestry:",
percent(pct_west_indian, accuracy=1))),
color = "transparent") +
theme_void() +
scale_fill_distiller(breaks=c(0, .2, .4, .6, .8, 1),
direction = 1,
na.value = "transparent",
name="Percent West Indian Ancestry (%)",
labels=percent_format(accuracy = 1L)) +
labs(
title = "Crown Heights North, West Indian Ancestry by Census Tract",
caption = "Source: American Community Survey, 2016-20"
) +
geom_sf(data = nabes |> filter(NTAName == "Crown Heights (North)"),
color = "#244C95", fill = NA, lwd = 0.25)
## Warning in layer_sf(geom = GeomSf, data = data, mapping = mapping, stat = stat,
## : Ignoring unknown aesthetics: text
ggplotly(west_indian_map_nabe, tooltip = "text")
##Calculate summary statistics
west_indian_nabe_stats <- st_drop_geometry(west_indian_nabes) |>
group_by(NTAName) |>
summarise(Borough = first(BoroName),
`Est. Total Population` = sum(ancestry_popE),
`Est. Total West Indian Population` = sum(west_indianM)) |>
mutate(`Est. Percent West Indian Ancestry` = percent(`Est. Total West Indian Population`/`Est. Total Population`, accuracy = 1))
A missing value is a way to signal an absence of information in a dataset.
Missing values are a part of messy, real-world data. Understanding how missing data are defined in R and how to perform operations with them will be a critical component of your data cleaning and analysis work.
#na.rm = TRUE
#is.na(): #tests if a value is NA
#is.nan(): #tests if a value is NaN (Not a Number)
# create a vector of numeric values with one NA value
vector1 <- c(4, 6, 2, 8, NA, 9)
# view structure of vector1
str(vector1)
## num [1:6] 4 6 2 8 NA 9
In R, missing values typically look like an NA appearing in a variable, a vector, or a dataframe:
You may also encounter missing values in datasets that aren’t NA.
Sometimes the creators of a dataset will use a numeric value to indicate missing data (such as 999) or a string of characters (such as “N/A” or “–”`).
# define the dataset's NA value during data import
#deseg_pp_clean_na <- read_csv("part1/data/raw/invol_data_propublica.csv",
# na = "N/A")
# change a column's data type from character to numeric
#deseg_conversion_clean <- deseg_pp_raw |>
# mutate(Year.Placed = as.numeric(Year.Lifted))
# change a column's data type from character to numeric
#deseg_conversion_clean_ifelse <- deseg_pp_raw |>
# mutate(Year.Placed = ifelse(Year.Placed == "N/A",
# NA,
# Year.Placed))
# attempt to calculate the median year deseg orders were placed
#median(deseg_pp_clean_na$Year.Placed)
It’s important not to ignore missing values when you are trying to run calculations with your data. It’s so important that R will not let you ignore it:
Calculate the median year a desegregation order was placed:
If there is even one NA in the column, mathematical calculations return NA
multiple geoms in one ggplot
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).