I looked at the median income and population of each census tract in Corona and in Queens Community Districts 3 and 4, of which Corona is a part.
I started by loading libraries, searching for the relevant variables in the American Community Survey (ACS), and importing datasets with geographic data for NYC boroughs and Neighborhood Tabulation Areas (NTAs). I then made a new dataset filtering the ACS data by population and median income in Queens, NYC.
library(tidyverse)
library(tidycensus)
library(sf)
library(scales)
library(viridis)
# load all acs variables
acs201620 <- load_variables(2020, "acs5", cache = T)
## import borough shapefiles from NYC Open Data
boros <- st_read("C:/Users/likms/Desktop/DUE/methods1/main_data/raw/geo/Borough Boundaries.geojson", quiet = TRUE)
## import Neighborhood Tabulation Areas for NYC
nabes <- st_read("C:/Users/likms/Desktop/DUE/methods1/main_data/raw/geo/nynta2020_22b/nynta2020.shp", quiet = TRUE)
# create a new dataframe of population and median income in Queens county
raw_median_income <- get_acs(geography = "tract",
variables = c(median_income = "B06011_001",
total_population = "B01003_001"),
state='NY',
county = 'Queens',
geometry = T,
year = 2020,
output = "wide")
Fix the projection to 2263.
# transform the projection to 2263
raw_median_income_2263 <- st_transform(raw_median_income, 2263)
Filter to the relevant areas and information
#remove unnecessary fields in the neighborhood shapefile
nabes_selected <- nabes %>%
select(BoroCode, BoroName, NTA2020, NTAName)
# create a new dataframe and do a spatial join - join neighborhoods to census tracts
nabes_median_income_2263 <- raw_median_income_2263 %>%
st_join(nabes_selected,
left = TRUE,
join = st_intersects, # if they intersect
largest = TRUE) # if they overlap, choose the one with the largest overlap
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
# filter to Corona and North Corona
corona_nabes_median_income_2263 <- nabes_median_income_2263 %>%
filter(NTAName == "Corona" | NTAName == "North Corona")
# filter to Community Districts 3 and 4, of which Corona is a part
cd3_4_nabes_median_income_2263 <- nabes_median_income_2263 %>%
filter(NTAName == "Corona" | NTAName == "North Corona" |
NTAName == "East Elmhurst" | NTAName == "Jackson Heights" | NTAName == "Elmhurst")
I mapped median income in Corona by census tract
# map median income in Corona by census tract
ggplot() +
geom_sf(data = corona_nabes_median_income_2263, mapping = aes(fill = median_incomeE),
color = "#ffffff",
lwd = 0) +
theme_void() +
scale_fill_distiller(direction = 1,
na.value = "transparent",
name="Median Income",
labels=number_format(accuracy = 1L)) +
labs(
title = "Median Income in Corona, Queens",
caption = "Source: American Community Survey, 2016-20") +
geom_sf(data = nabes %>% filter(NTAName == "Corona" | NTAName == "North Corona"),
color = "black", fill = NA, lwd = .5)
there are two census tracts that stand out: one with a significantly higher median income and one with a significantly lower median income. Why is this? Does this relate to the size of the population? Small sample size can have a large effect on results.
# map population of Corona by census tract
ggplot() +
geom_sf(data = corona_nabes_median_income_2263, mapping = aes(fill = total_populationE),
color = "#ffffff",
lwd = 0) +
theme_void() +
scale_fill_distiller(direction = 1,
na.value = "transparent",
name="Population",
labels=number_format(accuracy = 1L)) +
labs(
title = "Population in Corona, Queens Census Tracts",
caption = "Source: American Community Survey, 2016-20") +
geom_sf(data = nabes %>% filter(NTAName == "Corona" | NTAName == "North Corona"),
color = "black", fill = NA, lwd = .5)
I can’t see a clear relationship between the population size and income of these census tracts. As far as I can tell the population size does not have a relationship with the median income here.
I moved on to mapping the broader area. In this case I focused on Queens Community Districts 3 and 4. I mapped income first.
# map median income in Queens Community districts 3 and 4
ggplot() +
geom_sf(data = cd3_4_nabes_median_income_2263, mapping = aes(fill = median_incomeE),
color = "#ffffff",
lwd = 0) +
theme_void() +
scale_fill_distiller(direction = 1,
na.value = "transparent",
name="Median Income",
labels=number_format(accuracy = 1L)) +
labs(
title = "Median Income in Queens Community Districts 3 and 4",
caption = "Source: American Community Survey, 2016-20") +
geom_sf(data = nabes %>% filter(NTAName == "Corona" | NTAName == "North Corona"
| NTAName == "East Elmhurst" | NTAName == "Jackson Heights"
| NTAName == "Elmhurst"),
color = "black", fill = NA, lwd = .5)
I then mapped the population of Queens Community Districts 3 and 4
# map population of census tracts in Queens Community Districts 3 and 4
ggplot() +
geom_sf(data = cd3_4_nabes_median_income_2263, mapping = aes(fill = total_populationE),
color = "#ffffff",
lwd = 0) +
theme_void() +
scale_fill_distiller(direction = 1,
na.value = "transparent",
name="Population",
labels=number_format(accuracy = 1L)) +
labs(
title = "Population of Census Tracts in Queens Community Districts 3 and 4",
caption = "Source: American Community Survey, 2016-20") +
geom_sf(data = nabes %>% filter(NTAName == "Corona" | NTAName == "North Corona"
| NTAName == "East Elmhurst" | NTAName == "Jackson Heights"
| NTAName == "Elmhurst"),
color = "black", fill = NA, lwd = .5)
Lastly, I calculated summary statistics for these neighborhoods.
# calculate summary statistics for median income and population in Queens Community Districts 3 and 4
cd3_4_nabes_stats <- st_drop_geometry(cd3_4_nabes_median_income_2263) %>%
group_by(NTAName) %>%
summarise(NTAName = first(NTAName),
`Est. Average Median Income` = mean(median_incomeE),
`Est. Total Population` = sum(total_populationE),
`Est. Average Tract Population` = mean(total_populationE))
cd3_4_nabes_stats