Overview

I looked at the median income and population of each census tract in Corona and in Queens Community Districts 3 and 4, of which Corona is a part.

Data Processing

I started by loading libraries, searching for the relevant variables in the American Community Survey (ACS), and importing datasets with geographic data for NYC boroughs and Neighborhood Tabulation Areas (NTAs). I then made a new dataset filtering the ACS data by population and median income in Queens, NYC.

library(tidyverse)
library(tidycensus)
library(sf)
library(scales)
library(viridis)

# load all acs variables
acs201620 <- load_variables(2020, "acs5", cache = T)

## import borough shapefiles from NYC Open Data
boros <- st_read("C:/Users/likms/Desktop/DUE/methods1/main_data/raw/geo/Borough Boundaries.geojson", quiet = TRUE)

## import Neighborhood Tabulation Areas for NYC
nabes <- st_read("C:/Users/likms/Desktop/DUE/methods1/main_data/raw/geo/nynta2020_22b/nynta2020.shp", quiet = TRUE)

# create a new dataframe of population and median income in Queens county
raw_median_income <- get_acs(geography = "tract",
                         variables = c(median_income = "B06011_001", 
                                       total_population = "B01003_001"),
                         state='NY',
                         county = 'Queens',
                         geometry = T,
                         year = 2020,
                         output = "wide")

Fix the projection to 2263.

# transform the projection to 2263
raw_median_income_2263 <- st_transform(raw_median_income, 2263)

Filter to the relevant areas and information

#remove unnecessary fields in the neighborhood shapefile
nabes_selected <- nabes %>%
  select(BoroCode, BoroName, NTA2020, NTAName)

# create a new dataframe and do a spatial join - join neighborhoods to census tracts
nabes_median_income_2263 <- raw_median_income_2263 %>%
  st_join(nabes_selected, 
          left = TRUE,
          join = st_intersects, # if they intersect
          largest = TRUE) # if they overlap, choose the one with the largest overlap
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
# filter to Corona and North Corona
corona_nabes_median_income_2263 <- nabes_median_income_2263 %>% 
  filter(NTAName == "Corona" | NTAName == "North Corona")

# filter to Community Districts 3 and 4, of which Corona is a part
cd3_4_nabes_median_income_2263 <- nabes_median_income_2263 %>% 
  filter(NTAName == "Corona" | NTAName == "North Corona" | 
  NTAName == "East Elmhurst" | NTAName == "Jackson Heights" | NTAName == "Elmhurst")

I mapped median income in Corona by census tract

# map median income in Corona by census tract
ggplot()  + 
  geom_sf(data = corona_nabes_median_income_2263, mapping = aes(fill = median_incomeE), 
          color = "#ffffff",
          lwd = 0) +
  theme_void() +
  scale_fill_distiller(direction = 1,
                       na.value = "transparent",
                       name="Median Income",
                       labels=number_format(accuracy = 1L)) +
  labs(
    title = "Median Income in Corona, Queens",
    caption = "Source: American Community Survey, 2016-20") +
  geom_sf(data = nabes %>% filter(NTAName == "Corona" | NTAName == "North Corona"), 
          color = "black", fill = NA, lwd = .5)

there are two census tracts that stand out: one with a significantly higher median income and one with a significantly lower median income. Why is this? Does this relate to the size of the population? Small sample size can have a large effect on results.

# map population of Corona by census tract
ggplot()  + 
  geom_sf(data = corona_nabes_median_income_2263, mapping = aes(fill = total_populationE), 
          color = "#ffffff",
          lwd = 0) +
  theme_void() +
  scale_fill_distiller(direction = 1,
                       na.value = "transparent",
                       name="Population",
                       labels=number_format(accuracy = 1L)) +
  labs(
    title = "Population in Corona, Queens Census Tracts",
    caption = "Source: American Community Survey, 2016-20") +
  geom_sf(data = nabes %>% filter(NTAName == "Corona" | NTAName == "North Corona"), 
          color = "black", fill = NA, lwd = .5)

I can’t see a clear relationship between the population size and income of these census tracts. As far as I can tell the population size does not have a relationship with the median income here.

I moved on to mapping the broader area. In this case I focused on Queens Community Districts 3 and 4. I mapped income first.

# map median income in Queens Community districts 3 and 4
ggplot()  + 
  geom_sf(data = cd3_4_nabes_median_income_2263, mapping = aes(fill = median_incomeE), 
          color = "#ffffff",
          lwd = 0) +
  theme_void() +
  scale_fill_distiller(direction = 1,
                       na.value = "transparent",
                       name="Median Income",
                       labels=number_format(accuracy = 1L)) +
  labs(
    title = "Median Income in Queens Community Districts 3 and 4",
    caption = "Source: American Community Survey, 2016-20") +
  geom_sf(data = nabes %>% filter(NTAName == "Corona" | NTAName == "North Corona" 
                                  | NTAName == "East Elmhurst" | NTAName == "Jackson Heights" 
                                  | NTAName == "Elmhurst"), 
          color = "black", fill = NA, lwd = .5)

I then mapped the population of Queens Community Districts 3 and 4

# map population of census tracts in Queens Community Districts 3 and 4
ggplot()  + 
  geom_sf(data = cd3_4_nabes_median_income_2263, mapping = aes(fill = total_populationE), 
          color = "#ffffff",
          lwd = 0) +
  theme_void() +
  scale_fill_distiller(direction = 1,
                       na.value = "transparent",
                       name="Population",
                       labels=number_format(accuracy = 1L)) +
  labs(
    title = "Population of Census Tracts in Queens Community Districts 3 and 4",
    caption = "Source: American Community Survey, 2016-20") +
  geom_sf(data = nabes %>% filter(NTAName == "Corona" | NTAName == "North Corona" 
                                  | NTAName == "East Elmhurst" | NTAName == "Jackson Heights" 
                                  | NTAName == "Elmhurst"), 
          color = "black", fill = NA, lwd = .5)

Lastly, I calculated summary statistics for these neighborhoods.

# calculate summary statistics for median income and population in Queens Community Districts 3 and 4
cd3_4_nabes_stats <- st_drop_geometry(cd3_4_nabes_median_income_2263) %>% 
  group_by(NTAName) %>% 
  summarise(NTAName = first(NTAName),
            `Est. Average Median Income` = mean(median_incomeE),
            `Est. Total Population` = sum(total_populationE),
            `Est. Average Tract Population` = mean(total_populationE))

cd3_4_nabes_stats