tidycensus practice using Anne Arundel county, MD, data

https://walker-data.com/umich-workshop-2022/intro-2020-census/

1 Decennial census

2020 Census data: what we have now * The currently available 2020 Census data come from the PL94-171 Redistricting Summary File, which is used for congressional apportionment & redistricting

  • Variables available include total counts (population & households), occupied / vacant housing units, total and voting-age population breakdowns by race & ethnicity, and group quarters status

  • Later in 2022 (expected): the Demographic and Housing Characteristics summary files, which will include other variables typically included in the decennial Census data (age & sex breakdowns, detailed race & ethnicity)

1.1 Population by state

pop20 <- get_decennial(
    geography = "state",
    variables = "P1_001N",
    year = 2020 #<<<< Deccenial census year
)

#pop20
dim(pop20)
str(pop20)

Understanding the printed messages * The Census Bureau is using differential privacy in an attempt to preserve respondent confidentiality in the 2020 Census data, which is required under US Code Title 13

table_p2 <- get_decennial(
    geography = "state", 
    table = "P2", #<<<< Table number
    year = 2020
)

dim(table_p2)
str(table_p2)

1.1.1 Understanding geography and variables in tidycensus

https://walker-data.com/tidycensus/articles/basic-usage.html#geography-in-tidycensus-1

1.1.2 Searching for variables

vars <- load_variables(2020, "pl")
dim(vars)
#View(vars)

1.1.3 Tables in the 2020 Census PL file

  • H1: Occupancy status (by household)
  • P1: Race
  • P2: Race by Hispanic origin
  • P3: Race for the population 18+
  • P4: Race by Hispanic origin for the population 18+
  • P5: Group quarters status

1.1.4 Data structure in tidycensus

tidy = long form in tidycensus, since it’s modeled after tidyverse

group_quarters <- get_decennial(
  geography = "state", 
  table = "P5", 
  year = 2020
)

dim(group_quarters)
table(group_quarters$variable)

group_quarters_wide <- get_decennial(
  geography = "state", 
  table = "P5",
  year = 2020,
  output = "wide"
)

dim(group_quarters_wide)
table(group_quarters_wide$variable)

1.1.4 Using named vectors of variables

  • Census variables can be hard to remember; using a named vector to request variables will replace the Census IDs with a custom input

  • In long form, these custom inputs will populate the variable column; in wide form, they will replace the column names

vacancies_wide <- get_decennial(
    geography = "county",
    state = "MD",
    variables = c(vacant_households = "H1_003N",
                  total_households = "H1_001N"),
    output = "wide",
    year = 2020
)

1.2 Population by county

  • For geographies available below the state level, the state parameter allows you to query data for a specific state

  • tidycensus translates state names and postal abbreviations internally, so you don’t need to remember the FIPS codes!

pop_co <- get_decennial(
    state = "MD", 
    geography = "county", 
    variables = "P1_001N", #<<<<< total pop
    #variables = "P2_002N", #<<<<< hispanic pop
    year = 2020
)

pop_co_table <- get_decennial(
    state = "MD", 
    geography = "county", 
    table = "P2", 
    year = 2020
)

dim(pop_co)
str(pop_co)
dim(pop_co_table)
str(pop_co_table)
table(pop_co_table$variable)

1.3 Population by tract

  • County names are also translated internally by tidycensus for sub-county queries, e.g. for Census tracts, block groups, and blocks
pop_tr <- get_decennial(
    state = "MD", 
    county = "Anne Arundel", 
    geography = "tract", 
    variables = "P1_001N",
    year = 2020
)

pop_tr_table <- get_decennial(
    state = "MD", 
    county = "Anne Arundel", 
    geography = "tract", 
    table = "P2", 
    year = 2020
)
medage_tr <- get_decennial(
    state = "MD", 
    county = "Anne Arundel", 
    geography = "tract", 
    #variables = "P1_001N", WHAT IS THE MEDIAN AGE?? 
    year = 2020
)

dim(pop_tr)
dim(pop_tr_table)

poptr

1.4 More practices to wrangle data in tidycensus

Example 1 what are the largest and smallest counties in Maryland by population?

md_population <- get_decennial(
    geography = "county",
    variables = "P1_001N",
    year = 2020,
    state = "MD"
)

arrange(md_population, value)
arrange(md_population, desc(value))

Example 2: What are the counties with a population below 30,000?

below30000 <- filter(md_population, value < 30000)
below30000

Example 3: What are % of population by race in each county?

  • Many decennial Census and ACS variables are organized in tables in which the first variable represents a summary variable, or denominator for the others

  • The parameter summary_var can be used to generate a new column in long-form data for a requested denominator, which works well for normalizing estimates

race_vars <- c(
    Hispanic = "P2_002N",
    White = "P2_005N",
    Black = "P2_006N",
    Native = "P2_007N",
    Asian = "P2_008N",
    HIPI = "P2_009N"
)

md_race <- get_decennial(
    geography = "county",
    state = "MD",
    variables = race_vars,
    summary_var = "P2_001N",
    year = 2020
)

md_race

md_race_percent <- md_race %>%
    mutate(percent = round(100 * (value / summary_value), 1)) %>%
    select(NAME, variable, percent)

arrange(md_race_percent, NAME)

Example 4: What is the largest group by county? What are the median percentages by race group?

largest_group <- md_race_percent %>%
  group_by(NAME) %>%
  filter(percent == max(percent))

largest_group
md_race_percent %>%
    group_by(variable) %>%
    summarize(
        median_pct = median(percent), 
        minimum_pct = min(percent), 
        maximum_pct = max(percent)
        )

1.5 Trend since 2010 Census?

  • tidycensus allows users to access the 2000 and 2010 decennial Census data for comparison, though variable IDs will differ
county_pop_10 <- get_decennial(
        state = "MD",
        geography = "county",
        variables = "P001001",
        year = 2010
)

    dim(county_pop_10)
    colnames(county_pop_10)

county_pop_10 <- county_pop_10 %>%
    select(GEOID, value10 = value)

    dim(county_pop_10)
    colnames(county_pop_10)

county_pop_20 <- get_decennial(
        state = "MD",
        geography = "county",
        variables = "P1_001N", #<<<<< Notice the variable name change!! 
        year = 2020)%>%
    select(GEOID, NAME, value20 = value)

county_joined <- county_pop_20 %>%
    left_join(county_pop_10, by = "GEOID")%>%
    mutate(
        total_change = value20 - value10,
        percent_change = round(100 * (total_change / value10), 1)
    )

head(county_joined, 10)

Caveat: changing geographies! * County names and boundaries can change from year to year, introducing potential problems in time-series analysis

  • This is particularly acute for small geographies like Census tracts & block groups, which we’ll cover on March 25!
county_pop_10 <- get_decennial(
        geography = "county",
        variables = "P001001",
        year = 2010)%>%
    select(GEOID, value10 = value)

county_pop_20 <- get_decennial(
        geography = "county",
        variables = "P1_001N", #<<<<< Notice the variable name change!! 
        year = 2020)%>%
    select(GEOID, NAME, value20 = value)

county_joined <- county_pop_20 %>%
    left_join(county_pop_10, by = "GEOID")%>%
    mutate(
        total_change = value20 - value10,
        percent_change = round(100 * (total_change / value10), 1)
    )

filter(county_joined, is.na(value10)) #<<<<< counties that did not exist in 2010??

1.6 Data viz with 2020 Census data

Example 1. Hispanic population

md_hispanic <- get_decennial(
        geography = "county", 
        variables = c(total = "P2_001N",
                      hispanic = "P2_002N"), 
        state = "MD",
        year = 2020,
        output = "wide") %>%
    mutate(percent = 100 * (hispanic / total))
ggplot(md_hispanic, aes(x = percent)) + 
    geom_histogram(bins = 10)  

ggplot(md_hispanic, aes(x = percent)) + 
    geom_boxplot()

options(scipen = 999) # Disable scientific notation
p<- ggplot(md_hispanic, aes(x = total, y = percent)) + 
    geom_point() 
    
p+ geom_smooth(method = "lm")

p+ scale_x_log10() +
    geom_smooth()

Example 2. Vacant housing

md_vacancies <- get_decennial(
        state = "MD",      
        geography = "county",
        variables = c(total_households = "H1_001N",
                        vacant_households = "H1_003N"),
          
        year = 2020,
        output = "wide") %>%
    mutate(percent_vacant = 100 * (vacant_households / total_households))
ggplot(md_vacancies, aes(x = percent_vacant, y = NAME)) + 
    geom_col()

library(scales)
ggplot(md_vacancies, 
       aes(x = percent_vacant, y = reorder(NAME, percent_vacant))) +
    geom_col() + 
    scale_x_continuous(labels = label_percent(scale = 1)) +
    scale_y_discrete(
        labels = function(y) 
        str_remove(y, " County, Maryland")) +
    labs(x = "Percent vacant households",
         y = "",
         title = "Household vacancies by county in Maryland",
         subtitle = "2020 decennial US Census")

ggplot(md_vacancies, 
       aes(x = percent_vacant, y = reorder(NAME, percent_vacant))) +
    geom_col(fill = "navy", color = "navy", alpha = 0.5) +
    theme_minimal(base_family = "Verdana") +
    scale_x_continuous(labels = label_percent(scale = 1)) + 
    scale_y_discrete(
        labels = function(y) 
        str_remove(y, " County, Maryland")) + 
    labs(x = "Percent vacant households",
         y = "",
         title = "Household vacancies by county in Maryland",
         subtitle = "2020 decennial US Census")

Example 3. how do the distributions of percentage Black population by Census tract vary among the five boroughs of New York City?

nyc_percent_black <- get_decennial(
        geography = "tract",
        variables = "P2_006N",
        summary_var = "P2_001N",
        state = "NY",
        county = c("New York", "Kings",
                   "Queens", "Bronx",
                   "Richmond"),
        year = 2020) %>%
    mutate(percent = 100 * (value / summary_value))

nyc_percent_black2 <- nyc_percent_black %>%
    separate(NAME, into = c("tract", "county", "state"),
           sep = ", ")
ggplot(nyc_percent_black2, 
       aes(x = percent, fill = county)) + 
    geom_density(alpha = 0.3)

ggplot(nyc_percent_black2, 
       aes(x = percent)) +
    geom_density(fill = "darkgreen", color = "darkgreen", alpha = 0.5) + 
    facet_wrap(~county) +
    scale_x_continuous(labels = label_percent(scale = 1)) +
    theme_minimal(base_size = 14) +
    theme(axis.text.y = element_blank()) +
    labs(x = "Percent Black",
         y = "",
         title = "Black population shares by Census tract, 2020")

library(ggridges)
ggplot(nyc_percent_black2, aes(x = percent, y = county)) + 
    geom_density_ridges() +
    theme_ridges() +
    labs(x = "Percent Black, 2020 (by Census tract)", 
         y = "") + 
    scale_x_continuous(labels = label_percent(scale = 1))

Example 3. Geo-faceted plots

us_percent_white <- map_dfr(c(state.abb, "DC"), function(state) {
    get_decennial(
        geography = "tract",
        variables = "P2_005N",
        summary_var = "P2_001N",
        state = state,
        year = 2020) %>%
    mutate(percent = 100 * (value / summary_value)) %>%
    separate(NAME, into = c("tract", "county", "state"),
               sep = ", ")
  })

library(geofacet)
ggplot(us_percent_white, aes(x = percent)) + 
    geom_histogram(fill = "navy", alpha = 0.8, bins = 30) + 
    theme_minimal() + 
    scale_fill_manual(values = c("darkred", "navy")) + 
    facet_geo(~state, grid = "us_state_grid2",
              label = "code", scales = "free_y") + 
    theme(axis.text = element_blank(),
          strip.text.x = element_text(size = 8)) + 
    labs(x = "", 
         y = "", 
         title = "Non-Hispanic white population shares among Census tracts", 
         fill = "", 
         caption = "Data source: 2020 decennial US Census & tidycensus R package\nX-axes range from 0% white (on the left) to 100% white (on the right).  Y-axes are unique to each state.")

2 ACS

2.0 ACS and tidycensus

American Community Survey

  • Annual survey of 3.5 million US households

  • Covers topics not available in decennial US Census data (e.g. income, education, language, housing characteristics)

  • Available as 1-year estimates (for geographies of population 65,000 and greater) and 5-year estimates (for geographies down to the block group) *** 2020 1-year data only available as experimental estimates

  • Data delivered as estimates characterized by margins of error

ACS data in tidycensus

  • The get_acs() function in tidycensus allows you to access ACS data from 2005 through 2020

  • Our focus today: the 2016-2020 ACS estimates, available with the argument year = 2020

  • Other required arguments: geography, for the level of aggregation, and variables, for one or more ACS variables
    ** cbsa: metropolitan statistical area/micropolitan statistical area ** zcta: “zip code tabulation area”

  • get_acs() defaults to the 5-year ACS with the argument survey = "acs5"; survey = "acs1" is used for 1-year ACS data

  • Smaller geographies (e.g. tracts, block groups) are available by state and county using state and county arguments

  • Access with common names - no need for FIPS codes!

  • The ACS Detailed Tables, Data Profile, and Subject Tables are available with get_acs(); tidycensus will auto-detect the dataset from the variable name and fetch from the appropriate location

  • To look up variable IDs from a dataset, use:

*** Detailed Tables: load_variables(2020, “acs5”) *** Data Profile: load_variables(2020, “acs5/profile”) *** Subject Tables: load_variables(2020, “acs5/subject”) *** Comparison Profile: load_variables(2020, “acs5/cprofile”) (GitHub only)

  • New: The geography column tells you the smallest geography at which a variable is available; the Data Profile and Subject tables are available at the tract and up, and the Comparison Profile is available at the county and up
## ----born in mexico-------------------------------------------
born_in_mexico <- get_acs(
  geography = "state", 
  #state = "MD", 
  variables = "B05006_150",
  year = 2020 #<<
)

## ----median-age-----------------------------------------------
median_age <- get_acs(
  geography = "state",
  #state = "MD", 
  variables = "B01002_001",
  year = 2020 #<<
)

median_age_cbsa <- get_acs(
  geography = "cbsa", #<<
  variables = "B01002_001",
  year = 2020
)
aa_median_age <- get_acs(
    geography = "block group",
    #geography = "tract",
    variables = "B01002_001",
    state = "MI",
    county = "Washtenaw",
    year = 2020
)
acs5vars <- load_variables(2020, "acs5")
class(acs5vars)
dim(acs5vars)

acs1vars <- load_variables(2019, "acs1")
class(acs1vars)
dim(acs1vars)

subject <- load_variables(2020, "acs5/subject")
unique(subject$concept)

2.1 Age

### Full table --------------------
age_table <- get_acs(
    geography = "state", 
    table = "B01001"
)

age_table
table(age_table$variable)

### Median age --------------------
medage <- get_acs(
    geography = "state", 
    variable = "B06002_001"
)

medage

### Median age by county --------------------
medage_co <- get_acs(
    state = "MD", 
    geography = "county", 
    variable = "B06002_001"
)

medage_co

### Median age by tract --------------------
medage_tr <- get_acs(
    state = "MD", 
    county = "Anne Arundel", 
    geography = "tract", 
    variable = "B06002_001"
)

medage_tr
colnames(medage_co)<-tolower(colnames(medage_co))

medage_co <- medage_co %>%
    mutate(
        name = gsub(", Maryland", "", name), 
        name = gsub(" County", "", name))
medage_co$name<-factor(medage_co$name, 
                    levels = unique(medage_co$name) 
                    [order(medage_co$estimate, decreasing = FALSE)])

medage_co %>% 
    plot_ly( y = ~name, x = ~estimate, type = "bar"
             ) %>%
    layout(
        title ="Median age in each county, MD", 
        yaxis = list(title = ""),
        xaxis = list(title = "Median age (year)",
                     titlefont=list(size=12)), 
        Showlegend=FALSE
        ) 
colnames(medage_tr)<-tolower(colnames(medage_tr))

medage_tr <- medage_tr %>%
    mutate(
        name = gsub(", Maryland", "", name), 
        name = gsub(" County", "", name))
medage_tr %>% 
    plot_ly( x = ~estimate, type = "box",
             hoverinfo = 'text',
             text = ~paste(
                      '</br>', name,          
                      '</br> Median age (year): ', estimate)
             ) %>%
    layout(
        title ="Median age across tracts, Anne Arundel County, MD", 
        yaxis = list(title = ""),
        xaxis = list(title = "Median age (year)",
                     titlefont=list(size=12)), 
        legend = list(font=list(size=12))
        ) 
medage_tr %>% 
    plot_ly( x = ~estimate, type = "histogram",
             hoverinfo = 'text',
             text = ~paste(
                      '</br>', name,          
                      '</br> Median age (year): ', estimate)
             ) %>%
    layout(
        title ="Median age across tracts, Anne Arundel County, MD", 
        yaxis = list(title = "Number of tracts"),
        xaxis = list(title = "Median age (year)",
                     titlefont=list(size=12)), 
        legend = list(font=list(size=12)), 
        bargap =  0.1
        ) 

2.2 Race

### Full table --------------------
racevars <- c(
    NH_White = "B03002_003", #NH White 
    NH_Black = "B03002_004",
    NH_AIAN = "B03002_005",
    NH_Asian = "B03002_006",
    NH_NHPI = "B03002_007",
    NH_Other = "B03002_008",
    NH_Multiple = "B03002_009",
    Hispanic = "B03002_012"
)

### Race by tract --------------------
race_co <- get_acs(
    state = "MD",
    geography = "county",
    variables = racevars,
    summary_var = "B03002_001", #Total population 
    year=2019
)

race_co

### Race by tract --------------------
race_tr <- get_acs(
    state = "MD",
    county = "Anne Arundel", 
    geography = "tract",
    variables = racevars,
    summary_var = "B03002_001", #Total population 
    year=2019
)

race_tr
colnames(race_co)<-tolower(colnames(race_co))

dtarace_co <- race_co %>%
    mutate(percent = round( 100 * (estimate / summary_est), 2) ) %>%
    select(name, variable, percent)%>%
    rename(race=variable)%>%
    mutate(
        name = gsub(", Maryland", "", name), 
        name = gsub(" County", "", name))
dtarace_co %>% 
    plot_ly( x = ~name, y = ~percent, type = "bar",
             color= ~race, 
             colors = brewer.pal(length(unique(dtarace_co$race)),
                                "Set3")
             ) %>%
    layout(
        title ="Percent distribution by race in each county, MD", 
        xaxis = list(title = ""),
        yaxis = list(title = "Percent",
                     titlefont=list(size=12)), 
        legend = list(font=list(size=12)),
        barmode = 'stack'
        ) 
dtarace_co <- dtarace_co %>%
    mutate(race = ifelse(race=="NH_AIAN", "NH_AIAN/NHPI/Other", 
                    ifelse(race=="NH_NHPI", "NH_AIAN/NHPI/Other", 
                    ifelse(race=="NH_Other", "NH_AIAN/NHPI/Other", race))))%>%
    group_by(name, race)%>%
    summarize(percent = sum(percent))
dtarace_co %>% 
    plot_ly( y = ~name, x = ~percent, type = "bar",
             hoverinfo = 'text',
             text = ~paste(
                      '</br>', name,          
                      '</br> Race: ', race,
                      '</br> Value(%): ', percent), 
             color= ~race, 
             colors = brewer.pal(length(unique(dtarace_co$race)),
                                "Set3")
             ) %>%
    layout(
        title ="Percent distribution by race in each county, MD", 
        yaxis = list(title = ""),
        xaxis = list(title = "Percent",
                     titlefont=list(size=12)), 
        legend = list(font=list(size=12)), 
        barmode = 'stack'
        ) 
colnames(race_tr)<-tolower(colnames(race_tr))

dtarace_tr <- race_tr %>%
    mutate(percent = round( 100 * (estimate / summary_est), 2) ) %>%
    select(name, variable, percent)%>%
    rename(race=variable)%>%
    mutate(
        name = gsub(", Maryland", "", name), 
        name = gsub(" County", "", name))%>%
    
    mutate(race = ifelse(race=="NH_AIAN", "NH_AIAN/NHPI/Other", 
                    ifelse(race=="NH_NHPI", "NH_AIAN/NHPI/Other", 
                    ifelse(race=="NH_Other", "NH_AIAN/NHPI/Other", race))))%>%
    group_by(name, race)%>%
    summarize(percent = sum(percent))
dtarace_tr %>% 
    plot_ly( x = ~percent, type = "box",
             hoverinfo = 'text',
             text = ~paste(
                      '</br>', name,          
                      '</br> Race: ', race,
                      '</br> Value(%): ', percent), 
             color= ~race, 
             colors = brewer.pal(length(unique(dtarace_tr$race)),
                                "Set3")
             ) %>%
    layout(
        title ="Distribution of percent pop by race across tracts, Anne Arundel County, MD", 
        yaxis = list(title = ""),
        xaxis = list(title = "Percent",
                     titlefont=list(size=12)), 
        showlegend = FALSE
        ) 

2.3 Income

# comparing 5 vs. 1 year data 

### 5 year data --------------------
income_15to19 <- get_acs(
    geography = "state",
    variables = "B19013_001"
)

income_15to19

### 1 year data --------------------
income_19 <- get_acs(
    geography = "state",
    variables = "B19013_001",
    survey = "acs1"
)

income_19

### Median HH income 5 year data by county --------------------
income_co <- get_acs(
    state = "MD",
    geography = "county",
    variables = "B19013_001" #Median household income
)

income_co

### Median HH income 5 year data by tract --------------------
income_tr <- get_acs(
    state = "MD",
    county = "Anne Arundel",
    geography = "tract",
    variables = "B19013_001"
)

income_tr
colnames(income_co)<-tolower(colnames(income_co))

income_co <- income_co %>%
    mutate(
        name = gsub(", Maryland", "", name), 
        name = gsub(" County", "", name))
income_co$name<-factor(income_co$name, 
                    levels = unique(income_co$name) 
                    [order(income_co$estimate, decreasing = FALSE)])

income_co %>% 
    plot_ly( y = ~name, x = ~estimate, type = "bar"
             ) %>%
    layout(
        title ="Median household income in each county, MD", 
        yaxis = list(title = ""),
        xaxis = list(title = "USD",
                     titlefont=list(size=12)), 
        Showlegend=FALSE
        ) 
colnames(income_tr)<-tolower(colnames(income_tr))

income_tr <- income_tr %>%
    mutate(
        name = gsub(", Maryland", "", name), 
        name = gsub(" County", "", name))
income_tr %>% 
    plot_ly( x = ~estimate, type = "box",
             hoverinfo = 'text',
             text = ~paste(
                      '</br>', name,          
                      '</br> Income (USD): ', estimate)
             ) %>%
    layout(
        title ="Median household income across tracts, Anne Arundel County, MD", 
        yaxis = list(title = ""),
        xaxis = list(title = "Median household income (USD)",
                     titlefont=list(size=12)), 
        legend = list(font=list(size=12))
        ) 
income_tr<-income_tr%>%mutate(county = "Anne Arundel")

#geom_quasirandom problem...???
p<-ggplot(income_tr, aes(x = county, y = estimate, color = estimate)) +
    geom_quasirandom(alpha = 0.5) +
    coord_flip() +
    theme_minimal() +
    scale_color_viridis_c(guide = FALSE) +
    scale_y_continuous(labels = scales::dollar) +
    labs(x = " ",
         y = "Median household income",
         title = "Household income distribution across tracts, Anne Arundel County, MD",
         subtitle = "Census tracts, New York City",
         caption = "Data source: 2015-2019 ACS")

#p

ggplotly(p)
income_tr<-income_tr%>%mutate(county = "Anne Arundel")

income_tr %>% 
    plot_ly( x = ~estimate, type = "histogram",
             hoverinfo = 'text',
             text = ~paste(
                      '</br>', name,          
                      '</br> Income (USD): ', estimate)
             ) %>%
    layout(
        title ="Median household income across tracts, Anne Arundel County, MD", 
        yaxis = list(title = "Number of tracts"),
        xaxis = list(title = "Median household income (USD)",
                     titlefont=list(size=12)), 
        legend = list(font=list(size=12)), 
        bargap = 0.1
        ) 

2.3.1 Bonus: Income by race

md_race_income <- get_acs(
      geography = "tract",
      state = "MD",
      county = c("Anne Arundel"),
      variables = c(White = "B03002_003",
                    Black = "B03002_004",
                    Asian = "B03002_006",
                    Hispanic = "B03002_012"),
      summary_var = "B19013_001"
    ) %>%
    #group_by(GEOID) %>%
    #filter(estimate == max(estimate, na.rm = TRUE)) %>%
    #ungroup() %>%
    filter(estimate != 0)

p<-ggplot(md_race_income, aes(x = variable, y = estimate, color = estimate)) +
    geom_quasirandom(alpha = 0.5) +
    coord_flip() +
    theme_minimal() +
    #scale_color_viridis_c(guide = FALSE) +
    scale_color_viridis_c(guide = "none") +
    scale_y_continuous(labels = scales::dollar) +
    labs(x = " ",
         y = "Median household income",
         title = "Household income distribution by largest racial/ethnic group",
         subtitle = "Census tracts, Anne Arundel County",
         caption = "Data source: 2015-2019 ACS")

p

ggplotly(p)
md_race_income <- get_acs(
        geography = "county",
        state = "MD",
        variables = c(White = "B03002_003",
                      Black = "B03002_004",
                      Asian = "B03002_006",
                      Hispanic = "B03002_012"),
        summary_var = "B19013_001"
    ) %>%
    #group_by(GEOID) %>%
    #filter(estimate == max(estimate, na.rm = TRUE)) %>%
    #ungroup() %>%
    filter(estimate != 0)

p<-ggplot(md_race_income, aes(x = variable, y = estimate, color = estimate)) +
    geom_quasirandom(alpha = 0.5) +
    coord_flip() +
    theme_minimal() +
    #scale_color_viridis_c(guide = FALSE) +
    scale_color_viridis_c(guide = "none") +
    scale_y_continuous(labels = scales::dollar) +
    labs(x = " ",
         y = "Median household income",
         title = "Household income distribution by largest racial/ethnic group",
         subtitle = "Counties, Maryland",
         caption = "Data source: 2015-2019 ACS")

p

ggplotly(p)

2.3.2 Income map

aa_income <- get_acs(
    geography = "tract",
    variables = "B19013_001",
    state = "MD",
    county = "Anne Arundel",
    geometry = TRUE
)
ggplot(aa_income, aes(fill = estimate)) + 
    geom_sf()

ggplot(aa_income, aes(fill = estimate)) + 
    geom_sf() + 
    scale_fill_viridis_c(option = "mako")

aa_income_erase <- get_acs(
        geography = "tract",
        variables = "B19013_001",
        state = "MD",
        county = "Anne Arundel",
        geometry = TRUE,
        cb = FALSE
    ) %>%
    st_transform(26982) %>%
    erase_water(area_threshold = 0.99)
ggplot(aa_income_erase, aes(fill = estimate)) + 
    geom_sf() + 
    scale_fill_viridis_c(option = "mako")

final_map <- ggplot(orleans_erase, aes(fill = estimate)) + 
    annotation_map_tile(type = "cartolight", zoom = 11) +
    theme_void(base_size = 20) + 
    geom_sf(alpha = 0.6, lwd = 0.1) + 
    scale_fill_viridis_c(option = "mako", labels = label_dollar()) + 
    labs(title = "Median household income, Anne Arundel, MD",
         subtitle = "2016-2020 ACS estimates",
         caption = "Tile source: CARTO / OpenStreetMap contributors",
         fill = "ACS estimate  ")

2.4 Home value

median_home_value <- get_acs(
        state = "MD",
        geography = "tract",
        variables = "B25077_001",
        year = 2020
    )

median_home_value_county <- get_acs(
        state = "MD",
        geography = "county",
        variables = "B25077_001",
        #year = 2020, not yet available? 
        year = 2019, 
        survey = "acs1"
    )%>%
    mutate(NAME = str_remove(NAME, 
                           " County, Maryland"))

Practice question: which tracts are in the top/bottom 10 percent in the state for median home values, and how does this break down by county?

top10percent <- median_home_value %>%
    mutate(percentile = percent_rank(estimate)) %>%
    filter(percentile >= 0.9)%>%
    separate(
        NAME,
        into = c("tract", "county", "state"),
        sep = ", "
    )

summary_top10 <- top10percent %>%
    group_by(county) %>%
    summarize(n = n()) %>%
    arrange(desc(n))

summary_top10
bottom10percent <- median_home_value %>%
    mutate(percentile = percent_rank(estimate)) %>%
    filter(percentile < 0.1)%>%
    separate(
        NAME,
        into = c("tract", "county"),
        sep = ", "
    )

summary_bottom10 <- bottom10percent %>%
    group_by(county) %>%
    summarize(n = n()) %>%
    arrange(desc(n))

summary_bottom10
#library(tidyverse)
#library(scales)
ggplot(median_home_value, 
       aes(x = estimate)) + 
    geom_histogram(bins = 30) + 
    scale_x_continuous(labels = label_dollar())

ggplot(median_home_value_county, 
       aes(x = estimate, 
           y = reorder(NAME, estimate))) + 
    geom_col() + 
    labs(title = "Median home value, 2016-2020 ACS", 
         subtitle = "Counties in Maryland", 
         x = "ACS estimate", 
         y = "") + 
    theme_minimal(base_size = 18) + 
    scale_x_continuous(labels = dollar_format(scale = 0.001, 
                                              suffix = "K"))

median_home_value_county %>%
  arrange(desc(estimate)) %>%
  head(3)
ggplot(median_home_value_county, 
       aes(x = estimate, 
           y = reorder(NAME, estimate))) + 
    geom_errorbar(aes(xmin = estimate - moe, 
                      xmax = estimate + moe),
                  width = 0.5, size = 0.5) +
    geom_point(color = "navy", size = 1) +
    labs(title = "Median home value, 2016-2020 ACS", 
         subtitle = "Counties in Maryland", 
         x = "ACS estimate", 
         y = "",
         caption = "Error bars reflect the margin of error around the ACS estimate") +
    theme_minimal(base_size = 18) + 
    scale_x_continuous(labels = dollar_format(scale = 0.001, 
                                            suffix = "K"))

2.5 Population pyramid

cohort_names <- c("0-4", "5-9", "10-14", "15-19",
                  "20-24", "25-29", "30-34", "35-39",
                  "40-44", "45-49", "50-54", "55-59",
                  "60-64", "65-69", "70-74", "75-79",
                  "80-84", "85+")
male_vars <- 2:19 %>%
    str_pad(2, "left", "0") %>%
    paste0("S0101_C03_0", .) %>%
    set_names(cohort_names)

female_vars <- 2:19 %>%
    str_pad(2, "left", "0") %>%
    paste0("S0101_C05_0", .) %>%
    set_names(cohort_names)

male_data <- get_acs(
        geography = "county",
        variables = male_vars,
        state = "MI",
        county = "Washtenaw",
        year = 2020
    ) %>%
    mutate(sex = "Male",
           estimate = estimate * -1)

female_data <- get_acs(
        geography = "county",
        variables = female_vars,
        state = "MI",
        county = "Washtenaw",
        year = 2020
    ) %>%
    mutate(sex = "Female")

pyramid_data <- bind_rows(male_data, female_data) %>%
    mutate(variable = factor(variable, levels = cohort_names))
ggplot(pyramid_data, 
                     aes(x = estimate, y = variable, 
                         fill = sex)) + 
    geom_col(width = 0.95, alpha = 0.75) + 
    theme_minimal(base_size = 18) + 
    scale_x_continuous(labels = function(x) paste0(abs(x / 1000), "k")) + 
    scale_fill_manual(values = c("#00274C", "#FFCB05")) + 
    labs(x = "", 
         y = "ACS estimate", 
         title = "Population structure in Anne Arundel County, Michigan", 
         fill = "", 
         caption = "Data source: 2016-2020 ACS & tidycensus R package")

2.6 Basic Profile

commonvars = c(
                totalpop_ = "B03002_001", 
                median_income_ = "B19013_001",
                median_age_ = "B01002_001", 
                pct_college_ = "DP02_0068P" #% of pop 25+ with a bachelor's degree 
                )

### Basic variables from 5 year data by county --------------------

basic_co_1519 <- get_acs(
    state = "MD",
    geography = "county",
    variables = commonvars, 
    output = "wide",
    year = 2019
)

basic_co_1014 <- get_acs(
    state = "MD",
    geography = "county",
    variables = commonvars, 
    output = "wide", 
    year = 2014
)
temp1<-basic_co_1519%>%mutate(period="2015-2019")
temp2<-basic_co_1014%>%mutate(period="2010-2014")

dtabasic<-rbind(temp1, temp2)
colnames(dtabasic)<-tolower(colnames(dtabasic))
str(dtabasic)

dtabasic<-dtabasic%>%
    mutate(
        name = gsub(", Maryland", "", name), 
        name = gsub(" County", "", name))    

str(dtabasic)