tidycensus practice using Anne Arundel county, MD, data

https://walker-data.com/umich-workshop-2022/intro-2020-census/

1 Decennial census

2020 Census data: what we have now * The currently available 2020 Census data come from the PL94-171 Redistricting Summary File, which is used for congressional apportionment & redistricting

Variables available include total counts (population & households), occupied / vacant housing units, total and voting-age population breakdowns by race & ethnicity, and group quarters status
Later in 2022 (expected): the Demographic and Housing Characteristics summary files, which will include other variables typically included in the decennial Census data (age & sex breakdowns, detailed race & ethnicity)

1.1 Population by state

pop20 <- get_decennial(
    geography = "state",
    variables = "P1_001N",
    year = 2020 #<<<< Deccenial census year
)

#pop20
dim(pop20)
str(pop20)

Understanding the printed messages * The Census Bureau is using differential privacy in an attempt to preserve respondent confidentiality in the 2020 Census data, which is required under US Code Title 13

Intentional errors are introduced into data, impacting the accuracy of small area counts (e.g. some blocks with children, but no adults)
Advocates argue that differential privacy is necessary to satisfy Title 13 requirements given modern database reconstruction technologies; critics contend that the method makes data less useful with no tangible privacy benefit
https://www.census.gov/library/fact-sheets/2021/protecting-the-confidentiality-of-the-2020-census-redistricting-data.html

table_p2 <- get_decennial(
    geography = "state", 
    table = "P2", #<<<< Table number
    year = 2020
)

dim(table_p2)
str(table_p2)

1.1.1 Understanding geography and variables in tidycensus

https://walker-data.com/tidycensus/articles/basic-usage.html#geography-in-tidycensus-1

1.1.2 Searching for variables

vars <- load_variables(2020, "pl")
dim(vars)
#View(vars)

1.1.3 Tables in the 2020 Census PL file

H1: Occupancy status (by household)
P1: Race
P2: Race by Hispanic origin
P3: Race for the population 18+
P4: Race by Hispanic origin for the population 18+
P5: Group quarters status

1.1.4 Data structure in tidycensus

tidy = long form in tidycensus, since it’s modeled after tidyverse

group_quarters <- get_decennial(
  geography = "state", 
  table = "P5", 
  year = 2020
)

dim(group_quarters)
table(group_quarters$variable)

group_quarters_wide <- get_decennial(
  geography = "state", 
  table = "P5",
  year = 2020,
  output = "wide"
)

dim(group_quarters_wide)
table(group_quarters_wide$variable)

1.1.4 Using named vectors of variables

Census variables can be hard to remember; using a named vector to request variables will replace the Census IDs with a custom input
In long form, these custom inputs will populate the variable column; in wide form, they will replace the column names

vacancies_wide <- get_decennial(
    geography = "county",
    state = "MD",
    variables = c(vacant_households = "H1_003N",
                  total_households = "H1_001N"),
    output = "wide",
    year = 2020
)

1.2 Population by county

For geographies available below the state level, the state parameter allows you to query data for a specific state
tidycensus translates state names and postal abbreviations internally, so you don’t need to remember the FIPS codes!

pop_co <- get_decennial(
    state = "MD", 
    geography = "county", 
    variables = "P1_001N", #<<<<< total pop
    #variables = "P2_002N", #<<<<< hispanic pop
    year = 2020
)

pop_co_table <- get_decennial(
    state = "MD", 
    geography = "county", 
    table = "P2", 
    year = 2020
)

dim(pop_co)
str(pop_co)
dim(pop_co_table)
str(pop_co_table)
table(pop_co_table$variable)

1.3 Population by tract

County names are also translated internally by tidycensus for sub-county queries, e.g. for Census tracts, block groups, and blocks

pop_tr <- get_decennial(
    state = "MD", 
    county = "Anne Arundel", 
    geography = "tract", 
    variables = "P1_001N",
    year = 2020
)

pop_tr_table <- get_decennial(
    state = "MD", 
    county = "Anne Arundel", 
    geography = "tract", 
    table = "P2", 
    year = 2020
)

medage_tr <- get_decennial(
    state = "MD", 
    county = "Anne Arundel", 
    geography = "tract", 
    #variables = "P1_001N", WHAT IS THE MEDIAN AGE?? 
    year = 2020
)

dim(pop_tr)
dim(pop_tr_table)

poptr

1.4 More practices to wrangle data in tidycensus

Example 1 what are the largest and smallest counties in Maryland by population?

md_population <- get_decennial(
    geography = "county",
    variables = "P1_001N",
    year = 2020,
    state = "MD"
)

arrange(md_population, value)
arrange(md_population, desc(value))

Example 2: What are the counties with a population below 30,000?

below30000 <- filter(md_population, value < 30000)
below30000

Example 3: What are % of population by race in each county?

Many decennial Census and ACS variables are organized in tables in which the first variable represents a summary variable, or denominator for the others
The parameter summary_var can be used to generate a new column in long-form data for a requested denominator, which works well for normalizing estimates

race_vars <- c(
    Hispanic = "P2_002N",
    White = "P2_005N",
    Black = "P2_006N",
    Native = "P2_007N",
    Asian = "P2_008N",
    HIPI = "P2_009N"
)

md_race <- get_decennial(
    geography = "county",
    state = "MD",
    variables = race_vars,
    summary_var = "P2_001N",
    year = 2020
)

md_race

md_race_percent <- md_race %>%
    mutate(percent = round(100 * (value / summary_value), 1)) %>%
    select(NAME, variable, percent)

arrange(md_race_percent, NAME)

Example 4: What is the largest group by county? What are the median percentages by race group?

largest_group <- md_race_percent %>%
  group_by(NAME) %>%
  filter(percent == max(percent))

largest_group

md_race_percent %>%
    group_by(variable) %>%
    summarize(
        median_pct = median(percent), 
        minimum_pct = min(percent), 
        maximum_pct = max(percent)
        )

1.5 Trend since 2010 Census?

tidycensus allows users to access the 2000 and 2010 decennial Census data for comparison, though variable IDs will differ

county_pop_10 <- get_decennial(
        state = "MD",
        geography = "county",
        variables = "P001001",
        year = 2010
)

    dim(county_pop_10)
    colnames(county_pop_10)

county_pop_10 <- county_pop_10 %>%
    select(GEOID, value10 = value)

    dim(county_pop_10)
    colnames(county_pop_10)

county_pop_20 <- get_decennial(
        state = "MD",
        geography = "county",
        variables = "P1_001N", #<<<<< Notice the variable name change!! 
        year = 2020)%>%
    select(GEOID, NAME, value20 = value)

county_joined <- county_pop_20 %>%
    left_join(county_pop_10, by = "GEOID")%>%
    mutate(
        total_change = value20 - value10,
        percent_change = round(100 * (total_change / value10), 1)
    )

head(county_joined, 10)

Caveat: changing geographies! * County names and boundaries can change from year to year, introducing potential problems in time-series analysis

This is particularly acute for small geographies like Census tracts & block groups, which we’ll cover on March 25!

county_pop_10 <- get_decennial(
        geography = "county",
        variables = "P001001",
        year = 2010)%>%
    select(GEOID, value10 = value)

county_pop_20 <- get_decennial(
        geography = "county",
        variables = "P1_001N", #<<<<< Notice the variable name change!! 
        year = 2020)%>%
    select(GEOID, NAME, value20 = value)

county_joined <- county_pop_20 %>%
    left_join(county_pop_10, by = "GEOID")%>%
    mutate(
        total_change = value20 - value10,
        percent_change = round(100 * (total_change / value10), 1)
    )

filter(county_joined, is.na(value10)) #<<<<< counties that did not exist in 2010??

1.6 Data viz with 2020 Census data

ggplot aesthetic specifications: https://ggplot2.tidyverse.org/articles/ggplot2-specs.html

Example 1. Hispanic population

md_hispanic <- get_decennial(
        geography = "county", 
        variables = c(total = "P2_001N",
                      hispanic = "P2_002N"), 
        state = "MD",
        year = 2020,
        output = "wide") %>%
    mutate(percent = 100 * (hispanic / total))

ggplot(md_hispanic, aes(x = percent)) + 
    geom_histogram(bins = 10)

ggplot(md_hispanic, aes(x = percent)) + 
    geom_boxplot()

options(scipen = 999) # Disable scientific notation
p<- ggplot(md_hispanic, aes(x = total, y = percent)) + 
    geom_point() 
    
p+ geom_smooth(method = "lm")

p+ scale_x_log10() +
    geom_smooth()

Example 2. Vacant housing

md_vacancies <- get_decennial(
        state = "MD",      
        geography = "county",
        variables = c(total_households = "H1_001N",
                        vacant_households = "H1_003N"),
          
        year = 2020,
        output = "wide") %>%
    mutate(percent_vacant = 100 * (vacant_households / total_households))

ggplot(md_vacancies, aes(x = percent_vacant, y = NAME)) + 
    geom_col()

library(scales)
ggplot(md_vacancies, 
       aes(x = percent_vacant, y = reorder(NAME, percent_vacant))) +
    geom_col() + 
    scale_x_continuous(labels = label_percent(scale = 1)) +
    scale_y_discrete(
        labels = function(y) 
        str_remove(y, " County, Maryland")) +
    labs(x = "Percent vacant households",
         y = "",
         title = "Household vacancies by county in Maryland",
         subtitle = "2020 decennial US Census")

ggplot(md_vacancies, 
       aes(x = percent_vacant, y = reorder(NAME, percent_vacant))) +
    geom_col(fill = "navy", color = "navy", alpha = 0.5) +
    theme_minimal(base_family = "Verdana") +
    scale_x_continuous(labels = label_percent(scale = 1)) + 
    scale_y_discrete(
        labels = function(y) 
        str_remove(y, " County, Maryland")) + 
    labs(x = "Percent vacant households",
         y = "",
         title = "Household vacancies by county in Maryland",
         subtitle = "2020 decennial US Census")

Example 3. how do the distributions of percentage Black population by Census tract vary among the five boroughs of New York City?

nyc_percent_black <- get_decennial(
        geography = "tract",
        variables = "P2_006N",
        summary_var = "P2_001N",
        state = "NY",
        county = c("New York", "Kings",
                   "Queens", "Bronx",
                   "Richmond"),
        year = 2020) %>%
    mutate(percent = 100 * (value / summary_value))

nyc_percent_black2 <- nyc_percent_black %>%
    separate(NAME, into = c("tract", "county", "state"),
           sep = ", ")

ggplot(nyc_percent_black2, 
       aes(x = percent, fill = county)) + 
    geom_density(alpha = 0.3)

ggplot(nyc_percent_black2, 
       aes(x = percent)) +
    geom_density(fill = "darkgreen", color = "darkgreen", alpha = 0.5) + 
    facet_wrap(~county) +
    scale_x_continuous(labels = label_percent(scale = 1)) +
    theme_minimal(base_size = 14) +
    theme(axis.text.y = element_blank()) +
    labs(x = "Percent Black",
         y = "",
         title = "Black population shares by Census tract, 2020")

library(ggridges)
ggplot(nyc_percent_black2, aes(x = percent, y = county)) + 
    geom_density_ridges() +
    theme_ridges() +
    labs(x = "Percent Black, 2020 (by Census tract)", 
         y = "") + 
    scale_x_continuous(labels = label_percent(scale = 1))

Example 3. Geo-faceted plots

us_percent_white <- map_dfr(c(state.abb, "DC"), function(state) {
    get_decennial(
        geography = "tract",
        variables = "P2_005N",
        summary_var = "P2_001N",
        state = state,
        year = 2020) %>%
    mutate(percent = 100 * (value / summary_value)) %>%
    separate(NAME, into = c("tract", "county", "state"),
               sep = ", ")
  })

library(geofacet)
ggplot(us_percent_white, aes(x = percent)) + 
    geom_histogram(fill = "navy", alpha = 0.8, bins = 30) + 
    theme_minimal() + 
    scale_fill_manual(values = c("darkred", "navy")) + 
    facet_geo(~state, grid = "us_state_grid2",
              label = "code", scales = "free_y") + 
    theme(axis.text = element_blank(),
          strip.text.x = element_text(size = 8)) + 
    labs(x = "", 
         y = "", 
         title = "Non-Hispanic white population shares among Census tracts", 
         fill = "", 
         caption = "Data source: 2020 decennial US Census & tidycensus R package\nX-axes range from 0% white (on the left) to 100% white (on the right).  Y-axes are unique to each state.")

2 ACS

2.0 ACS and tidycensus

American Community Survey

Annual survey of 3.5 million US households
Covers topics not available in decennial US Census data (e.g. income, education, language, housing characteristics)
Available as 1-year estimates (for geographies of population 65,000 and greater) and 5-year estimates (for geographies down to the block group) *** 2020 1-year data only available as experimental estimates
Data delivered as estimates characterized by margins of error

ACS data in tidycensus

The get_acs() function in tidycensus allows you to access ACS data from 2005 through 2020
Our focus today: the 2016-2020 ACS estimates, available with the argument year = 2020
Other required arguments: geography, for the level of aggregation, and variables, for one or more ACS variables
** cbsa: metropolitan statistical area/micropolitan statistical area ** zcta: “zip code tabulation area”
get_acs() defaults to the 5-year ACS with the argument survey = "acs5"; survey = "acs1" is used for 1-year ACS data
Smaller geographies (e.g. tracts, block groups) are available by state and county using state and county arguments
Access with common names - no need for FIPS codes!

The ACS Detailed Tables, Data Profile, and Subject Tables are available with get_acs(); tidycensus will auto-detect the dataset from the variable name and fetch from the appropriate location
To look up variable IDs from a dataset, use:

*** Detailed Tables: load_variables(2020, “acs5”) *** Data Profile: load_variables(2020, “acs5/profile”) *** Subject Tables: load_variables(2020, “acs5/subject”) *** Comparison Profile: load_variables(2020, “acs5/cprofile”) (GitHub only)

New: The geography column tells you the smallest geography at which a variable is available; the Data Profile and Subject tables are available at the tract and up, and the Comparison Profile is available at the county and up

## ----born in mexico-------------------------------------------
born_in_mexico <- get_acs(
  geography = "state", 
  #state = "MD", 
  variables = "B05006_150",
  year = 2020 #<<
)

## ----median-age-----------------------------------------------
median_age <- get_acs(
  geography = "state",
  #state = "MD", 
  variables = "B01002_001",
  year = 2020 #<<
)

median_age_cbsa <- get_acs(
  geography = "cbsa", #<<
  variables = "B01002_001",
  year = 2020
)

aa_median_age <- get_acs(
    geography = "block group",
    #geography = "tract",
    variables = "B01002_001",
    state = "MI",
    county = "Washtenaw",
    year = 2020
)

acs5vars <- load_variables(2020, "acs5")
class(acs5vars)
dim(acs5vars)

acs1vars <- load_variables(2019, "acs1")
class(acs1vars)
dim(acs1vars)

subject <- load_variables(2020, "acs5/subject")
unique(subject$concept)

2.1 Age

### Full table --------------------
age_table <- get_acs(
    geography = "state", 
    table = "B01001"
)

age_table
table(age_table$variable)

### Median age --------------------
medage <- get_acs(
    geography = "state", 
    variable = "B06002_001"
)

medage

### Median age by county --------------------
medage_co <- get_acs(
    state = "MD", 
    geography = "county", 
    variable = "B06002_001"
)

medage_co

### Median age by tract --------------------
medage_tr <- get_acs(
    state = "MD", 
    county = "Anne Arundel", 
    geography = "tract", 
    variable = "B06002_001"
)

medage_tr

colnames(medage_co)<-tolower(colnames(medage_co))

medage_co <- medage_co %>%
    mutate(
        name = gsub(", Maryland", "", name), 
        name = gsub(" County", "", name))

medage_co$name<-factor(medage_co$name, 
                    levels = unique(medage_co$name) 
                    [order(medage_co$estimate, decreasing = FALSE)])

medage_co %>% 
    plot_ly( y = ~name, x = ~estimate, type = "bar"
             ) %>%
    layout(
        title ="Median age in each county, MD", 
        yaxis = list(title = ""),
        xaxis = list(title = "Median age (year)",
                     titlefont=list(size=12)), 
        Showlegend=FALSE
        )

colnames(medage_tr)<-tolower(colnames(medage_tr))

medage_tr <- medage_tr %>%
    mutate(
        name = gsub(", Maryland", "", name), 
        name = gsub(" County", "", name))

medage_tr %>% 
    plot_ly( x = ~estimate, type = "box",
             hoverinfo = 'text',
             text = ~paste(
                      '</br>', name,          
                      '</br> Median age (year): ', estimate)
             ) %>%
    layout(
        title ="Median age across tracts, Anne Arundel County, MD", 
        yaxis = list(title = ""),
        xaxis = list(title = "Median age (year)",
                     titlefont=list(size=12)), 
        legend = list(font=list(size=12))
        )

medage_tr %>% 
    plot_ly( x = ~estimate, type = "histogram",
             hoverinfo = 'text',
             text = ~paste(
                      '</br>', name,          
                      '</br> Median age (year): ', estimate)
             ) %>%
    layout(
        title ="Median age across tracts, Anne Arundel County, MD", 
        yaxis = list(title = "Number of tracts"),
        xaxis = list(title = "Median age (year)",
                     titlefont=list(size=12)), 
        legend = list(font=list(size=12)), 
        bargap =  0.1
        )

2.2 Race

### Full table --------------------
racevars <- c(
    NH_White = "B03002_003", #NH White 
    NH_Black = "B03002_004",
    NH_AIAN = "B03002_005",
    NH_Asian = "B03002_006",
    NH_NHPI = "B03002_007",
    NH_Other = "B03002_008",
    NH_Multiple = "B03002_009",
    Hispanic = "B03002_012"
)

### Race by tract --------------------
race_co <- get_acs(
    state = "MD",
    geography = "county",
    variables = racevars,
    summary_var = "B03002_001", #Total population 
    year=2019
)

race_co

### Race by tract --------------------
race_tr <- get_acs(
    state = "MD",
    county = "Anne Arundel", 
    geography = "tract",
    variables = racevars,
    summary_var = "B03002_001", #Total population 
    year=2019
)

race_tr

colnames(race_co)<-tolower(colnames(race_co))

dtarace_co <- race_co %>%
    mutate(percent = round( 100 * (estimate / summary_est), 2) ) %>%
    select(name, variable, percent)%>%
    rename(race=variable)%>%
    mutate(
        name = gsub(", Maryland", "", name), 
        name = gsub(" County", "", name))

dtarace_co %>% 
    plot_ly( x = ~name, y = ~percent, type = "bar",
             color= ~race, 
             colors = brewer.pal(length(unique(dtarace_co$race)),
                                "Set3")
             ) %>%
    layout(
        title ="Percent distribution by race in each county, MD", 
        xaxis = list(title = ""),
        yaxis = list(title = "Percent",
                     titlefont=list(size=12)), 
        legend = list(font=list(size=12)),
        barmode = 'stack'
        )

dtarace_co <- dtarace_co %>%
    mutate(race = ifelse(race=="NH_AIAN", "NH_AIAN/NHPI/Other", 
                    ifelse(race=="NH_NHPI", "NH_AIAN/NHPI/Other", 
                    ifelse(race=="NH_Other", "NH_AIAN/NHPI/Other", race))))%>%
    group_by(name, race)%>%
    summarize(percent = sum(percent))

dtarace_co %>% 
    plot_ly( y = ~name, x = ~percent, type = "bar",
             hoverinfo = 'text',
             text = ~paste(
                      '</br>', name,          
                      '</br> Race: ', race,
                      '</br> Value(%): ', percent), 
             color= ~race, 
             colors = brewer.pal(length(unique(dtarace_co$race)),
                                "Set3")
             ) %>%
    layout(
        title ="Percent distribution by race in each county, MD", 
        yaxis = list(title = ""),
        xaxis = list(title = "Percent",
                     titlefont=list(size=12)), 
        legend = list(font=list(size=12)), 
        barmode = 'stack'
        )

colnames(race_tr)<-tolower(colnames(race_tr))

dtarace_tr <- race_tr %>%
    mutate(percent = round( 100 * (estimate / summary_est), 2) ) %>%
    select(name, variable, percent)%>%
    rename(race=variable)%>%
    mutate(
        name = gsub(", Maryland", "", name), 
        name = gsub(" County", "", name))%>%
    
    mutate(race = ifelse(race=="NH_AIAN", "NH_AIAN/NHPI/Other", 
                    ifelse(race=="NH_NHPI", "NH_AIAN/NHPI/Other", 
                    ifelse(race=="NH_Other", "NH_AIAN/NHPI/Other", race))))%>%
    group_by(name, race)%>%
    summarize(percent = sum(percent))

dtarace_tr %>% 
    plot_ly( x = ~percent, type = "box",
             hoverinfo = 'text',
             text = ~paste(
                      '</br>', name,          
                      '</br> Race: ', race,
                      '</br> Value(%): ', percent), 
             color= ~race, 
             colors = brewer.pal(length(unique(dtarace_tr$race)),
                                "Set3")
             ) %>%
    layout(
        title ="Distribution of percent pop by race across tracts, Anne Arundel County, MD", 
        yaxis = list(title = ""),
        xaxis = list(title = "Percent",
                     titlefont=list(size=12)), 
        showlegend = FALSE
        )

2.3 Income

# comparing 5 vs. 1 year data 

### 5 year data --------------------
income_15to19 <- get_acs(
    geography = "state",
    variables = "B19013_001"
)

income_15to19

### 1 year data --------------------
income_19 <- get_acs(
    geography = "state",
    variables = "B19013_001",
    survey = "acs1"
)

income_19

### Median HH income 5 year data by county --------------------
income_co <- get_acs(
    state = "MD",
    geography = "county",
    variables = "B19013_001" #Median household income
)

income_co

### Median HH income 5 year data by tract --------------------
income_tr <- get_acs(
    state = "MD",
    county = "Anne Arundel",
    geography = "tract",
    variables = "B19013_001"
)

income_tr

colnames(income_co)<-tolower(colnames(income_co))

income_co <- income_co %>%
    mutate(
        name = gsub(", Maryland", "", name), 
        name = gsub(" County", "", name))

income_co$name<-factor(income_co$name, 
                    levels = unique(income_co$name) 
                    [order(income_co$estimate, decreasing = FALSE)])

income_co %>% 
    plot_ly( y = ~name, x = ~estimate, type = "bar"
             ) %>%
    layout(
        title ="Median household income in each county, MD", 
        yaxis = list(title = ""),
        xaxis = list(title = "USD",
                     titlefont=list(size=12)), 
        Showlegend=FALSE
        )

colnames(income_tr)<-tolower(colnames(income_tr))

income_tr <- income_tr %>%
    mutate(
        name = gsub(", Maryland", "", name), 
        name = gsub(" County", "", name))

income_tr %>% 
    plot_ly( x = ~estimate, type = "box",
             hoverinfo = 'text',
             text = ~paste(
                      '</br>', name,          
                      '</br> Income (USD): ', estimate)
             ) %>%
    layout(
        title ="Median household income across tracts, Anne Arundel County, MD", 
        yaxis = list(title = ""),
        xaxis = list(title = "Median household income (USD)",
                     titlefont=list(size=12)), 
        legend = list(font=list(size=12))
        )

income_tr<-income_tr%>%mutate(county = "Anne Arundel")

#geom_quasirandom problem...???
p<-ggplot(income_tr, aes(x = county, y = estimate, color = estimate)) +
    geom_quasirandom(alpha = 0.5) +
    coord_flip() +
    theme_minimal() +
    scale_color_viridis_c(guide = FALSE) +
    scale_y_continuous(labels = scales::dollar) +
    labs(x = " ",
         y = "Median household income",
         title = "Household income distribution across tracts, Anne Arundel County, MD",
         subtitle = "Census tracts, New York City",
         caption = "Data source: 2015-2019 ACS")

#p

ggplotly(p)

income_tr<-income_tr%>%mutate(county = "Anne Arundel")

income_tr %>% 
    plot_ly( x = ~estimate, type = "histogram",
             hoverinfo = 'text',
             text = ~paste(
                      '</br>', name,          
                      '</br> Income (USD): ', estimate)
             ) %>%
    layout(
        title ="Median household income across tracts, Anne Arundel County, MD", 
        yaxis = list(title = "Number of tracts"),
        xaxis = list(title = "Median household income (USD)",
                     titlefont=list(size=12)), 
        legend = list(font=list(size=12)), 
        bargap = 0.1
        )

2.3.1 Bonus: Income by race

md_race_income <- get_acs(
      geography = "tract",
      state = "MD",
      county = c("Anne Arundel"),
      variables = c(White = "B03002_003",
                    Black = "B03002_004",
                    Asian = "B03002_006",
                    Hispanic = "B03002_012"),
      summary_var = "B19013_001"
    ) %>%
    #group_by(GEOID) %>%
    #filter(estimate == max(estimate, na.rm = TRUE)) %>%
    #ungroup() %>%
    filter(estimate != 0)

p<-ggplot(md_race_income, aes(x = variable, y = estimate, color = estimate)) +
    geom_quasirandom(alpha = 0.5) +
    coord_flip() +
    theme_minimal() +
    #scale_color_viridis_c(guide = FALSE) +
    scale_color_viridis_c(guide = "none") +
    scale_y_continuous(labels = scales::dollar) +
    labs(x = " ",
         y = "Median household income",
         title = "Household income distribution by largest racial/ethnic group",
         subtitle = "Census tracts, Anne Arundel County",
         caption = "Data source: 2015-2019 ACS")

p

ggplotly(p)

md_race_income <- get_acs(
        geography = "county",
        state = "MD",
        variables = c(White = "B03002_003",
                      Black = "B03002_004",
                      Asian = "B03002_006",
                      Hispanic = "B03002_012"),
        summary_var = "B19013_001"
    ) %>%
    #group_by(GEOID) %>%
    #filter(estimate == max(estimate, na.rm = TRUE)) %>%
    #ungroup() %>%
    filter(estimate != 0)

p<-ggplot(md_race_income, aes(x = variable, y = estimate, color = estimate)) +
    geom_quasirandom(alpha = 0.5) +
    coord_flip() +
    theme_minimal() +
    #scale_color_viridis_c(guide = FALSE) +
    scale_color_viridis_c(guide = "none") +
    scale_y_continuous(labels = scales::dollar) +
    labs(x = " ",
         y = "Median household income",
         title = "Household income distribution by largest racial/ethnic group",
         subtitle = "Counties, Maryland",
         caption = "Data source: 2015-2019 ACS")

p

ggplotly(p)

2.3.2 Income map

aa_income <- get_acs(
    geography = "tract",
    variables = "B19013_001",
    state = "MD",
    county = "Anne Arundel",
    geometry = TRUE
)

ggplot(aa_income, aes(fill = estimate)) + 
    geom_sf()

ggplot(aa_income, aes(fill = estimate)) + 
    geom_sf() + 
    scale_fill_viridis_c(option = "mako")

aa_income_erase <- get_acs(
        geography = "tract",
        variables = "B19013_001",
        state = "MD",
        county = "Anne Arundel",
        geometry = TRUE,
        cb = FALSE
    ) %>%
    st_transform(26982) %>%
    erase_water(area_threshold = 0.99)

ggplot(aa_income_erase, aes(fill = estimate)) + 
    geom_sf() + 
    scale_fill_viridis_c(option = "mako")

final_map <- ggplot(orleans_erase, aes(fill = estimate)) + 
    annotation_map_tile(type = "cartolight", zoom = 11) +
    theme_void(base_size = 20) + 
    geom_sf(alpha = 0.6, lwd = 0.1) + 
    scale_fill_viridis_c(option = "mako", labels = label_dollar()) + 
    labs(title = "Median household income, Anne Arundel, MD",
         subtitle = "2016-2020 ACS estimates",
         caption = "Tile source: CARTO / OpenStreetMap contributors",
         fill = "ACS estimate  ")

2.4 Home value

median_home_value <- get_acs(
        state = "MD",
        geography = "tract",
        variables = "B25077_001",
        year = 2020
    )

median_home_value_county <- get_acs(
        state = "MD",
        geography = "county",
        variables = "B25077_001",
        #year = 2020, not yet available? 
        year = 2019, 
        survey = "acs1"
    )%>%
    mutate(NAME = str_remove(NAME, 
                           " County, Maryland"))

Practice question: which tracts are in the top/bottom 10 percent in the state for median home values, and how does this break down by county?

top10percent <- median_home_value %>%
    mutate(percentile = percent_rank(estimate)) %>%
    filter(percentile >= 0.9)%>%
    separate(
        NAME,
        into = c("tract", "county", "state"),
        sep = ", "
    )

summary_top10 <- top10percent %>%
    group_by(county) %>%
    summarize(n = n()) %>%
    arrange(desc(n))

summary_top10

bottom10percent <- median_home_value %>%
    mutate(percentile = percent_rank(estimate)) %>%
    filter(percentile < 0.1)%>%
    separate(
        NAME,
        into = c("tract", "county"),
        sep = ", "
    )

summary_bottom10 <- bottom10percent %>%
    group_by(county) %>%
    summarize(n = n()) %>%
    arrange(desc(n))

summary_bottom10

#library(tidyverse)
#library(scales)
ggplot(median_home_value, 
       aes(x = estimate)) + 
    geom_histogram(bins = 30) + 
    scale_x_continuous(labels = label_dollar())

ggplot(median_home_value_county, 
       aes(x = estimate, 
           y = reorder(NAME, estimate))) + 
    geom_col() + 
    labs(title = "Median home value, 2016-2020 ACS", 
         subtitle = "Counties in Maryland", 
         x = "ACS estimate", 
         y = "") + 
    theme_minimal(base_size = 18) + 
    scale_x_continuous(labels = dollar_format(scale = 0.001, 
                                              suffix = "K"))

median_home_value_county %>%
  arrange(desc(estimate)) %>%
  head(3)

ggplot(median_home_value_county, 
       aes(x = estimate, 
           y = reorder(NAME, estimate))) + 
    geom_errorbar(aes(xmin = estimate - moe, 
                      xmax = estimate + moe),
                  width = 0.5, size = 0.5) +
    geom_point(color = "navy", size = 1) +
    labs(title = "Median home value, 2016-2020 ACS", 
         subtitle = "Counties in Maryland", 
         x = "ACS estimate", 
         y = "",
         caption = "Error bars reflect the margin of error around the ACS estimate") +
    theme_minimal(base_size = 18) + 
    scale_x_continuous(labels = dollar_format(scale = 0.001, 
                                            suffix = "K"))

2.5 Population pyramid

cohort_names <- c("0-4", "5-9", "10-14", "15-19",
                  "20-24", "25-29", "30-34", "35-39",
                  "40-44", "45-49", "50-54", "55-59",
                  "60-64", "65-69", "70-74", "75-79",
                  "80-84", "85+")
male_vars <- 2:19 %>%
    str_pad(2, "left", "0") %>%
    paste0("S0101_C03_0", .) %>%
    set_names(cohort_names)

female_vars <- 2:19 %>%
    str_pad(2, "left", "0") %>%
    paste0("S0101_C05_0", .) %>%
    set_names(cohort_names)

male_data <- get_acs(
        geography = "county",
        variables = male_vars,
        state = "MI",
        county = "Washtenaw",
        year = 2020
    ) %>%
    mutate(sex = "Male",
           estimate = estimate * -1)

female_data <- get_acs(
        geography = "county",
        variables = female_vars,
        state = "MI",
        county = "Washtenaw",
        year = 2020
    ) %>%
    mutate(sex = "Female")

pyramid_data <- bind_rows(male_data, female_data) %>%
    mutate(variable = factor(variable, levels = cohort_names))

ggplot(pyramid_data, 
                     aes(x = estimate, y = variable, 
                         fill = sex)) + 
    geom_col(width = 0.95, alpha = 0.75) + 
    theme_minimal(base_size = 18) + 
    scale_x_continuous(labels = function(x) paste0(abs(x / 1000), "k")) + 
    scale_fill_manual(values = c("#00274C", "#FFCB05")) + 
    labs(x = "", 
         y = "ACS estimate", 
         title = "Population structure in Anne Arundel County, Michigan", 
         fill = "", 
         caption = "Data source: 2016-2020 ACS & tidycensus R package")

2.6 Basic Profile

commonvars = c(
                totalpop_ = "B03002_001", 
                median_income_ = "B19013_001",
                median_age_ = "B01002_001", 
                pct_college_ = "DP02_0068P" #% of pop 25+ with a bachelor's degree 
                )

### Basic variables from 5 year data by county --------------------

basic_co_1519 <- get_acs(
    state = "MD",
    geography = "county",
    variables = commonvars, 
    output = "wide",
    year = 2019
)

basic_co_1014 <- get_acs(
    state = "MD",
    geography = "county",
    variables = commonvars, 
    output = "wide", 
    year = 2014
)

temp1<-basic_co_1519%>%mutate(period="2015-2019")
temp2<-basic_co_1014%>%mutate(period="2010-2014")

dtabasic<-rbind(temp1, temp2)
colnames(dtabasic)<-tolower(colnames(dtabasic))
str(dtabasic)

dtabasic<-dtabasic%>%
    mutate(
        name = gsub(", Maryland", "", name), 
        name = gsub(" County", "", name))    

str(dtabasic)