tidycensus practice using Anne Arundel county, MD, data
https://walker-data.com/umich-workshop-2022/intro-2020-census/
2020 Census data: what we have now * The currently available 2020 Census data come from the PL94-171 Redistricting Summary File, which is used for congressional apportionment & redistricting
Variables available include total counts (population & households), occupied / vacant housing units, total and voting-age population breakdowns by race & ethnicity, and group quarters status
Later in 2022 (expected): the Demographic and Housing Characteristics summary files, which will include other variables typically included in the decennial Census data (age & sex breakdowns, detailed race & ethnicity)
pop20 <- get_decennial(
geography = "state",
variables = "P1_001N",
year = 2020 #<<<< Deccenial census year
)
#pop20
dim(pop20)
str(pop20)
Understanding the printed messages * The Census Bureau is using differential privacy in an attempt to preserve respondent confidentiality in the 2020 Census data, which is required under US Code Title 13
Intentional errors are introduced into data, impacting the accuracy of small area counts (e.g. some blocks with children, but no adults)
Advocates argue that differential privacy is necessary to satisfy Title 13 requirements given modern database reconstruction technologies; critics contend that the method makes data less useful with no tangible privacy benefit
table_p2 <- get_decennial(
geography = "state",
table = "P2", #<<<< Table number
year = 2020
)
dim(table_p2)
str(table_p2)
https://walker-data.com/tidycensus/articles/basic-usage.html#geography-in-tidycensus-1
vars <- load_variables(2020, "pl")
dim(vars)
#View(vars)
tidy = long form in tidycensus
, since it’s modeled after
tidyverse
group_quarters <- get_decennial(
geography = "state",
table = "P5",
year = 2020
)
dim(group_quarters)
table(group_quarters$variable)
group_quarters_wide <- get_decennial(
geography = "state",
table = "P5",
year = 2020,
output = "wide"
)
dim(group_quarters_wide)
table(group_quarters_wide$variable)
Census variables can be hard to remember; using a named vector to request variables will replace the Census IDs with a custom input
In long form, these custom inputs will populate the variable column; in wide form, they will replace the column names
vacancies_wide <- get_decennial(
geography = "county",
state = "MD",
variables = c(vacant_households = "H1_003N",
total_households = "H1_001N"),
output = "wide",
year = 2020
)
For geographies available below the state level, the state parameter allows you to query data for a specific state
tidycensus
translates state names and postal
abbreviations internally, so you don’t need to remember the FIPS
codes!
pop_co <- get_decennial(
state = "MD",
geography = "county",
variables = "P1_001N", #<<<<< total pop
#variables = "P2_002N", #<<<<< hispanic pop
year = 2020
)
pop_co_table <- get_decennial(
state = "MD",
geography = "county",
table = "P2",
year = 2020
)
dim(pop_co)
str(pop_co)
dim(pop_co_table)
str(pop_co_table)
table(pop_co_table$variable)
pop_tr <- get_decennial(
state = "MD",
county = "Anne Arundel",
geography = "tract",
variables = "P1_001N",
year = 2020
)
pop_tr_table <- get_decennial(
state = "MD",
county = "Anne Arundel",
geography = "tract",
table = "P2",
year = 2020
)
medage_tr <- get_decennial(
state = "MD",
county = "Anne Arundel",
geography = "tract",
#variables = "P1_001N", WHAT IS THE MEDIAN AGE??
year = 2020
)
dim(pop_tr)
dim(pop_tr_table)
poptr
md_population <- get_decennial(
geography = "county",
variables = "P1_001N",
year = 2020,
state = "MD"
)
arrange(md_population, value)
arrange(md_population, desc(value))
below30000 <- filter(md_population, value < 30000)
below30000
Many decennial Census and ACS variables are organized in tables in which the first variable represents a summary variable, or denominator for the others
The parameter summary_var can be used to generate a new column in long-form data for a requested denominator, which works well for normalizing estimates
race_vars <- c(
Hispanic = "P2_002N",
White = "P2_005N",
Black = "P2_006N",
Native = "P2_007N",
Asian = "P2_008N",
HIPI = "P2_009N"
)
md_race <- get_decennial(
geography = "county",
state = "MD",
variables = race_vars,
summary_var = "P2_001N",
year = 2020
)
md_race
md_race_percent <- md_race %>%
mutate(percent = round(100 * (value / summary_value), 1)) %>%
select(NAME, variable, percent)
arrange(md_race_percent, NAME)
largest_group <- md_race_percent %>%
group_by(NAME) %>%
filter(percent == max(percent))
largest_group
md_race_percent %>%
group_by(variable) %>%
summarize(
median_pct = median(percent),
minimum_pct = min(percent),
maximum_pct = max(percent)
)
tidycensus
allows users to access the 2000 and 2010
decennial Census data for comparison, though variable IDs will
differcounty_pop_10 <- get_decennial(
state = "MD",
geography = "county",
variables = "P001001",
year = 2010
)
dim(county_pop_10)
colnames(county_pop_10)
county_pop_10 <- county_pop_10 %>%
select(GEOID, value10 = value)
dim(county_pop_10)
colnames(county_pop_10)
county_pop_20 <- get_decennial(
state = "MD",
geography = "county",
variables = "P1_001N", #<<<<< Notice the variable name change!!
year = 2020)%>%
select(GEOID, NAME, value20 = value)
county_joined <- county_pop_20 %>%
left_join(county_pop_10, by = "GEOID")%>%
mutate(
total_change = value20 - value10,
percent_change = round(100 * (total_change / value10), 1)
)
head(county_joined, 10)
Caveat: changing geographies! * County names and boundaries can change from year to year, introducing potential problems in time-series analysis
county_pop_10 <- get_decennial(
geography = "county",
variables = "P001001",
year = 2010)%>%
select(GEOID, value10 = value)
county_pop_20 <- get_decennial(
geography = "county",
variables = "P1_001N", #<<<<< Notice the variable name change!!
year = 2020)%>%
select(GEOID, NAME, value20 = value)
county_joined <- county_pop_20 %>%
left_join(county_pop_10, by = "GEOID")%>%
mutate(
total_change = value20 - value10,
percent_change = round(100 * (total_change / value10), 1)
)
filter(county_joined, is.na(value10)) #<<<<< counties that did not exist in 2010??
md_hispanic <- get_decennial(
geography = "county",
variables = c(total = "P2_001N",
hispanic = "P2_002N"),
state = "MD",
year = 2020,
output = "wide") %>%
mutate(percent = 100 * (hispanic / total))
ggplot(md_hispanic, aes(x = percent)) +
geom_histogram(bins = 10)
ggplot(md_hispanic, aes(x = percent)) +
geom_boxplot()
options(scipen = 999) # Disable scientific notation
p<- ggplot(md_hispanic, aes(x = total, y = percent)) +
geom_point()
p+ geom_smooth(method = "lm")
p+ scale_x_log10() +
geom_smooth()
md_vacancies <- get_decennial(
state = "MD",
geography = "county",
variables = c(total_households = "H1_001N",
vacant_households = "H1_003N"),
year = 2020,
output = "wide") %>%
mutate(percent_vacant = 100 * (vacant_households / total_households))
ggplot(md_vacancies, aes(x = percent_vacant, y = NAME)) +
geom_col()
library(scales)
ggplot(md_vacancies,
aes(x = percent_vacant, y = reorder(NAME, percent_vacant))) +
geom_col() +
scale_x_continuous(labels = label_percent(scale = 1)) +
scale_y_discrete(
labels = function(y)
str_remove(y, " County, Maryland")) +
labs(x = "Percent vacant households",
y = "",
title = "Household vacancies by county in Maryland",
subtitle = "2020 decennial US Census")
ggplot(md_vacancies,
aes(x = percent_vacant, y = reorder(NAME, percent_vacant))) +
geom_col(fill = "navy", color = "navy", alpha = 0.5) +
theme_minimal(base_family = "Verdana") +
scale_x_continuous(labels = label_percent(scale = 1)) +
scale_y_discrete(
labels = function(y)
str_remove(y, " County, Maryland")) +
labs(x = "Percent vacant households",
y = "",
title = "Household vacancies by county in Maryland",
subtitle = "2020 decennial US Census")
nyc_percent_black <- get_decennial(
geography = "tract",
variables = "P2_006N",
summary_var = "P2_001N",
state = "NY",
county = c("New York", "Kings",
"Queens", "Bronx",
"Richmond"),
year = 2020) %>%
mutate(percent = 100 * (value / summary_value))
nyc_percent_black2 <- nyc_percent_black %>%
separate(NAME, into = c("tract", "county", "state"),
sep = ", ")
ggplot(nyc_percent_black2,
aes(x = percent, fill = county)) +
geom_density(alpha = 0.3)
ggplot(nyc_percent_black2,
aes(x = percent)) +
geom_density(fill = "darkgreen", color = "darkgreen", alpha = 0.5) +
facet_wrap(~county) +
scale_x_continuous(labels = label_percent(scale = 1)) +
theme_minimal(base_size = 14) +
theme(axis.text.y = element_blank()) +
labs(x = "Percent Black",
y = "",
title = "Black population shares by Census tract, 2020")
library(ggridges)
ggplot(nyc_percent_black2, aes(x = percent, y = county)) +
geom_density_ridges() +
theme_ridges() +
labs(x = "Percent Black, 2020 (by Census tract)",
y = "") +
scale_x_continuous(labels = label_percent(scale = 1))
us_percent_white <- map_dfr(c(state.abb, "DC"), function(state) {
get_decennial(
geography = "tract",
variables = "P2_005N",
summary_var = "P2_001N",
state = state,
year = 2020) %>%
mutate(percent = 100 * (value / summary_value)) %>%
separate(NAME, into = c("tract", "county", "state"),
sep = ", ")
})
library(geofacet)
ggplot(us_percent_white, aes(x = percent)) +
geom_histogram(fill = "navy", alpha = 0.8, bins = 30) +
theme_minimal() +
scale_fill_manual(values = c("darkred", "navy")) +
facet_geo(~state, grid = "us_state_grid2",
label = "code", scales = "free_y") +
theme(axis.text = element_blank(),
strip.text.x = element_text(size = 8)) +
labs(x = "",
y = "",
title = "Non-Hispanic white population shares among Census tracts",
fill = "",
caption = "Data source: 2020 decennial US Census & tidycensus R package\nX-axes range from 0% white (on the left) to 100% white (on the right). Y-axes are unique to each state.")
American Community Survey
Annual survey of 3.5 million US households
Covers topics not available in decennial US Census data (e.g. income, education, language, housing characteristics)
Available as 1-year estimates (for geographies of population 65,000 and greater) and 5-year estimates (for geographies down to the block group) *** 2020 1-year data only available as experimental estimates
Data delivered as estimates characterized by margins of error
ACS data in tidycensus
The get_acs() function in tidycensus allows you to access ACS data from 2005 through 2020
Our focus today: the 2016-2020 ACS estimates, available with the
argument year = 2020
Other required arguments: geography
, for the level
of aggregation, and variables
, for one or more ACS
variables
** cbsa
: metropolitan statistical area/micropolitan
statistical area ** zcta
: “zip code tabulation
area”
get_acs() defaults to the 5-year ACS with the argument
survey = "acs5"
; survey = "acs1"
is used for
1-year ACS data
Smaller geographies (e.g. tracts, block groups) are available by state and county using state and county arguments
Access with common names - no need for FIPS codes!
The ACS Detailed Tables, Data Profile, and Subject Tables are available with get_acs(); tidycensus will auto-detect the dataset from the variable name and fetch from the appropriate location
To look up variable IDs from a dataset, use:
*** Detailed Tables: load_variables(2020, “acs5”) *** Data Profile: load_variables(2020, “acs5/profile”) *** Subject Tables: load_variables(2020, “acs5/subject”) *** Comparison Profile: load_variables(2020, “acs5/cprofile”) (GitHub only)
## ----born in mexico-------------------------------------------
born_in_mexico <- get_acs(
geography = "state",
#state = "MD",
variables = "B05006_150",
year = 2020 #<<
)
## ----median-age-----------------------------------------------
median_age <- get_acs(
geography = "state",
#state = "MD",
variables = "B01002_001",
year = 2020 #<<
)
median_age_cbsa <- get_acs(
geography = "cbsa", #<<
variables = "B01002_001",
year = 2020
)
aa_median_age <- get_acs(
geography = "block group",
#geography = "tract",
variables = "B01002_001",
state = "MI",
county = "Washtenaw",
year = 2020
)
acs5vars <- load_variables(2020, "acs5")
class(acs5vars)
dim(acs5vars)
acs1vars <- load_variables(2019, "acs1")
class(acs1vars)
dim(acs1vars)
subject <- load_variables(2020, "acs5/subject")
unique(subject$concept)
### Full table --------------------
age_table <- get_acs(
geography = "state",
table = "B01001"
)
age_table
table(age_table$variable)
### Median age --------------------
medage <- get_acs(
geography = "state",
variable = "B06002_001"
)
medage
### Median age by county --------------------
medage_co <- get_acs(
state = "MD",
geography = "county",
variable = "B06002_001"
)
medage_co
### Median age by tract --------------------
medage_tr <- get_acs(
state = "MD",
county = "Anne Arundel",
geography = "tract",
variable = "B06002_001"
)
medage_tr
colnames(medage_co)<-tolower(colnames(medage_co))
medage_co <- medage_co %>%
mutate(
name = gsub(", Maryland", "", name),
name = gsub(" County", "", name))
medage_co$name<-factor(medage_co$name,
levels = unique(medage_co$name)
[order(medage_co$estimate, decreasing = FALSE)])
medage_co %>%
plot_ly( y = ~name, x = ~estimate, type = "bar"
) %>%
layout(
title ="Median age in each county, MD",
yaxis = list(title = ""),
xaxis = list(title = "Median age (year)",
titlefont=list(size=12)),
Showlegend=FALSE
)
colnames(medage_tr)<-tolower(colnames(medage_tr))
medage_tr <- medage_tr %>%
mutate(
name = gsub(", Maryland", "", name),
name = gsub(" County", "", name))
medage_tr %>%
plot_ly( x = ~estimate, type = "box",
hoverinfo = 'text',
text = ~paste(
'</br>', name,
'</br> Median age (year): ', estimate)
) %>%
layout(
title ="Median age across tracts, Anne Arundel County, MD",
yaxis = list(title = ""),
xaxis = list(title = "Median age (year)",
titlefont=list(size=12)),
legend = list(font=list(size=12))
)
medage_tr %>%
plot_ly( x = ~estimate, type = "histogram",
hoverinfo = 'text',
text = ~paste(
'</br>', name,
'</br> Median age (year): ', estimate)
) %>%
layout(
title ="Median age across tracts, Anne Arundel County, MD",
yaxis = list(title = "Number of tracts"),
xaxis = list(title = "Median age (year)",
titlefont=list(size=12)),
legend = list(font=list(size=12)),
bargap = 0.1
)
### Full table --------------------
racevars <- c(
NH_White = "B03002_003", #NH White
NH_Black = "B03002_004",
NH_AIAN = "B03002_005",
NH_Asian = "B03002_006",
NH_NHPI = "B03002_007",
NH_Other = "B03002_008",
NH_Multiple = "B03002_009",
Hispanic = "B03002_012"
)
### Race by tract --------------------
race_co <- get_acs(
state = "MD",
geography = "county",
variables = racevars,
summary_var = "B03002_001", #Total population
year=2019
)
race_co
### Race by tract --------------------
race_tr <- get_acs(
state = "MD",
county = "Anne Arundel",
geography = "tract",
variables = racevars,
summary_var = "B03002_001", #Total population
year=2019
)
race_tr
colnames(race_co)<-tolower(colnames(race_co))
dtarace_co <- race_co %>%
mutate(percent = round( 100 * (estimate / summary_est), 2) ) %>%
select(name, variable, percent)%>%
rename(race=variable)%>%
mutate(
name = gsub(", Maryland", "", name),
name = gsub(" County", "", name))
dtarace_co %>%
plot_ly( x = ~name, y = ~percent, type = "bar",
color= ~race,
colors = brewer.pal(length(unique(dtarace_co$race)),
"Set3")
) %>%
layout(
title ="Percent distribution by race in each county, MD",
xaxis = list(title = ""),
yaxis = list(title = "Percent",
titlefont=list(size=12)),
legend = list(font=list(size=12)),
barmode = 'stack'
)
dtarace_co <- dtarace_co %>%
mutate(race = ifelse(race=="NH_AIAN", "NH_AIAN/NHPI/Other",
ifelse(race=="NH_NHPI", "NH_AIAN/NHPI/Other",
ifelse(race=="NH_Other", "NH_AIAN/NHPI/Other", race))))%>%
group_by(name, race)%>%
summarize(percent = sum(percent))
dtarace_co %>%
plot_ly( y = ~name, x = ~percent, type = "bar",
hoverinfo = 'text',
text = ~paste(
'</br>', name,
'</br> Race: ', race,
'</br> Value(%): ', percent),
color= ~race,
colors = brewer.pal(length(unique(dtarace_co$race)),
"Set3")
) %>%
layout(
title ="Percent distribution by race in each county, MD",
yaxis = list(title = ""),
xaxis = list(title = "Percent",
titlefont=list(size=12)),
legend = list(font=list(size=12)),
barmode = 'stack'
)
colnames(race_tr)<-tolower(colnames(race_tr))
dtarace_tr <- race_tr %>%
mutate(percent = round( 100 * (estimate / summary_est), 2) ) %>%
select(name, variable, percent)%>%
rename(race=variable)%>%
mutate(
name = gsub(", Maryland", "", name),
name = gsub(" County", "", name))%>%
mutate(race = ifelse(race=="NH_AIAN", "NH_AIAN/NHPI/Other",
ifelse(race=="NH_NHPI", "NH_AIAN/NHPI/Other",
ifelse(race=="NH_Other", "NH_AIAN/NHPI/Other", race))))%>%
group_by(name, race)%>%
summarize(percent = sum(percent))
dtarace_tr %>%
plot_ly( x = ~percent, type = "box",
hoverinfo = 'text',
text = ~paste(
'</br>', name,
'</br> Race: ', race,
'</br> Value(%): ', percent),
color= ~race,
colors = brewer.pal(length(unique(dtarace_tr$race)),
"Set3")
) %>%
layout(
title ="Distribution of percent pop by race across tracts, Anne Arundel County, MD",
yaxis = list(title = ""),
xaxis = list(title = "Percent",
titlefont=list(size=12)),
showlegend = FALSE
)
# comparing 5 vs. 1 year data
### 5 year data --------------------
income_15to19 <- get_acs(
geography = "state",
variables = "B19013_001"
)
income_15to19
### 1 year data --------------------
income_19 <- get_acs(
geography = "state",
variables = "B19013_001",
survey = "acs1"
)
income_19
### Median HH income 5 year data by county --------------------
income_co <- get_acs(
state = "MD",
geography = "county",
variables = "B19013_001" #Median household income
)
income_co
### Median HH income 5 year data by tract --------------------
income_tr <- get_acs(
state = "MD",
county = "Anne Arundel",
geography = "tract",
variables = "B19013_001"
)
income_tr
colnames(income_co)<-tolower(colnames(income_co))
income_co <- income_co %>%
mutate(
name = gsub(", Maryland", "", name),
name = gsub(" County", "", name))
income_co$name<-factor(income_co$name,
levels = unique(income_co$name)
[order(income_co$estimate, decreasing = FALSE)])
income_co %>%
plot_ly( y = ~name, x = ~estimate, type = "bar"
) %>%
layout(
title ="Median household income in each county, MD",
yaxis = list(title = ""),
xaxis = list(title = "USD",
titlefont=list(size=12)),
Showlegend=FALSE
)
colnames(income_tr)<-tolower(colnames(income_tr))
income_tr <- income_tr %>%
mutate(
name = gsub(", Maryland", "", name),
name = gsub(" County", "", name))
income_tr %>%
plot_ly( x = ~estimate, type = "box",
hoverinfo = 'text',
text = ~paste(
'</br>', name,
'</br> Income (USD): ', estimate)
) %>%
layout(
title ="Median household income across tracts, Anne Arundel County, MD",
yaxis = list(title = ""),
xaxis = list(title = "Median household income (USD)",
titlefont=list(size=12)),
legend = list(font=list(size=12))
)
income_tr<-income_tr%>%mutate(county = "Anne Arundel")
#geom_quasirandom problem...???
p<-ggplot(income_tr, aes(x = county, y = estimate, color = estimate)) +
geom_quasirandom(alpha = 0.5) +
coord_flip() +
theme_minimal() +
scale_color_viridis_c(guide = FALSE) +
scale_y_continuous(labels = scales::dollar) +
labs(x = " ",
y = "Median household income",
title = "Household income distribution across tracts, Anne Arundel County, MD",
subtitle = "Census tracts, New York City",
caption = "Data source: 2015-2019 ACS")
#p
ggplotly(p)
income_tr<-income_tr%>%mutate(county = "Anne Arundel")
income_tr %>%
plot_ly( x = ~estimate, type = "histogram",
hoverinfo = 'text',
text = ~paste(
'</br>', name,
'</br> Income (USD): ', estimate)
) %>%
layout(
title ="Median household income across tracts, Anne Arundel County, MD",
yaxis = list(title = "Number of tracts"),
xaxis = list(title = "Median household income (USD)",
titlefont=list(size=12)),
legend = list(font=list(size=12)),
bargap = 0.1
)
md_race_income <- get_acs(
geography = "tract",
state = "MD",
county = c("Anne Arundel"),
variables = c(White = "B03002_003",
Black = "B03002_004",
Asian = "B03002_006",
Hispanic = "B03002_012"),
summary_var = "B19013_001"
) %>%
#group_by(GEOID) %>%
#filter(estimate == max(estimate, na.rm = TRUE)) %>%
#ungroup() %>%
filter(estimate != 0)
p<-ggplot(md_race_income, aes(x = variable, y = estimate, color = estimate)) +
geom_quasirandom(alpha = 0.5) +
coord_flip() +
theme_minimal() +
#scale_color_viridis_c(guide = FALSE) +
scale_color_viridis_c(guide = "none") +
scale_y_continuous(labels = scales::dollar) +
labs(x = " ",
y = "Median household income",
title = "Household income distribution by largest racial/ethnic group",
subtitle = "Census tracts, Anne Arundel County",
caption = "Data source: 2015-2019 ACS")
p
ggplotly(p)
md_race_income <- get_acs(
geography = "county",
state = "MD",
variables = c(White = "B03002_003",
Black = "B03002_004",
Asian = "B03002_006",
Hispanic = "B03002_012"),
summary_var = "B19013_001"
) %>%
#group_by(GEOID) %>%
#filter(estimate == max(estimate, na.rm = TRUE)) %>%
#ungroup() %>%
filter(estimate != 0)
p<-ggplot(md_race_income, aes(x = variable, y = estimate, color = estimate)) +
geom_quasirandom(alpha = 0.5) +
coord_flip() +
theme_minimal() +
#scale_color_viridis_c(guide = FALSE) +
scale_color_viridis_c(guide = "none") +
scale_y_continuous(labels = scales::dollar) +
labs(x = " ",
y = "Median household income",
title = "Household income distribution by largest racial/ethnic group",
subtitle = "Counties, Maryland",
caption = "Data source: 2015-2019 ACS")
p
ggplotly(p)
aa_income <- get_acs(
geography = "tract",
variables = "B19013_001",
state = "MD",
county = "Anne Arundel",
geometry = TRUE
)
ggplot(aa_income, aes(fill = estimate)) +
geom_sf()
ggplot(aa_income, aes(fill = estimate)) +
geom_sf() +
scale_fill_viridis_c(option = "mako")
aa_income_erase <- get_acs(
geography = "tract",
variables = "B19013_001",
state = "MD",
county = "Anne Arundel",
geometry = TRUE,
cb = FALSE
) %>%
st_transform(26982) %>%
erase_water(area_threshold = 0.99)
ggplot(aa_income_erase, aes(fill = estimate)) +
geom_sf() +
scale_fill_viridis_c(option = "mako")
final_map <- ggplot(orleans_erase, aes(fill = estimate)) +
annotation_map_tile(type = "cartolight", zoom = 11) +
theme_void(base_size = 20) +
geom_sf(alpha = 0.6, lwd = 0.1) +
scale_fill_viridis_c(option = "mako", labels = label_dollar()) +
labs(title = "Median household income, Anne Arundel, MD",
subtitle = "2016-2020 ACS estimates",
caption = "Tile source: CARTO / OpenStreetMap contributors",
fill = "ACS estimate ")
median_home_value <- get_acs(
state = "MD",
geography = "tract",
variables = "B25077_001",
year = 2020
)
median_home_value_county <- get_acs(
state = "MD",
geography = "county",
variables = "B25077_001",
#year = 2020, not yet available?
year = 2019,
survey = "acs1"
)%>%
mutate(NAME = str_remove(NAME,
" County, Maryland"))
Practice question: which tracts are in the top/bottom 10 percent in the state for median home values, and how does this break down by county?
top10percent <- median_home_value %>%
mutate(percentile = percent_rank(estimate)) %>%
filter(percentile >= 0.9)%>%
separate(
NAME,
into = c("tract", "county", "state"),
sep = ", "
)
summary_top10 <- top10percent %>%
group_by(county) %>%
summarize(n = n()) %>%
arrange(desc(n))
summary_top10
bottom10percent <- median_home_value %>%
mutate(percentile = percent_rank(estimate)) %>%
filter(percentile < 0.1)%>%
separate(
NAME,
into = c("tract", "county"),
sep = ", "
)
summary_bottom10 <- bottom10percent %>%
group_by(county) %>%
summarize(n = n()) %>%
arrange(desc(n))
summary_bottom10
#library(tidyverse)
#library(scales)
ggplot(median_home_value,
aes(x = estimate)) +
geom_histogram(bins = 30) +
scale_x_continuous(labels = label_dollar())
ggplot(median_home_value_county,
aes(x = estimate,
y = reorder(NAME, estimate))) +
geom_col() +
labs(title = "Median home value, 2016-2020 ACS",
subtitle = "Counties in Maryland",
x = "ACS estimate",
y = "") +
theme_minimal(base_size = 18) +
scale_x_continuous(labels = dollar_format(scale = 0.001,
suffix = "K"))
median_home_value_county %>%
arrange(desc(estimate)) %>%
head(3)
ggplot(median_home_value_county,
aes(x = estimate,
y = reorder(NAME, estimate))) +
geom_errorbar(aes(xmin = estimate - moe,
xmax = estimate + moe),
width = 0.5, size = 0.5) +
geom_point(color = "navy", size = 1) +
labs(title = "Median home value, 2016-2020 ACS",
subtitle = "Counties in Maryland",
x = "ACS estimate",
y = "",
caption = "Error bars reflect the margin of error around the ACS estimate") +
theme_minimal(base_size = 18) +
scale_x_continuous(labels = dollar_format(scale = 0.001,
suffix = "K"))
cohort_names <- c("0-4", "5-9", "10-14", "15-19",
"20-24", "25-29", "30-34", "35-39",
"40-44", "45-49", "50-54", "55-59",
"60-64", "65-69", "70-74", "75-79",
"80-84", "85+")
male_vars <- 2:19 %>%
str_pad(2, "left", "0") %>%
paste0("S0101_C03_0", .) %>%
set_names(cohort_names)
female_vars <- 2:19 %>%
str_pad(2, "left", "0") %>%
paste0("S0101_C05_0", .) %>%
set_names(cohort_names)
male_data <- get_acs(
geography = "county",
variables = male_vars,
state = "MI",
county = "Washtenaw",
year = 2020
) %>%
mutate(sex = "Male",
estimate = estimate * -1)
female_data <- get_acs(
geography = "county",
variables = female_vars,
state = "MI",
county = "Washtenaw",
year = 2020
) %>%
mutate(sex = "Female")
pyramid_data <- bind_rows(male_data, female_data) %>%
mutate(variable = factor(variable, levels = cohort_names))
ggplot(pyramid_data,
aes(x = estimate, y = variable,
fill = sex)) +
geom_col(width = 0.95, alpha = 0.75) +
theme_minimal(base_size = 18) +
scale_x_continuous(labels = function(x) paste0(abs(x / 1000), "k")) +
scale_fill_manual(values = c("#00274C", "#FFCB05")) +
labs(x = "",
y = "ACS estimate",
title = "Population structure in Anne Arundel County, Michigan",
fill = "",
caption = "Data source: 2016-2020 ACS & tidycensus R package")
commonvars = c(
totalpop_ = "B03002_001",
median_income_ = "B19013_001",
median_age_ = "B01002_001",
pct_college_ = "DP02_0068P" #% of pop 25+ with a bachelor's degree
)
### Basic variables from 5 year data by county --------------------
basic_co_1519 <- get_acs(
state = "MD",
geography = "county",
variables = commonvars,
output = "wide",
year = 2019
)
basic_co_1014 <- get_acs(
state = "MD",
geography = "county",
variables = commonvars,
output = "wide",
year = 2014
)
temp1<-basic_co_1519%>%mutate(period="2015-2019")
temp2<-basic_co_1014%>%mutate(period="2010-2014")
dtabasic<-rbind(temp1, temp2)
colnames(dtabasic)<-tolower(colnames(dtabasic))
str(dtabasic)
dtabasic<-dtabasic%>%
mutate(
name = gsub(", Maryland", "", name),
name = gsub(" County", "", name))
str(dtabasic)