library(tidycensus)
library(tidyverse)
library(leaflet)
library(tigris)
library(sf)
library(viridis)
library(mapview)
library(ggrepel)
library(ggthemes)
census_api_key("43bcf61a88fedf78947062a0a584551692f09a88")
options(tigris_use_cache = TRUE)

Simple Example

This simple example show show to retrieve ACS data for a specific table, by state, with ‘county’ level geography for the year 2017.

df2017 = get_acs(geography = "county", table="B25064", state='TX', geometry = FALSE, year = 2017)

## Getting data from the 2013-2017 5-year ACS

nrow(df2017)

## [1] 254

head(df2017)

## # A tibble: 6 x 5
##   GEOID NAME                    variable   estimate   moe
##   <chr> <chr>                   <chr>         <dbl> <dbl>
## 1 48001 Anderson County, Texas  B25064_001      748    35
## 2 48003 Andrews County, Texas   B25064_001      997    96
## 3 48005 Angelina County, Texas  B25064_001      774    17
## 4 48007 Aransas County, Texas   B25064_001      822   116
## 5 48009 Archer County, Texas    B25064_001      621    45
## 6 48011 Armstrong County, Texas B25064_001      732    70

Census Tract Level Details - Single County

This example shows how to retrieve Census tracts. Notice that a county must be provided for this level of geography.

dfTractsTravis = get_acs(geography = "tract", table="B25064", state='TX', county='Travis', geometry = FALSE, year = 2017)

## Getting data from the 2013-2017 5-year ACS

nrow(dfTractsTravis)

## [1] 218

head(dfTractsTravis)

## # A tibble: 6 x 5
##   GEOID       NAME                                 variable  estimate   moe
##   <chr>       <chr>                                <chr>        <dbl> <dbl>
## 1 48453000101 Census Tract 1.01, Travis County, T… B25064_0…     1591   165
## 2 48453000102 Census Tract 1.02, Travis County, T… B25064_0…     2975  2228
## 3 48453000203 Census Tract 2.03, Travis County, T… B25064_0…     1723    58
## 4 48453000204 Census Tract 2.04, Travis County, T… B25064_0…     1020    76
## 5 48453000205 Census Tract 2.05, Travis County, T… B25064_0…     1136    95
## 6 48453000206 Census Tract 2.06, Travis County, T… B25064_0…     1262    79

Census Tract Level Details - Multiple Counties

This example shows how to retrieve Census tracts. Notice that you can provide multiple counties by passing in a vector of county names

counties = c("Travis", "Williamson", "Hays", "Bastrop", "Caldwell")
dfTractsAustinMetro = get_acs(geography = "tract", table="B25064", state='TX', county=counties, geometry = FALSE, year = 2017)

## Getting data from the 2013-2017 5-year ACS

nrow(dfTractsAustinMetro)

## [1] 350

head(dfTractsAustinMetro)

## # A tibble: 6 x 5
##   GEOID      NAME                                  variable  estimate   moe
##   <chr>      <chr>                                 <chr>        <dbl> <dbl>
## 1 480219501… Census Tract 9501, Bastrop County, T… B25064_0…      921    57
## 2 480219502… Census Tract 9502, Bastrop County, T… B25064_0…      937   170
## 3 480219503… Census Tract 9503, Bastrop County, T… B25064_0…     1058   138
## 4 480219504… Census Tract 9504, Bastrop County, T… B25064_0…      881    84
## 5 480219505… Census Tract 9505.01, Bastrop County… B25064_0…      631   450
## 6 480219505… Census Tract 9505.02, Bastrop County… B25064_0…      818   213

Caching a Table for Faster Future Access

This example shows how to cache a table for faster future access. The cache_table parameter controls this and is set to FALSE by default.

dfTractsTravis = get_acs(geography = "tract", table="B25064", state='TX', county='Travis', geometry = FALSE, year = 2017, cache_table = TRUE)

## Getting data from the 2013-2017 5-year ACS

## Loading ACS5 variables for 2017 from table B25064 and caching the dataset for faster future access.

nrow(dfTractsTravis)

## [1] 218

head(dfTractsTravis)

## # A tibble: 6 x 5
##   GEOID       NAME                                 variable  estimate   moe
##   <chr>       <chr>                                <chr>        <dbl> <dbl>
## 1 48453000101 Census Tract 1.01, Travis County, T… B25064_0…     1591   165
## 2 48453000102 Census Tract 1.02, Travis County, T… B25064_0…     2975  2228
## 3 48453000203 Census Tract 2.03, Travis County, T… B25064_0…     1723    58
## 4 48453000204 Census Tract 2.04, Travis County, T… B25064_0…     1020    76
## 5 48453000205 Census Tract 2.05, Travis County, T… B25064_0…     1136    95
## 6 48453000206 Census Tract 2.06, Travis County, T… B25064_0…     1262    79

Specifying Variables from a Table

You can specify particular variables from a table rather than returning the entire table. Table B25068 from ACS contains Bedrooms by Gross Rent. This includes no bedroom, 1BR, 2BR, and 3+BR. But what if you only want the data for 3BR and not all the other information? The variables parameter can be used. It accepts a vector or character string. In this example we’ll return the information for 3+BR which is contained within the 031-036 variables. We also specify a summary variable here using the 030 variable.

cats = c("B25068_031",
         "B25068_032",
         "B25068_033",
         "B25068_034",
         "B25068_035",
         "B25068_036"
)

counties = c("Kendall", "Comal", "Bexar", "Atascosa", "Bandera", "Guadalupe", "Medina", "Wilson")

df = get_acs(geography = "county", state = 'TX', county = counties, variables = cats, summary_var = "B25068_030", geometry = FALSE, year = 2017, cache_table = TRUE)

## Getting data from the 2013-2017 5-year ACS

head(df)

## # A tibble: 6 x 7
##   GEOID NAME               variable  estimate   moe summary_est summary_moe
##   <chr> <chr>              <chr>        <dbl> <dbl>       <dbl>       <dbl>
## 1 48013 Atascosa County, … B25068_0…        6     9        1352         312
## 2 48013 Atascosa County, … B25068_0…        7    12        1352         312
## 3 48013 Atascosa County, … B25068_0…      328   141        1352         312
## 4 48013 Atascosa County, … B25068_0…      441   167        1352         312
## 5 48013 Atascosa County, … B25068_0…      433   166        1352         312
## 6 48013 Atascosa County, … B25068_0…      137   101        1352         312

Retrieving Decennial Census Information

The get_decennial() function can be used to return decnnial census information from the years 1990, 2000, and 2010.

vars10 = c("P005003", "P005004", "P005006", "P004003")

il = get_decennial(geography = "county", variables = vars10, year = 2010, summary_var = "P001001", state = "IL", geometry = TRUE)
il = mutate(il, pct = 100 * (value / summary_value))

ggplot(il, aes(fill = pct, color = pct)) +
  geom_sf() +
  facet_wrap(~variable)

Filtering with tidyverse

This example will show you how to create a subset of records with the filter() dplyr function. We’ll filter median estimated home values for 2017 from the ACS to includes only records where the estimate median home value is greater than 200,000

#get home value information from ACS for 2017
dfHomeValues = get_acs(geography = "county", variables = "B25077_001", geometry = FALSE, year=2017)

## Getting data from the 2013-2017 5-year ACS

nrow(dfHomeValues)

## [1] 3220

#filter the data so that only counties that have a median value of greater than 200,000 are retained
dfHomeValues = filter(dfHomeValues, estimate >= 200000)
nrow(dfHomeValues)

## [1] 456

head(dfHomeValues, 20)

## # A tibble: 20 x 5
##    GEOID NAME                                 variable   estimate   moe
##    <chr> <chr>                                <chr>         <dbl> <dbl>
##  1 02016 Aleutians West Census Area, Alaska   B25077_001   238800 48045
##  2 02020 Anchorage Municipality, Alaska       B25077_001   304500  5427
##  3 02068 Denali Borough, Alaska               B25077_001   224200 32280
##  4 02090 Fairbanks North Star Borough, Alaska B25077_001   230600  6832
##  5 02100 Haines Borough, Alaska               B25077_001   243100 35366
##  6 02105 Hoonah-Angoon Census Area, Alaska    B25077_001   226800 17022
##  7 02110 Juneau City and Borough, Alaska      B25077_001   343100  8120
##  8 02122 Kenai Peninsula Borough, Alaska      B25077_001   234600  6420
##  9 02130 Ketchikan Gateway Borough, Alaska    B25077_001   265700  7682
## 10 02150 Kodiak Island Borough, Alaska        B25077_001   267700 14317
## 11 02170 Matanuska-Susitna Borough, Alaska    B25077_001   235600  2432
## 12 02195 Petersburg Borough, Alaska           B25077_001   205000 16068
## 13 02220 Sitka City and Borough, Alaska       B25077_001   350900 12649
## 14 02230 Skagway Municipality, Alaska         B25077_001   293800 22308
## 15 02261 Valdez-Cordova Census Area, Alaska   B25077_001   252200 24296
## 16 04005 Coconino County, Arizona             B25077_001   241400  9243
## 17 04013 Maricopa County, Arizona             B25077_001   225000   888
## 18 04025 Yavapai County, Arizona              B25077_001   215000  4772
## 19 06001 Alameda County, California           B25077_001   649100  3303
## 20 06003 Alpine County, California            B25077_001   343800 37678

Aranging with tidyverse

In this example you’ll learn how to use the arrange() tidyverse function to order the rows in a dataframe using a variable (column). Rows can sorted in ascending or descending order. Here we’ll arrange the median home values for 2017 from ACS in descending order.

#get home value information from ACS for 2017
dfHomeValues = get_acs(geography = "county", variables = "B25077_001", geometry = FALSE, year=2017)

## Getting data from the 2013-2017 5-year ACS

nrow(dfHomeValues)

## [1] 3220

#filter the data so that only counties that have a median value of greater than 200,000 are retained
dfHomeValues = filter(dfHomeValues, estimate >= 200000)
dfHomeValues = arrange(dfHomeValues, desc(estimate))
head(dfHomeValues, 20)

## # A tibble: 20 x 5
##    GEOID NAME                             variable   estimate    moe
##    <chr> <chr>                            <chr>         <dbl>  <dbl>
##  1 25019 Nantucket County, Massachusetts  B25077_001   995900  52007
##  2 06075 San Francisco County, California B25077_001   927400   7229
##  3 06081 San Mateo County, California     B25077_001   917700   6273
##  4 36061 New York County, New York        B25077_001   915300  14939
##  5 06041 Marin County, California         B25077_001   908800   8635
##  6 06085 Santa Clara County, California   B25077_001   829600   4266
##  7 51610 Falls Church city, Virginia      B25077_001   742000  36331
##  8 56039 Teton County, Wyoming            B25077_001   739100  70137
##  9 25007 Dukes County, Massachusetts      B25077_001   674600  29662
## 10 06087 Santa Cruz County, California    B25077_001   659900   9193
## 11 06001 Alameda County, California       B25077_001   649100   3303
## 12 51013 Arlington County, Virginia       B25077_001   643300  10582
## 13 15003 Honolulu County, Hawaii          B25077_001   626400   3236
## 14 36047 Kings County, New York           B25077_001   623900   3713
## 15 06059 Orange County, California        B25077_001   620500   2291
## 16 08097 Pitkin County, Colorado          B25077_001   593600 117537
## 17 15009 Maui County, Hawaii              B25077_001   569100  11979
## 18 06055 Napa County, California          B25077_001   560500   8513
## 19 49043 Summit County, Utah              B25077_001   558300  22597
## 20 08117 Summit County, Colorado          B25077_001   547700  33439

Selecting with tidyverse

In this example you’ll learn how to use the select() tidyverse function to specify which columns to include in a data frame. Thes select() function can also be used to rename columns as you’ll see.

dfHomeValues = get_acs(geography = "county", variables = "B25077_001", geometry = FALSE, year=2017)

## Getting data from the 2013-2017 5-year ACS

#notice that all columns are included initially
head(dfHomeValues)

## # A tibble: 6 x 5
##   GEOID NAME                    variable   estimate   moe
##   <chr> <chr>                   <chr>         <dbl> <dbl>
## 1 01001 Autauga County, Alabama B25077_001   143000  6496
## 2 01003 Baldwin County, Alabama B25077_001   182000  3992
## 3 01005 Barbour County, Alabama B25077_001    89300  5605
## 4 01007 Bibb County, Alabama    B25077_001   105500 12056
## 5 01009 Blount County, Alabama  B25077_001   122200  4169
## 6 01011 Bullock County, Alabama B25077_001    66800  9883

#use the select function to restrict the columns that are included and rename columns
dfHomeValues = select(dfHomeValues, NAME, estimate, "ErrorEst" = moe)
head(dfHomeValues)

## # A tibble: 6 x 3
##   NAME                    estimate ErrorEst
##   <chr>                      <dbl>    <dbl>
## 1 Autauga County, Alabama   143000     6496
## 2 Baldwin County, Alabama   182000     3992
## 3 Barbour County, Alabama    89300     5605
## 4 Bibb County, Alabama      105500    12056
## 5 Blount County, Alabama    122200     4169
## 6 Bullock County, Alabama    66800     9883

Separating columns with tidyverse

The separate() function from the tidyr package can be used to separate one column into multiple columns. This is often useful for splitting Census geographies into multiple columns. For example, you might want to split county and state into seaprate counties since they are originally stored in a single column.

dfMedHHInc = get_acs(geography = "county", variables = "B19013_001", geometry = FALSE, year=2017)

## Getting data from the 2013-2017 5-year ACS

head(dfMedHHInc)

## # A tibble: 6 x 5
##   GEOID NAME                    variable   estimate   moe
##   <chr> <chr>                   <chr>         <dbl> <dbl>
## 1 01001 Autauga County, Alabama B19013_001    55317  2838
## 2 01003 Baldwin County, Alabama B19013_001    52562  1348
## 3 01005 Barbour County, Alabama B19013_001    33368  2551
## 4 01007 Bibb County, Alabama    B19013_001    43404  3431
## 5 01009 Blount County, Alabama  B19013_001    47412  2630
## 6 01011 Bullock County, Alabama B19013_001    29655  5376

dfMedHHInc = separate(dfMedHHInc, NAME, into=c("COUNTY", "STATE"), sep="\\,")
head(dfMedHHInc)

## # A tibble: 6 x 6
##   GEOID COUNTY         STATE      variable   estimate   moe
##   <chr> <chr>          <chr>      <chr>         <dbl> <dbl>
## 1 01001 Autauga County " Alabama" B19013_001    55317  2838
## 2 01003 Baldwin County " Alabama" B19013_001    52562  1348
## 3 01005 Barbour County " Alabama" B19013_001    33368  2551
## 4 01007 Bibb County    " Alabama" B19013_001    43404  3431
## 5 01009 Blount County  " Alabama" B19013_001    47412  2630
## 6 01011 Bullock County " Alabama" B19013_001    29655  5376

Adding or redefining columns with tidyverse

The mutate() function can be used to add new columns to a data frame based on information found in existing columns of the data frame. It can also be used to redefine rows values for an existing column.

dfMedHHInc = get_acs(geography = "county", variables = "B19013_001", geometry = FALSE, year=2017)

## Getting data from the 2013-2017 5-year ACS

dfMedHHInc = separate(dfMedHHInc, NAME, into=c("COUNTY", "STATE"), sep="\\,")
head(dfMedHHInc)

## # A tibble: 6 x 6
##   GEOID COUNTY         STATE      variable   estimate   moe
##   <chr> <chr>          <chr>      <chr>         <dbl> <dbl>
## 1 01001 Autauga County " Alabama" B19013_001    55317  2838
## 2 01003 Baldwin County " Alabama" B19013_001    52562  1348
## 3 01005 Barbour County " Alabama" B19013_001    33368  2551
## 4 01007 Bibb County    " Alabama" B19013_001    43404  3431
## 5 01009 Blount County  " Alabama" B19013_001    47412  2630
## 6 01011 Bullock County " Alabama" B19013_001    29655  5376

dfMedHHInc = mutate(dfMedHHInc, STATE = trimws(STATE))
head(dfMedHHInc)

## # A tibble: 6 x 6
##   GEOID COUNTY         STATE   variable   estimate   moe
##   <chr> <chr>          <chr>   <chr>         <dbl> <dbl>
## 1 01001 Autauga County Alabama B19013_001    55317  2838
## 2 01003 Baldwin County Alabama B19013_001    52562  1348
## 3 01005 Barbour County Alabama B19013_001    33368  2551
## 4 01007 Bibb County    Alabama B19013_001    43404  3431
## 5 01009 Blount County  Alabama B19013_001    47412  2630
## 6 01011 Bullock County Alabama B19013_001    29655  5376

Grouping and summarizing data with tidyverse

In this example you’ll learn how to use the group_by() function to group based on one or more columns. You’ll also learn how to use the summarize() function to produce summary statistics for a data frame.

dfMedHHInc = get_acs(geography = "county", variables = "B19013_001", geometry = FALSE, year=2017)

## Getting data from the 2013-2017 5-year ACS

dfMedHHInc = separate(dfMedHHInc, NAME, into=c("COUNTY", "STATE"), sep="\\,")
dfMedHHInc = mutate(dfMedHHInc, STATE = trimws(STATE))
head(dfMedHHInc)

## # A tibble: 6 x 6
##   GEOID COUNTY         STATE   variable   estimate   moe
##   <chr> <chr>          <chr>   <chr>         <dbl> <dbl>
## 1 01001 Autauga County Alabama B19013_001    55317  2838
## 2 01003 Baldwin County Alabama B19013_001    52562  1348
## 3 01005 Barbour County Alabama B19013_001    33368  2551
## 4 01007 Bibb County    Alabama B19013_001    43404  3431
## 5 01009 Blount County  Alabama B19013_001    47412  2630
## 6 01011 Bullock County Alabama B19013_001    29655  5376

dfGroup = group_by(dfMedHHInc, STATE)
dfSum = summarize(dfGroup, mean(estimate))
head(dfSum)

## # A tibble: 6 x 2
##   STATE      `mean(estimate)`
##   <chr>                 <dbl>
## 1 Alabama              40271.
## 2 Alaska               66425.
## 3 Arizona              45817.
## 4 Arkansas             39025.
## 5 California           61047.
## 6 Colorado             54354.

dfSum = select(dfSum, STATE, MeanHHIncome = `mean(estimate)`)
dfSum = arrange(dfSum, desc(MeanHHIncome))
head(dfSum)

## # A tibble: 6 x 2
##   STATE                MeanHHIncome
##   <chr>                       <dbl>
## 1 District of Columbia       77649 
## 2 New Jersey                 77048.
## 3 Connecticut                74496 
## 4 Maryland                   72541.
## 5 Massachusetts              71031.
## 6 Rhode Island               69906.

Piping

Piping allows you to send the output of one function to another function without creating an intermediate dataset

dfMedHHInc = get_acs(geography = "county", variables = "B19013_001", geometry = FALSE, year=2017) %>%
  separate(NAME, into=c("COUNTY", "STATE"), sep="\\,") %>%
  mutate(STATE = trimws(STATE)) %>%
  group_by(STATE) %>%
  summarize(mean(estimate))%>%
  select(STATE, MeanHHIncome = `mean(estimate)`) %>%
  arrange(desc(MeanHHIncome))

## Getting data from the 2013-2017 5-year ACS

head(dfMedHHInc)

## # A tibble: 6 x 2
##   STATE                MeanHHIncome
##   <chr>                       <dbl>
## 1 District of Columbia       77649 
## 2 New Jersey                 77048.
## 3 Connecticut                74496 
## 4 Maryland                   72541.
## 5 Massachusetts              71031.
## 6 Rhode Island               69906.

Retrieving Spatial Data with tidycensus

tidycensus can return simple feature geometry along with variables. Set geometry = TRUE in get_acs() or get_decennial(). Uses the tigris package to retrieve geographic dataset from US Census Bureau and pre-merge with tabular data from Census API.

dfMedHHInc = get_acs(geography = "county", variables = "B19013_001", geometry = TRUE, year=2017, state= "TX", county = c("Kendall", "Comal", "Bexar", "Atascosa", "Bandera", "Guadalupe", "Medina", "Wilson"))
head(dfMedHHInc)

## Simple feature collection with 6 features and 5 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -99.60332 ymin: 28.61266 xmax: -97.63102 ymax: 30.13895
## epsg (SRID):    4269
## proj4string:    +proj=longlat +datum=NAD83 +no_defs
##   GEOID                    NAME   variable estimate  moe
## 1 48013  Atascosa County, Texas B19013_001    55194 4735
## 2 48019   Bandera County, Texas B19013_001    56413 5522
## 3 48029     Bexar County, Texas B19013_001    53999  449
## 4 48091     Comal County, Texas B19013_001    73655 1751
## 5 48187 Guadalupe County, Texas B19013_001    66187 1709
## 6 48259   Kendall County, Texas B19013_001    81023 5739
##                         geometry
## 1 MULTIPOLYGON (((-98.80479 2...
## 2 MULTIPOLYGON (((-99.60332 2...
## 3 MULTIPOLYGON (((-98.80655 2...
## 4 MULTIPOLYGON (((-98.64612 2...
## 5 MULTIPOLYGON (((-98.31093 2...
## 6 MULTIPOLYGON (((-98.92061 2...

Plotting a map with ggplot2

You can use ggplot to create a map of the output simple feature geometry. However, notice that this is not an interactive map that would allow you to zoom in or out, pan, or click a feature on the map to get more inforamtion. For that you will have to use a more advantage package like leaflet.

dfMedHHInc = get_acs(geography = "county", variables = "B19013_001", geometry = TRUE, year=2017, state= "TX", county = c("Kendall", "Comal", "Bexar", "Atascosa", "Bandera", "Guadalupe", "Medina", "Wilson"))
p = ggplot(dfMedHHInc, aes(fill = dfMedHHInc$estimate)) + geom_sf(color = NA) + coord_sf(crs = 26911) + scale_fill_viridis_c(option = "magma")
p

Plotting a map with leaflet

The leaflet package also gives you the ability to create maps but with quite a bit of added functionality including dynamic maps that can be zoomed and panned.

counties = c("Kendall", "Comal", "Bexar", "Atascosa", "Bandera", "Guadalupe", "Medina", "Wilson")

#get the median household income data
dfMedHHInc = get_acs(geography = "county", variables = "B19013_001", geometry = TRUE, year=2017, state= "TX", county = c("Kendall", "Comal", "Bexar", "Atascosa", "Bandera", "Guadalupe", "Medina", "Wilson"))

#wrangle the data
dfMedHHInc2 = separate(dfMedHHInc, NAME, into=c("COUNTY", "STATE"), sep="\\,") %>%
    separate(COUNTY, into=c("COUNTY_NAME", "OTHER"), sep="\\s") %>%
    mutate(STATE = trimws(STATE)) %>%
    mutate(COUNTY_NAME = trimws(COUNTY_NAME)) %>%
    filter(STATE == 'Texas', COUNTY_NAME %in% counties) %>%
    arrange(desc(estimate))

  #create the color palette and assign it to the estimate field with 5 bins
  mypal = colorBin(palette = "YlGnBu",domain = dfMedHHInc2$estimate, bins = 5, pretty = TRUE)
  
  #create the popup info window
  popup = paste0("Name: ", dfMedHHInc2$COUNTY_NAME, "<br>", "Value: ", dfMedHHInc2$estimate)

  #get the bounding box for the data
  bbox = st_bbox(dfMedHHInc2) %>%
    as.vector()

  #create the map
  mymap<-leaflet(options = leafletOptions(zoomSnap = 0)) %>%
    addProviderTiles("CartoDB.Positron") %>%
    fitBounds(bbox[1], bbox[2], bbox[3], bbox[4]) %>%
    #add the data
    addPolygons(data = dfMedHHInc2, fillColor = ~mypal(dfMedHHInc2$estimate), color = "#b2aeae", fillOpacity = 0.7, weight = 1, smoothFactor = 0.2, popup = popup) %>%
    #add the labels
    addStaticLabels(dfMedHHInc2, label = dfMedHHInc2$COUNTY_NAME) %>%
    #add the legend
    addLegend(pal = mypal, values = dfMedHHInc2$estimate, position = "bottomright", title = "Household Income")
  print(mymap)

Creating a Bar Chart

A bar chart is a chart or graph that presents categorical data with rectangular bars with heights or length proportional to the values that they represent. The bars can be plotted vertically or horizontally. In the example below a bar chart displays the estimate home values for select variable categories for the County of Denver, CO.

vars = c("$200,000-$249,999" = "B25075_019", "$250,000-$299,999" = "B25075_020", "$300,000-$399,999" =  "B25075_021", "$400,000-$499,999" = "B25075_022", "$500,000-$749,999" =  "B25075_023")
dfHomeVals = get_acs(geography = "county", variables = vars, geometry = FALSE, year=2017, state = 'CO')
dfHomeVals = separate(dfHomeVals, NAME, into=c("COUNTY", "STATE"), sep="\\,")
dfHomeVals = mutate(dfHomeVals, STATE = trimws(STATE))
dfHomeVals = filter(dfHomeVals, COUNTY == 'Denver County')
head(dfHomeVals)

## # A tibble: 5 x 6
##   GEOID COUNTY        STATE    variable          estimate   moe
##   <chr> <chr>         <chr>    <chr>                <dbl> <dbl>
## 1 08031 Denver County Colorado $200,000-$249,999    16938   774
## 2 08031 Denver County Colorado $250,000-$299,999    16108   837
## 3 08031 Denver County Colorado $300,000-$399,999    24868   915
## 4 08031 Denver County Colorado $400,000-$499,999    18159   712
## 5 08031 Denver County Colorado $500,000-$749,999    21600   869

ggplot(dfHomeVals) + geom_col(mapping = aes(x = variable, y = estimate))

Creating a Histogram

Histograms are used with continuous data and divide data into bins. The bins cover the entire range of the dataset so that each value will fall into one, and only one, bin. Each bin is then mapped to a bar.

dfMedHomeVal = get_acs(table = "B25077", geography = "tract", county = 'Denver', geometry = FALSE, year=2017, state = 'CO')
ggplot(data = dfMedHomeVal) + geom_histogram(mapping = aes(x=estimate), binwidth=10000)

Creating a Box Plot

Box plots can be used to show the spread of data for a variable. The box itself defines the Interquartile Range or IQR. It’s the range of the data between the 25th and 75th percentile. A larger box indicates a more dispersed dataset, while a smaller box indicates a clustering of data for the variable. The line that runs through the box defines the median of the dataset. The lines extended from the boxes are also known as the whiskers and represent the 0-25% and 75-100% ranges.

dfMedHomeVal = get_acs(table = "B25077", geography = "tract", geometry = FALSE, year=2017, state = 'CO')
dfMedHomeVal = separate(dfMedHomeVal, NAME, into=c("TRACT", "COUNTY", "STATE"), sep="\\,")
dfMedHomeVal = mutate(dfMedHomeVal, COUNTY = trimws(COUNTY))
dfMedHomeVal = filter(dfMedHomeVal, COUNTY %in% c("Denver County", "Adams County", "Arapahoe County", "Jefferson County", "Douglas County", "Boulder County", "Broomfield County", "Gilpin County", "Clear Creek County", "Elbert County"))
dfMedHomeVal = group_by(dfMedHomeVal, COUNTY)
ggplot(data=dfMedHomeVal, mapping = aes(x = COUNTY, y = estimate)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Measuring Covariation of Categorical Data with Symbol Size

The geom_count() function can be used to visualize covariation between categorical values. It counts the number of observations for each combination. For example, this slide includes a chart of the number of wildfires by cause and organization. The larger the output symbol, the more fires for that cause and organization.

dfMedHomeVal = get_acs(table = "B25077", geography = "tract", geometry = FALSE, year=2017, state = 'CO')
dfMedHomeVal = separate(dfMedHomeVal, NAME, into=c("TRACT", "COUNTY", "STATE"), sep="\\,")
dfMedHomeVal = mutate(dfMedHomeVal, COUNTY = trimws(COUNTY))
dfMedHomeVal = filter(dfMedHomeVal, COUNTY %in% c("Denver County", "Adams County", "Arapahoe County", "Jefferson County", "Douglas County", "Boulder County", "Broomfield County", "Gilpin County", "Clear Creek County", "Elbert County"))
dfMedHomeVal = group_by(dfMedHomeVal, COUNTY)
ggplot(dfMedHomeVal) + geom_count(mapping = aes(x = COUNTY, y = estimate)) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Measuring Covariation – Two Continuous Variables

Covariation between two continuous variables can be measured using either the geom_bin2d() or geom_hex() functions. These functions can produce some interesting visualizations that help you spot patterns in your data.

dfMedHomeVal = get_acs(table = "B25077", geography = "tract", geometry = FALSE, year=2017, state = 'CO')
dfMedHomeVal = separate(dfMedHomeVal, NAME, into=c("TRACT", "COUNTY", "STATE"), sep="\\,")
ggplot(dfMedHomeVal) + geom_bin2d(mapping = aes(x=COUNTY, y=estimate)) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Mapping Variables to Aesthetics

Variables can be color coded by using the color property for the aes() function. The color property is set to one of the columns in the data frame, and each variable category is assigned a unique color.

year_range <- c(2012:2016)
geos_inc <- c("us", 
              "metropolitan statistical area/micropolitan statistical area", 
              "state")
nm_us_metros <- c(1,35,10740,22140,29740,42140)
nm_us_w_micros <- "NM|New Mexico|United States"
get_historic_acs <- function (variables, 
                              geography, 
                              year, 
                              summary_var = NULL) {
y <- list()
for (i in 1:length(year)) {
  y[[i]] <- lapply(geography, function (x) {
      tidycensus::get_acs (geography = x, 
                           variables = variables, 
                           summary_var = summary_var, 
                           output = "tidy", 
                           year = year[i])}) %>%
      bind_rows() %>% 
      mutate(year = year[i]) } 
  y %>% bind_rows() }

variable <-c("DP02_0011P", 
             "DP02_0067P",
             "DP02_0069P",
             "DP02_0090P", 
             "DP02_0092P", 
             "DP02_0111P", 
             "DP03_0005P", 
             "DP03_0021P", 
             "DP03_0028P",
             "DP03_0088",
             "DP03_0096P",
             "DP03_0128P")

label <- c("%Householders living alone", 
           "%Bachelor's degree or higher", 
           "%Civilian veterans",
           "%Born different state", 
           "%Foreign born",
           "%Speak English only @ home", 
           "%Civilian LF - Unemployed",  
           "%Public trans to work", 
           "%Service occupations", 
           "$Per capita income", 
           "%Health insurance", 
           "%Below FPL - All people")
dp_table <- as.data.frame(cbind(variable, label))

dp_data <- get_historic_acs(variables=variable, 
                            geography = geos_inc, 
                            year = year_range)

#Filter data set
dp_data %>%
  filter(GEOID %in% c(nm_us_metros)) %>%
  left_join(dp_table)%>%

#Build viz:
  ggplot(aes(x = year, y =  estimate, color=NAME, 
             ymin=estimate - moe, ymax=estimate + moe)) +
    geom_line(size=.95) +
    geom_errorbar(width=0.1) +
    scale_colour_stata() + 
    theme_fivethirtyeight()+
    theme(legend.position="bottom", 
          legend.title = element_blank(), 
          plot.title = element_text(size=14))+
    ylab ("") + xlab("") +
    facet_wrap(~label, scales = "free_y", ncol=3)+ 
    labs(title="Socio-economic profiles",
         subtitle="NM & USA, 2012-2016")

Basic Scatterplot with Labels

Scatterplots are created using the geom_point() function. To create the individual points on the plot you’ll need to pass in x and y parameters to the aes() function. The values passed to the x and y parameters should be columns from your data frame. This example illustrates the creation of what is called a Cleveland Dot Plot, which is basically a scatterplot. The chart will be an ordered scatterplot of median homes values by state for ACS 2017.

dfMedHomeVal = get_acs(table = "B25077", geography = "state", geometry = FALSE, year=2017)
ggplot(dfMedHomeVal, aes(x = estimate, y = reorder(NAME, estimate))) + geom_point(color = "navy", size = 2) + geom_text_repel(aes(label=estimate), size=3) + scale_x_continuous(labels = scales::dollar) + theme_minimal(base_size = 14) + labs(x = "2017 ACS estimate", y = "", title = "Median Home Value by State", caption = "2017 ACS - 5 Year")

Faceting

Facet plots are an interesting visualization that allows you to create comparison plots for a variable. For example, the facet plot on this slide displays acreage burned by year for each state in the study area. There are two methods in ggplot2 that enable you to create facet plots. Facet_wrap() creates a subset at the level of a single grouping variable, similar to the facet plot seen on this slide where State is the single grouping variable. A facet_grid() subsets the crossing of two grouping variables. These are great plots for comparison of one or more variables.

variable <- c('DP02_0059P', 
              'DP02_0060P', 
              'DP02_0061P', 
              'DP02_0062P', 
              'DP02_0063P', 
              'DP02_0064P', 
              'DP02_0065P')

ed_labels <- c('Less than 9th Grade', 
               '9th to 12th grade, no diploma', 
               'High school graduate', 
               'Some college, no degree', 
               "Associate's degree", 
               "Bachelor's degree", 
               'Grad/pro degree')

ed_level <- c(1:7)

ed_table <- as.data.frame(cbind(variable, ed_level, ed_labels), stringsAsFactors =FALSE)

ed_data <- get_historic_acs(variables=variable, 
                            geography = geos_inc, 
                            year = year_range)

ed_data %>%
  left_join(ed_table) %>% 
  mutate(ed_level = as.numeric(ed_level))%>%
  filter(grepl (nm_us_w_micros, NAME))%>%
  mutate (NAME = ifelse(GEOID == 21580, "Espanola, NM Micro Area", NAME)) %>%

#Build viz:  
  ggplot(aes(x = year, 
             y = estimate, 
             fill = reorder(ed_labels, -ed_level))) + 
    geom_col(color= 'gray', width = .8) +
    scale_fill_brewer(palette = 'BrBG') +
    theme_fivethirtyeight()+
    coord_flip()+
    facet_wrap(~NAME, ncol = 3)+
    theme(legend.position="bottom", 
          legend.title = element_blank(), 
          plot.title = element_text(size=14))+
    labs(title="Educational attainment profiles",
         subtitle="NM & USA, 2012-2016")

Violin Plot

Violin plots are similar to box plots except that they also show the density of data at different values. Thicker areas on the plot indicate a higher probability at that values, while thinner areas indicate a lower probability for the value. It’s common to also include a box plot inside the violin plot to get a sense of the median of the dataset along with the range of data.

vars = c("$200,000-$249,999" = "B25075_019", "$250,000-$299,999" = "B25075_020", "$300,000-$399,999" =  "B25075_021", "$400,000-$499,999" = "B25075_022", "$500,000-$749,999" =  "B25075_023")
dfHomeVals = get_acs(geography = "tract", variables = vars, geometry = FALSE, year=2017, state = 'CO')
dfHomeVals = separate(dfHomeVals, NAME, into=c("TRACT", "COUNTY", "STATE"), sep="\\,")
#dfHomeVals = mutate(dfHomeVals, STATE = trimws(STATE))
dfHomeVals = mutate(dfHomeVals, COUNTY = trimws(COUNTY))
dfHomeVals = filter(dfHomeVals, COUNTY %in% c("Denver County", "Adams County", "Arapahoe County", "Jefferson County", "Douglas County", "Boulder County", "Broomfield County", "Gilpin County", "Clear Creek County", "Elbert County"))
head(dfHomeVals)

## # A tibble: 6 x 7
##   GEOID     TRACT          COUNTY     STATE    variable      estimate   moe
##   <chr>     <chr>          <chr>      <chr>    <chr>            <dbl> <dbl>
## 1 08001007… Census Tract … Adams Cou… " Color… $200,000-$24…       34    28
## 2 08001007… Census Tract … Adams Cou… " Color… $250,000-$29…        0    12
## 3 08001007… Census Tract … Adams Cou… " Color… $300,000-$39…        8    12
## 4 08001007… Census Tract … Adams Cou… " Color… $400,000-$49…        0    12
## 5 08001007… Census Tract … Adams Cou… " Color… $500,000-$74…        8    14
## 6 08001007… Census Tract … Adams Cou… " Color… $200,000-$24…        0    12

ggplot(dfHomeVals) +  geom_violin(aes(x = COUNTY, y = estimate))

Summary Statistics

R also includes a number of individual functions that can be used to generate specific summary statistics. These functions include mean(), median(), var(), sd(), min(), max(), quartile(), and others. You can use these functions individually or you can calculate multiple summary statistics at once with the summary() function. The summary() function runs summary statistics for numeric columns in a data frame.

dfMedHHInc = get_acs(geography = "county", table = "B19013", geometry = TRUE, year=2017, state= "CO")
summary(dfMedHHInc$estimate)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   29000   42462   51457   54354   65357  111154

mean(dfMedHHInc$estimate)

## [1] 54353.56

sd(dfMedHHInc$estimate)

## [1] 15978.04

Census Examples with R and tidycensus

Eric Pimpler

6/21/2019