library(tidycensus)
library(tidyverse)
library(leaflet)
library(tigris)
library(sf)
library(viridis)
library(mapview)
library(ggrepel)
library(ggthemes)
census_api_key("43bcf61a88fedf78947062a0a584551692f09a88")
options(tigris_use_cache = TRUE)
This simple example show show to retrieve ACS data for a specific table, by state, with ‘county’ level geography for the year 2017.
df2017 = get_acs(geography = "county", table="B25064", state='TX', geometry = FALSE, year = 2017)
## Getting data from the 2013-2017 5-year ACS
nrow(df2017)
## [1] 254
head(df2017)
## # A tibble: 6 x 5
## GEOID NAME variable estimate moe
## <chr> <chr> <chr> <dbl> <dbl>
## 1 48001 Anderson County, Texas B25064_001 748 35
## 2 48003 Andrews County, Texas B25064_001 997 96
## 3 48005 Angelina County, Texas B25064_001 774 17
## 4 48007 Aransas County, Texas B25064_001 822 116
## 5 48009 Archer County, Texas B25064_001 621 45
## 6 48011 Armstrong County, Texas B25064_001 732 70
This example shows how to retrieve Census tracts. Notice that a county must be provided for this level of geography.
dfTractsTravis = get_acs(geography = "tract", table="B25064", state='TX', county='Travis', geometry = FALSE, year = 2017)
## Getting data from the 2013-2017 5-year ACS
nrow(dfTractsTravis)
## [1] 218
head(dfTractsTravis)
## # A tibble: 6 x 5
## GEOID NAME variable estimate moe
## <chr> <chr> <chr> <dbl> <dbl>
## 1 48453000101 Census Tract 1.01, Travis County, T… B25064_0… 1591 165
## 2 48453000102 Census Tract 1.02, Travis County, T… B25064_0… 2975 2228
## 3 48453000203 Census Tract 2.03, Travis County, T… B25064_0… 1723 58
## 4 48453000204 Census Tract 2.04, Travis County, T… B25064_0… 1020 76
## 5 48453000205 Census Tract 2.05, Travis County, T… B25064_0… 1136 95
## 6 48453000206 Census Tract 2.06, Travis County, T… B25064_0… 1262 79
This example shows how to retrieve Census tracts. Notice that you can provide multiple counties by passing in a vector of county names
counties = c("Travis", "Williamson", "Hays", "Bastrop", "Caldwell")
dfTractsAustinMetro = get_acs(geography = "tract", table="B25064", state='TX', county=counties, geometry = FALSE, year = 2017)
## Getting data from the 2013-2017 5-year ACS
nrow(dfTractsAustinMetro)
## [1] 350
head(dfTractsAustinMetro)
## # A tibble: 6 x 5
## GEOID NAME variable estimate moe
## <chr> <chr> <chr> <dbl> <dbl>
## 1 480219501… Census Tract 9501, Bastrop County, T… B25064_0… 921 57
## 2 480219502… Census Tract 9502, Bastrop County, T… B25064_0… 937 170
## 3 480219503… Census Tract 9503, Bastrop County, T… B25064_0… 1058 138
## 4 480219504… Census Tract 9504, Bastrop County, T… B25064_0… 881 84
## 5 480219505… Census Tract 9505.01, Bastrop County… B25064_0… 631 450
## 6 480219505… Census Tract 9505.02, Bastrop County… B25064_0… 818 213
This example shows how to cache a table for faster future access. The cache_table parameter controls this and is set to FALSE by default.
dfTractsTravis = get_acs(geography = "tract", table="B25064", state='TX', county='Travis', geometry = FALSE, year = 2017, cache_table = TRUE)
## Getting data from the 2013-2017 5-year ACS
## Loading ACS5 variables for 2017 from table B25064 and caching the dataset for faster future access.
nrow(dfTractsTravis)
## [1] 218
head(dfTractsTravis)
## # A tibble: 6 x 5
## GEOID NAME variable estimate moe
## <chr> <chr> <chr> <dbl> <dbl>
## 1 48453000101 Census Tract 1.01, Travis County, T… B25064_0… 1591 165
## 2 48453000102 Census Tract 1.02, Travis County, T… B25064_0… 2975 2228
## 3 48453000203 Census Tract 2.03, Travis County, T… B25064_0… 1723 58
## 4 48453000204 Census Tract 2.04, Travis County, T… B25064_0… 1020 76
## 5 48453000205 Census Tract 2.05, Travis County, T… B25064_0… 1136 95
## 6 48453000206 Census Tract 2.06, Travis County, T… B25064_0… 1262 79
You can specify particular variables from a table rather than returning the entire table. Table B25068 from ACS contains Bedrooms by Gross Rent. This includes no bedroom, 1BR, 2BR, and 3+BR. But what if you only want the data for 3BR and not all the other information? The variables parameter can be used. It accepts a vector or character string. In this example we’ll return the information for 3+BR which is contained within the 031-036 variables. We also specify a summary variable here using the 030 variable.
cats = c("B25068_031",
"B25068_032",
"B25068_033",
"B25068_034",
"B25068_035",
"B25068_036"
)
counties = c("Kendall", "Comal", "Bexar", "Atascosa", "Bandera", "Guadalupe", "Medina", "Wilson")
df = get_acs(geography = "county", state = 'TX', county = counties, variables = cats, summary_var = "B25068_030", geometry = FALSE, year = 2017, cache_table = TRUE)
## Getting data from the 2013-2017 5-year ACS
head(df)
## # A tibble: 6 x 7
## GEOID NAME variable estimate moe summary_est summary_moe
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 48013 Atascosa County, … B25068_0… 6 9 1352 312
## 2 48013 Atascosa County, … B25068_0… 7 12 1352 312
## 3 48013 Atascosa County, … B25068_0… 328 141 1352 312
## 4 48013 Atascosa County, … B25068_0… 441 167 1352 312
## 5 48013 Atascosa County, … B25068_0… 433 166 1352 312
## 6 48013 Atascosa County, … B25068_0… 137 101 1352 312
The get_decennial() function can be used to return decnnial census information from the years 1990, 2000, and 2010.
vars10 = c("P005003", "P005004", "P005006", "P004003")
il = get_decennial(geography = "county", variables = vars10, year = 2010, summary_var = "P001001", state = "IL", geometry = TRUE)
il = mutate(il, pct = 100 * (value / summary_value))
ggplot(il, aes(fill = pct, color = pct)) +
geom_sf() +
facet_wrap(~variable)
This example will show you how to create a subset of records with the filter() dplyr function. We’ll filter median estimated home values for 2017 from the ACS to includes only records where the estimate median home value is greater than 200,000
#get home value information from ACS for 2017
dfHomeValues = get_acs(geography = "county", variables = "B25077_001", geometry = FALSE, year=2017)
## Getting data from the 2013-2017 5-year ACS
nrow(dfHomeValues)
## [1] 3220
#filter the data so that only counties that have a median value of greater than 200,000 are retained
dfHomeValues = filter(dfHomeValues, estimate >= 200000)
nrow(dfHomeValues)
## [1] 456
head(dfHomeValues, 20)
## # A tibble: 20 x 5
## GEOID NAME variable estimate moe
## <chr> <chr> <chr> <dbl> <dbl>
## 1 02016 Aleutians West Census Area, Alaska B25077_001 238800 48045
## 2 02020 Anchorage Municipality, Alaska B25077_001 304500 5427
## 3 02068 Denali Borough, Alaska B25077_001 224200 32280
## 4 02090 Fairbanks North Star Borough, Alaska B25077_001 230600 6832
## 5 02100 Haines Borough, Alaska B25077_001 243100 35366
## 6 02105 Hoonah-Angoon Census Area, Alaska B25077_001 226800 17022
## 7 02110 Juneau City and Borough, Alaska B25077_001 343100 8120
## 8 02122 Kenai Peninsula Borough, Alaska B25077_001 234600 6420
## 9 02130 Ketchikan Gateway Borough, Alaska B25077_001 265700 7682
## 10 02150 Kodiak Island Borough, Alaska B25077_001 267700 14317
## 11 02170 Matanuska-Susitna Borough, Alaska B25077_001 235600 2432
## 12 02195 Petersburg Borough, Alaska B25077_001 205000 16068
## 13 02220 Sitka City and Borough, Alaska B25077_001 350900 12649
## 14 02230 Skagway Municipality, Alaska B25077_001 293800 22308
## 15 02261 Valdez-Cordova Census Area, Alaska B25077_001 252200 24296
## 16 04005 Coconino County, Arizona B25077_001 241400 9243
## 17 04013 Maricopa County, Arizona B25077_001 225000 888
## 18 04025 Yavapai County, Arizona B25077_001 215000 4772
## 19 06001 Alameda County, California B25077_001 649100 3303
## 20 06003 Alpine County, California B25077_001 343800 37678
In this example you’ll learn how to use the arrange() tidyverse function to order the rows in a dataframe using a variable (column). Rows can sorted in ascending or descending order. Here we’ll arrange the median home values for 2017 from ACS in descending order.
#get home value information from ACS for 2017
dfHomeValues = get_acs(geography = "county", variables = "B25077_001", geometry = FALSE, year=2017)
## Getting data from the 2013-2017 5-year ACS
nrow(dfHomeValues)
## [1] 3220
#filter the data so that only counties that have a median value of greater than 200,000 are retained
dfHomeValues = filter(dfHomeValues, estimate >= 200000)
dfHomeValues = arrange(dfHomeValues, desc(estimate))
head(dfHomeValues, 20)
## # A tibble: 20 x 5
## GEOID NAME variable estimate moe
## <chr> <chr> <chr> <dbl> <dbl>
## 1 25019 Nantucket County, Massachusetts B25077_001 995900 52007
## 2 06075 San Francisco County, California B25077_001 927400 7229
## 3 06081 San Mateo County, California B25077_001 917700 6273
## 4 36061 New York County, New York B25077_001 915300 14939
## 5 06041 Marin County, California B25077_001 908800 8635
## 6 06085 Santa Clara County, California B25077_001 829600 4266
## 7 51610 Falls Church city, Virginia B25077_001 742000 36331
## 8 56039 Teton County, Wyoming B25077_001 739100 70137
## 9 25007 Dukes County, Massachusetts B25077_001 674600 29662
## 10 06087 Santa Cruz County, California B25077_001 659900 9193
## 11 06001 Alameda County, California B25077_001 649100 3303
## 12 51013 Arlington County, Virginia B25077_001 643300 10582
## 13 15003 Honolulu County, Hawaii B25077_001 626400 3236
## 14 36047 Kings County, New York B25077_001 623900 3713
## 15 06059 Orange County, California B25077_001 620500 2291
## 16 08097 Pitkin County, Colorado B25077_001 593600 117537
## 17 15009 Maui County, Hawaii B25077_001 569100 11979
## 18 06055 Napa County, California B25077_001 560500 8513
## 19 49043 Summit County, Utah B25077_001 558300 22597
## 20 08117 Summit County, Colorado B25077_001 547700 33439
In this example you’ll learn how to use the select() tidyverse function to specify which columns to include in a data frame. Thes select() function can also be used to rename columns as you’ll see.
dfHomeValues = get_acs(geography = "county", variables = "B25077_001", geometry = FALSE, year=2017)
## Getting data from the 2013-2017 5-year ACS
#notice that all columns are included initially
head(dfHomeValues)
## # A tibble: 6 x 5
## GEOID NAME variable estimate moe
## <chr> <chr> <chr> <dbl> <dbl>
## 1 01001 Autauga County, Alabama B25077_001 143000 6496
## 2 01003 Baldwin County, Alabama B25077_001 182000 3992
## 3 01005 Barbour County, Alabama B25077_001 89300 5605
## 4 01007 Bibb County, Alabama B25077_001 105500 12056
## 5 01009 Blount County, Alabama B25077_001 122200 4169
## 6 01011 Bullock County, Alabama B25077_001 66800 9883
#use the select function to restrict the columns that are included and rename columns
dfHomeValues = select(dfHomeValues, NAME, estimate, "ErrorEst" = moe)
head(dfHomeValues)
## # A tibble: 6 x 3
## NAME estimate ErrorEst
## <chr> <dbl> <dbl>
## 1 Autauga County, Alabama 143000 6496
## 2 Baldwin County, Alabama 182000 3992
## 3 Barbour County, Alabama 89300 5605
## 4 Bibb County, Alabama 105500 12056
## 5 Blount County, Alabama 122200 4169
## 6 Bullock County, Alabama 66800 9883
The separate() function from the tidyr package can be used to separate one column into multiple columns. This is often useful for splitting Census geographies into multiple columns. For example, you might want to split county and state into seaprate counties since they are originally stored in a single column.
dfMedHHInc = get_acs(geography = "county", variables = "B19013_001", geometry = FALSE, year=2017)
## Getting data from the 2013-2017 5-year ACS
head(dfMedHHInc)
## # A tibble: 6 x 5
## GEOID NAME variable estimate moe
## <chr> <chr> <chr> <dbl> <dbl>
## 1 01001 Autauga County, Alabama B19013_001 55317 2838
## 2 01003 Baldwin County, Alabama B19013_001 52562 1348
## 3 01005 Barbour County, Alabama B19013_001 33368 2551
## 4 01007 Bibb County, Alabama B19013_001 43404 3431
## 5 01009 Blount County, Alabama B19013_001 47412 2630
## 6 01011 Bullock County, Alabama B19013_001 29655 5376
dfMedHHInc = separate(dfMedHHInc, NAME, into=c("COUNTY", "STATE"), sep="\\,")
head(dfMedHHInc)
## # A tibble: 6 x 6
## GEOID COUNTY STATE variable estimate moe
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 01001 Autauga County " Alabama" B19013_001 55317 2838
## 2 01003 Baldwin County " Alabama" B19013_001 52562 1348
## 3 01005 Barbour County " Alabama" B19013_001 33368 2551
## 4 01007 Bibb County " Alabama" B19013_001 43404 3431
## 5 01009 Blount County " Alabama" B19013_001 47412 2630
## 6 01011 Bullock County " Alabama" B19013_001 29655 5376
The mutate() function can be used to add new columns to a data frame based on information found in existing columns of the data frame. It can also be used to redefine rows values for an existing column.
dfMedHHInc = get_acs(geography = "county", variables = "B19013_001", geometry = FALSE, year=2017)
## Getting data from the 2013-2017 5-year ACS
dfMedHHInc = separate(dfMedHHInc, NAME, into=c("COUNTY", "STATE"), sep="\\,")
head(dfMedHHInc)
## # A tibble: 6 x 6
## GEOID COUNTY STATE variable estimate moe
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 01001 Autauga County " Alabama" B19013_001 55317 2838
## 2 01003 Baldwin County " Alabama" B19013_001 52562 1348
## 3 01005 Barbour County " Alabama" B19013_001 33368 2551
## 4 01007 Bibb County " Alabama" B19013_001 43404 3431
## 5 01009 Blount County " Alabama" B19013_001 47412 2630
## 6 01011 Bullock County " Alabama" B19013_001 29655 5376
dfMedHHInc = mutate(dfMedHHInc, STATE = trimws(STATE))
head(dfMedHHInc)
## # A tibble: 6 x 6
## GEOID COUNTY STATE variable estimate moe
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 01001 Autauga County Alabama B19013_001 55317 2838
## 2 01003 Baldwin County Alabama B19013_001 52562 1348
## 3 01005 Barbour County Alabama B19013_001 33368 2551
## 4 01007 Bibb County Alabama B19013_001 43404 3431
## 5 01009 Blount County Alabama B19013_001 47412 2630
## 6 01011 Bullock County Alabama B19013_001 29655 5376
In this example you’ll learn how to use the group_by() function to group based on one or more columns. You’ll also learn how to use the summarize() function to produce summary statistics for a data frame.
dfMedHHInc = get_acs(geography = "county", variables = "B19013_001", geometry = FALSE, year=2017)
## Getting data from the 2013-2017 5-year ACS
dfMedHHInc = separate(dfMedHHInc, NAME, into=c("COUNTY", "STATE"), sep="\\,")
dfMedHHInc = mutate(dfMedHHInc, STATE = trimws(STATE))
head(dfMedHHInc)
## # A tibble: 6 x 6
## GEOID COUNTY STATE variable estimate moe
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 01001 Autauga County Alabama B19013_001 55317 2838
## 2 01003 Baldwin County Alabama B19013_001 52562 1348
## 3 01005 Barbour County Alabama B19013_001 33368 2551
## 4 01007 Bibb County Alabama B19013_001 43404 3431
## 5 01009 Blount County Alabama B19013_001 47412 2630
## 6 01011 Bullock County Alabama B19013_001 29655 5376
dfGroup = group_by(dfMedHHInc, STATE)
dfSum = summarize(dfGroup, mean(estimate))
head(dfSum)
## # A tibble: 6 x 2
## STATE `mean(estimate)`
## <chr> <dbl>
## 1 Alabama 40271.
## 2 Alaska 66425.
## 3 Arizona 45817.
## 4 Arkansas 39025.
## 5 California 61047.
## 6 Colorado 54354.
dfSum = select(dfSum, STATE, MeanHHIncome = `mean(estimate)`)
dfSum = arrange(dfSum, desc(MeanHHIncome))
head(dfSum)
## # A tibble: 6 x 2
## STATE MeanHHIncome
## <chr> <dbl>
## 1 District of Columbia 77649
## 2 New Jersey 77048.
## 3 Connecticut 74496
## 4 Maryland 72541.
## 5 Massachusetts 71031.
## 6 Rhode Island 69906.
Piping allows you to send the output of one function to another function without creating an intermediate dataset
dfMedHHInc = get_acs(geography = "county", variables = "B19013_001", geometry = FALSE, year=2017) %>%
separate(NAME, into=c("COUNTY", "STATE"), sep="\\,") %>%
mutate(STATE = trimws(STATE)) %>%
group_by(STATE) %>%
summarize(mean(estimate))%>%
select(STATE, MeanHHIncome = `mean(estimate)`) %>%
arrange(desc(MeanHHIncome))
## Getting data from the 2013-2017 5-year ACS
head(dfMedHHInc)
## # A tibble: 6 x 2
## STATE MeanHHIncome
## <chr> <dbl>
## 1 District of Columbia 77649
## 2 New Jersey 77048.
## 3 Connecticut 74496
## 4 Maryland 72541.
## 5 Massachusetts 71031.
## 6 Rhode Island 69906.
tidycensus can return simple feature geometry along with variables. Set geometry = TRUE in get_acs() or get_decennial(). Uses the tigris package to retrieve geographic dataset from US Census Bureau and pre-merge with tabular data from Census API.
dfMedHHInc = get_acs(geography = "county", variables = "B19013_001", geometry = TRUE, year=2017, state= "TX", county = c("Kendall", "Comal", "Bexar", "Atascosa", "Bandera", "Guadalupe", "Medina", "Wilson"))
head(dfMedHHInc)
## Simple feature collection with 6 features and 5 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -99.60332 ymin: 28.61266 xmax: -97.63102 ymax: 30.13895
## epsg (SRID): 4269
## proj4string: +proj=longlat +datum=NAD83 +no_defs
## GEOID NAME variable estimate moe
## 1 48013 Atascosa County, Texas B19013_001 55194 4735
## 2 48019 Bandera County, Texas B19013_001 56413 5522
## 3 48029 Bexar County, Texas B19013_001 53999 449
## 4 48091 Comal County, Texas B19013_001 73655 1751
## 5 48187 Guadalupe County, Texas B19013_001 66187 1709
## 6 48259 Kendall County, Texas B19013_001 81023 5739
## geometry
## 1 MULTIPOLYGON (((-98.80479 2...
## 2 MULTIPOLYGON (((-99.60332 2...
## 3 MULTIPOLYGON (((-98.80655 2...
## 4 MULTIPOLYGON (((-98.64612 2...
## 5 MULTIPOLYGON (((-98.31093 2...
## 6 MULTIPOLYGON (((-98.92061 2...
You can use ggplot to create a map of the output simple feature geometry. However, notice that this is not an interactive map that would allow you to zoom in or out, pan, or click a feature on the map to get more inforamtion. For that you will have to use a more advantage package like leaflet.
dfMedHHInc = get_acs(geography = "county", variables = "B19013_001", geometry = TRUE, year=2017, state= "TX", county = c("Kendall", "Comal", "Bexar", "Atascosa", "Bandera", "Guadalupe", "Medina", "Wilson"))
p = ggplot(dfMedHHInc, aes(fill = dfMedHHInc$estimate)) + geom_sf(color = NA) + coord_sf(crs = 26911) + scale_fill_viridis_c(option = "magma")
p
The leaflet package also gives you the ability to create maps but with quite a bit of added functionality including dynamic maps that can be zoomed and panned.
counties = c("Kendall", "Comal", "Bexar", "Atascosa", "Bandera", "Guadalupe", "Medina", "Wilson")
#get the median household income data
dfMedHHInc = get_acs(geography = "county", variables = "B19013_001", geometry = TRUE, year=2017, state= "TX", county = c("Kendall", "Comal", "Bexar", "Atascosa", "Bandera", "Guadalupe", "Medina", "Wilson"))
#wrangle the data
dfMedHHInc2 = separate(dfMedHHInc, NAME, into=c("COUNTY", "STATE"), sep="\\,") %>%
separate(COUNTY, into=c("COUNTY_NAME", "OTHER"), sep="\\s") %>%
mutate(STATE = trimws(STATE)) %>%
mutate(COUNTY_NAME = trimws(COUNTY_NAME)) %>%
filter(STATE == 'Texas', COUNTY_NAME %in% counties) %>%
arrange(desc(estimate))
#create the color palette and assign it to the estimate field with 5 bins
mypal = colorBin(palette = "YlGnBu",domain = dfMedHHInc2$estimate, bins = 5, pretty = TRUE)
#create the popup info window
popup = paste0("Name: ", dfMedHHInc2$COUNTY_NAME, "<br>", "Value: ", dfMedHHInc2$estimate)
#get the bounding box for the data
bbox = st_bbox(dfMedHHInc2) %>%
as.vector()
#create the map
mymap<-leaflet(options = leafletOptions(zoomSnap = 0)) %>%
addProviderTiles("CartoDB.Positron") %>%
fitBounds(bbox[1], bbox[2], bbox[3], bbox[4]) %>%
#add the data
addPolygons(data = dfMedHHInc2, fillColor = ~mypal(dfMedHHInc2$estimate), color = "#b2aeae", fillOpacity = 0.7, weight = 1, smoothFactor = 0.2, popup = popup) %>%
#add the labels
addStaticLabels(dfMedHHInc2, label = dfMedHHInc2$COUNTY_NAME) %>%
#add the legend
addLegend(pal = mypal, values = dfMedHHInc2$estimate, position = "bottomright", title = "Household Income")
print(mymap)
A bar chart is a chart or graph that presents categorical data with rectangular bars with heights or length proportional to the values that they represent. The bars can be plotted vertically or horizontally. In the example below a bar chart displays the estimate home values for select variable categories for the County of Denver, CO.
vars = c("$200,000-$249,999" = "B25075_019", "$250,000-$299,999" = "B25075_020", "$300,000-$399,999" = "B25075_021", "$400,000-$499,999" = "B25075_022", "$500,000-$749,999" = "B25075_023")
dfHomeVals = get_acs(geography = "county", variables = vars, geometry = FALSE, year=2017, state = 'CO')
dfHomeVals = separate(dfHomeVals, NAME, into=c("COUNTY", "STATE"), sep="\\,")
dfHomeVals = mutate(dfHomeVals, STATE = trimws(STATE))
dfHomeVals = filter(dfHomeVals, COUNTY == 'Denver County')
head(dfHomeVals)
## # A tibble: 5 x 6
## GEOID COUNTY STATE variable estimate moe
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 08031 Denver County Colorado $200,000-$249,999 16938 774
## 2 08031 Denver County Colorado $250,000-$299,999 16108 837
## 3 08031 Denver County Colorado $300,000-$399,999 24868 915
## 4 08031 Denver County Colorado $400,000-$499,999 18159 712
## 5 08031 Denver County Colorado $500,000-$749,999 21600 869
ggplot(dfHomeVals) + geom_col(mapping = aes(x = variable, y = estimate))
Histograms are used with continuous data and divide data into bins. The bins cover the entire range of the dataset so that each value will fall into one, and only one, bin. Each bin is then mapped to a bar.
dfMedHomeVal = get_acs(table = "B25077", geography = "tract", county = 'Denver', geometry = FALSE, year=2017, state = 'CO')
ggplot(data = dfMedHomeVal) + geom_histogram(mapping = aes(x=estimate), binwidth=10000)
Box plots can be used to show the spread of data for a variable. The box itself defines the Interquartile Range or IQR. It’s the range of the data between the 25th and 75th percentile. A larger box indicates a more dispersed dataset, while a smaller box indicates a clustering of data for the variable. The line that runs through the box defines the median of the dataset. The lines extended from the boxes are also known as the whiskers and represent the 0-25% and 75-100% ranges.
dfMedHomeVal = get_acs(table = "B25077", geography = "tract", geometry = FALSE, year=2017, state = 'CO')
dfMedHomeVal = separate(dfMedHomeVal, NAME, into=c("TRACT", "COUNTY", "STATE"), sep="\\,")
dfMedHomeVal = mutate(dfMedHomeVal, COUNTY = trimws(COUNTY))
dfMedHomeVal = filter(dfMedHomeVal, COUNTY %in% c("Denver County", "Adams County", "Arapahoe County", "Jefferson County", "Douglas County", "Boulder County", "Broomfield County", "Gilpin County", "Clear Creek County", "Elbert County"))
dfMedHomeVal = group_by(dfMedHomeVal, COUNTY)
ggplot(data=dfMedHomeVal, mapping = aes(x = COUNTY, y = estimate)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
The geom_count() function can be used to visualize covariation between categorical values. It counts the number of observations for each combination. For example, this slide includes a chart of the number of wildfires by cause and organization. The larger the output symbol, the more fires for that cause and organization.
dfMedHomeVal = get_acs(table = "B25077", geography = "tract", geometry = FALSE, year=2017, state = 'CO')
dfMedHomeVal = separate(dfMedHomeVal, NAME, into=c("TRACT", "COUNTY", "STATE"), sep="\\,")
dfMedHomeVal = mutate(dfMedHomeVal, COUNTY = trimws(COUNTY))
dfMedHomeVal = filter(dfMedHomeVal, COUNTY %in% c("Denver County", "Adams County", "Arapahoe County", "Jefferson County", "Douglas County", "Boulder County", "Broomfield County", "Gilpin County", "Clear Creek County", "Elbert County"))
dfMedHomeVal = group_by(dfMedHomeVal, COUNTY)
ggplot(dfMedHomeVal) + geom_count(mapping = aes(x = COUNTY, y = estimate)) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
Covariation between two continuous variables can be measured using either the geom_bin2d() or geom_hex() functions. These functions can produce some interesting visualizations that help you spot patterns in your data.
dfMedHomeVal = get_acs(table = "B25077", geography = "tract", geometry = FALSE, year=2017, state = 'CO')
dfMedHomeVal = separate(dfMedHomeVal, NAME, into=c("TRACT", "COUNTY", "STATE"), sep="\\,")
ggplot(dfMedHomeVal) + geom_bin2d(mapping = aes(x=COUNTY, y=estimate)) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
Variables can be color coded by using the color property for the aes() function. The color property is set to one of the columns in the data frame, and each variable category is assigned a unique color.
year_range <- c(2012:2016)
geos_inc <- c("us",
"metropolitan statistical area/micropolitan statistical area",
"state")
nm_us_metros <- c(1,35,10740,22140,29740,42140)
nm_us_w_micros <- "NM|New Mexico|United States"
get_historic_acs <- function (variables,
geography,
year,
summary_var = NULL) {
y <- list()
for (i in 1:length(year)) {
y[[i]] <- lapply(geography, function (x) {
tidycensus::get_acs (geography = x,
variables = variables,
summary_var = summary_var,
output = "tidy",
year = year[i])}) %>%
bind_rows() %>%
mutate(year = year[i]) }
y %>% bind_rows() }
variable <-c("DP02_0011P",
"DP02_0067P",
"DP02_0069P",
"DP02_0090P",
"DP02_0092P",
"DP02_0111P",
"DP03_0005P",
"DP03_0021P",
"DP03_0028P",
"DP03_0088",
"DP03_0096P",
"DP03_0128P")
label <- c("%Householders living alone",
"%Bachelor's degree or higher",
"%Civilian veterans",
"%Born different state",
"%Foreign born",
"%Speak English only @ home",
"%Civilian LF - Unemployed",
"%Public trans to work",
"%Service occupations",
"$Per capita income",
"%Health insurance",
"%Below FPL - All people")
dp_table <- as.data.frame(cbind(variable, label))
dp_data <- get_historic_acs(variables=variable,
geography = geos_inc,
year = year_range)
#Filter data set
dp_data %>%
filter(GEOID %in% c(nm_us_metros)) %>%
left_join(dp_table)%>%
#Build viz:
ggplot(aes(x = year, y = estimate, color=NAME,
ymin=estimate - moe, ymax=estimate + moe)) +
geom_line(size=.95) +
geom_errorbar(width=0.1) +
scale_colour_stata() +
theme_fivethirtyeight()+
theme(legend.position="bottom",
legend.title = element_blank(),
plot.title = element_text(size=14))+
ylab ("") + xlab("") +
facet_wrap(~label, scales = "free_y", ncol=3)+
labs(title="Socio-economic profiles",
subtitle="NM & USA, 2012-2016")
Scatterplots are created using the geom_point() function. To create the individual points on the plot you’ll need to pass in x and y parameters to the aes() function. The values passed to the x and y parameters should be columns from your data frame. This example illustrates the creation of what is called a Cleveland Dot Plot, which is basically a scatterplot. The chart will be an ordered scatterplot of median homes values by state for ACS 2017.
dfMedHomeVal = get_acs(table = "B25077", geography = "state", geometry = FALSE, year=2017)
ggplot(dfMedHomeVal, aes(x = estimate, y = reorder(NAME, estimate))) + geom_point(color = "navy", size = 2) + geom_text_repel(aes(label=estimate), size=3) + scale_x_continuous(labels = scales::dollar) + theme_minimal(base_size = 14) + labs(x = "2017 ACS estimate", y = "", title = "Median Home Value by State", caption = "2017 ACS - 5 Year")
Facet plots are an interesting visualization that allows you to create comparison plots for a variable. For example, the facet plot on this slide displays acreage burned by year for each state in the study area. There are two methods in ggplot2 that enable you to create facet plots. Facet_wrap() creates a subset at the level of a single grouping variable, similar to the facet plot seen on this slide where State is the single grouping variable. A facet_grid() subsets the crossing of two grouping variables. These are great plots for comparison of one or more variables.
variable <- c('DP02_0059P',
'DP02_0060P',
'DP02_0061P',
'DP02_0062P',
'DP02_0063P',
'DP02_0064P',
'DP02_0065P')
ed_labels <- c('Less than 9th Grade',
'9th to 12th grade, no diploma',
'High school graduate',
'Some college, no degree',
"Associate's degree",
"Bachelor's degree",
'Grad/pro degree')
ed_level <- c(1:7)
ed_table <- as.data.frame(cbind(variable, ed_level, ed_labels), stringsAsFactors =FALSE)
ed_data <- get_historic_acs(variables=variable,
geography = geos_inc,
year = year_range)
ed_data %>%
left_join(ed_table) %>%
mutate(ed_level = as.numeric(ed_level))%>%
filter(grepl (nm_us_w_micros, NAME))%>%
mutate (NAME = ifelse(GEOID == 21580, "Espanola, NM Micro Area", NAME)) %>%
#Build viz:
ggplot(aes(x = year,
y = estimate,
fill = reorder(ed_labels, -ed_level))) +
geom_col(color= 'gray', width = .8) +
scale_fill_brewer(palette = 'BrBG') +
theme_fivethirtyeight()+
coord_flip()+
facet_wrap(~NAME, ncol = 3)+
theme(legend.position="bottom",
legend.title = element_blank(),
plot.title = element_text(size=14))+
labs(title="Educational attainment profiles",
subtitle="NM & USA, 2012-2016")
Violin plots are similar to box plots except that they also show the density of data at different values. Thicker areas on the plot indicate a higher probability at that values, while thinner areas indicate a lower probability for the value. It’s common to also include a box plot inside the violin plot to get a sense of the median of the dataset along with the range of data.
vars = c("$200,000-$249,999" = "B25075_019", "$250,000-$299,999" = "B25075_020", "$300,000-$399,999" = "B25075_021", "$400,000-$499,999" = "B25075_022", "$500,000-$749,999" = "B25075_023")
dfHomeVals = get_acs(geography = "tract", variables = vars, geometry = FALSE, year=2017, state = 'CO')
dfHomeVals = separate(dfHomeVals, NAME, into=c("TRACT", "COUNTY", "STATE"), sep="\\,")
#dfHomeVals = mutate(dfHomeVals, STATE = trimws(STATE))
dfHomeVals = mutate(dfHomeVals, COUNTY = trimws(COUNTY))
dfHomeVals = filter(dfHomeVals, COUNTY %in% c("Denver County", "Adams County", "Arapahoe County", "Jefferson County", "Douglas County", "Boulder County", "Broomfield County", "Gilpin County", "Clear Creek County", "Elbert County"))
head(dfHomeVals)
## # A tibble: 6 x 7
## GEOID TRACT COUNTY STATE variable estimate moe
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 08001007… Census Tract … Adams Cou… " Color… $200,000-$24… 34 28
## 2 08001007… Census Tract … Adams Cou… " Color… $250,000-$29… 0 12
## 3 08001007… Census Tract … Adams Cou… " Color… $300,000-$39… 8 12
## 4 08001007… Census Tract … Adams Cou… " Color… $400,000-$49… 0 12
## 5 08001007… Census Tract … Adams Cou… " Color… $500,000-$74… 8 14
## 6 08001007… Census Tract … Adams Cou… " Color… $200,000-$24… 0 12
ggplot(dfHomeVals) + geom_violin(aes(x = COUNTY, y = estimate))
R also includes a number of individual functions that can be used to generate specific summary statistics. These functions include mean(), median(), var(), sd(), min(), max(), quartile(), and others. You can use these functions individually or you can calculate multiple summary statistics at once with the summary() function. The summary() function runs summary statistics for numeric columns in a data frame.
dfMedHHInc = get_acs(geography = "county", table = "B19013", geometry = TRUE, year=2017, state= "CO")
summary(dfMedHHInc$estimate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29000 42462 51457 54354 65357 111154
mean(dfMedHHInc$estimate)
## [1] 54353.56
sd(dfMedHHInc$estimate)
## [1] 15978.04