This example will use R to download American Community Survey summary file tables using the tidycensus package. The goal of this example is to illustrate how to download data from the Census API using R, how to create basic descriptive maps of attributes and how to construct a change map between two time periods.
The example will use data from San Antonio, Texas from the American Community Survey summary file.
Obtain one at http://api.census.gov/data/key_signup.html
use census_api_key(key = "yourkeyhere", install = T)
one time to install your key for use in tidycensus
library(tidycensus)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.2.1, PROJ 7.2.1; sf_use_s2() is TRUE
library(tmap)
You should always check the variable name of your ACS variable, you cannot assume that the variable name stays the same across years.
v10_Profile <- load_variables(2010,
"acs5/profile",
cache = TRUE) #demographic profile tables
v19_Profile <- load_variables(2019,
"acs5/profile",
cache = TRUE) #demographic
#Search for variables by using grep()
v10_Profile[grep(x = v10_Profile$label,
"BELOW THE POVERTY LEVEL",
ignore.case = TRUE),
c("name", "label")]
## # A tibble: 38 x 2
## name label
## <chr> <chr>
## 1 DP03_0119 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 2 DP03_0119P Percent!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PA~
## 3 DP03_0120 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 4 DP03_0120P Percent!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PA~
## 5 DP03_0121 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 6 DP03_0121P Percent!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PA~
## 7 DP03_0122 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 8 DP03_0122P Percent!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PA~
## 9 DP03_0123 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 10 DP03_0123P Percent!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PA~
## # ... with 28 more rows
v19_Profile[grep(x = v19_Profile$label,
"BELOW THE POVERTY LEVEL",
ignore.case = TRUE),
c("name", "label")]
## # A tibble: 38 x 2
## name label
## <chr> <chr>
## 1 DP03_0119 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 2 DP03_0119P Percent!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PA~
## 3 DP03_0120 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 4 DP03_0120P Percent!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PA~
## 5 DP03_0121 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 6 DP03_0121P Percent!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PA~
## 7 DP03_0122 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 8 DP03_0122P Percent!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PA~
## 9 DP03_0123 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 10 DP03_0123P Percent!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PA~
## # ... with 28 more rows
The data profile tables are very useful because they contain lots of pre-calculated variables.
pov10<-get_acs(geography = "tract",
state="TX",
county = "Bexar",
year = 2010,
variables="DP03_0119PE" ,
geometry = T,
output = "wide")
## Getting data from the 2006-2010 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Using the ACS Data Profile
#rename variables and filter missing cases
pov10 <- pov10%>%
mutate(ppov = DP03_0119PE,
ppov_er = DP03_0119PM/1.645,
ppov_cv =100* (ppov_er/ppov)) %>%
filter(complete.cases(ppov), is.finite(ppov_cv)==T)%>%
select(GEOID, ppov, ppov_er,ppov_cv)
head(pov10)
Two common ways to break a continuous variable into discrete bins are quantile breaks and Jenks/Fisher breaks.
tm_shape(pov10)+
tm_polygons(c("ppov"),
title=c("% in Poverty"),
palette="Blues",
style="quantile",
n=5)+
#tm_format("World", legend.outside=T, title.size =4)+
tm_scale_bar()+
tm_layout(title="San Antonio Poverty Rate Estimates - Quantile Breaks",
title.size =1.5,
legend.frame = TRUE,
title.position = c('right', 'top'))+
tm_compass()+
tm_format("World",
legend.position = c("left", "bottom"),
main.title.position =c("center"))
tm_shape(pov10)+
tm_polygons(c("ppov"),
title=c("% in Poverty"),
palette="Blues",
style="jenks",
n=5)+
#tm_format("World", legend.outside=T, title.size =4)+
tm_scale_bar()+
tm_layout(title="San Antonio Poverty Rate Estimates - Jenks Breaks",
title.size =1.5,
legend.frame = TRUE,
title.position = c('right', 'top'))+
tm_compass()+
tm_format("World",
legend.position = c("left", "bottom"),
main.title.position =c("center"))
Here I generate a quantile break for the coefficient of variation in census tract poverty estimates.
If you don’t remember, the coefficient of variation is:
\[CV =\frac {\sigma }{\mu}\]
and is a measure of the variability in the estimate, relative to the estimate itself. If the CV is greater than 100, the estimate is very imprecise. The lower the value, the more precise the estimate. This is very important when using small area estimates from the ACS.
When presenting data from the ACS, you should always examine the coefficients of variation, as the ACS is based off a survey, and the estimates, especially for small or rare groups can be very imprecise.
p1<-tm_shape(pov10)+
tm_polygons(c("ppov"),
title=c("% in Poverty"),
palette="Blues",
style="quantile",
n=5)+
#tm_format("World", legend.outside=T, title.size =4)+
tm_scale_bar()+
tm_layout(title="San Antonio Poverty Rate Estimates",
title.size =1.5,
legend.frame = TRUE,
title.position = c('right', 'top'))+
tm_compass()+
tm_format("World",
legend.position = c("left", "bottom"),
main.title.position =c("center"))
p2<-tm_shape(pov10)+
tm_polygons(c("ppov_cv"),
title=c("CV Poverty"),
palette="Blues",
style="quantile",
n=5)+
#tm_format("World", title="San Antonio Poverty Rate CV", legend.outside=T)+
tm_layout(title="San Antonio Poverty Rate CV",
title.size =1.5,
legend.frame = TRUE,
title.position = c('right', 'top'))+
tm_scale_bar()+
tm_compass()+
tm_format("World",
legend.position = c("left", "bottom"),
main.title.position =c("center"))
tmap_arrange(p1, p2)
plot(pov10$ppov, pov10$ppov_cv, main = "Error in Estimates vs Estimate Size")
When we have data that are collected over times on the same geographies, we may be interested in whether the variable we’re mapping has changed much over time.
In the ACS, we can compare two estimates if the years used to produce the estimates do not overlap. For instance, we could compare the 2006-2010 estimates to the 2015-2019 estimates, but we could not compare the 2006-2010 to the 2008-2012, because they share years of data.
See this for the official position on the subject.
There is a fix that will let us do this. see below
Here we take the poverty rate in tracts derived from the 2006-2010 ACS and compare it to the estimate from the 2015-2019 ACS
# get the 2019 estimates
pov19<-get_acs(geography = "tract",
state="TX",
county = "Bexar",
year = 2019,
variables="DP03_0119P" ,
geometry = T,
output = "wide")
## Getting data from the 2015-2019 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Using the ACS Data Profile
#rename variables and filter missing cases
pov19<- pov19%>%
mutate(ppov19 = DP03_0119PE,
ppov19_er = DP03_0119PM/1.645,
ppov19_cv =100* (ppov19_er/ppov19)) %>%
filter(complete.cases(ppov19), is.finite(ppov19_cv)==T)%>%
select(GEOID, ppov19, ppov19_er,ppov19_cv)
head(pov19)
#merge the two years worth of data
st_geometry(pov19)<-NULL #strip the geometry from the 2019 data
mdat<-left_join(pov10, pov19, by=c("GEOID"="GEOID"))
head(mdat)
## Simple feature collection with 6 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -98.5152 ymin: 29.40677 xmax: -98.456 ymax: 29.58043
## Geodetic CRS: NAD83
## GEOID ppov ppov_er ppov_cv ppov19 ppov19_er ppov19_cv
## 1 48029110100 15.4 10.151976 65.92192 24.7 14.650456 59.313587
## 2 48029110300 29.9 8.085106 27.04049 31.3 12.401216 39.620498
## 3 48029110500 57.5 9.057751 15.75261 69.1 4.498480 6.510102
## 4 48029120300 2.3 1.519757 66.07638 2.6 2.370821 91.185410
## 5 48029120702 7.4 4.802432 64.89772 1.6 1.702128 106.382979
## 6 48029121108 3.5 2.127660 60.79027 6.6 3.282675 49.737497
## geometry
## 1 MULTIPOLYGON (((-98.49928 2...
## 2 MULTIPOLYGON (((-98.48687 2...
## 3 MULTIPOLYGON (((-98.51411 2...
## 4 MULTIPOLYGON (((-98.45928 2...
## 5 MULTIPOLYGON (((-98.48187 2...
## 6 MULTIPOLYGON (((-98.47468 2...
Here I create a function that implements the testing procedure used by the Census for comparing estimates across year. The test for two estimates is:
\[\text{z = }\left| \frac{est_1 - est_2}{(1-C)*\sqrt{se_1^2 + se_2^2}}\right|\]
Where \(C\) is the number of years overlap in the estimates.
Here is my function:
acstest<-function(names,geoid, est1, err1, est2, err2, alpha, yr1, yr2, span){
se1<-err1/qnorm(.90)
se2<-err2/qnorm(.90)
yrs1<-seq(yr1, to=yr1-span)
yrs2<-seq(yr2, to=yr2-span)
C<-mean(yrs2%in%yrs1)
diff<- (est1-est2)
test<-(est1-est2) / (sqrt(1-C)*sqrt(se1^2+se2^2))
crit<-qnorm(1-alpha/2)
pval<-1-pnorm(abs(test))
result<-NULL
result[pval > alpha]<-"insignificant change"
result[pval < alpha & test < 0]<- "significant increase"
result[pval < alpha & test > 0]<-"significant decrease"
data.frame(name=names,geoid=geoid, est1=est1, est2=est2, se1=se1, se2=se2,difference=diff, test=test, result=result, pval=pval)
}
A very similar function significance() from the tidycensus package does a similar thing, but with less output.
significance(est1=mdat$ppov,
est2=mdat$ppov19,
moe1=mdat$ppov_er,
moe2 = mdat$ppov19_er,
clevel = .9)
## [1] FALSE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
## [13] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE
## [37] TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
## [49] TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE FALSE
## [61] TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] TRUE FALSE FALSE FALSE FALSE FALSE NA TRUE TRUE FALSE TRUE TRUE
## [85] FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE
## [97] FALSE TRUE NA FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [109] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [145] TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## [157] TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE
## [169] FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [181] TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [193] TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [205] TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE
## [217] FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [229] FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE
## [241] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [253] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [265] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
## [277] FALSE FALSE TRUE NA FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [289] FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE
## [301] TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE
## [313] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [325] FALSE FALSE NA FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## [337] FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
mdat$signif<- significance(est1=mdat$ppov,
est2=mdat$ppov19,
moe1=mdat$ppov_er,
moe2 = mdat$ppov19_er,
clevel = .9)
Here I use the function I just made to do the comparisons
diff1019<-acstest(names = mdat$GEOID,
geoid = mdat$GEOID,
est1 = mdat$ppov,
est2 = mdat$ppov19,
err1 = mdat$ppov_er,
err2=mdat$ppov19_er,
alpha = .1,
yr1 = 2010, yr2=2019,
span = 5)
head(diff1019)
## name geoid est1 est2 se1 se2 difference test
## 1 48029110100 48029110100 15.4 24.7 7.921629 11.431812 -9.3 -0.6686694
## 2 48029110300 48029110300 29.9 31.3 6.308842 9.676720 -1.4 -0.1211949
## 3 48029110500 48029110500 57.5 69.1 7.067800 3.510183 -11.6 -1.4699429
## 4 48029120300 48029120300 2.3 2.6 1.185873 1.849961 -0.3 -0.1365238
## 5 48029120702 48029120702 7.4 1.6 3.747357 1.328177 5.8 1.4588372
## 6 48029121108 48029121108 3.5 6.6 1.660222 2.561485 -3.1 -1.0155728
## result pval
## 1 insignificant change 0.25185318
## 2 insignificant change 0.45176833
## 3 significant increase 0.07078861
## 4 insignificant change 0.44570360
## 5 significant decrease 0.07230497
## 6 insignificant change 0.15491643
table(diff1019$result)
##
## insignificant change significant decrease significant increase
## 220 71 52
Compare to tidycensus
table(mdat$signif) #
##
## FALSE TRUE
## 220 123
acs_merge<-left_join(mdat, diff1019, by=c("GEOID"="geoid"))
tmap_mode("plot")
## tmap mode set to plotting
p1<-tm_shape(acs_merge)+
tm_polygons(c("ppov"),
title=c("% in Poverty 2010"),
palette="Blues",
style="quantile",
n=5)+
#tm_format("World", legend.outside=T, title.size =4)+
tm_scale_bar()+
tm_layout(title="San Antonio Poverty Rate Estimates 2010",
title.size =1.5,
legend.frame = TRUE,
title.position = c('right', 'top'))+
tm_compass()+
tm_format("World",
legend.position = c("left", "bottom"),
main.title.position =c("center"))
p2<-tm_shape(acs_merge)+
tm_polygons(c("ppov19"),
title=c("% in Poverty 2019"),
palette="Blues",
style="quantile",
n=5)+
#tm_format("World", title="San Antonio Poverty Rate CV", legend.outside=T)+
tm_layout(title="San Antonio Poverty Rate Estimate 2019",
title.size =1.5,
legend.frame = TRUE,
title.position = c('right', 'top'))+
tm_scale_bar()+
tm_compass()+
tm_format("World",
legend.position = c("left", "bottom"),
main.title.position =c("center"))
p3 <- tm_shape(acs_merge)+
tm_polygons(c("result"),
title=c("Changes"),
palette = "Set2")+
#tm_format("World", title="San Antonio Poverty Rate CV", legend.outside=T)+
tm_layout(title="San Antonio Poverty Rate Estimate Changes",
title.size =1.5,
legend.frame = TRUE,
title.position = c('right', 'top'))+
tm_scale_bar()+
tm_compass()
tmap_arrange(p1, p2, p3, nrow=2)
tmap_mode("view")
## tmap mode set to interactive viewing
#osmtile <- tmaptools::read_osm(pov10, mergeTiles = T)
#tm_shape(osmtile)+
# tm_rgb()+
tm_shape(acs_merge)+
tm_polygons("result",
alpha = .7,
title=c("Changes"),
palette = "Set2")+
#tm_format("World", title="San Antonio Poverty Rate CV", legend.outside=T)+
tm_layout(title="San Antonio Poverty Rate Estimate Changes",
title.size =1.5)+
tm_scale_bar()
#tm_compass()+
st_write(acs_merge, dsn="C:/Users/ozd504/Documents/GitHub/DEM7093/data/lab3_dat.gpkg",
layer="lab3dat",
driver="GPKG" )