This example will use R to downloard American Coummunity Survey summary file tables using the tidycensus package. The goal of this example is to illustrate how to download data from the Census API using R and to illustrate how to create basic descriptive maps of attributes.
The example will use data from San Antonio, Texas from the 2015 American Community Survey summary file.
Obtain one at http://api.census.gov/data/key_signup.html
use census_api_key(key = "yourkeyhere", install = T)
one time to install your key for use in tidycensus
library(tidycensus); library(tidyverse); library(sf)
## -- Attaching packages ------ tidyverse 1.2.1 --
## v ggplot2 2.2.1.9000 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.5
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts --------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Linking to GEOS 3.6.1, GDAL 2.2.3, proj.4 4.9.3
v15_Profile <- load_variables(2015, "acs5/profile", cache = TRUE) #demographic profile tables
v15_tables <- load_variables(2015 , "acs5", cache = TRUE) #all tables
#v10_sf1_tables <- load_variables(2010 , "sf1", cache = TRUE) #all tables for 2010 SF1
#v10_sf1_tables <- load_variables(2000 , "sf3", cache = TRUE) #all tables for 2000 SF#
View(v15_Profile)
#Search for variables by
v15_Profile[grep(x = v15_Profile$label, "HighSchool"), c("name", "label")]
## # A tibble: 0 x 2
## # ... with 2 variables: name <chr>, label <chr>
v15_Profile[grep(x = v15_Profile$label, "Built 2000 to 2009"), c("name", "label")]
## # A tibble: 2 x 2
## name label
## <chr> <chr>
## 1 DP04_0019E Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built ~
## 2 DP04_0019PE Percent!!YEAR STRUCTURE BUILT!!Total housing units!!Built 2~
The data profile tables are very useful because they contain lots of pre-calculated variables.
sa_acs<-get_acs(geography = "tract", state="TX", county = c("Bexar"), year = 2015,
variables=c( "DP05_0001E", "DP03_0009P", "DP03_0062E", "DP03_0119PE",
"DP05_0001E","DP02_0009PE","DP02_0008PE", "DP02_0040E","DP02_0038E",
"DP02_0066PE","DP02_0067PE","DP02_0080PE","DP02_0092PE",
"DP03_0005PE","DP03_0028PE","DP03_0062E","DP03_0099PE","DP03_0101PE",
"DP03_0119PE","DP04_0046PE","DP05_0072PE","DP05_0073PE",
"DP05_0066PE", "DP05_0072PE", "DP02_0113PE") ,
summary_var = "B01001_001",
geometry = T, output = "wide")
## Getting data from the 2011-2015 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Using the ACS Data Profile
## Using the ACS Data Profile
#create a county FIPS code - 5 digit
sa_acs$county<-substr(sa_acs$GEOID, 1, 5)
#rename variables and filter missing cases
sa_acs2<-sa_acs%>%
mutate(totpop= DP05_0001E,tpop_cv=(DP05_0001M/1.645)/DP05_0001E ,
phsormore=DP02_0066PE,phsormore_cv=(DP02_0066PM/1.645)/DP02_0066PE, medhhinc=DP03_0062E,ppov=DP03_0119PE ) %>%
# st_transform(crs = 102740)%>%
filter(complete.cases(totpop, phsormore, medhhinc))
#(acs.sub$medhhinc_ME/1.645) / acs.sub$medhhinc) * 100
class(sa_acs2)
#change the directory
sf::st_write(sa_acs2,dsn="C:/Users/ozd504/Google Drive/classes/dem7093/GIS_class_2018/data",layer="sa_tract_dp", driver="ESRI Shapefile", delete_layer=T, update=T)
Here I generate a quantile break for the coefficient of variation in census tract population estimates
library(classInt)
library(dplyr)
cv_map<-sa_acs2 %>%
mutate(cv_cut=cut(tpop_cv,breaks = quantile(tpop_cv, na.rm=T, p=seq(0,1,length.out = 6)),include.lowest = T),
ed_cut=cut(phsormore_cv,breaks = quantile(phsormore_cv, na.rm=T, p=seq(0,1,length.out = 6)),include.lowest = T) )
library(ggsn)
p1<-ggplot(cv_map, aes(fill = cv_cut, color = cv_cut)) +
geom_sf() +
ggtitle("Coefficent of Variation in Population Estimate",
subtitle = "Bexar County Texas, 2015 ACS")+
scale_fill_brewer(palette = "Blues") +
scale_color_brewer(palette = "Blues")+
theme(axis.text.x = element_blank(), axis.text.y = element_blank())+
north(cv_map)+
scalebar(cv_map, dist = 5, dd2km =T, model="GRS80", st.size = 2)
p1
p2<-ggplot(cv_map, aes(fill = ed_cut, color = ed_cut)) +
geom_sf() +
ggtitle("Coefficent of Variation in Estimate of % Greater than High School",
subtitle = "Bexar County Texas, 2015 ACS")+
scale_fill_brewer(palette = "Blues") +
scale_color_brewer(palette = "Blues")+
theme(axis.text.x = element_blank(), axis.text.y = element_blank())+
north(cv_map)+
scalebar(cv_map, dist = 5, dd2km =T, model="GRS80", st.size = 2)
p2
When we have data that are collected over time on the same geographies, we may be interested in whether the variable we’re mapping has changed much over time.
In the ACS, we can compare two estimates if the years used to produce the esimates do not overlap. For instance, we could compare the 2005-2009 estimates to the 2011-2015 estimates, but we could not compare the 2010-2014 to the 2011-2015, becuase they share years of data. See this for the official position on the subject.
Here we take the median houshold income in tracts derived from the 2007-2011 ACS and compare it to the estimate from the 2011-2015 ACS
#v11_Profile <- load_variables(2011 ,dataset = "acs5") #demographic profile
options(tigris_use_cache = TRUE)
#2009 ACS
acs_11<-get_acs(geography = "tract", state="TX", county = "029", year = 2011, variables="B19013_001",geometry = TRUE, output = "wide")
## Getting data from the 2007-2011 5-year ACS
#2015 ACS
acs_15<-get_acs(geography = "tract", state="TX", county = "Bexar", year = 2015, variables="B19013_001",geometry = TRUE, output = "wide")
## Getting data from the 2011-2015 5-year ACS
#create variables with nice names
acs_11<-acs_11%>%
mutate(inc11=B19013_001E, inc11err=B19013_001M)%>%
select(GEOID, NAME, inc11, inc11err)
#create variables with nice names
acs_15<-acs_15%>%
mutate(inc15=B19013_001E, inc15err=B19013_001M)%>%
select(GEOID, NAME, inc15, inc15err)
#merge the two years worth of data
mdat<-left_join(acs_11, as.data.frame(acs_15), by="GEOID", st_join=FALSE)
head(mdat)
## Simple feature collection with 6 features and 7 fields
## Active geometry column: geometry.x
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: -98.56778 ymin: 29.32196 xmax: -98.41912 ymax: 29.44661
## epsg (SRID): 4269
## proj4string: +proj=longlat +datum=NAD83 +no_defs
## GEOID NAME.x inc11 inc11err
## 1 48029170401 Census Tract 1704.01, Bexar County, Texas 17796 3070
## 2 48029170402 Census Tract 1704.02, Bexar County, Texas 27433 3465
## 3 48029160702 Census Tract 1607.02, Bexar County, Texas 25455 5946
## 4 48029160200 Census Tract 1602, Bexar County, Texas 34821 9070
## 5 48029141600 Census Tract 1416, Bexar County, Texas 56184 7383
## 6 48029141200 Census Tract 1412, Bexar County, Texas 39897 3871
## NAME.y inc15 inc15err
## 1 Census Tract 1704.01, Bexar County, Texas 18310 2821
## 2 Census Tract 1704.02, Bexar County, Texas 24722 4437
## 3 Census Tract 1607.02, Bexar County, Texas 22076 4843
## 4 Census Tract 1602, Bexar County, Texas 36458 7223
## 5 Census Tract 1416, Bexar County, Texas 53534 4526
## 6 Census Tract 1412, Bexar County, Texas 32088 7034
## geometry.x geometry.y
## 1 POLYGON ((-98.5251 29.44358... MULTIPOLYGON (((-98.54106 2...
## 2 POLYGON ((-98.53594 29.4405... MULTIPOLYGON (((-98.54196 2...
## 3 POLYGON ((-98.55955 29.3849... MULTIPOLYGON (((-98.56773 2...
## 4 POLYGON ((-98.51153 29.3992... MULTIPOLYGON (((-98.52883 2...
## 5 POLYGON ((-98.43569 29.3351... MULTIPOLYGON (((-98.45349 2...
## 6 POLYGON ((-98.44832 29.3826... MULTIPOLYGON (((-98.44833 2...
## geometry
## 1 GEOMETRYCOLLECTION EMPTY
## 2 GEOMETRYCOLLECTION EMPTY
## 3 GEOMETRYCOLLECTION EMPTY
## 4 GEOMETRYCOLLECTION EMPTY
## 5 GEOMETRYCOLLECTION EMPTY
## 6 GEOMETRYCOLLECTION EMPTY
Here I create a function that implements the testing procedure used by the Census for comparing estimates across year
acstest<-function(names,geoid, est1, err1, est2, err2, alpha, yr1, yr2, span){
se1<-err1/qnorm(.90)
se2<-err2/qnorm(.90)
yrs1<-seq(yr1, to=yr1-span)
yrs2<-seq(yr2, to=yr2-span)
C<-mean(yrs2%in%yrs1)
diff<- (est1-est2)
test<-(est1-est2) / (sqrt(1-C)*sqrt((se1^2+se2^2)))
crit<-qnorm(1-alpha/2)
pval<-2*pnorm(abs(test),lower.tail=F)
result<-NULL
result[pval > alpha]<-"insignificant change"
result[pval < alpha & test > 0]<- "significant increase"
result[pval < alpha & test < 0]<-"significant decrease"
data.frame(name=names,geoid=geoid, est1=est1, est2=est2, se1=se1, se2=se2,difference=diff, test=test, result=result, pval=pval)
}
Here I use the function I just made to do the comparisons
diff0915<-acstest(names = mdat$NAME.x, geoid = mdat$GEOID, est1 = mdat$inc11*1.07, est2 = mdat$inc15, err1 = mdat$inc11err*1.12, err2=mdat$inc15err, alpha = .1, yr1 = 2011, yr2=2015, span = 5)
head(diff0915)
## name geoid est1 est2
## 1 Census Tract 1704.01, Bexar County, Texas 48029170401 19041.72 18310
## 2 Census Tract 1704.02, Bexar County, Texas 48029170402 29353.31 24722
## 3 Census Tract 1607.02, Bexar County, Texas 48029160702 27236.85 22076
## 4 Census Tract 1602, Bexar County, Texas 48029160200 37258.47 36458
## 5 Census Tract 1416, Bexar County, Texas 48029141600 60116.88 53534
## 6 Census Tract 1412, Bexar County, Texas 48029141200 42689.79 32088
## se1 se2 difference test result pval
## 1 2682.998 2201.238 731.72 0.2582299 insignificant change 0.79622949
## 2 3028.204 3462.209 4631.31 1.2331710 insignificant change 0.21751198
## 3 5196.451 3779.013 5160.85 0.9837296 insignificant change 0.32524846
## 4 7926.642 5636.137 800.47 0.1007977 insignificant change 0.91971109
## 5 6452.304 3531.657 6582.88 1.0960834 insignificant change 0.27304230
## 6 3383.024 5488.659 10601.79 2.0138800 significant increase 0.04402213
options(scipen=999)
acs_merge<-left_join(sa_acs, diff0915, by=c("GEOID"="geoid"))
## Warning: Column `GEOID`/`geoid` joining character vector and factor,
## coercing into character vector
brks<-quantile(c(acs_11$inc11, acs_15$inc15), probs = seq(0,1,.2), na.rm=T)
acs_11map<-acs_11%>%
mutate(inc_q = cut(inc11,breaks = brks, include.lowest = T))
p1<-ggplot(acs_11map, aes( fill=inc_q, color=inc_q))+
geom_sf() +
ggtitle("Median Household Income 2011 ",
subtitle = "Bexar County Texas")+
scale_fill_brewer(palette = "Blues") +
scale_color_brewer(palette = "Blues")+
#scale_fill_manual(values=myfil)+
theme(axis.text.x = element_blank(), axis.text.y = element_blank())+
north(acs_11map)+
scalebar(acs_11map, dist = 5, dd2km =T, model="GRS80", st.size = 2)
acs_15map<-acs_15%>%
mutate(inc_q = cut(inc15,breaks = brks, include.lowest = T))
p2<-ggplot(acs_15map, aes( fill=inc_q, color=inc_q))+
geom_sf() +
ggtitle("Median Household Income 2015 ",
subtitle = "Bexar County Texas")+
scale_fill_brewer(palette = "Blues") +
scale_color_brewer(palette = "Blues")+
#scale_fill_manual(values=myfil)+
theme(axis.text.x = element_blank(), axis.text.y = element_blank())+
north(acs_15map)+
scalebar(acs_15map, dist = 5, dd2km =T, model="GRS80", st.size = 2)
myfil<-c("grey", "red", "blue")
p3<-ggplot(acs_merge, aes( fill=result))+
geom_sf() +
ggtitle("Comparison of Median Household Income 2011 to 2015",
subtitle = "Bexar County Texas")+
#scale_fill_brewer(palette = "Accent") +
#scale_color_brewer(palette = "Accent")+
scale_fill_manual(values=myfil)+
theme(axis.text.x = element_blank(), axis.text.y = element_blank())+
north(acs_merge)+
scalebar(acs_merge, dist = 5, dd2km =T, model="GRS80", st.size = 2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
out<-grid.arrange(p1, p2, p3, nrow = 2)
out
## TableGrob (2 x 2) "arrange": 3 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]