I propose to build a regression model to predict county-level median household incomes (or potentially another measure of household finances) based on data from United States Census and American Community Survey. The data used to build the regression model will be based on some combination of Age, Gender, Race, Household Composition, Occupation, Education, Income Distribution, Housing which will be retrieved using the US Census API.
I have registered and obtained a US Census API Key and will use the censusapi package in R to retrieve relevant data. The glossaries of the Census Summary File 1 SF1 and American Community Survey AC5 were used to determine which of the large number of fields might potentially be useful for downstream analyses.
library(censusapi)
#us census api key
#censuskey= 'get your own, it's free!'
#retrieve ac5 data
ac5_occupations <- paste('B08124_', sprintf('%03i', seq(1, 7)), 'E', sep='')
ac5_education <- paste('B06009_', sprintf('%03i', seq(1, 6)), 'E', sep='')
ac5_poverty <- paste('B06012_', sprintf('%03i', seq(1, 4)), 'E', sep='')
ac5_hhincomebracket <- paste('B19001_', sprintf('%03i', seq(1, 17)), 'E', sep='')
ac5_hhincomeage <- paste('B19049_', sprintf('%03i', seq(1, 5)), 'E', sep='')
ac5_incomefrom_wagesalary <- paste('B19052_', sprintf('%03i', seq(1, 3)), 'E', sep='')
ac5_incomefrom_wageselfemploy <- paste('B19053_', sprintf('%03i', seq(1, 3)), 'E', sep='')
ac5_incomefrom_investRE <- paste('B19054_', sprintf('%03i', seq(1, 3)), 'E', sep='')
ac5_incomefrom_retirment <- paste('B19059_', sprintf('%03i', seq(1, 3)), 'E', sep='')
ac5_income_quintilemeans <- paste('B19081_', sprintf('%03i', seq(1, 5)), 'E', sep='')
ac5_income_quintileshare <- paste('B19082_', sprintf('%03i', seq(1, 5)), 'E', sep='')
ac5_gini <- 'B19083_001E'
ac5_familyincome <- paste('B19101_', sprintf('%03i', seq(1, 17)), 'E', sep='')
ac5_nonfamilyincome <- paste('B19201_', sprintf('%03i', seq(1, 17)), 'E', sep='')
ac5_2014 <- getCensus(name="acs5", vintage=2014, key=censuskey,
vars=c("NAME"
, "B01001_001E" #Total w/ Sex
, "B01001_002E" #Total Males
, "B01001_026E" #Total Females
, "B19013_001E" #Median Household Income
, "B19019_001E" #Median Household Income by HH People
, ac5_occupations
, ac5_education
, ac5_poverty
, ac5_gini
, ac5_hhincomeage
),
region="county:*")
#concatenate state and county to create county FIPS
ac5_2014$GEOID <- paste(ac5_2014$state, ac5_2014$county, sep="")
#sf1 variables to pull
sf1p_urbanrural <- paste('P002000', sprintf('%i', seq(1, 6)), sep='')
sf1p_race <- paste('P003000', sprintf('%i', seq(1, 8)), sep='')
sf1h_urbanrural <- paste('H002000', sprintf('%i', seq(1, 6)), sep='')
sf1h_occupancy <- paste('H003000', sprintf('%i', seq(1, 3)), sep='')
sf1h_tenure <- paste('H004000', sprintf('%i', seq(1, 4)), sep='')
sf1h_vacant <- paste('H005000', sprintf('%i', seq(1, 8)), sep='')
sf1h_hhsize <- paste('H013000', sprintf('%i', seq(1, 8)), sep='')
# Retrieve 2010 sf1 data
sf1_2010 <- getCensus(name="sf1", vintage=2010, key=censuskey,
vars=c(sf1p_urbanrural
,sf1p_race
,sf1h_urbanrural
,sf1h_occupancy
,sf1h_tenure
,sf1h_vacant
,sf1h_hhsize
),
region="county:*")
#concatenate state and county FIPS to create county FIPS
sf1_2010$GEOID <- paste(sf1_2010$state, sf1_2010$county, sep="")
#merge sf1 and ac5 datasets
countydata <- merge(ac5_2014, sf1_2010, by=c("GEOID"))
Following the example code from this tutorial by Kris Eberwein of Data Science Riot, I first downloaded US county shapefiles, removed territories outside of the contiguous ‘lower 48’ states, and then merged the SF1/ACS dataframe to the Shapefile dataframe using the FIPS county codes. I then plotted the dataframe as an interactive Javascript Leaflet map.
library(ggmap)
library(censusapi)
library(sp)
library(rgeos)
library(rgdal)
library(maptools)
library(dplyr)
library(leaflet)
library(scales)
# Download county shape file from Tiger.
# https://www.census.gov/geo/maps-data/data/cbf/cbf_counties.html
us.map <- readOGR(dsn = ".", layer = "cb_2015_us_county_5m", stringsAsFactors = FALSE, integer64 = "allow.loss", verbose=FALSE)
# Remove Alaska(2), Hawaii(15), Puerto Rico (72), Guam (66), Virgin Islands (78), American Samoa (60)
# Mariana Islands (69), Micronesia (64), Marshall Islands (68), Palau (70), Minor Islands (74)
us.map <- us.map[!us.map$STATEFP %in% c("02", "15", "72", "66", "78", "60", "69",
"64", "68", "70", "74"),]
# Make sure other outling islands are removed.
us.map <- us.map[!us.map$STATEFP %in% c("81", "84", "86", "87", "89", "71", "76",
"95", "79"),]
# Merge spatial df with downloade ddata.
leafmap <- merge(us.map, countydata, by=c("GEOID"))
#convert square m to square km
leafmap$ALAND <- leafmap$ALAND/1000000
leafmap$AWATER <- leafmap$AWATER/1000000
# Format popup data for leaflet map.
popup_dat <- paste0("<strong>County: </strong>",
leafmap$NAME.y,
"<br><strong>Median HH Income: </strong>$",
round(leafmap$B19049_001E,2),
"<br><strong>Population: </strong>" ,
round(leafmap$B01001_001E, 2),
"<br><strong>Gini Income Inequality: </strong>",
round(leafmap$B19083_001E,2),
"<br><strong>Population Density: </strong>" ,
round(leafmap$B01001_001E/leafmap$ALAND, 2),
"<br><strong>M/F Ratio </strong>",
round(leafmap$B01001_002E/leafmap$B01001_026E,3),
"<br><strong>Management/Professional Ratio </strong>",
round(leafmap$B08124_002E/leafmap$B08124_001E,3)
)
pal <- colorQuantile("YlOrRd", NULL, n = 9)
# Render final map in leaflet.
leaflet(data = leafmap, width="100%") %>% addTiles() %>%
addPolygons(fillColor = ~pal(leafmap$B19049_001E),
fillOpacity = 0.7,
color = "#BDBDC3",
weight = 1,
popup = popup_dat)
The preceding plot shows a chloropleth map of the US coloured according to median household income with darker areas having higher incomes and lighter areas having lower incomes. The popup tooltip shows the county name and state as well as a number of other stats derived or computed from the SF1 and AC5 datasets that could prove useful in building a regression model for predicting median household income (e.g. Population, Gini Index, Population Density, M/F Ratio, Percent of Workers in Management/Professional Occupations)