Overview

The purpose of this project was to build a regression model to predict median household incomes at the county-level based on data from United States Census and American Community Survey.

I have registered and obtained a US Census API Key and will use the censusapi package in R to retrieve relevant data. The glossaries of the Census Summary File 1 SF1 (https://api.census.gov/data/2010/sf1/variables.html) and American Community Survey AC5 (https://api.census.gov/data/2014/ac5/variables.html) were used to determine which of the large number of fields might potentially be useful for downstream analyses. In total, 100 variables were selected related to Age, Gender, Race, Household Composition, Occupation, Education, Housing, and Geographic Mobility. Income related variables (e.g. poverty, Gini inequality index) were excluded so as not to confound the predictions.

#can set EVAL=FALSE to prevent having to requery censusdata.
library(censusapi)
library(XML)

#us census api key
#censuskey= 'get your own, it's free!'

#retrieve ac5 data
ac5_sex <-  c("B01001_001E", "B01001_002E") #Total and Males
ac5_occupations  <- paste('B08124_', sprintf('%03i', seq(1, 7)), 'E', sep='')
ac5_education <- paste('B06009_', sprintf('%03i', seq(1, 6)), 'E', sep='')
#ac5_poverty <- paste('B06012_', sprintf('%03i', seq(1, 4)), 'E', sep='')
ac5_age <- paste('B01002_', sprintf('%03i', seq(1, 3)), 'E', sep='')
ac5_nativity <- paste('B05001_', sprintf('%03i', seq(1, 6)), 'E', sep='')
ac5_transportation <- paste('B08006_', sprintf('%03i', seq(1, 17)), 'E', sep='')
ac5_mobility <- paste('B07001_', sprintf('%03i', c(seq(1, 17),33,49,65,81)), 'E', sep='')
ac5_householdtype <-  paste('B11001_', sprintf('%03i', seq(1, 9)), 'E', sep='')
#ac5_hhincomebracket <- paste('B19001_', sprintf('%03i', seq(1, 17)), 'E', sep='')
#ac5_hhincomeage <- paste('B19049_', sprintf('%03i', seq(1, 5)), 'E', sep='')
#ac5_incomefrom_wagesalary <- paste('B19052_', sprintf('%03i', seq(1, 3)), 'E', sep='')
#ac5_incomefrom_wageselfemploy <- paste('B19053_', sprintf('%03i', seq(1, 3)), 'E', sep='')
#ac5_incomefrom_investRE <- paste('B19054_', sprintf('%03i', seq(1, 3)), 'E', sep='')
#ac5_incomefrom_retirment <- paste('B19059_', sprintf('%03i', seq(1, 3)), 'E', sep='')
#ac5_income_quintilemeans <- paste('B19081_', sprintf('%03i', seq(1, 5)), 'E', sep='')
#ac5_income_quintileshare <- paste('B19082_', sprintf('%03i', seq(1, 5)), 'E', sep='')
ac5_gini <- 'B19083_001E'
#ac5_familyincome <- paste('B19101_', sprintf('%03i', seq(1, 17)), 'E', sep='')
#ac5_nonfamilyincome <- paste('B19201_', sprintf('%03i', seq(1, 17)), 'E', sep='')

#```
#```{r censusdata2, message=FALSE}

#median hh income is null for a large number of counties in 2015, use 2014 data for now.
ac5_2015 <- getCensus(name="acs5", vintage=2014, key=censuskey, 
                      vars=c("NAME"
                             , ac5_sex
                             , "B19013_001E" #Median Household Income
                             #, "B19019_001E" #Median Household Income by HH People
                             , ac5_occupations
                             , ac5_education
                             #, ac5_poverty #excluded because of income
                             , ac5_age #don't divide
                             , ac5_nativity
                             , ac5_transportation
                             , ac5_mobility
                             , ac5_householdtype
                             , ac5_gini #don't divide
                             #, ac5_hhincomeage
                             ), 
                      region="county:*")

#concatenate state and county to create county FIPS
ac5_2015$GEOID <- paste(ac5_2015$state, ac5_2015$county, sep="")

#sf1 variables to pull
#sf1p_urbanrural <- paste('P002000', sprintf('%i', seq(1, 6)), sep='') #not definied in 2010 sf1
sf1p_race <- paste('P003000', sprintf('%i', seq(1, 8)), sep='')
#sf1h_urbanrural <- paste('H002000', sprintf('%i', seq(1, 6)), sep='') #not definied in 2010 sf1
sf1h_occupancy <- paste('H003000', sprintf('%i', seq(1, 3)), sep='')
sf1h_tenure <- paste('H004000', sprintf('%i', seq(1, 4)), sep='')
sf1h_vacant <- paste('H005000', sprintf('%i', seq(1, 8)), sep='') 
sf1h_hhsize <- paste('H013000', sprintf('%i', seq(1, 8)), sep='') 

# Retrieve 2010 sf1 data
sf1_2010 <- getCensus(name="sf1", vintage=2010, key=censuskey, 
                      vars=c(#sf1p_urbanrural
                            sf1p_race 
                            #,sf1h_urbanrural 
                            ,sf1h_occupancy 
                            ,sf1h_tenure 
                            ,sf1h_vacant 
                            ,sf1h_hhsize
                            ), 
                      region="county:*")

#concatenate state and county FIPS to create county FIPS
sf1_2010$GEOID <- paste(sf1_2010$state, sf1_2010$county, sep="")

#merge sf1 and ac5 datasets
countydata <- merge(ac5_2015
                    , sf1_2010[,-(1:2)] #don't duplicate state and county 
                    , by=c("GEOID")) 

#divide ac5 to get percentages where appropriate
countydata[,ac5_sex[-(1)]] <- countydata[,ac5_sex[-(1)]]/countydata[,ac5_sex[1]]
countydata[,ac5_occupations[-(1)]] <- countydata[,ac5_occupations[-(1)]]/countydata[,ac5_occupations[1]]
countydata[,ac5_education[-(1)]] <- countydata[,ac5_education[-(1)]]/countydata[,ac5_education[1]]
#countydata[,ac5_poverty[-(1)]] <- countydata[,ac5_poverty[-(1)]]/countydata[,ac5_poverty[1]] 
countydata[,ac5_nativity[-(1)]] <- countydata[,ac5_nativity[-(1)]]/countydata[,ac5_nativity[1]]
countydata[,ac5_transportation[-(1)]] <- countydata[,ac5_transportation[-(1)]]/countydata[,ac5_transportation[1]]
countydata[,ac5_mobility[-(1)]] <- countydata[,ac5_mobility[-(1)]]/countydata[,ac5_mobility[1]]
countydata[,ac5_householdtype[-(1)]] <- countydata[,ac5_householdtype[-(1)]]/countydata[,ac5_householdtype[1]]

#divide sf1 to get percentages where appropriate
countydata[,sf1p_race[-(1)]] <- countydata[,sf1p_race[-(1)]]/countydata[,sf1p_race[1]]
countydata[,sf1h_occupancy[-(1)]] <- countydata[,sf1h_occupancy[-(1)]]/countydata[,sf1h_occupancy[1]]
countydata[,sf1h_tenure[-(1)]] <- countydata[,sf1h_tenure[-(1)]]/countydata[,sf1h_tenure[1]]
countydata[,sf1h_vacant[-(1)]] <- countydata[,sf1h_vacant[-(1)]]/countydata[,sf1h_vacant[1]]
countydata[,sf1h_hhsize[-(1)]] <- countydata[,sf1h_hhsize[-(1)]]/countydata[,sf1h_hhsize[1]]

#grab variable names from acs5 and sf1
acs5_vars <- readHTMLTable("http://api.census.gov/data/2014/acs5/variables.html",header = TRUE)
acs5_vars <- acs5_vars$`Census Data API: Variables in /data/2014/acs5/variables`
sf1_vars <- readHTMLTable("http://api.census.gov/data/2010/sf1/variables.html",header = TRUE)
sf1_vars <- sf1_vars$`Census Data API: Variables in /data/2010/sf1/variables`

Exploratory Analysis

Following the example code from this tutorial by Kris Eberwein of Data Science Riot (https://www.datascienceriot.com/mapping-us-counties-in-r-with-fips/kris/), I first downloaded US county shapefiles, removed territories outside of the contiguous ‘lower 48’ states, and then merged the SF1/ACS dataframe to the Shapefile dataframe using the FIPS county codes. I then plotted the dataframe as an interactive Javascript Leaflet map.

The plot shows a chloropleth map of the US coloured according to median household income with darker areas having higher incomes and lighter areas having lower incomes. The popup tooltip shows the county name and state as well as a number of other stats derived or computed from the SF1 and AC5 datasets that could prove useful in building a regression model for predicting median household income (e.g. Population, Gini Index, Population Density, M/F Ratio, Percent of Workers in Management/Professional Occupations)

library(ggmap)
library(censusapi)
library(sp)
library(rgeos)
library(rgdal)
library(maptools)
library(dplyr)
library(leaflet)
library(scales)

# Download county shape file from Tiger.
# https://www.census.gov/geo/maps-data/data/cbf/cbf_counties.html
us.map <- readOGR(dsn = ".", layer = "cb_2015_us_county_5m", stringsAsFactors = FALSE, integer64 = "allow.loss", verbose=FALSE)

#```
#```{r chloropleth2, message=FALSE, echo=FALSE}

# Remove Alaska(2), Hawaii(15), Puerto Rico (72), Guam (66), Virgin Islands (78), American Samoa (60)
#  Mariana Islands (69), Micronesia (64), Marshall Islands (68), Palau (70), Minor Islands (74)
us.map <- us.map[!us.map$STATEFP %in% c("02", "15", "72", "66", "78", "60", "69",
                                        "64", "68", "70", "74"),]
# Make sure other outling islands are removed.
us.map <- us.map[!us.map$STATEFP %in% c("81", "84", "86", "87", "89", "71", "76",
                                        "95", "79"),]
# Merge spatial df with downloade ddata.
leafmap <- merge(us.map, countydata, by=c("GEOID"))

#convert square m to square km
leafmap$ALAND <- leafmap$ALAND/1000000
leafmap$AWATER <- leafmap$AWATER/1000000

#```
##```{r chloropleth3, message=FALSE}

# Format popup data for leaflet map.
popup_dat <- paste0("<strong>County: </strong>", 
                    leafmap$NAME.x, 
                    "<br><strong>Median HH Income: </strong>$", 
                    round(leafmap$B19013_001E,2),
                    "<br><strong>Population: </strong>" , 
                    round(leafmap$B01001_001E, 2),
                    "<br><strong>Gini Income Inequality: </strong>", 
                    round(leafmap$B19083_001E,2),
                    "<br><strong>Population Density: </strong>" , 
                    round(leafmap$B01001_001E/leafmap$ALAND, 2),
                    "<br><strong>Percent Male </strong>",
                    round(leafmap$B01001_002E,3),
                    "<br><strong>Management/Professional Ratio </strong>",
                    round(leafmap$B08124_002E,3)
                    )

pal <- colorQuantile("YlOrRd", NULL, n = 9)
# Render final map in leaflet.
leaflet(data = leafmap, width="100%") %>% addTiles() %>%
    addPolygons(fillColor = ~pal(leafmap$B19013_001E), 
                fillOpacity = 0.7, 
                color = "#BDBDC3", 
                weight = 1,
                popup = popup_dat)
#grab land and water area from maps
countydata <- merge(x=countydata, us.map@data[,c(5,8,9)], by="GEOID") 
#convert square m to square km
countydata$ALAND <- countydata$ALAND/1000000
countydata$AWATER <- countydata$AWATER/1000000
#calculate some measures
countydata$ATOTAL <- countydata$ALAND+countydata$ALAND
countydata$PopDensityAll <- countydata$B01001_001E/countydata$ATOTAL
countydata$PopDensityLand <- countydata$B01001_001E/countydata$ALAND
#countydata$pctLandArea <- countydata$ALAND/countydata$ATOTAL

Machine Learning

I set aside 1/6 of my samples as a hold-out test dataset. I then used the remaining 5/6 to build a Random Forest Model with 5-fold cross-validation.

The optimal fit was achieved by trying approximately half of the 100 input variables per node (mtry = 49). The highest scoring model had an R-squared of 0.848 and an RMSE of 4824 which seems to indicate a good fit for the in-sample cross-validation testing.

Using this model to predict the hold-out test data gave very good results with an R-squared of 0.858 and RMSE of 4466. The results on the test dataset was actually slightly better than that on the training data suggesting that overfitting is not an issue.

Since the R-squared is close to 0.85 for both the test and training datasets, we can say with some confidence that the Random Forest model is able to explain ~85% of the variability in median household income. This seems to be fairly robust as a regression model though the fit could potentially be improved further with additional parameter tuning.

library(caret)
library(randomForest)
#library(gbm)
library(foreach)
library(doParallel)
library(ggplot2)
library(mlbench)

#do some final clean up county data for learning
learndata <- countydata
#excl <- c('GEOID', 'NAME', 'state', 'county', 'B08124_001E', 'B06009_001E', 'B05001_001E', 'B08006_001E','B07001_001E', 'B11001_001E', 'B19083_001E', 'P0030001', 'H0040001', 'H0050001', 'H0130001' )
learndata <- learndata[ , !names(learndata) %in% c('GEOID', 'NAME', 'state', 'county', 'B08124_001E', 'B06009_001E', 'B05001_001E', 'B08006_001E',
          'B07001_001E', 'B11001_001E', 'B19083_001E', 'P0030001', 'H0040001', 'H0050001', 'H0130001')]#excl]

#generate training and test datasets - partitioned by state
set.seed(12345)
inTrain <- sample(seq_len(nrow(learndata)), size = floor(5/6*nrow(learndata)))
        #sample(y=learndata$state, p=5/6, list=FALSE)
    
train <- learndata[inTrain, ]
test <- learndata[-inTrain, ]


#make sure the distributions are nice
qplot(x=test$B19013_001E) #histogram of median income in the test dataset

qplot(x=train$B19013_001E) #histogram of median income in the train dataset

mean(x=test$B19013_001E) #mean of median income in the test dataset
## [1] 46648.54
mean(x=train$B19013_001E) #mean of median income in the train dataset
## [1] 46307.52
#initialize parallel processing
cl <- makeCluster(6) #use 6 of 8 cores
registerDoParallel(cl)

#setup cross-validation and other training controls
tc <- trainControl(method = "cv", 
                   number = 5, 
                   repeats = 1,
                   seeds = NA, 
                   #verboseIter = TRUE,
                   allowParallel = TRUE) 

#remove measurement/user/timestamp/window metadata to prevent overfitting
set.seed(1234)
RF <- train(B19013_001E  ~ . ,data=train, method="rf", trControl = tc)

RF$finalModel
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 49
## 
##           Mean of squared residuals: 22050106
##                     % Var explained: 84.69
print(RF$results)
##   mtry     RMSE  Rsquared   RMSESD RsquaredSD
## 1    2 5847.830 0.8200256 335.5274 0.01722595
## 2   49 4824.251 0.8478424 279.6627 0.02170712
## 3   97 4887.070 0.8401712 347.6619 0.02667968
predict <- predict(RF, newdata=test)
postResample(predict, test$B19013_001E)
##         RMSE     Rsquared 
## 4466.2863831    0.8580545

Variable Importance

In an effort to identify the key predictive inputs for Median Household Income, I used the varImpPlot function to generate a variable importance plot. Predictor importance is heavily left skewed with the top 4 predictors accounting for the majority of the predictive power. The plot shows that many of the hypothesized predictors such as Education levels, Professional/Managerial occupation were, in fact, highly predictive. That said, there were many somewhat surprising findings not least of which is that Home Ownership with Mortgage or Loan is the attribute with the most predictive power.

Finally, it seems like Mobility between the ages of 40-75 may be a strong predictor of income and the effect could be made stronger if we grouped a few of these age ranges together (e.g. using 10 or 15 year buckets). Of course, the assumption here would be that the trend is in the same direction and would require further investigation to see if the trends are consistent or opposing through the proposed wider age ranges.

#library(plyr)
library(randomForest)

#importance graph
RFImport <- as.data.frame(RF$finalModel$importance)
RFImport <-setNames(cbind(rownames(RFImport), RFImport, row.names = NULL),c("Name", "IncNodePurity"))
#RFImport<- join_all(list(x=RFImport,sf1_vars,acs5_vars), by=c('Name'), type='left') #using plyr not dplyr

varImpPlot(RF$finalModel)