library("httr")
library("readr")
library("dplyr")
library("tidyr")
library("ggplot2")
library("knitr")
library("rgeos")
library("rgdal")
library("maptools")
library("dplyr")
library("leaflet")
library("scales")

Overview

Hypothesis

The common story of migration in the country the USA has been that people migrate for ecomomic reasons. That the rural areas are “poor” and the urban areas are “rich” and people are migrating from poor rural areas to rich urban areas.

Strategy

To test this hypothesis we are going to load data from income per capita at the county level from the US Bureau of Economic Analysis (BEA) and the US Census Bureau. We are going to look at data from 2000 and 2010 as this is the latest “official” decennial census numbers.

Load Data

Load Census Data

Census API documenation is at: http://www.census.gov/data/developers/data-sets/decennial-census.html. The US Census Bureau API requires an API key.

api_key <- readLines(".census_api_key") [[1]]
base_url <- "http://api.census.gov/data/2010/sf1"
query_list <- list( key=api_key, get="P0010001,NAME", `for`="county:*")
response <- GET(base_url, query=query_list)

payload <- content(response)

payload <- payload[2:length(payload)]

census_2010 <- data.frame(matrix(unlist(payload), 
                                 nrow=length(payload), byrow=T,),
                          stringsAsFactors = FALSE)

base_url <- "http://api.census.gov/data/2000/sf1"
query_list <- list( key=api_key, get="P001001,NAME", `for`="county:*")
response <- GET(base_url, query=query_list)
payload <- content(response)

payload <- payload[2:length(payload)]

census_2000 <- data.frame(matrix(unlist(payload), nrow=length(payload), byrow=T), stringsAsFactors = FALSE)

Census 2010 data

kable(head(census_2010))
X1 X2 X3 X4
54571 Autauga County 01 001
182265 Baldwin County 01 003
27457 Barbour County 01 005
22915 Bibb County 01 007
57322 Blount County 01 009
10914 Bullock County 01 011

Census 2000 data

kable(head(census_2000))
X1 X2 X3 X4
43671 Autauga County 01 001
140415 Baldwin County 01 003
29038 Barbour County 01 005
20826 Bibb County 01 007
51024 Blount County 01 009
11714 Bullock County 01 011

Load official state codes

Data scraped from: http://www2.census.gov/geo/docs/reference/state.txt

states <- read.delim("http://www2.census.gov/geo/docs/reference/state.txt", header = TRUE, sep = "|", colClasses=rep("character",4))
kable(head(states))
STATE STUSAB STATE_NAME STATENS
01 AL Alabama 01779775
02 AK Alaska 01785533
04 AZ Arizona 01779777
05 AR Arkansas 00068085
06 CA California 01779778
08 CO Colorado 01779779

Load income per capital data

CSV files downloaded from: http://www.bea.gov/iTable/iTable.cfm?reqid=70&step=1&isuri=1&acrdn=6#reqid=70&step=25&isuri=1&7022=20&7023=7&7024=non-industry&7001=720&7029=20&7090=70

bea_2010_length <- 
  length(read_lines("https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607-Final-Project/master/bea_income_per_capita/ipc_2010.csv"))
                          
bea_2000_length <-  
  length(read_lines("https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607-Final-Project/master/bea_income_per_capita/ipc_2000.csv"))

bea_2010 <- 
  read.csv( "https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607-Final-Project/master/bea_income_per_capita/ipc_2010.csv", nrows = (bea_2010_length-16), skip = 4, stringsAsFactors = FALSE)

bea_2000 <- 
  read.csv( "https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607-Final-Project/master/bea_income_per_capita/ipc_2000.csv", nrows = (bea_2000_length-16), skip = 4, stringsAsFactors = FALSE)

BEA 2010 data

kable(head(bea_2010))
GeoFips GeoName X2010
01001 Autauga, AL 33452
01003 Baldwin, AL 36090
01005 Barbour, AL 27923
01007 Bibb, AL 25003
01009 Blount, AL 27600
01011 Bullock, AL 23235

BEA 2000 data

kable(head(bea_2000))
GeoFips GeoName X2000
01001 Autauga, AL 23537
01003 Baldwin, AL 26678
01005 Barbour, AL 18882
01007 Bibb, AL 17617
01009 Blount, AL 20228
01011 Bullock, AL 15728

Transform Data

In order to do analysis on the data, we need to join the data from all of the sources and transform them into both a wide and long format table. We did both a wide and a long format, because at this point in time we do not know what format will be the most appropriate for the analysis we are doing. Durring this process, we are also adding derived columns for the delta and percent change from 2000 to 2010.

colnames(census_2010) <- c("population_2010","county_name","state_num", "county_num")

colnames(census_2000) <- c("population_2000","county_name","state_num", "county_num")

census_2000$population_2000 <-  as.numeric(census_2000$population_2000 )
census_2010$population_2010 <-  as.numeric(census_2010$population_2010)


census_2010 <- mutate(census_2010, fips = paste(state_num, county_num, sep="") )

census_2000 <- mutate(census_2000, fips = paste(state_num, county_num, sep="") )
 

 
colnames(bea_2000) <- c("fips", "GeoName", "income_per_capita_2000")
colnames(bea_2010) <- c("fips", "GeoName", "income_per_capita_2010")


bea_2000 <- subset(bea_2000, select = c(1,3))
bea_2010 <- subset(bea_2010, select = c(1,3))

bea_2000$income_per_capita_2000 <- as.numeric(bea_2000$income_per_capita_2000)

bea_2010$income_per_capita_2010 <- as.numeric(bea_2010$income_per_capita_2010)
 


census_2000 <- inner_join(census_2000, bea_2000, by="fips")

census_2010 <- inner_join(census_2010, bea_2010, by="fips")

census_2000_key_val_only <- subset(census_2000, select=c(1,5,6))


census_wide <- inner_join(census_2010, census_2000_key_val_only, by = "fips")

colnames(states) <- c("state_num", "state_abbr", "state_name" , "stats_ens")

states <- subset(states, select= c("state_num", "state_abbr", "state_name" ))

census_wide <- inner_join(census_wide, states, by= "state_num")

census_wide <- census_wide[complete.cases(census_wide),]

 
census_wide <- mutate(census_wide, delta_population = population_2010 - population_2000 ,
                 delta_income_per_capita = income_per_capita_2010 - income_per_capita_2000,
                 percent_change_population = delta_population/population_2000,
                 percent_change_income_per_capita = delta_income_per_capita/income_per_capita_2000
                 )


census_long <- gather(census_wide, type, data, 
                      c(population_2010, 
                        income_per_capita_2010, 
                        population_2000, 
                        delta_population, 
                        delta_income_per_capita, 
                        percent_change_population, 
                        percent_change_income_per_capita,
                        income_per_capita_2000) )



census_long$year[census_long$type %in% c("population_2010", "income_per_capita_2010" )] <- "2010"

census_long$year[census_long$type %in% c("population_2000", "income_per_capita_2000" )] <- "2010"


census_long$year[census_long$type %in% 
                   c("delta_population",
                     "delta_income_per_capita", 
                     "percent_change_population", 
                     "percent_change_income_per_capita")] <- "2000-2010"


census_long$type[census_long$type %in% c("population_2010",  "population_2000") ] <- "population"
census_long$type[census_long$type %in% c("income_per_capita_2010", "income_per_capita_2010")] <- "income_per_capita"

census_long$type <- as.factor(census_long$type)
census_long$year <- as.factor(census_long$year)

Long format table

kable(head(census_long))
county_name state_num county_num fips state_abbr state_name type data year
Autauga County 01 001 01001 AL Alabama population 54571 2010
Baldwin County 01 003 01003 AL Alabama population 182265 2010
Barbour County 01 005 01005 AL Alabama population 27457 2010
Bibb County 01 007 01007 AL Alabama population 22915 2010
Blount County 01 009 01009 AL Alabama population 57322 2010
Bullock County 01 011 01011 AL Alabama population 10914 2010

Wide format table

kable(head(census_wide))
population_2010 county_name state_num county_num fips income_per_capita_2010 population_2000 income_per_capita_2000 state_abbr state_name delta_population delta_income_per_capita percent_change_population percent_change_income_per_capita
54571 Autauga County 01 001 01001 33452 43671 23537 AL Alabama 10900 9915 0.2495936 0.4212516
182265 Baldwin County 01 003 01003 36090 140415 26678 AL Alabama 41850 9412 0.2980451 0.3528001
27457 Barbour County 01 005 01005 27923 29038 18882 AL Alabama -1581 9041 -0.0544459 0.4788158
22915 Bibb County 01 007 01007 25003 20826 17617 AL Alabama 2089 7386 0.1003073 0.4192541
57322 Blount County 01 009 01009 27600 51024 20228 AL Alabama 6298 7372 0.1234321 0.3644453
10914 Bullock County 01 011 01011 23235 11714 15728 AL Alabama -800 7507 -0.0682943 0.4773016
summary(census_wide)
##  population_2010   county_name         state_num        
##  Min.   :     82   Length:3081        Length:3081       
##  1st Qu.:  11096   Class :character   Class :character  
##  Median :  25910   Mode  :character   Mode  :character  
##  Mean   :  99058                                        
##  3rd Qu.:  67077                                        
##  Max.   :9818605                                        
##   county_num            fips           income_per_capita_2010
##  Length:3081        Length:3081        Min.   : 14360        
##  Class :character   Class :character   1st Qu.: 28544        
##  Mode  :character   Mode  :character   Median : 32333        
##                                        Mean   : 33814        
##                                        3rd Qu.: 37196        
##                                        Max.   :144395        
##  population_2000   income_per_capita_2000  state_abbr       
##  Min.   :     67   Min.   :10156          Length:3081       
##  1st Qu.:  11211   1st Qu.:20142          Class :character  
##  Median :  24747   Median :22943          Mode  :character  
##  Mean   :  90351   Mean   :23849                            
##  3rd Qu.:  62298   3rd Qu.:26132                            
##  Max.   :9519338   Max.   :88640                            
##   state_name        delta_population  delta_income_per_capita
##  Length:3081        Min.   :-240578   Min.   :-39217         
##  Class :character   1st Qu.:   -283   1st Qu.:  7217         
##  Mode  :character   Median :    630   Median :  9224         
##                     Mean   :   8707   Mean   :  9965         
##                     3rd Qu.:   4361   3rd Qu.: 11731         
##                     Max.   : 744968   Max.   : 71798         
##  percent_change_population percent_change_income_per_capita
##  Min.   :-0.46605          Min.   :-0.5791                 
##  1st Qu.:-0.02589          1st Qu.: 0.3067                 
##  Median : 0.03154          Median : 0.4019                 
##  Mean   : 0.05194          Mean   : 0.4327                 
##  3rd Qu.: 0.10163          3rd Qu.: 0.5175                 
##  Max.   : 1.10355          Max.   : 2.0657

Graph Data

Map Data

Note: Mapping code was copied from: https://www.datascienceriot.com/mapping-us-counties-in-r-with-fips/kris/ however we made some modifications, like adding the legend, changing the color schema and of course using our data.

county_pop_change <- subset(census_wide,  
                     select = c("fips", "percent_change_population")) 


colnames(county_pop_change) <- c("GEOID", "percent_change_population")


county_income_change <- subset(census_wide,  
                     select = c("fips", "percent_change_income_per_capita")) 


colnames(county_income_change) <- c("GEOID", "percent_change_income_per_capita")


us.map <- readOGR(dsn=path.expand("cb_2013_us_county_20m"), layer="cb_2013_us_county_20m")
  

# Remove Alaska(2), Hawaii(15), Puerto Rico (72), Guam (66), Virgin Islands (78), American Samoa (60)
#  Mariana Islands (69), Micronesia (64), Marshall Islands (68), Palau (70), Minor Islands (74)
us.map <- us.map[!us.map$STATEFP %in% c("02", "15", "72", "66", "78", "60", "69",
                                        "64", "68", "70", "74"),]
# Make sure other outling islands are removed.
us.map <- us.map[!us.map$STATEFP %in% c("81", "84", "86", "87", "89", "71", "76",
                                        "95", "79"),]

Map % Population Change

leafmap <- merge(us.map, county_pop_change, by=c("GEOID"))

popup_dat <- paste0("<strong>County: </strong>", 
                    leafmap$NAME, 
                    "<br><strong>Population Change: </strong>", 
                    round(leafmap$percent_change_population * 100 , 2 ) , " % " ) 

leafmap$percent_change_population <- leafmap$percent_change_population * 100
 

pal <- colorQuantile("RdYlGn", domain = leafmap$percent_change_population , n = 20)

leaflet(data = leafmap) %>% addTiles() %>%
    addPolygons(fillColor = ~pal(percent_change_population), 
                fillOpacity = 0.8, 
                color = "#BDBDC3", 
                weight = 1,
                popup = popup_dat) %>% 
  addLegend(position = "bottomright",
   pal = pal, values = leafmap$percent_change_population,
   title = "Change",
   opacity = .5) %>% 
  setView(-92, 37, zoom = 4)

Map % Income Per Capita Change

leafmap_income <- merge(us.map, county_income_change, by=c("GEOID"))

popup_dat_income <- paste0("<strong>County: </strong>", 
                    leafmap$NAME, 
                    "<br><strong>Income Per Person Change: </strong>", 
                    round(leafmap_income$percent_change_income_per_capita * 100 , 2 ) , " % " ) 

 

pal <- colorQuantile("RdYlGn", domain = leafmap_income$percent_change_income_per_capita , n = 20)
 
leaflet(data = leafmap_income) %>% addTiles() %>%
    addPolygons(fillColor = ~pal(percent_change_income_per_capita), 
                fillOpacity = 0.8, 
                color = "#BDBDC3", 
                weight = 1,
                popup = popup_dat_income) %>%
addLegend(position = "bottomright",
   pal = pal, values = leafmap_income$percent_change_income_per_capita,
   title = "Change",
   opacity = .5)  %>% 
  setView(-92, 37, zoom = 4)

Income change vs population change

ggplot(data = census_wide, aes(y=percent_change_population, 
                               x=percent_change_income_per_capita )) + 
  geom_point() + 
  stat_smooth(method = "lm")

Linear regression

model_income_change_vs_pop_change <- lm(percent_change_income_per_capita ~  percent_change_population, data = census_wide)
summary(model_income_change_vs_pop_change)
## 
## Call:
## lm(formula = percent_change_income_per_capita ~ percent_change_population, 
##     data = census_wide)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.92139 -0.11597 -0.03018  0.07880  1.54360 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.45998    0.00377  122.02   <2e-16 ***
## percent_change_population -0.52560    0.02665  -19.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1946 on 3079 degrees of freedom
## Multiple R-squared:  0.1122, Adjusted R-squared:  0.1119 
## F-statistic:   389 on 1 and 3079 DF,  p-value: < 2.2e-16

Conclusion

We found the exact opposite relationship from our hypothesis. Counties that have a decrease in population have an increase in income per capita. According to the p value this is a statiscally signifigant relationship. This could possibly be explained by gentrification pushing people out of the city center and people moving into the surrounding areas.

Difficulties and Next Steps

Our next step would be to cluster counties and find the “high” and “low” income counties in a state and see how migration patterns varied between these two clusters. We wanted to keep these clusters of “high” and “low” income counties using adjectent counties to account for cities that spill into the sourounding counties, like the NYC and Philadelphia areas. Unfortunately, the Census Bureau’s Metropolitian Statiscal Areas was not suited for this task, because they don’t account for all rural areas, especially in the west.

In order to try to get around theese difficulties Luis tried attemped some clustering. Here is a link to the clustering that he attempted: https://rpubs.com/calleja/237132