library("httr")
library("readr")
library("dplyr")
library("tidyr")
library("ggplot2")
library("knitr")
library("rgeos")
library("rgdal")
library("maptools")
library("dplyr")
library("leaflet")
library("scales")

Overview

Hypothesis

The common story of migration in the country the USA has been that people migrate for ecomomic reasons. That the rural areas are “poor” and the urban areas are “rich” and people are migrating from poor rural areas to rich urban areas.

Strategy

To test this hypothesis we are going to load data from income per capita at the county level from the US Bureau of Economic Analysis (BEA) and the US Census Bureau. We are going to look at data from 2000 and 2010 as this is the latest “official” decennial census numbers.

Load Data

Load Census Data

Census API documenation is at: http://www.census.gov/data/developers/data-sets/decennial-census.html. The US Census Bureau API requires an API key.

api_key <- readLines(".census_api_key") [[1]]
base_url <- "http://api.census.gov/data/2010/sf1"
query_list <- list( key=api_key, get="P0010001,NAME", `for`="county:*")
response <- GET(base_url, query=query_list)

payload <- content(response)

payload <- payload[2:length(payload)]

census_2010 <- data.frame(matrix(unlist(payload), 
                                 nrow=length(payload), byrow=T,),
                          stringsAsFactors = FALSE)

base_url <- "http://api.census.gov/data/2000/sf1"
query_list <- list( key=api_key, get="P001001,NAME", `for`="county:*")
response <- GET(base_url, query=query_list)
payload <- content(response)

payload <- payload[2:length(payload)]

census_2000 <- data.frame(matrix(unlist(payload), nrow=length(payload), byrow=T), stringsAsFactors = FALSE)

Census 2010 data

kable(head(census_2010))

X1	X2	X3	X4
54571	Autauga County	01	001
182265	Baldwin County	01	003
27457	Barbour County	01	005
22915	Bibb County	01	007
57322	Blount County	01	009
10914	Bullock County	01	011

Census 2000 data

kable(head(census_2000))

X1	X2	X3	X4
43671	Autauga County	01	001
140415	Baldwin County	01	003
29038	Barbour County	01	005
20826	Bibb County	01	007
51024	Blount County	01	009
11714	Bullock County	01	011

Load official state codes

Data scraped from: http://www2.census.gov/geo/docs/reference/state.txt

states <- read.delim("http://www2.census.gov/geo/docs/reference/state.txt", header = TRUE, sep = "|", colClasses=rep("character",4))

kable(head(states))

STATE	STUSAB	STATE_NAME	STATENS
01	AL	Alabama	01779775
02	AK	Alaska	01785533
04	AZ	Arizona	01779777
05	AR	Arkansas	00068085
06	CA	California	01779778
08	CO	Colorado	01779779

Load income per capital data

CSV files downloaded from: http://www.bea.gov/iTable/iTable.cfm?reqid=70&step=1&isuri=1&acrdn=6#reqid=70&step=25&isuri=1&7022=20&7023=7&7024=non-industry&7001=720&7029=20&7090=70

bea_2010_length <- 
  length(read_lines("https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607-Final-Project/master/bea_income_per_capita/ipc_2010.csv"))
                          
bea_2000_length <-  
  length(read_lines("https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607-Final-Project/master/bea_income_per_capita/ipc_2000.csv"))

bea_2010 <- 
  read.csv( "https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607-Final-Project/master/bea_income_per_capita/ipc_2010.csv", nrows = (bea_2010_length-16), skip = 4, stringsAsFactors = FALSE)

bea_2000 <- 
  read.csv( "https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607-Final-Project/master/bea_income_per_capita/ipc_2000.csv", nrows = (bea_2000_length-16), skip = 4, stringsAsFactors = FALSE)

BEA 2010 data

kable(head(bea_2010))

GeoFips	GeoName	X2010
01001	Autauga, AL	33452
01003	Baldwin, AL	36090
01005	Barbour, AL	27923
01007	Bibb, AL	25003
01009	Blount, AL	27600
01011	Bullock, AL	23235

BEA 2000 data

kable(head(bea_2000))

GeoFips	GeoName	X2000
01001	Autauga, AL	23537
01003	Baldwin, AL	26678
01005	Barbour, AL	18882
01007	Bibb, AL	17617
01009	Blount, AL	20228
01011	Bullock, AL	15728

Transform Data

In order to do analysis on the data, we need to join the data from all of the sources and transform them into both a wide and long format table. We did both a wide and a long format, because at this point in time we do not know what format will be the most appropriate for the analysis we are doing. Durring this process, we are also adding derived columns for the delta and percent change from 2000 to 2010.

colnames(census_2010) <- c("population_2010","county_name","state_num", "county_num")

colnames(census_2000) <- c("population_2000","county_name","state_num", "county_num")

census_2000$population_2000 <-  as.numeric(census_2000$population_2000 )
census_2010$population_2010 <-  as.numeric(census_2010$population_2010)


census_2010 <- mutate(census_2010, fips = paste(state_num, county_num, sep="") )

census_2000 <- mutate(census_2000, fips = paste(state_num, county_num, sep="") )
 

 
colnames(bea_2000) <- c("fips", "GeoName", "income_per_capita_2000")
colnames(bea_2010) <- c("fips", "GeoName", "income_per_capita_2010")


bea_2000 <- subset(bea_2000, select = c(1,3))
bea_2010 <- subset(bea_2010, select = c(1,3))

bea_2000$income_per_capita_2000 <- as.numeric(bea_2000$income_per_capita_2000)

bea_2010$income_per_capita_2010 <- as.numeric(bea_2010$income_per_capita_2010)
 


census_2000 <- inner_join(census_2000, bea_2000, by="fips")

census_2010 <- inner_join(census_2010, bea_2010, by="fips")

census_2000_key_val_only <- subset(census_2000, select=c(1,5,6))


census_wide <- inner_join(census_2010, census_2000_key_val_only, by = "fips")

colnames(states) <- c("state_num", "state_abbr", "state_name" , "stats_ens")

states <- subset(states, select= c("state_num", "state_abbr", "state_name" ))

census_wide <- inner_join(census_wide, states, by= "state_num")

census_wide <- census_wide[complete.cases(census_wide),]

 
census_wide <- mutate(census_wide, delta_population = population_2010 - population_2000 ,
                 delta_income_per_capita = income_per_capita_2010 - income_per_capita_2000,
                 percent_change_population = delta_population/population_2000,
                 percent_change_income_per_capita = delta_income_per_capita/income_per_capita_2000
                 )


census_long <- gather(census_wide, type, data, 
                      c(population_2010, 
                        income_per_capita_2010, 
                        population_2000, 
                        delta_population, 
                        delta_income_per_capita, 
                        percent_change_population, 
                        percent_change_income_per_capita,
                        income_per_capita_2000) )



census_long$year[census_long$type %in% c("population_2010", "income_per_capita_2010" )] <- "2010"

census_long$year[census_long$type %in% c("population_2000", "income_per_capita_2000" )] <- "2010"


census_long$year[census_long$type %in% 
                   c("delta_population",
                     "delta_income_per_capita", 
                     "percent_change_population", 
                     "percent_change_income_per_capita")] <- "2000-2010"


census_long$type[census_long$type %in% c("population_2010",  "population_2000") ] <- "population"
census_long$type[census_long$type %in% c("income_per_capita_2010", "income_per_capita_2010")] <- "income_per_capita"

census_long$type <- as.factor(census_long$type)
census_long$year <- as.factor(census_long$year)

Long format table

kable(head(census_long))

county_name	state_num	county_num	fips	state_abbr	state_name	type	data	year
Autauga County	01	001	01001	AL	Alabama	population	54571	2010
Baldwin County	01	003	01003	AL	Alabama	population	182265	2010
Barbour County	01	005	01005	AL	Alabama	population	27457	2010
Bibb County	01	007	01007	AL	Alabama	population	22915	2010
Blount County	01	009	01009	AL	Alabama	population	57322	2010
Bullock County	01	011	01011	AL	Alabama	population	10914	2010

Wide format table

kable(head(census_wide))

population_2010	county_name	state_num	county_num	fips	income_per_capita_2010	population_2000	income_per_capita_2000	state_abbr	state_name	delta_population	delta_income_per_capita	percent_change_population	percent_change_income_per_capita
54571	Autauga County	01	001	01001	33452	43671	23537	AL	Alabama	10900	9915	0.2495936	0.4212516
182265	Baldwin County	01	003	01003	36090	140415	26678	AL	Alabama	41850	9412	0.2980451	0.3528001
27457	Barbour County	01	005	01005	27923	29038	18882	AL	Alabama	-1581	9041	-0.0544459	0.4788158
22915	Bibb County	01	007	01007	25003	20826	17617	AL	Alabama	2089	7386	0.1003073	0.4192541
57322	Blount County	01	009	01009	27600	51024	20228	AL	Alabama	6298	7372	0.1234321	0.3644453
10914	Bullock County	01	011	01011	23235	11714	15728	AL	Alabama	-800	7507	-0.0682943	0.4773016

summary(census_wide)

##  population_2010   county_name         state_num        
##  Min.   :     82   Length:3081        Length:3081       
##  1st Qu.:  11096   Class :character   Class :character  
##  Median :  25910   Mode  :character   Mode  :character  
##  Mean   :  99058                                        
##  3rd Qu.:  67077                                        
##  Max.   :9818605                                        
##   county_num            fips           income_per_capita_2010
##  Length:3081        Length:3081        Min.   : 14360        
##  Class :character   Class :character   1st Qu.: 28544        
##  Mode  :character   Mode  :character   Median : 32333        
##                                        Mean   : 33814        
##                                        3rd Qu.: 37196        
##                                        Max.   :144395        
##  population_2000   income_per_capita_2000  state_abbr       
##  Min.   :     67   Min.   :10156          Length:3081       
##  1st Qu.:  11211   1st Qu.:20142          Class :character  
##  Median :  24747   Median :22943          Mode  :character  
##  Mean   :  90351   Mean   :23849                            
##  3rd Qu.:  62298   3rd Qu.:26132                            
##  Max.   :9519338   Max.   :88640                            
##   state_name        delta_population  delta_income_per_capita
##  Length:3081        Min.   :-240578   Min.   :-39217         
##  Class :character   1st Qu.:   -283   1st Qu.:  7217         
##  Mode  :character   Median :    630   Median :  9224         
##                     Mean   :   8707   Mean   :  9965         
##                     3rd Qu.:   4361   3rd Qu.: 11731         
##                     Max.   : 744968   Max.   : 71798         
##  percent_change_population percent_change_income_per_capita
##  Min.   :-0.46605          Min.   :-0.5791                 
##  1st Qu.:-0.02589          1st Qu.: 0.3067                 
##  Median : 0.03154          Median : 0.4019                 
##  Mean   : 0.05194          Mean   : 0.4327                 
##  3rd Qu.: 0.10163          3rd Qu.: 0.5175                 
##  Max.   : 1.10355          Max.   : 2.0657

Graph Data

Map Data

Note: Mapping code was copied from: https://www.datascienceriot.com/mapping-us-counties-in-r-with-fips/kris/ however we made some modifications, like adding the legend, changing the color schema and of course using our data.

county_pop_change <- subset(census_wide,  
                     select = c("fips", "percent_change_population")) 


colnames(county_pop_change) <- c("GEOID", "percent_change_population")


county_income_change <- subset(census_wide,  
                     select = c("fips", "percent_change_income_per_capita")) 


colnames(county_income_change) <- c("GEOID", "percent_change_income_per_capita")


us.map <- readOGR(dsn=path.expand("cb_2013_us_county_20m"), layer="cb_2013_us_county_20m")
  

# Remove Alaska(2), Hawaii(15), Puerto Rico (72), Guam (66), Virgin Islands (78), American Samoa (60)
#  Mariana Islands (69), Micronesia (64), Marshall Islands (68), Palau (70), Minor Islands (74)
us.map <- us.map[!us.map$STATEFP %in% c("02", "15", "72", "66", "78", "60", "69",
                                        "64", "68", "70", "74"),]
# Make sure other outling islands are removed.
us.map <- us.map[!us.map$STATEFP %in% c("81", "84", "86", "87", "89", "71", "76",
                                        "95", "79"),]

Map % Population Change

leafmap <- merge(us.map, county_pop_change, by=c("GEOID"))

popup_dat <- paste0("<strong>County: </strong>", 
                    leafmap$NAME, 
                    "<br><strong>Population Change: </strong>", 
                    round(leafmap$percent_change_population * 100 , 2 ) , " % " ) 

leafmap$percent_change_population <- leafmap$percent_change_population * 100
 

pal <- colorQuantile("RdYlGn", domain = leafmap$percent_change_population , n = 20)

leaflet(data = leafmap) %>% addTiles() %>%
    addPolygons(fillColor = ~pal(percent_change_population), 
                fillOpacity = 0.8, 
                color = "#BDBDC3", 
                weight = 1,
                popup = popup_dat) %>% 
  addLegend(position = "bottomright",
   pal = pal, values = leafmap$percent_change_population,
   title = "Change",
   opacity = .5) %>% 
  setView(-92, 37, zoom = 4)

Map % Income Per Capita Change

leafmap_income <- merge(us.map, county_income_change, by=c("GEOID"))

popup_dat_income <- paste0("<strong>County: </strong>", 
                    leafmap$NAME, 
                    "<br><strong>Income Per Person Change: </strong>", 
                    round(leafmap_income$percent_change_income_per_capita * 100 , 2 ) , " % " ) 

 

pal <- colorQuantile("RdYlGn", domain = leafmap_income$percent_change_income_per_capita , n = 20)
 
leaflet(data = leafmap_income) %>% addTiles() %>%
    addPolygons(fillColor = ~pal(percent_change_income_per_capita), 
                fillOpacity = 0.8, 
                color = "#BDBDC3", 
                weight = 1,
                popup = popup_dat_income) %>%
addLegend(position = "bottomright",
   pal = pal, values = leafmap_income$percent_change_income_per_capita,
   title = "Change",
   opacity = .5)  %>% 
  setView(-92, 37, zoom = 4)

Income change vs population change

ggplot(data = census_wide, aes(y=percent_change_population, 
                               x=percent_change_income_per_capita )) + 
  geom_point() + 
  stat_smooth(method = "lm")

Linear regression

model_income_change_vs_pop_change <- lm(percent_change_income_per_capita ~  percent_change_population, data = census_wide)
summary(model_income_change_vs_pop_change)

## 
## Call:
## lm(formula = percent_change_income_per_capita ~ percent_change_population, 
##     data = census_wide)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.92139 -0.11597 -0.03018  0.07880  1.54360 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.45998    0.00377  122.02   <2e-16 ***
## percent_change_population -0.52560    0.02665  -19.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1946 on 3079 degrees of freedom
## Multiple R-squared:  0.1122, Adjusted R-squared:  0.1119 
## F-statistic:   389 on 1 and 3079 DF,  p-value: < 2.2e-16

Conclusion

We found the exact opposite relationship from our hypothesis. Counties that have a decrease in population have an increase in income per capita. According to the p value this is a statiscally signifigant relationship. This could possibly be explained by gentrification pushing people out of the city center and people moving into the surrounding areas.

Difficulties and Next Steps

Our next step would be to cluster counties and find the “high” and “low” income counties in a state and see how migration patterns varied between these two clusters. We wanted to keep these clusters of “high” and “low” income counties using adjectent counties to account for cities that spill into the sourounding counties, like the NYC and Philadelphia areas. Unfortunately, the Census Bureau’s Metropolitian Statiscal Areas was not suited for this task, because they don’t account for all rural areas, especially in the west.

In order to try to get around theese difficulties Luis tried attemped some clustering. Here is a link to the clustering that he attempted: https://rpubs.com/calleja/237132

Economic Migration 2000-2010

Raphael Nash and Luis Calleja

12/18/2016