Set libraries and directories

library(ipumsr)
## Warning: package 'ipumsr' was built under R version 4.1.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0      v purrr   1.0.1 
## v tibble  3.1.8      v dplyr   1.0.10
## v tidyr   1.2.1      v stringr 1.5.0 
## v readr   2.1.3      v forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'purrr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'stringr' was built under R version 4.1.3
## Warning: package 'forcats' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)
## Warning: package 'knitr' was built under R version 4.1.3
library(scales)
## Warning: package 'scales' was built under R version 4.1.3
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
# Change these directories to where your data file is stored and to where you want to save plots; these are in a subdirectory of the R project where the .Rmd file is located.
dataDir <- "./data"  #read data file dir
dataDir2 <- "./plots"  #save plot files dir

options(scipen=99, digits=5)

Read the data

ddi <- read_ipums_ddi(file.path(dataDir,"usa_00001.xml")) # ACS 2019
data <- read_ipums_micro(ddi)
## Use of data from IPUMS USA is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
kable(head(data))
YEAR SAMPLE SERIAL CBSERIAL HHWT CLUSTER STATEFIP MET2013 STRATA GQ PERNUM PERWT SEX AGE RACE RACED HISPAN HISPAND BPL BPLD CITIZEN RACAMIND RACASIAN RACBLK RACPACIS RACWHT RACOTHER RACNUM EDUC EDUCD MIGRATE1 MIGRATE1D MIGPLAC1 MIGMET131
2019 201901 1 2019010000088 11 2019000000011 1 0 220001 4 1 11 1 39 2 200 0 0 1 100 0 1 1 2 1 1 1 1 4 40 1 10 0 0
2019 201901 2 2019010000096 70 2019000000021 1 0 100001 3 1 70 2 21 1 100 0 0 13 1300 0 1 1 1 1 2 1 1 4 40 2 23 1 0
2019 201901 3 2019010000153 20 2019000000031 1 11500 110001 4 1 20 1 19 2 200 0 0 1 100 0 1 1 2 1 1 1 1 7 71 2 24 1 0
2019 201901 4 2019010000198 79 2019000000041 1 11500 110001 3 1 79 1 77 1 100 0 0 1 100 0 1 1 1 1 2 1 1 3 30 2 23 1 11500
2019 201901 5 2019010000205 53 2019000000051 1 33660 270101 3 1 53 1 41 2 200 0 0 1 100 0 1 1 2 1 1 1 1 3 30 2 23 1 33660
2019 201901 6 2019010000215 77 2019000000061 1 33860 200001 4 1 77 1 18 2 200 0 0 26 2600 0 1 1 2 1 1 1 1 6 65 3 32 26 19820
# extract variable names the traditional way
varname <- as.data.frame(colnames(data))

# extract labels
n <- ncol(data)
labels_list <- map(1:n, function(x) attr(data[[x]], "label") )

labels_vector <- map_chr(1:n, function(x) attr(data[[x]], "label") )

# ddi info
names(ddi)
##  [1] "file_name"        "file_path"        "file_type"        "ipums_project"   
##  [5] "extract_date"     "extract_notes"    "rectypes"         "rectype_idvar"   
##  [9] "rectypes_keyvars" "var_info"         "conditions"       "citation"        
## [13] "file_encoding"
# extract file
ddi$file_name
## [1] "usa_00001.dat"
# description of extract file
ddi$extract_notes
## [1] "User-provided description:  2019 ACS migration"
# variables in extract
ddi$var_info
## # A tibble: 34 x 10
##    var_name var_l~1 var_d~2 val_la~3 code_~4 start   end imp_d~5 var_t~6 recty~7
##    <chr>    <chr>   <chr>   <list>   <chr>   <dbl> <dbl>   <dbl> <chr>   <lgl>  
##  1 YEAR     Census~ "YEAR ~ <tibble>  <NA>       1     4       0 integer NA     
##  2 SAMPLE   IPUMS ~ "SAMPL~ <tibble>  <NA>       5    10       0 integer NA     
##  3 SERIAL   Househ~ "SERIA~ <tibble> "Codes~    11    18       0 numeric NA     
##  4 CBSERIAL Origin~ "CBSER~ <tibble> "Codes~    19    31       0 numeric NA     
##  5 HHWT     Househ~ "HHWT ~ <tibble> "Codes~    32    41       2 numeric NA     
##  6 CLUSTER  Househ~ "CLUST~ <tibble> "Codes~    42    54       0 numeric NA     
##  7 STATEFIP State ~ "STATE~ <tibble>  <NA>      55    56       0 integer NA     
##  8 MET2013  Metrop~ "A met~ <tibble>  <NA>      57    61       0 integer NA     
##  9 STRATA   Househ~ "STRAT~ <tibble> "Codes~    62    73       0 numeric NA     
## 10 GQ       Group ~ "GQ cl~ <tibble>  <NA>      74    74       0 integer NA     
## # ... with 24 more rows, and abbreviated variable names 1: var_label,
## #   2: var_desc, 3: val_labels, 4: code_instr, 5: imp_decim, 6: var_type,
## #   7: rectypes
# details on variable in extract
ipums_var_label(ddi, MIGRATE1)
## [1] "Migration status, 1 year [general version]"
ipums_val_labels(ddi, MIGRATE1)
## # A tibble: 6 x 2
##     val lbl                 
##   <dbl> <chr>               
## 1     0 N/A                 
## 2     1 Same house          
## 3     2 Moved within state  
## 4     3 Moved between states
## 5     4 Abroad one year ago 
## 6     9 Unknown
# make a new variable from state name from statefips labels;  these is not used in this script but useful and easy to read for later analysis
STATEFIP <- ipums_val_labels(ddi, STATEFIP)  

# add statefips labels to data as variable STATENAME
data$STATENAME <- as_factor(data$STATEFIP)

# interactively view variables, labels/descriptions, values, generates a web page
ipums_view(ddi)

# file name

Compute Unweighted Migration Rate

This code block computes the percentage of the US population that migrates interstate based on the 2019 ACS sample disregarding the person weight that counts for the significance of representation of each record.The result migration rate is 2.26%.

migrate_national_unweighted <- data %>% 
  filter(AGE>=1) %>% 
  mutate(n=1) %>% 
  summarise(pop=sum(n),
         interstate_movers=sum(n[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

kable(head(migrate_national_unweighted))
pop interstate_movers interstate_mig_rate
3210231 72688 0.02264

Compute Weighted Migration Rate

This code block computes the percentage of the US population that migrates interstate based on the 2019 ACS sample counting in the person weight, displaying a more accurate estimation of the US interstate migration in 2019. The result migration rate is 2.31%

migrate_national_weighted <- data %>% 
  filter(AGE>=1) %>% 
  summarise(pop=sum(PERWT),
         interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

kable(head(migrate_national_weighted))
pop interstate_movers interstate_mig_rate
324737606 7501876 0.0231

Compute Weighted Migration Rates by Gender/Sex

This code block computes the percentage of US interstate movers for male and female in 2019, counting in person weight. Sex 1 representing male has a migration rate of 2.38%; sex 2 representing female has a migration rate of 2.25%, which is slightly lower than that of male.

migrate_national_sex <- data %>% 
  filter(AGE>=1) %>% 
  group_by(SEX) %>% 
  summarise(pop=sum(PERWT),
         interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

kable(head(migrate_national_sex))
SEX pop interstate_movers interstate_mig_rate
1 159795828 3796339 0.02376
2 164941778 3705537 0.02247

Compute Weighted Migration Rates by Age

This code block computes the percentage of US interstate movers for each age within the age range (age has to be older than 0) in 2019, counting in person weight. Below is the migration rate for the first six age.

migrate_age <- data %>% 
  filter(AGE>=1) %>% 
  group_by(AGE) %>% 
  summarise(pop=sum(PERWT),
         interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

kable(head(migrate_age))
AGE pop interstate_movers interstate_mig_rate
1 3777331 114018 0.03018
2 3913567 118238 0.03021
3 4016207 94121 0.02344
4 4106801 94470 0.02300
5 3880268 91160 0.02349
6 3869326 80178 0.02072

Plot migration rates by age: a migration schedule by age

A clear pattern

ggplot(data=migrate_age, aes(x=AGE, y=interstate_mig_rate)) +
  geom_line()+
  geom_point() +
  scale_x_continuous(name="Age") +
  scale_y_continuous(name="Migration Rate")

Smooth the age schedule, add title etc, plot using percentages

Get rid of the jerky trend line and replace with a smoothed one. It does not capture the extremes but does show the general trends.

ggplot(data=migrate_age, aes(x=AGE, y=interstate_mig_rate)) +
  geom_point() +
  geom_smooth(span=.24) +
  scale_x_continuous(name="Age", limits=c(0,100), breaks=seq(0,100, by = 5)) +
  scale_y_continuous(name="Migration Rate", limits=c(0,0.06), breaks=seq(0,0.6, by =0.01), 
                     labels=percent) +
  labs(title = "Age Schedule for US Interstate Migration") +
  theme(text=element_text(size=14))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).

Compute Migration Rate by Race

This code block computes the percentage of US interstate movers for each racial group (White, Black, American Indian, Asian, Other, Mixed, and Latino) in 2019, counting in person weight.

migrate_race <- data %>% 
  filter(AGE>=1) %>% 
  mutate(RACE_ETHNIC=case_when(
    HISPAN==0 & RACE==1 ~ "White",
    HISPAN==0 & RACE==2 ~ "Black",
    HISPAN==0 & RACE==3 ~ "American Indian",
    HISPAN==0 & (RACE>=4 & RACE <=6) ~ "Asian",
    HISPAN==0 & RACE==7 ~ "Other",
    HISPAN==0 & RACE>=8 ~ "Mixed",
    HISPAN>0 ~ "Latinx")) %>% 
  group_by(RACE_ETHNIC) %>% 
  summarise(pop=sum(PERWT),
         interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)

kable(migrate_race)
RACE_ETHNIC pop interstate_movers interstate_mig_rate
American Indian 2187817 54078 0.02472
Asian 18681193 503396 0.02695
Black 40193456 867732 0.02159
Latinx 59604788 1037598 0.01741
Mixed 8224028 268469 0.03264
Other 817627 14735 0.01802
White 195028697 4755868 0.02439

It appears in the table that some minority groups have a relatively lower rate of interstate migration. One possible explanation could be that minority groups are less disperse and more concentrated in the nation as they have a need to attach to their co-ethnic communities. Moving to a different state for minority groups not only means facing new networking and working/living environments, but also the need to discover local racial groups, which could be difficult in some states. Given this aspect, minority groups might be less inclined to move.

Migration Rates by Age for Each Racial Group

This code block computes the percentage of US interstate movers for each racial group by age in 2019, counting in person weight.

migrate_age_race <- data %>%
  filter(AGE>=1) %>%
  mutate(RACE_ETHNIC=case_when(
    HISPAN==0 & RACE==1 ~ "White",
    HISPAN==0 & RACE==2 ~ "Black",
    HISPAN==0 & RACE==3 ~ "American Indian",
    HISPAN==0 & (RACE>=4 & RACE <=6) ~ "Asian",
    HISPAN==0 & RACE==7 ~ "Other",
    HISPAN==0 & RACE>=8 ~ "Mixed",
    HISPAN>0 ~ "Latinx")) %>%
  group_by(AGE, RACE_ETHNIC) %>%
  summarise(pop=sum(PERWT),
            interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
  mutate(interstate_mig_rate=interstate_movers/pop)
## `summarise()` has grouped output by 'AGE'. You can override using the `.groups`
## argument.
kable(head(migrate_age_race))
AGE RACE_ETHNIC pop interstate_movers interstate_mig_rate
1 American Indian 26547 1135 0.04275
1 Asian 170907 4570 0.02674
1 Black 494926 13377 0.02703
1 Latinx 984859 25604 0.02600
1 Mixed 228702 6664 0.02914
1 Other 16941 241 0.01423

Plot migration rates by age for each racial group: Age Schedule for US Interstate Migration for Each Racial Group

ggplot(data=migrate_age_race, aes(x=AGE, y=interstate_mig_rate)) +
  geom_smooth(aes(color=RACE_ETHNIC),span=.24)+
  geom_point(size=0.7) +
  scale_x_continuous(name="Age", limits=c(0,100), breaks=seq(0,100, by = 5)) +
  scale_y_continuous(name="Migration Rate", limits=c(0,0.06), breaks=seq(0,0.6, by =0.01), 
                     labels=percent) +
  labs(title = "Age Schedule for US Interstate Migration for Each Race") +
  theme(text=element_text(size=14))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 18 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 8 rows containing missing values (`geom_smooth()`).
## Warning: Removed 18 rows containing missing values (`geom_point()`).