library(ipumsr)
## Warning: package 'ipumsr' was built under R version 4.1.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0 v purrr 1.0.1
## v tibble 3.1.8 v dplyr 1.0.10
## v tidyr 1.2.1 v stringr 1.5.0
## v readr 2.1.3 v forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'purrr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'stringr' was built under R version 4.1.3
## Warning: package 'forcats' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr)
## Warning: package 'knitr' was built under R version 4.1.3
library(scales)
## Warning: package 'scales' was built under R version 4.1.3
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
# Change these directories to where your data file is stored and to where you want to save plots; these are in a subdirectory of the R project where the .Rmd file is located.
dataDir <- "./data" #read data file dir
dataDir2 <- "./plots" #save plot files dir
options(scipen=99, digits=5)
ddi <- read_ipums_ddi(file.path(dataDir,"usa_00001.xml")) # ACS 2019
data <- read_ipums_micro(ddi)
## Use of data from IPUMS USA is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
kable(head(data))
| YEAR | SAMPLE | SERIAL | CBSERIAL | HHWT | CLUSTER | STATEFIP | MET2013 | STRATA | GQ | PERNUM | PERWT | SEX | AGE | RACE | RACED | HISPAN | HISPAND | BPL | BPLD | CITIZEN | RACAMIND | RACASIAN | RACBLK | RACPACIS | RACWHT | RACOTHER | RACNUM | EDUC | EDUCD | MIGRATE1 | MIGRATE1D | MIGPLAC1 | MIGMET131 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2019 | 201901 | 1 | 2019010000088 | 11 | 2019000000011 | 1 | 0 | 220001 | 4 | 1 | 11 | 1 | 39 | 2 | 200 | 0 | 0 | 1 | 100 | 0 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 4 | 40 | 1 | 10 | 0 | 0 |
| 2019 | 201901 | 2 | 2019010000096 | 70 | 2019000000021 | 1 | 0 | 100001 | 3 | 1 | 70 | 2 | 21 | 1 | 100 | 0 | 0 | 13 | 1300 | 0 | 1 | 1 | 1 | 1 | 2 | 1 | 1 | 4 | 40 | 2 | 23 | 1 | 0 |
| 2019 | 201901 | 3 | 2019010000153 | 20 | 2019000000031 | 1 | 11500 | 110001 | 4 | 1 | 20 | 1 | 19 | 2 | 200 | 0 | 0 | 1 | 100 | 0 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 7 | 71 | 2 | 24 | 1 | 0 |
| 2019 | 201901 | 4 | 2019010000198 | 79 | 2019000000041 | 1 | 11500 | 110001 | 3 | 1 | 79 | 1 | 77 | 1 | 100 | 0 | 0 | 1 | 100 | 0 | 1 | 1 | 1 | 1 | 2 | 1 | 1 | 3 | 30 | 2 | 23 | 1 | 11500 |
| 2019 | 201901 | 5 | 2019010000205 | 53 | 2019000000051 | 1 | 33660 | 270101 | 3 | 1 | 53 | 1 | 41 | 2 | 200 | 0 | 0 | 1 | 100 | 0 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 3 | 30 | 2 | 23 | 1 | 33660 |
| 2019 | 201901 | 6 | 2019010000215 | 77 | 2019000000061 | 1 | 33860 | 200001 | 4 | 1 | 77 | 1 | 18 | 2 | 200 | 0 | 0 | 26 | 2600 | 0 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 6 | 65 | 3 | 32 | 26 | 19820 |
# extract variable names the traditional way
varname <- as.data.frame(colnames(data))
# extract labels
n <- ncol(data)
labels_list <- map(1:n, function(x) attr(data[[x]], "label") )
labels_vector <- map_chr(1:n, function(x) attr(data[[x]], "label") )
# ddi info
names(ddi)
## [1] "file_name" "file_path" "file_type" "ipums_project"
## [5] "extract_date" "extract_notes" "rectypes" "rectype_idvar"
## [9] "rectypes_keyvars" "var_info" "conditions" "citation"
## [13] "file_encoding"
# extract file
ddi$file_name
## [1] "usa_00001.dat"
# description of extract file
ddi$extract_notes
## [1] "User-provided description: 2019 ACS migration"
# variables in extract
ddi$var_info
## # A tibble: 34 x 10
## var_name var_l~1 var_d~2 val_la~3 code_~4 start end imp_d~5 var_t~6 recty~7
## <chr> <chr> <chr> <list> <chr> <dbl> <dbl> <dbl> <chr> <lgl>
## 1 YEAR Census~ "YEAR ~ <tibble> <NA> 1 4 0 integer NA
## 2 SAMPLE IPUMS ~ "SAMPL~ <tibble> <NA> 5 10 0 integer NA
## 3 SERIAL Househ~ "SERIA~ <tibble> "Codes~ 11 18 0 numeric NA
## 4 CBSERIAL Origin~ "CBSER~ <tibble> "Codes~ 19 31 0 numeric NA
## 5 HHWT Househ~ "HHWT ~ <tibble> "Codes~ 32 41 2 numeric NA
## 6 CLUSTER Househ~ "CLUST~ <tibble> "Codes~ 42 54 0 numeric NA
## 7 STATEFIP State ~ "STATE~ <tibble> <NA> 55 56 0 integer NA
## 8 MET2013 Metrop~ "A met~ <tibble> <NA> 57 61 0 integer NA
## 9 STRATA Househ~ "STRAT~ <tibble> "Codes~ 62 73 0 numeric NA
## 10 GQ Group ~ "GQ cl~ <tibble> <NA> 74 74 0 integer NA
## # ... with 24 more rows, and abbreviated variable names 1: var_label,
## # 2: var_desc, 3: val_labels, 4: code_instr, 5: imp_decim, 6: var_type,
## # 7: rectypes
# details on variable in extract
ipums_var_label(ddi, MIGRATE1)
## [1] "Migration status, 1 year [general version]"
ipums_val_labels(ddi, MIGRATE1)
## # A tibble: 6 x 2
## val lbl
## <dbl> <chr>
## 1 0 N/A
## 2 1 Same house
## 3 2 Moved within state
## 4 3 Moved between states
## 5 4 Abroad one year ago
## 6 9 Unknown
# make a new variable from state name from statefips labels; these is not used in this script but useful and easy to read for later analysis
STATEFIP <- ipums_val_labels(ddi, STATEFIP)
# add statefips labels to data as variable STATENAME
data$STATENAME <- as_factor(data$STATEFIP)
# interactively view variables, labels/descriptions, values, generates a web page
ipums_view(ddi)
# file name
This code block computes the percentage of the US population that migrates interstate based on the 2019 ACS sample disregarding the person weight that counts for the significance of representation of each record.The result migration rate is 2.26%.
migrate_national_unweighted <- data %>%
filter(AGE>=1) %>%
mutate(n=1) %>%
summarise(pop=sum(n),
interstate_movers=sum(n[MIGRATE1==3])) %>%
mutate(interstate_mig_rate=interstate_movers/pop)
kable(head(migrate_national_unweighted))
| pop | interstate_movers | interstate_mig_rate |
|---|---|---|
| 3210231 | 72688 | 0.02264 |
This code block computes the percentage of the US population that migrates interstate based on the 2019 ACS sample counting in the person weight, displaying a more accurate estimation of the US interstate migration in 2019. The result migration rate is 2.31%
migrate_national_weighted <- data %>%
filter(AGE>=1) %>%
summarise(pop=sum(PERWT),
interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
mutate(interstate_mig_rate=interstate_movers/pop)
kable(head(migrate_national_weighted))
| pop | interstate_movers | interstate_mig_rate |
|---|---|---|
| 324737606 | 7501876 | 0.0231 |
This code block computes the percentage of US interstate movers for male and female in 2019, counting in person weight. Sex 1 representing male has a migration rate of 2.38%; sex 2 representing female has a migration rate of 2.25%, which is slightly lower than that of male.
migrate_national_sex <- data %>%
filter(AGE>=1) %>%
group_by(SEX) %>%
summarise(pop=sum(PERWT),
interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
mutate(interstate_mig_rate=interstate_movers/pop)
kable(head(migrate_national_sex))
| SEX | pop | interstate_movers | interstate_mig_rate |
|---|---|---|---|
| 1 | 159795828 | 3796339 | 0.02376 |
| 2 | 164941778 | 3705537 | 0.02247 |
This code block computes the percentage of US interstate movers for each age within the age range (age has to be older than 0) in 2019, counting in person weight. Below is the migration rate for the first six age.
migrate_age <- data %>%
filter(AGE>=1) %>%
group_by(AGE) %>%
summarise(pop=sum(PERWT),
interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
mutate(interstate_mig_rate=interstate_movers/pop)
kable(head(migrate_age))
| AGE | pop | interstate_movers | interstate_mig_rate |
|---|---|---|---|
| 1 | 3777331 | 114018 | 0.03018 |
| 2 | 3913567 | 118238 | 0.03021 |
| 3 | 4016207 | 94121 | 0.02344 |
| 4 | 4106801 | 94470 | 0.02300 |
| 5 | 3880268 | 91160 | 0.02349 |
| 6 | 3869326 | 80178 | 0.02072 |
A clear pattern
ggplot(data=migrate_age, aes(x=AGE, y=interstate_mig_rate)) +
geom_line()+
geom_point() +
scale_x_continuous(name="Age") +
scale_y_continuous(name="Migration Rate")
Get rid of the jerky trend line and replace with a smoothed one. It does not capture the extremes but does show the general trends.
ggplot(data=migrate_age, aes(x=AGE, y=interstate_mig_rate)) +
geom_point() +
geom_smooth(span=.24) +
scale_x_continuous(name="Age", limits=c(0,100), breaks=seq(0,100, by = 5)) +
scale_y_continuous(name="Migration Rate", limits=c(0,0.06), breaks=seq(0,0.6, by =0.01),
labels=percent) +
labs(title = "Age Schedule for US Interstate Migration") +
theme(text=element_text(size=14))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
This code block computes the percentage of US interstate movers for each racial group (White, Black, American Indian, Asian, Other, Mixed, and Latino) in 2019, counting in person weight.
migrate_race <- data %>%
filter(AGE>=1) %>%
mutate(RACE_ETHNIC=case_when(
HISPAN==0 & RACE==1 ~ "White",
HISPAN==0 & RACE==2 ~ "Black",
HISPAN==0 & RACE==3 ~ "American Indian",
HISPAN==0 & (RACE>=4 & RACE <=6) ~ "Asian",
HISPAN==0 & RACE==7 ~ "Other",
HISPAN==0 & RACE>=8 ~ "Mixed",
HISPAN>0 ~ "Latinx")) %>%
group_by(RACE_ETHNIC) %>%
summarise(pop=sum(PERWT),
interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
mutate(interstate_mig_rate=interstate_movers/pop)
kable(migrate_race)
| RACE_ETHNIC | pop | interstate_movers | interstate_mig_rate |
|---|---|---|---|
| American Indian | 2187817 | 54078 | 0.02472 |
| Asian | 18681193 | 503396 | 0.02695 |
| Black | 40193456 | 867732 | 0.02159 |
| Latinx | 59604788 | 1037598 | 0.01741 |
| Mixed | 8224028 | 268469 | 0.03264 |
| Other | 817627 | 14735 | 0.01802 |
| White | 195028697 | 4755868 | 0.02439 |
It appears in the table that some minority groups have a relatively lower rate of interstate migration. One possible explanation could be that minority groups are less disperse and more concentrated in the nation as they have a need to attach to their co-ethnic communities. Moving to a different state for minority groups not only means facing new networking and working/living environments, but also the need to discover local racial groups, which could be difficult in some states. Given this aspect, minority groups might be less inclined to move.
This code block computes the percentage of US interstate movers for each racial group by age in 2019, counting in person weight.
migrate_age_race <- data %>%
filter(AGE>=1) %>%
mutate(RACE_ETHNIC=case_when(
HISPAN==0 & RACE==1 ~ "White",
HISPAN==0 & RACE==2 ~ "Black",
HISPAN==0 & RACE==3 ~ "American Indian",
HISPAN==0 & (RACE>=4 & RACE <=6) ~ "Asian",
HISPAN==0 & RACE==7 ~ "Other",
HISPAN==0 & RACE>=8 ~ "Mixed",
HISPAN>0 ~ "Latinx")) %>%
group_by(AGE, RACE_ETHNIC) %>%
summarise(pop=sum(PERWT),
interstate_movers=sum(PERWT[MIGRATE1==3])) %>%
mutate(interstate_mig_rate=interstate_movers/pop)
## `summarise()` has grouped output by 'AGE'. You can override using the `.groups`
## argument.
kable(head(migrate_age_race))
| AGE | RACE_ETHNIC | pop | interstate_movers | interstate_mig_rate |
|---|---|---|---|---|
| 1 | American Indian | 26547 | 1135 | 0.04275 |
| 1 | Asian | 170907 | 4570 | 0.02674 |
| 1 | Black | 494926 | 13377 | 0.02703 |
| 1 | Latinx | 984859 | 25604 | 0.02600 |
| 1 | Mixed | 228702 | 6664 | 0.02914 |
| 1 | Other | 16941 | 241 | 0.01423 |
ggplot(data=migrate_age_race, aes(x=AGE, y=interstate_mig_rate)) +
geom_smooth(aes(color=RACE_ETHNIC),span=.24)+
geom_point(size=0.7) +
scale_x_continuous(name="Age", limits=c(0,100), breaks=seq(0,100, by = 5)) +
scale_y_continuous(name="Migration Rate", limits=c(0,0.06), breaks=seq(0,0.6, by =0.01),
labels=percent) +
labs(title = "Age Schedule for US Interstate Migration for Each Race") +
theme(text=element_text(size=14))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 18 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 8 rows containing missing values (`geom_smooth()`).
## Warning: Removed 18 rows containing missing values (`geom_point()`).