#setup
library(readr)
library(kirkegaard)
library(tidyverse)
library(survey)
library(Hmisc)
#load NLSY
nlsy<-read_csv("NLSY_Mobility.csv
#defining/renaming variables
nlsy <- nlsy %>% rename (
m_id = R0000100,
birth_yr = R0000500,
birth_mo = R0000300,
sex = R0214800,
race = R0214700,
us_born = R0000700
)
nlsy <- nlsy %>% rename(
income1978 = R0155400,
income1979 = R0312300,
income1980 = R0482600,
income1981 = R0782101,
income1982 = R1024001,
income1983 = R1410701,
income1984 = R1778501,
income1985 = R2141601,
income1986 = R2350301,
income1987 = R2722501,
income1988 = R3279401,
income1989 = R2971401,
income1990 = R3559001,
income1991 = R3897101,
income1992 = R4295101,
income1993 = R4982801,
income1995 = R5626201,
income1997 = R6364601,
income1999 = R6909701,
income2001 = R7607800,
income2003 = R8316300,
income2005 = T0912400
)
income_vars <- c(
"income1978","income1979","income1980","income1981","income1982",
"income1983","income1984","income1985","income1986","income1987",
"income1988","income1989","income1990","income1991","income1992",
"income1993","income1995","income1997","income1999","income2001",
"income2003","income2005"
)
nlsy <- nlsy %>% rename (
poverty1978 = R0217910,
poverty1979 = R0406100,
poverty1980 = R0618500,
poverty1981 = R0898700,
poverty1982 = R1144600,
poverty1983 = R1519800,
poverty1984 = R1890500,
poverty1985 = R2257600,
poverty1986 = R2444900,
poverty1987 = R2870400,
poverty1988 = R3074100,
poverty1989 = R3400800,
poverty1990 = R3656200,
poverty1991 = R4006700,
poverty1992 = R4417800,
poverty1993 = R5080800,
poverty1995 = R5166100,
poverty1997 = R6478800,
poverty1999 = R7006600,
poverty2001 = R7703900,
poverty2003 = R8496300,
poverty2005 = T0987900
)
poverty_vars <- c(
"poverty1978","poverty1979","poverty1980","poverty1981","poverty1982",
"poverty1983","poverty1984","poverty1985","poverty1986","poverty1987",
"poverty1988","poverty1989","poverty1990","poverty1991","poverty1992",
"poverty1993","poverty1995","poverty1997","poverty1999","poverty2001",
"poverty2003","poverty2005"
)
nlsy <- nlsy %>% rename(
wealth1985 = R1790803,
wealth1986 = R2153903,
wealth1987 = R2362503,
wealth1988 = R2735403,
wealth1989 = R2982903,
wealth1990 = R3293303,
wealth1992 = R3911003,
wealth1993 = R4392303,
wealth1994 = R5046603,
wealth1996 = R5728003,
wealth1998 = R6426003,
wealth2000 = R6940103,
wealth2004 = R8378703
)
wealth_vars <- c(
"wealth1985","wealth1986","wealth1987","wealth1988","wealth1989",
"wealth1990","wealth1992","wealth1993","wealth1994","wealth1996",
"wealth1998","wealth2000","wealth2004")
nlsy <- nlsy %>% rename(
highest_grade1994 = R5103900,
highest_grade2006 = T0988800
)
nlsy <- nlsy %>% rename (
asvab_gs = R0618010,
asvab_ar = R0618011,
asvab_wk = R0618012,
asvab_pc = R0618013,
asvab_no = R0618014,
asvab_cs = R0618015,
asvab_as = R0618016,
asvab_mk = R0618017,
asvab_mc = R0618018,
asvab_ei = R0618019
)
asvab_vars <- c(
"asvab_gs", "asvab_ar", "asvab_wk", "asvab_pc", "asvab_no",
"asvab_cs", "asvab_as", "asvab_mk", "asvab_mc", "asvab_ei")
#computed AFQT
nlsy <- nlsy %>% mutate (
afqt_raw = asvab_ar + asvab_mk +
2 * (asvab_wk + asvab_pc)
)
#race and sex dummies
nlsy <- nlsy %>% mutate (
white = race ==3,
black = race == 2,
hispanic = race == 1
)
nlsy <- nlsy %>% mutate (
female = sex == 2
)
#restricted dataset to females
nlsy <- nlsy %>% filter(female)
#data cleaning
R’s with no valid data in any wave for one or more of the categories removed invalid data which remained after this filter was labeled NA
nlsy <- nlsy %>%
filter(us_born > 0)
nlsy <- nlsy %>%
filter(if_any(all_of(income_vars), ~ .x >= 0))
nlsy<- nlsy %>%
filter(if_any(all_of(poverty_vars), ~.x >=0))
nlsy <- nlsy %>%
filter(if_any(all_of(wealth_vars), ~ .x <= -6 | .x >= 0))
nlsy <- nlsy %>%
filter(if_all(all_of(asvab_vars), ~.x > 0))
nlsy <- nlsy %>%
mutate(
across(all_of(income_vars), ~ if_else(.x < 0, NA_real_, .x)),
across(all_of(poverty_vars), ~ if_else(.x < 0, NA_real_, .x)),
across(all_of(wealth_vars), ~ if_else(.x %in% -1:-5, NA_real_, .x))
)
nlsy <- nlsy %>% mutate(
highest_grade1994 = if_else(
highest_grade1994 < 0,
NA_real_, highest_grade1994)
)
nlsy <- nlsy %>% mutate(
highest_grade2006 = if_else(
highest_grade2006 < 0, NA_real_, highest_grade2006)
)
#standardization
nlsy <- nlsy %>%
mutate(afqt_z = kirkegaard::standardize(afqt_raw, focal_group = white))```
nlsy <- nlsy %>% mutate(
birth_yr = as.integer(birth_yr +1900L),
birth_mo = as.integer(birth_mo)
)
years <- 1978:2006
for (yr in years) {
nlsy[[paste0("age_", yr)]] <- yr - nlsy$birth_yr -
as.integer(nlsy$birth_mo > 6) }
created a function for var_at_age applied it to income and poverty
make_var_at_age <- function(df, target_age, vars, prefix) {
years <- as.integer(sub(paste0("^", prefix), "", vars))
out <- rep(NA_real_, nrow(df))
for (i in seq_along(vars)) {
yr <- years[i]
age_col <- paste0("age_", yr)
if (!age_col %in% names(df)) next
idx <- df[[age_col]] == target_age
out[idx] <- df[[vars[i]]][idx]
}
out
}
for (a in 18:41) {
nlsy[[paste0("income_age_", a)]] <-
make_var_at_age(nlsy, a, income_vars, "income")
}
for (a in 18:41) {
nlsy[[paste0("poverty_age_", a)]] <-
make_var_at_age(nlsy, a, poverty_vars, "poverty")
}
#standardization
AFQT was standardized relative to whites: afqt_z income standardized at each age relative to whole sample income was log transformed: income_agexx_log1p_z
nlsy <- nlsy %>%
mutate(afqt_z = kirkegaard::standardize(afqt_raw, focal_group = white))
income_age_vars <- grep("^income_age_[0-9]+$", names(nlsy), value = TRUE)
nlsy <- nlsy %>%
mutate(
across(all_of(income_age_vars), ~ log1p(.x), .names = "{.col}_log1p"),
across(ends_with("_log1p"), ~ kirkegaard::standardize(.x), .names = "{.col}_z")
)
#load CNLSY
cnlsy <- read_csv("CNLSY_Mobility/CNLSY_Mobility.csv")
#define variables create income_vars
cnlsy <- cnlsy %>% rename(
cid = C0000100,
m_id = C0000200,
csex = C0005400,
crace = C0005300,
cbirth_yr = C0005700,
cbirth_mo = C0005500
)
cnlsy <- cnlsy %>% mutate(
white = crace == 3,
black = crace == 2,
hispanic = crace == 1
)
cnlsy <- cnlsy %>%
mutate(
cbirth_yr = as.integer(cbirth_yr),
cbirth_mo = as.integer(cbirth_mo)
)
cnlsy <- cnlsy %>% rename(
cincome1993 = Y0309200,
cincome1995 = Y0610500,
cincome1997 = Y0904800,
cincome1999 = Y1151700,
cincome2001 = Y1386700,
cincome2003 = Y1638400,
cincome2005 = Y1910100,
cincome2007 = Y2225400,
cincome2009 = Y2573900,
cincome2011 = Y2921600,
cincome2013 = Y3291400,
cincome2015 = Y3631700,
cincome2017 = Y4240000,
cincome2019 = Y4561200
)
cincome_vars <- c(
"cincome1993", "cincome1995", "cincome1997", "cincome1999",
"cincome2001", "cincome2003", "cincome2005", "cincome2007",
"cincome2009", "cincome2011", "cincome2013", "cincome2015",
"cincome2017", "cincome2019")
cnlsy <- cnlsy %>% rename(
chighest_grade2012 = C5718500)
cnlsy <- cnlsy %>% rename(
cmath1988 = C0799600,
creadr1988 = C0799900,
creadc1988 = C0800200,
cmath1990 = C0999100,
creadr1990 = C0999400,
creadc1990 = C0999700,
cmath1992 = C1198800,
creadr1992 = C1199100,
creadc1992 = C1199400,
cmath1994 = C1507800,
creadr1994 = C1508100,
creadc1994 = C1508400,
cmath1996 = C1564700,
creadr1996 = C1565000,
creadc1996 = C1565300,
cmath1998 = C1800100,
creadr1998 = C1800400,
creadc1998 = C1800700,
cmath2000 = C2503700,
creadr2000 = C2503900,
creadc2000 = C2504300,
cmath2002 = C2532200,
creadr2002 = C2532400,
creadc2002 = C2532800,
cmath2004 = C2803000,
creadr2004 = C2803200,
creadc2004 = C2803600
)
filtered dataset to females born between 1978 and 1990
cnlsy <- cnlsy %>%
filter(cbirth_yr >= 1978, cbirth_yr <= 1990)
cnlsy <- cnlsy %>% filter (csex ==2)
#data cleaning
removed respondents with no valid birth month or income in any wave; defined missing/invalid data as NA
cnlsy <- cnlsy %>% filter(cbirth_mo >= 0)
cnlsy <- cnlsy %>%
filter(if_any(all_of(cincome_vars), ~ .x >= 0))
cnlsy <- cnlsy %>%
mutate(
across(all_of(cincome_vars), ~ if_else(.x < 0, NA_real_, .x)))
#creating and standardizing PIAT composite
PIAT = (math + reading r + reading c)/3 Normed by age and with whites as the reference
piat_years <- c(1988,1990,1992,1994,1996,1998,2000,2002,2004)
for (yr in piat_years) {
cnlsy[[paste0("cage_", yr)]] <- yr - cnlsy$cbirth_yr
- as.integer(cnlsy$cbirth_mo > 6)
}
for (yr in piat_years) {
m <- paste0("cmath", yr)
r <- paste0("creadr", yr)
c <- paste0("creadc", yr)
comp <- paste0("piat_", yr)
}
piat_vars <- grep("^c(math|readr|readc)[0-9]{4}$", names(cnlsy), value = TRUE)
cnlsy <- cnlsy %>%
mutate(across(all_of(piat_vars), ~ if_else(.x <0, NA_real_, as.numeric(.x))))
for (yr in piat_years) {
m <- paste0("cmath", yr)
r <- paste0("creadr", yr)
c <- paste0("creadc", yr)
out <- paste0("piat", yr)
if (!all(c(m, r, c) %in% names(cnlsy))) next
cnlsy[[out]] <- ifelse(
!is.na(cnlsy[[m]]) & !is.na(cnlsy[[r]]) & !is.na(cnlsy[[c]]),
(cnlsy[[m]] + cnlsy[[r]] + cnlsy[[c]]) / 3,
NA_real_
)
}
rm(yr, m, r, c, out)
make_piat_at_age <- function(df, target_age, years) {
out <- rep(NA_real_, nrow(df))
for (yr in years) {
piat_col <- paste0("piat", yr)
age_col <- paste0("cage_", yr)
if (!all(c(piat_col, age_col) %in% names(df))) next
idx <- df[[age_col]] == target_age
out[idx] <- df[[piat_col]][idx]
}
out
}
for (a in 8:14) {
cnlsy[[paste0("piat_age_", a)]] <- make_piat_at_age(cnlsy, a, piat_years)
}
cnlsy <- cnlsy %>%
mutate(piat_age_14z = kirkegaard::standardize(
piat_age_14, focal_group = white))
```
#Income standardization
note: standardized without removing missing age 14 PIAT
```r
income_years <- as.integer(sub("^cincome", "", cincome_vars))
for (yr in income_years) {
cnlsy[[paste0("cage_", yr)]] <- yr - cnlsy$cbirth_yr -
as.integer(cnlsy$cbirth_mo > 6)
}
rm(yr)
make_income_age <- function(df, target_age, vars, prefix, age_prefix = "cage_") {
years <- as.integer(sub(paste0("^", prefix), "", vars))
out <- rep(NA_real_, nrow(df))
for (i in seq_along(vars)) {
yr <- years[i]
age_col <- paste0(age_prefix, yr)
if (!age_col %in% names(df)) next
idx <- df[[age_col]] == target_age
out[idx] <- df[[vars[i]]][idx]
}
out
}
for (a in 18:41) {
cnlsy[[paste0("cincome_age_", a)]] <- make_income_age(
cnlsy, a, cincome_vars, "cincome", age_prefix = "cage_"
)
}
rm(a)
cincome_age_vars <- grep("^cincome_age_[0-9]+$", names(cnlsy), value = TRUE)
cnlsy <- cnlsy %>%
mutate(
across(all_of(cincome_age_vars), ~ log1p(.x), .names = "{.col}_log1p"),
across(ends_with("_log1p"), ~ kirkegaard::standardize(.x), .names = "{.col}_z")
)
#final cleaning and merge
cnlsy <- cnlsy %>% filter(
!is.na(piat_age_14z)
)
nlsy_one <- nlsy %>%
select(m_id, afqt_z, matches("^income_age_[0-9]+log1p_z$")) %>%
group_by(m_id) %>%
summarise(
mother_iq_z = first(afqt_z),
across(matches("^income_age_[0-9]+log1p_z$"), first),
.groups = "drop"
)
merged <- cnlsy %>%
left_join(nlsy_one, by = "m_id")
merged <- cnlsy %>%
left_join(
nlsy %>% select(m_id, afqt_z, starts_with("income_age_")),
by = "m_id"
)
#renaming important variables
merged <- merged %>%
rename_with(
~ sub("^income_age_([0-9]+)_log1p_z$", "mother_income_\\1_z", .x),
matches("^income_age_[0-9]+_log1p_z$")
)
merged <- merged %>%
rename_with(
~ sub("^cincome_age_([0-9]+)_log1p_z$", "daughter_income_\\1_z", .x),
matches("^cincome_age_[0-9]+_log1p_z$")
)
merged <- merged %>% rename(
mother_iq_z = afqt_z,
daughter_iq_z = piat_age_14z
)
#results
Mother-Daughter IQ correlation:
cor_iq <- cor(
merged$mother_iq_z,
merged$daughter_iq_z,
use = "complete.obs"
)
cor_iq
r=0.53
Mother-Daughter income correlations:
name_map <- tibble(var = names(merged)) %>%
mutate(
who = case_when(
grepl("mother_income", var) ~ "mother",
grepl("daughter_income", var) ~ "daughter",
TRUE ~ NA_character_
),
age = parse_number(var)
) %>%
filter(!is.na(who), !is.na(age))
name_map %>% count(who)
name_map %>% group_by(who) %>%
summarise(min_age=min(age), max_age=max(age), n=n())
common_ages <- intersect(
name_map %>% filter(who=="mother") %>% pull(age),
name_map %>% filter(who=="daughter") %>% pull(age)
) %>% sort()
income_corrs <- lapply(common_ages, function(a) {
mvar <- name_map %>% filter(who=="mother", age==a) %>% pull(var) %>% first()
dvar <- name_map %>% filter(who=="daughter", age==a) %>% pull(var) %>% first()
ok <- complete.cases(merged[[mvar]], merged[[dvar]])
tibble(
age = a,
r = if (sum(ok) >= 3) cor(merged[[mvar]][ok],
merged[[dvar]][ok]) else NA_real_,
n = sum(ok),
mother_var = mvar,
daughter_var = dvar
)
}) %>% bind_rows()
income_corrs
range(income_corrs$r, na.rm = TRUE)
correlations range from -0.03 to 0.40, with mean of 0.14
Mother-Daughter differences in IQ and income:
merged <- merged %>%
mutate(delta_iq = daughter_iq_z - mother_iq_z)
delta_corrs <- data.frame(
age = common_ages,
r = NA_real_,
n = NA_integer_
)
for (i in seq_along(common_ages)) {
a <- common_ages[i]
mv <- paste0("mother_income_", a, "_z")
dv <- paste0("daughter_income_", a, "_z")
tmp <- merged %>%
transmute(
delta_inc = .data[[dv]] - .data[[mv]],
delta_iq = delta_iq
)
ok <- complete.cases(tmp)
delta_corrs$r[i] <- cor(tmp$delta_iq[ok], tmp$delta_inc[ok])
delta_corrs$n[i] <- sum(ok)
}
delta_corrs
income_corrs %>%
filter(age >= 30, age <= 45) %>%
summarise(
min = min(r),
max = max(r),
mean = mean(r)
)
range(delta_corrs$r, na.rm = TRUE)
summary(delta_corrs$r)
correlations range from 0.04 to 0.48, with a mean of 0.19
#Additional Analyses
prime-age (restricted to ages 30-45)
income_corrs %>%
filter(age >= 30, age <= 45, !is.na(r)) %>%
summarise(
min = min(r),
max = max(r),
mean = mean(r)
)
mean r=0.13
delta_corrs %>%
filter(age >= 30, age <= 45, !is.na(r)) %>%
summarise(
min = min(r),
max = max(r),
mean = mean(r)
)
mean r=0.25
also with n>=100:
income_corrs %>%
filter(age >= 30, age <= 41, n >= 100, !is.na(r)) %>%
summarise(
min = min(r),
max = max(r),
mean = mean(r)
)
r=0.11
delta_corrs %>%
filter(age >= 30, age <= 41, n >= 100, !is.na(r)) %>%
summarise(
min = min(r),
max = max(r),
mean = mean(r)
)
r=0.22
#5-year smoothed (n-weighted)
income_corrs <- income_corrs %>%
mutate(
age_bin = cut(
age,
breaks = c(18, 24, 30, 36, 42),
right = FALSE,
labels = c("18–23", "24–29", "30–35", "36–41")
)
)
income_corrs_5yr <- income_corrs %>%
filter(!is.na(r), !is.na(age_bin)) %>%
group_by(age_bin) %>%
summarise(
r_weighted = weighted.mean(r, w = n),
min_r = min(r),
max_r = max(r),
n_ages = n(),
total_n = sum(n),
.groups = "drop"
)
income_corrs_5yr
rs ranging from 0.11 to 0.15
delta_corrs <- delta_corrs %>%
mutate(
age_bin = cut(
age,
breaks = c(18, 24, 30, 36, 42),
right = FALSE,
labels = c("18–23", "24–29", "30–35", "36–41")
)
)
delta_corrs_5yr <- delta_corrs %>%
filter(!is.na(r), !is.na(age_bin)) %>%
group_by(age_bin) %>%
summarise(
r_weighted = weighted.mean(r, w = n),
min_r = min(r),
max_r = max(r),
n_ages = n(),
total_n = sum(n),
.groups = "drop"
)
delta_corrs_5yr
rs of 0.12 to 0.23.
all years together:
income_corrs %>%
filter(!is.na(r)) %>%
summarise(
r_weighted = weighted.mean(r, w = n),
n_ages = n(),
total_n = sum(n)
)
```
r=0.13
```r
delta_all <- delta_corrs %>%
filter(!is.na(r)) %>%
summarise(
r_weighted = weighted.mean(r, w = n),
n_ages = n(),
total_n = sum(n)
)
delta_all
r=0.15
#all prime years
prime_income_corr <- income_corrs %>%
filter(age >= 30, age <= 41, !is.na(r)) %>%
summarise(
r_weighted = weighted.mean(r, w = n),
min_r = min(r),
max_r = max(r),
n_ages = n(),
total_n = sum(n)
)
prime_income_corr
r=0.12
delta_prime <- delta_corrs %>%
filter(age >= 30, age <= 45, !is.na(r)) %>%
summarise(
r_weighted = weighted.mean(r, w = n),
n_ages = n(),
total_n = sum(n)
)
delta_prime
r=0.22
#summary statistics
m_map <- name_map %>%
filter(who == "mother") %>%
distinct(age, var) %>%
arrange(age)
mother_iq_income_corrs <- data.frame(age = m_map$age, r =
NA_real_, n = NA_integer_, var = m_map$var)
for (i in seq_len(nrow(m_map))) {
v <- m_map$var[i]
ok <- complete.cases(merged$mother_iq_z, merged[[v]])
mother_iq_income_corrs$r[i] <- cor(merged$mother_iq_z[ok], merged[[v]][ok])
mother_iq_income_corrs$n[i] <- sum(ok)
}
mother_iq_income_corrs
Weighted average across ages:
weighted.mean(mother_iq_income_corrs$r,
w = mother_iq_income_corrs$n, na.rm = TRUE)
```
weighted mean mother IQ-income r is 0.29
```r
prime <- mother_iq_income_corrs$age >= 30 & mother_iq_income_corrs$age <= 41
weighted.mean(mother_iq_income_corrs$r[prime],
w = mother_iq_income_corrs$n[prime], na.rm = TRUE)
when restricted to ages 30-41, r=0.22
d_map <- name_map %>%
filter(who == "daughter") %>%
distinct(age, var) %>%
arrange(age)
daughter_iq_income_corrs <- data.frame(
age = d_map$age, r = NA_real_, n = NA_integer_, var = d_map$var)
for (i in seq_len(nrow(d_map))) {
v <- d_map$var[i]
ok <- complete.cases(merged$daughter_iq_z, merged[[v]])
daughter_iq_income_corrs$r[i] <- cor(merged$daughter_iq_z[ok], merged[[v]][ok])
daughter_iq_income_corrs$n[i] <- sum(ok)
}
daughter_iq_income_corrs
Weighted average across ages:
weighted.mean(daughter_iq_income_corrs$r,
w = daughter_iq_income_corrs$n, na.rm = TRUE)
#weighted r for daughter IQ-income =0.24
prime <- daughter_iq_income_corrs$age >= 30 & daughter_iq_income_corrs$age <= 45
weighted.mean(daughter_iq_income_corrs$r[prime],
w = daughter_iq_income_corrs$n[prime], na.rm = TRUE)
when restricted to ages 30-41, r=0.18