#setup

library(readr)
library(kirkegaard)
library(tidyverse)
library(survey)
library(Hmisc)

#load NLSY

nlsy<-read_csv("NLSY_Mobility.csv

#defining/renaming variables

nlsy <- nlsy %>% rename (
  m_id = R0000100,
  birth_yr = R0000500,
  birth_mo = R0000300,
  sex = R0214800,
  race = R0214700,
  us_born = R0000700
)

nlsy <- nlsy %>% rename(
  income1978 = R0155400,
  income1979 = R0312300,
  income1980 = R0482600,
  income1981 = R0782101,
  income1982 = R1024001,
  income1983 = R1410701,
  income1984 = R1778501,
  income1985 = R2141601, 
  income1986 = R2350301,
  income1987 = R2722501,
  income1988 = R3279401,
  income1989 = R2971401,
  income1990 = R3559001,
  income1991 = R3897101,
  income1992 = R4295101,
  income1993 = R4982801,
  income1995 = R5626201,
  income1997 = R6364601,
  income1999 = R6909701,
  income2001 = R7607800,
  income2003 = R8316300,
  income2005 = T0912400
)

income_vars <- c(
  "income1978","income1979","income1980","income1981","income1982",
  "income1983","income1984","income1985","income1986","income1987",
  "income1988","income1989","income1990","income1991","income1992",
  "income1993","income1995","income1997","income1999","income2001",
  "income2003","income2005"
)


nlsy <- nlsy %>% rename (
  poverty1978 = R0217910,
  poverty1979 = R0406100,
  poverty1980 = R0618500,
  poverty1981 = R0898700,
  poverty1982 = R1144600,
  poverty1983 = R1519800,
  poverty1984 = R1890500,
  poverty1985 = R2257600,
  poverty1986 = R2444900,
  poverty1987 = R2870400,
  poverty1988 = R3074100,
  poverty1989 = R3400800,
  poverty1990 = R3656200,
  poverty1991 = R4006700,
  poverty1992 = R4417800,
  poverty1993 = R5080800,
  poverty1995 = R5166100,
  poverty1997 = R6478800,
  poverty1999 = R7006600,
  poverty2001 = R7703900,
  poverty2003 = R8496300,
  poverty2005 = T0987900
)

poverty_vars <- c(
  "poverty1978","poverty1979","poverty1980","poverty1981","poverty1982",
  "poverty1983","poverty1984","poverty1985","poverty1986","poverty1987",
  "poverty1988","poverty1989","poverty1990","poverty1991","poverty1992",
  "poverty1993","poverty1995","poverty1997","poverty1999","poverty2001",
  "poverty2003","poverty2005"
)


nlsy <- nlsy %>% rename(
  wealth1985 = R1790803,
  wealth1986 = R2153903,
  wealth1987 = R2362503,
  wealth1988 = R2735403,
  wealth1989 = R2982903,
  wealth1990 = R3293303,
  wealth1992 = R3911003,
  wealth1993 = R4392303,
  wealth1994 = R5046603,
  wealth1996 = R5728003,
  wealth1998 = R6426003,
  wealth2000 = R6940103,
  wealth2004 = R8378703
)

wealth_vars <- c(
  "wealth1985","wealth1986","wealth1987","wealth1988","wealth1989",
  "wealth1990","wealth1992","wealth1993","wealth1994","wealth1996",
  "wealth1998","wealth2000","wealth2004")


nlsy <- nlsy %>% rename(
  highest_grade1994 = R5103900,
  highest_grade2006 = T0988800
)

nlsy <- nlsy %>% rename (
  asvab_gs = R0618010,
  asvab_ar = R0618011,
  asvab_wk = R0618012,
  asvab_pc = R0618013,
  asvab_no = R0618014,
  asvab_cs = R0618015,
  asvab_as = R0618016,
  asvab_mk = R0618017,
  asvab_mc = R0618018,
  asvab_ei = R0618019
)

asvab_vars <- c(
  "asvab_gs", "asvab_ar", "asvab_wk", "asvab_pc", "asvab_no", 
  "asvab_cs", "asvab_as", "asvab_mk", "asvab_mc", "asvab_ei")
 

#computed AFQT

nlsy <- nlsy %>% mutate (
  afqt_raw = asvab_ar + asvab_mk +
    2 * (asvab_wk + asvab_pc)
)

#race and sex dummies

nlsy <- nlsy %>% mutate (
  white = race ==3,
  black = race == 2,
  hispanic = race == 1
)

nlsy <- nlsy %>% mutate (
  female = sex == 2
)

#restricted dataset to females


nlsy <- nlsy %>% filter(female)

#data cleaning

R’s with no valid data in any wave for one or more of the categories removed invalid data which remained after this filter was labeled NA

nlsy <- nlsy %>% 
  filter(us_born > 0)

nlsy <- nlsy %>% 
  filter(if_any(all_of(income_vars), ~ .x >= 0))

nlsy<- nlsy %>% 
  filter(if_any(all_of(poverty_vars), ~.x >=0))

nlsy <- nlsy %>%
  filter(if_any(all_of(wealth_vars), ~ .x <= -6 | .x >= 0))

nlsy <- nlsy %>% 
  filter(if_all(all_of(asvab_vars), ~.x > 0))

nlsy <- nlsy %>%
  mutate(
    across(all_of(income_vars),  ~ if_else(.x < 0, NA_real_, .x)),
    across(all_of(poverty_vars), ~ if_else(.x < 0, NA_real_, .x)),
    across(all_of(wealth_vars),  ~ if_else(.x %in% -1:-5, NA_real_, .x))
  )

nlsy <- nlsy %>% mutate(
  highest_grade1994 = if_else(
    highest_grade1994 < 0,
    NA_real_, highest_grade1994)
)

nlsy <- nlsy %>% mutate(
  highest_grade2006 = if_else(
    highest_grade2006 < 0, NA_real_, highest_grade2006)
)

#standardization

nlsy <- nlsy %>%
  mutate(afqt_z = kirkegaard::standardize(afqt_raw, focal_group = white))```

nlsy <- nlsy %>% mutate(
  birth_yr = as.integer(birth_yr +1900L),
  birth_mo = as.integer(birth_mo)
)

years <- 1978:2006

for (yr in years) {
  nlsy[[paste0("age_", yr)]] <- yr - nlsy$birth_yr -
    as.integer(nlsy$birth_mo > 6) }

created a function for var_at_age applied it to income and poverty

make_var_at_age <- function(df, target_age, vars, prefix) {
  years <- as.integer(sub(paste0("^", prefix), "", vars))
  out <- rep(NA_real_, nrow(df))
  
  for (i in seq_along(vars)) {
    yr <- years[i]
    age_col <- paste0("age_", yr)
    if (!age_col %in% names(df)) next
    
    idx <- df[[age_col]] == target_age
    out[idx] <- df[[vars[i]]][idx]
  }
  
  out
}

for (a in 18:41) {
  nlsy[[paste0("income_age_", a)]] <-
    make_var_at_age(nlsy, a, income_vars, "income")
}

for (a in 18:41) {
  nlsy[[paste0("poverty_age_", a)]] <-
    make_var_at_age(nlsy, a, poverty_vars, "poverty")
}

#standardization

AFQT was standardized relative to whites: afqt_z income standardized at each age relative to whole sample income was log transformed: income_agexx_log1p_z

nlsy <- nlsy %>%
  mutate(afqt_z = kirkegaard::standardize(afqt_raw, focal_group = white))

income_age_vars <- grep("^income_age_[0-9]+$", names(nlsy), value = TRUE)

 nlsy <- nlsy %>%
       mutate(
     across(all_of(income_age_vars), ~ log1p(.x), .names = "{.col}_log1p"),
across(ends_with("_log1p"), ~ kirkegaard::standardize(.x), .names = "{.col}_z")
         )

#load CNLSY

cnlsy <- read_csv("CNLSY_Mobility/CNLSY_Mobility.csv")

#define variables create income_vars

cnlsy <- cnlsy %>% rename(
  cid = C0000100,
   m_id = C0000200,
   csex = C0005400,
   crace = C0005300,
    cbirth_yr = C0005700,
    cbirth_mo = C0005500
   )
cnlsy <- cnlsy %>% mutate(
  white = crace == 3,
  black = crace == 2,
  hispanic = crace == 1
)


cnlsy <- cnlsy %>%
  mutate(
    cbirth_yr = as.integer(cbirth_yr),
    cbirth_mo = as.integer(cbirth_mo)
  )

cnlsy <- cnlsy %>% rename(
  cincome1993 = Y0309200,
  cincome1995 = Y0610500,
  cincome1997 = Y0904800,
  cincome1999 = Y1151700,
  cincome2001 = Y1386700,
  cincome2003 = Y1638400,
  cincome2005 = Y1910100,
  cincome2007 = Y2225400,
  cincome2009 = Y2573900,
  cincome2011 = Y2921600,
  cincome2013 = Y3291400,
  cincome2015 = Y3631700,
  cincome2017 = Y4240000,
  cincome2019 = Y4561200
)

cincome_vars <- c(
  "cincome1993", "cincome1995", "cincome1997", "cincome1999", 
  "cincome2001", "cincome2003", "cincome2005", "cincome2007",
  "cincome2009", "cincome2011", "cincome2013", "cincome2015",
  "cincome2017", "cincome2019")


 cnlsy <- cnlsy %>% rename(
       chighest_grade2012 = C5718500)


cnlsy <- cnlsy %>% rename(
   
             cmath1988 = C0799600,
             creadr1988 = C0799900,
             creadc1988 = C0800200,
             cmath1990 = C0999100,
             creadr1990 = C0999400,
             creadc1990 = C0999700,
             cmath1992 = C1198800,
             creadr1992 = C1199100,
             creadc1992 = C1199400,
             cmath1994 = C1507800,
             creadr1994 = C1508100,
             creadc1994 = C1508400,
             cmath1996 = C1564700,
             creadr1996 = C1565000,
             creadc1996 = C1565300,
             cmath1998 = C1800100,
             creadr1998 = C1800400,
             creadc1998 = C1800700,
             cmath2000 = C2503700,
             creadr2000 = C2503900,
             creadc2000 = C2504300,
             cmath2002 = C2532200,
             creadr2002 = C2532400,
             creadc2002 = C2532800,
             cmath2004 = C2803000,
             creadr2004 = C2803200,
             creadc2004 = C2803600
)

filtered dataset to females born between 1978 and 1990

cnlsy <- cnlsy %>%
       filter(cbirth_yr >= 1978, cbirth_yr <= 1990)
cnlsy <- cnlsy %>% filter (csex ==2)

#data cleaning

removed respondents with no valid birth month or income in any wave; defined missing/invalid data as NA

cnlsy <- cnlsy %>% filter(cbirth_mo >= 0)

cnlsy <- cnlsy %>% 
       filter(if_any(all_of(cincome_vars), ~ .x >= 0))

cnlsy <- cnlsy %>%
  mutate(
    across(all_of(cincome_vars),  ~ if_else(.x < 0, NA_real_, .x)))

#creating and standardizing PIAT composite

PIAT = (math + reading r + reading c)/3 Normed by age and with whites as the reference

piat_years <- c(1988,1990,1992,1994,1996,1998,2000,2002,2004)


for (yr in piat_years) {
  cnlsy[[paste0("cage_", yr)]] <- yr - cnlsy$cbirth_yr 
                                        - as.integer(cnlsy$cbirth_mo > 6)
}

for (yr in piat_years) {
  m <- paste0("cmath",  yr)
  r <- paste0("creadr", yr)
  c <- paste0("creadc", yr)
  comp <- paste0("piat_", yr)
}

piat_vars <- grep("^c(math|readr|readc)[0-9]{4}$", names(cnlsy), value = TRUE)

cnlsy <- cnlsy %>%
  mutate(across(all_of(piat_vars), ~ if_else(.x <0, NA_real_, as.numeric(.x))))

for (yr in piat_years) {
  m   <- paste0("cmath",  yr)
  r   <- paste0("creadr", yr)
  c   <- paste0("creadc", yr)
  out <- paste0("piat",   yr)
  
  if (!all(c(m, r, c) %in% names(cnlsy))) next
  
  cnlsy[[out]] <- ifelse(
    !is.na(cnlsy[[m]]) & !is.na(cnlsy[[r]]) & !is.na(cnlsy[[c]]),
    (cnlsy[[m]] + cnlsy[[r]] + cnlsy[[c]]) / 3,
    NA_real_
  )
}
rm(yr, m, r, c, out)

make_piat_at_age <- function(df, target_age, years) {
  out <- rep(NA_real_, nrow(df))
  for (yr in years) {
    piat_col <- paste0("piat", yr)
    age_col  <- paste0("cage_", yr)
    if (!all(c(piat_col, age_col) %in% names(df))) next
    idx <- df[[age_col]] == target_age
    out[idx] <- df[[piat_col]][idx]
  }
  out
}

for (a in 8:14) {
  cnlsy[[paste0("piat_age_", a)]] <- make_piat_at_age(cnlsy, a, piat_years)
}

cnlsy <- cnlsy %>%
  mutate(piat_age_14z = kirkegaard::standardize(
    piat_age_14, focal_group = white))
    
    ```

#Income standardization
note: standardized without removing missing age 14 PIAT

```r
income_years <- as.integer(sub("^cincome", "", cincome_vars))

for (yr in income_years) {
  cnlsy[[paste0("cage_", yr)]] <- yr - cnlsy$cbirth_yr -
    as.integer(cnlsy$cbirth_mo > 6)
}
rm(yr)

make_income_age <- function(df, target_age, vars, prefix, age_prefix = "cage_") {
  years <- as.integer(sub(paste0("^", prefix), "", vars))
  out <- rep(NA_real_, nrow(df))

  for (i in seq_along(vars)) {
    yr <- years[i]
    age_col <- paste0(age_prefix, yr)
    if (!age_col %in% names(df)) next
    idx <- df[[age_col]] == target_age
    out[idx] <- df[[vars[i]]][idx]
  }
  out
}

for (a in 18:41) {
  cnlsy[[paste0("cincome_age_", a)]] <- make_income_age(
    cnlsy, a, cincome_vars, "cincome", age_prefix = "cage_"
  )
}
rm(a)

cincome_age_vars <- grep("^cincome_age_[0-9]+$", names(cnlsy), value = TRUE)

cnlsy <- cnlsy %>%
  mutate(
    across(all_of(cincome_age_vars), ~ log1p(.x), .names = "{.col}_log1p"),
    across(ends_with("_log1p"), ~ kirkegaard::standardize(.x), .names = "{.col}_z")
  )

#final cleaning and merge

cnlsy <- cnlsy %>% filter(
  !is.na(piat_age_14z)
)

nlsy_one <- nlsy %>%
       select(m_id, afqt_z, matches("^income_age_[0-9]+log1p_z$")) %>%
       group_by(m_id) %>%
       summarise(
             mother_iq_z = first(afqt_z),
             across(matches("^income_age_[0-9]+log1p_z$"), first),
             .groups = "drop"
         )

merged <- cnlsy %>%
  left_join(nlsy_one, by = "m_id")

merged <- cnlsy %>%
       left_join(
             nlsy %>% select(m_id, afqt_z, starts_with("income_age_")),
             by = "m_id"
         )
   

#renaming important variables

 merged <- merged %>%
       rename_with(
             ~ sub("^income_age_([0-9]+)_log1p_z$", "mother_income_\\1_z", .x),
             matches("^income_age_[0-9]+_log1p_z$")
         )

  merged <- merged %>%
   rename_with(
     ~ sub("^cincome_age_([0-9]+)_log1p_z$", "daughter_income_\\1_z", .x),
     matches("^cincome_age_[0-9]+_log1p_z$")
   )
  
  merged <- merged %>% rename(
    mother_iq_z = afqt_z,
    daughter_iq_z = piat_age_14z
  )
 

#results

Mother-Daughter IQ correlation:

cor_iq <- cor(
  merged$mother_iq_z,
  merged$daughter_iq_z,
  use = "complete.obs"
)

cor_iq

r=0.53

Mother-Daughter income correlations:

name_map <- tibble(var = names(merged)) %>%
  mutate(
    who = case_when(
      grepl("mother_income", var)   ~ "mother",
      grepl("daughter_income", var) ~ "daughter",
      TRUE ~ NA_character_
    ),
age = parse_number(var) 
  ) %>%
  filter(!is.na(who), !is.na(age))

name_map %>% count(who)          
name_map %>% group_by(who) %>% 
                summarise(min_age=min(age), max_age=max(age), n=n())

common_ages <- intersect(
  name_map %>% filter(who=="mother") %>% pull(age),
  name_map %>% filter(who=="daughter") %>% pull(age)
) %>% sort()

income_corrs <- lapply(common_ages, function(a) {
mvar <- name_map %>% filter(who=="mother",  age==a) %>% pull(var) %>% first()
dvar <- name_map %>% filter(who=="daughter", age==a) %>% pull(var) %>% first()
  
 ok <- complete.cases(merged[[mvar]], merged[[dvar]])
  tibble(
    age = a,
    r = if (sum(ok) >= 3) cor(merged[[mvar]][ok], 
                                    merged[[dvar]][ok]) else NA_real_,
    n = sum(ok),
    mother_var = mvar,
    daughter_var = dvar
  )
}) %>% bind_rows()

income_corrs

range(income_corrs$r, na.rm = TRUE)

correlations range from -0.03 to 0.40, with mean of 0.14

Mother-Daughter differences in IQ and income:

merged <- merged %>%
  mutate(delta_iq = daughter_iq_z - mother_iq_z)
delta_corrs <- data.frame(
  age = common_ages,
  r = NA_real_,
  n = NA_integer_
)

for (i in seq_along(common_ages)) {
  a <- common_ages[i]
  
  mv <- paste0("mother_income_", a, "_z")
  dv <- paste0("daughter_income_", a, "_z")
  
  tmp <- merged %>%
    transmute(
      delta_inc = .data[[dv]] - .data[[mv]],
      delta_iq  = delta_iq
    )
  
  ok <- complete.cases(tmp)
  
  delta_corrs$r[i] <- cor(tmp$delta_iq[ok], tmp$delta_inc[ok])
  delta_corrs$n[i] <- sum(ok)
}

delta_corrs
income_corrs %>%
  filter(age >= 30, age <= 45) %>%
  summarise(
    min = min(r),
    max = max(r),
    mean = mean(r)
  )

range(delta_corrs$r, na.rm = TRUE)
summary(delta_corrs$r)

correlations range from 0.04 to 0.48, with a mean of 0.19

#Additional Analyses

prime-age (restricted to ages 30-45)

 income_corrs %>%
   filter(age >= 30, age <= 45, !is.na(r)) %>%
   summarise(
     min  = min(r),
     max  = max(r),
     mean = mean(r)
   )

mean r=0.13

delta_corrs %>%
  filter(age >= 30, age <= 45, !is.na(r)) %>%
  summarise(
    min  = min(r),
    max  = max(r),
    mean = mean(r)
  )

mean r=0.25

also with n>=100:

income_corrs %>%
  filter(age >= 30, age <= 41, n >= 100, !is.na(r)) %>%
  summarise(
    min  = min(r),
    max  = max(r),
    mean = mean(r)
  )

r=0.11

delta_corrs %>%
  filter(age >= 30, age <= 41, n >= 100, !is.na(r)) %>%
  summarise(
    min  = min(r),
    max  = max(r),
    mean = mean(r)
  )

r=0.22

#5-year smoothed (n-weighted)

income_corrs <- income_corrs %>%
  mutate(
    age_bin = cut(
      age,
      breaks = c(18, 24, 30, 36, 42),
      right = FALSE,
      labels = c("18–23", "24–29", "30–35", "36–41")
    )
  )

income_corrs_5yr <- income_corrs %>%
  filter(!is.na(r), !is.na(age_bin)) %>%
  group_by(age_bin) %>%
  summarise(
    r_weighted = weighted.mean(r, w = n),
    min_r = min(r),
    max_r = max(r),
    n_ages = n(),
    total_n = sum(n),
    .groups = "drop"
  )

income_corrs_5yr

rs ranging from 0.11 to 0.15


delta_corrs <- delta_corrs %>%
  mutate(
    age_bin = cut(
      age,
      breaks = c(18, 24, 30, 36, 42),
      right = FALSE,
      labels = c("18–23", "24–29", "30–35", "36–41")
    )
  )

delta_corrs_5yr <- delta_corrs %>%
  filter(!is.na(r), !is.na(age_bin)) %>%
  group_by(age_bin) %>%
  summarise(
    r_weighted = weighted.mean(r, w = n),
    min_r = min(r),
    max_r = max(r),
    n_ages = n(),
    total_n = sum(n),
    .groups = "drop"
  )

delta_corrs_5yr

rs of 0.12 to 0.23.

all years together:

 income_corrs %>%
   filter(!is.na(r)) %>%
   summarise(
     r_weighted = weighted.mean(r, w = n),
     n_ages = n(),
     total_n = sum(n)
   )
 
   ```
 
 r=0.13

 ```r
 delta_all <- delta_corrs %>%
   filter(!is.na(r)) %>%
   summarise(
     r_weighted = weighted.mean(r, w = n),
     n_ages = n(),
     total_n = sum(n)
   )
 
 delta_all

r=0.15

#all prime years

prime_income_corr <- income_corrs %>%
  filter(age >= 30, age <= 41, !is.na(r)) %>%
  summarise(
    r_weighted = weighted.mean(r, w = n),
    min_r = min(r),
    max_r = max(r),
    n_ages = n(),
    total_n = sum(n)
  )

prime_income_corr

r=0.12


  delta_prime <- delta_corrs %>%
    filter(age >= 30, age <= 45, !is.na(r)) %>%
    summarise(
      r_weighted = weighted.mean(r, w = n),
      n_ages = n(),
      total_n = sum(n)
    )
  
  delta_prime
 

r=0.22

#summary statistics

m_map <- name_map %>%
  filter(who == "mother") %>%
  distinct(age, var) %>%
  arrange(age)

mother_iq_income_corrs <- data.frame(age = m_map$age, r = 
                                  NA_real_, n = NA_integer_, var = m_map$var)

for (i in seq_len(nrow(m_map))) {
  v <- m_map$var[i]
  ok <- complete.cases(merged$mother_iq_z, merged[[v]])
  mother_iq_income_corrs$r[i] <- cor(merged$mother_iq_z[ok], merged[[v]][ok])
  mother_iq_income_corrs$n[i] <- sum(ok)
}

mother_iq_income_corrs

Weighted average across ages:

weighted.mean(mother_iq_income_corrs$r,
                                 w = mother_iq_income_corrs$n, na.rm = TRUE)
  ```

weighted mean mother IQ-income r is 0.29

```r

prime <- mother_iq_income_corrs$age >= 30 & mother_iq_income_corrs$age <= 41
weighted.mean(mother_iq_income_corrs$r[prime], 
                          w = mother_iq_income_corrs$n[prime], na.rm = TRUE)

when restricted to ages 30-41, r=0.22

d_map <- name_map %>%
  filter(who == "daughter") %>%
  distinct(age, var) %>%
  arrange(age)

daughter_iq_income_corrs <- data.frame(
  age = d_map$age, r = NA_real_, n = NA_integer_, var = d_map$var)

for (i in seq_len(nrow(d_map))) {
  v <- d_map$var[i]
ok <- complete.cases(merged$daughter_iq_z, merged[[v]])
daughter_iq_income_corrs$r[i] <- cor(merged$daughter_iq_z[ok], merged[[v]][ok])
  daughter_iq_income_corrs$n[i] <- sum(ok)
}

daughter_iq_income_corrs

Weighted average across ages:

weighted.mean(daughter_iq_income_corrs$r, 
              w = daughter_iq_income_corrs$n, na.rm = TRUE)

#weighted r for daughter IQ-income =0.24

prime <- daughter_iq_income_corrs$age >= 30 & daughter_iq_income_corrs$age <= 45
weighted.mean(daughter_iq_income_corrs$r[prime], 
              w = daughter_iq_income_corrs$n[prime], na.rm = TRUE)

when restricted to ages 30-41, r=0.18