About

This is the R notebook for:

Kirkegaard, E. O. W. Are there complex assortamative mating patterns for traits? Analysis of data from 342 Spanish couples. Mankind Quarterly.

Init

options(
  digits = 2
)

library(pacman)
p_load(kirkegaard, 
       readxl, 
       rms, 
       osfr, 
       olsrr, 
       quantreg,
       cowplot
       )

theme_set(theme_bw())

Functions

#get model R2 adj
get_r2adj = function(x) {
  x %>% summary.lm() %>% .$adj.r.squared
}

#boomer code
#https://rdrr.io/cran/rms/src/R/ols.s#sym-print.ols
get_p = function(x) {
  1 - pchisq(x$stats['Model L.R.'], x$stats['d.f.'])
}

#add gains
add_gains = function(x) {
  x %>% 
    mutate(
    mom_gain = mom_nonlinear_r2_adj - mom_linear_r2_adj,
    dad_gain = dad_nonlinear_r2_adj - dad_linear_r2_adj
  )
} 

#fit models function
fit_model_comparisons = function(v, data = d) {
  #make formulas
  #predict mother
  mom_linear_f = str_glue("{v}_mother ~ {v}_father") %>% as.formula()
  mom_nonlinear_f = str_glue("{v}_mother ~ rcs({v}_father)") %>% as.formula()
  dad_linear_f = str_glue("{v}_father ~ {v}_mother") %>% as.formula()
  dad_nonlinear_f = str_glue("{v}_father ~ rcs({v}_mother)") %>% as.formula()
  
  #fit models
  mom_linear_fit = ols(mom_linear_f, data = data)
  mom_nonlinear_fit = ols(mom_nonlinear_f, data = data)
  dad_linear_fit = ols(dad_linear_f, data = data)
  dad_nonlinear_fit = ols(dad_nonlinear_f, data = data)
  
  #get results
  tibble(
    trait = v,
    mom_linear_r2_adj = mom_linear_fit %>% get_r2adj(),
    mom_nonlinear_r2_adj = mom_nonlinear_fit %>% get_r2adj(),
    dad_linear_r2_adj = dad_linear_fit %>% get_r2adj(),
    dad_nonlinear_r2_adj = dad_nonlinear_fit %>% get_r2adj()
  )
}


#function to perform non-random mating
#done by sorting the two columns based on a noisy version of one of them
#effectively results in inperfect sorting for the non-noisy version
non_random_pair = function(pair_phenotypes, noise = 1) {
  pair_phenotypes %<>% 
    set_colnames(c("p1", "p2")) %>% 
    mutate(
      #add initial numbers, so we can keep track of original positions
      p1n = 1:n(),
      p2n = 1:n()
    ) %>% 
    #sort by p2's true score
    arrange(p2)
  
  #subset the p1 part, and sort by noisy phenotype to produce imperfect true sorting
  p1 = pair_phenotypes[c("p1n", "p1")] %>% 
    mutate(
      #standardize and add noise
      noisy_p1 = standardize(p1) + rnorm(nrow(pair_phenotypes), mean = noise)
    ) %>% 
    #sort by noisy
    arrange(noisy_p1)
  
  #now we join the sorted p2 with the sorted p1, and these will be our semi-matched pairs
  chosen_pairs = cbind(
    p1$p1n,
    #add the number of rows to p2 to get back to the complete dataset number
    pair_phenotypes$p2n
  )

  chosen_pairs
}

#make function to smooth quantile
quantile_smooth = function(x, y, quantile, method = c("qgam", "Rq", "running"), k = 5) {
  
  #method
  if (method == "qgam") {
    #same as in example here
    #https://rdrr.io/cran/qgam/man/qgam.html
    fit = qgam::qgam(list(y ~ s(x, k = k, bs = "cr"), ~ s(x, k = k, bs = "cr")), 
               data = data.frame(x = x, y = y), qu = quantile)
    return(predict(fit))
    
  } else if (method == "Rq") {
    #fit with spline
    fit = rms::Rq(y ~ rcs(x), data = data.frame(x = x, y = y), tau = quantile)
    return(fitted(fit) %>% as.vector())
    
  } else if (method == "running") {
    #beware, need to sort the y by x first
    #and then return it to original state
    x_order = order(x)
    return(caTools::runquantile(y[x_order], k = k, probs = quantile)[x_order])
    
  } else {
    stop(str_glue("method not recognized: {method}"), call. = F)
  }
  
}

#loop convenience function
add_quantile_smooths = function(x, y, data, vars, quantiles, method = "qgam", k = 30) {
  #loop vars and quantiles to add
  todo = expand_grid(
    x = x,
    quantile = quantiles
  )
  
  # browser()
  #loop rows and add
  for (i in seq_along_rows(todo)) {
    i_var = todo$x[i]
    i_quantile = todo$quantile[i]
    
    #make var
    data[[str_glue("{y}_q{i_quantile*100}")]] = NA_real_
    
    #get values
    i_vals = quantile_smooth(y = data[[y]], 
                             x = data[[i_var]], 
                             quantile = i_quantile, 
                             method = method, 
                             k = k)
    
    #beware, functions differ in dealing with NA!
    if (method == "qgam") {
      data[[str_glue("{y}_q{i_quantile*100}")]][as.numeric(names(i_vals))] = i_vals
    } else {
      data[[str_glue("{y}_q{i_quantile*100}")]] = i_vals
    }
    
  }
  
  data
}

#function to find heteroscedasticity
#https://rpubs.com/EmilOWK/heteroscedasticity
test_HS = function(x, resid) {
  #data
  d = tibble(
    resid = standardize(resid^2),
    x = standardize(x),
    resid_rank = rank(resid),
    x_rank = rank(x)
  )
  
  #fits
  mods = list(
    fit_linear = ols(resid ~ x, data = d),
    fit_spline = ols(resid ~ rcs(x), data = d),
    fit_linear_rank = ols(resid_rank ~ x_rank, data = d),
    fit_spline_rank = ols(resid_rank ~ rcs(x_rank), data = d)
  )
  
  #summarize
  y = tibble(
    test = c("linear raw", "spline raw", "linear rank", "spline rank"),
    r2adj = map_dbl(mods, get_r2adj),
    p = map_dbl(mods, get_p),
    fit = mods
  )
  
  #for the rcs, we have to compute the model improvement vs. linear
  #https://stackoverflow.com/questions/47937065/how-can-i-interpret-the-p-value-of-nonlinear-term-under-rms-anova
  #https://stats.stackexchange.com/questions/405961/is-there-formal-test-of-non-linearity-in-linear-regression
  y$p[2] = lrtest(mods$fit_linear, mods$fit_spline)$stats["P"]
  y$p[4] = lrtest(mods$fit_linear_rank, mods$fit_spline_rank)$stats["P"]

  y %>% mutate(log10_p = -log10(p))
}

Data

#read
d = readxl::read_excel("data/DataSet_AM_4Emil.xls", na = c("", "#NULL!")) %>% 
  #remove code variable
  select(
    -code
  ) %>% 
  #reorder
  select(sort(colnames(.))) %>% 
  #rename child
  {
    colnames(.) = colnames(.) %>% 
      str_replace("son_daughter", "child") %>% 
      str_replace("INTEL", "Intel") %>% 
      str_replace("age", "Age")
    .
  }

#education as ordinal
d$ed_father %<>% ordered()
d$ed_mother %<>% ordered()
d$ed_child %<>% ordered()

#sex as factor
d$sex_child %<>% factor()

#continuous traits
cont_traits = c(
  "Age", "N", "E", "P", "Lies", "Intel"
)

#fix one extreme outlier


#variables
d_variables = df_var_table(d)

#print
d_variables

Descriptives

#describe each variable
d %>% describe() %>% as.matrix()

##              vars   n mean   sd median trimmed mad min max range   skew
## Age_father      1 333 51.1 6.17     51    50.7 5.9  34  77    43  0.748
## Age_mother      2 337 48.4 5.45     48    48.1 5.9  36  71    35  0.601
## Age_child       3 339 20.3 2.41     20    20.0 1.5  17  37    20  2.360
## E_father        4 342 10.8 4.53     11    10.9 4.4   0  21    21 -0.158
## E_mother        5 342 11.4 3.70     12    11.6 3.0   2  20    18 -0.295
## E_child         6 342 13.9 3.98     14    14.2 4.4   1  21    20 -0.826
## ed_father*      7 333  2.2 1.16      2     2.1 1.5   1   4     3  0.443
## ed_mother*      8 337  1.9 1.11      2     1.8 1.5   1   4     3  0.831
## ed_child*       9 342  2.0 0.12      2     2.0 0.0   1   2     1 -8.052
## Intel_father   10 342 19.6 7.71     20    19.8 8.2   0  42    42 -0.216
## Intel_mother   11 342 18.5 7.37     19    18.6 7.4   0  37    37 -0.050
## Intel_child    12 342 25.4 5.74     26    25.5 5.2   0  42    42 -0.434
## Lies_father    13 342 12.9 3.98     13    13.1 4.4   0  20    20 -0.554
## Lies_mother    14 342 14.3 3.25     15    14.5 3.0   3  20    17 -0.586
## Lies_child     15 342 10.2 3.65     10    10.2 3.0   1  19    18  0.021
## N_father       16 342  9.0 4.85      8     8.8 4.4   0  24    24  0.481
## N_mother       17 342 11.1 5.05     11    11.0 5.9   0  23    23  0.224
## N_child        18 342 11.8 4.85     12    11.7 5.9   1  24    23  0.156
## P_father       19 342  6.0 2.99      6     5.8 3.0   0  21    21  1.003
## P_mother       20 342  5.9 2.75      6     5.7 3.0   0  20    20  0.769
## P_child        21 342  6.8 3.51      6     6.6 3.0   0  20    20  0.771
## sex_child*     22 339  1.8 0.42      2     1.8 0.0   1   2     1 -1.277
##              kurtosis     se
## Age_father      1.089 0.3382
## Age_mother      0.826 0.2967
## Age_child      11.214 0.1309
## E_father       -0.538 0.2447
## E_mother       -0.376 0.2000
## E_child         0.403 0.2152
## ed_father*     -1.307 0.0636
## ed_mother*     -0.735 0.0603
## ed_child*      63.027 0.0065
## Intel_father   -0.141 0.4167
## Intel_mother   -0.379 0.3983
## Intel_child     1.359 0.3103
## Lies_father     0.233 0.2151
## Lies_mother     0.014 0.1757
## Lies_child     -0.299 0.1974
## N_father       -0.130 0.2623
## N_mother       -0.526 0.2730
## N_child        -0.574 0.2623
## P_father        2.604 0.1616
## P_mother        1.586 0.1488
## P_child         0.474 0.1898
## sex_child*     -0.371 0.0229

Correlations

#pairwise cors for couples
d %>% 
  select(-contains("_child")) %>% 
  wtd.cors()

##              Age_father Age_mother E_father E_mother ed_father ed_mother
## Age_father       1.0000     0.8142 -0.09101   0.0372    0.0035    0.0016
## Age_mother       0.8142     1.0000 -0.10612  -0.0015    0.0418   -0.0251
## E_father        -0.0910    -0.1061  1.00000  -0.0070   -0.0119    0.0779
## E_mother         0.0372    -0.0015 -0.00703   1.0000    0.0521    0.0519
## ed_father        0.0035     0.0418 -0.01193   0.0521    1.0000    0.5251
## ed_mother        0.0016    -0.0251  0.07787   0.0519    0.5251    1.0000
## Intel_father    -0.1426    -0.1410  0.06052   0.0892    0.2625    0.1914
## Intel_mother    -0.0338    -0.0644 -0.00328  -0.0170    0.1396    0.1619
## Lies_father      0.1851     0.1386  0.00077   0.0473   -0.0674   -0.0320
## Lies_mother      0.1325     0.1661 -0.02303  -0.1133   -0.1654   -0.1786
## N_father        -0.0726    -0.0416  0.02635  -0.0638   -0.1088   -0.0988
## N_mother        -0.0172    -0.0504  0.05906  -0.1014   -0.1737   -0.1700
## P_father        -0.0329    -0.0397  0.08069  -0.0426    0.0799    0.0867
## P_mother        -0.1114    -0.0963 -0.00543   0.1120    0.1589    0.1467
##              Intel_father Intel_mother Lies_father Lies_mother N_father
## Age_father         -0.143      -0.0338     0.18509       0.132   -0.073
## Age_mother         -0.141      -0.0644     0.13856       0.166   -0.042
## E_father            0.061      -0.0033     0.00077      -0.023    0.026
## E_mother            0.089      -0.0170     0.04728      -0.113   -0.064
## ed_father           0.263       0.1396    -0.06744      -0.165   -0.109
## ed_mother           0.191       0.1619    -0.03202      -0.179   -0.099
## Intel_father        1.000       0.4917     0.10227      -0.065   -0.013
## Intel_mother        0.492       1.0000     0.00862       0.019   -0.065
## Lies_father         0.102       0.0086     1.00000       0.274   -0.146
## Lies_mother        -0.065       0.0195     0.27375       1.000   -0.016
## N_father           -0.013      -0.0651    -0.14580      -0.016    1.000
## N_mother           -0.048      -0.0833    -0.03934      -0.259    0.068
## P_father            0.045      -0.0070    -0.16844      -0.096    0.190
## P_mother           -0.044      -0.0540    -0.18938      -0.406   -0.011
##              N_mother P_father P_mother
## Age_father    -0.0172  -0.0329  -0.1114
## Age_mother    -0.0504  -0.0397  -0.0963
## E_father       0.0591   0.0807  -0.0054
## E_mother      -0.1014  -0.0426   0.1120
## ed_father     -0.1737   0.0799   0.1589
## ed_mother     -0.1700   0.0867   0.1467
## Intel_father  -0.0477   0.0446  -0.0444
## Intel_mother  -0.0833  -0.0070  -0.0540
## Lies_father   -0.0393  -0.1684  -0.1894
## Lies_mother   -0.2593  -0.0955  -0.4055
## N_father       0.0681   0.1905  -0.0106
## N_mother       1.0000  -0.0099   0.0892
## P_father      -0.0099   1.0000   0.1983
## P_mother       0.0892   0.1983   1.0000

#pairwise N's
d %>% 
  select(-contains("_child")) %>% 
  pairwiseCount()

##              Age_father Age_mother E_father E_mother ed_father ed_mother
## Age_father          333        331      333      333       333       330
## Age_mother          331        337      337      337       331       335
## E_father            333        337      342      342       333       337
## E_mother            333        337      342      342       333       337
## ed_father           333        331      333      333       333       330
## ed_mother           330        335      337      337       330       337
## Intel_father        333        337      342      342       333       337
## Intel_mother        333        337      342      342       333       337
## Lies_father         333        337      342      342       333       337
## Lies_mother         333        337      342      342       333       337
## N_father            333        337      342      342       333       337
## N_mother            333        337      342      342       333       337
## P_father            333        337      342      342       333       337
## P_mother            333        337      342      342       333       337
##              Intel_father Intel_mother Lies_father Lies_mother N_father
## Age_father            333          333         333         333      333
## Age_mother            337          337         337         337      337
## E_father              342          342         342         342      342
## E_mother              342          342         342         342      342
## ed_father             333          333         333         333      333
## ed_mother             337          337         337         337      337
## Intel_father          342          342         342         342      342
## Intel_mother          342          342         342         342      342
## Lies_father           342          342         342         342      342
## Lies_mother           342          342         342         342      342
## N_father              342          342         342         342      342
## N_mother              342          342         342         342      342
## P_father              342          342         342         342      342
## P_mother              342          342         342         342      342
##              N_mother P_father P_mother
## Age_father        333      333      333
## Age_mother        337      337      337
## E_father          342      342      342
## E_mother          342      342      342
## ed_father         333      333      333
## ed_mother         337      337      337
## Intel_father      342      342      342
## Intel_mother      342      342      342
## Lies_father       342      342      342
## Lies_mother       342      342      342
## N_father          342      342      342
## N_mother          342      342      342
## P_father          342      342      342
## P_mother          342      342      342

#p values for those who want that
d %>% 
  select(-contains("_child")) %>% 
  map_df(as.numeric) %>% 
  cor_matrix(p_val = T)

##              Age_father          Age_mother          E_father          
## Age_father   "1"                 "0.81 [p=1.08e-79]" "-0.09 [p=0.0973]"
## Age_mother   "0.81 [p=1.08e-79]" "1"                 "-0.11 [p=0.0516]"
## E_father     "-0.09 [p=0.0973]"  "-0.11 [p=0.0516]"  "1"               
## E_mother     "0.04 [p=0.499]"    "0.00 [p=0.978]"    "-0.01 [p=0.897]" 
## ed_father    "0.00 [p=0.950]"    "0.04 [p=0.448]"    "-0.01 [p=0.828]" 
## ed_mother    "0.00 [p=0.977]"    "-0.03 [p=0.647]"   "0.08 [p=0.154]"  
## Intel_father "-0.14 [p=0.00917]" "-0.14 [p=0.00956]" "0.06 [p=0.264]"  
## Intel_mother "-0.03 [p=0.539]"   "-0.06 [p=0.238]"   "0.00 [p=0.952]"  
## Lies_father  "0.19 [p=0.000688]" "0.14 [p=0.0109]"   "0.00 [p=0.989]"  
## Lies_mother  "0.13 [p=0.0156]"   "0.17 [p=0.00223]"  "-0.02 [p=0.671]" 
## N_father     "-0.07 [p=0.186]"   "-0.04 [p=0.446]"   "0.03 [p=0.627]"  
## N_mother     "-0.02 [p=0.755]"   "-0.05 [p=0.357]"   "0.06 [p=0.276]"  
## P_father     "-0.03 [p=0.550]"   "-0.04 [p=0.468]"   "0.08 [p=0.136]"  
## P_mother     "-0.11 [p=0.0422]"  "-0.10 [p=0.0776]"  "-0.01 [p=0.920]" 
##              E_mother           ed_father           ed_mother           
## Age_father   "0.04 [p=0.499]"   "0.00 [p=0.950]"    "0.00 [p=0.977]"    
## Age_mother   "0.00 [p=0.978]"   "0.04 [p=0.448]"    "-0.03 [p=0.647]"   
## E_father     "-0.01 [p=0.897]"  "-0.01 [p=0.828]"   "0.08 [p=0.154]"    
## E_mother     "1"                "0.05 [p=0.344]"    "0.05 [p=0.342]"    
## ed_father    "0.05 [p=0.344]"   "1"                 "0.53 [p=8.82e-25]" 
## ed_mother    "0.05 [p=0.342]"   "0.53 [p=8.82e-25]" "1"                 
## Intel_father "0.09 [p=0.0997]"  "0.26 [p=1.19e-06]" "0.19 [p=0.000409]" 
## Intel_mother "-0.02 [p=0.755]"  "0.14 [p=0.0108]"   "0.16 [p=0.00287]"  
## Lies_father  "0.05 [p=0.383]"   "-0.07 [p=0.220]"   "-0.03 [p=0.558]"   
## Lies_mother  "-0.11 [p=0.0363]" "-0.17 [p=0.00247]" "-0.18 [p=0.000993]"
## N_father     "-0.06 [p=0.239]"  "-0.11 [p=0.0473]"  "-0.10 [p=0.0702]"  
## N_mother     "-0.10 [p=0.0609]" "-0.17 [p=0.00147]" "-0.17 [p=0.00174]" 
## P_father     "-0.04 [p=0.433]"  "0.08 [p=0.146]"    "0.09 [p=0.112]"    
## P_mother     "0.11 [p=0.0384]"  "0.16 [p=0.00365]"  "0.15 [p=0.00698]"  
##              Intel_father        Intel_mother       Lies_father         
## Age_father   "-0.14 [p=0.00917]" "-0.03 [p=0.539]"  "0.19 [p=0.000688]" 
## Age_mother   "-0.14 [p=0.00956]" "-0.06 [p=0.238]"  "0.14 [p=0.0109]"   
## E_father     "0.06 [p=0.264]"    "0.00 [p=0.952]"   "0.00 [p=0.989]"    
## E_mother     "0.09 [p=0.0997]"   "-0.02 [p=0.755]"  "0.05 [p=0.383]"    
## ed_father    "0.26 [p=1.19e-06]" "0.14 [p=0.0108]"  "-0.07 [p=0.220]"   
## ed_mother    "0.19 [p=0.000409]" "0.16 [p=0.00287]" "-0.03 [p=0.558]"   
## Intel_father "1"                 "0.49 [p=3.2e-22]" "0.10 [p=0.0589]"   
## Intel_mother "0.49 [p=3.2e-22]"  "1"                "0.01 [p=0.874]"    
## Lies_father  "0.10 [p=0.0589]"   "0.01 [p=0.874]"   "1"                 
## Lies_mother  "-0.07 [p=0.227]"   "0.02 [p=0.720]"   "0.27 [p=2.71e-07]" 
## N_father     "-0.01 [p=0.817]"   "-0.07 [p=0.230]"  "-0.15 [p=0.00691]" 
## N_mother     "-0.05 [p=0.379]"   "-0.08 [p=0.124]"  "-0.04 [p=0.468]"   
## P_father     "0.04 [p=0.411]"    "-0.01 [p=0.897]"  "-0.17 [p=0.00177]" 
## P_mother     "-0.04 [p=0.413]"   "-0.05 [p=0.319]"  "-0.19 [p=0.000429]"
##              Lies_mother          N_father            N_mother            
## Age_father   "0.13 [p=0.0156]"    "-0.07 [p=0.186]"   "-0.02 [p=0.755]"   
## Age_mother   "0.17 [p=0.00223]"   "-0.04 [p=0.446]"   "-0.05 [p=0.357]"   
## E_father     "-0.02 [p=0.671]"    "0.03 [p=0.627]"    "0.06 [p=0.276]"    
## E_mother     "-0.11 [p=0.0363]"   "-0.06 [p=0.239]"   "-0.10 [p=0.0609]"  
## ed_father    "-0.17 [p=0.00247]"  "-0.11 [p=0.0473]"  "-0.17 [p=0.00147]" 
## ed_mother    "-0.18 [p=0.000993]" "-0.10 [p=0.0702]"  "-0.17 [p=0.00174]" 
## Intel_father "-0.07 [p=0.227]"    "-0.01 [p=0.817]"   "-0.05 [p=0.379]"   
## Intel_mother "0.02 [p=0.720]"     "-0.07 [p=0.230]"   "-0.08 [p=0.124]"   
## Lies_father  "0.27 [p=2.71e-07]"  "-0.15 [p=0.00691]" "-0.04 [p=0.468]"   
## Lies_mother  "1"                  "-0.02 [p=0.768]"   "-0.26 [p=1.17e-06]"
## N_father     "-0.02 [p=0.768]"    "1"                 "0.07 [p=0.209]"    
## N_mother     "-0.26 [p=1.17e-06]" "0.07 [p=0.209]"    "1"                 
## P_father     "-0.10 [p=0.0778]"   "0.19 [p=0.000397]" "-0.01 [p=0.855]"   
## P_mother     "-0.41 [p=5.69e-15]" "-0.01 [p=0.845]"   "0.09 [p=0.0995]"   
##              P_father            P_mother            
## Age_father   "-0.03 [p=0.550]"   "-0.11 [p=0.0422]"  
## Age_mother   "-0.04 [p=0.468]"   "-0.10 [p=0.0776]"  
## E_father     "0.08 [p=0.136]"    "-0.01 [p=0.920]"   
## E_mother     "-0.04 [p=0.433]"   "0.11 [p=0.0384]"   
## ed_father    "0.08 [p=0.146]"    "0.16 [p=0.00365]"  
## ed_mother    "0.09 [p=0.112]"    "0.15 [p=0.00698]"  
## Intel_father "0.04 [p=0.411]"    "-0.04 [p=0.413]"   
## Intel_mother "-0.01 [p=0.897]"   "-0.05 [p=0.319]"   
## Lies_father  "-0.17 [p=0.00177]" "-0.19 [p=0.000429]"
## Lies_mother  "-0.10 [p=0.0778]"  "-0.41 [p=5.69e-15]"
## N_father     "0.19 [p=0.000397]" "-0.01 [p=0.845]"   
## N_mother     "-0.01 [p=0.855]"   "0.09 [p=0.0995]"   
## P_father     "1"                 "0.20 [p=0.000224]" 
## P_mother     "0.20 [p=0.000224]" "1"

#heat map
d %>% 
  select(-contains("_child")) %>% 
  GG_heatmap(reorder_vars = F)

GG_save("figs/parent_heatmap.png")

#if we treat them as latents
d %>% 
  select(-contains("_child")) %>% 
  map_df(as.numeric) %>% 
  mixedCor() ->
  latent_cors

#heatmap again
latent_cors$rho %>% 
  GG_heatmap(color_label = "Correlation", reorder_vars = F)

GG_save("figs/parent_heatmap_latent.png")

Plots

We want the cross-partner plots for each of the 6 paired variables.

#attempt a fancy facet wrap approach
#convert to long form
d_cont_long = d %>% 
  #get continuous traits
  select(contains(cont_traits + "_")) %>% 
  #z score
  df_standardize() %>% 
  #add id
  mutate(
    id = 1:n()
  ) %>% 
  #convert form
  gather(key = trait, value = value, -id) %>% 
  separate(col = trait, into = c("trait", "person"), sep = "_") %>% 
  spread(key = "person", value = "value")

#plot
d_cont_long %>% 
  ggplot(aes(mother, father)) +
  geom_point() +
  geom_smooth() +
  facet_wrap("trait")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning: Removed 11 rows containing non-finite values (stat_smooth).

## Warning: Removed 11 rows containing missing values (geom_point).

GG_save("figs/pairweise_trait_scatter.png")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning: Removed 11 rows containing non-finite values (stat_smooth).

## Warning: Removed 11 rows containing missing values (geom_point).

Nonlinear patterns vs. linear

We compare the linear association vs. the spline association in adjusted R metric.

#loop over traits
linear_nonlinear_results = map_df(cont_traits, fit_model_comparisons) %>% 
  #gains
  mutate(
    mom_gain = mom_nonlinear_r2_adj - mom_linear_r2_adj,
    dad_gain = dad_nonlinear_r2_adj - dad_linear_r2_adj
  )

#print
linear_nonlinear_results

#describe
linear_nonlinear_results %>% select(mom_gain, dad_gain) %>% unlist() %>% describe() %>% as.matrix()

##    vars  n   mean    sd  median trimmed    mad     min   max range skew
## X1    1 12 0.0074 0.017 0.00076  0.0043 0.0076 -0.0066 0.052 0.058  1.4
##    kurtosis     se
## X1     0.98 0.0049

Results without outlier

#filter that family
d_rm_outlier = d %>% 
  filter(P_mother <= 15)

#redo model fits
#loop over traits
linear_nonlinear_rm_outlier_results = map_df(cont_traits, fit_model_comparisons, data = d_rm_outlier) %>% 
  #gains
  mutate(
    mom_gain = mom_nonlinear_r2_adj - mom_linear_r2_adj,
    dad_gain = dad_nonlinear_r2_adj - dad_linear_r2_adj
  )

#print
linear_nonlinear_rm_outlier_results

#describe
linear_nonlinear_rm_outlier_results %>% select(mom_gain, dad_gain) %>% unlist() %>% describe() %>% as.matrix()

##    vars  n   mean    sd median trimmed    mad    min   max range skew kurtosis
## X1    1 12 0.0038 0.011 0.0016  0.0019 0.0059 -0.007 0.033  0.04  1.5      1.3
##        se
## X1 0.0032

#correlations without outlier
#if we treat them as latents
d_rm_outlier %>% 
  select(-contains("_child")) %>% 
  map_df(as.numeric) %>% 
  mixedCor() ->
  latent_cors_rm_outlier

#heatmap again
latent_cors_rm_outlier$rho %>% 
  GG_heatmap(color_label = "Correlation", reorder_vars = F)

GG_save("figs/parent_heatmap_latent_rm_outlier.png")

#pairwise scatter
#convert to long form
d_cont_long_rm_outlier = d_rm_outlier %>% 
  #get continuous traits
  select(contains(cont_traits + "_")) %>% 
  #z score
  df_standardize() %>% 
  #add id
  mutate(
    id = 1:n()
  ) %>% 
  #convert form
  gather(key = trait, value = value, -id) %>% 
  separate(col = trait, into = c("trait", "person"), sep = "_") %>% 
  spread(key = "person", value = "value")

#plot
d_cont_long_rm_outlier %>% 
  ggplot(aes(mother, father)) +
  geom_point() +
  geom_smooth() +
  facet_wrap("trait")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning: Removed 11 rows containing non-finite values (stat_smooth).

## Warning: Removed 11 rows containing missing values (geom_point).

GG_save("figs/pairweise_trait_scatter_rm_outlier.png")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning: Removed 11 rows containing non-finite values (stat_smooth).

## Warning: Removed 11 rows containing missing values (geom_point).

General factor of assortative mating? Cross-trait assortative mating?

Are people who are closer matched on 1 trait closer on others as well? Or maybe less so?

#calculate the distances from the z scores
#and correlate them
d_dists = map_dfc(c(cont_traits, "ed"), function(v) {

  y = tibble(
    #calc dist
    dist = standardize(as.numeric(d[[str_glue("{v}_mother")]])) - standardize(as.numeric(d[[str_glue("{v}_father")]]))
  )
  
  colnames(y) = v
  
  y
})

#plot cors
d_dists %>% 
  GG_heatmap()

GG_save("figs/cross_trait_dists.png")

Heteroscedasticity

R1 asked about this. https://cran.r-project.org/web/packages/olsrr/vignettes/heteroskedasticity.html

Simulation results at: https://rpubs.com/EmilOWK/heteroscedasticity

#loop across the 6 traits, test for heteroscedasticity with Bartlett method
map_df(cont_traits, function(v) {
  #get data subset
  dd = d_cont_long_rm_outlier %>% filter(trait == v)
  dd_ranked = dd %>% df_rank()

  #do tests
  tibble(
    trait = v,
    raw = ols_test_bartlett(dd, "father", "mother")$pval,
    ranked = ols_test_bartlett(dd_ranked, "father", "mother")$pval,
  )
})

#custom method
map_df(cont_traits, function(v) {
  #get data subset
  dd = d_cont_long_rm_outlier %>% filter(trait == v)
  
  #do tests both ways
  fit_mother_father = ols(mother ~ father, data = dd)
  res_mother_father = test_HS(resid = resid(fit_mother_father), x = dd$father)
  
  fit_father_mother = ols(father ~ mother, data = dd)
  res_father_mother = test_HS(resid = resid(fit_father_mother), x = dd$mother)
  
  bind_rows(
    res_mother_father %>% mutate(direction = "mother ~ father"),
    res_father_mother %>% mutate(direction = "father ~ mother")
  ) %>% mutate(trait = v)
}) %>% 
  select(-fit) %>% 
  tidyr::separate(test, sep = " ", into = c("form", "transformation")) ->
  HS_results

#inspect
HS_results

#p values below .05
mean(HS_results$p < .05)

## [1] 0.38

HS_results %>% group_by(trait) %>% dplyr::summarize(frac_05 = mean(p < .05))

## `summarise()` ungrouping output (override with `.groups` argument)

HS_results %>% group_by(trait=="Age") %>% dplyr::summarize(frac_05 = mean(p < .05))

## `summarise()` ungrouping output (override with `.groups` argument)

HS_results %>% group_by(form) %>% dplyr::summarize(frac_05 = mean(p < .05))

## `summarise()` ungrouping output (override with `.groups` argument)

HS_results %>% group_by(direction) %>% dplyr::summarize(frac_05 = mean(p < .05))

## `summarise()` ungrouping output (override with `.groups` argument)

HS_results %>% group_by(transformation) %>% dplyr::summarize(frac_05 = mean(p < .05))

## `summarise()` ungrouping output (override with `.groups` argument)

#predict p values
ols(log10_p ~ direction + form + transformation + trait, data = HS_results)

## Linear Regression Model
##  
##  ols(formula = log10_p ~ direction + form + transformation + trait, 
##      data = HS_results)
##  
##                  Model Likelihood    Discrimination    
##                        Ratio Test           Indexes    
##  Obs      48    LR chi2     18.81    R2       0.324    
##  sigma1.3366    d.f.            8    R2 adj   0.186    
##  d.f.     39    Pr(> chi2) 0.0159    g        0.976    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.9639 -0.6803 -0.1535  0.5211  5.3374 
##  
##  
##                            Coef    S.E.   t     Pr(>|t|)
##  Intercept                  1.8797 0.5788  3.25 0.0024  
##  direction=mother ~ father  0.3638 0.3858  0.94 0.3515  
##  form=spline                0.3351 0.3858  0.87 0.3904  
##  transformation=raw         0.3290 0.3858  0.85 0.3991  
##  trait=E                   -1.4996 0.6683 -2.24 0.0306  
##  trait=Intel               -0.4853 0.6683 -0.73 0.4721  
##  trait=Lies                -1.9364 0.6683 -2.90 0.0061  
##  trait=N                   -1.9894 0.6683 -2.98 0.0050  
##  trait=P                   -0.4526 0.6683 -0.68 0.5023  
##

#inspect strongest signals
#first augment the data with the smoothed quantiles
d_cont_long_rm_outlier_aug = plyr::ddply(d_cont_long_rm_outlier, "trait", function(dd) {
  # browser()
  #add them
  dd = add_quantile_smooths(data = dd, x = "father", y = "mother", quantiles = c(.1, .9), method = "Rq", k = 5)
  dd = add_quantile_smooths(data = dd, x = "mother", y = "father", quantiles = c(.1, .9), method = "Rq", k = 5)
})

## Warning in quantreg::summary.rq(fit, covariance = TRUE, se = se, hs = hs): 7
## non-positive fis

## Warning in quantreg::summary.rq(fit, covariance = TRUE, se = se, hs = hs): 3
## non-positive fis

## Warning in quantreg::summary.rq(fit, covariance = TRUE, se = se, hs = hs): 7
## non-positive fis

## Warning in quantreg::summary.rq(fit, covariance = TRUE, se = se, hs = hs): 1
## non-positive fis

#inspect the age pattern, obvious
d_cont_long_rm_outlier_aug %>% 
  filter(
    trait == "Age",
    !is.na(father), !is.na(mother)
  ) %>% 
  ggplot(aes(father, mother)) +
  geom_point() +
  geom_smooth(method = lm, se = F) +
  geom_ribbon(mapping = aes(
    ymin = mother_q10,
    ymax = mother_q90
  ), alpha = .4) ->
  HS_age_MF
HS_age_MF

## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/age_mother~father.png")

## `geom_smooth()` using formula 'y ~ x'

#inspect the age pattern, reverse
d_cont_long_rm_outlier_aug %>% 
  filter(
    trait == "Age",
    !is.na(father), !is.na(mother)
  ) %>% 
  ggplot(aes(mother, father)) +
  geom_point() +
  geom_smooth(method = lm, se = F) +
  geom_ribbon(mapping = aes(
    ymin = father_q10,
    ymax = father_q90
  ), alpha = .4) ->
  HS_age_FM
HS_age_FM

## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/age_father~mother.png")

## `geom_smooth()` using formula 'y ~ x'

#intel
d_cont_long_rm_outlier_aug %>% 
  filter(
    trait == "Intel",
    !is.na(father), !is.na(mother)
  ) %>% 
  ggplot(aes(father, mother)) +
  geom_point() +
  geom_smooth(method = lm, se = F) +
  geom_ribbon(mapping = aes(
    ymin = mother_q10,
    ymax = mother_q90
  ), alpha = .4) ->
  HS_intel_MF
HS_intel_MF

## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/intel_mother~father.png")

## `geom_smooth()` using formula 'y ~ x'

#P
d_cont_long_rm_outlier_aug %>% 
  filter(
    trait == "P",
    !is.na(father), !is.na(mother)
  ) %>% 
  ggplot(aes(mother, father)) +
  geom_point() +
  geom_smooth(method = lm, se = F) +
  geom_ribbon(mapping = aes(
    ymin = father_q10,
    ymax = father_q90
  ), alpha = .4) ->
  HS_P_FM
HS_P_FM

## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/P_father~mother.png")

## `geom_smooth()` using formula 'y ~ x'

#E
d_cont_long_rm_outlier_aug %>% 
  filter(
    trait == "E",
    !is.na(father), !is.na(mother)
  ) %>% 
  ggplot(aes(mother, father)) +
  geom_point() +
  geom_smooth(method = lm, se = F) +
  geom_ribbon(mapping = aes(
    ymin = father_q10,
    ymax = father_q90
  ), alpha = .4) ->
  HS_E_FM
HS_E_FM

## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/E_father~mother.png")

## `geom_smooth()` using formula 'y ~ x'

# combined plot
#https://cran.r-project.org/web/packages/cowplot/vignettes/introduction.html
plot_grid(
  HS_age_MF,
  HS_intel_MF,
  HS_P_FM,
  HS_E_FM,
  labels = c("      age", "intelligence", "psychoticism", "extroversion"),
  vjust = 2
  )

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/HS_4plot.png")

Simulation results

Do the methods work as expected.

Power of splines to find nonlinear patterns

Simulate data with nonlinear patterns and see if we can detect it. Vary the noise and sample size.

#manual example
set.seed(1)
sim_manual_data = tibble(
  n = 340,
  trait_mother = seq(0, pi, length.out = n),
  trait_father = sin(trait_mother) + rnorm(n, sd = .5)
)

#fit models manually
(sim_manual_linear = ols(trait_father ~ trait_mother, data = sim_manual_data))

## Linear Regression Model
##  
##  ols(formula = trait_father ~ trait_mother, data = sim_manual_data)
##  
##                  Model Likelihood    Discrimination    
##                        Ratio Test           Indexes    
##  Obs     340    LR chi2      0.00    R2       0.000    
##  sigma0.5544    d.f.            1    R2 adj  -0.003    
##  d.f.    338    Pr(> chi2) 0.9746    g        0.001    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -1.647217 -0.365306  0.003739  0.317401  1.537018 
##  
##  
##               Coef    S.E.   t     Pr(>|t|)
##  Intercept     0.6602 0.0600 11.00 <0.0001 
##  trait_mother -0.0010 0.0331 -0.03 0.9747  
##

(sim_manual_nonlinear = ols(trait_father ~ rcs(trait_mother), data = sim_manual_data))

## Linear Regression Model
##  
##  ols(formula = trait_father ~ rcs(trait_mother), data = sim_manual_data)
##  
##                  Model Likelihood    Discrimination    
##                        Ratio Test           Indexes    
##  Obs     340    LR chi2     99.64    R2       0.254    
##  sigma0.4810    d.f.            4    R2 adj   0.245    
##  d.f.    335    Pr(> chi2) 0.0000    g        0.315    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -1.45229 -0.29993 -0.02196  0.31171  1.29158 
##  
##  
##                  Coef    S.E.   t     Pr(>|t|)
##  Intercept        0.0878 0.0999  0.88 0.3802  
##  trait_mother     0.8826 0.2075  4.25 <0.0001 
##  trait_mother'   -1.6219 1.2033 -1.35 0.1786  
##  trait_mother''   1.7818 3.6558  0.49 0.6263  
##  trait_mother''' -0.3374 4.8431 -0.07 0.9445  
##

sim_manual_data$linear_predict = predict(sim_manual_linear)
sim_manual_data$nonlinear_predict = predict(sim_manual_nonlinear)

#plot it
sim_manual_data %>% 
  ggplot(aes(trait_mother, trait_father)) +
  geom_point() +
  #regression lines
    geom_line(
    mapping = aes(y = linear_predict),
    color = "red",
    size = 1.5
  ) +
  geom_line(
    mapping = aes(y = nonlinear_predict),
    color = "blue",
    size = 1.5
  )

GG_save("figs/spline_example.png")

#params for simulation
sim_pars = expand_grid(
  n = seq(10, 1000, by = 10),
  noise_sd = seq(0, 2, by = .5),
  replicate = 1:10
)

#loop and get results
set.seed(1)
sim_results = map_df(seq_along_rows(sim_pars), function(i) {
  #simulate data
  x_data = tibble(
    trait_mother = seq(0, pi, length.out = sim_pars$n[i]),
    trait_father = sin(trait_mother) + rnorm(n = sim_pars$n[i], mean = 0, sd = sim_pars$noise_sd[i])
  )
  
  #call function for results
  model_results = fit_model_comparisons("trait", data = x_data) %>% 
    mutate(
      #sim pars
      n = sim_pars$n[i],
      noise_sd = sim_pars$noise_sd[i]
    )
  
  model_results
})

#summarize results
sim_results %>% 
  #gains
  mutate(
    mom_gain = mom_nonlinear_r2_adj - mom_linear_r2_adj,
    dad_gain = dad_nonlinear_r2_adj - dad_linear_r2_adj
  ) %>% 
  plyr::ddply(c("n", "noise_sd"), function(x) {
    # browser()
    x %>% 
      select(mom_linear_r2_adj:dad_nonlinear_r2_adj, mom_gain, dad_gain) %>% 
      describe() %>% 
      as.data.frame() %>% 
      rownames_to_column(var = "variable") %>% select(-n, -vars, -se)
  }) ->
  sim_results_summary


#mom gain only
sim_results_summary %>% 
  filter(
    variable == "dad_gain"
  ) %>% 
  #plot
  ggplot(aes(n, mean, color = ordered(round(noise_sd, 2)))) +
  # geom_line() +
  geom_ribbon(aes(ymin = min, ymax = max), alpha = .2) +
  scale_color_ordinal("Amount of noise (SD)") +
  scale_x_log10(breaks = c(10^(1:3), 3*10^(1:3))) +
  scale_y_continuous("Mean gain in R2 adj.", breaks = seq(-1, 2, .1)) +
  theme_classic()

GG_save("figs/simulation_results.png")

Trade-off mating, collider bias

Simple two trait model.

#simulate some random trait data
sim_n = 10000
set.seed(1)

sim_traits = tibble(
  male_attractiveness = rnorm(sim_n),
  male_niceness = rnorm(sim_n),
  female_attractiveness = rnorm(sim_n),
  female_niceness = rnorm(sim_n)
) %>% 
  mutate(
    male_composite = (male_attractiveness + male_niceness) %>% standardize(),
    female_composite = (female_attractiveness + female_niceness) %>% standardize()
  )

#find mates
sim_mates = non_random_pair(sim_traits %>% select(male_composite, female_composite))

#reordered data
sim_traits_paired = cbind(
  sim_traits[sim_mates[, 1], ] %>% select(starts_with("male")),
  sim_traits[sim_mates[, 2], ] %>% select(starts_with("female"))
)

#verify correlations
sim_traits_paired %>% wtd.cors()

##                       male_attractiveness male_niceness male_composite
## male_attractiveness                1.0000        0.0048           0.72
## male_niceness                      0.0048        1.0000           0.70
## male_composite                     0.7163        0.7012           1.00
## female_attractiveness              0.3589        0.3417           0.49
## female_niceness                    0.3433        0.3413           0.48
## female_composite                   0.5008        0.4871           0.70
##                       female_attractiveness female_niceness female_composite
## male_attractiveness                   0.359           0.343             0.50
## male_niceness                         0.342           0.341             0.49
## male_composite                        0.494           0.483             0.70
## female_attractiveness                 1.000          -0.017             0.70
## female_niceness                      -0.017           1.000             0.70
## female_composite                      0.704           0.698             1.00

#above some composite score
sim_traits_paired %>% filter(male_composite > 1) %>% wtd.cors()

##                       male_attractiveness male_niceness male_composite
## male_attractiveness                 1.000        -0.661           0.43
## male_niceness                      -0.661         1.000           0.39
## male_composite                      0.430         0.393           1.00
## female_attractiveness               0.153         0.057           0.26
## female_niceness                     0.066         0.120           0.23
## female_composite                    0.184         0.147           0.40
##                       female_attractiveness female_niceness female_composite
## male_attractiveness                   0.153           0.066             0.18
## male_niceness                         0.057           0.120             0.15
## male_composite                        0.256           0.226             0.40
## female_attractiveness                 1.000          -0.282             0.61
## female_niceness                      -0.282           1.000             0.59
## female_composite                      0.612           0.585             1.00

sim_traits_paired %>% filter(male_composite < 1) %>% wtd.cors()

##                       male_attractiveness male_niceness male_composite
## male_attractiveness                  1.00         -0.22           0.64
## male_niceness                       -0.22          1.00           0.61
## male_composite                       0.64          0.61           1.00
## female_attractiveness                0.26          0.25           0.41
## female_niceness                      0.26          0.24           0.40
## female_composite                     0.39          0.37           0.61
##                       female_attractiveness female_niceness female_composite
## male_attractiveness                    0.26            0.26             0.39
## male_niceness                          0.25            0.24             0.37
## male_composite                         0.41            0.40             0.61
## female_attractiveness                  1.00           -0.11             0.67
## female_niceness                       -0.11            1.00             0.66
## female_composite                       0.67            0.66             1.00

#plot for dataviz
sim_traits_paired %>% 
  ggplot(aes(male_attractiveness, male_niceness, color = male_composite > 1)) +
  geom_point(alpha = .2) +
  # geom_smooth() +
  geom_smooth(aes(color = NULL, group = male_composite > 1)) +
  geom_smooth(aes(color = NULL), color = "black") +
  scale_x_continuous("Male attractiveness") +
  scale_y_continuous("Male niceness") +
  scale_color_discrete("Composite score > 1", labels = c("No", "Yes"))

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

GG_save("figs/composite_score_collider_bias.png")

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Are there complex assortative mating patterns for humans? Analysis of 340 Spanish couples