This is the R notebook for:
options(
digits = 2
)
library(pacman)
p_load(kirkegaard,
readxl,
rms,
osfr,
olsrr,
quantreg,
cowplot
)
theme_set(theme_bw())
#get model R2 adj
get_r2adj = function(x) {
x %>% summary.lm() %>% .$adj.r.squared
}
#boomer code
#https://rdrr.io/cran/rms/src/R/ols.s#sym-print.ols
get_p = function(x) {
1 - pchisq(x$stats['Model L.R.'], x$stats['d.f.'])
}
#add gains
add_gains = function(x) {
x %>%
mutate(
mom_gain = mom_nonlinear_r2_adj - mom_linear_r2_adj,
dad_gain = dad_nonlinear_r2_adj - dad_linear_r2_adj
)
}
#fit models function
fit_model_comparisons = function(v, data = d) {
#make formulas
#predict mother
mom_linear_f = str_glue("{v}_mother ~ {v}_father") %>% as.formula()
mom_nonlinear_f = str_glue("{v}_mother ~ rcs({v}_father)") %>% as.formula()
dad_linear_f = str_glue("{v}_father ~ {v}_mother") %>% as.formula()
dad_nonlinear_f = str_glue("{v}_father ~ rcs({v}_mother)") %>% as.formula()
#fit models
mom_linear_fit = ols(mom_linear_f, data = data)
mom_nonlinear_fit = ols(mom_nonlinear_f, data = data)
dad_linear_fit = ols(dad_linear_f, data = data)
dad_nonlinear_fit = ols(dad_nonlinear_f, data = data)
#get results
tibble(
trait = v,
mom_linear_r2_adj = mom_linear_fit %>% get_r2adj(),
mom_nonlinear_r2_adj = mom_nonlinear_fit %>% get_r2adj(),
dad_linear_r2_adj = dad_linear_fit %>% get_r2adj(),
dad_nonlinear_r2_adj = dad_nonlinear_fit %>% get_r2adj()
)
}
#function to perform non-random mating
#done by sorting the two columns based on a noisy version of one of them
#effectively results in inperfect sorting for the non-noisy version
non_random_pair = function(pair_phenotypes, noise = 1) {
pair_phenotypes %<>%
set_colnames(c("p1", "p2")) %>%
mutate(
#add initial numbers, so we can keep track of original positions
p1n = 1:n(),
p2n = 1:n()
) %>%
#sort by p2's true score
arrange(p2)
#subset the p1 part, and sort by noisy phenotype to produce imperfect true sorting
p1 = pair_phenotypes[c("p1n", "p1")] %>%
mutate(
#standardize and add noise
noisy_p1 = standardize(p1) + rnorm(nrow(pair_phenotypes), mean = noise)
) %>%
#sort by noisy
arrange(noisy_p1)
#now we join the sorted p2 with the sorted p1, and these will be our semi-matched pairs
chosen_pairs = cbind(
p1$p1n,
#add the number of rows to p2 to get back to the complete dataset number
pair_phenotypes$p2n
)
chosen_pairs
}
#make function to smooth quantile
quantile_smooth = function(x, y, quantile, method = c("qgam", "Rq", "running"), k = 5) {
#method
if (method == "qgam") {
#same as in example here
#https://rdrr.io/cran/qgam/man/qgam.html
fit = qgam::qgam(list(y ~ s(x, k = k, bs = "cr"), ~ s(x, k = k, bs = "cr")),
data = data.frame(x = x, y = y), qu = quantile)
return(predict(fit))
} else if (method == "Rq") {
#fit with spline
fit = rms::Rq(y ~ rcs(x), data = data.frame(x = x, y = y), tau = quantile)
return(fitted(fit) %>% as.vector())
} else if (method == "running") {
#beware, need to sort the y by x first
#and then return it to original state
x_order = order(x)
return(caTools::runquantile(y[x_order], k = k, probs = quantile)[x_order])
} else {
stop(str_glue("method not recognized: {method}"), call. = F)
}
}
#loop convenience function
add_quantile_smooths = function(x, y, data, vars, quantiles, method = "qgam", k = 30) {
#loop vars and quantiles to add
todo = expand_grid(
x = x,
quantile = quantiles
)
# browser()
#loop rows and add
for (i in seq_along_rows(todo)) {
i_var = todo$x[i]
i_quantile = todo$quantile[i]
#make var
data[[str_glue("{y}_q{i_quantile*100}")]] = NA_real_
#get values
i_vals = quantile_smooth(y = data[[y]],
x = data[[i_var]],
quantile = i_quantile,
method = method,
k = k)
#beware, functions differ in dealing with NA!
if (method == "qgam") {
data[[str_glue("{y}_q{i_quantile*100}")]][as.numeric(names(i_vals))] = i_vals
} else {
data[[str_glue("{y}_q{i_quantile*100}")]] = i_vals
}
}
data
}
#function to find heteroscedasticity
#https://rpubs.com/EmilOWK/heteroscedasticity
test_HS = function(x, resid) {
#data
d = tibble(
resid = standardize(resid^2),
x = standardize(x),
resid_rank = rank(resid),
x_rank = rank(x)
)
#fits
mods = list(
fit_linear = ols(resid ~ x, data = d),
fit_spline = ols(resid ~ rcs(x), data = d),
fit_linear_rank = ols(resid_rank ~ x_rank, data = d),
fit_spline_rank = ols(resid_rank ~ rcs(x_rank), data = d)
)
#summarize
y = tibble(
test = c("linear raw", "spline raw", "linear rank", "spline rank"),
r2adj = map_dbl(mods, get_r2adj),
p = map_dbl(mods, get_p),
fit = mods
)
#for the rcs, we have to compute the model improvement vs. linear
#https://stackoverflow.com/questions/47937065/how-can-i-interpret-the-p-value-of-nonlinear-term-under-rms-anova
#https://stats.stackexchange.com/questions/405961/is-there-formal-test-of-non-linearity-in-linear-regression
y$p[2] = lrtest(mods$fit_linear, mods$fit_spline)$stats["P"]
y$p[4] = lrtest(mods$fit_linear_rank, mods$fit_spline_rank)$stats["P"]
y %>% mutate(log10_p = -log10(p))
}
#read
d = readxl::read_excel("data/DataSet_AM_4Emil.xls", na = c("", "#NULL!")) %>%
#remove code variable
select(
-code
) %>%
#reorder
select(sort(colnames(.))) %>%
#rename child
{
colnames(.) = colnames(.) %>%
str_replace("son_daughter", "child") %>%
str_replace("INTEL", "Intel") %>%
str_replace("age", "Age")
.
}
#education as ordinal
d$ed_father %<>% ordered()
d$ed_mother %<>% ordered()
d$ed_child %<>% ordered()
#sex as factor
d$sex_child %<>% factor()
#continuous traits
cont_traits = c(
"Age", "N", "E", "P", "Lies", "Intel"
)
#fix one extreme outlier
#variables
d_variables = df_var_table(d)
#print
d_variables
#describe each variable
d %>% describe() %>% as.matrix()
## vars n mean sd median trimmed mad min max range skew
## Age_father 1 333 51.1 6.17 51 50.7 5.9 34 77 43 0.748
## Age_mother 2 337 48.4 5.45 48 48.1 5.9 36 71 35 0.601
## Age_child 3 339 20.3 2.41 20 20.0 1.5 17 37 20 2.360
## E_father 4 342 10.8 4.53 11 10.9 4.4 0 21 21 -0.158
## E_mother 5 342 11.4 3.70 12 11.6 3.0 2 20 18 -0.295
## E_child 6 342 13.9 3.98 14 14.2 4.4 1 21 20 -0.826
## ed_father* 7 333 2.2 1.16 2 2.1 1.5 1 4 3 0.443
## ed_mother* 8 337 1.9 1.11 2 1.8 1.5 1 4 3 0.831
## ed_child* 9 342 2.0 0.12 2 2.0 0.0 1 2 1 -8.052
## Intel_father 10 342 19.6 7.71 20 19.8 8.2 0 42 42 -0.216
## Intel_mother 11 342 18.5 7.37 19 18.6 7.4 0 37 37 -0.050
## Intel_child 12 342 25.4 5.74 26 25.5 5.2 0 42 42 -0.434
## Lies_father 13 342 12.9 3.98 13 13.1 4.4 0 20 20 -0.554
## Lies_mother 14 342 14.3 3.25 15 14.5 3.0 3 20 17 -0.586
## Lies_child 15 342 10.2 3.65 10 10.2 3.0 1 19 18 0.021
## N_father 16 342 9.0 4.85 8 8.8 4.4 0 24 24 0.481
## N_mother 17 342 11.1 5.05 11 11.0 5.9 0 23 23 0.224
## N_child 18 342 11.8 4.85 12 11.7 5.9 1 24 23 0.156
## P_father 19 342 6.0 2.99 6 5.8 3.0 0 21 21 1.003
## P_mother 20 342 5.9 2.75 6 5.7 3.0 0 20 20 0.769
## P_child 21 342 6.8 3.51 6 6.6 3.0 0 20 20 0.771
## sex_child* 22 339 1.8 0.42 2 1.8 0.0 1 2 1 -1.277
## kurtosis se
## Age_father 1.089 0.3382
## Age_mother 0.826 0.2967
## Age_child 11.214 0.1309
## E_father -0.538 0.2447
## E_mother -0.376 0.2000
## E_child 0.403 0.2152
## ed_father* -1.307 0.0636
## ed_mother* -0.735 0.0603
## ed_child* 63.027 0.0065
## Intel_father -0.141 0.4167
## Intel_mother -0.379 0.3983
## Intel_child 1.359 0.3103
## Lies_father 0.233 0.2151
## Lies_mother 0.014 0.1757
## Lies_child -0.299 0.1974
## N_father -0.130 0.2623
## N_mother -0.526 0.2730
## N_child -0.574 0.2623
## P_father 2.604 0.1616
## P_mother 1.586 0.1488
## P_child 0.474 0.1898
## sex_child* -0.371 0.0229
#pairwise cors for couples
d %>%
select(-contains("_child")) %>%
wtd.cors()
## Age_father Age_mother E_father E_mother ed_father ed_mother
## Age_father 1.0000 0.8142 -0.09101 0.0372 0.0035 0.0016
## Age_mother 0.8142 1.0000 -0.10612 -0.0015 0.0418 -0.0251
## E_father -0.0910 -0.1061 1.00000 -0.0070 -0.0119 0.0779
## E_mother 0.0372 -0.0015 -0.00703 1.0000 0.0521 0.0519
## ed_father 0.0035 0.0418 -0.01193 0.0521 1.0000 0.5251
## ed_mother 0.0016 -0.0251 0.07787 0.0519 0.5251 1.0000
## Intel_father -0.1426 -0.1410 0.06052 0.0892 0.2625 0.1914
## Intel_mother -0.0338 -0.0644 -0.00328 -0.0170 0.1396 0.1619
## Lies_father 0.1851 0.1386 0.00077 0.0473 -0.0674 -0.0320
## Lies_mother 0.1325 0.1661 -0.02303 -0.1133 -0.1654 -0.1786
## N_father -0.0726 -0.0416 0.02635 -0.0638 -0.1088 -0.0988
## N_mother -0.0172 -0.0504 0.05906 -0.1014 -0.1737 -0.1700
## P_father -0.0329 -0.0397 0.08069 -0.0426 0.0799 0.0867
## P_mother -0.1114 -0.0963 -0.00543 0.1120 0.1589 0.1467
## Intel_father Intel_mother Lies_father Lies_mother N_father
## Age_father -0.143 -0.0338 0.18509 0.132 -0.073
## Age_mother -0.141 -0.0644 0.13856 0.166 -0.042
## E_father 0.061 -0.0033 0.00077 -0.023 0.026
## E_mother 0.089 -0.0170 0.04728 -0.113 -0.064
## ed_father 0.263 0.1396 -0.06744 -0.165 -0.109
## ed_mother 0.191 0.1619 -0.03202 -0.179 -0.099
## Intel_father 1.000 0.4917 0.10227 -0.065 -0.013
## Intel_mother 0.492 1.0000 0.00862 0.019 -0.065
## Lies_father 0.102 0.0086 1.00000 0.274 -0.146
## Lies_mother -0.065 0.0195 0.27375 1.000 -0.016
## N_father -0.013 -0.0651 -0.14580 -0.016 1.000
## N_mother -0.048 -0.0833 -0.03934 -0.259 0.068
## P_father 0.045 -0.0070 -0.16844 -0.096 0.190
## P_mother -0.044 -0.0540 -0.18938 -0.406 -0.011
## N_mother P_father P_mother
## Age_father -0.0172 -0.0329 -0.1114
## Age_mother -0.0504 -0.0397 -0.0963
## E_father 0.0591 0.0807 -0.0054
## E_mother -0.1014 -0.0426 0.1120
## ed_father -0.1737 0.0799 0.1589
## ed_mother -0.1700 0.0867 0.1467
## Intel_father -0.0477 0.0446 -0.0444
## Intel_mother -0.0833 -0.0070 -0.0540
## Lies_father -0.0393 -0.1684 -0.1894
## Lies_mother -0.2593 -0.0955 -0.4055
## N_father 0.0681 0.1905 -0.0106
## N_mother 1.0000 -0.0099 0.0892
## P_father -0.0099 1.0000 0.1983
## P_mother 0.0892 0.1983 1.0000
#pairwise N's
d %>%
select(-contains("_child")) %>%
pairwiseCount()
## Age_father Age_mother E_father E_mother ed_father ed_mother
## Age_father 333 331 333 333 333 330
## Age_mother 331 337 337 337 331 335
## E_father 333 337 342 342 333 337
## E_mother 333 337 342 342 333 337
## ed_father 333 331 333 333 333 330
## ed_mother 330 335 337 337 330 337
## Intel_father 333 337 342 342 333 337
## Intel_mother 333 337 342 342 333 337
## Lies_father 333 337 342 342 333 337
## Lies_mother 333 337 342 342 333 337
## N_father 333 337 342 342 333 337
## N_mother 333 337 342 342 333 337
## P_father 333 337 342 342 333 337
## P_mother 333 337 342 342 333 337
## Intel_father Intel_mother Lies_father Lies_mother N_father
## Age_father 333 333 333 333 333
## Age_mother 337 337 337 337 337
## E_father 342 342 342 342 342
## E_mother 342 342 342 342 342
## ed_father 333 333 333 333 333
## ed_mother 337 337 337 337 337
## Intel_father 342 342 342 342 342
## Intel_mother 342 342 342 342 342
## Lies_father 342 342 342 342 342
## Lies_mother 342 342 342 342 342
## N_father 342 342 342 342 342
## N_mother 342 342 342 342 342
## P_father 342 342 342 342 342
## P_mother 342 342 342 342 342
## N_mother P_father P_mother
## Age_father 333 333 333
## Age_mother 337 337 337
## E_father 342 342 342
## E_mother 342 342 342
## ed_father 333 333 333
## ed_mother 337 337 337
## Intel_father 342 342 342
## Intel_mother 342 342 342
## Lies_father 342 342 342
## Lies_mother 342 342 342
## N_father 342 342 342
## N_mother 342 342 342
## P_father 342 342 342
## P_mother 342 342 342
#p values for those who want that
d %>%
select(-contains("_child")) %>%
map_df(as.numeric) %>%
cor_matrix(p_val = T)
## Age_father Age_mother E_father
## Age_father "1" "0.81 [p=1.08e-79]" "-0.09 [p=0.0973]"
## Age_mother "0.81 [p=1.08e-79]" "1" "-0.11 [p=0.0516]"
## E_father "-0.09 [p=0.0973]" "-0.11 [p=0.0516]" "1"
## E_mother "0.04 [p=0.499]" "0.00 [p=0.978]" "-0.01 [p=0.897]"
## ed_father "0.00 [p=0.950]" "0.04 [p=0.448]" "-0.01 [p=0.828]"
## ed_mother "0.00 [p=0.977]" "-0.03 [p=0.647]" "0.08 [p=0.154]"
## Intel_father "-0.14 [p=0.00917]" "-0.14 [p=0.00956]" "0.06 [p=0.264]"
## Intel_mother "-0.03 [p=0.539]" "-0.06 [p=0.238]" "0.00 [p=0.952]"
## Lies_father "0.19 [p=0.000688]" "0.14 [p=0.0109]" "0.00 [p=0.989]"
## Lies_mother "0.13 [p=0.0156]" "0.17 [p=0.00223]" "-0.02 [p=0.671]"
## N_father "-0.07 [p=0.186]" "-0.04 [p=0.446]" "0.03 [p=0.627]"
## N_mother "-0.02 [p=0.755]" "-0.05 [p=0.357]" "0.06 [p=0.276]"
## P_father "-0.03 [p=0.550]" "-0.04 [p=0.468]" "0.08 [p=0.136]"
## P_mother "-0.11 [p=0.0422]" "-0.10 [p=0.0776]" "-0.01 [p=0.920]"
## E_mother ed_father ed_mother
## Age_father "0.04 [p=0.499]" "0.00 [p=0.950]" "0.00 [p=0.977]"
## Age_mother "0.00 [p=0.978]" "0.04 [p=0.448]" "-0.03 [p=0.647]"
## E_father "-0.01 [p=0.897]" "-0.01 [p=0.828]" "0.08 [p=0.154]"
## E_mother "1" "0.05 [p=0.344]" "0.05 [p=0.342]"
## ed_father "0.05 [p=0.344]" "1" "0.53 [p=8.82e-25]"
## ed_mother "0.05 [p=0.342]" "0.53 [p=8.82e-25]" "1"
## Intel_father "0.09 [p=0.0997]" "0.26 [p=1.19e-06]" "0.19 [p=0.000409]"
## Intel_mother "-0.02 [p=0.755]" "0.14 [p=0.0108]" "0.16 [p=0.00287]"
## Lies_father "0.05 [p=0.383]" "-0.07 [p=0.220]" "-0.03 [p=0.558]"
## Lies_mother "-0.11 [p=0.0363]" "-0.17 [p=0.00247]" "-0.18 [p=0.000993]"
## N_father "-0.06 [p=0.239]" "-0.11 [p=0.0473]" "-0.10 [p=0.0702]"
## N_mother "-0.10 [p=0.0609]" "-0.17 [p=0.00147]" "-0.17 [p=0.00174]"
## P_father "-0.04 [p=0.433]" "0.08 [p=0.146]" "0.09 [p=0.112]"
## P_mother "0.11 [p=0.0384]" "0.16 [p=0.00365]" "0.15 [p=0.00698]"
## Intel_father Intel_mother Lies_father
## Age_father "-0.14 [p=0.00917]" "-0.03 [p=0.539]" "0.19 [p=0.000688]"
## Age_mother "-0.14 [p=0.00956]" "-0.06 [p=0.238]" "0.14 [p=0.0109]"
## E_father "0.06 [p=0.264]" "0.00 [p=0.952]" "0.00 [p=0.989]"
## E_mother "0.09 [p=0.0997]" "-0.02 [p=0.755]" "0.05 [p=0.383]"
## ed_father "0.26 [p=1.19e-06]" "0.14 [p=0.0108]" "-0.07 [p=0.220]"
## ed_mother "0.19 [p=0.000409]" "0.16 [p=0.00287]" "-0.03 [p=0.558]"
## Intel_father "1" "0.49 [p=3.2e-22]" "0.10 [p=0.0589]"
## Intel_mother "0.49 [p=3.2e-22]" "1" "0.01 [p=0.874]"
## Lies_father "0.10 [p=0.0589]" "0.01 [p=0.874]" "1"
## Lies_mother "-0.07 [p=0.227]" "0.02 [p=0.720]" "0.27 [p=2.71e-07]"
## N_father "-0.01 [p=0.817]" "-0.07 [p=0.230]" "-0.15 [p=0.00691]"
## N_mother "-0.05 [p=0.379]" "-0.08 [p=0.124]" "-0.04 [p=0.468]"
## P_father "0.04 [p=0.411]" "-0.01 [p=0.897]" "-0.17 [p=0.00177]"
## P_mother "-0.04 [p=0.413]" "-0.05 [p=0.319]" "-0.19 [p=0.000429]"
## Lies_mother N_father N_mother
## Age_father "0.13 [p=0.0156]" "-0.07 [p=0.186]" "-0.02 [p=0.755]"
## Age_mother "0.17 [p=0.00223]" "-0.04 [p=0.446]" "-0.05 [p=0.357]"
## E_father "-0.02 [p=0.671]" "0.03 [p=0.627]" "0.06 [p=0.276]"
## E_mother "-0.11 [p=0.0363]" "-0.06 [p=0.239]" "-0.10 [p=0.0609]"
## ed_father "-0.17 [p=0.00247]" "-0.11 [p=0.0473]" "-0.17 [p=0.00147]"
## ed_mother "-0.18 [p=0.000993]" "-0.10 [p=0.0702]" "-0.17 [p=0.00174]"
## Intel_father "-0.07 [p=0.227]" "-0.01 [p=0.817]" "-0.05 [p=0.379]"
## Intel_mother "0.02 [p=0.720]" "-0.07 [p=0.230]" "-0.08 [p=0.124]"
## Lies_father "0.27 [p=2.71e-07]" "-0.15 [p=0.00691]" "-0.04 [p=0.468]"
## Lies_mother "1" "-0.02 [p=0.768]" "-0.26 [p=1.17e-06]"
## N_father "-0.02 [p=0.768]" "1" "0.07 [p=0.209]"
## N_mother "-0.26 [p=1.17e-06]" "0.07 [p=0.209]" "1"
## P_father "-0.10 [p=0.0778]" "0.19 [p=0.000397]" "-0.01 [p=0.855]"
## P_mother "-0.41 [p=5.69e-15]" "-0.01 [p=0.845]" "0.09 [p=0.0995]"
## P_father P_mother
## Age_father "-0.03 [p=0.550]" "-0.11 [p=0.0422]"
## Age_mother "-0.04 [p=0.468]" "-0.10 [p=0.0776]"
## E_father "0.08 [p=0.136]" "-0.01 [p=0.920]"
## E_mother "-0.04 [p=0.433]" "0.11 [p=0.0384]"
## ed_father "0.08 [p=0.146]" "0.16 [p=0.00365]"
## ed_mother "0.09 [p=0.112]" "0.15 [p=0.00698]"
## Intel_father "0.04 [p=0.411]" "-0.04 [p=0.413]"
## Intel_mother "-0.01 [p=0.897]" "-0.05 [p=0.319]"
## Lies_father "-0.17 [p=0.00177]" "-0.19 [p=0.000429]"
## Lies_mother "-0.10 [p=0.0778]" "-0.41 [p=5.69e-15]"
## N_father "0.19 [p=0.000397]" "-0.01 [p=0.845]"
## N_mother "-0.01 [p=0.855]" "0.09 [p=0.0995]"
## P_father "1" "0.20 [p=0.000224]"
## P_mother "0.20 [p=0.000224]" "1"
#heat map
d %>%
select(-contains("_child")) %>%
GG_heatmap(reorder_vars = F)
GG_save("figs/parent_heatmap.png")
#if we treat them as latents
d %>%
select(-contains("_child")) %>%
map_df(as.numeric) %>%
mixedCor() ->
latent_cors
#heatmap again
latent_cors$rho %>%
GG_heatmap(color_label = "Correlation", reorder_vars = F)
GG_save("figs/parent_heatmap_latent.png")
We want the cross-partner plots for each of the 6 paired variables.
#attempt a fancy facet wrap approach
#convert to long form
d_cont_long = d %>%
#get continuous traits
select(contains(cont_traits + "_")) %>%
#z score
df_standardize() %>%
#add id
mutate(
id = 1:n()
) %>%
#convert form
gather(key = trait, value = value, -id) %>%
separate(col = trait, into = c("trait", "person"), sep = "_") %>%
spread(key = "person", value = "value")
#plot
d_cont_long %>%
ggplot(aes(mother, father)) +
geom_point() +
geom_smooth() +
facet_wrap("trait")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 11 rows containing non-finite values (stat_smooth).
## Warning: Removed 11 rows containing missing values (geom_point).
GG_save("figs/pairweise_trait_scatter.png")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 11 rows containing non-finite values (stat_smooth).
## Warning: Removed 11 rows containing missing values (geom_point).
We compare the linear association vs. the spline association in adjusted R metric.
#loop over traits
linear_nonlinear_results = map_df(cont_traits, fit_model_comparisons) %>%
#gains
mutate(
mom_gain = mom_nonlinear_r2_adj - mom_linear_r2_adj,
dad_gain = dad_nonlinear_r2_adj - dad_linear_r2_adj
)
#print
linear_nonlinear_results
#describe
linear_nonlinear_results %>% select(mom_gain, dad_gain) %>% unlist() %>% describe() %>% as.matrix()
## vars n mean sd median trimmed mad min max range skew
## X1 1 12 0.0074 0.017 0.00076 0.0043 0.0076 -0.0066 0.052 0.058 1.4
## kurtosis se
## X1 0.98 0.0049
#filter that family
d_rm_outlier = d %>%
filter(P_mother <= 15)
#redo model fits
#loop over traits
linear_nonlinear_rm_outlier_results = map_df(cont_traits, fit_model_comparisons, data = d_rm_outlier) %>%
#gains
mutate(
mom_gain = mom_nonlinear_r2_adj - mom_linear_r2_adj,
dad_gain = dad_nonlinear_r2_adj - dad_linear_r2_adj
)
#print
linear_nonlinear_rm_outlier_results
#describe
linear_nonlinear_rm_outlier_results %>% select(mom_gain, dad_gain) %>% unlist() %>% describe() %>% as.matrix()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 12 0.0038 0.011 0.0016 0.0019 0.0059 -0.007 0.033 0.04 1.5 1.3
## se
## X1 0.0032
#correlations without outlier
#if we treat them as latents
d_rm_outlier %>%
select(-contains("_child")) %>%
map_df(as.numeric) %>%
mixedCor() ->
latent_cors_rm_outlier
#heatmap again
latent_cors_rm_outlier$rho %>%
GG_heatmap(color_label = "Correlation", reorder_vars = F)
GG_save("figs/parent_heatmap_latent_rm_outlier.png")
#pairwise scatter
#convert to long form
d_cont_long_rm_outlier = d_rm_outlier %>%
#get continuous traits
select(contains(cont_traits + "_")) %>%
#z score
df_standardize() %>%
#add id
mutate(
id = 1:n()
) %>%
#convert form
gather(key = trait, value = value, -id) %>%
separate(col = trait, into = c("trait", "person"), sep = "_") %>%
spread(key = "person", value = "value")
#plot
d_cont_long_rm_outlier %>%
ggplot(aes(mother, father)) +
geom_point() +
geom_smooth() +
facet_wrap("trait")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 11 rows containing non-finite values (stat_smooth).
## Warning: Removed 11 rows containing missing values (geom_point).
GG_save("figs/pairweise_trait_scatter_rm_outlier.png")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 11 rows containing non-finite values (stat_smooth).
## Warning: Removed 11 rows containing missing values (geom_point).
Are people who are closer matched on 1 trait closer on others as well? Or maybe less so?
#calculate the distances from the z scores
#and correlate them
d_dists = map_dfc(c(cont_traits, "ed"), function(v) {
y = tibble(
#calc dist
dist = standardize(as.numeric(d[[str_glue("{v}_mother")]])) - standardize(as.numeric(d[[str_glue("{v}_father")]]))
)
colnames(y) = v
y
})
#plot cors
d_dists %>%
GG_heatmap()
GG_save("figs/cross_trait_dists.png")
R1 asked about this. https://cran.r-project.org/web/packages/olsrr/vignettes/heteroskedasticity.html
Simulation results at: https://rpubs.com/EmilOWK/heteroscedasticity
#loop across the 6 traits, test for heteroscedasticity with Bartlett method
map_df(cont_traits, function(v) {
#get data subset
dd = d_cont_long_rm_outlier %>% filter(trait == v)
dd_ranked = dd %>% df_rank()
#do tests
tibble(
trait = v,
raw = ols_test_bartlett(dd, "father", "mother")$pval,
ranked = ols_test_bartlett(dd_ranked, "father", "mother")$pval,
)
})
#custom method
map_df(cont_traits, function(v) {
#get data subset
dd = d_cont_long_rm_outlier %>% filter(trait == v)
#do tests both ways
fit_mother_father = ols(mother ~ father, data = dd)
res_mother_father = test_HS(resid = resid(fit_mother_father), x = dd$father)
fit_father_mother = ols(father ~ mother, data = dd)
res_father_mother = test_HS(resid = resid(fit_father_mother), x = dd$mother)
bind_rows(
res_mother_father %>% mutate(direction = "mother ~ father"),
res_father_mother %>% mutate(direction = "father ~ mother")
) %>% mutate(trait = v)
}) %>%
select(-fit) %>%
tidyr::separate(test, sep = " ", into = c("form", "transformation")) ->
HS_results
#inspect
HS_results
#p values below .05
mean(HS_results$p < .05)
## [1] 0.38
HS_results %>% group_by(trait) %>% dplyr::summarize(frac_05 = mean(p < .05))
## `summarise()` ungrouping output (override with `.groups` argument)
HS_results %>% group_by(trait=="Age") %>% dplyr::summarize(frac_05 = mean(p < .05))
## `summarise()` ungrouping output (override with `.groups` argument)
HS_results %>% group_by(form) %>% dplyr::summarize(frac_05 = mean(p < .05))
## `summarise()` ungrouping output (override with `.groups` argument)
HS_results %>% group_by(direction) %>% dplyr::summarize(frac_05 = mean(p < .05))
## `summarise()` ungrouping output (override with `.groups` argument)
HS_results %>% group_by(transformation) %>% dplyr::summarize(frac_05 = mean(p < .05))
## `summarise()` ungrouping output (override with `.groups` argument)
#predict p values
ols(log10_p ~ direction + form + transformation + trait, data = HS_results)
## Linear Regression Model
##
## ols(formula = log10_p ~ direction + form + transformation + trait,
## data = HS_results)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 48 LR chi2 18.81 R2 0.324
## sigma1.3366 d.f. 8 R2 adj 0.186
## d.f. 39 Pr(> chi2) 0.0159 g 0.976
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.9639 -0.6803 -0.1535 0.5211 5.3374
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 1.8797 0.5788 3.25 0.0024
## direction=mother ~ father 0.3638 0.3858 0.94 0.3515
## form=spline 0.3351 0.3858 0.87 0.3904
## transformation=raw 0.3290 0.3858 0.85 0.3991
## trait=E -1.4996 0.6683 -2.24 0.0306
## trait=Intel -0.4853 0.6683 -0.73 0.4721
## trait=Lies -1.9364 0.6683 -2.90 0.0061
## trait=N -1.9894 0.6683 -2.98 0.0050
## trait=P -0.4526 0.6683 -0.68 0.5023
##
#inspect strongest signals
#first augment the data with the smoothed quantiles
d_cont_long_rm_outlier_aug = plyr::ddply(d_cont_long_rm_outlier, "trait", function(dd) {
# browser()
#add them
dd = add_quantile_smooths(data = dd, x = "father", y = "mother", quantiles = c(.1, .9), method = "Rq", k = 5)
dd = add_quantile_smooths(data = dd, x = "mother", y = "father", quantiles = c(.1, .9), method = "Rq", k = 5)
})
## Warning in quantreg::summary.rq(fit, covariance = TRUE, se = se, hs = hs): 7
## non-positive fis
## Warning in quantreg::summary.rq(fit, covariance = TRUE, se = se, hs = hs): 3
## non-positive fis
## Warning in quantreg::summary.rq(fit, covariance = TRUE, se = se, hs = hs): 7
## non-positive fis
## Warning in quantreg::summary.rq(fit, covariance = TRUE, se = se, hs = hs): 1
## non-positive fis
#inspect the age pattern, obvious
d_cont_long_rm_outlier_aug %>%
filter(
trait == "Age",
!is.na(father), !is.na(mother)
) %>%
ggplot(aes(father, mother)) +
geom_point() +
geom_smooth(method = lm, se = F) +
geom_ribbon(mapping = aes(
ymin = mother_q10,
ymax = mother_q90
), alpha = .4) ->
HS_age_MF
HS_age_MF
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/age_mother~father.png")
## `geom_smooth()` using formula 'y ~ x'
#inspect the age pattern, reverse
d_cont_long_rm_outlier_aug %>%
filter(
trait == "Age",
!is.na(father), !is.na(mother)
) %>%
ggplot(aes(mother, father)) +
geom_point() +
geom_smooth(method = lm, se = F) +
geom_ribbon(mapping = aes(
ymin = father_q10,
ymax = father_q90
), alpha = .4) ->
HS_age_FM
HS_age_FM
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/age_father~mother.png")
## `geom_smooth()` using formula 'y ~ x'
#intel
d_cont_long_rm_outlier_aug %>%
filter(
trait == "Intel",
!is.na(father), !is.na(mother)
) %>%
ggplot(aes(father, mother)) +
geom_point() +
geom_smooth(method = lm, se = F) +
geom_ribbon(mapping = aes(
ymin = mother_q10,
ymax = mother_q90
), alpha = .4) ->
HS_intel_MF
HS_intel_MF
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/intel_mother~father.png")
## `geom_smooth()` using formula 'y ~ x'
#P
d_cont_long_rm_outlier_aug %>%
filter(
trait == "P",
!is.na(father), !is.na(mother)
) %>%
ggplot(aes(mother, father)) +
geom_point() +
geom_smooth(method = lm, se = F) +
geom_ribbon(mapping = aes(
ymin = father_q10,
ymax = father_q90
), alpha = .4) ->
HS_P_FM
HS_P_FM
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/P_father~mother.png")
## `geom_smooth()` using formula 'y ~ x'
#E
d_cont_long_rm_outlier_aug %>%
filter(
trait == "E",
!is.na(father), !is.na(mother)
) %>%
ggplot(aes(mother, father)) +
geom_point() +
geom_smooth(method = lm, se = F) +
geom_ribbon(mapping = aes(
ymin = father_q10,
ymax = father_q90
), alpha = .4) ->
HS_E_FM
HS_E_FM
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/E_father~mother.png")
## `geom_smooth()` using formula 'y ~ x'
# combined plot
#https://cran.r-project.org/web/packages/cowplot/vignettes/introduction.html
plot_grid(
HS_age_MF,
HS_intel_MF,
HS_P_FM,
HS_E_FM,
labels = c(" age", "intelligence", "psychoticism", "extroversion"),
vjust = 2
)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/HS_4plot.png")
Do the methods work as expected.
Simulate data with nonlinear patterns and see if we can detect it. Vary the noise and sample size.
#manual example
set.seed(1)
sim_manual_data = tibble(
n = 340,
trait_mother = seq(0, pi, length.out = n),
trait_father = sin(trait_mother) + rnorm(n, sd = .5)
)
#fit models manually
(sim_manual_linear = ols(trait_father ~ trait_mother, data = sim_manual_data))
## Linear Regression Model
##
## ols(formula = trait_father ~ trait_mother, data = sim_manual_data)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 340 LR chi2 0.00 R2 0.000
## sigma0.5544 d.f. 1 R2 adj -0.003
## d.f. 338 Pr(> chi2) 0.9746 g 0.001
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.647217 -0.365306 0.003739 0.317401 1.537018
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.6602 0.0600 11.00 <0.0001
## trait_mother -0.0010 0.0331 -0.03 0.9747
##
(sim_manual_nonlinear = ols(trait_father ~ rcs(trait_mother), data = sim_manual_data))
## Linear Regression Model
##
## ols(formula = trait_father ~ rcs(trait_mother), data = sim_manual_data)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 340 LR chi2 99.64 R2 0.254
## sigma0.4810 d.f. 4 R2 adj 0.245
## d.f. 335 Pr(> chi2) 0.0000 g 0.315
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.45229 -0.29993 -0.02196 0.31171 1.29158
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.0878 0.0999 0.88 0.3802
## trait_mother 0.8826 0.2075 4.25 <0.0001
## trait_mother' -1.6219 1.2033 -1.35 0.1786
## trait_mother'' 1.7818 3.6558 0.49 0.6263
## trait_mother''' -0.3374 4.8431 -0.07 0.9445
##
sim_manual_data$linear_predict = predict(sim_manual_linear)
sim_manual_data$nonlinear_predict = predict(sim_manual_nonlinear)
#plot it
sim_manual_data %>%
ggplot(aes(trait_mother, trait_father)) +
geom_point() +
#regression lines
geom_line(
mapping = aes(y = linear_predict),
color = "red",
size = 1.5
) +
geom_line(
mapping = aes(y = nonlinear_predict),
color = "blue",
size = 1.5
)
GG_save("figs/spline_example.png")
#params for simulation
sim_pars = expand_grid(
n = seq(10, 1000, by = 10),
noise_sd = seq(0, 2, by = .5),
replicate = 1:10
)
#loop and get results
set.seed(1)
sim_results = map_df(seq_along_rows(sim_pars), function(i) {
#simulate data
x_data = tibble(
trait_mother = seq(0, pi, length.out = sim_pars$n[i]),
trait_father = sin(trait_mother) + rnorm(n = sim_pars$n[i], mean = 0, sd = sim_pars$noise_sd[i])
)
#call function for results
model_results = fit_model_comparisons("trait", data = x_data) %>%
mutate(
#sim pars
n = sim_pars$n[i],
noise_sd = sim_pars$noise_sd[i]
)
model_results
})
#summarize results
sim_results %>%
#gains
mutate(
mom_gain = mom_nonlinear_r2_adj - mom_linear_r2_adj,
dad_gain = dad_nonlinear_r2_adj - dad_linear_r2_adj
) %>%
plyr::ddply(c("n", "noise_sd"), function(x) {
# browser()
x %>%
select(mom_linear_r2_adj:dad_nonlinear_r2_adj, mom_gain, dad_gain) %>%
describe() %>%
as.data.frame() %>%
rownames_to_column(var = "variable") %>% select(-n, -vars, -se)
}) ->
sim_results_summary
#mom gain only
sim_results_summary %>%
filter(
variable == "dad_gain"
) %>%
#plot
ggplot(aes(n, mean, color = ordered(round(noise_sd, 2)))) +
# geom_line() +
geom_ribbon(aes(ymin = min, ymax = max), alpha = .2) +
scale_color_ordinal("Amount of noise (SD)") +
scale_x_log10(breaks = c(10^(1:3), 3*10^(1:3))) +
scale_y_continuous("Mean gain in R2 adj.", breaks = seq(-1, 2, .1)) +
theme_classic()
GG_save("figs/simulation_results.png")
Simple two trait model.
#simulate some random trait data
sim_n = 10000
set.seed(1)
sim_traits = tibble(
male_attractiveness = rnorm(sim_n),
male_niceness = rnorm(sim_n),
female_attractiveness = rnorm(sim_n),
female_niceness = rnorm(sim_n)
) %>%
mutate(
male_composite = (male_attractiveness + male_niceness) %>% standardize(),
female_composite = (female_attractiveness + female_niceness) %>% standardize()
)
#find mates
sim_mates = non_random_pair(sim_traits %>% select(male_composite, female_composite))
#reordered data
sim_traits_paired = cbind(
sim_traits[sim_mates[, 1], ] %>% select(starts_with("male")),
sim_traits[sim_mates[, 2], ] %>% select(starts_with("female"))
)
#verify correlations
sim_traits_paired %>% wtd.cors()
## male_attractiveness male_niceness male_composite
## male_attractiveness 1.0000 0.0048 0.72
## male_niceness 0.0048 1.0000 0.70
## male_composite 0.7163 0.7012 1.00
## female_attractiveness 0.3589 0.3417 0.49
## female_niceness 0.3433 0.3413 0.48
## female_composite 0.5008 0.4871 0.70
## female_attractiveness female_niceness female_composite
## male_attractiveness 0.359 0.343 0.50
## male_niceness 0.342 0.341 0.49
## male_composite 0.494 0.483 0.70
## female_attractiveness 1.000 -0.017 0.70
## female_niceness -0.017 1.000 0.70
## female_composite 0.704 0.698 1.00
#above some composite score
sim_traits_paired %>% filter(male_composite > 1) %>% wtd.cors()
## male_attractiveness male_niceness male_composite
## male_attractiveness 1.000 -0.661 0.43
## male_niceness -0.661 1.000 0.39
## male_composite 0.430 0.393 1.00
## female_attractiveness 0.153 0.057 0.26
## female_niceness 0.066 0.120 0.23
## female_composite 0.184 0.147 0.40
## female_attractiveness female_niceness female_composite
## male_attractiveness 0.153 0.066 0.18
## male_niceness 0.057 0.120 0.15
## male_composite 0.256 0.226 0.40
## female_attractiveness 1.000 -0.282 0.61
## female_niceness -0.282 1.000 0.59
## female_composite 0.612 0.585 1.00
sim_traits_paired %>% filter(male_composite < 1) %>% wtd.cors()
## male_attractiveness male_niceness male_composite
## male_attractiveness 1.00 -0.22 0.64
## male_niceness -0.22 1.00 0.61
## male_composite 0.64 0.61 1.00
## female_attractiveness 0.26 0.25 0.41
## female_niceness 0.26 0.24 0.40
## female_composite 0.39 0.37 0.61
## female_attractiveness female_niceness female_composite
## male_attractiveness 0.26 0.26 0.39
## male_niceness 0.25 0.24 0.37
## male_composite 0.41 0.40 0.61
## female_attractiveness 1.00 -0.11 0.67
## female_niceness -0.11 1.00 0.66
## female_composite 0.67 0.66 1.00
#plot for dataviz
sim_traits_paired %>%
ggplot(aes(male_attractiveness, male_niceness, color = male_composite > 1)) +
geom_point(alpha = .2) +
# geom_smooth() +
geom_smooth(aes(color = NULL, group = male_composite > 1)) +
geom_smooth(aes(color = NULL), color = "black") +
scale_x_continuous("Male attractiveness") +
scale_y_continuous("Male niceness") +
scale_color_discrete("Composite score > 1", labels = c("No", "Yes"))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
GG_save("figs/composite_score_collider_bias.png")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#no need to save data, we didn't modify it much of note
#versions
write_sessioninfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] cowplot_1.0.0 quantreg_5.55 olsrr_0.5.3
## [4] osfr_0.2.8 rms_5.1-4 SparseM_1.78
## [7] readxl_1.3.1 kirkegaard_2020-09-05 metafor_2.4-0
## [10] Matrix_1.2-18 psych_1.9.12.31 magrittr_1.5
## [13] assertthat_0.2.1 weights_1.0.1 mice_3.9.0
## [16] gdata_2.18.0 Hmisc_4.4-0 Formula_1.2-3
## [19] survival_3.1-12 lattice_0.20-41 forcats_0.5.0
## [22] stringr_1.4.0 dplyr_0.8.99.9003 purrr_0.3.4
## [25] readr_1.3.1 tidyr_1.1.0 tibble_3.0.1
## [28] ggplot2_3.3.2 tidyverse_1.3.0 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.0-10 colorspace_1.4-1 rio_0.5.16
## [4] ellipsis_0.3.1 htmlTable_2.0.0 base64enc_0.1-3
## [7] fs_1.4.2 rstudioapi_0.11 httpcode_0.3.0
## [10] farver_2.0.3 MatrixModels_0.4-1 fansi_0.4.1
## [13] mvtnorm_1.1-0 lubridate_1.7.9 xml2_1.3.2
## [16] codetools_0.2-16 splines_4.0.2 mnormt_2.0.1
## [19] knitr_1.29 jsonlite_1.7.0 broom_0.5.6
## [22] cluster_2.1.0 dbplyr_1.4.4 png_0.1-7
## [25] compiler_4.0.2 httr_1.4.1 backports_1.1.8
## [28] cli_2.0.2 acepack_1.4.1 htmltools_0.5.0
## [31] tools_4.0.2 gtable_0.3.0 glue_1.4.1
## [34] Rcpp_1.0.4.6 carData_3.0-3 cellranger_1.1.0
## [37] vctrs_0.3.1 crul_0.9.0 nlme_3.1-149
## [40] xfun_0.15 openxlsx_4.1.5 rvest_0.3.5
## [43] lifecycle_0.2.0 gtools_3.8.2 goftest_1.2-2
## [46] polspline_1.1.19 MASS_7.3-52 zoo_1.8-8
## [49] scales_1.1.1 hms_0.5.3 parallel_4.0.2
## [52] sandwich_2.5-1 RColorBrewer_1.1-2 yaml_2.2.1
## [55] curl_4.3 memoise_1.1.0 gridExtra_2.3
## [58] rpart_4.1-15 latticeExtra_0.6-29 stringi_1.4.6
## [61] nortest_1.0-4 checkmate_2.0.0 zip_2.0.4
## [64] rlang_0.4.6 pkgconfig_2.0.3 evaluate_0.14
## [67] labeling_0.3 htmlwidgets_1.5.1 tidyselect_1.1.0
## [70] plyr_1.8.6 R6_2.4.1 generics_0.0.2
## [73] multcomp_1.4-13 DBI_1.1.0 mgcv_1.8-33
## [76] pillar_1.4.4 haven_2.3.1 foreign_0.8-79
## [79] withr_2.2.0 abind_1.4-5 nnet_7.3-14
## [82] modelr_0.1.8 crayon_1.3.4 car_3.0-7
## [85] tmvnsim_1.0-2 rmarkdown_2.3 jpeg_0.1-8.1
## [88] grid_4.0.2 data.table_1.12.8 blob_1.2.1
## [91] reprex_0.3.0 digest_0.6.25 munsell_0.5.0
## [94] viridisLite_0.3.0
if (F) {
#upload to OSF
#use approach in https://rpubs.com/EmilOWK/osfr_demo
#you need to set up a personal token
#good idea is to then place it somewhere safe for reuse
osf_auth(read_lines("~/.config/osf_token"))
#the project we will use
osf_proj = osf_retrieve_node("https://osf.io/qjcp6/")
#upload all files in project
#overwrite existing (versioning)
osf_upload(osf_proj, path = list.files(), conflicts = "overwrite")
}