Init
library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: weights
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: assertthat
##
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
##
## has_name
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
## Loading required package: psych
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## Loading required package: metafor
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading the 'metafor' package (version 3.0-2). For an
## introduction to the package please type: help(metafor)
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following object is masked from 'package:magrittr':
##
## set_names
## The following object is masked from 'package:assertthat':
##
## has_name
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
## flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
## splice
##
## Attaching package: 'kirkegaard'
## The following object is masked from 'package:rlang':
##
## is_logical
## The following object is masked from 'package:psych':
##
## rescale
## The following object is masked from 'package:assertthat':
##
## are_equal
## The following objects are masked from 'package:purrr':
##
## is_logical, is_numeric
## The following object is masked from 'package:base':
##
## +
load_packages(
mirt,
rms,
GGally,
readxl
)
## Loading required package: stats4
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## Attaching package: 'rms'
## The following object is masked from 'package:metafor':
##
## vif
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
theme_set(theme_bw())
options(
contrasts=c("contr.treatment", "contr.treatment")
)
Data
#read admix data
d = read_tsv("data/Aggregate-data_Subtypes_of_Native_American_Ancestry_and_Death.txt", comment = "#")
## Rows: 1805 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): gender, region, ageclass, socioecost, education, salary
## dbl (10): hgdp, ceu_3, yri_3, mapaym, ceu_3z, yri_3z, aym, map, ceu_4, yri_4
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#recode
d %<>% mutate(
id = 1:n(),
income = salary %>% mapvalues(from = c("z<350 000", "350-450", "450+", "Missing"), to = c(1, 2, 3, NA)) %>% ordered(),
education = education %>% ordered(levels = c("Primary/secondary school", "Technical", "University/postgrade")),
social_class = socioecost %>% mapvalues(from = c("ABC1", "C2", "C3", "D", "Missing"), to = c(1, 2, 3, 4, NA)) %>% ordered() %>% fct_rev(),
age = ageclass %>% mapvalues(from = "z>32", to = ">32") %>% ordered(levels = c("<24", "24-26", "27-32", ">32")),
mapaym2 = map+aym
)
# gender (male, female)
# region (Arica, Tarapaca, Antofagasta, Atacama, Coquimbo, Valparaiso, ZMetropolitana, OHiggins, Maule, Biobio, Araucania, Rios, Lagos, Aisen, Magallanes)
# ageclass (<24, 24-26, 27-32, z>32)
# socioecost (ABC1, C2, C3, D, Missing)
# education (Primary/secondary school, Technical, University/postgrade)
# salary (z<350 000, 350-450, 450+, Missing)
# hgdp (numeric values from 0 to 1): Native American ancestry proportions from supervised ADMIXTURE with 3 references (CEU, YRI, HGDP)
# ceu_3 (numeric values from 0 to 1): European ancestry proportions from supervised ADMIXTURE with 3 references (CEU, YRI, HGDP)
# yri_3 (numeric values from 0 to 1): African ancestry proportions from supervised ADMIXTURE with 3 references (CEU, YRI, HGDP)
# mapaym (numeric values from 0 to 1): Native American ancestry proportions from supervised ADMIXTURE with 3 references (CEU, YRI, Mapuche and Aymara grouped together)
# ceu_3z (numeric values from 0 to 1): European ancestry proportions from supervised ADMIXTURE with 3 references (CEU, YRI, Mapuche and Aymara grouped together)
# yri_3z (numeric values from 0 to 1): African ancestry proportions from supervised ADMIXTURE with 3 references (CEU, YRI, Mapuche and Aymara grouped together)
# aym (numeric values from 0 to 1): Aymara ancestry proportions from supervised ADMIXTURE with 4 references (CEU, YRI, Mapuche, Aymara)
# map (numeric values from 0 to 1): Mapuche ancestry proportions from supervised ADMIXTURE with 4 references (CEU, YRI, Mapuche, Aymara)
# ceu_4 (numeric values from 0 to 1): European ancestry proportions from supervised ADMIXTURE with 4 references (CEU, YRI, Mapuche, Aymara)
# yri_4 (numeric values from 0 to 1): African ancestry proportions from supervised ADMIXTURE with 4 references (CEU, YRI, Mapuche, Aymara)
#ancestry vars
ancestry_vars = d %>%
select(hgdp:yri_4) %>%
names()
ancestry_set_1 = c("hgdp", "ceu_3", "yri_3")
ancestry_set_2 = c("mapaym", "ceu_3z", "yri_3z")
ancestry_set_3 = c("aym", "map", "ceu_4", "yri_4")
#regional data
regional = read_excel("data/regional data.xlsx")
Functions
# regression line function for ggpairs
#https://stackoverflow.com/questions/30856602/how-to-add-line-fit-on-ggally-correlation-plot-matrix
lowerfun <- function(data, mapping) {
ggplot(data = data, mapping = mapping)+
geom_point(alpha = .25) +
geom_smooth(method = "loess", formula = y ~ x,
fill = "blue", color = "red", size = 0.5)
}
Analyses
S factor
#these are ordinals
SES_vars = c("income", "education", "social_class")
#cors
d %>%
select(!!SES_vars) %>%
map_df(as.numeric) %>%
mixedCor()
## Warning in polychoric(data[, p], smooth = smooth, global = global, weight =
## weight, : The items do not have an equal number of response alternatives, global
## set to FALSE.
## Warning in matpLower(x, nvar, gminx, gmaxx, gminy, gmaxy): 2 cells were adjusted
## for 0 values using the correction for continuity. Examine your data carefully.
## Call: mixedCor(data = .)
## incom edctn scl_c
## income 1.00
## education 0.45 1.00
## social_class 0.80 0.73 1.00
#factor analysis
s_factor = irt.fa(
x = d %>%
select(!!SES_vars) %>%
map_df(as.numeric),
nfactors = 1
)
## Warning in polychoric(x): The items do not have an equal number of response
## alternatives, global set to FALSE.
## Warning in polychoric(x): 2 cells were adjusted for 0 values using the
## correction for continuity. Examine your data carefully.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in sqrt(1 - t$loadings[, i]^2): NaNs produced
## Warning in sqrt(1 - t$loadings^2): NaNs produced
## Warning in irt.fa(x = d %>% select(!!SES_vars) %>% map_df(as.numeric), nfactors = 1): A discrimination with a NaN value was replaced with the maximum discrimination for factor 1 and item(s) 3
## examine the factor analysis object (fa) to identify the Heywood case.
## The item informations are probably suspect as well for this factor.
## You might try a different factor extraction technique.

s_factor
## Item Response Analysis using Factor Analysis
##
## Call: irt.fa(x = d %>% select(!!SES_vars) %>% map_df(as.numeric), nfactors = 1)
## Item Response Analysis using Factor Analysis
##
## Summary information by factor and item
## Factor = 1
## -3 -2 -1 0 1 2 3
## income 0.18 0.38 0.55 0.51 0.30 0.13 0.05
## education 0.05 0.11 0.23 0.37 0.42 0.32 0.18
## social_class 0.07 0.19 0.41 0.65 0.75 0.63 0.36
## Test Info 0.30 0.68 1.18 1.52 1.47 1.07 0.59
## SEM 1.83 1.22 0.92 0.81 0.83 0.96 1.31
## Reliability -2.37 -0.48 0.16 0.34 0.32 0.07 -0.71
##
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
##
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 0 and the objective function was 0.11
## The number of observations was 1805 with Chi Square = 194.85 with prob < NA
##
## The root mean square of the residuals (RMSA) is 0.04
## The df corrected root mean square of the residuals is NA
##
## Tucker Lewis Index of factoring reliability = -Inf
s_factor$fa
## Factor Analysis using method = minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 h2 u2 com
## income 0.74 0.55 0.449 1
## education 0.68 0.46 0.542 1
## social_class 1.02 1.05 -0.048 1
##
## MR1
## SS loadings 2.06
## Proportion Var 0.69
##
## Mean item complexity = 1
## Test of the hypothesis that 1 factor is sufficient.
##
## The degrees of freedom for the null model are 3 and the objective function was 1.89 with Chi Square of 3400.07
## The degrees of freedom for the model are 0 and the objective function was 0.11
##
## The root mean square of the residuals (RMSR) is 0.04
## The df corrected root mean square of the residuals is NA
##
## The harmonic number of observations is 1805 with the empirical chi square 20.98 with prob < NA
## The total number of observations was 1805 with Likelihood Chi Square = 194.85 with prob < NA
##
## Tucker Lewis Index of factoring reliability = -Inf
## Fit based upon off diagonal values = 1
#mirt
s_factor_mirt = mirt(
data = d %>%
select(!!SES_vars) %>%
map_df(as.numeric),
model = 1,
itemtype = c("nominal", "nominal", "nominal")
)
##
Iteration: 1, Log-Lik: -4621.448, Max-Change: 4.13311
Iteration: 2, Log-Lik: -3225.701, Max-Change: 1.77520
Iteration: 3, Log-Lik: -3116.486, Max-Change: 0.78061
Iteration: 4, Log-Lik: -3072.821, Max-Change: 0.92416
Iteration: 5, Log-Lik: -3051.338, Max-Change: 0.70424
Iteration: 6, Log-Lik: -3040.009, Max-Change: 0.58114
Iteration: 7, Log-Lik: -3033.767, Max-Change: 0.50749
Iteration: 8, Log-Lik: -3030.280, Max-Change: 0.33793
Iteration: 9, Log-Lik: -3027.882, Max-Change: 0.40246
Iteration: 10, Log-Lik: -3024.082, Max-Change: 0.16176
Iteration: 11, Log-Lik: -3022.968, Max-Change: 0.38202
Iteration: 12, Log-Lik: -3022.472, Max-Change: 0.46090
Iteration: 13, Log-Lik: -3021.022, Max-Change: 0.14928
Iteration: 14, Log-Lik: -3020.701, Max-Change: 0.49693
Iteration: 15, Log-Lik: -3020.483, Max-Change: 0.46776
Iteration: 16, Log-Lik: -3019.766, Max-Change: 0.38457
Iteration: 17, Log-Lik: -3019.641, Max-Change: 0.39636
Iteration: 18, Log-Lik: -3019.552, Max-Change: 0.39882
Iteration: 19, Log-Lik: -3019.195, Max-Change: 0.41482
Iteration: 20, Log-Lik: -3019.120, Max-Change: 0.40918
Iteration: 21, Log-Lik: -3019.068, Max-Change: 0.40251
Iteration: 22, Log-Lik: -3018.845, Max-Change: 0.38846
Iteration: 23, Log-Lik: -3018.806, Max-Change: 0.37775
Iteration: 24, Log-Lik: -3018.776, Max-Change: 0.37795
Iteration: 25, Log-Lik: -3018.638, Max-Change: 0.00368
Iteration: 26, Log-Lik: -3018.633, Max-Change: 0.00334
Iteration: 27, Log-Lik: -3018.632, Max-Change: 0.00000
s_factor_mirt@Fit
## $G2
## [1] NaN
##
## $p
## [1] NaN
##
## $TLI
## [1] NaN
##
## $CFI
## [1] NaN
##
## $RMSEA
## [1] NaN
##
## $df
## [1] 21
##
## $AIC
## [1] 6065.264
##
## $AICc
## [1] 6065.499
##
## $BIC
## [1] 6142.241
##
## $SABIC
## [1] 6097.763
##
## $HQ
## [1] 6093.675
##
## $logLik
## [1] -3018.632
##
## $logPrior
## [1] 0
##
## $SElogLik
## [1] 0
##
## $F
## F1
## income 0.7994839
## education 0.5714313
## social_class 0.9915645
##
## $h2
## income education social_class
## 0.6391745 0.3265338 0.9832002
#scores
s_factor_mirt_scores = mirt::fscores(
s_factor_mirt,
full.scores = T,
full.scores.SE = T
)
#add to main
d$S = s_factor_mirt_scores[, 1] %>% standardize()
#unit weighted S factor
d$S_unit = d %>%
select(!!SES_vars) %>%
map_df(as.numeric) %>%
df_standardize() %>%
rowMeans() %>%
standardize()
#relations
d %>%
select(!!SES_vars, S, S_unit) %>%
map_df(as.numeric) %>%
mixedCor()
## Warning in polychoric(data[, p], smooth = smooth, global = global, weight =
## weight, : The items do not have an equal number of response alternatives, global
## set to FALSE.
## Warning in matpLower(x, nvar, gminx, gmaxx, gminy, gmaxy): 2 cells were adjusted
## for 0 values using the correction for continuity. Examine your data carefully.
## Call: mixedCor(data = .)
## incom edctn scl_c S S_unt
## income 1.00
## education 0.45 1.00
## social_class 0.80 0.73 1.00
## S 0.92 0.96 1.00 1.00
## S_unit 0.94 0.96 1.00 0.97 1.00
Descriptive stats
d %>%
select(S, !!ancestry_vars) %>%
describe2()
#ordinals
d$age %>% table2()
d$gender %>% table2()
d$region %>% table2()
Plots
#ancestries among each other
d %>%
select(S, !!ancestry_set_3) %>%
ggpairs(
lower = list(continuous = wrap(lowerfun)),
columnLabels = c("Social status", "Aymara ancestry", "Mapuche ancestry", "European ancestry", "African ancestry")
)

GG_save("figs/ggpairs.png")
#ensure ancestry sets sum to 1
d %>%
select(!!ancestry_set_1) %>%
rowSums() %>%
describe2()
d %>%
select(!!ancestry_set_2) %>%
rowSums() %>%
describe2()
d %>%
select(!!ancestry_set_3) %>%
rowSums() %>%
describe2()
#are ancestry estimates are consistent across sets?
d %>%
select(hgdp, mapaym, aym, map, mapaym2) %>%
wtd.cors()
## hgdp mapaym aym map mapaym2
## hgdp 1.0000000 0.9992532 0.8824527 -0.3326785 0.9804084
## mapaym 0.9992532 1.0000000 0.8788837 -0.3238963 0.9833253
## aym 0.8824527 0.8788837 1.0000000 -0.7356212 0.7791240
## map -0.3326785 -0.3238963 -0.7356212 1.0000000 -0.1485028
## mapaym2 0.9804084 0.9833253 0.7791240 -0.1485028 1.0000000
Regressions
#regression model function
do_reg = function(outcome, ancestries, control_sets, data) {
# browser()
#make formulas
f1 = str_glue("{outcome} ~ {str_c(ancestries, collapse = ' + ')}") %>% as.formula()
f2 = str_glue("{outcome} ~ {str_c(ancestries, collapse = ' + ')} + {str_c(control_sets[[1]], collapse = ' + ')}") %>% as.formula()
f3 = str_glue("{outcome} ~ {str_c(ancestries, collapse = ' + ')} + {str_c(control_sets[[1]], collapse = ' + ')} + {str_c(control_sets[[2]], collapse = ' + ')}") %>% as.formula()
#do regs
regs = list(
simple = ols(f1, data = d),
basic = ols(f2, data = d),
regions = ols(f3, data = d)
)
regs
}
#primary analysis
do_reg(
outcome = "S",
ancestries = ancestry_set_3[-3],
control_sets = list(c("age", "gender"), "region")) %>%
summarize_models(asterisks_only = F, collapse_factors = "region")
#other sets of ancestry variables
do_reg(
outcome = "S",
ancestries = ancestry_set_1[-2],
control_sets = list(c("age", "gender"), "region")) %>%
summarize_models(asterisks_only = F, collapse_factors = "region")
do_reg(
outcome = "S",
ancestries = ancestry_set_2[-2],
control_sets = list(c("age", "gender"), "region")) %>%
summarize_models(asterisks_only = F, collapse_factors = "region")
#manual mapaym
do_reg(
outcome = "S",
ancestries = c("mapaym2", "yri_4"),
control_sets = list(c("age", "gender"), "region")) %>%
summarize_models(asterisks_only = F, collapse_factors = "region")
#individual slopes
list(
Mapuche = ols(S ~ map, data = d),
Aymara = ols(S ~ aym, data = d),
European = ols(S ~ ceu_4, data = d),
African = ols(S ~ yri_4, data = d)
) %>% summarize_models(asterisks_only = F)
Regional
#aggregate to regional
d$region %>% table2()
#merge Rios to Lagos, and Africa to Tarapica
#says so here https://en.wikipedia.org/wiki/Ranked_lists_of_Chilean_regions#By_international_HDI
#the report says the same thing
d$region2 = d$region %>% mapvalues(from = c("Rios", "Arica"), to = c("Lagos", "Tarapaca"))
#compute means
regions_merged = d %>%
plyr::ddply("region2", function(dd) {
tibble(
euro = mean(dd$ceu_4),
aym = mean(dd$aym),
map = mean(dd$map),
african = mean(dd$yri_4),
S = mean(dd$S),
n = nrow(dd),
sqrtn = sqrt(n)
)
}) %>%
full_join(regional, by = c("region2" ="Region2"))
#cors
regions_merged %>%
select(euro:S, HDI, SIMCE, PSU) %>%
ggpairs(
lower = list(continuous = wrap(lowerfun)),
columnLabels = c("European ancestry", "Aymara ancestry", "Mapuche ancestry", "African ancestry", "Social status", "Human development index", "SIMCE", "PSU")
) +
theme(text = element_text(size = 8))

GG_save("figs/regional_cors.png")
#with p values
regions_merged %>%
select(euro:S, HDI, SIMCE, PSU) %>%
cor_matrix(p_val = T)
## euro aym map african S HDI SIMCE
## euro "1" "-0.42" "-0.17" "-0.03" "0.25" "0.16" "0.38"
## aym "-0.42" "1" "-0.82***" "0.77**" "0.09" "0.59" "-0.16"
## map "-0.17" "-0.82***" "1" "-0.85***" "-0.26" "-0.75**" "-0.07"
## african "-0.03" "0.77**" "-0.85***" "1" "0.23" "0.64" "0.02"
## S "0.25" "0.09" "-0.26" "0.23" "1" "0.59" "0.63"
## HDI "0.16" "0.59" "-0.75**" "0.64" "0.59" "1" "0.60"
## SIMCE "0.38" "-0.16" "-0.07" "0.02" "0.63" "0.60" "1"
## PSU "0.69*" "-0.42" "0.02" "-0.05" "0.05" "0.04" "0.42"
## PSU
## euro "0.69*"
## aym "-0.42"
## map "0.02"
## african "-0.05"
## S "0.05"
## HDI "0.04"
## SIMCE "0.42"
## PSU "1"
#regress
ols(HDI ~ aym + map + african, data = regions_merged) %>%
list() %>%
summarize_models()
ols(SIMCE ~ aym + map + african, data = regions_merged) %>%
list() %>%
summarize_models()
ols(PSU ~ aym + map + african, data = regions_merged) %>%
list() %>%
summarize_models()
#weights
#with p values
regions_merged %>%
select(euro:S, SIMCE:HDI) %>%
cor_matrix(asterisks_only = F, p_val = T, weights = regions_merged$sqrtn)
## euro aym map
## euro "1" "-0.68%a [p=0.011]" "0.23%a [p=0.447]"
## aym "-0.68%a [p=0.011]" "1" "-0.87%a [p=<0.001***]"
## map "0.23%a [p=0.447]" "-0.87%a [p=<0.001***]" "1"
## african "-0.27%a [p=0.379]" "0.78%a [p=0.002**]" "-0.88%a [p=<0.001***]"
## S "0.09%a [p=0.771]" "0.34%a [p=0.258]" "-0.51%a [p=0.076]"
## SIMCE "0.59%a [p=0.033]" "-0.31%a [p=0.306]" "0.01%a [p=0.969]"
## HDI "-0.03%a [p=0.923]" "0.63%a [p=0.022]" "-0.81%a [p=<0.001***]"
## african S SIMCE
## euro "-0.27%a [p=0.379]" "0.09%a [p=0.771]" "0.59%a [p=0.033]"
## aym "0.78%a [p=0.002**]" "0.34%a [p=0.258]" "-0.31%a [p=0.306]"
## map "-0.88%a [p=<0.001***]" "-0.51%a [p=0.076]" "0.01%a [p=0.969]"
## african "1" "0.46%a [p=0.112]" "-0.01%a [p=0.984]"
## S "0.46%a [p=0.112]" "1" "0.41%a [p=0.165]"
## SIMCE "-0.01%a [p=0.984]" "0.41%a [p=0.165]" "1"
## HDI "0.73%a [p=0.004**]" "0.70%a [p=0.007*]" "0.49%a [p=0.093]"
## HDI
## euro "-0.03%a [p=0.923]"
## aym "0.63%a [p=0.022]"
## map "-0.81%a [p=<0.001***]"
## african "0.73%a [p=0.004**]"
## S "0.70%a [p=0.007*]"
## SIMCE "0.49%a [p=0.093]"
## HDI "1"
#regress
ols(HDI ~ aym + map + african, data = regions_merged, weights = regions_merged$sqrtn)
## Linear Regression Model
##
## ols(formula = HDI ~ aym + map + african, data = regions_merged,
## weights = regions_merged$sqrtn)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 13 LR chi2 15.03 R2 0.685
## sigma0.0758 d.f. 3 R2 adj 0.580
## d.f. 9 Pr(> chi2) 0.0018 g 0.038
##
## Residuals
##
## Min 1Q Median 3Q Max
## -0.0499846 -0.0133877 -0.0005058 0.0072193 0.0443796
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 1.0313 0.1409 7.32 <0.0001
## aym -0.1305 0.1443 -0.90 0.3892
## map -0.4612 0.2301 -2.00 0.0760
## african 0.4734 1.7850 0.27 0.7968
##