This is an R notebook for the study (insert final name).
This script loads packages, data, merges data and does some initial filtering.
# libraries ---------------------------------------------------------------
#here we load the libraries we need
library(pacman)
p_load(plyr, psych, weights, tidyverse, forcats, magrittr, kirkegaard, gridExtra)
options(digits = 2, width = 600)
#default ggsave settings
ggsave2 = purrr::partial(ggsave, width = 10, height = 6)
#controls
#excludes persons without English as first language
exclude_nonenglish_first_language = F
# data --------------------------------------------------------------------
#here we load the data we need
d_ping_behavior = read_csv("data/PING_Behavior.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## SubjID = col_character(),
## Age_At_NPExam = col_double(),
## Age_At_IMGExam = col_double(),
## Age_At_PhenX_Completion = col_double(),
## Gender = col_character(),
## FDH_Respondent_Relation = col_character(),
## FDH_Guardian_1_Relation = col_character(),
## FDH_Guardian_2_Relation = col_character(),
## FDH_1_Hispanic_or_Latino = col_character(),
## FDH_2_Pacific_Islander = col_character(),
## FDH_2_Asian = col_character(),
## FDH_2_African_American = col_character(),
## FDH_2_American_Indian = col_character(),
## FDH_2_White = col_character(),
## FDH_5_Birth_By = col_character(),
## FDH_8_Birth_Weight_lbs = col_double(),
## FDH_8_Birth_Length = col_double(),
## FDH_8_Birth_Head_Circumference = col_double(),
## FDH_9_APGAR_Score_5_Minutes = col_double(),
## FDH_11_Pregnancy_Medications = col_character()
## # ... with 545 more columns
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not a multiple of vector length (arg 1)
## Warning: 62 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 1093 TBX_dccs_pst_rt no trailing characters .5 'data/PING_Behavior.csv' file 2 1093 TBX_dccs_d_rt no trailing characters .5 'data/PING_Behavior.csv' row 3 1123 TBX_dccs_d_rt no trailing characters .5 'data/PING_Behavior.csv' col 4 1167 TBX_dccs_pst_rt no trailing characters .5 'data/PING_Behavior.csv' expected 5 1167 TBX_dccs_d_rt no trailing characters .5 'data/PING_Behavior.csv'
## ... ................. ... .............................................................................. ........ .............................................................................. ...... .............................................................................. .... .............................................................................. ... .............................................................................. ... .............................................................................. ........ ..............................................................................
## See problems(...) for more details.
d_ping_mri = read_csv("data/PING_MRI_data.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## ID = col_character(),
## Site = col_character(),
## Age = col_double(),
## Sex = col_character(),
## Manufacturer = col_character(),
## ManufacturersModelName = col_character(),
## DeviceSerialNumber = col_character(),
## europe = col_double(),
## africa = col_double(),
## amerind = col_double(),
## eastAsia = col_double(),
## oceania = col_double(),
## centralAsia = col_double(),
## ED = col_double(),
## vol.IntracranialVolume = col_double()
## )
## See spec(...) for full column specifications.
# merge data --------------------------------------------------------------
#here we merge the two datasets: one with behavioral data, one with MRI + genetic data.
#we do this by matching up the rows by the IDs
#we skip age from one dataset to avoid duplication
d_ping = full_join(d_ping_mri %>% dplyr::select(-Age), d_ping_behavior,by = c("ID" = "SubjID"))
This code recodes some variables, primarily the SIRE ones.
### RECODE AND RENAME VARIABLES
# exlude persons without any genomic data ---------------------------------
#only small number of people
d_ping %<>% filter(!is.na(GAF_africa))
# non-english first language ----------------------------------------------
#rename
d_ping$English_first_language = d_ping$FDH_21_English_First_Lang == "Yes"
#fill NA to T
d_ping$English_first_language %<>% mapvalues(NA, T)
d_ping$EFL = d_ping$English_first_language
#table
table2(d_ping$English_first_language)
## # A tibble: 3 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 TRUE 1237 88.9
## 2 FALSE 154 11.1
## 3 <NA> 0 0
count_NA(d_ping$English_first_language, T)
## [1] 1391
#exclude
if (exclude_nonenglish_first_language) {
d_ping %<>% dplyr::filter(!English_first_language)
}
# rename ------------------------------------------------------------------
#cognitive tests
d_ping$IBAM = d_ping$TBX_ibam_scr
d_ping$list_sort = d_ping$TBX_ls
d_ping$pattern_recog = d_ping$TBX_pspac_scr
d_ping$vocab = d_ping$TBX_VOCAB_THETA
d_ping$reading = d_ping$TBX_reading_score
d_ping$flanker = d_ping$TBX_flanker_score
d_ping$card_sort = d_ping$TBX_dccs_score
#vector
v_cog = c("IBAM", "list_sort", "pattern_recog", "vocab", "reading", "flanker", "card_sort")
#we dont like attention
#S
d_ping$occupation_1 = d_ping$FDH_Guardian_1_Ocup
d_ping$occupation_2 = d_ping$FDH_Guardian_2_Ocup
d_ping$education_1 = d_ping$FDH_Guardian_1_Edu
d_ping$education_2 = d_ping$FDH_Guardian_2_Edu
d_ping$hh_income = d_ping$FDH_3_Household_Income
v_S = c("occupation_1", "occupation_2", "education_1", "education_2", "hh_income")
#sensible ancestry names
d_ping$European = d_ping$europe
d_ping$African = d_ping$africa
d_ping$Amerindian = d_ping$amerind
d_ping$East_Asian = d_ping$eastAsia
d_ping$Oceanian = d_ping$oceania
d_ping$Central_Asian = d_ping$centralAsia
#genomics
v_genomic = c("European", "African", "Amerindian", "East_Asian", "Oceanian", "Central_Asian")
# recode ------------------------------------------------------------------
#recode N sex to NA
d_ping$Sex[!d_ping$Sex %in% c("M", "F")] = NA
# rename SIRE -------------------------------------------------------------
#which are the original SIRE variables?
(v_SIRE_orig = str_detect2(names(d_ping), pattern = "FDH_[12]_", value = T))
## [1] "FDH_1_Hispanic_or_Latino" "FDH_2_Pacific_Islander" "FDH_2_Asian" "FDH_2_African_American" "FDH_2_American_Indian" "FDH_2_White"
#new names
v_SIREs = c("Hispanic", "Pacific_Islander", "Asian", "African_American", "American_Indian", "White")
#better names
d_ping$Hispanic = d_ping$FDH_1_Hispanic_or_Latino == "Yes"
d_ping$Pacific_Islander = d_ping$FDH_2_Pacific_Islander == "Yes"
d_ping$Asian = d_ping$FDH_2_Asian == "Yes"
d_ping$African_American = d_ping$FDH_2_African_American == "Yes"
d_ping$American_Indian = d_ping$FDH_2_American_Indian == "Yes"
d_ping$White = d_ping$FDH_2_White == "Yes"
#fill with FALSE
false_filler = function(x) {
x[is.na(x)] = F
x
}
d_ping$Hispanic %<>% false_filler
d_ping$Pacific_Islander %<>% false_filler
d_ping$Asian %<>% false_filler
d_ping$African_American %<>% false_filler
d_ping$American_Indian %<>% false_filler
d_ping$White %<>% false_filler
#fill in Other
d_ping$Other = d_ping[v_SIREs] %>% rowSums %>% `==`(0)
#add Other SIRE, but not last!
#adding it last causes it to be the reference class/group
v_SIREs = c("Hispanic", "Pacific_Islander", "Asian", "African_American", "American_Indian", "Other", "White")
#frequency
colMeans(d_ping[v_SIREs], na.rm = T)
## Hispanic Pacific_Islander Asian African_American American_Indian Other White
## 0.236 0.090 0.219 0.164 0.047 0.017 0.656
colMeans(d_ping[v_SIREs], na.rm = T) %>% sum
## [1] 1.4
#how much missing data?
d_ping[v_SIREs] %>% miss_amount()
## cases with missing data vars with missing data cells with missing data
## 0 0 0
# SIRE, standard ------------------------------------------------------------------
#recode SIRE
d_ping$SIRE_standard = map_chr(1:nrow(d_ping), .f = function(row) {
#extract SIREs
SIREs = d_ping[row, v_SIREs] %>% unlist
#Hispanic?
if (SIREs[1]) return(names(SIREs)[1])
#only one?
if (sum(SIREs) == 1) {
#then return the name of it
return(names(SIREs)[which(SIREs)])
} else if (sum(SIREs) == 0) {
#did not select any
return("Other")
} else {
#more than one, return multi
return("Multi_ethnic")
#return combo
return(str_c(names(SIREs)[which(SIREs)], collapse = " "))
}
})
#check results
table2(d_ping$SIRE_standard, include_NA = F) %>%
write_clipboard()
## Group Count Percent
## 1 White 571.00 41.05
## 2 Hispanic 328.00 23.58
## 3 Multi_ethnic 186.00 13.37
## 4 African_American 140.00 10.06
## 5 Asian 122.00 8.77
## 6 Other 24.00 1.73
## 7 Pacific_Islander 16.00 1.15
## 8 American_Indian 4.00 0.29
#as factor
#factors are R's built in variable type for handling categorical data
d_ping$SIRE_standard %<>% factor()
#we want White to be in front
d_ping$SIRE_standard %<>% fct_relevel("White")
#verify
assert_that(levels(d_ping$SIRE_standard)[1] == "White")
## [1] TRUE
# SIRE, combinations ------------------------------------------
d_ping$SIRE_common = map_chr(1:nrow(d_ping), .f = function(row) {
#extract SIREs
SIREs = d_ping[row, v_SIREs] %>% unlist
#only one?
if (sum(SIREs) == 1) {
#then return the name of it
return(names(SIREs)[which(SIREs)])
} else if (sum(SIREs) == 0) {
#did not select any
return("Other")
} else {
#return combo
return(str_c(names(SIREs)[which(SIREs)], collapse = ", "))
}
})
#table
table2(d_ping$SIRE_common, include_NA = F) %>%
write_clipboard()
## Group Count Percent
## 1 White 571.00 41.05
## 2 African_American 140.00 10.06
## 3 Hispanic, White 140.00 10.06
## 4 Asian 122.00 8.77
## 5 Hispanic 71.00 5.10
## 6 Asian, White 60.00 4.31
## 7 Pacific_Islander, Asian, White 32.00 2.30
## 8 Hispanic, African_American 29.00 2.08
## 9 African_American, White 25.00 1.80
## 10 Other 24.00 1.73
## 11 Pacific_Islander, Asian 17.00 1.22
## 12 Pacific_Islander 16.00 1.15
## 13 Hispanic, American_Indian, White 13.00 0.93
## 14 Hispanic, Asian 13.00 0.93
## 15 Hispanic, Pacific_Islander, Asian, White 13.00 0.93
## 16 American_Indian, White 11.00 0.79
## 17 Hispanic, Pacific_Islander, Asian 11.00 0.79
## 18 Pacific_Islander, White 9.00 0.65
## 19 Hispanic, American_Indian 7.00 0.50
## 20 Hispanic, Asian, White 6.00 0.43
## 21 African_American, American_Indian 5.00 0.36
## 22 Asian, American_Indian, White 5.00 0.36
## 23 Hispanic, Pacific_Islander 5.00 0.36
## 24 Pacific_Islander, Asian, American_Indian, White 5.00 0.36
## 25 American_Indian 4.00 0.29
## 26 Asian, African_American 4.00 0.29
## 27 Hispanic, African_American, American_Indian 4.00 0.29
## 28 African_American, American_Indian, White 3.00 0.22
## 29 Pacific_Islander, African_American, White 3.00 0.22
## 30 Pacific_Islander, Asian, African_American, White 3.00 0.22
## 31 Asian, African_American, White 2.00 0.14
## 32 Hispanic, African_American, American_Indian, White 2.00 0.14
## 33 Hispanic, Asian, American_Indian, White 2.00 0.14
## 34 Hispanic, Pacific_Islander, Asian, African_American 2.00 0.14
## 35 Hispanic, Pacific_Islander, Asian, African_American, White 2.00 0.14
## 36 Hispanic, Pacific_Islander, White 2.00 0.14
## 37 Hispanic, African_American, White 1.00 0.07
## 38 Hispanic, Asian, African_American, White 1.00 0.07
## 39 Hispanic, Asian, American_Indian 1.00 0.07
## 40 Hispanic, Pacific_Islander, African_American, White 1.00 0.07
## 41 Hispanic, Pacific_Islander, American_Indian 1.00 0.07
## 42 Hispanic, Pacific_Islander, Asian, American_Indian 1.00 0.07
## 43 Pacific_Islander, Asian, African_American 1.00 0.07
## 44 Pacific_Islander, Asian, American_Indian 1.00 0.07
#recode uncommon
d_ping$SIRE_common = fct_lump(d_ping$SIRE_common, n = 8)
#set White as the first
d_ping$SIRE_common %<>% fct_relevel("White")
#verify
assert_that(levels(d_ping$SIRE_common)[1] == "White")
## [1] TRUE
# recode SIRE - continuous ------------------------------------------------
#here we recode the variable variables as continuous variables
d_SIRE_con = adply(d_ping, .margins = 1, .expand = F, .fun = function(row) {
#extract SIREs
SIREs = row[v_SIREs] %>% unlist
#numeric
SIREs = SIREs %>% as.numeric
#sum of SIRE
SIRE_sum = sum(SIREs)
#proportion SIREs
res = SIREs / SIRE_sum
#names
names(res) = v_SIREs + "_c"
#output
res
})
#merge
d_ping = cbind(d_ping, d_SIRE_con[-1])
#vector for use
v_SIREs_con = v_SIREs + "_c"
# largest ancestry --------------------------------------------------------
#function to assign largest ancestry
ancestry_assigner = function(data = d_ping, threshold = 0) {
aaply(df_subset_by_pattern(d_ping, pattern = "GAF_"), .margins = 1, .fun = function(block) {
#vector
vect = unlist(block)
#no data?
if (all(is.na(vect))) return(NA)
#get rid of prefix
names(vect) %<>% str_replace(pattern = "GAF_", replacement = "")
#equal sized largest?
if(sum(max(vect) == vect) > 1) browser()
#threshold met?
if (max(vect) < threshold) return("Mixed")
#return largest ancestry name
names(which.max(vect))
}, .expand = F)
}
#assign
d_ping$largest_ancestry = ancestry_assigner() #default
d_ping$largest_ancestry_twothirds = ancestry_assigner(threshold = 2/3)
d_ping$largest_ancestry_half = ancestry_assigner(threshold = .5)
# age and sex -------------------------------------------------------------
#attempt to fill in missing age
sum(is.na(d_ping$Age_At_NPExam) & !is.na(d_ping$Age_At_IMGExam))
## [1] 3
sum(is.na(d_ping$Age_At_NPExam) & !is.na(d_ping$Age_At_PhenX_Completion))
## [1] 0
#we can only fill in 7
#we are also using the wrong variable
d_ping$Age = map2_dbl(d_ping$Age_At_NPExam, d_ping$Age_At_IMGExam, function(x, y) {
if (is.na(x)) return(y)
x
})
#plots
GG_denhist(d_ping, "Age")
## Warning: Removed 5 rows containing non-finite values (stat_bin).
## Warning: Removed 5 rows containing non-finite values (stat_density).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5 rows containing non-finite values (stat_bin).
## Warning: Removed 5 rows containing non-finite values (stat_density).
ggsave("figures/age_dist.png")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5 rows containing non-finite values (stat_bin).
## Warning: Removed 5 rows containing non-finite values (stat_density).
GG_denhist(d_ping, var = "Age", group = "Gender")
## Warning in GG_denhist(d_ping, var = "Age", group = "Gender"): There were groups without any data. These were removed
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("figures/age_gender.png")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#numeric
d_ping$Age %>% psych::describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1386 12 5 11 12 5.9 3 21 18 0.18 -1.1 0.13
Impute missing data, factor analyze, score.
### COGNITIVE DATA
# function to control ----------------------------------------------------------------
#make a function that controls each variable for sex then for age
controller = function(data, gender=T, age=T) {
for (var in names(data)) {
if (gender) {
.model = var + " ~ Gender"
fit = lm(.model, data = cbind(data, Gender = d_ping$Gender), na.action = na.exclude)
data[[var]] = resid(fit)
}
if (age) {
.model = var + " ~ Age"
fit = loess(.model, data = cbind(data, Age = d_ping$Age), na.action = na.exclude)
data[[var]] = resid(fit)
}
}
data
}
# subset ------------------------------------------------------------------
d_cog = d_ping[v_cog]
# missing -----------------------------------------------------------------
#examine
miss_plot(d_cog)
miss_by_var(d_cog)
## IBAM list_sort pattern_recog vocab reading flanker card_sort
## 20 32 21 24 40 41 169
miss_amount(d_cog)
## cases with missing data vars with missing data cells with missing data
## 0.140 1.000 0.036
#impute
d_cog = miss_impute(d_cog)
#complete cases after imputation
d_cog %>% na.omit() %>% nrow()
## [1] 1370
# age and sex control -------------------------------------------------------------
#adjust for age and sex
d_cog = controller(d_cog)
#sample sizes
count.pairwise(d_cog)
## IBAM list_sort pattern_recog vocab reading flanker card_sort
## IBAM 1370 1369 1370 1370 1369 1369 1369
## list_sort 1369 1369 1369 1369 1369 1369 1369
## pattern_recog 1370 1369 1370 1370 1369 1369 1369
## vocab 1370 1369 1370 1370 1369 1369 1369
## reading 1369 1369 1369 1369 1369 1369 1369
## flanker 1369 1369 1369 1369 1369 1370 1370
## card_sort 1369 1369 1369 1369 1369 1370 1370
#cors
wtd.cors(d_cog)
## IBAM list_sort pattern_recog vocab reading flanker card_sort
## IBAM 1.000 0.34 0.13 0.22 0.20 0.097 0.16
## list_sort 0.337 1.00 0.18 0.34 0.31 0.218 0.30
## pattern_recog 0.130 0.18 1.00 0.13 0.13 0.290 0.32
## vocab 0.222 0.34 0.13 1.00 0.56 0.172 0.22
## reading 0.204 0.31 0.13 0.56 1.00 0.174 0.21
## flanker 0.097 0.22 0.29 0.17 0.17 1.000 0.52
## card_sort 0.159 0.30 0.32 0.22 0.21 0.519 1.00
#factor analysis
(fa = fa(d_cog))
## Factor Analysis using method = minres
## Call: fa(r = d_cog)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 h2 u2 com
## IBAM 0.37 0.14 0.86 1
## list_sort 0.57 0.32 0.68 1
## pattern_recog 0.38 0.14 0.86 1
## vocab 0.57 0.33 0.67 1
## reading 0.55 0.30 0.70 1
## flanker 0.49 0.24 0.76 1
## card_sort 0.57 0.33 0.67 1
##
## MR1
## SS loadings 1.79
## Proportion Var 0.26
##
## Mean item complexity = 1
## Test of the hypothesis that 1 factor is sufficient.
##
## The degrees of freedom for the null model are 21 and the objective function was 1.2 with Chi Square of 1719
## The degrees of freedom for the model are 14 and the objective function was 0.39
##
## The root mean square of the residuals (RMSR) is 0.1
## The df corrected root mean square of the residuals is 0.13
##
## The harmonic number of observations is 1369 with the empirical chi square 618 with prob < 8.3e-123
## The total number of observations was 1391 with Likelihood Chi Square = 534 with prob < 5.6e-105
##
## Tucker Lewis Index of factoring reliability = 0.54
## RMSEA index = 0.16 and the 90 % confidence intervals are 0.15 0.17
## BIC = 433
## Fit based upon off diagonal values = 0.86
## Measures of factor score adequacy
## MR1
## Correlation of (regression) scores with factors 0.85
## Multiple R square of scores with factors 0.72
## Minimum correlation of possible factor scores 0.43
#save scores
d_ping$CA = fa$scores %>% as.vector() %>% standardize()
# only age control --------------------------------------------------------
d_cog2 = controller(d_cog, gender = F)
wtd.cors(d_cog2)
## IBAM list_sort pattern_recog vocab reading flanker card_sort
## IBAM 1.000 0.34 0.13 0.22 0.20 0.099 0.16
## list_sort 0.337 1.00 0.18 0.34 0.31 0.218 0.30
## pattern_recog 0.132 0.18 1.00 0.13 0.14 0.288 0.31
## vocab 0.222 0.34 0.13 1.00 0.56 0.172 0.22
## reading 0.203 0.31 0.14 0.56 1.00 0.177 0.21
## flanker 0.099 0.22 0.29 0.17 0.18 1.000 0.52
## card_sort 0.160 0.30 0.31 0.22 0.21 0.518 1.00
#FA
(fa2 = fa(d_cog2))
## Factor Analysis using method = minres
## Call: fa(r = d_cog2)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 h2 u2 com
## IBAM 0.37 0.14 0.86 1
## list_sort 0.57 0.32 0.68 1
## pattern_recog 0.37 0.14 0.86 1
## vocab 0.57 0.33 0.67 1
## reading 0.55 0.31 0.69 1
## flanker 0.49 0.24 0.76 1
## card_sort 0.57 0.33 0.67 1
##
## MR1
## SS loadings 1.80
## Proportion Var 0.26
##
## Mean item complexity = 1
## Test of the hypothesis that 1 factor is sufficient.
##
## The degrees of freedom for the null model are 21 and the objective function was 1.2 with Chi Square of 1717
## The degrees of freedom for the model are 14 and the objective function was 0.38
##
## The root mean square of the residuals (RMSR) is 0.1
## The df corrected root mean square of the residuals is 0.13
##
## The harmonic number of observations is 1369 with the empirical chi square 608 with prob < 1.1e-120
## The total number of observations was 1391 with Likelihood Chi Square = 528 with prob < 1.1e-103
##
## Tucker Lewis Index of factoring reliability = 0.55
## RMSEA index = 0.16 and the 90 % confidence intervals are 0.15 0.17
## BIC = 427
## Fit based upon off diagonal values = 0.86
## Measures of factor score adequacy
## MR1
## Correlation of (regression) scores with factors 0.85
## Multiple R square of scores with factors 0.72
## Minimum correlation of possible factor scores 0.43
#save scores
d_ping$CA2 = fa2$scores %>% as.vector() %>% standardize()
# compare loadings -----------------------------------------------------------------
fa_plot_loadings(list(age_gender_controlled = fa,
age_controlled = fa2))
# factor analyze by SIRE --------------------------------------------------
#split into subset for largest SIRE groups
d_cog2_SIRE = list(
White = d_cog2[d_ping$SIRE_standard == "White", ],
African_American = d_cog2[d_ping$SIRE_standard == "African_American", ],
Hispanic = d_cog2[d_ping$SIRE_standard == "Hispanic", ],
Multi_ethnic = d_cog2[d_ping$SIRE_standard == "Multi_ethnic", ],
Asian = d_cog2[d_ping$SIRE_standard == "Asian", ]
)
#factor analyze each
d_cog2_SIRE_fa = map(d_cog2_SIRE, ~fa(.))
#extract loadings
fa_congruence_matrix(d_cog2_SIRE_fa) %>%
set_colnames(names(d_cog2_SIRE)) %>%
set_rownames(names(d_cog2_SIRE))
## White African_American Hispanic Multi_ethnic Asian
## White 1.00 0.95 0.96 0.98 0.97
## African_American 0.95 1.00 0.99 0.96 0.87
## Hispanic 0.96 0.99 1.00 0.98 0.91
## Multi_ethnic 0.98 0.96 0.98 1.00 0.97
## Asian 0.97 0.87 0.91 0.97 1.00
Impute missing data, factor analyze, score.
### S FACTOR
# missing ------------------------------------------------------------------
d_S = d_ping[v_S]
#how much missing?
miss_amount(d_S)
## cases with missing data vars with missing data cells with missing data
## 0.5 1.0 0.2
#intercors
wtd.cors(d_S) %>% MAT_half() %>% psych::describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 10 0.55 0.07 0.55 0.55 0.05 0.43 0.66 0.23 -0.19 -1.1 0.02
#view distribution
miss_plot(d_S)
#impute all cases with any data
d_S = miss_impute(d_S, max_na = 4)
#without imputed numbers
d_ping[c(v_S, "CA")] %>% wtd.cors()
## occupation_1 occupation_2 education_1 education_2 hh_income CA
## occupation_1 1.00 0.53 0.64 0.43 0.57 0.32
## occupation_2 0.53 1.00 0.46 0.66 0.59 0.34
## education_1 0.64 0.46 1.00 0.57 0.54 0.34
## education_2 0.43 0.66 0.57 1.00 0.53 0.36
## hh_income 0.57 0.59 0.54 0.53 1.00 0.35
## CA 0.32 0.34 0.34 0.36 0.35 1.00
# cors --------------------------------------------------------------------
wtd.cors(cbind(d_S, d_ping$CA))
## occupation_1 occupation_2 education_1 education_2 hh_income d_ping$CA
## occupation_1 1.00 0.66 0.66 0.51 0.60 0.32
## occupation_2 0.66 1.00 0.53 0.74 0.73 0.36
## education_1 0.66 0.53 1.00 0.65 0.56 0.34
## education_2 0.51 0.74 0.65 1.00 0.63 0.38
## hh_income 0.60 0.73 0.56 0.63 1.00 0.35
## d_ping$CA 0.32 0.36 0.34 0.38 0.35 1.00
wtd.cors(d_S) %>% MAT_half() %>% averages()
## arithmetic geometric harmonic mode median trimmed midrange
## 0.63 0.62 0.62 0.66 0.64 0.63 0.63
# FA ----------------------------------------------------------------------
(fa_S = fa(d_S))
## Factor Analysis using method = minres
## Call: fa(r = d_S)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 h2 u2 com
## occupation_1 0.76 0.57 0.43 1
## occupation_2 0.86 0.74 0.26 1
## education_1 0.74 0.54 0.46 1
## education_2 0.80 0.65 0.35 1
## hh_income 0.80 0.64 0.36 1
##
## MR1
## SS loadings 3.14
## Proportion Var 0.63
##
## Mean item complexity = 1
## Test of the hypothesis that 1 factor is sufficient.
##
## The degrees of freedom for the null model are 10 and the objective function was 3.1 with Chi Square of 4300
## The degrees of freedom for the model are 5 and the objective function was 0.36
##
## The root mean square of the residuals (RMSR) is 0.06
## The df corrected root mean square of the residuals is 0.09
##
## The harmonic number of observations is 1380 with the empirical chi square 109 with prob < 6.8e-22
## The total number of observations was 1391 with Likelihood Chi Square = 493 with prob < 2.1e-104
##
## Tucker Lewis Index of factoring reliability = 0.77
## RMSEA index = 0.27 and the 90 % confidence intervals are 0.25 0.28
## BIC = 457
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy
## MR1
## Correlation of (regression) scores with factors 0.95
## Multiple R square of scores with factors 0.90
## Minimum correlation of possible factor scores 0.80
#scores
d_ping$S = fa_S$scores %>% as.vector() %>% standardize()
Some analyses of ancestry and SIRE.
### ancestry analysis
# cross-tabulate ----------------------------------------------------------
#simple analyses
#binary measures of congruence between genetic ancestry and SIRE
table(d_ping$SIRE_standard, d_ping$largest_ancestry) %>%
prop.table(margin = 1) %>%
unclass() %>%
write_clipboard()
## Africa Amerind CentralAsia EastAsia Europe
## White 0.00 0.00 0.00 0.00 1.00
## African American 0.99 0.00 0.00 0.00 0.01
## American Indian 0.25 0.25 0.00 0.00 0.50
## Asian 0.00 0.00 0.13 0.83 0.04
## Hispanic 0.09 0.07 0.00 0.10 0.73
## Multi ethnic 0.10 0.00 0.02 0.39 0.49
## Other 0.21 0.00 0.17 0.00 0.62
## Pacific Islander 0.00 0.00 0.00 0.94 0.06
table(d_ping$SIRE_standard, d_ping$largest_ancestry_twothirds) %>%
prop.table(margin = 1) %>%
unclass() %>%
write_clipboard()
## Africa Amerind CentralAsia EastAsia Europe Mixed
## White 0.00 0.00 0.00 0.00 0.99 0.01
## African American 0.88 0.00 0.00 0.00 0.00 0.12
## American Indian 0.25 0.00 0.00 0.00 0.50 0.25
## Asian 0.00 0.00 0.13 0.77 0.02 0.08
## Hispanic 0.03 0.01 0.00 0.05 0.36 0.55
## Multi ethnic 0.03 0.00 0.01 0.17 0.20 0.59
## Other 0.21 0.00 0.17 0.00 0.46 0.17
## Pacific Islander 0.00 0.00 0.00 0.75 0.00 0.25
table(d_ping$SIRE_standard, d_ping$largest_ancestry_half) %>%
prop.table(margin = 1) %>%
unclass() %>%
write_clipboard()
## Africa Amerind CentralAsia EastAsia Europe Mixed
## White 0.00 0.00 0.00 0.00 1.00 0.00
## African American 0.98 0.00 0.00 0.00 0.01 0.01
## American Indian 0.25 0.25 0.00 0.00 0.50 0.00
## Asian 0.00 0.00 0.13 0.81 0.04 0.02
## Hispanic 0.08 0.05 0.00 0.06 0.59 0.22
## Multi ethnic 0.06 0.00 0.02 0.35 0.44 0.12
## Other 0.21 0.00 0.17 0.00 0.54 0.08
## Pacific Islander 0.00 0.00 0.00 0.94 0.00 0.06
# mean ancestry by SIRE ---------------------------------------------------
ddply(d_ping, .variables = "SIRE_standard", .fun = function(block) {
#select
tmp = block[v_genomic]
#mean
c(n = count_NA(tmp[[1]], reverse = T), colMeans(tmp, na.rm=T))
}) %>%
write_clipboard()
## SIRE standard N European African Amerindian East Asian Oceanian Central Asian
## 1 White 571.00 0.97 0.00 0.01 0.00 0.00 0.01
## 2 African_American 140.00 0.17 0.81 0.00 0.00 0.00 0.01
## 3 American_Indian 4.00 0.66 0.21 0.13 0.00 0.01 0.00
## 4 Asian 122.00 0.06 0.00 0.00 0.80 0.01 0.13
## 5 Hispanic 328.00 0.57 0.14 0.19 0.09 0.01 0.01
## 6 Multi_ethnic 186.00 0.45 0.11 0.01 0.37 0.03 0.03
## 7 Other 24.00 0.55 0.26 0.01 0.01 0.00 0.18
## 8 Pacific_Islander 16.00 0.11 0.01 0.00 0.70 0.17 0.00
# SD ----------------------------------------------------------------------
ddply(d_ping, .variables = "SIRE_standard", .fun = function(block) {
#select
tmp = block[v_genomic]
#summarize
c(n = count_NA(tmp[[1]], reverse = T), sapply(tmp, sd, na.rm=T))
}) %>%
write_clipboard()
## SIRE standard N European African Amerindian East Asian Oceanian Central Asian
## 1 White 571.00 0.07 0.04 0.02 0.04 0.00 0.04
## 2 African_American 140.00 0.11 0.12 0.01 0.03 0.01 0.02
## 3 American_Indian 4.00 0.42 0.42 0.25 0.00 0.01 0.00
## 4 Asian 122.00 0.17 0.02 0.01 0.36 0.01 0.32
## 5 Hispanic 328.00 0.23 0.20 0.17 0.20 0.02 0.03
## 6 Multi_ethnic 186.00 0.27 0.21 0.02 0.29 0.05 0.12
## 7 Other 24.00 0.36 0.33 0.03 0.02 0.01 0.32
## 8 Pacific_Islander 16.00 0.15 0.04 0.00 0.13 0.06 0.00
# correlations between SIRE continuous and genomic ancestry ---------------
wtd.cors(d_ping[c(v_SIREs_con, v_genomic)]) %>%
#change order manually so they match up as well as possible
.[v_SIREs_con[c(7, 4, 1, 3, 2, 5, 6)], v_genomic] %>%
write_clipboard()
## European African Amerindian East Asian Oceanian Central Asian
## White c 0.86 -0.50 -0.23 -0.44 -0.22 -0.13
## African American c -0.45 0.92 -0.14 -0.18 -0.08 -0.06
## Hispanic c -0.08 0.00 0.75 -0.15 -0.06 -0.08
## Asian c -0.56 -0.20 -0.16 0.81 0.09 0.27
## Pacific Islander c -0.28 -0.10 -0.07 0.39 0.81 -0.03
## American Indian c 0.00 0.03 0.09 -0.04 -0.01 -0.04
## Other c -0.03 0.06 -0.04 -0.06 -0.03 0.16
#SIREc sums
d_ping[v_SIREs_con] %>% colSums(na.rm=T)
## Hispanic_c Pacific_Islander_c Asian_c African_American_c American_Indian_c Other_c White_c
## 187 55 196 179 27 24 723
# overall ancestry summary stats ---------------------------------------------------
psych::describe(d_ping[v_genomic])
## vars n mean sd median trimmed mad min max range skew kurtosis se
## European 1 1391 0.63 0.37 0.73 0.66 0.4 0 1.00 1.00 -0.46 -1.3 0.01
## African 2 1391 0.14 0.27 0.00 0.06 0.0 0 1.00 1.00 1.94 2.3 0.01
## Amerindian 3 1391 0.05 0.12 0.00 0.02 0.0 0 0.83 0.83 2.96 9.3 0.00
## East_Asian 4 1391 0.15 0.30 0.00 0.07 0.0 0 1.00 1.00 1.89 2.1 0.01
## Oceanian 5 1391 0.01 0.03 0.00 0.00 0.0 0 0.25 0.25 5.22 30.2 0.00
## Central_Asian 6 1391 0.02 0.12 0.00 0.00 0.0 0 1.00 1.00 6.56 44.6 0.00
We create a few subsamples of interest.
# subgroups ---------------------------------------------------------------
#define subsets of interest
d_AA_narrow = d_ping[d_ping$SIRE_standard == "African_American", ]
d_AA_broad = d_ping[d_ping$African_American, ]
d_HA = d_ping[d_ping$Hispanic, ]
d_nW = d_ping %>% dplyr::filter(SIRE_standard != "White")
#get Ws and AAs by standard coding
d_W_AA = filter(d_ping, SIRE_standard %in% c("African_American", "White"))
#get Ws and Hs by standard coding
d_W_H = filter(d_ping, SIRE_standard %in% c("Hispanic", "White"))
#simple results
# bivariate ---------------------------------------------------------------
wtd.cors(d_ping[c("CA", "S", v_genomic)]) %>% write_clipboard()
## CA S European African Amerindian East Asian Oceanian Central Asian
## CA 1.00 0.42 0.23 -0.33 -0.15 0.05 -0.08 0.06
## S 0.42 1.00 0.32 -0.30 -0.24 -0.06 -0.20 0.12
## European 0.23 0.32 1.00 -0.50 -0.10 -0.62 -0.31 -0.20
## African -0.33 -0.30 -0.50 1.00 -0.08 -0.22 -0.10 -0.07
## Amerindian -0.15 -0.24 -0.10 -0.08 1.00 -0.16 -0.09 -0.07
## East Asian 0.05 -0.06 -0.62 -0.22 -0.16 1.00 0.42 -0.07
## Oceanian -0.08 -0.20 -0.31 -0.10 -0.09 0.42 1.00 -0.03
## Central Asian 0.06 0.12 -0.20 -0.07 -0.07 -0.07 -0.03 1.00
d_ping[c("CA", "S", v_genomic)] %>% count.pairwise()
## CA S European African Amerindian East_Asian Oceanian Central_Asian
## CA 1369 1363 1369 1369 1369 1369 1369 1369
## S 1363 1380 1380 1380 1380 1380 1380 1380
## European 1369 1380 1391 1391 1391 1391 1391 1391
## African 1369 1380 1391 1391 1391 1391 1391 1391
## Amerindian 1369 1380 1391 1391 1391 1391 1391 1391
## East_Asian 1369 1380 1391 1391 1391 1391 1391 1391
## Oceanian 1369 1380 1391 1391 1391 1391 1391 1391
## Central_Asian 1369 1380 1391 1391 1391 1391 1391 1391
#group differences
d_ping %$% SMD_matrix(CA, SIRE_standard)
## White African_American American_Indian Asian Hispanic Multi_ethnic Other Pacific_Islander
## White NA 1.03 0.092 0.027 0.66 0.25 0.36 0.82
## African_American 1.029 NA -0.936 -1.001 -0.37 -0.78 -0.67 -0.21
## American_Indian 0.092 -0.94 NA -0.065 0.57 0.16 0.27 0.72
## Asian 0.027 -1.00 -0.065 NA 0.63 0.23 0.33 0.79
## Hispanic 0.659 -0.37 0.566 0.631 NA -0.41 -0.30 0.16
## Multi_ethnic 0.253 -0.78 0.161 0.226 -0.41 NA 0.11 0.56
## Other 0.361 -0.67 0.269 0.334 -0.30 0.11 NA 0.45
## Pacific_Islander 0.816 -0.21 0.723 0.788 0.16 0.56 0.45 NA
d_ping %$% SMD_matrix(S, SIRE_standard)
## White African_American American_Indian Asian Hispanic Multi_ethnic Other Pacific_Islander
## White NA 1.07 1.58 0.23 0.94 0.50 0.61 1.81
## African_American 1.07 NA 0.51 -0.84 -0.13 -0.57 -0.46 0.74
## American_Indian 1.58 0.51 NA -1.35 -0.64 -1.08 -0.97 0.23
## Asian 0.23 -0.84 -1.35 NA 0.72 0.28 0.38 1.58
## Hispanic 0.94 -0.13 -0.64 0.72 NA -0.44 -0.33 0.86
## Multi_ethnic 0.50 -0.57 -1.08 0.28 -0.44 NA 0.11 1.31
## Other 0.61 -0.46 -0.97 0.38 -0.33 0.11 NA 1.20
## Pacific_Islander 1.81 0.74 0.23 1.58 0.86 1.31 1.20 NA
#sample sizes
table2(d_ping$SIRE_standard)
## # A tibble: 9 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 White 571 41.0
## 2 Hispanic 328 23.6
## 3 Multi_ethnic 186 13.4
## 4 African_American 140 10.1
## 5 Asian 122 8.77
## 6 Other 24 1.73
## 7 Pacific_Islander 16 1.15
## 8 American_Indian 4 0.288
## 9 <NA> 0 0
#redefine function to avoid error
describe = function(...) {
y = psych::describe(...)
class(y) = "data.frame"
y
}
# full sample ---------------------------------------------------------------------
describe(d_ping$Age)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1386 12 5 11 12 5.9 3 21 18 0.18 -1.1 0.13
table2(d_ping$Gender)
## # A tibble: 3 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 M 719 51.7
## 2 F 672 48.3
## 3 <NA> 0 0
describe(d_ping$CA)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1369 -9.7e-18 1 0.08 0.036 0.96 -3.4 3.2 6.6 -0.35 0.13 0.027
describe(d_ping$S)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1380 -6.6e-18 1 0.11 0.049 1 -3 1.8 4.8 -0.43 -0.43 0.027
# by subgroup -------------------------------------------------------------
describeBy(d_ping %>% select(Age, CA, S), d_ping$SIRE_standard)
##
## Descriptive statistics by group
## group: White
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Age 1 571 11.62 4.81 11.25 11.50 5.68 3.2 21.0 17.8 0.17 -1.01 0.20
## CA 2 567 0.29 0.89 0.32 0.32 0.77 -3.4 3.2 6.6 -0.51 1.36 0.04
## S 3 570 0.41 0.78 0.52 0.45 0.76 -1.8 1.8 3.7 -0.49 -0.21 0.03
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## group: African_American
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Age 1 140 12.50 4.68 12.58 12.55 5.56 3.1 20.8 17.7 -0.06 -0.93 0.40
## CA 2 138 -0.68 0.88 -0.66 -0.68 0.83 -2.9 1.8 4.7 0.02 -0.09 0.07
## S 3 139 -0.56 0.96 -0.52 -0.57 1.12 -2.3 1.5 3.8 0.11 -0.77 0.08
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## group: American_Indian
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Age 1 4 11.62 3.49 12.16 11.62 3.40 7.33 14.83 7.5 -0.20 -2.2 1.75
## CA 2 4 0.21 1.26 -0.17 0.21 0.74 -0.84 1.99 2.8 0.53 -1.9 0.63
## S 3 4 -1.03 0.92 -0.90 -1.03 0.92 -2.13 -0.19 1.9 -0.17 -2.2 0.46
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## group: Asian
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Age 1 122 13.64 5.78 15.42 13.98 6.55 3.1 21.0 17.9 -0.35 -1.40 0.52
## CA 2 120 0.27 0.88 0.50 0.29 0.73 -2.5 2.5 5.0 -0.39 -0.01 0.08
## S 3 122 0.20 1.03 0.38 0.26 1.04 -3.0 1.8 4.8 -0.65 0.30 0.09
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## group: Hispanic
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Age 1 328 11.31 4.89 10.41 11.10 5.63 3.2 21.0 17.8 0.34 -1.01 0.27
## CA 2 323 -0.33 0.98 -0.24 -0.30 0.99 -3.1 2.1 5.3 -0.23 -0.23 0.05
## S 3 328 -0.45 0.94 -0.42 -0.44 1.05 -2.9 1.8 4.7 -0.08 -0.53 0.05
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## group: Multi_ethnic
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Age 1 186 10.86 5.0 9.71 10.60 5.9 3.0 20.9 17.9 0.37 -0.96 0.37
## CA 2 182 0.05 1.1 0.08 0.09 1.1 -2.6 2.8 5.4 -0.26 -0.40 0.08
## S 3 186 -0.05 1.1 0.16 0.01 1.1 -2.3 1.8 4.1 -0.42 -0.75 0.08
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## group: Other
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Age 1 19 14.45 4.21 14.17 14.64 3.8 5.4 20.25 14.8 -0.41 -0.81 0.96
## CA 2 19 -0.05 0.84 0.15 0.03 0.7 -2.3 0.89 3.2 -1.02 0.32 0.19
## S 3 15 -0.15 1.00 -0.18 -0.07 1.1 -2.5 1.28 3.8 -0.56 -0.20 0.26
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## group: Pacific_Islander
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Age 1 16 7.10 2.7 6.54 6.79 1.66 3.8 14.75 10.9 1.30 1.35 0.68
## CA 2 16 -0.48 1.3 -0.17 -0.52 0.64 -2.6 2.25 4.9 -0.22 -0.52 0.33
## S 3 16 -1.24 1.2 -1.42 -1.28 1.51 -2.5 0.61 3.1 0.38 -1.53 0.30
plyr::dlply(d_ping, "SIRE_standard", function(dd) table2(dd$Gender))
## $White
## # A tibble: 3 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 M 303 53.1
## 2 F 268 46.9
## 3 <NA> 0 0
##
## $African_American
## # A tibble: 3 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 M 78 55.7
## 2 F 62 44.3
## 3 <NA> 0 0
##
## $American_Indian
## # A tibble: 2 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 M 4 100
## 2 <NA> 0 0
##
## $Asian
## # A tibble: 3 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 F 80 65.6
## 2 M 42 34.4
## 3 <NA> 0 0
##
## $Hispanic
## # A tibble: 3 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 M 175 53.4
## 2 F 153 46.6
## 3 <NA> 0 0
##
## $Multi_ethnic
## # A tibble: 3 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 M 95 51.1
## 2 F 91 48.9
## 3 <NA> 0 0
##
## $Other
## # A tibble: 3 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 M 15 62.5
## 2 F 9 37.5
## 3 <NA> 0 0
##
## $Pacific_Islander
## # A tibble: 3 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 F 9 56.2
## 2 M 7 43.8
## 3 <NA> 0 0
##
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
## SIRE_standard
## 1 White
## 2 African_American
## 3 American_Indian
## 4 Asian
## 5 Hispanic
## 6 Multi_ethnic
## 7 Other
## 8 Pacific_Islander
After having done the initial analyses, we do the main modeling. This outputs quite a lot. Normally, one would do this interactively and only print the necessary stuff, but when the output is a static file, we will have to print everything.
### MAIN MODELING
# main modeling function ----------------------------------------------------------------
#we make a function that does the analyses for us given an outcome variable
modeller = function(outcome, genomics, include_S_as_pred = F, data = d_ping) {
outcome
genomics
data
#variations
models = expand.grid(SIRE = c("standard", "common", "numeric", "dummy", "none"),
genomics = c(T, F),
stringsAsFactors = F) %>%
as_data_frame()
#add list col
models[["fit"]] = replicate(nrow(models), list())
#remove null model
models = models[-nrow(models), ]
#loop de loop
for (i in 1:nrow(models)) {
this_to_add = c()
this_pars = models[i, ]
#build model
this_model = outcome + " ~ "
#parental S as control?
if (include_S_as_pred) {
this_model = this_model + "S + "
}
#SIRE
switch(this_pars$SIRE,
standard = {this_to_add = "SIRE_standard"},
common = {this_to_add = "SIRE_common"},
dummy = {this_to_add = v_SIREs},
numeric = {this_to_add = v_SIREs_con[-length(v_SIREs_con)]}, #drop last predictor to avoid perfect multicollinearity
none = NULL
)
#genomics
switch(this_pars$genomics,
T = {this_to_add = c(this_to_add, genomics)},
F = NULL
)
#first language
if (!exclude_nonenglish_first_language) {
this_to_add = c(this_to_add, "EFL")
}
#make model
this_model = this_model + str_c(this_to_add, collapse = " + ")
#fit model
models$fit[i] = list(lm(this_model, data = data) %>% MOD_summary(progress = F, standardize = F, kfold = F))
}
models
}
# restructure function ----------------------------------------------------
#another function that restructures a bunch of model results
MOD_restructure = function(x) {
#get coefs
coefs = map(x, "coefs")
etas = map(x, c("aov_etas")) %>% map(~as.data.frame(.) %>% `[`("Eta"))
#add id's if none present
if (!has_names(coefs)) names(coefs) = seq_along(coefs)
if (!has_names(etas)) names(coefs) = seq_along(coefs)
#restructure
list(
coefs = map(coefs, rownames_to_column, var = "Predictor") %>% ldf_to_df(by_name = "model_id"),
etas = map(etas, rownames_to_column, var = "Predictor") %>% ldf_to_df(by_name = "model_id")
)
}
# main regression -----------------------------------------------------------
#models without European (reference)
all_models_nonEuro = map(c("CA", "S"), modeller, genomics = v_genomic[-1])
#same as above, but include parental S as predictor
all_models_nonEuro_S = modeller(outcome = "CA", genomics = v_genomic[-1], include_S_as_pred = T)
#this is models only with European ancestry as predictor
#showing the importance of using multiple ancestries, unlike what was previous found
all_models_euro = map(c("CA", "S"), modeller, genomics = "European")
#restructure model results
betas = rbind(
MOD_restructure(all_models_nonEuro[[1]][["fit"]]) %>% `[[`("coefs") %>% mutate(outcome = "CA"),
MOD_restructure(all_models_nonEuro[[2]][["fit"]]) %>% `[[`("coefs") %>% mutate(outcome = "S")
)
etas = rbind(
MOD_restructure(all_models_nonEuro[[1]][["fit"]]) %>% `[[`("etas") %>% mutate(outcome = "CA"),
MOD_restructure(all_models_nonEuro[[2]][["fit"]]) %>% `[[`("etas") %>% mutate(outcome = "S")
)
#classify predictors
etas %<>% mutate(class = map_chr(Predictor, function(pred) {
if (pred %in% v_genomic) return("Genomic ancestry")
if (pred == "EFL") return("Linguistic")
"SIRE"
}))
#summarize etas
etas_summary = ddply(etas, c("model_id", "outcome", "class"), function(d) {
d$Eta %>% `^`(2) %>% sum %>% sqrt %>% set_names("total_eta")
})
#Parental models
etas_parental = MOD_restructure(all_models_nonEuro_S[["fit"]]) %>% `[[`("etas")
#classify predictors
etas_parental %<>% mutate(class = map_chr(Predictor, function(pred) {
if (pred %in% v_genomic) return("Genomic ancestry")
if (pred == "EFL") return("Linguistic")
if (pred == "S") return("Parental")
"SIRE"
}))
#summarize etas
etas_summary_parental = ddply(etas_parental, c("model_id", "class"), function(d) {
d$Eta %>% `^`(2) %>% sum %>% sqrt %>% set_names("total_eta")
})
# main models output ------------------------------------------------------------
#all non-European ancestry + SIREs
#CA
all_models_nonEuro[[1]][["fit"]]
## [[1]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_standard: White 0.000 NA NA NA
## SIRE_standard: African_American 0.305 0.184 -0.055 0.67
## SIRE_standard: American_Indian 0.408 0.464 -0.503 1.32
## SIRE_standard: Asian -0.071 0.189 -0.442 0.30
## SIRE_standard: Hispanic -0.165 0.099 -0.359 0.03
## SIRE_standard: Multi_ethnic 0.038 0.110 -0.178 0.25
## SIRE_standard: Other -0.029 0.228 -0.476 0.42
## SIRE_standard: Pacific_Islander 0.023 0.308 -0.581 0.63
## African -1.580 0.201 -1.975 -1.19
## Amerindian -1.152 0.309 -1.757 -0.55
## East_Asian 0.058 0.186 -0.307 0.42
## Oceanian -4.809 1.273 -7.307 -2.31
## Central_Asian 0.209 0.255 -0.291 0.71
## EFL 0.030 0.085 -0.137 0.20
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1355 0.17 0.16 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_standard 0.0881 0.0961
## African 0.1947 0.2087
## Amerindian 0.0925 0.1009
## East_Asian 0.0078 0.0085
## Oceanian 0.0936 0.1021
## Central_Asian 0.0203 0.0223
## EFL 0.0086 0.0095
##
## [[2]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_common: White 0.000 NA NA NA
## SIRE_common: African_American 0.086 0.212 -0.330 0.502
## SIRE_common: Asian -0.014 0.197 -0.400 0.372
## SIRE_common: Asian, White 0.315 0.153 0.015 0.615
## SIRE_common: Hispanic -0.421 0.162 -0.739 -0.104
## SIRE_common: Hispanic, African_American -0.382 0.231 -0.836 0.071
## SIRE_common: Hispanic, White -0.135 0.111 -0.354 0.083
## SIRE_common: Pacific_Islander, Asian, White -0.186 0.189 -0.557 0.185
## SIRE_common: Other -0.162 0.111 -0.379 0.056
## African -1.309 0.240 -1.780 -0.838
## Amerindian -0.953 0.324 -1.589 -0.317
## East_Asian -0.013 0.197 -0.399 0.373
## Oceanian -3.331 1.144 -5.575 -1.086
## Central_Asian 0.158 0.258 -0.348 0.665
## EFL 0.033 0.085 -0.133 0.199
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1354 0.17 0.17 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_common 0.1192 0.1301
## African 0.1346 0.1465
## Amerindian 0.0726 0.0796
## East_Asian 0.0017 0.0018
## Oceanian 0.0719 0.0789
## Central_Asian 0.0151 0.0166
## EFL 0.0096 0.0106
##
## [[3]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic_c -0.460 0.153 -0.76 -0.16
## Pacific_Islander_c -0.781 0.311 -1.39 -0.17
## Asian_c -0.038 0.223 -0.48 0.40
## African_American_c 0.287 0.238 -0.18 0.75
## American_Indian_c -0.241 0.266 -0.76 0.28
## Other_c -0.035 0.232 -0.49 0.42
## African -1.594 0.273 -2.13 -1.06
## Amerindian -0.753 0.329 -1.40 -0.11
## East_Asian 0.061 0.233 -0.40 0.52
## Oceanian -1.566 1.495 -4.50 1.37
## Central_Asian 0.178 0.278 -0.37 0.72
## EFL 0.036 0.084 -0.13 0.20
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1356 0.17 0.17 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic_c 0.0743 0.0815
## Pacific_Islander_c 0.0619 0.0680
## Asian_c 0.0042 0.0047
## African_American_c 0.0297 0.0326
## American_Indian_c 0.0224 0.0246
## Other_c 0.0037 0.0041
## African 0.1442 0.1567
## Amerindian 0.0565 0.0620
## East_Asian 0.0064 0.0071
## Oceanian 0.0259 0.0284
## Central_Asian 0.0158 0.0174
## EFL 0.0106 0.0116
##
## [[4]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic -0.182 0.081 -0.342 -0.022
## Pacific_Islander -0.440 0.134 -0.703 -0.178
## Asian 0.205 0.119 -0.028 0.438
## African_American 0.263 0.151 -0.033 0.559
## American_Indian -0.105 0.119 -0.338 0.128
## Other 0.154 0.227 -0.290 0.599
## White 0.215 0.086 0.045 0.385
## African -1.364 0.229 -1.813 -0.915
## Amerindian -0.837 0.303 -1.432 -0.242
## East_Asian 0.021 0.188 -0.348 0.390
## Oceanian -1.319 1.317 -3.904 1.265
## Central_Asian 0.149 0.248 -0.338 0.637
## EFL 0.046 0.084 -0.120 0.212
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1355 0.18 0.17 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic 0.0551 0.0607
## Pacific_Islander 0.0810 0.0889
## Asian 0.0424 0.0467
## African_American 0.0430 0.0473
## American_Indian 0.0217 0.0240
## Other 0.0168 0.0185
## White 0.0613 0.0674
## African 0.1469 0.1598
## Amerindian 0.0680 0.0747
## East_Asian 0.0028 0.0031
## Oceanian 0.0247 0.0272
## Central_Asian 0.0148 0.0163
## EFL 0.0135 0.0148
##
## [[5]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## African -1.312 0.095 -1.50 -1.13
## Amerindian -1.593 0.222 -2.03 -1.16
## East_Asian 0.022 0.094 -0.16 0.21
## Oceanian -4.584 0.930 -6.41 -2.76
## Central_Asian 0.187 0.206 -0.22 0.59
## EFL 0.046 0.084 -0.12 0.21
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1362 0.16 0.16 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## African 0.3439 0.3512
## Amerindian 0.1784 0.1910
## East_Asian 0.0057 0.0063
## Oceanian 0.1224 0.1323
## Central_Asian 0.0226 0.0246
## EFL 0.0136 0.0149
##
## [[6]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_standard: White 0.000 NA NA NA
## SIRE_standard: African_American -0.971 0.090 -1.15 -0.795
## SIRE_standard: American_Indian -0.089 0.473 -1.02 0.839
## SIRE_standard: Asian -0.016 0.097 -0.21 0.175
## SIRE_standard: Hispanic -0.615 0.067 -0.75 -0.484
## SIRE_standard: Multi_ethnic -0.238 0.080 -0.40 -0.081
## SIRE_standard: Other -0.332 0.221 -0.76 0.101
## SIRE_standard: Pacific_Islander -0.761 0.240 -1.23 -0.291
## EFL 0.039 0.085 -0.13 0.206
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1360 0.12 0.11 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_standard 0.341 0.341
## EFL 0.012 0.012
##
## [[7]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_common: White 0.000 NA NA NA
## SIRE_common: African_American -0.971 0.088 -1.14 -0.799
## SIRE_common: Asian -0.012 0.095 -0.20 0.176
## SIRE_common: Asian, White 0.320 0.128 0.07 0.571
## SIRE_common: Hispanic -0.868 0.120 -1.10 -0.633
## SIRE_common: Hispanic, African_American -1.222 0.179 -1.57 -0.871
## SIRE_common: Hispanic, White -0.413 0.089 -0.59 -0.239
## SIRE_common: Pacific_Islander, Asian, White -0.398 0.168 -0.73 -0.069
## SIRE_common: Other -0.524 0.074 -0.67 -0.379
## EFL 0.054 0.084 -0.11 0.219
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1359 0.15 0.14 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_common 0.386 0.387
## EFL 0.016 0.018
##
## [[8]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic_c -0.917 0.099 -1.11 -0.722
## Pacific_Islander_c -0.996 0.175 -1.34 -0.653
## Asian_c 0.036 0.088 -0.14 0.210
## African_American_c -1.016 0.084 -1.18 -0.852
## American_Indian_c -0.615 0.261 -1.13 -0.103
## Other_c -0.345 0.216 -0.77 0.078
## EFL 0.050 0.083 -0.11 0.214
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1361 0.15 0.15 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic_c 0.231 0.243
## Pacific_Islander_c 0.143 0.153
## Asian_c 0.010 0.011
## African_American_c 0.304 0.313
## American_Indian_c 0.059 0.064
## Other_c 0.040 0.043
## EFL 0.015 0.016
##
## [[9]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic -0.341 0.062 -0.46 -0.22
## Pacific_Islander -0.479 0.095 -0.67 -0.29
## Asian 0.378 0.073 0.23 0.52
## African_American -0.460 0.083 -0.62 -0.30
## American_Indian -0.108 0.121 -0.34 0.13
## Other 0.110 0.224 -0.33 0.55
## White 0.410 0.068 0.28 0.54
## EFL 0.052 0.083 -0.11 0.22
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1360 0.15 0.14 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic 0.137 0.147
## Pacific_Islander 0.126 0.135
## Asian 0.129 0.139
## African_American 0.139 0.149
## American_Indian 0.022 0.024
## Other 0.012 0.013
## White 0.151 0.161
## EFL 0.016 0.017
#S
all_models_nonEuro[[2]][["fit"]]
## [[1]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_standard: White 0.000 NA NA NA
## SIRE_standard: African_American 0.283 0.175 -0.060 0.625
## SIRE_standard: American_Indian -0.845 0.440 -1.709 0.018
## SIRE_standard: Asian -0.033 0.178 -0.383 0.316
## SIRE_standard: Hispanic -0.209 0.094 -0.393 -0.026
## SIRE_standard: Multi_ethnic -0.027 0.103 -0.229 0.175
## SIRE_standard: Other -0.387 0.237 -0.852 0.078
## SIRE_standard: Pacific_Islander -0.221 0.291 -0.791 0.350
## African -1.553 0.191 -1.928 -1.177
## Amerindian -1.906 0.291 -2.476 -1.335
## East_Asian -0.239 0.176 -0.585 0.107
## Oceanian -7.229 1.202 -9.587 -4.872
## Central_Asian 0.624 0.246 0.142 1.107
## EFL 0.109 0.080 -0.048 0.265
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 S 1380 1366 0.25 0.24 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_standard 0.110 0.126
## African 0.190 0.214
## Amerindian 0.153 0.175
## East_Asian 0.032 0.037
## Oceanian 0.141 0.161
## Central_Asian 0.059 0.069
## EFL 0.032 0.037
##
## [[2]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_common: White 0.000 NA NA NA
## SIRE_common: African_American 0.017 0.203 -0.382 0.416
## SIRE_common: Asian -0.152 0.185 -0.515 0.212
## SIRE_common: Asian, White 0.199 0.142 -0.080 0.479
## SIRE_common: Hispanic -0.207 0.152 -0.504 0.091
## SIRE_common: Hispanic, African_American -0.398 0.219 -0.828 0.031
## SIRE_common: Hispanic, White -0.061 0.105 -0.267 0.144
## SIRE_common: Pacific_Islander, Asian, White -0.567 0.179 -0.917 -0.216
## SIRE_common: Other -0.396 0.104 -0.600 -0.192
## African -1.227 0.231 -1.680 -0.773
## Amerindian -2.063 0.305 -2.661 -1.464
## East_Asian -0.104 0.185 -0.468 0.260
## Oceanian -5.657 1.081 -7.777 -3.536
## Central_Asian 0.707 0.249 0.219 1.195
## EFL 0.139 0.079 -0.016 0.295
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 S 1380 1365 0.26 0.25 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_common 0.149 0.171
## African 0.123 0.142
## Amerindian 0.157 0.180
## East_Asian 0.013 0.015
## Oceanian 0.122 0.140
## Central_Asian 0.066 0.077
## EFL 0.041 0.048
##
## [[3]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic_c -0.28 0.142 -0.556 0.00084
## Pacific_Islander_c -2.06 0.288 -2.628 -1.49849
## Asian_c -0.56 0.208 -0.967 -0.15100
## African_American_c 0.42 0.229 -0.030 0.86784
## American_Indian_c -0.63 0.246 -1.113 -0.14642
## Other_c -0.46 0.236 -0.919 0.00527
## African -1.76 0.262 -2.270 -1.24087
## Amerindian -1.88 0.306 -2.477 -1.27512
## East_Asian 0.33 0.217 -0.101 0.75219
## Oceanian -1.60 1.383 -4.317 1.10811
## Central_Asian 1.07 0.265 0.545 1.58527
## EFL 0.12 0.078 -0.028 0.27764
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 S 1380 1367 0.28 0.27 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic_c 0.045 0.053
## Pacific_Islander_c 0.165 0.190
## Asian_c 0.062 0.073
## African_American_c 0.042 0.049
## American_Indian_c 0.059 0.069
## Other_c 0.045 0.052
## African 0.154 0.178
## Amerindian 0.141 0.163
## East_Asian 0.034 0.040
## Oceanian 0.027 0.031
## Central_Asian 0.092 0.108
## EFL 0.037 0.043
##
## [[4]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic -0.109 0.075 -0.256 0.039
## Pacific_Islander -0.980 0.123 -1.222 -0.739
## Asian 0.032 0.109 -0.182 0.247
## African_American 0.309 0.143 0.029 0.588
## American_Indian -0.184 0.109 -0.398 0.030
## Other -0.157 0.233 -0.614 0.300
## White 0.296 0.080 0.140 0.453
## African -1.347 0.217 -1.774 -0.921
## Amerindian -1.757 0.280 -2.306 -1.207
## East_Asian 0.097 0.174 -0.243 0.438
## Oceanian -1.576 1.214 -3.958 0.806
## Central_Asian 0.826 0.234 0.366 1.286
## EFL 0.163 0.078 0.010 0.316
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 S 1380 1366 0.29 0.28 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic 0.0331 0.039
## Pacific_Islander 0.1820 0.211
## Asian 0.0068 0.008
## African_American 0.0493 0.058
## American_Indian 0.0386 0.046
## Other 0.0154 0.018
## White 0.0846 0.100
## African 0.1415 0.165
## Amerindian 0.1431 0.167
## East_Asian 0.0128 0.015
## Oceanian 0.0296 0.035
## Central_Asian 0.0804 0.095
## EFL 0.0478 0.057
##
## [[5]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## African -1.32 0.090 -1.49 -1.138
## Amerindian -2.44 0.210 -2.85 -2.030
## East_Asian -0.24 0.089 -0.42 -0.068
## Oceanian -7.87 0.882 -9.60 -6.141
## Central_Asian 0.59 0.199 0.20 0.979
## EFL 0.13 0.079 -0.03 0.282
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 S 1380 1373 0.24 0.24 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## African 0.344 0.367
## Amerindian 0.273 0.299
## East_Asian 0.064 0.073
## Oceanian 0.210 0.234
## Central_Asian 0.069 0.079
## EFL 0.037 0.043
##
## [[6]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_standard: White 0.00 NA NA NA
## SIRE_standard: African_American -0.98 0.086 -1.144 -0.8061
## SIRE_standard: American_Indian -1.45 0.457 -2.342 -0.5497
## SIRE_standard: Asian -0.17 0.093 -0.356 0.0099
## SIRE_standard: Hispanic -0.84 0.064 -0.966 -0.7140
## SIRE_standard: Multi_ethnic -0.46 0.077 -0.607 -0.3053
## SIRE_standard: Other -0.52 0.239 -0.990 -0.0515
## SIRE_standard: Pacific_Islander -1.62 0.231 -2.075 -1.1670
## EFL 0.13 0.081 -0.033 0.2870
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 S 1380 1371 0.18 0.17 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_standard 0.412 0.413
## EFL 0.038 0.042
##
## [[7]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_common: White 0.00 NA NA NA
## SIRE_common: African_American -0.98 0.085 -1.142 -0.810
## SIRE_common: Asian -0.16 0.092 -0.341 0.018
## SIRE_common: Asian, White 0.19 0.121 -0.043 0.432
## SIRE_common: Hispanic -1.00 0.114 -1.225 -0.776
## SIRE_common: Hispanic, African_American -1.27 0.170 -1.599 -0.932
## SIRE_common: Hispanic, White -0.55 0.085 -0.719 -0.384
## SIRE_common: Pacific_Islander, Asian, White -0.97 0.162 -1.288 -0.652
## SIRE_common: Other -0.89 0.071 -1.029 -0.749
## EFL 0.17 0.080 0.015 0.330
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 S 1380 1370 0.21 0.2 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_common 0.449 0.450
## EFL 0.052 0.058
##
## [[8]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic_c -1.12 0.094 -1.305 -0.9379
## Pacific_Islander_c -2.09 0.166 -2.414 -1.7641
## Asian_c -0.16 0.084 -0.322 0.0065
## African_American_c -1.00 0.079 -1.158 -0.8465
## American_Indian_c -1.14 0.246 -1.623 -0.6584
## Other_c -0.53 0.231 -0.983 -0.0757
## EFL 0.14 0.079 -0.011 0.2976
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 S 1380 1372 0.23 0.22 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic_c 0.284 0.308
## Pacific_Islander_c 0.299 0.323
## Asian_c 0.045 0.051
## African_American_c 0.300 0.323
## American_Indian_c 0.110 0.124
## Other_c 0.054 0.062
## EFL 0.043 0.049
##
## [[9]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic -0.445 0.059 -0.560 -0.330
## Pacific_Islander -1.029 0.089 -1.204 -0.854
## Asian 0.325 0.069 0.191 0.460
## African_American -0.371 0.078 -0.525 -0.218
## American_Indian -0.210 0.112 -0.430 0.011
## Other 0.024 0.236 -0.440 0.487
## White 0.498 0.064 0.373 0.624
## EFL 0.174 0.078 0.021 0.327
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 S 1380 1371 0.24 0.23 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic 0.1787 0.2005
## Pacific_Islander 0.2721 0.2976
## Asian 0.1117 0.1269
## African_American 0.1120 0.1273
## American_Indian 0.0440 0.0504
## Other 0.0024 0.0027
## White 0.1836 0.2059
## EFL 0.0524 0.0600
#overcorrected model with all non-Euro + parental S
all_models_nonEuro_S[["fit"]]
## [[1]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## S 0.3089 0.027 0.26 0.363
## SIRE_standard: White 0.0000 NA NA NA
## SIRE_standard: African_American 0.1895 0.178 -0.16 0.539
## SIRE_standard: American_Indian 0.6612 0.444 -0.21 1.532
## SIRE_standard: Asian -0.0751 0.182 -0.43 0.282
## SIRE_standard: Hispanic -0.1095 0.095 -0.30 0.078
## SIRE_standard: Multi_ethnic 0.0347 0.105 -0.17 0.241
## SIRE_standard: Other 0.1434 0.239 -0.33 0.613
## SIRE_standard: Pacific_Islander 0.0866 0.294 -0.49 0.663
## African -1.0521 0.200 -1.44 -0.659
## Amerindian -0.5346 0.300 -1.12 0.054
## East_Asian 0.1555 0.180 -0.20 0.508
## Oceanian -2.6176 1.233 -5.04 -0.199
## Central_Asian 0.0429 0.250 -0.45 0.532
## EFL -0.0041 0.081 -0.16 0.155
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1363 1348 0.24 0.23 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## S 0.2686 0.2938
## SIRE_standard 0.0688 0.0785
## African 0.1251 0.1417
## Amerindian 0.0424 0.0485
## East_Asian 0.0206 0.0236
## Oceanian 0.0505 0.0577
## Central_Asian 0.0041 0.0047
## EFL 0.0012 0.0014
##
## [[2]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## S 0.302 0.028 0.248 0.356
## SIRE_common: White 0.000 NA NA NA
## SIRE_common: African_American 0.033 0.207 -0.373 0.440
## SIRE_common: Asian 0.019 0.190 -0.354 0.391
## SIRE_common: Asian, White 0.240 0.147 -0.049 0.528
## SIRE_common: Hispanic -0.371 0.155 -0.676 -0.066
## SIRE_common: Hispanic, African_American -0.301 0.224 -0.740 0.138
## SIRE_common: Hispanic, White -0.121 0.107 -0.330 0.089
## SIRE_common: Pacific_Islander, Asian, White -0.018 0.182 -0.375 0.339
## SIRE_common: Other -0.052 0.107 -0.263 0.158
## African -0.867 0.238 -1.334 -0.399
## Amerindian -0.308 0.316 -0.928 0.311
## East_Asian 0.040 0.190 -0.332 0.413
## Oceanian -1.659 1.109 -3.836 0.517
## Central_Asian -0.029 0.254 -0.528 0.470
## EFL -0.011 0.081 -0.170 0.149
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1363 1347 0.24 0.23 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## S 0.2611 0.2867
## SIRE_common 0.0866 0.0988
## African 0.0865 0.0986
## Amerindian 0.0232 0.0266
## East_Asian 0.0051 0.0058
## Oceanian 0.0355 0.0407
## Central_Asian 0.0027 0.0031
## EFL 0.0031 0.0035
##
## [[3]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## S 0.3040 0.028 0.25 0.36
## Hispanic_c -0.3917 0.147 -0.68 -0.10
## Pacific_Islander_c -0.1587 0.304 -0.76 0.44
## Asian_c 0.1114 0.217 -0.31 0.54
## African_American_c 0.1039 0.236 -0.36 0.57
## American_Indian_c -0.0638 0.256 -0.57 0.44
## Other_c 0.1491 0.242 -0.33 0.62
## African -0.9809 0.275 -1.52 -0.44
## Amerindian -0.1529 0.320 -0.78 0.48
## East_Asian -0.0095 0.226 -0.45 0.43
## Oceanian -1.1523 1.433 -3.96 1.66
## Central_Asian -0.1145 0.275 -0.65 0.43
## EFL -0.0013 0.081 -0.16 0.16
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1363 1349 0.24 0.23 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## S 0.25954 0.28492
## Hispanic_c 0.06323 0.07223
## Pacific_Islander_c 0.01241 0.01421
## Asian_c 0.01220 0.01397
## African_American_c 0.01046 0.01198
## American_Indian_c 0.00592 0.00678
## Other_c 0.01465 0.01677
## African 0.08472 0.09657
## Amerindian 0.01135 0.01300
## East_Asian 0.00100 0.00115
## Oceanian 0.01911 0.02188
## Central_Asian 0.00988 0.01132
## EFL 0.00039 0.00045
##
## [[4]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## S 0.2991 0.028 0.244 0.3541
## Hispanic -0.1500 0.078 -0.303 0.0035
## Pacific_Islander -0.1425 0.131 -0.400 0.1155
## Asian 0.1855 0.115 -0.039 0.4105
## African_American 0.1374 0.147 -0.152 0.4268
## American_Indian -0.0482 0.114 -0.272 0.1760
## Other 0.2723 0.240 -0.199 0.7438
## White 0.1381 0.084 -0.026 0.3023
## African -0.8858 0.228 -1.333 -0.4381
## Amerindian -0.2924 0.295 -0.871 0.2867
## East_Asian 0.0222 0.182 -0.334 0.3785
## Oceanian -0.9074 1.265 -3.389 1.5742
## Central_Asian -0.0697 0.244 -0.549 0.4095
## EFL -0.0026 0.081 -0.162 0.1570
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1363 1348 0.24 0.23 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## S 0.25367 0.27904
## Hispanic 0.04559 0.05216
## Pacific_Islander 0.02576 0.02950
## Asian 0.03847 0.04403
## African_American 0.02215 0.02537
## American_Indian 0.01003 0.01149
## Other 0.02694 0.03084
## White 0.03922 0.04489
## African 0.09230 0.10514
## Amerindian 0.02355 0.02697
## East_Asian 0.00291 0.00333
## Oceanian 0.01706 0.01953
## Central_Asian 0.00678 0.00777
## EFL 0.00076 0.00087
##
## [[5]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## S 0.3122 0.027 0.26 0.37
## African -0.8788 0.098 -1.07 -0.69
## Amerindian -0.8206 0.222 -1.26 -0.39
## East_Asian 0.1067 0.090 -0.07 0.28
## Oceanian -2.1592 0.912 -3.95 -0.37
## Central_Asian 0.0254 0.200 -0.37 0.42
## EFL 0.0048 0.081 -0.15 0.16
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1363 1355 0.23 0.23 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## S 0.2737 0.2980
## African 0.2140 0.2371
## Amerindian 0.0882 0.1001
## East_Asian 0.0282 0.0321
## Oceanian 0.0564 0.0642
## Central_Asian 0.0030 0.0034
## EFL 0.0014 0.0016
##
## [[6]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## S 0.3458 0.026 0.29 0.398
## SIRE_standard: White 0.0000 NA NA NA
## SIRE_standard: African_American -0.6238 0.088 -0.80 -0.451
## SIRE_standard: American_Indian 0.4161 0.446 -0.46 1.291
## SIRE_standard: Asian 0.0519 0.091 -0.13 0.231
## SIRE_standard: Hispanic -0.3212 0.067 -0.45 -0.191
## SIRE_standard: Multi_ethnic -0.0762 0.076 -0.23 0.074
## SIRE_standard: Other -0.0088 0.233 -0.47 0.449
## SIRE_standard: Pacific_Islander -0.1951 0.229 -0.64 0.254
## EFL -0.0047 0.080 -0.16 0.153
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1363 1353 0.22 0.21 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## S 0.3156 0.3357
## SIRE_standard 0.1986 0.2188
## EFL 0.0014 0.0016
##
## [[7]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## S 0.3189 0.027 0.267 0.371
## SIRE_common: White 0.0000 NA NA NA
## SIRE_common: African_American -0.6500 0.088 -0.822 -0.478
## SIRE_common: Asian 0.0464 0.091 -0.131 0.224
## SIRE_common: Asian, White 0.2524 0.121 0.015 0.490
## SIRE_common: Hispanic -0.5456 0.117 -0.775 -0.317
## SIRE_common: Hispanic, African_American -0.8163 0.173 -1.156 -0.477
## SIRE_common: Hispanic, White -0.2308 0.085 -0.398 -0.063
## SIRE_common: Pacific_Islander, Asian, White -0.0840 0.162 -0.401 0.233
## SIRE_common: Other -0.2303 0.074 -0.376 -0.084
## EFL -0.0036 0.080 -0.160 0.153
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1363 1352 0.23 0.22 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## S 0.2852 0.3091
## SIRE_common 0.2327 0.2564
## EFL 0.0011 0.0012
##
## [[8]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## S 0.3202 0.027 0.267 0.37
## Hispanic_c -0.5524 0.099 -0.746 -0.36
## Pacific_Islander_c -0.3229 0.175 -0.666 0.02
## Asian_c 0.0917 0.084 -0.073 0.26
## African_American_c -0.6871 0.084 -0.852 -0.52
## American_Indian_c -0.2424 0.249 -0.732 0.25
## Other_c -0.0328 0.231 -0.485 0.42
## EFL 0.0047 0.079 -0.151 0.16
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1363 1354 0.23 0.23 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## S 0.2827 0.3066
## Hispanic_c 0.1334 0.1502
## Pacific_Islander_c 0.0440 0.0501
## Asian_c 0.0260 0.0296
## African_American_c 0.1951 0.2170
## American_Indian_c 0.0232 0.0264
## Other_c 0.0034 0.0039
## EFL 0.0014 0.0016
##
## [[9]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## S 0.3207 0.027 0.27 0.374
## Hispanic -0.1969 0.060 -0.32 -0.078
## Pacific_Islander -0.1472 0.095 -0.33 0.038
## Asian 0.2731 0.070 0.14 0.410
## African_American -0.3377 0.079 -0.49 -0.182
## American_Indian -0.0395 0.115 -0.26 0.185
## Other 0.2415 0.237 -0.22 0.706
## White 0.2460 0.066 0.12 0.375
## EFL -0.0029 0.079 -0.16 0.153
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1363 1353 0.23 0.22 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## S 0.28140 0.30509
## Hispanic 0.07780 0.08823
## Pacific_Islander 0.03720 0.04231
## Asian 0.09309 0.10539
## African_American 0.10149 0.11477
## American_Indian 0.00824 0.00938
## Other 0.02434 0.02770
## White 0.08905 0.10086
## EFL 0.00087 0.00098
# summarize model results -------------------------------------------------
#beta stats by predictor and outcome
#across models, mean and range
betas %>%
#only main models
filter(model_id %in% 1:4) %>%
#loop over predictors
plyr::ddply(c("Predictor", "outcome"), function(x) {
#summarize the data with mean and range
this_sum = psych::describe(x$Beta) %>%
#type
as.data.frame()
#return a nice string
data.frame(
beta = sprintf("%.2f (%.2f to %.2f)", this_sum$mean, this_sum$min, this_sum$max)
)
}) %>%
#filter to ancestries
filter(Predictor %in% v_genomic) %>%
#spread beta
spread(key = outcome, value = beta) %>%
#output
write_clipboard()
## Predictor CA S
## 1 African -1.46 (-1.59 to -1.31) -1.47 (-1.76 to -1.23)
## 2 Amerindian -0.92 (-1.15 to -0.75) -1.90 (-2.06 to -1.76)
## 3 Central_Asian 0.17 (0.15 to 0.21) 0.81 (0.62 to 1.07)
## 4 East_Asian 0.03 (-0.01 to 0.06) 0.02 (-0.24 to 0.33)
## 5 Oceanian -2.76 (-4.81 to -1.32) -4.02 (-7.23 to -1.58)
#ancestry model fit by SIRE coding and outcome
rbind(
all_models_nonEuro[[1]] %>% mutate(outcome = "CA"),
all_models_nonEuro[[2]] %>% mutate(outcome = "S")
) %>%
#only include models with genomics and SIRE
filter(genomics, SIRE != "none") %>%
#retrieve r2adj
mutate(
r2_adj = map_dbl(fit, function(x) {
# browser()
x$meta$`R2-adj.`
})
) %>%
#out
write_clipboard()
## List columns were removed because they cannot be easily transformed to rectangular format. Colnames: fit
## SIRE Genomics Outcome R2 adj
## 1 standard TRUE CA 0.16
## 2 common TRUE CA 0.17
## 3 numeric TRUE CA 0.17
## 4 dummy TRUE CA 0.17
## 5 standard TRUE S 0.24
## 6 common TRUE S 0.25
## 7 numeric TRUE S 0.27
## 8 dummy TRUE S 0.28
#african amer SIREs
v_african_american_SIREs = c("SIRE_standard: African_American",
"SIRE_common: African_American",
"African_American",
"African_American_c",
"SIRE_common: Hispanic, African_American")
betas %>%
#only main models
filter(model_id %in% 1:4, Predictor %in% v_african_american_SIREs) %T>%
print() %$%
#descrip
psych::describe(Beta)
## Predictor Beta SE CI_lower CI_upper model_id outcome
## 1 SIRE_standard: African_American 0.305 0.18 -0.055 0.666 1 CA
## 2 SIRE_common: African_American 0.086 0.21 -0.330 0.502 2 CA
## 3 SIRE_common: Hispanic, African_American -0.382 0.23 -0.836 0.071 2 CA
## 4 African_American_c 0.287 0.24 -0.181 0.754 3 CA
## 5 African_American 0.263 0.15 -0.033 0.559 4 CA
## 6 SIRE_standard: African_American 0.283 0.17 -0.060 0.625 1 S
## 7 SIRE_common: African_American 0.017 0.20 -0.382 0.416 2 S
## 8 SIRE_common: Hispanic, African_American -0.398 0.22 -0.828 0.031 2 S
## 9 African_American_c 0.419 0.23 -0.030 0.868 3 S
## 10 African_American 0.309 0.14 0.029 0.588 4 S
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 10 0.12 0.29 0.27 0.15 0.13 -0.4 0.42 0.82 -0.86 -0.97 0.09
#hispanics
betas %>%
#only main models
filter(model_id %in% 1:4, str_detect(Predictor, "Hispanic")) %T>%
print() %$%
#descrip
psych::describe(Beta)
## Predictor Beta SE CI_lower CI_upper model_id outcome
## 1 SIRE_standard: Hispanic -0.165 0.099 -0.36 0.03029 1 CA
## 2 SIRE_common: Hispanic -0.421 0.162 -0.74 -0.10358 2 CA
## 3 SIRE_common: Hispanic, African_American -0.382 0.231 -0.84 0.07100 2 CA
## 4 SIRE_common: Hispanic, White -0.135 0.111 -0.35 0.08305 2 CA
## 5 Hispanic_c -0.460 0.153 -0.76 -0.16012 3 CA
## 6 Hispanic -0.182 0.081 -0.34 -0.02241 4 CA
## 7 SIRE_standard: Hispanic -0.209 0.094 -0.39 -0.02576 1 S
## 8 SIRE_common: Hispanic -0.207 0.152 -0.50 0.09090 2 S
## 9 SIRE_common: Hispanic, African_American -0.398 0.219 -0.83 0.03088 2 S
## 10 SIRE_common: Hispanic, White -0.061 0.105 -0.27 0.14402 2 S
## 11 Hispanic_c -0.278 0.142 -0.56 0.00084 3 S
## 12 Hispanic -0.109 0.075 -0.26 0.03851 4 S
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 12 -0.25 0.13 -0.21 -0.25 0.13 -0.46 -0.06 0.4 -0.25 -1.6 0.04
# plot model results ------------------------------------------------------
#eta comparison
etas_summary %>%
#only models that have both genomics and SIRE
filter(model_id %in% 1:4) %>%
ggplot(aes(model_id, total_eta, fill = class)) +
scale_x_discrete("SIRE coding", labels = c("standard", "common comb.", "continuous", "dummy")) +
geom_bar(stat = "identity", position = "dodge") +
xlab("Model id") + ylab("Total eta") +
theme_bw() +
#facet_grid(. ~ outcome)
facet_grid(outcome ~ .)
ggsave2("figures/model_etas.png")
#summary statistics, by outcome x class
etas_summary %>%
#only models that have both genomics and SIRE
filter(model_id %in% 1:4) %>%
plyr::ddply(c("outcome", "class"), function(x) {
psych::describe(x$total_eta)
})
## outcome class vars n mean sd median trimmed mad min max range skew kurtosis se
## 1 CA Genomic ancestry 1 4 0.182 0.0363 0.167 0.182 0.0087 0.1579 0.236 0.0781 0.711 -1.7 0.0182
## 2 CA Linguistic 1 4 0.011 0.0021 0.010 0.011 0.0014 0.0086 0.013 0.0048 0.439 -1.9 0.0010
## 3 CA SIRE 1 4 0.111 0.0195 0.112 0.111 0.0219 0.0881 0.133 0.0452 -0.038 -2.1 0.0098
## 4 S Genomic ancestry 1 4 0.246 0.0308 0.238 0.246 0.0181 0.2191 0.290 0.0706 0.522 -1.8 0.0154
## 5 S Linguistic 1 4 0.039 0.0067 0.039 0.039 0.0066 0.0320 0.048 0.0159 0.157 -2.0 0.0034
## 6 S SIRE 1 4 0.168 0.0478 0.175 0.168 0.0475 0.1098 0.213 0.1037 -0.186 -2.2 0.0239
#summary statistics, by class
etas_summary %>%
#only models that have both genomics and SIRE
filter(model_id %in% 1:4) %>%
plyr::ddply(c("class"), function(x) {
psych::describe(x$total_eta)
})
## class vars n mean sd median trimmed mad min max range skew kurtosis se
## 1 Genomic ancestry 1 8 0.214 0.046 0.226 0.214 0.055 0.1579 0.290 0.132 0.11 -1.5 0.0164
## 2 Linguistic 1 8 0.025 0.016 0.023 0.025 0.020 0.0086 0.048 0.039 0.17 -2.0 0.0057
## 3 SIRE 1 8 0.140 0.046 0.126 0.140 0.034 0.0881 0.213 0.125 0.54 -1.5 0.0161
#with parental
etas_summary_parental %>%
#only models that have both genomics and SIRE
filter(model_id %in% 1:4) %>%
ggplot(aes(model_id, total_eta, fill = class)) +
scale_x_discrete("SIRE coding", labels = c("standard", "common comb.", "continuous", "dummy")) +
geom_bar(stat = "identity", position = "dodge") +
xlab("Model id") + ylab("Total eta") +
theme_bw()
ggsave2("figures/model_etas_parental.png")
#summary statistics, by class
etas_summary_parental %>%
#only models that have both genomics and SIRE
filter(model_id %in% 1:4) %>%
plyr::ddply(c("class"), function(x) {
psych::describe(x$total_eta)
})
## class vars n mean sd median trimmed mad min max range skew kurtosis se
## 1 Genomic ancestry 1 4 0.1062 0.0249 0.09677 0.1062 0.0066 0.08815 0.1430 0.0548 0.688 -1.7 0.0124
## 2 Linguistic 1 4 0.0014 0.0012 0.00098 0.0014 0.0006 0.00039 0.0031 0.0027 0.585 -1.8 0.0006
## 3 Parental 1 4 0.2607 0.0061 0.26031 0.2607 0.0055 0.25367 0.2686 0.0149 0.146 -1.9 0.0031
## 4 SIRE 1 4 0.0770 0.0098 0.07648 0.0770 0.0118 0.06827 0.0866 0.0183 0.019 -2.4 0.0049
# non-Whites --------------------------------------------------------------
#ancestry vairation
d_nW[v_genomic] %>% psych::describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## European 1 820 0.39 0.30 0.39 0.37 0.37 0 1.00 1.00 0.28 -1.04 0.01
## African 2 820 0.23 0.32 0.04 0.17 0.06 0 1.00 1.00 1.14 -0.33 0.01
## Amerindian 3 820 0.08 0.15 0.00 0.04 0.00 0 0.83 0.83 2.10 4.19 0.01
## East_Asian 4 820 0.25 0.36 0.00 0.19 0.00 0 1.00 1.00 1.09 -0.38 0.01
## Oceanian 5 820 0.01 0.04 0.00 0.00 0.00 0 0.25 0.25 3.88 16.25 0.00
## Central_Asian 6 820 0.04 0.16 0.00 0.00 0.00 0 1.00 1.00 5.21 26.79 0.01
#zero order
wtd.cors(d_nW[c("CA", "S", v_genomic)]) %>% write_clipboard()
## CA S European African Amerindian East Asian Oceanian Central Asian
## CA 1.00 0.41 0.08 -0.32 -0.10 0.21 -0.04 0.11
## S 0.41 1.00 0.11 -0.21 -0.17 0.11 -0.17 0.18
## European 0.08 0.11 1.00 -0.32 0.25 -0.54 -0.24 -0.18
## African -0.32 -0.21 -0.32 1.00 -0.24 -0.45 -0.20 -0.13
## Amerindian -0.10 -0.17 0.25 -0.24 1.00 -0.33 -0.17 -0.10
## East Asian 0.21 0.11 -0.54 -0.45 -0.33 1.00 0.37 -0.12
## Oceanian -0.04 -0.17 -0.24 -0.20 -0.17 0.37 1.00 -0.05
## Central Asian 0.11 0.18 -0.18 -0.13 -0.10 -0.12 -0.05 1.00
count.pairwise(d_nW[c("CA", "S", v_genomic)])
## CA S European African Amerindian East_Asian Oceanian Central_Asian
## CA 802 797 802 802 802 802 802 802
## S 797 810 810 810 810 810 810 810
## European 802 810 820 820 820 820 820 820
## African 802 810 820 820 820 820 820 820
## Amerindian 802 810 820 820 820 820 820 820
## East_Asian 802 810 820 820 820 820 820 820
## Oceanian 802 810 820 820 820 820 820 820
## Central_Asian 802 810 820 820 820 820 820 820
#models
model_ca_genomic_dummy = "CA ~ " + str_c(c(v_SIREs, v_genomic[-1], "EFL"), collapse = " + ")
model_s_genomic_dummy = "S ~ " + str_c(c(v_SIREs, v_genomic[-1], "EFL"), collapse = " + ")
#fit full models
lm(model_ca_genomic_dummy, data = d_nW) %>% MOD_summary(progress = F, standardize = F, kfold = F) %>% write_clipboard()
## V1 V2 V3 V4 V5 V6
## 1 coefs
## 2 Predictor Beta SE CI lower CI upper
## 3 Hispanic -0.20 0.10 -0.40 0.00
## 4 Pacific Islander -0.45 0.14 -0.72 -0.18
## 5 Asian 0.20 0.13 -0.06 0.46
## 6 African American 0.30 0.16 -0.02 0.61
## 7 American Indian -0.12 0.13 -0.36 0.13
## 8 Other 0.16 0.25 -0.33 0.64
## 9 White 0.20 0.09 0.02 0.38
## 10 African -1.47 0.26 -1.98 -0.96
## 11 Amerindian -0.81 0.34 -1.48 -0.15
## 12 East Asian 0.00 0.21 -0.41 0.41
## 13 Oceanian -1.34 1.37 -4.02 1.34
## 14 Central Asian 0.10 0.27 -0.44 0.64
## 15 EFL 0.09 0.10 -0.10 0.29
## 16
## 17 meta
## 18 outcome N df R2 R2-adj R2-cv
## 19 CA 802 788 0.19 0.18
## 20
## 21 etas
## 22 Predictor Eta Eta partial
## 23 Hispanic 0.06 0.07
## 24 Pacific Islander 0.11 0.12
## 25 Asian 0.05 0.05
## 26 African American 0.06 0.07
## 27 American Indian 0.03 0.03
## 28 Other 0.02 0.02
## 29 White 0.07 0.08
## 30 African 0.18 0.20
## 31 Amerindian 0.08 0.08
## 32 East Asian 0.00 0.00
## 33 Oceanian 0.03 0.03
## 34 Central Asian 0.01 0.01
## 35 EFL 0.03 0.03
lm(model_s_genomic_dummy, data = d_nW) %>% MOD_summary(progress = F, standardize = F, kfold = F) %>% write_clipboard()
## V1 V2 V3 V4 V5 V6
## 1 coefs
## 2 Predictor Beta SE CI lower CI upper
## 3 Hispanic -0.16 0.10 -0.35 0.03
## 4 Pacific Islander -1.00 0.13 -1.25 -0.74
## 5 Asian 0.04 0.13 -0.21 0.29
## 6 African American 0.28 0.16 -0.03 0.59
## 7 American Indian -0.21 0.12 -0.45 0.02
## 8 Other -0.17 0.26 -0.69 0.34
## 9 White 0.28 0.09 0.11 0.45
## 10 African -1.40 0.26 -1.91 -0.90
## 11 Amerindian -1.68 0.33 -2.32 -1.04
## 12 East Asian 0.02 0.20 -0.38 0.42
## 13 Oceanian -1.53 1.31 -4.10 1.04
## 14 Central Asian 0.70 0.27 0.17 1.23
## 15 EFL 0.28 0.09 0.10 0.46
## 16
## 17 meta
## 18 outcome N df R2 R2-adj R2-cv
## 19 S 810 796 0.26 0.25
## 20
## 21 etas
## 22 Predictor Eta Eta partial
## 23 Hispanic 0.05 0.06
## 24 Pacific Islander 0.23 0.26
## 25 Asian 0.01 0.01
## 26 African American 0.05 0.06
## 27 American Indian 0.05 0.06
## 28 Other 0.02 0.02
## 29 White 0.10 0.11
## 30 African 0.17 0.19
## 31 Amerindian 0.16 0.18
## 32 East Asian 0.00 0.00
## 33 Oceanian 0.04 0.04
## 34 Central Asian 0.08 0.09
## 35 EFL 0.09 0.11
# african americans narrow -------------------------------------------------------
#ancestry vairation
d_AA_narrow[v_genomic] %>% psych::describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## European 1 140 0.17 0.11 0.15 0.16 0.08 0.00 0.61 0.61 1.1 2.0 0.01
## African 2 140 0.81 0.12 0.84 0.82 0.09 0.39 1.00 0.61 -1.1 1.3 0.01
## Amerindian 3 140 0.00 0.01 0.00 0.00 0.00 0.00 0.05 0.05 3.2 9.5 0.00
## East_Asian 4 140 0.00 0.03 0.00 0.00 0.00 0.00 0.20 0.20 6.7 46.7 0.00
## Oceanian 5 140 0.00 0.01 0.00 0.00 0.00 0.00 0.04 0.04 5.9 36.0 0.00
## Central_Asian 6 140 0.01 0.02 0.00 0.00 0.00 0.00 0.16 0.16 4.5 19.8 0.00
#zero order
wtd.cors(d_AA_narrow[c("CA", "S", v_genomic)]) %>% write_clipboard()
## CA S European African Amerindian East Asian Oceanian Central Asian
## CA 1.00 0.31 0.17 -0.22 -0.08 0.16 0.00 0.13
## S 0.31 1.00 0.27 -0.31 0.05 0.10 0.01 0.17
## European 0.17 0.27 1.00 -0.93 0.02 0.04 -0.21 -0.05
## African -0.22 -0.31 -0.93 1.00 -0.07 -0.35 0.16 -0.26
## Amerindian -0.08 0.05 0.02 -0.07 1.00 -0.04 -0.06 -0.07
## East Asian 0.16 0.10 0.04 -0.35 -0.04 1.00 -0.03 0.52
## Oceanian 0.00 0.01 -0.21 0.16 -0.06 -0.03 1.00 0.02
## Central Asian 0.13 0.17 -0.05 -0.26 -0.07 0.52 0.02 1.00
count.pairwise(d_AA_narrow[c("CA", "S", v_genomic)])
## CA S European African Amerindian East_Asian Oceanian Central_Asian
## CA 138 137 138 138 138 138 138 138
## S 137 139 139 139 139 139 139 139
## European 138 139 140 140 140 140 140 140
## African 138 139 140 140 140 140 140 140
## Amerindian 138 139 140 140 140 140 140 140
## East_Asian 138 139 140 140 140 140 140 140
## Oceanian 138 139 140 140 140 140 140 140
## Central_Asian 138 139 140 140 140 140 140 140
#model stats
#CA
lm("CA ~ African", data = d_AA_narrow) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()
## V1 V2 V3 V4 V5 V6
## 1 coefs
## 2 Predictor Beta SE CI lower CI upper
## 3 African -1.60 0.61 -2.81 -0.39
## 4
## 5 meta
## 6 outcome N df R2 R2-adj R2-cv
## 7 CA 138 136 0.048 0.041
## 8
## 9 etas
## 10 Predictor Eta Eta partial
## 11 African 0.22 0.22
lm("CA ~ European", data = d_AA_narrow) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()
## V1 V2 V3 V4 V5 V6
## 1 coefs
## 2 Predictor Beta SE CI lower CI upper
## 3 European 1.36 0.66 0.06 2.66
## 4
## 5 meta
## 6 outcome N df R2 R2-adj R2-cv
## 7 CA 138 136 0.031 0.023
## 8
## 9 etas
## 10 Predictor Eta Eta partial
## 11 European 0.17 0.17
#model predictions
(AA_preds_euro = predict(lm("CA ~ European", data = d_AA_narrow), newdata = data.frame(European = c(0, 1))))
## 1 2
## -0.92 0.45
(AA_preds_african = predict(lm("CA ~ African", data = d_AA_narrow), newdata = data.frame(African = c(0, 1))))
## 1 2
## 0.62 -0.98
#gap sizes
AA_preds_euro[1] - AA_preds_euro[2]
## 1
## -1.4
AA_preds_african[2] - AA_preds_african[1]
## 2
## -1.6
#S
lm("S ~ African", data = d_AA_narrow) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()
## V1 V2 V3 V4 V5 V6
## 1 coefs
## 2 Predictor Beta SE CI lower CI upper
## 3 African -2.50 0.65 -3.79 -1.22
## 4
## 5 meta
## 6 outcome N df R2 R2-adj R2-cv
## 7 S 139 137 0.098 0.091
## 8
## 9 etas
## 10 Predictor Eta Eta partial
## 11 African 0.31 0.31
lm("S ~ European", data = d_AA_narrow) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()
## V1 V2 V3 V4 V5 V6
## 1 coefs
## 2 Predictor Beta SE CI lower CI upper
## 3 European 2.28 0.70 0.89 3.66
## 4
## 5 meta
## 6 outcome N df R2 R2-adj R2-cv
## 7 S 139 137 0.072 0.065
## 8
## 9 etas
## 10 Predictor Eta Eta partial
## 11 European 0.27 0.27
#model predictions
(AA_preds_euro = predict(lm("S ~ European", data = d_AA_narrow), newdata = data.frame(European = c(0, 1))))
## 1 2
## -0.96 1.32
(AA_preds_african = predict(lm("S ~ African", data = d_AA_narrow), newdata = data.frame(African = c(0, 1))))
## 1 2
## 1.5 -1.0
#gap sizes
AA_preds_euro[1] - AA_preds_euro[2]
## 1
## -2.3
AA_preds_african[2] - AA_preds_african[1]
## 2
## -2.5
#calculate group means
d_W_AA_means = d_W_AA %>% plyr::ddply("SIRE_standard", function(x) {
data.frame(
African = wtd_mean(x[["African"]]),
Amerindian = wtd_mean(x[["Amerindian"]]),
European = wtd_mean(x[["European"]]),
African_European_r = wtd.cors(x[["African"]], x[["European"]]) %>% as.vector,
CA = wtd_mean(x[["CA"]]),
n = nrow(x)
)
})
#African ancestry
ggplot(d_W_AA, aes(African, CA, color = SIRE_standard)) +
geom_point() +
geom_smooth(data = d_AA_narrow, method = lm, fullrange = T) +
scale_x_continuous("African ancestry%", limits = c(0, 1), labels = scales::percent) +
scale_color_discrete(name = "SIRE (standard)") +
scale_y_continuous("Cognitive ability", breaks = -3:3) +
geom_point(data = d_W_AA_means, size = 5) +
theme_bw()
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).
ggsave2("figures/AA_African.png")
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).
#European
ggplot(d_W_AA, aes(European, CA, color = SIRE_standard)) +
geom_point() +
geom_smooth(data = d_AA_narrow, method = lm, fullrange = T) +
scale_x_continuous("European ancestry%", limits = c(0, 1), labels = scales::percent) +
scale_y_continuous("Cognitive ability", breaks = -3:3) +
scale_color_discrete(name = "SIRE (standard)") +
geom_point(data = d_W_AA_means, size = 5) +
theme_bw()
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).
ggsave2("figures/AA_European.png")
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).
# african americans broad -------------------------------------------------------
#ancestry vairation
d_AA_broad[v_genomic] %>% psych::describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## European 1 228 0.26 0.19 0.20 0.24 0.15 0 1.00 1.00 1.14 1.11 0.01
## African 2 228 0.68 0.23 0.77 0.71 0.19 0 1.00 1.00 -0.89 -0.03 0.02
## Amerindian 3 228 0.01 0.04 0.00 0.00 0.00 0 0.32 0.32 4.22 21.32 0.00
## East_Asian 4 228 0.03 0.12 0.00 0.00 0.00 0 1.00 1.00 4.82 27.80 0.01
## Oceanian 5 228 0.00 0.01 0.00 0.00 0.00 0 0.13 0.13 5.87 41.30 0.00
## Central_Asian 6 228 0.01 0.04 0.00 0.00 0.00 0 0.44 0.44 6.94 58.83 0.00
#zero order
wtd.cors(d_AA_broad[c("CA", "S", v_genomic)]) %>% write_clipboard()
## CA S European African Amerindian East Asian Oceanian Central Asian
## CA 1.00 0.33 0.20 -0.29 -0.08 0.23 0.05 0.14
## S 0.33 1.00 0.16 -0.19 -0.09 0.10 -0.17 0.16
## European 0.20 0.16 1.00 -0.81 0.21 -0.06 0.01 -0.13
## African -0.29 -0.19 -0.81 1.00 -0.30 -0.47 -0.30 -0.09
## Amerindian -0.08 -0.09 0.21 -0.30 1.00 -0.08 -0.07 -0.02
## East Asian 0.23 0.10 -0.06 -0.47 -0.08 1.00 0.50 0.07
## Oceanian 0.05 -0.17 0.01 -0.30 -0.07 0.50 1.00 -0.04
## Central Asian 0.14 0.16 -0.13 -0.09 -0.02 0.07 -0.04 1.00
count.pairwise(d_AA_broad[c("CA", "S", v_genomic)])
## CA S European African Amerindian East_Asian Oceanian Central_Asian
## CA 225 224 225 225 225 225 225 225
## S 224 227 227 227 227 227 227 227
## European 225 227 228 228 228 228 228 228
## African 225 227 228 228 228 228 228 228
## Amerindian 225 227 228 228 228 228 228 228
## East_Asian 225 227 228 228 228 228 228 228
## Oceanian 225 227 228 228 228 228 228 228
## Central_Asian 225 227 228 228 228 228 228 228
#model stats
#CA
lm("CA ~ African", data = d_AA_broad) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()
## V1 V2 V3 V4 V5 V6
## 1 coefs
## 2 Predictor Beta SE CI lower CI upper
## 3 African -1.24 0.27 -1.77 -0.71
## 4
## 5 meta
## 6 outcome N df R2 R2-adj R2-cv
## 7 CA 225 223 0.087 0.083
## 8
## 9 etas
## 10 Predictor Eta Eta partial
## 11 African 0.29 0.29
lm("CA ~ European", data = d_AA_broad) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()
## V1 V2 V3 V4 V5 V6
## 1 coefs
## 2 Predictor Beta SE CI lower CI upper
## 3 European 0.99 0.33 0.34 1.64
## 4
## 5 meta
## 6 outcome N df R2 R2-adj R2-cv
## 7 CA 225 223 0.038 0.034
## 8
## 9 etas
## 10 Predictor Eta Eta partial
## 11 European 0.20 0.20
#model predictions
(AA_preds_euro = predict(lm("CA ~ European", data = d_AA_broad), newdata = data.frame(European = c(0, 1))))
## 1 2
## -0.86 0.13
(AA_preds_african = predict(lm("CA ~ African", data = d_AA_broad), newdata = data.frame(African = c(0, 1))))
## 1 2
## 0.24 -0.99
#gap sizes
AA_preds_euro[1] - AA_preds_euro[2]
## 1
## -0.99
AA_preds_african[2] - AA_preds_african[1]
## 2
## -1.2
#S
lm("S ~ African", data = d_AA_broad) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()
## V1 V2 V3 V4 V5 V6
## 1 coefs
## 2 Predictor Beta SE CI lower CI upper
## 3 African -0.77 0.27 -1.30 -0.24
## 4
## 5 meta
## 6 outcome N df R2 R2-adj R2-cv
## 7 S 227 225 0.035 0.031
## 8
## 9 etas
## 10 Predictor Eta Eta partial
## 11 African 0.19 0.19
lm("S ~ European", data = d_AA_broad) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()
## V1 V2 V3 V4 V5 V6
## 1 coefs
## 2 Predictor Beta SE CI lower CI upper
## 3 European 0.81 0.33 0.17 1.45
## 4
## 5 meta
## 6 outcome N df R2 R2-adj R2-cv
## 7 S 227 225 0.027 0.022
## 8
## 9 etas
## 10 Predictor Eta Eta partial
## 11 European 0.16 0.16
#model predictions
(AA_preds_euro = predict(lm("S ~ European", data = d_AA_broad), newdata = data.frame(European = c(0, 1))))
## 1 2
## -0.735 0.074
(AA_preds_african = predict(lm("S ~ African", data = d_AA_broad), newdata = data.frame(African = c(0, 1))))
## 1 2
## 0.0018 -0.7697
#gap sizes
AA_preds_euro[1] - AA_preds_euro[2]
## 1
## -0.81
AA_preds_african[2] - AA_preds_african[1]
## 2
## -0.77
# Hispanics ---------------------------------------------------------------
#ancestry variation
d_HA[v_genomic] %>% psych::describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## European 1 328 0.57 0.23 0.56 0.58 0.25 0 1.00 1.00 -0.19 -0.53 0.01
## African 2 328 0.14 0.20 0.05 0.09 0.08 0 0.91 0.91 1.95 3.27 0.01
## Amerindian 3 328 0.19 0.17 0.15 0.17 0.18 0 0.83 0.83 0.91 0.44 0.01
## East_Asian 4 328 0.09 0.20 0.00 0.03 0.00 0 1.00 1.00 2.71 6.85 0.01
## Oceanian 5 328 0.01 0.02 0.00 0.00 0.00 0 0.16 0.16 4.17 20.32 0.00
## Central_Asian 6 328 0.01 0.03 0.00 0.00 0.00 0 0.46 0.46 9.51 117.77 0.00
#zero order
wtd.cors(d_HA[c("CA", "S", v_genomic)]) %>% write_clipboard()
## CA S European African Amerindian East Asian Oceanian Central Asian
## CA 1.00 0.33 0.23 -0.23 -0.07 0.01 -0.09 0.08
## S 0.33 1.00 0.42 -0.24 -0.18 -0.09 -0.22 0.07
## European 0.23 0.42 1.00 -0.41 -0.21 -0.54 -0.34 -0.04
## African -0.23 -0.24 -0.41 1.00 -0.31 -0.21 -0.12 -0.06
## Amerindian -0.07 -0.18 -0.21 -0.31 1.00 -0.28 -0.29 0.01
## East Asian 0.01 -0.09 -0.54 -0.21 -0.28 1.00 0.67 -0.06
## Oceanian -0.09 -0.22 -0.34 -0.12 -0.29 0.67 1.00 -0.05
## Central Asian 0.08 0.07 -0.04 -0.06 0.01 -0.06 -0.05 1.00
count.pairwise(d_HA[c("CA", "S", v_genomic)])
## CA S European African Amerindian East_Asian Oceanian Central_Asian
## CA 323 323 323 323 323 323 323 323
## S 323 328 328 328 328 328 328 328
## European 323 328 328 328 328 328 328 328
## African 323 328 328 328 328 328 328 328
## Amerindian 323 328 328 328 328 328 328 328
## East_Asian 323 328 328 328 328 328 328 328
## Oceanian 323 328 328 328 328 328 328 328
## Central_Asian 323 328 328 328 328 328 328 328
#3 ancestries non euro
HA_3way_ca = lm(CA ~ African + Amerindian + East_Asian, data = d_HA)
HA_3way_s = lm(S ~ African + Amerindian + East_Asian, data = d_HA)
HA_3way_ca %>% MOD_summary(standardize = F, kfold = F) %>% write_clipboard()
## V1 V2 V3 V4 V5 V6
## 1 coefs
## 2 Predictor Beta SE CI lower CI upper
## 3 African -1.62 0.30 -2.22 -1.03
## 4 Amerindian -1.16 0.34 -1.84 -0.49
## 5 East Asian -0.53 0.28 -1.09 0.02
## 6
## 7 meta
## 8 outcome N df R2 R2-adj R2-cv
## 9 CA 323 319 0.089 0.08
## 10
## 11 etas
## 12 Predictor Eta Eta partial
## 13 African 0.29 0.29
## 14 Amerindian 0.18 0.19
## 15 East Asian 0.10 0.10
HA_3way_s %>% MOD_summary(standardize = F, kfold = F) %>% write_clipboard()
## V1 V2 V3 V4 V5 V6
## 1 coefs
## 2 Predictor Beta SE CI lower CI upper
## 3 African -2.00 0.26 -2.52 -1.48
## 4 Amerindian -2.09 0.30 -2.69 -1.49
## 5 East Asian -1.32 0.25 -1.81 -0.82
## 6
## 7 meta
## 8 outcome N df R2 R2-adj R2-cv
## 9 S 328 324 0.19 0.19
## 10
## 11 etas
## 12 Predictor Eta Eta partial
## 13 African 0.38 0.39
## 14 Amerindian 0.34 0.36
## 15 East Asian 0.26 0.28
#predicted gaps
#CA
(HA_3way_ca_preds = predict(HA_3way_ca, newdata = data.frame(African = c(1, 0, 0, 0),
Amerindian = c(0, 1, 0, 0),
East_Asian = c(0, 0, 1, 0))))
## 1 2 3 4
## -1.46 -1.00 -0.37 0.17
HA_3way_ca_preds[1:3] - HA_3way_ca_preds[4]
## 1 2 3
## -1.62 -1.16 -0.53
#S
(HA_3way_s_preds = predict(HA_3way_s, newdata = data.frame(African = c(1, 0, 0, 0),
Amerindian = c(0, 1, 0, 0),
East_Asian = c(0, 0, 1, 0))))
## 1 2 3 4
## -1.65 -1.74 -0.97 0.35
HA_3way_s_preds[1:3] - HA_3way_s_preds[4]
## 1 2 3
## -2.0 -2.1 -1.3
#calculate group means
d_W_H_means = d_W_H %>% plyr::ddply("SIRE_standard", function(x) {
data.frame(
African = wtd_mean(x[["African"]]),
Amerindian = wtd_mean(x[["Amerindian"]]),
European = wtd_mean(x[["European"]]),
African_European_r = wtd.cors(x[["African"]], x[["European"]]) %>% as.vector,
CA = wtd_mean(x[["CA"]]),
n = nrow(x)
)
})
#African ancestry
ggplot(d_W_H, aes(African, CA, color = SIRE_standard)) +
geom_point() +
geom_smooth(data = d_HA, method = lm, fullrange = T) +
scale_x_continuous("African ancestry%", limits = c(0, 1), labels = scales::percent) +
scale_y_continuous("Cognitive ability", breaks = -3:3) +
scale_color_discrete(name = "SIRE (standard)") +
geom_point(data = d_W_H_means, size = 5) +
theme_bw()
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).
ggsave2("figures/H_African.png")
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).
#Amerindian
ggplot(d_W_H, aes(Amerindian, CA, color = SIRE_standard)) +
geom_point() +
geom_smooth(data = d_HA, method = lm, fullrange = T) +
scale_x_continuous("Amerindian ancestry%", limits = c(0, 1), labels = scales::percent) +
scale_color_discrete(name = "SIRE (standard)") +
scale_y_continuous("Cognitive ability", breaks = -3:3) +
geom_point(data = d_W_H_means, size = 5) +
theme_bw()
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).
ggsave2("figures/H_Amerindian.png")
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).
#European
ggplot(d_W_H, aes(European, CA, color = SIRE_standard)) +
geom_point() +
geom_smooth(data = d_HA, method = lm, fullrange = T) +
scale_x_continuous("European ancestry%", limits = c(0, 1), labels = scales::percent) +
scale_y_continuous("Cognitive ability", breaks = -3:3) +
scale_color_discrete(name = "SIRE (standard)") +
geom_point(data = d_W_H_means, size = 5) +
theme_bw()
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).
ggsave2("figures/H_European.png")
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).
# 3d plot -----------------------------------------------------------------
#can only be done for persons where 3 ancestries sum to ~100%
#we choose european, amerindian, african for this purpose
d_triway = d_ping %>% filter(African + Amerindian + European >= .99)
nrow(d_triway)
## [1] 887
#fit model
(triway_fit = lm(CA ~ African + Amerindian, data = d_triway) %>% MOD_summary(standardize = F))
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## African -1.4 0.099 -1.6 -1.2
## Amerindian -1.9 0.285 -2.4 -1.3
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 874 871 0.2 0.2 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## African 0.42 0.42
## Amerindian 0.20 0.22
summary(triway_fit$model_obj)
##
## Call:
## lm(formula = CA ~ African + Amerindian, data = d_triway)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.630 -0.532 0.024 0.588 3.043
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2914 0.0381 7.64 5.6e-14 ***
## African -1.3648 0.0993 -13.74 < 2e-16 ***
## Amerindian -1.8633 0.2853 -6.53 1.1e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9 on 871 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.198, Adjusted R-squared: 0.196
## F-statistic: 107 on 2 and 871 DF, p-value: <2e-16
#plot in 3D
# car::scatter3d(CA ~ African + Amerindian, data = d_triway, axis.ticks = T, axis.scales = T)
#another package
library(plot3D)
#prepare surface fit
# predict values on regular xy grid
n_grid_points = 25
triway_surface_data = list()
triway_surface_data$x.pred <- seq(0, 1, length.out = n_grid_points)
triway_surface_data$y.pred <- seq(0, 1, length.out = n_grid_points)
triway_surface_data$xy <- expand.grid( African = triway_surface_data$x.pred, Amerindian = triway_surface_data$y.pred)
triway_surface_data$xy$sum = rowSums(triway_surface_data$xy)
triway_surface_data$z.pred <- matrix(predict(triway_fit$model_obj, newdata = triway_surface_data$xy),
nrow = n_grid_points, ncol = n_grid_points)
#code impossible values as NA
#ancestry sums to 1, one cannot be 100% african and 100% amerindian!
triway_surface_data$z.pred[matrix(triway_surface_data$xy$sum > 1, nrow = n_grid_points, ncol = n_grid_points)] = NA
#make title
triway_surface_data$title = glue::glue("Cognitive ability as a function of genomic ancesty\npersons with European, African and Amerindian ancestry only\nn={nrow(d_triway)}")
#plot
d_triway %$% scatter3D(x = African, y = Amerindian, z = CA,
theta = 120, phi = 20,
xlim = c(0, 1), ylim = c(0, 1),
ticktype = "detailed",
xlab = "African ancestry", ylab = "Amerindian ancestry", zlab = "Cognitive ability",
surf = list(x = triway_surface_data$x.pred, y = triway_surface_data$y.pred, z = triway_surface_data$z.pred,
facets = NA),
main = triway_surface_data$title)
#save
png("figures/triway_points_surface.png")
d_triway %$% scatter3D(x = African, y = Amerindian, z = CA,
theta = 120, phi = 20,
xlim = c(0, 1), ylim = c(0, 1),
ticktype = "detailed",
xlab = "African ancestry", ylab = "Amerindian ancestry", zlab = "Cognitive ability",
surf = list(x = triway_surface_data$x.pred, y = triway_surface_data$y.pred, z = triway_surface_data$z.pred,
facets = NA),
main = triway_surface_data$title)
dev.off()
## png
## 2
#save without caption (for paper)
png("figures/triway_points_surface_no_caption.png")
d_triway %$% scatter3D(x = African, y = Amerindian, z = CA,
theta = 120, phi = 20,
xlim = c(0, 1), ylim = c(0, 1),
ticktype = "detailed",
xlab = "African ancestry", ylab = "Amerindian ancestry", zlab = "Cognitive ability",
surf = list(x = triway_surface_data$x.pred, y = triway_surface_data$y.pred, z = triway_surface_data$z.pred,
facets = NA)
)
dev.off()
## png
## 2
# eta example -------------------------------------------------------------
#get the eta and eta^2
all_models_nonEuro[[1]]$fit[[1]]$aov %>% lsr::etaSquared() %>% as.data.frame %>% mutate(eta = sqrt(eta.sq)) %>% write_clipboard(digits = 4)
## Eta sq Eta sq part Eta
## 1 0.0078 0.0092 0.0881
## 2 0.0379 0.0436 0.1947
## 3 0.0086 0.0102 0.0925
## 4 0.0001 0.0001 0.0078
## 5 0.0088 0.0104 0.0936
## 6 0.0004 0.0005 0.0203
## 7 0.0001 0.0001 0.0086
#sum the genomic preds eta^2
all_models_nonEuro[[1]]$fit[[1]]$aov %>% lsr::etaSquared() %>% as.data.frame %>% mutate(eta = sqrt(eta.sq)) %>% `[`(2:6, 1) %>% sum
## [1] 0.056
#sqrt
all_models_nonEuro[[1]]$fit[[1]]$aov %>% lsr::etaSquared() %>% as.data.frame %>% mutate(eta = sqrt(eta.sq)) %>% `[`(2:6, 1) %>% sum %>% sqrt
## [1] 0.24
#predict african ancestry from non-euro ancestry
lm("African ~ Amerindian + Asian + Central_Asian + Oceanian", data = d_ping) %>% MOD_summary(progress = F, standardize = F)
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Amerindian -0.284 0.061 -0.40 -0.165
## Asian -0.138 0.018 -0.17 -0.102
## Central_Asian -0.086 0.059 -0.20 0.029
## Oceanian -0.582 0.249 -1.07 -0.093
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 African 1391 1386 0.063 0.06 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Amerindian 0.121 0.124
## Asian 0.198 0.200
## Central_Asian 0.038 0.039
## Oceanian 0.061 0.063
# other models ------------------------------------------------------------
#some other results that may have marginal interest
#only European ancestry + SIREs
#CA
all_models_euro[[1]][["fit"]]
## [[1]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_standard: White 0.0000 NA NA NA
## SIRE_standard: African_American -0.4184 0.147 -0.706 -0.13
## SIRE_standard: American_Indian 0.1310 0.472 -0.794 1.06
## SIRE_standard: Asian 0.6110 0.164 0.289 0.93
## SIRE_standard: Hispanic -0.3409 0.088 -0.514 -0.17
## SIRE_standard: Multi_ethnic 0.1252 0.111 -0.092 0.34
## SIRE_standard: Other -0.0380 0.228 -0.484 0.41
## SIRE_standard: Pacific_Islander -0.1732 0.268 -0.700 0.35
## European 0.6890 0.146 0.403 0.98
## EFL 0.0038 0.085 -0.163 0.17
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1359 0.13 0.13 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_standard 0.2766 0.2845
## European 0.1194 0.1271
## EFL 0.0011 0.0012
##
## [[2]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_common: White 0.000 NA NA NA
## SIRE_common: African_American -0.564 0.149 -0.86 -0.272
## SIRE_common: Asian 0.447 0.166 0.12 0.773
## SIRE_common: Asian, White 0.549 0.144 0.27 0.831
## SIRE_common: Hispanic -0.654 0.135 -0.92 -0.388
## SIRE_common: Hispanic, African_American -0.894 0.203 -1.29 -0.495
## SIRE_common: Hispanic, White -0.281 0.097 -0.47 -0.092
## SIRE_common: Pacific_Islander, Asian, White -0.114 0.188 -0.48 0.255
## SIRE_common: Other -0.248 0.110 -0.46 -0.031
## European 0.507 0.151 0.21 0.803
## EFL 0.018 0.084 -0.15 0.184
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1358 0.16 0.15 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_common 0.3203 0.3293
## European 0.0837 0.0908
## EFL 0.0054 0.0058
##
## [[3]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic_c -0.676 0.125 -0.92 -0.431
## Pacific_Islander_c -0.524 0.229 -0.97 -0.075
## Asian_c 0.522 0.177 0.17 0.869
## African_American_c -0.579 0.161 -0.90 -0.263
## American_Indian_c -0.439 0.266 -0.96 0.083
## Other_c -0.118 0.227 -0.56 0.327
## European 0.543 0.172 0.21 0.880
## EFL 0.013 0.084 -0.15 0.178
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1360 0.16 0.15 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic_c 0.1350 0.1455
## Pacific_Islander_c 0.0570 0.0619
## Asian_c 0.0734 0.0797
## African_American_c 0.0894 0.0969
## American_Indian_c 0.0411 0.0447
## Other_c 0.0130 0.0141
## European 0.0788 0.0855
## EFL 0.0038 0.0042
##
## [[4]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic -0.3129 0.063 -0.436 -0.19
## Pacific_Islander -0.4107 0.097 -0.601 -0.22
## Asian 0.5510 0.089 0.376 0.73
## African_American -0.3138 0.094 -0.497 -0.13
## American_Indian -0.1187 0.120 -0.354 0.12
## Other 0.1005 0.223 -0.337 0.54
## White 0.2279 0.087 0.057 0.40
## European 0.4710 0.141 0.194 0.75
## EFL 0.0026 0.084 -0.163 0.17
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1359 0.16 0.15 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic 0.12432 0.13412
## Pacific_Islander 0.10559 0.11420
## Asian 0.15347 0.16480
## African_American 0.08353 0.09057
## American_Indian 0.02462 0.02680
## Other 0.01121 0.01221
## White 0.06531 0.07092
## European 0.08301 0.09000
## EFL 0.00076 0.00083
##
## [[5]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## European 0.633 0.072 0.49 0.77
## EFL -0.051 0.085 -0.22 0.12
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1366 0.054 0.053 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## European 0.232 0.232
## EFL 0.016 0.016
##
## [[6]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_standard: White 0.000 NA NA NA
## SIRE_standard: African_American -0.971 0.090 -1.15 -0.795
## SIRE_standard: American_Indian -0.089 0.473 -1.02 0.839
## SIRE_standard: Asian -0.016 0.097 -0.21 0.175
## SIRE_standard: Hispanic -0.615 0.067 -0.75 -0.484
## SIRE_standard: Multi_ethnic -0.238 0.080 -0.40 -0.081
## SIRE_standard: Other -0.332 0.221 -0.76 0.101
## SIRE_standard: Pacific_Islander -0.761 0.240 -1.23 -0.291
## EFL 0.039 0.085 -0.13 0.206
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1360 0.12 0.11 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_standard 0.341 0.341
## EFL 0.012 0.012
##
## [[7]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_common: White 0.000 NA NA NA
## SIRE_common: African_American -0.971 0.088 -1.14 -0.799
## SIRE_common: Asian -0.012 0.095 -0.20 0.176
## SIRE_common: Asian, White 0.320 0.128 0.07 0.571
## SIRE_common: Hispanic -0.868 0.120 -1.10 -0.633
## SIRE_common: Hispanic, African_American -1.222 0.179 -1.57 -0.871
## SIRE_common: Hispanic, White -0.413 0.089 -0.59 -0.239
## SIRE_common: Pacific_Islander, Asian, White -0.398 0.168 -0.73 -0.069
## SIRE_common: Other -0.524 0.074 -0.67 -0.379
## EFL 0.054 0.084 -0.11 0.219
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1359 0.15 0.14 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_common 0.386 0.387
## EFL 0.016 0.018
##
## [[8]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic_c -0.917 0.099 -1.11 -0.722
## Pacific_Islander_c -0.996 0.175 -1.34 -0.653
## Asian_c 0.036 0.088 -0.14 0.210
## African_American_c -1.016 0.084 -1.18 -0.852
## American_Indian_c -0.615 0.261 -1.13 -0.103
## Other_c -0.345 0.216 -0.77 0.078
## EFL 0.050 0.083 -0.11 0.214
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1361 0.15 0.15 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic_c 0.231 0.243
## Pacific_Islander_c 0.143 0.153
## Asian_c 0.010 0.011
## African_American_c 0.304 0.313
## American_Indian_c 0.059 0.064
## Other_c 0.040 0.043
## EFL 0.015 0.016
##
## [[9]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic -0.341 0.062 -0.46 -0.22
## Pacific_Islander -0.479 0.095 -0.67 -0.29
## Asian 0.378 0.073 0.23 0.52
## African_American -0.460 0.083 -0.62 -0.30
## American_Indian -0.108 0.121 -0.34 0.13
## Other 0.110 0.224 -0.33 0.55
## White 0.410 0.068 0.28 0.54
## EFL 0.052 0.083 -0.11 0.22
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1360 0.15 0.14 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic 0.137 0.147
## Pacific_Islander 0.126 0.135
## Asian 0.129 0.139
## African_American 0.139 0.149
## American_Indian 0.022 0.024
## Other 0.012 0.013
## White 0.151 0.161
## EFL 0.016 0.017
#S
all_models_euro[[1]][["fit"]]
## [[1]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_standard: White 0.0000 NA NA NA
## SIRE_standard: African_American -0.4184 0.147 -0.706 -0.13
## SIRE_standard: American_Indian 0.1310 0.472 -0.794 1.06
## SIRE_standard: Asian 0.6110 0.164 0.289 0.93
## SIRE_standard: Hispanic -0.3409 0.088 -0.514 -0.17
## SIRE_standard: Multi_ethnic 0.1252 0.111 -0.092 0.34
## SIRE_standard: Other -0.0380 0.228 -0.484 0.41
## SIRE_standard: Pacific_Islander -0.1732 0.268 -0.700 0.35
## European 0.6890 0.146 0.403 0.98
## EFL 0.0038 0.085 -0.163 0.17
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1359 0.13 0.13 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_standard 0.2766 0.2845
## European 0.1194 0.1271
## EFL 0.0011 0.0012
##
## [[2]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_common: White 0.000 NA NA NA
## SIRE_common: African_American -0.564 0.149 -0.86 -0.272
## SIRE_common: Asian 0.447 0.166 0.12 0.773
## SIRE_common: Asian, White 0.549 0.144 0.27 0.831
## SIRE_common: Hispanic -0.654 0.135 -0.92 -0.388
## SIRE_common: Hispanic, African_American -0.894 0.203 -1.29 -0.495
## SIRE_common: Hispanic, White -0.281 0.097 -0.47 -0.092
## SIRE_common: Pacific_Islander, Asian, White -0.114 0.188 -0.48 0.255
## SIRE_common: Other -0.248 0.110 -0.46 -0.031
## European 0.507 0.151 0.21 0.803
## EFL 0.018 0.084 -0.15 0.184
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1358 0.16 0.15 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_common 0.3203 0.3293
## European 0.0837 0.0908
## EFL 0.0054 0.0058
##
## [[3]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic_c -0.676 0.125 -0.92 -0.431
## Pacific_Islander_c -0.524 0.229 -0.97 -0.075
## Asian_c 0.522 0.177 0.17 0.869
## African_American_c -0.579 0.161 -0.90 -0.263
## American_Indian_c -0.439 0.266 -0.96 0.083
## Other_c -0.118 0.227 -0.56 0.327
## European 0.543 0.172 0.21 0.880
## EFL 0.013 0.084 -0.15 0.178
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1360 0.16 0.15 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic_c 0.1350 0.1455
## Pacific_Islander_c 0.0570 0.0619
## Asian_c 0.0734 0.0797
## African_American_c 0.0894 0.0969
## American_Indian_c 0.0411 0.0447
## Other_c 0.0130 0.0141
## European 0.0788 0.0855
## EFL 0.0038 0.0042
##
## [[4]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic -0.3129 0.063 -0.436 -0.19
## Pacific_Islander -0.4107 0.097 -0.601 -0.22
## Asian 0.5510 0.089 0.376 0.73
## African_American -0.3138 0.094 -0.497 -0.13
## American_Indian -0.1187 0.120 -0.354 0.12
## Other 0.1005 0.223 -0.337 0.54
## White 0.2279 0.087 0.057 0.40
## European 0.4710 0.141 0.194 0.75
## EFL 0.0026 0.084 -0.163 0.17
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1359 0.16 0.15 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic 0.12432 0.13412
## Pacific_Islander 0.10559 0.11420
## Asian 0.15347 0.16480
## African_American 0.08353 0.09057
## American_Indian 0.02462 0.02680
## Other 0.01121 0.01221
## White 0.06531 0.07092
## European 0.08301 0.09000
## EFL 0.00076 0.00083
##
## [[5]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## European 0.633 0.072 0.49 0.77
## EFL -0.051 0.085 -0.22 0.12
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1366 0.054 0.053 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## European 0.232 0.232
## EFL 0.016 0.016
##
## [[6]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_standard: White 0.000 NA NA NA
## SIRE_standard: African_American -0.971 0.090 -1.15 -0.795
## SIRE_standard: American_Indian -0.089 0.473 -1.02 0.839
## SIRE_standard: Asian -0.016 0.097 -0.21 0.175
## SIRE_standard: Hispanic -0.615 0.067 -0.75 -0.484
## SIRE_standard: Multi_ethnic -0.238 0.080 -0.40 -0.081
## SIRE_standard: Other -0.332 0.221 -0.76 0.101
## SIRE_standard: Pacific_Islander -0.761 0.240 -1.23 -0.291
## EFL 0.039 0.085 -0.13 0.206
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1360 0.12 0.11 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_standard 0.341 0.341
## EFL 0.012 0.012
##
## [[7]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## SIRE_common: White 0.000 NA NA NA
## SIRE_common: African_American -0.971 0.088 -1.14 -0.799
## SIRE_common: Asian -0.012 0.095 -0.20 0.176
## SIRE_common: Asian, White 0.320 0.128 0.07 0.571
## SIRE_common: Hispanic -0.868 0.120 -1.10 -0.633
## SIRE_common: Hispanic, African_American -1.222 0.179 -1.57 -0.871
## SIRE_common: Hispanic, White -0.413 0.089 -0.59 -0.239
## SIRE_common: Pacific_Islander, Asian, White -0.398 0.168 -0.73 -0.069
## SIRE_common: Other -0.524 0.074 -0.67 -0.379
## EFL 0.054 0.084 -0.11 0.219
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1359 0.15 0.14 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## SIRE_common 0.386 0.387
## EFL 0.016 0.018
##
## [[8]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic_c -0.917 0.099 -1.11 -0.722
## Pacific_Islander_c -0.996 0.175 -1.34 -0.653
## Asian_c 0.036 0.088 -0.14 0.210
## African_American_c -1.016 0.084 -1.18 -0.852
## American_Indian_c -0.615 0.261 -1.13 -0.103
## Other_c -0.345 0.216 -0.77 0.078
## EFL 0.050 0.083 -0.11 0.214
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1361 0.15 0.15 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic_c 0.231 0.243
## Pacific_Islander_c 0.143 0.153
## Asian_c 0.010 0.011
## African_American_c 0.304 0.313
## American_Indian_c 0.059 0.064
## Other_c 0.040 0.043
## EFL 0.015 0.016
##
## [[9]]
##
## ---- Model summary ----
## Model coefficients
## Beta SE CI_lower CI_upper
## Hispanic -0.341 0.062 -0.46 -0.22
## Pacific_Islander -0.479 0.095 -0.67 -0.29
## Asian 0.378 0.073 0.23 0.52
## African_American -0.460 0.083 -0.62 -0.30
## American_Indian -0.108 0.121 -0.34 0.13
## Other 0.110 0.224 -0.33 0.55
## White 0.410 0.068 0.28 0.54
## EFL 0.052 0.083 -0.11 0.22
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1369 1360 0.15 0.14 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## Hispanic 0.137 0.147
## Pacific_Islander 0.126 0.135
## Asian 0.129 0.139
## African_American 0.139 0.149
## American_Indian 0.022 0.024
## Other 0.012 0.013
## White 0.151 0.161
## EFL 0.016 0.017
### Some other analyses
# Cognitive ability stability ---------------------------------------------
#for this we need Lynn's IQs and Baten's age heaping data
#both are in the mega dataset
#one can also get them from:
#http://clio-infra.eu/
#Lynn's book
mega = read_csv("data/Megadataset_v2.0p.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character(),
## EthnicHeterogenityVanhanen2012 = col_integer(),
## EthnicConflictVanhanen2012 = col_integer(),
## SlowTimePrefWangetal2011 = col_integer(),
## Math00Mean = col_integer(),
## Math00SD = col_integer(),
## Read00Mean = col_integer(),
## Read00SD = col_integer(),
## Sci00Mean = col_integer(),
## Sci00SD = col_integer(),
## Math03Mean = col_integer(),
## Math03SD = col_integer(),
## Read03Mean = col_integer(),
## Read03SD = col_integer(),
## Sci03Mean = col_integer(),
## Sci03SD = col_integer(),
## PS03Mean = col_integer(),
## PS03SD = col_integer(),
## Read09.Native = col_integer(),
## Read09.1g = col_integer()
## # ... with 102 more columns
## )
## See spec(...) for full column specifications.
#rename some vars
mega$LynnIQ = mega$LV2012estimatedIQ
#find the age heaping vars
age_heaping_vars = str_subset(names(mega), "^AH\\d+")
#the correlations
wtd.cors(mega[c(age_heaping_vars, "LynnIQ")])
## AH1800 AH1820 AH1830 AH1840 AH1850 AH1860 AH1870 AH1880 AH1890 AH1900 AH1910 AH1920 AH1930 AH1940 AH1950 AH1960 AH1970 LynnIQ
## AH1800 1.00 0.95 0.94 0.93 0.94 0.95 0.96 0.91 0.90 0.90 0.91 0.69 0.64 0.68 0.77 0.76 NaN 0.85
## AH1820 0.95 1.00 0.99 0.97 0.94 0.88 0.94 0.81 0.76 0.69 0.71 0.58 0.58 0.21 0.64 0.74 NaN 0.62
## AH1830 0.94 0.99 1.00 0.99 0.98 0.94 0.95 0.83 0.78 0.70 0.70 0.54 0.53 0.35 0.65 0.74 NaN 0.70
## AH1840 0.93 0.97 0.99 1.00 0.99 0.95 0.96 0.86 0.81 0.74 0.74 0.60 0.59 0.42 0.71 0.79 NaN 0.70
## AH1850 0.94 0.94 0.98 0.99 1.00 0.98 0.99 0.90 0.84 0.77 0.77 0.61 0.61 0.43 0.73 0.81 NaN 0.73
## AH1860 0.95 0.88 0.94 0.95 0.98 1.00 0.97 0.87 0.81 0.72 0.75 0.61 0.60 0.49 0.69 0.67 NaN 0.59
## AH1870 0.96 0.94 0.95 0.96 0.99 0.97 1.00 0.98 0.96 0.93 0.89 0.81 0.73 0.69 0.60 0.86 NaN 0.64
## AH1880 0.91 0.81 0.83 0.86 0.90 0.87 0.98 1.00 0.98 0.94 0.93 0.89 0.86 0.73 0.71 0.86 NaN 0.52
## AH1890 0.90 0.76 0.78 0.81 0.84 0.81 0.96 0.98 1.00 0.96 0.94 0.93 0.91 0.85 0.69 0.61 NaN 0.52
## AH1900 0.90 0.69 0.70 0.74 0.77 0.72 0.93 0.94 0.96 1.00 0.96 0.95 0.94 0.90 0.73 0.55 NaN 0.47
## AH1910 0.91 0.71 0.70 0.74 0.77 0.75 0.89 0.93 0.94 0.96 1.00 0.98 0.97 0.94 0.82 0.55 NaN 0.46
## AH1920 0.69 0.58 0.54 0.60 0.61 0.61 0.81 0.89 0.93 0.95 0.98 1.00 0.98 0.95 0.81 0.56 NaN 0.47
## AH1930 0.64 0.58 0.53 0.59 0.61 0.60 0.73 0.86 0.91 0.94 0.97 0.98 1.00 0.98 0.89 0.56 NaN 0.44
## AH1940 0.68 0.21 0.35 0.42 0.43 0.49 0.69 0.73 0.85 0.90 0.94 0.95 0.98 1.00 0.91 0.64 NaN 0.45
## AH1950 0.77 0.64 0.65 0.71 0.73 0.69 0.60 0.71 0.69 0.73 0.82 0.81 0.89 0.91 1.00 0.97 -0.53 0.45
## AH1960 0.76 0.74 0.74 0.79 0.81 0.67 0.86 0.86 0.61 0.55 0.55 0.56 0.56 0.64 0.97 1.00 0.22 0.51
## AH1970 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN -0.53 0.22 1.00 0.74
## LynnIQ 0.85 0.62 0.70 0.70 0.73 0.59 0.64 0.52 0.52 0.47 0.46 0.47 0.44 0.45 0.45 0.51 0.74 1.00
#sample sizes
count.pairwise(mega[c(age_heaping_vars, "LynnIQ")])
## AH1800 AH1820 AH1830 AH1840 AH1850 AH1860 AH1870 AH1880 AH1890 AH1900 AH1910 AH1920 AH1930 AH1940 AH1950 AH1960 AH1970 LynnIQ
## AH1800 31 25 27 26 22 20 22 25 24 22 21 18 17 17 16 14 0 29
## AH1820 25 45 45 43 37 34 22 38 36 35 34 33 32 31 22 18 0 43
## AH1830 27 45 49 47 41 38 26 42 40 39 38 35 34 33 22 18 0 46
## AH1840 26 43 47 50 44 41 29 44 42 41 40 37 36 35 23 19 0 48
## AH1850 22 37 41 44 45 41 27 39 37 36 35 32 31 30 18 16 0 43
## AH1860 20 34 38 41 41 56 41 53 50 49 47 37 37 35 23 17 0 54
## AH1870 22 22 26 29 27 41 62 59 56 54 53 43 37 31 26 20 0 61
## AH1880 25 38 42 44 39 53 59 95 92 89 88 78 72 55 38 28 0 93
## AH1890 24 36 40 42 37 50 56 92 109 105 103 94 88 70 41 31 1 107
## AH1900 22 35 39 41 36 49 54 89 105 121 116 107 101 84 52 31 1 118
## AH1910 21 34 38 40 35 47 53 88 103 116 133 123 117 99 67 46 1 130
## AH1920 18 33 35 37 32 37 43 78 94 107 123 126 120 102 70 49 3 123
## AH1930 17 32 34 36 31 37 37 72 88 101 117 120 120 102 70 49 3 117
## AH1940 17 31 33 35 30 35 31 55 70 84 99 102 102 107 73 52 3 104
## AH1950 16 22 22 23 18 23 26 38 41 52 67 70 70 73 77 56 5 74
## AH1960 14 18 18 19 16 17 20 28 31 31 46 49 49 52 56 56 5 54
## AH1970 0 0 0 0 0 0 0 0 1 1 1 3 3 3 5 5 5 4
## LynnIQ 29 43 46 48 43 54 61 93 107 118 130 123 117 104 74 54 4 203
#plots
plots = map(seq(1850, 1880, by = 10), ~GG_scatter(mega, "AH" + ., "LynnIQ", case_names = "Names") + xlab(sprintf("Age heaping %d", .)) + ylab("Lynn's IQ estimates"))
GG_save("figures/age_heaping_IQ.png", plot = grid.arrange(grobs = plots, ncol = 2))
# Old numeracy / literacy data --------------------------------------------
lit_num = data_frame(
year = c(1850, 1870, 1900) %>% as.integer(),
W_literacy = c(.86, .892, .936),
W_numeracy = c(.876, .885, .954),
B_literacy = c(.585, .349, .62),
B_numeracy = c(.696, .70, .86)
) %>% mutate(
BW_literacy = qnorm(W_literacy) - qnorm(B_literacy),
BW_numeracy = qnorm(W_numeracy) - qnorm(B_numeracy),
BW_mean = (BW_literacy + BW_numeracy) / 2
) %>% write_clipboard()
## Year W literacy W numeracy B literacy B numeracy BW literacy BW numeracy BW mean
## 1 1850 0.86 0.88 0.58 0.70 0.87 0.64 0.75
## 2 1870 0.89 0.88 0.35 0.70 1.63 0.68 1.15
## 3 1900 0.94 0.95 0.62 0.86 1.22 0.60 0.91
mean(lit_num$BW_mean)
## [1] 0.94
write_sessioninfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18
##
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/libopenblasp-r0.2.18.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plot3D_1.1.1 bindrcpp_0.2.2 gridExtra_2.3 kirkegaard_2018.05 metafor_2.0-0 Matrix_1.2-14 assertthat_0.2.0 magrittr_1.5 forcats_0.3.0 stringr_1.3.1 dplyr_0.7.6 purrr_0.2.5 readr_1.1.1 tidyr_0.8.1 tibble_1.4.2 tidyverse_1.2.1 weights_0.90 mice_3.3.0 gdata_2.18.0 Hmisc_4.1-1 ggplot2_3.0.0 Formula_1.2-3 survival_2.42-6 lattice_0.20-35 psych_1.8.4 plyr_1.8.4 pacman_0.4.6
##
## loaded via a namespace (and not attached):
## [1] minqa_1.2.4 colorspace_1.3-2 class_7.3-14 rio_0.5.16 rprojroot_1.3-2 htmlTable_1.12 base64enc_0.1-3 rstudioapi_0.8 fansi_0.3.0 lubridate_1.7.4 xml2_1.2.0 splines_3.5.1 mnormt_1.5-5 robustbase_0.93-3 knitr_1.20 jsonlite_1.6 nloptr_1.0.4 lsr_0.5 broom_0.5.0 cluster_2.0.7-1 curry_0.1.1 compiler_3.5.1 httr_1.3.1 backports_1.1.2 lazyeval_0.2.1 cli_1.0.1 acepack_1.4.1 htmltools_0.3.6 tools_3.5.1
## [30] misc3d_0.8-4 gtable_0.2.0 glue_1.3.0 reshape2_1.4.3 Rcpp_0.12.18 carData_3.0-2 cellranger_1.1.0 nlme_3.1-137 multilevel_2.6 lmtest_0.9-36 laeken_0.4.6 openxlsx_4.1.0 lme4_1.1-18-1 rvest_0.3.2 gtools_3.8.1 pan_1.6 DEoptimR_1.0-8 MASS_7.3-50 zoo_1.8-4 scales_1.0.0 VIM_4.7.0 hms_0.4.2 parallel_3.5.1 RColorBrewer_1.1-2 psychometric_2.2 yaml_2.2.0 curl_3.2 rpart_4.1-13 latticeExtra_0.6-28
## [59] stringi_1.2.4 e1071_1.7-0 checkmate_1.8.5 boot_1.3-20 zip_1.0.0 rlang_0.2.2 pkgconfig_2.0.2 evaluate_0.11 bindr_0.1.1 htmlwidgets_1.2 labeling_0.3 tidyselect_0.2.4 R6_2.2.2 mitml_0.3-6 pillar_1.3.0 haven_1.1.2 foreign_0.8-71 withr_2.1.2 abind_1.4-5 sp_1.3-1 nnet_7.3-12 modelr_0.1.2 crayon_1.3.4 car_3.0-2 jomo_2.6-4 utf8_1.1.4 rmarkdown_1.10 grid_3.5.1 readxl_1.1.0
## [88] data.table_1.11.6 vcd_1.4-4 digest_0.6.17 munsell_0.5.0