About

This is an R notebook for the study (insert final name).

Start up code

This script loads packages, data, merges data and does some initial filtering.

# libraries ---------------------------------------------------------------
#here we load the libraries we need
library(pacman)
p_load(plyr, psych, weights, tidyverse, forcats, magrittr, kirkegaard, gridExtra)
options(digits = 2, width = 600)

#default ggsave settings
ggsave2 = purrr::partial(ggsave, width = 10, height = 6)

#controls
#excludes persons without English as first language
exclude_nonenglish_first_language = F

# data --------------------------------------------------------------------
#here we load the data we need

d_ping_behavior = read_csv("data/PING_Behavior.csv")

## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   SubjID = col_character(),
##   Age_At_NPExam = col_double(),
##   Age_At_IMGExam = col_double(),
##   Age_At_PhenX_Completion = col_double(),
##   Gender = col_character(),
##   FDH_Respondent_Relation = col_character(),
##   FDH_Guardian_1_Relation = col_character(),
##   FDH_Guardian_2_Relation = col_character(),
##   FDH_1_Hispanic_or_Latino = col_character(),
##   FDH_2_Pacific_Islander = col_character(),
##   FDH_2_Asian = col_character(),
##   FDH_2_African_American = col_character(),
##   FDH_2_American_Indian = col_character(),
##   FDH_2_White = col_character(),
##   FDH_5_Birth_By = col_character(),
##   FDH_8_Birth_Weight_lbs = col_double(),
##   FDH_8_Birth_Length = col_double(),
##   FDH_8_Birth_Head_Circumference = col_double(),
##   FDH_9_APGAR_Score_5_Minutes = col_double(),
##   FDH_11_Pregnancy_Medications = col_character()
##   # ... with 545 more columns
## )

## See spec(...) for full column specifications.

## Warning in rbind(names(probs), probs_f): number of columns of result is not a multiple of vector length (arg 1)

## Warning: 62 parsing failures.
## row # A tibble: 5 x 5 col     row col             expected               actual file                     expected   <int> <chr>           <chr>                  <chr>  <chr>                    actual 1  1093 TBX_dccs_pst_rt no trailing characters .5     'data/PING_Behavior.csv' file 2  1093 TBX_dccs_d_rt   no trailing characters .5     'data/PING_Behavior.csv' row 3  1123 TBX_dccs_d_rt   no trailing characters .5     'data/PING_Behavior.csv' col 4  1167 TBX_dccs_pst_rt no trailing characters .5     'data/PING_Behavior.csv' expected 5  1167 TBX_dccs_d_rt   no trailing characters .5     'data/PING_Behavior.csv'
## ... ................. ... .............................................................................. ........ .............................................................................. ...... .............................................................................. .... .............................................................................. ... .............................................................................. ... .............................................................................. ........ ..............................................................................
## See problems(...) for more details.

d_ping_mri = read_csv("data/PING_MRI_data.csv")

## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   ID = col_character(),
##   Site = col_character(),
##   Age = col_double(),
##   Sex = col_character(),
##   Manufacturer = col_character(),
##   ManufacturersModelName = col_character(),
##   DeviceSerialNumber = col_character(),
##   europe = col_double(),
##   africa = col_double(),
##   amerind = col_double(),
##   eastAsia = col_double(),
##   oceania = col_double(),
##   centralAsia = col_double(),
##   ED = col_double(),
##   vol.IntracranialVolume = col_double()
## )
## See spec(...) for full column specifications.

# merge data --------------------------------------------------------------
#here we merge the two datasets: one with behavioral data, one with MRI + genetic data.
#we do this by matching up the rows by the IDs
#we skip age from one dataset to avoid duplication

d_ping = full_join(d_ping_mri %>% dplyr::select(-Age), d_ping_behavior,by = c("ID" = "SubjID"))

Recoding

This code recodes some variables, primarily the SIRE ones.

### RECODE AND RENAME VARIABLES


# exlude persons without any genomic data ---------------------------------
#only small number of people

d_ping %<>% filter(!is.na(GAF_africa))

# non-english first language ----------------------------------------------

#rename
d_ping$English_first_language = d_ping$FDH_21_English_First_Lang == "Yes"

#fill NA to T
d_ping$English_first_language %<>% mapvalues(NA, T)
d_ping$EFL = d_ping$English_first_language

#table
table2(d_ping$English_first_language)

## # A tibble: 3 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1 TRUE   1237    88.9
## 2 FALSE   154    11.1
## 3 <NA>      0     0

count_NA(d_ping$English_first_language, T)

## [1] 1391

#exclude
if (exclude_nonenglish_first_language) {
  d_ping %<>% dplyr::filter(!English_first_language)
}


# rename ------------------------------------------------------------------

#cognitive tests
d_ping$IBAM = d_ping$TBX_ibam_scr
d_ping$list_sort = d_ping$TBX_ls
d_ping$pattern_recog = d_ping$TBX_pspac_scr
d_ping$vocab = d_ping$TBX_VOCAB_THETA
d_ping$reading = d_ping$TBX_reading_score
d_ping$flanker = d_ping$TBX_flanker_score
d_ping$card_sort = d_ping$TBX_dccs_score

#vector
v_cog = c("IBAM", "list_sort", "pattern_recog", "vocab", "reading", "flanker", "card_sort")
#we dont like attention

#S
d_ping$occupation_1 = d_ping$FDH_Guardian_1_Ocup
d_ping$occupation_2 = d_ping$FDH_Guardian_2_Ocup
d_ping$education_1 = d_ping$FDH_Guardian_1_Edu
d_ping$education_2 = d_ping$FDH_Guardian_2_Edu
d_ping$hh_income = d_ping$FDH_3_Household_Income

v_S = c("occupation_1", "occupation_2", "education_1", "education_2", "hh_income")

#sensible ancestry names
d_ping$European = d_ping$europe
d_ping$African = d_ping$africa
d_ping$Amerindian = d_ping$amerind
d_ping$East_Asian = d_ping$eastAsia
d_ping$Oceanian = d_ping$oceania
d_ping$Central_Asian = d_ping$centralAsia

#genomics
v_genomic = c("European", "African", "Amerindian", "East_Asian", "Oceanian", "Central_Asian")


# recode ------------------------------------------------------------------
#recode N sex to NA
d_ping$Sex[!d_ping$Sex %in% c("M", "F")] = NA

# rename SIRE -------------------------------------------------------------
#which are the original SIRE variables?
(v_SIRE_orig = str_detect2(names(d_ping), pattern = "FDH_[12]_", value = T))

## [1] "FDH_1_Hispanic_or_Latino" "FDH_2_Pacific_Islander"   "FDH_2_Asian"              "FDH_2_African_American"   "FDH_2_American_Indian"    "FDH_2_White"

#new names
v_SIREs = c("Hispanic", "Pacific_Islander", "Asian", "African_American", "American_Indian", "White")

#better names
d_ping$Hispanic = d_ping$FDH_1_Hispanic_or_Latino == "Yes"
d_ping$Pacific_Islander = d_ping$FDH_2_Pacific_Islander == "Yes"
d_ping$Asian = d_ping$FDH_2_Asian == "Yes"
d_ping$African_American = d_ping$FDH_2_African_American == "Yes"
d_ping$American_Indian = d_ping$FDH_2_American_Indian == "Yes"
d_ping$White = d_ping$FDH_2_White == "Yes"

#fill with FALSE
false_filler = function(x) {
  x[is.na(x)] = F
  x
}
d_ping$Hispanic %<>% false_filler
d_ping$Pacific_Islander %<>% false_filler
d_ping$Asian %<>% false_filler
d_ping$African_American %<>% false_filler
d_ping$American_Indian %<>% false_filler
d_ping$White %<>% false_filler

#fill in Other
d_ping$Other = d_ping[v_SIREs] %>% rowSums %>% `==`(0)

#add Other SIRE, but not last!
#adding it last causes it to be the reference class/group
v_SIREs = c("Hispanic", "Pacific_Islander", "Asian", "African_American", "American_Indian", "Other", "White")

#frequency
colMeans(d_ping[v_SIREs], na.rm = T)

##         Hispanic Pacific_Islander            Asian African_American  American_Indian            Other            White 
##            0.236            0.090            0.219            0.164            0.047            0.017            0.656

colMeans(d_ping[v_SIREs], na.rm = T) %>% sum

## [1] 1.4

#how much missing data?
d_ping[v_SIREs] %>% miss_amount()

## cases with missing data  vars with missing data cells with missing data 
##                       0                       0                       0

# SIRE, standard ------------------------------------------------------------------

#recode SIRE
d_ping$SIRE_standard = map_chr(1:nrow(d_ping), .f = function(row) {

  #extract SIREs
  SIREs = d_ping[row, v_SIREs] %>% unlist
  
  #Hispanic?
  if (SIREs[1]) return(names(SIREs)[1])
  
  #only one?
  if (sum(SIREs) == 1) {
    #then return the name of it
    return(names(SIREs)[which(SIREs)])
  } else if (sum(SIREs) == 0) {
    #did not select any
    return("Other")
  } else {
    #more than one, return multi
    return("Multi_ethnic")
    #return combo
    return(str_c(names(SIREs)[which(SIREs)], collapse = " "))
  }
})

#check results
table2(d_ping$SIRE_standard, include_NA = F) %>% 
  write_clipboard()

##              Group  Count Percent
## 1            White 571.00   41.05
## 2         Hispanic 328.00   23.58
## 3     Multi_ethnic 186.00   13.37
## 4 African_American 140.00   10.06
## 5            Asian 122.00    8.77
## 6            Other  24.00    1.73
## 7 Pacific_Islander  16.00    1.15
## 8  American_Indian   4.00    0.29

#as factor
#factors are R's built in variable type for handling categorical data
d_ping$SIRE_standard %<>% factor()

#we want White to be in front
d_ping$SIRE_standard %<>% fct_relevel("White")

#verify
assert_that(levels(d_ping$SIRE_standard)[1] == "White")

## [1] TRUE

# SIRE, combinations ------------------------------------------

d_ping$SIRE_common = map_chr(1:nrow(d_ping), .f = function(row) {
  
  #extract SIREs
  SIREs = d_ping[row, v_SIREs] %>% unlist
  
  #only one?
  if (sum(SIREs) == 1) {
    #then return the name of it
    return(names(SIREs)[which(SIREs)])
  } else if (sum(SIREs) == 0) {
    #did not select any
    return("Other")
  } else {
    #return combo
    return(str_c(names(SIREs)[which(SIREs)], collapse = ", "))
  }
})

#table
table2(d_ping$SIRE_common, include_NA = F) %>% 
  write_clipboard()

##                                                         Group  Count Percent
## 1                                                       White 571.00   41.05
## 2                                            African_American 140.00   10.06
## 3                                             Hispanic, White 140.00   10.06
## 4                                                       Asian 122.00    8.77
## 5                                                    Hispanic  71.00    5.10
## 6                                                Asian, White  60.00    4.31
## 7                              Pacific_Islander, Asian, White  32.00    2.30
## 8                                  Hispanic, African_American  29.00    2.08
## 9                                     African_American, White  25.00    1.80
## 10                                                      Other  24.00    1.73
## 11                                    Pacific_Islander, Asian  17.00    1.22
## 12                                           Pacific_Islander  16.00    1.15
## 13                           Hispanic, American_Indian, White  13.00    0.93
## 14                                            Hispanic, Asian  13.00    0.93
## 15                   Hispanic, Pacific_Islander, Asian, White  13.00    0.93
## 16                                     American_Indian, White  11.00    0.79
## 17                          Hispanic, Pacific_Islander, Asian  11.00    0.79
## 18                                    Pacific_Islander, White   9.00    0.65
## 19                                  Hispanic, American_Indian   7.00    0.50
## 20                                     Hispanic, Asian, White   6.00    0.43
## 21                          African_American, American_Indian   5.00    0.36
## 22                              Asian, American_Indian, White   5.00    0.36
## 23                                 Hispanic, Pacific_Islander   5.00    0.36
## 24            Pacific_Islander, Asian, American_Indian, White   5.00    0.36
## 25                                            American_Indian   4.00    0.29
## 26                                    Asian, African_American   4.00    0.29
## 27                Hispanic, African_American, American_Indian   4.00    0.29
## 28                   African_American, American_Indian, White   3.00    0.22
## 29                  Pacific_Islander, African_American, White   3.00    0.22
## 30           Pacific_Islander, Asian, African_American, White   3.00    0.22
## 31                             Asian, African_American, White   2.00    0.14
## 32         Hispanic, African_American, American_Indian, White   2.00    0.14
## 33                    Hispanic, Asian, American_Indian, White   2.00    0.14
## 34        Hispanic, Pacific_Islander, Asian, African_American   2.00    0.14
## 35 Hispanic, Pacific_Islander, Asian, African_American, White   2.00    0.14
## 36                          Hispanic, Pacific_Islander, White   2.00    0.14
## 37                          Hispanic, African_American, White   1.00    0.07
## 38                   Hispanic, Asian, African_American, White   1.00    0.07
## 39                           Hispanic, Asian, American_Indian   1.00    0.07
## 40        Hispanic, Pacific_Islander, African_American, White   1.00    0.07
## 41                Hispanic, Pacific_Islander, American_Indian   1.00    0.07
## 42         Hispanic, Pacific_Islander, Asian, American_Indian   1.00    0.07
## 43                  Pacific_Islander, Asian, African_American   1.00    0.07
## 44                   Pacific_Islander, Asian, American_Indian   1.00    0.07

#recode uncommon
d_ping$SIRE_common = fct_lump(d_ping$SIRE_common, n = 8)

#set White as the first
d_ping$SIRE_common %<>% fct_relevel("White")

#verify
assert_that(levels(d_ping$SIRE_common)[1] == "White")

## [1] TRUE

# recode SIRE - continuous ------------------------------------------------
#here we recode the variable variables as continuous variables
d_SIRE_con = adply(d_ping, .margins = 1, .expand = F, .fun = function(row) {

  #extract SIREs
  SIREs = row[v_SIREs] %>% unlist
  
  #numeric
  SIREs = SIREs %>% as.numeric
  
  #sum of SIRE
  SIRE_sum = sum(SIREs)
  
  #proportion SIREs
  res = SIREs / SIRE_sum
  
  #names
  names(res) = v_SIREs + "_c"
  
  #output
  res
})

#merge
d_ping = cbind(d_ping, d_SIRE_con[-1])

#vector for use
v_SIREs_con = v_SIREs + "_c"


# largest ancestry --------------------------------------------------------
#function to assign largest ancestry
ancestry_assigner = function(data = d_ping, threshold = 0) {
  aaply(df_subset_by_pattern(d_ping, pattern = "GAF_"), .margins = 1, .fun = function(block) {
    #vector
    vect = unlist(block)
    
    #no data?
    if (all(is.na(vect))) return(NA)
    
    #get rid of prefix
    names(vect) %<>% str_replace(pattern = "GAF_", replacement = "")
    
    #equal sized largest?
    if(sum(max(vect) == vect) > 1) browser()
    
    #threshold met?
    if (max(vect) < threshold) return("Mixed")
    
    #return largest ancestry name
    names(which.max(vect))
  }, .expand = F)
}

#assign
d_ping$largest_ancestry = ancestry_assigner() #default
d_ping$largest_ancestry_twothirds = ancestry_assigner(threshold = 2/3)
d_ping$largest_ancestry_half = ancestry_assigner(threshold = .5)



# age and sex -------------------------------------------------------------
#attempt to fill in missing age
sum(is.na(d_ping$Age_At_NPExam) & !is.na(d_ping$Age_At_IMGExam))

## [1] 3

sum(is.na(d_ping$Age_At_NPExam) & !is.na(d_ping$Age_At_PhenX_Completion))

## [1] 0

#we can only fill in 7

#we are also using the wrong variable
d_ping$Age = map2_dbl(d_ping$Age_At_NPExam, d_ping$Age_At_IMGExam, function(x, y) {
  if (is.na(x)) return(y)
  x
})

#plots
GG_denhist(d_ping, "Age")

## Warning: Removed 5 rows containing non-finite values (stat_bin).

## Warning: Removed 5 rows containing non-finite values (stat_density).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 5 rows containing non-finite values (stat_bin).

## Warning: Removed 5 rows containing non-finite values (stat_density).

ggsave("figures/age_dist.png")

## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 5 rows containing non-finite values (stat_bin).

## Warning: Removed 5 rows containing non-finite values (stat_density).

GG_denhist(d_ping, var = "Age", group = "Gender")

## Warning in GG_denhist(d_ping, var = "Age", group = "Gender"): There were groups without any data. These were removed

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("figures/age_gender.png")

## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#numeric
d_ping$Age %>% psych::describe()

##    vars    n mean sd median trimmed mad min max range skew kurtosis   se
## X1    1 1386   12  5     11      12 5.9   3  21    18 0.18     -1.1 0.13

Cognitive ability analyses

Impute missing data, factor analyze, score.

### COGNITIVE DATA


# function to control ----------------------------------------------------------------
#make a function that controls each variable for sex then for age
controller = function(data, gender=T, age=T) {
  for (var in names(data)) {
    
    if (gender) {
      .model = var + " ~ Gender"
      fit = lm(.model, data = cbind(data, Gender = d_ping$Gender), na.action = na.exclude)
      data[[var]] = resid(fit)
    }
    
    if (age) {
      .model = var + " ~ Age"
      fit = loess(.model, data = cbind(data, Age = d_ping$Age), na.action = na.exclude)
      data[[var]] = resid(fit)
    }
    
  }
  data
}


# subset ------------------------------------------------------------------

d_cog = d_ping[v_cog]


# missing -----------------------------------------------------------------

#examine
miss_plot(d_cog)

miss_by_var(d_cog)

##          IBAM     list_sort pattern_recog         vocab       reading       flanker     card_sort 
##            20            32            21            24            40            41           169

miss_amount(d_cog)

## cases with missing data  vars with missing data cells with missing data 
##                   0.140                   1.000                   0.036

#impute
d_cog = miss_impute(d_cog)

#complete cases after imputation
d_cog %>% na.omit() %>% nrow()

## [1] 1370

# age and sex control -------------------------------------------------------------
#adjust for age and sex
d_cog = controller(d_cog)

#sample sizes
count.pairwise(d_cog)

##               IBAM list_sort pattern_recog vocab reading flanker card_sort
## IBAM          1370      1369          1370  1370    1369    1369      1369
## list_sort     1369      1369          1369  1369    1369    1369      1369
## pattern_recog 1370      1369          1370  1370    1369    1369      1369
## vocab         1370      1369          1370  1370    1369    1369      1369
## reading       1369      1369          1369  1369    1369    1369      1369
## flanker       1369      1369          1369  1369    1369    1370      1370
## card_sort     1369      1369          1369  1369    1369    1370      1370

#cors
wtd.cors(d_cog)

##                IBAM list_sort pattern_recog vocab reading flanker card_sort
## IBAM          1.000      0.34          0.13  0.22    0.20   0.097      0.16
## list_sort     0.337      1.00          0.18  0.34    0.31   0.218      0.30
## pattern_recog 0.130      0.18          1.00  0.13    0.13   0.290      0.32
## vocab         0.222      0.34          0.13  1.00    0.56   0.172      0.22
## reading       0.204      0.31          0.13  0.56    1.00   0.174      0.21
## flanker       0.097      0.22          0.29  0.17    0.17   1.000      0.52
## card_sort     0.159      0.30          0.32  0.22    0.21   0.519      1.00

#factor analysis
(fa = fa(d_cog))

## Factor Analysis using method =  minres
## Call: fa(r = d_cog)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                MR1   h2   u2 com
## IBAM          0.37 0.14 0.86   1
## list_sort     0.57 0.32 0.68   1
## pattern_recog 0.38 0.14 0.86   1
## vocab         0.57 0.33 0.67   1
## reading       0.55 0.30 0.70   1
## flanker       0.49 0.24 0.76   1
## card_sort     0.57 0.33 0.67   1
## 
##                 MR1
## SS loadings    1.79
## Proportion Var 0.26
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  21  and the objective function was  1.2 with Chi Square of  1719
## The degrees of freedom for the model are 14  and the objective function was  0.39 
## 
## The root mean square of the residuals (RMSR) is  0.1 
## The df corrected root mean square of the residuals is  0.13 
## 
## The harmonic number of observations is  1369 with the empirical chi square  618  with prob <  8.3e-123 
## The total number of observations was  1391  with Likelihood Chi Square =  534  with prob <  5.6e-105 
## 
## Tucker Lewis Index of factoring reliability =  0.54
## RMSEA index =  0.16  and the 90 % confidence intervals are  0.15 0.17
## BIC =  433
## Fit based upon off diagonal values = 0.86
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.85
## Multiple R square of scores with factors          0.72
## Minimum correlation of possible factor scores     0.43

#save scores
d_ping$CA = fa$scores %>% as.vector() %>% standardize()


# only age control --------------------------------------------------------

d_cog2 = controller(d_cog, gender = F)
wtd.cors(d_cog2)

##                IBAM list_sort pattern_recog vocab reading flanker card_sort
## IBAM          1.000      0.34          0.13  0.22    0.20   0.099      0.16
## list_sort     0.337      1.00          0.18  0.34    0.31   0.218      0.30
## pattern_recog 0.132      0.18          1.00  0.13    0.14   0.288      0.31
## vocab         0.222      0.34          0.13  1.00    0.56   0.172      0.22
## reading       0.203      0.31          0.14  0.56    1.00   0.177      0.21
## flanker       0.099      0.22          0.29  0.17    0.18   1.000      0.52
## card_sort     0.160      0.30          0.31  0.22    0.21   0.518      1.00

#FA
(fa2 = fa(d_cog2))

## Factor Analysis using method =  minres
## Call: fa(r = d_cog2)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                MR1   h2   u2 com
## IBAM          0.37 0.14 0.86   1
## list_sort     0.57 0.32 0.68   1
## pattern_recog 0.37 0.14 0.86   1
## vocab         0.57 0.33 0.67   1
## reading       0.55 0.31 0.69   1
## flanker       0.49 0.24 0.76   1
## card_sort     0.57 0.33 0.67   1
## 
##                 MR1
## SS loadings    1.80
## Proportion Var 0.26
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  21  and the objective function was  1.2 with Chi Square of  1717
## The degrees of freedom for the model are 14  and the objective function was  0.38 
## 
## The root mean square of the residuals (RMSR) is  0.1 
## The df corrected root mean square of the residuals is  0.13 
## 
## The harmonic number of observations is  1369 with the empirical chi square  608  with prob <  1.1e-120 
## The total number of observations was  1391  with Likelihood Chi Square =  528  with prob <  1.1e-103 
## 
## Tucker Lewis Index of factoring reliability =  0.55
## RMSEA index =  0.16  and the 90 % confidence intervals are  0.15 0.17
## BIC =  427
## Fit based upon off diagonal values = 0.86
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.85
## Multiple R square of scores with factors          0.72
## Minimum correlation of possible factor scores     0.43

#save scores
d_ping$CA2 = fa2$scores %>% as.vector() %>% standardize()


# compare loadings -----------------------------------------------------------------

fa_plot_loadings(list(age_gender_controlled = fa,
                      age_controlled = fa2))

# factor analyze by SIRE --------------------------------------------------
#split into subset for largest SIRE groups
d_cog2_SIRE = list(
  White = d_cog2[d_ping$SIRE_standard == "White", ],
  African_American = d_cog2[d_ping$SIRE_standard == "African_American", ],
  Hispanic = d_cog2[d_ping$SIRE_standard == "Hispanic", ],
  Multi_ethnic = d_cog2[d_ping$SIRE_standard == "Multi_ethnic", ],
  Asian = d_cog2[d_ping$SIRE_standard == "Asian", ]
)

#factor analyze each
d_cog2_SIRE_fa = map(d_cog2_SIRE, ~fa(.))

#extract loadings
fa_congruence_matrix(d_cog2_SIRE_fa) %>% 
  set_colnames(names(d_cog2_SIRE)) %>% 
  set_rownames(names(d_cog2_SIRE))

##                  White African_American Hispanic Multi_ethnic Asian
## White             1.00             0.95     0.96         0.98  0.97
## African_American  0.95             1.00     0.99         0.96  0.87
## Hispanic          0.96             0.99     1.00         0.98  0.91
## Multi_ethnic      0.98             0.96     0.98         1.00  0.97
## Asian             0.97             0.87     0.91         0.97  1.00

S factor analyses

Impute missing data, factor analyze, score.

### S FACTOR



# missing ------------------------------------------------------------------
d_S = d_ping[v_S]

#how much missing?
miss_amount(d_S)

## cases with missing data  vars with missing data cells with missing data 
##                     0.5                     1.0                     0.2

#intercors
wtd.cors(d_S) %>% MAT_half() %>% psych::describe()

##    vars  n mean   sd median trimmed  mad  min  max range  skew kurtosis   se
## X1    1 10 0.55 0.07   0.55    0.55 0.05 0.43 0.66  0.23 -0.19     -1.1 0.02

#view distribution
miss_plot(d_S)

#impute all cases with any data
d_S = miss_impute(d_S, max_na = 4)

#without imputed numbers
d_ping[c(v_S, "CA")] %>% wtd.cors()

##              occupation_1 occupation_2 education_1 education_2 hh_income   CA
## occupation_1         1.00         0.53        0.64        0.43      0.57 0.32
## occupation_2         0.53         1.00        0.46        0.66      0.59 0.34
## education_1          0.64         0.46        1.00        0.57      0.54 0.34
## education_2          0.43         0.66        0.57        1.00      0.53 0.36
## hh_income            0.57         0.59        0.54        0.53      1.00 0.35
## CA                   0.32         0.34        0.34        0.36      0.35 1.00

# cors --------------------------------------------------------------------

wtd.cors(cbind(d_S, d_ping$CA))

##              occupation_1 occupation_2 education_1 education_2 hh_income d_ping$CA
## occupation_1         1.00         0.66        0.66        0.51      0.60      0.32
## occupation_2         0.66         1.00        0.53        0.74      0.73      0.36
## education_1          0.66         0.53        1.00        0.65      0.56      0.34
## education_2          0.51         0.74        0.65        1.00      0.63      0.38
## hh_income            0.60         0.73        0.56        0.63      1.00      0.35
## d_ping$CA            0.32         0.36        0.34        0.38      0.35      1.00

wtd.cors(d_S) %>% MAT_half() %>% averages()

## arithmetic  geometric   harmonic       mode     median    trimmed   midrange 
##       0.63       0.62       0.62       0.66       0.64       0.63       0.63

# FA ----------------------------------------------------------------------

(fa_S = fa(d_S))

## Factor Analysis using method =  minres
## Call: fa(r = d_S)
## Standardized loadings (pattern matrix) based upon correlation matrix
##               MR1   h2   u2 com
## occupation_1 0.76 0.57 0.43   1
## occupation_2 0.86 0.74 0.26   1
## education_1  0.74 0.54 0.46   1
## education_2  0.80 0.65 0.35   1
## hh_income    0.80 0.64 0.36   1
## 
##                 MR1
## SS loadings    3.14
## Proportion Var 0.63
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  10  and the objective function was  3.1 with Chi Square of  4300
## The degrees of freedom for the model are 5  and the objective function was  0.36 
## 
## The root mean square of the residuals (RMSR) is  0.06 
## The df corrected root mean square of the residuals is  0.09 
## 
## The harmonic number of observations is  1380 with the empirical chi square  109  with prob <  6.8e-22 
## The total number of observations was  1391  with Likelihood Chi Square =  493  with prob <  2.1e-104 
## 
## Tucker Lewis Index of factoring reliability =  0.77
## RMSEA index =  0.27  and the 90 % confidence intervals are  0.25 0.28
## BIC =  457
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.95
## Multiple R square of scores with factors          0.90
## Minimum correlation of possible factor scores     0.80

#scores
d_ping$S = fa_S$scores %>% as.vector() %>% standardize()

Ancestry & SIRE

Some analyses of ancestry and SIRE.

### ancestry analysis


# cross-tabulate ----------------------------------------------------------
#simple analyses
#binary measures of congruence between genetic ancestry and SIRE
table(d_ping$SIRE_standard, d_ping$largest_ancestry) %>% 
  prop.table(margin = 1) %>% 
  unclass() %>%
  write_clipboard()

##                  Africa Amerind CentralAsia EastAsia Europe
## White              0.00    0.00        0.00     0.00   1.00
## African American   0.99    0.00        0.00     0.00   0.01
## American Indian    0.25    0.25        0.00     0.00   0.50
## Asian              0.00    0.00        0.13     0.83   0.04
## Hispanic           0.09    0.07        0.00     0.10   0.73
## Multi ethnic       0.10    0.00        0.02     0.39   0.49
## Other              0.21    0.00        0.17     0.00   0.62
## Pacific Islander   0.00    0.00        0.00     0.94   0.06

table(d_ping$SIRE_standard, d_ping$largest_ancestry_twothirds) %>% 
  prop.table(margin = 1) %>% 
  unclass() %>% 
  write_clipboard()

##                  Africa Amerind CentralAsia EastAsia Europe Mixed
## White              0.00    0.00        0.00     0.00   0.99  0.01
## African American   0.88    0.00        0.00     0.00   0.00  0.12
## American Indian    0.25    0.00        0.00     0.00   0.50  0.25
## Asian              0.00    0.00        0.13     0.77   0.02  0.08
## Hispanic           0.03    0.01        0.00     0.05   0.36  0.55
## Multi ethnic       0.03    0.00        0.01     0.17   0.20  0.59
## Other              0.21    0.00        0.17     0.00   0.46  0.17
## Pacific Islander   0.00    0.00        0.00     0.75   0.00  0.25

table(d_ping$SIRE_standard, d_ping$largest_ancestry_half) %>% 
  prop.table(margin = 1) %>% 
  unclass() %>% 
  write_clipboard()

##                  Africa Amerind CentralAsia EastAsia Europe Mixed
## White              0.00    0.00        0.00     0.00   1.00  0.00
## African American   0.98    0.00        0.00     0.00   0.01  0.01
## American Indian    0.25    0.25        0.00     0.00   0.50  0.00
## Asian              0.00    0.00        0.13     0.81   0.04  0.02
## Hispanic           0.08    0.05        0.00     0.06   0.59  0.22
## Multi ethnic       0.06    0.00        0.02     0.35   0.44  0.12
## Other              0.21    0.00        0.17     0.00   0.54  0.08
## Pacific Islander   0.00    0.00        0.00     0.94   0.00  0.06

# mean ancestry by SIRE ---------------------------------------------------

ddply(d_ping, .variables = "SIRE_standard", .fun = function(block) {
  #select
  tmp = block[v_genomic]
  
  #mean
  c(n = count_NA(tmp[[1]], reverse = T), colMeans(tmp, na.rm=T))
}) %>% 
  write_clipboard()

##      SIRE standard      N European African Amerindian East Asian Oceanian Central Asian
## 1            White 571.00     0.97    0.00       0.01       0.00     0.00          0.01
## 2 African_American 140.00     0.17    0.81       0.00       0.00     0.00          0.01
## 3  American_Indian   4.00     0.66    0.21       0.13       0.00     0.01          0.00
## 4            Asian 122.00     0.06    0.00       0.00       0.80     0.01          0.13
## 5         Hispanic 328.00     0.57    0.14       0.19       0.09     0.01          0.01
## 6     Multi_ethnic 186.00     0.45    0.11       0.01       0.37     0.03          0.03
## 7            Other  24.00     0.55    0.26       0.01       0.01     0.00          0.18
## 8 Pacific_Islander  16.00     0.11    0.01       0.00       0.70     0.17          0.00

# SD ----------------------------------------------------------------------

ddply(d_ping, .variables = "SIRE_standard", .fun = function(block) {
  #select
  tmp = block[v_genomic]
  
  #summarize
  c(n = count_NA(tmp[[1]], reverse = T), sapply(tmp, sd, na.rm=T))
}) %>%
  write_clipboard()

##      SIRE standard      N European African Amerindian East Asian Oceanian Central Asian
## 1            White 571.00     0.07    0.04       0.02       0.04     0.00          0.04
## 2 African_American 140.00     0.11    0.12       0.01       0.03     0.01          0.02
## 3  American_Indian   4.00     0.42    0.42       0.25       0.00     0.01          0.00
## 4            Asian 122.00     0.17    0.02       0.01       0.36     0.01          0.32
## 5         Hispanic 328.00     0.23    0.20       0.17       0.20     0.02          0.03
## 6     Multi_ethnic 186.00     0.27    0.21       0.02       0.29     0.05          0.12
## 7            Other  24.00     0.36    0.33       0.03       0.02     0.01          0.32
## 8 Pacific_Islander  16.00     0.15    0.04       0.00       0.13     0.06          0.00

# correlations between SIRE continuous and genomic ancestry ---------------

wtd.cors(d_ping[c(v_SIREs_con, v_genomic)]) %>% 
  #change order manually so they match up as well as possible
  .[v_SIREs_con[c(7, 4, 1, 3, 2, 5, 6)], v_genomic] %>% 
  write_clipboard()

##                    European African Amerindian East Asian Oceanian Central Asian
## White c                0.86   -0.50      -0.23      -0.44    -0.22         -0.13
## African American c    -0.45    0.92      -0.14      -0.18    -0.08         -0.06
## Hispanic c            -0.08    0.00       0.75      -0.15    -0.06         -0.08
## Asian c               -0.56   -0.20      -0.16       0.81     0.09          0.27
## Pacific Islander c    -0.28   -0.10      -0.07       0.39     0.81         -0.03
## American Indian c      0.00    0.03       0.09      -0.04    -0.01         -0.04
## Other c               -0.03    0.06      -0.04      -0.06    -0.03          0.16

#SIREc sums
d_ping[v_SIREs_con] %>% colSums(na.rm=T)

##         Hispanic_c Pacific_Islander_c            Asian_c African_American_c  American_Indian_c            Other_c            White_c 
##                187                 55                196                179                 27                 24                723

# overall ancestry summary stats ---------------------------------------------------

psych::describe(d_ping[v_genomic])

##               vars    n mean   sd median trimmed mad min  max range  skew kurtosis   se
## European         1 1391 0.63 0.37   0.73    0.66 0.4   0 1.00  1.00 -0.46     -1.3 0.01
## African          2 1391 0.14 0.27   0.00    0.06 0.0   0 1.00  1.00  1.94      2.3 0.01
## Amerindian       3 1391 0.05 0.12   0.00    0.02 0.0   0 0.83  0.83  2.96      9.3 0.00
## East_Asian       4 1391 0.15 0.30   0.00    0.07 0.0   0 1.00  1.00  1.89      2.1 0.01
## Oceanian         5 1391 0.01 0.03   0.00    0.00 0.0   0 0.25  0.25  5.22     30.2 0.00
## Central_Asian    6 1391 0.02 0.12   0.00    0.00 0.0   0 1.00  1.00  6.56     44.6 0.00

Subsamples

We create a few subsamples of interest.

# subgroups ---------------------------------------------------------------
#define subsets of interest
d_AA_narrow = d_ping[d_ping$SIRE_standard == "African_American", ]
d_AA_broad = d_ping[d_ping$African_American, ]
d_HA = d_ping[d_ping$Hispanic, ]
d_nW = d_ping %>% dplyr::filter(SIRE_standard != "White")

#get Ws and AAs by standard coding
d_W_AA = filter(d_ping, SIRE_standard %in% c("African_American", "White"))

#get Ws and Hs by standard coding
d_W_H = filter(d_ping, SIRE_standard %in% c("Hispanic", "White"))

Simple results

#simple results

# bivariate ---------------------------------------------------------------
wtd.cors(d_ping[c("CA", "S", v_genomic)]) %>% write_clipboard()

##                  CA     S European African Amerindian East Asian Oceanian Central Asian
## CA             1.00  0.42     0.23   -0.33      -0.15       0.05    -0.08          0.06
## S              0.42  1.00     0.32   -0.30      -0.24      -0.06    -0.20          0.12
## European       0.23  0.32     1.00   -0.50      -0.10      -0.62    -0.31         -0.20
## African       -0.33 -0.30    -0.50    1.00      -0.08      -0.22    -0.10         -0.07
## Amerindian    -0.15 -0.24    -0.10   -0.08       1.00      -0.16    -0.09         -0.07
## East Asian     0.05 -0.06    -0.62   -0.22      -0.16       1.00     0.42         -0.07
## Oceanian      -0.08 -0.20    -0.31   -0.10      -0.09       0.42     1.00         -0.03
## Central Asian  0.06  0.12    -0.20   -0.07      -0.07      -0.07    -0.03          1.00

d_ping[c("CA", "S", v_genomic)] %>% count.pairwise()

##                 CA    S European African Amerindian East_Asian Oceanian Central_Asian
## CA            1369 1363     1369    1369       1369       1369     1369          1369
## S             1363 1380     1380    1380       1380       1380     1380          1380
## European      1369 1380     1391    1391       1391       1391     1391          1391
## African       1369 1380     1391    1391       1391       1391     1391          1391
## Amerindian    1369 1380     1391    1391       1391       1391     1391          1391
## East_Asian    1369 1380     1391    1391       1391       1391     1391          1391
## Oceanian      1369 1380     1391    1391       1391       1391     1391          1391
## Central_Asian 1369 1380     1391    1391       1391       1391     1391          1391

#group differences
d_ping %$% SMD_matrix(CA, SIRE_standard)

##                  White African_American American_Indian  Asian Hispanic Multi_ethnic Other Pacific_Islander
## White               NA             1.03           0.092  0.027     0.66         0.25  0.36             0.82
## African_American 1.029               NA          -0.936 -1.001    -0.37        -0.78 -0.67            -0.21
## American_Indian  0.092            -0.94              NA -0.065     0.57         0.16  0.27             0.72
## Asian            0.027            -1.00          -0.065     NA     0.63         0.23  0.33             0.79
## Hispanic         0.659            -0.37           0.566  0.631       NA        -0.41 -0.30             0.16
## Multi_ethnic     0.253            -0.78           0.161  0.226    -0.41           NA  0.11             0.56
## Other            0.361            -0.67           0.269  0.334    -0.30         0.11    NA             0.45
## Pacific_Islander 0.816            -0.21           0.723  0.788     0.16         0.56  0.45               NA

d_ping %$% SMD_matrix(S, SIRE_standard)

##                  White African_American American_Indian Asian Hispanic Multi_ethnic Other Pacific_Islander
## White               NA             1.07            1.58  0.23     0.94         0.50  0.61             1.81
## African_American  1.07               NA            0.51 -0.84    -0.13        -0.57 -0.46             0.74
## American_Indian   1.58             0.51              NA -1.35    -0.64        -1.08 -0.97             0.23
## Asian             0.23            -0.84           -1.35    NA     0.72         0.28  0.38             1.58
## Hispanic          0.94            -0.13           -0.64  0.72       NA        -0.44 -0.33             0.86
## Multi_ethnic      0.50            -0.57           -1.08  0.28    -0.44           NA  0.11             1.31
## Other             0.61            -0.46           -0.97  0.38    -0.33         0.11    NA             1.20
## Pacific_Islander  1.81             0.74            0.23  1.58     0.86         1.31  1.20               NA

#sample sizes
table2(d_ping$SIRE_standard)

## # A tibble: 9 x 3
##   Group            Count Percent
##   <chr>            <dbl>   <dbl>
## 1 White              571  41.0  
## 2 Hispanic           328  23.6  
## 3 Multi_ethnic       186  13.4  
## 4 African_American   140  10.1  
## 5 Asian              122   8.77 
## 6 Other               24   1.73 
## 7 Pacific_Islander    16   1.15 
## 8 American_Indian      4   0.288
## 9 <NA>                 0   0

Descriptives

#redefine function to avoid error
describe = function(...) {
  y = psych::describe(...)
  class(y) = "data.frame"
  y
}

# full sample ---------------------------------------------------------------------

describe(d_ping$Age)

##    vars    n mean sd median trimmed mad min max range skew kurtosis   se
## X1    1 1386   12  5     11      12 5.9   3  21    18 0.18     -1.1 0.13

table2(d_ping$Gender)

## # A tibble: 3 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1 M       719    51.7
## 2 F       672    48.3
## 3 <NA>      0     0

describe(d_ping$CA)

##    vars    n     mean sd median trimmed  mad  min max range  skew kurtosis    se
## X1    1 1369 -9.7e-18  1   0.08   0.036 0.96 -3.4 3.2   6.6 -0.35     0.13 0.027

describe(d_ping$S)

##    vars    n     mean sd median trimmed mad min max range  skew kurtosis    se
## X1    1 1380 -6.6e-18  1   0.11   0.049   1  -3 1.8   4.8 -0.43    -0.43 0.027

# by subgroup -------------------------------------------------------------

describeBy(d_ping %>% select(Age, CA, S), d_ping$SIRE_standard)

## 
##  Descriptive statistics by group 
## group: White
##     vars   n  mean   sd median trimmed  mad  min  max range  skew kurtosis   se
## Age    1 571 11.62 4.81  11.25   11.50 5.68  3.2 21.0  17.8  0.17    -1.01 0.20
## CA     2 567  0.29 0.89   0.32    0.32 0.77 -3.4  3.2   6.6 -0.51     1.36 0.04
## S      3 570  0.41 0.78   0.52    0.45 0.76 -1.8  1.8   3.7 -0.49    -0.21 0.03
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 
## group: African_American
##     vars   n  mean   sd median trimmed  mad  min  max range  skew kurtosis   se
## Age    1 140 12.50 4.68  12.58   12.55 5.56  3.1 20.8  17.7 -0.06    -0.93 0.40
## CA     2 138 -0.68 0.88  -0.66   -0.68 0.83 -2.9  1.8   4.7  0.02    -0.09 0.07
## S      3 139 -0.56 0.96  -0.52   -0.57 1.12 -2.3  1.5   3.8  0.11    -0.77 0.08
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 
## group: American_Indian
##     vars n  mean   sd median trimmed  mad   min   max range  skew kurtosis   se
## Age    1 4 11.62 3.49  12.16   11.62 3.40  7.33 14.83   7.5 -0.20     -2.2 1.75
## CA     2 4  0.21 1.26  -0.17    0.21 0.74 -0.84  1.99   2.8  0.53     -1.9 0.63
## S      3 4 -1.03 0.92  -0.90   -1.03 0.92 -2.13 -0.19   1.9 -0.17     -2.2 0.46
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 
## group: Asian
##     vars   n  mean   sd median trimmed  mad  min  max range  skew kurtosis   se
## Age    1 122 13.64 5.78  15.42   13.98 6.55  3.1 21.0  17.9 -0.35    -1.40 0.52
## CA     2 120  0.27 0.88   0.50    0.29 0.73 -2.5  2.5   5.0 -0.39    -0.01 0.08
## S      3 122  0.20 1.03   0.38    0.26 1.04 -3.0  1.8   4.8 -0.65     0.30 0.09
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 
## group: Hispanic
##     vars   n  mean   sd median trimmed  mad  min  max range  skew kurtosis   se
## Age    1 328 11.31 4.89  10.41   11.10 5.63  3.2 21.0  17.8  0.34    -1.01 0.27
## CA     2 323 -0.33 0.98  -0.24   -0.30 0.99 -3.1  2.1   5.3 -0.23    -0.23 0.05
## S      3 328 -0.45 0.94  -0.42   -0.44 1.05 -2.9  1.8   4.7 -0.08    -0.53 0.05
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 
## group: Multi_ethnic
##     vars   n  mean  sd median trimmed mad  min  max range  skew kurtosis   se
## Age    1 186 10.86 5.0   9.71   10.60 5.9  3.0 20.9  17.9  0.37    -0.96 0.37
## CA     2 182  0.05 1.1   0.08    0.09 1.1 -2.6  2.8   5.4 -0.26    -0.40 0.08
## S      3 186 -0.05 1.1   0.16    0.01 1.1 -2.3  1.8   4.1 -0.42    -0.75 0.08
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 
## group: Other
##     vars  n  mean   sd median trimmed mad  min   max range  skew kurtosis   se
## Age    1 19 14.45 4.21  14.17   14.64 3.8  5.4 20.25  14.8 -0.41    -0.81 0.96
## CA     2 19 -0.05 0.84   0.15    0.03 0.7 -2.3  0.89   3.2 -1.02     0.32 0.19
## S      3 15 -0.15 1.00  -0.18   -0.07 1.1 -2.5  1.28   3.8 -0.56    -0.20 0.26
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 
## group: Pacific_Islander
##     vars  n  mean  sd median trimmed  mad  min   max range  skew kurtosis   se
## Age    1 16  7.10 2.7   6.54    6.79 1.66  3.8 14.75  10.9  1.30     1.35 0.68
## CA     2 16 -0.48 1.3  -0.17   -0.52 0.64 -2.6  2.25   4.9 -0.22    -0.52 0.33
## S      3 16 -1.24 1.2  -1.42   -1.28 1.51 -2.5  0.61   3.1  0.38    -1.53 0.30

plyr::dlply(d_ping, "SIRE_standard", function(dd) table2(dd$Gender))

## $White
## # A tibble: 3 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1 M       303    53.1
## 2 F       268    46.9
## 3 <NA>      0     0  
## 
## $African_American
## # A tibble: 3 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1 M        78    55.7
## 2 F        62    44.3
## 3 <NA>      0     0  
## 
## $American_Indian
## # A tibble: 2 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1 M         4     100
## 2 <NA>      0       0
## 
## $Asian
## # A tibble: 3 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1 F        80    65.6
## 2 M        42    34.4
## 3 <NA>      0     0  
## 
## $Hispanic
## # A tibble: 3 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1 M       175    53.4
## 2 F       153    46.6
## 3 <NA>      0     0  
## 
## $Multi_ethnic
## # A tibble: 3 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1 M        95    51.1
## 2 F        91    48.9
## 3 <NA>      0     0  
## 
## $Other
## # A tibble: 3 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1 M        15    62.5
## 2 F         9    37.5
## 3 <NA>      0     0  
## 
## $Pacific_Islander
## # A tibble: 3 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1 F         9    56.2
## 2 M         7    43.8
## 3 <NA>      0     0  
## 
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
##      SIRE_standard
## 1            White
## 2 African_American
## 3  American_Indian
## 4            Asian
## 5         Hispanic
## 6     Multi_ethnic
## 7            Other
## 8 Pacific_Islander

Modeling

After having done the initial analyses, we do the main modeling. This outputs quite a lot. Normally, one would do this interactively and only print the necessary stuff, but when the output is a static file, we will have to print everything.

### MAIN MODELING

# main modeling function ----------------------------------------------------------------
#we make a function that does the analyses for us given an outcome variable

modeller = function(outcome, genomics, include_S_as_pred = F, data = d_ping) {
  outcome
  genomics
  data
  
  #variations
  models = expand.grid(SIRE = c("standard", "common", "numeric", "dummy", "none"),
                       genomics = c(T, F),
                       stringsAsFactors = F) %>% 
    as_data_frame()
  
  #add list col
  models[["fit"]] = replicate(nrow(models), list())
  
  #remove null model
  models = models[-nrow(models), ]
  
  #loop de loop
  for (i in 1:nrow(models)) {
    
    this_to_add = c()
    this_pars = models[i, ]
    
    #build model
    this_model = outcome + " ~ "
    
    #parental S as control?
    if (include_S_as_pred) {
      this_model = this_model + "S + "
    }
    
    #SIRE
    switch(this_pars$SIRE,
           standard = {this_to_add = "SIRE_standard"},
           common = {this_to_add = "SIRE_common"},
           dummy = {this_to_add = v_SIREs},
           numeric = {this_to_add = v_SIREs_con[-length(v_SIREs_con)]}, #drop last predictor to avoid perfect multicollinearity
           none = NULL
    )
    
    #genomics
    switch(this_pars$genomics,
           T = {this_to_add = c(this_to_add, genomics)},
           F = NULL
    )
    
    #first language
    if (!exclude_nonenglish_first_language) {
      this_to_add = c(this_to_add, "EFL")
    }
    
    #make model
    this_model = this_model + str_c(this_to_add, collapse = " + ")
    
    #fit model
    models$fit[i] = list(lm(this_model, data = data) %>% MOD_summary(progress = F, standardize = F, kfold = F))
    
  }
  
  models
}


# restructure function ----------------------------------------------------
#another function that restructures a bunch of model results
MOD_restructure = function(x) {
  
  #get coefs
  coefs = map(x, "coefs")
  etas = map(x, c("aov_etas")) %>% map(~as.data.frame(.) %>% `[`("Eta"))
  
  #add id's if none present
  if (!has_names(coefs)) names(coefs) = seq_along(coefs)
  if (!has_names(etas)) names(coefs) = seq_along(coefs)
  
  #restructure 
  list(
    coefs = map(coefs, rownames_to_column, var = "Predictor") %>% ldf_to_df(by_name = "model_id"),
    etas = map(etas, rownames_to_column, var = "Predictor") %>% ldf_to_df(by_name = "model_id")
  )
  
}


# main regression -----------------------------------------------------------

#models without European (reference)
all_models_nonEuro = map(c("CA", "S"), modeller, genomics = v_genomic[-1])

#same as above, but include parental S as predictor
all_models_nonEuro_S = modeller(outcome = "CA", genomics = v_genomic[-1], include_S_as_pred = T)

#this is models only with European ancestry as predictor
#showing the importance of using multiple ancestries, unlike what was previous found
all_models_euro = map(c("CA", "S"), modeller, genomics = "European")

#restructure model results
betas = rbind(
  MOD_restructure(all_models_nonEuro[[1]][["fit"]]) %>% `[[`("coefs") %>% mutate(outcome = "CA"),
  MOD_restructure(all_models_nonEuro[[2]][["fit"]]) %>% `[[`("coefs") %>% mutate(outcome = "S")
)

etas = rbind(
  MOD_restructure(all_models_nonEuro[[1]][["fit"]]) %>% `[[`("etas") %>% mutate(outcome = "CA"),
  MOD_restructure(all_models_nonEuro[[2]][["fit"]]) %>% `[[`("etas") %>% mutate(outcome = "S")
)

#classify predictors
etas %<>% mutate(class = map_chr(Predictor, function(pred) {
  if (pred %in% v_genomic) return("Genomic ancestry")
  if (pred == "EFL") return("Linguistic")
  "SIRE"
}))

#summarize etas
etas_summary = ddply(etas, c("model_id", "outcome", "class"), function(d) {
  d$Eta %>% `^`(2) %>% sum %>% sqrt %>% set_names("total_eta")
})

#Parental models
etas_parental = MOD_restructure(all_models_nonEuro_S[["fit"]]) %>% `[[`("etas")

#classify predictors
etas_parental %<>% mutate(class = map_chr(Predictor, function(pred) {
  if (pred %in% v_genomic) return("Genomic ancestry")
  if (pred == "EFL") return("Linguistic")
  if (pred == "S") return("Parental")
  "SIRE"
}))

#summarize etas
etas_summary_parental = ddply(etas_parental, c("model_id", "class"), function(d) {
  d$Eta %>% `^`(2) %>% sum %>% sqrt %>% set_names("total_eta")
})



# main models output ------------------------------------------------------------

#all non-European ancestry + SIREs
#CA
all_models_nonEuro[[1]][["fit"]]

## [[1]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                   Beta    SE CI_lower CI_upper
## SIRE_standard: White             0.000    NA       NA       NA
## SIRE_standard: African_American  0.305 0.184   -0.055     0.67
## SIRE_standard: American_Indian   0.408 0.464   -0.503     1.32
## SIRE_standard: Asian            -0.071 0.189   -0.442     0.30
## SIRE_standard: Hispanic         -0.165 0.099   -0.359     0.03
## SIRE_standard: Multi_ethnic      0.038 0.110   -0.178     0.25
## SIRE_standard: Other            -0.029 0.228   -0.476     0.42
## SIRE_standard: Pacific_Islander  0.023 0.308   -0.581     0.63
## African                         -1.580 0.201   -1.975    -1.19
## Amerindian                      -1.152 0.309   -1.757    -0.55
## East_Asian                       0.058 0.186   -0.307     0.42
## Oceanian                        -4.809 1.273   -7.307    -2.31
## Central_Asian                    0.209 0.255   -0.291     0.71
## EFL                              0.030 0.085   -0.137     0.20
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1355 0.17    0.16    NA
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## SIRE_standard 0.0881      0.0961
## African       0.1947      0.2087
## Amerindian    0.0925      0.1009
## East_Asian    0.0078      0.0085
## Oceanian      0.0936      0.1021
## Central_Asian 0.0203      0.0223
## EFL           0.0086      0.0095
## 
## [[2]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                               Beta    SE CI_lower CI_upper
## SIRE_common: White                           0.000    NA       NA       NA
## SIRE_common: African_American                0.086 0.212   -0.330    0.502
## SIRE_common: Asian                          -0.014 0.197   -0.400    0.372
## SIRE_common: Asian, White                    0.315 0.153    0.015    0.615
## SIRE_common: Hispanic                       -0.421 0.162   -0.739   -0.104
## SIRE_common: Hispanic, African_American     -0.382 0.231   -0.836    0.071
## SIRE_common: Hispanic, White                -0.135 0.111   -0.354    0.083
## SIRE_common: Pacific_Islander, Asian, White -0.186 0.189   -0.557    0.185
## SIRE_common: Other                          -0.162 0.111   -0.379    0.056
## African                                     -1.309 0.240   -1.780   -0.838
## Amerindian                                  -0.953 0.324   -1.589   -0.317
## East_Asian                                  -0.013 0.197   -0.399    0.373
## Oceanian                                    -3.331 1.144   -5.575   -1.086
## Central_Asian                                0.158 0.258   -0.348    0.665
## EFL                                          0.033 0.085   -0.133    0.199
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1354 0.17    0.17    NA
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## SIRE_common   0.1192      0.1301
## African       0.1346      0.1465
## Amerindian    0.0726      0.0796
## East_Asian    0.0017      0.0018
## Oceanian      0.0719      0.0789
## Central_Asian 0.0151      0.0166
## EFL           0.0096      0.0106
## 
## [[3]]
## 
##     ---- Model summary ----    
## Model coefficients
##                      Beta    SE CI_lower CI_upper
## Hispanic_c         -0.460 0.153    -0.76    -0.16
## Pacific_Islander_c -0.781 0.311    -1.39    -0.17
## Asian_c            -0.038 0.223    -0.48     0.40
## African_American_c  0.287 0.238    -0.18     0.75
## American_Indian_c  -0.241 0.266    -0.76     0.28
## Other_c            -0.035 0.232    -0.49     0.42
## African            -1.594 0.273    -2.13    -1.06
## Amerindian         -0.753 0.329    -1.40    -0.11
## East_Asian          0.061 0.233    -0.40     0.52
## Oceanian           -1.566 1.495    -4.50     1.37
## Central_Asian       0.178 0.278    -0.37     0.72
## EFL                 0.036 0.084    -0.13     0.20
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1356 0.17    0.17    NA
## 
## 
## Etas from analysis of variance
##                       Eta Eta_partial
## Hispanic_c         0.0743      0.0815
## Pacific_Islander_c 0.0619      0.0680
## Asian_c            0.0042      0.0047
## African_American_c 0.0297      0.0326
## American_Indian_c  0.0224      0.0246
## Other_c            0.0037      0.0041
## African            0.1442      0.1567
## Amerindian         0.0565      0.0620
## East_Asian         0.0064      0.0071
## Oceanian           0.0259      0.0284
## Central_Asian      0.0158      0.0174
## EFL                0.0106      0.0116
## 
## [[4]]
## 
##     ---- Model summary ----    
## Model coefficients
##                    Beta    SE CI_lower CI_upper
## Hispanic         -0.182 0.081   -0.342   -0.022
## Pacific_Islander -0.440 0.134   -0.703   -0.178
## Asian             0.205 0.119   -0.028    0.438
## African_American  0.263 0.151   -0.033    0.559
## American_Indian  -0.105 0.119   -0.338    0.128
## Other             0.154 0.227   -0.290    0.599
## White             0.215 0.086    0.045    0.385
## African          -1.364 0.229   -1.813   -0.915
## Amerindian       -0.837 0.303   -1.432   -0.242
## East_Asian        0.021 0.188   -0.348    0.390
## Oceanian         -1.319 1.317   -3.904    1.265
## Central_Asian     0.149 0.248   -0.338    0.637
## EFL               0.046 0.084   -0.120    0.212
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1355 0.18    0.17    NA
## 
## 
## Etas from analysis of variance
##                     Eta Eta_partial
## Hispanic         0.0551      0.0607
## Pacific_Islander 0.0810      0.0889
## Asian            0.0424      0.0467
## African_American 0.0430      0.0473
## American_Indian  0.0217      0.0240
## Other            0.0168      0.0185
## White            0.0613      0.0674
## African          0.1469      0.1598
## Amerindian       0.0680      0.0747
## East_Asian       0.0028      0.0031
## Oceanian         0.0247      0.0272
## Central_Asian    0.0148      0.0163
## EFL              0.0135      0.0148
## 
## [[5]]
## 
##     ---- Model summary ----    
## Model coefficients
##                 Beta    SE CI_lower CI_upper
## African       -1.312 0.095    -1.50    -1.13
## Amerindian    -1.593 0.222    -2.03    -1.16
## East_Asian     0.022 0.094    -0.16     0.21
## Oceanian      -4.584 0.930    -6.41    -2.76
## Central_Asian  0.187 0.206    -0.22     0.59
## EFL            0.046 0.084    -0.12     0.21
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1362 0.16    0.16    NA
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## African       0.3439      0.3512
## Amerindian    0.1784      0.1910
## East_Asian    0.0057      0.0063
## Oceanian      0.1224      0.1323
## Central_Asian 0.0226      0.0246
## EFL           0.0136      0.0149
## 
## [[6]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                   Beta    SE CI_lower CI_upper
## SIRE_standard: White             0.000    NA       NA       NA
## SIRE_standard: African_American -0.971 0.090    -1.15   -0.795
## SIRE_standard: American_Indian  -0.089 0.473    -1.02    0.839
## SIRE_standard: Asian            -0.016 0.097    -0.21    0.175
## SIRE_standard: Hispanic         -0.615 0.067    -0.75   -0.484
## SIRE_standard: Multi_ethnic     -0.238 0.080    -0.40   -0.081
## SIRE_standard: Other            -0.332 0.221    -0.76    0.101
## SIRE_standard: Pacific_Islander -0.761 0.240    -1.23   -0.291
## EFL                              0.039 0.085    -0.13    0.206
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1360 0.12    0.11    NA
## 
## 
## Etas from analysis of variance
##                 Eta Eta_partial
## SIRE_standard 0.341       0.341
## EFL           0.012       0.012
## 
## [[7]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                               Beta    SE CI_lower CI_upper
## SIRE_common: White                           0.000    NA       NA       NA
## SIRE_common: African_American               -0.971 0.088    -1.14   -0.799
## SIRE_common: Asian                          -0.012 0.095    -0.20    0.176
## SIRE_common: Asian, White                    0.320 0.128     0.07    0.571
## SIRE_common: Hispanic                       -0.868 0.120    -1.10   -0.633
## SIRE_common: Hispanic, African_American     -1.222 0.179    -1.57   -0.871
## SIRE_common: Hispanic, White                -0.413 0.089    -0.59   -0.239
## SIRE_common: Pacific_Islander, Asian, White -0.398 0.168    -0.73   -0.069
## SIRE_common: Other                          -0.524 0.074    -0.67   -0.379
## EFL                                          0.054 0.084    -0.11    0.219
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1359 0.15    0.14    NA
## 
## 
## Etas from analysis of variance
##               Eta Eta_partial
## SIRE_common 0.386       0.387
## EFL         0.016       0.018
## 
## [[8]]
## 
##     ---- Model summary ----    
## Model coefficients
##                      Beta    SE CI_lower CI_upper
## Hispanic_c         -0.917 0.099    -1.11   -0.722
## Pacific_Islander_c -0.996 0.175    -1.34   -0.653
## Asian_c             0.036 0.088    -0.14    0.210
## African_American_c -1.016 0.084    -1.18   -0.852
## American_Indian_c  -0.615 0.261    -1.13   -0.103
## Other_c            -0.345 0.216    -0.77    0.078
## EFL                 0.050 0.083    -0.11    0.214
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1361 0.15    0.15    NA
## 
## 
## Etas from analysis of variance
##                      Eta Eta_partial
## Hispanic_c         0.231       0.243
## Pacific_Islander_c 0.143       0.153
## Asian_c            0.010       0.011
## African_American_c 0.304       0.313
## American_Indian_c  0.059       0.064
## Other_c            0.040       0.043
## EFL                0.015       0.016
## 
## [[9]]
## 
##     ---- Model summary ----    
## Model coefficients
##                    Beta    SE CI_lower CI_upper
## Hispanic         -0.341 0.062    -0.46    -0.22
## Pacific_Islander -0.479 0.095    -0.67    -0.29
## Asian             0.378 0.073     0.23     0.52
## African_American -0.460 0.083    -0.62    -0.30
## American_Indian  -0.108 0.121    -0.34     0.13
## Other             0.110 0.224    -0.33     0.55
## White             0.410 0.068     0.28     0.54
## EFL               0.052 0.083    -0.11     0.22
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1360 0.15    0.14    NA
## 
## 
## Etas from analysis of variance
##                    Eta Eta_partial
## Hispanic         0.137       0.147
## Pacific_Islander 0.126       0.135
## Asian            0.129       0.139
## African_American 0.139       0.149
## American_Indian  0.022       0.024
## Other            0.012       0.013
## White            0.151       0.161
## EFL              0.016       0.017

#S
all_models_nonEuro[[2]][["fit"]]

## [[1]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                   Beta    SE CI_lower CI_upper
## SIRE_standard: White             0.000    NA       NA       NA
## SIRE_standard: African_American  0.283 0.175   -0.060    0.625
## SIRE_standard: American_Indian  -0.845 0.440   -1.709    0.018
## SIRE_standard: Asian            -0.033 0.178   -0.383    0.316
## SIRE_standard: Hispanic         -0.209 0.094   -0.393   -0.026
## SIRE_standard: Multi_ethnic     -0.027 0.103   -0.229    0.175
## SIRE_standard: Other            -0.387 0.237   -0.852    0.078
## SIRE_standard: Pacific_Islander -0.221 0.291   -0.791    0.350
## African                         -1.553 0.191   -1.928   -1.177
## Amerindian                      -1.906 0.291   -2.476   -1.335
## East_Asian                      -0.239 0.176   -0.585    0.107
## Oceanian                        -7.229 1.202   -9.587   -4.872
## Central_Asian                    0.624 0.246    0.142    1.107
## EFL                              0.109 0.080   -0.048    0.265
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1       S 1380 1366 0.25    0.24    NA
## 
## 
## Etas from analysis of variance
##                 Eta Eta_partial
## SIRE_standard 0.110       0.126
## African       0.190       0.214
## Amerindian    0.153       0.175
## East_Asian    0.032       0.037
## Oceanian      0.141       0.161
## Central_Asian 0.059       0.069
## EFL           0.032       0.037
## 
## [[2]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                               Beta    SE CI_lower CI_upper
## SIRE_common: White                           0.000    NA       NA       NA
## SIRE_common: African_American                0.017 0.203   -0.382    0.416
## SIRE_common: Asian                          -0.152 0.185   -0.515    0.212
## SIRE_common: Asian, White                    0.199 0.142   -0.080    0.479
## SIRE_common: Hispanic                       -0.207 0.152   -0.504    0.091
## SIRE_common: Hispanic, African_American     -0.398 0.219   -0.828    0.031
## SIRE_common: Hispanic, White                -0.061 0.105   -0.267    0.144
## SIRE_common: Pacific_Islander, Asian, White -0.567 0.179   -0.917   -0.216
## SIRE_common: Other                          -0.396 0.104   -0.600   -0.192
## African                                     -1.227 0.231   -1.680   -0.773
## Amerindian                                  -2.063 0.305   -2.661   -1.464
## East_Asian                                  -0.104 0.185   -0.468    0.260
## Oceanian                                    -5.657 1.081   -7.777   -3.536
## Central_Asian                                0.707 0.249    0.219    1.195
## EFL                                          0.139 0.079   -0.016    0.295
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1       S 1380 1365 0.26    0.25    NA
## 
## 
## Etas from analysis of variance
##                 Eta Eta_partial
## SIRE_common   0.149       0.171
## African       0.123       0.142
## Amerindian    0.157       0.180
## East_Asian    0.013       0.015
## Oceanian      0.122       0.140
## Central_Asian 0.066       0.077
## EFL           0.041       0.048
## 
## [[3]]
## 
##     ---- Model summary ----    
## Model coefficients
##                     Beta    SE CI_lower CI_upper
## Hispanic_c         -0.28 0.142   -0.556  0.00084
## Pacific_Islander_c -2.06 0.288   -2.628 -1.49849
## Asian_c            -0.56 0.208   -0.967 -0.15100
## African_American_c  0.42 0.229   -0.030  0.86784
## American_Indian_c  -0.63 0.246   -1.113 -0.14642
## Other_c            -0.46 0.236   -0.919  0.00527
## African            -1.76 0.262   -2.270 -1.24087
## Amerindian         -1.88 0.306   -2.477 -1.27512
## East_Asian          0.33 0.217   -0.101  0.75219
## Oceanian           -1.60 1.383   -4.317  1.10811
## Central_Asian       1.07 0.265    0.545  1.58527
## EFL                 0.12 0.078   -0.028  0.27764
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1       S 1380 1367 0.28    0.27    NA
## 
## 
## Etas from analysis of variance
##                      Eta Eta_partial
## Hispanic_c         0.045       0.053
## Pacific_Islander_c 0.165       0.190
## Asian_c            0.062       0.073
## African_American_c 0.042       0.049
## American_Indian_c  0.059       0.069
## Other_c            0.045       0.052
## African            0.154       0.178
## Amerindian         0.141       0.163
## East_Asian         0.034       0.040
## Oceanian           0.027       0.031
## Central_Asian      0.092       0.108
## EFL                0.037       0.043
## 
## [[4]]
## 
##     ---- Model summary ----    
## Model coefficients
##                    Beta    SE CI_lower CI_upper
## Hispanic         -0.109 0.075   -0.256    0.039
## Pacific_Islander -0.980 0.123   -1.222   -0.739
## Asian             0.032 0.109   -0.182    0.247
## African_American  0.309 0.143    0.029    0.588
## American_Indian  -0.184 0.109   -0.398    0.030
## Other            -0.157 0.233   -0.614    0.300
## White             0.296 0.080    0.140    0.453
## African          -1.347 0.217   -1.774   -0.921
## Amerindian       -1.757 0.280   -2.306   -1.207
## East_Asian        0.097 0.174   -0.243    0.438
## Oceanian         -1.576 1.214   -3.958    0.806
## Central_Asian     0.826 0.234    0.366    1.286
## EFL               0.163 0.078    0.010    0.316
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1       S 1380 1366 0.29    0.28    NA
## 
## 
## Etas from analysis of variance
##                     Eta Eta_partial
## Hispanic         0.0331       0.039
## Pacific_Islander 0.1820       0.211
## Asian            0.0068       0.008
## African_American 0.0493       0.058
## American_Indian  0.0386       0.046
## Other            0.0154       0.018
## White            0.0846       0.100
## African          0.1415       0.165
## Amerindian       0.1431       0.167
## East_Asian       0.0128       0.015
## Oceanian         0.0296       0.035
## Central_Asian    0.0804       0.095
## EFL              0.0478       0.057
## 
## [[5]]
## 
##     ---- Model summary ----    
## Model coefficients
##                Beta    SE CI_lower CI_upper
## African       -1.32 0.090    -1.49   -1.138
## Amerindian    -2.44 0.210    -2.85   -2.030
## East_Asian    -0.24 0.089    -0.42   -0.068
## Oceanian      -7.87 0.882    -9.60   -6.141
## Central_Asian  0.59 0.199     0.20    0.979
## EFL            0.13 0.079    -0.03    0.282
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1       S 1380 1373 0.24    0.24    NA
## 
## 
## Etas from analysis of variance
##                 Eta Eta_partial
## African       0.344       0.367
## Amerindian    0.273       0.299
## East_Asian    0.064       0.073
## Oceanian      0.210       0.234
## Central_Asian 0.069       0.079
## EFL           0.037       0.043
## 
## [[6]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                  Beta    SE CI_lower CI_upper
## SIRE_standard: White             0.00    NA       NA       NA
## SIRE_standard: African_American -0.98 0.086   -1.144  -0.8061
## SIRE_standard: American_Indian  -1.45 0.457   -2.342  -0.5497
## SIRE_standard: Asian            -0.17 0.093   -0.356   0.0099
## SIRE_standard: Hispanic         -0.84 0.064   -0.966  -0.7140
## SIRE_standard: Multi_ethnic     -0.46 0.077   -0.607  -0.3053
## SIRE_standard: Other            -0.52 0.239   -0.990  -0.0515
## SIRE_standard: Pacific_Islander -1.62 0.231   -2.075  -1.1670
## EFL                              0.13 0.081   -0.033   0.2870
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1       S 1380 1371 0.18    0.17    NA
## 
## 
## Etas from analysis of variance
##                 Eta Eta_partial
## SIRE_standard 0.412       0.413
## EFL           0.038       0.042
## 
## [[7]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                              Beta    SE CI_lower CI_upper
## SIRE_common: White                           0.00    NA       NA       NA
## SIRE_common: African_American               -0.98 0.085   -1.142   -0.810
## SIRE_common: Asian                          -0.16 0.092   -0.341    0.018
## SIRE_common: Asian, White                    0.19 0.121   -0.043    0.432
## SIRE_common: Hispanic                       -1.00 0.114   -1.225   -0.776
## SIRE_common: Hispanic, African_American     -1.27 0.170   -1.599   -0.932
## SIRE_common: Hispanic, White                -0.55 0.085   -0.719   -0.384
## SIRE_common: Pacific_Islander, Asian, White -0.97 0.162   -1.288   -0.652
## SIRE_common: Other                          -0.89 0.071   -1.029   -0.749
## EFL                                          0.17 0.080    0.015    0.330
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1       S 1380 1370 0.21     0.2    NA
## 
## 
## Etas from analysis of variance
##               Eta Eta_partial
## SIRE_common 0.449       0.450
## EFL         0.052       0.058
## 
## [[8]]
## 
##     ---- Model summary ----    
## Model coefficients
##                     Beta    SE CI_lower CI_upper
## Hispanic_c         -1.12 0.094   -1.305  -0.9379
## Pacific_Islander_c -2.09 0.166   -2.414  -1.7641
## Asian_c            -0.16 0.084   -0.322   0.0065
## African_American_c -1.00 0.079   -1.158  -0.8465
## American_Indian_c  -1.14 0.246   -1.623  -0.6584
## Other_c            -0.53 0.231   -0.983  -0.0757
## EFL                 0.14 0.079   -0.011   0.2976
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1       S 1380 1372 0.23    0.22    NA
## 
## 
## Etas from analysis of variance
##                      Eta Eta_partial
## Hispanic_c         0.284       0.308
## Pacific_Islander_c 0.299       0.323
## Asian_c            0.045       0.051
## African_American_c 0.300       0.323
## American_Indian_c  0.110       0.124
## Other_c            0.054       0.062
## EFL                0.043       0.049
## 
## [[9]]
## 
##     ---- Model summary ----    
## Model coefficients
##                    Beta    SE CI_lower CI_upper
## Hispanic         -0.445 0.059   -0.560   -0.330
## Pacific_Islander -1.029 0.089   -1.204   -0.854
## Asian             0.325 0.069    0.191    0.460
## African_American -0.371 0.078   -0.525   -0.218
## American_Indian  -0.210 0.112   -0.430    0.011
## Other             0.024 0.236   -0.440    0.487
## White             0.498 0.064    0.373    0.624
## EFL               0.174 0.078    0.021    0.327
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1       S 1380 1371 0.24    0.23    NA
## 
## 
## Etas from analysis of variance
##                     Eta Eta_partial
## Hispanic         0.1787      0.2005
## Pacific_Islander 0.2721      0.2976
## Asian            0.1117      0.1269
## African_American 0.1120      0.1273
## American_Indian  0.0440      0.0504
## Other            0.0024      0.0027
## White            0.1836      0.2059
## EFL              0.0524      0.0600

#overcorrected model with all non-Euro + parental S
all_models_nonEuro_S[["fit"]]

## [[1]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                    Beta    SE CI_lower CI_upper
## S                                0.3089 0.027     0.26    0.363
## SIRE_standard: White             0.0000    NA       NA       NA
## SIRE_standard: African_American  0.1895 0.178    -0.16    0.539
## SIRE_standard: American_Indian   0.6612 0.444    -0.21    1.532
## SIRE_standard: Asian            -0.0751 0.182    -0.43    0.282
## SIRE_standard: Hispanic         -0.1095 0.095    -0.30    0.078
## SIRE_standard: Multi_ethnic      0.0347 0.105    -0.17    0.241
## SIRE_standard: Other             0.1434 0.239    -0.33    0.613
## SIRE_standard: Pacific_Islander  0.0866 0.294    -0.49    0.663
## African                         -1.0521 0.200    -1.44   -0.659
## Amerindian                      -0.5346 0.300    -1.12    0.054
## East_Asian                       0.1555 0.180    -0.20    0.508
## Oceanian                        -2.6176 1.233    -5.04   -0.199
## Central_Asian                    0.0429 0.250    -0.45    0.532
## EFL                             -0.0041 0.081    -0.16    0.155
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1363 1348 0.24    0.23    NA
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## S             0.2686      0.2938
## SIRE_standard 0.0688      0.0785
## African       0.1251      0.1417
## Amerindian    0.0424      0.0485
## East_Asian    0.0206      0.0236
## Oceanian      0.0505      0.0577
## Central_Asian 0.0041      0.0047
## EFL           0.0012      0.0014
## 
## [[2]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                               Beta    SE CI_lower CI_upper
## S                                            0.302 0.028    0.248    0.356
## SIRE_common: White                           0.000    NA       NA       NA
## SIRE_common: African_American                0.033 0.207   -0.373    0.440
## SIRE_common: Asian                           0.019 0.190   -0.354    0.391
## SIRE_common: Asian, White                    0.240 0.147   -0.049    0.528
## SIRE_common: Hispanic                       -0.371 0.155   -0.676   -0.066
## SIRE_common: Hispanic, African_American     -0.301 0.224   -0.740    0.138
## SIRE_common: Hispanic, White                -0.121 0.107   -0.330    0.089
## SIRE_common: Pacific_Islander, Asian, White -0.018 0.182   -0.375    0.339
## SIRE_common: Other                          -0.052 0.107   -0.263    0.158
## African                                     -0.867 0.238   -1.334   -0.399
## Amerindian                                  -0.308 0.316   -0.928    0.311
## East_Asian                                   0.040 0.190   -0.332    0.413
## Oceanian                                    -1.659 1.109   -3.836    0.517
## Central_Asian                               -0.029 0.254   -0.528    0.470
## EFL                                         -0.011 0.081   -0.170    0.149
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1363 1347 0.24    0.23    NA
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## S             0.2611      0.2867
## SIRE_common   0.0866      0.0988
## African       0.0865      0.0986
## Amerindian    0.0232      0.0266
## East_Asian    0.0051      0.0058
## Oceanian      0.0355      0.0407
## Central_Asian 0.0027      0.0031
## EFL           0.0031      0.0035
## 
## [[3]]
## 
##     ---- Model summary ----    
## Model coefficients
##                       Beta    SE CI_lower CI_upper
## S                   0.3040 0.028     0.25     0.36
## Hispanic_c         -0.3917 0.147    -0.68    -0.10
## Pacific_Islander_c -0.1587 0.304    -0.76     0.44
## Asian_c             0.1114 0.217    -0.31     0.54
## African_American_c  0.1039 0.236    -0.36     0.57
## American_Indian_c  -0.0638 0.256    -0.57     0.44
## Other_c             0.1491 0.242    -0.33     0.62
## African            -0.9809 0.275    -1.52    -0.44
## Amerindian         -0.1529 0.320    -0.78     0.48
## East_Asian         -0.0095 0.226    -0.45     0.43
## Oceanian           -1.1523 1.433    -3.96     1.66
## Central_Asian      -0.1145 0.275    -0.65     0.43
## EFL                -0.0013 0.081    -0.16     0.16
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1363 1349 0.24    0.23    NA
## 
## 
## Etas from analysis of variance
##                        Eta Eta_partial
## S                  0.25954     0.28492
## Hispanic_c         0.06323     0.07223
## Pacific_Islander_c 0.01241     0.01421
## Asian_c            0.01220     0.01397
## African_American_c 0.01046     0.01198
## American_Indian_c  0.00592     0.00678
## Other_c            0.01465     0.01677
## African            0.08472     0.09657
## Amerindian         0.01135     0.01300
## East_Asian         0.00100     0.00115
## Oceanian           0.01911     0.02188
## Central_Asian      0.00988     0.01132
## EFL                0.00039     0.00045
## 
## [[4]]
## 
##     ---- Model summary ----    
## Model coefficients
##                     Beta    SE CI_lower CI_upper
## S                 0.2991 0.028    0.244   0.3541
## Hispanic         -0.1500 0.078   -0.303   0.0035
## Pacific_Islander -0.1425 0.131   -0.400   0.1155
## Asian             0.1855 0.115   -0.039   0.4105
## African_American  0.1374 0.147   -0.152   0.4268
## American_Indian  -0.0482 0.114   -0.272   0.1760
## Other             0.2723 0.240   -0.199   0.7438
## White             0.1381 0.084   -0.026   0.3023
## African          -0.8858 0.228   -1.333  -0.4381
## Amerindian       -0.2924 0.295   -0.871   0.2867
## East_Asian        0.0222 0.182   -0.334   0.3785
## Oceanian         -0.9074 1.265   -3.389   1.5742
## Central_Asian    -0.0697 0.244   -0.549   0.4095
## EFL              -0.0026 0.081   -0.162   0.1570
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1363 1348 0.24    0.23    NA
## 
## 
## Etas from analysis of variance
##                      Eta Eta_partial
## S                0.25367     0.27904
## Hispanic         0.04559     0.05216
## Pacific_Islander 0.02576     0.02950
## Asian            0.03847     0.04403
## African_American 0.02215     0.02537
## American_Indian  0.01003     0.01149
## Other            0.02694     0.03084
## White            0.03922     0.04489
## African          0.09230     0.10514
## Amerindian       0.02355     0.02697
## East_Asian       0.00291     0.00333
## Oceanian         0.01706     0.01953
## Central_Asian    0.00678     0.00777
## EFL              0.00076     0.00087
## 
## [[5]]
## 
##     ---- Model summary ----    
## Model coefficients
##                  Beta    SE CI_lower CI_upper
## S              0.3122 0.027     0.26     0.37
## African       -0.8788 0.098    -1.07    -0.69
## Amerindian    -0.8206 0.222    -1.26    -0.39
## East_Asian     0.1067 0.090    -0.07     0.28
## Oceanian      -2.1592 0.912    -3.95    -0.37
## Central_Asian  0.0254 0.200    -0.37     0.42
## EFL            0.0048 0.081    -0.15     0.16
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1363 1355 0.23    0.23    NA
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## S             0.2737      0.2980
## African       0.2140      0.2371
## Amerindian    0.0882      0.1001
## East_Asian    0.0282      0.0321
## Oceanian      0.0564      0.0642
## Central_Asian 0.0030      0.0034
## EFL           0.0014      0.0016
## 
## [[6]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                    Beta    SE CI_lower CI_upper
## S                                0.3458 0.026     0.29    0.398
## SIRE_standard: White             0.0000    NA       NA       NA
## SIRE_standard: African_American -0.6238 0.088    -0.80   -0.451
## SIRE_standard: American_Indian   0.4161 0.446    -0.46    1.291
## SIRE_standard: Asian             0.0519 0.091    -0.13    0.231
## SIRE_standard: Hispanic         -0.3212 0.067    -0.45   -0.191
## SIRE_standard: Multi_ethnic     -0.0762 0.076    -0.23    0.074
## SIRE_standard: Other            -0.0088 0.233    -0.47    0.449
## SIRE_standard: Pacific_Islander -0.1951 0.229    -0.64    0.254
## EFL                             -0.0047 0.080    -0.16    0.153
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1363 1353 0.22    0.21    NA
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## S             0.3156      0.3357
## SIRE_standard 0.1986      0.2188
## EFL           0.0014      0.0016
## 
## [[7]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                                Beta    SE CI_lower CI_upper
## S                                            0.3189 0.027    0.267    0.371
## SIRE_common: White                           0.0000    NA       NA       NA
## SIRE_common: African_American               -0.6500 0.088   -0.822   -0.478
## SIRE_common: Asian                           0.0464 0.091   -0.131    0.224
## SIRE_common: Asian, White                    0.2524 0.121    0.015    0.490
## SIRE_common: Hispanic                       -0.5456 0.117   -0.775   -0.317
## SIRE_common: Hispanic, African_American     -0.8163 0.173   -1.156   -0.477
## SIRE_common: Hispanic, White                -0.2308 0.085   -0.398   -0.063
## SIRE_common: Pacific_Islander, Asian, White -0.0840 0.162   -0.401    0.233
## SIRE_common: Other                          -0.2303 0.074   -0.376   -0.084
## EFL                                         -0.0036 0.080   -0.160    0.153
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1363 1352 0.23    0.22    NA
## 
## 
## Etas from analysis of variance
##                Eta Eta_partial
## S           0.2852      0.3091
## SIRE_common 0.2327      0.2564
## EFL         0.0011      0.0012
## 
## [[8]]
## 
##     ---- Model summary ----    
## Model coefficients
##                       Beta    SE CI_lower CI_upper
## S                   0.3202 0.027    0.267     0.37
## Hispanic_c         -0.5524 0.099   -0.746    -0.36
## Pacific_Islander_c -0.3229 0.175   -0.666     0.02
## Asian_c             0.0917 0.084   -0.073     0.26
## African_American_c -0.6871 0.084   -0.852    -0.52
## American_Indian_c  -0.2424 0.249   -0.732     0.25
## Other_c            -0.0328 0.231   -0.485     0.42
## EFL                 0.0047 0.079   -0.151     0.16
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1363 1354 0.23    0.23    NA
## 
## 
## Etas from analysis of variance
##                       Eta Eta_partial
## S                  0.2827      0.3066
## Hispanic_c         0.1334      0.1502
## Pacific_Islander_c 0.0440      0.0501
## Asian_c            0.0260      0.0296
## African_American_c 0.1951      0.2170
## American_Indian_c  0.0232      0.0264
## Other_c            0.0034      0.0039
## EFL                0.0014      0.0016
## 
## [[9]]
## 
##     ---- Model summary ----    
## Model coefficients
##                     Beta    SE CI_lower CI_upper
## S                 0.3207 0.027     0.27    0.374
## Hispanic         -0.1969 0.060    -0.32   -0.078
## Pacific_Islander -0.1472 0.095    -0.33    0.038
## Asian             0.2731 0.070     0.14    0.410
## African_American -0.3377 0.079    -0.49   -0.182
## American_Indian  -0.0395 0.115    -0.26    0.185
## Other             0.2415 0.237    -0.22    0.706
## White             0.2460 0.066     0.12    0.375
## EFL              -0.0029 0.079    -0.16    0.153
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1363 1353 0.23    0.22    NA
## 
## 
## Etas from analysis of variance
##                      Eta Eta_partial
## S                0.28140     0.30509
## Hispanic         0.07780     0.08823
## Pacific_Islander 0.03720     0.04231
## Asian            0.09309     0.10539
## African_American 0.10149     0.11477
## American_Indian  0.00824     0.00938
## Other            0.02434     0.02770
## White            0.08905     0.10086
## EFL              0.00087     0.00098

# summarize model results -------------------------------------------------

#beta stats by predictor and outcome
#across models, mean and range
betas %>% 
  #only main models
  filter(model_id %in% 1:4) %>% 
  #loop over predictors
  plyr::ddply(c("Predictor", "outcome"), function(x) {
    #summarize the data with mean and range
    this_sum = psych::describe(x$Beta) %>% 
      #type
      as.data.frame()
    
    #return a nice string
    data.frame(
      beta = sprintf("%.2f (%.2f to %.2f)", this_sum$mean, this_sum$min, this_sum$max)
    )
  }) %>% 
  #filter to ancestries
  filter(Predictor %in% v_genomic) %>% 
  #spread beta
  spread(key = outcome, value = beta) %>% 
  #output
  write_clipboard()

##       Predictor                     CA                      S
## 1       African -1.46 (-1.59 to -1.31) -1.47 (-1.76 to -1.23)
## 2    Amerindian -0.92 (-1.15 to -0.75) -1.90 (-2.06 to -1.76)
## 3 Central_Asian    0.17 (0.15 to 0.21)    0.81 (0.62 to 1.07)
## 4    East_Asian   0.03 (-0.01 to 0.06)   0.02 (-0.24 to 0.33)
## 5      Oceanian -2.76 (-4.81 to -1.32) -4.02 (-7.23 to -1.58)

#ancestry model fit by SIRE coding and outcome
rbind(
  all_models_nonEuro[[1]] %>% mutate(outcome = "CA"),
  all_models_nonEuro[[2]] %>% mutate(outcome = "S")
) %>% 
  #only include models with genomics and SIRE
  filter(genomics, SIRE != "none") %>% 
  #retrieve r2adj
  mutate(
    r2_adj = map_dbl(fit, function(x) {
      # browser()
      x$meta$`R2-adj.`
    })
  ) %>% 
  #out
  write_clipboard()

## List columns were removed because they cannot be easily transformed to rectangular format. Colnames: fit

##       SIRE Genomics Outcome R2 adj
## 1 standard     TRUE      CA   0.16
## 2   common     TRUE      CA   0.17
## 3  numeric     TRUE      CA   0.17
## 4    dummy     TRUE      CA   0.17
## 5 standard     TRUE       S   0.24
## 6   common     TRUE       S   0.25
## 7  numeric     TRUE       S   0.27
## 8    dummy     TRUE       S   0.28

#african amer SIREs
v_african_american_SIREs = c("SIRE_standard: African_American",
                             "SIRE_common: African_American",
                             "African_American",
                             "African_American_c",
                             "SIRE_common: Hispanic, African_American")
betas %>% 
  #only main models
  filter(model_id %in% 1:4, Predictor %in% v_african_american_SIREs) %T>% 
  print() %$% 
  #descrip
  psych::describe(Beta)

##                                  Predictor   Beta   SE CI_lower CI_upper model_id outcome
## 1          SIRE_standard: African_American  0.305 0.18   -0.055    0.666        1      CA
## 2            SIRE_common: African_American  0.086 0.21   -0.330    0.502        2      CA
## 3  SIRE_common: Hispanic, African_American -0.382 0.23   -0.836    0.071        2      CA
## 4                       African_American_c  0.287 0.24   -0.181    0.754        3      CA
## 5                         African_American  0.263 0.15   -0.033    0.559        4      CA
## 6          SIRE_standard: African_American  0.283 0.17   -0.060    0.625        1       S
## 7            SIRE_common: African_American  0.017 0.20   -0.382    0.416        2       S
## 8  SIRE_common: Hispanic, African_American -0.398 0.22   -0.828    0.031        2       S
## 9                       African_American_c  0.419 0.23   -0.030    0.868        3       S
## 10                        African_American  0.309 0.14    0.029    0.588        4       S

##    vars  n mean   sd median trimmed  mad  min  max range  skew kurtosis   se
## X1    1 10 0.12 0.29   0.27    0.15 0.13 -0.4 0.42  0.82 -0.86    -0.97 0.09

#hispanics
betas %>% 
  #only main models
  filter(model_id %in% 1:4, str_detect(Predictor, "Hispanic")) %T>% 
  print() %$% 
  #descrip
  psych::describe(Beta)

##                                  Predictor   Beta    SE CI_lower CI_upper model_id outcome
## 1                  SIRE_standard: Hispanic -0.165 0.099    -0.36  0.03029        1      CA
## 2                    SIRE_common: Hispanic -0.421 0.162    -0.74 -0.10358        2      CA
## 3  SIRE_common: Hispanic, African_American -0.382 0.231    -0.84  0.07100        2      CA
## 4             SIRE_common: Hispanic, White -0.135 0.111    -0.35  0.08305        2      CA
## 5                               Hispanic_c -0.460 0.153    -0.76 -0.16012        3      CA
## 6                                 Hispanic -0.182 0.081    -0.34 -0.02241        4      CA
## 7                  SIRE_standard: Hispanic -0.209 0.094    -0.39 -0.02576        1       S
## 8                    SIRE_common: Hispanic -0.207 0.152    -0.50  0.09090        2       S
## 9  SIRE_common: Hispanic, African_American -0.398 0.219    -0.83  0.03088        2       S
## 10            SIRE_common: Hispanic, White -0.061 0.105    -0.27  0.14402        2       S
## 11                              Hispanic_c -0.278 0.142    -0.56  0.00084        3       S
## 12                                Hispanic -0.109 0.075    -0.26  0.03851        4       S

##    vars  n  mean   sd median trimmed  mad   min   max range  skew kurtosis   se
## X1    1 12 -0.25 0.13  -0.21   -0.25 0.13 -0.46 -0.06   0.4 -0.25     -1.6 0.04

# plot model results ------------------------------------------------------
#eta comparison
etas_summary %>% 
  #only models that have both genomics and SIRE
  filter(model_id %in% 1:4) %>% 
  ggplot(aes(model_id, total_eta, fill = class)) +
  scale_x_discrete("SIRE coding", labels = c("standard", "common comb.", "continuous", "dummy")) +
  geom_bar(stat = "identity", position = "dodge") +
  xlab("Model id") + ylab("Total eta") +
  theme_bw() +
  #facet_grid(. ~ outcome)
  facet_grid(outcome ~ .)

ggsave2("figures/model_etas.png")

#summary statistics, by outcome x class
etas_summary %>% 
  #only models that have both genomics and SIRE
  filter(model_id %in% 1:4) %>% 
  plyr::ddply(c("outcome", "class"), function(x) {
    psych::describe(x$total_eta)
  })

##   outcome            class vars n  mean     sd median trimmed    mad    min   max  range   skew kurtosis     se
## 1      CA Genomic ancestry    1 4 0.182 0.0363  0.167   0.182 0.0087 0.1579 0.236 0.0781  0.711     -1.7 0.0182
## 2      CA       Linguistic    1 4 0.011 0.0021  0.010   0.011 0.0014 0.0086 0.013 0.0048  0.439     -1.9 0.0010
## 3      CA             SIRE    1 4 0.111 0.0195  0.112   0.111 0.0219 0.0881 0.133 0.0452 -0.038     -2.1 0.0098
## 4       S Genomic ancestry    1 4 0.246 0.0308  0.238   0.246 0.0181 0.2191 0.290 0.0706  0.522     -1.8 0.0154
## 5       S       Linguistic    1 4 0.039 0.0067  0.039   0.039 0.0066 0.0320 0.048 0.0159  0.157     -2.0 0.0034
## 6       S             SIRE    1 4 0.168 0.0478  0.175   0.168 0.0475 0.1098 0.213 0.1037 -0.186     -2.2 0.0239

#summary statistics, by class
etas_summary %>% 
  #only models that have both genomics and SIRE
  filter(model_id %in% 1:4) %>% 
  plyr::ddply(c("class"), function(x) {
    psych::describe(x$total_eta)
  })

##              class vars n  mean    sd median trimmed   mad    min   max range skew kurtosis     se
## 1 Genomic ancestry    1 8 0.214 0.046  0.226   0.214 0.055 0.1579 0.290 0.132 0.11     -1.5 0.0164
## 2       Linguistic    1 8 0.025 0.016  0.023   0.025 0.020 0.0086 0.048 0.039 0.17     -2.0 0.0057
## 3             SIRE    1 8 0.140 0.046  0.126   0.140 0.034 0.0881 0.213 0.125 0.54     -1.5 0.0161

#with parental
etas_summary_parental %>% 
  #only models that have both genomics and SIRE
  filter(model_id %in% 1:4) %>% 
  ggplot(aes(model_id, total_eta, fill = class)) +
  scale_x_discrete("SIRE coding", labels = c("standard", "common comb.", "continuous", "dummy")) +
  geom_bar(stat = "identity", position = "dodge") +
  xlab("Model id") + ylab("Total eta") +
  theme_bw()

ggsave2("figures/model_etas_parental.png")

#summary statistics, by class
etas_summary_parental %>% 
  #only models that have both genomics and SIRE
  filter(model_id %in% 1:4) %>% 
  plyr::ddply(c("class"), function(x) {
    psych::describe(x$total_eta)
  })

##              class vars n   mean     sd  median trimmed    mad     min    max  range  skew kurtosis     se
## 1 Genomic ancestry    1 4 0.1062 0.0249 0.09677  0.1062 0.0066 0.08815 0.1430 0.0548 0.688     -1.7 0.0124
## 2       Linguistic    1 4 0.0014 0.0012 0.00098  0.0014 0.0006 0.00039 0.0031 0.0027 0.585     -1.8 0.0006
## 3         Parental    1 4 0.2607 0.0061 0.26031  0.2607 0.0055 0.25367 0.2686 0.0149 0.146     -1.9 0.0031
## 4             SIRE    1 4 0.0770 0.0098 0.07648  0.0770 0.0118 0.06827 0.0866 0.0183 0.019     -2.4 0.0049

# non-Whites --------------------------------------------------------------

#ancestry vairation
d_nW[v_genomic] %>% psych::describe()

##               vars   n mean   sd median trimmed  mad min  max range skew kurtosis   se
## European         1 820 0.39 0.30   0.39    0.37 0.37   0 1.00  1.00 0.28    -1.04 0.01
## African          2 820 0.23 0.32   0.04    0.17 0.06   0 1.00  1.00 1.14    -0.33 0.01
## Amerindian       3 820 0.08 0.15   0.00    0.04 0.00   0 0.83  0.83 2.10     4.19 0.01
## East_Asian       4 820 0.25 0.36   0.00    0.19 0.00   0 1.00  1.00 1.09    -0.38 0.01
## Oceanian         5 820 0.01 0.04   0.00    0.00 0.00   0 0.25  0.25 3.88    16.25 0.00
## Central_Asian    6 820 0.04 0.16   0.00    0.00 0.00   0 1.00  1.00 5.21    26.79 0.01

#zero order
wtd.cors(d_nW[c("CA", "S", v_genomic)]) %>% write_clipboard()

##                  CA     S European African Amerindian East Asian Oceanian Central Asian
## CA             1.00  0.41     0.08   -0.32      -0.10       0.21    -0.04          0.11
## S              0.41  1.00     0.11   -0.21      -0.17       0.11    -0.17          0.18
## European       0.08  0.11     1.00   -0.32       0.25      -0.54    -0.24         -0.18
## African       -0.32 -0.21    -0.32    1.00      -0.24      -0.45    -0.20         -0.13
## Amerindian    -0.10 -0.17     0.25   -0.24       1.00      -0.33    -0.17         -0.10
## East Asian     0.21  0.11    -0.54   -0.45      -0.33       1.00     0.37         -0.12
## Oceanian      -0.04 -0.17    -0.24   -0.20      -0.17       0.37     1.00         -0.05
## Central Asian  0.11  0.18    -0.18   -0.13      -0.10      -0.12    -0.05          1.00

count.pairwise(d_nW[c("CA", "S", v_genomic)])

##                CA   S European African Amerindian East_Asian Oceanian Central_Asian
## CA            802 797      802     802        802        802      802           802
## S             797 810      810     810        810        810      810           810
## European      802 810      820     820        820        820      820           820
## African       802 810      820     820        820        820      820           820
## Amerindian    802 810      820     820        820        820      820           820
## East_Asian    802 810      820     820        820        820      820           820
## Oceanian      802 810      820     820        820        820      820           820
## Central_Asian 802 810      820     820        820        820      820           820

#models
model_ca_genomic_dummy = "CA ~ " + str_c(c(v_SIREs, v_genomic[-1], "EFL"), collapse = " + ")
model_s_genomic_dummy = "S ~ " + str_c(c(v_SIREs, v_genomic[-1], "EFL"), collapse = " + ")

#fit full models
lm(model_ca_genomic_dummy, data = d_nW) %>% MOD_summary(progress = F, standardize = F, kfold = F) %>% write_clipboard()

##                  V1    V2          V3       V4       V5    V6
## 1             coefs                                          
## 2         Predictor  Beta          SE CI lower CI upper      
## 3          Hispanic -0.20        0.10    -0.40     0.00      
## 4  Pacific Islander -0.45        0.14    -0.72    -0.18      
## 5             Asian  0.20        0.13    -0.06     0.46      
## 6  African American  0.30        0.16    -0.02     0.61      
## 7   American Indian -0.12        0.13    -0.36     0.13      
## 8             Other  0.16        0.25    -0.33     0.64      
## 9             White  0.20        0.09     0.02     0.38      
## 10          African -1.47        0.26    -1.98    -0.96      
## 11       Amerindian -0.81        0.34    -1.48    -0.15      
## 12       East Asian  0.00        0.21    -0.41     0.41      
## 13         Oceanian -1.34        1.37    -4.02     1.34      
## 14    Central Asian  0.10        0.27    -0.44     0.64      
## 15              EFL  0.09        0.10    -0.10     0.29      
## 16                                                           
## 17             meta                                          
## 18          outcome     N          df       R2   R2-adj R2-cv
## 19               CA   802         788     0.19     0.18      
## 20                                                           
## 21             etas                                          
## 22        Predictor   Eta Eta partial                        
## 23         Hispanic  0.06        0.07                        
## 24 Pacific Islander  0.11        0.12                        
## 25            Asian  0.05        0.05                        
## 26 African American  0.06        0.07                        
## 27  American Indian  0.03        0.03                        
## 28            Other  0.02        0.02                        
## 29            White  0.07        0.08                        
## 30          African  0.18        0.20                        
## 31       Amerindian  0.08        0.08                        
## 32       East Asian  0.00        0.00                        
## 33         Oceanian  0.03        0.03                        
## 34    Central Asian  0.01        0.01                        
## 35              EFL  0.03        0.03

lm(model_s_genomic_dummy, data = d_nW) %>% MOD_summary(progress = F, standardize = F, kfold = F) %>% write_clipboard()

##                  V1    V2          V3       V4       V5    V6
## 1             coefs                                          
## 2         Predictor  Beta          SE CI lower CI upper      
## 3          Hispanic -0.16        0.10    -0.35     0.03      
## 4  Pacific Islander -1.00        0.13    -1.25    -0.74      
## 5             Asian  0.04        0.13    -0.21     0.29      
## 6  African American  0.28        0.16    -0.03     0.59      
## 7   American Indian -0.21        0.12    -0.45     0.02      
## 8             Other -0.17        0.26    -0.69     0.34      
## 9             White  0.28        0.09     0.11     0.45      
## 10          African -1.40        0.26    -1.91    -0.90      
## 11       Amerindian -1.68        0.33    -2.32    -1.04      
## 12       East Asian  0.02        0.20    -0.38     0.42      
## 13         Oceanian -1.53        1.31    -4.10     1.04      
## 14    Central Asian  0.70        0.27     0.17     1.23      
## 15              EFL  0.28        0.09     0.10     0.46      
## 16                                                           
## 17             meta                                          
## 18          outcome     N          df       R2   R2-adj R2-cv
## 19                S   810         796     0.26     0.25      
## 20                                                           
## 21             etas                                          
## 22        Predictor   Eta Eta partial                        
## 23         Hispanic  0.05        0.06                        
## 24 Pacific Islander  0.23        0.26                        
## 25            Asian  0.01        0.01                        
## 26 African American  0.05        0.06                        
## 27  American Indian  0.05        0.06                        
## 28            Other  0.02        0.02                        
## 29            White  0.10        0.11                        
## 30          African  0.17        0.19                        
## 31       Amerindian  0.16        0.18                        
## 32       East Asian  0.00        0.00                        
## 33         Oceanian  0.04        0.04                        
## 34    Central Asian  0.08        0.09                        
## 35              EFL  0.09        0.11

# african americans narrow -------------------------------------------------------

#ancestry vairation
d_AA_narrow[v_genomic] %>% psych::describe()

##               vars   n mean   sd median trimmed  mad  min  max range skew kurtosis   se
## European         1 140 0.17 0.11   0.15    0.16 0.08 0.00 0.61  0.61  1.1      2.0 0.01
## African          2 140 0.81 0.12   0.84    0.82 0.09 0.39 1.00  0.61 -1.1      1.3 0.01
## Amerindian       3 140 0.00 0.01   0.00    0.00 0.00 0.00 0.05  0.05  3.2      9.5 0.00
## East_Asian       4 140 0.00 0.03   0.00    0.00 0.00 0.00 0.20  0.20  6.7     46.7 0.00
## Oceanian         5 140 0.00 0.01   0.00    0.00 0.00 0.00 0.04  0.04  5.9     36.0 0.00
## Central_Asian    6 140 0.01 0.02   0.00    0.00 0.00 0.00 0.16  0.16  4.5     19.8 0.00

#zero order
wtd.cors(d_AA_narrow[c("CA", "S", v_genomic)]) %>% write_clipboard()

##                  CA     S European African Amerindian East Asian Oceanian Central Asian
## CA             1.00  0.31     0.17   -0.22      -0.08       0.16     0.00          0.13
## S              0.31  1.00     0.27   -0.31       0.05       0.10     0.01          0.17
## European       0.17  0.27     1.00   -0.93       0.02       0.04    -0.21         -0.05
## African       -0.22 -0.31    -0.93    1.00      -0.07      -0.35     0.16         -0.26
## Amerindian    -0.08  0.05     0.02   -0.07       1.00      -0.04    -0.06         -0.07
## East Asian     0.16  0.10     0.04   -0.35      -0.04       1.00    -0.03          0.52
## Oceanian       0.00  0.01    -0.21    0.16      -0.06      -0.03     1.00          0.02
## Central Asian  0.13  0.17    -0.05   -0.26      -0.07       0.52     0.02          1.00

count.pairwise(d_AA_narrow[c("CA", "S", v_genomic)])

##                CA   S European African Amerindian East_Asian Oceanian Central_Asian
## CA            138 137      138     138        138        138      138           138
## S             137 139      139     139        139        139      139           139
## European      138 139      140     140        140        140      140           140
## African       138 139      140     140        140        140      140           140
## Amerindian    138 139      140     140        140        140      140           140
## East_Asian    138 139      140     140        140        140      140           140
## Oceanian      138 139      140     140        140        140      140           140
## Central_Asian 138 139      140     140        140        140      140           140

#model stats
#CA
lm("CA ~ African", data = d_AA_narrow) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()

##           V1    V2          V3       V4       V5    V6
## 1      coefs                                          
## 2  Predictor  Beta          SE CI lower CI upper      
## 3    African -1.60        0.61    -2.81    -0.39      
## 4                                                     
## 5       meta                                          
## 6    outcome     N          df       R2   R2-adj R2-cv
## 7         CA   138         136    0.048    0.041      
## 8                                                     
## 9       etas                                          
## 10 Predictor   Eta Eta partial                        
## 11   African  0.22        0.22

lm("CA ~ European", data = d_AA_narrow) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()

##           V1   V2          V3       V4       V5    V6
## 1      coefs                                         
## 2  Predictor Beta          SE CI lower CI upper      
## 3   European 1.36        0.66     0.06     2.66      
## 4                                                    
## 5       meta                                         
## 6    outcome    N          df       R2   R2-adj R2-cv
## 7         CA  138         136    0.031    0.023      
## 8                                                    
## 9       etas                                         
## 10 Predictor  Eta Eta partial                        
## 11  European 0.17        0.17

#model predictions
(AA_preds_euro = predict(lm("CA ~ European", data = d_AA_narrow), newdata = data.frame(European = c(0, 1))))

##     1     2 
## -0.92  0.45

(AA_preds_african = predict(lm("CA ~ African", data = d_AA_narrow), newdata = data.frame(African = c(0, 1))))

##     1     2 
##  0.62 -0.98

#gap sizes
AA_preds_euro[1] - AA_preds_euro[2]

##    1 
## -1.4

AA_preds_african[2] - AA_preds_african[1]

##    2 
## -1.6

#S
lm("S ~ African", data = d_AA_narrow) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()

##           V1    V2          V3       V4       V5    V6
## 1      coefs                                          
## 2  Predictor  Beta          SE CI lower CI upper      
## 3    African -2.50        0.65    -3.79    -1.22      
## 4                                                     
## 5       meta                                          
## 6    outcome     N          df       R2   R2-adj R2-cv
## 7          S   139         137    0.098    0.091      
## 8                                                     
## 9       etas                                          
## 10 Predictor   Eta Eta partial                        
## 11   African  0.31        0.31

lm("S ~ European", data = d_AA_narrow) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()

##           V1   V2          V3       V4       V5    V6
## 1      coefs                                         
## 2  Predictor Beta          SE CI lower CI upper      
## 3   European 2.28        0.70     0.89     3.66      
## 4                                                    
## 5       meta                                         
## 6    outcome    N          df       R2   R2-adj R2-cv
## 7          S  139         137    0.072    0.065      
## 8                                                    
## 9       etas                                         
## 10 Predictor  Eta Eta partial                        
## 11  European 0.27        0.27

#model predictions
(AA_preds_euro = predict(lm("S ~ European", data = d_AA_narrow), newdata = data.frame(European = c(0, 1))))

##     1     2 
## -0.96  1.32

(AA_preds_african = predict(lm("S ~ African", data = d_AA_narrow), newdata = data.frame(African = c(0, 1))))

##    1    2 
##  1.5 -1.0

#gap sizes
AA_preds_euro[1] - AA_preds_euro[2]

##    1 
## -2.3

AA_preds_african[2] - AA_preds_african[1]

##    2 
## -2.5

#calculate group means
d_W_AA_means = d_W_AA %>% plyr::ddply("SIRE_standard", function(x) {
  data.frame(
    African = wtd_mean(x[["African"]]),
    Amerindian = wtd_mean(x[["Amerindian"]]),
    European = wtd_mean(x[["European"]]),
    African_European_r = wtd.cors(x[["African"]], x[["European"]]) %>% as.vector,
    CA = wtd_mean(x[["CA"]]),
    n = nrow(x)
  )
})

#African ancestry
ggplot(d_W_AA, aes(African, CA, color = SIRE_standard)) +
  geom_point() +
  geom_smooth(data = d_AA_narrow, method = lm, fullrange = T) +
  scale_x_continuous("African ancestry%", limits = c(0, 1), labels = scales::percent) +
  scale_color_discrete(name = "SIRE (standard)") + 
  scale_y_continuous("Cognitive ability", breaks = -3:3) +
  geom_point(data = d_W_AA_means, size = 5) +
  theme_bw()

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 6 rows containing missing values (geom_point).

ggsave2("figures/AA_African.png")

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 6 rows containing missing values (geom_point).

#European
ggplot(d_W_AA, aes(European, CA, color = SIRE_standard)) +
  geom_point() +
  geom_smooth(data = d_AA_narrow, method = lm, fullrange = T) +
  scale_x_continuous("European ancestry%", limits = c(0, 1), labels = scales::percent) +
  scale_y_continuous("Cognitive ability", breaks = -3:3) +
  scale_color_discrete(name = "SIRE (standard)") + 
  geom_point(data = d_W_AA_means, size = 5) +
  theme_bw()

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 6 rows containing missing values (geom_point).

ggsave2("figures/AA_European.png")

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 6 rows containing missing values (geom_point).

# african americans broad -------------------------------------------------------

#ancestry vairation
d_AA_broad[v_genomic] %>% psych::describe()

##               vars   n mean   sd median trimmed  mad min  max range  skew kurtosis   se
## European         1 228 0.26 0.19   0.20    0.24 0.15   0 1.00  1.00  1.14     1.11 0.01
## African          2 228 0.68 0.23   0.77    0.71 0.19   0 1.00  1.00 -0.89    -0.03 0.02
## Amerindian       3 228 0.01 0.04   0.00    0.00 0.00   0 0.32  0.32  4.22    21.32 0.00
## East_Asian       4 228 0.03 0.12   0.00    0.00 0.00   0 1.00  1.00  4.82    27.80 0.01
## Oceanian         5 228 0.00 0.01   0.00    0.00 0.00   0 0.13  0.13  5.87    41.30 0.00
## Central_Asian    6 228 0.01 0.04   0.00    0.00 0.00   0 0.44  0.44  6.94    58.83 0.00

#zero order
wtd.cors(d_AA_broad[c("CA", "S", v_genomic)]) %>% write_clipboard()

##                  CA     S European African Amerindian East Asian Oceanian Central Asian
## CA             1.00  0.33     0.20   -0.29      -0.08       0.23     0.05          0.14
## S              0.33  1.00     0.16   -0.19      -0.09       0.10    -0.17          0.16
## European       0.20  0.16     1.00   -0.81       0.21      -0.06     0.01         -0.13
## African       -0.29 -0.19    -0.81    1.00      -0.30      -0.47    -0.30         -0.09
## Amerindian    -0.08 -0.09     0.21   -0.30       1.00      -0.08    -0.07         -0.02
## East Asian     0.23  0.10    -0.06   -0.47      -0.08       1.00     0.50          0.07
## Oceanian       0.05 -0.17     0.01   -0.30      -0.07       0.50     1.00         -0.04
## Central Asian  0.14  0.16    -0.13   -0.09      -0.02       0.07    -0.04          1.00

count.pairwise(d_AA_broad[c("CA", "S", v_genomic)])

##                CA   S European African Amerindian East_Asian Oceanian Central_Asian
## CA            225 224      225     225        225        225      225           225
## S             224 227      227     227        227        227      227           227
## European      225 227      228     228        228        228      228           228
## African       225 227      228     228        228        228      228           228
## Amerindian    225 227      228     228        228        228      228           228
## East_Asian    225 227      228     228        228        228      228           228
## Oceanian      225 227      228     228        228        228      228           228
## Central_Asian 225 227      228     228        228        228      228           228

#model stats
#CA
lm("CA ~ African", data = d_AA_broad) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()

##           V1    V2          V3       V4       V5    V6
## 1      coefs                                          
## 2  Predictor  Beta          SE CI lower CI upper      
## 3    African -1.24        0.27    -1.77    -0.71      
## 4                                                     
## 5       meta                                          
## 6    outcome     N          df       R2   R2-adj R2-cv
## 7         CA   225         223    0.087    0.083      
## 8                                                     
## 9       etas                                          
## 10 Predictor   Eta Eta partial                        
## 11   African  0.29        0.29

lm("CA ~ European", data = d_AA_broad) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()

##           V1   V2          V3       V4       V5    V6
## 1      coefs                                         
## 2  Predictor Beta          SE CI lower CI upper      
## 3   European 0.99        0.33     0.34     1.64      
## 4                                                    
## 5       meta                                         
## 6    outcome    N          df       R2   R2-adj R2-cv
## 7         CA  225         223    0.038    0.034      
## 8                                                    
## 9       etas                                         
## 10 Predictor  Eta Eta partial                        
## 11  European 0.20        0.20

#model predictions
(AA_preds_euro = predict(lm("CA ~ European", data = d_AA_broad), newdata = data.frame(European = c(0, 1))))

##     1     2 
## -0.86  0.13

(AA_preds_african = predict(lm("CA ~ African", data = d_AA_broad), newdata = data.frame(African = c(0, 1))))

##     1     2 
##  0.24 -0.99

#gap sizes
AA_preds_euro[1] - AA_preds_euro[2]

##     1 
## -0.99

AA_preds_african[2] - AA_preds_african[1]

##    2 
## -1.2

#S
lm("S ~ African", data = d_AA_broad) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()

##           V1    V2          V3       V4       V5    V6
## 1      coefs                                          
## 2  Predictor  Beta          SE CI lower CI upper      
## 3    African -0.77        0.27    -1.30    -0.24      
## 4                                                     
## 5       meta                                          
## 6    outcome     N          df       R2   R2-adj R2-cv
## 7          S   227         225    0.035    0.031      
## 8                                                     
## 9       etas                                          
## 10 Predictor   Eta Eta partial                        
## 11   African  0.19        0.19

lm("S ~ European", data = d_AA_broad) %>% MOD_summary(standardize = F, progress = F, kfold = F) %>% write_clipboard()

##           V1   V2          V3       V4       V5    V6
## 1      coefs                                         
## 2  Predictor Beta          SE CI lower CI upper      
## 3   European 0.81        0.33     0.17     1.45      
## 4                                                    
## 5       meta                                         
## 6    outcome    N          df       R2   R2-adj R2-cv
## 7          S  227         225    0.027    0.022      
## 8                                                    
## 9       etas                                         
## 10 Predictor  Eta Eta partial                        
## 11  European 0.16        0.16

#model predictions
(AA_preds_euro = predict(lm("S ~ European", data = d_AA_broad), newdata = data.frame(European = c(0, 1))))

##      1      2 
## -0.735  0.074

(AA_preds_african = predict(lm("S ~ African", data = d_AA_broad), newdata = data.frame(African = c(0, 1))))

##       1       2 
##  0.0018 -0.7697

#gap sizes
AA_preds_euro[1] - AA_preds_euro[2]

##     1 
## -0.81

AA_preds_african[2] - AA_preds_african[1]

##     2 
## -0.77

# Hispanics ---------------------------------------------------------------

#ancestry variation
d_HA[v_genomic] %>% psych::describe()

##               vars   n mean   sd median trimmed  mad min  max range  skew kurtosis   se
## European         1 328 0.57 0.23   0.56    0.58 0.25   0 1.00  1.00 -0.19    -0.53 0.01
## African          2 328 0.14 0.20   0.05    0.09 0.08   0 0.91  0.91  1.95     3.27 0.01
## Amerindian       3 328 0.19 0.17   0.15    0.17 0.18   0 0.83  0.83  0.91     0.44 0.01
## East_Asian       4 328 0.09 0.20   0.00    0.03 0.00   0 1.00  1.00  2.71     6.85 0.01
## Oceanian         5 328 0.01 0.02   0.00    0.00 0.00   0 0.16  0.16  4.17    20.32 0.00
## Central_Asian    6 328 0.01 0.03   0.00    0.00 0.00   0 0.46  0.46  9.51   117.77 0.00

#zero order
wtd.cors(d_HA[c("CA", "S", v_genomic)]) %>% write_clipboard()

##                  CA     S European African Amerindian East Asian Oceanian Central Asian
## CA             1.00  0.33     0.23   -0.23      -0.07       0.01    -0.09          0.08
## S              0.33  1.00     0.42   -0.24      -0.18      -0.09    -0.22          0.07
## European       0.23  0.42     1.00   -0.41      -0.21      -0.54    -0.34         -0.04
## African       -0.23 -0.24    -0.41    1.00      -0.31      -0.21    -0.12         -0.06
## Amerindian    -0.07 -0.18    -0.21   -0.31       1.00      -0.28    -0.29          0.01
## East Asian     0.01 -0.09    -0.54   -0.21      -0.28       1.00     0.67         -0.06
## Oceanian      -0.09 -0.22    -0.34   -0.12      -0.29       0.67     1.00         -0.05
## Central Asian  0.08  0.07    -0.04   -0.06       0.01      -0.06    -0.05          1.00

count.pairwise(d_HA[c("CA", "S", v_genomic)])

##                CA   S European African Amerindian East_Asian Oceanian Central_Asian
## CA            323 323      323     323        323        323      323           323
## S             323 328      328     328        328        328      328           328
## European      323 328      328     328        328        328      328           328
## African       323 328      328     328        328        328      328           328
## Amerindian    323 328      328     328        328        328      328           328
## East_Asian    323 328      328     328        328        328      328           328
## Oceanian      323 328      328     328        328        328      328           328
## Central_Asian 323 328      328     328        328        328      328           328

#3 ancestries non euro
HA_3way_ca = lm(CA ~ African + Amerindian + East_Asian, data = d_HA)
HA_3way_s = lm(S ~ African + Amerindian + East_Asian, data = d_HA)
HA_3way_ca %>% MOD_summary(standardize = F, kfold = F) %>% write_clipboard()

##            V1    V2          V3       V4       V5    V6
## 1       coefs                                          
## 2   Predictor  Beta          SE CI lower CI upper      
## 3     African -1.62        0.30    -2.22    -1.03      
## 4  Amerindian -1.16        0.34    -1.84    -0.49      
## 5  East Asian -0.53        0.28    -1.09     0.02      
## 6                                                      
## 7        meta                                          
## 8     outcome     N          df       R2   R2-adj R2-cv
## 9          CA   323         319    0.089     0.08      
## 10                                                     
## 11       etas                                          
## 12  Predictor   Eta Eta partial                        
## 13    African  0.29        0.29                        
## 14 Amerindian  0.18        0.19                        
## 15 East Asian  0.10        0.10

HA_3way_s %>% MOD_summary(standardize = F, kfold = F) %>% write_clipboard()

##            V1    V2          V3       V4       V5    V6
## 1       coefs                                          
## 2   Predictor  Beta          SE CI lower CI upper      
## 3     African -2.00        0.26    -2.52    -1.48      
## 4  Amerindian -2.09        0.30    -2.69    -1.49      
## 5  East Asian -1.32        0.25    -1.81    -0.82      
## 6                                                      
## 7        meta                                          
## 8     outcome     N          df       R2   R2-adj R2-cv
## 9           S   328         324     0.19     0.19      
## 10                                                     
## 11       etas                                          
## 12  Predictor   Eta Eta partial                        
## 13    African  0.38        0.39                        
## 14 Amerindian  0.34        0.36                        
## 15 East Asian  0.26        0.28

#predicted gaps
#CA
(HA_3way_ca_preds = predict(HA_3way_ca, newdata = data.frame(African = c(1, 0, 0, 0),
                                         Amerindian = c(0, 1, 0, 0),
                                         East_Asian = c(0, 0, 1, 0))))

##     1     2     3     4 
## -1.46 -1.00 -0.37  0.17

HA_3way_ca_preds[1:3] - HA_3way_ca_preds[4]

##     1     2     3 
## -1.62 -1.16 -0.53

#S
(HA_3way_s_preds = predict(HA_3way_s, newdata = data.frame(African = c(1, 0, 0, 0),
                                                             Amerindian = c(0, 1, 0, 0),
                                                             East_Asian = c(0, 0, 1, 0))))

##     1     2     3     4 
## -1.65 -1.74 -0.97  0.35

HA_3way_s_preds[1:3] - HA_3way_s_preds[4]

##    1    2    3 
## -2.0 -2.1 -1.3

#calculate group means
d_W_H_means = d_W_H %>% plyr::ddply("SIRE_standard", function(x) {
  data.frame(
    African = wtd_mean(x[["African"]]),
    Amerindian = wtd_mean(x[["Amerindian"]]),
    European = wtd_mean(x[["European"]]),
    African_European_r = wtd.cors(x[["African"]], x[["European"]]) %>% as.vector,
    CA = wtd_mean(x[["CA"]]),
    n = nrow(x)
  )
})

#African ancestry
ggplot(d_W_H, aes(African, CA, color = SIRE_standard)) +
  geom_point() +
  geom_smooth(data = d_HA, method = lm, fullrange = T) +
  scale_x_continuous("African ancestry%", limits = c(0, 1), labels = scales::percent) +
  scale_y_continuous("Cognitive ability", breaks = -3:3) +
  scale_color_discrete(name = "SIRE (standard)") + 
  geom_point(data = d_W_H_means, size = 5) +
  theme_bw()

## Warning: Removed 5 rows containing non-finite values (stat_smooth).

## Warning: Removed 9 rows containing missing values (geom_point).

ggsave2("figures/H_African.png")

## Warning: Removed 5 rows containing non-finite values (stat_smooth).

## Warning: Removed 9 rows containing missing values (geom_point).

#Amerindian
ggplot(d_W_H, aes(Amerindian, CA, color = SIRE_standard)) +
  geom_point() +
  geom_smooth(data = d_HA, method = lm, fullrange = T) +
  scale_x_continuous("Amerindian ancestry%", limits = c(0, 1), labels = scales::percent) +
  scale_color_discrete(name = "SIRE (standard)") + 
  scale_y_continuous("Cognitive ability", breaks = -3:3) +
  geom_point(data = d_W_H_means, size = 5) +
  theme_bw()

## Warning: Removed 5 rows containing non-finite values (stat_smooth).

## Warning: Removed 9 rows containing missing values (geom_point).

ggsave2("figures/H_Amerindian.png")

## Warning: Removed 5 rows containing non-finite values (stat_smooth).

## Warning: Removed 9 rows containing missing values (geom_point).

#European
ggplot(d_W_H, aes(European, CA, color = SIRE_standard)) +
  geom_point() +
  geom_smooth(data = d_HA, method = lm, fullrange = T) +
  scale_x_continuous("European ancestry%", limits = c(0, 1), labels = scales::percent) +
  scale_y_continuous("Cognitive ability", breaks = -3:3) +
  scale_color_discrete(name = "SIRE (standard)") + 
  geom_point(data = d_W_H_means, size = 5) +
  theme_bw()

## Warning: Removed 5 rows containing non-finite values (stat_smooth).

## Warning: Removed 9 rows containing missing values (geom_point).

ggsave2("figures/H_European.png")

## Warning: Removed 5 rows containing non-finite values (stat_smooth).

## Warning: Removed 9 rows containing missing values (geom_point).

# 3d plot -----------------------------------------------------------------
#can only be done for persons where 3 ancestries sum to ~100%
#we choose european, amerindian, african for this purpose
d_triway = d_ping %>% filter(African + Amerindian + European >= .99)
nrow(d_triway)

## [1] 887

#fit model
(triway_fit = lm(CA ~ African + Amerindian, data = d_triway) %>% MOD_summary(standardize = F))

## 
##     ---- Model summary ----    
## Model coefficients
##            Beta    SE CI_lower CI_upper
## African    -1.4 0.099     -1.6     -1.2
## Amerindian -1.9 0.285     -2.4     -1.3
## 
## 
## Model meta-data
##   outcome   N  df  R2 R2-adj. R2-cv
## 1      CA 874 871 0.2     0.2    NA
## 
## 
## Etas from analysis of variance
##             Eta Eta_partial
## African    0.42        0.42
## Amerindian 0.20        0.22

summary(triway_fit$model_obj)

## 
## Call:
## lm(formula = CA ~ African + Amerindian, data = d_triway)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.630 -0.532  0.024  0.588  3.043 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.2914     0.0381    7.64  5.6e-14 ***
## African      -1.3648     0.0993  -13.74  < 2e-16 ***
## Amerindian   -1.8633     0.2853   -6.53  1.1e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9 on 871 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.198,  Adjusted R-squared:  0.196 
## F-statistic:  107 on 2 and 871 DF,  p-value: <2e-16

#plot in 3D
# car::scatter3d(CA ~ African + Amerindian, data = d_triway, axis.ticks = T, axis.scales = T)

#another package
library(plot3D)

#prepare surface fit
# predict values on regular xy grid
n_grid_points = 25
triway_surface_data = list()
triway_surface_data$x.pred <- seq(0, 1, length.out = n_grid_points)
triway_surface_data$y.pred <- seq(0, 1, length.out = n_grid_points)
triway_surface_data$xy <- expand.grid( African = triway_surface_data$x.pred, Amerindian = triway_surface_data$y.pred)
triway_surface_data$xy$sum = rowSums(triway_surface_data$xy)

triway_surface_data$z.pred <- matrix(predict(triway_fit$model_obj, newdata = triway_surface_data$xy), 
                 nrow = n_grid_points, ncol = n_grid_points)

#code impossible values as NA
#ancestry sums to 1, one cannot be 100% african and 100% amerindian!
triway_surface_data$z.pred[matrix(triway_surface_data$xy$sum > 1, nrow = n_grid_points, ncol = n_grid_points)] = NA

#make title
triway_surface_data$title = glue::glue("Cognitive ability as a function of genomic ancesty\npersons with European, African and Amerindian ancestry only\nn={nrow(d_triway)}")

#plot
d_triway %$% scatter3D(x = African, y = Amerindian, z = CA,
                       theta = 120, phi = 20,
                       xlim = c(0, 1), ylim = c(0, 1),
                       ticktype = "detailed",
                       xlab = "African ancestry", ylab = "Amerindian ancestry", zlab = "Cognitive ability",
                       surf = list(x = triway_surface_data$x.pred, y = triway_surface_data$y.pred, z = triway_surface_data$z.pred,  
                                   facets = NA),
                       main = triway_surface_data$title)

#save
png("figures/triway_points_surface.png")
d_triway %$% scatter3D(x = African, y = Amerindian, z = CA,
                       theta = 120, phi = 20,
                       xlim = c(0, 1), ylim = c(0, 1),
                       ticktype = "detailed",
                       xlab = "African ancestry", ylab = "Amerindian ancestry", zlab = "Cognitive ability",
                       surf = list(x = triway_surface_data$x.pred, y = triway_surface_data$y.pred, z = triway_surface_data$z.pred,  
                                   facets = NA),
                       main = triway_surface_data$title)
dev.off()

## png 
##   2

#save without caption (for paper)
png("figures/triway_points_surface_no_caption.png")
d_triway %$% scatter3D(x = African, y = Amerindian, z = CA,
                       theta = 120, phi = 20,
                       xlim = c(0, 1), ylim = c(0, 1),
                       ticktype = "detailed",
                       xlab = "African ancestry", ylab = "Amerindian ancestry", zlab = "Cognitive ability",
                       surf = list(x = triway_surface_data$x.pred, y = triway_surface_data$y.pred, z = triway_surface_data$z.pred,  
                                   facets = NA)
                       )
dev.off()

## png 
##   2

# eta example -------------------------------------------------------------
#get the eta and eta^2
all_models_nonEuro[[1]]$fit[[1]]$aov %>% lsr::etaSquared() %>% as.data.frame %>% mutate(eta = sqrt(eta.sq)) %>% write_clipboard(digits = 4)

##   Eta sq Eta sq part    Eta
## 1 0.0078      0.0092 0.0881
## 2 0.0379      0.0436 0.1947
## 3 0.0086      0.0102 0.0925
## 4 0.0001      0.0001 0.0078
## 5 0.0088      0.0104 0.0936
## 6 0.0004      0.0005 0.0203
## 7 0.0001      0.0001 0.0086

#sum the genomic preds eta^2
all_models_nonEuro[[1]]$fit[[1]]$aov %>% lsr::etaSquared() %>% as.data.frame %>% mutate(eta = sqrt(eta.sq)) %>% `[`(2:6, 1) %>% sum

## [1] 0.056

#sqrt
all_models_nonEuro[[1]]$fit[[1]]$aov %>% lsr::etaSquared() %>% as.data.frame %>% mutate(eta = sqrt(eta.sq)) %>% `[`(2:6, 1) %>% sum %>% sqrt

## [1] 0.24

#predict african ancestry from non-euro ancestry
lm("African ~ Amerindian + Asian + Central_Asian + Oceanian", data = d_ping) %>% MOD_summary(progress = F, standardize = F)

## 
##     ---- Model summary ----    
## Model coefficients
##                 Beta    SE CI_lower CI_upper
## Amerindian    -0.284 0.061    -0.40   -0.165
## Asian         -0.138 0.018    -0.17   -0.102
## Central_Asian -0.086 0.059    -0.20    0.029
## Oceanian      -0.582 0.249    -1.07   -0.093
## 
## 
## Model meta-data
##   outcome    N   df    R2 R2-adj. R2-cv
## 1 African 1391 1386 0.063    0.06    NA
## 
## 
## Etas from analysis of variance
##                 Eta Eta_partial
## Amerindian    0.121       0.124
## Asian         0.198       0.200
## Central_Asian 0.038       0.039
## Oceanian      0.061       0.063

# other models ------------------------------------------------------------
#some other results that may have marginal interest

#only European ancestry + SIREs
#CA
all_models_euro[[1]][["fit"]]

## [[1]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                    Beta    SE CI_lower CI_upper
## SIRE_standard: White             0.0000    NA       NA       NA
## SIRE_standard: African_American -0.4184 0.147   -0.706    -0.13
## SIRE_standard: American_Indian   0.1310 0.472   -0.794     1.06
## SIRE_standard: Asian             0.6110 0.164    0.289     0.93
## SIRE_standard: Hispanic         -0.3409 0.088   -0.514    -0.17
## SIRE_standard: Multi_ethnic      0.1252 0.111   -0.092     0.34
## SIRE_standard: Other            -0.0380 0.228   -0.484     0.41
## SIRE_standard: Pacific_Islander -0.1732 0.268   -0.700     0.35
## European                         0.6890 0.146    0.403     0.98
## EFL                              0.0038 0.085   -0.163     0.17
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1359 0.13    0.13    NA
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## SIRE_standard 0.2766      0.2845
## European      0.1194      0.1271
## EFL           0.0011      0.0012
## 
## [[2]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                               Beta    SE CI_lower CI_upper
## SIRE_common: White                           0.000    NA       NA       NA
## SIRE_common: African_American               -0.564 0.149    -0.86   -0.272
## SIRE_common: Asian                           0.447 0.166     0.12    0.773
## SIRE_common: Asian, White                    0.549 0.144     0.27    0.831
## SIRE_common: Hispanic                       -0.654 0.135    -0.92   -0.388
## SIRE_common: Hispanic, African_American     -0.894 0.203    -1.29   -0.495
## SIRE_common: Hispanic, White                -0.281 0.097    -0.47   -0.092
## SIRE_common: Pacific_Islander, Asian, White -0.114 0.188    -0.48    0.255
## SIRE_common: Other                          -0.248 0.110    -0.46   -0.031
## European                                     0.507 0.151     0.21    0.803
## EFL                                          0.018 0.084    -0.15    0.184
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1358 0.16    0.15    NA
## 
## 
## Etas from analysis of variance
##                Eta Eta_partial
## SIRE_common 0.3203      0.3293
## European    0.0837      0.0908
## EFL         0.0054      0.0058
## 
## [[3]]
## 
##     ---- Model summary ----    
## Model coefficients
##                      Beta    SE CI_lower CI_upper
## Hispanic_c         -0.676 0.125    -0.92   -0.431
## Pacific_Islander_c -0.524 0.229    -0.97   -0.075
## Asian_c             0.522 0.177     0.17    0.869
## African_American_c -0.579 0.161    -0.90   -0.263
## American_Indian_c  -0.439 0.266    -0.96    0.083
## Other_c            -0.118 0.227    -0.56    0.327
## European            0.543 0.172     0.21    0.880
## EFL                 0.013 0.084    -0.15    0.178
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1360 0.16    0.15    NA
## 
## 
## Etas from analysis of variance
##                       Eta Eta_partial
## Hispanic_c         0.1350      0.1455
## Pacific_Islander_c 0.0570      0.0619
## Asian_c            0.0734      0.0797
## African_American_c 0.0894      0.0969
## American_Indian_c  0.0411      0.0447
## Other_c            0.0130      0.0141
## European           0.0788      0.0855
## EFL                0.0038      0.0042
## 
## [[4]]
## 
##     ---- Model summary ----    
## Model coefficients
##                     Beta    SE CI_lower CI_upper
## Hispanic         -0.3129 0.063   -0.436    -0.19
## Pacific_Islander -0.4107 0.097   -0.601    -0.22
## Asian             0.5510 0.089    0.376     0.73
## African_American -0.3138 0.094   -0.497    -0.13
## American_Indian  -0.1187 0.120   -0.354     0.12
## Other             0.1005 0.223   -0.337     0.54
## White             0.2279 0.087    0.057     0.40
## European          0.4710 0.141    0.194     0.75
## EFL               0.0026 0.084   -0.163     0.17
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1359 0.16    0.15    NA
## 
## 
## Etas from analysis of variance
##                      Eta Eta_partial
## Hispanic         0.12432     0.13412
## Pacific_Islander 0.10559     0.11420
## Asian            0.15347     0.16480
## African_American 0.08353     0.09057
## American_Indian  0.02462     0.02680
## Other            0.01121     0.01221
## White            0.06531     0.07092
## European         0.08301     0.09000
## EFL              0.00076     0.00083
## 
## [[5]]
## 
##     ---- Model summary ----    
## Model coefficients
##            Beta    SE CI_lower CI_upper
## European  0.633 0.072     0.49     0.77
## EFL      -0.051 0.085    -0.22     0.12
## 
## 
## Model meta-data
##   outcome    N   df    R2 R2-adj. R2-cv
## 1      CA 1369 1366 0.054   0.053    NA
## 
## 
## Etas from analysis of variance
##            Eta Eta_partial
## European 0.232       0.232
## EFL      0.016       0.016
## 
## [[6]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                   Beta    SE CI_lower CI_upper
## SIRE_standard: White             0.000    NA       NA       NA
## SIRE_standard: African_American -0.971 0.090    -1.15   -0.795
## SIRE_standard: American_Indian  -0.089 0.473    -1.02    0.839
## SIRE_standard: Asian            -0.016 0.097    -0.21    0.175
## SIRE_standard: Hispanic         -0.615 0.067    -0.75   -0.484
## SIRE_standard: Multi_ethnic     -0.238 0.080    -0.40   -0.081
## SIRE_standard: Other            -0.332 0.221    -0.76    0.101
## SIRE_standard: Pacific_Islander -0.761 0.240    -1.23   -0.291
## EFL                              0.039 0.085    -0.13    0.206
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1360 0.12    0.11    NA
## 
## 
## Etas from analysis of variance
##                 Eta Eta_partial
## SIRE_standard 0.341       0.341
## EFL           0.012       0.012
## 
## [[7]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                               Beta    SE CI_lower CI_upper
## SIRE_common: White                           0.000    NA       NA       NA
## SIRE_common: African_American               -0.971 0.088    -1.14   -0.799
## SIRE_common: Asian                          -0.012 0.095    -0.20    0.176
## SIRE_common: Asian, White                    0.320 0.128     0.07    0.571
## SIRE_common: Hispanic                       -0.868 0.120    -1.10   -0.633
## SIRE_common: Hispanic, African_American     -1.222 0.179    -1.57   -0.871
## SIRE_common: Hispanic, White                -0.413 0.089    -0.59   -0.239
## SIRE_common: Pacific_Islander, Asian, White -0.398 0.168    -0.73   -0.069
## SIRE_common: Other                          -0.524 0.074    -0.67   -0.379
## EFL                                          0.054 0.084    -0.11    0.219
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1359 0.15    0.14    NA
## 
## 
## Etas from analysis of variance
##               Eta Eta_partial
## SIRE_common 0.386       0.387
## EFL         0.016       0.018
## 
## [[8]]
## 
##     ---- Model summary ----    
## Model coefficients
##                      Beta    SE CI_lower CI_upper
## Hispanic_c         -0.917 0.099    -1.11   -0.722
## Pacific_Islander_c -0.996 0.175    -1.34   -0.653
## Asian_c             0.036 0.088    -0.14    0.210
## African_American_c -1.016 0.084    -1.18   -0.852
## American_Indian_c  -0.615 0.261    -1.13   -0.103
## Other_c            -0.345 0.216    -0.77    0.078
## EFL                 0.050 0.083    -0.11    0.214
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1361 0.15    0.15    NA
## 
## 
## Etas from analysis of variance
##                      Eta Eta_partial
## Hispanic_c         0.231       0.243
## Pacific_Islander_c 0.143       0.153
## Asian_c            0.010       0.011
## African_American_c 0.304       0.313
## American_Indian_c  0.059       0.064
## Other_c            0.040       0.043
## EFL                0.015       0.016
## 
## [[9]]
## 
##     ---- Model summary ----    
## Model coefficients
##                    Beta    SE CI_lower CI_upper
## Hispanic         -0.341 0.062    -0.46    -0.22
## Pacific_Islander -0.479 0.095    -0.67    -0.29
## Asian             0.378 0.073     0.23     0.52
## African_American -0.460 0.083    -0.62    -0.30
## American_Indian  -0.108 0.121    -0.34     0.13
## Other             0.110 0.224    -0.33     0.55
## White             0.410 0.068     0.28     0.54
## EFL               0.052 0.083    -0.11     0.22
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1360 0.15    0.14    NA
## 
## 
## Etas from analysis of variance
##                    Eta Eta_partial
## Hispanic         0.137       0.147
## Pacific_Islander 0.126       0.135
## Asian            0.129       0.139
## African_American 0.139       0.149
## American_Indian  0.022       0.024
## Other            0.012       0.013
## White            0.151       0.161
## EFL              0.016       0.017

#S
all_models_euro[[1]][["fit"]]

## [[1]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                    Beta    SE CI_lower CI_upper
## SIRE_standard: White             0.0000    NA       NA       NA
## SIRE_standard: African_American -0.4184 0.147   -0.706    -0.13
## SIRE_standard: American_Indian   0.1310 0.472   -0.794     1.06
## SIRE_standard: Asian             0.6110 0.164    0.289     0.93
## SIRE_standard: Hispanic         -0.3409 0.088   -0.514    -0.17
## SIRE_standard: Multi_ethnic      0.1252 0.111   -0.092     0.34
## SIRE_standard: Other            -0.0380 0.228   -0.484     0.41
## SIRE_standard: Pacific_Islander -0.1732 0.268   -0.700     0.35
## European                         0.6890 0.146    0.403     0.98
## EFL                              0.0038 0.085   -0.163     0.17
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1359 0.13    0.13    NA
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## SIRE_standard 0.2766      0.2845
## European      0.1194      0.1271
## EFL           0.0011      0.0012
## 
## [[2]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                               Beta    SE CI_lower CI_upper
## SIRE_common: White                           0.000    NA       NA       NA
## SIRE_common: African_American               -0.564 0.149    -0.86   -0.272
## SIRE_common: Asian                           0.447 0.166     0.12    0.773
## SIRE_common: Asian, White                    0.549 0.144     0.27    0.831
## SIRE_common: Hispanic                       -0.654 0.135    -0.92   -0.388
## SIRE_common: Hispanic, African_American     -0.894 0.203    -1.29   -0.495
## SIRE_common: Hispanic, White                -0.281 0.097    -0.47   -0.092
## SIRE_common: Pacific_Islander, Asian, White -0.114 0.188    -0.48    0.255
## SIRE_common: Other                          -0.248 0.110    -0.46   -0.031
## European                                     0.507 0.151     0.21    0.803
## EFL                                          0.018 0.084    -0.15    0.184
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1358 0.16    0.15    NA
## 
## 
## Etas from analysis of variance
##                Eta Eta_partial
## SIRE_common 0.3203      0.3293
## European    0.0837      0.0908
## EFL         0.0054      0.0058
## 
## [[3]]
## 
##     ---- Model summary ----    
## Model coefficients
##                      Beta    SE CI_lower CI_upper
## Hispanic_c         -0.676 0.125    -0.92   -0.431
## Pacific_Islander_c -0.524 0.229    -0.97   -0.075
## Asian_c             0.522 0.177     0.17    0.869
## African_American_c -0.579 0.161    -0.90   -0.263
## American_Indian_c  -0.439 0.266    -0.96    0.083
## Other_c            -0.118 0.227    -0.56    0.327
## European            0.543 0.172     0.21    0.880
## EFL                 0.013 0.084    -0.15    0.178
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1360 0.16    0.15    NA
## 
## 
## Etas from analysis of variance
##                       Eta Eta_partial
## Hispanic_c         0.1350      0.1455
## Pacific_Islander_c 0.0570      0.0619
## Asian_c            0.0734      0.0797
## African_American_c 0.0894      0.0969
## American_Indian_c  0.0411      0.0447
## Other_c            0.0130      0.0141
## European           0.0788      0.0855
## EFL                0.0038      0.0042
## 
## [[4]]
## 
##     ---- Model summary ----    
## Model coefficients
##                     Beta    SE CI_lower CI_upper
## Hispanic         -0.3129 0.063   -0.436    -0.19
## Pacific_Islander -0.4107 0.097   -0.601    -0.22
## Asian             0.5510 0.089    0.376     0.73
## African_American -0.3138 0.094   -0.497    -0.13
## American_Indian  -0.1187 0.120   -0.354     0.12
## Other             0.1005 0.223   -0.337     0.54
## White             0.2279 0.087    0.057     0.40
## European          0.4710 0.141    0.194     0.75
## EFL               0.0026 0.084   -0.163     0.17
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1359 0.16    0.15    NA
## 
## 
## Etas from analysis of variance
##                      Eta Eta_partial
## Hispanic         0.12432     0.13412
## Pacific_Islander 0.10559     0.11420
## Asian            0.15347     0.16480
## African_American 0.08353     0.09057
## American_Indian  0.02462     0.02680
## Other            0.01121     0.01221
## White            0.06531     0.07092
## European         0.08301     0.09000
## EFL              0.00076     0.00083
## 
## [[5]]
## 
##     ---- Model summary ----    
## Model coefficients
##            Beta    SE CI_lower CI_upper
## European  0.633 0.072     0.49     0.77
## EFL      -0.051 0.085    -0.22     0.12
## 
## 
## Model meta-data
##   outcome    N   df    R2 R2-adj. R2-cv
## 1      CA 1369 1366 0.054   0.053    NA
## 
## 
## Etas from analysis of variance
##            Eta Eta_partial
## European 0.232       0.232
## EFL      0.016       0.016
## 
## [[6]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                   Beta    SE CI_lower CI_upper
## SIRE_standard: White             0.000    NA       NA       NA
## SIRE_standard: African_American -0.971 0.090    -1.15   -0.795
## SIRE_standard: American_Indian  -0.089 0.473    -1.02    0.839
## SIRE_standard: Asian            -0.016 0.097    -0.21    0.175
## SIRE_standard: Hispanic         -0.615 0.067    -0.75   -0.484
## SIRE_standard: Multi_ethnic     -0.238 0.080    -0.40   -0.081
## SIRE_standard: Other            -0.332 0.221    -0.76    0.101
## SIRE_standard: Pacific_Islander -0.761 0.240    -1.23   -0.291
## EFL                              0.039 0.085    -0.13    0.206
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1360 0.12    0.11    NA
## 
## 
## Etas from analysis of variance
##                 Eta Eta_partial
## SIRE_standard 0.341       0.341
## EFL           0.012       0.012
## 
## [[7]]
## 
##     ---- Model summary ----    
## Model coefficients
##                                               Beta    SE CI_lower CI_upper
## SIRE_common: White                           0.000    NA       NA       NA
## SIRE_common: African_American               -0.971 0.088    -1.14   -0.799
## SIRE_common: Asian                          -0.012 0.095    -0.20    0.176
## SIRE_common: Asian, White                    0.320 0.128     0.07    0.571
## SIRE_common: Hispanic                       -0.868 0.120    -1.10   -0.633
## SIRE_common: Hispanic, African_American     -1.222 0.179    -1.57   -0.871
## SIRE_common: Hispanic, White                -0.413 0.089    -0.59   -0.239
## SIRE_common: Pacific_Islander, Asian, White -0.398 0.168    -0.73   -0.069
## SIRE_common: Other                          -0.524 0.074    -0.67   -0.379
## EFL                                          0.054 0.084    -0.11    0.219
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1359 0.15    0.14    NA
## 
## 
## Etas from analysis of variance
##               Eta Eta_partial
## SIRE_common 0.386       0.387
## EFL         0.016       0.018
## 
## [[8]]
## 
##     ---- Model summary ----    
## Model coefficients
##                      Beta    SE CI_lower CI_upper
## Hispanic_c         -0.917 0.099    -1.11   -0.722
## Pacific_Islander_c -0.996 0.175    -1.34   -0.653
## Asian_c             0.036 0.088    -0.14    0.210
## African_American_c -1.016 0.084    -1.18   -0.852
## American_Indian_c  -0.615 0.261    -1.13   -0.103
## Other_c            -0.345 0.216    -0.77    0.078
## EFL                 0.050 0.083    -0.11    0.214
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1361 0.15    0.15    NA
## 
## 
## Etas from analysis of variance
##                      Eta Eta_partial
## Hispanic_c         0.231       0.243
## Pacific_Islander_c 0.143       0.153
## Asian_c            0.010       0.011
## African_American_c 0.304       0.313
## American_Indian_c  0.059       0.064
## Other_c            0.040       0.043
## EFL                0.015       0.016
## 
## [[9]]
## 
##     ---- Model summary ----    
## Model coefficients
##                    Beta    SE CI_lower CI_upper
## Hispanic         -0.341 0.062    -0.46    -0.22
## Pacific_Islander -0.479 0.095    -0.67    -0.29
## Asian             0.378 0.073     0.23     0.52
## African_American -0.460 0.083    -0.62    -0.30
## American_Indian  -0.108 0.121    -0.34     0.13
## Other             0.110 0.224    -0.33     0.55
## White             0.410 0.068     0.28     0.54
## EFL               0.052 0.083    -0.11     0.22
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1369 1360 0.15    0.14    NA
## 
## 
## Etas from analysis of variance
##                    Eta Eta_partial
## Hispanic         0.137       0.147
## Pacific_Islander 0.126       0.135
## Asian            0.129       0.139
## African_American 0.139       0.149
## American_Indian  0.022       0.024
## Other            0.012       0.013
## White            0.151       0.161
## EFL              0.016       0.017

Other figures

### Some other analyses


# Cognitive ability stability ---------------------------------------------
#for this we need Lynn's IQs and Baten's age heaping data
#both are in the mega dataset
#one can also get them from:
#http://clio-infra.eu/
#Lynn's book

mega = read_csv("data/Megadataset_v2.0p.csv")

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character(),
##   EthnicHeterogenityVanhanen2012 = col_integer(),
##   EthnicConflictVanhanen2012 = col_integer(),
##   SlowTimePrefWangetal2011 = col_integer(),
##   Math00Mean = col_integer(),
##   Math00SD = col_integer(),
##   Read00Mean = col_integer(),
##   Read00SD = col_integer(),
##   Sci00Mean = col_integer(),
##   Sci00SD = col_integer(),
##   Math03Mean = col_integer(),
##   Math03SD = col_integer(),
##   Read03Mean = col_integer(),
##   Read03SD = col_integer(),
##   Sci03Mean = col_integer(),
##   Sci03SD = col_integer(),
##   PS03Mean = col_integer(),
##   PS03SD = col_integer(),
##   Read09.Native = col_integer(),
##   Read09.1g = col_integer()
##   # ... with 102 more columns
## )

## See spec(...) for full column specifications.

#rename some vars
mega$LynnIQ = mega$LV2012estimatedIQ

#find the age heaping vars
age_heaping_vars = str_subset(names(mega), "^AH\\d+")

#the correlations
wtd.cors(mega[c(age_heaping_vars, "LynnIQ")])

##        AH1800 AH1820 AH1830 AH1840 AH1850 AH1860 AH1870 AH1880 AH1890 AH1900 AH1910 AH1920 AH1930 AH1940 AH1950 AH1960 AH1970 LynnIQ
## AH1800   1.00   0.95   0.94   0.93   0.94   0.95   0.96   0.91   0.90   0.90   0.91   0.69   0.64   0.68   0.77   0.76    NaN   0.85
## AH1820   0.95   1.00   0.99   0.97   0.94   0.88   0.94   0.81   0.76   0.69   0.71   0.58   0.58   0.21   0.64   0.74    NaN   0.62
## AH1830   0.94   0.99   1.00   0.99   0.98   0.94   0.95   0.83   0.78   0.70   0.70   0.54   0.53   0.35   0.65   0.74    NaN   0.70
## AH1840   0.93   0.97   0.99   1.00   0.99   0.95   0.96   0.86   0.81   0.74   0.74   0.60   0.59   0.42   0.71   0.79    NaN   0.70
## AH1850   0.94   0.94   0.98   0.99   1.00   0.98   0.99   0.90   0.84   0.77   0.77   0.61   0.61   0.43   0.73   0.81    NaN   0.73
## AH1860   0.95   0.88   0.94   0.95   0.98   1.00   0.97   0.87   0.81   0.72   0.75   0.61   0.60   0.49   0.69   0.67    NaN   0.59
## AH1870   0.96   0.94   0.95   0.96   0.99   0.97   1.00   0.98   0.96   0.93   0.89   0.81   0.73   0.69   0.60   0.86    NaN   0.64
## AH1880   0.91   0.81   0.83   0.86   0.90   0.87   0.98   1.00   0.98   0.94   0.93   0.89   0.86   0.73   0.71   0.86    NaN   0.52
## AH1890   0.90   0.76   0.78   0.81   0.84   0.81   0.96   0.98   1.00   0.96   0.94   0.93   0.91   0.85   0.69   0.61    NaN   0.52
## AH1900   0.90   0.69   0.70   0.74   0.77   0.72   0.93   0.94   0.96   1.00   0.96   0.95   0.94   0.90   0.73   0.55    NaN   0.47
## AH1910   0.91   0.71   0.70   0.74   0.77   0.75   0.89   0.93   0.94   0.96   1.00   0.98   0.97   0.94   0.82   0.55    NaN   0.46
## AH1920   0.69   0.58   0.54   0.60   0.61   0.61   0.81   0.89   0.93   0.95   0.98   1.00   0.98   0.95   0.81   0.56    NaN   0.47
## AH1930   0.64   0.58   0.53   0.59   0.61   0.60   0.73   0.86   0.91   0.94   0.97   0.98   1.00   0.98   0.89   0.56    NaN   0.44
## AH1940   0.68   0.21   0.35   0.42   0.43   0.49   0.69   0.73   0.85   0.90   0.94   0.95   0.98   1.00   0.91   0.64    NaN   0.45
## AH1950   0.77   0.64   0.65   0.71   0.73   0.69   0.60   0.71   0.69   0.73   0.82   0.81   0.89   0.91   1.00   0.97  -0.53   0.45
## AH1960   0.76   0.74   0.74   0.79   0.81   0.67   0.86   0.86   0.61   0.55   0.55   0.56   0.56   0.64   0.97   1.00   0.22   0.51
## AH1970    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  -0.53   0.22   1.00   0.74
## LynnIQ   0.85   0.62   0.70   0.70   0.73   0.59   0.64   0.52   0.52   0.47   0.46   0.47   0.44   0.45   0.45   0.51   0.74   1.00

#sample sizes
count.pairwise(mega[c(age_heaping_vars, "LynnIQ")])

##        AH1800 AH1820 AH1830 AH1840 AH1850 AH1860 AH1870 AH1880 AH1890 AH1900 AH1910 AH1920 AH1930 AH1940 AH1950 AH1960 AH1970 LynnIQ
## AH1800     31     25     27     26     22     20     22     25     24     22     21     18     17     17     16     14      0     29
## AH1820     25     45     45     43     37     34     22     38     36     35     34     33     32     31     22     18      0     43
## AH1830     27     45     49     47     41     38     26     42     40     39     38     35     34     33     22     18      0     46
## AH1840     26     43     47     50     44     41     29     44     42     41     40     37     36     35     23     19      0     48
## AH1850     22     37     41     44     45     41     27     39     37     36     35     32     31     30     18     16      0     43
## AH1860     20     34     38     41     41     56     41     53     50     49     47     37     37     35     23     17      0     54
## AH1870     22     22     26     29     27     41     62     59     56     54     53     43     37     31     26     20      0     61
## AH1880     25     38     42     44     39     53     59     95     92     89     88     78     72     55     38     28      0     93
## AH1890     24     36     40     42     37     50     56     92    109    105    103     94     88     70     41     31      1    107
## AH1900     22     35     39     41     36     49     54     89    105    121    116    107    101     84     52     31      1    118
## AH1910     21     34     38     40     35     47     53     88    103    116    133    123    117     99     67     46      1    130
## AH1920     18     33     35     37     32     37     43     78     94    107    123    126    120    102     70     49      3    123
## AH1930     17     32     34     36     31     37     37     72     88    101    117    120    120    102     70     49      3    117
## AH1940     17     31     33     35     30     35     31     55     70     84     99    102    102    107     73     52      3    104
## AH1950     16     22     22     23     18     23     26     38     41     52     67     70     70     73     77     56      5     74
## AH1960     14     18     18     19     16     17     20     28     31     31     46     49     49     52     56     56      5     54
## AH1970      0      0      0      0      0      0      0      0      1      1      1      3      3      3      5      5      5      4
## LynnIQ     29     43     46     48     43     54     61     93    107    118    130    123    117    104     74     54      4    203

#plots
plots = map(seq(1850, 1880, by = 10), ~GG_scatter(mega, "AH" + ., "LynnIQ", case_names = "Names") + xlab(sprintf("Age heaping %d", .)) + ylab("Lynn's IQ estimates"))
GG_save("figures/age_heaping_IQ.png", plot = grid.arrange(grobs = plots, ncol = 2))

# Old numeracy / literacy data --------------------------------------------

lit_num = data_frame(
  year = c(1850, 1870, 1900) %>% as.integer(),
  W_literacy = c(.86, .892, .936),
  W_numeracy = c(.876, .885, .954),
  B_literacy = c(.585, .349, .62),
  B_numeracy = c(.696, .70, .86)
) %>% mutate(
  BW_literacy = qnorm(W_literacy) - qnorm(B_literacy),
  BW_numeracy = qnorm(W_numeracy) - qnorm(B_numeracy),
  BW_mean = (BW_literacy + BW_numeracy) / 2
) %>% write_clipboard()

##   Year W literacy W numeracy B literacy B numeracy BW literacy BW numeracy BW mean
## 1 1850       0.86       0.88       0.58       0.70        0.87        0.64    0.75
## 2 1870       0.89       0.88       0.35       0.70        1.63        0.68    1.15
## 3 1900       0.94       0.95       0.62       0.86        1.22        0.60    0.91

mean(lit_num$BW_mean)

## [1] 0.94

Versions

write_sessioninfo()

## R version 3.5.1 (2018-07-02)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18
## 
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/libopenblasp-r0.2.18.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] plot3D_1.1.1       bindrcpp_0.2.2     gridExtra_2.3      kirkegaard_2018.05 metafor_2.0-0      Matrix_1.2-14      assertthat_0.2.0   magrittr_1.5       forcats_0.3.0      stringr_1.3.1      dplyr_0.7.6        purrr_0.2.5        readr_1.1.1        tidyr_0.8.1        tibble_1.4.2       tidyverse_1.2.1    weights_0.90       mice_3.3.0         gdata_2.18.0       Hmisc_4.1-1        ggplot2_3.0.0      Formula_1.2-3      survival_2.42-6    lattice_0.20-35    psych_1.8.4        plyr_1.8.4         pacman_0.4.6      
## 
## loaded via a namespace (and not attached):
##  [1] minqa_1.2.4         colorspace_1.3-2    class_7.3-14        rio_0.5.16          rprojroot_1.3-2     htmlTable_1.12      base64enc_0.1-3     rstudioapi_0.8      fansi_0.3.0         lubridate_1.7.4     xml2_1.2.0          splines_3.5.1       mnormt_1.5-5        robustbase_0.93-3   knitr_1.20          jsonlite_1.6        nloptr_1.0.4        lsr_0.5             broom_0.5.0         cluster_2.0.7-1     curry_0.1.1         compiler_3.5.1      httr_1.3.1          backports_1.1.2     lazyeval_0.2.1      cli_1.0.1           acepack_1.4.1       htmltools_0.3.6     tools_3.5.1        
## [30] misc3d_0.8-4        gtable_0.2.0        glue_1.3.0          reshape2_1.4.3      Rcpp_0.12.18        carData_3.0-2       cellranger_1.1.0    nlme_3.1-137        multilevel_2.6      lmtest_0.9-36       laeken_0.4.6        openxlsx_4.1.0      lme4_1.1-18-1       rvest_0.3.2         gtools_3.8.1        pan_1.6             DEoptimR_1.0-8      MASS_7.3-50         zoo_1.8-4           scales_1.0.0        VIM_4.7.0           hms_0.4.2           parallel_3.5.1      RColorBrewer_1.1-2  psychometric_2.2    yaml_2.2.0          curl_3.2            rpart_4.1-13        latticeExtra_0.6-28
## [59] stringi_1.2.4       e1071_1.7-0         checkmate_1.8.5     boot_1.3-20         zip_1.0.0           rlang_0.2.2         pkgconfig_2.0.2     evaluate_0.11       bindr_0.1.1         htmlwidgets_1.2     labeling_0.3        tidyselect_0.2.4    R6_2.2.2            mitml_0.3-6         pillar_1.3.0        haven_1.1.2         foreign_0.8-71      withr_2.1.2         abind_1.4-5         sp_1.3-1            nnet_7.3-12         modelr_0.1.2        crayon_1.3.4        car_3.0-2           jomo_2.6-4          utf8_1.1.4          rmarkdown_1.10      grid_3.5.1          readxl_1.1.0       
## [88] data.table_1.11.6   vcd_1.4-4           digest_0.6.17       munsell_0.5.0

R Notebook for: Biogeographic ancestry, cognitive ability, and socioeconomic outcomes

Emil O. W. Kirkegaard