Init

options(digits = 3)
library(pacman)
p_load(kirkegaard, rms, rvest)

#describe overwrite
describe = function(...) {
  y = psych::describe(...)
  class(y) = "data.frame"
  y
}

#standardize IDs
#VCF conversion messes them up
std_ids = function(x) {
  x %>% 
    #replace all dots with underscores
    str_replace_all("\\.", "_") %>% 
    #replace all dashes with underscores
    str_replace_all("\\-", "_") %>% 
    #trim any leading underscores
    str_replace("^_", "")
}

Data

Reich lab compilation: https://reich.hms.harvard.edu/downloadable-genotypes-worlds-published-ancient-dna-data

Read PGSs

#find PGS
pgs_files = dir("data", pattern = ".profile$")

#read files
pgs_scores_raw = map("data/" + pgs_files, ~read_table2(.))
## Parsed with column specification:
## cols(
##   FID = col_double(),
##   IID = col_character(),
##   PHENO = col_double(),
##   CNT = col_double(),
##   CNT2 = col_double(),
##   SCORE = col_double()
## )
## Parsed with column specification:
## cols(
##   FID = col_double(),
##   IID = col_character(),
##   PHENO = col_double(),
##   CNT = col_double(),
##   CNT2 = col_double(),
##   SCORE = col_double()
## )
## Parsed with column specification:
## cols(
##   FID = col_double(),
##   IID = col_character(),
##   PHENO = col_double(),
##   CNT = col_double(),
##   CNT2 = col_double(),
##   SCORE = col_double()
## )
## Parsed with column specification:
## cols(
##   FID = col_double(),
##   IID = col_character(),
##   PHENO = col_double(),
##   CNT = col_double(),
##   CNT2 = col_double(),
##   SCORE = col_double()
## )
names(pgs_scores_raw) = pgs_files %>% str_replace(".profile$", "") %>% str_replace(".(tsv|csv|txt)$", "")

#add prefixes / prep merge
for (i in seq_along(pgs_scores_raw)) {
  #remove unwanted columns
  pgs_scores_raw[[i]] = pgs_scores_raw[[i]] %>% select(-FID, -PHENO)
  
  #fix names
  names(pgs_scores_raw[[i]])[-1] = names(pgs_scores_raw)[i] + "_" +names(pgs_scores_raw[[i]])[-1]
}

#as single data frame
pgs_scores = Reduce(x = pgs_scores_raw, f = full_join)
## Joining, by = "IID"
## Joining, by = "IID"
## Joining, by = "IID"
#Lahn snps
lahn_vcf = read_csv("data/lahn.csv")
## Parsed with column specification:
## cols(
##   id = col_character(),
##   rs3762271 = col_double()
## )

Read metadata

#read data
meta = read_tsv("data/v37.2.1240K.clean4.anno") %>% df_legalize_names()
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `SNPs hit on autosomes` = col_double()
## )
## See spec(...) for full column specifications.

Merge / recode

#fix ids
pgs_scores$IID %<>% std_ids()
lahn_vcf$id %<>% std_ids()
meta$Instance_ID %<>% std_ids()

#join
d = full_join(meta, pgs_scores, by = c("Instance_ID" = "IID")) %>% 
  full_join(lahn_vcf, by = c("Instance_ID" = "id"))

#ensure correct
assert_that(nrow(d) == nrow(meta))
## [1] TRUE
assert_that(!any(duplicated(d$Instance_ID)))
## [1] TRUE
#rename
d$ybp = d$Average_of_95_4pct_date_range_in_calBP_defined_as_1950_CE %>% as.numeric()
## Warning in function_list[[k]](value): NAs introduced by coercion
d$abs_lat = d$Lat %>% as.numeric() %>% abs()
## Warning in function_list[[i]](value): NAs introduced by coercion
d$long = d$Long %>% as.numeric()
## Warning in function_list[[k]](value): NAs introduced by coercion
d$coverage = d$Coverage %>% as.numeric()
## Warning in function_list[[k]](value): NAs introduced by coercion
d$snp_hits = d$SNPs_hit_on_autosomes %>% as.numeric()
d$data_source = d$Skeletal_element %>% plyr::mapvalues(from = c("Tooth", "Petrous"), to = c("tooth", "petrous")) %>% factor()
d$country = d$Country %>% factor()
#recode into broad geographical regions
d$ISO = d$country %>% as.character() %>% plyr::mapvalues(c("..", "Orkney Islands"), c(NA, "Scotland")) %>% pu_translate()
## No exact match: BotswanaOrNamibia
## No exact match: BotswanaOrNamibia
## No exact match: BotswanaOrNamibia
## No exact match: Western Sahara (Morocco)
## No exact match: BotswanaOrNamibia
## No exact match: Western Sahara (Morocco)
## Best fuzzy match found: BotswanaOrNamibia -> Botswana with distance 9.00
## Best fuzzy match found: BotswanaOrNamibia -> Botswana with distance 9.00
## Best fuzzy match found: BotswanaOrNamibia -> Botswana with distance 9.00
## Best fuzzy match found: Western Sahara (Morocco) -> Western Sahara with distance 10.00
## Best fuzzy match found: BotswanaOrNamibia -> Botswana with distance 9.00
## Best fuzzy match found: Western Sahara (Morocco) -> Western Sahara with distance 10.00
#other national data
mega = read_csv("data/Megadataset_v2.0p.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character(),
##   LVnotes = col_character(),
##   Names = col_character(),
##   Capital = col_character(),
##   se_Land = col_character(),
##   se_abbrev = col_character(),
##   de_Country = col_character(),
##   de_abbrev = col_character(),
##   de_Country_EN = col_character(),
##   de_Western_neighbor = col_logical(),
##   de_NW_European = col_logical(),
##   dk_Origin = col_character(),
##   Country_EN = col_character()
## )
## See spec(...) for full column specifications.
#population data
worldpop = read_html("data/wiki_worldpop.html") %>% html_table() %>% .[[2]] %>% df_legalize_names()
worldpop %<>% mutate(
  population2017 = Population_1_July_2017_2 %>% str_replace_all(",", "") %>% as.integer(),
  pop = population2017,
  log_pop = log10(population2017),
  country = str_replace(Country_or_area, "\\[.*", ""),
  ISO = pu_translate(country)
) %>% na.omit()
## Warning in function_list[[k]](value): NAs introduced by coercion to integer
## range
## No exact match: World
## No exact match: Guernsey and  Jersey
## Best fuzzy match found: World -> world with distance 1.00
## Best fuzzy match found: Guernsey and  Jersey -> Guernsey (GBR) with distance 11.00
#merge regions into main
d = left_join(d, worldpop %>% select(ISO, UN_continental_region_1, UN_statistical_region_1))
## Joining, by = "ISO"
## Warning: Column `ISO` has different attributes on LHS and RHS of join
d$UN_continental_region_1 = d$UN_continental_region_1 %>% factor() %>% fct_relevel("Europe")

#subsets
#olds
d2 = d %>% filter(ybp > 150)
d2 %>% nrow()
## [1] 2081
#europeans
d2_euro = d2 %>% filter(UN_continental_region_1 == "Europe")

#near modern europeans
d2_NM_euro = d %>% filter(UN_continental_region_1 == "Europe", ybp < 5000, ybp > 100)

Results

Descriptives

nrow(d)
## [1] 5081
#age distribution
d$ybp %>% GG_denhist()
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).

## Warning: Removed 1 rows containing non-finite values (stat_density).

d$ybp %>% describe()
#olds
d2$ybp %>% GG_denhist()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

d2$ybp %>% describe()
#regions
(regional_table = d2$UN_continental_region_1 %>% table2())

Simple plots

#plot all
ggplot(d, aes(ybp, CAVIARBF_supl_table_SCORE)) +
  geom_point() +
  geom_smooth() +
  theme_bw()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

#plot -- no moderns
d2 %>% 
  ggplot(aes(ybp, CAVIARBF_supl_table_SCORE)) +
  geom_point() +
  geom_smooth() +
  theme_bw()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#plot -- no moderns, Europe only
d2_euro %>% 
  ggplot(aes(ybp, CAVIARBF_supl_table_SCORE)) +
  geom_point() +
  geom_smooth() +
  theme_bw()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#plot -- near moderns, Europe only
d2_NM_euro %>% 
  ggplot(aes(ybp, CAVIARBF_supl_table_SCORE)) +
  geom_point() +
  geom_smooth() +
  theme_bw()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Primary models

With causal variants PGS from Lee et al 2018.

#simple
(model_1 = ols(CAVIARBF_supl_table_SCORE ~ ybp, data = d2))
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ ybp, data = d2)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    2081    LR chi2      9.33    R2       0.004    
##  sigma0.0006    d.f.            1    R2 adj   0.004    
##  d.f.   2079    Pr(> chi2) 0.0023    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -2.662e-03 -3.266e-04 -9.835e-05  2.968e-04  3.186e-03 
##  
##  
##            Coef   S.E.   t     Pr(>|t|)
##  Intercept 0.0001 0.0000  5.48 <0.0001 
##  ybp       0.0000 0.0000 -3.06 0.0023  
## 
#latitude
(model_2 = ols(CAVIARBF_supl_table_SCORE ~ ybp + abs_lat, data = d2))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat 
##                       244 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ ybp + abs_lat, data = d2)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1837    LR chi2     37.01    R2       0.020    
##  sigma0.0006    d.f.            2    R2 adj   0.019    
##  d.f.   1834    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -2.705e-03 -3.268e-04 -7.887e-05  3.018e-04  3.166e-03 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept -0.0002 0.0001 -3.37 0.0008  
##  ybp        0.0000 0.0000 -3.45 0.0006  
##  abs_lat    0.0000 0.0000  5.29 <0.0001 
## 
#continental dummies
(model_3 = ols(CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + UN_continental_region_1, data = d2))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat   UN_continental_region_1 
##                       244                         9 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + UN_continental_region_1, 
##      data = d2)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1832    LR chi2     71.47    R2       0.038    
##  sigma0.0006    d.f.            6    R2 adj   0.035    
##  d.f.   1825    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -2.556e-03 -3.315e-04 -8.013e-05  3.016e-04  3.105e-03 
##  
##  
##                                   Coef    S.E.   t     Pr(>|t|)
##  Intercept                         0.0003 0.0001  2.35 0.0190  
##  ybp                               0.0000 0.0000 -4.19 <0.0001 
##  abs_lat                           0.0000 0.0000 -0.72 0.4744  
##  UN_continental_region_1=Africa   -0.0003 0.0001 -2.11 0.0350  
##  UN_continental_region_1=Americas -0.0003 0.0001 -4.26 <0.0001 
##  UN_continental_region_1=Asia     -0.0002 0.0000 -3.77 0.0002  
##  UN_continental_region_1=Oceania  -0.0007 0.0001 -5.07 <0.0001 
## 
#quality
(model_4 = ols(CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + UN_continental_region_1 + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + data_source, data = d2))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat   UN_continental_region_1 
##                       244                         9 
##   CAVIARBF_supl_table_CNT                  coverage 
##                         0                         1 
##                  snp_hits               data_source 
##                         0                         0 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + UN_continental_region_1 + 
##      CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##      data_source, data = d2)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1832    LR chi2    159.50    R2       0.083    
##  sigma0.0006    d.f.           28    R2 adj   0.069    
##  d.f.   1803    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -2.637e-03 -3.128e-04 -3.003e-05  3.012e-04  3.114e-03 
##  
##  
##                                       Coef    S.E.   t     Pr(>|t|)
##  Intercept                             0.0001 0.0001  0.63 0.5293  
##  ybp                                   0.0000 0.0000 -3.90 <0.0001 
##  abs_lat                               0.0000 0.0000 -1.25 0.2102  
##  UN_continental_region_1=Africa       -0.0003 0.0001 -2.14 0.0325  
##  UN_continental_region_1=Americas     -0.0002 0.0001 -3.50 0.0005  
##  UN_continental_region_1=Asia         -0.0001 0.0001 -2.76 0.0059  
##  UN_continental_region_1=Oceania      -0.0007 0.0001 -5.21 <0.0001 
##  CAVIARBF_supl_table_CNT               0.0000 0.0000  1.06 0.2898  
##  coverage                             -0.0012 0.0016 -0.77 0.4405  
##  coverage'                             0.2625 0.3032  0.87 0.3867  
##  coverage''                           -0.3251 0.3766 -0.86 0.3881  
##  coverage'''                           0.0625 0.0745  0.84 0.4012  
##  snp_hits                              0.0000 0.0000  0.85 0.3971  
##  snp_hits'                             0.0000 0.0000 -0.69 0.4908  
##  snp_hits''                            0.0000 0.0000  0.57 0.5670  
##  snp_hits'''                           0.0000 0.0000  0.16 0.8741  
##  data_source=bone                     -0.0001 0.0001 -0.56 0.5756  
##  data_source=bone (cranial)            0.0001 0.0002  0.65 0.5140  
##  data_source=bone (long bone)          0.0002 0.0001  2.35 0.0190  
##  data_source=bone (phalanx)            0.0001 0.0002  0.66 0.5081  
##  data_source=Hair                     -0.0009 0.0006 -1.42 0.1564  
##  data_source=petrous                   0.0001 0.0000  1.20 0.2289  
##  data_source=Rib                       0.0002 0.0006  0.34 0.7356  
##  data_source=tooth                     0.0001 0.0000  1.86 0.0626  
##  data_source=tooth (canine)            0.0004 0.0003  1.38 0.1691  
##  data_source=tooth (incisor)          -0.0001 0.0004 -0.33 0.7450  
##  data_source=tooth (molar)             0.0001 0.0001  1.32 0.1887  
##  data_source=tooth (premolar)         -0.0001 0.0002 -0.26 0.7938  
##  data_source=tooth and bone (cranial)  0.0000 0.0006 -0.06 0.9516  
## 
#nonlinear age
(model_5 = ols(CAVIARBF_supl_table_SCORE ~ rcs(ybp, 4) + abs_lat + UN_continental_region_1 + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + data_source, data = d2))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat   UN_continental_region_1 
##                       244                         9 
##   CAVIARBF_supl_table_CNT                  coverage 
##                         0                         1 
##                  snp_hits               data_source 
##                         0                         0 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ rcs(ybp, 4) + abs_lat + 
##      UN_continental_region_1 + CAVIARBF_supl_table_CNT + rcs(coverage) + 
##      rcs(snp_hits) + data_source, data = d2)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1832    LR chi2    167.11    R2       0.087    
##  sigma0.0006    d.f.           30    R2 adj   0.072    
##  d.f.   1801    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -2.679e-03 -3.208e-04 -1.435e-05  3.067e-04  3.071e-03 
##  
##  
##                                       Coef    S.E.   t     Pr(>|t|)
##  Intercept                            -0.0001 0.0002 -0.44 0.6564  
##  ybp                                   0.0000 0.0000  2.02 0.0436  
##  ybp'                                  0.0000 0.0000 -2.71 0.0068  
##  ybp''                                 0.0000 0.0000  2.74 0.0063  
##  abs_lat                               0.0000 0.0000 -1.18 0.2396  
##  UN_continental_region_1=Africa       -0.0002 0.0001 -1.89 0.0587  
##  UN_continental_region_1=Americas     -0.0002 0.0001 -2.85 0.0045  
##  UN_continental_region_1=Asia         -0.0001 0.0001 -2.50 0.0124  
##  UN_continental_region_1=Oceania      -0.0007 0.0001 -4.93 <0.0001 
##  CAVIARBF_supl_table_CNT               0.0000 0.0000  1.02 0.3059  
##  coverage                             -0.0012 0.0016 -0.76 0.4486  
##  coverage'                             0.2522 0.3028  0.83 0.4051  
##  coverage''                           -0.3120 0.3762 -0.83 0.4071  
##  coverage'''                           0.0595 0.0744  0.80 0.4238  
##  snp_hits                              0.0000 0.0000  0.88 0.3816  
##  snp_hits'                             0.0000 0.0000 -0.72 0.4698  
##  snp_hits''                            0.0000 0.0000  0.61 0.5388  
##  snp_hits'''                           0.0000 0.0000  0.08 0.9377  
##  data_source=bone                     -0.0001 0.0001 -0.73 0.4632  
##  data_source=bone (cranial)            0.0001 0.0002  0.70 0.4837  
##  data_source=bone (long bone)          0.0002 0.0001  2.22 0.0266  
##  data_source=bone (phalanx)            0.0001 0.0002  0.63 0.5320  
##  data_source=Hair                     -0.0010 0.0006 -1.57 0.1175  
##  data_source=petrous                   0.0001 0.0001  1.13 0.2575  
##  data_source=Rib                       0.0001 0.0006  0.23 0.8155  
##  data_source=tooth                     0.0001 0.0001  1.44 0.1488  
##  data_source=tooth (canine)            0.0004 0.0003  1.37 0.1702  
##  data_source=tooth (incisor)          -0.0001 0.0004 -0.29 0.7727  
##  data_source=tooth (molar)             0.0001 0.0001  1.56 0.1189  
##  data_source=tooth (premolar)          0.0000 0.0002 -0.09 0.9272  
##  data_source=tooth and bone (cranial)  0.0000 0.0006 -0.03 0.9735  
## 
#slope interaction
(model_6 = ols(CAVIARBF_supl_table_SCORE ~ rcs(ybp, 4) * UN_continental_region_1 + abs_lat + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + data_source, data = d2 %>% filter(UN_continental_region_1 %in% regional_table$Group[1:3])))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##   UN_continental_region_1                   abs_lat 
##                         0                       223 
##   CAVIARBF_supl_table_CNT                  coverage 
##                         0                         1 
##                  snp_hits               data_source 
##                         0                         0 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ rcs(ybp, 4) * UN_continental_region_1 + 
##      abs_lat + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##      data_source, data = d2 %>% filter(UN_continental_region_1 %in% 
##      regional_table$Group[1:3]))
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1762    LR chi2    165.81    R2       0.090    
##  sigma0.0006    d.f.           34    R2 adj   0.072    
##  d.f.   1727    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -2.646e-03 -3.098e-04 -1.505e-05  3.033e-04  2.990e-03 
##  
##  
##                                           Coef    S.E.   t     Pr(>|t|)
##  Intercept                                 0.0000 0.0002  0.22 0.8283  
##  ybp                                       0.0000 0.0000  0.83 0.4062  
##  ybp'                                      0.0000 0.0000 -2.06 0.0397  
##  ybp''                                     0.0000 0.0000  2.28 0.0227  
##  UN_continental_region_1=Americas         -0.0002 0.0002 -1.09 0.2742  
##  UN_continental_region_1=Asia             -0.0006 0.0002 -3.22 0.0013  
##  abs_lat                                   0.0000 0.0000 -0.96 0.3386  
##  CAVIARBF_supl_table_CNT                   0.0000 0.0000  0.97 0.3344  
##  coverage                                 -0.0010 0.0019 -0.52 0.6026  
##  coverage'                                 0.1987 0.3179  0.63 0.5320  
##  coverage''                               -0.2474 0.3963 -0.62 0.5326  
##  coverage'''                               0.0484 0.0791  0.61 0.5406  
##  snp_hits                                  0.0000 0.0000  0.65 0.5141  
##  snp_hits'                                 0.0000 0.0000 -0.60 0.5510  
##  snp_hits''                                0.0000 0.0000  0.50 0.6136  
##  snp_hits'''                               0.0000 0.0000  0.18 0.8599  
##  data_source=bone                         -0.0001 0.0001 -0.99 0.3246  
##  data_source=bone (cranial)                0.0001 0.0002  0.87 0.3838  
##  data_source=bone (long bone)              0.0002 0.0001  1.98 0.0479  
##  data_source=bone (phalanx)                0.0000 0.0002  0.23 0.8217  
##  data_source=Hair                         -0.0010 0.0006 -1.54 0.1236  
##  data_source=petrous                       0.0000 0.0001 -0.21 0.8342  
##  data_source=Rib                           0.0002 0.0006  0.25 0.7994  
##  data_source=tooth                         0.0000 0.0001  0.76 0.4501  
##  data_source=tooth (canine)                0.0004 0.0003  1.28 0.2015  
##  data_source=tooth (incisor)              -0.0002 0.0004 -0.49 0.6255  
##  data_source=tooth (molar)                 0.0001 0.0001  1.11 0.2678  
##  data_source=tooth (premolar)             -0.0001 0.0002 -0.23 0.8153  
##  data_source=tooth and bone (cranial)      0.0000 0.0006 -0.03 0.9738  
##  ybp * UN_continental_region_1=Americas    0.0000 0.0000 -0.78 0.4362  
##  ybp' * UN_continental_region_1=Americas   0.0000 0.0000  1.38 0.1679  
##  ybp'' * UN_continental_region_1=Americas  0.0000 0.0000 -1.53 0.1263  
##  ybp * UN_continental_region_1=Asia        0.0000 0.0000  1.71 0.0873  
##  ybp' * UN_continental_region_1=Asia       0.0000 0.0000 -0.49 0.6228  
##  ybp'' * UN_continental_region_1=Asia      0.0000 0.0000  0.14 0.8850  
## 
#steps
lrtest(model_1, model_2)
## 
## Model 1: CAVIARBF_supl_table_SCORE ~ ybp
## Model 2: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat
## 
## L.R. Chisq       d.f.          P 
##   2.77e+01   1.00e+00   1.43e-07
lrtest(model_2, model_3)
## 
## Model 1: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat
## Model 2: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + UN_continental_region_1
## 
## L.R. Chisq       d.f.          P 
##   3.45e+01   4.00e+00   6.00e-07
lrtest(model_3, model_4)
## 
## Model 1: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + UN_continental_region_1
## Model 2: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + UN_continental_region_1 + 
##     CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##     data_source
## 
## L.R. Chisq       d.f.          P 
##   8.80e+01   2.20e+01   7.41e-10
lrtest(model_4, model_5)
## 
## Model 1: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + UN_continental_region_1 + 
##     CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##     data_source
## Model 2: CAVIARBF_supl_table_SCORE ~ rcs(ybp, 4) + abs_lat + UN_continental_region_1 + 
##     CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##     data_source
## 
## L.R. Chisq       d.f.          P 
##     7.6100     2.0000     0.0223
lrtest(model_5, model_6)
## 
## Model 1: CAVIARBF_supl_table_SCORE ~ rcs(ybp, 4) + abs_lat + UN_continental_region_1 + 
##     CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##     data_source
## Model 2: CAVIARBF_supl_table_SCORE ~ rcs(ybp, 4) * UN_continental_region_1 + 
##     abs_lat + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##     data_source
## 
## L.R. Chisq       d.f.          P 
##      1.302      4.000      0.861
#plot expected age pattern
model_preds = data.frame(
  ybp = 154:90000, #full range,
  abs_lat = d2$abs_lat %>% wtd_mean(), #mean covars
  long = d2$long %>% wtd_mean(), #mean covars
  CAVIARBF_supl_table_CNT = d2$CAVIARBF_supl_table_CNT %>% wtd_mean(), 
  coverage = d2$coverage %>% wtd_mean(), 
  snp_hits = d2$snp_hits %>% wtd_mean(), 
  data_source = "petrous", #mode
  UN_continental_region_1 = "Europe" #mode
)

model_preds$model_1 = predict(model_1, newdata = model_preds)
model_preds$model_2 = predict(model_2, newdata = model_preds)
model_preds$model_3 = predict(model_3, newdata = model_preds)
model_preds$model_4 = predict(model_4, newdata = model_preds)
model_preds$model_5 = predict(model_5, newdata = model_preds)
model_preds$model_6 = predict(model_6, newdata = model_preds)

#plot predictions
model_preds %>% 
  gather(key = model, value = prediction, starts_with("model_")) %>% 
  ggplot(aes(ybp, prediction, color = model)) +
  geom_line() +
  theme_bw()

#for each continent
#plot expected age pattern by continent
model_preds2 = data.frame(
  ybp = 154:90000, #full range,
  abs_lat = d2$abs_lat %>% wtd_mean(), #mean covars
  long = d2$long %>% wtd_mean(), #mean covars
  CAVIARBF_supl_table_CNT = d2$CAVIARBF_supl_table_CNT %>% wtd_mean(), 
  coverage = d2$coverage %>% wtd_mean(), 
  snp_hits = d2$snp_hits %>% wtd_mean(), 
  data_source = "petrous", #mode
  UN_continental_region_1 = rep(c("Europe", "Asia", "Americas"),
                                length.out = length(154:90000))
)

model_preds2$model_6 = predict(model_6, newdata = model_preds2)

#plot predictions
model_preds2 %>% 
  gather(key = model, value = prediction, starts_with("model_")) %>% 
  ggplot(aes(ybp, prediction, color = UN_continental_region_1)) +
  geom_line() +
  theme_bw()

The nonlinear slope * continent model was not significant, so redisregard the weird predictions of that.

Alternative polygenic score: MTAG education lead snps

Lead SNPs MTAG, same source.

#simple
(model_1 = ols(mtag_edu_lead_snps_gwas_SCORE ~ ybp, data = d2))
## Linear Regression Model
##  
##  ols(formula = mtag_edu_lead_snps_gwas_SCORE ~ ybp, data = d2)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    2081    LR chi2     60.92    R2       0.029    
##  sigma0.0002    d.f.            1    R2 adj   0.028    
##  d.f.   2079    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -8.943e-04 -8.423e-05 -6.122e-06  8.322e-05  7.214e-04 
##  
##  
##            Coef   S.E.   t     Pr(>|t|)
##  Intercept 0.0001 0.0000 12.22 <0.0001 
##  ybp       0.0000 0.0000 -7.86 <0.0001 
## 
#latitude
(model_2 = ols(mtag_edu_lead_snps_gwas_SCORE ~ ybp + abs_lat, data = d2))
## Frequencies of Missing Values Due to Each Variable
## mtag_edu_lead_snps_gwas_SCORE                           ybp 
##                             0                             0 
##                       abs_lat 
##                           244 
## 
## Linear Regression Model
##  
##  ols(formula = mtag_edu_lead_snps_gwas_SCORE ~ ybp + abs_lat, 
##      data = d2)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1837    LR chi2     70.86    R2       0.038    
##  sigma0.0002    d.f.            2    R2 adj   0.037    
##  d.f.   1834    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -7.106e-04 -8.574e-05 -4.582e-06  8.147e-05  7.081e-04 
##  
##  
##            Coef   S.E.   t     Pr(>|t|)
##  Intercept 0.0000 0.0000  0.85 0.3938  
##  ybp       0.0000 0.0000 -8.13 <0.0001 
##  abs_lat   0.0000 0.0000  3.05 0.0023  
## 
#continental dummies
(model_3 = ols(mtag_edu_lead_snps_gwas_SCORE ~ ybp + abs_lat + UN_continental_region_1, data = d2))
## Frequencies of Missing Values Due to Each Variable
## mtag_edu_lead_snps_gwas_SCORE                           ybp 
##                             0                             0 
##                       abs_lat       UN_continental_region_1 
##                           244                             9 
## 
## Linear Regression Model
##  
##  ols(formula = mtag_edu_lead_snps_gwas_SCORE ~ ybp + abs_lat + 
##      UN_continental_region_1, data = d2)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1832    LR chi2    100.89    R2       0.054    
##  sigma0.0002    d.f.            6    R2 adj   0.050    
##  d.f.   1825    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -7.097e-04 -8.663e-05 -5.698e-06  8.340e-05  7.137e-04 
##  
##  
##                                   Coef    S.E.   t     Pr(>|t|)
##  Intercept                         0.0000 0.0000  0.99 0.3242  
##  ybp                               0.0000 0.0000 -7.86 <0.0001 
##  abs_lat                           0.0000 0.0000  1.26 0.2067  
##  UN_continental_region_1=Africa   -0.0001 0.0000 -3.78 0.0002  
##  UN_continental_region_1=Americas  0.0000 0.0000  1.58 0.1136  
##  UN_continental_region_1=Asia      0.0000 0.0000  1.29 0.1983  
##  UN_continental_region_1=Oceania  -0.0001 0.0000 -1.47 0.1417  
## 
#quality
(model_4 = ols(mtag_edu_lead_snps_gwas_SCORE ~ ybp + abs_lat + UN_continental_region_1 + mtag_edu_lead_snps_gwas_CNT + rcs(coverage) + rcs(snp_hits) + data_source, data = d2))
## Frequencies of Missing Values Due to Each Variable
## mtag_edu_lead_snps_gwas_SCORE                           ybp 
##                             0                             0 
##                       abs_lat       UN_continental_region_1 
##                           244                             9 
##   mtag_edu_lead_snps_gwas_CNT                      coverage 
##                             0                             1 
##                      snp_hits                   data_source 
##                             0                             0 
## 
## Linear Regression Model
##  
##  ols(formula = mtag_edu_lead_snps_gwas_SCORE ~ ybp + abs_lat + 
##      UN_continental_region_1 + mtag_edu_lead_snps_gwas_CNT + rcs(coverage) + 
##      rcs(snp_hits) + data_source, data = d2)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1832    LR chi2    131.81    R2       0.069    
##  sigma0.0002    d.f.           28    R2 adj   0.055    
##  d.f.   1803    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -7.001e-04 -8.540e-05 -1.404e-06  8.660e-05  7.084e-04 
##  
##  
##                                       Coef    S.E.   t     Pr(>|t|)
##  Intercept                             0.0001 0.0000  1.40 0.1624  
##  ybp                                   0.0000 0.0000 -6.78 <0.0001 
##  abs_lat                               0.0000 0.0000  0.34 0.7329  
##  UN_continental_region_1=Africa       -0.0001 0.0000 -4.23 <0.0001 
##  UN_continental_region_1=Americas      0.0000 0.0000  1.04 0.2985  
##  UN_continental_region_1=Asia          0.0000 0.0000  0.56 0.5722  
##  UN_continental_region_1=Oceania      -0.0001 0.0000 -1.98 0.0480  
##  mtag_edu_lead_snps_gwas_CNT           0.0000 0.0000 -0.59 0.5546  
##  coverage                             -0.0004 0.0004 -0.90 0.3682  
##  coverage'                             0.0928 0.0809  1.15 0.2512  
##  coverage''                           -0.1166 0.1005 -1.16 0.2459  
##  coverage'''                           0.0245 0.0199  1.23 0.2183  
##  snp_hits                              0.0000 0.0000  1.12 0.2613  
##  snp_hits'                             0.0000 0.0000 -1.08 0.2820  
##  snp_hits''                            0.0000 0.0000  1.01 0.3127  
##  snp_hits'''                           0.0000 0.0000 -0.35 0.7301  
##  data_source=bone                      0.0000 0.0000 -0.35 0.7267  
##  data_source=bone (cranial)            0.0000 0.0000  0.89 0.3759  
##  data_source=bone (long bone)          0.0000 0.0000  1.14 0.2530  
##  data_source=bone (phalanx)            0.0000 0.0000 -0.44 0.6571  
##  data_source=Hair                      0.0004 0.0002  2.20 0.0280  
##  data_source=petrous                   0.0000 0.0000 -0.44 0.6566  
##  data_source=Rib                       0.0000 0.0002 -0.12 0.9017  
##  data_source=tooth                     0.0000 0.0000 -0.71 0.4798  
##  data_source=tooth (canine)           -0.0001 0.0001 -1.17 0.2425  
##  data_source=tooth (incisor)          -0.0003 0.0001 -3.13 0.0018  
##  data_source=tooth (molar)             0.0000 0.0000 -1.46 0.1435  
##  data_source=tooth (premolar)         -0.0001 0.0001 -1.51 0.1319  
##  data_source=tooth and bone (cranial) -0.0001 0.0002 -0.61 0.5413  
## 
#nonlinear age
(model_5 = ols(mtag_edu_lead_snps_gwas_SCORE ~ rcs(ybp, 4) + abs_lat + UN_continental_region_1 + mtag_edu_lead_snps_gwas_CNT + rcs(coverage) + rcs(snp_hits) + data_source, data = d2))
## Frequencies of Missing Values Due to Each Variable
## mtag_edu_lead_snps_gwas_SCORE                           ybp 
##                             0                             0 
##                       abs_lat       UN_continental_region_1 
##                           244                             9 
##   mtag_edu_lead_snps_gwas_CNT                      coverage 
##                             0                             1 
##                      snp_hits                   data_source 
##                             0                             0 
## 
## Linear Regression Model
##  
##  ols(formula = mtag_edu_lead_snps_gwas_SCORE ~ rcs(ybp, 4) + abs_lat + 
##      UN_continental_region_1 + mtag_edu_lead_snps_gwas_CNT + rcs(coverage) + 
##      rcs(snp_hits) + data_source, data = d2)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1832    LR chi2    173.16    R2       0.090    
##  sigma0.0002    d.f.           30    R2 adj   0.075    
##  d.f.   1801    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -6.674e-04 -8.373e-05 -2.669e-06  8.780e-05  6.822e-04 
##  
##  
##                                       Coef    S.E.   t     Pr(>|t|)
##  Intercept                             0.0000 0.0000  0.16 0.8700  
##  ybp                                   0.0000 0.0000  2.61 0.0092  
##  ybp'                                  0.0000 0.0000 -4.97 <0.0001 
##  ybp''                                 0.0000 0.0000  5.35 <0.0001 
##  abs_lat                               0.0000 0.0000  0.30 0.7653  
##  UN_continental_region_1=Africa       -0.0001 0.0000 -4.01 <0.0001 
##  UN_continental_region_1=Americas      0.0000 0.0000  1.45 0.1463  
##  UN_continental_region_1=Asia          0.0000 0.0000  0.90 0.3704  
##  UN_continental_region_1=Oceania      -0.0001 0.0000 -2.42 0.0156  
##  mtag_edu_lead_snps_gwas_CNT           0.0000 0.0000 -0.62 0.5332  
##  coverage                             -0.0004 0.0004 -0.97 0.3327  
##  coverage'                             0.0939 0.0800  1.17 0.2409  
##  coverage''                           -0.1176 0.0994 -1.18 0.2369  
##  coverage'''                           0.0243 0.0197  1.23 0.2170  
##  snp_hits                              0.0000 0.0000  1.27 0.2048  
##  snp_hits'                             0.0000 0.0000 -1.19 0.2335  
##  snp_hits''                            0.0000 0.0000  1.12 0.2619  
##  snp_hits'''                           0.0000 0.0000 -0.42 0.6774  
##  data_source=bone                      0.0000 0.0000 -0.24 0.8083  
##  data_source=bone (cranial)            0.0000 0.0000  1.16 0.2447  
##  data_source=bone (long bone)          0.0000 0.0000  1.55 0.1217  
##  data_source=bone (phalanx)            0.0000 0.0000 -0.57 0.5715  
##  data_source=Hair                      0.0003 0.0002  2.10 0.0359  
##  data_source=petrous                   0.0000 0.0000  0.57 0.5657  
##  data_source=Rib                       0.0000 0.0002 -0.18 0.8561  
##  data_source=tooth                     0.0000 0.0000 -0.34 0.7321  
##  data_source=tooth (canine)           -0.0001 0.0001 -0.91 0.3606  
##  data_source=tooth (incisor)          -0.0003 0.0001 -2.87 0.0041  
##  data_source=tooth (molar)             0.0000 0.0000  0.26 0.7923  
##  data_source=tooth (premolar)          0.0000 0.0001 -0.84 0.4021  
##  data_source=tooth and bone (cranial) -0.0001 0.0002 -0.43 0.6655  
## 
#slope interaction
(model_6 = ols(mtag_edu_lead_snps_gwas_SCORE ~ rcs(ybp, 4) * UN_continental_region_1 + abs_lat + mtag_edu_lead_snps_gwas_CNT + rcs(coverage) + rcs(snp_hits) + data_source, data = d2 %>% filter(UN_continental_region_1 %in% regional_table$Group[1:3])))
## Frequencies of Missing Values Due to Each Variable
## mtag_edu_lead_snps_gwas_SCORE                           ybp 
##                             0                             0 
##       UN_continental_region_1                       abs_lat 
##                             0                           223 
##   mtag_edu_lead_snps_gwas_CNT                      coverage 
##                             0                             1 
##                      snp_hits                   data_source 
##                             0                             0 
## 
## Linear Regression Model
##  
##  ols(formula = mtag_edu_lead_snps_gwas_SCORE ~ rcs(ybp, 4) * UN_continental_region_1 + 
##      abs_lat + mtag_edu_lead_snps_gwas_CNT + rcs(coverage) + rcs(snp_hits) + 
##      data_source, data = d2 %>% filter(UN_continental_region_1 %in% 
##      regional_table$Group[1:3]))
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1762    LR chi2    182.22    R2       0.098    
##  sigma0.0002    d.f.           34    R2 adj   0.080    
##  d.f.   1727    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -6.505e-04 -8.482e-05 -4.012e-06  9.009e-05  6.642e-04 
##  
##  
##                                           Coef    S.E.   t     Pr(>|t|)
##  Intercept                                 0.0001 0.0000  1.34 0.1808  
##  ybp                                       0.0000 0.0000  0.96 0.3360  
##  ybp'                                      0.0000 0.0000 -4.10 <0.0001 
##  ybp''                                     0.0000 0.0000  4.78 <0.0001 
##  UN_continental_region_1=Americas          0.0000 0.0000 -1.11 0.2688  
##  UN_continental_region_1=Asia              0.0000 0.0000  0.26 0.7981  
##  abs_lat                                   0.0000 0.0000 -0.19 0.8520  
##  mtag_edu_lead_snps_gwas_CNT               0.0000 0.0000 -0.57 0.5661  
##  coverage                                 -0.0003 0.0005 -0.70 0.4863  
##  coverage'                                 0.0749 0.0830  0.90 0.3675  
##  coverage''                               -0.0944 0.1035 -0.91 0.3619  
##  coverage'''                               0.0200 0.0207  0.97 0.3322  
##  snp_hits                                  0.0000 0.0000  0.95 0.3422  
##  snp_hits'                                 0.0000 0.0000 -0.95 0.3402  
##  snp_hits''                                0.0000 0.0000  0.92 0.3597  
##  snp_hits'''                               0.0000 0.0000 -0.36 0.7195  
##  data_source=bone                          0.0000 0.0000 -0.39 0.6997  
##  data_source=bone (cranial)                0.0001 0.0000  1.75 0.0799  
##  data_source=bone (long bone)              0.0001 0.0000  2.45 0.0142  
##  data_source=bone (phalanx)                0.0000 0.0000 -0.77 0.4425  
##  data_source=Hair                          0.0004 0.0002  2.16 0.0313  
##  data_source=petrous                       0.0000 0.0000  0.11 0.9100  
##  data_source=Rib                           0.0000 0.0002 -0.12 0.9051  
##  data_source=tooth                         0.0000 0.0000  0.28 0.7786  
##  data_source=tooth (canine)               -0.0001 0.0001 -0.78 0.4382  
##  data_source=tooth (incisor)              -0.0003 0.0001 -2.91 0.0037  
##  data_source=tooth (molar)                 0.0000 0.0000  1.12 0.2638  
##  data_source=tooth (premolar)              0.0000 0.0001 -0.56 0.5764  
##  data_source=tooth and bone (cranial)     -0.0001 0.0002 -0.33 0.7434  
##  ybp * UN_continental_region_1=Americas    0.0000 0.0000  0.68 0.4937  
##  ybp' * UN_continental_region_1=Americas   0.0000 0.0000 -0.02 0.9840  
##  ybp'' * UN_continental_region_1=Americas  0.0000 0.0000  0.04 0.9704  
##  ybp * UN_continental_region_1=Asia        0.0000 0.0000 -0.91 0.3624  
##  ybp' * UN_continental_region_1=Asia       0.0000 0.0000  1.52 0.1297  
##  ybp'' * UN_continental_region_1=Asia      0.0000 0.0000 -1.46 0.1457  
## 
#steps
lrtest(model_1, model_2)
## 
## Model 1: mtag_edu_lead_snps_gwas_SCORE ~ ybp
## Model 2: mtag_edu_lead_snps_gwas_SCORE ~ ybp + abs_lat
## 
## L.R. Chisq       d.f.          P 
##    9.93230    1.00000    0.00162
lrtest(model_2, model_3)
## 
## Model 1: mtag_edu_lead_snps_gwas_SCORE ~ ybp + abs_lat
## Model 2: mtag_edu_lead_snps_gwas_SCORE ~ ybp + abs_lat + UN_continental_region_1
## 
## L.R. Chisq       d.f.          P 
##   3.00e+01   4.00e+00   4.83e-06
lrtest(model_3, model_4)
## 
## Model 1: mtag_edu_lead_snps_gwas_SCORE ~ ybp + abs_lat + UN_continental_region_1
## Model 2: mtag_edu_lead_snps_gwas_SCORE ~ ybp + abs_lat + UN_continental_region_1 + 
##     mtag_edu_lead_snps_gwas_CNT + rcs(coverage) + rcs(snp_hits) + 
##     data_source
## 
## L.R. Chisq       d.f.          P 
##    30.9243    22.0000     0.0977
lrtest(model_4, model_5)
## 
## Model 1: mtag_edu_lead_snps_gwas_SCORE ~ ybp + abs_lat + UN_continental_region_1 + 
##     mtag_edu_lead_snps_gwas_CNT + rcs(coverage) + rcs(snp_hits) + 
##     data_source
## Model 2: mtag_edu_lead_snps_gwas_SCORE ~ rcs(ybp, 4) + abs_lat + UN_continental_region_1 + 
##     mtag_edu_lead_snps_gwas_CNT + rcs(coverage) + rcs(snp_hits) + 
##     data_source
## 
## L.R. Chisq       d.f.          P 
##   4.13e+01   2.00e+00   1.05e-09
lrtest(model_5, model_6)
## 
## Model 1: mtag_edu_lead_snps_gwas_SCORE ~ rcs(ybp, 4) + abs_lat + UN_continental_region_1 + 
##     mtag_edu_lead_snps_gwas_CNT + rcs(coverage) + rcs(snp_hits) + 
##     data_source
## Model 2: mtag_edu_lead_snps_gwas_SCORE ~ rcs(ybp, 4) * UN_continental_region_1 + 
##     abs_lat + mtag_edu_lead_snps_gwas_CNT + rcs(coverage) + rcs(snp_hits) + 
##     data_source
## 
## L.R. Chisq       d.f.          P 
##     9.0637     4.0000     0.0595
#plot expected age pattern
model_preds = data.frame(
  ybp = 154:90000, #full range,
  abs_lat = d2$abs_lat %>% wtd_mean(), #mean covars
  long = d2$long %>% wtd_mean(), #mean covars
  mtag_edu_lead_snps_gwas_CNT = d2$mtag_edu_lead_snps_gwas_CNT %>% wtd_mean(), 
  coverage = d2$coverage %>% wtd_mean(), 
  snp_hits = d2$snp_hits %>% wtd_mean(), 
  data_source = "petrous", #mode
  UN_continental_region_1 = "Europe" #mode
)

model_preds$model_1 = predict(model_1, newdata = model_preds)
model_preds$model_2 = predict(model_2, newdata = model_preds)
model_preds$model_3 = predict(model_3, newdata = model_preds)
model_preds$model_4 = predict(model_4, newdata = model_preds)
model_preds$model_5 = predict(model_5, newdata = model_preds)
model_preds$model_6 = predict(model_6, newdata = model_preds)

#plot predictions
model_preds %>% 
  gather(key = model, value = prediction, starts_with("model_")) %>% 
  ggplot(aes(ybp, prediction, color = model)) +
  geom_line() +
  theme_bw()

#for each continent
#plot expected age pattern by continent
model_preds2 = data.frame(
  ybp = 154:90000, #full range,
  abs_lat = d2$abs_lat %>% wtd_mean(), #mean covars
  long = d2$long %>% wtd_mean(), #mean covars
  mtag_edu_lead_snps_gwas_CNT = d2$mtag_edu_lead_snps_gwas_CNT %>% wtd_mean(), 
  coverage = d2$coverage %>% wtd_mean(), 
  snp_hits = d2$snp_hits %>% wtd_mean(), 
  data_source = "petrous", #mode
  UN_continental_region_1 = rep(c("Europe", "Asia", "Americas"),
                                length.out = length(154:90000))
)

model_preds2$model_6 = predict(model_6, newdata = model_preds2)

#plot predictions
model_preds2 %>% 
  gather(key = model, value = prediction, starts_with("model_")) %>% 
  ggplot(aes(ybp, prediction, color = UN_continental_region_1)) +
  geom_line() +
  theme_bw()

Europeans only

#simple
(model_1 = ols(CAVIARBF_supl_table_SCORE ~ ybp, data = d2_euro))
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ ybp, data = d2_euro)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1487    LR chi2     22.39    R2       0.015    
##  sigma0.0006    d.f.            1    R2 adj   0.014    
##  d.f.   1485    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.0023969 -0.0003313 -0.0001163  0.0002944  0.0031026 
##  
##  
##            Coef   S.E.   t     Pr(>|t|)
##  Intercept 0.0002 0.0000  8.94 <0.0001 
##  ybp       0.0000 0.0000 -4.75 <0.0001 
## 
#latitude
(model_2 = ols(CAVIARBF_supl_table_SCORE ~ ybp + abs_lat, data = d2_euro))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat 
##                       173 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ ybp + abs_lat, data = d2_euro)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1314    LR chi2     20.33    R2       0.015    
##  sigma0.0006    d.f.            2    R2 adj   0.014    
##  d.f.   1311    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.0023899 -0.0003348 -0.0001147  0.0003025  0.0031084 
##  
##  
##            Coef   S.E.   t     Pr(>|t|)
##  Intercept 0.0001 0.0002  0.75 0.4563  
##  ybp       0.0000 0.0000 -4.49 <0.0001 
##  abs_lat   0.0000 0.0000  0.41 0.6855  
## 
#continental dummies
(model_3 = ols(CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long, data = d2_euro))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat                      long 
##                       173                       175 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long, 
##      data = d2_euro)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1312    LR chi2     23.17    R2       0.018    
##  sigma0.0006    d.f.            3    R2 adj   0.015    
##  d.f.   1308    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.0023854 -0.0003366 -0.0001112  0.0003086  0.0031245 
##  
##  
##            Coef   S.E.   t     Pr(>|t|)
##  Intercept 0.0001 0.0002  0.65 0.5168  
##  ybp       0.0000 0.0000 -4.20 <0.0001 
##  abs_lat   0.0000 0.0000  0.62 0.5377  
##  long      0.0000 0.0000 -1.71 0.0880  
## 
#quality
(model_4 = ols(CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + data_source, data = d2_euro))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat                      long 
##                       173                       175 
##   CAVIARBF_supl_table_CNT                  coverage 
##                         0                         0 
##                  snp_hits               data_source 
##                         0                         0 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long + 
##      CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##      data_source, data = d2_euro)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1312    LR chi2    117.46    R2       0.086    
##  sigma0.0006    d.f.           23    R2 adj   0.069    
##  d.f.   1288    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -2.575e-03 -3.140e-04 -1.855e-05  2.951e-04  2.919e-03 
##  
##  
##                                       Coef    S.E.   t     Pr(>|t|)
##  Intercept                             0.0001 0.0002  0.41 0.6787  
##  ybp                                   0.0000 0.0000 -4.32 <0.0001 
##  abs_lat                               0.0000 0.0000 -0.75 0.4529  
##  long                                  0.0000 0.0000 -2.01 0.0451  
##  CAVIARBF_supl_table_CNT               0.0000 0.0000  0.63 0.5301  
##  coverage                             -0.0035 0.0021 -1.64 0.1016  
##  coverage'                             0.5190 0.2939  1.77 0.0776  
##  coverage''                           -0.6431 0.3636 -1.77 0.0772  
##  coverage'''                           0.1249 0.0703  1.78 0.0757  
##  snp_hits                              0.0000 0.0000  1.81 0.0702  
##  snp_hits'                             0.0000 0.0000 -1.55 0.1221  
##  snp_hits''                            0.0000 0.0000  1.38 0.1687  
##  snp_hits'''                           0.0000 0.0000  0.14 0.8892  
##  data_source=bone                     -0.0001 0.0001 -0.81 0.4199  
##  data_source=bone (cranial)            0.0002 0.0002  1.34 0.1808  
##  data_source=bone (long bone)          0.0002 0.0001  1.82 0.0688  
##  data_source=bone (phalanx)            0.0001 0.0002  0.45 0.6527  
##  data_source=petrous                  -0.0001 0.0001 -1.03 0.3012  
##  data_source=tooth                     0.0000 0.0001  0.63 0.5290  
##  data_source=tooth (canine)            0.0003 0.0004  0.91 0.3646  
##  data_source=tooth (incisor)          -0.0001 0.0004 -0.33 0.7436  
##  data_source=tooth (molar)             0.0000 0.0001  0.29 0.7750  
##  data_source=tooth (premolar)         -0.0002 0.0003 -0.84 0.4019  
##  data_source=tooth and bone (cranial)  0.0000 0.0006 -0.03 0.9775  
## 
#nonlinear age
(model_5 = ols(CAVIARBF_supl_table_SCORE ~ rcs(ybp, 4) + abs_lat + long + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + data_source, data = d2_euro))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat                      long 
##                       173                       175 
##   CAVIARBF_supl_table_CNT                  coverage 
##                         0                         0 
##                  snp_hits               data_source 
##                         0                         0 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ rcs(ybp, 4) + abs_lat + 
##      long + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##      data_source, data = d2_euro)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1312    LR chi2    126.10    R2       0.092    
##  sigma0.0006    d.f.           25    R2 adj   0.074    
##  d.f.   1286    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -2.562e-03 -3.205e-04 -2.609e-05  3.014e-04  2.876e-03 
##  
##  
##                                       Coef    S.E.   t     Pr(>|t|)
##  Intercept                             0.0001 0.0002  0.31 0.7537  
##  ybp                                   0.0000 0.0000  0.67 0.5051  
##  ybp'                                  0.0000 0.0000 -2.02 0.0441  
##  ybp''                                 0.0000 0.0000  2.25 0.0248  
##  abs_lat                               0.0000 0.0000 -1.05 0.2924  
##  long                                  0.0000 0.0000 -1.81 0.0712  
##  CAVIARBF_supl_table_CNT               0.0000 0.0000  0.56 0.5747  
##  coverage                             -0.0036 0.0021 -1.70 0.0900  
##  coverage'                             0.5306 0.2932  1.81 0.0706  
##  coverage''                           -0.6570 0.3628 -1.81 0.0704  
##  coverage'''                           0.1271 0.0701  1.81 0.0702  
##  snp_hits                              0.0000 0.0000  1.90 0.0580  
##  snp_hits'                             0.0000 0.0000 -1.63 0.1033  
##  snp_hits''                            0.0000 0.0000  1.47 0.1430  
##  snp_hits'''                           0.0000 0.0000  0.04 0.9662  
##  data_source=bone                     -0.0001 0.0001 -0.72 0.4735  
##  data_source=bone (cranial)            0.0003 0.0002  1.55 0.1216  
##  data_source=bone (long bone)          0.0002 0.0001  1.98 0.0482  
##  data_source=bone (phalanx)            0.0001 0.0002  0.43 0.6669  
##  data_source=petrous                   0.0000 0.0001 -0.70 0.4845  
##  data_source=tooth                     0.0000 0.0001  0.70 0.4842  
##  data_source=tooth (canine)            0.0004 0.0004  1.06 0.2885  
##  data_source=tooth (incisor)          -0.0001 0.0004 -0.26 0.7917  
##  data_source=tooth (molar)             0.0001 0.0001  1.01 0.3146  
##  data_source=tooth (premolar)         -0.0001 0.0003 -0.48 0.6292  
##  data_source=tooth and bone (cranial)  0.0000 0.0006  0.06 0.9523  
## 
(model_5b = ols(CAVIARBF_supl_table_SCORE ~ rcs(ybp, 3) + abs_lat + long + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + data_source, data = d2_euro))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat                      long 
##                       173                       175 
##   CAVIARBF_supl_table_CNT                  coverage 
##                         0                         0 
##                  snp_hits               data_source 
##                         0                         0 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ rcs(ybp, 3) + abs_lat + 
##      long + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##      data_source, data = d2_euro)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1312    LR chi2    119.31    R2       0.087    
##  sigma0.0006    d.f.           24    R2 adj   0.070    
##  d.f.   1287    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -2.562e-03 -3.145e-04 -1.742e-05  2.939e-04  2.929e-03 
##  
##  
##                                       Coef    S.E.   t     Pr(>|t|)
##  Intercept                             0.0002 0.0002  0.84 0.4014  
##  ybp                                   0.0000 0.0000 -1.97 0.0489  
##  ybp'                                  0.0000 0.0000  1.35 0.1783  
##  abs_lat                               0.0000 0.0000 -0.85 0.3959  
##  long                                  0.0000 0.0000 -1.99 0.0464  
##  CAVIARBF_supl_table_CNT               0.0000 0.0000  0.56 0.5754  
##  coverage                             -0.0035 0.0021 -1.62 0.1053  
##  coverage'                             0.5138 0.2938  1.75 0.0806  
##  coverage''                           -0.6366 0.3635 -1.75 0.0802  
##  coverage'''                           0.1235 0.0703  1.76 0.0791  
##  snp_hits                              0.0000 0.0000  1.80 0.0717  
##  snp_hits'                             0.0000 0.0000 -1.53 0.1268  
##  snp_hits''                            0.0000 0.0000  1.35 0.1761  
##  snp_hits'''                           0.0000 0.0000  0.17 0.8629  
##  data_source=bone                      0.0000 0.0001 -0.44 0.6634  
##  data_source=bone (cranial)            0.0003 0.0002  1.51 0.1306  
##  data_source=bone (long bone)          0.0002 0.0001  2.14 0.0322  
##  data_source=bone (phalanx)            0.0001 0.0002  0.59 0.5538  
##  data_source=petrous                   0.0000 0.0001 -0.57 0.5664  
##  data_source=tooth                     0.0001 0.0001  1.15 0.2522  
##  data_source=tooth (canine)            0.0004 0.0004  1.03 0.3025  
##  data_source=tooth (incisor)          -0.0001 0.0004 -0.24 0.8134  
##  data_source=tooth (molar)             0.0001 0.0001  0.81 0.4164  
##  data_source=tooth (premolar)         -0.0002 0.0003 -0.64 0.5202  
##  data_source=tooth and bone (cranial)  0.0000 0.0006  0.06 0.9558  
## 
(model_5c = ols(CAVIARBF_supl_table_SCORE ~ rcs(ybp, 5) + abs_lat + long + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + data_source, data = d2_euro))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat                      long 
##                       173                       175 
##   CAVIARBF_supl_table_CNT                  coverage 
##                         0                         0 
##                  snp_hits               data_source 
##                         0                         0 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ rcs(ybp, 5) + abs_lat + 
##      long + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##      data_source, data = d2_euro)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1312    LR chi2    129.51    R2       0.094    
##  sigma0.0006    d.f.           26    R2 adj   0.076    
##  d.f.   1285    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -2.543e-03 -3.156e-04 -2.376e-05  3.074e-04  2.819e-03 
##  
##  
##                                       Coef    S.E.   t     Pr(>|t|)
##  Intercept                            -0.0001 0.0002 -0.30 0.7637  
##  ybp                                   0.0000 0.0000  1.82 0.0689  
##  ybp'                                  0.0000 0.0000 -2.35 0.0187  
##  ybp''                                 0.0000 0.0000  2.03 0.0427  
##  ybp'''                                0.0000 0.0000 -1.94 0.0530  
##  abs_lat                               0.0000 0.0000 -0.97 0.3343  
##  long                                  0.0000 0.0000 -2.08 0.0380  
##  CAVIARBF_supl_table_CNT               0.0000 0.0000  0.51 0.6103  
##  coverage                             -0.0037 0.0021 -1.72 0.0859  
##  coverage'                             0.5345 0.2930  1.82 0.0683  
##  coverage''                           -0.6618 0.3625 -1.83 0.0682  
##  coverage'''                           0.1279 0.0701  1.83 0.0682  
##  snp_hits                              0.0000 0.0000  1.93 0.0543  
##  snp_hits'                             0.0000 0.0000 -1.65 0.0983  
##  snp_hits''                            0.0000 0.0000  1.49 0.1354  
##  snp_hits'''                           0.0000 0.0000 -0.01 0.9919  
##  data_source=bone                     -0.0001 0.0001 -0.72 0.4727  
##  data_source=bone (cranial)            0.0003 0.0002  1.53 0.1268  
##  data_source=bone (long bone)          0.0002 0.0001  1.95 0.0517  
##  data_source=bone (phalanx)            0.0001 0.0002  0.43 0.6690  
##  data_source=petrous                   0.0000 0.0001 -0.69 0.4890  
##  data_source=tooth                     0.0000 0.0001  0.73 0.4633  
##  data_source=tooth (canine)            0.0004 0.0004  1.05 0.2959  
##  data_source=tooth (incisor)          -0.0001 0.0004 -0.29 0.7754  
##  data_source=tooth (molar)             0.0001 0.0001  1.01 0.3120  
##  data_source=tooth (premolar)         -0.0001 0.0003 -0.53 0.5956  
##  data_source=tooth and bone (cranial)  0.0001 0.0006  0.12 0.9055  
## 
#steps
lrtest(model_1, model_2)
## 
## Model 1: CAVIARBF_supl_table_SCORE ~ ybp
## Model 2: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat
## 
## L.R. Chisq       d.f.          P 
##      2.064      1.000      0.151
lrtest(model_2, model_3)
## 
## Model 1: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat
## Model 2: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long
## 
## L.R. Chisq       d.f.          P 
##     2.8374     1.0000     0.0921
lrtest(model_3, model_4)
## 
## Model 1: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long
## Model 2: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long + CAVIARBF_supl_table_CNT + 
##     rcs(coverage) + rcs(snp_hits) + data_source
## 
## L.R. Chisq       d.f.          P 
##   9.43e+01   2.00e+01   1.30e-11
lrtest(model_4, model_5)
## 
## Model 1: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long + CAVIARBF_supl_table_CNT + 
##     rcs(coverage) + rcs(snp_hits) + data_source
## Model 2: CAVIARBF_supl_table_SCORE ~ rcs(ybp, 4) + abs_lat + long + CAVIARBF_supl_table_CNT + 
##     rcs(coverage) + rcs(snp_hits) + data_source
## 
## L.R. Chisq       d.f.          P 
##     8.6422     2.0000     0.0133
#plot expected age pattern
model_preds = data.frame(
  ybp = 154:90000, #full range,
  abs_lat = d2_euro$abs_lat %>% wtd_mean(), #mean covars
  long = d2_euro$long %>% wtd_mean(), #mean covars
  CAVIARBF_supl_table_CNT = d2_euro$CAVIARBF_supl_table_CNT %>% wtd_mean(), 
  coverage = d2_euro$coverage %>% wtd_mean(), 
  snp_hits = d2_euro$snp_hits %>% wtd_mean(), 
  data_source = "petrous", #mode
  UN_continental_region_1 = "Europe" #mode
)

model_preds$model_1 = predict(model_1, newdata = model_preds)
model_preds$model_2 = predict(model_2, newdata = model_preds)
model_preds$model_3 = predict(model_3, newdata = model_preds)
model_preds$model_4 = predict(model_4, newdata = model_preds)
model_preds$model_5 = predict(model_5, newdata = model_preds)
model_preds$model_5b = predict(model_5b, newdata = model_preds)
model_preds$model_5c = predict(model_5c, newdata = model_preds)

#plot predictions
model_preds %>% 
  gather(key = model, value = prediction, starts_with("model_")) %>% 
  ggplot(aes(ybp, prediction, color = model)) +
  geom_line() +
  theme_bw()

So the shape of the function varies depending on the number of knots used for the spline. Tricky.

Near-moden Europeans only

#simple
(model_1 = ols(CAVIARBF_supl_table_SCORE ~ ybp, data = d2_NM_euro))
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ ybp, data = d2_NM_euro)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     889    LR chi2      0.14    R2       0.000    
##  sigma0.0007    d.f.            1    R2 adj  -0.001    
##  d.f.    887    Pr(> chi2) 0.7084    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.0023940 -0.0003477 -0.0001420  0.0003343  0.0031047 
##  
##  
##            Coef   S.E.   t    Pr(>|t|)
##  Intercept 0.0001 0.0001 2.25 0.0250  
##  ybp       0.0000 0.0000 0.37 0.7088  
## 
#latitude
(model_2 = ols(CAVIARBF_supl_table_SCORE ~ ybp + abs_lat, data = d2_NM_euro))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat 
##                       130 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ ybp + abs_lat, data = d2_NM_euro)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     759    LR chi2      0.49    R2       0.001    
##  sigma0.0007    d.f.            2    R2 adj  -0.002    
##  d.f.    756    Pr(> chi2) 0.7814    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.0023770 -0.0003565 -0.0001416  0.0003745  0.0030829 
##  
##  
##            Coef   S.E.   t     Pr(>|t|)
##  Intercept 0.0002 0.0003  0.86 0.3884  
##  ybp       0.0000 0.0000 -0.69 0.4884  
##  abs_lat   0.0000 0.0000  0.03 0.9747  
## 
#continental dummies
(model_3 = ols(CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long, data = d2_NM_euro))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat                      long 
##                       130                       132 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long, 
##      data = d2_NM_euro)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     757    LR chi2      4.85    R2       0.006    
##  sigma0.0007    d.f.            3    R2 adj   0.002    
##  d.f.    753    Pr(> chi2) 0.1834    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.0023793 -0.0003443 -0.0001395  0.0003485  0.0031042 
##  
##  
##            Coef   S.E.   t     Pr(>|t|)
##  Intercept 0.0002 0.0003  0.74 0.4617  
##  ybp       0.0000 0.0000 -0.89 0.3716  
##  abs_lat   0.0000 0.0000  0.38 0.7028  
##  long      0.0000 0.0000 -2.05 0.0412  
## 
#quality
(model_4 = ols(CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + data_source, data = d2_NM_euro))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat                      long 
##                       130                       132 
##   CAVIARBF_supl_table_CNT                  coverage 
##                         0                         0 
##                  snp_hits               data_source 
##                         0                         0 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long + 
##      CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##      data_source, data = d2_NM_euro)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     757    LR chi2     98.01    R2       0.121    
##  sigma0.0006    d.f.           21    R2 adj   0.096    
##  d.f.    735    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -2.426e-03 -3.315e-04 -9.681e-06  3.035e-04  2.968e-03 
##  
##  
##                               Coef    S.E.   t     Pr(>|t|)
##  Intercept                     0.0001 0.0003  0.44 0.6607  
##  ybp                           0.0000 0.0000  0.09 0.9318  
##  abs_lat                       0.0000 0.0000 -0.70 0.4849  
##  long                          0.0000 0.0000 -2.35 0.0192  
##  CAVIARBF_supl_table_CNT       0.0000 0.0000  0.64 0.5230  
##  coverage                     -0.0037 0.0033 -1.11 0.2694  
##  coverage'                     0.6875 0.5214  1.32 0.1877  
##  coverage''                   -0.8819 0.6621 -1.33 0.1833  
##  coverage'''                   0.1992 0.1421  1.40 0.1615  
##  snp_hits                      0.0000 0.0000  1.23 0.2201  
##  snp_hits'                     0.0000 0.0000 -1.34 0.1811  
##  snp_hits''                    0.0000 0.0000  1.34 0.1809  
##  snp_hits'''                   0.0000 0.0000 -1.16 0.2453  
##  data_source=bone             -0.0002 0.0001 -1.58 0.1137  
##  data_source=bone (cranial)    0.0009 0.0006  1.49 0.1379  
##  data_source=bone (long bone)  0.0002 0.0001  1.39 0.1652  
##  data_source=bone (phalanx)   -0.0002 0.0003 -0.57 0.5721  
##  data_source=petrous          -0.0001 0.0001 -1.43 0.1525  
##  data_source=tooth             0.0000 0.0001 -0.46 0.6440  
##  data_source=tooth (canine)    0.0008 0.0006  1.27 0.2033  
##  data_source=tooth (incisor)  -0.0001 0.0006 -0.10 0.9178  
##  data_source=tooth (molar)    -0.0002 0.0002 -1.13 0.2586  
## 
#nonlinear age
(model_5 = ols(CAVIARBF_supl_table_SCORE ~ rcs(ybp, 4) + abs_lat + long + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + data_source, data = d2_NM_euro))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat                      long 
##                       130                       132 
##   CAVIARBF_supl_table_CNT                  coverage 
##                         0                         0 
##                  snp_hits               data_source 
##                         0                         0 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ rcs(ybp, 4) + abs_lat + 
##      long + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##      data_source, data = d2_NM_euro)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     757    LR chi2    104.50    R2       0.129    
##  sigma0.0006    d.f.           23    R2 adj   0.102    
##  d.f.    733    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -2.421e-03 -3.283e-04 -1.485e-05  3.067e-04  2.839e-03 
##  
##  
##                               Coef    S.E.   t     Pr(>|t|)
##  Intercept                    -0.0001 0.0003 -0.38 0.7039  
##  ybp                           0.0000 0.0000  1.90 0.0580  
##  ybp'                          0.0000 0.0000 -1.55 0.1219  
##  ybp''                         0.0000 0.0000  0.51 0.6136  
##  abs_lat                       0.0000 0.0000 -0.79 0.4326  
##  long                          0.0000 0.0000 -2.64 0.0084  
##  CAVIARBF_supl_table_CNT       0.0000 0.0000  0.63 0.5271  
##  coverage                     -0.0040 0.0033 -1.19 0.2343  
##  coverage'                     0.7199 0.5206  1.38 0.1671  
##  coverage''                   -0.9223 0.6611 -1.40 0.1634  
##  coverage'''                   0.2068 0.1419  1.46 0.1453  
##  snp_hits                      0.0000 0.0000  1.32 0.1870  
##  snp_hits'                     0.0000 0.0000 -1.43 0.1532  
##  snp_hits''                    0.0000 0.0000  1.44 0.1510  
##  snp_hits'''                   0.0000 0.0000 -1.28 0.2010  
##  data_source=bone             -0.0002 0.0001 -1.70 0.0900  
##  data_source=bone (cranial)    0.0008 0.0006  1.32 0.1883  
##  data_source=bone (long bone)  0.0002 0.0001  1.37 0.1705  
##  data_source=bone (phalanx)   -0.0002 0.0003 -0.67 0.5024  
##  data_source=petrous          -0.0001 0.0001 -1.56 0.1199  
##  data_source=tooth             0.0000 0.0001 -0.56 0.5750  
##  data_source=tooth (canine)    0.0008 0.0006  1.29 0.1963  
##  data_source=tooth (incisor)  -0.0001 0.0006 -0.12 0.9059  
##  data_source=tooth (molar)    -0.0002 0.0002 -1.11 0.2689  
## 
(model_5b = ols(CAVIARBF_supl_table_SCORE ~ rcs(ybp, 3) + abs_lat + long + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + data_source, data = d2_NM_euro))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat                      long 
##                       130                       132 
##   CAVIARBF_supl_table_CNT                  coverage 
##                         0                         0 
##                  snp_hits               data_source 
##                         0                         0 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ rcs(ybp, 3) + abs_lat + 
##      long + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##      data_source, data = d2_NM_euro)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     757    LR chi2    104.19    R2       0.129    
##  sigma0.0006    d.f.           22    R2 adj   0.102    
##  d.f.    734    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -2.422e-03 -3.284e-04 -1.245e-05  3.039e-04  2.857e-03 
##  
##  
##                               Coef    S.E.   t     Pr(>|t|)
##  Intercept                     0.0000 0.0003 -0.17 0.8687  
##  ybp                           0.0000 0.0000  2.21 0.0277  
##  ybp'                          0.0000 0.0000 -2.45 0.0144  
##  abs_lat                       0.0000 0.0000 -0.83 0.4049  
##  long                          0.0000 0.0000 -2.60 0.0096  
##  CAVIARBF_supl_table_CNT       0.0000 0.0000  0.61 0.5404  
##  coverage                     -0.0039 0.0033 -1.17 0.2443  
##  coverage'                     0.7063 0.5197  1.36 0.1745  
##  coverage''                   -0.9052 0.6600 -1.37 0.1706  
##  coverage'''                   0.2033 0.1416  1.44 0.1515  
##  snp_hits                      0.0000 0.0000  1.30 0.1951  
##  snp_hits'                     0.0000 0.0000 -1.41 0.1599  
##  snp_hits''                    0.0000 0.0000  1.42 0.1572  
##  snp_hits'''                   0.0000 0.0000 -1.27 0.2033  
##  data_source=bone             -0.0002 0.0001 -1.75 0.0804  
##  data_source=bone (cranial)    0.0009 0.0006  1.34 0.1794  
##  data_source=bone (long bone)  0.0002 0.0001  1.39 0.1645  
##  data_source=bone (phalanx)   -0.0002 0.0003 -0.70 0.4844  
##  data_source=petrous          -0.0001 0.0001 -1.61 0.1083  
##  data_source=tooth            -0.0001 0.0001 -0.65 0.5153  
##  data_source=tooth (canine)    0.0008 0.0006  1.28 0.2025  
##  data_source=tooth (incisor)  -0.0001 0.0006 -0.14 0.8869  
##  data_source=tooth (molar)    -0.0002 0.0002 -1.16 0.2470  
## 
(model_5c = ols(CAVIARBF_supl_table_SCORE ~ rcs(ybp, 5) + abs_lat + long + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + data_source, data = d2_NM_euro))
## Frequencies of Missing Values Due to Each Variable
## CAVIARBF_supl_table_SCORE                       ybp 
##                         0                         0 
##                   abs_lat                      long 
##                       130                       132 
##   CAVIARBF_supl_table_CNT                  coverage 
##                         0                         0 
##                  snp_hits               data_source 
##                         0                         0 
## 
## Linear Regression Model
##  
##  ols(formula = CAVIARBF_supl_table_SCORE ~ rcs(ybp, 5) + abs_lat + 
##      long + CAVIARBF_supl_table_CNT + rcs(coverage) + rcs(snp_hits) + 
##      data_source, data = d2_NM_euro)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     757    LR chi2    104.60    R2       0.129    
##  sigma0.0006    d.f.           24    R2 adj   0.100    
##  d.f.    732    Pr(> chi2) 0.0000    g        0.000    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.0024292 -0.0003276 -0.0000175  0.0003047  0.0028262 
##  
##  
##                               Coef    S.E.   t     Pr(>|t|)
##  Intercept                    -0.0002 0.0003 -0.48 0.6336  
##  ybp                           0.0000 0.0000  1.58 0.1148  
##  ybp'                          0.0000 0.0000 -0.96 0.3368  
##  ybp''                         0.0000 0.0000  0.39 0.6963  
##  ybp'''                        0.0000 0.0000 -0.12 0.9006  
##  abs_lat                       0.0000 0.0000 -0.72 0.4693  
##  long                          0.0000 0.0000 -2.66 0.0081  
##  CAVIARBF_supl_table_CNT       0.0000 0.0000  0.64 0.5211  
##  coverage                     -0.0040 0.0033 -1.19 0.2342  
##  coverage'                     0.7215 0.5210  1.38 0.1665  
##  coverage''                   -0.9244 0.6616 -1.40 0.1628  
##  coverage'''                   0.2074 0.1420  1.46 0.1446  
##  snp_hits                      0.0000 0.0000  1.32 0.1868  
##  snp_hits'                     0.0000 0.0000 -1.44 0.1514  
##  snp_hits''                    0.0000 0.0000  1.44 0.1492  
##  snp_hits'''                   0.0000 0.0000 -1.28 0.1995  
##  data_source=bone             -0.0002 0.0001 -1.67 0.0959  
##  data_source=bone (cranial)    0.0008 0.0006  1.31 0.1891  
##  data_source=bone (long bone)  0.0002 0.0001  1.40 0.1631  
##  data_source=bone (phalanx)   -0.0002 0.0003 -0.67 0.5023  
##  data_source=petrous          -0.0001 0.0001 -1.55 0.1220  
##  data_source=tooth             0.0000 0.0001 -0.56 0.5759  
##  data_source=tooth (canine)    0.0008 0.0006  1.29 0.1985  
##  data_source=tooth (incisor)  -0.0001 0.0006 -0.12 0.9051  
##  data_source=tooth (molar)    -0.0002 0.0002 -1.11 0.2687  
## 
#steps
lrtest(model_1, model_2)
## 
## Model 1: CAVIARBF_supl_table_SCORE ~ ybp
## Model 2: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat
## 
## L.R. Chisq       d.f.          P 
##      0.353      1.000      0.552
lrtest(model_2, model_3)
## 
## Model 1: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat
## Model 2: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long
## 
## L.R. Chisq       d.f.          P 
##      4.353      1.000      0.037
lrtest(model_3, model_4)
## 
## Model 1: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long
## Model 2: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long + CAVIARBF_supl_table_CNT + 
##     rcs(coverage) + rcs(snp_hits) + data_source
## 
## L.R. Chisq       d.f.          P 
##   9.32e+01   1.80e+01   3.89e-12
lrtest(model_4, model_5)
## 
## Model 1: CAVIARBF_supl_table_SCORE ~ ybp + abs_lat + long + CAVIARBF_supl_table_CNT + 
##     rcs(coverage) + rcs(snp_hits) + data_source
## Model 2: CAVIARBF_supl_table_SCORE ~ rcs(ybp, 4) + abs_lat + long + CAVIARBF_supl_table_CNT + 
##     rcs(coverage) + rcs(snp_hits) + data_source
## 
## L.R. Chisq       d.f.          P 
##      6.487      2.000      0.039
#plot expected age pattern
model_preds = data.frame(
  ybp = 0:5000, #full range,
  abs_lat = d2_euro$abs_lat %>% wtd_mean(), #mean covars
  long = d2_euro$long %>% wtd_mean(), #mean covars
  CAVIARBF_supl_table_CNT = d2_euro$CAVIARBF_supl_table_CNT %>% wtd_mean(), 
  coverage = d2_euro$coverage %>% wtd_mean(), 
  snp_hits = d2_euro$snp_hits %>% wtd_mean(), 
  data_source = "petrous", #mode
  UN_continental_region_1 = "Europe" #mode
)

model_preds$model_1 = predict(model_1, newdata = model_preds)
model_preds$model_2 = predict(model_2, newdata = model_preds)
model_preds$model_3 = predict(model_3, newdata = model_preds)
model_preds$model_4 = predict(model_4, newdata = model_preds)
model_preds$model_5 = predict(model_5, newdata = model_preds)
model_preds$model_5b = predict(model_5b, newdata = model_preds)
model_preds$model_5c = predict(model_5c, newdata = model_preds)

#plot predictions
model_preds %>% 
  gather(key = model, value = prediction, starts_with("model_")) %>% 
  ggplot(aes(ybp, prediction, color = model)) +
  geom_line() +
  theme_bw()

#plot

Polygenic score correlations

#all data
wtd.cors(d %>% select(ends_with("SCORE")))
##                               CAVIARBF_supl_table_SCORE
## CAVIARBF_supl_table_SCORE                       1.00000
## GWAS_EA_excl23andMe_SCORE                      -0.00928
## MTAG_EA.to10K_SCORE                             0.15447
## mtag_edu_lead_snps_gwas_SCORE                   0.20368
##                               GWAS_EA_excl23andMe_SCORE
## CAVIARBF_supl_table_SCORE                      -0.00928
## GWAS_EA_excl23andMe_SCORE                       1.00000
## MTAG_EA.to10K_SCORE                             0.57631
## mtag_edu_lead_snps_gwas_SCORE                   0.59114
##                               MTAG_EA.to10K_SCORE
## CAVIARBF_supl_table_SCORE                   0.154
## GWAS_EA_excl23andMe_SCORE                   0.576
## MTAG_EA.to10K_SCORE                         1.000
## mtag_edu_lead_snps_gwas_SCORE               0.713
##                               mtag_edu_lead_snps_gwas_SCORE
## CAVIARBF_supl_table_SCORE                             0.204
## GWAS_EA_excl23andMe_SCORE                             0.591
## MTAG_EA.to10K_SCORE                                   0.713
## mtag_edu_lead_snps_gwas_SCORE                         1.000
#old data
wtd.cors(d2 %>% select(ends_with("SCORE")))
##                               CAVIARBF_supl_table_SCORE
## CAVIARBF_supl_table_SCORE                        1.0000
## GWAS_EA_excl23andMe_SCORE                        0.0668
## MTAG_EA.to10K_SCORE                              0.1871
## mtag_edu_lead_snps_gwas_SCORE                    0.2564
##                               GWAS_EA_excl23andMe_SCORE
## CAVIARBF_supl_table_SCORE                        0.0668
## GWAS_EA_excl23andMe_SCORE                        1.0000
## MTAG_EA.to10K_SCORE                              0.4409
## mtag_edu_lead_snps_gwas_SCORE                    0.4038
##                               MTAG_EA.to10K_SCORE
## CAVIARBF_supl_table_SCORE                   0.187
## GWAS_EA_excl23andMe_SCORE                   0.441
## MTAG_EA.to10K_SCORE                         1.000
## mtag_edu_lead_snps_gwas_SCORE               0.646
##                               mtag_edu_lead_snps_gwas_SCORE
## CAVIARBF_supl_table_SCORE                             0.256
## GWAS_EA_excl23andMe_SCORE                             0.404
## MTAG_EA.to10K_SCORE                                   0.646
## mtag_edu_lead_snps_gwas_SCORE                         1.000
#new data
wtd.cors(d %>% filter(ybp <= 150) %>% select(ends_with("SCORE")))
##                               CAVIARBF_supl_table_SCORE
## CAVIARBF_supl_table_SCORE                        1.0000
## GWAS_EA_excl23andMe_SCORE                       -0.0145
## MTAG_EA.to10K_SCORE                              0.1532
## mtag_edu_lead_snps_gwas_SCORE                    0.1873
##                               GWAS_EA_excl23andMe_SCORE
## CAVIARBF_supl_table_SCORE                       -0.0145
## GWAS_EA_excl23andMe_SCORE                        1.0000
## MTAG_EA.to10K_SCORE                              0.6051
## mtag_edu_lead_snps_gwas_SCORE                    0.6273
##                               MTAG_EA.to10K_SCORE
## CAVIARBF_supl_table_SCORE                   0.153
## GWAS_EA_excl23andMe_SCORE                   0.605
## MTAG_EA.to10K_SCORE                         1.000
## mtag_edu_lead_snps_gwas_SCORE               0.731
##                               mtag_edu_lead_snps_gwas_SCORE
## CAVIARBF_supl_table_SCORE                             0.187
## GWAS_EA_excl23andMe_SCORE                             0.627
## MTAG_EA.to10K_SCORE                                   0.731
## mtag_edu_lead_snps_gwas_SCORE                         1.000

Meta / Export

#save data for reuse
write_tsv(d, "data/merged_data.tsv", na="")

write_sessioninfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rvest_0.3.4        xml2_1.2.2         rms_5.1-3.1       
##  [4] SparseM_1.77       kirkegaard_2018.05 metafor_2.1-0     
##  [7] Matrix_1.2-17      psych_1.8.12       magrittr_1.5      
## [10] assertthat_0.2.1   weights_1.0        mice_3.6.0        
## [13] gdata_2.18.0       Hmisc_4.2-0        Formula_1.2-3     
## [16] survival_2.44-1.1  lattice_0.20-38    forcats_0.4.0     
## [19] stringr_1.4.0      dplyr_0.8.3        purrr_0.3.2       
## [22] readr_1.3.1        tidyr_0.8.3        tibble_2.1.3      
## [25] ggplot2_3.2.1      tidyverse_1.2.1    pacman_0.5.1      
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-141        lubridate_1.7.4     RColorBrewer_1.1-2 
##  [4] httr_1.4.1          tools_3.6.1         backports_1.1.4    
##  [7] R6_2.4.0            rpart_4.1-15        mgcv_1.8-28        
## [10] lazyeval_0.2.2      colorspace_1.4-1    jomo_2.6-9         
## [13] nnet_7.3-12         withr_2.1.2         tidyselect_0.2.5   
## [16] gridExtra_2.3       mnormt_1.5-5        compiler_3.6.1     
## [19] cli_1.1.0           quantreg_5.51       htmlTable_1.13.1   
## [22] sandwich_2.5-1      labeling_0.3        scales_1.0.0       
## [25] checkmate_1.9.4     mvtnorm_1.0-11      polspline_1.1.15   
## [28] digest_0.6.20       foreign_0.8-72      minqa_1.2.4        
## [31] rmarkdown_1.14      stringdist_0.9.5.2  base64enc_0.1-3    
## [34] pkgconfig_2.0.2     htmltools_0.3.6     lme4_1.1-21        
## [37] htmlwidgets_1.3     rlang_0.4.0         readxl_1.3.1       
## [40] rstudioapi_0.10     generics_0.0.2      zoo_1.8-6          
## [43] jsonlite_1.6        gtools_3.8.1        acepack_1.4.1      
## [46] Rcpp_1.0.2          munsell_0.5.0       stringi_1.4.3      
## [49] multcomp_1.4-10     yaml_2.2.0          MASS_7.3-51.4      
## [52] plyr_1.8.4          grid_3.6.1          parallel_3.6.1     
## [55] mitml_0.3-7         crayon_1.3.4        haven_2.1.1        
## [58] splines_3.6.1       hms_0.5.0           zeallot_0.1.0      
## [61] knitr_1.24          pillar_1.4.2        boot_1.3-23        
## [64] codetools_0.2-16    pan_1.6             glue_1.3.1         
## [67] evaluate_0.14       latticeExtra_0.6-28 data.table_1.12.2  
## [70] modelr_0.1.5        selectr_0.4-1       vctrs_0.2.0        
## [73] nloptr_1.2.1        MatrixModels_0.4-1  cellranger_1.1.0   
## [76] gtable_0.3.0        xfun_0.8            broom_0.5.2        
## [79] cluster_2.1.0       TH.data_1.0-10