Frequencies of Corona virus genetic variants by 1000 genomes populations

Init

options(digits = 2)
library(pacman)
p_load(kirkegaard, DT)

Data

#vcf of snps
tg_snps = read_vcf("data/tg_snps.vcf")

#tg meta
tg_meta = read_tsv(str_glue("data/tg_samples.tsv")) %>% df_legalize_names()
## Parsed with column specification:
## cols(
##   `Sample name` = col_character(),
##   Sex = col_character(),
##   `Biosample ID` = col_character(),
##   `Population code` = col_character(),
##   `Population name` = col_character(),
##   `Superpopulation code` = col_character(),
##   `Superpopulation name` = col_character(),
##   `Data collections` = col_character()
## )
tg_meta %<>% filter(Sample_name %in% tg_snps$id)

#join
d = full_join(tg_snps, tg_meta, by = c("id" = "Sample_name"))
## Warning: Column `id`/`Sample_name` has different attributes on LHS and RHS of
## join

Frequency by group

freqs_superpops = plyr::ddply(d, "Superpopulation_name", function(dd) {
  #frequency of each snp
  snps = names(dd) %>% str_subset("^rs")
  snp_freqs = map(snps, ~wtd_mean(dd[[.]] / 2))
  names(snp_freqs) = snps
  
  #data frame
  snp_freqs %>% as.list %>% as_tibble
})

freqs_superpops %>% df_round(3)
freqs_pops = plyr::ddply(d, "Population_name", function(dd) {
  #frequency of each snp
  snps = names(dd) %>% str_subset("^rs")
  snp_freqs = map(snps, ~wtd_mean(dd[[.]] / 2))
  names(snp_freqs) = snps
  
  #data frame
  snp_freqs %>% as.list %>% as_tibble
})

freqs_pops %>% df_round(3)

Meta

write_tsv(freqs_pops, "data/out/freqs_pops.tsv")
write_tsv(freqs_superpops, "data/out/freqs_superpops.tsv")

write_sessioninfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] DT_0.12            kirkegaard_2018.05 metafor_2.1-0      Matrix_1.2-18     
##  [5] psych_1.9.12.31    magrittr_1.5       assertthat_0.2.1   weights_1.0.1     
##  [9] mice_3.8.0         gdata_2.18.0       Hmisc_4.3-1        Formula_1.2-3     
## [13] survival_3.1-8     lattice_0.20-38    forcats_0.4.0      stringr_1.4.0     
## [17] dplyr_0.8.4        purrr_0.3.3        readr_1.3.1        tidyr_1.0.2       
## [21] tibble_2.1.3       ggplot2_3.2.1      tidyverse_1.3.0    pacman_0.5.1      
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-144        fs_1.3.1            lubridate_1.7.4    
##  [4] RColorBrewer_1.1-2  httr_1.4.1          tools_3.6.2        
##  [7] backports_1.1.5     R6_2.4.1            rpart_4.1-15       
## [10] DBI_1.1.0           lazyeval_0.2.2      colorspace_1.4-1   
## [13] nnet_7.3-12         withr_2.1.2         tidyselect_1.0.0   
## [16] gridExtra_2.3       mnormt_1.5-6        compiler_3.6.2     
## [19] cli_2.0.2           rvest_0.3.5         htmlTable_1.13.3   
## [22] xml2_1.2.2          scales_1.1.0        checkmate_2.0.0    
## [25] digest_0.6.25       foreign_0.8-75      rmarkdown_2.1      
## [28] base64enc_0.1-3     jpeg_0.1-8.1        pkgconfig_2.0.3    
## [31] htmltools_0.4.0     dbplyr_1.4.2        htmlwidgets_1.5.1  
## [34] rlang_0.4.4         readxl_1.3.1        rstudioapi_0.11    
## [37] generics_0.0.2      jsonlite_1.6.1      gtools_3.8.1       
## [40] acepack_1.4.1       Rcpp_1.0.3          munsell_0.5.0      
## [43] fansi_0.4.1         lifecycle_0.1.0     stringi_1.4.6      
## [46] yaml_2.2.1          plyr_1.8.5          grid_3.6.2         
## [49] parallel_3.6.2      crayon_1.3.4        haven_2.2.0        
## [52] splines_3.6.2       hms_0.5.3           knitr_1.28         
## [55] pillar_1.4.3        reprex_0.3.0        glue_1.3.1         
## [58] evaluate_0.14       latticeExtra_0.6-29 data.table_1.12.8  
## [61] modelr_0.1.6        png_0.1-7           vctrs_0.2.3        
## [64] cellranger_1.1.0    gtable_0.3.0        xfun_0.12          
## [67] broom_0.5.5         cluster_2.1.0       ellipsis_0.3.0