THIS SCRIPT REQUIRES THE LOF VARIANT FILTERING AND RANKING SCRIPT (https://rpubs.com/deepsubs/nature_comms_paper_2020) TO BE RUN FIRST, AND THE SPECIFIED OUTPUT FILE FROM THAT SCRIPT TO BE SAVED IN THE SAME DIRECTORY AS THIS SCRIPT

Load tidyverse package

library("tidyverse")

Load data from LoF variant filtering and ranking analysis

masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01 <- read.csv("masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01.csv")

Remove variants with non-protein-coding biotypes

masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype <- filter(masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01,BIOTYPE%in%c("protein_coding"))

Remove variants with GnomAD PopMax AF > 0.005

masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005 <- filter(masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype,
GnomAD_v2.1_non_cancer_AF_popmax<=0.005)

Remove variants with ratio < 3

masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3 <- filter(masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005,Sample_Gene_LOF_Freq_Ratio_GnomAD_NFE_0.005>3) 

Output values for binomial test

variant_counts_AFs <- function(vcf,Total_Sample_Alleles){
variants_heterozygous <- vcf %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  dplyr::rename(alleles = "n()") %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants_homozygous <- vcf %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(alleles = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants <- full_join(variants_heterozygous,variants_homozygous,by="HGVSc",copy=FALSE,suffix=c(".x",".y")) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Allele_Count=`sum(alleles).x`+`sum(alleles).y`) %>% 
  mutate(Sample_AF=Total_Allele_Count/Total_Sample_Alleles)
vcf <- left_join(vcf, variants[,c(1,4:5)],by="HGVSc",copy=FALSE)
}

masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3_variantsPerPerson <- select(masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3,Sample,CHROM,POS,REF,ALT,Sample.GT,SYMBOL,Gene,Feature,HGVSc)
samples <- masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3_variantsPerPerson[,1] %>% unique()
samples_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3_variantsPerPerson <- data.frame(Sample=factor(),No_of_Variants=double())
for(sample in samples){
  sample_data <- filter(masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3_variantsPerPerson,Sample%in%c(sample))
  sample_variants <- variant_counts_AFs(sample_data,(n_distinct(samples)*2))
  no_of_variants <- sum(sample_variants$Total_Allele_Count)
  samples_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3_variantsPerPerson <- add_row(samples_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3_variantsPerPerson,Sample=sample,No_of_Variants=no_of_variants)
}

binomial_results_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3 <- samples_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3_variantsPerPerson %>%
  rename(gene_Var = "No_of_Variants") %>%
  count(gene_Var) %>%
  rename(obs_n_samples = "n") %>%
  add_row(gene_Var = 0, obs_n_samples = (510)-n_distinct(masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3$Sample)) %>%
  arrange(gene_Var) %>%
  mutate(Total_No=gene_Var*obs_n_samples) %>%
  mutate(p_LoF_variant_perSample=sum(Total_No)/(510*2*(n_distinct(masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3$Gene)))) %>%
  mutate(binomial_distrib_p=(dbinom(gene_Var,2*(n_distinct(masterfile_eoc_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3$Gene)),p_LoF_variant_perSample,log=FALSE))) %>%
  mutate(exp_n_samples = round(binomial_distrib_p*(510),digits=0)) %>%
  write_tsv(path="binomial_results_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3.tsv")

multinomial <- binomial_results_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3[c(1:8),c(2,5,6)] %>%
  add_row(obs_n_samples=(sum(binomial_results_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3[c(9:13),2])),
          binomial_distrib_p=(sum(binomial_results_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3[c(9:13),5])),
          exp_n_samples=(sum(binomial_results_goodQ2_HIGH_ENSTcanonical_sampleAF0.01_biotype_GnomADpopmax0.005_ratio3[c(9:13),6])))

Perform chi-squared test

chisq.test(x=multinomial$obs_n_samples,
           correct=FALSE,
           p=multinomial$binomial_distrib_p,
           rescale.p=TRUE)

    Chi-squared test for given probabilities

data:  multinomial$obs_n_samples
X-squared = 10.56, df = 8, p-value = 0.2279
LS0tCnRpdGxlOiAiVGhlc2lzIExvRiBWYXJpYW50IEJ1cmRlbiBBbmFseXNpcyBTY3JpcHQiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KClRISVMgU0NSSVBUIFJFUVVJUkVTIFRIRSBMT0YgVkFSSUFOVCBGSUxURVJJTkcgQU5EIFJBTktJTkcgU0NSSVBUIChodHRwczovL3JwdWJzLmNvbS9kZWVwc3Vicy9uYXR1cmVfY29tbXNfcGFwZXJfMjAyMCkgVE8gQkUgUlVOIEZJUlNULCBBTkQgVEhFIFNQRUNJRklFRCBPVVRQVVQgRklMRSBGUk9NIFRIQVQgU0NSSVBUIFRPIEJFIFNBVkVEIElOIFRIRSBTQU1FIERJUkVDVE9SWSBBUyBUSElTIFNDUklQVAoKTG9hZCB0aWR5dmVyc2UgcGFja2FnZQpgYGB7cn0KbGlicmFyeSgidGlkeXZlcnNlIikKYGBgCgpMb2FkIGRhdGEgZnJvbSBMb0YgdmFyaWFudCBmaWx0ZXJpbmcgYW5kIHJhbmtpbmcgYW5hbHlzaXMKYGBge3J9Cm1hc3RlcmZpbGVfZW9jX2dvb2RRMl9ISUdIX0VOU1RjYW5vbmljYWxfc2FtcGxlQUYwLjAxIDwtIHJlYWQuY3N2KCJtYXN0ZXJmaWxlX2VvY19nb29kUTJfSElHSF9FTlNUY2Fub25pY2FsX3NhbXBsZUFGMC4wMS5jc3YiKQpgYGAKClJlbW92ZSB2YXJpYW50cyB3aXRoIG5vbi1wcm90ZWluLWNvZGluZyBiaW90eXBlcwpgYGB7cn0KbWFzdGVyZmlsZV9lb2NfZ29vZFEyX0hJR0hfRU5TVGNhbm9uaWNhbF9zYW1wbGVBRjAuMDFfYmlvdHlwZSA8LSBmaWx0ZXIobWFzdGVyZmlsZV9lb2NfZ29vZFEyX0hJR0hfRU5TVGNhbm9uaWNhbF9zYW1wbGVBRjAuMDEsQklPVFlQRSVpbiVjKCJwcm90ZWluX2NvZGluZyIpKQpgYGAKClJlbW92ZSB2YXJpYW50cyB3aXRoIEdub21BRCBQb3BNYXggQUYgPiAwLjAwNQpgYGB7cn0KbWFzdGVyZmlsZV9lb2NfZ29vZFEyX0hJR0hfRU5TVGNhbm9uaWNhbF9zYW1wbGVBRjAuMDFfYmlvdHlwZV9Hbm9tQURwb3BtYXgwLjAwNSA8LSBmaWx0ZXIobWFzdGVyZmlsZV9lb2NfZ29vZFEyX0hJR0hfRU5TVGNhbm9uaWNhbF9zYW1wbGVBRjAuMDFfYmlvdHlwZSwKR25vbUFEX3YyLjFfbm9uX2NhbmNlcl9BRl9wb3BtYXg8PTAuMDA1KQpgYGAKClJlbW92ZSB2YXJpYW50cyB3aXRoIHJhdGlvIDwgMwpgYGB7cn0KbWFzdGVyZmlsZV9lb2NfZ29vZFEyX0hJR0hfRU5TVGNhbm9uaWNhbF9zYW1wbGVBRjAuMDFfYmlvdHlwZV9Hbm9tQURwb3BtYXgwLjAwNV9yYXRpbzMgPC0gZmlsdGVyKG1hc3RlcmZpbGVfZW9jX2dvb2RRMl9ISUdIX0VOU1RjYW5vbmljYWxfc2FtcGxlQUYwLjAxX2Jpb3R5cGVfR25vbUFEcG9wbWF4MC4wMDUsU2FtcGxlX0dlbmVfTE9GX0ZyZXFfUmF0aW9fR25vbUFEX05GRV8wLjAwNT4zKSAKYGBgCgpPdXRwdXQgdmFsdWVzIGZvciBiaW5vbWlhbCB0ZXN0CmBgYHtyfQp2YXJpYW50X2NvdW50c19BRnMgPC0gZnVuY3Rpb24odmNmLFRvdGFsX1NhbXBsZV9BbGxlbGVzKXsKdmFyaWFudHNfaGV0ZXJvenlnb3VzIDwtIHZjZiAlPiUgCiAgc2VsZWN0KEhHVlNjLFNhbXBsZS5HVCkgJT4lIAogIGdyb3VwX2J5KEhHVlNjLFNhbXBsZS5HVCkgJT4lIAogIHN1bW1hcmlzZShuKCkpICU+JSAKICBmaWx0ZXIoKFNhbXBsZS5HVCVpbiVjKCInMC8xIiwiJzEvMCIpKSkgJT4lIAogIGRwbHlyOjpyZW5hbWUoYWxsZWxlcyA9ICJuKCkiKSAlPiUgCiAgZ3JvdXBfYnkoSEdWU2MpICU+JSAKICBzdW1tYXJpc2Uoc3VtKGFsbGVsZXMpKQp2YXJpYW50c19ob21venlnb3VzIDwtIHZjZiAlPiUgCiAgc2VsZWN0KEhHVlNjLFNhbXBsZS5HVCkgJT4lIAogIGdyb3VwX2J5KEhHVlNjLFNhbXBsZS5HVCkgJT4lIAogIHN1bW1hcmlzZShuKCkpICU+JSAKICBmaWx0ZXIoKFNhbXBsZS5HVCE9IicwLzEiKSYoU2FtcGxlLkdUIT0iJzEvMCIpKSAlPiUgCiAgbXV0YXRlKGFsbGVsZXMgPSBgbigpYCoyKSAlPiUgCiAgc2VsZWN0KC1gbigpYCkgJT4lIAogIGdyb3VwX2J5KEhHVlNjKSAlPiUgCiAgc3VtbWFyaXNlKHN1bShhbGxlbGVzKSkKdmFyaWFudHMgPC0gZnVsbF9qb2luKHZhcmlhbnRzX2hldGVyb3p5Z291cyx2YXJpYW50c19ob21venlnb3VzLGJ5PSJIR1ZTYyIsY29weT1GQUxTRSxzdWZmaXg9YygiLngiLCIueSIpKSAlPiUKICBtdXRhdGVfYWxsKGZ1bnMocmVwbGFjZSguLCBpcy5uYSguKSwgMCkpKSAlPiUgCiAgbXV0YXRlKFRvdGFsX0FsbGVsZV9Db3VudD1gc3VtKGFsbGVsZXMpLnhgK2BzdW0oYWxsZWxlcykueWApICU+JSAKICBtdXRhdGUoU2FtcGxlX0FGPVRvdGFsX0FsbGVsZV9Db3VudC9Ub3RhbF9TYW1wbGVfQWxsZWxlcykKdmNmIDwtIGxlZnRfam9pbih2Y2YsIHZhcmlhbnRzWyxjKDEsNDo1KV0sYnk9IkhHVlNjIixjb3B5PUZBTFNFKQp9CgptYXN0ZXJmaWxlX2VvY19nb29kUTJfSElHSF9FTlNUY2Fub25pY2FsX3NhbXBsZUFGMC4wMV9iaW90eXBlX0dub21BRHBvcG1heDAuMDA1X3JhdGlvM192YXJpYW50c1BlclBlcnNvbiA8LSBzZWxlY3QobWFzdGVyZmlsZV9lb2NfZ29vZFEyX0hJR0hfRU5TVGNhbm9uaWNhbF9zYW1wbGVBRjAuMDFfYmlvdHlwZV9Hbm9tQURwb3BtYXgwLjAwNV9yYXRpbzMsU2FtcGxlLENIUk9NLFBPUyxSRUYsQUxULFNhbXBsZS5HVCxTWU1CT0wsR2VuZSxGZWF0dXJlLEhHVlNjKQpzYW1wbGVzIDwtIG1hc3RlcmZpbGVfZW9jX2dvb2RRMl9ISUdIX0VOU1RjYW5vbmljYWxfc2FtcGxlQUYwLjAxX2Jpb3R5cGVfR25vbUFEcG9wbWF4MC4wMDVfcmF0aW8zX3ZhcmlhbnRzUGVyUGVyc29uWywxXSAlPiUgdW5pcXVlKCkKc2FtcGxlc19nb29kUTJfSElHSF9FTlNUY2Fub25pY2FsX3NhbXBsZUFGMC4wMV9iaW90eXBlX0dub21BRHBvcG1heDAuMDA1X3JhdGlvM192YXJpYW50c1BlclBlcnNvbiA8LSBkYXRhLmZyYW1lKFNhbXBsZT1mYWN0b3IoKSxOb19vZl9WYXJpYW50cz1kb3VibGUoKSkKZm9yKHNhbXBsZSBpbiBzYW1wbGVzKXsKICBzYW1wbGVfZGF0YSA8LSBmaWx0ZXIobWFzdGVyZmlsZV9lb2NfZ29vZFEyX0hJR0hfRU5TVGNhbm9uaWNhbF9zYW1wbGVBRjAuMDFfYmlvdHlwZV9Hbm9tQURwb3BtYXgwLjAwNV9yYXRpbzNfdmFyaWFudHNQZXJQZXJzb24sU2FtcGxlJWluJWMoc2FtcGxlKSkKICBzYW1wbGVfdmFyaWFudHMgPC0gdmFyaWFudF9jb3VudHNfQUZzKHNhbXBsZV9kYXRhLChuX2Rpc3RpbmN0KHNhbXBsZXMpKjIpKQogIG5vX29mX3ZhcmlhbnRzIDwtIHN1bShzYW1wbGVfdmFyaWFudHMkVG90YWxfQWxsZWxlX0NvdW50KQogIHNhbXBsZXNfZ29vZFEyX0hJR0hfRU5TVGNhbm9uaWNhbF9zYW1wbGVBRjAuMDFfYmlvdHlwZV9Hbm9tQURwb3BtYXgwLjAwNV9yYXRpbzNfdmFyaWFudHNQZXJQZXJzb24gPC0gYWRkX3JvdyhzYW1wbGVzX2dvb2RRMl9ISUdIX0VOU1RjYW5vbmljYWxfc2FtcGxlQUYwLjAxX2Jpb3R5cGVfR25vbUFEcG9wbWF4MC4wMDVfcmF0aW8zX3ZhcmlhbnRzUGVyUGVyc29uLFNhbXBsZT1zYW1wbGUsTm9fb2ZfVmFyaWFudHM9bm9fb2ZfdmFyaWFudHMpCn0KCmJpbm9taWFsX3Jlc3VsdHNfZ29vZFEyX0hJR0hfRU5TVGNhbm9uaWNhbF9zYW1wbGVBRjAuMDFfYmlvdHlwZV9Hbm9tQURwb3BtYXgwLjAwNV9yYXRpbzMgPC0gc2FtcGxlc19nb29kUTJfSElHSF9FTlNUY2Fub25pY2FsX3NhbXBsZUFGMC4wMV9iaW90eXBlX0dub21BRHBvcG1heDAuMDA1X3JhdGlvM192YXJpYW50c1BlclBlcnNvbiAlPiUKICByZW5hbWUoZ2VuZV9WYXIgPSAiTm9fb2ZfVmFyaWFudHMiKSAlPiUKICBjb3VudChnZW5lX1ZhcikgJT4lCiAgcmVuYW1lKG9ic19uX3NhbXBsZXMgPSAibiIpICU+JQogIGFkZF9yb3coZ2VuZV9WYXIgPSAwLCBvYnNfbl9zYW1wbGVzID0gKDUxMCktbl9kaXN0aW5jdChtYXN0ZXJmaWxlX2VvY19nb29kUTJfSElHSF9FTlNUY2Fub25pY2FsX3NhbXBsZUFGMC4wMV9iaW90eXBlX0dub21BRHBvcG1heDAuMDA1X3JhdGlvMyRTYW1wbGUpKSAlPiUKICBhcnJhbmdlKGdlbmVfVmFyKSAlPiUKICBtdXRhdGUoVG90YWxfTm89Z2VuZV9WYXIqb2JzX25fc2FtcGxlcykgJT4lCiAgbXV0YXRlKHBfTG9GX3ZhcmlhbnRfcGVyU2FtcGxlPXN1bShUb3RhbF9ObykvKDUxMCoyKihuX2Rpc3RpbmN0KG1hc3RlcmZpbGVfZW9jX2dvb2RRMl9ISUdIX0VOU1RjYW5vbmljYWxfc2FtcGxlQUYwLjAxX2Jpb3R5cGVfR25vbUFEcG9wbWF4MC4wMDVfcmF0aW8zJEdlbmUpKSkpICU+JQogIG11dGF0ZShiaW5vbWlhbF9kaXN0cmliX3A9KGRiaW5vbShnZW5lX1ZhciwyKihuX2Rpc3RpbmN0KG1hc3RlcmZpbGVfZW9jX2dvb2RRMl9ISUdIX0VOU1RjYW5vbmljYWxfc2FtcGxlQUYwLjAxX2Jpb3R5cGVfR25vbUFEcG9wbWF4MC4wMDVfcmF0aW8zJEdlbmUpKSxwX0xvRl92YXJpYW50X3BlclNhbXBsZSxsb2c9RkFMU0UpKSkgJT4lCiAgbXV0YXRlKGV4cF9uX3NhbXBsZXMgPSByb3VuZChiaW5vbWlhbF9kaXN0cmliX3AqKDUxMCksZGlnaXRzPTApKSAlPiUKICB3cml0ZV90c3YocGF0aD0iYmlub21pYWxfcmVzdWx0c19nb29kUTJfSElHSF9FTlNUY2Fub25pY2FsX3NhbXBsZUFGMC4wMV9iaW90eXBlX0dub21BRHBvcG1heDAuMDA1X3JhdGlvMy50c3YiKQoKbXVsdGlub21pYWwgPC0gYmlub21pYWxfcmVzdWx0c19nb29kUTJfSElHSF9FTlNUY2Fub25pY2FsX3NhbXBsZUFGMC4wMV9iaW90eXBlX0dub21BRHBvcG1heDAuMDA1X3JhdGlvM1tjKDE6OCksYygyLDUsNildICU+JQogIGFkZF9yb3cob2JzX25fc2FtcGxlcz0oc3VtKGJpbm9taWFsX3Jlc3VsdHNfZ29vZFEyX0hJR0hfRU5TVGNhbm9uaWNhbF9zYW1wbGVBRjAuMDFfYmlvdHlwZV9Hbm9tQURwb3BtYXgwLjAwNV9yYXRpbzNbYyg5OjEzKSwyXSkpLAogICAgICAgICAgYmlub21pYWxfZGlzdHJpYl9wPShzdW0oYmlub21pYWxfcmVzdWx0c19nb29kUTJfSElHSF9FTlNUY2Fub25pY2FsX3NhbXBsZUFGMC4wMV9iaW90eXBlX0dub21BRHBvcG1heDAuMDA1X3JhdGlvM1tjKDk6MTMpLDVdKSksCiAgICAgICAgICBleHBfbl9zYW1wbGVzPShzdW0oYmlub21pYWxfcmVzdWx0c19nb29kUTJfSElHSF9FTlNUY2Fub25pY2FsX3NhbXBsZUFGMC4wMV9iaW90eXBlX0dub21BRHBvcG1heDAuMDA1X3JhdGlvM1tjKDk6MTMpLDZdKSkpCmBgYAoKUGVyZm9ybSBjaGktc3F1YXJlZCB0ZXN0IApgYGB7cn0KY2hpc3EudGVzdCh4PW11bHRpbm9taWFsJG9ic19uX3NhbXBsZXMsCiAgICAgICAgICAgY29ycmVjdD1GQUxTRSwKICAgICAgICAgICBwPW11bHRpbm9taWFsJGJpbm9taWFsX2Rpc3RyaWJfcCwKICAgICAgICAgICByZXNjYWxlLnA9VFJVRSkKYGBgCg==