ALL IMPORTED FILES MUST BE IN THE SAME DIRECTORY AS THIS SCRIPT

Load tidyverse package

library("tidyverse")

Import raw vcf data from “masterfile” and exclude variants with GnomAD non-cancer AF > 0.005

masterfile_consequence_1.3_or_LoF <- read.delim("masterfile_consequence_1.3_or_LoF.tsv", header=TRUE, row.names=NULL, na.strings = ".", stringsAsFactors=FALSE) %>% 
  rename("Sample"="X.Sample") %>% 
  filter(GnomAD_v2.1_non_cancer_AF<=0.005)

masterfile_consequence_4.6 <- read.delim("masterfile_consequence_4.6.tsv", header=TRUE, row.names=NULL, na.strings = ".", stringsAsFactors=FALSE) %>% 
  rename("Sample"="X.Sample") %>% 
  filter(GnomAD_v2.1_non_cancer_AF<=0.005)

masterfile <- union(masterfile_consequence_1.3_or_LoF,masterfile_consequence_4.6)

Exclude non-epithelial, uterine-only and BRCA +ve samples

ViP_Complete_Cohort <- read.delim("ViP_LoF_Complete_Cohort.txt", stringsAsFactors=FALSE)
ViP_Complete_Cohort_list <- ViP_Complete_Cohort[,1]

masterfile_eoc <- filter(masterfile,Sample%in%c(ViP_Complete_Cohort_list))
rm(masterfile)

Append patient path data

ViP_OvCa_Path_Data <- read.delim("ViP_OvCa_Path_Data.txt", header=TRUE, row.names=NULL, stringsAsFactors=FALSE) %>% 
  dplyr::rename("Sample"=Exome.ID)
masterfile_eoc_withPath <- left_join(masterfile_eoc,ViP_OvCa_Path_Data,by="Sample",copy=FALSE)
rm(masterfile_eoc)

Exclude low-quality variants, GnomAD RF/InbreedingCoeff-flagged variants and protein-coding transcripts

masterfile_eoc_NPC_goodQ <- filter(masterfile_eoc_withPath, (QUAL>=30)&
                                 (Identified!="FilteredInAll")&
                                 (Sample.PMCDP>=10)&
                                 (Sample.PMCFREQ>=0.25)) %>% 
                                 filter((!str_detect(GnomAD_v2.1_FILTER_exome,"InbreedingCoeff")&
                                           !str_detect(GnomAD_v2.1_FILTER_exome,"RF"))|
                                       is.na(GnomAD_v2.1_FILTER_exome)) %>% 
  filter((!str_detect(GnomAD_v2.1_FILTER_genome,"InbreedingCoeff")&
            !str_detect(GnomAD_v2.1_FILTER_genome,"RF"))|
           is.na(GnomAD_v2.1_FILTER_genome)) %>% 
  filter(!BIOTYPE%in%c("protein_coding"))
rm(masterfile_eoc_withPath)

Exclude other mutation +ve samples (MLH1/MSH2/MSH6/PMS2, TP53, RAD51C/D, BRIP1)

ViP_Discovery_Cohort <- read.delim("ViP_LoF_Discovery_Cohort.txt", stringsAsFactors=FALSE)
ViP_Discovery_Cohort_list <- ViP_Discovery_Cohort[,1]

masterfile_eoc_NPC_goodQ2 <- filter(masterfile_eoc_NPC_goodQ,Sample%in%c(ViP_Discovery_Cohort_list))

Exclude MODIFIER impact variants + any non-protein-coding-transcripts with a transcribed protein ENSP ID e.g. immune system genes + any ‘pseudogenes’ (+ GnomAD popmax AF > 0.005 if needed)

masterfile_eoc_NPC_goodQ_ALL <- filter(masterfile_eoc_NPC_goodQ, IMPACT%in%c("HIGH","MODERATE","LOW")) %>% 
  filter(!grepl("ENSP",ENSP)) %>% 
  filter(!grepl("pseudogene",BIOTYPE)) %>% 
  distinct(.keep_all = FALSE)

masterfile_eoc_NPC_goodQ2_ALL <- filter(masterfile_eoc_NPC_goodQ2, IMPACT%in%c("HIGH","MODERATE","LOW")) %>% 
  filter(!grepl("ENSP",ENSP)) %>% 
  filter(!grepl("pseudogene",BIOTYPE)) %>% 
  distinct(.keep_all = FALSE)

Exclude MODIFIER impact variants + any non-protein-coding-transcripts other than ‘lincRNAs’ (+ GnomAD popmax AF > 0.005 if needed)

masterfile_eoc_lincRNA_goodQ_ALL <- filter(masterfile_eoc_NPC_goodQ, IMPACT%in%c("HIGH","MODERATE","LOW")) %>% 
  filter(BIOTYPE%in%"lincRNA") %>% 
  distinct(.keep_all = FALSE)

masterfile_eoc_lincRNA_goodQ2_ALL <- filter(masterfile_eoc_NPC_goodQ2, IMPACT%in%c("HIGH","MODERATE","LOW")) %>% 
  filter(BIOTYPE%in%"lincRNA") %>% 
  distinct(.keep_all = FALSE)

Separate Ensembl canonical and RefSeq canonical variants

masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical <- filter(masterfile_eoc_NPC_goodQ_ALL,CANONICAL=="YES") 

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical <- filter(masterfile_eoc_NPC_goodQ2_ALL,CANONICAL=="YES")

masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical <- filter(masterfile_eoc_lincRNA_goodQ_ALL,CANONICAL=="YES") 

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical <- filter(masterfile_eoc_lincRNA_goodQ2_ALL,CANONICAL=="YES") 

Sample variant counts and AFs

variants_heterozygous <- masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(alleles = "n()") %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants_homozygous <- masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(alleles = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants <- full_join(variants_heterozygous,variants_homozygous,by="HGVSc",copy=FALSE,suffix=c(".x",".y")) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Allele_Count=`sum(alleles).x`+`sum(alleles).y`) %>% 
  mutate(Sample_AF=Total_Allele_Count/(n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2))
masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_2 <- left_join(masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical,variants[,c(1,4:5)],by="HGVSc",copy=FALSE)

variants_heterozygous <- masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(alleles = "n()") %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants_homozygous <- masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(alleles = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants <- full_join(variants_heterozygous,variants_homozygous,by="HGVSc",copy=FALSE,suffix=c(".x",".y")) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Allele_Count=`sum(alleles).x`+`sum(alleles).y`) %>% 
  mutate(Sample_AF=Total_Allele_Count/(n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_2 <- left_join(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical,variants[,c(1,4:5)],by="HGVSc",copy=FALSE)

variants_heterozygous <- masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(alleles = "n()") %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants_homozygous <- masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(alleles = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants <- full_join(variants_heterozygous,variants_homozygous,by="HGVSc",copy=FALSE,suffix=c(".x",".y")) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Allele_Count=`sum(alleles).x`+`sum(alleles).y`) %>% 
  mutate(Sample_AF=Total_Allele_Count/(n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2))
masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_2 <- left_join(masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical,variants[,c(1,4:5)],by="HGVSc",copy=FALSE)

variants_heterozygous <- masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(alleles = "n()") %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants_homozygous <- masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(alleles = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants <- full_join(variants_heterozygous,variants_homozygous,by="HGVSc",copy=FALSE,suffix=c(".x",".y")) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Allele_Count=`sum(alleles).x`+`sum(alleles).y`) %>% 
  mutate(Sample_AF=Total_Allele_Count/(n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_2 <- left_join(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical,variants[,c(1,4:5)],by="HGVSc",copy=FALSE)

Exclude variants with sample AF >0.01

masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01 <- filter(masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_2,Sample_AF < 0.01)

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01 <- filter(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_2,Sample_AF < 0.01)

masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01 <- filter(masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_2,Sample_AF < 0.01)

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01 <- filter(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_2,Sample_AF < 0.01)

Output list of genes and variants with sample AF >0.01

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_genesandvariants0.01 <- filter(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_2,Sample_AF > 0.01)
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_genesandvariants0.01[,c(2:6,25:54)] %>% 
  distinct(.keep_all = FALSE) %>% 
  write_excel_csv(file="masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_genesandvariants0.01.csv",na=".",append=FALSE,col_names=TRUE)

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_genesandvariants0.01 <- filter(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_2,Sample_AF > 0.01)
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_genesandvariants0.01[,c(2:6,25:54)] %>% 
  distinct(.keep_all = FALSE) %>% 
  write_excel_csv(file="masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_genesandvariants0.01.csv",na=".",append=FALSE,col_names=TRUE)

Sample gene counts and frequencies

genes_oneVarAllele <- masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(gene_Var = "n()") %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes_twoVarAllele <- masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(gene_Var = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes <- full_join(genes_oneVarAllele,genes_twoVarAllele,by="Gene",copy=FALSE,suffix=c(".x",".y")) %>% 
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Gene_Count=`sum(gene_Var).x`+`sum(gene_Var).y`) %>% 
  mutate(Sample_Gene_Freq=Total_Gene_Count/(n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2))
masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01_2 <- left_join(masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01, genes[,c(1,4:5)],by="Gene",copy=FALSE)

genes_oneVarAllele <- masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(gene_Var = "n()") %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes_twoVarAllele <- masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(gene_Var = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes <- full_join(genes_oneVarAllele,genes_twoVarAllele,by="Gene",copy=FALSE,suffix=c(".x",".y")) %>% 
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Gene_Count=`sum(gene_Var).x`+`sum(gene_Var).y`) %>% 
  mutate(Sample_Gene_Freq=Total_Gene_Count/(n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_2 <- left_join(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01, genes[,c(1,4:5)],by="Gene",copy=FALSE)

genes_oneVarAllele <- masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(gene_Var = "n()") %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes_twoVarAllele <- masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(gene_Var = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes <- full_join(genes_oneVarAllele,genes_twoVarAllele,by="Gene",copy=FALSE,suffix=c(".x",".y")) %>% 
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Gene_Count=`sum(gene_Var).x`+`sum(gene_Var).y`) %>% 
  mutate(Sample_Gene_Freq=Total_Gene_Count/(n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2))
masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01_2 <- left_join(masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01, genes[,c(1,4:5)],by="Gene",copy=FALSE)

genes_oneVarAllele <- masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(gene_Var = "n()") %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes_twoVarAllele <- masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(gene_Var = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes <- full_join(genes_oneVarAllele,genes_twoVarAllele,by="Gene",copy=FALSE,suffix=c(".x",".y")) %>% 
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Gene_Count=`sum(gene_Var).x`+`sum(gene_Var).y`) %>% 
  mutate(Sample_Gene_Freq=Total_Gene_Count/(n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_2 <- left_join(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01, genes[,c(1,4:5)],by="Gene",copy=FALSE)

Add GnomAD gene-level data

ensembl_biotypes <- read.delim("ensembl_biotypes.tsv", stringsAsFactors=FALSE) %>% 
  select(ensembl_transcript_id,transcript_biotype) %>% 
  dplyr::rename("Feature"="ensembl_transcript_id","BIOTYPE"="transcript_biotype")

AgilentSSv6_list_ENSTcanonical_non_protein_coding <- read.delim("GnomAD.gene.stats.v2.RF.only.agilent.v6.only.tsv", header=TRUE, row.names=NULL, stringsAsFactors=FALSE) %>% 
  filter(CANONICAL=="YES") %>% 
  dplyr::rename("SYMBOL"="X.Symbol") %>% 
  dplyr::rename("Gene"="ENSG") %>% 
  left_join(ensembl_biotypes,by="Feature",copy=FALSE) %>% 
  filter(!BIOTYPE%in%c("protein_coding")) %>% 
  filter(!grepl("ENSP",ENSP)) %>% 
  filter(!grepl("pseudogene",BIOTYPE)) %>% 
  select(SYMBOL,Feature,Gene,BIOTYPE) %>% 
  distinct(.keep_all = FALSE)

AgilentSSv6_list_ENSTcanonical_lincRNA <- read.delim("GnomAD.gene.stats.v2.RF.only.agilent.v6.only.tsv", header=TRUE, row.names=NULL, stringsAsFactors=FALSE) %>% 
  filter(CANONICAL=="YES") %>% 
  dplyr::rename("SYMBOL"="X.Symbol") %>% 
  dplyr::rename("Gene"="ENSG") %>% 
  left_join(ensembl_biotypes,by="Feature",copy=FALSE) %>% 
  filter(BIOTYPE%in%"lincRNA") %>% 
  select(SYMBOL,Feature,Gene,BIOTYPE) %>% 
  distinct(.keep_all = FALSE)

GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding <- read.delim("agilent_v6_gnomad_v2.1.1_genestats_canonical_allIMPACTS.tsv", header=TRUE, row.names=NULL, stringsAsFactors=FALSE) %>% 
  filter(Feature%in%(AgilentSSv6_list_ENSTcanonical_non_protein_coding$Feature)) %>% 
  dplyr::rename("Gene"="ENSG")

GnomAD_stats_ALL_VEP_ENSTcanonical_lincRNA <- read.delim("agilent_v6_gnomad_v2.1.1_genestats_canonical_allIMPACTS.tsv", header=TRUE, row.names=NULL, stringsAsFactors=FALSE) %>% 
  filter(Feature%in%(AgilentSSv6_list_ENSTcanonical_lincRNA$Feature)) %>% 
  dplyr::rename("Gene"="ENSG")

masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats <- left_join(masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01_2, GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding[,c(2,14:19)],by="Feature",copy=FALSE) 
masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate_at(masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats,vars(starts_with("FILTER_")),funs(replace(., is.na(.), 0))) %>% 
  mutate_if(grepl("popmax$", names(.)),funs(ifelse(. == "NA", 0, as.numeric(.)))) %>% 
  mutate_at(vars(ends_with("popmax")),funs(replace(., is.na(.), 0)))

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats <- left_join(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_2, GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding[,c(2,14:19)],by="Feature",copy=FALSE) 
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate_at(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats,vars(starts_with("FILTER_")),funs(replace(., is.na(.), 0))) %>% 
  mutate_if(grepl("popmax$", names(.)),funs(ifelse(. == "NA", 0, as.numeric(.)))) %>% 
  mutate_at(vars(ends_with("popmax")),funs(replace(., is.na(.), 0)))

masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats <- left_join(masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01_2, GnomAD_stats_ALL_VEP_ENSTcanonical_lincRNA[,c(2,14:19)],by="Feature",copy=FALSE) 
masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate_at(masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats,vars(starts_with("FILTER_")),funs(replace(., is.na(.), 0))) %>% 
  mutate_if(grepl("popmax$", names(.)),funs(ifelse(. == "NA", 0, as.numeric(.)))) %>% 
  mutate_at(vars(ends_with("popmax")),funs(replace(., is.na(.), 0)))

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats <- left_join(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_2, GnomAD_stats_ALL_VEP_ENSTcanonical_lincRNA[,c(2,14:19)],by="Feature",copy=FALSE) 
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate_at(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats,vars(starts_with("FILTER_")),funs(replace(., is.na(.), 0))) %>% 
  mutate_if(grepl("popmax$", names(.)),funs(ifelse(. == "NA", 0, as.numeric(.)))) %>% 
  mutate_at(vars(ends_with("popmax")),funs(replace(., is.na(.), 0)))

Calculate Sample/GnomAD gene freq and allele freq ratios for total and NFE GnomAD figures

masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate(masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats,
"Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ,      "Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ,      "Sample_Gene_ALL_AF_Ratio"=Sample_AF/GnomAD_v2.1_non_cancer_AF, 
"Sample_Gene_ALL_AF_Ratio_NFE"=Sample_AF/GnomAD_v2.1_non_cancer_AF_nfe)

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats, "Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ,      "Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ,      "Sample_Gene_ALL_AF_Ratio"=Sample_AF/GnomAD_v2.1_non_cancer_AF, 
"Sample_Gene_ALL_AF_Ratio_NFE"=Sample_AF/GnomAD_v2.1_non_cancer_AF_nfe)

masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate(masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats,
"Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ,      "Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ,      "Sample_Gene_ALL_AF_Ratio"=Sample_AF/GnomAD_v2.1_non_cancer_AF, 
"Sample_Gene_ALL_AF_Ratio_NFE"=Sample_AF/GnomAD_v2.1_non_cancer_AF_nfe)

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats, "Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ,      "Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ,      "Sample_Gene_ALL_AF_Ratio"=Sample_AF/GnomAD_v2.1_non_cancer_AF, 
"Sample_Gene_ALL_AF_Ratio_NFE"=Sample_AF/GnomAD_v2.1_non_cancer_AF_nfe)

Create data frames with Agilent SureSelect whole exome genes (ENST and non-protein-coding) for volcano plots

allGenes_AgilentSSv6_list_non_protein_coding_only <- GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding[,c(1:3,6,10)] %>% 
  filter(MAX_AN_NFE!=0) %>% 
  left_join(AgilentSSv6_list_ENSTcanonical_non_protein_coding[,c(2,4)],by="Feature",copy=FALSE)

allGenes_masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_biotypeNPC_list <- tibble("SYMBOL"=masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2$SYMBOL,"Gene"=masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2$Gene,"BIOTYPE"=masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2$BIOTYPE) %>% 
  distinct(.keep_all=TRUE) %>% 
  arrange(SYMBOL)

allGenes_masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_biotype_lincRNA_list <- tibble("SYMBOL"=masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2$SYMBOL,"Gene"=masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2$Gene,"BIOTYPE"=masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2$BIOTYPE) %>% 
  distinct(.keep_all=TRUE) %>% 
  arrange(SYMBOL)

volcano_sampleAF0.01_biotypeNPC_allGenes <- masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 %>% 
  mutate(Total_Case_Alleles=(n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2)) %>% 
  select(SYMBOL,Gene,Total_Gene_Count,Total_Case_Alleles,Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005,Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005) %>% 
  distinct(SYMBOL,Gene,Total_Gene_Count,Total_Case_Alleles,Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005,Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005) 
volcano_sampleAF0.01_biotypeNPC_allGenes2 <- left_join(allGenes_AgilentSSv6_list_non_protein_coding_only,volcano_sampleAF0.01_biotypeNPC_allGenes[,c(2:4)],by="Gene",copy=FALSE) %>% 
  bind_rows(anti_join(allGenes_AgilentSSv6_list_non_protein_coding_only[,c(1,3)],volcano_sampleAF0.01_biotypeNPC_allGenes,by="Gene")) %>% 
  mutate_at(vars("Total_Gene_Count"),funs(replace(., is.na(.), 0))) %>% 
  mutate_at(vars("Total_Case_Alleles"),funs(replace(., is.na(.), n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2))) %>% 
  distinct(SYMBOL,Gene,Total_Gene_Count,Total_Case_Alleles) 
volcano_sampleAF0.01_biotypeNPC_allGenes3 <- right_join(
  volcano_sampleAF0.01_biotypeNPC_allGenes2,
  GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding,
  by="Gene",copy=FALSE) %>% 
  select("SYMBOL.x","Gene","Total_Gene_Count","Total_Case_Alleles","FILTER_RF_LOW.MODERATE.HIGH_AC_0.005","FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_NFE","MAX_AN","MAX_AN_NFE","FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ","FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ") %>% 
  dplyr::rename("SYMBOL"="SYMBOL.x") %>% 
  mutate(FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_ADJ=round((MAX_AN*FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ))) %>% 
  mutate(FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_NFE_ADJ=round((MAX_AN_NFE*FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ))) %>% 
  distinct(.keep_all=TRUE) %>% 
  filter(!is.na(MAX_AN_NFE)) %>% 
  filter(MAX_AN_NFE!=0)

volcano_sampleAF0.01_biotype_lincRNA_allGenes <- masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 %>% 
  mutate(Total_Case_Alleles=(n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2)) %>% 
  # filter(!BIOTYPE%in%c("protein_coding")) %>% 
  select(SYMBOL,Gene,Total_Gene_Count,Total_Case_Alleles,Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005,Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005) %>% 
  distinct(SYMBOL,Gene,Total_Gene_Count,Total_Case_Alleles,Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005,Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005) 
volcano_sampleAF0.01_biotype_lincRNA_allGenes2 <- left_join(allGenes_AgilentSSv6_list_non_protein_coding_only,volcano_sampleAF0.01_biotype_lincRNA_allGenes[,c(2:4)],by="Gene",copy=FALSE) %>% 
  bind_rows(anti_join(allGenes_AgilentSSv6_list_non_protein_coding_only[,c(1,3)],volcano_sampleAF0.01_biotype_lincRNA_allGenes,by="Gene")) %>% 
  mutate_at(vars("Total_Gene_Count"),funs(replace(., is.na(.), 0))) %>% 
  mutate_at(vars("Total_Case_Alleles"),funs(replace(., is.na(.), n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2))) %>% 
  distinct(SYMBOL,Gene,Total_Gene_Count,Total_Case_Alleles) 
volcano_sampleAF0.01_biotype_lincRNA_allGenes3 <- right_join(
  volcano_sampleAF0.01_biotype_lincRNA_allGenes2,
  GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding,
  by="Gene",copy=FALSE) %>% 
  select("SYMBOL.x","Gene","Total_Gene_Count","Total_Case_Alleles","FILTER_RF_LOW.MODERATE.HIGH_AC_0.005","FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_NFE","MAX_AN","MAX_AN_NFE","FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ","FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ") %>% 
  dplyr::rename("SYMBOL"="SYMBOL.x") %>% 
  mutate(FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_ADJ=round((MAX_AN*FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ))) %>% 
  mutate(FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_NFE_ADJ=round((MAX_AN_NFE*FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ))) %>% 
  distinct(.keep_all=TRUE) %>% 
  filter(!is.na(MAX_AN_NFE)) %>% 
  filter(MAX_AN_NFE!=0)

Calculate Fisher’s test values (Ensembl Canonical)

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults <- mutate(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2,Total_Case_Alleles=(n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2)) %>% 
  left_join(GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding[,c(2,6,10)],by="Feature",copy=FALSE) 
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2 <- mutate(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults, MAX_AN=replace(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults$MAX_AN, is.na(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults$MAX_AN), max(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults$MAX_AN, na.rm=TRUE))) %>% 
  filter(!is.na(MAX_AN_NFE)&
           MAX_AN_NFE!=0)

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults <- mutate(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2,Total_Case_Alleles=(n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2)) %>% 
  left_join(GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding[,c(2,6,10)],by="Feature",copy=FALSE) 
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2 <- mutate(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults, MAX_AN=replace(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults$MAX_AN, is.na(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults$MAX_AN), max(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults$MAX_AN, na.rm=TRUE))) %>% 
  filter(!is.na(MAX_AN_NFE)&
           MAX_AN_NFE!=0)

fisherresults_ENST <- apply(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2[,c(262,264,274,275)],1,function(x) {fisher.test(rbind(x[1:2],c(x[3]-x[1],x[4]-x[2])))})

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005 = sapply(fisherresults_ENST,function(x) round(x$p.value,12))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$OR_0.005 = sapply(fisherresults_ENST,function(x) round(x$estimate,4))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$`95%CI_0.005` = sapply(fisherresults_ENST,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$BH_0.005 = p.adjust(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005,method = "BH")

fisherresults2_ENST <- apply(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2[,c(262,267,274,276)],1,function(x) {fisher.test(rbind(x[1:2],c(x[3]-x[1],x[4]-x[2])))})

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005_NFE = sapply(fisherresults2_ENST,function(x) round(x$p.value,12))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$OR_0.005_NFE = sapply(fisherresults2_ENST,function(x) round(x$estimate,4))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$`95%CI_0.005_NFE` = sapply(fisherresults2_ENST,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$BH_0.005_NFE = p.adjust(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005_NFE,method = "BH")

fisherresults_ENST <- apply(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2[,c(262,264,274,275)],1,function(x) {fisher.test(rbind(x[1:2],c(x[3]-x[1],x[4]-x[2])))})

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005 = sapply(fisherresults_ENST,function(x) round(x$p.value,12))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$OR_0.005 = sapply(fisherresults_ENST,function(x) round(x$estimate,4))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$`95%CI_0.005` = sapply(fisherresults_ENST,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$BH_0.005 = p.adjust(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005,method = "BH")

fisherresults2_ENST <- apply(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2[,c(262,267,274,276)],1,function(x) {fisher.test(rbind(x[1:2],c(x[3]-x[1],x[4]-x[2])))})

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005_NFE = sapply(fisherresults2_ENST,function(x) round(x$p.value,12))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$OR_0.005_NFE = sapply(fisherresults2_ENST,function(x) round(x$estimate,4))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$`95%CI_0.005_NFE` = sapply(fisherresults2_ENST,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$BH_0.005_NFE = p.adjust(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005_NFE,method = "BH")

Calculate Fisher’s test values (volcano plot)

fisherresults_biotypeNPC_allGenes_volcano <- apply(volcano_sampleAF0.01_biotypeNPC_allGenes3[,c(3,5,4,7)],1,function(x) {fisher.test(rbind(x[1:2],c(abs(x[3]-x[1]),abs(x[4]-x[2]))))})

volcano_sampleAF0.01_biotypeNPC_allGenes3$P_value_Fisher_0.005 = sapply(fisherresults_biotypeNPC_allGenes_volcano,function(x) round(x$p.value,15))
volcano_sampleAF0.01_biotypeNPC_allGenes3$OR_0.005 = sapply(fisherresults_biotypeNPC_allGenes_volcano,function(x) round(x$estimate,15))
volcano_sampleAF0.01_biotypeNPC_allGenes3$`95%CI_0.005` = sapply(fisherresults_biotypeNPC_allGenes_volcano,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
volcano_sampleAF0.01_biotypeNPC_allGenes3$BH_0.005 = p.adjust(volcano_sampleAF0.01_biotypeNPC_allGenes3$P_value_Fisher_0.005,method = "BH")

fisherresults_biotypeNPC_NFE_volcano <- apply(volcano_sampleAF0.01_biotypeNPC_allGenes3[,c(3,6,4,8)],1,function(x) {fisher.test(rbind(x[1:2],c(abs(x[3]-x[1]),abs(x[4]-x[2]))))})

volcano_sampleAF0.01_biotypeNPC_allGenes3$P_value_Fisher_0.005_NFE = sapply(fisherresults_biotypeNPC_NFE_volcano,function(x) round(x$p.value,15))
volcano_sampleAF0.01_biotypeNPC_allGenes3$OR_0.005_NFE = sapply(fisherresults_biotypeNPC_NFE_volcano,function(x) round(x$estimate,15))
volcano_sampleAF0.01_biotypeNPC_allGenes3$`95%CI_0.005_NFE` = sapply(fisherresults_biotypeNPC_NFE_volcano,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
volcano_sampleAF0.01_biotypeNPC_allGenes3$BH_0.005_NFE = p.adjust(volcano_sampleAF0.01_biotypeNPC_allGenes3$P_value_Fisher_0.005_NFE,method = "BH")

fisherresults_biotype_lincRNA_allGenes_volcano <- apply(volcano_sampleAF0.01_biotype_lincRNA_allGenes3[,c(3,5,4,7)],1,function(x) {fisher.test(rbind(x[1:2],c(abs(x[3]-x[1]),abs(x[4]-x[2]))))})

volcano_sampleAF0.01_biotype_lincRNA_allGenes3$P_value_Fisher_0.005 = sapply(fisherresults_biotype_lincRNA_allGenes_volcano,function(x) round(x$p.value,15))
volcano_sampleAF0.01_biotype_lincRNA_allGenes3$OR_0.005 = sapply(fisherresults_biotype_lincRNA_allGenes_volcano,function(x) round(x$estimate,15))
volcano_sampleAF0.01_biotype_lincRNA_allGenes3$`95%CI_0.005` = sapply(fisherresults_biotype_lincRNA_allGenes_volcano,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
volcano_sampleAF0.01_biotype_lincRNA_allGenes3$BH_0.005 = p.adjust(volcano_sampleAF0.01_biotype_lincRNA_allGenes3$P_value_Fisher_0.005,method = "BH")

fisherresults_biotype_lincRNA_NFE_volcano <- apply(volcano_sampleAF0.01_biotype_lincRNA_allGenes3[,c(3,6,4,8)],1,function(x) {fisher.test(rbind(x[1:2],c(abs(x[3]-x[1]),abs(x[4]-x[2]))))})

volcano_sampleAF0.01_biotype_lincRNA_allGenes3$P_value_Fisher_0.005_NFE = sapply(fisherresults_biotype_lincRNA_NFE_volcano,function(x) round(x$p.value,15))
volcano_sampleAF0.01_biotype_lincRNA_allGenes3$OR_0.005_NFE = sapply(fisherresults_biotype_lincRNA_NFE_volcano,function(x) round(x$estimate,15))
volcano_sampleAF0.01_biotype_lincRNA_allGenes3$`95%CI_0.005_NFE` = sapply(fisherresults_biotype_lincRNA_NFE_volcano,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
volcano_sampleAF0.01_biotype_lincRNA_allGenes3$BH_0.005_NFE = p.adjust(volcano_sampleAF0.01_biotype_lincRNA_allGenes3$P_value_Fisher_0.005_NFE,method = "BH")

Identify variants with filtering AF > maximum credible population AF in GnomAD for hereditary EOC (0.000181)

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults3 <- mutate(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2, FilteringAF_95_Exceeds_MaxCredPopAF=((GnomAD_v2.1_non_cancer_Filtering_AF_95)>0.000181)) %>% 
  mutate(FilteringAF_95_NFE_Exceeds_MaxCredPopAF=((GnomAD_v2.1_non_cancer_Filtering_AF_95_nfe)>0.000181))

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults3 <- mutate(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2, FilteringAF_95_Exceeds_MaxCredPopAF=((GnomAD_v2.1_non_cancer_Filtering_AF_95)>0.000181)) %>% 
  mutate(FilteringAF_95_NFE_Exceeds_MaxCredPopAF=((GnomAD_v2.1_non_cancer_Filtering_AF_95_nfe)>0.000181))

Identify variants that PASS/FAIL the LOFTEE 50bp rule

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults4 <- mutate(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults3,LoF_50_BP_RULE = ifelse(grepl("50_BP_RULE:PASS", LoF_info), TRUE, ifelse(grepl("50_BP_RULE:FAIL", LoF_info), FALSE, NA)))

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults4 <- mutate(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults3,LoF_50_BP_RULE = ifelse(grepl("50_BP_RULE:PASS", LoF_info), TRUE, ifelse(grepl("50_BP_RULE:FAIL", LoF_info), FALSE, NA)))

Import NMD depleted gene list and annotate variants accordingly

NMD_depleted_gene_list <- read.csv("NMD_depleted_gene_list.csv", stringsAsFactors=FALSE) %>% 
  rename("SYMBOL" = gene) %>% 
  rename("Feature" = txnames) %>% 
  rename("NMD_predictor_rank" = min.rank)

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults5 <- left_join(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults4, NMD_depleted_gene_list[,c(2:3)],by="Feature",copy=FALSE)

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults5 <- left_join(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults4, NMD_depleted_gene_list[,c(2:3)],by="Feature",copy=FALSE) 

Remove redundant objects and output data

rm(list=(ls(pattern="^genes")))
rm(list=(ls(pattern="^variants")))
rm(list=(ls(pattern="^fisherresults")))

write_excel_csv(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults5,file="masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_biotypeNPC.csv",na=".",append=FALSE,col_names=TRUE)

write_excel_csv(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults5,file="masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_biotype_lincRNA.csv",na=".",append=FALSE,col_names=TRUE)

Calculate total LoF variants in sample vs total LoF variants in GnomAD non-cancer NFE and use for chi-squared test

oddsratio <- function (a, b = NULL, c = NULL, d = NULL, conf.level = 0.95, 
    p.calc.by.independence = TRUE) 
{
    if (is.matrix(a)) {
        if ((dim(a)[1] != 2L) | (dim(a)[2] != 2L)) {
            stop("Input matrix must be a 2x2 table.")
        }
        .a <- a[1, 1]
        .b <- a[1, 2]
        .c <- a[2, 1]
        .d <- a[2, 2]
        .data.name <- deparse(substitute(a))
    }
    else {
        .a <- a
        .b <- b
        .c <- c
        .d <- d
        .data.name <- paste(deparse(substitute(a)), deparse(substitute(b)), 
            deparse(substitute(c)), deparse(substitute(d)))
    }
    .MAT <- matrix(c(.a, .b, M1 <- .a + .b, .c, .d, M0 <- .c + 
        .d, N1 <- .a + .c, N0 <- .b + .d, Total <- .a + .b + 
        .c + .d), 3, 3)
    colnames(.MAT) <- c("Sample", "GnomAD", "Total") #("Disease", "Nondisease", "Total")
    rownames(.MAT) <- c("Variants", "No variants", "Total") #("Exposed", "Nonexposed", "Total")
    class(.MAT) <- "table"
    print(.MAT)
    ESTIMATE <- (.a /.b)/(.c/.d)
    norm.pp <- qnorm(1 - (1 - conf.level)/2)
    if (p.calc.by.independence) {
        p.v <- 2 * (1 - pnorm(abs((.a - N1 * M1/Total)/sqrt(N1 * 
            N0 * M1 * M0/Total/Total/(Total - 1)))))
    }
    else {
        p.v <- 2 * (1 - pnorm(log(ifelse(ESTIMATE > 1, ESTIMATE, 
            1/ESTIMATE))/sqrt(1/.a + 1/.b + 1/.c + 1/.d)))
    }
    ORL <- ESTIMATE * exp(-norm.pp * sqrt(1/.a + 1/.b + 1/.c + 
        1/.d))
    ORU <- ESTIMATE * exp(norm.pp * sqrt(1/.a + 1/.b + 1/.c + 
        1/.d)) %>% signif(digits=7)
    CINT <- paste(signif(ORL,digits = 7),signif(ORU,digits = 7),sep="~")
    attr(CINT, "conf.level") <- conf.level
    RVAL <- list(p.value = p.v, conf.int = CINT, estimate = ESTIMATE, 
        method = "Odds ratio estimate and its significance probability", 
        data.name = .data.name)
    class(RVAL) <- "htest"
    return(RVAL)
}

a <- sum(volcano_sampleAF0.01_biotypeNPC_allGenes3$Total_Gene_Count) %>% as.numeric()
b <- n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2*(nrow(allGenes_AgilentSSv6_list_non_protein_coding_only)) %>% as.numeric()
b1 <- b-a %>% as.numeric()
c <- sum(volcano_sampleAF0.01_biotypeNPC_allGenes3$FILTER_RF_LOW.MODERATE.HIGH_AC_0.005) %>% as.numeric()
d <- (max(volcano_sampleAF0.01_biotypeNPC_allGenes3$MAX_AN))*nrow(allGenes_AgilentSSv6_list_non_protein_coding_only) %>% as.numeric()
d1 <- d-c %>% as.numeric()
e <- sum(volcano_sampleAF0.01_biotypeNPC_allGenes3$FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_NFE) %>% as.numeric()
f <- (max(volcano_sampleAF0.01_biotypeNPC_allGenes3$MAX_AN_NFE))*nrow(allGenes_AgilentSSv6_list_non_protein_coding_only) %>% as.numeric()
f1 <- f-e %>% as.numeric()

chiX2_0.005_biotypeNPC <- matrix(c(a,b1,c,d1),nrow=2,byrow=TRUE)
chiX2_0.005_NFE_biotypeNPC <- matrix(c(a,b1,e,f1),nrow=2,byrow=TRUE)

sink(file="masterfile_eoc_ALL_volcano_chiX2_results.txt",append=TRUE)

chisq.test(chiX2_0.005_biotypeNPC,correct=FALSE)
oddsratio(chiX2_0.005_biotypeNPC)
chisq.test(chiX2_0.005_NFE_biotypeNPC,correct=FALSE)
oddsratio(chiX2_0.005_NFE_biotypeNPC)

sink()

g <- sum(volcano_sampleAF0.01_biotype_lincRNA_allGenes3$Total_Gene_Count) %>% as.numeric()
h <- n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2*(nrow(allGenes_AgilentSSv6_list_non_protein_coding_only)) %>% as.numeric()
h1 <- h-g %>% as.numeric()
i <- sum(volcano_sampleAF0.01_biotype_lincRNA_allGenes3$FILTER_RF_LOW.MODERATE.HIGH_AC_0.005) %>% as.numeric()
j <- (max(volcano_sampleAF0.01_biotype_lincRNA_allGenes3$MAX_AN))*nrow(allGenes_AgilentSSv6_list_non_protein_coding_only) %>% as.numeric()
j1 <- j-i %>% as.numeric()
k <- sum(volcano_sampleAF0.01_biotype_lincRNA_allGenes3$FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_NFE) %>% as.numeric()
l <- (max(volcano_sampleAF0.01_biotype_lincRNA_allGenes3$MAX_AN_NFE))*nrow(allGenes_AgilentSSv6_list_non_protein_coding_only) %>% as.numeric()
l1 <- l-k %>% as.numeric()

chiX2_0.005_biotype_lincRNA <- matrix(c(g,h1,i,j1),nrow=2,byrow=TRUE)
chiX2_0.005_NFE_biotype_lincRNA <- matrix(c(g,h1,k,l1),nrow=2,byrow=TRUE)

sink(file="masterfile_eoc_ALL_volcano_chiX2_results.txt",append=TRUE)

chisq.test(chiX2_0.005_biotype_lincRNA,correct=FALSE)
oddsratio(chiX2_0.005_biotype_lincRNA)
chisq.test(chiX2_0.005_NFE_biotype_lincRNA,correct=FALSE)
oddsratio(chiX2_0.005_NFE_biotype_lincRNA)

sink()

Output data for log p-value volcano plots

volcano_sampleAF0.01_biotypeNPC_allGenes4 <- mutate(volcano_sampleAF0.01_biotypeNPC_allGenes3,"Log10_P-value_Fisher_0.05_NFE" = log10(P_value_Fisher_0.005_NFE)) %>% 
  arrange(desc(OR_0.005_NFE)) %>% 
  mutate("Plotted_Log10_P-value_Fisher_0.05_NFE"= 
  if_else (`Log10_P-value_Fisher_0.05_NFE`<0, `Log10_P-value_Fisher_0.05_NFE`*-1, `Log10_P-value_Fisher_0.05_NFE`, missing=NULL)) %>% 
  arrange(desc(`Plotted_Log10_P-value_Fisher_0.05_NFE`),desc(OR_0.005_NFE)) %>% 
  left_join(AgilentSSv6_list_ENSTcanonical_non_protein_coding[,c(3,4)],by="Gene",copy="FALSE") %>% 
  left_join(volcano_sampleAF0.01_biotypeNPC_allGenes[,c(2,5,6)],by="Gene",copy="FALSE") %>% 
  write_excel_csv(file="volcano_sampleAF0.01_biotypeNPC_allGenes_v6.csv",na=".",append=FALSE,col_names=TRUE)

volcano_sampleAF0.01_biotype_lincRNA_allGenes4 <- mutate(volcano_sampleAF0.01_biotype_lincRNA_allGenes3,"Log10_P-value_Fisher_0.05_NFE" = log10(P_value_Fisher_0.005_NFE)) %>% 
  arrange(desc(OR_0.005_NFE)) %>% 
  mutate("Plotted_Log10_P-value_Fisher_0.05_NFE"= 
  if_else (`Log10_P-value_Fisher_0.05_NFE`<0, `Log10_P-value_Fisher_0.05_NFE`*-1, `Log10_P-value_Fisher_0.05_NFE`, missing=NULL)) %>% 
  arrange(desc(`Plotted_Log10_P-value_Fisher_0.05_NFE`),desc(OR_0.005_NFE)) %>% 
  left_join(AgilentSSv6_list_ENSTcanonical_non_protein_coding[,c(3,4)],by="Gene",copy="FALSE") %>% 
  left_join(volcano_sampleAF0.01_biotype_lincRNA_allGenes[,c(2,5,6)],by="Gene",copy="FALSE") %>% 
  write_excel_csv(file="volcano_sampleAF0.01_biotype_lincRNA_allGenes_v6.csv",na=".",append=FALSE,col_names=TRUE)
---
title: "Thesis Non-Protein-Coding Variants Filtering and Analysis Script"
output: html_notebook
---

ALL IMPORTED FILES MUST BE IN THE SAME DIRECTORY AS THIS SCRIPT

Load tidyverse package
```{r}
library("tidyverse")
```

Import raw vcf data from "masterfile" and exclude variants with GnomAD non-cancer AF > 0.005
```{r}
masterfile_consequence_1.3_or_LoF <- read.delim("masterfile_consequence_1.3_or_LoF.tsv", header=TRUE, row.names=NULL, na.strings = ".", stringsAsFactors=FALSE) %>% 
  rename("Sample"="X.Sample") %>% 
  filter(GnomAD_v2.1_non_cancer_AF<=0.005)

masterfile_consequence_4.6 <- read.delim("masterfile_consequence_4.6.tsv", header=TRUE, row.names=NULL, na.strings = ".", stringsAsFactors=FALSE) %>% 
  rename("Sample"="X.Sample") %>% 
  filter(GnomAD_v2.1_non_cancer_AF<=0.005)

masterfile <- union(masterfile_consequence_1.3_or_LoF,masterfile_consequence_4.6)
```

Exclude non-epithelial, uterine-only and BRCA +ve samples
```{r}
ViP_Complete_Cohort <- read.delim("ViP_LoF_Complete_Cohort.txt", stringsAsFactors=FALSE)
ViP_Complete_Cohort_list <- ViP_Complete_Cohort[,1]

masterfile_eoc <- filter(masterfile,Sample%in%c(ViP_Complete_Cohort_list))
rm(masterfile)
```

Append patient path data
```{r}
ViP_OvCa_Path_Data <- read.delim("ViP_OvCa_Path_Data.txt", header=TRUE, row.names=NULL, stringsAsFactors=FALSE) %>% 
  dplyr::rename("Sample"=Exome.ID)
masterfile_eoc_withPath <- left_join(masterfile_eoc,ViP_OvCa_Path_Data,by="Sample",copy=FALSE)
rm(masterfile_eoc)
```

Exclude low-quality variants, GnomAD RF/InbreedingCoeff-flagged variants and protein-coding transcripts
```{r}
masterfile_eoc_NPC_goodQ <- filter(masterfile_eoc_withPath, (QUAL>=30)&
                                 (Identified!="FilteredInAll")&
                                 (Sample.PMCDP>=10)&
                                 (Sample.PMCFREQ>=0.25)) %>% 
                                 filter((!str_detect(GnomAD_v2.1_FILTER_exome,"InbreedingCoeff")&
                                           !str_detect(GnomAD_v2.1_FILTER_exome,"RF"))|
                                       is.na(GnomAD_v2.1_FILTER_exome)) %>% 
  filter((!str_detect(GnomAD_v2.1_FILTER_genome,"InbreedingCoeff")&
            !str_detect(GnomAD_v2.1_FILTER_genome,"RF"))|
           is.na(GnomAD_v2.1_FILTER_genome)) %>% 
  filter(!BIOTYPE%in%c("protein_coding"))
rm(masterfile_eoc_withPath)
```

Exclude other mutation +ve samples (MLH1/MSH2/MSH6/PMS2, TP53, RAD51C/D, BRIP1)
```{r}
ViP_Discovery_Cohort <- read.delim("ViP_LoF_Discovery_Cohort.txt", stringsAsFactors=FALSE)
ViP_Discovery_Cohort_list <- ViP_Discovery_Cohort[,1]

masterfile_eoc_NPC_goodQ2 <- filter(masterfile_eoc_NPC_goodQ,Sample%in%c(ViP_Discovery_Cohort_list))
```

Exclude MODIFIER impact variants + any non-protein-coding-transcripts with a transcribed protein ENSP ID e.g. immune system genes + any 'pseudogenes' (+ GnomAD popmax AF > 0.005 if needed)
```{r}
masterfile_eoc_NPC_goodQ_ALL <- filter(masterfile_eoc_NPC_goodQ, IMPACT%in%c("HIGH","MODERATE","LOW")) %>% 
  filter(!grepl("ENSP",ENSP)) %>% 
  filter(!grepl("pseudogene",BIOTYPE)) %>% 
  distinct(.keep_all = FALSE)

masterfile_eoc_NPC_goodQ2_ALL <- filter(masterfile_eoc_NPC_goodQ2, IMPACT%in%c("HIGH","MODERATE","LOW")) %>% 
  filter(!grepl("ENSP",ENSP)) %>% 
  filter(!grepl("pseudogene",BIOTYPE)) %>% 
  distinct(.keep_all = FALSE)
```

Exclude MODIFIER impact variants + any non-protein-coding-transcripts other than 'lincRNAs' (+ GnomAD popmax AF > 0.005 if needed)
```{r}
masterfile_eoc_lincRNA_goodQ_ALL <- filter(masterfile_eoc_NPC_goodQ, IMPACT%in%c("HIGH","MODERATE","LOW")) %>% 
  filter(BIOTYPE%in%"lincRNA") %>% 
  distinct(.keep_all = FALSE)

masterfile_eoc_lincRNA_goodQ2_ALL <- filter(masterfile_eoc_NPC_goodQ2, IMPACT%in%c("HIGH","MODERATE","LOW")) %>% 
  filter(BIOTYPE%in%"lincRNA") %>% 
  distinct(.keep_all = FALSE)
```

Separate Ensembl canonical and RefSeq canonical variants
```{r}
masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical <- filter(masterfile_eoc_NPC_goodQ_ALL,CANONICAL=="YES") 

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical <- filter(masterfile_eoc_NPC_goodQ2_ALL,CANONICAL=="YES")

masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical <- filter(masterfile_eoc_lincRNA_goodQ_ALL,CANONICAL=="YES") 

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical <- filter(masterfile_eoc_lincRNA_goodQ2_ALL,CANONICAL=="YES") 
```

Sample variant counts and AFs
```{r}
variants_heterozygous <- masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(alleles = "n()") %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants_homozygous <- masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(alleles = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants <- full_join(variants_heterozygous,variants_homozygous,by="HGVSc",copy=FALSE,suffix=c(".x",".y")) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Allele_Count=`sum(alleles).x`+`sum(alleles).y`) %>% 
  mutate(Sample_AF=Total_Allele_Count/(n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2))
masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_2 <- left_join(masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical,variants[,c(1,4:5)],by="HGVSc",copy=FALSE)

variants_heterozygous <- masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(alleles = "n()") %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants_homozygous <- masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(alleles = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants <- full_join(variants_heterozygous,variants_homozygous,by="HGVSc",copy=FALSE,suffix=c(".x",".y")) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Allele_Count=`sum(alleles).x`+`sum(alleles).y`) %>% 
  mutate(Sample_AF=Total_Allele_Count/(n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_2 <- left_join(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical,variants[,c(1,4:5)],by="HGVSc",copy=FALSE)

variants_heterozygous <- masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(alleles = "n()") %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants_homozygous <- masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(alleles = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants <- full_join(variants_heterozygous,variants_homozygous,by="HGVSc",copy=FALSE,suffix=c(".x",".y")) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Allele_Count=`sum(alleles).x`+`sum(alleles).y`) %>% 
  mutate(Sample_AF=Total_Allele_Count/(n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2))
masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_2 <- left_join(masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical,variants[,c(1,4:5)],by="HGVSc",copy=FALSE)

variants_heterozygous <- masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(alleles = "n()") %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants_homozygous <- masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(alleles = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants <- full_join(variants_heterozygous,variants_homozygous,by="HGVSc",copy=FALSE,suffix=c(".x",".y")) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Allele_Count=`sum(alleles).x`+`sum(alleles).y`) %>% 
  mutate(Sample_AF=Total_Allele_Count/(n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_2 <- left_join(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical,variants[,c(1,4:5)],by="HGVSc",copy=FALSE)
```

Exclude variants with sample AF >0.01
```{r}
masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01 <- filter(masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_2,Sample_AF < 0.01)

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01 <- filter(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_2,Sample_AF < 0.01)

masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01 <- filter(masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_2,Sample_AF < 0.01)

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01 <- filter(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_2,Sample_AF < 0.01)
```

Output list of genes and variants with sample AF >0.01
```{r}
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_genesandvariants0.01 <- filter(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_2,Sample_AF > 0.01)
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_genesandvariants0.01[,c(2:6,25:54)] %>% 
  distinct(.keep_all = FALSE) %>% 
  write_excel_csv(file="masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_genesandvariants0.01.csv",na=".",append=FALSE,col_names=TRUE)

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_genesandvariants0.01 <- filter(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_2,Sample_AF > 0.01)
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_genesandvariants0.01[,c(2:6,25:54)] %>% 
  distinct(.keep_all = FALSE) %>% 
  write_excel_csv(file="masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_genesandvariants0.01.csv",na=".",append=FALSE,col_names=TRUE)
```

Sample gene counts and frequencies
```{r}
genes_oneVarAllele <- masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(gene_Var = "n()") %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes_twoVarAllele <- masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(gene_Var = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes <- full_join(genes_oneVarAllele,genes_twoVarAllele,by="Gene",copy=FALSE,suffix=c(".x",".y")) %>% 
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Gene_Count=`sum(gene_Var).x`+`sum(gene_Var).y`) %>% 
  mutate(Sample_Gene_Freq=Total_Gene_Count/(n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2))
masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01_2 <- left_join(masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01, genes[,c(1,4:5)],by="Gene",copy=FALSE)

genes_oneVarAllele <- masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(gene_Var = "n()") %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes_twoVarAllele <- masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(gene_Var = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes <- full_join(genes_oneVarAllele,genes_twoVarAllele,by="Gene",copy=FALSE,suffix=c(".x",".y")) %>% 
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Gene_Count=`sum(gene_Var).x`+`sum(gene_Var).y`) %>% 
  mutate(Sample_Gene_Freq=Total_Gene_Count/(n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_2 <- left_join(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01, genes[,c(1,4:5)],by="Gene",copy=FALSE)

genes_oneVarAllele <- masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(gene_Var = "n()") %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes_twoVarAllele <- masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(gene_Var = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes <- full_join(genes_oneVarAllele,genes_twoVarAllele,by="Gene",copy=FALSE,suffix=c(".x",".y")) %>% 
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Gene_Count=`sum(gene_Var).x`+`sum(gene_Var).y`) %>% 
  mutate(Sample_Gene_Freq=Total_Gene_Count/(n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2))
masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01_2 <- left_join(masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01, genes[,c(1,4:5)],by="Gene",copy=FALSE)

genes_oneVarAllele <- masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  rename(gene_Var = "n()") %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes_twoVarAllele <- masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01 %>% select(Gene,Sample.GT) %>% 
  select(Gene,Sample.GT) %>% 
  group_by(Gene,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(gene_Var = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(Gene) %>% 
  summarise(sum(gene_Var))
genes <- full_join(genes_oneVarAllele,genes_twoVarAllele,by="Gene",copy=FALSE,suffix=c(".x",".y")) %>% 
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Gene_Count=`sum(gene_Var).x`+`sum(gene_Var).y`) %>% 
  mutate(Sample_Gene_Freq=Total_Gene_Count/(n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_2 <- left_join(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01, genes[,c(1,4:5)],by="Gene",copy=FALSE)
```

Add GnomAD gene-level data
```{r}
ensembl_biotypes <- read.delim("ensembl_biotypes.tsv", stringsAsFactors=FALSE) %>% 
  select(ensembl_transcript_id,transcript_biotype) %>% 
  dplyr::rename("Feature"="ensembl_transcript_id","BIOTYPE"="transcript_biotype")

AgilentSSv6_list_ENSTcanonical_non_protein_coding <- read.delim("GnomAD.gene.stats.v2.RF.only.agilent.v6.only.tsv", header=TRUE, row.names=NULL, stringsAsFactors=FALSE) %>% 
  filter(CANONICAL=="YES") %>% 
  dplyr::rename("SYMBOL"="X.Symbol") %>% 
  dplyr::rename("Gene"="ENSG") %>% 
  left_join(ensembl_biotypes,by="Feature",copy=FALSE) %>% 
  filter(!BIOTYPE%in%c("protein_coding")) %>% 
  filter(!grepl("ENSP",ENSP)) %>% 
  filter(!grepl("pseudogene",BIOTYPE)) %>% 
  select(SYMBOL,Feature,Gene,BIOTYPE) %>% 
  distinct(.keep_all = FALSE)

AgilentSSv6_list_ENSTcanonical_lincRNA <- read.delim("GnomAD.gene.stats.v2.RF.only.agilent.v6.only.tsv", header=TRUE, row.names=NULL, stringsAsFactors=FALSE) %>% 
  filter(CANONICAL=="YES") %>% 
  dplyr::rename("SYMBOL"="X.Symbol") %>% 
  dplyr::rename("Gene"="ENSG") %>% 
  left_join(ensembl_biotypes,by="Feature",copy=FALSE) %>% 
  filter(BIOTYPE%in%"lincRNA") %>% 
  select(SYMBOL,Feature,Gene,BIOTYPE) %>% 
  distinct(.keep_all = FALSE)

GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding <- read.delim("agilent_v6_gnomad_v2.1.1_genestats_canonical_allIMPACTS.tsv", header=TRUE, row.names=NULL, stringsAsFactors=FALSE) %>% 
  filter(Feature%in%(AgilentSSv6_list_ENSTcanonical_non_protein_coding$Feature)) %>% 
  dplyr::rename("Gene"="ENSG")

GnomAD_stats_ALL_VEP_ENSTcanonical_lincRNA <- read.delim("agilent_v6_gnomad_v2.1.1_genestats_canonical_allIMPACTS.tsv", header=TRUE, row.names=NULL, stringsAsFactors=FALSE) %>% 
  filter(Feature%in%(AgilentSSv6_list_ENSTcanonical_lincRNA$Feature)) %>% 
  dplyr::rename("Gene"="ENSG")

masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats <- left_join(masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01_2, GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding[,c(2,14:19)],by="Feature",copy=FALSE) 
masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate_at(masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats,vars(starts_with("FILTER_")),funs(replace(., is.na(.), 0))) %>% 
  mutate_if(grepl("popmax$", names(.)),funs(ifelse(. == "NA", 0, as.numeric(.)))) %>% 
  mutate_at(vars(ends_with("popmax")),funs(replace(., is.na(.), 0)))

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats <- left_join(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_2, GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding[,c(2,14:19)],by="Feature",copy=FALSE) 
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate_at(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats,vars(starts_with("FILTER_")),funs(replace(., is.na(.), 0))) %>% 
  mutate_if(grepl("popmax$", names(.)),funs(ifelse(. == "NA", 0, as.numeric(.)))) %>% 
  mutate_at(vars(ends_with("popmax")),funs(replace(., is.na(.), 0)))

masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats <- left_join(masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01_2, GnomAD_stats_ALL_VEP_ENSTcanonical_lincRNA[,c(2,14:19)],by="Feature",copy=FALSE) 
masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate_at(masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats,vars(starts_with("FILTER_")),funs(replace(., is.na(.), 0))) %>% 
  mutate_if(grepl("popmax$", names(.)),funs(ifelse(. == "NA", 0, as.numeric(.)))) %>% 
  mutate_at(vars(ends_with("popmax")),funs(replace(., is.na(.), 0)))

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats <- left_join(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_2, GnomAD_stats_ALL_VEP_ENSTcanonical_lincRNA[,c(2,14:19)],by="Feature",copy=FALSE) 
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate_at(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats,vars(starts_with("FILTER_")),funs(replace(., is.na(.), 0))) %>% 
  mutate_if(grepl("popmax$", names(.)),funs(ifelse(. == "NA", 0, as.numeric(.)))) %>% 
  mutate_at(vars(ends_with("popmax")),funs(replace(., is.na(.), 0)))
```

Calculate Sample/GnomAD gene freq and allele freq ratios for total and NFE GnomAD figures
```{r}
masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate(masterfile_eoc_NPC_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats,
"Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ,      "Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ,      "Sample_Gene_ALL_AF_Ratio"=Sample_AF/GnomAD_v2.1_non_cancer_AF, 
"Sample_Gene_ALL_AF_Ratio_NFE"=Sample_AF/GnomAD_v2.1_non_cancer_AF_nfe)

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats, "Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ,      "Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ,      "Sample_Gene_ALL_AF_Ratio"=Sample_AF/GnomAD_v2.1_non_cancer_AF, 
"Sample_Gene_ALL_AF_Ratio_NFE"=Sample_AF/GnomAD_v2.1_non_cancer_AF_nfe)

masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate(masterfile_eoc_lincRNA_goodQ_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats,
"Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ,      "Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ,      "Sample_Gene_ALL_AF_Ratio"=Sample_AF/GnomAD_v2.1_non_cancer_AF, 
"Sample_Gene_ALL_AF_Ratio_NFE"=Sample_AF/GnomAD_v2.1_non_cancer_AF_nfe)

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 <- mutate(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats, "Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ,      "Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005"=Sample_Gene_Freq/FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ,      "Sample_Gene_ALL_AF_Ratio"=Sample_AF/GnomAD_v2.1_non_cancer_AF, 
"Sample_Gene_ALL_AF_Ratio_NFE"=Sample_AF/GnomAD_v2.1_non_cancer_AF_nfe)
```

Create data frames with Agilent SureSelect whole exome genes (ENST and non-protein-coding) for volcano plots
```{r}
allGenes_AgilentSSv6_list_non_protein_coding_only <- GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding[,c(1:3,6,10)] %>% 
  filter(MAX_AN_NFE!=0) %>% 
  left_join(AgilentSSv6_list_ENSTcanonical_non_protein_coding[,c(2,4)],by="Feature",copy=FALSE)

allGenes_masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_biotypeNPC_list <- tibble("SYMBOL"=masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2$SYMBOL,"Gene"=masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2$Gene,"BIOTYPE"=masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2$BIOTYPE) %>% 
  distinct(.keep_all=TRUE) %>% 
  arrange(SYMBOL)

allGenes_masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_biotype_lincRNA_list <- tibble("SYMBOL"=masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2$SYMBOL,"Gene"=masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2$Gene,"BIOTYPE"=masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2$BIOTYPE) %>% 
  distinct(.keep_all=TRUE) %>% 
  arrange(SYMBOL)

volcano_sampleAF0.01_biotypeNPC_allGenes <- masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 %>% 
  mutate(Total_Case_Alleles=(n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2)) %>% 
  select(SYMBOL,Gene,Total_Gene_Count,Total_Case_Alleles,Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005,Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005) %>% 
  distinct(SYMBOL,Gene,Total_Gene_Count,Total_Case_Alleles,Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005,Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005) 
volcano_sampleAF0.01_biotypeNPC_allGenes2 <- left_join(allGenes_AgilentSSv6_list_non_protein_coding_only,volcano_sampleAF0.01_biotypeNPC_allGenes[,c(2:4)],by="Gene",copy=FALSE) %>% 
  bind_rows(anti_join(allGenes_AgilentSSv6_list_non_protein_coding_only[,c(1,3)],volcano_sampleAF0.01_biotypeNPC_allGenes,by="Gene")) %>% 
  mutate_at(vars("Total_Gene_Count"),funs(replace(., is.na(.), 0))) %>% 
  mutate_at(vars("Total_Case_Alleles"),funs(replace(., is.na(.), n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2))) %>% 
  distinct(SYMBOL,Gene,Total_Gene_Count,Total_Case_Alleles) 
volcano_sampleAF0.01_biotypeNPC_allGenes3 <- right_join(
  volcano_sampleAF0.01_biotypeNPC_allGenes2,
  GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding,
  by="Gene",copy=FALSE) %>% 
  select("SYMBOL.x","Gene","Total_Gene_Count","Total_Case_Alleles","FILTER_RF_LOW.MODERATE.HIGH_AC_0.005","FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_NFE","MAX_AN","MAX_AN_NFE","FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ","FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ") %>% 
  dplyr::rename("SYMBOL"="SYMBOL.x") %>% 
  mutate(FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_ADJ=round((MAX_AN*FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ))) %>% 
  mutate(FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_NFE_ADJ=round((MAX_AN_NFE*FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ))) %>% 
  distinct(.keep_all=TRUE) %>% 
  filter(!is.na(MAX_AN_NFE)) %>% 
  filter(MAX_AN_NFE!=0)

volcano_sampleAF0.01_biotype_lincRNA_allGenes <- masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2 %>% 
  mutate(Total_Case_Alleles=(n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2)) %>% 
  # filter(!BIOTYPE%in%c("protein_coding")) %>% 
  select(SYMBOL,Gene,Total_Gene_Count,Total_Case_Alleles,Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005,Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005) %>% 
  distinct(SYMBOL,Gene,Total_Gene_Count,Total_Case_Alleles,Sample_Gene_ALL_Freq_Ratio_GnomAD_0.005,Sample_Gene_ALL_Freq_Ratio_GnomAD_NFE_0.005) 
volcano_sampleAF0.01_biotype_lincRNA_allGenes2 <- left_join(allGenes_AgilentSSv6_list_non_protein_coding_only,volcano_sampleAF0.01_biotype_lincRNA_allGenes[,c(2:4)],by="Gene",copy=FALSE) %>% 
  bind_rows(anti_join(allGenes_AgilentSSv6_list_non_protein_coding_only[,c(1,3)],volcano_sampleAF0.01_biotype_lincRNA_allGenes,by="Gene")) %>% 
  mutate_at(vars("Total_Gene_Count"),funs(replace(., is.na(.), 0))) %>% 
  mutate_at(vars("Total_Case_Alleles"),funs(replace(., is.na(.), n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2))) %>% 
  distinct(SYMBOL,Gene,Total_Gene_Count,Total_Case_Alleles) 
volcano_sampleAF0.01_biotype_lincRNA_allGenes3 <- right_join(
  volcano_sampleAF0.01_biotype_lincRNA_allGenes2,
  GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding,
  by="Gene",copy=FALSE) %>% 
  select("SYMBOL.x","Gene","Total_Gene_Count","Total_Case_Alleles","FILTER_RF_LOW.MODERATE.HIGH_AC_0.005","FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_NFE","MAX_AN","MAX_AN_NFE","FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ","FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ") %>% 
  dplyr::rename("SYMBOL"="SYMBOL.x") %>% 
  mutate(FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_ADJ=round((MAX_AN*FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_ADJ))) %>% 
  mutate(FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_NFE_ADJ=round((MAX_AN_NFE*FILTER_RF_LOW.MODERATE.HIGH_AF_0.005_NFE_ADJ))) %>% 
  distinct(.keep_all=TRUE) %>% 
  filter(!is.na(MAX_AN_NFE)) %>% 
  filter(MAX_AN_NFE!=0)
```

Calculate Fisher's test values (Ensembl Canonical)
```{r}
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults <- mutate(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2,Total_Case_Alleles=(n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2)) %>% 
  left_join(GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding[,c(2,6,10)],by="Feature",copy=FALSE) 
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2 <- mutate(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults, MAX_AN=replace(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults$MAX_AN, is.na(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults$MAX_AN), max(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults$MAX_AN, na.rm=TRUE))) %>% 
  filter(!is.na(MAX_AN_NFE)&
           MAX_AN_NFE!=0)

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults <- mutate(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats2,Total_Case_Alleles=(n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2)) %>% 
  left_join(GnomAD_stats_ALL_VEP_ENSTcanonical_non_protein_coding[,c(2,6,10)],by="Feature",copy=FALSE) 
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2 <- mutate(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults, MAX_AN=replace(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults$MAX_AN, is.na(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults$MAX_AN), max(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults$MAX_AN, na.rm=TRUE))) %>% 
  filter(!is.na(MAX_AN_NFE)&
           MAX_AN_NFE!=0)

fisherresults_ENST <- apply(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2[,c(262,264,274,275)],1,function(x) {fisher.test(rbind(x[1:2],c(x[3]-x[1],x[4]-x[2])))})

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005 = sapply(fisherresults_ENST,function(x) round(x$p.value,12))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$OR_0.005 = sapply(fisherresults_ENST,function(x) round(x$estimate,4))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$`95%CI_0.005` = sapply(fisherresults_ENST,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$BH_0.005 = p.adjust(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005,method = "BH")

fisherresults2_ENST <- apply(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2[,c(262,267,274,276)],1,function(x) {fisher.test(rbind(x[1:2],c(x[3]-x[1],x[4]-x[2])))})

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005_NFE = sapply(fisherresults2_ENST,function(x) round(x$p.value,12))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$OR_0.005_NFE = sapply(fisherresults2_ENST,function(x) round(x$estimate,4))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$`95%CI_0.005_NFE` = sapply(fisherresults2_ENST,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$BH_0.005_NFE = p.adjust(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005_NFE,method = "BH")

fisherresults_ENST <- apply(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2[,c(262,264,274,275)],1,function(x) {fisher.test(rbind(x[1:2],c(x[3]-x[1],x[4]-x[2])))})

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005 = sapply(fisherresults_ENST,function(x) round(x$p.value,12))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$OR_0.005 = sapply(fisherresults_ENST,function(x) round(x$estimate,4))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$`95%CI_0.005` = sapply(fisherresults_ENST,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$BH_0.005 = p.adjust(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005,method = "BH")

fisherresults2_ENST <- apply(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2[,c(262,267,274,276)],1,function(x) {fisher.test(rbind(x[1:2],c(x[3]-x[1],x[4]-x[2])))})

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005_NFE = sapply(fisherresults2_ENST,function(x) round(x$p.value,12))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$OR_0.005_NFE = sapply(fisherresults2_ENST,function(x) round(x$estimate,4))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$`95%CI_0.005_NFE` = sapply(fisherresults2_ENST,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$BH_0.005_NFE = p.adjust(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2$P_value_Fisher_0.005_NFE,method = "BH")
```

Calculate Fisher's test values (volcano plot)
```{r}
fisherresults_biotypeNPC_allGenes_volcano <- apply(volcano_sampleAF0.01_biotypeNPC_allGenes3[,c(3,5,4,7)],1,function(x) {fisher.test(rbind(x[1:2],c(abs(x[3]-x[1]),abs(x[4]-x[2]))))})

volcano_sampleAF0.01_biotypeNPC_allGenes3$P_value_Fisher_0.005 = sapply(fisherresults_biotypeNPC_allGenes_volcano,function(x) round(x$p.value,15))
volcano_sampleAF0.01_biotypeNPC_allGenes3$OR_0.005 = sapply(fisherresults_biotypeNPC_allGenes_volcano,function(x) round(x$estimate,15))
volcano_sampleAF0.01_biotypeNPC_allGenes3$`95%CI_0.005` = sapply(fisherresults_biotypeNPC_allGenes_volcano,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
volcano_sampleAF0.01_biotypeNPC_allGenes3$BH_0.005 = p.adjust(volcano_sampleAF0.01_biotypeNPC_allGenes3$P_value_Fisher_0.005,method = "BH")

fisherresults_biotypeNPC_NFE_volcano <- apply(volcano_sampleAF0.01_biotypeNPC_allGenes3[,c(3,6,4,8)],1,function(x) {fisher.test(rbind(x[1:2],c(abs(x[3]-x[1]),abs(x[4]-x[2]))))})

volcano_sampleAF0.01_biotypeNPC_allGenes3$P_value_Fisher_0.005_NFE = sapply(fisherresults_biotypeNPC_NFE_volcano,function(x) round(x$p.value,15))
volcano_sampleAF0.01_biotypeNPC_allGenes3$OR_0.005_NFE = sapply(fisherresults_biotypeNPC_NFE_volcano,function(x) round(x$estimate,15))
volcano_sampleAF0.01_biotypeNPC_allGenes3$`95%CI_0.005_NFE` = sapply(fisherresults_biotypeNPC_NFE_volcano,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
volcano_sampleAF0.01_biotypeNPC_allGenes3$BH_0.005_NFE = p.adjust(volcano_sampleAF0.01_biotypeNPC_allGenes3$P_value_Fisher_0.005_NFE,method = "BH")

fisherresults_biotype_lincRNA_allGenes_volcano <- apply(volcano_sampleAF0.01_biotype_lincRNA_allGenes3[,c(3,5,4,7)],1,function(x) {fisher.test(rbind(x[1:2],c(abs(x[3]-x[1]),abs(x[4]-x[2]))))})

volcano_sampleAF0.01_biotype_lincRNA_allGenes3$P_value_Fisher_0.005 = sapply(fisherresults_biotype_lincRNA_allGenes_volcano,function(x) round(x$p.value,15))
volcano_sampleAF0.01_biotype_lincRNA_allGenes3$OR_0.005 = sapply(fisherresults_biotype_lincRNA_allGenes_volcano,function(x) round(x$estimate,15))
volcano_sampleAF0.01_biotype_lincRNA_allGenes3$`95%CI_0.005` = sapply(fisherresults_biotype_lincRNA_allGenes_volcano,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
volcano_sampleAF0.01_biotype_lincRNA_allGenes3$BH_0.005 = p.adjust(volcano_sampleAF0.01_biotype_lincRNA_allGenes3$P_value_Fisher_0.005,method = "BH")

fisherresults_biotype_lincRNA_NFE_volcano <- apply(volcano_sampleAF0.01_biotype_lincRNA_allGenes3[,c(3,6,4,8)],1,function(x) {fisher.test(rbind(x[1:2],c(abs(x[3]-x[1]),abs(x[4]-x[2]))))})

volcano_sampleAF0.01_biotype_lincRNA_allGenes3$P_value_Fisher_0.005_NFE = sapply(fisherresults_biotype_lincRNA_NFE_volcano,function(x) round(x$p.value,15))
volcano_sampleAF0.01_biotype_lincRNA_allGenes3$OR_0.005_NFE = sapply(fisherresults_biotype_lincRNA_NFE_volcano,function(x) round(x$estimate,15))
volcano_sampleAF0.01_biotype_lincRNA_allGenes3$`95%CI_0.005_NFE` = sapply(fisherresults_biotype_lincRNA_NFE_volcano,function(x) paste(round(x$conf.int[1:2],2),collapse="~"))
volcano_sampleAF0.01_biotype_lincRNA_allGenes3$BH_0.005_NFE = p.adjust(volcano_sampleAF0.01_biotype_lincRNA_allGenes3$P_value_Fisher_0.005_NFE,method = "BH")
```

Identify variants with filtering AF > maximum credible population AF in GnomAD for hereditary EOC (0.000181)
```{r}
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults3 <- mutate(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2, FilteringAF_95_Exceeds_MaxCredPopAF=((GnomAD_v2.1_non_cancer_Filtering_AF_95)>0.000181)) %>% 
  mutate(FilteringAF_95_NFE_Exceeds_MaxCredPopAF=((GnomAD_v2.1_non_cancer_Filtering_AF_95_nfe)>0.000181))

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults3 <- mutate(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults2, FilteringAF_95_Exceeds_MaxCredPopAF=((GnomAD_v2.1_non_cancer_Filtering_AF_95)>0.000181)) %>% 
  mutate(FilteringAF_95_NFE_Exceeds_MaxCredPopAF=((GnomAD_v2.1_non_cancer_Filtering_AF_95_nfe)>0.000181))
```

Identify variants that PASS/FAIL the LOFTEE 50bp rule
```{r}
masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults4 <- mutate(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults3,LoF_50_BP_RULE = ifelse(grepl("50_BP_RULE:PASS", LoF_info), TRUE, ifelse(grepl("50_BP_RULE:FAIL", LoF_info), FALSE, NA)))

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults4 <- mutate(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults3,LoF_50_BP_RULE = ifelse(grepl("50_BP_RULE:PASS", LoF_info), TRUE, ifelse(grepl("50_BP_RULE:FAIL", LoF_info), FALSE, NA)))
```

Import NMD depleted gene list and annotate variants accordingly
```{r}
NMD_depleted_gene_list <- read.csv("NMD_depleted_gene_list.csv", stringsAsFactors=FALSE) %>% 
  rename("SYMBOL" = gene) %>% 
  rename("Feature" = txnames) %>% 
  rename("NMD_predictor_rank" = min.rank)

masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults5 <- left_join(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults4, NMD_depleted_gene_list[,c(2:3)],by="Feature",copy=FALSE)

masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults5 <- left_join(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults4, NMD_depleted_gene_list[,c(2:3)],by="Feature",copy=FALSE) 
```

Remove redundant objects and output data
```{r}
rm(list=(ls(pattern="^genes")))
rm(list=(ls(pattern="^variants")))
rm(list=(ls(pattern="^fisherresults")))

write_excel_csv(masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults5,file="masterfile_eoc_NPC_goodQ2_ALL_ENSTcanonical_sampleAF0.01_biotypeNPC.csv",na=".",append=FALSE,col_names=TRUE)

write_excel_csv(masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_withGnomADstats_fisherresults5,file="masterfile_eoc_lincRNA_goodQ2_ALL_ENSTcanonical_sampleAF0.01_biotype_lincRNA.csv",na=".",append=FALSE,col_names=TRUE)
```

Calculate total LoF variants in sample vs total LoF variants in GnomAD non-cancer NFE and use for chi-squared test
```{r}
oddsratio <- function (a, b = NULL, c = NULL, d = NULL, conf.level = 0.95, 
    p.calc.by.independence = TRUE) 
{
    if (is.matrix(a)) {
        if ((dim(a)[1] != 2L) | (dim(a)[2] != 2L)) {
            stop("Input matrix must be a 2x2 table.")
        }
        .a <- a[1, 1]
        .b <- a[1, 2]
        .c <- a[2, 1]
        .d <- a[2, 2]
        .data.name <- deparse(substitute(a))
    }
    else {
        .a <- a
        .b <- b
        .c <- c
        .d <- d
        .data.name <- paste(deparse(substitute(a)), deparse(substitute(b)), 
            deparse(substitute(c)), deparse(substitute(d)))
    }
    .MAT <- matrix(c(.a, .b, M1 <- .a + .b, .c, .d, M0 <- .c + 
        .d, N1 <- .a + .c, N0 <- .b + .d, Total <- .a + .b + 
        .c + .d), 3, 3)
    colnames(.MAT) <- c("Sample", "GnomAD", "Total") #("Disease", "Nondisease", "Total")
    rownames(.MAT) <- c("Variants", "No variants", "Total") #("Exposed", "Nonexposed", "Total")
    class(.MAT) <- "table"
    print(.MAT)
    ESTIMATE <- (.a /.b)/(.c/.d)
    norm.pp <- qnorm(1 - (1 - conf.level)/2)
    if (p.calc.by.independence) {
        p.v <- 2 * (1 - pnorm(abs((.a - N1 * M1/Total)/sqrt(N1 * 
            N0 * M1 * M0/Total/Total/(Total - 1)))))
    }
    else {
        p.v <- 2 * (1 - pnorm(log(ifelse(ESTIMATE > 1, ESTIMATE, 
            1/ESTIMATE))/sqrt(1/.a + 1/.b + 1/.c + 1/.d)))
    }
    ORL <- ESTIMATE * exp(-norm.pp * sqrt(1/.a + 1/.b + 1/.c + 
        1/.d))
    ORU <- ESTIMATE * exp(norm.pp * sqrt(1/.a + 1/.b + 1/.c + 
        1/.d)) %>% signif(digits=7)
    CINT <- paste(signif(ORL,digits = 7),signif(ORU,digits = 7),sep="~")
    attr(CINT, "conf.level") <- conf.level
    RVAL <- list(p.value = p.v, conf.int = CINT, estimate = ESTIMATE, 
        method = "Odds ratio estimate and its significance probability", 
        data.name = .data.name)
    class(RVAL) <- "htest"
    return(RVAL)
}

a <- sum(volcano_sampleAF0.01_biotypeNPC_allGenes3$Total_Gene_Count) %>% as.numeric()
b <- n_distinct(masterfile_eoc_NPC_goodQ2_ALL$Sample)*2*(nrow(allGenes_AgilentSSv6_list_non_protein_coding_only)) %>% as.numeric()
b1 <- b-a %>% as.numeric()
c <- sum(volcano_sampleAF0.01_biotypeNPC_allGenes3$FILTER_RF_LOW.MODERATE.HIGH_AC_0.005) %>% as.numeric()
d <- (max(volcano_sampleAF0.01_biotypeNPC_allGenes3$MAX_AN))*nrow(allGenes_AgilentSSv6_list_non_protein_coding_only) %>% as.numeric()
d1 <- d-c %>% as.numeric()
e <- sum(volcano_sampleAF0.01_biotypeNPC_allGenes3$FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_NFE) %>% as.numeric()
f <- (max(volcano_sampleAF0.01_biotypeNPC_allGenes3$MAX_AN_NFE))*nrow(allGenes_AgilentSSv6_list_non_protein_coding_only) %>% as.numeric()
f1 <- f-e %>% as.numeric()

chiX2_0.005_biotypeNPC <- matrix(c(a,b1,c,d1),nrow=2,byrow=TRUE)
chiX2_0.005_NFE_biotypeNPC <- matrix(c(a,b1,e,f1),nrow=2,byrow=TRUE)

sink(file="masterfile_eoc_ALL_volcano_chiX2_results.txt",append=TRUE)

chisq.test(chiX2_0.005_biotypeNPC,correct=FALSE)
oddsratio(chiX2_0.005_biotypeNPC)
chisq.test(chiX2_0.005_NFE_biotypeNPC,correct=FALSE)
oddsratio(chiX2_0.005_NFE_biotypeNPC)

sink()

g <- sum(volcano_sampleAF0.01_biotype_lincRNA_allGenes3$Total_Gene_Count) %>% as.numeric()
h <- n_distinct(masterfile_eoc_NPC_goodQ2$Sample)*2*(nrow(allGenes_AgilentSSv6_list_non_protein_coding_only)) %>% as.numeric()
h1 <- h-g %>% as.numeric()
i <- sum(volcano_sampleAF0.01_biotype_lincRNA_allGenes3$FILTER_RF_LOW.MODERATE.HIGH_AC_0.005) %>% as.numeric()
j <- (max(volcano_sampleAF0.01_biotype_lincRNA_allGenes3$MAX_AN))*nrow(allGenes_AgilentSSv6_list_non_protein_coding_only) %>% as.numeric()
j1 <- j-i %>% as.numeric()
k <- sum(volcano_sampleAF0.01_biotype_lincRNA_allGenes3$FILTER_RF_LOW.MODERATE.HIGH_AC_0.005_NFE) %>% as.numeric()
l <- (max(volcano_sampleAF0.01_biotype_lincRNA_allGenes3$MAX_AN_NFE))*nrow(allGenes_AgilentSSv6_list_non_protein_coding_only) %>% as.numeric()
l1 <- l-k %>% as.numeric()

chiX2_0.005_biotype_lincRNA <- matrix(c(g,h1,i,j1),nrow=2,byrow=TRUE)
chiX2_0.005_NFE_biotype_lincRNA <- matrix(c(g,h1,k,l1),nrow=2,byrow=TRUE)

sink(file="masterfile_eoc_ALL_volcano_chiX2_results.txt",append=TRUE)

chisq.test(chiX2_0.005_biotype_lincRNA,correct=FALSE)
oddsratio(chiX2_0.005_biotype_lincRNA)
chisq.test(chiX2_0.005_NFE_biotype_lincRNA,correct=FALSE)
oddsratio(chiX2_0.005_NFE_biotype_lincRNA)

sink()
```

Output data for log p-value volcano plots
```{r}
volcano_sampleAF0.01_biotypeNPC_allGenes4 <- mutate(volcano_sampleAF0.01_biotypeNPC_allGenes3,"Log10_P-value_Fisher_0.05_NFE" = log10(P_value_Fisher_0.005_NFE)) %>% 
  arrange(desc(OR_0.005_NFE)) %>% 
  mutate("Plotted_Log10_P-value_Fisher_0.05_NFE"= 
  if_else (`Log10_P-value_Fisher_0.05_NFE`<0, `Log10_P-value_Fisher_0.05_NFE`*-1, `Log10_P-value_Fisher_0.05_NFE`, missing=NULL)) %>% 
  arrange(desc(`Plotted_Log10_P-value_Fisher_0.05_NFE`),desc(OR_0.005_NFE)) %>% 
  left_join(AgilentSSv6_list_ENSTcanonical_non_protein_coding[,c(3,4)],by="Gene",copy="FALSE") %>% 
  left_join(volcano_sampleAF0.01_biotypeNPC_allGenes[,c(2,5,6)],by="Gene",copy="FALSE") %>% 
  write_excel_csv(file="volcano_sampleAF0.01_biotypeNPC_allGenes_v6.csv",na=".",append=FALSE,col_names=TRUE)

volcano_sampleAF0.01_biotype_lincRNA_allGenes4 <- mutate(volcano_sampleAF0.01_biotype_lincRNA_allGenes3,"Log10_P-value_Fisher_0.05_NFE" = log10(P_value_Fisher_0.005_NFE)) %>% 
  arrange(desc(OR_0.005_NFE)) %>% 
  mutate("Plotted_Log10_P-value_Fisher_0.05_NFE"= 
  if_else (`Log10_P-value_Fisher_0.05_NFE`<0, `Log10_P-value_Fisher_0.05_NFE`*-1, `Log10_P-value_Fisher_0.05_NFE`, missing=NULL)) %>% 
  arrange(desc(`Plotted_Log10_P-value_Fisher_0.05_NFE`),desc(OR_0.005_NFE)) %>% 
  left_join(AgilentSSv6_list_ENSTcanonical_non_protein_coding[,c(3,4)],by="Gene",copy="FALSE") %>% 
  left_join(volcano_sampleAF0.01_biotype_lincRNA_allGenes[,c(2,5,6)],by="Gene",copy="FALSE") %>% 
  write_excel_csv(file="volcano_sampleAF0.01_biotype_lincRNA_allGenes_v6.csv",na=".",append=FALSE,col_names=TRUE)
```
