ALL IMPORTED FILES MUST BE IN THE SAME DIRECTORY AS THIS SCRIPT

Load tidyverse package.

library("tidyverse")

Import individual tumour exome somatic variants (“tumour_sample_germline_exome_somatic_caller_file.tsv”) and tumour-only variants (“tumour_sample_germline_paired_exomes_HAP.tsv”- N/A for unpaired somatic tumour data) annotated vcf files as ‘f’ and ‘g’ data objects, alongside linked sample (“ViP Samples & Candidate Genes.txt”) and tumour purity (“Tumour Purity.txt”) files. All files available from the authors on reasonable request.

max.callers = 3
f <- read.table("tumour_sample_germline_exome_somatic_caller_file.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="", quote="") %>%
  dplyr::rename("Tumour_Sample"="X.Tumour_Sample") %>% 
  mutate(n.callers = setNames(sapply(strsplit(unique(Identified), "-"), 
                                     function(callers){
                                       ifelse("Intersection" %in% callers, max.callers, sum(!grepl("^filter", callers)))
                                       }), unique(Identified))[Identified])
g <- read.delim("tumour_sample_germline_paired_exomes_HAP.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>% 
  dplyr::rename("Tumour_Sample"="X.Tumour_Sample")
ViP_list <- read.delim("ViP Samples & Candidate Genes.txt", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>% 
  arrange(Sample)
tumour_type <- select(ViP_list,Sample,Tumour.Type) %>% distinct() %>% rename("Normal_Sample"="Sample")
f$Normal_Sample <- as.character(f$Normal_Sample)
g$Normal_Sample <- as.character(g$Normal_Sample)
f <- left_join(f,tumour_type,by="Normal_Sample",copy=FALSE)
g <- left_join(g,tumour_type,by="Normal_Sample",copy=FALSE) %>% filter(CANONICAL%in%"YES")
gene_list <- select(ViP_list,Sample,SYMBOL) %>% filter(Sample%in%(f$Normal_Sample))
genes <- gene_list$SYMBOL
tumour_purity <- read.delim("Tumour Purity.txt", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>%
  select(Tumour_Sample,Purity,No_Mutations)
f <- left_join(f,tumour_purity,by="Tumour_Sample",copy=FALSE)
g <- left_join(g,tumour_purity,by="Tumour_Sample",copy=FALSE)

Import individual tumour genome somatic variants (“tumour_germline_genome_somatic_caller_file.tsv”) annotated vcf file as ‘ff’ data object. All files available from the authors on reasonable request.

ff <- read.table("tumour_germline_genome_somatic_caller_file.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="", quote="") %>%
  dplyr::rename("Tumour_Sample"="X.Tumour_Sample")
ff$Normal_Sample <- as.character(ff$Normal_Sample)
ff <- left_join(ff,tumour_purity,by="Tumour_Sample",copy=FALSE)

Convert all relevant strings to integers/doubles, and (for somatic exome data only) remove variants that fail QC in one or more somatic pipeline callers or are Consequence 7 (i.e. IMPACT==MODIFIER).

f$QUAL <- as.numeric(f$QUAL) %>% replace_na(0)
f$TUMOUR.PMCAD <- as.numeric(f$TUMOUR.PMCAD) %>% replace_na(0)
f$TUMOUR.PMCDP <- as.numeric(f$TUMOUR.PMCDP) %>% replace_na(0)
f$TUMOUR.PMCFREQ <- as.numeric(f$TUMOUR.PMCFREQ) %>% replace_na(0)
f$NORMAL.PMCAD <- as.numeric(f$NORMAL.PMCAD) %>% replace_na(0)
f$NORMAL.PMCDP <- as.numeric(f$NORMAL.PMCDP) %>% replace_na(0)
f$NORMAL.PMCFREQ <- as.numeric(f$NORMAL.PMCFREQ) %>% replace_na(0)
f$GnomAD_v2.1_non_cancer_AF <- as.numeric(f$GnomAD_v2.1_non_cancer_AF) %>% replace_na(0)
f$GnomAD_v3_AF <- as.numeric(f$GnomAD_v3_AF) %>% replace_na(0)

f0 <- filter(f,!Identified%in%c("FilteredInAll")) %>% 
    filter(n.callers >= 2) %>% # n.callers >= 3 for unpaired somatic tumour data (see Supplementary Figure 1a)
    filter(Consequence_Rank < 7)

g$QUAL <- as.numeric(g$QUAL) %>% replace_na(0)
g$TUMOUR.PMCAD <- as.numeric(g$TUMOUR.PMCAD) %>% replace_na(0)
g$TUMOUR.PMCDP <- as.numeric(g$TUMOUR.PMCDP) %>% replace_na(0)
g$TUMOUR.PMCFREQ <- as.numeric(g$TUMOUR.PMCFREQ) %>% replace_na(0)
g$NORMAL.PMCAD <- as.numeric(g$NORMAL.PMCAD) %>% replace_na(0)
g$NORMAL.PMCDP <- as.numeric(g$NORMAL.PMCDP) %>% replace_na(0)
g$NORMAL.PMCFREQ <- as.numeric(g$NORMAL.PMCFREQ) %>% replace_na(0)
g$GnomAD_v2.1_non_cancer_AF <- as.numeric(g$GnomAD_v2.1_non_cancer_AF) %>% replace_na(0)
g$GnomAD_v3_AF <- as.numeric(g$GnomAD_v3_AF) %>% replace_na(0)

ff$QUAL <- as.numeric(ff$QUAL) %>% replace_na(0)
ff$TUMOUR.PMCAD <- as.numeric(ff$TUMOUR.PMCAD) %>% replace_na(0)
ff$TUMOUR.PMCDP <- as.numeric(ff$TUMOUR.PMCDP) %>% replace_na(0)
ff$TUMOUR.PMCFREQ <- as.numeric(ff$TUMOUR.PMCFREQ) %>% replace_na(0)
ff$NORMAL.PMCAD <- as.numeric(ff$NORMAL.PMCAD) %>% replace_na(0)
ff$NORMAL.PMCDP <- as.numeric(ff$NORMAL.PMCDP) %>% replace_na(0)
ff$NORMAL.PMCFREQ <- as.numeric(ff$NORMAL.PMCFREQ) %>% replace_na(0)
ff$gnomAD_AF <- as.numeric(ff$gnomAD_AF) %>% replace_na(0)

For somatic caller files, list unique, sorted elements in Identified (Somatic Caller QC results, exomes only), Consequence Rank (exomes only), NORMAL.PMCAD (germline alt allele base depth, exomes only), NORMAL.PMCADP (germline total base depth, exomes and genomes) NORMAL.PMCFREQ (germline alt allele read freq, exomes and genomes), GnomAD v2.1/3.0 AF (exomes and genomes) and QUAL score (exomes only).

sort(unique(f0$Identified))
sort(unique(f0$Consequence_Rank))
sort(unique(f0$NORMAL.PMCAD))
sort(unique(f0$NORMAL.PMCDP))
sort(unique(f0$NORMAL.PMCFREQ))
sort(unique(f0$GnomAD_v2.1_non_cancer_AF))
sort(unique(f0$GnomAD_v3_AF))
sort(unique(f0$QUAL))
sort(unique(ff$NORMAL.PMCDP))
sort(unique(ff$NORMAL.PMCFREQ))
sort(unique(ff$gnomAD_AF))

Plot pre-filtering distribution of tumour alt allele read number(TUMOUR.PMCAD, exomes only), depth (TUMOUR.PMCDP, exomes and genomes) and frequency (TUMOUR.PMCFREQ, exomes and genomes).

sort(unique(f0$TUMOUR.PMCAD))
hist(f0$TUMOUR.PMCAD, breaks=2000, main="Distribution of TUMOUR.PMCAD Values", xlab="TUMOUR.PMCAD",xlim=c(0,20))
sort(unique(f0$TUMOUR.PMCDP))
hist(f0$TUMOUR.PMCDP, breaks=4000, main="Distribution of TUMOUR.PMCDP Values", xlab="TUMOUR.PMCDP",xlim=c(0,100))
sort(unique(f0$TUMOUR.PMCFREQ))
hist(f0$TUMOUR.PMCFREQ, breaks=200, main="Distribution of TUMOUR.PMCFREQ Values", xlab="TUMOUR.PMCFREQ",xlim=c(0,1))

sort(unique(ff$TUMOUR.PMCDP))
hist(ff$TUMOUR.PMCDP, breaks=4000, main="Distribution of TUMOUR.PMCDP Values", xlab="TUMOUR.PMCDP",xlim=c(0,100))
sort(unique(ff$TUMOUR.PMCFREQ))
hist(ff$TUMOUR.PMCFREQ, breaks=200, main="Distribution of TUMOUR.PMCFREQ Values", xlab="TUMOUR.PMCFREQ",xlim=c(0,1))

Filter out and remove elements/rows with high number alt allele reads in germline (NORMAL.PMCAD, exomes only), high germline alt allele read freq (NORMAL.PMCFREQ, exomes and genomes), low germline and tumour read depths (NORMAL.PMCDP and TUMOUR.PMCDP, exomes and genomes), low number alt allele reads in tumour (TUMOUR.PMCAD, exomes only), low tumour alt allele read freq (TUMOUR.PMCFREQ, exomes and genomes) and common variants (using GnomAD v2.1/3.0 AFs, exomes and genomes).

f1<-filter(f0, (NORMAL.PMCDP >= 10) & (NORMAL.PMCAD <= 2) & (NORMAL.PMCFREQ < 0.01)) # N/A for unpaired somatic tumour data (see Supplementary Figure 1a)
f2<-filter(f1,TUMOUR.PMCDP >= 20)
f3<-filter(f2,TUMOUR.PMCAD >= 5)
f4<-filter(f3,TUMOUR.PMCFREQ >= (f3$Purity*0.5*(2/3)))
f5<-filter(f4,GnomAD_v2.1_non_cancer_AF <= 0.0001) # GnomAD_v2.1_non_cancer_AF == 0 for unpaired somatic tumour data (see Supplementary Figure 1a)
f6<-filter(f5,GnomAD_v3_AF <= 0.0001) # GnomAD_v3_AF == 0 for unpaired somatic tumour data (see Supplementary Figure 1a)

ff1<-filter(ff,(NORMAL.PMCDP >= 10) & (NORMAL.PMCAD <= 2) & (NORMAL.PMCFREQ < 0.01))
ff2<-filter(ff1,TUMOUR.PMCDP >= 10)
ff3<-filter(ff2,TUMOUR.PMCAD >= 4)
ff4<-filter(ff3,TUMOUR.PMCFREQ >= (ff3$Purity*0.5*(3/4)))
ff5<-filter(ff4,gnomAD_AF == 0) # NB Tumour genome data annotated to GnomAD v3.0 only (see Supplementary Figure 1b)

List remaining unique elements from above fields, and save output (exomes- for ENSEMBL CANONICAL transcripts only, genomes- for MANE SELECT transcripts only).

sort(unique(f2$TUMOUR.PMCAD))
sort(unique(f3$TUMOUR.PMCDP))
sort(unique(f4$TUMOUR.PMCFREQ))
sort(unique(f5$GnomAD_v2.1_non_cancer_AF))
sort(unique(f6$GnomAD_v3_AF))
samples_WES <- unique(f$Tumour_Sample)
for (sample_WES in samples_WES){
  dir.create(sample_WES)
  setwd(sample_WES)
f6 %>% 
  filter(CANONICAL%in%"YES") %>% 
  write_excel_csv(path=paste(sample_WES,"tumour_germline_exome_somatic_caller_file_filtered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
}
sort(unique(ff2$TUMOUR.PMCDP))
sort(unique(ff3$TUMOUR.PMCAD))
sort(unique(ff4$TUMOUR.PMCFREQ))
sort(unique(ff5$gnomAD_AF))
samples_WGS <- unique(ff$Tumour_Sample)
for (sample_WGS in samples_WGS){
  dir.create(sample_WGS)
  setwd(sample_WGS)
ff5 %>% 
  filter(MANE_Status!=".") %>% 
  write_excel_csv(path=paste(sample_WGS,"tumour_germline_genome_somatic_caller_file_filtered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
}

Plot post-filtering distribution of tumour alt allele read number (TUMOUR.PMCAD, exomes only), depth (TUMOUR.PMCDP/DP, exomes and genomes) and frequency (TUMOUR.PMCFREQ/AF, exomes and genomes).

sort(unique(f6$TUMOUR.PMCAD))
hist(f6$TUMOUR.PMCAD, breaks=2000, main="Distribution of TUMOUR.PMCAD Values", xlab="TUMOUR.PMCAD",xlim=c(0,20))
sort(unique(f6$TUMOUR.PMCDP))
hist(f6$TUMOUR.PMCDP, breaks=4000, main="Distribution of TUMOUR.PMCDP Values", xlab="TUMOUR.PMCDP",xlim=c(0,100))
sort(unique(f6$TUMOUR.PMCFREQ))
hist(f6$TUMOUR.PMCFREQ, breaks=200, main="Distribution of TUMOUR.PMCFREQ Values", xlab="TUMOUR.PMCFREQ",xlim=c(0,1))

sort(unique(ff5$TUMOUR.PMCDP))
hist(ff5$TUMOUR.PMCDP, breaks=4000, main="Distribution of TUMOUR.PMCDP Values", xlab="TUMOUR.PMCDP",xlim=c(0,100))
sort(unique(ff4$TUMOUR.PMCFREQ))
hist(ff5$TUMOUR.PMCFREQ, breaks=200, main="Distribution of TUMOUR.PMCFREQ Values", xlab="TUMOUR.PMCFREQ",xlim=c(0,1))

Extract remaining variants and columns used for mutation signature analysis, and rename sample column.

f7 <- select(f6,c(CHROM,POS,REF,ALT,Variant_Type))
unique(f7$Variant_Type)
f7$Sample <- f6$Tumour_Sample

ff6<-select(ff5,c(CHROM,POS,REF,ALT)) %>% 
    filter(!CHROM%in%"chrY")
ff6$Sample<-sample_WGS

Output mutations for SIGNAL mutation signature analysis, with count of number of variants used and filtering metrics.

mut.ref_f <- f7
mut.ref_f$CHROM<-unlist(lapply(mut.ref_f$CHROM,function(x) paste("chr",x,sep=""))) 
mut.ref_edit_f <- mut.ref_f[,c("Sample","CHROM","POS","REF","ALT")]

for (sample_WES in samples_WES){
  setwd(sample_WES)
  mut.ref2_f<-mut.ref_edit_f[mut.ref_edit_f$Sample==sample_WES,]
  mut.ref2_f<-unique(mut.ref2_f)
  mut.Num_f <- nrow(mut.ref2_f)
  s <- paste("The number of variants in sample", sample_WES, "used for mutation signature analysis is", mut.Num_f)
  t <- paste("The minimum TUMOUR.PMCAD in sample", sample_WES, "used for filtering is", min(unique(f2$TUMOUR.PMCAD)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(f7$TUMOUR.PMCAD)))
  u <- paste("The minimum TUMOUR.PMCDP in sample", sample_WES, "used for filtering is", min(unique(f3$TUMOUR.PMCDP)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(f7$TUMOUR.PMCDP)))
  v <- paste("The minimum TUMOUR.PMCFREQ in sample", sample_WES, "used for filtering is", min(unique(f4$TUMOUR.PMCFREQ)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(f7$TUMOUR.PMCFREQ)))
  w <- paste("The maximuum GnomAD_v2.1_non_cancer_AF in sample", sample_WES, "used for filtering is", max(unique(f5$GnomAD_v2.1_non_cancer_AF)), "and the maximum figure for the variants used for mutation signature analysis is", max(unique(f7$GnomAD_v2.1_non_cancer_AF)))
  x <- paste("The maximum GnomAD_v3_AF in sample", sample_WES, "used for filtering and for the variants used for mutation signature analysis is", max(unique(f7$GnomAD_v3_AF)))
  parameters <- print(c(s,t,u,v,w,x))
  mut.ref3_f = mut.ref2_f[,c(3:5)]
  mut.ref_f.chr = tibble(gsub("[chr]","", mut.ref2_f$CHROM))
  bind_cols(mut.ref_f.chr,mut.ref3_f) %>% write_tsv(path=paste(sample_WES,"unique_mutations_forSignal.tsv",sep="_"),col_names=FALSE)
  write.table(parameters,file=paste(sample_WGS,"_parameters.txt",sep=""),quote=F,row.names = F,sep="\t")
}
mut.ref_ff <- ff6
mut.ref_edit_ff<-mut.ref_ff[,c("Sample","CHROM","POS","REF","ALT")]
for (sample_WGS in samples_WGS){
  setwd(sample_WGS)
  mut.ref2_ff<-mut.ref_edit_ff[mut.ref_edit_ff$Sample==sample_WGS,]
  mut.ref2_ff<-unique(mut.ref2_ff)
  mut.Num_ff <- nrow(mut.ref2_ff)
  s <- paste("The number of variants in sample", sample_WGS, "used for mutation signature analysis is", mut.Num_ff)
  t <- paste("The minimum TUMOUR.PMCAD in sample", sample_WGS, "used for filtering is", min(unique(f2$TUMOUR.PMCAD)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(f4$TUMOUR.PMCAD)))
  u <- paste("The minimum TUMOUR.PMCDP in sample", sample_WGS, "used for filtering is", min(unique(ff2$TUMOUR.PMCDP)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(ff2$TUMOUR.PMCDP)))
  v <- paste("The minimum TUMOUR.PMCFREQ in sample", sample_WGS, "used for filtering is", min(unique(ff3$TUMOUR.PMCFREQ)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(ff3$TUMOUR.PMCFREQ)))
  w <- paste("The maximum GnomAD_v3_AF in sample", sample_WGS, "used for filtering and for the variants used for mutation signature analysis is", max(unique(ff4$gnomAD_AF)))
  parameters <- print(c(s,t,u,v,w))
  mut.ref3_ff = mut.ref2_ff[,c(3:5)]
  mut.ref_ff.chr = tibble(gsub("[chr]","", mut.ref2_ff$CHROM))
  bind_cols(mut.ref2_ff$Sample,mut.ref_ff.chr,mut.ref3_ff) %>% write_tsv(path=paste(sample_WGS,"unique_mutations_forSignal.tsv",sep="_"),col_names=FALSE)
  write.table(parameters,file=paste(sample_WGS,"_parameters.txt",sep=""),quote=F,row.names = F,sep="\t")
}

Extract rows with variants in gene(s) of interest pre- and post-filtering (e.g. TP53) from somatic variants file (exomes and genomes), and save output.

HGS_tumour_genes <- c("TP53","BRCA1","BRCA2","PTEN","NF1","RB1","CDK12","CDKN2A")

setwd(sample_WES)
f0_genes <- filter(f0,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(path=paste(samples_WES,"tumour_germline_exome_somatic_caller_genes_unfiltered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
f6_genes <- filter(f6,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(sample,path=paste(samples_WES,"tumour_germline_exome_somatic_caller_genes_filtered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
setwd(sample_WGS)
ff_genes <- filter(ff,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(path=paste(samples_WGS,"tumour_germline_genome_somatic_caller_genes_unfiltered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
ff5_genes <- filter(ff5,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(sample_WGS,path=paste(samples_WGS,"tumour_germline_genome_somatic_caller_genes_filtered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)

Extract equivalent rows with variants in gene(s) of interest (one gene transcript per variant) from tumour-only variants file (exomes only- N/A for unpaired somatic tumour data), and save output.

setwd(sample_WES)
g_genes <- filter(g,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  arrange(Transcript_Index) %>% 
  distinct(CHROM,POS,REF,ALT,.keep_all=TRUE)
write_excel_csv(g_genes,path=paste(sample_WES,"tumour_germline_paired_exomes_HAP_genes.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
---
title: "Tumour sequencing data filtering R script for 'Assessment of candidate high-grade serous ovarian carcinoma predisposition genes through integrated germline and tumour sequencing' (Subramanian et al., npj Genomic Medicine, 2024)"
output: html_notebook
---

ALL IMPORTED FILES MUST BE IN THE SAME DIRECTORY AS THIS SCRIPT

Load tidyverse package.
```{r}
library("tidyverse")
```

Import individual tumour exome somatic variants ("tumour_sample_germline_exome_somatic_caller_file.tsv") and tumour-only variants ("tumour_sample_germline_paired_exomes_HAP.tsv"- N/A for unpaired somatic tumour data) annotated vcf files as 'f' and 'g' data objects, alongside linked sample ("ViP Samples & Candidate Genes.txt") and tumour purity ("Tumour Purity.txt") files. All files available from the authors on reasonable request.
```{r}
max.callers = 3
f <- read.table("tumour_sample_germline_exome_somatic_caller_file.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="", quote="") %>%
  dplyr::rename("Tumour_Sample"="X.Tumour_Sample") %>% 
  mutate(n.callers = setNames(sapply(strsplit(unique(Identified), "-"), 
                                     function(callers){
                                       ifelse("Intersection" %in% callers, max.callers, sum(!grepl("^filter", callers)))
                                       }), unique(Identified))[Identified])
g <- read.delim("tumour_sample_germline_paired_exomes_HAP.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>% 
  dplyr::rename("Tumour_Sample"="X.Tumour_Sample")
ViP_list <- read.delim("ViP Samples & Candidate Genes.txt", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>% 
  arrange(Sample)
tumour_type <- select(ViP_list,Sample,Tumour.Type) %>% distinct() %>% rename("Normal_Sample"="Sample")
f$Normal_Sample <- as.character(f$Normal_Sample)
g$Normal_Sample <- as.character(g$Normal_Sample)
f <- left_join(f,tumour_type,by="Normal_Sample",copy=FALSE)
g <- left_join(g,tumour_type,by="Normal_Sample",copy=FALSE) %>% filter(CANONICAL%in%"YES")
gene_list <- select(ViP_list,Sample,SYMBOL) %>% filter(Sample%in%(f$Normal_Sample))
genes <- gene_list$SYMBOL
tumour_purity <- read.delim("Tumour Purity.txt", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>%
  select(Tumour_Sample,Purity,No_Mutations)
f <- left_join(f,tumour_purity,by="Tumour_Sample",copy=FALSE)
g <- left_join(g,tumour_purity,by="Tumour_Sample",copy=FALSE)
```

Import individual tumour genome somatic variants ("tumour_germline_genome_somatic_caller_file.tsv") annotated vcf file as 'ff' data object. All files available from the authors on reasonable request.
```{r}
ff <- read.table("tumour_germline_genome_somatic_caller_file.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="", quote="") %>%
  dplyr::rename("Tumour_Sample"="X.Tumour_Sample")
ff$Normal_Sample <- as.character(ff$Normal_Sample)
ff <- left_join(ff,tumour_purity,by="Tumour_Sample",copy=FALSE)
```

Convert all relevant strings to integers/doubles, and (for somatic exome data only) remove variants that fail QC in one or more somatic pipeline callers or are Consequence 7 (i.e. IMPACT==MODIFIER).
```{r}
f$QUAL <- as.numeric(f$QUAL) %>% replace_na(0)
f$TUMOUR.PMCAD <- as.numeric(f$TUMOUR.PMCAD) %>% replace_na(0)
f$TUMOUR.PMCDP <- as.numeric(f$TUMOUR.PMCDP) %>% replace_na(0)
f$TUMOUR.PMCFREQ <- as.numeric(f$TUMOUR.PMCFREQ) %>% replace_na(0)
f$NORMAL.PMCAD <- as.numeric(f$NORMAL.PMCAD) %>% replace_na(0)
f$NORMAL.PMCDP <- as.numeric(f$NORMAL.PMCDP) %>% replace_na(0)
f$NORMAL.PMCFREQ <- as.numeric(f$NORMAL.PMCFREQ) %>% replace_na(0)
f$GnomAD_v2.1_non_cancer_AF <- as.numeric(f$GnomAD_v2.1_non_cancer_AF) %>% replace_na(0)
f$GnomAD_v3_AF <- as.numeric(f$GnomAD_v3_AF) %>% replace_na(0)

f0 <- filter(f,!Identified%in%c("FilteredInAll")) %>% 
    filter(n.callers >= 2) %>% # n.callers >= 3 for unpaired somatic tumour data (see Supplementary Figure 1a)
    filter(Consequence_Rank < 7)

g$QUAL <- as.numeric(g$QUAL) %>% replace_na(0)
g$TUMOUR.PMCAD <- as.numeric(g$TUMOUR.PMCAD) %>% replace_na(0)
g$TUMOUR.PMCDP <- as.numeric(g$TUMOUR.PMCDP) %>% replace_na(0)
g$TUMOUR.PMCFREQ <- as.numeric(g$TUMOUR.PMCFREQ) %>% replace_na(0)
g$NORMAL.PMCAD <- as.numeric(g$NORMAL.PMCAD) %>% replace_na(0)
g$NORMAL.PMCDP <- as.numeric(g$NORMAL.PMCDP) %>% replace_na(0)
g$NORMAL.PMCFREQ <- as.numeric(g$NORMAL.PMCFREQ) %>% replace_na(0)
g$GnomAD_v2.1_non_cancer_AF <- as.numeric(g$GnomAD_v2.1_non_cancer_AF) %>% replace_na(0)
g$GnomAD_v3_AF <- as.numeric(g$GnomAD_v3_AF) %>% replace_na(0)

ff$QUAL <- as.numeric(ff$QUAL) %>% replace_na(0)
ff$TUMOUR.PMCAD <- as.numeric(ff$TUMOUR.PMCAD) %>% replace_na(0)
ff$TUMOUR.PMCDP <- as.numeric(ff$TUMOUR.PMCDP) %>% replace_na(0)
ff$TUMOUR.PMCFREQ <- as.numeric(ff$TUMOUR.PMCFREQ) %>% replace_na(0)
ff$NORMAL.PMCAD <- as.numeric(ff$NORMAL.PMCAD) %>% replace_na(0)
ff$NORMAL.PMCDP <- as.numeric(ff$NORMAL.PMCDP) %>% replace_na(0)
ff$NORMAL.PMCFREQ <- as.numeric(ff$NORMAL.PMCFREQ) %>% replace_na(0)
ff$gnomAD_AF <- as.numeric(ff$gnomAD_AF) %>% replace_na(0)
```

For somatic caller files, list unique, sorted elements in Identified (Somatic Caller QC results, exomes only), Consequence Rank (exomes only), NORMAL.PMCAD (germline alt allele base depth, exomes only), NORMAL.PMCADP (germline total base depth, exomes and genomes) NORMAL.PMCFREQ (germline alt allele read freq, exomes and genomes), GnomAD v2.1/3.0 AF (exomes and genomes) and QUAL score (exomes only).
```{r}
sort(unique(f0$Identified))
```
```{r}
sort(unique(f0$Consequence_Rank))
```
```{r}
sort(unique(f0$NORMAL.PMCAD))
```
```{r}
sort(unique(f0$NORMAL.PMCDP))
```
```{r}
sort(unique(f0$NORMAL.PMCFREQ))
```
```{r}
sort(unique(f0$GnomAD_v2.1_non_cancer_AF))
```
```{r}
sort(unique(f0$GnomAD_v3_AF))
```
```{r}
sort(unique(f0$QUAL))
```
```{r}
sort(unique(ff$NORMAL.PMCDP))
```
```{r}
sort(unique(ff$NORMAL.PMCFREQ))
```
```{r}
sort(unique(ff$gnomAD_AF))
```

Plot pre-filtering distribution of tumour alt allele read number(TUMOUR.PMCAD, exomes only), depth (TUMOUR.PMCDP, exomes and genomes) and frequency (TUMOUR.PMCFREQ, exomes and genomes).
```{r}
sort(unique(f0$TUMOUR.PMCAD))
hist(f0$TUMOUR.PMCAD, breaks=2000, main="Distribution of TUMOUR.PMCAD Values", xlab="TUMOUR.PMCAD",xlim=c(0,20))
sort(unique(f0$TUMOUR.PMCDP))
hist(f0$TUMOUR.PMCDP, breaks=4000, main="Distribution of TUMOUR.PMCDP Values", xlab="TUMOUR.PMCDP",xlim=c(0,100))
sort(unique(f0$TUMOUR.PMCFREQ))
hist(f0$TUMOUR.PMCFREQ, breaks=200, main="Distribution of TUMOUR.PMCFREQ Values", xlab="TUMOUR.PMCFREQ",xlim=c(0,1))

sort(unique(ff$TUMOUR.PMCDP))
hist(ff$TUMOUR.PMCDP, breaks=4000, main="Distribution of TUMOUR.PMCDP Values", xlab="TUMOUR.PMCDP",xlim=c(0,100))
sort(unique(ff$TUMOUR.PMCFREQ))
hist(ff$TUMOUR.PMCFREQ, breaks=200, main="Distribution of TUMOUR.PMCFREQ Values", xlab="TUMOUR.PMCFREQ",xlim=c(0,1))
```

Filter out and remove elements/rows with high number alt allele reads in germline (NORMAL.PMCAD, exomes only), high germline alt allele read freq (NORMAL.PMCFREQ, exomes and genomes), low germline and tumour read depths (NORMAL.PMCDP and TUMOUR.PMCDP, exomes and genomes), low number alt allele reads in tumour (TUMOUR.PMCAD, exomes only), low tumour alt allele read freq (TUMOUR.PMCFREQ, exomes and genomes) and common variants (using GnomAD v2.1/3.0 AFs, exomes and genomes).
```{r}
f1<-filter(f0, (NORMAL.PMCDP >= 10) & (NORMAL.PMCAD <= 2) & (NORMAL.PMCFREQ < 0.01)) # N/A for unpaired somatic tumour data (see Supplementary Figure 1a)
f2<-filter(f1,TUMOUR.PMCDP >= 20)
f3<-filter(f2,TUMOUR.PMCAD >= 5)
f4<-filter(f3,TUMOUR.PMCFREQ >= (f3$Purity*0.5*(2/3)))
f5<-filter(f4,GnomAD_v2.1_non_cancer_AF <= 0.0001) # GnomAD_v2.1_non_cancer_AF == 0 for unpaired somatic tumour data (see Supplementary Figure 1a)
f6<-filter(f5,GnomAD_v3_AF <= 0.0001) # GnomAD_v3_AF == 0 for unpaired somatic tumour data (see Supplementary Figure 1a)

ff1<-filter(ff,(NORMAL.PMCDP >= 10) & (NORMAL.PMCAD <= 2) & (NORMAL.PMCFREQ < 0.01))
ff2<-filter(ff1,TUMOUR.PMCDP >= 10)
ff3<-filter(ff2,TUMOUR.PMCAD >= 4)
ff4<-filter(ff3,TUMOUR.PMCFREQ >= (ff3$Purity*0.5*(3/4)))
ff5<-filter(ff4,gnomAD_AF == 0) # NB Tumour genome data annotated to GnomAD v3.0 only (see Supplementary Figure 1b)
```

List remaining unique elements from above fields, and save output (exomes- for ENSEMBL CANONICAL transcripts only, genomes- for MANE SELECT transcripts only).
```{r}
sort(unique(f2$TUMOUR.PMCAD))
```
```{r}
sort(unique(f3$TUMOUR.PMCDP))
```
```{r}
sort(unique(f4$TUMOUR.PMCFREQ))
```
```{r}
sort(unique(f5$GnomAD_v2.1_non_cancer_AF))
```
```{r}
sort(unique(f6$GnomAD_v3_AF))
```
```{r}
samples_WES <- unique(f$Tumour_Sample)
for (sample_WES in samples_WES){
  dir.create(sample_WES)
  setwd(sample_WES)
f6 %>% 
  filter(CANONICAL%in%"YES") %>% 
  write_excel_csv(path=paste(sample_WES,"tumour_germline_exome_somatic_caller_file_filtered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
}
```
```{r}
sort(unique(ff2$TUMOUR.PMCDP))
```
```{r}
sort(unique(ff3$TUMOUR.PMCAD))
```
```{r}
sort(unique(ff4$TUMOUR.PMCFREQ))
```
```{r}
sort(unique(ff5$gnomAD_AF))
```
```{r}
samples_WGS <- unique(ff$Tumour_Sample)
for (sample_WGS in samples_WGS){
  dir.create(sample_WGS)
  setwd(sample_WGS)
ff5 %>% 
  filter(MANE_Status!=".") %>% 
  write_excel_csv(path=paste(sample_WGS,"tumour_germline_genome_somatic_caller_file_filtered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
}
```

Plot post-filtering distribution of tumour alt allele read number (TUMOUR.PMCAD, exomes only), depth (TUMOUR.PMCDP/DP, exomes and genomes) and frequency (TUMOUR.PMCFREQ/AF, exomes and genomes).
```{r}
sort(unique(f6$TUMOUR.PMCAD))
hist(f6$TUMOUR.PMCAD, breaks=2000, main="Distribution of TUMOUR.PMCAD Values", xlab="TUMOUR.PMCAD",xlim=c(0,20))
sort(unique(f6$TUMOUR.PMCDP))
hist(f6$TUMOUR.PMCDP, breaks=4000, main="Distribution of TUMOUR.PMCDP Values", xlab="TUMOUR.PMCDP",xlim=c(0,100))
sort(unique(f6$TUMOUR.PMCFREQ))
hist(f6$TUMOUR.PMCFREQ, breaks=200, main="Distribution of TUMOUR.PMCFREQ Values", xlab="TUMOUR.PMCFREQ",xlim=c(0,1))

sort(unique(ff5$TUMOUR.PMCDP))
hist(ff5$TUMOUR.PMCDP, breaks=4000, main="Distribution of TUMOUR.PMCDP Values", xlab="TUMOUR.PMCDP",xlim=c(0,100))
sort(unique(ff4$TUMOUR.PMCFREQ))
hist(ff5$TUMOUR.PMCFREQ, breaks=200, main="Distribution of TUMOUR.PMCFREQ Values", xlab="TUMOUR.PMCFREQ",xlim=c(0,1))
```

Extract remaining variants and columns used for mutation signature analysis, and rename sample column.
```{r}
f7 <- select(f6,c(CHROM,POS,REF,ALT,Variant_Type))
unique(f7$Variant_Type)
f7$Sample <- f6$Tumour_Sample

ff6<-select(ff5,c(CHROM,POS,REF,ALT)) %>% 
    filter(!CHROM%in%"chrY")
ff6$Sample<-sample_WGS
```

Output mutations for SIGNAL mutation signature analysis, with count of number of variants used and filtering metrics.
```{r}
mut.ref_f <- f7
mut.ref_f$CHROM<-unlist(lapply(mut.ref_f$CHROM,function(x) paste("chr",x,sep=""))) 
mut.ref_edit_f <- mut.ref_f[,c("Sample","CHROM","POS","REF","ALT")]

for (sample_WES in samples_WES){
  setwd(sample_WES)
  mut.ref2_f<-mut.ref_edit_f[mut.ref_edit_f$Sample==sample_WES,]
  mut.ref2_f<-unique(mut.ref2_f)
  mut.Num_f <- nrow(mut.ref2_f)
  s <- paste("The number of variants in sample", sample_WES, "used for mutation signature analysis is", mut.Num_f)
  t <- paste("The minimum TUMOUR.PMCAD in sample", sample_WES, "used for filtering is", min(unique(f2$TUMOUR.PMCAD)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(f7$TUMOUR.PMCAD)))
  u <- paste("The minimum TUMOUR.PMCDP in sample", sample_WES, "used for filtering is", min(unique(f3$TUMOUR.PMCDP)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(f7$TUMOUR.PMCDP)))
  v <- paste("The minimum TUMOUR.PMCFREQ in sample", sample_WES, "used for filtering is", min(unique(f4$TUMOUR.PMCFREQ)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(f7$TUMOUR.PMCFREQ)))
  w <- paste("The maximuum GnomAD_v2.1_non_cancer_AF in sample", sample_WES, "used for filtering is", max(unique(f5$GnomAD_v2.1_non_cancer_AF)), "and the maximum figure for the variants used for mutation signature analysis is", max(unique(f7$GnomAD_v2.1_non_cancer_AF)))
  x <- paste("The maximum GnomAD_v3_AF in sample", sample_WES, "used for filtering and for the variants used for mutation signature analysis is", max(unique(f7$GnomAD_v3_AF)))
  parameters <- print(c(s,t,u,v,w,x))
  mut.ref3_f = mut.ref2_f[,c(3:5)]
  mut.ref_f.chr = tibble(gsub("[chr]","", mut.ref2_f$CHROM))
  bind_cols(mut.ref_f.chr,mut.ref3_f) %>% write_tsv(path=paste(sample_WES,"unique_mutations_forSignal.tsv",sep="_"),col_names=FALSE)
  write.table(parameters,file=paste(sample_WGS,"_parameters.txt",sep=""),quote=F,row.names = F,sep="\t")
}
```
```{r}
mut.ref_ff <- ff6
mut.ref_edit_ff<-mut.ref_ff[,c("Sample","CHROM","POS","REF","ALT")]
for (sample_WGS in samples_WGS){
  setwd(sample_WGS)
  mut.ref2_ff<-mut.ref_edit_ff[mut.ref_edit_ff$Sample==sample_WGS,]
  mut.ref2_ff<-unique(mut.ref2_ff)
  mut.Num_ff <- nrow(mut.ref2_ff)
  s <- paste("The number of variants in sample", sample_WGS, "used for mutation signature analysis is", mut.Num_ff)
  t <- paste("The minimum TUMOUR.PMCAD in sample", sample_WGS, "used for filtering is", min(unique(f2$TUMOUR.PMCAD)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(f4$TUMOUR.PMCAD)))
  u <- paste("The minimum TUMOUR.PMCDP in sample", sample_WGS, "used for filtering is", min(unique(ff2$TUMOUR.PMCDP)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(ff2$TUMOUR.PMCDP)))
  v <- paste("The minimum TUMOUR.PMCFREQ in sample", sample_WGS, "used for filtering is", min(unique(ff3$TUMOUR.PMCFREQ)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(ff3$TUMOUR.PMCFREQ)))
  w <- paste("The maximum GnomAD_v3_AF in sample", sample_WGS, "used for filtering and for the variants used for mutation signature analysis is", max(unique(ff4$gnomAD_AF)))
  parameters <- print(c(s,t,u,v,w))
  mut.ref3_ff = mut.ref2_ff[,c(3:5)]
  mut.ref_ff.chr = tibble(gsub("[chr]","", mut.ref2_ff$CHROM))
  bind_cols(mut.ref2_ff$Sample,mut.ref_ff.chr,mut.ref3_ff) %>% write_tsv(path=paste(sample_WGS,"unique_mutations_forSignal.tsv",sep="_"),col_names=FALSE)
  write.table(parameters,file=paste(sample_WGS,"_parameters.txt",sep=""),quote=F,row.names = F,sep="\t")
}
```

Extract rows with variants in gene(s) of interest pre- and post-filtering (e.g. TP53) from somatic variants file (exomes and genomes), and save output.
```{r}
HGS_tumour_genes <- c("TP53","BRCA1","BRCA2","PTEN","NF1","RB1","CDK12","CDKN2A")

setwd(sample_WES)
f0_genes <- filter(f0,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(path=paste(samples_WES,"tumour_germline_exome_somatic_caller_genes_unfiltered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
f6_genes <- filter(f6,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(sample,path=paste(samples_WES,"tumour_germline_exome_somatic_caller_genes_filtered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
```
```{r}
setwd(sample_WGS)
ff_genes <- filter(ff,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(path=paste(samples_WGS,"tumour_germline_genome_somatic_caller_genes_unfiltered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
ff5_genes <- filter(ff5,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(sample_WGS,path=paste(samples_WGS,"tumour_germline_genome_somatic_caller_genes_filtered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
```

Extract equivalent rows with variants in gene(s) of interest (one gene transcript per variant) from tumour-only variants file (exomes only- N/A for unpaired somatic tumour data), and save output.
```{r}
setwd(sample_WES)
g_genes <- filter(g,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  arrange(Transcript_Index) %>% 
  distinct(CHROM,POS,REF,ALT,.keep_all=TRUE)
write_excel_csv(g_genes,path=paste(sample_WES,"tumour_germline_paired_exomes_HAP_genes.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
```