ALL IMPORTED FILES MUST BE IN THE SAME DIRECTORY AS THIS SCRIPT

Load tidyverse package

library("tidyverse")

Import individual tumour exome somatic variants (‘f’) and tumour-only variants (‘g’) files

max.callers = 3
f <- read.table("tumour_sample_germline_exome_somatic_caller_file.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="", quote="") %>%
  dplyr::rename("Tumour_Sample"="X.Tumour_Sample") %>% 
  mutate(n.callers = setNames(sapply(strsplit(unique(Identified), "-"), 
                                     function(callers){
                                       ifelse("Intersection" %in% callers, max.callers, sum(!grepl("^filter", callers)))
                                       }), unique(Identified))[Identified])
g <- read.delim("tumour_sample_germline_paired_exomes_HAP.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>% 
  dplyr::rename("Tumour_Sample"="X.Tumour_Sample")
names(f)
names(g)
sort(unique(f$Tumour_Sample))
sort(unique(g$Tumour_Sample))
ViP_list <- read.delim("ViP Samples & Candidate Genes.txt", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>% 
  arrange(Sample)
tumour_type <- select(ViP_list,Sample,Tumour.Type) %>% distinct() %>% rename("Sample"="Normal_Sample")
f$Normal_Sample <- as.character(f$Normal_Sample)
g$Normal_Sample <- as.character(g$Normal_Sample)
f <- left_join(f,tumour_type,by="Normal_Sample",copy=FALSE)
g0 <- left_join(g,tumour_type,by="Normal_Sample",copy=FALSE) %>% filter(CANONICAL%in%"YES")
gene_list <- select(ViP_list,Sample,SYMBOL) %>% filter(Sample%in%(f$Normal_Sample))
genes <- gene_list$SYMBOL
tumour_purity <- read.delim("Tumour Purity.txt", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>%
  select(Tumour_Sample,Purity,No_Mutations)
f <- left_join(f,tumour_purity,by="Tumour_Sample",copy=FALSE)
g0 <- left_join(g0,tumour_purity,by="Tumour_Sample",copy=FALSE)

Convert all relevant strings to integers/doubles, and remove variants that fail QC in one or more somatic pipeline callers or are Consequence 7 (IMPACT==MODIFIER)

f$QUAL <- as.numeric(f$QUAL) %>% replace_na(0)
f$TUMOUR.PMCAD <- as.numeric(f$TUMOUR.PMCAD) %>% replace_na(0)
f$TUMOUR.PMCDP <- as.numeric(f$TUMOUR.PMCDP) %>% replace_na(0)
f$TUMOUR.PMCFREQ <- as.numeric(f$TUMOUR.PMCFREQ) %>% replace_na(0)
f$NORMAL.PMCAD <- as.numeric(f$NORMAL.PMCAD) %>% replace_na(0)
f$NORMAL.PMCDP <- as.numeric(f$NORMAL.PMCDP) %>% replace_na(0)
f$NORMAL.PMCFREQ <- as.numeric(f$NORMAL.PMCFREQ) %>% replace_na(0)
f$GnomAD_v3_AF <- as.numeric(f$GnomAD_v3_AF) %>% replace_na(0)

g$QUAL <- as.numeric(g$QUAL) %>% replace_na(0)
g$TUMOUR.PMCAD <- as.numeric(g$TUMOUR.PMCAD) %>% replace_na(0)
g$TUMOUR.PMCDP <- as.numeric(g$TUMOUR.PMCDP) %>% replace_na(0)
g$TUMOUR.PMCFREQ <- as.numeric(g$TUMOUR.PMCFREQ) %>% replace_na(0)
g$NORMAL.PMCAD <- as.numeric(g$NORMAL.PMCAD) %>% replace_na(0)
g$NORMAL.PMCDP <- as.numeric(g$NORMAL.PMCDP) %>% replace_na(0)
g$NORMAL.PMCFREQ <- as.numeric(g$NORMAL.PMCFREQ) %>% replace_na(0)
g$GnomAD_v3_AF <- as.numeric(g$GnomAD_v3_AF) %>% replace_na(0)

f0 <- filter(f,!Identified%in%c("FilteredInAll")) %>% 
    filter(n.callers>=2) %>%
    filter(Consequence_Rank<7)

List unique, sorted elements in Identified (Somatic Caller QC results), Consequence Rank, NORMAL.PMCAD (germline alt allele base depth), NORMAL.PMCADP (germline total base depth) NORMAL.PMCFREQ (germline alt allele read freq), GnomAD v2.1/3 AF and QUAL score

sort(unique(f0$Identified))
sort(unique(f0$Consequence_Rank))
sort(unique(f0$NORMAL.PMCAD))
sort(unique(f0$NORMAL.PMCDP))
sort(unique(f0$NORMAL.PMCFREQ))
sort(unique(f0$GnomAD_v2.1_non_cancer_AF))
sort(unique(f0$GnomAD_v3_AF))
sort(unique(f0$QUAL))

Plot pre-filtering distribution of tumour alt allele read number(TUMOUR.PMCAD), depth (TUMOUR.PMCDP) and frequency (TUMOUR.PMCFREQ)

sort(unique(f0$TUMOUR.PMCAD))
hist(f0$TUMOUR.PMCAD, breaks=2000, main="Distribution of TUMOUR.PMCAD Values", xlab="TUMOUR.PMCAD",xlim=c(0,20))
sort(unique(f0$TUMOUR.PMCDP))
hist(f0$TUMOUR.PMCDP, breaks=4000, main="Distribution of TUMOUR.PMCDP Values", xlab="TUMOUR.PMCDP",xlim=c(0,100))
sort(unique(f0$TUMOUR.PMCFREQ))
hist(f0$TUMOUR.PMCFREQ, breaks=200, main="Distribution of TUMOUR.PMCFREQ Values", xlab="TUMOUR.PMCFREQ",xlim=c(0,1))

Filter out and remove elements/rows with high number alt allele reads in germline (NORMAL.PMCAD), high germline alt allele read freq (NORMAL.PMCFREQ), low germline and tumour read depths (NORMAL/TUMOUR.PMCDP), low number alt allele reads in tumour (TUMOUR.PMCAD), low tumour alt allele read freq (TUMOUR.PMCFREQ) and common variants (using GnomAD v2.1/3 AFs); list remaining unique elements from those fields; and save output (for ENSEMBL CANONICAL transcripts only)

f1<-filter(f0,(NORMAL.PMCAD <= 2) & (NORMAL.PMCDP >= 10) & (NORMAL.PMCFREQ < 0.01))
f2<-filter(f1,TUMOUR.PMCAD >= 5)
f3<-filter(f2,TUMOUR.PMCDP>=20)
f4<-filter(f3,TUMOUR.PMCFREQ >= (f3$Purity*0.5*(2/3)))
f5<-filter(f4,GnomAD_v2.1_non_cancer_AF <= 0.0001)
f6<-filter(f5,GnomAD_v3_AF <= 0.0001)

sort(unique(f1$QUAL))
sort(unique(f2$TUMOUR.PMCAD))
sort(unique(f2$QUAL))
sort(unique(f3$TUMOUR.PMCDP))
sort(unique(f3$QUAL))
sort(unique(f4$TUMOUR.PMCFREQ))
sort(unique(f4$QUAL))
sort(unique(f5$GnomAD_v2.1_non_cancer_AF))
sort(unique(f6$GnomAD_v3_AF))
samples<-unique(f$Tumour_Sample)
for (sample in samples){
  dir.create(sample)
  setwd(sample)
f6 %>% 
  filter(CANONICAL%in%"YES") %>% 
  write_excel_csv(path=paste(samples,"tumour_germline_exome_somatic_caller_file_filtered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
}

Plot post-filtering distribution of tumour alt allele read number(TUMOUR.PMCAD), depth (TUMOUR.PMCDP) and frequency (TUMOUR.PMCFREQ)

sort(unique(f6$TUMOUR.PMCAD))
hist(f6$TUMOUR.PMCAD, breaks=2000, main="Distribution of TUMOUR.PMCAD Values", xlab="TUMOUR.PMCAD",xlim=c(0,20))
sort(unique(f6$TUMOUR.PMCDP))
hist(f6$TUMOUR.PMCDP, breaks=4000, main="Distribution of TUMOUR.PMCDP Values", xlab="TUMOUR.PMCDP",xlim=c(0,100))
sort(unique(f6$TUMOUR.PMCFREQ))
hist(f6$TUMOUR.PMCFREQ, breaks=200, main="Distribution of TUMOUR.PMCFREQ Values", xlab="TUMOUR.PMCFREQ",xlim=c(0,1))

Extract rows with variants in gene(s) of interest pre- and post-filtering (e.g. TP53) from somatic variants file, and save output

HGS_tumour_genes <- c("TP53","BRCA1","BRCA2","PTEN","NF1","RB1","CDK12","CDKN2A")
END_tumour_genes <- c("ARID1A","PIK3CA","PTEN","CTNNB1","KRAS","BRAF","TP53","MET")
setwd(sample)
f0_genes <- if (f0$Tumour.Type%in%"High-grade endometrioid")
{
  filter(f0,(SYMBOL%in%END_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(path=paste(samples,"tumour_germline_exome_somatic_caller_genes_unfiltered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
}else{
  filter(f0,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(path=paste(samples,"tumour_germline_exome_somatic_caller_genes_unfiltered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
}
f6_genes <- if (f6$Tumour.Type%in%"High-grade endometrioid")
{
  filter(f6,(SYMBOL%in%END_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(path=paste(samples,"tumour_germline_exome_somatic_caller_genes_filtered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
}else{
  filter(f6,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(sample,path=paste(samples,"tumour_germline_exome_somatic_caller_genes_filtered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
}

Extract equivalent rows with variants in gene(s) of interest (one gene transcript per variant) from tumour-only variants file, and save output

setwd(sample)
g_genes <-
  if (g0$Tumour.Type%in%"High-grade endometrioid")
{
  filter(g0,(SYMBOL%in%END_tumour_genes)|
                     (SYMBOL%in%c(genes)))
}else{
  filter(g0,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes)))
} %>% 
  arrange(Transcript_Index) %>% 
  distinct(CHROM,POS,REF,ALT,.keep_all=TRUE)
  write_excel_csv(g_genes,path=paste(sample,"tumour_germline_paired_exomes_HAP_genes.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
---
title: "Thesis Tumour Exome Variants Filtering Script"
output: html_notebook
---

ALL IMPORTED FILES MUST BE IN THE SAME DIRECTORY AS THIS SCRIPT

Load tidyverse package
```{r}
library("tidyverse")
```

Import individual tumour exome somatic variants ('f') and tumour-only variants ('g') files
```{r}
max.callers = 3
f <- read.table("tumour_sample_germline_exome_somatic_caller_file.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="", quote="") %>%
  dplyr::rename("Tumour_Sample"="X.Tumour_Sample") %>% 
  mutate(n.callers = setNames(sapply(strsplit(unique(Identified), "-"), 
                                     function(callers){
                                       ifelse("Intersection" %in% callers, max.callers, sum(!grepl("^filter", callers)))
                                       }), unique(Identified))[Identified])
g <- read.delim("tumour_sample_germline_paired_exomes_HAP.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>% 
  dplyr::rename("Tumour_Sample"="X.Tumour_Sample")
names(f)
names(g)
sort(unique(f$Tumour_Sample))
sort(unique(g$Tumour_Sample))
ViP_list <- read.delim("ViP Samples & Candidate Genes.txt", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>% 
  arrange(Sample)
tumour_type <- select(ViP_list,Sample,Tumour.Type) %>% distinct() %>% rename("Sample"="Normal_Sample")
f$Normal_Sample <- as.character(f$Normal_Sample)
g$Normal_Sample <- as.character(g$Normal_Sample)
f <- left_join(f,tumour_type,by="Normal_Sample",copy=FALSE)
g0 <- left_join(g,tumour_type,by="Normal_Sample",copy=FALSE) %>% filter(CANONICAL%in%"YES")
gene_list <- select(ViP_list,Sample,SYMBOL) %>% filter(Sample%in%(f$Normal_Sample))
genes <- gene_list$SYMBOL
tumour_purity <- read.delim("Tumour Purity.txt", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>%
  select(Tumour_Sample,Purity,No_Mutations)
f <- left_join(f,tumour_purity,by="Tumour_Sample",copy=FALSE)
g0 <- left_join(g0,tumour_purity,by="Tumour_Sample",copy=FALSE)
```

Convert all relevant strings to integers/doubles, and remove variants that fail QC in one or more somatic pipeline callers or are Consequence 7 (IMPACT==MODIFIER)
```{r}
f$QUAL <- as.numeric(f$QUAL) %>% replace_na(0)
f$TUMOUR.PMCAD <- as.numeric(f$TUMOUR.PMCAD) %>% replace_na(0)
f$TUMOUR.PMCDP <- as.numeric(f$TUMOUR.PMCDP) %>% replace_na(0)
f$TUMOUR.PMCFREQ <- as.numeric(f$TUMOUR.PMCFREQ) %>% replace_na(0)
f$NORMAL.PMCAD <- as.numeric(f$NORMAL.PMCAD) %>% replace_na(0)
f$NORMAL.PMCDP <- as.numeric(f$NORMAL.PMCDP) %>% replace_na(0)
f$NORMAL.PMCFREQ <- as.numeric(f$NORMAL.PMCFREQ) %>% replace_na(0)
f$GnomAD_v3_AF <- as.numeric(f$GnomAD_v3_AF) %>% replace_na(0)

g$QUAL <- as.numeric(g$QUAL) %>% replace_na(0)
g$TUMOUR.PMCAD <- as.numeric(g$TUMOUR.PMCAD) %>% replace_na(0)
g$TUMOUR.PMCDP <- as.numeric(g$TUMOUR.PMCDP) %>% replace_na(0)
g$TUMOUR.PMCFREQ <- as.numeric(g$TUMOUR.PMCFREQ) %>% replace_na(0)
g$NORMAL.PMCAD <- as.numeric(g$NORMAL.PMCAD) %>% replace_na(0)
g$NORMAL.PMCDP <- as.numeric(g$NORMAL.PMCDP) %>% replace_na(0)
g$NORMAL.PMCFREQ <- as.numeric(g$NORMAL.PMCFREQ) %>% replace_na(0)
g$GnomAD_v3_AF <- as.numeric(g$GnomAD_v3_AF) %>% replace_na(0)

f0 <- filter(f,!Identified%in%c("FilteredInAll")) %>% 
    filter(n.callers>=2) %>%
    filter(Consequence_Rank<7)
```

List unique, sorted elements in Identified (Somatic Caller QC results), Consequence Rank, NORMAL.PMCAD (germline alt allele base depth), NORMAL.PMCADP (germline total base depth) NORMAL.PMCFREQ (germline alt allele read freq), GnomAD v2.1/3 AF and QUAL score
```{r}
sort(unique(f0$Identified))
```
```{r}
sort(unique(f0$Consequence_Rank))
```
```{r}
sort(unique(f0$NORMAL.PMCAD))
```
```{r}
sort(unique(f0$NORMAL.PMCDP))
```
```{r}
sort(unique(f0$NORMAL.PMCFREQ))
```
```{r}
sort(unique(f0$GnomAD_v2.1_non_cancer_AF))
```
```{r}
sort(unique(f0$GnomAD_v3_AF))
```
```{r}
sort(unique(f0$QUAL))
```

Plot pre-filtering distribution of tumour alt allele read number(TUMOUR.PMCAD), depth (TUMOUR.PMCDP) and frequency (TUMOUR.PMCFREQ)
```{r}
sort(unique(f0$TUMOUR.PMCAD))
hist(f0$TUMOUR.PMCAD, breaks=2000, main="Distribution of TUMOUR.PMCAD Values", xlab="TUMOUR.PMCAD",xlim=c(0,20))
sort(unique(f0$TUMOUR.PMCDP))
hist(f0$TUMOUR.PMCDP, breaks=4000, main="Distribution of TUMOUR.PMCDP Values", xlab="TUMOUR.PMCDP",xlim=c(0,100))
sort(unique(f0$TUMOUR.PMCFREQ))
hist(f0$TUMOUR.PMCFREQ, breaks=200, main="Distribution of TUMOUR.PMCFREQ Values", xlab="TUMOUR.PMCFREQ",xlim=c(0,1))
```

Filter out and remove elements/rows with high number alt allele reads in germline (NORMAL.PMCAD), high germline alt allele read freq (NORMAL.PMCFREQ), low germline and tumour read depths (NORMAL/TUMOUR.PMCDP), low number alt allele reads in tumour (TUMOUR.PMCAD), low tumour alt allele read freq (TUMOUR.PMCFREQ) and common variants (using GnomAD v2.1/3 AFs); list remaining unique elements from those fields; and save output (for ENSEMBL CANONICAL transcripts only)
```{r}
f1<-filter(f0,(NORMAL.PMCAD <= 2) & (NORMAL.PMCDP >= 10) & (NORMAL.PMCFREQ < 0.01))
f2<-filter(f1,TUMOUR.PMCAD >= 5)
f3<-filter(f2,TUMOUR.PMCDP>=20)
f4<-filter(f3,TUMOUR.PMCFREQ >= (f3$Purity*0.5*(2/3)))
f5<-filter(f4,GnomAD_v2.1_non_cancer_AF <= 0.0001)
f6<-filter(f5,GnomAD_v3_AF <= 0.0001)

sort(unique(f1$QUAL))
```
```{r}
sort(unique(f2$TUMOUR.PMCAD))
sort(unique(f2$QUAL))
```
```{r}
sort(unique(f3$TUMOUR.PMCDP))
sort(unique(f3$QUAL))
```
```{r}
sort(unique(f4$TUMOUR.PMCFREQ))
sort(unique(f4$QUAL))
```
```{r}
sort(unique(f5$GnomAD_v2.1_non_cancer_AF))
```
```{r}
sort(unique(f6$GnomAD_v3_AF))
```
```{r}
samples<-unique(f$Tumour_Sample)
for (sample in samples){
  dir.create(sample)
  setwd(sample)
f6 %>% 
  filter(CANONICAL%in%"YES") %>% 
  write_excel_csv(path=paste(samples,"tumour_germline_exome_somatic_caller_file_filtered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
}
```

Plot post-filtering distribution of tumour alt allele read number(TUMOUR.PMCAD), depth (TUMOUR.PMCDP) and frequency (TUMOUR.PMCFREQ)
```{r}
sort(unique(f6$TUMOUR.PMCAD))
hist(f6$TUMOUR.PMCAD, breaks=2000, main="Distribution of TUMOUR.PMCAD Values", xlab="TUMOUR.PMCAD",xlim=c(0,20))
sort(unique(f6$TUMOUR.PMCDP))
hist(f6$TUMOUR.PMCDP, breaks=4000, main="Distribution of TUMOUR.PMCDP Values", xlab="TUMOUR.PMCDP",xlim=c(0,100))
sort(unique(f6$TUMOUR.PMCFREQ))
hist(f6$TUMOUR.PMCFREQ, breaks=200, main="Distribution of TUMOUR.PMCFREQ Values", xlab="TUMOUR.PMCFREQ",xlim=c(0,1))
```

Extract rows with variants in gene(s) of interest pre- and post-filtering (e.g. TP53) from somatic variants file, and save output
```{r}
HGS_tumour_genes <- c("TP53","BRCA1","BRCA2","PTEN","NF1","RB1","CDK12","CDKN2A")
END_tumour_genes <- c("ARID1A","PIK3CA","PTEN","CTNNB1","KRAS","BRAF","TP53","MET")
setwd(sample)
f0_genes <- if (f0$Tumour.Type%in%"High-grade endometrioid")
{
  filter(f0,(SYMBOL%in%END_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(path=paste(samples,"tumour_germline_exome_somatic_caller_genes_unfiltered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
}else{
  filter(f0,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(path=paste(samples,"tumour_germline_exome_somatic_caller_genes_unfiltered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
}
f6_genes <- if (f6$Tumour.Type%in%"High-grade endometrioid")
{
  filter(f6,(SYMBOL%in%END_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(path=paste(samples,"tumour_germline_exome_somatic_caller_genes_filtered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
}else{
  filter(f6,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes))) %>% 
  write_excel_csv(sample,path=paste(samples,"tumour_germline_exome_somatic_caller_genes_filtered.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
}
```

Extract equivalent rows with variants in gene(s) of interest (one gene transcript per variant) from tumour-only variants file, and save output
```{r}
setwd(sample)
g_genes <-
  if (g0$Tumour.Type%in%"High-grade endometrioid")
{
  filter(g0,(SYMBOL%in%END_tumour_genes)|
                     (SYMBOL%in%c(genes)))
}else{
  filter(g0,(SYMBOL%in%HGS_tumour_genes)|
                     (SYMBOL%in%c(genes)))
} %>% 
  arrange(Transcript_Index) %>% 
  distinct(CHROM,POS,REF,ALT,.keep_all=TRUE)
  write_excel_csv(g_genes,path=paste(sample,"tumour_germline_paired_exomes_HAP_genes.csv",sep="_"), na=".",append=FALSE,col_names=TRUE)
```
