ALL IMPORTED FILES MUST BE IN THE SAME DIRECTORY AS THIS SCRIPT

Load tidyverse package

setwd("~/OneDrive - The University of Melbourne/R data")
library("tidyverse")

Import raw vcf data from “masterfile” and exclude variants with GnomAD non-cancer AF > 0

masterfile <- read.delim("masterfile.tsv", header=TRUE, row.names=NULL, na.strings = ".", stringsAsFactors=FALSE) %>% 
  dplyr::rename("Sample"="X.Sample") %>% 
  filter(GnomAD_v2.1_non_cancer_AF==0)

Exclude non-epithelial, uterine-only and BRCA +ve samples

ViP_Complete_Cohort <- read.delim("ViP_LoF_Complete_Cohort.txt", stringsAsFactors=FALSE)
ViP_Complete_Cohort_list <- ViP_Complete_Cohort[,1]

masterfile_eoc <- filter(masterfile,Sample%in%c(ViP_Complete_Cohort_list))

Append patient path data and family history info

ViP_OvCa_Path_Data <- read.delim("ViP_OvCa_Path_Data.txt", header=TRUE, row.names=NULL, stringsAsFactors=FALSE) %>% 
  dplyr::rename("Sample"=Exome.ID)
NoFHxCa <- read.delim("NoFHxCa.tsv", header=TRUE, row.names=NULL, stringsAsFactors=FALSE) %>% 
  select(Exome.ID,FHx) %>% 
  dplyr::rename("Sample"=Exome.ID)
masterfile_eoc2 <- left_join(masterfile_eoc,ViP_OvCa_Path_Data,by="Sample",copy=FALSE) %>% 
  left_join(NoFHxCa,by="Sample",copy=FALSE) %>% 
  mutate_at(vars("FHx"),funs(replace(., is.na(.), "YES")))

Exclude other mutation +ve samples (MLH1/MSH2/MSH6/PMS2, TP53, RAD51C/D, BRIP1)

ViP_Discovery_Cohort <- read.delim("ViP_LoF_Discovery_Cohort.txt", stringsAsFactors=FALSE)
ViP_Discovery_Cohort_list <- ViP_Discovery_Cohort[,1]

masterfile_eoc3 <- filter(masterfile_eoc2,Sample%in%c(ViP_Discovery_Cohort_list))

Add Sample AD/DP column to calculate read freq for variants with missing PMCFREQ values

masterfile_eoc4 <- mutate(masterfile_eoc3, AD = as.numeric(sapply(strsplit(Sample.AD,","), function(x) tail(x,1))), DP = as.numeric(Sample.DP), Sample.FREQ = AD/DP)

Total_Sample_Alleles <- (n_distinct(masterfile_eoc4$Sample)*2)

Exclude low-quality variants, GnomAD RF/InbreedingCoeff-flagged variants

masterfile_eoc_lowFreq <- filter(masterfile_eoc4, (QUAL>=30)&
                                 (Identified!="FilteredInAll")&
                                 (Sample.PMCDP>=20)&
                                 (Sample.PMCFREQ<0.25)&
                                 (Sample.FREQ<0.25)) %>% 
  filter(Sample.FREQ>=0.01) %>% 
  filter((!str_detect(GnomAD_v2.1_FILTER_exome,"InbreedingCoeff")&
            !str_detect(GnomAD_v2.1_FILTER_exome,"RF"))|
           is.na(GnomAD_v2.1_FILTER_exome)) %>%
  filter((!str_detect(GnomAD_v2.1_FILTER_genome,"InbreedingCoeff")&
            !str_detect(GnomAD_v2.1_FILTER_genome,"RF"))|
           is.na(GnomAD_v2.1_FILTER_genome))

Remove HAP-only indel variants with PMCFREQ==0

masterfile_eoc_lowFreq_HAP_SNV <- filter(masterfile_eoc_lowFreq,Sample.PMCFREQ==0) %>% 
  filter(Variant_Type=="SNV")
masterfile_eoc_lowFreq2 <- filter(masterfile_eoc_lowFreq,Sample.PMCFREQ!=0) %>% 
  bind_rows(masterfile_eoc_lowFreq_HAP_SNV,.id=NULL)

Exclude MODERATE impact variants

masterfile_eoc_lowFreq_HIGH <- filter(masterfile_eoc_lowFreq2, IMPACT=="HIGH")

Separate Ensembl canonical and RefSeq canonical variants

masterfile_eoc_lowFreq_HIGH_ENSTcanonical <- filter(masterfile_eoc_lowFreq_HIGH,CANONICAL=="YES") %>% 
  write_excel_csv(path="masterfile_eoc_altAF0.25_HIGH_ENSTcanonical.csv",na=".",append=FALSE,col_names=TRUE)

Sample variant counts and AFs

variant_counts_AFs <- function(vcf){
variants_heterozygous <- vcf %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  dplyr::rename(alleles = "n()") %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants_homozygous <- vcf %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(alleles = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants <- full_join(variants_heterozygous,variants_homozygous,by="HGVSc",copy=FALSE,suffix=c(".x",".y")) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Allele_Count=`sum(alleles).x`+`sum(alleles).y`) %>% 
  mutate(Sample_AF=Total_Allele_Count/Total_Sample_Alleles)
vcf <- left_join(vcf, variants[,c(1,4:5)],by="HGVSc",copy=FALSE)
}

masterfile_eoc_lowFreq_HIGH_ENSTcanonical_2 <- variant_counts_AFs(masterfile_eoc_lowFreq_HIGH_ENSTcanonical)

Sample gene counts and frequencies

gene_counts_AF <- function(vcf){
genes_oneVarAllele <- vcf %>% select(SYMBOL,Sample.GT) %>% 
  select(SYMBOL,Sample.GT) %>% 
  group_by(SYMBOL,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  dplyr::rename(gene_Var = "n()") %>% 
  group_by(SYMBOL) %>% 
  summarise(sum(gene_Var))
genes_twoVarAllele <- vcf %>% select(SYMBOL,Sample.GT) %>% 
  select(SYMBOL,Sample.GT) %>% 
  group_by(SYMBOL,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(gene_Var = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(SYMBOL) %>% 
  summarise(sum(gene_Var))
genes <- full_join(genes_oneVarAllele,genes_twoVarAllele,by="SYMBOL",copy=FALSE,suffix=c(".x",".y")) %>% 
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Gene_Count=`sum(gene_Var).x`+`sum(gene_Var).y`) %>% 
  mutate(Sample_Gene_Freq=Total_Gene_Count/Total_Sample_Alleles)
vcf <- left_join(vcf, genes[,c(1,4:5)],by="SYMBOL",copy=FALSE)
}

masterfile_eoc_lowFreq_HIGH_ENSTcanonical_3 <- gene_counts_AF(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_2)

Add GnomAD gene-level data

GnomAD_stats_LOF_VEP <- read.delim("GnomAD_stats_LOF_VEP.tsv", header=TRUE, row.names=NULL, stringsAsFactors=FALSE)

masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats <- left_join(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_3, GnomAD_stats_LOF_VEP[,c(2,30:41)],by="Feature",copy=FALSE) 
masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats2 <- mutate_at(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats,vars(starts_with("FILTER_")),funs(replace(., is.na(.), 0))) %>% 
  mutate_if(grepl("popmax$", names(.)),funs(ifelse(. == "NA", 0, as.numeric(.)))) %>% 
  mutate_at(vars(ends_with("popmax")),funs(replace(., is.na(.), 0)))

Calculate Sample/GnomAD gene freq ratios for total and NFE GnomAD figures, including and excluding variants with AF > 0.005

masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios <- 
  mutate(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats2, "Sample_Gene_LOF_Freq_Ratio_GnomAD"=Sample_Gene_Freq/FILTER_RF_LOF_HIGHIMPACT_AF_1.0_ADJ,
"Sample_Gene_LOF_Freq_Ratio_GnomAD_0.005"=Sample_Gene_Freq/FILTER_RF_LOF_HIGHIMPACT_AF_0.005_ADJ,
"Ratio_Difference"=Sample_Gene_LOF_Freq_Ratio_GnomAD_0.005-Sample_Gene_LOF_Freq_Ratio_GnomAD, 
"Sample_Gene_LOF_Freq_Ratio_GnomAD_NFE"=Sample_Gene_Freq/FILTER_RF_LOF_HIGHIMPACT_AF_1.0_NFE_ADJ,
"Sample_Gene_LOF_Freq_Ratio_GnomAD_NFE_0.005"=Sample_Gene_Freq/FILTER_RF_LOF_HIGHIMPACT_AF_0.005_NFE_ADJ,
"Ratio_Difference_NFE"=Sample_Gene_LOF_Freq_Ratio_GnomAD_NFE_0.005-Sample_Gene_LOF_Freq_Ratio_GnomAD_NFE)

Calculate Sample/GnomAD allele freq ratios

masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios2 <- mutate(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios, "Sample_Gene_LOF_AF_Ratio"=Sample_AF/GnomAD_v2.1_non_cancer_AF, 
"Sample_Gene_LOF_AF_Ratio_NFE"=Sample_AF/GnomAD_v2.1_non_cancer_AF_nfe)

Attach other GnomAD data for DNA repair genes analysis

masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3 <- mutate(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios2,Total_Case_Alleles=Total_Sample_Alleles) %>% 
  left_join(GnomAD_stats_LOF_VEP[,c(2,6,10,14:29)],by="Feature",copy=FALSE) 
masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios4 <- mutate(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3, MAX_AN=replace(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3$MAX_AN, is.na(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3$MAX_AN), max(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3$MAX_AN, na.rm=TRUE))) %>% 
  mutate(MAX_AN_NFE=replace(.$MAX_AN_NFE, is.na(.$MAX_AN_NFE), max(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3$MAX_AN_NFE, na.rm=TRUE)))

Post-filtering figures (ENST, excl. mutation +ve samples)

n_distinct(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3$Sample)
[1] 150
n_distinct(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3$Gene)
[1] 102
n_distinct(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3$HGVSc)
[1] 117

Divide into groups stratified by age (<60/>=60) and presence/absence of FHx, and output files.

masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_ageUnder60 <- filter(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3,Reported.Age.Dx<60) %>%
  write_excel_csv(path="masterfile_eoc_altAF0.25_HIGH_ENSTcanonical_ageUnder60.csv",na=".",append=FALSE,col_names=TRUE)
  
masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_ageOver60 <- filter(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3,Reported.Age.Dx>=60) %>% 
  write_excel_csv(path="masterfile_eoc_altAF0.25_HIGH_ENSTcanonical_ageOver60.csv",na=".",append=FALSE,col_names=TRUE)
  
masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_FHx <- filter(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3,FHx%in%c("YES")) %>% 
  write_excel_csv(path="masterfile_eoc_altAF0.25_HIGH_ENSTcanonical_FHx.csv",na=".",append=FALSE,col_names=TRUE)
  
masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_noFHx <- 
filter(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3,!FHx%in%c("YES")) %>% 
  write_excel_csv(path="masterfile_eoc_altAF0.25_HIGH_ENSTcanonical_noFHx.csv",na=".",append=FALSE,col_names=TRUE)

Identify ViP samples with and without low-frequency variants post-filtering (incl age/FHx distribution), and perform simple linear regression comparison for age vs presence of low-frequency variants

ViP_LowFreqVar_Samples <- select(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3,Sample) %>% distinct(.keep_all=FALSE)
ViP_noLowFreqVar_Samples <- filter(ViP_Discovery_Cohort,!Exome.ID%in%ViP_LowFreqVar_Samples$Sample)
ViP_noLowFreqVar_Path <- filter(ViP_OvCa_Path_Data,Sample%in%ViP_noLowFreqVar_Samples$Exome.ID)
ViP_LowFreqVar_Path <- filter(ViP_OvCa_Path_Data,Sample%in%ViP_LowFreqVar_Samples$Sample)
ViP_FHx <- select(masterfile_eoc4,Sample,FHx) %>% 
  distinct(.keep_all=FALSE)
FHx <- filter(ViP_FHx,FHx=="YES")
NoFHx <- filter(ViP_FHx,FHx=="NO")

FHx_LowFreqVar <- filter(FHx,Sample%in%ViP_LowFreqVar_Samples$Sample)
NoFHx_LowFreqVar <- filter(NoFHx,Sample%in%ViP_LowFreqVar_Samples$Sample)
FHx_noLowFreqVar <- filter(FHx,Sample%in%ViP_noLowFreqVar_Samples$Exome.ID)
NoFHxCa_noLowFreqVar <- filter(NoFHx,Sample%in%ViP_noLowFreqVar_Samples$Exome.ID) 

ViP_noLowFreqVar_Path %>% mutate(age_group = ifelse(Reported.Age.Dx <30,"<30",
                                               ifelse(Reported.Age.Dx < 40,"30-39",
                                                      ifelse(Reported.Age.Dx < 50,"40-49",
                                                             ifelse(Reported.Age.Dx < 60,"50-59",
                                                                    ifelse(Reported.Age.Dx < 70,"60-69",
                                                                           ifelse(Reported.Age.Dx < 80,"70-79",">80")))))))%>%
  group_by(age_group)%>%
  count()
ViP_LowFreqVar_Path %>% mutate(age_group = ifelse(Reported.Age.Dx <30,"<30",
                                               ifelse(Reported.Age.Dx < 40,"30-39",
                                                      ifelse(Reported.Age.Dx < 50,"40-49",
                                                             ifelse(Reported.Age.Dx < 60,"50-59",
                                                                    ifelse(Reported.Age.Dx < 70,"60-69",
                                                                           ifelse(Reported.Age.Dx < 80,"70-79",">80")))))))%>%
  group_by(age_group)%>%
  count()
Low_vs_noLowFreqVar <- bind_rows(ViP_LowFreqVar_Path,ViP_noLowFreqVar_Path,.id = "LowFreq_status")
Low_vs_noLowFreqVar$LowFreq_status <- as.factor(Low_vs_noLowFreqVar$LowFreq_status)
Age_lm <- lm(Reported.Age.Dx~LowFreq_status,Low_vs_noLowFreqVar)
summary(Age_lm)

Call:
lm(formula = Reported.Age.Dx ~ LowFreq_status, data = Low_vs_noLowFreqVar)

Residuals:
    Min      1Q  Median      3Q     Max 
-36.640  -7.386   0.614   7.614  28.614 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      61.6400     0.9157  67.312   <2e-16 ***
LowFreq_status2  -2.2539     1.0899  -2.068   0.0392 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 11.22 on 508 degrees of freedom
Multiple R-squared:  0.008347,  Adjusted R-squared:  0.006395 
F-statistic: 4.276 on 1 and 508 DF,  p-value: 0.03916

Extract DNA repair gene variants from sample data by repair pathway, and combine into one table

DNArepair_other_list <- tibble("SYMBOL"=c("XRCC1","PARP1","PARP2","APEX1","FEN1","OGG1","UNG","SMUG1","MBD4","TDG","MUTYH","NTHL1","MPG","NEIL1","NEIL2","NEIL3","APEX2","PNKP","APLF","PARP3","MSH3","MSH4","MSH5","MLH3","PMS1","PMS2P3","EXO1","RFC1","RFC2","RFC3","RFC4","RFC5","PCNA","ERCC4","ERCC2","ERCC5","XPC","ERCC6","GTF2H2","ERCC3","XPA","RAD23B","POLE","POLD1","RAD23A","LIG3","CETN2","DDB1","DDB2","GTF2H1","GTF2H3","GTF2H4","GTF2H5","CDK7","CCNH","MNAT1","LIG1","ERCC8","UVSSA","XAB2","MMS19","ATM","RPA1","RPA2","RPA3","RPA4","RAD51","NBN","RAD50","CHEK2","MRE11A","RAD52","RAD51B","DMC1","XRCC2","XRCC3","RAD54L","RAD54B","SHFM1","RBBP8","SLX1A","SLX1B","GEN1","ATR","ERCC1","FANCA","FANCB","FANCC","FANCD2","FANCE","FANCF","FANCG","FANCI","FANCL","FANCM","PALB2","CHEK1","SLX4","FAN1","C1orf86","C19orf40","MUS81","EME1","XRCC6","XRCC5","PRKDC","LIG4","XRCC4","DCLRE1C","NHEJ1","MGMT","ALKBH2","ALKBH3","PAXIP1","BLM","KMT2C","CRIP1","CDK12","BAP1","BARD1","WRN","BUB1","CENPE","ZW10","TTK","KNTC1","AURKB","POLB","POLH","POLQ","TDP1","TDP2","NUDT1","DUT","RRM2B","POLG","REV3L","MAD2L2","REV1","POLI","POLK","POLL","POLM","POLN","TREX1","TREX2","APTX","SPO11","ENDOV","UBE2A","UBE2B","RAD18","SHPRH","HLTF","RNF168","SPRTN","RNF8","RNF4","UBE2V2","UBE2N","H2AFX","CHAF1A","SETMAR","RECQL4","MPLKIP","DCLRE1A","DCLRE1B","PRPF19","RECQL","RECQL5","HELQ","RDM1","NABP2","ATRIP","MDC1","RAD1","RAD9A","HUS1","RAD17","TP53","TP53BP1","TOPBP1","CLK2","PER1","MLH1","MSH2","MSH6","PMS2","TP53","RAD51C","RAD51D","BRIP1","BRCA1","BRCA2"))
                                                                                                           masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_DNArepair_other <-                                  filter(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3,SYMBOL%in%DNArepair_other_list$SYMBOL) %>% write_excel_csv(path="masterfile_eoc_altAF0.25_HIGH_ENSTcanonical_DNArepair_other.csv",na=".",append=FALSE,col_names=TRUE)

DNA repair gene filtering figures

n_distinct(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_DNArepair_other$Sample)
[1] 5
n_distinct(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_DNArepair_other$Gene)
[1] 4
n_distinct(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_DNArepair_other$HGVSc)
[1] 5
---
title: "Thesis Low-Frequency DNA Repair Genes Somatic LoF Variants Filtering Script"
output: html_notebook
---

ALL IMPORTED FILES MUST BE IN THE SAME DIRECTORY AS THIS SCRIPT

Load tidyverse package
```{r}
setwd("~/OneDrive - The University of Melbourne/R data")
library("tidyverse")
```

Import raw vcf data from "masterfile" and exclude variants with GnomAD non-cancer AF > 0
```{r}
masterfile <- read.delim("masterfile.tsv", header=TRUE, row.names=NULL, na.strings = ".", stringsAsFactors=FALSE) %>% 
  dplyr::rename("Sample"="X.Sample") %>% 
  filter(GnomAD_v2.1_non_cancer_AF==0)
```

Exclude non-epithelial, uterine-only and BRCA +ve samples
```{r}
ViP_Complete_Cohort <- read.delim("ViP_LoF_Complete_Cohort.txt", stringsAsFactors=FALSE)
ViP_Complete_Cohort_list <- ViP_Complete_Cohort[,1]

masterfile_eoc <- filter(masterfile,Sample%in%c(ViP_Complete_Cohort_list))
```

Append patient path data and family history info
```{r}
ViP_OvCa_Path_Data <- read.delim("ViP_OvCa_Path_Data.txt", header=TRUE, row.names=NULL, stringsAsFactors=FALSE) %>% 
  dplyr::rename("Sample"=Exome.ID)
NoFHxCa <- read.delim("NoFHxCa.tsv", header=TRUE, row.names=NULL, stringsAsFactors=FALSE) %>% 
  select(Exome.ID,FHx) %>% 
  dplyr::rename("Sample"=Exome.ID)
masterfile_eoc2 <- left_join(masterfile_eoc,ViP_OvCa_Path_Data,by="Sample",copy=FALSE) %>% 
  left_join(NoFHxCa,by="Sample",copy=FALSE) %>% 
  mutate_at(vars("FHx"),funs(replace(., is.na(.), "YES")))
```

Exclude other mutation +ve samples (MLH1/MSH2/MSH6/PMS2, TP53, RAD51C/D, BRIP1)
```{r}
ViP_Discovery_Cohort <- read.delim("ViP_LoF_Discovery_Cohort.txt", stringsAsFactors=FALSE)
ViP_Discovery_Cohort_list <- ViP_Discovery_Cohort[,1]

masterfile_eoc3 <- filter(masterfile_eoc2,Sample%in%c(ViP_Discovery_Cohort_list))
```

Add Sample AD/DP column to calculate read freq for variants with missing PMCFREQ values
```{r}
masterfile_eoc4 <- mutate(masterfile_eoc3, AD = as.numeric(sapply(strsplit(Sample.AD,","), function(x) tail(x,1))), DP = as.numeric(Sample.DP), Sample.FREQ = AD/DP)

Total_Sample_Alleles <- (n_distinct(masterfile_eoc4$Sample)*2)
```

Exclude low-quality variants, GnomAD RF/InbreedingCoeff-flagged variants
```{r}
masterfile_eoc_lowFreq <- filter(masterfile_eoc4, (QUAL>=30)&
                                 (Identified!="FilteredInAll")&
                                 (Sample.PMCDP>=20)&
                                 (Sample.PMCFREQ<0.25)&
                                 (Sample.FREQ<0.25)) %>% 
  filter(Sample.FREQ>=0.01) %>% 
  filter((!str_detect(GnomAD_v2.1_FILTER_exome,"InbreedingCoeff")&
            !str_detect(GnomAD_v2.1_FILTER_exome,"RF"))|
           is.na(GnomAD_v2.1_FILTER_exome)) %>%
  filter((!str_detect(GnomAD_v2.1_FILTER_genome,"InbreedingCoeff")&
            !str_detect(GnomAD_v2.1_FILTER_genome,"RF"))|
           is.na(GnomAD_v2.1_FILTER_genome))
```

Remove HAP-only indel variants with PMCFREQ==0
```{r}
masterfile_eoc_lowFreq_HAP_SNV <- filter(masterfile_eoc_lowFreq,Sample.PMCFREQ==0) %>% 
  filter(Variant_Type=="SNV")
masterfile_eoc_lowFreq2 <- filter(masterfile_eoc_lowFreq,Sample.PMCFREQ!=0) %>% 
  bind_rows(masterfile_eoc_lowFreq_HAP_SNV,.id=NULL)
```

Exclude MODERATE impact variants
```{r}
masterfile_eoc_lowFreq_HIGH <- filter(masterfile_eoc_lowFreq2, IMPACT=="HIGH")
```

Separate Ensembl canonical and RefSeq canonical variants
```{r}
masterfile_eoc_lowFreq_HIGH_ENSTcanonical <- filter(masterfile_eoc_lowFreq_HIGH,CANONICAL=="YES") %>% 
  write_excel_csv(path="masterfile_eoc_altAF0.25_HIGH_ENSTcanonical.csv",na=".",append=FALSE,col_names=TRUE)
```

Sample variant counts and AFs
```{r}
variant_counts_AFs <- function(vcf){
variants_heterozygous <- vcf %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  dplyr::rename(alleles = "n()") %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants_homozygous <- vcf %>% 
  select(HGVSc,Sample.GT) %>% 
  group_by(HGVSc,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(alleles = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(HGVSc) %>% 
  summarise(sum(alleles))
variants <- full_join(variants_heterozygous,variants_homozygous,by="HGVSc",copy=FALSE,suffix=c(".x",".y")) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Allele_Count=`sum(alleles).x`+`sum(alleles).y`) %>% 
  mutate(Sample_AF=Total_Allele_Count/Total_Sample_Alleles)
vcf <- left_join(vcf, variants[,c(1,4:5)],by="HGVSc",copy=FALSE)
}

masterfile_eoc_lowFreq_HIGH_ENSTcanonical_2 <- variant_counts_AFs(masterfile_eoc_lowFreq_HIGH_ENSTcanonical)
```

Sample gene counts and frequencies
```{r}
gene_counts_AF <- function(vcf){
genes_oneVarAllele <- vcf %>% select(SYMBOL,Sample.GT) %>% 
  select(SYMBOL,Sample.GT) %>% 
  group_by(SYMBOL,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT%in%c("'0/1","'1/0"))) %>% 
  dplyr::rename(gene_Var = "n()") %>% 
  group_by(SYMBOL) %>% 
  summarise(sum(gene_Var))
genes_twoVarAllele <- vcf %>% select(SYMBOL,Sample.GT) %>% 
  select(SYMBOL,Sample.GT) %>% 
  group_by(SYMBOL,Sample.GT) %>% 
  summarise(n()) %>% 
  filter((Sample.GT!="'0/1")&(Sample.GT!="'1/0")) %>% 
  mutate(gene_Var = `n()`*2) %>% 
  select(-`n()`) %>% 
  group_by(SYMBOL) %>% 
  summarise(sum(gene_Var))
genes <- full_join(genes_oneVarAllele,genes_twoVarAllele,by="SYMBOL",copy=FALSE,suffix=c(".x",".y")) %>% 
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(Total_Gene_Count=`sum(gene_Var).x`+`sum(gene_Var).y`) %>% 
  mutate(Sample_Gene_Freq=Total_Gene_Count/Total_Sample_Alleles)
vcf <- left_join(vcf, genes[,c(1,4:5)],by="SYMBOL",copy=FALSE)
}

masterfile_eoc_lowFreq_HIGH_ENSTcanonical_3 <- gene_counts_AF(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_2)
```

Add GnomAD gene-level data
```{r}
GnomAD_stats_LOF_VEP <- read.delim("GnomAD_stats_LOF_VEP.tsv", header=TRUE, row.names=NULL, stringsAsFactors=FALSE)

masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats <- left_join(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_3, GnomAD_stats_LOF_VEP[,c(2,30:41)],by="Feature",copy=FALSE) 
masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats2 <- mutate_at(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats,vars(starts_with("FILTER_")),funs(replace(., is.na(.), 0))) %>% 
  mutate_if(grepl("popmax$", names(.)),funs(ifelse(. == "NA", 0, as.numeric(.)))) %>% 
  mutate_at(vars(ends_with("popmax")),funs(replace(., is.na(.), 0)))
```

Calculate Sample/GnomAD gene freq ratios for total and NFE GnomAD figures, including and excluding variants with AF > 0.005
```{r}
masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios <- 
  mutate(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats2, "Sample_Gene_LOF_Freq_Ratio_GnomAD"=Sample_Gene_Freq/FILTER_RF_LOF_HIGHIMPACT_AF_1.0_ADJ,
"Sample_Gene_LOF_Freq_Ratio_GnomAD_0.005"=Sample_Gene_Freq/FILTER_RF_LOF_HIGHIMPACT_AF_0.005_ADJ,
"Ratio_Difference"=Sample_Gene_LOF_Freq_Ratio_GnomAD_0.005-Sample_Gene_LOF_Freq_Ratio_GnomAD, 
"Sample_Gene_LOF_Freq_Ratio_GnomAD_NFE"=Sample_Gene_Freq/FILTER_RF_LOF_HIGHIMPACT_AF_1.0_NFE_ADJ,
"Sample_Gene_LOF_Freq_Ratio_GnomAD_NFE_0.005"=Sample_Gene_Freq/FILTER_RF_LOF_HIGHIMPACT_AF_0.005_NFE_ADJ,
"Ratio_Difference_NFE"=Sample_Gene_LOF_Freq_Ratio_GnomAD_NFE_0.005-Sample_Gene_LOF_Freq_Ratio_GnomAD_NFE)
```

Calculate Sample/GnomAD allele freq ratios 
```{r}
masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios2 <- mutate(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios, "Sample_Gene_LOF_AF_Ratio"=Sample_AF/GnomAD_v2.1_non_cancer_AF, 
"Sample_Gene_LOF_AF_Ratio_NFE"=Sample_AF/GnomAD_v2.1_non_cancer_AF_nfe)
```

Attach other GnomAD data for DNA repair genes analysis
```{r}
masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3 <- mutate(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios2,Total_Case_Alleles=Total_Sample_Alleles) %>% 
  left_join(GnomAD_stats_LOF_VEP[,c(2,6,10,14:29)],by="Feature",copy=FALSE) 
masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios4 <- mutate(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3, MAX_AN=replace(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3$MAX_AN, is.na(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3$MAX_AN), max(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3$MAX_AN, na.rm=TRUE))) %>% 
  mutate(MAX_AN_NFE=replace(.$MAX_AN_NFE, is.na(.$MAX_AN_NFE), max(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3$MAX_AN_NFE, na.rm=TRUE)))
```

Post-filtering figures (ENST, excl. mutation +ve samples)
```{r}
n_distinct(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3$Sample)
n_distinct(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3$Gene)
n_distinct(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3$HGVSc)
```

Divide into groups stratified by age (<60/>=60) and presence/absence of FHx, and output files.
```{r}
masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_ageUnder60 <- filter(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3,Reported.Age.Dx<60) %>%
  write_excel_csv(path="masterfile_eoc_altAF0.25_HIGH_ENSTcanonical_ageUnder60.csv",na=".",append=FALSE,col_names=TRUE)
  
masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_ageOver60 <- filter(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3,Reported.Age.Dx>=60) %>% 
  write_excel_csv(path="masterfile_eoc_altAF0.25_HIGH_ENSTcanonical_ageOver60.csv",na=".",append=FALSE,col_names=TRUE)
  
masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_FHx <- filter(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3,FHx%in%c("YES")) %>% 
  write_excel_csv(path="masterfile_eoc_altAF0.25_HIGH_ENSTcanonical_FHx.csv",na=".",append=FALSE,col_names=TRUE)
  
masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_noFHx <- 
filter(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3,!FHx%in%c("YES")) %>% 
  write_excel_csv(path="masterfile_eoc_altAF0.25_HIGH_ENSTcanonical_noFHx.csv",na=".",append=FALSE,col_names=TRUE)
```

Identify ViP samples with and without low-frequency variants post-filtering (incl age/FHx distribution), and perform simple linear regression comparison for age vs presence of low-frequency variants
```{r}
ViP_LowFreqVar_Samples <- select(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3,Sample) %>% distinct(.keep_all=FALSE)
ViP_noLowFreqVar_Samples <- filter(ViP_Discovery_Cohort,!Exome.ID%in%ViP_LowFreqVar_Samples$Sample)
ViP_noLowFreqVar_Path <- filter(ViP_OvCa_Path_Data,Sample%in%ViP_noLowFreqVar_Samples$Exome.ID)
ViP_LowFreqVar_Path <- filter(ViP_OvCa_Path_Data,Sample%in%ViP_LowFreqVar_Samples$Sample)
ViP_FHx <- select(masterfile_eoc4,Sample,FHx) %>% 
  distinct(.keep_all=FALSE)
FHx <- filter(ViP_FHx,FHx=="YES")
NoFHx <- filter(ViP_FHx,FHx=="NO")

FHx_LowFreqVar <- filter(FHx,Sample%in%ViP_LowFreqVar_Samples$Sample)
NoFHx_LowFreqVar <- filter(NoFHx,Sample%in%ViP_LowFreqVar_Samples$Sample)
FHx_noLowFreqVar <- filter(FHx,Sample%in%ViP_noLowFreqVar_Samples$Exome.ID)
NoFHxCa_noLowFreqVar <- filter(NoFHx,Sample%in%ViP_noLowFreqVar_Samples$Exome.ID) 

ViP_noLowFreqVar_Path %>% mutate(age_group = ifelse(Reported.Age.Dx <30,"<30",
                                               ifelse(Reported.Age.Dx < 40,"30-39",
                                                      ifelse(Reported.Age.Dx < 50,"40-49",
                                                             ifelse(Reported.Age.Dx < 60,"50-59",
                                                                    ifelse(Reported.Age.Dx < 70,"60-69",
                                                                           ifelse(Reported.Age.Dx < 80,"70-79",">80")))))))%>%
  group_by(age_group)%>%
  count()
ViP_LowFreqVar_Path %>% mutate(age_group = ifelse(Reported.Age.Dx <30,"<30",
                                               ifelse(Reported.Age.Dx < 40,"30-39",
                                                      ifelse(Reported.Age.Dx < 50,"40-49",
                                                             ifelse(Reported.Age.Dx < 60,"50-59",
                                                                    ifelse(Reported.Age.Dx < 70,"60-69",
                                                                           ifelse(Reported.Age.Dx < 80,"70-79",">80")))))))%>%
  group_by(age_group)%>%
  count()
Low_vs_noLowFreqVar <- bind_rows(ViP_LowFreqVar_Path,ViP_noLowFreqVar_Path,.id = "LowFreq_status")
Low_vs_noLowFreqVar$LowFreq_status <- as.factor(Low_vs_noLowFreqVar$LowFreq_status)
Age_lm <- lm(Reported.Age.Dx~LowFreq_status,Low_vs_noLowFreqVar)
summary(Age_lm)
```

Extract DNA repair gene variants from sample data by repair pathway, and combine into one table
```{r}
DNArepair_other_list <- tibble("SYMBOL"=c("XRCC1","PARP1","PARP2","APEX1","FEN1","OGG1","UNG","SMUG1","MBD4","TDG","MUTYH","NTHL1","MPG","NEIL1","NEIL2","NEIL3","APEX2","PNKP","APLF","PARP3","MSH3","MSH4","MSH5","MLH3","PMS1","PMS2P3","EXO1","RFC1","RFC2","RFC3","RFC4","RFC5","PCNA","ERCC4","ERCC2","ERCC5","XPC","ERCC6","GTF2H2","ERCC3","XPA","RAD23B","POLE","POLD1","RAD23A","LIG3","CETN2","DDB1","DDB2","GTF2H1","GTF2H3","GTF2H4","GTF2H5","CDK7","CCNH","MNAT1","LIG1","ERCC8","UVSSA","XAB2","MMS19","ATM","RPA1","RPA2","RPA3","RPA4","RAD51","NBN","RAD50","CHEK2","MRE11A","RAD52","RAD51B","DMC1","XRCC2","XRCC3","RAD54L","RAD54B","SHFM1","RBBP8","SLX1A","SLX1B","GEN1","ATR","ERCC1","FANCA","FANCB","FANCC","FANCD2","FANCE","FANCF","FANCG","FANCI","FANCL","FANCM","PALB2","CHEK1","SLX4","FAN1","C1orf86","C19orf40","MUS81","EME1","XRCC6","XRCC5","PRKDC","LIG4","XRCC4","DCLRE1C","NHEJ1","MGMT","ALKBH2","ALKBH3","PAXIP1","BLM","KMT2C","CRIP1","CDK12","BAP1","BARD1","WRN","BUB1","CENPE","ZW10","TTK","KNTC1","AURKB","POLB","POLH","POLQ","TDP1","TDP2","NUDT1","DUT","RRM2B","POLG","REV3L","MAD2L2","REV1","POLI","POLK","POLL","POLM","POLN","TREX1","TREX2","APTX","SPO11","ENDOV","UBE2A","UBE2B","RAD18","SHPRH","HLTF","RNF168","SPRTN","RNF8","RNF4","UBE2V2","UBE2N","H2AFX","CHAF1A","SETMAR","RECQL4","MPLKIP","DCLRE1A","DCLRE1B","PRPF19","RECQL","RECQL5","HELQ","RDM1","NABP2","ATRIP","MDC1","RAD1","RAD9A","HUS1","RAD17","TP53","TP53BP1","TOPBP1","CLK2","PER1","MLH1","MSH2","MSH6","PMS2","TP53","RAD51C","RAD51D","BRIP1","BRCA1","BRCA2"))
                                                                                                           masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_DNArepair_other <-                                  filter(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios3,SYMBOL%in%DNArepair_other_list$SYMBOL) %>% write_excel_csv(path="masterfile_eoc_altAF0.25_HIGH_ENSTcanonical_DNArepair_other.csv",na=".",append=FALSE,col_names=TRUE)
```

DNA repair gene filtering figures
```{r}
n_distinct(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_DNArepair_other$Sample)
n_distinct(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_DNArepair_other$Gene)
n_distinct(masterfile_eoc_lowFreq_HIGH_ENSTcanonical_withGnomADstats_ratios_DNArepair_other$HGVSc)
```
