Here, I am explaining the processing step of variants for GWAS. Please, ignore the code parts unless interested in details.

Variant loci are generated from the multiple sequence alignments of all EBV genomes we sequenced. Positions and reference allele are in reference to Type 1 reference genome NC_007605.

After calling the variants, we filter out the ones that come from repeatitive regions. We keep all the variants including the synonymous ones as the reviewers suggested. This leaves us 6,495 variant loci to start with.

cd ~/Documents/EBV/ourbatch2/Secondround/ && mkdir SNP-detect_02.27.2020 %% cd SNP-detect_02.27.2020
toolDir='~/Dropbox/codes/ViralGenomeAssembly'

#This will create a vcf file out of the msa using the very first genome (NC_007605) as the reference:
bash $toolDir/scripts/snp_sites_vcf_modifier.sh my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub.with-Ref1.fasta

#Filter the variants in the repeat regions:
bedtools intersect -header -v -wa -a my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub.with-Ref1_modified.vcf -b $toolDir/resources/Annotation/Type1/NC_007605_repeatMask.bed |bedtools intersect -header -v -wa -a - -b $toolDir/resources/Annotation/Type1/NC_007605_miropeat_default_run_repeats.bed|bedtools intersect -header -v -wa -a - -b $toolDir/resources/Annotation/Type1/filter.regions.bed > my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub.with-Ref1_modified_filtered.vcf

#Remove the reference genome entry from the vcf data: 
cut -f1-9,11- my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub.with-Ref1_modified_filtered.vcf > my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub_modified_filtered.vcf

Do PCA and check the SNP variation:

Here, I do PCA on variants to use the PC3 as a cofactor in the association test below. PC3 is the principal component that separated the type 2 genomes into Group A and Group B in Figure 2B-lower panel. Also, reviewers wants us to account for the population structure.

summary(pops)
               Method          EBV_Type           Condition             Label   
 Direct Sequencing:33   Inter-typic: 3   eBL           :58   eBL_Cell_Line : 3  
 GenomiPhi-WGA    :13   Type1      :60   HealthyControl:40   eBL_Plasma    :14  
 Preamp-sWGA      :52   Type2      :35                       eBL_Tumor     :41  
                                                             HealthyControl:40  

Generate a genetic relatedness matrix (GRM) for EBV Genomes:

I used GEMMA for this:

#plink gets confused when IDs contain underscore, '_'
sed 's/_/-/g' my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub_modified_filtered.vcf > my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub_modified_filtered._.fixed.vcf

#First create file formats that GEMMA can take:
#plink --vcf my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub_modified_filtered._.fixed.vcf --allow-extra-chr --out my.plink.ped.data --make-bed
plink --vcf my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub_modified_filtered.vcf --allow-extra-chr --out my.plink.ped.data --make-bed
#This creates files my.plink.ped.data.bed my.plink.ped.data.bim my.plink.ped.data.fam

#Create a custom fam file with pheno info:
grep -v "##" pop.phe.data.withPCs.txt|cut -f1,2,6|awk '{print $1"\t"$1"\t"0"\t"0"\t"$3"\t"$2}'|awk '{if($5 == M) print $0"\t"1; else print $0"\t"2}'|awk '{if($6 == 2) print $0"\t"0; else print $0"\t"1}'|cut -f1-4,7-|grep -v "#"|tr '\t' ' ' > my.plink.ped.data.fam

gemma -bfile my.plink.ped.data -gk 1 -o GRM.ebv
gemma -bfile my.plink.ped.data -gk 2 -o GRM.ebv
#GEMMA 0.98.1 (2018-12-10) by Xiang Zhou and team (C) 2012-2018
#Reading Files ...
## number of total individuals = 98
## number of analyzed individuals = 98
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =     6764
## number of analyzed SNPs         =      511
#Calculating Relatedness Matrix ...
#================================================== 100%
#**** INFO: Done.
#This process created a folder 'output' with file 'GRM.ebv.cXX.txt' in it.

#Copy files for glmm.score:
cp my.plink.ped.data.* /usr/local/lib/R/3.5/site-library/GMMAT/extdata/

Genome-wide association with GMMAT

This part is the actual GWAS using generalized mixed model with GRM of EBV as the random effect and Age, Gender, EBV type, and PC3 as the fixed effects.

In the test, I mask (not test) the variant loci with less than 5% minor allele frequency and more than 50% missing data. This gives us 2,807 variant loci to be tested.

library(GMMAT)
sampleDemo$id <- rownames(sampleDemo)
GRM.ebv <- as.matrix(read.table("~/Documents/EBV/ourbatch2/Secondround/SNP-detect_02.27.2020/output/GRM.ebv.cXX.txt", check.names = FALSE))
rownames(GRM.ebv) <- sampleDemo$id
colnames(GRM.ebv) <- sampleDemo$id
#pheatmap::pheatmap(GRM.ebv, cluster_rows = F, cluster_cols = F, annotation_col = sampleDemo[,c("Type","Gender","Age","Method")])
geno.file <- strsplit(system.file("extdata", "my.plink.ped.data.bed", package = "GMMAT"), ".bed", fixed = TRUE)[[1]]
model0 <- glmmkin(Phe ~ Type + Age + Gender + PC3, data = sampleDemo, kins = GRM.ebv, id = "id", family = binomial(link = "logit"))
glmm.score(model0, 
           missing.method = "omit", 
           infile = geno.file,
           MAF.range = c(0.05, 0.5),
           miss.cutoff = 0.5,
           outfile = "glmm.score.gds.testoutfile.txt")
score.results <- read.delim("glmm.score.gds.testoutfile.txt", header=T)

Determine the genome-wide significance threshold:

Here, I determine the genome-wide significance threshold via phenotype (BL/healthy control) label permutations (1,000 times). This resulted in a suggestive significance threshold of 0.000999952 ~= 10^-3.


P.pool <- data.frame(row.names = rownames(score.results))
for( i in 1:1000){
  print(i)
#Permuting the phenotype of interest
sampleDemo.rand <- sampleDemo
sampleDemo.rand$Phe <- sample(sampleDemo$Phe, replace = T)
model.rand <- glmmkin(Phe ~ Type + Age + Gender + PC3, data = sampleDemo.rand, kins = GRM.ebv, id = "id", family = binomial(link = "logit"))

glmm.score(model.rand, 
           missing.method = "omit", 
           infile = geno.file,
           MAF.range = c(0.05, 0.5),
           miss.cutoff = 0.5,
           outfile = "glmm.score.Phe-Permutation.txt")

score.results.rand <- read.delim("glmm.score.Phe-Permutation.txt", header=T)

P.pool <- cbind(P.pool, P=score.results.rand$PVAL)
}

plist <- head(sort(apply(P.pool, 1, min)), dim(P.pool)[1]*5/100)
plist[length(plist)]#0.000999952
-log10(plist[length(plist)])
write.csv(P.pool, file = "P.pool.txt")

The Manhattan plot of the results:

The suggestive genome-wide significance threshold of ~3 (in -log10) results in No significant variant in the analysis. Please see the plot below:

---
title: "R Notebook for EBV BL - GWAS"
output: html_notebook
---
 
Here, I am explaining the processing step of variants for GWAS. Please, ignore the code parts unless interested in details. 

Variant loci are generated from the multiple sequence alignments of all EBV genomes we sequenced. Positions and reference allele are in reference to Type 1 reference genome NC_007605.

After calling the variants, we filter out the ones that come from repeatitive regions. We keep all the variants including the synonymous ones as the reviewers suggested. This leaves us 6,495 variant loci to start with.

```{bash}
cd ~/Documents/EBV/ourbatch2/Secondround/ && mkdir SNP-detect_02.27.2020 %% cd SNP-detect_02.27.2020
toolDir='~/Dropbox/codes/ViralGenomeAssembly'

#This will create a vcf file out of the msa using the very first genome (NC_007605) as the reference:
bash $toolDir/scripts/snp_sites_vcf_modifier.sh my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub.with-Ref1.fasta

#Filter the variants in the repeat regions:
bedtools intersect -header -v -wa -a my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub.with-Ref1_modified.vcf -b $toolDir/resources/Annotation/Type1/NC_007605_repeatMask.bed |bedtools intersect -header -v -wa -a - -b $toolDir/resources/Annotation/Type1/NC_007605_miropeat_default_run_repeats.bed|bedtools intersect -header -v -wa -a - -b $toolDir/resources/Annotation/Type1/filter.regions.bed > my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub.with-Ref1_modified_filtered.vcf

#Remove the reference genome entry from the vcf data: 
cut -f1-9,11- my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub.with-Ref1_modified_filtered.vcf > my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub_modified_filtered.vcf
```

### Do PCA and check the SNP variation:
Here, I do PCA on variants to use the PC3 as a cofactor in the association test below. PC3 is the principal component that separated the type 2 genomes into Group A and Group B in Figure 2B-lower panel. Also, reviewers wants us to account for the population structure.

```{r}
library(vcfR)
library(dartR)
vcf.ebv <- read.vcfR("~/Documents/EBV/ourbatch2/Secondround/SNP-detect_02.27.2020/my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub_modified_filtered.vcf")
gl <- vcfR2genlight(vcf.ebv)

#do PCA
pc <- gl.pcoa(gl, nfactors = 3)

#Create a phetype file for GWAS analysis:
pops <- read.delim("~/Dropbox/codes/ViralGenomeAssembly/workspace/data/pop_info.txt",row.names = 1, header = T)
summary(pops)
pop <- read.delim("~/Dropbox/codes/ViralGenomeAssembly/workspace/pop.phe.data.txt",header=TRUE, row.names = 1)
sampleDemo <- cbind(pop, pc$scores[rownames(pop),], data.frame(Method=gsub(" ",replacement = "-",pops[rownames(pop),"Method"])))
sampleDemo$Method <- ifelse(sampleDemo$Method == "Direct-Sequencing", 1, as.character(sampleDemo$Method))
sampleDemo$Method <- ifelse(sampleDemo$Method == "Preamp-sWGA", 2, as.character(sampleDemo$Method))
sampleDemo$Method <- ifelse(sampleDemo$Method == "GenomiPhi-WGA", 3, as.character(sampleDemo$Method))
sampleDemo$Type <- ifelse(sampleDemo$Type == 3, 2, as.factor(sampleDemo$Type))
sampleDemo$Phe <- as.factor(sampleDemo$Phe)
sampleDemo$Type <- as.factor(sampleDemo$Type)

write.table(sampleDemo, file="~/Dropbox/codes/ViralGenomeAssembly/workspace/pop.phe.data.withPCs.txt",quote = F,col.names = NA,sep = "\t" )
```


```{bash eval=FALSE, include=FALSE}
#Header for the pop.phe file of pseq:
#First, add #ID to the first column header with Vi. Then, add the header below.
cp ~/Dropbox/codes/ViralGenomeAssembly/workspace/pop.phe.data.withPCs.txt ./

vi pop.phe.data.withPCs.txt #to add #ID to the first column header AND lines below:

##Phe,Integer,0,"1/2=eBLs/Controls"
##Type,Integer,0,1/2/3=Type1/Type2/Inter-typic"
##Origin,Integer,0,"1/2/3/4=eBL_Cell_Line/HealthyControl/eBL_Tumor/eBL_Plasma"
##Age,Float,-9,"Age in years"
##Gender,String,?,"M/F=Male/Female"
##Age_isLessThan7,Integer,-9,"1/0=isAgeLessThan7"
##Gender_isMale,Integer,-9,"1/0=Male/Female"
##PC1,Float,"PC1"
##PC2,Float,"PC2"
##PC3,Float,"PC3"
##Method,Integer,-9,"1/2/3=Direct-Sequencing/mlrPCR-sWGA/GenomiPhi-WGA"
```

### Generate a genetic relatedness matrix (GRM) for EBV Genomes:

I used GEMMA for this:

```{bash}
#plink gets confused when IDs contain underscore, '_'
sed 's/_/-/g' my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub_modified_filtered.vcf > my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub_modified_filtered._.fixed.vcf

#First create file formats that GEMMA can take:
#plink --vcf my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub_modified_filtered._.fixed.vcf --allow-extra-chr --out my.plink.ped.data --make-bed
plink --vcf my_ICed_my_edited_sequence.aln.IDreplaced.OnlyIndividuals-MinusCellLines_sub_modified_filtered.vcf --allow-extra-chr --out my.plink.ped.data --make-bed
#This creates files my.plink.ped.data.bed my.plink.ped.data.bim my.plink.ped.data.fam

#Create a custom fam file with pheno info:
grep -v "##" pop.phe.data.withPCs.txt|cut -f1,2,6|awk '{print $1"\t"$1"\t"0"\t"0"\t"$3"\t"$2}'|awk '{if($5 == M) print $0"\t"1; else print $0"\t"2}'|awk '{if($6 == 2) print $0"\t"0; else print $0"\t"1}'|cut -f1-4,7-|grep -v "#"|tr '\t' ' ' > my.plink.ped.data.fam

gemma -bfile my.plink.ped.data -gk 1 -o GRM.ebv
gemma -bfile my.plink.ped.data -gk 2 -o GRM.ebv
#GEMMA 0.98.1 (2018-12-10) by Xiang Zhou and team (C) 2012-2018
#Reading Files ...
## number of total individuals = 98
## number of analyzed individuals = 98
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =     6764
## number of analyzed SNPs         =      511
#Calculating Relatedness Matrix ...
#================================================== 100%
#**** INFO: Done.
#This process created a folder 'output' with file 'GRM.ebv.cXX.txt' in it.

#Copy files for glmm.score:
cp my.plink.ped.data.* /usr/local/lib/R/3.5/site-library/GMMAT/extdata/
```


### Genome-wide association with GMMAT
This part is the actual GWAS using generalized mixed model with GRM of EBV as the random effect and Age, Gender, EBV type, and PC3 as the fixed effects.

In the test, I mask (not test) the variant loci with less than 5% minor allele frequency and more than 50% missing data. This gives us 2,807 variant loci to be tested.

```{r}
library(GMMAT)
sampleDemo$id <- rownames(sampleDemo)

GRM.ebv <- as.matrix(read.table("~/Documents/EBV/ourbatch2/Secondround/SNP-detect_02.27.2020/output/GRM.ebv.cXX.txt", check.names = FALSE))
rownames(GRM.ebv) <- sampleDemo$id
colnames(GRM.ebv) <- sampleDemo$id
#pheatmap::pheatmap(GRM.ebv, cluster_rows = F, cluster_cols = F, annotation_col = sampleDemo[,c("Type","Gender","Age","Method")])

geno.file <- strsplit(system.file("extdata", "my.plink.ped.data.bed", package = "GMMAT"), ".bed", fixed = TRUE)[[1]]

model0 <- glmmkin(Phe ~ Type + Age + Gender + PC3, data = sampleDemo, kins = GRM.ebv, id = "id", family = binomial(link = "logit"))

glmm.score(model0, 
           missing.method = "omit", 
           infile = geno.file,
           MAF.range = c(0.05, 0.5),
           miss.cutoff = 0.5,
           outfile = "glmm.score.gds.testoutfile.txt")

score.results <- read.delim("glmm.score.gds.testoutfile.txt", header=T)

```

### Determine the genome-wide significance threshold:
Here, I determine the genome-wide significance threshold via phenotype (BL/healthy control) label permutations (1,000 times). This resulted in a suggestive significance threshold of 0.000999952 ~= 10^-3.

```{r}

P.pool <- data.frame(row.names = rownames(score.results))
for( i in 1:1000){
  print(i)
#Permuting the phenotype of interest
sampleDemo.rand <- sampleDemo
sampleDemo.rand$Phe <- sample(sampleDemo$Phe, replace = T)
model.rand <- glmmkin(Phe ~ Type + Age + Gender + PC3, data = sampleDemo.rand, kins = GRM.ebv, id = "id", family = binomial(link = "logit"))

glmm.score(model.rand, 
           missing.method = "omit", 
           infile = geno.file,
           MAF.range = c(0.05, 0.5),
           miss.cutoff = 0.5,
           outfile = "glmm.score.Phe-Permutation.txt")

score.results.rand <- read.delim("glmm.score.Phe-Permutation.txt", header=T)

P.pool <- cbind(P.pool, P=score.results.rand$PVAL)
}

plist <- head(sort(apply(P.pool, 1, min)), dim(P.pool)[1]*5/100)
plist[length(plist)]#0.000999952
-log10(plist[length(plist)])
write.csv(P.pool, file = "P.pool.txt")
```

### The Manhattan plot of the results:

The suggestive genome-wide significance threshold of ~3 (in -log10) results in No significant variant in the analysis. Please see the plot below:

```{r}
library(tidyverse)

head(score.results)
score.results$PVAL <- ifelse(is.na(score.results$PVAL), 1.0, as.numeric(score.results$PVAL))

genome.wide.signifinance <- plist[length(plist)]
score.results.sig <- score.results[ which(score.results$PVAL < genome.wide.signifinance),]$POS

p1 <- ggplot(score.results, aes(x=POS/1000, y=-log10(PVAL)))+ geom_point(size=4, color="grey")+
  geom_hline(aes(yintercept= as.numeric(-log10(genome.wide.signifinance))),colour = "red", linetype="dashed", size = 1) + theme_bw()+
  geom_point(data=score.results[ -log10(score.results$PVAL+0.0000001) > -log10(genome.wide.signifinance),],aes(x=POS/1000, y=-log10(PVAL)),size=4, color="red")+
  ylab("-log10 (P value)")+
  xlab("Genomic Postion (kb)")+
  ylim(0,5)+
  theme(axis.text.x = element_text(color = "black",face = "bold", size=18), 
        axis.text.y = element_text(color = "black",face = "bold",size=18),
        axis.title.y = element_text(color = "black",face = "bold",size=20),
        axis.title.x = element_text(color = "black",face = "bold",size=20))

ggsave(plot=p1,width = 18,height = 9, dpi=200, filename="/Users/yasinkaymaz/Dropbox/Papers/EBV_project/workspace/Figure_temps/assoc8.pdf", useDingbats=FALSE )


results.out <- score.results[which(!is.na(score.results$PVAL)),!colnames(score.results) %in% c("SNP", "cM", "VAR")]
results.out <- results.out[c("CHR", "POS", "A2", "A1", "N", "AF", "SCORE", "PVAL")]

colnames(results.out)[colnames(results.out) == 'A2'] <- 'REF'
colnames(results.out)[colnames(results.out) == 'A1'] <- 'ALT'

write.table(results.out[order(results.out$PVAL),], file="~/Dropbox/Papers/EBV_project/workspace/data/GWAS_GLMM_results.All.txt",sep="\t",quote = FALSE,row.names = FALSE)


 ggplot(score.results, aes(x=POS/1000, y=-log10(PVAL)))+ geom_point(size=2, color="grey")+
  geom_hline(aes(yintercept= as.numeric(-log10(genome.wide.signifinance))),colour = "red", linetype="dashed", size = 1) + theme_bw()+
  geom_point(data=score.results[ -log10(score.results$PVAL+0.0000001) > -log10(genome.wide.signifinance),],aes(x=POS/1000, y=-log10(PVAL)),size=2, color="red")+
  ylab("-log10 (P value)")+
  xlab("Genomic Postion (kb)")+
  ylim(0,5)+
  theme(axis.text.x = element_text(color = "black",face = "bold", size=18), 
        axis.text.y = element_text(color = "black",face = "bold",size=18),
        axis.title.y = element_text(color = "black",face = "bold",size=20),
        axis.title.x = element_text(color = "black",face = "bold",size=20))

```
