Dataset details

The variants were called on the 2024 dataset only (8 times 12 samples from the 2022 subclone dataset).
filters applied so far : SNPs removed when found in PON + Friends

—————————————————————————–

Step 1: filter based on the allele frequencies

—————————————————————————–

reasoning: the allele frequencies are sometimes >0.5. For somatic mutations, we do not want to keep mutations that are widespread (shared by all individuals). The Pando clone is triploid, which reduces our expectation for fixation of a mutation to 0.33. We thus transform the allele frequency file to not take into account any variants whose allele frequency > 0.7, and reduce to 0.5 ane allele frequency comprised between 0.5 and 0.7.

—————————————————————————–

Step 2: Point estimates with modified allele frequencies

—————————————————————————–

I used the modified allele frequency file to calculate point estimates based on genotype likelihoods. I used the following script : /uufs/chpc.utah.edu/common/home/u6028866/Pando/replicate_analysis/scripts and the files are in this folder: /uufs/chpc.utah.edu/common/home/u6028866/Pando/replicate_analysis/variants/filtering

Download the file from the cluster to the laptop and continue analyses: filter for empty variants, singletons and for read depth>4.

#scp u6028866@kingspeak.chpc.utah.edu:/uufs/chpc.utah.edu/common/home/u6028866/Pando/replicate_analysis/variants/filtering/pntest\* /Users/rpineau3/Dropbox\ \(GaTech\)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/data/

# upoad pntest data and depth info
pn_est <- read.table("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/1_data/pntest_friendsfil_ponfil_replicates_variants.txt", sep = " " ) # 797 x 96
depth <- read.csv("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/1_data/depth_reps.csv") # 797 x 96
ids <- read.csv("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/1_data/replicate_ids.csv", header = T) # 96 x 6 


# Filter based on depth
par(family = "Times New Roman")
plot(colMeans(depth), pch = 16, xlab = "individual", ylab = "depth", ylim = c(0,150)) # Mean depth per sample
abline(h=4, col = "red")

keep <- which (colMeans(depth) >= 4) # 79

pnt_est_fil <- pn_est[,keep] # 797 x 79
ids_fil <- ids[as.numeric(keep), ] # 79 x 6
depth_fil <- depth[,keep] # 797 x 79


# -----------------------------------------------------------------------------
# BINARIZATION
# -----------------------------------------------------------------------------
pn_bin <- matrix(NaN, dim(pnt_est_fil)[1], dim(pnt_est_fil)[2])
threshold <- 0.5

for (x in 1:dim(pnt_est_fil)[1]) {
  
  for (y in 1:dim(pnt_est_fil)[2]) {
    
    if (pnt_est_fil[x,y]>threshold) {pn_bin[x,y] <- 1 } # if proba of being heteroZ > threshold, label with 1
    
    else {pn_bin[x,y] <- 0 }
    
  }
}


nInd <- 79 # number of individuals in the dataset

hets <- rowSums(pn_bin)/nInd
#length(which(hets == 0)) # 110
#length(which(rowSums(pn_bin)>1)) # 429

# remove singletons, meaning present in 1/nInd individuals AND
# present in zero individuals (left after filtering out some individuals, maybe)

pn_bin_fil <- pn_bin[c(rowSums(pn_bin)>1),] # 429 x  79
af_mod$V1[c(rowSums(pn_bin)<=1)] <- 0

nSNP <-  429 # number of SNPs in the dataset
hets <- (rowSums(pn_bin_fil))/nInd

par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity", ylab = "counts", 100, main = "heterozygosity per SNP, after removing singletons", 
     col = "#2ca25f")

There are a lot of mutations that are above 0.8 - do we want to keep them when we calculate the replicability of our sequencing (i.e. the probability of cathing a mutation if it is there, or do we focus on somatic mutations for these values?)

—————————————————————————–

Step 3. Assess replication power (quality of our re-sequencing experiment)

—————————————————————————–

I would like to find the probability of mutation replication, i.e.: what is the probability that, if I detect a mutation in one sample within the replicate group, I detect it in all other samples within the group?

# How many mutations are found in every replicate versus not?

# 1 - dataset divided by replicate group (3 to 8 samples)

group <- c(1:6,8:12) # no data for replicate group 7 anymore
store_prop <- c()

# number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
for (rep in 1:length(group)) {
  
  inds <- which( ids_fil$replicate_group == group[rep] )

  ind_prop <- c(rowSums(pn_bin_fil[,inds])/length(inds)) # proportion of individuals with the mutation

  mean_n_mut <- mean(colSums(pn_bin_fil[,inds])) # mean number of mutations per sample
  
  #prop <- c(length(which( ind_prop == 0 )) + length(which( ind_prop == 1 )) ) / nSNP
  prop <- c( length(which( ind_prop == 1 )) ) / mean_n_mut
  store_prop <- c(store_prop, prop)
  
}

data_store_prop <- store_prop
par(family = "Times New Roman")
plot(data_store_prop, pch = 16, ylim = c(0,1), xlab = "replicate", ylab = "Prop. of SNP in common per set of reps")
abline(h=mean(store_prop), col = "red") # 0.64

# 2 - Compare to random
store_mean_prop <- c()

for (sim in 1:1000){
  

  pn_bin_rdn <- apply(pn_bin_fil, 2, sample)
  store_prop <- c()
  
  # number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
  for (rep in 1:length(group)) {
    
    rand_inds <- sample(ids_fil$replicate_group) # re-shuffle individuals  
    
    inds <- which( rand_inds == group[rep] )
    
    ind_prop <- c(rowSums(pn_bin_rdn[,inds])/length(inds)) # proportion of individuals with the mutation
    mean_n_mut <- mean(colSums(pn_bin_fil[,inds])) # mean number of mutations per sample
    #prop <- c(length(which( ind_prop == 0 )) + length(which( ind_prop == 1 )) ) / nSNP
    prop <- c( length(which( ind_prop == 1 )) ) / mean_n_mut
    store_prop <- c(store_prop, prop)
    
    }
  
  store_mean_prop[sim] <- mean(store_prop)
  
}

# Plot and compare
par(family = "Times New Roman")
hist(store_mean_prop, 10, xlab = "Prop. of SNP in common per set of reps", col = "#3690c0", xlim = c(0,1),
     main = "Comparison per replicate, count of mutations present in *all* of reps", border = "#3690c0")
abline(v=mean(data_store_prop),  col = "#045a8d")
legend("topright", legend = c("Expected by chance" , "Mean for data (0.6)"), pch = 15, col = c("#3690c0", "#045a8d"), bty = "n" )

# Allow one missing mutation by changing threshold to 0.8

We have a good replication (60%) within replicate groups, significantly better than expected by chance.

I do the same analysis, this time by reducing the threshold to 80%, meaning: I count every mutation being found in 80% or more of the samples within a replicate group.

Question for Zach: is that sufficient to assess our “replication power”, or the “quality of sequencing” in our experiment?

—————————————————————————–

Step 4: Filtering for somatic mutations

—————————————————————————–

We checked that we could trust the data. Now I will only keep the SNPs that are found in less than 60% of the samples. I am being more conservativere with a 60% threshold, when we were using a 80% threshold before, mostly because we see in the previous figure that we miss ~40% of the mutations. (is that true?) I will re-estimate the point estimates using the gl2genest_somatic.pl script, based on this new set of SNP that shoud only contain somatic mutations. I will then filter the vcf to keep those mutations and compare to our previous set of mutations for the same samples.

af_mod$V1[hets>0.6] <- 0
length(which(af_mod$V1==0)) # 505 SNPs allele frequencies set to 0
## [1] 626
# write.csv(af_mod, "/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/data/afsom_friendsfil_ponfil_replicates_variants.txt")

# upload the file on the cluster to re-calculate point estimates : 
# scp /Users/rpineau3/Dropbox\ \(GaTech\)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/data/afsom_friendsfil_ponfil_replicates_variants.txt u6028866@kingspeak.chpc.utah.edu:/uufs/chpc.utah.edu/common/home/u6028866/Pando/replicate_analysis/variants/filtering/

#scp u6028866@kingspeak.chpc.utah.edu:/uufs/chpc.utah.edu/common/home/u6028866/Pando/replicate_analysis/variants/filtering/pntest_som\* /Users/rpineau3/Dropbox\ \(GaTech\)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/data/

# upoad pntest data and depth info
pn_est_som <- read.table("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/1_data/pntest_somatic_friendsfil_ponfil_replicates_variants.txt", sep = " " ) # 797 x 96
af_mod_2 <- af_mod

# Filter based on depth
pnt_est_som_fil <- pn_est_som[,keep] # 797 x 79

# -----------------------------------------------------------------------------
# BINARIZATION
# -----------------------------------------------------------------------------
pn_som_bin <- matrix(NaN, dim(pnt_est_som_fil)[1], dim(pnt_est_som_fil)[2])
threshold <- 0.5

for (x in 1:dim(pnt_est_som_fil)[1]) {
  
  for (y in 1:dim(pnt_est_som_fil)[2]) {
    
    if (pnt_est_som_fil[x,y]>threshold) {pn_som_bin[x,y] <- 1 } # if proba of being heteroZ > threshold, label with 1
    
    else {pn_som_bin[x,y] <- 0 }
    
  }
}


nInd <- 79 # number of individuals in the dataset

#length(which(hets == 0)) # 110
#length(which(rowSums(pn_som_bin)>1)) # 171

# remove singletons, meaning present in 1/nInd individuals AND
# present in zero individuals (left after filtering out some individuals, maybe)

pn_bin_som_fil <- pn_som_bin[c(rowSums(pn_som_bin)>1),] # 171 x  79
af_mod_2$V1[c(rowSums(pn_som_bin)<=1)] <- 0

nSNP <-  171 # number of SNPs in the dataset
hets <- (rowSums(pn_bin_som_fil))/nInd

par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity", ylab = "counts", 100, main = "heterozygosity per SNP, after removing singletons", 
     col = "#2ca25f")

Ok, what I find interesting here, is that I am still left with SNPs with heterozygoty with frequency > 0.6 when I got rid of them previously - so our previous filtering was “counteracted” by the use of this new script.

af_mod_2$V1[hets>0.6] <- 0

# To filter the vcf, I need (1) the list of individuals and (2) the list of SNPs
bool <- matrix(1,length(af_mod_2$V1), 1)
bool[c(af_mod_2$V1 == 0)] <- 0 # 74 somatic mutations to look at

# write.csv(bool, "/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/data/bool_to_filter_vcf.csv", row.names = F)
# write.csv(ids_fil, "/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/data/IDs_repicates_79.csv", row.names = F)
# 

I used my python script to filter the vcf based on the boolean vector. I will transfer the filtered vcf file to the cluster to remove compare each sample with its all in the subclone dataset. I also want to filter out the individuals that were filtered out based on depth.

———————————————————————————————-

Step 5. Assess the replication between experiment - how many SNPs in common with initial dataset?

———————————————————————————————-

# scp /Users/rpineau3/Dropbox\ \(GaTech\)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/data/IDs_replicates_79.csv u6028866@kingspeak.chpc.utah.edu:/uufs/chpc.utah.edu/common/home/u6028866/Pando/replicate_analysis/variants/filtering/
  
# bcftools query -l friendsfil_ponfil_optionIA_fitered_96inds.vcf | wc -l
# 96

# After filtering

# bcftools query -l friendsfil_ponfil_optionIA_filtered_79inds.vcf | wc -l
# 79

# now, compare both vcfs (subclone dataset by itself and this set of 74 mutations)

# start back from this vcf file: /uufs/chpc.utah.edu/common/home/u6028866/Pando/subclone/variants/subpando_variants.vcf and do all of the vcf filtering steps to get there

—————————————————————————–

Step 6. Spatial analysis (PCoA)

—————————————————————————–

Principal coordinate analysis and simple ordination plot

library(ape)
library(colourvalues)
library(ecodist)

# -----------------------------------------------------------------------------
# PAIRWISE SIMILARY
# -----------------------------------------------------------------------------

# All pairwise comparisons
# 1 - go sample by sample
# 2 - for each pair A and B, calculate the number of mutations shared b/w A and B
# 3 - calculate the number of mutations in each A and B, take the mean
# 4 - divide #2 by #3
# 5 - store this number in a nInd x nInd matrix

#  x first
store_mat <- matrix(NA, nInd, nInd)
n <- 1
N <- 1

for (x in n:nInd) {
  
  for (y in N:nInd) {
    
    
    n_shared <- sum(rowSums(pn_bin_som_fil[,c(x, y)]) == 2) #  calculate the number of mutations shared b/w A and B
    n_sample_x <- sum(pn_bin_som_fil[,x] == 1) # calculate the number of mutations in each sample
    n_sample_y <- sum(pn_bin_som_fil[,y] == 1)
    
    val <- n_shared / mean(n_sample_x,n_sample_y) # calculate similarity
    store_mat[x,y] <- val
    
    
  }
  
  # N <- N + 1
  
}

# what is the range of values
par(family = "Times New Roman")
hist(store_mat[lower.tri(store_mat)], 100, col = "blue", xlim = c(0,1), main = "Range of value for pairwise similarity", xlab = "Pairwise similarity")

# transform to distance object
dist_obj <- as.dist(store_mat)
pcoa_res <- pco(dist_obj, negvals = "zero", dround = 0) #, negvals = "zero", dround = 0, rm= 1) # if negvals = 0 sets all negative eigenvalues to zero; if = "rm" corrects for negative eigenvalues using method 1 of Legendre and Anderson 1999

# color by tree ID
par(mfrow=c(1,2))
col_vec <- colour_values(ids_fil$tree.ID, palette = "viridis") 
par(family = "Times New Roman", cex = 1.2)
plot(pcoa_res$vectors[,1], pcoa_res$vectors[,2], xlab = "PCoA1", ylab = "PCoA2",pch = 16,
     axes = TRUE, main = "PCoA colored by Tree ID", , col = col_vec)

plot(pcoa_res$vectors[,3], pcoa_res$vectors[,4], xlab = "PCoA3", ylab = "PCoA4",pch = 16,
     axes = TRUE, main = "PCoA colored by Tree ID", , col = col_vec)
legend("topright", legend=c("Tree A", "Tree G", "Tree H"), pch = 16, col = c("#440154FF", "#21908CFF", "#FDE725FF"), bty = "n")

# color by tissue type
par(mfrow=c(1,2))
col_vec <- colour_values(ids_fil$tissue, palette = "viridis") 
par(family = "Times New Roman", cex = 1.2)
plot(pcoa_res$vectors[,1], pcoa_res$vectors[,2], xlab = "PCoA1", ylab = "PCoA2",pch = 16,
     axes = TRUE, main = "PCoA colored by tissue type", , col = col_vec)

plot(pcoa_res$vectors[,3], pcoa_res$vectors[,4], xlab = "PCoA3", ylab = "PCoA4",pch = 16,
     axes = TRUE, main = "PCoA colored by tissue type", , col = col_vec)
legend("topright", legend=c("leaf", "root"), pch = 16, col = c("#440154FF", "#FDE725FF"), bty = "n")

# color by replicate group
par(mfrow=c(1,2))
col_vec <- colour_values(ids_fil$replicate_group, palette = "plasma") 
par(family = "Times New Roman", cex = 1.2)
plot(pcoa_res$vectors[,1], pcoa_res$vectors[,2], xlab = "PCoA1", ylab = "PCoA2",pch = 16,
     axes = TRUE, main = "PCoA colored by replicate group", , col = col_vec)

plot(pcoa_res$vectors[,3], pcoa_res$vectors[,4], xlab = "PCoA1", ylab = "PCoA2",pch = 16,
     axes = TRUE, main = "PCoA colored by replicate group", , col = col_vec)