The variants were called on the 2024 dataset only (8 times 12 samples
from the 2022 subclone dataset).
filters applied so far : SNPs
removed when found in PON + Friends
reasoning: the allele frequencies are sometimes >0.5. For somatic mutations, we do not want to keep mutations that are widespread (shared by all individuals). The Pando clone is triploid, which reduces our expectation for fixation of a mutation to 0.33. We thus transform the allele frequency file to not take into account any variants whose allele frequency > 0.7, and reduce to 0.5 ane allele frequency comprised between 0.5 and 0.7.
I used the modified allele frequency file to calculate point
estimates based on genotype likelihoods. I used the following script :
/uufs/chpc.utah.edu/common/home/u6028866/Pando/replicate_analysis/scripts
and the files are in this folder:
/uufs/chpc.utah.edu/common/home/u6028866/Pando/replicate_analysis/variants/filtering
Download the file from the cluster to the laptop and continue analyses: filter for empty variants, singletons and for read depth>4.
#scp u6028866@kingspeak.chpc.utah.edu:/uufs/chpc.utah.edu/common/home/u6028866/Pando/replicate_analysis/variants/filtering/pntest\* /Users/rpineau3/Dropbox\ \(GaTech\)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/data/
# upoad pntest data and depth info
pn_est <- read.table("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/1_data/pntest_friendsfil_ponfil_replicates_variants.txt", sep = " " ) # 797 x 96
depth <- read.csv("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/1_data/depth_reps.csv") # 797 x 96
ids <- read.csv("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/1_data/replicate_ids.csv", header = T) # 96 x 6
# Filter based on depth
par(family = "Times New Roman")
plot(colMeans(depth), pch = 16, xlab = "individual", ylab = "depth", ylim = c(0,150)) # Mean depth per sample
abline(h=4, col = "red")
keep <- which (colMeans(depth) >= 4) # 79
pnt_est_fil <- pn_est[,keep] # 797 x 79
ids_fil <- ids[as.numeric(keep), ] # 79 x 6
depth_fil <- depth[,keep] # 797 x 79
# -----------------------------------------------------------------------------
# BINARIZATION
# -----------------------------------------------------------------------------
pn_bin <- matrix(NaN, dim(pnt_est_fil)[1], dim(pnt_est_fil)[2])
threshold <- 0.5
for (x in 1:dim(pnt_est_fil)[1]) {
for (y in 1:dim(pnt_est_fil)[2]) {
if (pnt_est_fil[x,y]>threshold) {pn_bin[x,y] <- 1 } # if proba of being heteroZ > threshold, label with 1
else {pn_bin[x,y] <- 0 }
}
}
nInd <- 79 # number of individuals in the dataset
hets <- rowSums(pn_bin)/nInd
#length(which(hets == 0)) # 110
#length(which(rowSums(pn_bin)>1)) # 429
# remove singletons, meaning present in 1/nInd individuals AND
# present in zero individuals (left after filtering out some individuals, maybe)
pn_bin_fil <- pn_bin[c(rowSums(pn_bin)>1),] # 429 x 79
af_mod$V1[c(rowSums(pn_bin)<=1)] <- 0
nSNP <- 429 # number of SNPs in the dataset
hets <- (rowSums(pn_bin_fil))/nInd
par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity", ylab = "counts", 100, main = "heterozygosity per SNP, after removing singletons",
col = "#2ca25f")
There are a lot of mutations that are above 0.8 - do we want to keep them when we calculate the replicability of our sequencing (i.e. the probability of cathing a mutation if it is there, or do we focus on somatic mutations for these values?)
I would like to find the probability of mutation replication, i.e.: what is the probability that, if I detect a mutation in one sample within the replicate group, I detect it in all other samples within the group?
# How many mutations are found in every replicate versus not?
# 1 - dataset divided by replicate group (3 to 8 samples)
group <- c(1:6,8:12) # no data for replicate group 7 anymore
store_prop <- c()
# number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
for (rep in 1:length(group)) {
inds <- which( ids_fil$replicate_group == group[rep] )
ind_prop <- c(rowSums(pn_bin_fil[,inds])/length(inds)) # proportion of individuals with the mutation
mean_n_mut <- mean(colSums(pn_bin_fil[,inds])) # mean number of mutations per sample
#prop <- c(length(which( ind_prop == 0 )) + length(which( ind_prop == 1 )) ) / nSNP
prop <- c( length(which( ind_prop == 1 )) ) / mean_n_mut
store_prop <- c(store_prop, prop)
}
data_store_prop <- store_prop
par(family = "Times New Roman")
plot(data_store_prop, pch = 16, ylim = c(0,1), xlab = "replicate", ylab = "Prop. of SNP in common per set of reps")
abline(h=mean(store_prop), col = "red") # 0.64
# 2 - Compare to random
store_mean_prop <- c()
for (sim in 1:1000){
pn_bin_rdn <- apply(pn_bin_fil, 2, sample)
store_prop <- c()
# number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
for (rep in 1:length(group)) {
rand_inds <- sample(ids_fil$replicate_group) # re-shuffle individuals
inds <- which( rand_inds == group[rep] )
ind_prop <- c(rowSums(pn_bin_rdn[,inds])/length(inds)) # proportion of individuals with the mutation
mean_n_mut <- mean(colSums(pn_bin_fil[,inds])) # mean number of mutations per sample
#prop <- c(length(which( ind_prop == 0 )) + length(which( ind_prop == 1 )) ) / nSNP
prop <- c( length(which( ind_prop == 1 )) ) / mean_n_mut
store_prop <- c(store_prop, prop)
}
store_mean_prop[sim] <- mean(store_prop)
}
# Plot and compare
par(family = "Times New Roman")
hist(store_mean_prop, 10, xlab = "Prop. of SNP in common per set of reps", col = "#3690c0", xlim = c(0,1),
main = "Comparison per replicate, count of mutations present in *all* of reps", border = "#3690c0")
abline(v=mean(data_store_prop), col = "#045a8d")
legend("topright", legend = c("Expected by chance" , "Mean for data (0.6)"), pch = 15, col = c("#3690c0", "#045a8d"), bty = "n" )
# Allow one missing mutation by changing threshold to 0.8
We have a good replication (60%) within replicate groups, significantly better than expected by chance.
I do the same analysis, this time by reducing the threshold to 80%,
meaning: I count every mutation being found in 80% or more of the
samples within a replicate group.
Question for Zach: is that sufficient to assess our “replication power”, or the “quality of sequencing” in our experiment?
We checked that we could trust the data. Now I will only keep the SNPs that are found in less than 60% of the samples. I am being more conservativere with a 60% threshold, when we were using a 80% threshold before, mostly because we see in the previous figure that we miss ~40% of the mutations. (is that true?) I will re-estimate the point estimates using the gl2genest_somatic.pl script, based on this new set of SNP that shoud only contain somatic mutations. I will then filter the vcf to keep those mutations and compare to our previous set of mutations for the same samples.
af_mod$V1[hets>0.6] <- 0
length(which(af_mod$V1==0)) # 505 SNPs allele frequencies set to 0
## [1] 626
# write.csv(af_mod, "/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/data/afsom_friendsfil_ponfil_replicates_variants.txt")
# upload the file on the cluster to re-calculate point estimates :
# scp /Users/rpineau3/Dropbox\ \(GaTech\)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/data/afsom_friendsfil_ponfil_replicates_variants.txt u6028866@kingspeak.chpc.utah.edu:/uufs/chpc.utah.edu/common/home/u6028866/Pando/replicate_analysis/variants/filtering/
#scp u6028866@kingspeak.chpc.utah.edu:/uufs/chpc.utah.edu/common/home/u6028866/Pando/replicate_analysis/variants/filtering/pntest_som\* /Users/rpineau3/Dropbox\ \(GaTech\)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/data/
# upoad pntest data and depth info
pn_est_som <- read.table("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/1_data/pntest_somatic_friendsfil_ponfil_replicates_variants.txt", sep = " " ) # 797 x 96
af_mod_2 <- af_mod
# Filter based on depth
pnt_est_som_fil <- pn_est_som[,keep] # 797 x 79
# -----------------------------------------------------------------------------
# BINARIZATION
# -----------------------------------------------------------------------------
pn_som_bin <- matrix(NaN, dim(pnt_est_som_fil)[1], dim(pnt_est_som_fil)[2])
threshold <- 0.5
for (x in 1:dim(pnt_est_som_fil)[1]) {
for (y in 1:dim(pnt_est_som_fil)[2]) {
if (pnt_est_som_fil[x,y]>threshold) {pn_som_bin[x,y] <- 1 } # if proba of being heteroZ > threshold, label with 1
else {pn_som_bin[x,y] <- 0 }
}
}
nInd <- 79 # number of individuals in the dataset
#length(which(hets == 0)) # 110
#length(which(rowSums(pn_som_bin)>1)) # 171
# remove singletons, meaning present in 1/nInd individuals AND
# present in zero individuals (left after filtering out some individuals, maybe)
pn_bin_som_fil <- pn_som_bin[c(rowSums(pn_som_bin)>1),] # 171 x 79
af_mod_2$V1[c(rowSums(pn_som_bin)<=1)] <- 0
nSNP <- 171 # number of SNPs in the dataset
hets <- (rowSums(pn_bin_som_fil))/nInd
par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity", ylab = "counts", 100, main = "heterozygosity per SNP, after removing singletons",
col = "#2ca25f")
Ok, what I find interesting here, is that I am still left with SNPs with heterozygoty with frequency > 0.6 when I got rid of them previously - so our previous filtering was “counteracted” by the use of this new script.
af_mod_2$V1[hets>0.6] <- 0
# To filter the vcf, I need (1) the list of individuals and (2) the list of SNPs
bool <- matrix(1,length(af_mod_2$V1), 1)
bool[c(af_mod_2$V1 == 0)] <- 0 # 74 somatic mutations to look at
# write.csv(bool, "/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/data/bool_to_filter_vcf.csv", row.names = F)
# write.csv(ids_fil, "/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/data/IDs_repicates_79.csv", row.names = F)
#
I used my python script to filter the vcf based on the boolean vector. I will transfer the filtered vcf file to the cluster to remove compare each sample with its all in the subclone dataset. I also want to filter out the individuals that were filtered out based on depth.
# scp /Users/rpineau3/Dropbox\ \(GaTech\)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/1_analysis_of_replicates/data/IDs_replicates_79.csv u6028866@kingspeak.chpc.utah.edu:/uufs/chpc.utah.edu/common/home/u6028866/Pando/replicate_analysis/variants/filtering/
# bcftools query -l friendsfil_ponfil_optionIA_fitered_96inds.vcf | wc -l
# 96
# After filtering
# bcftools query -l friendsfil_ponfil_optionIA_filtered_79inds.vcf | wc -l
# 79
# now, compare both vcfs (subclone dataset by itself and this set of 74 mutations)
# start back from this vcf file: /uufs/chpc.utah.edu/common/home/u6028866/Pando/subclone/variants/subpando_variants.vcf and do all of the vcf filtering steps to get there
library(ape)
library(colourvalues)
library(ecodist)
# -----------------------------------------------------------------------------
# PAIRWISE SIMILARY
# -----------------------------------------------------------------------------
# All pairwise comparisons
# 1 - go sample by sample
# 2 - for each pair A and B, calculate the number of mutations shared b/w A and B
# 3 - calculate the number of mutations in each A and B, take the mean
# 4 - divide #2 by #3
# 5 - store this number in a nInd x nInd matrix
# x first
store_mat <- matrix(NA, nInd, nInd)
n <- 1
N <- 1
for (x in n:nInd) {
for (y in N:nInd) {
n_shared <- sum(rowSums(pn_bin_som_fil[,c(x, y)]) == 2) # calculate the number of mutations shared b/w A and B
n_sample_x <- sum(pn_bin_som_fil[,x] == 1) # calculate the number of mutations in each sample
n_sample_y <- sum(pn_bin_som_fil[,y] == 1)
val <- n_shared / mean(n_sample_x,n_sample_y) # calculate similarity
store_mat[x,y] <- val
}
# N <- N + 1
}
# what is the range of values
par(family = "Times New Roman")
hist(store_mat[lower.tri(store_mat)], 100, col = "blue", xlim = c(0,1), main = "Range of value for pairwise similarity", xlab = "Pairwise similarity")
# transform to distance object
dist_obj <- as.dist(store_mat)
pcoa_res <- pco(dist_obj, negvals = "zero", dround = 0) #, negvals = "zero", dround = 0, rm= 1) # if negvals = 0 sets all negative eigenvalues to zero; if = "rm" corrects for negative eigenvalues using method 1 of Legendre and Anderson 1999
# color by tree ID
par(mfrow=c(1,2))
col_vec <- colour_values(ids_fil$tree.ID, palette = "viridis")
par(family = "Times New Roman", cex = 1.2)
plot(pcoa_res$vectors[,1], pcoa_res$vectors[,2], xlab = "PCoA1", ylab = "PCoA2",pch = 16,
axes = TRUE, main = "PCoA colored by Tree ID", , col = col_vec)
plot(pcoa_res$vectors[,3], pcoa_res$vectors[,4], xlab = "PCoA3", ylab = "PCoA4",pch = 16,
axes = TRUE, main = "PCoA colored by Tree ID", , col = col_vec)
legend("topright", legend=c("Tree A", "Tree G", "Tree H"), pch = 16, col = c("#440154FF", "#21908CFF", "#FDE725FF"), bty = "n")
# color by tissue type
par(mfrow=c(1,2))
col_vec <- colour_values(ids_fil$tissue, palette = "viridis")
par(family = "Times New Roman", cex = 1.2)
plot(pcoa_res$vectors[,1], pcoa_res$vectors[,2], xlab = "PCoA1", ylab = "PCoA2",pch = 16,
axes = TRUE, main = "PCoA colored by tissue type", , col = col_vec)
plot(pcoa_res$vectors[,3], pcoa_res$vectors[,4], xlab = "PCoA3", ylab = "PCoA4",pch = 16,
axes = TRUE, main = "PCoA colored by tissue type", , col = col_vec)
legend("topright", legend=c("leaf", "root"), pch = 16, col = c("#440154FF", "#FDE725FF"), bty = "n")
# color by replicate group
par(mfrow=c(1,2))
col_vec <- colour_values(ids_fil$replicate_group, palette = "plasma")
par(family = "Times New Roman", cex = 1.2)
plot(pcoa_res$vectors[,1], pcoa_res$vectors[,2], xlab = "PCoA1", ylab = "PCoA2",pch = 16,
axes = TRUE, main = "PCoA colored by replicate group", , col = col_vec)
plot(pcoa_res$vectors[,3], pcoa_res$vectors[,4], xlab = "PCoA1", ylab = "PCoA2",pch = 16,
axes = TRUE, main = "PCoA colored by replicate group", , col = col_vec)