Replicability of the replicate samples between two different sequencing batches

Two datasets here : 2024 (replicates, 12x8) and the same 12 samples than the replicate dataset from the subclone (with 2018 and 2022 called together). Goal: I want to know how many SNPs called in this subclone + Pando dataset were found in the replicate dataset, in the same set of samples. This will tell us how many mutations we might be missing when there is only one replicate per sample.

The set of somatic mutations was chosen because found in at least 2 samples per group, and at most 8 groups total (out of 11).

rm(list= ls())

setwd("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/1_comparison_reps_singles/data/")

pnt_est <- read.table("pntest_somatic_35snps_92inds.txt", sep = " ") # 165 x 92

ids <- read.table("ids_165snps_92inds.txt", sep = "\t", header = T) # 92 x 6

If the mutation is found in the replicated sample (2024 dataset), is it also found in the original single sequenced one (2022 dataset)?

# -----------------------------------------------------------------------------
# BINARIZATION
# -----------------------------------------------------------------------------
pn_bin <- matrix(NaN, dim(pnt_est)[1], dim(pnt_est)[2])
threshold <- 0.5

for (x in 1:dim(pnt_est)[1]) {
  
  for (y in 1:dim(pnt_est)[2]) {
    
    if (pnt_est[x,y]>threshold) {pn_bin[x,y] <- 1 } # if proba of being heteroZ > threshold, label with 1
    
    else {pn_bin[x,y] <- 0 }
    
  }
}


nInd <- 92 # number of individuals in the dataset

hets <- rowSums(pn_bin)/nInd
#length(which(hets == 0)) # 5
#length(which(rowSums(pn_bin)>1)) # 191

nSNP <-  35 # number of SNPs in the dataset

par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity", ylab = "counts", 100, main = "Heterozygosity per SNP in replicated samples", sub = "(2022 and 2024 datasets, 35 SNPs in common)", 
     col = "#2ca25f")

# in how many samples of the replicate dataset do we find the same mutation?
rep_idx <- ids$dataset=="replicate"
pn_bin_rep <- pn_bin[,rep_idx] # 165 x 80
ids_rep <- ids[rep_idx,] # 80 x 6

pn_bin_sub <- pn_bin[,!rep_idx] # 165 x 12
ids_sub <- ids[!rep_idx,] # 12 x 6

store_prop <- c()

group <- c(1:6, 8:12)
# number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
for (rep in 1:length(group)) {
  
  inds <- which( ids_rep$group == group[rep] )

  ind_prop <- c(rowSums(pn_bin_rep[,inds])/length(inds)) # proportion of individuals with the mutation

  mean_n_mut <- mean(colSums(pn_bin_rep[,inds])) # mean number of mutations per sample

  prop <- c( length(which( ind_prop >= 0.8 )) ) / mean_n_mut
  store_prop <- c(store_prop, prop)
  
}

data_store_prop <- store_prop
par(family = "Times New Roman")
plot(data_store_prop, pch = 16, ylim = c(0,1), xlab = "replicate", ylab = "Prop. of SNP in common per set of reps")
abline(h=mean(store_prop), col = "red") # 0.61

The value for group #2 is closer to 1 because most mutations are found in all samples in this group, maybe mostly due to the group size (3 samples left in this group after filtering).

# choose threshold
t <- 2

store_replicability <- matrix(NA, 1, 11)

for (rep in 1:length(group)) {
  
  inds <- which( ids_rep$group == group[rep] ) # within group individuals for the replicate dataset
  
  snps <- which(rowSums(pn_bin_rep[,inds]) >= t ) # which SNPS are present in that group? 
  
  store_replicability[rep] <- sum(pn_bin_sub[snps,group[rep]])/length(snps) # proportion of snps found in replicate also found in single samples
  
}

mean_dt <- mean(store_replicability, na.rm=T) # 0.48


# Compare to random
N <- 1000
store_replicability_rdn <- matrix(NA, 1, N)

for (n in 1:N) {
  
  store_replicability <- matrix(NA, 1, 11)
  
  for (rep in 1:length(group)) {
    
    rdn_inds <- sample(ids_rep$group) # randomly sample through the individuals
    
    inds <- which( rdn_inds == group[rep] ) # within group individuals for the replicate dataset
    
    snps <- which(rowSums(pn_bin_rep[,inds]) >= t ) # which SNPS are present in that group?
    
    store_replicability[rep] <- sum(pn_bin_sub[snps,group[rep]])/length(snps)# proportion of snps found in replicate also found in single samples
  
    
  }
  
  store_replicability_rdn[n] <- mean(store_replicability, na.r=TRUE) 
  
    
}

mean(store_replicability_rdn, na.rm = T) # 0.24
## [1] 0.3954646
# Maybe most mutations are found in all reps?

# Plot 
par(family = "Times New Roman")
hist(store_replicability_rdn, xlab = "Prop. of SNPs found in the single sample, when found in >=2 samples in replicate dataset", 100, col = "blue", main = "")
abline(v=mean(mean_dt), col = "red", lwd = 2) 
legend("topleft", legend = c("mean for data", "mean for randomized dataset"), col = c("red", "blue"), pch = 18, bty = "n")

We chose a threshold of 2. How does this threshold affect this result?

# choose threshold
T <- 4
store_replicability <- matrix(NA, T, length(group))

for (t in 1:T) {

  for (rep in 1:length(group)) {
    
    inds <- which( ids_rep$group == group[rep] ) # within group individuals for the replicate dataset
    
    snps <- which(rowSums(pn_bin_rep[,inds]) >= t ) # which SNPS are present in that group? 
    
    store_replicability[t,rep] <- sum(pn_bin_sub[snps,group[rep]])/length(snps) # proportion of snps found in replicate also found in single samples
    
  }
  

}

mean_dt <- rowMeans(store_replicability, na.rm=T) # 0.4


# Compare to random
N <- 1000
mean_rdn <- matrix(NA, N, T)


  
for (n in 1:N) {
  
  store_replicability <- matrix(NA, T, 11)
  
  for (t in 1:T) {
    
    for (rep in 1:length(group)) {
      
      rdn_inds <- sample(ids_rep$group) # randomly sample through the individuals
      
      inds <- which( rdn_inds == group[rep] ) # within group individuals for the repicate dataset
      
      snps <- which(rowSums(pn_bin_rep[,inds]) >= t ) # which SNPS are present in that group?
      
      store_replicability[t, rep] <- sum(pn_bin_sub[snps,group[rep]])/length(snps) # proportion of snps found in replicate also found in single samples
    
    }
    
      
  }
  
  mean_rdn[n,] <- rowMeans(store_replicability, na.rm=T) # 0.4

}

# Plot 
par(family = "Times New Roman")

plot(rep(1,N), mean_rdn[,1], pch = 16, xlim = c(0,5), ylim = c(0,1), 
     xlab = "threshold t", ylab = "proportion of times the snp is found", 
     main = "proba(finding snp in the single sample) if found in >=t of the replicate samples")
points(rep(2,N), mean_rdn[,2], pch = 16)
points(rep(3,N), mean_rdn[,3], pch = 16)
points(rep(4,N), mean_rdn[,4], pch = 16)
points(1:4, mean_dt, pch = 16, cex = 1.2, col = "red")

legend("topleft", legend = c("mean for data", "mean for randomized datasets"), col = c("red", "black"), pch = 18, bty = "n")

How is that possible, that if one mutation is found in more than 4 samples in the same replicate group, the chance of finding it in the dataset as a whole is as high?