Replicability of the replicate samples between two different sequencing batches

Two datasets here : 2024 (replicates, 12x8) and the same 12 samples than the replicate dataset from the subclone (with 2018 and 2022 called together). Goal: I want to know how many SNPs called in this subclone + Pando dataset were found in the replicate dataset, in the same set of samples. This will tell us how many mutations we might be missing when there is only one replicate per sample.

The set of somatic mutations was chosen because found in at least 2 samples per group, and at most 8 groups total (out of 11).

rm(list= ls())

pnt_est <- read.table("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/4_troubleshooting/may6/new_comparison_v2/pntest_som_reps_singles_37SNPs.txt", sep = " ") # 37 x 108

ids <- read.table("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/4_troubleshooting/may6/new_comparison_v2/merged_ids.txt", sep = "\t", header = T) # 91 x 3

ids_full <- read.table("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/11-field-lab-work/5-GBS/4_troubleshooting/may6/new_comparison_v2/ids_13SNPs.txt", sep = "\t", header = F) # 108 x 3

# filter based on samples
keep <- matrix(NA,length(ids$full_ID),1)

for (i in 1:length(ids$full_ID)) {
  
 keep[i] <- which(ids$full_ID[i] == ids_full$V1)
  
}


pnt_est_fil <- pnt_est[,keep] # 37 x 91

I will do not any filtering by depth for now.

One thing that I am interested in is: if the mutation is found in the replicated sample (2024 dataset), is it also found in the original single sequenced one (2022 dataset)?

# -----------------------------------------------------------------------------
# BINARIZATION
# -----------------------------------------------------------------------------
pn_bin <- matrix(NaN, dim(pnt_est_fil)[1], dim(pnt_est_fil)[2])
threshold <- 0.9

for (x in 1:dim(pnt_est_fil)[1]) {
  
  for (y in 1:dim(pnt_est_fil)[2]) {
    
    if (pnt_est_fil[x,y]>threshold) {pn_bin[x,y] <- 1 } # if proba of being heteroZ > threshold, label with 1
    
    else {pn_bin[x,y] <- 0 }
    
  }
}


nInd <- 91 # number of individuals in the dataset

hets <- rowSums(pn_bin)/nInd
#length(which(hets == 0)) # 0
#length(which(rowSums(pn_bin)>1)) # 43

nSNP <-  37 # number of SNPs in the dataset

par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity", ylab = "counts", 100, main = "Heterozygosity per SNP in replicated samples", sub = "(2022 and 2024 datasets, 37 SNPs in common)", 
     col = "#2ca25f")

levels(as.factor(ids$group_ID)) 
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12"
group <- c(1:7,9:12)  # missing the replicate samples for group #8

# in how many samples of the replicate dataset do we find the same mutation?
rep_idx <- ids$group=="replicates"
pn_bin_rep <- pn_bin[,rep_idx] # 37 x 79
ids_rep <- ids[rep_idx,] # 79 x 3

pn_bin_sub <- pn_bin[,!rep_idx] # 37 x 12
ids_sub <- ids[!rep_idx,] # 12 x 3

store_prop <- c()

# number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
for (rep in 1:length(group)) {
  
  inds <- which( ids_rep$group_ID == group[rep] )

  ind_prop <- c(rowSums(pn_bin_rep[,inds])/length(inds)) # proportion of individuals with the mutation

  mean_n_mut <- mean(colSums(pn_bin_rep[,inds])) # mean number of mutations per sample
  
  #prop <- c(length(which( ind_prop == 0 )) + length(which( ind_prop == 1 )) ) / nSNP
  prop <- c( length(which( ind_prop == 1 )) ) / mean_n_mut
  store_prop <- c(store_prop, prop)
  
}

data_store_prop <- store_prop
par(family = "Times New Roman")
plot(data_store_prop, pch = 16, ylim = c(0,1), xlab = "replicate", ylab = "Prop. of SNP in common per set of reps")
abline(h=mean(store_prop), col = "red") # 0.66

# choose threshold
t <- 2

store_replicability <- matrix(NA, 1, 11)

for (rep in 1:length(group)) {
  
  inds <- which( ids_rep$group_ID == group[rep] ) # within group individuals for the replicate dataset
  
  snps <- which(rowSums(pn_bin_rep[,inds]) >= t ) # which SNPS are present in that group? 
  
  store_replicability[rep] <- sum(pn_bin_sub[snps,group[rep]])/length(snps) # proportion of snps found in replicate also found in single samples
  
}

mean_dt <- mean(store_replicability, na.rm=T) # 0.4


# Compare to random
N <- 1000
store_replicability_rdn <- matrix(NA, 1, N)

for (n in 1:N) {
  
  store_replicability <- matrix(NA, 1, 11)
  
  for (rep in 1:length(group)) {
    
    rdn_inds <- sample(ids_rep$group_ID) # randomly sample through the individuals
    
    inds <- which( rdn_inds == group[rep] ) # within group individuals for the repicate dataset
    
    snps <- which(rowSums(pn_bin_rep[,inds]) >= t ) # which SNPS are present in that group?
    
    store_replicability[rep] <- sum(pn_bin_sub[snps,group[rep]])/length(snps)# proportion of snps found in replicate also found in single samples
  
    
  }
  
  store_replicability_rdn[n] <- mean(store_replicability, na.r=TRUE) 
  
    
}

mean(store_replicability_rdn, na.rm = T) # 0.24
## [1] 0.242636
# Maybe most mutations are found in all reps?

# Plot 
par(family = "Times New Roman")
hist(store_replicability_rdn, xlab = "Prop. of SNPs found in the single sample, when found in >=2 samples in replicate dataset", 100, col = "blue", main = "")
abline(v=mean(mean_dt), col = "red", lwd = 2) 
legend("topleft", legend = c("mean for data", "mean for randomized dataset"), col = c("red", "blue"), pch = 18, bty = "n")

What does this mean?
On average, when a mutation is found in at least 2 samples in one group of replicates, it is found 0.42 times on average in the single sequenced sample, which is significantly different from the null expectation.

We chose a threshold of 2. How does this threshold affect this result?

# choose threshold
T <- 4
store_replicability <- matrix(NA, T, length(group))

for (t in 1:T) {

  for (rep in 1:length(group)) {
    
    inds <- which( ids_rep$group_ID == group[rep] ) # within group individuals for the replicate dataset
    
    snps <- which(rowSums(pn_bin_rep[,inds]) >= t ) # which SNPS are present in that group? 
    
    store_replicability[t,rep] <- sum(pn_bin_sub[snps,group[rep]])/length(snps) # proportion of snps found in replicate also found in single samples
    
  }
  

}

mean_dt <- rowMeans(store_replicability, na.rm=T) # 0.4


# Compare to random
N <- 1000
mean_rdn <- matrix(NA, N, T)


  
for (n in 1:N) {
  
  store_replicability <- matrix(NA, T, 11)
  
  for (t in 1:T) {
    
    for (rep in 1:length(group)) {
      
      rdn_inds <- sample(ids_rep$group_ID) # randomly sample through the individuals
      
      inds <- which( rdn_inds == group[rep] ) # within group individuals for the repicate dataset
      
      snps <- which(rowSums(pn_bin_rep[,inds]) >= t ) # which SNPS are present in that group?
      
      store_replicability[t, rep] <- sum(pn_bin_sub[snps,group[rep]])/length(snps)# proportion of snps found in replicate also found in single samples
    
    }
    
      
  }
  
  mean_rdn[n,] <- rowMeans(store_replicability, na.rm=T) # 0.4

}

# Plot 
par(family = "Times New Roman")

plot(rep(1,N), mean_rdn[,1], pch = 16, xlim = c(0,5), ylim = c(0,1), 
     xlab = "threshold t", ylab = "proportion of times the snp is found", 
     main = "proba(finding snp in the single sample) if found in >=t of the replicate samples")
points(rep(2,N), mean_rdn[,2], pch = 16)
points(rep(3,N), mean_rdn[,3], pch = 16)
points(rep(4,N), mean_rdn[,4], pch = 16)
points(1:4, mean_dt, pch = 16, cex = 2, col = "red")

legend("topleft", legend = c("mean for data", "mean for randomized dataset"), col = c("red", "black"), pch = 18, bty = "n")