Two datasets here : 2024 (replicates, 12x8) and the same 12 samples
than the replicate dataset from the subclone (with 2018 and 2022 called
together). Goal: I want to know how many SNPs called in this subclone +
Pando dataset were found in the replicate dataset, in the same set of
samples. This will tell us how many mutations we might be missing when
there is only one replicate per sample.
The set of somatic mutations was chosen because found in at least 2
samples per group, and at most 8 groups total (out of 11).
rm(list= ls())
setwd("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/1_comparison_reps_singles/data/")
pnt_est <- read.table("pntest_somatic_35snps_92inds.txt", sep = " ") # 165 x 92
ids <- read.table("ids_165snps_92inds.txt", sep = "\t", header = T) # 92 x 6
# -----------------------------------------------------------------------------
# BINARIZATION
# -----------------------------------------------------------------------------
pn_bin <- matrix(NaN, dim(pnt_est)[1], dim(pnt_est)[2])
threshold <- 0.5
for (x in 1:dim(pnt_est)[1]) {
for (y in 1:dim(pnt_est)[2]) {
if (pnt_est[x,y]>threshold) {pn_bin[x,y] <- 1 } # if proba of being heteroZ > threshold, label with 1
else {pn_bin[x,y] <- 0 }
}
}
nInd <- 92 # number of individuals in the dataset
hets <- rowSums(pn_bin)/nInd
#length(which(hets == 0)) # 5
#length(which(rowSums(pn_bin)>1)) # 191
nSNP <- 35 # number of SNPs in the dataset
par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity", ylab = "counts", 100, main = "Heterozygosity per SNP in replicated samples", sub = "(2022 and 2024 datasets, 35 SNPs in common)",
col = "#2ca25f")
# in how many samples of the replicate dataset do we find the same mutation?
rep_idx <- ids$dataset=="replicate"
pn_bin_rep <- pn_bin[,rep_idx] # 165 x 80
ids_rep <- ids[rep_idx,] # 80 x 6
pn_bin_sub <- pn_bin[,!rep_idx] # 165 x 12
ids_sub <- ids[!rep_idx,] # 12 x 6
store_prop <- c()
group <- c(1:6, 8:12)
# number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
for (rep in 1:length(group)) {
inds <- which( ids_rep$group == group[rep] )
ind_prop <- c(rowSums(pn_bin_rep[,inds])/length(inds)) # proportion of individuals with the mutation
mean_n_mut <- mean(colSums(pn_bin_rep[,inds])) # mean number of mutations per sample
prop <- c( length(which( ind_prop >= 0.8 )) ) / mean_n_mut
store_prop <- c(store_prop, prop)
}
data_store_prop <- store_prop
par(family = "Times New Roman")
plot(data_store_prop, pch = 16, ylim = c(0,1), xlab = "replicate", ylab = "Prop. of SNP in common per set of reps")
abline(h=mean(store_prop), col = "red") # 0.61
The value for group #2 is closer to 1 because most mutations are found in all samples in this group, maybe mostly due to the group size (3 samples left in this group after filtering).
# choose threshold
t <- 2
store_replicability <- matrix(NA, 1, 11)
for (rep in 1:length(group)) {
inds <- which( ids_rep$group == group[rep] ) # within group individuals for the replicate dataset
snps <- which(rowSums(pn_bin_rep[,inds]) >= t ) # which SNPS are present in that group?
store_replicability[rep] <- sum(pn_bin_sub[snps,group[rep]])/length(snps) # proportion of snps found in replicate also found in single samples
}
mean_dt <- mean(store_replicability, na.rm=T) # 0.48
# Compare to random
N <- 1000
store_replicability_rdn <- matrix(NA, 1, N)
for (n in 1:N) {
store_replicability <- matrix(NA, 1, 11)
for (rep in 1:length(group)) {
rdn_inds <- sample(ids_rep$group) # randomly sample through the individuals
inds <- which( rdn_inds == group[rep] ) # within group individuals for the replicate dataset
snps <- which(rowSums(pn_bin_rep[,inds]) >= t ) # which SNPS are present in that group?
store_replicability[rep] <- sum(pn_bin_sub[snps,group[rep]])/length(snps)# proportion of snps found in replicate also found in single samples
}
store_replicability_rdn[n] <- mean(store_replicability, na.r=TRUE)
}
mean(store_replicability_rdn, na.rm = T) # 0.24
## [1] 0.3954646
# Maybe most mutations are found in all reps?
# Plot
par(family = "Times New Roman")
hist(store_replicability_rdn, xlab = "Prop. of SNPs found in the single sample, when found in >=2 samples in replicate dataset", 100, col = "blue", main = "")
abline(v=mean(mean_dt), col = "red", lwd = 2)
legend("topleft", legend = c("mean for data", "mean for randomized dataset"), col = c("red", "blue"), pch = 18, bty = "n")
We chose a threshold of 2. How does this threshold affect this result?
# choose threshold
T <- 4
store_replicability <- matrix(NA, T, length(group))
for (t in 1:T) {
for (rep in 1:length(group)) {
inds <- which( ids_rep$group == group[rep] ) # within group individuals for the replicate dataset
snps <- which(rowSums(pn_bin_rep[,inds]) >= t ) # which SNPS are present in that group?
store_replicability[t,rep] <- sum(pn_bin_sub[snps,group[rep]])/length(snps) # proportion of snps found in replicate also found in single samples
}
}
mean_dt <- rowMeans(store_replicability, na.rm=T) # 0.4
# Compare to random
N <- 1000
mean_rdn <- matrix(NA, N, T)
for (n in 1:N) {
store_replicability <- matrix(NA, T, 11)
for (t in 1:T) {
for (rep in 1:length(group)) {
rdn_inds <- sample(ids_rep$group) # randomly sample through the individuals
inds <- which( rdn_inds == group[rep] ) # within group individuals for the repicate dataset
snps <- which(rowSums(pn_bin_rep[,inds]) >= t ) # which SNPS are present in that group?
store_replicability[t, rep] <- sum(pn_bin_sub[snps,group[rep]])/length(snps) # proportion of snps found in replicate also found in single samples
}
}
mean_rdn[n,] <- rowMeans(store_replicability, na.rm=T) # 0.4
}
# Plot
par(family = "Times New Roman")
plot(rep(1,N), mean_rdn[,1], pch = 16, xlim = c(0,5), ylim = c(0,1),
xlab = "threshold t", ylab = "proportion of times the snp is found",
main = "proba(finding snp in the single sample) if found in >=t of the replicate samples")
points(rep(2,N), mean_rdn[,2], pch = 16)
points(rep(3,N), mean_rdn[,3], pch = 16)
points(rep(4,N), mean_rdn[,4], pch = 16)
points(1:4, mean_dt, pch = 16, cex = 1.2, col = "red")
legend("topleft", legend = c("mean for data", "mean for randomized datasets"), col = c("red", "black"), pch = 18, bty = "n")
How is that possible, that if one mutation is found in more than 4 samples in the same replicate group, the chance of finding it in the dataset as a whole is as high?