The variants were called on the 2024 dataset only (8 times 12 samples
from the 2022 subclone dataset).
filters applied so far : SNPs
removed when found in PON + Friends + “crap” + modified allele frequency
vector + somatic point estimate script
reasoning for the modified allele frequency vector : the allele frequencies are sometimes >0.5. For somatic mutations, we do not want to keep mutations that are widespread (shared by all individuals). The Pando clone is triploid, which reduces our expectation for fixation of a mutation to 0.33. We thus transform the allele frequency file to not take into account any variants whose allele frequency > 0.7, and reduce to 0.5 ane allele frequency comprised between 0.5 and 0.7.
Filter for mean individual read depth>4 , empty variants and singletons.
setwd("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/0_replicate_dataset/data")
# upoad pntest data and depth info
pn_est <- read.table("pntest_somatic_filtered_ponfil_friendfil_replicates.txt", sep = " ", header = F ) # 4607 x 96
depth <- read.table("depth_filtered_ponfil_friendfil_replicates.txt", sep = "\t", header = T) # 4607 x 96
ids <- read.csv("ids_filtered_ponfil_friendfil_replicates.txt", sep = "\t", header = T) # 96 x 5
# Filter based on depth
par(family = "Times New Roman")
plot(colMeans(depth), pch = 16, xlab = "individual", ylab = "depth", ylim = c(0,150)) # Mean depth per sample
abline(h=4, col = "red")
keep <- which (colMeans(depth) >= 4) # 80
pnt_est_fil <- pn_est[,keep] # 797 x 80
ids_fil <- ids[as.numeric(keep), ] # 80 x 5
depth_fil <- depth[,keep] # 797 x 80
# -----------------------------------------------------------------------------
# BINARIZATION
# -----------------------------------------------------------------------------
pn_bin <- matrix(NaN, dim(pnt_est_fil)[1], dim(pnt_est_fil)[2])
threshold <- 0.5
for (x in 1:dim(pnt_est_fil)[1]) {
for (y in 1:dim(pnt_est_fil)[2]) {
if (pnt_est_fil[x,y]>threshold) {pn_bin[x,y] <- 1 } # if proba of being heteroZ > threshold, label with 1
else {pn_bin[x,y] <- 0 }
}
}
nInd <- 80 # number of individuals in the dataset
hets <- rowSums(pn_bin)/nInd
# length(which(hets == 0)) # 182
# length(which(rowSums(pn_bin)>1)) # 4045
# length(which(rowSums(pn_bin)==1)) # 380
# remove singletons, meaning present in 1/nInd individuals AND
# present in zero individuals (left after filtering out some individuals, maybe)
pn_bin_fil <- pn_bin[c(rowSums(pn_bin)>1),] # 4045 x 80
hets <- (rowSums(pn_bin_fil))/nInd
par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity", ylab = "counts", 100, main = "heterozygosity per SNP, after removing singletons",
col = "#2ca25f")
# keep track of indices of the SNPs that make it through the filters
winners <- 1:c(dim(pn_est)[1]) # 4607
winners_v1 <- winners[rowSums(pn_bin)>1] # 4045
There are a lot of mutations that are above 0.8 - do we want to keep them when we calculate the replicability of our sequencing (i.e. the probability of cathing a mutation if it is there, or do we focus on somatic mutations for these values?)
What is the probability that, if I detect a mutation in one sample within the replicate group, I detect it in all other samples within the group?
setwd("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/0_replicate_dataset/figures")
#pdf(file=paste("replication_within_rep_groups.pdf", sep="" ), bg = "transparent", width=8, height=6, family = "Times New Roman")
par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
layout(matrix(c(1,2,2,1,2,2, 3,4,4,3,4,4), nrow = 4, ncol = 3, byrow = TRUE)) # layout for figure
# How many mutations are found in every replicate versus not?
ids_fil$group
## [1] 1 1 1 1 1 1 1 1 2 2 2 3 3 3 3 3 3 3 3 4 4 4 4 4 4
## [26] 5 5 5 5 5 5 5 5 6 6 6 6 6 6 7 7 8 8 8 8 8 8 8 8 9
## [51] 9 9 9 9 9 9 9 10 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 12 12
## [76] 12 12 12 12 12
# dataset divided by replicate group (3 to 8 samples)
group <- c(1:12) # 3 replicates for group 2, 2 replicates for group 7
store_prop <- c()
# number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
for (rep in 1:length(group)) {
inds <- which( ids_fil$group == group[rep] )
ind_prop <- c(rowSums(pn_bin_fil[,inds])/length(inds)) # proportion of individuals with the mutation
mean_n_mut <- mean(colSums(pn_bin_fil[,inds])) # mean number of mutations per sample
prop <- c( length(which( ind_prop == 1 )) ) / mean_n_mut
store_prop <- c(store_prop, prop)
}
data_store_prop <- store_prop
plot(data_store_prop, pch = 16, ylim = c(0,1), xlab = "Group", ylab = "Prop. of SNP found in all samples within group")
abline(h=mean(store_prop), col = "#045a8d") # 0.72
# 2 - Compare to random
store_mean_prop <- c()
for (sim in 1:1000){
pn_bin_rdn <- apply(pn_bin_fil, 2, sample)
store_prop <- c()
# number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
for (rep in 1:length(group)) {
rand_inds <- sample(ids_fil$group) # re-shuffle individuals
inds <- which( rand_inds == group[rep] )
ind_prop <- c(rowSums(pn_bin_rdn[,inds])/length(inds)) # proportion of individuals with the mutation
mean_n_mut <- mean(colSums(pn_bin_fil[,inds])) # mean number of mutations per sample
#prop <- c(length(which( ind_prop == 0 )) + length(which( ind_prop == 1 )) ) / nSNP
prop <- c( length(which( ind_prop == 1 )) ) / mean_n_mut
store_prop <- c(store_prop, prop)
}
store_mean_prop[sim] <- mean(store_prop)
}
# Plot and compare
hist(store_mean_prop, 10, xlab = "Proportion of SNP found in all samples within group", col = "#a8ddb5", xlim = c(0,1),
main = "", border = "#a8ddb5")
box()
lines(rep(mean(data_store_prop),2), c(0,150), col = "#045a8d")
legend("topleft", legend = c("Expected by chance" , "Mean for data (0.72)"), pch = 15, col = c("#a8ddb5", "#045a8d"), bty = "n" )
###### Part 2: Allow one missing mutation by changing threshold to 0.8
store_prop <- c()
# number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
for (rep in 1:length(group)) {
inds <- which( ids_fil$group == group[rep] )
ind_prop <- c(rowSums(pn_bin_fil[,inds])/length(inds)) # proportion of individuals with the mutation
mean_n_mut <- mean(colSums(pn_bin_fil[,inds])) # mean number of mutations per sample
prop <- c( length(which( ind_prop >= 0.8 )) ) / mean_n_mut
store_prop <- c(store_prop, prop)
}
data_store_prop <- store_prop
plot(data_store_prop, pch = 16, ylim = c(0,1), xlab = "Group", ylab = "Prop. of SNP found in >80% of the samples within group")
abline(h=mean(store_prop), col = "#045a8d") # 0.83
# 2 - Compare to random
store_mean_prop <- c()
for (sim in 1:1000){
pn_bin_rdn <- apply(pn_bin_fil, 2, sample)
store_prop <- c()
# number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
for (rep in 1:length(group)) {
rand_inds <- sample(ids_fil$group) # re-shuffle individuals
inds <- which( rand_inds == group[rep] )
ind_prop <- c(rowSums(pn_bin_rdn[,inds])/length(inds)) # proportion of individuals with the mutation
mean_n_mut <- mean(colSums(pn_bin_fil[,inds])) # mean number of mutations per sample
#prop <- c(length(which( ind_prop == 0 )) + length(which( ind_prop == 1 )) ) / nSNP
prop <- c( length(which( ind_prop >= 0.8 )) ) / mean_n_mut
store_prop <- c(store_prop, prop)
}
store_mean_prop[sim] <- mean(store_prop, na.rm=T)
}
# Plot and compare
hist(store_mean_prop, 10, xlab = "Proportion of SNP found in >80% of the samples within group", col = "#a8ddb5", xlim = c(0,1), border = "#a8ddb5", main="")
box()
lines(rep(mean(data_store_prop),2), c(0,250), col = "#045a8d")
legend("topleft", legend = c("Expected by chance" , "Mean for data (0.89)"), pch = 15, col = c("#a8ddb5", "#045a8d"), bty = "n" )
#dev.off()
We checked that we could trust the data. Now I will select for the somatic mutations only.
nSNP <- 4045
store_sum_group <- matrix(NA,nSNP,length(group)) # 4045 x 12
### reduce the dataset from individual information to group-level information
for (rep in 1:length(group)) {
inds <- which( ids_fil$group == group[rep] ) # filter by group
ind_sum <- rowSums(pn_bin_fil[,inds]) # nb of individuals with the mutation
store_sum_group[,rep] <- ind_sum
}
# Keep the SNP if found in at least 2 samples, and at most 9 groups
### identify the SNPs found more than once per group
occurrence <- matrix(0, nSNP, length(group))
for (G in 1:length(group)) {
occurrence[which(store_sum_group[,G] >= 2),G] <- 1
}
# occurence is a matrix where there is a "1" if the SNP is found in more than 2 samples within the group
# to find "somatic" SNPs, i.e. SNPs found in at most 8 groups, we can keep the lines (SNPs) for which the sun is less than 9
som_snps_idx <- which(rowSums(occurrence) < 9 & rowSums(occurrence) > 0) # 536 out of 4045
# Check the histogram
hets <- (rowSums(pn_bin_fil[som_snps_idx,]))/nInd
par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity", ylab = "counts", 50, main = "heterozygosity per SNP for somatic set",
col = "#2ca25f")
# Make a new boolean vector to export the SNPs to work with
winners_v2 <- winners_v1[som_snps_idx] # 536 mutations
bool <- matrix(0, dim(pn_bin)[1], 1)
bool[winners_v2] <- 1
#write.table(bool, "/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/0_replicate_dataset/data/bool_filter_replicate_vcf_536snps_today.txt", row.names = F)
# write.table(ids_fil$full_ID, "/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/0_replicate_dataset/data/ids_80inds.txt", row.names = F)
#
#
check <- pn_bin[as.factor(as.numeric(bool)),]
rowSums(check)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 80 80 1 1 1 1 1 1 1 1 1 1
## [25] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1
## [49] 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [73] 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [97] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 80 80 1 1 1
## [121] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [145] 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [169] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [193] 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 80 1 1
## [217] 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [241] 1 1 80 80 80 1 1 1 80 1 1 1 1 1 1 1 1 1 80 80 80 1 1 1
## [265] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1
## [289] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [313] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [337] 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [361] 1 1 1 1 1 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80
## [385] 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1
## [409] 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [433] 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1
## [457] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [481] 1 80 80 1 1 1 1 1 1 1 1 1 1 1 80 80 1 1 1 80 80 1 1 1
## [505] 1 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [529] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [553] 1 1 1 1 1 1 1 1 1 80 80 1 80 1 1 1 1 1 80 1 1 1 1 1
## [577] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [601] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [625] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [649] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [673] 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [697] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 80 1 1 1 1
## [721] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [745] 1 1 80 1 80 80 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [769] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [793] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [817] 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 80
## [841] 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [865] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [889] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1
## [913] 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1
## [937] 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [961] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 80 1 1 1
## [985] 1 1 1 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1009] 1 1 1 1 80 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1033] 1 1 1 1 80 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1057] 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 80 1
## [1081] 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1
## [1105] 80 80 1 80 80 80 1 80 80 1 1 1 1 1 1 80 80 1 1 80 1 80 1 1
## [1129] 1 1 1 1 1 1 1 1 1 80 80 1 1 1 1 80 80 1 1 1 1 1 1 1
## [1153] 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1177] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1201] 1 1 1 1 1 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80
## [1225] 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1
## [1249] 1 1 80 1 1 80 80 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80
## [1273] 1 1 1 1 1 1 1 1 1 80 1 1 80 1 1 1 1 1 1 1 1 1 1 1
## [1297] 1 80 1 80 1 80 80 80 1 1 1 1 1 1 1 1 1 80 80 1 1 80 1 1
## [1321] 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1345] 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 80 1 1
## [1369] 1 1 1 1 1 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1393] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 80 1 1 1
## [1417] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1441] 1 1 80 1 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1465] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 80 80 1 1 1 1
## [1489] 80 80 80 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80
## [1513] 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1537] 1 1 1 1 1 1 1 1 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1561] 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1
## [1585] 1 80 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80
## [1609] 1 1 1 1 1 1 1 1 1 1 1 1 80 1 80 1 1 1 1 1 1 1 1 80
## [1633] 1 1 1 1 80 80 1 80 1 1 1 1 1 80 80 1 1 1 1 80 1 1 1 1
## [1657] 1 80 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1681] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 80 1
## [1705] 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1729] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1
## [1753] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1777] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1
## [1801] 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1825] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 80 80 1
## [1849] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1873] 1 1 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1897] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1921] 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1945] 1 1 80 80 1 1 1 1 1 1 1 1 80 80 1 1 1 1 1 1 1 1 1 1
## [1969] 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1
## [1993] 1 1 1 1 1 80 80 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1
## [2017] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2041] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2065] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2089] 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2113] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2137] 1 1 1 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1
## [2161] 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1
## [2185] 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2209] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2233] 1 1 1 1 80 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1
## [2257] 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2281] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2305] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 80 1
## [2329] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2353] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2377] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2401] 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2425] 1 1 1 1 1 1 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2449] 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2473] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 80 1 1 1
## [2497] 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 80 1 1 1
## [2521] 1 1 1 1 1 1 1 1 1 80 1 1 80 80 80 1 1 1 1 1 1 1 1 1
## [2545] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1
## [2569] 1 1 1 1 1 1 80 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 80 1
## [2593] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 80 1 1 1
## [2617] 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 80
## [2641] 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2665] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 80 1 80 1 1
## [2689] 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2713] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1
## [2737] 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2761] 1 1 1 1 1 1 1 1 1 1 80 80 1 1 1 1 1 80 80 80 1 80 1 1
## [2785] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 80 80 1 1 1 1
## [2809] 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2833] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 80 1 80 1
## [2857] 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2881] 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2905] 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2929] 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1
## [2953] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2977] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3001] 1 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 80 1
## [3025] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 80 1
## [3049] 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1
## [3073] 1 80 80 80 1 1 1 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3097] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1
## [3121] 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80
## [3145] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3169] 80 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1
## [3193] 1 1 1 80 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 80 1
## [3217] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80
## [3241] 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3265] 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3289] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3313] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3337] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80
## [3361] 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1
## [3385] 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 80 1 1 1 1 80 80 1 1
## [3409] 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 80 80 1
## [3433] 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 80 1 80 1 1 1 1 1 1
## [3457] 1 1 80 1 1 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 80
## [3481] 80 80 1 1 1 1 1 1 1 80 80 1 80 80 1 80 1 80 80 80 1 80 1 1
## [3505] 1 1 1 80 80 1 80 80 1 1 1 80 1 1 1 1 1 80 1 1 1 1 80 80
## [3529] 80 80 80 80 80 80 80 1 1 80 80 1 1 1 1 1 1 1 80 1 80 80 1 80
## [3553] 80 1 80 1 1 1 1 80 80 1 80 1 1 80 1 1 1 1 1 1 80 1 1 1
## [3577] 1 1 1 1 1 1 1 1 1 1 1 80 1 80 80 80 1 1 80 80 1 1 1 1
## [3601] 1 1 1 1 1 80 1 80 1 80 1 1 80 80 80 80 80 80 1 80 80 80 80 80
## [3625] 1 1 1 80 80 1 1 80 1 1 1 1 1 1 80 1 1 1 80 1 1 1 1 1
## [3649] 1 80 1 80 80 1 1 80 80 1 80 1 80 80 80 80 80 80 80 1 80 80 80 1
## [3673] 1 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 80 80 80 80 80
## [3697] 80 1 1 1 1 80 80 1 1 1 1 1 80 1 1 80 80 80 80 1 1 1 1 1
## [3721] 1 1 1 1 1 1 1 1 80 1 80 1 80 1 1 1 1 80 80 80 1 80 80 80
## [3745] 80 80 80 1 80 1 80 80 1 1 1 80 1 1 80 1 1 1 1 1 80 80 1 1
## [3769] 1 1 1 80 1 1 1 1 1 80 1 1 1 1 1 80 1 1 80 1 1 80 1 1
## [3793] 1 1 80 1 80 1 1 1 1 80 80 80 1 1 1 80 80 1 80 80 80 80 1 1
## [3817] 1 80 1 1 1 1 80 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3841] 1 1 1 80 1 1 1 1 1 1 1 80 80 80 1 80 80 1 80 80 80 1 1 80
## [3865] 1 1 1 1 1 1 1 1 1 80 80 80 1 1 1 1 1 1 80 80 80 80 1 1
## [3889] 1 1 80 1 80 1 1 1 1 1 80 1 80 1 80 1 80 80 80 1 80 80 1 1
## [3913] 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 80 80 80 1 1 1 1
## [3937] 80 80 80 1 1 1 80 80 80 80 1 80 80 80 80 1 1 1 1 1 80 1 80 80
## [3961] 1 1 80 1 1 1 1 80 1 80 80 1 1 80 1 80 1 1 1 80 1 1 1 80
## [3985] 1 80 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 80 1 80 1 1
## [4009] 80 1 80 80 1 1 1 80 1 1 80 1 1 1 1 1 1 1 1 1 80 80 1 1
## [4033] 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 80 80 80 1 1 1 1 1
## [4057] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [4081] 1 1 1 80 1 1 80 1 1 1 1 80 1 80 1 1 1 1 1 1 1 1 1 1
## [4105] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [4129] 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [4153] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1
## [4177] 1 1 80 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [4201] 1 1 1 1 1 1 1 1 1 1 80 1 1 1 80 1 1 1 1 1 1 1 1 1
## [4225] 1 1 1 80 80 1 1 1 1 1 1 1 80 1 1 1 1 80 80 1 80 1 1 80
## [4249] 1 80 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1
## [4273] 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [4297] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [4321] 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 80 1 80 1 1 1
## [4345] 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 1 1
## [4369] 1 1 1 1 1 1 1 1 1 1 1 80 1 1 1 80 1 1 1 1 80 80 80 1
## [4393] 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [4417] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [4441] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [4465] 1 1 1 1 1 1 1 1 80 80 1 1 80 80 1 1 1 1 1 1 1 1 1 1
## [4489] 1 80 1 1 1 1 1 1 80 80 80 1 1 1 80 80 80 1 1 1 1 80 80 1
## [4513] 1 80 80 1 1 1 1 1 1 1 1 1 1 1 80 80 1 1 1 1 1 80 80 80
## [4537] 1 1 1 1 1 1 1 1 80 1 1 1 1 1 1 1 1 80 80 1 1 1 1 1
## [4561] 80 1 1 1 1 80 1 80 1 1 1 1 1 1 1 1 80 1 80 80 1 1 1 80
## [4585] 1 1 1 80 1 80 1 1 1 80 1 1 1 1 1 1 80 80 80 80 1 1 1
colSums(check)
## [1] 536 536 536 536 536 536 536 536 536 536 536 536 536 536 536
## [16] 536 536 536 536 536 536 536 536 536 536 536 536 536 536 536
## [31] 536 536 536 536 536 536 536 536 536 536 536 536 536 536 536
## [46] 536 536 536 536 536 536 536 4607 536 536 536 536 536 536 536
## [61] 536 536 536 536 536 536 536 536 536 536 536 536 536 536 536
## [76] 536 536 536 536 536
check <- pn_bin[which(bool==1),]
rowSums(check)
## [1] 35 29 8 8 9 29 20 22 11 8 26 2 12 5 12 10 31 12 10 12 9 7 5 5 8
## [26] 9 10 27 29 12 23 8 8 8 3 4 7 10 21 30 28 13 22 11 9 14 20 25 6 10
## [51] 4 17 6 46 16 9 6 27 20 15 24 30 21 7 11 12 18 6 14 13 14 8 8 10 15
## [76] 17 11 23 24 15 9 10 8 8 45 12 5 25 19 27 6 11 18 3 3 9 20 10 13 8
## [101] 6 19 18 28 11 29 23 19 4 12 15 2 24 24 6 11 11 22 10 9 4 16 15 29 26
## [126] 26 30 6 19 6 2 38 29 8 30 22 6 5 15 15 4 3 19 23 7 7 4 27 31 29
## [151] 13 10 10 12 23 21 8 14 27 20 6 27 24 22 2 2 20 36 4 7 5 10 8 15 6
## [176] 6 44 34 27 5 6 32 25 27 7 6 4 5 47 4 5 5 3 25 28 35 3 4 7 8
## [201] 24 7 11 14 17 11 6 13 4 11 2 9 21 20 26 6 8 5 4 38 12 6 4 5 29
## [226] 17 12 12 5 2 3 3 27 27 25 15 17 6 25 12 36 9 19 2 17 22 5 6 20 12
## [251] 14 12 18 20 2 23 25 9 20 27 21 31 5 23 12 15 6 21 22 8 8 9 8 9 33
## [276] 9 7 9 25 9 25 6 5 5 5 20 19 13 2 19 6 34 7 6 22 10 27 7 11 19
## [301] 13 10 13 18 25 21 12 20 7 17 32 27 3 3 4 30 8 9 7 11 30 10 24 35 6
## [326] 8 6 25 6 19 37 15 6 7 12 32 17 26 16 16 23 4 6 6 6 6 4 9 16 15
## [351] 7 11 12 7 3 3 4 30 25 18 25 10 2 3 18 2 2 14 29 28 32 32 21 21 35
## [376] 6 29 29 6 18 8 12 4 10 8 17 15 7 4 2 7 4 4 7 6 14 14 11 7 13
## [401] 11 6 7 8 16 30 14 18 11 10 9 3 6 4 23 34 10 24 20 15 15 12 24 46 17
## [426] 31 24 10 12 16 21 17 33 22 18 11 8 8 17 4 4 10 5 11 2 15 17 9 3 27
## [451] 2 5 24 8 23 5 9 10 20 32 5 17 24 16 14 18 3 24 25 6 8 25 8 20 19
## [476] 34 6 13 13 4 14 7 14 12 33 16 8 5 10 24 26 15 2 2 7 19 20 20 10 3
## [501] 5 24 18 17 4 3 9 12 9 17 15 26 17 18 7 11 9 4 5 19 7 5 5 4 3
## [526] 24 13 12 3 8 6 6 14 13 13 14
colSums(check)
## [1] 43 46 95 49 87 41 41 40 89 95 123 71 126 97 13 126 121 71 116
## [20] 138 149 155 65 137 115 116 80 54 92 93 80 67 157 130 130 157 134 140
## [39] 175 138 95 30 34 35 70 52 41 92 32 77 62 127 163 100 117 98 90
## [58] 114 151 64 62 105 80 87 118 135 172 157 111 140 44 166 156 119 57 38
## [77] 41 12 96 119
par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(c(rowSums(pn_bin[which(bool==1),]))/96, xlab = "Heterozygosity", ylab = "counts", 50, main = "heterozygosity per SNP for somatic set",
col = "#2ca25f")
How many mutations in common between this set of 536 somatic mutations and the single samples from the Pando dataset?
setwd("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/0_replicate_dataset/data")
# upoad pntest data and depth info
pn_est <- read.table("pntest_somatic_101SNPs_80inds.txt", sep = " ", header = F ) # 101 x 80
# -----------------------------------------------------------------------------
# BINARIZATION
# -----------------------------------------------------------------------------
pn_bin <- matrix(NaN, dim(pn_est)[1], dim(pn_est)[2])
threshold <- 0.5
for (x in 1:dim(pn_est)[1]) {
for (y in 1:dim(pn_est)[2]) {
if (pn_est[x,y]>threshold) {pn_bin[x,y] <- 1 } # if proba of being heteroZ > threshold, label with 1
else {pn_bin[x,y] <- 0 }
}
}
nInd <- 80
hets <- (rowSums(pn_bin))/nInd
setwd("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/0_replicate_dataset/figures/")
pdf(file=paste("hets_per_snp_replicate_dataset.pdf", sep="" ), bg = "transparent", width=5, height=3, family = "Times New Roman")
par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity per SNP for somatic set", ylab = "counts", 50, main = "", sub = "Replicate dataset, 80 individuals and 101 SNPs",
col = "#2ca25f")
dev.off()
## quartz_off_screen
## 2
# If the mutation is found in 2 replicate samples, in how many samples is it found?
t <- 9
p <- which(rowSums(occurrence) >= t ) # found in at least 2 groups, 3725 groups
mean(rowSums(occurrence[p,])) # 11.53
## [1] 11.72422
# compare to random
N <- 100
store_means <- matrix(NA, 1, N)
for (n in 1:N) {
rdn <- matrix(NA,dim(occurrence)[1],dim(occurrence)[2])
# shuffle dataset here
for (i in 1:dim(occurrence)[2]) {
rdn[,i] <- occurrence[sample(1:dim(occurrence)[1]),i]
}
p <- which( rowSums(rdn) >= t ) # found in at least 2 groups, 3725 groups
print(length(p))
store_means[n] <- mean(rowSums(rdn[p,]))
}
## [1] 3656
## [1] 3651
## [1] 3667
## [1] 3642
## [1] 3663
## [1] 3638
## [1] 3670
## [1] 3679
## [1] 3652
## [1] 3683
## [1] 3656
## [1] 3662
## [1] 3618
## [1] 3670
## [1] 3679
## [1] 3649
## [1] 3672
## [1] 3633
## [1] 3655
## [1] 3640
## [1] 3667
## [1] 3661
## [1] 3652
## [1] 3636
## [1] 3660
## [1] 3654
## [1] 3660
## [1] 3656
## [1] 3609
## [1] 3629
## [1] 3659
## [1] 3658
## [1] 3646
## [1] 3643
## [1] 3658
## [1] 3651
## [1] 3640
## [1] 3647
## [1] 3680
## [1] 3667
## [1] 3658
## [1] 3650
## [1] 3664
## [1] 3651
## [1] 3647
## [1] 3657
## [1] 3654
## [1] 3651
## [1] 3659
## [1] 3655
## [1] 3661
## [1] 3664
## [1] 3653
## [1] 3667
## [1] 3643
## [1] 3670
## [1] 3670
## [1] 3652
## [1] 3645
## [1] 3647
## [1] 3659
## [1] 3661
## [1] 3664
## [1] 3619
## [1] 3675
## [1] 3653
## [1] 3650
## [1] 3679
## [1] 3614
## [1] 3676
## [1] 3645
## [1] 3664
## [1] 3651
## [1] 3675
## [1] 3644
## [1] 3676
## [1] 3649
## [1] 3673
## [1] 3668
## [1] 3679
## [1] 3648
## [1] 3635
## [1] 3651
## [1] 3670
## [1] 3648
## [1] 3651
## [1] 3676
## [1] 3677
## [1] 3657
## [1] 3675
## [1] 3648
## [1] 3641
## [1] 3666
## [1] 3647
## [1] 3672
## [1] 3648
## [1] 3649
## [1] 3660
## [1] 3668
## [1] 3638
mean(store_means) # 10.16
## [1] 10.42017
# Plot and compare
hist(store_means/12, 10, xlab = "Number", col = "#a8ddb5", border = "#a8ddb5", main="")
lines(rep(mean(rowSums(occurrence[p,]))/12, 2), c(0,25), col = "#045a8d")
legend("topright", legend = c("Expected by chance" , "Mean for data (0.85)"), pch = 15, col = c("#a8ddb5", "#045a8d"), bty = "n" )
par(family = "Times New Roman")
p1 <- hist(rowSums(rdn)/12, 7, plot = F)
p2 <- hist(rowSums(occurrence[p,])/12, plot = F)
plot(p2, col=rgb(0,0,1,1/4), main = "Prop. of inds with the snp when found in 2 inds", xlab = "Proportion" )
plot(p1, col=rgb(168/255,221/255,181/255,1/4), add=T)
legend("topleft", legend= c("data", "randomized dataset"), pt.bg = c(rgb(0,0,1,1/4), rgb(168/255,221/255,181/255,1/4)), bty = "n",
pch = 22)