Dataset details

The variants were called on the 2024 dataset only (8 times 12 samples from the 2022 subclone dataset).
filters applied so far : SNPs removed when found in PON + Friends + “crap” + modified allele frequency vector + somatic point estimate script

reasoning for the modified allele frequency vector : the allele frequencies are sometimes >0.5. For somatic mutations, we do not want to keep mutations that are widespread (shared by all individuals). The Pando clone is triploid, which reduces our expectation for fixation of a mutation to 0.33. We thus transform the allele frequency file to not take into account any variants whose allele frequency > 0.7, and reduce to 0.5 ane allele frequency comprised between 0.5 and 0.7.

—————————————————————————–

Step 1: point estimates with modified allele frequencies

—————————————————————————–

Filter for mean individual read depth>4 , empty variants and singletons.

setwd("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/0_replicate_dataset/data")
# upoad pntest data and depth info
pn_est <- read.table("pntest_somatic_filtered_ponfil_friendfil_replicates.txt", sep = " ", header = F ) # 4607 x 96
depth <- read.table("depth_filtered_ponfil_friendfil_replicates.txt", sep = "\t", header = T) # 4607 x 96
ids <- read.csv("ids_filtered_ponfil_friendfil_replicates.txt", sep = "\t", header = T) # 96 x 5


# Filter based on depth
par(family = "Times New Roman")
plot(colMeans(depth), pch = 16, xlab = "individual", ylab = "depth", ylim = c(0,150)) # Mean depth per sample
abline(h=4, col = "red")

keep <- which (colMeans(depth) >= 4) # 80

pnt_est_fil <- pn_est[,keep] # 797 x 80
ids_fil <- ids[as.numeric(keep), ] # 80 x 5
depth_fil <- depth[,keep] # 797 x 80


# -----------------------------------------------------------------------------
# BINARIZATION
# -----------------------------------------------------------------------------
pn_bin <- matrix(NaN, dim(pnt_est_fil)[1], dim(pnt_est_fil)[2])
threshold <- 0.5

for (x in 1:dim(pnt_est_fil)[1]) {
  
  for (y in 1:dim(pnt_est_fil)[2]) {
    
    if (pnt_est_fil[x,y]>threshold) {pn_bin[x,y] <- 1 } # if proba of being heteroZ > threshold, label with 1
    
    else {pn_bin[x,y] <- 0 }
    
  }
}


nInd <- 80 # number of individuals in the dataset
hets <- rowSums(pn_bin)/nInd
# length(which(hets == 0)) # 182
# length(which(rowSums(pn_bin)>1)) # 4045
# length(which(rowSums(pn_bin)==1)) # 380

# remove singletons, meaning present in 1/nInd individuals AND
# present in zero individuals (left after filtering out some individuals, maybe)

pn_bin_fil <- pn_bin[c(rowSums(pn_bin)>1),] # 4045 x  80
hets <- (rowSums(pn_bin_fil))/nInd

par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity", ylab = "counts", 100, main = "heterozygosity per SNP, after removing singletons", 
     col = "#2ca25f")

# keep track of indices of the SNPs that make it through the filters
winners <- 1:c(dim(pn_est)[1]) # 4607
winners_v1 <- winners[rowSums(pn_bin)>1] # 4045

There are a lot of mutations that are above 0.8 - do we want to keep them when we calculate the replicability of our sequencing (i.e. the probability of cathing a mutation if it is there, or do we focus on somatic mutations for these values?)

—————————————————————————–

Step 2. Assess replication power (quality of our re-sequencing experiment)

—————————————————————————–

What is the probability that, if I detect a mutation in one sample within the replicate group, I detect it in all other samples within the group?

setwd("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/0_replicate_dataset/figures")

#pdf(file=paste("replication_within_rep_groups.pdf", sep="" ), bg = "transparent", width=8, height=6, family = "Times New Roman")

par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
layout(matrix(c(1,2,2,1,2,2, 3,4,4,3,4,4), nrow = 4, ncol = 3, byrow = TRUE)) # layout for figure


# How many mutations are found in every replicate versus not?
ids_fil$group

##  [1]  1  1  1  1  1  1  1  1  2  2  2  3  3  3  3  3  3  3  3  4  4  4  4  4  4
## [26]  5  5  5  5  5  5  5  5  6  6  6  6  6  6  7  7  8  8  8  8  8  8  8  8  9
## [51]  9  9  9  9  9  9  9 10 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 12 12
## [76] 12 12 12 12 12

# dataset divided by replicate group (3 to 8 samples)
group <- c(1:12) # 3 replicates for group 2, 2 replicates for group 7 
store_prop <- c()

# number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
for (rep in 1:length(group)) {
  
  inds <- which( ids_fil$group == group[rep] )

  ind_prop <- c(rowSums(pn_bin_fil[,inds])/length(inds)) # proportion of individuals with the mutation

  mean_n_mut <- mean(colSums(pn_bin_fil[,inds])) # mean number of mutations per sample

  prop <- c( length(which( ind_prop == 1 )) ) / mean_n_mut
  
  store_prop <- c(store_prop, prop)
  
}

data_store_prop <- store_prop


plot(data_store_prop, pch = 16, ylim = c(0,1), xlab = "Group", ylab = "Prop. of SNP found in all samples within group")
abline(h=mean(store_prop), col = "#045a8d") # 0.72

# 2 - Compare to random
store_mean_prop <- c()

for (sim in 1:1000){
  

  pn_bin_rdn <- apply(pn_bin_fil, 2, sample)
  store_prop <- c()
  
  # number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
  for (rep in 1:length(group)) {
    
    rand_inds <- sample(ids_fil$group) # re-shuffle individuals  
    
    inds <- which( rand_inds == group[rep] )
    
    ind_prop <- c(rowSums(pn_bin_rdn[,inds])/length(inds)) # proportion of individuals with the mutation
    mean_n_mut <- mean(colSums(pn_bin_fil[,inds])) # mean number of mutations per sample
    #prop <- c(length(which( ind_prop == 0 )) + length(which( ind_prop == 1 )) ) / nSNP
    prop <- c( length(which( ind_prop == 1 )) ) / mean_n_mut
    store_prop <- c(store_prop, prop)
    
    }
  
  store_mean_prop[sim] <- mean(store_prop)
  
}

# Plot and compare
hist(store_mean_prop, 10, xlab = "Proportion of SNP found in all samples within group", col = "#a8ddb5", xlim = c(0,1),
     main = "", border = "#a8ddb5")
box()
lines(rep(mean(data_store_prop),2), c(0,150),  col = "#045a8d")
legend("topleft", legend = c("Expected by chance" , "Mean for data (0.72)"), pch = 15, col = c("#a8ddb5", "#045a8d"), bty = "n" )


###### Part 2: Allow one missing mutation by changing threshold to 0.8

store_prop <- c()

# number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
for (rep in 1:length(group)) {
  
  inds <- which( ids_fil$group == group[rep] )

  ind_prop <- c(rowSums(pn_bin_fil[,inds])/length(inds)) # proportion of individuals with the mutation

  mean_n_mut <- mean(colSums(pn_bin_fil[,inds])) # mean number of mutations per sample
  
  prop <- c( length(which( ind_prop >= 0.8 )) ) / mean_n_mut
  store_prop <- c(store_prop, prop)
  
}

data_store_prop <- store_prop
plot(data_store_prop, pch = 16, ylim = c(0,1), xlab = "Group", ylab = "Prop. of SNP found in >80% of the samples within group")
abline(h=mean(store_prop), col = "#045a8d") # 0.83

# 2 - Compare to random
store_mean_prop <- c()

for (sim in 1:1000){
  

  pn_bin_rdn <- apply(pn_bin_fil, 2, sample)
  store_prop <- c()
  
  # number of mutations found in every replicate, normalized by the mean number of mutations found in each replicate
  for (rep in 1:length(group)) {
    
    rand_inds <- sample(ids_fil$group) # re-shuffle individuals  
    
    inds <- which( rand_inds == group[rep] )
    
    ind_prop <- c(rowSums(pn_bin_rdn[,inds])/length(inds)) # proportion of individuals with the mutation
    mean_n_mut <- mean(colSums(pn_bin_fil[,inds])) # mean number of mutations per sample
    #prop <- c(length(which( ind_prop == 0 )) + length(which( ind_prop == 1 )) ) / nSNP
    prop <- c( length(which( ind_prop >= 0.8 )) ) / mean_n_mut
    store_prop <- c(store_prop, prop)
    
    }
  
  store_mean_prop[sim] <- mean(store_prop, na.rm=T)
  
}

# Plot and compare
hist(store_mean_prop, 10, xlab = "Proportion of SNP found in >80% of the samples within group", col = "#a8ddb5", xlim = c(0,1), border = "#a8ddb5", main="")
box()
lines(rep(mean(data_store_prop),2), c(0,250),  col = "#045a8d")
legend("topleft", legend = c("Expected by chance" , "Mean for data (0.89)"), pch = 15, col = c("#a8ddb5", "#045a8d"), bty = "n" )

#dev.off()

—————————————————————————–

Step 3: Filtering for somatic mutations

—————————————————————————–

We checked that we could trust the data. Now I will select for the somatic mutations only.

nSNP <- 4045

store_sum_group <- matrix(NA,nSNP,length(group)) # 4045 x 12

### reduce the dataset from individual information to group-level information

for (rep in 1:length(group)) {
  
  inds <- which( ids_fil$group == group[rep] ) # filter by group

  ind_sum <- rowSums(pn_bin_fil[,inds])  # nb of individuals with the mutation

  store_sum_group[,rep] <- ind_sum
  
}

# Keep the SNP if found in at least 2 samples, and at most 9 groups 

### identify the SNPs found more than once per group
occurrence <- matrix(0, nSNP, length(group)) 

for (G in 1:length(group)) {
    
      occurrence[which(store_sum_group[,G] >= 2),G] <- 1
}


# occurence is a matrix where there is a "1"  if the SNP is found in more than 2 samples within the group

# to find "somatic" SNPs, i.e. SNPs found in at most 8 groups, we can keep the lines (SNPs) for which the sun is less than 9

som_snps_idx <- which(rowSums(occurrence) < 9 & rowSums(occurrence) > 0) # 536 out of 4045

# Check the histogram
hets <- (rowSums(pn_bin_fil[som_snps_idx,]))/nInd

par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity", ylab = "counts", 50, main = "heterozygosity per SNP for somatic set", 
     col = "#2ca25f")

# Make a new boolean vector to export the SNPs to work with
winners_v2 <- winners_v1[som_snps_idx] # 536 mutations
bool <- matrix(0, dim(pn_bin)[1], 1)
bool[winners_v2] <- 1 

 #write.table(bool, "/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/0_replicate_dataset/data/bool_filter_replicate_vcf_536snps_today.txt", row.names = F)
 # write.table(ids_fil$full_ID, "/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/0_replicate_dataset/data/ids_80inds.txt", row.names = F)
 # 
 # 
check <- pn_bin[as.factor(as.numeric(bool)),]
rowSums(check)

##    [1]  1  1  1  1  1  1  1  1  1  1  1  1 80 80  1  1  1  1  1  1  1  1  1  1
##   [25]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1
##   [49]  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##   [73]  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##   [97]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1 80 80  1  1  1
##  [121]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [145]  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [169]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [193]  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1 80  1  1
##  [217]  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [241]  1  1 80 80 80  1  1  1 80  1  1  1  1  1  1  1  1  1 80 80 80  1  1  1
##  [265]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1
##  [289]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [313]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [337]  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [361]  1  1  1  1  1  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80
##  [385] 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1
##  [409]  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [433]  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1
##  [457]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [481]  1 80 80  1  1  1  1  1  1  1  1  1  1  1 80 80  1  1  1 80 80  1  1  1
##  [505]  1  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [529]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [553]  1  1  1  1  1  1  1  1  1 80 80  1 80  1  1  1  1  1 80  1  1  1  1  1
##  [577]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [601]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [625]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [649]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [673] 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [697]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80 80  1  1  1  1
##  [721]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [745]  1  1 80  1 80 80 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [769]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [793]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [817]  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1 80
##  [841]  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [865]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [889]  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1
##  [913]  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1
##  [937]  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [961]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80 80  1  1  1
##  [985]  1  1  1  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1009]  1  1  1  1 80  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1033]  1  1  1  1 80 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1057]  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1 80  1
## [1081]  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1
## [1105] 80 80  1 80 80 80  1 80 80  1  1  1  1  1  1 80 80  1  1 80  1 80  1  1
## [1129]  1  1  1  1  1  1  1  1  1 80 80  1  1  1  1 80 80  1  1  1  1  1  1  1
## [1153]  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1177]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1201]  1  1  1  1  1  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80
## [1225]  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1
## [1249]  1  1 80  1  1 80 80  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80
## [1273]  1  1  1  1  1  1  1  1  1 80  1  1 80  1  1  1  1  1  1  1  1  1  1  1
## [1297]  1 80  1 80  1 80 80 80  1  1  1  1  1  1  1  1  1 80 80  1  1 80  1  1
## [1321] 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1345]  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1 80  1  1
## [1369]  1  1  1  1  1  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1393]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1 80  1  1  1
## [1417]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1441]  1  1 80  1  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1465]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80 80 80  1  1  1  1
## [1489] 80 80 80  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80
## [1513]  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1537]  1  1  1  1  1  1  1  1  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1561]  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1
## [1585]  1 80  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80
## [1609]  1  1  1  1  1  1  1  1  1  1  1  1 80  1 80  1  1  1  1  1  1  1  1 80
## [1633]  1  1  1  1 80 80  1 80  1  1  1  1  1 80 80  1  1  1  1 80  1  1  1  1
## [1657]  1 80 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1681]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80 80  1
## [1705]  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1729]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1
## [1753]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1777]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1
## [1801] 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1825]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80 80 80  1
## [1849]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1873]  1  1  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1897]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1921]  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [1945]  1  1 80 80  1  1  1  1  1  1  1  1 80 80  1  1  1  1  1  1  1  1  1  1
## [1969]  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1
## [1993]  1  1  1  1  1 80 80  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1
## [2017]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2041]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2065]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2089] 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2113]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2137]  1  1  1  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1
## [2161]  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1
## [2185]  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2209]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2233]  1  1  1  1 80  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1
## [2257]  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2281]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2305]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1 80  1
## [2329]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2353]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2377]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2401]  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2425]  1  1  1  1  1  1  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2449]  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2473]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1 80  1  1  1
## [2497]  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1 80  1  1  1
## [2521]  1  1  1  1  1  1  1  1  1 80  1  1 80 80 80  1  1  1  1  1  1  1  1  1
## [2545]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1
## [2569]  1  1  1  1  1  1 80  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1 80  1
## [2593]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80 80  1  1  1
## [2617]  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1 80
## [2641] 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2665]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80 80  1 80  1  1
## [2689] 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2713]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1
## [2737]  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2761]  1  1  1  1  1  1  1  1  1  1 80 80  1  1  1  1  1 80 80 80  1 80  1  1
## [2785]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80 80 80  1  1  1  1
## [2809]  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2833]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80 80  1 80  1
## [2857]  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2881]  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2905]  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2929]  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1
## [2953]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [2977]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [3001]  1  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1 80  1
## [3025]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80 80  1
## [3049]  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1
## [3073]  1 80 80 80  1  1  1  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [3097]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1
## [3121]  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80
## [3145]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [3169] 80  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1
## [3193]  1  1  1 80  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1 80  1
## [3217]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80
## [3241]  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [3265]  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1
## [3289]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [3313]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [3337]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80
## [3361]  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1
## [3385]  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1 80  1  1  1  1 80 80  1  1
## [3409]  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1 80 80  1
## [3433]  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1 80  1 80  1  1  1  1  1  1
## [3457]  1  1 80  1  1  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1 80
## [3481] 80 80  1  1  1  1  1  1  1 80 80  1 80 80  1 80  1 80 80 80  1 80  1  1
## [3505]  1  1  1 80 80  1 80 80  1  1  1 80  1  1  1  1  1 80  1  1  1  1 80 80
## [3529] 80 80 80 80 80 80 80  1  1 80 80  1  1  1  1  1  1  1 80  1 80 80  1 80
## [3553] 80  1 80  1  1  1  1 80 80  1 80  1  1 80  1  1  1  1  1  1 80  1  1  1
## [3577]  1  1  1  1  1  1  1  1  1  1  1 80  1 80 80 80  1  1 80 80  1  1  1  1
## [3601]  1  1  1  1  1 80  1 80  1 80  1  1 80 80 80 80 80 80  1 80 80 80 80 80
## [3625]  1  1  1 80 80  1  1 80  1  1  1  1  1  1 80  1  1  1 80  1  1  1  1  1
## [3649]  1 80  1 80 80  1  1 80 80  1 80  1 80 80 80 80 80 80 80  1 80 80 80  1
## [3673]  1 80 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1 80 80 80 80 80
## [3697] 80  1  1  1  1 80 80  1  1  1  1  1 80  1  1 80 80 80 80  1  1  1  1  1
## [3721]  1  1  1  1  1  1  1  1 80  1 80  1 80  1  1  1  1 80 80 80  1 80 80 80
## [3745] 80 80 80  1 80  1 80 80  1  1  1 80  1  1 80  1  1  1  1  1 80 80  1  1
## [3769]  1  1  1 80  1  1  1  1  1 80  1  1  1  1  1 80  1  1 80  1  1 80  1  1
## [3793]  1  1 80  1 80  1  1  1  1 80 80 80  1  1  1 80 80  1 80 80 80 80  1  1
## [3817]  1 80  1  1  1  1 80  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [3841]  1  1  1 80  1  1  1  1  1  1  1 80 80 80  1 80 80  1 80 80 80  1  1 80
## [3865]  1  1  1  1  1  1  1  1  1 80 80 80  1  1  1  1  1  1 80 80 80 80  1  1
## [3889]  1  1 80  1 80  1  1  1  1  1 80  1 80  1 80  1 80 80 80  1 80 80  1  1
## [3913]  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1 80 80 80  1  1  1  1
## [3937] 80 80 80  1  1  1 80 80 80 80  1 80 80 80 80  1  1  1  1  1 80  1 80 80
## [3961]  1  1 80  1  1  1  1 80  1 80 80  1  1 80  1 80  1  1  1 80  1  1  1 80
## [3985]  1 80  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1 80  1 80  1  1
## [4009] 80  1 80 80  1  1  1 80  1  1 80  1  1  1  1  1  1  1  1  1 80 80  1  1
## [4033]  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1 80 80 80  1  1  1  1  1
## [4057]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [4081]  1  1  1 80  1  1 80  1  1  1  1 80  1 80  1  1  1  1  1  1  1  1  1  1
## [4105]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [4129]  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [4153]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1
## [4177]  1  1 80  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [4201]  1  1  1  1  1  1  1  1  1  1 80  1  1  1 80  1  1  1  1  1  1  1  1  1
## [4225]  1  1  1 80 80  1  1  1  1  1  1  1 80  1  1  1  1 80 80  1 80  1  1 80
## [4249]  1 80  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1
## [4273]  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [4297]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [4321]  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1 80  1 80  1  1  1
## [4345]  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1  1  1
## [4369]  1  1  1  1  1  1  1  1  1  1  1 80  1  1  1 80  1  1  1  1 80 80 80  1
## [4393]  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [4417]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [4441]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
## [4465]  1  1  1  1  1  1  1  1 80 80  1  1 80 80  1  1  1  1  1  1  1  1  1  1
## [4489]  1 80  1  1  1  1  1  1 80 80 80  1  1  1 80 80 80  1  1  1  1 80 80  1
## [4513]  1 80 80  1  1  1  1  1  1  1  1  1  1  1 80 80  1  1  1  1  1 80 80 80
## [4537]  1  1  1  1  1  1  1  1 80  1  1  1  1  1  1  1  1 80 80  1  1  1  1  1
## [4561] 80  1  1  1  1 80  1 80  1  1  1  1  1  1  1  1 80  1 80 80  1  1  1 80
## [4585]  1  1  1 80  1 80  1  1  1 80  1  1  1  1  1  1 80 80 80 80  1  1  1

colSums(check)

##  [1]  536  536  536  536  536  536  536  536  536  536  536  536  536  536  536
## [16]  536  536  536  536  536  536  536  536  536  536  536  536  536  536  536
## [31]  536  536  536  536  536  536  536  536  536  536  536  536  536  536  536
## [46]  536  536  536  536  536  536  536 4607  536  536  536  536  536  536  536
## [61]  536  536  536  536  536  536  536  536  536  536  536  536  536  536  536
## [76]  536  536  536  536  536

check <- pn_bin[which(bool==1),]
rowSums(check)

##   [1] 35 29  8  8  9 29 20 22 11  8 26  2 12  5 12 10 31 12 10 12  9  7  5  5  8
##  [26]  9 10 27 29 12 23  8  8  8  3  4  7 10 21 30 28 13 22 11  9 14 20 25  6 10
##  [51]  4 17  6 46 16  9  6 27 20 15 24 30 21  7 11 12 18  6 14 13 14  8  8 10 15
##  [76] 17 11 23 24 15  9 10  8  8 45 12  5 25 19 27  6 11 18  3  3  9 20 10 13  8
## [101]  6 19 18 28 11 29 23 19  4 12 15  2 24 24  6 11 11 22 10  9  4 16 15 29 26
## [126] 26 30  6 19  6  2 38 29  8 30 22  6  5 15 15  4  3 19 23  7  7  4 27 31 29
## [151] 13 10 10 12 23 21  8 14 27 20  6 27 24 22  2  2 20 36  4  7  5 10  8 15  6
## [176]  6 44 34 27  5  6 32 25 27  7  6  4  5 47  4  5  5  3 25 28 35  3  4  7  8
## [201] 24  7 11 14 17 11  6 13  4 11  2  9 21 20 26  6  8  5  4 38 12  6  4  5 29
## [226] 17 12 12  5  2  3  3 27 27 25 15 17  6 25 12 36  9 19  2 17 22  5  6 20 12
## [251] 14 12 18 20  2 23 25  9 20 27 21 31  5 23 12 15  6 21 22  8  8  9  8  9 33
## [276]  9  7  9 25  9 25  6  5  5  5 20 19 13  2 19  6 34  7  6 22 10 27  7 11 19
## [301] 13 10 13 18 25 21 12 20  7 17 32 27  3  3  4 30  8  9  7 11 30 10 24 35  6
## [326]  8  6 25  6 19 37 15  6  7 12 32 17 26 16 16 23  4  6  6  6  6  4  9 16 15
## [351]  7 11 12  7  3  3  4 30 25 18 25 10  2  3 18  2  2 14 29 28 32 32 21 21 35
## [376]  6 29 29  6 18  8 12  4 10  8 17 15  7  4  2  7  4  4  7  6 14 14 11  7 13
## [401] 11  6  7  8 16 30 14 18 11 10  9  3  6  4 23 34 10 24 20 15 15 12 24 46 17
## [426] 31 24 10 12 16 21 17 33 22 18 11  8  8 17  4  4 10  5 11  2 15 17  9  3 27
## [451]  2  5 24  8 23  5  9 10 20 32  5 17 24 16 14 18  3 24 25  6  8 25  8 20 19
## [476] 34  6 13 13  4 14  7 14 12 33 16  8  5 10 24 26 15  2  2  7 19 20 20 10  3
## [501]  5 24 18 17  4  3  9 12  9 17 15 26 17 18  7 11  9  4  5 19  7  5  5  4  3
## [526] 24 13 12  3  8  6  6 14 13 13 14

colSums(check)

##  [1]  43  46  95  49  87  41  41  40  89  95 123  71 126  97  13 126 121  71 116
## [20] 138 149 155  65 137 115 116  80  54  92  93  80  67 157 130 130 157 134 140
## [39] 175 138  95  30  34  35  70  52  41  92  32  77  62 127 163 100 117  98  90
## [58] 114 151  64  62 105  80  87 118 135 172 157 111 140  44 166 156 119  57  38
## [77]  41  12  96 119

par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(c(rowSums(pn_bin[which(bool==1),]))/96, xlab = "Heterozygosity", ylab = "counts", 50, main = "heterozygosity per SNP for somatic set", 
     col = "#2ca25f")

How many mutations in common between this set of 536 somatic mutations and the single samples from the Pando dataset?

Here: this set of somatic SNPs was further filtered to remove crap (based on the p-values) - this is the set that I am using as a base to compare it to the singles.

setwd("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/0_replicate_dataset/data")
# upoad pntest data and depth info
pn_est <- read.table("pntest_somatic_101SNPs_80inds.txt", sep = " ", header = F ) # 101 x 80

# -----------------------------------------------------------------------------
# BINARIZATION
# -----------------------------------------------------------------------------
pn_bin <- matrix(NaN, dim(pn_est)[1], dim(pn_est)[2])
threshold <- 0.5

for (x in 1:dim(pn_est)[1]) {
  
  for (y in 1:dim(pn_est)[2]) {
    
    if (pn_est[x,y]>threshold) {pn_bin[x,y] <- 1 } # if proba of being heteroZ > threshold, label with 1
    
    else {pn_bin[x,y] <- 0 }
    
  }
}


nInd <- 80
hets <- (rowSums(pn_bin))/nInd



setwd("/Users/rpineau3/Dropbox (GaTech)/BioSci-Ratcliff/Rozenn/4.PandoProject/17-analyses/0_replicate_dataset/figures/") 
pdf(file=paste("hets_per_snp_replicate_dataset.pdf", sep="" ), bg = "transparent", width=5, height=3, family = "Times New Roman")


par(family="Times New Roman", cex.lab=1.2, cex.axis=1.2, cex.main=1, cex.sub=1)
hist(hets, xlab = "Heterozygosity per SNP for somatic set", ylab = "counts", 50, main = "", sub = "Replicate dataset, 80 individuals and 101 SNPs",
     col = "#2ca25f")

dev.off()

## quartz_off_screen 
##                 2

# If the mutation is found in 2 replicate samples, in how many samples is it found?
t <- 9
p <- which(rowSums(occurrence) >= t ) # found in at least 2 groups, 3725 groups
mean(rowSums(occurrence[p,])) # 11.53

## [1] 11.72422

# compare to random
N <- 100
store_means <- matrix(NA, 1, N)

for (n in 1:N) {
  
  rdn <- matrix(NA,dim(occurrence)[1],dim(occurrence)[2])
  # shuffle dataset here
  for (i in 1:dim(occurrence)[2]) {
  
     rdn[,i] <- occurrence[sample(1:dim(occurrence)[1]),i]
  
       }

  p <- which( rowSums(rdn) >= t ) # found in at least 2 groups, 3725 groups
  print(length(p))
  
  store_means[n] <- mean(rowSums(rdn[p,])) 
  
}

## [1] 3656
## [1] 3651
## [1] 3667
## [1] 3642
## [1] 3663
## [1] 3638
## [1] 3670
## [1] 3679
## [1] 3652
## [1] 3683
## [1] 3656
## [1] 3662
## [1] 3618
## [1] 3670
## [1] 3679
## [1] 3649
## [1] 3672
## [1] 3633
## [1] 3655
## [1] 3640
## [1] 3667
## [1] 3661
## [1] 3652
## [1] 3636
## [1] 3660
## [1] 3654
## [1] 3660
## [1] 3656
## [1] 3609
## [1] 3629
## [1] 3659
## [1] 3658
## [1] 3646
## [1] 3643
## [1] 3658
## [1] 3651
## [1] 3640
## [1] 3647
## [1] 3680
## [1] 3667
## [1] 3658
## [1] 3650
## [1] 3664
## [1] 3651
## [1] 3647
## [1] 3657
## [1] 3654
## [1] 3651
## [1] 3659
## [1] 3655
## [1] 3661
## [1] 3664
## [1] 3653
## [1] 3667
## [1] 3643
## [1] 3670
## [1] 3670
## [1] 3652
## [1] 3645
## [1] 3647
## [1] 3659
## [1] 3661
## [1] 3664
## [1] 3619
## [1] 3675
## [1] 3653
## [1] 3650
## [1] 3679
## [1] 3614
## [1] 3676
## [1] 3645
## [1] 3664
## [1] 3651
## [1] 3675
## [1] 3644
## [1] 3676
## [1] 3649
## [1] 3673
## [1] 3668
## [1] 3679
## [1] 3648
## [1] 3635
## [1] 3651
## [1] 3670
## [1] 3648
## [1] 3651
## [1] 3676
## [1] 3677
## [1] 3657
## [1] 3675
## [1] 3648
## [1] 3641
## [1] 3666
## [1] 3647
## [1] 3672
## [1] 3648
## [1] 3649
## [1] 3660
## [1] 3668
## [1] 3638

mean(store_means) # 10.16

## [1] 10.42017

# Plot and compare
hist(store_means/12, 10, xlab = "Number", col = "#a8ddb5", border = "#a8ddb5", main="")
lines(rep(mean(rowSums(occurrence[p,]))/12, 2), c(0,25),  col = "#045a8d")
legend("topright", legend = c("Expected by chance" , "Mean for data (0.85)"), pch = 15, col = c("#a8ddb5", "#045a8d"), bty = "n" )

par(family = "Times New Roman")
p1 <- hist(rowSums(rdn)/12, 7, plot = F)
p2 <- hist(rowSums(occurrence[p,])/12, plot = F)
plot(p2, col=rgb(0,0,1,1/4), main = "Prop. of inds with the snp when found in 2 inds", xlab = "Proportion" )
plot(p1, col=rgb(168/255,221/255,181/255,1/4), add=T) 
legend("topleft", legend= c("data", "randomized dataset"), pt.bg =  c(rgb(0,0,1,1/4), rgb(168/255,221/255,181/255,1/4)), bty = "n", 
       pch = 22)

Replicate dataset analysis

2024-05-09

Dataset details

—————————————————————————–

Step 1: point estimates with modified allele frequencies

—————————————————————————–

—————————————————————————–

Step 2. Assess replication power (quality of our re-sequencing experiment)

—————————————————————————–

—————————————————————————–

Step 3: Filtering for somatic mutations

—————————————————————————–

Here: this set of somatic SNPs was further filtered to remove crap (based on the p-values) - this is the set that I am using as a base to compare it to the singles.