TA: Anastacia Wienecke,

Alice Woolard

treeNode <- setClass("treeNode", slots = c(descendants = "numeric", time = "numeric"))
branch <- setClass("branch", slots = c(descendants = "numeric", branchLength = "numeric"))
selectNodeIndicesToCoalesce <- function(n){  
  nodesToCoalesce <- sample(1:n,2)
  return(nodesToCoalesce)
}

pickCoalTime <- function(n){
  coalGen <- rgeom(1,n)
  return(coalGen)
}

calcCoalProb <- function(n,N){
  gpairs <- (n*(n-1))/2
  pcoal <- 1/(2*N)
  coalProb <- gpairs*pcoal
  return(coalProb)
}
generateTree <- function(n, N){
  nodes <- list()
  for (i in 1:n){
    currNode <- treeNode(descendants=c(i), time=0)
    nodes[[i]] <- currNode
  }
  
  branches <- list()
  currentTimeElapsed <- 0
  lineageCount = n 
  
  while (lineageCount >= 2){
    coalProb <- calcCoalProb(length(nodes),N)
    #print(length(nodes))
    coalGen <- pickCoalTime(coalProb) 
    
    indicesToCoalesce <- selectNodeIndicesToCoalesce(lineageCount)
    node1 <- nodes[[indicesToCoalesce[1]]]
    node2 <- nodes[[indicesToCoalesce[2]]]

    currentTimeElapsed <- currentTimeElapsed + coalGen

    nodes[[max(indicesToCoalesce)]] = NULL
    newNode <-  c(node1@descendants,node2@descendants)
    
    nodes[[min(indicesToCoalesce)]]@descendants <-  newNode
    nodes[[min(indicesToCoalesce)]]@time = currentTimeElapsed
    branchLength1 <- currentTimeElapsed - node1@time

    newBranch1 <- branch(descendants=node1@descendants, branchLength=branchLength1)
    branches[[length(branches)+1]] <- newBranch1
    
    branchLength2 <- currentTimeElapsed - node2@time
    newBranch2 <- branch(descendants=node2@descendants, branchLength=branchLength2)
    branches[[length(branches)+1]] <- newBranch2
    
    lineageCount = lineageCount - 1 
    #print(lineageCount)
  }
  return(list("nodes" = nodes, "branches" = branches))
}
drawNumberOfMutationsOnBranch <- function(branchLength, mut)
{currMuts = rbinom(1, branchLength, mut)
  return(currMuts)}

Mutations

throwDownMutations <- function(branches, n, mu, L)
{
    seqs = c()
    for (i in 1:n){
        seqs = c(seqs, "")
    }

    totalMuts <- 0
    for (b in 1:length(branches)){
        currMuts = drawNumberOfMutationsOnBranch(branches[[b]]@branchLength, mu*L)
        mut <- 0
        while (mut < currMuts){
            for (i in 1:n){
                if (i %in% branches[[b]]@descendants){ 
                    seqs[i] <- paste(seqs[i], "1", sep = "")
                }
                else{
                    seqs[i] <- paste(seqs[i], "0", sep = "")
                }
            }
            mut <- mut + 1
        }
        totalMuts <- totalMuts + currMuts
    }
    seqMatrix <- matrix(0L, nrow=n, ncol=totalMuts)
    for (i in 1:n){
        j = 1
        while (j <= totalMuts){
            seqMatrix[i,j] <- as.integer(substr(seqs[i], j, j))
            j <- j + 1
        }
    }
    if (totalMuts > 0){
        seqMatrix <- seqMatrix[,sample(1:totalMuts)]
    }
    return(seqMatrix)
}

Create the following getTotalTreeHeight function:

getTotalTreeHeight <- function(tree)
{return(tree$nodes[[1]]@time)}
L = 100000; N = 1e4; n = 20; mu = 1.2e-8  
expection <- 4*N*(1-(1/n))

tree <- generateTree(n, N)
tmrca <- getTotalTreeHeight(tree)

cat("Expection:",expection,"\nReality:", tmrca,"\n")
## Expection: 38000 
## Reality: 83421
seqMatrix <- throwDownMutations(tree$branches, n, mu, L)

cat("A sample of the sequence matrix...\n")
## A sample of the sequence matrix...
seqMatrix[1:20,1:13]
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
##  [1,]    1    1    1    1    0    0    1    1    0     0     0     0     0
##  [2,]    1    0    0    1    0    0    0    0    1     0     0     0     0
##  [3,]    1    0    0    1    0    0    0    0    0     0     0     0     0
##  [4,]    1    1    1    1    0    0    1    1    0     0     0     0     0
##  [5,]    1    0    0    1    0    1    0    0    0     0     0     1     0
##  [6,]    1    0    0    1    0    0    0    0    0     0     0     1     0
##  [7,]    1    0    0    1    0    0    0    0    0     0     0     1     0
##  [8,]    1    0    1    1    0    0    1    0    0     0     0     0     0
##  [9,]    1    1    1    1    0    0    1    1    0     0     0     0     0
## [10,]    1    0    0    1    0    0    0    0    0     0     0     1     0
## [11,]    1    1    1    1    0    0    1    1    0     0     0     0     0
## [12,]    1    0    1    1    1    0    1    0    0     0     0     0     1
## [13,]    1    0    0    1    0    0    0    0    1     0     0     0     0
## [14,]    1    1    1    1    0    0    1    1    0     0     0     0     0
## [15,]    1    0    0    1    0    1    0    0    0     0     0     1     0
## [16,]    1    0    0    1    0    0    0    0    1     0     0     0     0
## [17,]    0    0    0    0    0    0    0    0    0     1     1     0     0
## [18,]    1    0    1    1    0    0    1    0    0     0     0     0     0
## [19,]    1    0    0    1    0    1    0    0    0     0     0     1     0
## [20,]    1    0    0    1    0    0    0    0    0     0     0     1     0

Your Analysis here

ptm <- proc.time()
sfs <- as.data.frame(matrix())

for (i in 1:500){
  tree <- generateTree(n, N)
  tmrca <- getTotalTreeHeight(tree)
  seqMatrix <- throwDownMutations(tree$branches, n, mu, L)
  freq <- mean(colSums(seqMatrix))
  sfs[i,] <- freq
}
cat("\nTiming of the inefficiency of this loop:\nTotal time to run: ")
## 
## Timing of the inefficiency of this loop:
## Total time to run:
cat(proc.time()[3] - ptm[3],"seconds...")
## 21.346 seconds...
ggplot(sfs, aes(x=V1)) + 
  geom_histogram(aes(y=..density..), binwidth=.15,    color="gray18",fill="white")+ geom_density(alpha=.2, fill="red",size=.5, color="gray14") + ggtitle("Site Frequency Spectrum,n=500\nSmaller Bins")+ 
  scale_x_continuous(limits = c(min(sfs)-.5, max(sfs)+.5))+
  gghisto+xlab("Mean Allele Frequencies \nAcrossTrials")

ggplot(sfs, aes(x=V1)) + 
  geom_histogram(aes(y=..density..), binwidth=.35,    color="gray18",fill="white")+ geom_density(alpha=.2, fill="red",size=.5, color="gray14") + ggtitle("Site Frequency Spectrum,n=500\nLarger Bins")+ 
  scale_x_continuous(limits = c(min(sfs)-.5, max(sfs)+.5))+
  gghisto+xlab("Mean Allele Frequencies \nAcrossTrials")

  tvar <- function(n,S) {
    a1 <- sum(1/(seq(1,n-1,1)));ab <- sum(1/((seq(1,n-1,1))**2))
    b <- (n+1)/(3*(n-1));bb <- (2*((n**2)+n+3))/((9*n)*(n-1))
    c1 <- b-(1/a1);c2 <- bb-((n+2)/(a1*n))+(ab/(a1**2))
    c3 <- c2/((a1**2)+ab);var <- ((c1/a1)*S)+(c3*S*(S-1))
    return(var)
    }
L = 100000;N = 1e3;n = 20;mu = 1.2e-8  
w.theta <- c();pix <- c();td <- c();sfs <- as.data.frame(matrix())

for (i in 1:100){
  tree <- generateTree(n, N)
  tmrca <- getTotalTreeHeight(tree)
  seqMatrix <- throwDownMutations(tree$branches, n, mu, L)
  freq <- mean(colSums(seqMatrix))
  sfs[i,] <- freq

  #wattersons
  denom <- rep(0,nrow(seqMatrix))
  for (j in 1:(n-1)){
    denom[j] <- (1/j)
    realdenom <- sum(denom)}
  w.theta <- c(w.theta,ncol(seqMatrix)/(realdenom))
  
  #pi
  for (j in 1:n){
    jsum <- rowSums(seqMatrix)[j]
    p <- jsum/n
    x <- (2*p)*(n-p)*(n/(n-1))+pi
    pix <- c(pix,x)
  }

  #tajimas
  S <- ncol(seqMatrix)
  td <- c(td,(pix[i]-w.theta[i])/sqrt(tvar(nrow(seqMatrix),ncol(seqMatrix))))
}

sfs.stat<- as.data.frame(c(w.theta, pix, td))
stat.names <- c(rep("Wattersons theta",length(w.theta)), rep("p .diversity",length(pix)),rep("Tajimas D",length(td)))
stat.df <- as.data.frame(cbind(sfs.stat,stat.names))

cat("--MEANS--\nWatterson's Theta:",mean(w.theta),"\nPi:",mean(pix),"\nTajima's D:",mean(td))
## --MEANS--
## Watterson's Theta: 4.69031 
## Pi: 12.41577 
## Tajima's D: 6.017637
ggplot(sfs, aes(x=V1)) + 
  geom_histogram(aes(y=..density..), binwidth=.4,    color="gray18",fill="white")+ geom_density(alpha=.2, fill="red",size=.5, color="gray14") + ggtitle("Site Frequency Spectrum, n=200")+ 
  scale_x_continuous(limits = c(min(sfs)-.5, max(sfs)+.5))+
  gghisto+xlab("Mean Allele Frequencies \nAcrossTrials")

ggplot(stat.df, aes(x = stat.df$`c(w.theta, pix, td)`, fill = stat.names)) +geom_density(alpha = .3)+ ggtitle("Diversity of \nSimulated Sequences, n=200")+ 
  scale_x_continuous(limits = c(min(pix), 20))+
  gghisto+xlab("Value of Statistic \nAcrossTrials")