5. Simulations and HMM estimations

Simulating and estimating for 100 loci

recombination rate = 0.3

start.p<-c(0.25,0.25,0.25,0.25)
recombination.rate<-0.3
trans.p<-GetTransitionProb(recombination.rate)
child1.chromatid.mom<-SimulateChromatid(100,recombination.rate) #100 markers
child1.chromatid.dad<-SimulateChromatid(100,recombination.rate)
obs<-child1.chromatid.mom$child+child1.chromatid.dad$child
v.2<-Viterbi2 (obs,start.p, trans.p,child1.chromatid.mom,child1.chromatid.dad)
vitrowmax.2 <- apply(v.2, 1, function(x) which.max(x))
vitrowmax.2

##   [1] 1 1 2 1 1 1 2 2 2 2 2 2 2 2 1 2 2 2 2 1 1 1 3 3 3 3 3 3 4 4 2 2 1 4 4
##  [36] 4 4 4 4 4 4 4 3 1 3 3 3 1 1 2 2 1 1 3 3 4 4 4 3 1 2 2 1 1 1 1 2 2 4 3
##  [71] 3 1 4 4 3 1 1 1 1 1 2 1 1 2 2 3 3 3 3 1 1 1 2 2 2 4 4 4 4 4

simulated<-paste("mom.",child1.chromatid.mom$parent.path,"/","dad.",child1.chromatid.dad$parent.path,sep="")
estimated<-rownames(trans.p)[vitrowmax.2]
sim.results<-sum(simulated==estimated)

The number for true positives from our estimation would be given by.

sum(sim.results)

## [1] 57

and the percentage of true positives is 57%

Simulating for 10,000 loci and recombination rate r=0.3

child1.chromatid.mom<-SimulateChromatid(10000,recombination.rate) #100 markers
child1.chromatid.dad<-SimulateChromatid(10000,recombination.rate)
obs<-child1.chromatid.mom$child+child1.chromatid.dad$child
v.2<-Viterbi2 (obs,start.p, trans.p,child1.chromatid.mom,child1.chromatid.dad)
vitrowmax.2 <- apply(v.2, 1, function(x) which.max(x))
simulated<-paste("mom.",child1.chromatid.mom$parent.path,"/","dad.",child1.chromatid.dad$parent.path,sep="")
estimated<-rownames(trans.p)[vitrowmax.2]
sim.results<-sum(simulated==estimated)
sum(sim.results)

## [1] 2686

and the percentage of true positives is 26.86%. Much lower than when we simulated for 100 loci. Let’s look at some values of our matrix returned by Viterbi2 to see what is the problem.

v.2[1:5,]

##             [,1] [,2]        [,3]     [,4]
## [1,] 0.000000000 0.25 0.250000000 0.000000
## [2,] 0.052500000 0.00 0.122500000 0.000000
## [3,] 0.000000000 0.00 0.000000000 0.025725
## [4,] 0.002315250 0.00 0.005402250 0.000000
## [5,] 0.001134472 0.00 0.002647102 0.000000

v.2[1500:1505,]

##      [,1] [,2] [,3] [,4]
## [1,]    0    0    0    0
## [2,]    0    0    0    0
## [3,]    0    0    0    0
## [4,]    0    0    0    0
## [5,]    0    0    0    0
## [6,]    0    0    0    0

v.2[5000:5005,]

##      [,1] [,2] [,3] [,4]
## [1,]    0    0    0    0
## [2,]    0    0    0    0
## [3,]    0    0    0    0
## [4,]    0    0    0    0
## [5,]    0    0    0    0
## [6,]    0    0    0    0

v.2[9000:9005,]

##      [,1] [,2] [,3] [,4]
## [1,]    0    0    0    0
## [2,]    0    0    0    0
## [3,]    0    0    0    0
## [4,]    0    0    0    0
## [5,]    0    0    0    0
## [6,]    0    0    0    0

As you can see, the values get smaller to the point R does not have a representation and shows 0. To prevent this we can modify our Viterbi2 algorythm to use the log2 of the probabilities instead.

Viterbi3 <- function(obs,start.p, trans.p,mom.chromatid,dad.chromatid) {
  #initializing
  v <- matrix(NA, nr=length(obs), nc=dim(trans.p)[1])
  emit.p<-GetEmissionProb(mom.chromatid$maternal[1],
                          mom.chromatid$paternal[1],
                          dad.chromatid$maternal[1],
                          dad.chromatid$paternal[1])
  emit.p<-emit.p+10^(-12)                        
  for(i in 1:4){
    v[1,i]=log(start.p[i],2)+log(emit.p[i,obs[1]+1],2)
  }
  
  for(i in 2:length(obs)) {# from observation 2 to t
     emit.p<-GetEmissionProb(mom.chromatid$maternal[i],
                             mom.chromatid$paternal[i],
                             dad.chromatid$maternal[i],
                             dad.chromatid$paternal[i])
     for (l in 1:dim(trans.p)[1]) { #from state 1 to N (4 states)
       v[i,l] <- log(emit.p[l,obs[i]+1]+10^(-12),2) + max(v[(i-1),] + log( trans.p[l,],2))
     }
  }   
  
  return(v) 
}

You will notice that I am adding +10^(-12) before using the value from the emission matrix. We are doing that because we don’t want to calculate the log2 of 0 which would be -Inf.

Using the same simulated dat we can run our new Viterbi3 funcion.

child1.chromatid.mom<-SimulateChromatid(10000,recombination.rate) #100 markers
child1.chromatid.dad<-SimulateChromatid(10000,recombination.rate)
obs<-child1.chromatid.mom$child+child1.chromatid.dad$child
v.2<-Viterbi3 (obs,start.p, trans.p,child1.chromatid.mom,child1.chromatid.dad)
vitrowmax.2 <- apply(v.2, 1, function(x) which.max(x))
simulated<-paste("mom.",child1.chromatid.mom$parent.path,"/","dad.",child1.chromatid.dad$parent.path,sep="")
estimated<-rownames(trans.p)[vitrowmax.2]
sim.results<-sum(simulated==estimated)
sum(sim.results)

## [1] 5966

and the percentage of true positives is 59.66%. Closer to what we got when simulated for 100 loci. Now that our Viterbi function is more robust we can do more simulations.

A lot of simulations

start.p<-c(0.25,0.25,0.25,0.25)

ViterbiSimulator <- function(num.markers,num.simulations,start.p,recombination.rate){ 
  sim.results<-c()
  trans.p<-GetTransitionProb(recombination.rate)
  for (i in 1:num.simulations){
    child1.chromatid.mom<-SimulateChromatid(num.markers,recombination.rate) #100 markers
    child1.chromatid.dad<-SimulateChromatid(num.markers,recombination.rate)
    obs<-child1.chromatid.mom$child+child1.chromatid.dad$child
    v.2<-Viterbi3 (obs,start.p, trans.p,child1.chromatid.mom,child1.chromatid.dad)
    vitrowmax.2 <- apply(v.2, 1, function(x) which.max(x))
    simulated<-paste("mom.",child1.chromatid.mom$parent.path,"/","dad.",child1.chromatid.dad$parent.path,sep="")
    estimated<-rownames(trans.p)[vitrowmax.2]
    sim.results[i]<-sum(simulated==estimated)
  }
  return(sim.results)
}  

result.100.03<-ViterbiSimulator(100,100,start.p,0.3);
hist(result.100.03,main="True positives in 100 simulations with 100 markers\n r=0.3",xlab="true positives")

result.100.02<-ViterbiSimulator(100,100,start.p,0.2);
hist(result.100.02,main="True positives in 100 simulations with 100 markers\n r=0.2",xlab="true positives")

result.100.01<-ViterbiSimulator(100,100,start.p,0.1);
hist(result.100.01,main="True positives in 100 simulations with 100 markers\n r=0.1",xlab="true positives")

result.500.03<-ViterbiSimulator(500,100,start.p,0.3)
result.500.02<-ViterbiSimulator(500,100,start.p,0.2)
result.500.01<-ViterbiSimulator(500,100,start.p,0.1)

hist(result.500.03,main="True positives in 100 simulations with 500 markers\n r=0.3",xlab="true positives")

hist(result.500.01,main="True positives in 100 simulations with 500 markers\n r=0.1",xlab="true positives")

Percent mean true positives for estimations of different recombination rates and number of loci

x	100 loci	300 loci	500 loci	10,000 loci
r=0.3	59.47	58.5	51.2	59.528
r=0.2	67.23	67.3333333	59.8	67.5197
r=0.1	80.29	79.8333333	72.2	80.3359

From these results it is clear that our estimation accuracy increases with smaller recombination rates and it appears to be independent from the number of loci.

Author

Luis Avila (lmavila@gmail.com)