4. The Viterbi Algorithm

First implementation

This is just a test of a viterbi implementation with a fix Emission matrix for 5 observations. Adapted from (Krijnen 2009).

start.p<-c(0.4,0.2,0.2,0.2) #The starting probabilities chosen arbitrarily 
emit.p<-GetEmissionProb(0,0,1,0) # For this example we are having just one emission matrix, not 16 
                                 # to keep it simple for now we are not using the parent genotypes or phasing.
#obs<-child1.genotype
obs<-c(0,0,1,1,2)
trans.p<-GetTransitionProb(0.2) # Getting the transition probabilities matrix

Viterbi <- function(obs,start.p, trans.p, emit.p) {
  #initializing
  v <- matrix(NA, nr=length(obs), nc=dim(trans.p)[1]) ## we create a 10x4 empty matrix (10 obs, 4 hidden states)
  for(i in 1:4){ #for each of the hidden states we calculate the initial probabilities
    v[1,i]=start.p[i]*emit.p[i,obs[1]+1]
  }
  
  for(i in 2:length(obs)) {# from observation 2 to t
    for (l in 1:dim(trans.p)[1]) { #from state 1 to N (4 states)
      v[i,l] <- emit.p[l,obs[i]+1] * max(v[(i-1),] * trans.p[l,])
    }
  }   
      
return(v) 
}


v<-Viterbi (obs,start.p, trans.p, emit.p)
v

##           [,1]  [,2]      [,3]  [,4]
## [1,] 0.0000000 0.200 0.0000000 0.200
## [2,] 0.0000000 0.128 0.0000000 0.128
## [3,] 0.0204800 0.000 0.0204800 0.000
## [4,] 0.0131072 0.000 0.0131072 0.000
## [5,] 0.0000000 0.000 0.0000000 0.000

With the matrix v calculated, the state path given us the maximum probability can be found by the maximum values per row, with this function.

vitrowmax <- apply(v, 1, function(x) which.max(x))
vitrowmax

## [1] 2 2 1 1 1

Viterbi: Second implementation

Now we are going to adjust our implementation of the Viterbi algorithm to use the information we have about the parents phasing.

child1.chromatid.mom<-SimulateChromatid()
child1.chromatid.mom

## $maternal
##  [1] 0 1 1 0 0 0 1 1 1 0
## 
## $paternal
##  [1] 1 0 0 1 0 0 1 0 1 0
## 
## $parent.path
##  [1] "maternal" "paternal" "paternal" "paternal" "paternal" "paternal"
##  [7] "maternal" "maternal" "maternal" "paternal"
## 
## $child
##  [1] 0 0 0 1 0 0 1 1 1 0

child1.chromatid.dad<-SimulateChromatid()
child1.chromatid.dad

## $maternal
##  [1] 0 1 0 0 1 1 1 0 1 1
## 
## $paternal
##  [1] 0 0 0 0 1 1 1 0 0 0
## 
## $parent.path
##  [1] "paternal" "paternal" "paternal" "maternal" "paternal" "paternal"
##  [7] "maternal" "maternal" "paternal" "maternal"
## 
## $child
##  [1] 0 0 0 0 1 1 1 0 0 1

### for our observation in the child
obs<-child1.chromatid.mom$child+child1.chromatid.dad$child
obs

##  [1] 0 0 0 1 1 1 2 1 1 1

start.p<-c(0.25,0.25,0.25,0.25);

trans.p<-GetTransitionProb(0.3)

Viterbi2 <- function(obs,start.p, trans.p,mom.chromatid,dad.chromatid) {
  #initializing
  v <- matrix(NA, nr=length(obs), nc=dim(trans.p)[1])
  emit.p<-GetEmissionProb(mom.chromatid$maternal[1],
                          mom.chromatid$paternal[1],
                          dad.chromatid$maternal[1],
                          dad.chromatid$paternal[1])
  for(i in 1:4){
    v[1,i]=start.p[i]*emit.p[i,obs[1]+1]
  }
  
  for(i in 2:length(obs)) {# from observation 2 to t
    emit.p<-GetEmissionProb(mom.chromatid$maternal[i],
                            mom.chromatid$paternal[i],
                            dad.chromatid$maternal[i],
                            dad.chromatid$paternal[i])
    for (l in 1:dim(trans.p)[1]) { #from state 1 to N (4 states)
      v[i,l] <- emit.p[l,obs[i]+1] * max(v[(i-1),] * trans.p[l,])
    }
  }   
  
  return(v) 
}



v.2<-Viterbi2 (obs,start.p, trans.p,child1.chromatid.mom,child1.chromatid.dad)
v.2

##               [,1]         [,2]         [,3]         [,4]
##  [1,] 2.500000e-01 0.2500000000 0.000000e+00 0.000000e+00
##  [2,] 0.000000e+00 0.0000000000 0.000000e+00 5.250000e-02
##  [3,] 0.000000e+00 0.0000000000 1.102500e-02 2.572500e-02
##  [4,] 0.000000e+00 0.0000000000 5.402250e-03 1.260525e-02
##  [5,] 1.134472e-03 0.0026471025 2.647102e-03 6.176572e-03
##  [6,] 5.558915e-04 0.0012970802 1.297080e-03 3.026521e-03
##  [7,] 2.723868e-04 0.0006355693 6.355693e-04 1.482995e-03
##  [8,] 1.334696e-04 0.0003114290 0.000000e+00 0.000000e+00
##  [9,] 0.000000e+00 0.0001526002 0.000000e+00 6.540008e-05
## [10,] 3.204604e-05 0.0000000000 1.373402e-05 0.000000e+00

And to get the parent path

vitrowmax.2 <- apply(v.2, 1, function(x) which.max(x))
vitrowmax.2

##  [1] 1 4 4 4 4 4 4 2 2 1

And in order to compare our simulated states with the estimated states obtained using our HHMmodel, the observed genotype of the simulated progeny and the genotype and phasing of the simulated parents.

simulated<-paste("mom.",child1.chromatid.mom$parent.path,"/","dad.",child1.chromatid.dad$parent.path,sep="")
estimated<-rownames(trans.p)[vitrowmax.2]
simulated

##  [1] "mom.maternal/dad.paternal" "mom.paternal/dad.paternal"
##  [3] "mom.paternal/dad.paternal" "mom.paternal/dad.maternal"
##  [5] "mom.paternal/dad.paternal" "mom.paternal/dad.paternal"
##  [7] "mom.maternal/dad.maternal" "mom.maternal/dad.maternal"
##  [9] "mom.maternal/dad.paternal" "mom.paternal/dad.maternal"

estimated

##  [1] "mom.maternal/dad.maternal" "mom.paternal/dad.paternal"
##  [3] "mom.paternal/dad.paternal" "mom.paternal/dad.paternal"
##  [5] "mom.paternal/dad.paternal" "mom.paternal/dad.paternal"
##  [7] "mom.paternal/dad.paternal" "mom.maternal/dad.paternal"
##  [9] "mom.maternal/dad.paternal" "mom.maternal/dad.maternal"

simulated==estimated

##  [1] FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE

The model seems to be working with the simulated data, some predictions are right and some are wrong. To better assess the accuracy of the model we will run simulations with more markers and different recombination rates.

References

Krijnen WP. 2009. Applied Statistics for Bioinformatics using R. GNU Free Document License.

Author

Luis Avila (lmavila@gmail.com)