This is just a test of a viterbi implementation with a fix Emission matrix for 5 observations. Adapted from (Krijnen 2009).
start.p<-c(0.4,0.2,0.2,0.2) #The starting probabilities chosen arbitrarily
emit.p<-GetEmissionProb(0,0,1,0) # For this example we are having just one emission matrix, not 16
# to keep it simple for now we are not using the parent genotypes or phasing.
#obs<-child1.genotype
obs<-c(0,0,1,1,2)
trans.p<-GetTransitionProb(0.2) # Getting the transition probabilities matrix
Viterbi <- function(obs,start.p, trans.p, emit.p) {
#initializing
v <- matrix(NA, nr=length(obs), nc=dim(trans.p)[1]) ## we create a 10x4 empty matrix (10 obs, 4 hidden states)
for(i in 1:4){ #for each of the hidden states we calculate the initial probabilities
v[1,i]=start.p[i]*emit.p[i,obs[1]+1]
}
for(i in 2:length(obs)) {# from observation 2 to t
for (l in 1:dim(trans.p)[1]) { #from state 1 to N (4 states)
v[i,l] <- emit.p[l,obs[i]+1] * max(v[(i-1),] * trans.p[l,])
}
}
return(v)
}
v<-Viterbi (obs,start.p, trans.p, emit.p)
v
## [,1] [,2] [,3] [,4]
## [1,] 0.0000000 0.200 0.0000000 0.200
## [2,] 0.0000000 0.128 0.0000000 0.128
## [3,] 0.0204800 0.000 0.0204800 0.000
## [4,] 0.0131072 0.000 0.0131072 0.000
## [5,] 0.0000000 0.000 0.0000000 0.000
With the matrix v calculated, the state path given us the maximum probability can be found by the maximum values per row, with this function.
vitrowmax <- apply(v, 1, function(x) which.max(x))
vitrowmax
## [1] 2 2 1 1 1
Now we are going to adjust our implementation of the Viterbi algorithm to use the information we have about the parents phasing.
child1.chromatid.mom<-SimulateChromatid()
child1.chromatid.mom
## $maternal
## [1] 0 1 1 0 0 0 1 1 1 0
##
## $paternal
## [1] 1 0 0 1 0 0 1 0 1 0
##
## $parent.path
## [1] "maternal" "paternal" "paternal" "paternal" "paternal" "paternal"
## [7] "maternal" "maternal" "maternal" "paternal"
##
## $child
## [1] 0 0 0 1 0 0 1 1 1 0
child1.chromatid.dad<-SimulateChromatid()
child1.chromatid.dad
## $maternal
## [1] 0 1 0 0 1 1 1 0 1 1
##
## $paternal
## [1] 0 0 0 0 1 1 1 0 0 0
##
## $parent.path
## [1] "paternal" "paternal" "paternal" "maternal" "paternal" "paternal"
## [7] "maternal" "maternal" "paternal" "maternal"
##
## $child
## [1] 0 0 0 0 1 1 1 0 0 1
### for our observation in the child
obs<-child1.chromatid.mom$child+child1.chromatid.dad$child
obs
## [1] 0 0 0 1 1 1 2 1 1 1
start.p<-c(0.25,0.25,0.25,0.25);
trans.p<-GetTransitionProb(0.3)
Viterbi2 <- function(obs,start.p, trans.p,mom.chromatid,dad.chromatid) {
#initializing
v <- matrix(NA, nr=length(obs), nc=dim(trans.p)[1])
emit.p<-GetEmissionProb(mom.chromatid$maternal[1],
mom.chromatid$paternal[1],
dad.chromatid$maternal[1],
dad.chromatid$paternal[1])
for(i in 1:4){
v[1,i]=start.p[i]*emit.p[i,obs[1]+1]
}
for(i in 2:length(obs)) {# from observation 2 to t
emit.p<-GetEmissionProb(mom.chromatid$maternal[i],
mom.chromatid$paternal[i],
dad.chromatid$maternal[i],
dad.chromatid$paternal[i])
for (l in 1:dim(trans.p)[1]) { #from state 1 to N (4 states)
v[i,l] <- emit.p[l,obs[i]+1] * max(v[(i-1),] * trans.p[l,])
}
}
return(v)
}
v.2<-Viterbi2 (obs,start.p, trans.p,child1.chromatid.mom,child1.chromatid.dad)
v.2
## [,1] [,2] [,3] [,4]
## [1,] 2.500000e-01 0.2500000000 0.000000e+00 0.000000e+00
## [2,] 0.000000e+00 0.0000000000 0.000000e+00 5.250000e-02
## [3,] 0.000000e+00 0.0000000000 1.102500e-02 2.572500e-02
## [4,] 0.000000e+00 0.0000000000 5.402250e-03 1.260525e-02
## [5,] 1.134472e-03 0.0026471025 2.647102e-03 6.176572e-03
## [6,] 5.558915e-04 0.0012970802 1.297080e-03 3.026521e-03
## [7,] 2.723868e-04 0.0006355693 6.355693e-04 1.482995e-03
## [8,] 1.334696e-04 0.0003114290 0.000000e+00 0.000000e+00
## [9,] 0.000000e+00 0.0001526002 0.000000e+00 6.540008e-05
## [10,] 3.204604e-05 0.0000000000 1.373402e-05 0.000000e+00
And to get the parent path
vitrowmax.2 <- apply(v.2, 1, function(x) which.max(x))
vitrowmax.2
## [1] 1 4 4 4 4 4 4 2 2 1
And in order to compare our simulated states with the estimated states obtained using our HHMmodel, the observed genotype of the simulated progeny and the genotype and phasing of the simulated parents.
simulated<-paste("mom.",child1.chromatid.mom$parent.path,"/","dad.",child1.chromatid.dad$parent.path,sep="")
estimated<-rownames(trans.p)[vitrowmax.2]
simulated
## [1] "mom.maternal/dad.paternal" "mom.paternal/dad.paternal"
## [3] "mom.paternal/dad.paternal" "mom.paternal/dad.maternal"
## [5] "mom.paternal/dad.paternal" "mom.paternal/dad.paternal"
## [7] "mom.maternal/dad.maternal" "mom.maternal/dad.maternal"
## [9] "mom.maternal/dad.paternal" "mom.paternal/dad.maternal"
estimated
## [1] "mom.maternal/dad.maternal" "mom.paternal/dad.paternal"
## [3] "mom.paternal/dad.paternal" "mom.paternal/dad.paternal"
## [5] "mom.paternal/dad.paternal" "mom.paternal/dad.paternal"
## [7] "mom.paternal/dad.paternal" "mom.maternal/dad.paternal"
## [9] "mom.maternal/dad.paternal" "mom.maternal/dad.maternal"
simulated==estimated
## [1] FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE
The model seems to be working with the simulated data, some predictions are right and some are wrong. To better assess the accuracy of the model we will run simulations with more markers and different recombination rates.
Krijnen WP. 2009. Applied Statistics for Bioinformatics using R. GNU Free Document License.