recombination rate = 0.3
start.p<-c(0.25,0.25,0.25,0.25)
recombination.rate<-0.3
trans.p<-GetTransitionProb(recombination.rate)
child1.chromatid.mom<-SimulateChromatid(100,recombination.rate) #100 markers
child1.chromatid.dad<-SimulateChromatid(100,recombination.rate)
obs<-child1.chromatid.mom$child+child1.chromatid.dad$child
v.2<-Viterbi2 (obs,start.p, trans.p,child1.chromatid.mom,child1.chromatid.dad)
vitrowmax.2 <- apply(v.2, 1, function(x) which.max(x))
vitrowmax.2
## [1] 1 1 2 1 1 1 2 2 2 2 2 2 2 2 1 2 2 2 2 1 1 1 3 3 3 3 3 3 4 4 2 2 1 4 4
## [36] 4 4 4 4 4 4 4 3 1 3 3 3 1 1 2 2 1 1 3 3 4 4 4 3 1 2 2 1 1 1 1 2 2 4 3
## [71] 3 1 4 4 3 1 1 1 1 1 2 1 1 2 2 3 3 3 3 1 1 1 2 2 2 4 4 4 4 4
simulated<-paste("mom.",child1.chromatid.mom$parent.path,"/","dad.",child1.chromatid.dad$parent.path,sep="")
estimated<-rownames(trans.p)[vitrowmax.2]
sim.results<-sum(simulated==estimated)
The number for true positives from our estimation would be given by.
sum(sim.results)
## [1] 57
and the percentage of true positives is 57%
child1.chromatid.mom<-SimulateChromatid(10000,recombination.rate) #100 markers
child1.chromatid.dad<-SimulateChromatid(10000,recombination.rate)
obs<-child1.chromatid.mom$child+child1.chromatid.dad$child
v.2<-Viterbi2 (obs,start.p, trans.p,child1.chromatid.mom,child1.chromatid.dad)
vitrowmax.2 <- apply(v.2, 1, function(x) which.max(x))
simulated<-paste("mom.",child1.chromatid.mom$parent.path,"/","dad.",child1.chromatid.dad$parent.path,sep="")
estimated<-rownames(trans.p)[vitrowmax.2]
sim.results<-sum(simulated==estimated)
sum(sim.results)
## [1] 2686
and the percentage of true positives is 26.86%. Much lower than when we simulated for 100 loci. Let’s look at some values of our matrix returned by Viterbi2 to see what is the problem.
v.2[1:5,]
## [,1] [,2] [,3] [,4]
## [1,] 0.000000000 0.25 0.250000000 0.000000
## [2,] 0.052500000 0.00 0.122500000 0.000000
## [3,] 0.000000000 0.00 0.000000000 0.025725
## [4,] 0.002315250 0.00 0.005402250 0.000000
## [5,] 0.001134472 0.00 0.002647102 0.000000
v.2[1500:1505,]
## [,1] [,2] [,3] [,4]
## [1,] 0 0 0 0
## [2,] 0 0 0 0
## [3,] 0 0 0 0
## [4,] 0 0 0 0
## [5,] 0 0 0 0
## [6,] 0 0 0 0
v.2[5000:5005,]
## [,1] [,2] [,3] [,4]
## [1,] 0 0 0 0
## [2,] 0 0 0 0
## [3,] 0 0 0 0
## [4,] 0 0 0 0
## [5,] 0 0 0 0
## [6,] 0 0 0 0
v.2[9000:9005,]
## [,1] [,2] [,3] [,4]
## [1,] 0 0 0 0
## [2,] 0 0 0 0
## [3,] 0 0 0 0
## [4,] 0 0 0 0
## [5,] 0 0 0 0
## [6,] 0 0 0 0
As you can see, the values get smaller to the point R does not have a representation and shows 0. To prevent this we can modify our Viterbi2 algorythm to use the log2 of the probabilities instead.
Viterbi3 <- function(obs,start.p, trans.p,mom.chromatid,dad.chromatid) {
#initializing
v <- matrix(NA, nr=length(obs), nc=dim(trans.p)[1])
emit.p<-GetEmissionProb(mom.chromatid$maternal[1],
mom.chromatid$paternal[1],
dad.chromatid$maternal[1],
dad.chromatid$paternal[1])
emit.p<-emit.p+10^(-12)
for(i in 1:4){
v[1,i]=log(start.p[i],2)+log(emit.p[i,obs[1]+1],2)
}
for(i in 2:length(obs)) {# from observation 2 to t
emit.p<-GetEmissionProb(mom.chromatid$maternal[i],
mom.chromatid$paternal[i],
dad.chromatid$maternal[i],
dad.chromatid$paternal[i])
for (l in 1:dim(trans.p)[1]) { #from state 1 to N (4 states)
v[i,l] <- log(emit.p[l,obs[i]+1]+10^(-12),2) + max(v[(i-1),] + log( trans.p[l,],2))
}
}
return(v)
}
You will notice that I am adding +10^(-12) before using the value from the emission matrix. We are doing that because we don’t want to calculate the log2 of 0 which would be -Inf.
Using the same simulated dat we can run our new Viterbi3 funcion.
child1.chromatid.mom<-SimulateChromatid(10000,recombination.rate) #100 markers
child1.chromatid.dad<-SimulateChromatid(10000,recombination.rate)
obs<-child1.chromatid.mom$child+child1.chromatid.dad$child
v.2<-Viterbi3 (obs,start.p, trans.p,child1.chromatid.mom,child1.chromatid.dad)
vitrowmax.2 <- apply(v.2, 1, function(x) which.max(x))
simulated<-paste("mom.",child1.chromatid.mom$parent.path,"/","dad.",child1.chromatid.dad$parent.path,sep="")
estimated<-rownames(trans.p)[vitrowmax.2]
sim.results<-sum(simulated==estimated)
sum(sim.results)
## [1] 5966
and the percentage of true positives is 59.66%. Closer to what we got when simulated for 100 loci. Now that our Viterbi function is more robust we can do more simulations.
start.p<-c(0.25,0.25,0.25,0.25)
ViterbiSimulator <- function(num.markers,num.simulations,start.p,recombination.rate){
sim.results<-c()
trans.p<-GetTransitionProb(recombination.rate)
for (i in 1:num.simulations){
child1.chromatid.mom<-SimulateChromatid(num.markers,recombination.rate) #100 markers
child1.chromatid.dad<-SimulateChromatid(num.markers,recombination.rate)
obs<-child1.chromatid.mom$child+child1.chromatid.dad$child
v.2<-Viterbi3 (obs,start.p, trans.p,child1.chromatid.mom,child1.chromatid.dad)
vitrowmax.2 <- apply(v.2, 1, function(x) which.max(x))
simulated<-paste("mom.",child1.chromatid.mom$parent.path,"/","dad.",child1.chromatid.dad$parent.path,sep="")
estimated<-rownames(trans.p)[vitrowmax.2]
sim.results[i]<-sum(simulated==estimated)
}
return(sim.results)
}
result.100.03<-ViterbiSimulator(100,100,start.p,0.3);
hist(result.100.03,main="True positives in 100 simulations with 100 markers\n r=0.3",xlab="true positives")
result.100.02<-ViterbiSimulator(100,100,start.p,0.2);
hist(result.100.02,main="True positives in 100 simulations with 100 markers\n r=0.2",xlab="true positives")
result.100.01<-ViterbiSimulator(100,100,start.p,0.1);
hist(result.100.01,main="True positives in 100 simulations with 100 markers\n r=0.1",xlab="true positives")
result.500.03<-ViterbiSimulator(500,100,start.p,0.3)
result.500.02<-ViterbiSimulator(500,100,start.p,0.2)
result.500.01<-ViterbiSimulator(500,100,start.p,0.1)
hist(result.500.03,main="True positives in 100 simulations with 500 markers\n r=0.3",xlab="true positives")
hist(result.500.01,main="True positives in 100 simulations with 500 markers\n r=0.1",xlab="true positives")
x | 100 loci | 300 loci | 500 loci | 10,000 loci |
---|---|---|---|---|
r=0.3 | 59.47 | 58.5 | 51.2 | 59.528 |
r=0.2 | 67.23 | 67.3333333 | 59.8 | 67.5197 |
r=0.1 | 80.29 | 79.8333333 | 72.2 | 80.3359 |
From these results it is clear that our estimation accuracy increases with smaller recombination rates and it appears to be independent from the number of loci.