DE genes based on 45X vs 75X coverage

Scott Garrett Daniel — Nov 22, 2013, 3:08 PM

#Hypergeometric, fisher.test, differential expression and read coverage

#To start with, lets grossly oversimplify, we'll say we have 45x coverage meaning we can have a gene with 0 to 45 reads
#We'll say we have 17e3 genes and they are all at max coverage except for gene A (17e3*45 = 51e4 reads)

#What's the probability that we have gene A with 45 reads in experimental condition and 1 read in control condition given the background of 51e4 reads?
#The answer is a 2x2 fisher test

choose(51.003e4,45)*choose(51.0001e4,1)/choose(102.0031e4,31)
[1] 1.315e+54

#Now, let's hold gene A to be 45 reads in experiment but vary from 1 to 45 reads in the control, whats the probability of those?

p_values_of_read_depth_fourtyfive=matrix(ncol=2,nrow=45)
p_values_of_read_depth_fourtyfive[1:45,1]=1:45
i=1
for (i in 1:45) {
p_values_of_read_depth_fourtyfive[i,2]=fisher.test(rbind(c(45,51e4),c(i,51e4)))$p.value
}
plot(p_values_of_read_depth_fourtyfive)

plot of chunk unnamed-chunk-1


#What range of expression will give us significant results?
powerfourtyfive=length(which(p_values_of_read_depth_fourtyfive[,2]<.05))/45

#Now, lets say we increase depth to 75
p_values_of_read_depth_seventyfive=matrix(ncol=2,nrow=75)
p_values_of_read_depth_seventyfive[1:75,1]=1:75
i=1
for (i in 1:75) {
  p_values_of_read_depth_seventyfive[i,2]=fisher.test(rbind(c(75,51e4),c(i,51e4)))$p.value
}
plot(p_values_of_read_depth_seventyfive)

plot of chunk unnamed-chunk-1


#What range of expression will give us significant results?
powerseventyfive=length(which(p_values_of_read_depth_seventyfive[,2]<.05))/75

#How much increase do we get going from 45x to 75x coverage?
(powerseventyfive-powerfourtyfive)/powerfourtyfive
[1] 0.1333

#Good enough power increase?

#However, now we have to think about the fact that both genes range from 1 to X reads
#Or do we get the same power anyways?

#Let's simulate it anyways
p_values_of_read_depth_fourtyfive=matrix(ncol=3)

i=1
j=1

for (i in 1:45) {
  for (j in 1:45) {
    temp_matrix=matrix(ncol=3,nrow=1)
    temp_matrix[1,1]=i
    temp_matrix[1,2]=j
    temp_matrix[1,3]=fisher.test(rbind(c(i,51e4),c(j,51e4)))$p.value
    p_values_of_read_depth_fourtyfive=rbind(p_values_of_read_depth_fourtyfive,temp_matrix)
  }

}

#Let's see the distribution
hist(p_values_of_read_depth_fourtyfive[,3])

plot of chunk unnamed-chunk-1


#And the pretty density plot
plot(density(p_values_of_read_depth_fourtyfive[,3],na.rm=T))

plot of chunk unnamed-chunk-1


#Now, how many are significant p-values (<.05)?
newpowerfourtyfive=length(which(p_values_of_read_depth_fourtyfive[,3]<.05))/(45^2)

#And for 75x coverage

p_values_of_read_depth_seventyfive=matrix(ncol=3)

i=1
j=1

for (i in 1:75) {
  for (j in 1:75) {
    temp_matrix=matrix(ncol=3,nrow=1)
    temp_matrix[1,1]=i
    temp_matrix[1,2]=j
    temp_matrix[1,3]=fisher.test(rbind(c(i,51e4),c(j,51e4)))$p.value
    p_values_of_read_depth_seventyfive=rbind(p_values_of_read_depth_seventyfive,temp_matrix)
  }

}

#Let's see the distribution
hist(p_values_of_read_depth_seventyfive[,3])

plot of chunk unnamed-chunk-1


#And the pretty density plot
plot(density(p_values_of_read_depth_seventyfive[,3],na.rm=T))

plot of chunk unnamed-chunk-1


#Now, how many are significant p-values (<.05)?
newpowerseventyfive=length(which(p_values_of_read_depth_seventyfive[,3]<.05))/(75^2)

#And the power increase?
(newpowerseventyfive-newpowerfourtyfive)/newpowerfourtyfive
[1] 0.2169