Análisis del score de comparación de RNA's

Lectura del fichero

rna = read.table("/home/ricardo/Documents/recerca/RNAjairo/scoresSameFunc", 
    header = TRUE)
str(rna)
## 'data.frame':    76976 obs. of  7 variables:
##  $ pdb1 : Factor w/ 417 levels "17raA","1a1tB",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ pdb2 : Factor w/ 417 levels "1a4tA","1a60A",..: 1 6 10 8 3 4 17 9 2 7 ...
##  $ align: int  12 17 14 5 19 19 17 10 13 20 ...
##  $ score: num  1.219 1.376 1.304 0.296 1.086 ...
##  $ len1 : int  21 21 21 21 21 21 21 21 21 21 ...
##  $ len2 : int  15 18 16 17 24 24 19 29 44 30 ...
##  $ same : int  0 0 1 0 0 0 0 0 0 0 ...

Score distribution

## All pairs
hist(rna$score, freq = FALSE, ylim = c(0, 1.4), xlim = c(0, 3.4), main = "Histogram of all score's")
lines(density(rna$score), col = "red")

plot of chunk unnamed-chunk-2

## Pairs NOT SAME
hist(rna$score[rna$same == 0], freq = FALSE, ylim = c(0, 1.4), xlim = c(0, 3.4), 
    main = "Histogram of score's NOT SAME function")
lines(density(rna$score[rna$same == 0]), col = "red")

plot of chunk unnamed-chunk-3

## Pairs SAME
hist(rna$score[rna$same == 1], freq = FALSE, ylim = c(0, 1.4), xlim = c(0, 3.4), 
    main = "Histogram of score's SAME function")
lines(density(rna$score[rna$same == 1]), col = "red")

plot of chunk unnamed-chunk-4

Score distribution by length

maxLen = pmax(rna$len1, rna$len2)
boxplot(rna$score ~ maxLen, main = "Score length resistant ALL pairs")

plot of chunk unnamed-chunk-5

maxLen = pmax(rna$len1[rna$same == 0], rna$len2[rna$same == 0])
boxplot(rna$score[rna$same == 0] ~ maxLen, main = "Score length resistant NOT SAME pairs")

plot of chunk unnamed-chunk-6

maxLen = pmax(rna$len1[rna$same == 1], rna$len2[rna$same == 1])
boxplot(rna$score[rna$same == 1] ~ maxLen, main = "Score length resistant SAME pairs")

plot of chunk unnamed-chunk-7

Probabilities

P(NOTSAME), P(SAME)

n = length(rna$same)
p.not.same = length(rna$same[rna$same == 0])/n
p.not.same
## [1] 0.9674
p.same = 1 - p.not.same
p.same
## [1] 0.03265

###P(Score/SAME), P(Score/NOSAME)

n = length(rna$same)
p.not.same = length(rna$same[rna$same == 0])/n
p.not.same
## [1] 0.9674
p.same = 1 - p.not.same
p.same
## [1] 0.03265

Loading RNA scores and cut from 0.0 to 1.0 by step 1/k (k=number of intervals) for RNA scores

scoreSAME = rna$score[rna$same == 1]
scoreNOSAME = rna$score[rna$same == 0]

Compute Absolute Frecuencies k intervals, breaks from 0.00 to 1.00 by=step

# Set the parameters for breaks
k <- 100
M = max(rna$score)
M
## [1] 3.084
step <- M/100
step
## [1] 0.03084
breaks <- seq(0, M, by = step)
length <- length(breaks)
# Inicializate the data.frame for results
source.counts.scores <- data.frame(scores = breaks[2:length])

Computing the frequencies….

Counts scores

counts.SAME <- as.numeric(table(cut(scoreSAME, breaks = breaks)))
counts.NOSAME <- as.numeric(table(cut(scoreNOSAME, breaks = breaks)))
source.counts.scores$absolute.freqs.SAME <- counts.SAME
source.counts.scores$absolute.freqs.NOSAME <- counts.NOSAME
fix(source.counts.scores)

Not work

#Writing counts
write.table(source.counts.scores,file=PDTMFreq,row.names=F)

Freqs

N1 = sum(source.counts.scores$absolute.freqs.SAME)
N2 = sum(source.counts.scores$absolute.freqs.NOSAME)
plot(source.counts.scores$scores, source.counts.scores$absolute.freqs.NOSAME/N2, 
    col = "blue", type = "l", ylab = "Relative frequencies", xlab = "Score")
lines(source.counts.scores$scores, source.counts.scores$absolute.freqs.SAME/N1, 
    col = "red", type = "l")
legend(x = 0.748, y = 0.099, legend = c("Not Same", "Same"), fill = c("blue", 
    "red"), cex = 0.7)

plot of chunk unnamed-chunk-13

Acums

N1 = sum(source.counts.scores$absolute.freqs.SAME)
N2 = sum(source.counts.scores$absolute.freqs.NOSAME)
plot(source.counts.scores$scores, cumsum(source.counts.scores$absolute.freqs.NOSAME)/N2, 
    col = "blue", type = "l", ylab = "Relative frequencies", xlab = "Score")
lines(source.counts.scores$scores, cumsum(source.counts.scores$absolute.freqs.SAME)/N1, 
    col = "red", type = "l")
legend(x = 0, y = 0.8, legend = c("Not Same", "Same"), fill = c("blue", "red"), 
    cex = 0.7)

plot of chunk unnamed-chunk-14

Same/(Same+NoSame) ###

plot(source.counts.scores$scores, cumsum(source.counts.scores$absolute.freqs.NOSAME)/N2, 
    col = "blue", type = "l", ylab = "Relative frequencies", xlab = "Score")
lines(source.counts.scores$scores, cumsum(source.counts.scores$absolute.freqs.SAME)/N1, 
    col = "red", type = "l")
legend(x = 0, y = 0.8, legend = c("Not Same", "Same"), fill = c("blue", "red"), 
    cex = 0.7)

plot of chunk unnamed-chunk-15