Prepare sequences
transferredRNA = read.csv('simplex.csv')
names(transferredRNA)[1] = "ID"
gff = read.gff("simplex.gff")
gff$ID = str_match(gff$attributes, "ID=[a-z0-9]*")
gff$ID = substr(gff$ID,4,100)
transferredRNA$sstart = gff$start[match(transferredRNA$ID, gff$ID)]
transferredRNA$end = gff$end[match(transferredRNA$ID, gff$ID)]
genomefilename = "GCF_001578185.1_ASM157818v1_genomic.fna"
fasta = read.FASTA(genomefilename)
names(fasta) = c("genome", "plasmid")
# initialize fasta to empty values
transferredRNA$fasta = ""
# Remove genes without known start and end points in the GFF file.
nona_trRNA = transferredRNA[!is.na(transferredRNA$sstart),]
# b. Attach positive gene with matching sequence
for (index in which(nona_trRNA$count >= 0)) {
bases = nona_trRNA$sstart[index]:nona_trRNA$end[index]
nucleotide2 = fasta$genome[bases]
nucleotide3 = lapply(nucleotide2, as.character.DNAbin)
nona_trRNA$fasta[index] = paste0( unlist(nucleotide3), collapse='')
nona_trRNA$geneLength[index] = nchar(nona_trRNA$fasta[index])
}
pos = nona_trRNA$fasta[nona_trRNA$count > 2]
genes = nona_trRNA$fasta
# Change to biostrings object, then RC and finally as.character
genesRC = lapply(lapply(lapply(genes, DNAString),reverseComplement), as.character)
# positives change to biostring object
posRC = lapply(lapply(lapply(pos, DNAString),reverseComplement), as.character)
# remove big unnecessary objects
rm(fasta, gff, nona_trRNA)
Controls
Random sampling of genes
# sampling
samples = 100
genesSample = tibble(x=1:length(posRC))
for(index in seq(samples)) {
# randomSample[index] = unlist(lapply(genes.length, randDNA))
genesSample[index] = unlist(sample(genesRC, length(posRC)))
}
Random sequences with same nucleotide ratios
# random sequences
# array with nucleotides with same ratio as transferred genes
As = sum(stri_count_fixed(posRC,"A"))
Ts = sum(stri_count_fixed(posRC,"T"))
Gs = sum(stri_count_fixed(posRC,"G"))
Cs = sum(stri_count_fixed(posRC,"C"))
pos.nucleotides = c(rep("A", As), rep("G", Gs), rep("C", Cs), rep("T", Ts))
genes.length = unlist(lapply(lapply(posRC, DNAString), length))
randomSample = matrix(nrow = length(posRC), ncol = samples)
for (resample in seq(samples)){
for (gene.index in seq(genes.length)) {
sequence = sample(pos.nucleotides, as.integer(genes.length[gene.index]))
randomSample[gene.index, resample] = paste0(sequence, collapse = "")
}
}
randomSample = unlist(randomSample)
Looking for the motive
Motive specifications
# numeric Input
maxRepetitions = 27
# motiveUnit = "([atgc][tc][atgc])"
# tc ac ag at gc tg
motiveUnit = toupper("([atgc][tc][atgc])")
# motive results array
motivesHits = data.frame( repetitions = integer(maxRepetitions))
Look the motive in transferred genes
repetitions = 0
for (index in seq(maxRepetitions)) {
repetitions = index + 5
pattern = paste0(motiveUnit,"{",repetitions, ",}")
motivesHits$repetitions[index] = repetitions
motivesHits$positiveRC[index] = sum(stri_count_regex(unlist(posRC), pattern))
}
Look the motive in random sequences
randomSD = matrix(nrow = maxRepetitions)
repetitions=0
for (index in seq(maxRepetitions)){
repetitions = index + 5
pattern = paste0(motiveUnit,"{",repetitions, ",}")
hits.per.gene = stri_count_regex(randomSample, pattern)
motivesHits$random[index] = sum(hits.per.gene) / samples
# randomSD[index] = sd(hits)
}
rm(randomSample)
Look the motive in genes samples
sampleHits = matrix(nrow = maxRepetitions, ncol = samples)
repetitions = 0
for (index in seq(maxRepetitions)) {
repetitions = index + 5
pattern = paste0(motiveUnit,"{",repetitions, ",}")
sampleHits[index,] = stri_count_regex(genesSample, pattern)
}
sampleHits = data.frame(sampleHits)
motivesHits$samples = apply(sampleHits , 1, mean)
sampleHits$repetitions = 6 : (maxRepetitions + 5)
rm(genesSample)
Arrange the data
melted.table = melt(motivesHits, id = "repetitions")
melted.table$value = round(melted.table$value)
meltSamp = melt(sampleHits, id = "repetitions")
Calculate P-value, the percentage of random sets at each length that had a fraction of genes with the motif that was higher than in the real set.
pval = (apply(sampleHits[,1:samples] > motivesHits$positiveRC, 1, sum) + 1) /
(samples + 1)
pval
[1] 0.00990099 0.00990099 0.00990099 0.00990099 0.00990099 0.00990099 0.00990099 0.00990099 0.00990099
[10] 0.00990099 0.00990099 0.00990099 0.00990099 0.00990099 0.00990099 0.00990099 0.00990099 0.02970297
[19] 0.01980198 0.00990099 0.00990099 0.00990099 0.00990099 0.00990099 0.00990099 0.00990099 0.00990099
Motive repetitions in B. simplex
# load("results/molten_##.rdata")
title = bquote(.(motiveUnit) ['n']~ "motive repetitions in " ~ italic("B. simplex"))
Xs = paste0(rep("X", samples), 1:samples)
leg.colors = c("#E69F00", "#56B4E9",rep("#999999", samples))
leg.breaks = c("positiveRC", "random", Xs)
leg = c(values = leg.colors, breaks = leg.breaks)
p = ggplot(melted.table, aes(repetitions, value, variable)) +
geom_point(aes(repetitions, value)
, data = meltSamp
, color = "grey"
, alpha = 0.3
, position = position_jitter(w = 0.5, h = 0)) +
geom_line(aes(repetitions, value, color = variable), size = 2) +
scale_color_manual(values = c("#619CFF","#F8766D","grey")
,labels = c( "Transferred genes"
, "Random sequences"
, "Random genes samples")) +
scale_y_log10(breaks = c(1,10,100,1000,10000)) +
scale_x_continuous(expand = c(0, -0.3)) +
labs(y = "Motive occurrences (#)", x = "Motive repetitions (length)"
, color = "", group = "genes") +
theme_classic() +
theme( text = element_text(size=28)
,title = element_text(size=22)
,legend.position = c(0.75,0.9)
,panel.background = element_blank()
,plot.title = element_text(hjust = 0.5)
) +
ggtitle(title)
p

Save the graph
motivename = substr(motiveUnit, 9, 10)
file.png = paste0("results/greyRandom_",motivename , ".png")
file.svg = paste0("results/greyRandom_", motivename, ".svg")
ggsave(filename = file.png, p, width = 10)
ggsave(filename = file.svg, p, width = 10)
Another graph same values
line = list(width = 3)
p2 = plot_ly(
type = "scatter"
, mode = "markers+lines"
)
p2 = add_trace( p2
, data = melted.table[melted.table$variable =="positiveRC",]
, x = ~repetitions
, y = ~value
, color = ~variable
, line = list(width = 6)
# , name = c("Transferred genes", "Random sequences")
)
p2 = add_markers( p2
, data = melted.table
, x = jitter(melted.table$repetitions)
, y = ~value
, opacity = 0.3
, marker = list(color = "grey", opacity = 0.3)
, name = "genes samples"
)
p2 = add_markers( p2
, data = melted.table[melted.table$variable =="random",]
, x = ~repetitions
, y = ~value
, marker = list(color = "orange")
, name = "Random sequences"
)
p2 = layout( p2
,yaxis = list(type = "log", title = "Hits (#)", zeroline = F, showgrid = F)
,xaxis = list(title = "Motive Repetitions", showgrid = F)
,title = "(NYN)<sub>n</sub> motive in <i>B. simplex</i> genome"
,font = list(size = 12)
,legend = list(x = 0.8, y = 0.9)
,paper_bgcolor = 'rgba(0,0,0,0)'
,plot_bgcolor = 'rgba(0,0,0,0)'
)
p2 # %>% add_annotations(text = "<b> RC = <br>Reverse <br>Complement</b>", x = 23, y = 2.2, showarrow = F)
Source code here
---
title: "motive analyzing for each sample"
author: "Rotem Hadar"
date: "December 9, 2018"
output:
  html_notebook: default
  pdf_document: default
toc: yes
---
 

```{r message=FALSE, warning=FALSE}
library(ape)
library(stringr)
library(stringi)
library(Biostrings)
library(plotly)
library(reshape2)
library(htmlTable)
library(RBioinf)
library("cowplot")
library(fgpt)
library(dplyr)

```


###  Prepare sequences
```{r sequences}
transferredRNA = read.csv('simplex.csv')
names(transferredRNA)[1] = "ID"

gff = read.gff("simplex.gff")
gff$ID = str_match(gff$attributes, "ID=[a-z0-9]*")
gff$ID = substr(gff$ID,4,100)

transferredRNA$sstart = gff$start[match(transferredRNA$ID, gff$ID)]
transferredRNA$end = gff$end[match(transferredRNA$ID, gff$ID)]

genomefilename = "GCF_001578185.1_ASM157818v1_genomic.fna"
fasta = read.FASTA(genomefilename)
names(fasta) = c("genome", "plasmid")

# initialize fasta to empty values
transferredRNA$fasta = ""
# Remove genes without known start and end points in the GFF file.
nona_trRNA = transferredRNA[!is.na(transferredRNA$sstart),]
# b. Attach positive gene with matching sequence
for (index in which(nona_trRNA$count >= 0)) {
  bases = nona_trRNA$sstart[index]:nona_trRNA$end[index]
  nucleotide2 = fasta$genome[bases]
  nucleotide3 = lapply(nucleotide2, as.character.DNAbin)
  nona_trRNA$fasta[index] = paste0( unlist(nucleotide3), collapse='')
  nona_trRNA$geneLength[index] = nchar(nona_trRNA$fasta[index])
}

pos = nona_trRNA$fasta[nona_trRNA$count > 2]
genes = nona_trRNA$fasta
      # Change to biostrings object, then RC and finally as.character
  genesRC = lapply(lapply(lapply(genes, DNAString),reverseComplement), as.character)

# positives change to biostring object
  posRC = lapply(lapply(lapply(pos, DNAString),reverseComplement), as.character)

# remove big unnecessary objects
rm(fasta, gff, nona_trRNA)

```

#### Controls
##### Random sampling of genes
```{r}
#  sampling
samples = 100
  genesSample = tibble(x=1:length(posRC))
for(index in seq(samples)) {
  # randomSample[index] = unlist(lapply(genes.length, randDNA))
      genesSample[index] = unlist(sample(genesRC, length(posRC)))
}
```

##### Random sequences with same nucleotide ratios
```{r}
#  random sequences 
#  array with nucleotides with same ratio as transferred genes
  As = sum(stri_count_fixed(posRC,"A"))
  Ts = sum(stri_count_fixed(posRC,"T"))
  Gs = sum(stri_count_fixed(posRC,"G"))
  Cs = sum(stri_count_fixed(posRC,"C"))        
pos.nucleotides = c(rep("A", As), rep("G", Gs), rep("C", Cs), rep("T", Ts))
  genes.length = unlist(lapply(lapply(posRC, DNAString), length))

  randomSample = matrix(nrow = length(posRC), ncol = samples)
for (resample in seq(samples)){
  for (gene.index in seq(genes.length)) {
    sequence = sample(pos.nucleotides, as.integer(genes.length[gene.index]))
    randomSample[gene.index, resample] = paste0(sequence, collapse = "")
  }
}
randomSample = unlist(randomSample)
```

### Looking for the motive
#### Motive specifications
```{r motiveDecleration}
# numeric Input
maxRepetitions = 27
# motiveUnit = "([atgc][tc][atgc])"
# tc ac ag at gc tg
motiveUnit = toupper("([atgc][tc][atgc])")

#  motive results array
motivesHits = data.frame( repetitions = integer(maxRepetitions))
```

#### Look the motive in transferred genes
```{r LookPositive} 
repetitions = 0
for (index in seq(maxRepetitions)) {
  repetitions = index + 5
  pattern = paste0(motiveUnit,"{",repetitions, ",}")
  motivesHits$repetitions[index] = repetitions
      motivesHits$positiveRC[index] = sum(stri_count_regex(unlist(posRC), pattern))
}
```

#### Look the motive in random sequences
```{r LookRandom}
randomSD = matrix(nrow = maxRepetitions)

repetitions=0
for (index in seq(maxRepetitions)){
  repetitions = index + 5

  pattern = paste0(motiveUnit,"{",repetitions, ",}")
  hits.per.gene = stri_count_regex(randomSample, pattern)
  motivesHits$random[index] = sum(hits.per.gene) / samples
  # randomSD[index] = sd(hits)
}

rm(randomSample)
```

#### Look the motive in genes samples
```{r LookSamples, message=FALSE, warning=FALSE}
sampleHits = matrix(nrow = maxRepetitions, ncol = samples)

repetitions = 0
for (index in seq(maxRepetitions)) {
  repetitions = index + 5
  
  pattern = paste0(motiveUnit,"{",repetitions, ",}")
  sampleHits[index,] = stri_count_regex(genesSample, pattern)
}

sampleHits = data.frame(sampleHits)

motivesHits$samples = apply(sampleHits , 1, mean)
sampleHits$repetitions = 6 : (maxRepetitions + 5)

rm(genesSample)
```

##### Arrange the data
```{r meltData}
melted.table = melt(motivesHits, id = "repetitions")
melted.table$value = round(melted.table$value)
meltSamp = melt(sampleHits, id = "repetitions")
```

Calculate P-value, the percentage of random sets at each length that had a fraction of genes with the motif that was higher than in the real set.
```{r}
pval = (apply(sampleHits[,1:samples] > motivesHits$positiveRC, 1, sum) + 1) /
  (samples + 1)
pval
```


# Motive repetitions in _B. simplex_
```{r ggplot}
# load("results/molten_##.rdata")
title = bquote(.(motiveUnit) ['n']~ "motive repetitions in " ~ italic("B. simplex"))
Xs = paste0(rep("X", samples), 1:samples)
leg.colors = c("#E69F00", "#56B4E9",rep("#999999", samples))
  leg.breaks = c("positiveRC", "random", Xs)
leg = c(values = leg.colors, breaks = leg.breaks)

   p = ggplot(melted.table, aes(repetitions, value, variable)) +
     
     geom_point(aes(repetitions, value)
                , data = meltSamp
                , color = "grey"
                , alpha = 0.3
                , position = position_jitter(w = 0.5, h = 0)) +
     
     geom_line(aes(repetitions, value, color = variable), size = 2) +
     scale_color_manual(values = c("#619CFF","#F8766D","grey")
                        ,labels = c(  "Transferred genes"
                                    , "Random sequences"
                                    , "Random genes samples")) + 
     scale_y_log10(breaks = c(1,10,100,1000,10000)) +
     
     scale_x_continuous(expand = c(0, -0.3)) +
     labs(y = "Motive occurrences (#)", x = "Motive repetitions (length)"
          , color = "", group = "genes") +
     theme_classic() + 
     theme(  text = element_text(size=28) 
            ,title = element_text(size=22) 
            ,legend.position = c(0.75,0.9) 
            ,panel.background = element_blank() 
            ,plot.title = element_text(hjust = 0.5) 
          ) + 
     ggtitle(title) 
   p
```
##### Save the graph
```{r save, message=FALSE, warning=FALSE}
motivename = substr(motiveUnit, 9, 10)
file.png = paste0("results/greyRandom_",motivename , ".png")
file.svg = paste0("results/greyRandom_", motivename, ".svg") 
ggsave(filename = file.png, p, width = 10)
ggsave(filename = file.svg, p, width = 10)
#  save the data for the plot
save(file = paste0("results/molten_", motivename, ".rdata")
     , melted.table, meltSamp, motivename, pval)
```

## Another graph same values
```{r plotly}
line = list(width = 3)
    p2 = plot_ly( 
                  type = "scatter"
                , mode = "markers+lines"
                )
    p2 = add_trace( p2
                      , data = melted.table[melted.table$variable =="positiveRC",]
                    , x = ~repetitions
                    , y = ~value
                    , color = ~variable
                    , line = list(width = 6)
                    # , name = c("Transferred genes", "Random sequences")
                   )    
    p2 = add_markers(  p2
                   , data = melted.table
                   , x = jitter(melted.table$repetitions)
                   , y = ~value
                   , opacity = 0.3
                   , marker = list(color = "grey", opacity = 0.3)
                   , name = "genes samples"
                  )
    p2 = add_markers( p2
                    , data = melted.table[melted.table$variable =="random",]
                    , x = ~repetitions
                    , y = ~value
                    , marker = list(color = "orange")
                    , name = "Random sequences"
                   )    
    p2 = layout( p2
                 ,yaxis = list(type = "log", title = "Hits (#)", zeroline = F, showgrid = F)
                 ,xaxis = list(title = "Motive Repetitions", showgrid = F)
                 ,title = "(NYN)<sub>n</sub> motive in <i>B. simplex</i> genome"
                 ,font = list(size = 12)
                 ,legend = list(x = 0.8, y = 0.9)
                 ,paper_bgcolor = 'rgba(0,0,0,0)'
                 ,plot_bgcolor = 'rgba(0,0,0,0)'
                 )
      p2 # %>% add_annotations(text = "<b> RC = <br>Reverse <br>Complement</b>", x = 23, y = 2.2, showarrow = F)
```

Source code [here](https://bitbucket.org/benzzz/rnamotive/src/master/eachRandomMotive.rmd)