Table Of content:

Size distribution of transferred genes (>2)

Motive size in positive (count > 2) genes

  1. Load transferred genes.
    trRNA = read.csv('e_coli.csv')
    names(trRNA)[1] = "ID"
    trRNA$ID = substr(trRNA$ID,8,30)
  1. Load GFF file.
    gff = read.gff("GCF_000005845.2_ASM584v2_genomic.gff")
    gff$ID = str_match(gff$attributes, "ID=[a-z0-9]*")
    gff$ID = substr(gff$ID,4,100)
  1. Match every gene with matching gff start & end values.
    trRNA$sstart = gff$start[match(trRNA$ID, gff$ID)]
    trRNA$end = gff$end[match(trRNA$ID, gff$ID)]
  1. Load FASTA file.
    genomefilename = "GCF_000005845.2_ASM584v2_genomic.fna"
    fasta = read.FASTA(genomefilename)
    names(fasta) = c("genome")
  1. with start & end coordinates get right sequence from FASTA file.
    # initialize fasta to empty values
    trRNA$fasta = ""
    # Remove genes without known start and end points in the GFF file.
    nona_trRNA = trRNA[!is.na(trRNA$sstart),]
    # b. Attach positive gene with matching sequence
    for (index in which(nona_trRNA$count >= 0)) {
      bases = nona_trRNA$sstart[index]:nona_trRNA$end[index]
      nucleotide2 = fasta$genome[bases]
      nucleotide3 = lapply(nucleotide2, as.character.DNAbin)
      nona_trRNA$fasta[index] = paste0( unlist(nucleotide3), collapse='')
      nona_trRNA$geneLength[index] = nchar(nona_trRNA$fasta[index])
    }
    # remove big unnecessary objects
    rm(fasta)

Size distribution of transferred genes (>2)

# randSample = matrix( nrow = length(nona_trRNA$geneLength[nona_trRNA$count>2]), ncol = 100)
# for ( i in seq(100)){P
  randSampleA = sample(x = nona_trRNA$geneLength, size = length(nona_trRNA$geneLength[nona_trRNA$count>2]))
# }
# randSampleA = rowMeans(randSample, na.rm = T)

densityAll = density(nona_trRNA$geneLength)
densityTransfered = density(nona_trRNA$geneLength[nona_trRNA$count>2])
densityRandom = density(randSampleA)

p = plot_ly(type = "scatter", mode = "lines")
p = (add_trace(p, x = densityAll$x, y = densityAll$y, name = "All genes"))
p = (add_trace(p, x = densityTransfered$x, y = densityTransfered$y, name = "transferred genes"))
p = (add_trace(p, x = densityRandom$x, y = densityRandom$y, name = "random"))
p = layout(p, title = "Frequency of size distribution of transferred genes (>2)"
           , xaxis = list(title = "Size (bp)", zeroline = F, showgrid = F)
           , yaxis = list(title = "Frequency (%)", showgrid = F)
           , plot_bgcolor='rgba(0,0,0,0)'
           , paper_bgcolor='rgba(0,0,0,0)'
           , legend = list(x = 0.8, y = 0.9))
p
  1. Motive ‘NYN’ with few repetitions known to locate RNA to the membrane. In order to see the prevelance of that motive in transferred genes There is comparision to whole genome of B. simplex, All genes and only positives transferred genes.
# Genome
genome = read.FASTA(genomefilename)
names(genome) = c("genome")
genome = paste0(as.character.DNAbin(genome$genome), collapse = "")
    # Change to biostrings object, then RC and finally as.character
genomeRC = lapply(lapply(lapply(genome, DNAString),reverseComplement), as.character)
# Genes
genes = nona_trRNA$fasta
    # Change to biostrings object, then RC and finally as.character
genesRC = lapply(lapply(lapply(genes, DNAString),reverseComplement), as.character)

# positives
pos = nona_trRNA$fasta[nona_trRNA$count > 2]
    # Change to biostrings object, then RC and finally as.character
posRC = lapply(lapply(lapply(pos, DNAString),reverseComplement), as.character)
    # random sequences in size and length of positives hits

# numericInput
maxRepetitions = 17
motivesHits = data.frame( repetitions = integer(maxRepetitions))
repetitions=0
for (index in seq(maxRepetitions)) {
  repetitions = index + 5
  pattern = paste0("([atgc][tc][atgc]){",repetitions, ",}")
  motivesHits$repetitions[index] = repetitions
  
  # motivesHits$Genome[index] = sum(stri_count_regex(tolower(unlist(genome)), pattern))
  # motivesHits$GenomeRC[index] = sum(stri_count_regex(tolower(unlist(genomeRC)), pattern))
  
  # motivesHits$Genes[index] = sum(stri_count_regex(tolower(unlist(genes)), pattern))
  # motivesHits$GenesRC[index] = sum(stri_count_regex(tolower(unlist(genesRC)), pattern))

  # motivesHits$positive[index] = sum(stri_count_regex(tolower(unlist(pos)), pattern))
  motivesHits$positiveRC[index] = sum(stri_count_regex(tolower(unlist(posRC)), pattern))
  
  # motivesHits$randomSample[index] = sum(stri_count_regex(tolower(unlist(randomSample)), pattern))
}
# Delete unnecessery big elements
rm(genome,genomeRC,genes)#,genesRC)
library(tibble)
samples = 100
randomSample = tibble(x=1:length(posRC))
genesSample = tibble(x=1:length(posRC))
# create random sequences in same number and lengthes of the transered genes
for(index in seq(samples)) {
  randomSample[index] = unlist(lapply(lapply(lapply(posRC, DNAString), length), randDNA))
  genesSample[index] = unlist(sample(genesRC, length(posRC)))
}

randomSD = matrix(nrow = maxRepetitions)
sampleSD = matrix(nrow = maxRepetitions)
repetitions=0

for (index in seq(maxRepetitions)){
  repetitions = index + 5

  pattern = paste0("([atgc][tc][atgc]){",repetitions, ",}")
  motivesHits$samples[index] = mean(stri_count_regex(tolower(genesSample), pattern))
  motivesHits$random[index] = mean(stri_count_regex(tolower(randomSample), pattern))
  
  sampleSD[index] = sd(stri_count_regex(tolower(genesSample), pattern))
  randomSD[index] = sd(stri_count_regex(tolower(randomSample), pattern))
}
# Delete unnecessery big elements
rm(randomSample)
# randomHits = data.frame(t(randomHits))
    # motivesHits$random = floor(apply(randomHits, 1, mean))
    temp = melt(motivesHits, id="repetitions")
    temp$sd = 0
    temp$sd[temp[,2] == "random"] = randomSD
    temp$sd[temp[,2] == "samples"] = sampleSD
    
    levels(temp$variable)[levels(temp$variable) == "random"] <- paste0("Random sequences (n=",samples,")")
    levels(temp$variable)[levels(temp$variable) == "positiveRC"] <- "Transferred genes"
    levels(temp$variable) = c(levels(temp$variable),"RC")

Motive size in positive (count > 2) genes

    p2 = plot_ly( temp
                  ,x = ~repetitions
                  ,y = ~value
                  ,color = ~variable
                  ,type = "scatter"
                  ,mode = "lines")
    p2 = layout( p2
                 ,yaxis = list(type = "log", title = "Occurrences (#)", zeroline = F, showgrid = F)
                 ,xaxis = list(title = "Motive Repetitions", showgrid = F)
                 ,title = "(NYN)<sub>N</sub> motive in <i>E. coli</i> genome"
                 ,font = list(size = 12)
                 ,legend = list(x = 0.8, y = 0.9)
                 ,paper_bgcolor = 'rgba(0,0,0,0)'
                 ,plot_bgcolor = 'rgba(0,0,0,0)'
                 )
    
    p2 
  title = expression(paste((NYN)["n"], " motive repetitions in ", italic("E. coli")))

   p2.1 = ggplot(temp, aes(repetitions, value, ymin = value - sd, ymax = value + sd, variable)) +
     geom_line(aes(repetitions, value, color = variable), size = 1.5) +
     theme_classic() + 
     scale_y_log10(breaks = c(0.1,1,10,100,1000)) + 
     geom_linerange(color = "grey", size = 3, alpha = 0.3) + 
     scale_x_continuous(expand = c(0,0)) +
     labs(y = "Occurrences (#)", x = "Motive repetitions", color = "") +
     theme(text = element_text(size=18)
            ,legend.position = c(0.75,0.9)
            ,panel.background = element_blank()
            ,plot.title = element_text(hjust = 0.5)
          ) +
     ggtitle(title) 
     plot_grid(p2.1, labels = c('B'))

  1. Positives genes with highest number of motive repetitions.
    repetitions = motivesHits$repetitions[which.min(motivesHits$positiveRC)]
    pattern = paste0("([tc][atgc][atgc]){",repetitions - 1, ",}")
    bestHits = grep(pattern, posRC, ignore.case = T)
    htmlTable(nona_trRNA[nona_trRNA$fasta == pos[bestHits],c("description", "ID")])
description ID
17 DNA translocase at septal ring sorting daughter chromsomes cds870

Source code for this file can be found in: https://bitbucket.org/benzzz/rnamotive/src/master/coli.rmd