Looking RNA motive in transferred RNA

Rotem Hadar

October 29, 2018

Table Of content:

Size distribution of transferred genes (>2)

Motive size in positive (count > 2) genes

  1. Load transferred genes.
sim_trRNA = read.csv('simplex.csv')
names(sim_trRNA)[1] = "ID"
  1. Load GFF file.
sim_gff = read.gff("simplex.gff")
sim_gff$ID = str_match(sim_gff$attributes, "ID=[a-z0-9]*")
sim_gff$ID = substr(sim_gff$ID,4,100)
  1. Match every gene with matching gff start & end values.
sim_trRNA$sstart = sim_gff$start[match(sim_trRNA$ID, sim_gff$ID)]
sim_trRNA$end = sim_gff$end[match(sim_trRNA$ID, sim_gff$ID)]
  1. Load FASTA file.
genomefilename = "GCF_001578185.1_ASM157818v1_genomic.fna"
sim_fasta = read.FASTA(genomefilename)
names(sim_fasta) = c("genome", "plasmid")
  1. with start & end coordinates get right sequence from FASTA file.
# initialize fasta to empty values
sim_trRNA$fasta = ""
# Remove genes without known start and end points in the GFF file.
nona_trRNA = sim_trRNA[!is.na(sim_trRNA$sstart),]
# b. Attach positive gene with matching sequence
for (index in which(nona_trRNA$count >= 0)) {
  bases = nona_trRNA$sstart[index]:nona_trRNA$end[index]
  nucleotide2 = sim_fasta$genome[bases]
  nucleotide3 = lapply(nucleotide2, as.character.DNAbin)
  nona_trRNA$fasta[index] = paste0( unlist(nucleotide3), collapse='')
  nona_trRNA$geneLength[index] = nchar(nona_trRNA$fasta[index])
}
# remove big unnecessary objects
rm(sim_fasta)

Size distribution of transferred genes (>2)

# randSample = matrix( nrow = length(nona_trRNA$geneLength[nona_trRNA$count>2]), ncol = 100)
# for ( i in seq(100)){P
  randSampleA = sample(x = nona_trRNA$geneLength, size = length(nona_trRNA$geneLength[nona_trRNA$count>2]))
# }
# randSampleA = rowMeans(randSample, na.rm = T)

densityAll = density(nona_trRNA$geneLength)
densityTransfered = density(nona_trRNA$geneLength[nona_trRNA$count>2])
densityRandom = density(randSampleA)

p = plot_ly(type = "scatter", mode = "lines")
p = (add_trace(p, x = densityAll$x, y = densityAll$y, name = "All genes"))
p = (add_trace(p, x = densityTransfered$x, y = densityTransfered$y, name = "transferred genes"))
p = (add_trace(p, x = densityRandom$x, y = densityRandom$y, name = "random"))
p = layout(p, title = "Frequency of size distribution of transferred genes (>2)"
           , xaxis = list(title = "Size (bp)", zeroline = F, showgrid = F)
           , yaxis = list(title = "Frequency (%)", showgrid = F)
           , plot_bgcolor='rgba(0,0,0,0)'
           , paper_bgcolor='rgba(0,0,0,0)'
           , legend = list(x = 0.8, y = 0.9))
p
  1. Motive ‘NYN’ with few repetitions known to locate RNA to the membrane. In order to see the prevelance of that motive in transferred genes There is comparision to whole genome of B. simplex, All genes and only positives transferred genes.
# Genome
genome = read.FASTA(genomefilename)
names(genome) = c("genome", "plasmid")
genome = paste0(as.character.DNAbin(genome$genome), collapse = "")
    # Change to biostrings object, then RC and finally as.character
genomeRC = lapply(lapply(lapply(genome, DNAString),reverseComplement), as.character)
# Genes
genes = nona_trRNA$fasta
    # Change to biostrings object, then RC and finally as.character
genesRC = lapply(lapply(lapply(genes, DNAString),reverseComplement), as.character)

# positives
pos = nona_trRNA$fasta[nona_trRNA$count > 2]
    # Change to biostrings object, then RC and finally as.character
posRC = lapply(lapply(lapply(pos, DNAString),reverseComplement), as.character)
    # random sequences in size and length of positives hits

# numericInput
maxRepetitions = 27
motivesHits = data.frame( repetitions = integer(maxRepetitions))
repetitions=0
for (index in seq(maxRepetitions)) {
  repetitions = index + 5
  pattern = paste0("([atgc][tc][atgc]){",repetitions, ",}")
  motivesHits$repetitions[index] = repetitions
  
  # motivesHits$Genome[index] = sum(stri_count_regex(tolower(unlist(genome)), pattern))
  # motivesHits$GenomeRC[index] = sum(stri_count_regex(tolower(unlist(genomeRC)), pattern))
  
  # motivesHits$Genes[index] = sum(stri_count_regex(tolower(unlist(genes)), pattern))
  # motivesHits$GenesRC[index] = sum(stri_count_regex(tolower(unlist(genesRC)), pattern))

  # motivesHits$positive[index] = sum(stri_count_regex(tolower(unlist(pos)), pattern))
  motivesHits$positiveRC[index] = sum(stri_count_regex(tolower(unlist(posRC)), pattern))
  
  # motivesHits$randomSample[index] = sum(stri_count_regex(tolower(unlist(randomSample)), pattern))
}
# Delete unnecessery big elements
rm(genome,genomeRC,genes)#,genesRC)
library(tibble)
samples = 50
randomSample = tibble(x=1:length(posRC))
genesSample = tibble(x=1:length(posRC))
# create random sequences in same number and lengthes of the transered genes
for(index in seq(samples)) {
  randomSample[index] = unlist(lapply(lapply(lapply(posRC, DNAString), length), randDNA))
  genesSample[index] = unlist(sample(genesRC, length(posRC)))
}

randomSD = matrix(nrow = maxRepetitions)
sampleSD = matrix(nrow = maxRepetitions)
repetitions=0

for (index in seq(maxRepetitions)){
  repetitions = index + 5

  pattern = paste0("([atgc][tc][atgc]){",repetitions, ",}")
  motivesHits$samples[index] = mean(stri_count_regex(tolower(genesSample), pattern))
  motivesHits$random[index] = mean(stri_count_regex(tolower(randomSample), pattern))
  
  sampleSD[index] = sd(stri_count_regex(tolower(genesSample), pattern))
  randomSD[index] = sd(stri_count_regex(tolower(randomSample), pattern))
}
# Delete unnecessery big elements
rm(randomSample)
# randomHits = data.frame(t(randomHits))
    # motivesHits$random = floor(apply(randomHits, 1, mean))
    # motivesHits$sampling = floor(apply(sampleHits, 1, mean))
    temp = melt(motivesHits, id="repetitions")
    temp$sd = 0
    temp$sd[temp[,2] == "random"] = randomSD
    temp$sd[temp[,2] == "samples"] = sampleSD
    
    levels(temp$variable)[levels(temp$variable) == "random"] <- paste0("Random sequences (n=",samples,")")
    levels(temp$variable)[levels(temp$variable) == "positiveRC"] <- "Transferred genes"
    levels(temp$variable) = c(levels(temp$variable),"RC")

Motive size in positive (count > 2) genes

    p2 = plot_ly( temp
                  ,x = ~repetitions
                  ,y = ~value
                  # ,error_x = ~sd
                  ,color = ~variable
                  ,type = "scatter"
                  ,mode = "lines")
    p2 = layout( p2
                 ,yaxis = list(type = "log", title = "Hits", zeroline = F, showgrid = F)
                 ,xaxis = list(title = "Motive Repetitions", showgrid = F)
                 ,title = "(NYN)<sub>N</sub> motive in <i>B. simplex</i> genome"
                 ,font = list(size = 12)
                 ,legend = list(x = 0.8, y = 0.9)
                 ,paper_bgcolor = 'rgba(0,0,0,0)'
                 ,plot_bgcolor = 'rgba(0,0,0,0)'
                 )
    p2 %>% add_annotations(text = "<b> RC = <br>Reverse <br>Complement</b>", x = 23, y = 2.2, showarrow = F)
    title = expression(paste((NYN)["n"], " motive repetitions in ", italic("B. simplex")))

   p2.1 = ggplot(temp, aes(repetitions, value, ymin = value - sd, ymax = value + sd, variable)) +
     geom_line(aes(repetitions, value, color = variable), size = 1.5) +
     theme_classic() + 
     scale_y_log10(breaks = c(1,10,100,1000,10000)) +
     geom_linerange(color = "grey", size = 3, alpha = 0.3) + 
     scale_x_continuous(expand = c(0,0)) +
     labs(y = "Occurrences (#)", x = "Motive repetitions", color = "") +
     theme(text = element_text(size=18)
            ,legend.position = c(0.75,0.9)
            ,panel.background = element_blank()
            ,plot.title = element_text(hjust = 0.5)
          ) +
     ggtitle(title) 
     plot_grid(p2.1, labels = c('A'))

  1. Positives genes with highest number of motive repetitions.
bestHits = grep(pattern, posRC, ignore.case = T)
htmlTable(nona_trRNA[nona_trRNA$fasta == pos[bestHits],c("description", "gene_id")])
## Warning in nona_trRNA$fasta == pos[bestHits]: longer object length is not a
## multiple of shorter object length
description gene_id
322 penicillin-binding protein WP_061461217.1
3996 peptidoglycan glycosyltransferase WP_081108882.1
stri_count_regex(pattern, nona_trRNA$fasta[322])
## [1] 0

Source code for this file can be found in: https://bitbucket.org/benzzz/rnamotive/src/master/simplex.rmdd