Table Of content:
Size distribution of transferred genes (>2)
Motive size in positive (count > 2) genes
- Load transferred genes.
trRNA = read.csv('e_coli.csv')
names(trRNA)[1] = "ID"
trRNA$ID = substr(trRNA$ID,8,30)- Load GFF file.
gff = read.gff("GCF_000005845.2_ASM584v2_genomic.gff")
gff$ID = str_match(gff$attributes, "ID=[a-z0-9]*")
gff$ID = substr(gff$ID,4,100)- Match every gene with matching gff start & end values.
trRNA$sstart = gff$start[match(trRNA$ID, gff$ID)]
trRNA$end = gff$end[match(trRNA$ID, gff$ID)]- Load FASTA file.
genomefilename = "GCF_000005845.2_ASM584v2_genomic.fna"
fasta = read.FASTA(genomefilename)
names(fasta) = c("genome")- with start & end coordinates get right sequence from FASTA file.
# initialize fasta to empty values
trRNA$fasta = ""
# Remove genes without known start and end points in the GFF file.
nona_trRNA = trRNA[!is.na(trRNA$sstart),]
# b. Attach positive gene with matching sequence
for (index in which(nona_trRNA$count >= 0)) {
bases = nona_trRNA$sstart[index]:nona_trRNA$end[index]
nucleotide2 = fasta$genome[bases]
nucleotide3 = lapply(nucleotide2, as.character.DNAbin)
nona_trRNA$fasta[index] = paste0( unlist(nucleotide3), collapse='')
nona_trRNA$geneLength[index] = nchar(nona_trRNA$fasta[index])
}
# remove big unnecessary objects
rm(fasta)Size distribution of transferred genes (>2)
# randSample = matrix( nrow = length(nona_trRNA$geneLength[nona_trRNA$count>2]), ncol = 100)
# for ( i in seq(100)){P
randSampleA = sample(x = nona_trRNA$geneLength, size = length(nona_trRNA$geneLength[nona_trRNA$count>2]))
# }
# randSampleA = rowMeans(randSample, na.rm = T)
densityAll = density(nona_trRNA$geneLength)
densityTransfered = density(nona_trRNA$geneLength[nona_trRNA$count>2])
densityRandom = density(randSampleA)
p = plot_ly(type = "scatter", mode = "lines")
p = (add_trace(p, x = densityAll$x, y = densityAll$y, name = "All genes"))
p = (add_trace(p, x = densityTransfered$x, y = densityTransfered$y, name = "transferred genes"))
p = (add_trace(p, x = densityRandom$x, y = densityRandom$y, name = "random"))
p = layout(p, title = "Frequency of size distribution of transferred genes (>2)"
, xaxis = list(title = "Size (bp)", zeroline = F, showgrid = F)
, yaxis = list(title = "Frequency (%)", showgrid = F)
, plot_bgcolor='rgba(0,0,0,0)'
, paper_bgcolor='rgba(0,0,0,0)'
, legend = list(x = 0.8, y = 0.9))
p- Motive ‘NYN’ with few repetitions known to locate RNA to the membrane. In order to see the prevelance of that motive in transferred genes There is comparision to whole genome of B. simplex, All genes and only positives transferred genes.
# Genome
genome = read.FASTA(genomefilename)
names(genome) = c("genome")
genome = paste0(as.character.DNAbin(genome$genome), collapse = "")
# Change to biostrings object, then RC and finally as.character
genomeRC = lapply(lapply(lapply(genome, DNAString),reverseComplement), as.character)
# Genes
genes = nona_trRNA$fasta
# Change to biostrings object, then RC and finally as.character
genesRC = lapply(lapply(lapply(genes, DNAString),reverseComplement), as.character)
# positives
pos = nona_trRNA$fasta[nona_trRNA$count > 2]
# Change to biostrings object, then RC and finally as.character
posRC = lapply(lapply(lapply(pos, DNAString),reverseComplement), as.character)
# random sequences in size and length of positives hits
# numericInput
maxRepetitions = 17
motivesHits = data.frame( repetitions = integer(maxRepetitions))
repetitions=0
for (index in seq(maxRepetitions)) {
repetitions = index + 5
pattern = paste0("([atgc][tc][atgc]){",repetitions, ",}")
motivesHits$repetitions[index] = repetitions
# motivesHits$Genome[index] = sum(stri_count_regex(tolower(unlist(genome)), pattern))
# motivesHits$GenomeRC[index] = sum(stri_count_regex(tolower(unlist(genomeRC)), pattern))
# motivesHits$Genes[index] = sum(stri_count_regex(tolower(unlist(genes)), pattern))
# motivesHits$GenesRC[index] = sum(stri_count_regex(tolower(unlist(genesRC)), pattern))
# motivesHits$positive[index] = sum(stri_count_regex(tolower(unlist(pos)), pattern))
motivesHits$positiveRC[index] = sum(stri_count_regex(tolower(unlist(posRC)), pattern))
# motivesHits$randomSample[index] = sum(stri_count_regex(tolower(unlist(randomSample)), pattern))
}
# Delete unnecessery big elements
rm(genome,genomeRC,genes)#,genesRC)library(tibble)
samples = 100
randomSample = tibble(x=1:length(posRC))
genesSample = tibble(x=1:length(posRC))
# create random sequences in same number and lengthes of the transered genes
for(index in seq(samples)) {
randomSample[index] = unlist(lapply(lapply(lapply(posRC, DNAString), length), randDNA))
genesSample[index] = unlist(sample(genesRC, length(posRC)))
}
randomSD = matrix(nrow = maxRepetitions)
sampleSD = matrix(nrow = maxRepetitions)
repetitions=0
for (index in seq(maxRepetitions)){
repetitions = index + 5
pattern = paste0("([atgc][tc][atgc]){",repetitions, ",}")
motivesHits$samples[index] = mean(stri_count_regex(tolower(genesSample), pattern))
motivesHits$random[index] = mean(stri_count_regex(tolower(randomSample), pattern))
sampleSD[index] = sd(stri_count_regex(tolower(genesSample), pattern))
randomSD[index] = sd(stri_count_regex(tolower(randomSample), pattern))
}
# Delete unnecessery big elements
rm(randomSample)
# randomHits = data.frame(t(randomHits)) # motivesHits$random = floor(apply(randomHits, 1, mean))
temp = melt(motivesHits, id="repetitions")
temp$sd = 0
temp$sd[temp[,2] == "random"] = randomSD
temp$sd[temp[,2] == "samples"] = sampleSD
levels(temp$variable)[levels(temp$variable) == "random"] <- paste0("Random sequences (n=",samples,")")
levels(temp$variable)[levels(temp$variable) == "positiveRC"] <- "Transferred genes"
levels(temp$variable) = c(levels(temp$variable),"RC")Motive size in positive (count > 2) genes
p2 = plot_ly( temp
,x = ~repetitions
,y = ~value
,color = ~variable
,type = "scatter"
,mode = "lines")
p2 = layout( p2
,yaxis = list(type = "log", title = "Occurrences (#)", zeroline = F, showgrid = F)
,xaxis = list(title = "Motive Repetitions", showgrid = F)
,title = "(NYN)<sub>N</sub> motive in <i>E. coli</i> genome"
,font = list(size = 12)
,legend = list(x = 0.8, y = 0.9)
,paper_bgcolor = 'rgba(0,0,0,0)'
,plot_bgcolor = 'rgba(0,0,0,0)'
)
p2 title = expression(paste((NYN)["n"], " motive repetitions in ", italic("E. coli")))
p2.1 = ggplot(temp, aes(repetitions, value, ymin = value - sd, ymax = value + sd, variable)) +
geom_line(aes(repetitions, value, color = variable), size = 1.5) +
theme_classic() +
scale_y_log10(breaks = c(0.1,1,10,100,1000)) +
geom_linerange(color = "grey", size = 3, alpha = 0.3) +
scale_x_continuous(expand = c(0,0)) +
labs(y = "Occurrences (#)", x = "Motive repetitions", color = "") +
theme(text = element_text(size=18)
,legend.position = c(0.75,0.9)
,panel.background = element_blank()
,plot.title = element_text(hjust = 0.5)
) +
ggtitle(title)
plot_grid(p2.1, labels = c('B'))- Positives genes with highest number of motive repetitions.
repetitions = motivesHits$repetitions[which.min(motivesHits$positiveRC)]
pattern = paste0("([tc][atgc][atgc]){",repetitions - 1, ",}")
bestHits = grep(pattern, posRC, ignore.case = T)
htmlTable(nona_trRNA[nona_trRNA$fasta == pos[bestHits],c("description", "ID")])| description | ID | |
|---|---|---|
| 17 | DNA translocase at septal ring sorting daughter chromsomes | cds870 |
Source code for this file can be found in: https://bitbucket.org/benzzz/rnamotive/src/master/coli.rmd