First install ggseqlogo package from cran
#install.packages("ggseqlogo")
load packages
# Load the required packages
library(ggplot2)
library(ggseqlogo)
## Warning: package 'ggseqlogo' was built under R version 4.1.2
# Some sample data
data(ggseqlogo_sample)
This loads three sample data sets:
seqs_dna: sets of binding sites for 12 transcription factors obtained from FASTA files in JASPAR. This is represented as a named list of character vectors, where the names represent the JASPAR ID.
pfms_dna: a list of position frequency matrices for four transcription factors obtained from JASPAR. This is represented as a list of matrices, where the names represent the JASPAR ID.
seqs_aa: sets of kinase-substrate phosphorylation sites obtained from Wagih et al. This is represented as a named list of character vectors where the names represent the names of the kinases associated with the phosphosites.
###Plot a sequence logo You can draw a sequence logos using ggplot function, with geom_logo. Let’s try this on sequences for one of the transcription factors from JASPAR:
ggplot() + geom_logo( seqs_dna$MA0001.1 ) + theme_logo()
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
You can also use the ggseqlogo as a shortcut to do the same thing. This is a wrapper function which adds theme_logo by default and performs any required faceting if multiple sequences logos are to be drawn.
ggseqlogo( seqs_dna$MA0001.1 )
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
ggseqlogo accepts three types of input:
-Sequences: a character vector of aligned sequences
-Matrices: a position frequency matrix, where the row is the letter, and column is the position. Note: the matrix must be row named with the letter. Matrices can also be a custom height matrix
The following generates a sequence logo using a position frequency matrix from the sample data
ggseqlogo( pfms_dna$MA0018.2 )
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
###Plotting methods ggseqlogo supports two sequence logo methods through the method options: ‘bits’ and ‘probability’. By default, the bits is used.
p1 = ggseqlogo( seqs_dna$MA0001.1, method = 'bits' )
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
p2 = ggseqlogo( seqs_dna$MA0001.1, method = 'prob' )
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
gridExtra::grid.arrange(p1, p2)
Amino acids, DNA and RNA sequence types are all supported by ggseqlogo. By default, ggseqlogo will try to guess your sequence type. You can explicitly set the sequence type through the seq_type option.
Lets try generate an amino acid sequence logo using kinase-substrate phosphorylation data:
ggseqlogo( seqs_aa$AKT1, seq_type='aa' )
If you want to define a custom alphabet you can do so by setting namespace with your desired custom alphabet. For example, lets say you wanted a sequence logo of zeros and ones:
# Replace DNA characters with numbers
seqs_numeric = chartr('ATGC','1234', seqs_dna$MA0001.1)
ggseqlogo(seqs_numeric, method='p', namespace=1:4)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
###Colour schemes
Preset colour schemes ggseqlogo has preset colour schemes that can be set using the col_scheme parameter. By default, the col_scheme is set to auto such that the colour scheme is automatically chosen based on your sequence type.
You can adjust the parameter. For amino acids you can pick from the following chemistry, hydrophobicity, clustalx, taylor. For DNA and RNA sequences nucleotide and base_pairing. For example:
ggseqlogo(seqs_dna$MA0001.1, col_scheme='base_pairing')
Custom colour schemes: If the presets are not enough for you, you can define custom discrete or continuous colour schemes using the make_col_scheme function.
# Create custom colour scheme
cs1 = make_col_scheme(chars=c('A', 'T', 'C', 'G'), groups=c('gr1', 'gr1', 'gr2', 'gr2'),
cols=c('purple', 'purple', 'blue', 'blue'))
# Generate sequence logo
ggseqlogo(seqs_dna$MA0001.1, col_scheme=cs1)
You can plot more than one sequence logo at the same time with the help of facets. ggseqlogo will accept a named list of sequences or matrices. The names of the list will be used as the facet titles.
ggseqlogo(seqs_dna, ncol=4)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
###Custom-height logos
If you have your own height metric for each letter, simply create a matrix where each cell is a the desired height, and set the method to custom. You can even have negative heights. Here’s a simple example:
# Create a custom matrix
set.seed(123)
custom_mat = matrix( rnorm(20), nrow=4, dimnames=list(c('A', 'T', 'G', 'C')))
# Generate sequence logo
ggseqlogo(custom_mat, method='custom', seq_type='dna') + ylab('my custom height')
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
###Fonts You can adjust the font by setting the font parameter. To list all the available color schemes use the list_fonts function.
fonts = list_fonts(F)
p_list = lapply(fonts, function(f){
ggseqlogo(seqs_dna$MA0001.1, font=f) + ggtitle(f)
})
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
do.call(gridExtra::grid.arrange, c(p_list, ncol=2))
###Annotating logos Overlaying annotation onto sequence logos is straightforward in ggseqlogo with ggplot2. Here is an example of drawing rectangles, lines and text.
ggplot() +
annotate('rect', xmin = 0.5, xmax = 3.5, ymin = -0.05, ymax = 1.9, alpha = .1, col='black', fill='yellow') +
geom_logo(seqs_dna$MA0001.1, stack_width = 0.90) +
annotate('segment', x = 4, xend=8, y=1.2, yend=1.2, size=2) +
annotate('text', x=6, y=1.3, label='Text annotation') +
theme_logo()
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
###Combining plots
Combining sequence logos with other plots generated by ggplot2 is simple. I’ll demonstrate with an example combining a sequence logo, sequence alignment and bar plot.
# Sequences we're going to use for the logo
seqs = seqs_dna$MA0008.1
# Generate the sequence logo
p1 = ggseqlogo(seqs) + theme(axis.text.x = element_blank())
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
# Make data for sequence alignment
aln = data.frame(
letter=strsplit("AGATAAGATGATAAAAAGATAAGA", "")[[1]],
species = rep(c("a", "b", "c"), each=8),
x = rep(1:8, 3)
)
aln$mut = 'no'
aln$mut[ c(2,15,20,23) ] = 'yes'
# Generate the sequence alignment
p2 = ggplot(aln, aes(x, species)) +
geom_text(aes(label=letter, color=mut, size=mut)) +
scale_x_continuous(breaks=1:10, expand = c(0.105, 0)) + xlab('') +
scale_color_manual(values=c('black', 'red')) +
scale_size_manual(values=c(5, 6)) +
theme_logo() +
theme(legend.position = 'none', axis.text.x = element_blank())
# Generate barplot data
bp_data = data.frame(x=1:8, conservation=sample(1:100, 8))
# Generate barplot data
p3 = ggplot(bp_data, aes(x, conservation)) +
geom_bar(stat='identity', fill='grey') +
theme_logo() +
scale_x_continuous(breaks=1:10, expand = c(0.105, 0)) +
xlab('')
# Now combine using cowplot, which ensures the plots are aligned
suppressMessages( require(cowplot) )
plot_grid(p1, p2, p3, ncol = 1, align = 'v')
###Citation
Wagih, Omar. ggseqlogo: a versatile R package for drawing sequence logos. Bioinformatics (2017). https://doi.org/10.1093/bioinformatics/btx469