QUESTION: “How do I do a pairwise sequence alignment in R?”
If you have protein sequences from different species, how do you do a pairwise alignment to compare them in R?
We will use the data created below to address this question. You will need install the packages by install.packages(“compbio4all”), install.packages(“rentrez”), install.packages(“Biostrings”), install.packages(“seqinr”), and install.packages(“ggmsa”) if you have not done so before. Call library(compbio4all), library(rentrez), and library(msa).
Only do this once, then comment out of the script.
# install.packages("compbio4all")
# install.packages("rentrez")
# install.packages("Biostrings")
# install.packages("ggmsa")
library(compbio4all)
library(rentrez)
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
library(msa)
Make the data for a table of accession numbers
dio1_table<-c("NP_000783", "P49895","NA","Homo sapiens" , "Human", "DIO1",
"NP_001116123","NA", "NA", "Pan troglodytes" , "Chimpanzee","DIO1",
"NP_031886", "Q61153","NA","Mus musculus", "Mouse" ,"DIO1",
"NP_001091083","P24389","NA","Rattus norvegicus", "Rat", "Dio1",
"NP_001243226","Q2QEI3","NA","Xenopus tropicalis","Frog", "dio1",
"NP_001007284","F1R7E6","NA","Danio rerio", "Fish", "dio1")
# convert the vector to matrix using matrix()
dio1_table_matrix <- matrix(dio1_table, byrow = T, nrow = 6)
# convert the matrix to a dataframe using data.frame()
dio1_table <- as.data.frame(dio1_table_matrix, stringsAsFactors = F)
# name columns of dataframe using names() function
colnames(dio1_table) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name", "gene.name")
# convert table to dataframe
dio1_table <- as.data.frame(dio1_table)
dio1s_list <- entrez_fetch_list(db = "protein",
id = dio1_table$ncbi.protein.accession,
rettype = "fasta")
# Go through list and remove FASTA header from each sequence
for(i in 1:length(dio1s_list)){
dio1s_list[[i]] <- fasta_cleaner(dio1s_list[[i]], parse = F)
}
Remove FASTA header
for(i in 1:length(dio1s_list)){
dio1s_list[[i]] <- fasta_cleaner(dio1s_list[[i]], parse = F)
}
dio1s_vector <- rep(NA, length(dio1s_list))
names(dio1s_list)
## [1] "NP_000783" "NP_001116123" "NP_031886" "NP_001091083" "NP_001243226"
## [6] "NP_001007284"
for(i in 1:length(dio1s_vector)){
dio1s_vector[i] <- dio1s_list[[i]]
}
Naming the vector
names(dio1s_vector) <- names(dio1s_list)
dio1s_vector_ss <- Biostrings::AAStringSet(dio1s_vector)
dio1s_align <-msa(dio1s_vector_ss,
method = "ClustalW")
## use default substitution matrix
Default output of MSA
msa(dio1s_vector_ss, method = "ClustalW")
Change class of alignment
class(dio1s_align) <- "AAMultipleAlignment"
# run ggmsa
ggmsa::ggmsa(dio1s_align,
start = 0,
end = 50)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
Additional Reading
For more information on this topic, see: https://bioconductor.org/packages/release/bioc/vignettes/msa/inst/doc/msa.pdf
Keywords: 1.Biostrings 2. matrix 3. pairwiseAlignment 4. entrez_fetch_list 5. compbio4all 6. as.data.frame 7. rentrez 8. ggmsa 9. msa