QUESTION: “How do I do a pairwise sequence alignment in R?”
If you have a protein sequence, how do you do a pairwise alignment in R?
We will use the data created below to address this question. You will need install the packages by install.packages(“compbio4all”), install.packages(“rentrez”), and install.packages(“Biostrings”) if you have not done so before. Call library(compbio4all), library(Biostrings), and library(rentrez).
Only do this once, then comment out of the script.
# install.packages("compbio4all")
# install.packages("rentrez")
# install.packages("Biostrings")
library(compbio4all)
library(rentrez)
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
Make the data for a table of accession numbers
dio1_table<-c("NP_000783", "P49895","NA","Homo sapiens" , "Human", "DIO1",
"NP_001116123","NA", "NA", "Pan troglodytes" , "Chimpanzee","DIO1",
"NP_031886", "Q61153","NA","Mus musculus", "Mouse" ,"DIO1",
"NP_001091083","P24389","NA","Rattus norvegicus", "Rat", "Dio1",
"NP_001243226","Q2QEI3","NA","Xenopus tropicalis","Frog", "dio1",
"NP_001007284","F1R7E6","NA","Danio rerio", "Fish", "dio1")
# convert the vector to matrix using matrix()
dio1_table_matrix <- matrix(dio1_table, byrow = T, nrow = 6)
# convert the matrix to a dataframe using data.frame()
dio1_table <- as.data.frame(dio1_table_matrix, stringsAsFactors = F)
# name columns of dataframe using names() function
colnames(dio1_table) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name", "gene.name")
# convert table to dataframe
dio1_table <- as.data.frame(dio1_table)
dio1s_list <- entrez_fetch_list(db = "protein",
id = dio1_table$ncbi.protein.accession,
rettype = "fasta")
dio1s_human <- dio1s_list[[1]]
dio1s_chimp <- dio1s_list[[2]]
is(dio1s_human)
## [1] "character" "vector"
## [3] "data.frameRowLabels" "SuperClassMethod"
## [5] "character_OR_connection" "character_OR_NULL"
## [7] "atomic" "EnumerationValue"
## [9] "vector_OR_Vector" "vector_OR_factor"
is.vector(dio1s_human)
## [1] TRUE
is.list(dio1s_human)
## [1] FALSE
length(dio1s_human)
## [1] 1
dim(dio1s_human)
## NULL
nchar(dio1s_human)
## [1] 324
is(dio1s_chimp)
## [1] "character" "vector"
## [3] "data.frameRowLabels" "SuperClassMethod"
## [5] "character_OR_connection" "character_OR_NULL"
## [7] "atomic" "EnumerationValue"
## [9] "vector_OR_Vector" "vector_OR_factor"
is.vector(dio1s_chimp)
## [1] TRUE
is.list(dio1s_chimp)
## [1] FALSE
length(dio1s_chimp)
## [1] 1
dim(dio1s_chimp)
## NULL
nchar(dio1s_chimp)
## [1] 320
dio1s_human
## [1] ">NP_000783.2 type I iodothyronine deiodinase isoform a [Homo sapiens]\nMGLPQPGLWLKRLWVLLEVAVHVVVGKVLLILFPDRVKRNILAMGEKTGMTRNPHFSHDNWIPTFFSTQY\nFWFVLKVRWQRLEDTTELGGLAPNCPVVRLSGQRCNIWEFMQGNRPLVLNFGSCTUPSFMFKFDQFKRLI\nEDFSSIADFLVIYIEEAHASDGWAFKNNMDIRNHQNLQDRLQAAHLLLARSPQCPVVVDTMQNQSSQLYA\nALPERLYIIQEGRILYKGKSGPWNYNPEEVRAVLEKLHS\n\n"
dio1s_chimp
## [1] ">NP_001116123.2 type I iodothyronine deiodinase [Pan troglodytes]\nMGLPQPGLWLKRLWVLLEVAVHVVVGKVLLILFPDRVKRNILAMGEKTGMTRNPHFSHDNWIPTFFSTQY\nFWFVLKVRWQRLEDTTELGGLAPNCPVVHLSGQRCNIWEFMQGNRPLVLNFGSCTUPSFMFKFDQFKRLI\nEDFSSIADFLVIYIEEAHASDGWAFKNNMDIRNHQNLQDRLQAAHLLLARSPQCPVVVDTMQNQSSQLYA\nALPERLYVIQEGRILYKGKSGPWNYNPEEVRAVLEKLHS\n\n"
# using two sequences of your choosing from list created
# Human and Chimpanzee pairwise alignment
align.human.vs.chimp <- pairwiseAlignment (
dio1s_human, dio1s_chimp)
# Also displays the score of the alignment. The score is used to identify regions of similarity that may indicate functional, structural and/or evolutionary relationships between two biological sequences (protein or nucleic acid)
align.human.vs.chimp
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: >NP_00078---3.2 type I iodothyroni...IIQEGRILYKGKSGPWNYNPEEVRAVLEKLHS
##
##
## subject: >NP_001116123.2 type I iodothyroni...VIQEGRILYKGKSGPWNYNPEEVRAVLEKLHS
##
##
## score: 1509.864
Additional Reading
For more information on this topic, see: https://a-little-book-of-r-for-bioinformatics.readthedocs.io/en/latest/src/chapter4.html
Keywords: 1.Biostrings 2. matrix 3. pairwiseAlignment 4. entrez_fetch_list 5. compbio4all 6. as.data.frame 7. rentrez 8. seqinr 9. score