QUESTION: “How do I do a pairwise sequence alignment in R?”

If you have a protein sequence, how do you do a pairwise alignment in R?

Data Preparation

We will use the data created below to address this question. You will need install the packages by install.packages(“compbio4all”), install.packages(“rentrez”), and install.packages(“Biostrings”) if you have not done so before. Call library(compbio4all), library(Biostrings), and library(rentrez).

Preliminaries

Download packages

Only do this once, then comment out of the script.

# install.packages("compbio4all")
# install.packages("rentrez")
# install.packages("Biostrings")

Load the libraries

library(compbio4all)
library(rentrez)
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit

Data

Make the data for a table of accession numbers

dio1_table<-c("NP_000783",   "P49895","NA","Homo sapiens" ,    "Human",      "DIO1",
              "NP_001116123","NA",    "NA", "Pan troglodytes" , "Chimpanzee","DIO1",
              "NP_031886",   "Q61153","NA","Mus musculus",      "Mouse"    ,"DIO1",
              "NP_001091083","P24389","NA","Rattus norvegicus", "Rat",      "Dio1",
              "NP_001243226","Q2QEI3","NA","Xenopus tropicalis","Frog",     "dio1",
              "NP_001007284","F1R7E6","NA","Danio rerio",       "Fish",     "dio1")

# convert the vector to matrix using matrix()
dio1_table_matrix <- matrix(dio1_table, byrow = T, nrow = 6)

# convert the matrix to a dataframe using data.frame()
dio1_table <- as.data.frame(dio1_table_matrix,  stringsAsFactors = F)

# name columns of dataframe using names() function
colnames(dio1_table) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name", "gene.name")

# convert table to dataframe
dio1_table <- as.data.frame(dio1_table)

Get FASTA sequences

Download all accession numbers in the table made above.

dio1s_list <- entrez_fetch_list(db = "protein", 
                          id = dio1_table$ncbi.protein.accession, 
                          rettype = "fasta")

Pull out focal sequences

dio1s_human <- dio1s_list[[1]]
dio1s_chimp <- dio1s_list[[2]]

Evaluated the focal sequence

is(dio1s_human)
##  [1] "character"               "vector"                 
##  [3] "data.frameRowLabels"     "SuperClassMethod"       
##  [5] "character_OR_connection" "character_OR_NULL"      
##  [7] "atomic"                  "EnumerationValue"       
##  [9] "vector_OR_Vector"        "vector_OR_factor"
is.vector(dio1s_human)
## [1] TRUE
is.list(dio1s_human)
## [1] FALSE
length(dio1s_human)
## [1] 1
dim(dio1s_human)
## NULL
nchar(dio1s_human)
## [1] 324
is(dio1s_chimp)
##  [1] "character"               "vector"                 
##  [3] "data.frameRowLabels"     "SuperClassMethod"       
##  [5] "character_OR_connection" "character_OR_NULL"      
##  [7] "atomic"                  "EnumerationValue"       
##  [9] "vector_OR_Vector"        "vector_OR_factor"
is.vector(dio1s_chimp)
## [1] TRUE
is.list(dio1s_chimp)
## [1] FALSE
length(dio1s_chimp)
## [1] 1
dim(dio1s_chimp)
## NULL
nchar(dio1s_chimp)
## [1] 320
dio1s_human
## [1] ">NP_000783.2 type I iodothyronine deiodinase isoform a [Homo sapiens]\nMGLPQPGLWLKRLWVLLEVAVHVVVGKVLLILFPDRVKRNILAMGEKTGMTRNPHFSHDNWIPTFFSTQY\nFWFVLKVRWQRLEDTTELGGLAPNCPVVRLSGQRCNIWEFMQGNRPLVLNFGSCTUPSFMFKFDQFKRLI\nEDFSSIADFLVIYIEEAHASDGWAFKNNMDIRNHQNLQDRLQAAHLLLARSPQCPVVVDTMQNQSSQLYA\nALPERLYIIQEGRILYKGKSGPWNYNPEEVRAVLEKLHS\n\n"
dio1s_chimp
## [1] ">NP_001116123.2 type I iodothyronine deiodinase [Pan troglodytes]\nMGLPQPGLWLKRLWVLLEVAVHVVVGKVLLILFPDRVKRNILAMGEKTGMTRNPHFSHDNWIPTFFSTQY\nFWFVLKVRWQRLEDTTELGGLAPNCPVVHLSGQRCNIWEFMQGNRPLVLNFGSCTUPSFMFKFDQFKRLI\nEDFSSIADFLVIYIEEAHASDGWAFKNNMDIRNHQNLQDRLQAAHLLLARSPQCPVVVDTMQNQSSQLYA\nALPERLYVIQEGRILYKGKSGPWNYNPEEVRAVLEKLHS\n\n"

Making a pairwise alignment

#  using two sequences of your choosing from list created 

# Human and Chimpanzee pairwise alignment

align.human.vs.chimp <- pairwiseAlignment (
                  dio1s_human, dio1s_chimp)

Display alignment

# Also displays the score of the alignment. The score is used to identify regions of similarity that may indicate functional, structural and/or evolutionary relationships between two biological sequences (protein or nucleic acid)

align.human.vs.chimp
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: >NP_00078---3.2 type I iodothyroni...IIQEGRILYKGKSGPWNYNPEEVRAVLEKLHS
## 
## 
## subject: >NP_001116123.2 type I iodothyroni...VIQEGRILYKGKSGPWNYNPEEVRAVLEKLHS
## 
## 
## score: 1509.864

Additional Reading

For more information on this topic, see: https://a-little-book-of-r-for-bioinformatics.readthedocs.io/en/latest/src/chapter4.html

Keywords: 1.Biostrings 2. matrix 3. pairwiseAlignment 4. entrez_fetch_list 5. compbio4all 6. as.data.frame 7. rentrez 8. seqinr 9. score