install Bioconductor and relevant packages
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("Biostrings")
## Bioconductor version 3.20 (BiocManager 1.30.25), R 4.4.1 (2024-06-14 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
## `force = TRUE` to re-install: 'Biostrings'
## Installation paths not writeable, unable to update packages
## path: C:/Program Files/R/R-4.4.1/library
## packages:
## boot, cluster, foreign, MASS, Matrix, nlme, survival
BiocManager::install("seqinr")
## Bioconductor version 3.20 (BiocManager 1.30.25), R 4.4.1 (2024-06-14 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
## `force = TRUE` to re-install: 'seqinr'
## Installation paths not writeable, unable to update packages
## path: C:/Program Files/R/R-4.4.1/library
## packages:
## boot, cluster, foreign, MASS, Matrix, nlme, survival
library(Biostrings)
## Warning: package 'Biostrings' was built under R version 4.4.2
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 4.4.2
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.4.2
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
library(seqinr)
## Warning: package 'seqinr' was built under R version 4.4.2
##
## Attaching package: 'seqinr'
## The following object is masked from 'package:Biostrings':
##
## translate
Protein Sequence Analysis
Read Protein FASTA file
library(seqinr)
url <- "https://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/protein-sample.txt"
protein_fasta <- read.fasta(url)
## Warning in readLines(file): incomplete final line found on
## 'https://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/protein-sample.txt'
protein_fasta
## $Seq1
## [1] "l" "y" "l" "i" "f" "g" "a" "w" "a" "g" "m" "v" "g" "t" "a" "l" "s" "l"
## [19] "l" "i" "r" "a" "e" "l" "g" "q" "p" "g" "t" "l" "l" "g" "d" "d" "q" "i"
## [37] "y" "n" "v" "i" "v" "t" "a" "h" "a" "f" "v" "m" "i" "f" "f" "m" "v" "m"
## [55] "p" "i" "m" "i" "g" "g" "f" "g" "n" "w" "l" "v" "p" "l" "m" "i" "g" "a"
## [73] "p" "d" "m" "a" "f" "p" "r" "m" "n" "n" "m" "s" "f" "w" "l" "l" "p" "p"
## [91] "s" "f" "l" "l" "l" "l" "a" "s" "s" "t" "v" "e" "a" "g" "a" "g" "t" "g"
## [109] "w" "t" "v" "y" "p" "p" "l" "a" "g" "n" "l" "a" "h" "a" "g" "a" "s" "v"
## [127] "d" "l" "a" "i" "f" "s" "l" "h" "l" "a" "g" "v" "s" "s" "i" "l" "g" "a"
## [145] "i" "n" "f" "i" "t" "t" "a" "i" "n" "m" "k" "p" "p" "t" "l" "s" "q" "y"
## [163] "q" "t" "p" "l" "f" "v" "w" "s" "v" "l" "i" "t" "a" "v" "l" "l" "l" "l"
## [181] "s" "l" "p" "v" "l" "a" "a" "g" "i" "t" "m" "l" "l" "t" "d" "r" "n" "l"
## [199] "n" "t" "t" "f" "f" "d" "p" "a" "g" "g" "g" "d" "p" "v" "l" "y" "q" "h"
## [217] "l" "f" "w" "f" "f" "g" "h" "p" "e" "v" "y" "i" "l" "i" "l"
## attr(,"name")
## [1] "Seq1"
## attr(,"Annot")
## [1] ">Seq1"
## attr(,"class")
## [1] "SeqFastadna"
##
## $Seq2
## [1] "v" "g" "t" "a" "l" "x" "l" "l" "i" "r" "a" "e" "l" "x" "q" "p" "g" "a"
## [19] "l" "l" "g" "d" "d" "q" "i" "y" "n" "v" "v" "v" "t" "a" "h" "a" "f" "v"
## [37] "m" "i" "f" "f" "m" "v" "m" "p" "i" "m" "i" "g" "g" "f" "g" "n" "w" "l"
## [55] "v" "p" "l" "m" "i" "g" "a" "p" "d" "m" "a" "f" "p" "r" "m" "n" "n" "m"
## [73] "s" "f" "w" "l" "l" "p" "p" "s" "f" "l" "l" "l" "m" "a" "s" "s" "t" "v"
## [91] "e" "a" "g" "a" "g" "t" "g" "w" "t" "v" "y" "p" "p" "l" "a" "g" "n" "l"
## [109] "a" "h" "a" "g" "a" "s" "v" "d" "l" "a" "i" "f" "s" "l" "h" "l" "a" "g"
## [127] "i" "s" "s" "i" "l" "g" "a" "i" "n" "f" "i" "t" "t" "a" "i" "n" "m" "k"
## [145] "p" "p" "a" "l" "s" "q" "y" "q" "t" "p" "l" "f" "v" "w" "s" "v" "l" "i"
## [163] "t" "a" "v" "l" "l" "l" "l" "s" "l" "p" "v" "l" "a" "a" "g" "i" "t" "m"
## [181] "l" "l" "t" "d" "r" "n" "l" "n" "t" "t" "f" "f" "d" "p" "a" "g" "g" "g"
## [199] "d" "p" "v" "l" "y" "q" "h" "l" "f" "w" "f" "f" "g" "h" "p" "e" "v" "y"
## [217] "i" "l" "i" "l"
## attr(,"name")
## [1] "Seq2"
## attr(,"Annot")
## [1] ">Seq2"
## attr(,"class")
## [1] "SeqFastadna"
##
## $Seq3
## [1] "l" "y" "l" "i" "f" "g" "a" "w" "a" "g" "m" "v" "g" "t" "a" "l" "s" "l"
## [19] "l" "i" "r" "a" "e" "l" "g" "q" "p" "g" "a" "l" "l" "g" "d" "d" "q" "v"
## [37] "y" "n" "v" "v" "v" "t" "a" "h" "a" "f" "v" "m" "i" "f" "f" "m" "v" "m"
## [55] "p" "i" "m" "i" "g" "g" "f" "g" "n" "w" "l" "v" "p" "l" "m" "i" "g" "a"
## [73] "p" "d" "m" "a" "f" "p" "r" "m" "n" "n" "m" "s" "f" "w" "l" "l" "p" "p"
## [91] "s" "f" "l" "l" "l" "l" "a" "s" "s" "t" "v" "e" "a" "g" "v" "g" "t" "g"
## [109] "w" "t" "v" "y" "p" "p" "l" "a" "g" "n" "l" "a" "h" "a" "g" "a" "s" "v"
## [127] "d" "l" "a" "i" "f" "s" "l" "h" "l" "a" "g" "i" "s" "s" "i" "l" "g" "a"
## [145] "i" "n" "f" "i" "t" "t" "a" "i" "n" "m" "k" "p" "p" "a" "l" "s" "q" "y"
## [163] "q" "t" "p" "l" "f" "v" "w" "s" "v" "l" "i" "t" "a" "v" "l" "l" "l" "l"
## [181] "s" "l" "p" "v" "l" "a" "a" "g" "i" "t" "m" "l" "l" "t" "d" "r" "n" "l"
## [199] "n" "t" "t" "f" "f" "d" "p" "a" "g" "g" "g" "d" "p" "v" "l" "y" "q" "h"
## [217] "l" "f" "w" "f" "f" "g" "h" "p" "e" "v" "y" "i" "l" "i" "l"
## attr(,"name")
## [1] "Seq3"
## attr(,"Annot")
## [1] ">Seq3"
## attr(,"class")
## [1] "SeqFastadna"
##
## $Seq4
## [1] "w" "a" "g" "m" "v" "g" "t" "a" "l" "s" "l" "l" "i" "r" "a" "e" "l" "g"
## [19] "q" "p" "g" "a" "l" "l" "g" "d" "d" "q" "i" "y" "n" "v" "v" "x" "t" "a"
## [37] "h" "a" "f" "v" "m" "i" "f" "f" "m" "v" "m" "p" "i" "m" "i" "g" "g" "f"
## [55] "g" "n" "w" "l" "v" "p" "l" "m" "i" "g" "a" "p" "d" "m" "a" "f" "p" "r"
## [73] "m" "n" "n" "m" "s" "f" "w" "l" "l" "p" "p" "s" "f" "l" "l" "l" "m" "a"
## [91] "s" "s" "t" "v" "e" "a" "g" "v" "g" "t" "g" "w" "t" "v" "y" "p" "p" "l"
## [109] "a" "g" "n" "l" "a" "h" "a" "g" "a" "s" "v" "d" "l" "a" "i" "f" "s" "l"
## [127] "h" "l" "a" "g" "i" "s" "s" "i" "l" "g" "a" "i" "n" "f" "i" "t" "t" "a"
## [145] "i" "n" "m" "k" "p" "p" "a" "l" "s" "q" "y" "q" "t" "p" "l" "f" "v" "w"
## [163] "s" "v" "l" "i" "t" "a" "v" "l" "l" "l" "l" "s" "l" "p" "v" "l" "a" "a"
## [181] "g" "i" "t" "m" "l" "l" "t" "d" "r" "n" "l" "n" "t" "t" "f" "f" "d" "p"
## [199] "a" "g" "g" "g" "d" "p" "v" "l" "y" "q" "h" "l" "f" "w" "f" "f" "g" "h"
## [217] "p" "e" "v" "y" "i" "l" "i" "l"
## attr(,"name")
## [1] "Seq4"
## attr(,"Annot")
## [1] ">Seq4"
## attr(,"class")
## [1] "SeqFastadna"
##
## $Seq5
## [1] "l" "y" "l" "i" "f" "g" "a" "w" "a" "g" "m" "v" "g" "t" "a" "l" "s" "l"
## [19] "l" "i" "r" "a" "e" "l" "g" "q" "p" "g" "a" "l" "l" "g" "d" "d" "q" "v"
## [37] "y" "n" "v" "v" "v" "t" "a" "h" "a" "f" "v" "m" "i" "f" "f" "m" "v" "m"
## [55] "p" "i" "m" "i" "g" "g" "f" "g" "n" "w" "l" "v" "p" "l" "m" "i" "g" "a"
## [73] "p" "d" "m" "a" "f" "p" "r" "m" "n" "n" "m" "s" "f" "w" "l" "l" "p" "p"
## [91] "s" "f" "l" "l" "l" "l" "a" "s" "s" "t" "v" "e" "a" "g" "v" "g" "t" "g"
## [109] "w" "t" "v" "y" "p" "p" "l" "a" "g" "n" "l" "a" "h" "a" "g" "a" "s" "v"
## [127] "d" "l" "a" "i" "f" "s" "l" "h" "l" "a" "g" "i" "s" "s" "i" "l" "g" "a"
## [145] "i" "n" "f" "i" "t" "t" "a" "i" "n" "m" "k" "p" "p" "a" "l" "s" "q" "y"
## [163] "q" "t" "p" "l" "f" "v" "w" "s" "v" "l" "i" "t" "a" "v" "l" "l" "l" "l"
## [181] "s" "l" "p" "v" "l" "a" "a" "g" "i" "t" "m" "l" "l" "t" "d" "r" "n" "l"
## [199] "n" "t" "t" "f" "f" "d" "p" "a" "g" "g" "g" "d" "p" "v" "l" "y" "q" "h"
## [217] "l" "f" "w" "f" "f" "g" "h" "p" "e" "v" "y" "i" "l" "i" "x"
## attr(,"name")
## [1] "Seq5"
## attr(,"Annot")
## [1] ">Seq5"
## attr(,"class")
## [1] "SeqFastadna"
##
## $Seq6
## [1] "w" "a" "g" "m" "v" "g" "t" "a" "l" "s" "l" "l" "i" "r" "a" "e" "l" "g"
## [19] "q" "p" "g" "a" "l" "l" "g" "d" "d" "q" "i" "y" "n" "v" "v" "v" "t" "a"
## [37] "h" "a" "f" "v" "m" "i" "f" "f" "m" "v" "m" "p" "i" "m" "i" "g" "g" "f"
## [55] "g" "n" "w" "l" "v" "p" "l" "m" "i" "g" "a" "p" "d" "m" "a" "f" "p" "r"
## [73] "m" "n" "n" "m" "s" "f" "w" "l" "l" "p" "p" "s" "f" "l" "l" "l" "m" "a"
## [91] "s" "s" "t" "v" "e" "a" "g" "v" "g" "t" "g" "w" "t" "v" "y" "p" "p" "l"
## [109] "a" "g" "n" "l" "a" "h" "a" "g" "a" "s" "v" "d" "l" "a" "i" "f" "s" "l"
## [127] "h" "l" "a" "g" "i" "s" "s" "i" "l" "g" "a" "i" "n" "f" "i" "t" "t" "a"
## [145] "i" "n" "m" "k" "p" "p" "a" "l" "s" "q" "y" "q" "t" "p" "l" "f" "v" "w"
## [163] "s" "v" "l" "i" "t" "a" "v" "l" "l" "l" "l" "s" "l" "p" "v" "l" "a" "a"
## [181] "g" "i" "t" "m" "l" "l" "t" "d" "r" "n" "l" "n" "t" "t" "f" "f" "d" "p"
## [199] "a" "g" "g" "g" "d" "p" "v" "l" "y" "q" "h" "l" "f" "w" "f" "f" "g" "h"
## [217] "p" "e" "v" "y" "i" "l" "i" "l"
## attr(,"name")
## [1] "Seq6"
## attr(,"Annot")
## [1] ">Seq6"
## attr(,"class")
## [1] "SeqFastadna"
##
## $Seq7
## [1] "v" "g" "t" "a" "l" "s" "l" "l" "i" "r" "a" "e" "l" "g" "q" "p" "g" "t"
## [19] "l" "l" "g" "d" "d" "q" "i" "y" "n" "v" "i" "v" "t" "a" "h" "a" "f" "v"
## [37] "m" "i" "f" "f" "m" "v" "m" "p" "v" "m" "i" "g" "g" "f" "g" "n" "w" "l"
## [55] "v" "p" "l" "m" "i" "g" "a" "p" "d" "m" "a" "f" "p" "r" "m" "n" "n" "m"
## [73] "s" "f" "w" "l" "l" "p" "p" "s" "f" "l" "l" "l" "l" "a" "s" "s" "t" "v"
## [91] "e" "a" "g" "a" "g" "t" "g" "w" "t" "v" "y" "p" "p" "l" "a" "g" "n" "l"
## [109] "a" "h" "a" "g" "a" "s" "v" "d" "l" "a" "i" "f" "s" "l" "h" "l" "a" "g"
## [127] "v" "s" "s" "i" "l" "g" "a" "i" "n" "f" "i" "t" "t" "a" "i" "n" "m" "k"
## [145] "p" "p" "a" "l" "s" "q" "y" "q" "t" "p" "l" "f" "v" "w" "s" "v" "l" "i"
## [163] "t" "a" "v" "l" "l" "l" "l" "s" "l" "p" "v" "l" "a" "a" "g" "i" "t" "m"
## [181] "l" "l" "t" "d" "r" "n" "l" "n" "t" "t" "f" "f" "d" "p" "a" "g" "g" "g"
## [199] "d" "p" "v" "l" "y" "q" "h" "l" "f" "w" "f" "f" "g" "h" "p" "e" "v" "y"
## [217] "i" "l" "i" "l"
## attr(,"name")
## [1] "Seq7"
## attr(,"Annot")
## [1] ">Seq7"
## attr(,"class")
## [1] "SeqFastadna"
structure
# Check the structure of protein_fasta to understand how sequences are stored
str(protein_fasta)
## List of 7
## $ Seq1: 'SeqFastadna' chr [1:231] "l" "y" "l" "i" ...
## ..- attr(*, "name")= chr "Seq1"
## ..- attr(*, "Annot")= chr ">Seq1"
## $ Seq2: 'SeqFastadna' chr [1:220] "v" "g" "t" "a" ...
## ..- attr(*, "name")= chr "Seq2"
## ..- attr(*, "Annot")= chr ">Seq2"
## $ Seq3: 'SeqFastadna' chr [1:231] "l" "y" "l" "i" ...
## ..- attr(*, "name")= chr "Seq3"
## ..- attr(*, "Annot")= chr ">Seq3"
## $ Seq4: 'SeqFastadna' chr [1:224] "w" "a" "g" "m" ...
## ..- attr(*, "name")= chr "Seq4"
## ..- attr(*, "Annot")= chr ">Seq4"
## $ Seq5: 'SeqFastadna' chr [1:231] "l" "y" "l" "i" ...
## ..- attr(*, "name")= chr "Seq5"
## ..- attr(*, "Annot")= chr ">Seq5"
## $ Seq6: 'SeqFastadna' chr [1:224] "w" "a" "g" "m" ...
## ..- attr(*, "name")= chr "Seq6"
## ..- attr(*, "Annot")= chr ">Seq6"
## $ Seq7: 'SeqFastadna' chr [1:220] "v" "g" "t" "a" ...
## ..- attr(*, "name")= chr "Seq7"
## ..- attr(*, "Annot")= chr ">Seq7"
Sequence Length
sequence <- protein_fasta[[1]]
sequence_length <- length(sequence)
sequence_length
## [1] 231
SeqFastadna object to an AAString object
sequence_biostrings <- AAString(paste(sequence, collapse = ""))
sequence_biostrings
## 231-letter AAString object
## seq: LYLIFGAWAGMVGTALSLLIRAELGQPGTLLGDDQI...RNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL
frequency of each amino acid
sequence_composition <- alphabetFrequency(sequence_biostrings)
sequence_composition
## A R N D C Q E G H I L K M F P S T W Y V U O B J Z X
## 23 3 9 7 0 5 3 22 5 17 38 1 11 16 16 12 15 6 6 16 0 0 0 0 0 0
## * - + .
## 0 0 0 0
Define the subsequence want to match
subseq <- "MTEY"
subseq_loc <- matchPattern(subseq, sequence_biostrings)
subseq_loc
## Views on a 231-letter AAString subject
## subject: LYLIFGAWAGMVGTALSLLIRAELGQPGTLLGDD...LNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL
## views: NONE
Phylogenetic Tree
Library and Data
# Load necessary libraries
library(msa)
library(ape)
## Warning: package 'ape' was built under R version 4.4.2
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
## The following object is masked from 'package:Biostrings':
##
## complement
library(seqinr)
# Protein FASTA file URL
url <- "https://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/protein-sample.txt"
# Read the protein FASTA file
protein_fasta <- read.fasta(url, seqtype = "AA")
## Warning in readLines(file): incomplete final line found on
## 'https://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/protein-sample.txt'
protein_fasta
## $Seq1
## [1] "L" "Y" "L" "I" "F" "G" "A" "W" "A" "G" "M" "V" "G" "T" "A" "L" "S" "L"
## [19] "L" "I" "R" "A" "E" "L" "G" "Q" "P" "G" "T" "L" "L" "G" "D" "D" "Q" "I"
## [37] "Y" "N" "V" "I" "V" "T" "A" "H" "A" "F" "V" "M" "I" "F" "F" "M" "V" "M"
## [55] "P" "I" "M" "I" "G" "G" "F" "G" "N" "W" "L" "V" "P" "L" "M" "I" "G" "A"
## [73] "P" "D" "M" "A" "F" "P" "R" "M" "N" "N" "M" "S" "F" "W" "L" "L" "P" "P"
## [91] "S" "F" "L" "L" "L" "L" "A" "S" "S" "T" "V" "E" "A" "G" "A" "G" "T" "G"
## [109] "W" "T" "V" "Y" "P" "P" "L" "A" "G" "N" "L" "A" "H" "A" "G" "A" "S" "V"
## [127] "D" "L" "A" "I" "F" "S" "L" "H" "L" "A" "G" "V" "S" "S" "I" "L" "G" "A"
## [145] "I" "N" "F" "I" "T" "T" "A" "I" "N" "M" "K" "P" "P" "T" "L" "S" "Q" "Y"
## [163] "Q" "T" "P" "L" "F" "V" "W" "S" "V" "L" "I" "T" "A" "V" "L" "L" "L" "L"
## [181] "S" "L" "P" "V" "L" "A" "A" "G" "I" "T" "M" "L" "L" "T" "D" "R" "N" "L"
## [199] "N" "T" "T" "F" "F" "D" "P" "A" "G" "G" "G" "D" "P" "V" "L" "Y" "Q" "H"
## [217] "L" "F" "W" "F" "F" "G" "H" "P" "E" "V" "Y" "I" "L" "I" "L"
## attr(,"name")
## [1] "Seq1"
## attr(,"Annot")
## [1] ">Seq1"
## attr(,"class")
## [1] "SeqFastaAA"
##
## $Seq2
## [1] "V" "G" "T" "A" "L" "X" "L" "L" "I" "R" "A" "E" "L" "X" "Q" "P" "G" "A"
## [19] "L" "L" "G" "D" "D" "Q" "I" "Y" "N" "V" "V" "V" "T" "A" "H" "A" "F" "V"
## [37] "M" "I" "F" "F" "M" "V" "M" "P" "I" "M" "I" "G" "G" "F" "G" "N" "W" "L"
## [55] "V" "P" "L" "M" "I" "G" "A" "P" "D" "M" "A" "F" "P" "R" "M" "N" "N" "M"
## [73] "S" "F" "W" "L" "L" "P" "P" "S" "F" "L" "L" "L" "M" "A" "S" "S" "T" "V"
## [91] "E" "A" "G" "A" "G" "T" "G" "W" "T" "V" "Y" "P" "P" "L" "A" "G" "N" "L"
## [109] "A" "H" "A" "G" "A" "S" "V" "D" "L" "A" "I" "F" "S" "L" "H" "L" "A" "G"
## [127] "I" "S" "S" "I" "L" "G" "A" "I" "N" "F" "I" "T" "T" "A" "I" "N" "M" "K"
## [145] "P" "P" "A" "L" "S" "Q" "Y" "Q" "T" "P" "L" "F" "V" "W" "S" "V" "L" "I"
## [163] "T" "A" "V" "L" "L" "L" "L" "S" "L" "P" "V" "L" "A" "A" "G" "I" "T" "M"
## [181] "L" "L" "T" "D" "R" "N" "L" "N" "T" "T" "F" "F" "D" "P" "A" "G" "G" "G"
## [199] "D" "P" "V" "L" "Y" "Q" "H" "L" "F" "W" "F" "F" "G" "H" "P" "E" "V" "Y"
## [217] "I" "L" "I" "L"
## attr(,"name")
## [1] "Seq2"
## attr(,"Annot")
## [1] ">Seq2"
## attr(,"class")
## [1] "SeqFastaAA"
##
## $Seq3
## [1] "L" "Y" "L" "I" "F" "G" "A" "W" "A" "G" "M" "V" "G" "T" "A" "L" "S" "L"
## [19] "L" "I" "R" "A" "E" "L" "G" "Q" "P" "G" "A" "L" "L" "G" "D" "D" "Q" "V"
## [37] "Y" "N" "V" "V" "V" "T" "A" "H" "A" "F" "V" "M" "I" "F" "F" "M" "V" "M"
## [55] "P" "I" "M" "I" "G" "G" "F" "G" "N" "W" "L" "V" "P" "L" "M" "I" "G" "A"
## [73] "P" "D" "M" "A" "F" "P" "R" "M" "N" "N" "M" "S" "F" "W" "L" "L" "P" "P"
## [91] "S" "F" "L" "L" "L" "L" "A" "S" "S" "T" "V" "E" "A" "G" "V" "G" "T" "G"
## [109] "W" "T" "V" "Y" "P" "P" "L" "A" "G" "N" "L" "A" "H" "A" "G" "A" "S" "V"
## [127] "D" "L" "A" "I" "F" "S" "L" "H" "L" "A" "G" "I" "S" "S" "I" "L" "G" "A"
## [145] "I" "N" "F" "I" "T" "T" "A" "I" "N" "M" "K" "P" "P" "A" "L" "S" "Q" "Y"
## [163] "Q" "T" "P" "L" "F" "V" "W" "S" "V" "L" "I" "T" "A" "V" "L" "L" "L" "L"
## [181] "S" "L" "P" "V" "L" "A" "A" "G" "I" "T" "M" "L" "L" "T" "D" "R" "N" "L"
## [199] "N" "T" "T" "F" "F" "D" "P" "A" "G" "G" "G" "D" "P" "V" "L" "Y" "Q" "H"
## [217] "L" "F" "W" "F" "F" "G" "H" "P" "E" "V" "Y" "I" "L" "I" "L"
## attr(,"name")
## [1] "Seq3"
## attr(,"Annot")
## [1] ">Seq3"
## attr(,"class")
## [1] "SeqFastaAA"
##
## $Seq4
## [1] "W" "A" "G" "M" "V" "G" "T" "A" "L" "S" "L" "L" "I" "R" "A" "E" "L" "G"
## [19] "Q" "P" "G" "A" "L" "L" "G" "D" "D" "Q" "I" "Y" "N" "V" "V" "X" "T" "A"
## [37] "H" "A" "F" "V" "M" "I" "F" "F" "M" "V" "M" "P" "I" "M" "I" "G" "G" "F"
## [55] "G" "N" "W" "L" "V" "P" "L" "M" "I" "G" "A" "P" "D" "M" "A" "F" "P" "R"
## [73] "M" "N" "N" "M" "S" "F" "W" "L" "L" "P" "P" "S" "F" "L" "L" "L" "M" "A"
## [91] "S" "S" "T" "V" "E" "A" "G" "V" "G" "T" "G" "W" "T" "V" "Y" "P" "P" "L"
## [109] "A" "G" "N" "L" "A" "H" "A" "G" "A" "S" "V" "D" "L" "A" "I" "F" "S" "L"
## [127] "H" "L" "A" "G" "I" "S" "S" "I" "L" "G" "A" "I" "N" "F" "I" "T" "T" "A"
## [145] "I" "N" "M" "K" "P" "P" "A" "L" "S" "Q" "Y" "Q" "T" "P" "L" "F" "V" "W"
## [163] "S" "V" "L" "I" "T" "A" "V" "L" "L" "L" "L" "S" "L" "P" "V" "L" "A" "A"
## [181] "G" "I" "T" "M" "L" "L" "T" "D" "R" "N" "L" "N" "T" "T" "F" "F" "D" "P"
## [199] "A" "G" "G" "G" "D" "P" "V" "L" "Y" "Q" "H" "L" "F" "W" "F" "F" "G" "H"
## [217] "P" "E" "V" "Y" "I" "L" "I" "L"
## attr(,"name")
## [1] "Seq4"
## attr(,"Annot")
## [1] ">Seq4"
## attr(,"class")
## [1] "SeqFastaAA"
##
## $Seq5
## [1] "L" "Y" "L" "I" "F" "G" "A" "W" "A" "G" "M" "V" "G" "T" "A" "L" "S" "L"
## [19] "L" "I" "R" "A" "E" "L" "G" "Q" "P" "G" "A" "L" "L" "G" "D" "D" "Q" "V"
## [37] "Y" "N" "V" "V" "V" "T" "A" "H" "A" "F" "V" "M" "I" "F" "F" "M" "V" "M"
## [55] "P" "I" "M" "I" "G" "G" "F" "G" "N" "W" "L" "V" "P" "L" "M" "I" "G" "A"
## [73] "P" "D" "M" "A" "F" "P" "R" "M" "N" "N" "M" "S" "F" "W" "L" "L" "P" "P"
## [91] "S" "F" "L" "L" "L" "L" "A" "S" "S" "T" "V" "E" "A" "G" "V" "G" "T" "G"
## [109] "W" "T" "V" "Y" "P" "P" "L" "A" "G" "N" "L" "A" "H" "A" "G" "A" "S" "V"
## [127] "D" "L" "A" "I" "F" "S" "L" "H" "L" "A" "G" "I" "S" "S" "I" "L" "G" "A"
## [145] "I" "N" "F" "I" "T" "T" "A" "I" "N" "M" "K" "P" "P" "A" "L" "S" "Q" "Y"
## [163] "Q" "T" "P" "L" "F" "V" "W" "S" "V" "L" "I" "T" "A" "V" "L" "L" "L" "L"
## [181] "S" "L" "P" "V" "L" "A" "A" "G" "I" "T" "M" "L" "L" "T" "D" "R" "N" "L"
## [199] "N" "T" "T" "F" "F" "D" "P" "A" "G" "G" "G" "D" "P" "V" "L" "Y" "Q" "H"
## [217] "L" "F" "W" "F" "F" "G" "H" "P" "E" "V" "Y" "I" "L" "I" "X"
## attr(,"name")
## [1] "Seq5"
## attr(,"Annot")
## [1] ">Seq5"
## attr(,"class")
## [1] "SeqFastaAA"
##
## $Seq6
## [1] "W" "A" "G" "M" "V" "G" "T" "A" "L" "S" "L" "L" "I" "R" "A" "E" "L" "G"
## [19] "Q" "P" "G" "A" "L" "L" "G" "D" "D" "Q" "I" "Y" "N" "V" "V" "V" "T" "A"
## [37] "H" "A" "F" "V" "M" "I" "F" "F" "M" "V" "M" "P" "I" "M" "I" "G" "G" "F"
## [55] "G" "N" "W" "L" "V" "P" "L" "M" "I" "G" "A" "P" "D" "M" "A" "F" "P" "R"
## [73] "M" "N" "N" "M" "S" "F" "W" "L" "L" "P" "P" "S" "F" "L" "L" "L" "M" "A"
## [91] "S" "S" "T" "V" "E" "A" "G" "V" "G" "T" "G" "W" "T" "V" "Y" "P" "P" "L"
## [109] "A" "G" "N" "L" "A" "H" "A" "G" "A" "S" "V" "D" "L" "A" "I" "F" "S" "L"
## [127] "H" "L" "A" "G" "I" "S" "S" "I" "L" "G" "A" "I" "N" "F" "I" "T" "T" "A"
## [145] "I" "N" "M" "K" "P" "P" "A" "L" "S" "Q" "Y" "Q" "T" "P" "L" "F" "V" "W"
## [163] "S" "V" "L" "I" "T" "A" "V" "L" "L" "L" "L" "S" "L" "P" "V" "L" "A" "A"
## [181] "G" "I" "T" "M" "L" "L" "T" "D" "R" "N" "L" "N" "T" "T" "F" "F" "D" "P"
## [199] "A" "G" "G" "G" "D" "P" "V" "L" "Y" "Q" "H" "L" "F" "W" "F" "F" "G" "H"
## [217] "P" "E" "V" "Y" "I" "L" "I" "L"
## attr(,"name")
## [1] "Seq6"
## attr(,"Annot")
## [1] ">Seq6"
## attr(,"class")
## [1] "SeqFastaAA"
##
## $Seq7
## [1] "V" "G" "T" "A" "L" "S" "L" "L" "I" "R" "A" "E" "L" "G" "Q" "P" "G" "T"
## [19] "L" "L" "G" "D" "D" "Q" "I" "Y" "N" "V" "I" "V" "T" "A" "H" "A" "F" "V"
## [37] "M" "I" "F" "F" "M" "V" "M" "P" "V" "M" "I" "G" "G" "F" "G" "N" "W" "L"
## [55] "V" "P" "L" "M" "I" "G" "A" "P" "D" "M" "A" "F" "P" "R" "M" "N" "N" "M"
## [73] "S" "F" "W" "L" "L" "P" "P" "S" "F" "L" "L" "L" "L" "A" "S" "S" "T" "V"
## [91] "E" "A" "G" "A" "G" "T" "G" "W" "T" "V" "Y" "P" "P" "L" "A" "G" "N" "L"
## [109] "A" "H" "A" "G" "A" "S" "V" "D" "L" "A" "I" "F" "S" "L" "H" "L" "A" "G"
## [127] "V" "S" "S" "I" "L" "G" "A" "I" "N" "F" "I" "T" "T" "A" "I" "N" "M" "K"
## [145] "P" "P" "A" "L" "S" "Q" "Y" "Q" "T" "P" "L" "F" "V" "W" "S" "V" "L" "I"
## [163] "T" "A" "V" "L" "L" "L" "L" "S" "L" "P" "V" "L" "A" "A" "G" "I" "T" "M"
## [181] "L" "L" "T" "D" "R" "N" "L" "N" "T" "T" "F" "F" "D" "P" "A" "G" "G" "G"
## [199] "D" "P" "V" "L" "Y" "Q" "H" "L" "F" "W" "F" "F" "G" "H" "P" "E" "V" "Y"
## [217] "I" "L" "I" "L"
## attr(,"name")
## [1] "Seq7"
## attr(,"Annot")
## [1] ">Seq7"
## attr(,"class")
## [1] "SeqFastaAA"
Convert sequences to character strings
# Convert sequences to character strings
protein_sequences <- sapply(protein_fasta, function(x) paste(x, collapse = ""))
protein_sequences
## Seq1
## "LYLIFGAWAGMVGTALSLLIRAELGQPGTLLGDDQIYNVIVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSTVEAGAGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGVSSILGAINFITTAINMKPPTLSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL"
## Seq2
## "VGTALXLLIRAELXQPGALLGDDQIYNVVVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLMASSTVEAGAGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL"
## Seq3
## "LYLIFGAWAGMVGTALSLLIRAELGQPGALLGDDQVYNVVVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSTVEAGVGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL"
## Seq4
## "WAGMVGTALSLLIRAELGQPGALLGDDQIYNVVXTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLMASSTVEAGVGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL"
## Seq5
## "LYLIFGAWAGMVGTALSLLIRAELGQPGALLGDDQVYNVVVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSTVEAGVGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIX"
## Seq6
## "WAGMVGTALSLLIRAELGQPGALLGDDQIYNVVVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLMASSTVEAGVGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL"
## Seq7
## "VGTALSLLIRAELGQPGTLLGDDQIYNVIVTAHAFVMIFFMVMPVMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSTVEAGAGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGVSSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL"
Align the protein sequences using msa
protein_alignment <- msa(protein_sequences, type = "protein")
## use default substitution matrix
protein_alignment
## CLUSTAL 2.1
##
## Call:
## msa(protein_sequences, type = "protein")
##
## MsaAAMultipleAlignment with 7 rows and 231 columns
## aln names
## [1] -------WAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq4
## [2] -------WAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq6
## [3] -----------VGTALXLLIRAELXQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq2
## [4] LYLIFGAWAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq3
## [5] LYLIFGAWAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIX Seq5
## [6] LYLIFGAWAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq1
## [7] -----------VGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq7
## Con -------WAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Consensus
Compute the distance matrix
dist_matrix <- dist.alignment(alignment_ape)
dist_matrix
## Seq4 Seq6 Seq2 Seq3 Seq5 Seq1
## Seq6 0.00000000
## Seq2 0.06788442 0.06772855
## Seq3 0.09470274 0.09449112 0.11730928
## Seq5 0.09491580 0.09470274 0.11757927 0.00000000
## Seq1 0.16402997 0.16366342 0.15144563 0.16116459 0.16151457
## Seq7 0.16552118 0.16514456 0.15144563 0.16514456 0.16552118 0.09534626
Construct the phylogenetic tree using NJ
phylo_tree <- nj(as.dist(dist_matrix))
phylo_tree
##
## Phylogenetic tree with 7 tips and 5 internal nodes.
##
## Tip labels:
## Seq4, Seq6, Seq2, Seq3, Seq5, Seq1, ...
##
## Unrooted; includes branch length(s).
Plot the phylogenetic tree
plot(phylo_tree, main = "Neighbor-Joining Phylogenetic Tree (Protein Sequences)")
