install Bioconductor and relevant packages

if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("Biostrings")
## Bioconductor version 3.20 (BiocManager 1.30.25), R 4.4.1 (2024-06-14 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'Biostrings'
## Installation paths not writeable, unable to update packages
##   path: C:/Program Files/R/R-4.4.1/library
##   packages:
##     boot, cluster, foreign, MASS, Matrix, nlme, survival
BiocManager::install("seqinr")
## Bioconductor version 3.20 (BiocManager 1.30.25), R 4.4.1 (2024-06-14 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'seqinr'
## Installation paths not writeable, unable to update packages
##   path: C:/Program Files/R/R-4.4.1/library
##   packages:
##     boot, cluster, foreign, MASS, Matrix, nlme, survival
library(Biostrings)
## Warning: package 'Biostrings' was built under R version 4.4.2
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 4.4.2
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.4.2
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
library(seqinr)
## Warning: package 'seqinr' was built under R version 4.4.2
## 
## Attaching package: 'seqinr'
## The following object is masked from 'package:Biostrings':
## 
##     translate

Protein Sequence Analysis

Read Protein FASTA file

library(seqinr)
url <- "https://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/protein-sample.txt"
protein_fasta <- read.fasta(url)
## Warning in readLines(file): incomplete final line found on
## 'https://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/protein-sample.txt'
protein_fasta
## $Seq1
##   [1] "l" "y" "l" "i" "f" "g" "a" "w" "a" "g" "m" "v" "g" "t" "a" "l" "s" "l"
##  [19] "l" "i" "r" "a" "e" "l" "g" "q" "p" "g" "t" "l" "l" "g" "d" "d" "q" "i"
##  [37] "y" "n" "v" "i" "v" "t" "a" "h" "a" "f" "v" "m" "i" "f" "f" "m" "v" "m"
##  [55] "p" "i" "m" "i" "g" "g" "f" "g" "n" "w" "l" "v" "p" "l" "m" "i" "g" "a"
##  [73] "p" "d" "m" "a" "f" "p" "r" "m" "n" "n" "m" "s" "f" "w" "l" "l" "p" "p"
##  [91] "s" "f" "l" "l" "l" "l" "a" "s" "s" "t" "v" "e" "a" "g" "a" "g" "t" "g"
## [109] "w" "t" "v" "y" "p" "p" "l" "a" "g" "n" "l" "a" "h" "a" "g" "a" "s" "v"
## [127] "d" "l" "a" "i" "f" "s" "l" "h" "l" "a" "g" "v" "s" "s" "i" "l" "g" "a"
## [145] "i" "n" "f" "i" "t" "t" "a" "i" "n" "m" "k" "p" "p" "t" "l" "s" "q" "y"
## [163] "q" "t" "p" "l" "f" "v" "w" "s" "v" "l" "i" "t" "a" "v" "l" "l" "l" "l"
## [181] "s" "l" "p" "v" "l" "a" "a" "g" "i" "t" "m" "l" "l" "t" "d" "r" "n" "l"
## [199] "n" "t" "t" "f" "f" "d" "p" "a" "g" "g" "g" "d" "p" "v" "l" "y" "q" "h"
## [217] "l" "f" "w" "f" "f" "g" "h" "p" "e" "v" "y" "i" "l" "i" "l"
## attr(,"name")
## [1] "Seq1"
## attr(,"Annot")
## [1] ">Seq1"
## attr(,"class")
## [1] "SeqFastadna"
## 
## $Seq2
##   [1] "v" "g" "t" "a" "l" "x" "l" "l" "i" "r" "a" "e" "l" "x" "q" "p" "g" "a"
##  [19] "l" "l" "g" "d" "d" "q" "i" "y" "n" "v" "v" "v" "t" "a" "h" "a" "f" "v"
##  [37] "m" "i" "f" "f" "m" "v" "m" "p" "i" "m" "i" "g" "g" "f" "g" "n" "w" "l"
##  [55] "v" "p" "l" "m" "i" "g" "a" "p" "d" "m" "a" "f" "p" "r" "m" "n" "n" "m"
##  [73] "s" "f" "w" "l" "l" "p" "p" "s" "f" "l" "l" "l" "m" "a" "s" "s" "t" "v"
##  [91] "e" "a" "g" "a" "g" "t" "g" "w" "t" "v" "y" "p" "p" "l" "a" "g" "n" "l"
## [109] "a" "h" "a" "g" "a" "s" "v" "d" "l" "a" "i" "f" "s" "l" "h" "l" "a" "g"
## [127] "i" "s" "s" "i" "l" "g" "a" "i" "n" "f" "i" "t" "t" "a" "i" "n" "m" "k"
## [145] "p" "p" "a" "l" "s" "q" "y" "q" "t" "p" "l" "f" "v" "w" "s" "v" "l" "i"
## [163] "t" "a" "v" "l" "l" "l" "l" "s" "l" "p" "v" "l" "a" "a" "g" "i" "t" "m"
## [181] "l" "l" "t" "d" "r" "n" "l" "n" "t" "t" "f" "f" "d" "p" "a" "g" "g" "g"
## [199] "d" "p" "v" "l" "y" "q" "h" "l" "f" "w" "f" "f" "g" "h" "p" "e" "v" "y"
## [217] "i" "l" "i" "l"
## attr(,"name")
## [1] "Seq2"
## attr(,"Annot")
## [1] ">Seq2"
## attr(,"class")
## [1] "SeqFastadna"
## 
## $Seq3
##   [1] "l" "y" "l" "i" "f" "g" "a" "w" "a" "g" "m" "v" "g" "t" "a" "l" "s" "l"
##  [19] "l" "i" "r" "a" "e" "l" "g" "q" "p" "g" "a" "l" "l" "g" "d" "d" "q" "v"
##  [37] "y" "n" "v" "v" "v" "t" "a" "h" "a" "f" "v" "m" "i" "f" "f" "m" "v" "m"
##  [55] "p" "i" "m" "i" "g" "g" "f" "g" "n" "w" "l" "v" "p" "l" "m" "i" "g" "a"
##  [73] "p" "d" "m" "a" "f" "p" "r" "m" "n" "n" "m" "s" "f" "w" "l" "l" "p" "p"
##  [91] "s" "f" "l" "l" "l" "l" "a" "s" "s" "t" "v" "e" "a" "g" "v" "g" "t" "g"
## [109] "w" "t" "v" "y" "p" "p" "l" "a" "g" "n" "l" "a" "h" "a" "g" "a" "s" "v"
## [127] "d" "l" "a" "i" "f" "s" "l" "h" "l" "a" "g" "i" "s" "s" "i" "l" "g" "a"
## [145] "i" "n" "f" "i" "t" "t" "a" "i" "n" "m" "k" "p" "p" "a" "l" "s" "q" "y"
## [163] "q" "t" "p" "l" "f" "v" "w" "s" "v" "l" "i" "t" "a" "v" "l" "l" "l" "l"
## [181] "s" "l" "p" "v" "l" "a" "a" "g" "i" "t" "m" "l" "l" "t" "d" "r" "n" "l"
## [199] "n" "t" "t" "f" "f" "d" "p" "a" "g" "g" "g" "d" "p" "v" "l" "y" "q" "h"
## [217] "l" "f" "w" "f" "f" "g" "h" "p" "e" "v" "y" "i" "l" "i" "l"
## attr(,"name")
## [1] "Seq3"
## attr(,"Annot")
## [1] ">Seq3"
## attr(,"class")
## [1] "SeqFastadna"
## 
## $Seq4
##   [1] "w" "a" "g" "m" "v" "g" "t" "a" "l" "s" "l" "l" "i" "r" "a" "e" "l" "g"
##  [19] "q" "p" "g" "a" "l" "l" "g" "d" "d" "q" "i" "y" "n" "v" "v" "x" "t" "a"
##  [37] "h" "a" "f" "v" "m" "i" "f" "f" "m" "v" "m" "p" "i" "m" "i" "g" "g" "f"
##  [55] "g" "n" "w" "l" "v" "p" "l" "m" "i" "g" "a" "p" "d" "m" "a" "f" "p" "r"
##  [73] "m" "n" "n" "m" "s" "f" "w" "l" "l" "p" "p" "s" "f" "l" "l" "l" "m" "a"
##  [91] "s" "s" "t" "v" "e" "a" "g" "v" "g" "t" "g" "w" "t" "v" "y" "p" "p" "l"
## [109] "a" "g" "n" "l" "a" "h" "a" "g" "a" "s" "v" "d" "l" "a" "i" "f" "s" "l"
## [127] "h" "l" "a" "g" "i" "s" "s" "i" "l" "g" "a" "i" "n" "f" "i" "t" "t" "a"
## [145] "i" "n" "m" "k" "p" "p" "a" "l" "s" "q" "y" "q" "t" "p" "l" "f" "v" "w"
## [163] "s" "v" "l" "i" "t" "a" "v" "l" "l" "l" "l" "s" "l" "p" "v" "l" "a" "a"
## [181] "g" "i" "t" "m" "l" "l" "t" "d" "r" "n" "l" "n" "t" "t" "f" "f" "d" "p"
## [199] "a" "g" "g" "g" "d" "p" "v" "l" "y" "q" "h" "l" "f" "w" "f" "f" "g" "h"
## [217] "p" "e" "v" "y" "i" "l" "i" "l"
## attr(,"name")
## [1] "Seq4"
## attr(,"Annot")
## [1] ">Seq4"
## attr(,"class")
## [1] "SeqFastadna"
## 
## $Seq5
##   [1] "l" "y" "l" "i" "f" "g" "a" "w" "a" "g" "m" "v" "g" "t" "a" "l" "s" "l"
##  [19] "l" "i" "r" "a" "e" "l" "g" "q" "p" "g" "a" "l" "l" "g" "d" "d" "q" "v"
##  [37] "y" "n" "v" "v" "v" "t" "a" "h" "a" "f" "v" "m" "i" "f" "f" "m" "v" "m"
##  [55] "p" "i" "m" "i" "g" "g" "f" "g" "n" "w" "l" "v" "p" "l" "m" "i" "g" "a"
##  [73] "p" "d" "m" "a" "f" "p" "r" "m" "n" "n" "m" "s" "f" "w" "l" "l" "p" "p"
##  [91] "s" "f" "l" "l" "l" "l" "a" "s" "s" "t" "v" "e" "a" "g" "v" "g" "t" "g"
## [109] "w" "t" "v" "y" "p" "p" "l" "a" "g" "n" "l" "a" "h" "a" "g" "a" "s" "v"
## [127] "d" "l" "a" "i" "f" "s" "l" "h" "l" "a" "g" "i" "s" "s" "i" "l" "g" "a"
## [145] "i" "n" "f" "i" "t" "t" "a" "i" "n" "m" "k" "p" "p" "a" "l" "s" "q" "y"
## [163] "q" "t" "p" "l" "f" "v" "w" "s" "v" "l" "i" "t" "a" "v" "l" "l" "l" "l"
## [181] "s" "l" "p" "v" "l" "a" "a" "g" "i" "t" "m" "l" "l" "t" "d" "r" "n" "l"
## [199] "n" "t" "t" "f" "f" "d" "p" "a" "g" "g" "g" "d" "p" "v" "l" "y" "q" "h"
## [217] "l" "f" "w" "f" "f" "g" "h" "p" "e" "v" "y" "i" "l" "i" "x"
## attr(,"name")
## [1] "Seq5"
## attr(,"Annot")
## [1] ">Seq5"
## attr(,"class")
## [1] "SeqFastadna"
## 
## $Seq6
##   [1] "w" "a" "g" "m" "v" "g" "t" "a" "l" "s" "l" "l" "i" "r" "a" "e" "l" "g"
##  [19] "q" "p" "g" "a" "l" "l" "g" "d" "d" "q" "i" "y" "n" "v" "v" "v" "t" "a"
##  [37] "h" "a" "f" "v" "m" "i" "f" "f" "m" "v" "m" "p" "i" "m" "i" "g" "g" "f"
##  [55] "g" "n" "w" "l" "v" "p" "l" "m" "i" "g" "a" "p" "d" "m" "a" "f" "p" "r"
##  [73] "m" "n" "n" "m" "s" "f" "w" "l" "l" "p" "p" "s" "f" "l" "l" "l" "m" "a"
##  [91] "s" "s" "t" "v" "e" "a" "g" "v" "g" "t" "g" "w" "t" "v" "y" "p" "p" "l"
## [109] "a" "g" "n" "l" "a" "h" "a" "g" "a" "s" "v" "d" "l" "a" "i" "f" "s" "l"
## [127] "h" "l" "a" "g" "i" "s" "s" "i" "l" "g" "a" "i" "n" "f" "i" "t" "t" "a"
## [145] "i" "n" "m" "k" "p" "p" "a" "l" "s" "q" "y" "q" "t" "p" "l" "f" "v" "w"
## [163] "s" "v" "l" "i" "t" "a" "v" "l" "l" "l" "l" "s" "l" "p" "v" "l" "a" "a"
## [181] "g" "i" "t" "m" "l" "l" "t" "d" "r" "n" "l" "n" "t" "t" "f" "f" "d" "p"
## [199] "a" "g" "g" "g" "d" "p" "v" "l" "y" "q" "h" "l" "f" "w" "f" "f" "g" "h"
## [217] "p" "e" "v" "y" "i" "l" "i" "l"
## attr(,"name")
## [1] "Seq6"
## attr(,"Annot")
## [1] ">Seq6"
## attr(,"class")
## [1] "SeqFastadna"
## 
## $Seq7
##   [1] "v" "g" "t" "a" "l" "s" "l" "l" "i" "r" "a" "e" "l" "g" "q" "p" "g" "t"
##  [19] "l" "l" "g" "d" "d" "q" "i" "y" "n" "v" "i" "v" "t" "a" "h" "a" "f" "v"
##  [37] "m" "i" "f" "f" "m" "v" "m" "p" "v" "m" "i" "g" "g" "f" "g" "n" "w" "l"
##  [55] "v" "p" "l" "m" "i" "g" "a" "p" "d" "m" "a" "f" "p" "r" "m" "n" "n" "m"
##  [73] "s" "f" "w" "l" "l" "p" "p" "s" "f" "l" "l" "l" "l" "a" "s" "s" "t" "v"
##  [91] "e" "a" "g" "a" "g" "t" "g" "w" "t" "v" "y" "p" "p" "l" "a" "g" "n" "l"
## [109] "a" "h" "a" "g" "a" "s" "v" "d" "l" "a" "i" "f" "s" "l" "h" "l" "a" "g"
## [127] "v" "s" "s" "i" "l" "g" "a" "i" "n" "f" "i" "t" "t" "a" "i" "n" "m" "k"
## [145] "p" "p" "a" "l" "s" "q" "y" "q" "t" "p" "l" "f" "v" "w" "s" "v" "l" "i"
## [163] "t" "a" "v" "l" "l" "l" "l" "s" "l" "p" "v" "l" "a" "a" "g" "i" "t" "m"
## [181] "l" "l" "t" "d" "r" "n" "l" "n" "t" "t" "f" "f" "d" "p" "a" "g" "g" "g"
## [199] "d" "p" "v" "l" "y" "q" "h" "l" "f" "w" "f" "f" "g" "h" "p" "e" "v" "y"
## [217] "i" "l" "i" "l"
## attr(,"name")
## [1] "Seq7"
## attr(,"Annot")
## [1] ">Seq7"
## attr(,"class")
## [1] "SeqFastadna"

structure

# Check the structure of protein_fasta to understand how sequences are stored
str(protein_fasta)
## List of 7
##  $ Seq1: 'SeqFastadna' chr [1:231] "l" "y" "l" "i" ...
##   ..- attr(*, "name")= chr "Seq1"
##   ..- attr(*, "Annot")= chr ">Seq1"
##  $ Seq2: 'SeqFastadna' chr [1:220] "v" "g" "t" "a" ...
##   ..- attr(*, "name")= chr "Seq2"
##   ..- attr(*, "Annot")= chr ">Seq2"
##  $ Seq3: 'SeqFastadna' chr [1:231] "l" "y" "l" "i" ...
##   ..- attr(*, "name")= chr "Seq3"
##   ..- attr(*, "Annot")= chr ">Seq3"
##  $ Seq4: 'SeqFastadna' chr [1:224] "w" "a" "g" "m" ...
##   ..- attr(*, "name")= chr "Seq4"
##   ..- attr(*, "Annot")= chr ">Seq4"
##  $ Seq5: 'SeqFastadna' chr [1:231] "l" "y" "l" "i" ...
##   ..- attr(*, "name")= chr "Seq5"
##   ..- attr(*, "Annot")= chr ">Seq5"
##  $ Seq6: 'SeqFastadna' chr [1:224] "w" "a" "g" "m" ...
##   ..- attr(*, "name")= chr "Seq6"
##   ..- attr(*, "Annot")= chr ">Seq6"
##  $ Seq7: 'SeqFastadna' chr [1:220] "v" "g" "t" "a" ...
##   ..- attr(*, "name")= chr "Seq7"
##   ..- attr(*, "Annot")= chr ">Seq7"

Sequence Length

sequence <- protein_fasta[[1]]
sequence_length <- length(sequence)
sequence_length
## [1] 231

SeqFastadna object to an AAString object

sequence_biostrings <- AAString(paste(sequence, collapse = ""))
sequence_biostrings
## 231-letter AAString object
## seq: LYLIFGAWAGMVGTALSLLIRAELGQPGTLLGDDQI...RNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL

frequency of each amino acid

sequence_composition <- alphabetFrequency(sequence_biostrings)
sequence_composition
##  A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  U  O  B  J  Z  X 
## 23  3  9  7  0  5  3 22  5 17 38  1 11 16 16 12 15  6  6 16  0  0  0  0  0  0 
##  *  -  +  . 
##  0  0  0  0

Define the subsequence want to match

subseq <- "MTEY"
subseq_loc <- matchPattern(subseq, sequence_biostrings)
subseq_loc
## Views on a 231-letter AAString subject
## subject: LYLIFGAWAGMVGTALSLLIRAELGQPGTLLGDD...LNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL
## views: NONE

Multiple sequence alignments

MSA package

#BiocManager::install("msa")
library(msa)

Create Sequence

protein_sequences <- sapply(protein_fasta, function(x) paste(x, collapse = ""))
print(protein_sequences)
##                                                                                                                                                                                                                                      Seq1 
## "lylifgawagmvgtalslliraelgqpgtllgddqiynvivtahafvmiffmvmpimiggfgnwlvplmigapdmafprmnnmsfwllppsfllllasstveagagtgwtvypplagnlahagasvdlaifslhlagvssilgainfittainmkpptlsqyqtplfvwsvlitavllllslpvlaagitmlltdrnlnttffdpagggdpvlyqhlfwffghpevyilil" 
##                                                                                                                                                                                                                                      Seq2 
##            "vgtalxlliraelxqpgallgddqiynvvvtahafvmiffmvmpimiggfgnwlvplmigapdmafprmnnmsfwllppsflllmasstveagagtgwtvypplagnlahagasvdlaifslhlagissilgainfittainmkppalsqyqtplfvwsvlitavllllslpvlaagitmlltdrnlnttffdpagggdpvlyqhlfwffghpevyilil" 
##                                                                                                                                                                                                                                      Seq3 
## "lylifgawagmvgtalslliraelgqpgallgddqvynvvvtahafvmiffmvmpimiggfgnwlvplmigapdmafprmnnmsfwllppsfllllasstveagvgtgwtvypplagnlahagasvdlaifslhlagissilgainfittainmkppalsqyqtplfvwsvlitavllllslpvlaagitmlltdrnlnttffdpagggdpvlyqhlfwffghpevyilil" 
##                                                                                                                                                                                                                                      Seq4 
##        "wagmvgtalslliraelgqpgallgddqiynvvxtahafvmiffmvmpimiggfgnwlvplmigapdmafprmnnmsfwllppsflllmasstveagvgtgwtvypplagnlahagasvdlaifslhlagissilgainfittainmkppalsqyqtplfvwsvlitavllllslpvlaagitmlltdrnlnttffdpagggdpvlyqhlfwffghpevyilil" 
##                                                                                                                                                                                                                                      Seq5 
## "lylifgawagmvgtalslliraelgqpgallgddqvynvvvtahafvmiffmvmpimiggfgnwlvplmigapdmafprmnnmsfwllppsfllllasstveagvgtgwtvypplagnlahagasvdlaifslhlagissilgainfittainmkppalsqyqtplfvwsvlitavllllslpvlaagitmlltdrnlnttffdpagggdpvlyqhlfwffghpevyilix" 
##                                                                                                                                                                                                                                      Seq6 
##        "wagmvgtalslliraelgqpgallgddqiynvvvtahafvmiffmvmpimiggfgnwlvplmigapdmafprmnnmsfwllppsflllmasstveagvgtgwtvypplagnlahagasvdlaifslhlagissilgainfittainmkppalsqyqtplfvwsvlitavllllslpvlaagitmlltdrnlnttffdpagggdpvlyqhlfwffghpevyilil" 
##                                                                                                                                                                                                                                      Seq7 
##            "vgtalslliraelgqpgtllgddqiynvivtahafvmiffmvmpvmiggfgnwlvplmigapdmafprmnnmsfwllppsfllllasstveagagtgwtvypplagnlahagasvdlaifslhlagvssilgainfittainmkppalsqyqtplfvwsvlitavllllslpvlaagitmlltdrnlnttffdpagggdpvlyqhlfwffghpevyilil"

Sequence Alignment

protein_alignment <- msa(protein_sequences, type = "protein")
## use default substitution matrix
print(protein_alignment)
## CLUSTAL 2.1  
## 
## Call:
##    msa(protein_sequences, type = "protein")
## 
## MsaAAMultipleAlignment with 7 rows and 231 columns
##     aln                                                    names
## [1] -------WAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq4
## [2] -------WAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq6
## [3] -----------VGTALXLLIRAELXQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq2
## [4] LYLIFGAWAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq3
## [5] LYLIFGAWAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIX Seq5
## [6] LYLIFGAWAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq1
## [7] -----------VGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq7
## Con -------WAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Consensus

Nucleotide Sequence Analysis

Load necessary libraries

library(rvest)
library(Biostrings)

Read the HTML content from the URL

# Define the URL
url <- "http://bioinformatics.org/annhyb/examples/seq_fasta.html"

# Read the HTML content from the URL
webpage <- read_html(url)

Extract the FASTA sequence

# Extract the FASTA sequence (the sequence is usually inside a <pre> tag)
fasta_sequence <- html_text(html_nodes(webpage, "pre"))

Clean the extracted sequence:

# - Remove example text and unnecessary parts
# - Keep only lines starting with '>' and the sequence itself
fasta_sequence_cleaned <- gsub("Example of.*format:|\\r", "", fasta_sequence)  # Remove example text and \r
fasta_sequence_cleaned <- gsub("\n+", "\n", fasta_sequence_cleaned)  # Remove extra newlines
fasta_sequence_cleaned <- trimws(fasta_sequence_cleaned)  # Trim extra spaces

Save it as a cleaned FASTA file

# Now save it as a cleaned FASTA file
writeLines(fasta_sequence_cleaned, "cleaned_fasta.fasta")

# Read the FASTA sequence using Biostrings package
nucleotide_fasta <- readDNAStringSet("cleaned_fasta.fasta")

# Print the content to check
print(nucleotide_fasta)
## DNAStringSet object of length 2:
##     width seq                                               names               
## [1]   120 GGTAAGTCCTCTAGTACAAACAC...GCCATCTTCTTTGAAGCGTTGTC sequence A
## [2]   140 GGTAAGTGCTCTAGTACAAACAC...GTCTATGCATCGATCGACGACTG sequence B

Sequence length

sequence_length <- width(nucleotide_fasta)
sequence_length
## [1] 120 140

sequence_composition

sequence_composition <- alphabetFrequency(nucleotide_fasta, baseOnly = TRUE)
sequence_composition
##       A  C  G  T other
## [1,] 39 23 17 41     0
## [2,] 40 27 23 50     0

Sequence Alignment

# Align the sequences using the msa function
nucleotide_alignment <- msa(nucleotide_fasta)
## use default substitution matrix
# View the alignment result
print(nucleotide_alignment)
## CLUSTAL 2.1  
## 
## Call:
##    msa(nucleotide_fasta)
## 
## MsaDNAMultipleAlignment with 2 rows and 140 columns
##     aln                                                    names
## [1] GGTAAGTCCTCTAGTACAAACACCCC...TTGTC-------------------- sequence A
## [2] GGTAAGTGCTCTAGTACAAACACCCC...TTGTCTATGCATCGATCGACGACTG sequence B
## Con GGTAAGT?CTCTAGTACAAACACCCC...TTGTC???????????????????? Consensus

Phylogenetic Analysis

Convert to ape alignment class

alignment_ape <- msaConvert(nucleotide_alignment, type = "seqinr::alignment")
alignment_ape
## $nb
## [1] 2
## 
## $nam
## [1] "sequence A" "sequence B"
## 
## $seq
## [1] "GGTAAGTCCTCTAGTACAAACACCCCCAATATTGTGATATAATTAAAATTATATTCATATTCTGTTGCCAGAAAAAACACTTTTAGGCTATATTAGAGCCATCTTCTTTGAAGCGTTGTC--------------------"
## [2] "GGTAAGTGCTCTAGTACAAACACCCCCAATATTGTGATATAATTAAAATTATATTCATATTCTGTTGCCAGATTTTACACTTTTAGGCTATATTAGAGCCATCTTCTTTGAAGCGTTGTCTATGCATCGATCGACGACTG"
## 
## $com
## [1] NA
## 
## attr(,"class")
## [1] "alignment"

Compute the distance matrix

dist_matrix <- dist.alignment(alignment_ape)
print(dist_matrix)
##            sequence A
## sequence B  0.2041241

Compute the distance matrix

Phylogenetic Tree

Library and Data

# Load necessary libraries
library(msa)
library(ape)
## Warning: package 'ape' was built under R version 4.4.2
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
## The following object is masked from 'package:Biostrings':
## 
##     complement
library(seqinr)

# Protein FASTA file URL
url <- "https://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/protein-sample.txt"

# Read the protein FASTA file
protein_fasta <- read.fasta(url, seqtype = "AA")
## Warning in readLines(file): incomplete final line found on
## 'https://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/protein-sample.txt'
protein_fasta
## $Seq1
##   [1] "L" "Y" "L" "I" "F" "G" "A" "W" "A" "G" "M" "V" "G" "T" "A" "L" "S" "L"
##  [19] "L" "I" "R" "A" "E" "L" "G" "Q" "P" "G" "T" "L" "L" "G" "D" "D" "Q" "I"
##  [37] "Y" "N" "V" "I" "V" "T" "A" "H" "A" "F" "V" "M" "I" "F" "F" "M" "V" "M"
##  [55] "P" "I" "M" "I" "G" "G" "F" "G" "N" "W" "L" "V" "P" "L" "M" "I" "G" "A"
##  [73] "P" "D" "M" "A" "F" "P" "R" "M" "N" "N" "M" "S" "F" "W" "L" "L" "P" "P"
##  [91] "S" "F" "L" "L" "L" "L" "A" "S" "S" "T" "V" "E" "A" "G" "A" "G" "T" "G"
## [109] "W" "T" "V" "Y" "P" "P" "L" "A" "G" "N" "L" "A" "H" "A" "G" "A" "S" "V"
## [127] "D" "L" "A" "I" "F" "S" "L" "H" "L" "A" "G" "V" "S" "S" "I" "L" "G" "A"
## [145] "I" "N" "F" "I" "T" "T" "A" "I" "N" "M" "K" "P" "P" "T" "L" "S" "Q" "Y"
## [163] "Q" "T" "P" "L" "F" "V" "W" "S" "V" "L" "I" "T" "A" "V" "L" "L" "L" "L"
## [181] "S" "L" "P" "V" "L" "A" "A" "G" "I" "T" "M" "L" "L" "T" "D" "R" "N" "L"
## [199] "N" "T" "T" "F" "F" "D" "P" "A" "G" "G" "G" "D" "P" "V" "L" "Y" "Q" "H"
## [217] "L" "F" "W" "F" "F" "G" "H" "P" "E" "V" "Y" "I" "L" "I" "L"
## attr(,"name")
## [1] "Seq1"
## attr(,"Annot")
## [1] ">Seq1"
## attr(,"class")
## [1] "SeqFastaAA"
## 
## $Seq2
##   [1] "V" "G" "T" "A" "L" "X" "L" "L" "I" "R" "A" "E" "L" "X" "Q" "P" "G" "A"
##  [19] "L" "L" "G" "D" "D" "Q" "I" "Y" "N" "V" "V" "V" "T" "A" "H" "A" "F" "V"
##  [37] "M" "I" "F" "F" "M" "V" "M" "P" "I" "M" "I" "G" "G" "F" "G" "N" "W" "L"
##  [55] "V" "P" "L" "M" "I" "G" "A" "P" "D" "M" "A" "F" "P" "R" "M" "N" "N" "M"
##  [73] "S" "F" "W" "L" "L" "P" "P" "S" "F" "L" "L" "L" "M" "A" "S" "S" "T" "V"
##  [91] "E" "A" "G" "A" "G" "T" "G" "W" "T" "V" "Y" "P" "P" "L" "A" "G" "N" "L"
## [109] "A" "H" "A" "G" "A" "S" "V" "D" "L" "A" "I" "F" "S" "L" "H" "L" "A" "G"
## [127] "I" "S" "S" "I" "L" "G" "A" "I" "N" "F" "I" "T" "T" "A" "I" "N" "M" "K"
## [145] "P" "P" "A" "L" "S" "Q" "Y" "Q" "T" "P" "L" "F" "V" "W" "S" "V" "L" "I"
## [163] "T" "A" "V" "L" "L" "L" "L" "S" "L" "P" "V" "L" "A" "A" "G" "I" "T" "M"
## [181] "L" "L" "T" "D" "R" "N" "L" "N" "T" "T" "F" "F" "D" "P" "A" "G" "G" "G"
## [199] "D" "P" "V" "L" "Y" "Q" "H" "L" "F" "W" "F" "F" "G" "H" "P" "E" "V" "Y"
## [217] "I" "L" "I" "L"
## attr(,"name")
## [1] "Seq2"
## attr(,"Annot")
## [1] ">Seq2"
## attr(,"class")
## [1] "SeqFastaAA"
## 
## $Seq3
##   [1] "L" "Y" "L" "I" "F" "G" "A" "W" "A" "G" "M" "V" "G" "T" "A" "L" "S" "L"
##  [19] "L" "I" "R" "A" "E" "L" "G" "Q" "P" "G" "A" "L" "L" "G" "D" "D" "Q" "V"
##  [37] "Y" "N" "V" "V" "V" "T" "A" "H" "A" "F" "V" "M" "I" "F" "F" "M" "V" "M"
##  [55] "P" "I" "M" "I" "G" "G" "F" "G" "N" "W" "L" "V" "P" "L" "M" "I" "G" "A"
##  [73] "P" "D" "M" "A" "F" "P" "R" "M" "N" "N" "M" "S" "F" "W" "L" "L" "P" "P"
##  [91] "S" "F" "L" "L" "L" "L" "A" "S" "S" "T" "V" "E" "A" "G" "V" "G" "T" "G"
## [109] "W" "T" "V" "Y" "P" "P" "L" "A" "G" "N" "L" "A" "H" "A" "G" "A" "S" "V"
## [127] "D" "L" "A" "I" "F" "S" "L" "H" "L" "A" "G" "I" "S" "S" "I" "L" "G" "A"
## [145] "I" "N" "F" "I" "T" "T" "A" "I" "N" "M" "K" "P" "P" "A" "L" "S" "Q" "Y"
## [163] "Q" "T" "P" "L" "F" "V" "W" "S" "V" "L" "I" "T" "A" "V" "L" "L" "L" "L"
## [181] "S" "L" "P" "V" "L" "A" "A" "G" "I" "T" "M" "L" "L" "T" "D" "R" "N" "L"
## [199] "N" "T" "T" "F" "F" "D" "P" "A" "G" "G" "G" "D" "P" "V" "L" "Y" "Q" "H"
## [217] "L" "F" "W" "F" "F" "G" "H" "P" "E" "V" "Y" "I" "L" "I" "L"
## attr(,"name")
## [1] "Seq3"
## attr(,"Annot")
## [1] ">Seq3"
## attr(,"class")
## [1] "SeqFastaAA"
## 
## $Seq4
##   [1] "W" "A" "G" "M" "V" "G" "T" "A" "L" "S" "L" "L" "I" "R" "A" "E" "L" "G"
##  [19] "Q" "P" "G" "A" "L" "L" "G" "D" "D" "Q" "I" "Y" "N" "V" "V" "X" "T" "A"
##  [37] "H" "A" "F" "V" "M" "I" "F" "F" "M" "V" "M" "P" "I" "M" "I" "G" "G" "F"
##  [55] "G" "N" "W" "L" "V" "P" "L" "M" "I" "G" "A" "P" "D" "M" "A" "F" "P" "R"
##  [73] "M" "N" "N" "M" "S" "F" "W" "L" "L" "P" "P" "S" "F" "L" "L" "L" "M" "A"
##  [91] "S" "S" "T" "V" "E" "A" "G" "V" "G" "T" "G" "W" "T" "V" "Y" "P" "P" "L"
## [109] "A" "G" "N" "L" "A" "H" "A" "G" "A" "S" "V" "D" "L" "A" "I" "F" "S" "L"
## [127] "H" "L" "A" "G" "I" "S" "S" "I" "L" "G" "A" "I" "N" "F" "I" "T" "T" "A"
## [145] "I" "N" "M" "K" "P" "P" "A" "L" "S" "Q" "Y" "Q" "T" "P" "L" "F" "V" "W"
## [163] "S" "V" "L" "I" "T" "A" "V" "L" "L" "L" "L" "S" "L" "P" "V" "L" "A" "A"
## [181] "G" "I" "T" "M" "L" "L" "T" "D" "R" "N" "L" "N" "T" "T" "F" "F" "D" "P"
## [199] "A" "G" "G" "G" "D" "P" "V" "L" "Y" "Q" "H" "L" "F" "W" "F" "F" "G" "H"
## [217] "P" "E" "V" "Y" "I" "L" "I" "L"
## attr(,"name")
## [1] "Seq4"
## attr(,"Annot")
## [1] ">Seq4"
## attr(,"class")
## [1] "SeqFastaAA"
## 
## $Seq5
##   [1] "L" "Y" "L" "I" "F" "G" "A" "W" "A" "G" "M" "V" "G" "T" "A" "L" "S" "L"
##  [19] "L" "I" "R" "A" "E" "L" "G" "Q" "P" "G" "A" "L" "L" "G" "D" "D" "Q" "V"
##  [37] "Y" "N" "V" "V" "V" "T" "A" "H" "A" "F" "V" "M" "I" "F" "F" "M" "V" "M"
##  [55] "P" "I" "M" "I" "G" "G" "F" "G" "N" "W" "L" "V" "P" "L" "M" "I" "G" "A"
##  [73] "P" "D" "M" "A" "F" "P" "R" "M" "N" "N" "M" "S" "F" "W" "L" "L" "P" "P"
##  [91] "S" "F" "L" "L" "L" "L" "A" "S" "S" "T" "V" "E" "A" "G" "V" "G" "T" "G"
## [109] "W" "T" "V" "Y" "P" "P" "L" "A" "G" "N" "L" "A" "H" "A" "G" "A" "S" "V"
## [127] "D" "L" "A" "I" "F" "S" "L" "H" "L" "A" "G" "I" "S" "S" "I" "L" "G" "A"
## [145] "I" "N" "F" "I" "T" "T" "A" "I" "N" "M" "K" "P" "P" "A" "L" "S" "Q" "Y"
## [163] "Q" "T" "P" "L" "F" "V" "W" "S" "V" "L" "I" "T" "A" "V" "L" "L" "L" "L"
## [181] "S" "L" "P" "V" "L" "A" "A" "G" "I" "T" "M" "L" "L" "T" "D" "R" "N" "L"
## [199] "N" "T" "T" "F" "F" "D" "P" "A" "G" "G" "G" "D" "P" "V" "L" "Y" "Q" "H"
## [217] "L" "F" "W" "F" "F" "G" "H" "P" "E" "V" "Y" "I" "L" "I" "X"
## attr(,"name")
## [1] "Seq5"
## attr(,"Annot")
## [1] ">Seq5"
## attr(,"class")
## [1] "SeqFastaAA"
## 
## $Seq6
##   [1] "W" "A" "G" "M" "V" "G" "T" "A" "L" "S" "L" "L" "I" "R" "A" "E" "L" "G"
##  [19] "Q" "P" "G" "A" "L" "L" "G" "D" "D" "Q" "I" "Y" "N" "V" "V" "V" "T" "A"
##  [37] "H" "A" "F" "V" "M" "I" "F" "F" "M" "V" "M" "P" "I" "M" "I" "G" "G" "F"
##  [55] "G" "N" "W" "L" "V" "P" "L" "M" "I" "G" "A" "P" "D" "M" "A" "F" "P" "R"
##  [73] "M" "N" "N" "M" "S" "F" "W" "L" "L" "P" "P" "S" "F" "L" "L" "L" "M" "A"
##  [91] "S" "S" "T" "V" "E" "A" "G" "V" "G" "T" "G" "W" "T" "V" "Y" "P" "P" "L"
## [109] "A" "G" "N" "L" "A" "H" "A" "G" "A" "S" "V" "D" "L" "A" "I" "F" "S" "L"
## [127] "H" "L" "A" "G" "I" "S" "S" "I" "L" "G" "A" "I" "N" "F" "I" "T" "T" "A"
## [145] "I" "N" "M" "K" "P" "P" "A" "L" "S" "Q" "Y" "Q" "T" "P" "L" "F" "V" "W"
## [163] "S" "V" "L" "I" "T" "A" "V" "L" "L" "L" "L" "S" "L" "P" "V" "L" "A" "A"
## [181] "G" "I" "T" "M" "L" "L" "T" "D" "R" "N" "L" "N" "T" "T" "F" "F" "D" "P"
## [199] "A" "G" "G" "G" "D" "P" "V" "L" "Y" "Q" "H" "L" "F" "W" "F" "F" "G" "H"
## [217] "P" "E" "V" "Y" "I" "L" "I" "L"
## attr(,"name")
## [1] "Seq6"
## attr(,"Annot")
## [1] ">Seq6"
## attr(,"class")
## [1] "SeqFastaAA"
## 
## $Seq7
##   [1] "V" "G" "T" "A" "L" "S" "L" "L" "I" "R" "A" "E" "L" "G" "Q" "P" "G" "T"
##  [19] "L" "L" "G" "D" "D" "Q" "I" "Y" "N" "V" "I" "V" "T" "A" "H" "A" "F" "V"
##  [37] "M" "I" "F" "F" "M" "V" "M" "P" "V" "M" "I" "G" "G" "F" "G" "N" "W" "L"
##  [55] "V" "P" "L" "M" "I" "G" "A" "P" "D" "M" "A" "F" "P" "R" "M" "N" "N" "M"
##  [73] "S" "F" "W" "L" "L" "P" "P" "S" "F" "L" "L" "L" "L" "A" "S" "S" "T" "V"
##  [91] "E" "A" "G" "A" "G" "T" "G" "W" "T" "V" "Y" "P" "P" "L" "A" "G" "N" "L"
## [109] "A" "H" "A" "G" "A" "S" "V" "D" "L" "A" "I" "F" "S" "L" "H" "L" "A" "G"
## [127] "V" "S" "S" "I" "L" "G" "A" "I" "N" "F" "I" "T" "T" "A" "I" "N" "M" "K"
## [145] "P" "P" "A" "L" "S" "Q" "Y" "Q" "T" "P" "L" "F" "V" "W" "S" "V" "L" "I"
## [163] "T" "A" "V" "L" "L" "L" "L" "S" "L" "P" "V" "L" "A" "A" "G" "I" "T" "M"
## [181] "L" "L" "T" "D" "R" "N" "L" "N" "T" "T" "F" "F" "D" "P" "A" "G" "G" "G"
## [199] "D" "P" "V" "L" "Y" "Q" "H" "L" "F" "W" "F" "F" "G" "H" "P" "E" "V" "Y"
## [217] "I" "L" "I" "L"
## attr(,"name")
## [1] "Seq7"
## attr(,"Annot")
## [1] ">Seq7"
## attr(,"class")
## [1] "SeqFastaAA"

Convert sequences to character strings

# Convert sequences to character strings
protein_sequences <- sapply(protein_fasta, function(x) paste(x, collapse = ""))
protein_sequences
##                                                                                                                                                                                                                                      Seq1 
## "LYLIFGAWAGMVGTALSLLIRAELGQPGTLLGDDQIYNVIVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSTVEAGAGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGVSSILGAINFITTAINMKPPTLSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL" 
##                                                                                                                                                                                                                                      Seq2 
##            "VGTALXLLIRAELXQPGALLGDDQIYNVVVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLMASSTVEAGAGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL" 
##                                                                                                                                                                                                                                      Seq3 
## "LYLIFGAWAGMVGTALSLLIRAELGQPGALLGDDQVYNVVVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSTVEAGVGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL" 
##                                                                                                                                                                                                                                      Seq4 
##        "WAGMVGTALSLLIRAELGQPGALLGDDQIYNVVXTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLMASSTVEAGVGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL" 
##                                                                                                                                                                                                                                      Seq5 
## "LYLIFGAWAGMVGTALSLLIRAELGQPGALLGDDQVYNVVVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSTVEAGVGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIX" 
##                                                                                                                                                                                                                                      Seq6 
##        "WAGMVGTALSLLIRAELGQPGALLGDDQIYNVVVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLMASSTVEAGVGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL" 
##                                                                                                                                                                                                                                      Seq7 
##            "VGTALSLLIRAELGQPGTLLGDDQIYNVIVTAHAFVMIFFMVMPVMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSTVEAGAGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGVSSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL"

Align the protein sequences using msa

protein_alignment <- msa(protein_sequences, type = "protein")
## use default substitution matrix
protein_alignment
## CLUSTAL 2.1  
## 
## Call:
##    msa(protein_sequences, type = "protein")
## 
## MsaAAMultipleAlignment with 7 rows and 231 columns
##     aln                                                    names
## [1] -------WAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq4
## [2] -------WAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq6
## [3] -----------VGTALXLLIRAELXQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq2
## [4] LYLIFGAWAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq3
## [5] LYLIFGAWAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIX Seq5
## [6] LYLIFGAWAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq1
## [7] -----------VGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Seq7
## Con -------WAGMVGTALSLLIRAELGQ...GGGDPVLYQHLFWFFGHPEVYILIL Consensus

Convert the alignment to a compatible format

alignment_ape <- msaConvert(protein_alignment, type = "seqinr::alignment")
alignment_ape
## $nb
## [1] 7
## 
## $nam
## [1] "Seq4" "Seq6" "Seq2" "Seq3" "Seq5" "Seq1" "Seq7"
## 
## $seq
## [1] "-------WAGMVGTALSLLIRAELGQPGALLGDDQIYNVVXTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLMASSTVEAGVGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL"
## [2] "-------WAGMVGTALSLLIRAELGQPGALLGDDQIYNVVVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLMASSTVEAGVGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL"
## [3] "-----------VGTALXLLIRAELXQPGALLGDDQIYNVVVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLMASSTVEAGAGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL"
## [4] "LYLIFGAWAGMVGTALSLLIRAELGQPGALLGDDQVYNVVVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSTVEAGVGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL"
## [5] "LYLIFGAWAGMVGTALSLLIRAELGQPGALLGDDQVYNVVVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSTVEAGVGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGISSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIX"
## [6] "LYLIFGAWAGMVGTALSLLIRAELGQPGTLLGDDQIYNVIVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSTVEAGAGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGVSSILGAINFITTAINMKPPTLSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL"
## [7] "-----------VGTALSLLIRAELGQPGTLLGDDQIYNVIVTAHAFVMIFFMVMPVMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSTVEAGAGTGWTVYPPLAGNLAHAGASVDLAIFSLHLAGVSSILGAINFITTAINMKPPALSQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPVLYQHLFWFFGHPEVYILIL"
## 
## $com
## [1] NA
## 
## attr(,"class")
## [1] "alignment"

Compute the distance matrix

dist_matrix <- dist.alignment(alignment_ape)
dist_matrix
##            Seq4       Seq6       Seq2       Seq3       Seq5       Seq1
## Seq6 0.00000000                                                       
## Seq2 0.06788442 0.06772855                                            
## Seq3 0.09470274 0.09449112 0.11730928                                 
## Seq5 0.09491580 0.09470274 0.11757927 0.00000000                      
## Seq1 0.16402997 0.16366342 0.15144563 0.16116459 0.16151457           
## Seq7 0.16552118 0.16514456 0.15144563 0.16514456 0.16552118 0.09534626

Construct the phylogenetic tree using NJ

phylo_tree <- nj(as.dist(dist_matrix))
phylo_tree
## 
## Phylogenetic tree with 7 tips and 5 internal nodes.
## 
## Tip labels:
##   Seq4, Seq6, Seq2, Seq3, Seq5, Seq1, ...
## 
## Unrooted; includes branch length(s).

Plot the phylogenetic tree

plot(phylo_tree, main = "Neighbor-Joining Phylogenetic Tree (Protein Sequences)")