This code summarizes information about the gene FADS1.
This gene encodes a protein that is part of the fatty acid desaturase gene family, and these genes regulate the unsaturation of fatty acids. Diseases associated with FADS1 include Lipid Metabolism Disorder and Fetal Akinesia Deformation Sequence 4.
This code also generates alignments and a phylogeneitc tree to indicating the evolutionary relationship betweeen the human version of the gene and its homologs in other species.
Key information use to make this script can be found here: * Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/3992 * Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene/?term=Homo+sapiens+FADS1
Other interesting resources and online tools include: * REPPER: https://toolkit.tuebingen.mpg.de/jobs/7803820 * Sub-cellular locations prediction: https://wolfpsort.hgc.jp/
Download and load necessary packages from Bioconductor
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
library(drawProteins)
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
# Bioconductor packages
library(drawProteins)
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
## Biostrings
library(Biostrings)
library(HGNChelper)
# RefSeq Uniprot PDB sci name common name gene name
fads1_table<-c("NP_037534" , "A0A0A0MR51","NA","Homo sapiens" , "Human", "FADS1",
"XP_001150290", "H2R2R9","NA","Pan troglodytes" , "Chimpansee", "FADS1",
"XP_002699331", "UPI0001D56C35","NA","Bos taurus" , "Hybrid Cattle", "FADS1",
"NP_666206", "UPI00000211B7","NA","Mus musculus" , "Mouse", "Fads1",
"NP_445897", "UPI00000E8549","NA","Rattus norvegicus" , "Rat", "Fads1",
"XP_421052", "UPI0000E805B3","NA","Gallus gallus" , "Chicken", "Fads1",
"XP_002943012", "UPI00034F8DFC","NA","Xenopus tropicalis" , "Western clawed frog", "fads1",
"XP_010709579", "UPI000938BD5F","NA","Meleagris gallopavo" , "Turkey", "FADS1",
"XP_009287621", "UPI0004F4B077","NA","Aptenodytes forsteri" , "Emperor penguin", "FADS1",
"XP_004714995", "UPI000333EC69","NA","Echinops telfairi" , "Hedgehog", "FADS1" )
# convert the table above into a matrix
fads1_matrix <- matrix(fads1_table, nrow = 10, byrow = TRUE)
# convert matrix into a dataframe
fads1_df <- as.data.frame(fads1_matrix)
# add column names to the dataframe
colnames(fads1_df) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name")
The Finished Table
pander::pander(fads1_df)
| ncbi.protein.accession | UniProt.id | PDB | species |
|---|---|---|---|
| NP_037534 | A0A0A0MR51 | NA | Homo sapiens |
| XP_001150290 | H2R2R9 | NA | Pan troglodytes |
| XP_002699331 | UPI0001D56C35 | NA | Bos taurus |
| NP_666206 | UPI00000211B7 | NA | Mus musculus |
| NP_445897 | UPI00000E8549 | NA | Rattus norvegicus |
| XP_421052 | UPI0000E805B3 | NA | Gallus gallus |
| XP_002943012 | UPI00034F8DFC | NA | Xenopus tropicalis |
| XP_010709579 | UPI000938BD5F | NA | Meleagris gallopavo |
| XP_009287621 | UPI0004F4B077 | NA | Aptenodytes forsteri |
| XP_004714995 | UPI000333EC69 | NA | Echinops telfairi |
| common.name | NA |
|---|---|
| Human | FADS1 |
| Chimpansee | FADS1 |
| Hybrid Cattle | FADS1 |
| Mouse | Fads1 |
| Rat | Fads1 |
| Chicken | Fads1 |
| Western clawed frog | fads1 |
| Turkey | FADS1 |
| Emperor penguin | FADS1 |
| Hedgehog | FADS1 |
All sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.
fads1_list <- compbio4all::entrez_fetch_list(db = "protein",
id = fads1_df$ncbi.protein.accession,
rettype = "fasta")
Number of FASTA files obtained:
length(fads1_list)
## [1] 10
The first entry:
fads1_list[[1]]
## [1] ">NP_037534.5 acyl-CoA (8-3)-desaturase [Homo sapiens]\nMGTRAARPAGLPCGAENPARRRLALGARQQIHSWSPRTPSTRLTAPAGPARGVARPAMAPDPVAAETAAQ\nGPTPRYFTWDEVAQRSGCEERWLVIDRKVYNISEFTRRHPGGSRVISHYAGQDATDPFVAFHINKGLVKK\nYMNSLLIGELSPEQPSFEPTKNKELTDEFRELRATVERMGLMKANHVFFLLYLLHILLLDGAAWLTLWVF\nGTSFLPFLLCAVLLSAVQAQAGWLQHDFGHLSVFSTSKWNHLLHHFVIGHLKGAPASWWNHMHFQHHAKP\nNCFRKDPDINMHPFFFALGKILSVELGKQKKKYMPYNHQHKYFFLIGPPALLPLYFQWYIFYFVIQRKKW\nVDLAWMITFYVRFFLTYVPLLGLKAFLGLFFIVRFLESNWFVWVTQMNHIPMHIDHDRNMDWVSTQLQAT\nCNVHKSAFNDWFSGHLNFQIEHHLFPTMPRHNYHKVAPLVQSLCAKHGIEYQSKPLLSAFADIIHSLKES\nGQLWLDAYLHQ\n\n"
Remove FASTA Header
for(i in 1:length(fads1_list)){
fads1_list[[i]] <- compbio4all::fasta_cleaner(fads1_list[[i]], parse = F)
}
fads1_json <- drawProteins::get_features("A0A0A0MR51")
## [1] "Download has worked"
is(fads1_json)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
Raw data is converted into a dataframe
fads1_draw_df <- drawProteins::feature_to_dataframe(fads1_json)
is(fads1_draw_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
Making the drawing
my_canvas <- draw_canvas(fads1_draw_df)
my_canvas <- draw_chains(my_canvas, fads1_draw_df, label_size = 2.5)
my_canvas <- draw_recept_dom(my_canvas, fads1_draw_df)
my_canvas
Prepare data
fads1_vector <- compbio4all::fasta_cleaner(fads1_list[1], parse = T)
dotPlot(fads1_vector, fads1_vector)
Investigating Different Dotplot Settings
# set up 2 x 2 grid, and make margins
par(mfrow = c(2,2),
mar = c(0,0,2,1))
# plot 1: Defaults
dotPlot(fads1_vector, fads1_vector,
wsize = 1,
nmatch = 1,
main = "Default Settings")
# plot 2 size = 10, nmatch = 1
dotPlot(fads1_vector, fads1_vector,
wsize = 10,
nmatch = 1,
main = "size = 10, nmatch = 1")
# plot 3: size = 10, nmatch = 5
dotPlot(fads1_vector, fads1_vector,
wsize = 10,
nmatch = 5,
main = "size = 10, nmatch = 5")
# plot 4: size = 20, nmatch = 5
dotPlot(fads1_vector, fads1_vector,
wsize = 20,
nmatch = 5,
main = "size = 20, nmatch = 5")
# reset par() - run this or other plots will be small!
par(mfrow = c(1,1),
mar = c(4,4,4,4))
Best Settings Found
dotPlot(fads1_vector, fads1_vector,
wsize = 20,
nmatch = 5,
main = "Best Settings: size = 20, nmatch = 5")
Note: There was no information available on the presence of disorganized regions or tandem repeats.
| Source | Protein Property | Link |
|---|---|---|
| PFAM | There are two interesting domains. The first is a Cyt-b5 domain from AA 21-94. The second is a FA desaturase domain from AA 155-418. | http://pfam.xfam.org/protein/FADS1_HUMAN |
| Uniprot | FADS1 is a transmembrane protein found in the Endoplasmic reticulum and Mitochondrion | https://www.uniprot.org/uniprot/O60427 |
| Alphafold | There was no direct entry in alphafold for FADS1, but through analyzing the sequence, Alphafold predicts with high confidence that the protein coded for by FADS1 is made almost entirely of alpha helices | https://alphafold.ebi.ac.uk/entry/O60427 |
Subcellular Location - FADS1 is found in the endoplasmic reticulum and the mitochondrion.
First, we need the data from Chou and Zhang (1994) Table 5.
# get string of sequence
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
pander(data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b))
| aa.1.1 | alpha | beta | a.plus.b | a.div.b |
|---|---|---|---|---|
| A | 285 | 203 | 175 | 361 |
| R | 53 | 67 | 78 | 146 |
| N | 97 | 139 | 120 | 183 |
| D | 163 | 121 | 111 | 244 |
| C | 22 | 75 | 74 | 63 |
| Q | 67 | 122 | 74 | 114 |
| E | 134 | 86 | 86 | 257 |
| G | 197 | 297 | 171 | 377 |
| H | 111 | 49 | 33 | 107 |
| I | 91 | 120 | 93 | 239 |
| L | 221 | 177 | 110 | 339 |
| K | 249 | 115 | 112 | 321 |
| M | 48 | 16 | 25 | 91 |
| F | 123 | 85 | 52 | 158 |
| P | 82 | 127 | 71 | 188 |
| S | 122 | 341 | 126 | 327 |
| T | 119 | 253 | 117 | 238 |
| W | 33 | 44 | 30 | 72 |
| Y | 63 | 110 | 108 | 130 |
| V | 167 | 229 | 123 | 378 |
Then, we need to convert the data to frequencies.
# calculate proportions
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
First, we need to get the sequence for FADS1, clean it, and find out how many of each amino acid there are.
# download
NP_037534 <- rentrez::entrez_fetch(id = "NP_037534.3",
db = "protein",
rettype = "fasta")
# clean and turn into vector
NP_037534 <- compbio4all::fasta_cleaner(NP_037534, parse = TRUE)
# get frequency table
NP_037534.freq.table <- table(NP_037534)/length(NP_037534)
Function to convert a table into a vector.
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
FADS1.human.aa.freq <- table_to_vector(NP_037534.freq.table)
Check for presence of U, an unknown amino acid.
aa.names <- names(FADS1.human.aa.freq)
i.U <- which(aa.names == "U")
aa.names[i.U]
## character(0)
There are no U in data, but lets just make sure.
FADS1.human.aa.freq[i.U]
## named numeric(0)
Add data on my focal protein to the amino acid frequency table.
aa.prop$FADS1.human.aa.freq <- FADS1.human.aa.freq
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | FADS1.human.aa.freq | |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 0.08583 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 0.01198 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 0.03393 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 0.03593 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 0.07385 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 0.05589 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 0.06188 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 0.04192 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 0.0499 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 0.1138 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 0.02595 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 0.03593 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 0.06387 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 0.04391 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 0.0519 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 0.0519 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 0.04192 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 0.05389 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 0.03393 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 0.03194 |
Two custom functions are needed: one to calculate correlates between two columns of our table, and one to calculate correlation similarities.
# Corrleation used in Chou adn Zhange 1992.
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
# Cosine similarity used in Higgs and Attwood (2005).
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
Calculate correlation between each column
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
Calculate cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
Calculate distance. Note: we need to flip the dataframe on its side using a command called t()
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## FADS1.human.aa.freq 0.09 0.01 0.03 0.04 0.07 0.06 0.06 0.04 0.05 0.11 0.03 0.04
## M F P S T W Y V
## alpha.prop 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## FADS1.human.aa.freq 0.06 0.04 0.05 0.05 0.04 0.05 0.03 0.03
Get the distance matrix
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## FADS1.human.aa.freq 0.16832404 0.17524935 0.14129167 0.15102886
Individual Distances using dist()
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
Compile the information. Rounding makes it easier to read
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
Display output
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7771 | 0.7771 | 0.1683 | ||
| beta | 0.7622 | 0.7622 | 0.1752 | ||
| alpha plus beta | 0.8321 | 0.8321 | 0.1413 | most.sim | min.dist |
| alpha/beta | 0.8118 | 0.8118 | 0.151 |
Convert all FASTA records intro entries in a single vector. FASTA entries are contained in a list produced at the beginning of the script. They were cleaned to remove the header and newline characters.
names(fads1_list)
## [1] "NP_037534" "XP_001150290" "XP_002699331" "NP_666206" "NP_445897"
## [6] "XP_421052" "XP_002943012" "XP_010709579" "XP_009287621" "XP_004714995"
length(fads1_list)
## [1] 10
Each entry is a full entry with no spaces or parsing, and no header.
fads1_list[1]
## $NP_037534
## [1] "MGTRAARPAGLPCGAENPARRRLALGARQQIHSWSPRTPSTRLTAPAGPARGVARPAMAPDPVAAETAAQGPTPRYFTWDEVAQRSGCEERWLVIDRKVYNISEFTRRHPGGSRVISHYAGQDATDPFVAFHINKGLVKKYMNSLLIGELSPEQPSFEPTKNKELTDEFRELRATVERMGLMKANHVFFLLYLLHILLLDGAAWLTLWVFGTSFLPFLLCAVLLSAVQAQAGWLQHDFGHLSVFSTSKWNHLLHHFVIGHLKGAPASWWNHMHFQHHAKPNCFRKDPDINMHPFFFALGKILSVELGKQKKKYMPYNHQHKYFFLIGPPALLPLYFQWYIFYFVIQRKKWVDLAWMITFYVRFFLTYVPLLGLKAFLGLFFIVRFLESNWFVWVTQMNHIPMHIDHDRNMDWVSTQLQATCNVHKSAFNDWFSGHLNFQIEHHLFPTMPRHNYHKVAPLVQSLCAKHGIEYQSKPLLSAFADIIHSLKESGQLWLDAYLHQ"
To convert each entry into a vector, we can just use the fasta cleaner function.
fads1_vector <- fads1_list
for(i in 1:length(fads1_vector)){
fads1_vector[[i]] <- fasta_cleaner(fads1_vector[[i]], parse = T)
}
Pairwise Alignments for human(1), chimpanzee(2), mouse(4), and chicken(6).
align01.02 <- pairwiseAlignment( fads1_list[[1]], fads1_list[[2]] )
align01.04 <- pairwiseAlignment( fads1_list[[1]], fads1_list[[4]] )
align01.06 <- pairwiseAlignment( fads1_list[[1]], fads1_list[[6]] )
align02.04 <- pairwiseAlignment( fads1_list[[2]], fads1_list[[4]] )
align02.06 <- pairwiseAlignment( fads1_list[[2]], fads1_list[[6]] )
align04.06 <- pairwiseAlignment( fads1_list[[4]], fads1_list[[6]] )
Biostrings::pid(align01.02)
## [1] 99.8004
Biostrings::pid(align01.04)
## [1] 79.28287
Biostrings::pid(align01.06)
## [1] 71.81996
Biostrings::pid(align02.04)
## [1] 79.28287
Biostrings::pid(align02.06)
## [1] 71.56673
Biostrings::pid(align04.06)
## [1] 70.27559
Build Matrix
pids <- c(1, NA, NA, NA,
pid(align01.02), 1, NA, NA,
pid(align01.04), pid(align02.04), 1, NA,
pid(align01.06), pid(align02.06), pid(align04.06), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Human","Chimpanzee","Mouse","Chicken")
colnames(mat) <- c("Human","Chimpanzee","Mouse","Chicken")
pander::pander(mat)
| Human | Chimpanzee | Mouse | Chicken | |
|---|---|---|---|---|
| Human | 1 | NA | NA | NA |
| Chimpanzee | 99.8 | 1 | NA | NA |
| Mouse | 79.28 | 79.28 | 1 | NA |
| Chicken | 71.82 | 71.57 | 70.28 | 1 |
A comparison of PID calculation methods for the human vs chimpanzee alignment.
method <- c("PID1", "PID2","PID3","PID4")
denominator <- c("(aligned positions + internal gap positions)",
"(aligned positions)",
"(length shorter sequence)",
"(average length of the two sequences)")
PID <- c(pid(align01.02, type = "PID1"),
pid(align01.02, type = "PID2"),
pid(align01.02, type = "PID3"),
pid(align01.02, type = "PID4")
)
# make the dataframe
pid_comparison <- data.frame(method, PID, denominator)
# display the dataframe
pander::pander(pid_comparison)
| method | PID | denominator |
|---|---|---|
| PID1 | 99.8 | (aligned positions + internal gap positions) |
| PID2 | 99.8 | (aligned positions) |
| PID3 | 99.8 | (length shorter sequence) |
| PID4 | 99.8 | (average length of the two sequences) |
For use with R bioinformatics tools we need to convert our named vector to a string set using Biostrings::AAStringSet(). Note the _ss tag at the end of the object we’re assigning the output to, which designates this as a string set.
fads1_vector_ss <- Biostrings::AAStringSet(unlist(fads1_list))
fads1_align <- msa(fads1_vector_ss,
method = "ClustalW")
## use default substitution matrix
msa produces a species MSA objects
class(fads1_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(fads1_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
Default output of MSA
fads1_align
## CLUSTAL 2.1
##
## Call:
## msa(fads1_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 508 columns
## aln names
## [1] MLNARRPHAAPEGAGARQRGPMGVE...LTAFADIVYSLKDSGELWLDAYLHK XP_421052
## [2] -------------------------...LTAFADIVYSLKDSGELWLDAYLHK XP_010709579
## [3] -------------------------...LTAFADIVHSLKDSGDLWLDAYLHK XP_009287621
## [4] -MGTRAARPAGLPCGAENPARRRLA...LSAFADIIHSLKESGQLWLDAYLHQ NP_037534
## [5] -MGTRAARPAGLPCGAENPALRRLA...LSAFADIIHSLKESGQLWLDAYLHQ XP_001150290
## [6] -------------------------...FSAFADIVHSLKESGQLWLDAYLHQ XP_002699331
## [7] -------------------------...LTAFADIVYSLKESGQLWLDAYLHQ NP_666206
## [8] -------------------------...LTAFADIVYSLKESGQLWLDAYLHQ NP_445897
## [9] -------------------------...LSAFADIVYSLKESGQLWLDAYLHQ XP_004714995
## [10] -------------------------...FTAFADIVHSLRESGELWLDAYLHK XP_002943012
## Con -------------------------...LTAFADIV?SLKESGQLWLDAYLHQ Consensus
We need to change the class of the alignment
class(fads1_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
class(fads1_align) <- "AAMultipleAlignment"
class(fads1_align)
## [1] "AAMultipleAlignment"
Convert to seqinr format
fads1_align_seqinr <- msaConvert(fads1_align,
type = "seqinr::alignment")
class(fads1_align_seqinr)
## [1] "alignment"
MSA output
compbio4all::print_msa(fads1_align_seqinr)
## [1] "MLNARRPHAAPEGAGARQRGPMGVELRREGRVLASRRANERRASGAARNRRLRPSAAGGL 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "-MGTRAARPAGLPCGAENPARRRLALGARQQIHSWSPRTPSTRLTAPAGPARGVARPAMA 0"
## [1] "-MGTRAARPAGLPCGAENPALRRLALGARQQIHSWSPRTPSTRLTAPAGPARGVARPAMA 0"
## [1] "----------------------------------------------------------MA 0"
## [1] "----------------------------------------------------------MA 0"
## [1] "----------------------------------------------------------MA 0"
## [1] "----------------------------------------------------------MA 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "GPAVGRAMEERGAEPEQRRFTWEEIAQRTGRGPAADERWLVIDRKVYDISRFHRRHPGGS 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------MIQR-----------IQLQMIVYGVKK--------- 0"
## [1] "PDPVAA--ETAAQGPTPRYFTWDEVAQRSGCE----ERWLVIDRKVYNISEFTRRHPGGS 0"
## [1] "PDPVAA--ETAAQGPTPRYFTWDEVAQRSGCE----ERWLVIDRKVYNISEFTRRHPGGS 0"
## [1] "P-------ETPAQGPTPRYFTWDEVAQRSGREK---ERWLVIDRKVYNISEFVRRHPGGS 0"
## [1] "PDPVPTPGPASAQLRQTRYFTWEEVAQRSGREK---ERWLVIDRKVYNISDFSRRHPGGS 0"
## [1] "PDPVQTPDPASAQLRQMRYFTWEEVAQRSGREK---ERWLVIDRKVYNISDFSRRHPGGS 0"
## [1] "PDPKAA--EPPVVGSGQRYFTWDEVAQRSGREK---ERWLVIDRKVYNISEFVRRHPGGS 0"
## [1] "----------MGSTKELTCYTWEEVKKRCTRE----ERWLVINRKVYDITRFVNIHPGGP 0"
## [1] " "
## [1] "RVISHYAGQDATDPFIAFHLDKTLVKKYMSPLLIGELAPDQPSFEPSKNKKLVEDFRELR 0"
## [1] "----------------------------MSPLLIGELAPDQPSFEPSKNKKLVEDFRELR 0"
## [1] "-----------ADPFIAFHLDKALVRKYMSPLLIGELAPDQPSFEPSKNKKLVEDFRELR 0"
## [1] "RVISHYAGQDATDPFVAFHINKGLVKKYMNSLLIGELSPEQPSFEPTKNKELTDEFRELR 0"
## [1] "RVISHYAGQDATDPFVAFHINKGLVKKYMNSLLIGELSPEQPSFEPTKNKELTDEFRELR 0"
## [1] "RVISHYAGQDATDPFVAFHINKGLVRKYMNSLLIGELSPEQPSFEPTKNKELIHEFRELR 0"
## [1] "RVISHYAGQDATDPFVAFHINKGLVRKYMNSLLIGELAPEQPSFEPTKNKALTDEFRELR 0"
## [1] "RVISHYAGQDATDPFVAFHINKGLVRKYMNSLLIGELAPEQPSFEPTKNKALTDEFRELR 0"
## [1] "RVISHYAGQDATDPFTAFHIDKALVRKYMNSLLIGELSPEQPSFEPTKNKELVDEFRELR 0"
## [1] "RVISHYAGQDATDPFVAFHIDQELVKKRMCCLLIGELAPGEPSIEPFKDAAMVEDFRALR 0"
## [1] " "
## [1] "ATVEKMGLLKPNRTFFLLHLCHILALDVAAWLTIWYFGSSTVPFLFSALLLGTVQAQAGW 0"
## [1] "ATVEKMGLLKPNRTFFLLHLCHILVLDVAAWLTIWYFGSSTMPFLFSALLLGTVQAQAGW 0"
## [1] "ATVEKMGLLNPNRTFFILYLCHILVLDVAAWLTIWYFGASTVPFLFSAVLLGTVQAQAGW 0"
## [1] "ATVERMGLMKANHVFFLLYLLHILLLDGAAWLTLWVFGTSFLPFLLCAVLLSAVQAQAGW 0"
## [1] "ATVERMGLMKANHVFFLLYLLHILLLDGAAWLTLWVFGTSFLPFLLCAVLLSAVQAQAGW 0"
## [1] "ATVERMGLMKANPVFFLLYLLHILLLDVAAWLTLWLFGTSLVPFLLCSVLLSIVQAQAGW 0"
## [1] "ATVERMGLMKANHLFFLVYLLHILLLDVAAWLTLWIFGTSLVPFILCAVLLSTVQAQAGW 0"
## [1] "ATVERMGLMKANHLFFLFYLLHILLLDVAAWLTLWIFGTSLVPFTLCAVLLSTVQAQAGW 0"
## [1] "ASVEQMGLMKPNLGFFLLYLLHILLMDVMAWLVLWSFGMSLVPFLLSAVLLSAVQAQAGW 0"
## [1] "TTVEQMGLFHPSKLFFFVTLLHVLLLDVLAYVTMYYGGTSLISLLVTALLLATVQAQAGW 0"
## [1] " "
## [1] "LQHDFGHLSVFSESKWNHWVHKFVIGHLKGAPASWWNHLHFQHHAKPNCFRKDPDVNMHP 0"
## [1] "LQHDFGHLSVFSKSKWNHWVHKFVIGHLKGAPASWWNHLHFQHHAKPNCFRKDPDVNMHP 0"
## [1] "LQHDFGHLSVFSKSRWNHWVHKFVIGHLKGAPASWWNHLHFQHHAKPNCFRKDPDVNMHP 0"
## [1] "LQHDFGHLSVFSTSKWNHLLHHFVIGHLKGAPASWWNHMHFQHHAKPNCFRKDPDINMHP 0"
## [1] "LQHDFGHLSVFSTSKWNHLLHHFVIGHLKGAPASWWNHMHFQHHAKPNCFRKDPDINMHP 0"
## [1] "LQHDFGHLSVFGTSKWNHLLHHFVIGHLKGAPASWWNHLHFQHHAKPNCFRKDPDINMHP 0"
## [1] "LQHDFGHLSVFGTSTWNHLLHHFVIGHLKGAPASWWNHMHFQHHAKPNCFRKDPDINMHP 0"
## [1] "LQHDFGHLSVFSTSTWNHLVHHFVIGHLKGAPASWWNHMHFQHHAKPNCFRKDPDINMHP 0"
## [1] "LQHDFGHLSVFSTSKWNHLIHHFVIGHLKGAPASWWNHMHFQHHAKPNCFRKDPDLNMHP 0"
## [1] "LQHDFGHLSVFSRSSWNHVVHQFVIGHLKGAPASWWNHLHFQHHAKPNCFRKDPDINMHP 0"
## [1] " "
## [1] "LFFALGKKLSVELGEQKKKFMPYNHQHKYFFIIGPPALVPLYFQWYIFYFVVQRKQWVDL 0"
## [1] "LFFALGKKLSVELGEQKKKFMPYNHQHKYFFIIGPPALVPLYFQWYIFYFVVQRKKWVDL 0"
## [1] "LFFALGKTLSVELGVQKKKFMPYNHQHKYFFIIGPPALVPLYFQWYIFYFVVQRKQWVDL 0"
## [1] "FFFALGKILSVELGKQKKKYMPYNHQHKYFFLIGPPALLPLYFQWYIFYFVIQRKKWVDL 0"
## [1] "FFFALGKILSVELGKQKKKYMPYNHQHKYFFLIGPPALLPLYFQWYIFYFVIQRKKWVDL 0"
## [1] "FFFALGKVLSVELGKQKKKYMPYNHQHKYFFLIGPPALLPVYFQWYIFYFVIHRKKWVDL 0"
## [1] "LFFALGKVLPVELGREKKKHMPYNHQHKYFFLIGPPALLPLYFQWYIFYFVVQRKKWVDL 0"
## [1] "LFFALGKVLSVELGKEKKKHMPYNHQHKYFFLIGPPALLPLYFQWYIFYFVVQRKKWVDL 0"
## [1] "FFFTLGKILSVELGKQKKKYMPYNHQHKYFCLIGPPALVIFYFQWYIFYFAVQRKKWVDL 0"
## [1] "LLFALGKKLSVELGMKKKKYMPYNHQHKYFFFIGPPALIPVYFQWYIFYFAIRRKKWADL 0"
## [1] " "
## [1] "AWMLTFYIRFFLTYLPLLGVKGILGLHLLVRFIESNWFVWITQMNHIPMHIDYDKNVDWF 0"
## [1] "AWMLTFYIRFFLTYLPLLGVKGILGLHLLVRFIESNWFVWITQMNHIPMHIDYDKNVDWF 0"
## [1] "AWMLTFYIRFFLTYLPLLGVKGILGLHLLVRFIESNWFVWITQMNHIPMHIDYDKNVDWF 0"
## [1] "AWMITFYVRFFLTYVPLLGLKAFLGLFFIVRFLESNWFVWVTQMNHIPMHIDHDRNMDWV 0"
## [1] "AWMITFYVRFFLTYVPLLGLKAFLGLFFIVRFLESNWFVWVTQMNHIPMHIDHDRNMDWV 0"
## [1] "AWMITFYVRIFLTYVPLLGLKGFLGLVFMVRFLESNWFVWVTQMNHIPMHIDHDRNMDWV 0"
## [1] "AWMLSFYARIFFTYMPLLGLKGFLGLFFIVRFLESNWFVWVTQMNHIPMHIDHDRNVDWV 0"
## [1] "AWMLSFYVRVFFTYMPLLGLKGLLCLFFIVRFLESNWFVWVTQMNHIPMHIDHDRNVDWV 0"
## [1] "VWMLSFYARMALAYVPLVGLKGFLGLFLLVRFLESHWFVWVTQMNHIPMHIEYDQNKDWL 0"
## [1] "AWMISFYVRFGLCYIPFLGVSGTIALFMVVRFIESNWFVWVTQMNHIPMNIDYDQNKEWL 0"
## [1] " "
## [1] "STQLQATCNVRQSLFNDWFSGHLNFQIEHHLFPTMPRHNYWKVAPLVKSLCAKHGIEYHC 0"
## [1] "STQLQATCNVRQSFFNDWFSGHLNFQIEHHLFPTMPRHNYWKVAPLVKSLCAKHGIEYHC 0"
## [1] "STQLQATCNVHQSLFNDWFSGHLNFQIEHHLFPTMPRHNYWKVAPLVKSLCAKHGIEYQC 0"
## [1] "STQLQATCNVHKSAFNDWFSGHLNFQIEHHLFPTMPRHNYHKVAPLVQSLCAKHGIEYQS 0"
## [1] "STQLQATCNVHKSAFNDWFSGHLNFQIEHHLFPTMPRHNYHKVAPLVQSLCAKHGIEYQS 0"
## [1] "STQLQATCNVHKSAFNDWFSGHLNFQIEHHLFPTMPRHNYHKVAPLVQSLCAKHGIKYQS 0"
## [1] "STQLQATCNVHQSAFNNWFSGHLNFQIEHHLFPTMPRHNYHKVAPLVQSLCAKYGIKYES 0"
## [1] "STQLQATCNVHQSAFNNWFSGHLNFQIEHHLFPTMPRHNYHKVAPLVQSLCAKYGIKYES 0"
## [1] "STQLQATCNVHKSFFNDWFSGHLNFQIEHHLFPTMPRHNYHKVAPLVRSLCTKHGIKYQS 0"
## [1] "STQLQATCNVDQSLFNDWFSGHLNFQIEHHLFPTMPRHNYWKAAPLVRSLCKKYGIEYQS 0"
## [1] " "
## [1] "KPLLTAFADIVYSLKDSGELWLDAYLHK 32"
## [1] "KPLLTAFADIVYSLKDSGELWLDAYLHK 32"
## [1] "KPLLTAFADIVHSLKDSGDLWLDAYLHK 32"
## [1] "KPLLSAFADIIHSLKESGQLWLDAYLHQ 32"
## [1] "KPLLSAFADIIHSLKESGQLWLDAYLHQ 32"
## [1] "KPLFSAFADIVHSLKESGQLWLDAYLHQ 32"
## [1] "KPLLTAFADIVYSLKESGQLWLDAYLHQ 32"
## [1] "KPLLTAFADIVYSLKESGQLWLDAYLHQ 32"
## [1] "KPLLSAFADIVYSLKESGQLWLDAYLHQ 32"
## [1] "KPLFTAFADIVHSLRESGELWLDAYLHK 32"
## [1] " "
Only displaying a small portion of the MSA so that it is readable.
ggmsa::ggmsa(fads1_align, start=100, end=150)
Make a distance matrix
fads1_dist <- seqinr::dist.alignment(fads1_align_seqinr,
matrix = "identity")
This produces a “dist” class object.
is(fads1_dist)
## [1] "dist" "oldClass"
class(fads1_dist)
## [1] "dist"
Round for display
fads1_align_seqinr_rnd <- round(fads1_dist, 3)
fads1_align_seqinr_rnd
## XP_421052 XP_010709579 XP_009287621 NP_037534 XP_001150290
## XP_010709579 0.118
## XP_009287621 0.272 0.204
## NP_037534 0.544 0.438 0.468
## XP_001150290 0.544 0.438 0.468 0.045
## XP_002699331 0.467 0.447 0.471 0.239 0.239
## NP_666206 0.473 0.438 0.468 0.332 0.332
## NP_445897 0.468 0.431 0.462 0.329 0.329
## XP_004714995 0.481 0.453 0.479 0.371 0.371
## XP_002943012 0.506 0.489 0.522 0.532 0.532
## XP_002699331 NP_666206 NP_445897 XP_004714995
## XP_010709579
## XP_009287621
## NP_037534
## XP_001150290
## XP_002699331
## NP_666206 0.323
## NP_445897 0.330 0.171
## XP_004714995 0.366 0.408 0.408
## XP_002943012 0.526 0.528 0.521 0.539
Building a phylogenetic tree from the distance matrix
# Note - not using rounded values
tree_subset <- nj(fads1_dist)
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree",
#type = "unrooted",
use.edge.length = F)
# add label
mtext(text = "FADS1 family gene tree")