This code compiles summary information about the gene EGLN1 (Egl-9 Family Hypoxia Inducible Factor 1).
Links: RefSeq page: https://www.ncbi.nlm.nih.gov/gene/54583 Homologene page: https://www.ncbi.nlm.nih.gov/homologene/56936 UniProt page: https://www.uniprot.org/uniprot/F6P6J7 PDB page: NA
To compare the evolutionary relationship between the human version of EGLN1 and its homologs in other species, this code creates pairwise and multiple sequence alignments and a phylogenetic tree.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("drawProteins")
## Bioconductor version 3.13 (BiocManager 1.30.16), R 4.1.2 (2021-11-01)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
## re-install: 'drawProteins'
## Old packages: 'ade4', 'aplot', 'backports', 'brio', 'broom', 'car', 'cli',
## 'conquer', 'corrplot', 'cpp11', 'crayon', 'credentials', 'crosstalk',
## 'data.table', 'desc', 'diffobj', 'digest', 'fs', 'generics', 'gert', 'glue',
## 'Hmisc', 'hms', 'htmlTable', 'knitr', 'lifecycle', 'maps', 'Matrix',
## 'matrixStats', 'memoise', 'mime', 'nloptr', 'openxlsx', 'pillar', 'pkgbuild',
## 'pkgload', 'plotly', 'rcmdcheck', 'RcppArmadillo', 'RCurl', 'readr',
## 'remotes', 'rio', 'rlang', 'rsconnect', 'S4Vectors', 'sessioninfo', 'sp',
## 'stringi', 'testthat', 'tibble', 'tidyr', 'tzdb', 'usethis', 'viridis',
## 'vroom', 'withr', 'xfun', 'XML', 'xml2', 'yulab.utils'
# installed.packages("BiocManager")
# install.packages("drawProteins")
# install.packages("HGNChelper")
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
library(drawProteins)
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(drawProteins)
# Biostrings
library(Biostrings)
library(HGNChelper)
RefSeq, Refseq Homlogene, UniProt and PDB were used to obtain accession numbers.
A protein BLAST search was carried out excluding vertebrates to determine if it occurred outside of vertebrates. The gene does not appear in non-vertebrates and so a second search was conducted to exclude mammals.
Does not occur: Outside of vertebrates Drosophila Melanogaster Neanderthal
# RefSeq Uniprot PDB sci name common name gene name
egln1_table <- c("NP_071334", "Q9GZT9", "2Y34", "Homo sapiens", "human", "EGLN1",
"NP_444437.2", "Q91YE3", "NA", "Mus musculus", "house mouse", "EGLN1",
"NP_001002595.2", "F6P6J7", "NA", "Danio rerio", "zebrafish", "egln1",
"XP_525092.2", "NA", "NA", "Pan troglodytes", "chimpanzee", "EGLN1",
"XP_001104870.1", "A0A5F8AKN1", "NA", "Macaca mulatta", "rhesus monkey", "EGLN1",
"XP_546089.3", "NA", "NA", "Canis lupus familiaris", "dog", "EGLN1",
"XP_002728657.3", "NA", "NA", "Rattus norvegicus", "Norway rat", "EGLN1",
"XP_020929239.1", "NA", "NA", "Sus scrofa", "Wild boar", "EGLN1",
"XP_026268254.1", "NA", "NA", "Urocitellus parryii", "Arctic ground squirrel", "EGLN1",
"XP_032955515.1", "NA", "NA", "Rhinolophus ferrumequinum", "greater horseshoe bat", "EGLN1")
Convert vector information into a table
ncbi.protein.accession <- c("NP_071334", "NP_444437.2", "NP_001002595.2", "XP_525092.2", "XP_001104870.1", "XP_546089.3", "XP_002728657.3", "XP_020929239.1", "XP_026268254.1", "XP_032955515.1")
UniProt.id <- c("Q9GZT9", "Q91YE3", "F6P6J7", "NA", "A0A5F8AKN1", "NA", "NA", "NA", "NA", "NA")
PDB <- c("2Y34", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA")
species <- c("Homo sapiens", "Mus musculus", "Danio rerio", "Pan troglodytes", "Macaca mulatta", "Canis lupus familiaris", "Rattus norvegicus", "Sus scrofa", "Urocitellus parryii", "Rhinolophus ferrumequinum")
common.name <- c("human", "house mouse", "zebrafish", "chimpanzee", "rhesus monkey", "dog", "Norway rat", "Wild boar", "Arctic ground squirrel", "greater horseshoe bat")
gene.name <- c("EGLN1", "EGLN1", "egln1", "EGLN1", "EGLN1", "EGLN1", "EGLN1", "EGLN1", "EGLN1", "EGLN1")
egln1_table1 <- data.frame(ncbi.protein.accession = ncbi.protein.accession,
UniProt.id = UniProt.id,
PDB = PDB,
species = species,
common.name = common.name,
gene.name = gene.name)
egln1_table1
## ncbi.protein.accession UniProt.id PDB species
## 1 NP_071334 Q9GZT9 2Y34 Homo sapiens
## 2 NP_444437.2 Q91YE3 NA Mus musculus
## 3 NP_001002595.2 F6P6J7 NA Danio rerio
## 4 XP_525092.2 NA NA Pan troglodytes
## 5 XP_001104870.1 A0A5F8AKN1 NA Macaca mulatta
## 6 XP_546089.3 NA NA Canis lupus familiaris
## 7 XP_002728657.3 NA NA Rattus norvegicus
## 8 XP_020929239.1 NA NA Sus scrofa
## 9 XP_026268254.1 NA NA Urocitellus parryii
## 10 XP_032955515.1 NA NA Rhinolophus ferrumequinum
## common.name gene.name
## 1 human EGLN1
## 2 house mouse EGLN1
## 3 zebrafish egln1
## 4 chimpanzee EGLN1
## 5 rhesus monkey EGLN1
## 6 dog EGLN1
## 7 Norway rat EGLN1
## 8 Wild boar EGLN1
## 9 Arctic ground squirrel EGLN1
## 10 greater horseshoe bat EGLN1
All sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.
entrez_fetch_list <- function(db, id, rettype, ...){
#setup list for storing output
n.seq <- length(id)
list.output <- as.list(rep(NA, n.seq))
names(list.output) <- id
# get output
for(i in 1:length(id)){
list.output[[i]] <- rentrez::entrez_fetch(db = db,
id = id[i],
rettype = rettype)
}
return(list.output)
}
egln1_list <- entrez_fetch_list(db = "protein",
id = egln1_table1$ncbi.protein.accession,
rettype = "fasta")
Removing FASTA header
for(i in 1:length(egln1_list)){
egln1_list[[i]] <- compbio4all::fasta_cleaner(egln1_list[[i]], parse = F)
}
# Q9GZT9
Q9GZT9 <- drawProteins::get_features("Q9GZT9")
## [1] "Download has worked"
is(Q9GZT9)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df1 <- drawProteins::feature_to_dataframe(Q9GZT9)
is(my_prot_df1)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
# Q91YE3
Q91YE3 <- drawProteins::get_features("Q91YE3")
## [1] "Download has worked"
is(Q91YE3)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df2 <- drawProteins::feature_to_dataframe(Q91YE3)
is(my_prot_df2)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
# F6P6J7
F6P6J7 <- drawProteins::get_features("F6P6J7")
## [1] "Download has worked"
is(F6P6J7)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df3 <- drawProteins::feature_to_dataframe(F6P6J7)
is(my_prot_df3)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
# Q9GZT9
Q9GZT9 <- drawProteins::get_features("Q9GZT9")
## [1] "Download has worked"
is(Q9GZT9)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df4 <- drawProteins::feature_to_dataframe(Q9GZT9)
is(my_prot_df4)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
The information available on a protein on UniProt varies a lot depending on how much its been studied. drawProteins can extract information about the following things:
domains chains regions motifs phosphorylated sites repeats and others
Looking at a dataframe to see what is available to plot.
my_prot_df3[,-2]
## type begin end length accession entryName taxid order
## featuresTemp DOMAIN 18 55 37 F6P6J7 F6P6J7_DANRE 7955 1
## featuresTemp.1 DOMAIN 229 327 98 F6P6J7 F6P6J7_DANRE 7955 1
## featuresTemp.2 REGION 40 126 86 F6P6J7 F6P6J7_DANRE 7955 1
## featuresTemp.3 REGION 336 358 22 F6P6J7 F6P6J7_DANRE 7955 1
## featuresTemp.4 COMPBIAS 40 75 35 F6P6J7 F6P6J7_DANRE 7955 1
## featuresTemp.5 COMPBIAS 76 97 21 F6P6J7 F6P6J7_DANRE 7955 1
## featuresTemp.6 COMPBIAS 104 126 22 F6P6J7 F6P6J7_DANRE 7955 1
## 1 CHAIN 1 358 357 F6P6J7 F6P6J7_DANRE 7955 1
# Q9GZT9
# my_canvas1 <- draw_canvas(my_prot_df1)
# my_canvas1 <- draw_chains(my_canvas1, my_prot_df1,
# label_size = 2.5)
#m y_canvas1 <- draw_domains(my_canvas1, my_prot_df1)
# my_canvas1
# Q91YE3
# my_canvas2 <- draw_canvas(my_prot_df2)
# my_canvas2 <- draw_chains(my_canvas2, my_prot_df2,
# label_size = 2.5)
# my_canvas2 <- draw_domains(my_canvas2, my_prot_df2)
# my_canvas2
# F6P6J7
my_canvas3 <- draw_canvas(my_prot_df3)
my_canvas3 <- draw_chains(my_canvas3, my_prot_df3,
label_size = 2.5)
my_canvas3 <- draw_domains(my_canvas3, my_prot_df3)
my_canvas3
# Q9GZT9
# my_canvas4 <- draw_canvas(my_prot_df4)
# my_canvas4 <- draw_chains(my_canvas4, my_prot_df4,
# label_size = 2.5)
# my_canvas4 <- draw_domains(my_canvas4, my_prot_df4)
# my_canvas4
# sequence 3: NP_001002595.2
egln1_daniorerio_fasta <- rentrez::entrez_fetch(db ="protein",
id = "NP_001002595.2",
rettype = "fasta")
# converting to a vector
egln1_daniorerio_vector <- compbio4all::fasta_cleaner(egln1_daniorerio_fasta)
# set up 2 x 2 grid, make margins thinner
par(mfrow = c(2,2),
mar = c(0,0,2,1))
# plot 1: EGLN1
dotPlot(egln1_daniorerio_vector, egln1_daniorerio_vector,
wsize = 30,
nmatch = 5,
main = "EGLN1 30 / 5")
# plot 2 EGLN1 - size = 30, nmatch = 10
dotPlot(egln1_daniorerio_vector, egln1_daniorerio_vector,
wsize = 30,
nmatch = 10,
main = "EGLN1 - size = 30, nmatch = 10")
# plot 3: size = 5, nmatch = 2
dotPlot(egln1_daniorerio_vector, egln1_daniorerio_vector,
wsize = 5,
nmatch = 2,
main = "EGLN1 - size = 5, nmatch = 2")
# plot 4: size = 12, nmatch = 4
dotPlot(egln1_daniorerio_vector, egln1_daniorerio_vector,
wsize = 12,
nmatch = 4,
main = "EGLN1 - size = 12 , nmatch = 4")
# reset par()
par(mfrow = c(1,1),
mar = c(4,4,4,4))
dotPlot(egln1_daniorerio_vector, egln1_daniorerio_vector,
wsize = 30,
nmatch = 5,
main = "Danio rerio: EGLN1 30 / 5")
Below are links to relevant information.
Pfam; http://pfam.xfam.org/protein/F6P6J7; entire protein is indicated to be “F6P6J7_DANRE”. zf-MYND domain from 18 to 55; 2OG-FeII_Oxy_3 domain from 233 to 326. DisProt: no information available RepeatsDB: no information available UniProt sub-cellular locations: nucleus and cytoplasm PDB secondary structural location: no PDB or CATH entry available Alphafold; (https://alphafold.ebi.ac.uk/entry/F6P6J7). The predicted structure contains alpha helices, beta sheets, and disordered regions.
from Chou and Zhang data
# enter once
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# enter again
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# checking accuracy
length(aa.1.1) == length(aa.1.2)
## [1] TRUE
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
# check against chou's total
sum(alpha) == 2447
## [1] TRUE
# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
# check against chou's total
sum(beta) == 2776
## [1] TRUE
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
## aa.1.1 alpha beta a.plus.b a.div.b
## 1 A 285 203 175 361
## 2 R 53 67 78 146
## 3 N 97 139 120 183
## 4 D 163 121 111 244
## 5 C 22 75 74 63
## 6 Q 67 122 74 114
## 7 E 134 86 86 257
## 8 G 197 297 171 377
## 9 H 111 49 33 107
## 10 I 91 120 93 239
## 11 L 221 177 110 339
## 12 K 249 115 112 321
## 13 M 48 16 25 91
## 14 F 123 85 52 158
## 15 P 82 127 71 188
## 16 S 122 341 126 327
## 17 T 119 253 117 238
## 18 W 33 44 30 72
## 19 Y 63 110 108 130
## 20 V 167 229 123 378
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
#dataframe
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
aa_pander <- pander::pander(aa.prop)
Adding my protein data
# frequency of my protein
egln1.freq.table <- table(egln1_daniorerio_vector)/length(egln1_daniorerio_vector)
# converting table into a vector
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
egln1.daniorerio.aa.freq <- table_to_vector(egln1.freq.table)
aa.names <- names(egln1.daniorerio.aa.freq)
any(aa.names == "U")
## [1] FALSE
aa.prop$egln1.daniorerio.aa.freq <- egln1.daniorerio.aa.freq
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
| egln1.daniorerio.aa.freq | |
|---|---|
| A | 0.04469 |
| R | 0.03631 |
| N | 0.06983 |
| D | 0.08101 |
| C | 0.03073 |
| Q | 0.07821 |
| E | 0.01955 |
| G | 0.03631 |
| H | 0.07263 |
| I | 0.05587 |
| L | 0.02235 |
| K | 0.04469 |
| M | 0.05028 |
| F | 0.04469 |
| P | 0.07821 |
| S | 0.0838 |
| T | 0.05587 |
| W | 0.04749 |
| Y | 0.01397 |
| V | 0.03352 |
# Corrleation used in Chou adn Zhange 1992.
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
# Cosine similarity used in Higgs and Attwood (2005).
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
Data exploration
par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ egln1.daniorerio.aa.freq, data = aa.prop)
plot(beta.prop ~ egln1.daniorerio.aa.freq, data = aa.prop)
plot(a.plus.b.prop ~ egln1.daniorerio.aa.freq, data = aa.prop)
plot(a.div.b ~ egln1.daniorerio.aa.freq, data = aa.prop)
Calculating correlation between each column
par(mfrow = c(1,1), mar = c(1,1,1,1))
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
Calculating cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
Calculating distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## egln1.daniorerio.aa.freq 0.04 0.04 0.07 0.08 0.03 0.08 0.02 0.04 0.07 0.06 0.02
## K M F P S T W Y V
## alpha.prop 0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## egln1.daniorerio.aa.freq 0.04 0.05 0.04 0.08 0.08 0.06 0.05 0.01 0.03
Generating a distance matrix
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## egln1.daniorerio.aa.freq 0.16845549 0.15500920 0.14200299 0.15021813
Individal distances
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
Compiling the information
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
# display output
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7754 | 0.7754 | 0.1685 | ||
| beta | 0.8135 | 0.8135 | 0.155 | ||
| alpha plus beta | 0.829 | 0.829 | 0.142 | most.sim | min.dist |
| alpha/beta | 0.8125 | 0.8125 | 0.1502 |
Converting all FASTA records into entries in a single vector.
names(egln1_list)
## [1] "NP_071334" "NP_444437.2" "NP_001002595.2" "XP_525092.2"
## [5] "XP_001104870.1" "XP_546089.3" "XP_002728657.3" "XP_020929239.1"
## [9] "XP_026268254.1" "XP_032955515.1"
length(egln1_list)
## [1] 10
egln1_list[1]
## $NP_071334
## [1] "MANDSGGPGGPSPSERDRQYCELCGKMENLLRCSRCRSSFYCCKEHQRQDWKKHKLVCQGSEGALGHGVGPHQHSGPAPPAAVPPPRAGAREPRKAAARRDNASGDAAKGKVKAKPPADPAAAASPCRAAAGGQGSAVAAEAEPGKEEPPARSSLFQEKANLYPPSNTPGDALSPGGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGKETGQQIGDEVRALHDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGSYKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEGKAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLTGEKGVRVELNKPSDSVGKDVF"
Making each entry of the list into a vector.
# naming the vector
egln_vector <- rep(NA, length(egln1_list))
# the loop function
for(i in 1:length(egln_vector)){
egln_vector[i] <- egln1_list[[i]]
}
names(egln_vector) <- names(egln1_list)
egln1_human_vector <- egln_vector[1]
egln1_mouse_vector <- egln_vector[2]
egln1_zfish_vector <- egln_vector[3]
egln1_chimp_vector <- egln_vector[4]
Doing pairwise alignments for humans, chimps, mice, and zebrafish
data(BLOSUM50)
globalAlignh.v.c <- pairwiseAlignment(egln1_human_vector, egln1_chimp_vector,
substitutionMatrix = "BLOSUM50",
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
globalAlignh.v.m <- pairwiseAlignment(egln1_human_vector, egln1_mouse_vector,
substitutionMatrix = "BLOSUM50",
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
globalAlignh.v.z <- pairwiseAlignment(egln1_human_vector, egln1_zfish_vector,
substitutionMatrix = "BLOSUM50",
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
globalAlignsc.v.m <- pairwiseAlignment(egln1_chimp_vector, egln1_mouse_vector,
substitutionMatrix = "BLOSUM50",
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
globalAlignc.v.z <- pairwiseAlignment(egln1_chimp_vector, egln1_zfish_vector,
substitutionMatrix = "BLOSUM50",
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
globalAlignm.v.z <- pairwiseAlignment(egln1_mouse_vector, egln1_zfish_vector,
substitutionMatrix = "BLOSUM50",
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
Biostrings::pid(globalAlignh.v.c)
## [1] 77.2093
Biostrings::pid(globalAlignh.v.m)
## [1] 79.76471
Biostrings::pid(globalAlignh.v.z)
## [1] 61.70213
Biostrings::pid(globalAlignsc.v.m)
## [1] 68.72038
Biostrings::pid(globalAlignc.v.z)
## [1] 55.47619
Biostrings::pid(globalAlignm.v.z)
## [1] 64.67662
Building a matrix
pids <- c(1, NA, NA, NA,
pid(globalAlignh.v.c), 1, NA, NA,
pid(globalAlignh.v.m), pid(globalAlignsc.v.m), 1, NA,
pid(globalAlignh.v.z), pid(globalAlignc.v.z), pid(globalAlignm.v.z), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Chimp","Mouse","Fish")
colnames(mat) <- c("Homo","Chimp","Mouse","Fish")
pander::pander(mat)
| Homo | Chimp | Mouse | Fish | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Chimp | 77.21 | 1 | NA | NA |
| Mouse | 79.76 | 68.72 | 1 | NA |
| Fish | 61.7 | 55.48 | 64.68 | 1 |
Comparing different PID methods (humans vs. chimps)
PID <- c(pid(globalAlignh.v.c, type = "PID1"), pid(globalAlignh.v.c, type = "PID2"), pid(globalAlignh.v.c, type = "PID3"), pid(globalAlignh.v.c, type = "PID4"))
method <- c("PID1", "PID2", "PID3", "PID4")
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")
comparePID_table <- data.frame(method = method,
PID = PID,
denominator = denominator)
comparePID_table
## method PID denominator
## 1 PID1 77.20930 (aligned positions + internal gap positions)
## 2 PID2 79.42584 (aligned positions)
## 3 PID3 78.67299 (length shorter sequence)
## 4 PID4 78.30189 (average length of the two sequences)
Converting our named vector to a string set
egln_vector_ss <- Biostrings::AAStringSet(egln_vector)
egln_vector_ss
## AAStringSet object of length 10:
## width seq names
## [1] 426 MANDSGGPGGPSPSERDRQYCEL...TGEKGVRVELNKPSDSVGKDVF NP_071334
## [2] 400 MASDSGGPGVLSASERDRQYCEL...KYLTGEKGVRVELKPNSVSKDV NP_444437.2
## [3] 358 MEGNSRESERLERERQFCELCGK...KYSTSSGERGVKVELNKPSEPS NP_001002595.2
## [4] 422 MYKEKHIAHVCIWQNPQYRDKQD...TGEKGVRVELNKPSDSVSKDVF XP_525092.2
## [5] 426 MANDSGGPGGPSPSERDRQYCEL...TGEKGVRVELNKPSDSVGKDVF XP_001104870.1
## [6] 386 MVFQGGEGAPGSGAAQQWHPVPV...TGEKGVRVELNKPSDSISKDVL XP_546089.3
## [7] 388 MNKHGICVVDDFLGRETGQQIGD...KYLTGEKGVRVELKPNSVSKDV XP_002728657.3
## [8] 427 MANDSGGPGGPSPSERDRQYCEL...TGEKGVRVELNKPSDSVSKDVL XP_020929239.1
## [9] 413 MASDSGGPGGPSASERDRQYCEL...LTGEKGVRVELNKPSDPVGKDV XP_026268254.1
## [10] 427 MANDSGGPGGPSPSERDRQYCEL...TGEKGVRVELNKPSDSISKDVL XP_032955515.1
This chunk performs the multiple sequence alignment on the dataset and puts it on the object “shrooms_align”
egln1_align <- msa (egln_vector_ss,
method = "ClustalW")
## use default substitution matrix
egln1_align
## CLUSTAL 2.1
##
## Call:
## msa(egln_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 439 columns
## aln names
## [1] -MASDSGGPGVLSASERDRQYCELC...LT--GEKGVRVELKPNS--VSKDV- NP_444437.2
## [2] ---MNKHGICVVDDFLGRETGQQIG...LT--GEKGVRVELKPNS--VSKDV- XP_002728657.3
## [3] -MASDSGGPGGPSASERDRQYCELC...LT--GEKGVRVELNKPSDPVGKDV- XP_026268254.1
## [4] -MANDSGGPGGPSPSERDRQYCELC...LT--GEKGVRVELNKPSDSVGKDVF NP_071334
## [5] -MANDSGGPGGPSPSERDRQYCELC...LT--GEKGVRVELNKPSDSVGKDVF XP_001104870.1
## [6] MYKEKHIAHVCIWQNPQYRDKQDRN...LT--GEKGVRVELNKPSDSVSKDVF XP_525092.2
## [7] ----------MVFQGG---------...LT--GEKGVRVELNKPSDSISKDVL XP_546089.3
## [8] -MANDSGGPGGPSPSERDRQYCELC...LT--GEKGVRVELNKPSDSVSKDVL XP_020929239.1
## [9] -MANDSGGPGGPSPSERDRQYCELC...LT--GEKGVRVELNKPSDSISKDVL XP_032955515.1
## [10] ----MEGNSRESERLERERQFCELC...STSSGERGVKVELNKPSEPS----- NP_001002595.2
## Con -MA?DSGGPGGPS?SERDRQYCELC...LT--GEKGVRVELNKPSDSVSKDV? Consensus
msa produces a species MSA objects
class(egln1_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(egln1_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
Changing class of alignment, converting to seqinr format, and showing output with print_msa
class(egln1_align) <- "AAMultipleAlignment"
egln1_align_seqinr <- msaConvert(egln1_align, type = "seqinr::alignment")
print_msa(alignment = egln1_align_seqinr,
chunksize = 60)
## [1] "-MASDSGGPGVLSASERDRQYCELCGKMENLLRCGRCRS-SFYCCKEHQRQDWKKHKLVC 0"
## [1] "---MNKHGICVVDDFLGRETGQQIGDEVRALHDTGKFTDGQLVSQKSDSSKDIRGDKITW 0"
## [1] "-MASDSGGPGGPSASERDRQYCELCGKMENLLRCGRCRS-SFYCCKEHQRQDWKKHKLVC 0"
## [1] "-MANDSGGPGGPSPSERDRQYCELCGKMENLLRCSRCRS-SFYCCKEHQRQDWKKHKLVC 0"
## [1] "-MANDSGGPGGPSPSERDRQYCELCGKMENLLRCSRCRS-SFYCCKEHQRQDWKKHKLVC 0"
## [1] "MYKEKHIAHVCIWQNPQYRDKQDRNSHLATLTGASLAHP-APAWRKSSHDYAAFAPLPAC 0"
## [1] "----------MVFQGG---------------EGAPGSGA-AQQWH----------PVPVL 0"
## [1] "-MANDSGGPGGPSPSERDRQYCELCGKMENLLRCGRCRS-SFYCSKEHQRQDWKKHKLVC 0"
## [1] "-MANDSGGPGGPSPSERDRQYCELCGKMENLLRCSRCRS-SFYCSKEHQRQDWKKHKLVC 0"
## [1] "----MEGNSRESERLERERQFCELCGKMENLMKCGRCRS-SFYCSKEHQRQDWKKHKRVC 0"
## [1] " "
## [1] "QGGE---------------APRAQPAPAQPRVAPPPGGAPGAARAGGAARRGDS-----A 0"
## [1] "IEGK---------------EPGWRPS------------APGAARAGGAARRGDSS----T 0"
## [1] "QGGDPAGPAAA-----PRAQPGPAPPAAAPPTRAGAAAGNAGARAGKAAGQRQGS----A 0"
## [1] "QGSEGALGHGVGPHQHSGPAPPAAVPPPRAGAREPRKAAARRDNASGDAAKGKVKAKPPA 0"
## [1] "QGSESALGPGVGPHQHSGPAPPVAAPPPRAGAREPRKAAARRDNASGDAAKAKAKGKPPA 0"
## [1] "GWRPPPAGVGAAALP---PRLFRRFAGPCALGVPRRRPAAGPKAPGKRPGKSTVK--PPA 0"
## [1] "AASAPPPGEAREARE---PREARKAAP----RRDSAAESAGPRAAG-DAAKAKAK--PAA 0"
## [1] "QGGEGAPAPGAGQQPQPGPAPLLRKAVPRRDRVAAVEAAGPRAAGDAAKAKAKAKP--AA 0"
## [1] "QRGDGAPGPGASQHPHPGPAPPAAAPPPRATGAKEARTAAPRRDSAAAGDAADAK----A 0"
## [1] "KEAD-----------------------------------KQQQPVGEEEEEQPSD----T 0"
## [1] " "
## [1] "AASRVPGPEDAAQARSGPGPAEPGS--------EDPPLSRSPGP----ERASLCPAGGGP 0"
## [1] "AASRVPGPEDATQAGSGPGPAEPSS--------EDPPPSRSPGP----ERASLCPAGGGP 0"
## [1] "EAARAPAPGDAAQARGPRGAAECGQ--------EEPPARRSPCP----ERASLCPAGSSP 0"
## [1] "DPAAAASPCRAAAGGQGSAVAAEAE-----PGKEEPPARSSLFQ----EKANLYPPSNTP 0"
## [1] "DPAAAASPSRAAPGGQGSAVAAEAE-----PGKEEPPARSSLFQ----EKANLYPPSNTP 0"
## [1] "DPAAAASPSRAAAGGQGSAVAAEAE-----PGKEEPPARSSLFQ----EKANLYPPSNTP 0"
## [1] "DPAAAASPPRAAAGGQGPAAAAAAA-----AAAACAAAEAEPAKEEDPEKTNLYPPSGTP 0"
## [1] "DPAAAASPPRAAPGGAGPAAAAAAA--EAEPAKEEPPARSSLYQ----EKANLYPPSNTP 0"
## [1] "KPAAAASPPRAAPGGQGPAAAAATATAEAEPAKEEPLGRSSLFQ----DKANLYPPGNTP 0"
## [1] "SKQSSTSQSNSTLTSQD-------------------------------EKMPDFIKSATG 0"
## [1] " "
## [1] "GEALSPGGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGRETGQQIGDEVRAL 0"
## [1] "GEALSPSGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGRETGQQIGDEVRAL 0"
## [1] "GEALSPGGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGRETGQQIGDEVRAL 0"
## [1] "GDALSPGGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGKETGQQIGDEVRAL 0"
## [1] "GDALSPSGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGKETGQQIGDEVRAL 0"
## [1] "GDALSPSGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGKETGQQIGDEVRAL 0"
## [1] "GEGLSPGGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGKETGQQIGDEVRAL 0"
## [1] "GEGLSHGGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGKETGQQIGDEVRAL 0"
## [1] "GEALSHGGGPRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGKETGQQIGEEVRAL 0"
## [1] "SDTKPTPDSSKPNGQTR-SPPQKLATDYIVPCMNKHGICVVDNFLGNEVGRSILEDVRAL 0"
## [1] " "
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDQITWIEGKEPGCETIGLLMSSMDDLIRHCSGKLGN 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCSGKLGN 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGN 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGS 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGS 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGS 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGN 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGN 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGN 0"
## [1] "YLTGGFTDGQLVSQRSDSSKDIRGDKITWVEGKEPGCERIAFLMSRMDDLIRHCNGKLGN 0"
## [1] " "
## [1] "YRINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YRINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YRINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDATEHGGLLRIFPEG 0"
## [1] " "
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYSTRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "TAQFADIEPKFDRLLLFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKEKYSTSSGE 0"
## [1] " "
## [1] "KGVRVELKPNS--VSKDV- 41"
## [1] "KGVRVELKPNS--VSKDV- 41"
## [1] "KGVRVELNKPSDPVGKDV- 41"
## [1] "KGVRVELNKPSDSVGKDVF 41"
## [1] "KGVRVELNKPSDSVGKDVF 41"
## [1] "KGVRVELNKPSDSVSKDVF 41"
## [1] "KGVRVELNKPSDSISKDVL 41"
## [1] "KGVRVELNKPSDSVSKDVL 41"
## [1] "KGVRVELNKPSDSISKDVL 41"
## [1] "RGVKVELNKPSEPS----- 41"
## [1] " "
ggmsa(egln1_align, start = 15, end = 65)
Make a distance matrix
egln1_dist <- seqinr::dist.alignment(egln1_align_seqinr,
matrix = "identity")
is(egln1_dist)
## [1] "dist" "oldClass"
class(egln1_dist)
## [1] "dist"
egln1_align_seqinr_rnd <- round(egln1_dist, 3)
egln1_align_seqinr_rnd
## NP_444437.2 XP_002728657.3 XP_026268254.1 NP_071334
## XP_002728657.3 0.407
## XP_026268254.1 0.361 0.495
## NP_071334 0.433 0.552 0.432
## XP_001104870.1 0.442 0.552 0.435 0.153
## XP_525092.2 0.574 0.555 0.569 0.480
## XP_546089.3 0.518 0.513 0.516 0.472
## XP_020929239.1 0.436 0.545 0.429 0.357
## XP_032955515.1 0.447 0.557 0.443 0.358
## NP_001002595.2 0.563 0.636 0.556 0.553
## XP_001104870.1 XP_525092.2 XP_546089.3 XP_020929239.1
## XP_002728657.3
## XP_026268254.1
## NP_071334
## XP_001104870.1
## XP_525092.2 0.480
## XP_546089.3 0.469 0.449
## XP_020929239.1 0.350 0.512 0.452
## XP_032955515.1 0.351 0.522 0.470 0.318
## NP_001002595.2 0.551 0.629 0.593 0.551
## XP_032955515.1
## XP_002728657.3
## XP_026268254.1
## NP_071334
## XP_001104870.1
## XP_525092.2
## XP_546089.3
## XP_020929239.1
## XP_032955515.1
## NP_001002595.2 0.551
Building a phylogenetic tree from the distance matrix
egln1_tree <- nj(egln1_dist)
# plot tree
plot.phylo(egln1_tree, main="Phylogenetic Tree",
use.edge.length = F)
# add label
mtext(text = "EGLN1 family gene tree - rooted, no branch lengths")