This code compiles summary information about the gene CREBRF.
It also generates alignments and a phylogeneitc tree to indicating the evolutionary relationship betweeen the human version of the gene and its homologs in other species.
Key information use to make this script can be found here:
Refseq Gene:https://www.ncbi.nlm.nih.gov/gene/153222/
Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene/12672
Uniprot gene: https://www.uniprot.org/uniprot/Q8IUR6
Other resources consulted includes
Neanderthal genome: http://neandertal.ensemblgenomes.org/index.html
Other interesting resources and online tools include:
REPPER: https://toolkit.tuebingen.mpg.de/jobs/7803820 Sub-cellular locations prediction: https://wolfpsort.hgc.jp/
###Preparation Load necessary packages:
Download and load drawProteins from Bioconductor
#install.packages("BiocManager")
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
# github packages
#install.packages("compbio4all")
#install.packages("ggmsa")
#install.packages("msa")
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
# CRAN packages
# install.packages("rentrez",dependencies = TRUE)
# install.packages("pander")
# install.packages("ape")
# install.packages("seqinr")
library(rentrez)
library(seqinr)
##
## Attaching package: 'seqinr'
## The following object is masked from 'package:Biostrings':
##
## translate
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
## The following object is masked from 'package:Biostrings':
##
## complement
library(pander)
library(ggplot2)
# Bioconductor packages
# BiocManager::install("msa")
#BiocManager::install("drawProteins")
#library(msa)
library(drawProteins)
## Biostrings
#install.packages("Biostrings")
library(Biostrings)
#install.packages("HGNChelper")
library(HGNChelper)
## Warning: package 'HGNChelper' was built under R version 4.1.2
Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The the Neanderthal genome database was searched but did not yield sequence information on.
A protein BLAST search (https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome) was carried out excluding vertebrates to determine if it occurred outside of vertebreates. The gene does not appear in non-vertebrates and so a second search was conducted to exclude mammals.
HGNChelper::checkGeneSymbols(x = c("CREBRF"))
## Maps last updated on: Thu Oct 24 12:31:05 2019
## x Approved Suggested.Symbol
## 1 CREBRF TRUE CREBRF
# RefSeq Uniprot PDB sci name common name genename
crebrf_table<-c("NP_705835.2", "Q8IUR6","NA","Homo sapiens" , "Human", "CREBRF",
"XP_518103.2","K6ZKI8", "NA", "Pan troglodytes","Chimpanzee","CREBRF",
"NP_084146.1", "Q8CDG5","NA","Mus musculus", "Mouse" ,"CREBRF",
"NP_001264086.1","D4A811","NA","Rattus norvegicus", "Rat", "CREBRF",
"XP_004912902.1","F7AUN5","NA","Xenopus tropicalis","Frog", "CREBRF",
"XP_005170343.1","A0A0R4IJW4","NA","Danio rerio", "Fish", "CREBRF",
"XP_005619260.1","F6V6Q2","NA","Canis lupus","Dog","CREBRF",
"XP_005221488.1","Q8SQ19","NA","Bos taurus","Bovine","CREBRF",
"XP_004016921.1","W5P1R9","NA","Ovis aries","Sheep","CREBRF",
"XP_005694592.1","A0A452E994","NA","Capra hircus","Goat","CREBRF"
)
crebrf_table
## [1] "NP_705835.2" "Q8IUR6" "NA"
## [4] "Homo sapiens" "Human" "CREBRF"
## [7] "XP_518103.2" "K6ZKI8" "NA"
## [10] "Pan troglodytes" "Chimpanzee" "CREBRF"
## [13] "NP_084146.1" "Q8CDG5" "NA"
## [16] "Mus musculus" "Mouse" "CREBRF"
## [19] "NP_001264086.1" "D4A811" "NA"
## [22] "Rattus norvegicus" "Rat" "CREBRF"
## [25] "XP_004912902.1" "F7AUN5" "NA"
## [28] "Xenopus tropicalis" "Frog" "CREBRF"
## [31] "XP_005170343.1" "A0A0R4IJW4" "NA"
## [34] "Danio rerio" "Fish" "CREBRF"
## [37] "XP_005619260.1" "F6V6Q2" "NA"
## [40] "Canis lupus" "Dog" "CREBRF"
## [43] "XP_005221488.1" "Q8SQ19" "NA"
## [46] "Bos taurus" "Bovine" "CREBRF"
## [49] "XP_004016921.1" "W5P1R9" "NA"
## [52] "Ovis aries" "Sheep" "CREBRF"
## [55] "XP_005694592.1" "A0A452E994" "NA"
## [58] "Capra hircus" "Goat" "CREBRF"
#making matrix from table
crebrf_matrix<-matrix(crebrf_table,nrow=10,ncol=6,byrow = TRUE)
#crebrf_matrix
#converting matrix to dataframe
crebrf_table<-data.frame(crebrf_matrix)
All sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.
id_list<- crebrf_table[,1]
crebrf_fasta_list<-entrez_fetch_list(db = "protein",
id = id_list,
rettype = "fasta")
Number of FASTA files obtained
length(crebrf_fasta_list)
## [1] 10
The first entry
crebrf_fasta_list[1]
## $NP_705835.2
## [1] ">NP_705835.2 CREB3 regulatory factor isoform 1 [Homo sapiens]\nMPQPSVSGMDPPFGDAFRSHTFSEQTLMSTDLLANSSDPDFMYELDREMNYQQNPRDNFLSLEDCKDIEN\nLESFTDVLDNEGALTSNWEQWDTYCEDLTKYTKLTSCDIWGTKEVDYLGLDDFSSPYQDEEVISKTPTLA\nQLNSEDSQSVSDSLYYPDSLFSVKQNPLPSSFPGKKITSRAAAPVCSSKTLQAEVPLSDCVQKASKPTSS\nTQIMVKTNMYHNEKVNFHVECKDYVKKAKVKINPVQQSRPLLSQIHTDAAKENTCYCGAVAKRQEKKGME\nPLQGHATPALPFKETQELLLSPLPQEGPGSLAAGESSSLSASTSVSDSSQKKEEHNYSLFVSDNLGEQPT\nKCSPEEDEEDEEDVDDEDHDEGFGSEHELSENEEEEEEEEDYEDDKDDDISDTFSEPGYENDSVEDLKEV\nTSISSRKRGKRRYFWEYSEQLTPSQQERMLRPSEWNRDTLPSNMYQKNGLHHGKYAVKKSRRTDVEDLTP\nNPKKLLQIGNELRKLNKVISDLTPVSELPLTARPRSRKEKNKLASRACRLKKKAQYEANKVKLWGLNTEY\nDNLLFVINSIKQEIVNRVQNPRDERGPNMGQKLEILIKDTLGLPVAGQTSEFVNQVLEKTAEGNPTGGLV\nGLRIPTSKV\n\n"
Remove FASTA header
crebrf_list<-list()
for(i in 1:length(crebrf_fasta_list)){
crebrf_list[[i]] <- compbio4all::fasta_cleaner(crebrf_fasta_list[[i]], parse = F)
}
names(crebrf_list)<-crebrf_table[,1]
Specific additional cleaning steps will be as needed for particular analyses
human_features <- drawProteins::get_features(crebrf_table[1,2])
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(human_features)
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df,
label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas
### Draw dotplot Prepare data
crebrf_human <- fasta_cleaner(crebrf_fasta_list[1],
parse = TRUE)
making 4 plots with different wsize and nmatch
# set up 2 x 2 grid, make margins things
par(mfrow = c(2,2),
mar = c(0,0,2,1))
# plot 1: Defaults
dotPlot(crebrf_human, crebrf_human,
wsize =1 ,
nmatch =1 ,
main = "Defaults")
# plot 2
dotPlot(crebrf_human, crebrf_human,
wsize =10 ,
nmatch =5 ,
main = "wsize=10, nmatch=5")
# plot 3:
dotPlot(crebrf_human, crebrf_human,
wsize = 10,
nmatch =2 ,
main = "wsize=10, nmatch=3")
# plot 4:
dotPlot(crebrf_human, crebrf_human,
wsize =20 ,
nmatch =3 ,
main = "wsize=20, nmatch=3")
best version of the plot:
# be sure to run par - re-run just in case
par(mfrow = c(1,1),
mar = c(4,4,4,4))
dotPlot(crebrf_human, crebrf_human,
wsize =20 ,
nmatch =3 ,
main = "wsize=20, nmatch=3")
## Protein properties compiled from databases Below are links to relevant information. This particular protein is not in
| Pfam;http://pfam.xfam.org/protein/Q8IUR6, there are some low complexity, disorder and coiled region, no named domain | DisProt: no information available | RepeatDB: no information available | UniProt sub-cellular locations: nuclear foci | PDB secondary structural location: no PDB entry available | as shown in alphafold(https://alphafold.ebi.ac.uk/entry/Q8IUR6), the protein has mostly alpha helix with a little short beta sheets and many disorder region
### Protein feature prediction Multivariate statistcal techniques were used to confirm the information about protein structure and location in the line database. / Alphafold indicates that there are a mix of alpha helices and beta sheets. I therefore predict that machine-learning methods will indicate an a+b and a/b structure.
NOTE: My protein contains a “U” for an unknown amino acid. I removed this from the sequence because it is otherwise undefined.
First, I need the data from Chou and Zhang (1994) Table 5. Code to build this table is available at https://rpubs.com/lowbrowR/843543\
The table looks like this:
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
pander(data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b))
| aa.1.1 | alpha | beta | a.plus.b | a.div.b |
|---|---|---|---|---|
| A | 285 | 203 | 175 | 361 |
| R | 53 | 67 | 78 | 146 |
| N | 97 | 139 | 120 | 183 |
| D | 163 | 121 | 111 | 244 |
| C | 22 | 75 | 74 | 63 |
| Q | 67 | 122 | 74 | 114 |
| E | 134 | 86 | 86 | 257 |
| G | 197 | 297 | 171 | 377 |
| H | 111 | 49 | 33 | 107 |
| I | 91 | 120 | 93 | 239 |
| L | 221 | 177 | 110 | 339 |
| K | 249 | 115 | 112 | 321 |
| M | 48 | 16 | 25 | 91 |
| F | 123 | 85 | 52 | 158 |
| P | 82 | 127 | 71 | 188 |
| S | 122 | 341 | 126 | 327 |
| T | 119 | 253 | 117 | 238 |
| W | 33 | 44 | 30 | 72 |
| Y | 63 | 110 | 108 | 130 |
| V | 167 | 229 | 123 | 378 |
Convert to frequencies
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
#dataframe
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
Table 5 therefore becomes this
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
Determine the number of each amino acid in my protein.
crebrf.freq.table <- table(crebrf_human)/length(crebrf_human)
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
crebrf.human.aa.freq <- table_to_vector(crebrf.freq.table)
crebrf.human.aa.freq
## A C D E F G
## 0.042253521 0.015649452 0.076682316 0.103286385 0.026604069 0.045383412
## H I K L M N
## 0.015649452 0.026604069 0.079812207 0.092331768 0.017214397 0.053208138
## P Q R S T V
## 0.061032864 0.050078247 0.039123631 0.103286385 0.057902973 0.053208138
## W Y
## 0.009389671 0.031298905
Check for the presence of “U” (unknown aa.)
aa.names <- names(crebrf.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
There is no Unkown aa in my case, so no need to remove anything. Now add data on my focal protein to the amino acid frequency table.
aa.prop$crebrf.human.aa.freq <- crebrf.human.aa.freq
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | crebrf.human.aa.freq | |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 0.04225 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 0.01565 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 0.07668 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 0.1033 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 0.0266 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 0.04538 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 0.01565 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 0.0266 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 0.07981 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 0.09233 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 0.01721 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 0.05321 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 0.06103 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 0.05008 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 0.03912 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 0.1033 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 0.0579 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 0.05321 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 0.00939 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 0.0313 |
Two custom functions are needed: one to calculate correlates between two columns of our table, and one to calculate correlation similarities.
# Corrleation used in Chou adn Zhange 1992.
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
# Cosine similarity used in Higgs and Attwood (2005).
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
Calculate correlation between each column
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
Calculate distance.
Note: we need to flip the dataframe on its side using a command called t()
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## crebrf.human.aa.freq 0.04 0.02 0.08 0.10 0.03 0.05 0.02 0.03 0.08 0.09 0.02
## K M F P S T W Y V
## alpha.prop 0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## crebrf.human.aa.freq 0.05 0.06 0.05 0.04 0.10 0.06 0.05 0.01 0.03
We can get distance matrix like this
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## crebrf.human.aa.freq 0.18209101 0.17456055 0.16419774 0.16592236
Individual distancesusing dist()
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
Compile the information. Rounding makes it easier to read
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
Display output
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7501 | 0.7501 | 0.1821 | ||
| beta | 0.7735 | 0.7735 | 0.1746 | ||
| alpha plus beta | 0.7857 | 0.7857 | 0.1642 | most.sim | min.dist |
| alpha/beta | 0.7845 | 0.7845 | 0.1659 |
Convert all FASTA records intro entries in a single vector. FASTA entries are contained in a list produced at the beginning of the script. They were cleaned to remove the header and newline characters.
names(crebrf_list)
## [1] "NP_705835.2" "XP_518103.2" "NP_084146.1" "NP_001264086.1"
## [5] "XP_004912902.1" "XP_005170343.1" "XP_005619260.1" "XP_005221488.1"
## [9] "XP_004016921.1" "XP_005694592.1"
length(crebrf_list)
## [1] 10
Each entry is a full entry with no spaces or parsing, and no header
crebrf_list[1]
## $NP_705835.2
## [1] "MPQPSVSGMDPPFGDAFRSHTFSEQTLMSTDLLANSSDPDFMYELDREMNYQQNPRDNFLSLEDCKDIENLESFTDVLDNEGALTSNWEQWDTYCEDLTKYTKLTSCDIWGTKEVDYLGLDDFSSPYQDEEVISKTPTLAQLNSEDSQSVSDSLYYPDSLFSVKQNPLPSSFPGKKITSRAAAPVCSSKTLQAEVPLSDCVQKASKPTSSTQIMVKTNMYHNEKVNFHVECKDYVKKAKVKINPVQQSRPLLSQIHTDAAKENTCYCGAVAKRQEKKGMEPLQGHATPALPFKETQELLLSPLPQEGPGSLAAGESSSLSASTSVSDSSQKKEEHNYSLFVSDNLGEQPTKCSPEEDEEDEEDVDDEDHDEGFGSEHELSENEEEEEEEEDYEDDKDDDISDTFSEPGYENDSVEDLKEVTSISSRKRGKRRYFWEYSEQLTPSQQERMLRPSEWNRDTLPSNMYQKNGLHHGKYAVKKSRRTDVEDLTPNPKKLLQIGNELRKLNKVISDLTPVSELPLTARPRSRKEKNKLASRACRLKKKAQYEANKVKLWGLNTEYDNLLFVINSIKQEIVNRVQNPRDERGPNMGQKLEILIKDTLGLPVAGQTSEFVNQVLEKTAEGNPTGGLVGLRIPTSKV"
Make each entry of the list into a vector. There are several ways to do this.
crebrf_vector<- rep(NA, length(crebrf_list))
for(i in 1:length(crebrf_list)){
crebrf_vector[i] <- crebrf_list[[i]]
}
Name the vector
names(crebrf_vector) <- names(crebrf_list)
Do pairwise alignments for humans, chimps and 2-other species.
01:human
02:chimps
03: mouse
04: rat
align01.02<- Biostrings::pairwiseAlignment(crebrf_vector[1], crebrf_vector[2])
align01.03<- Biostrings::pairwiseAlignment(crebrf_vector[1], crebrf_vector[3])
align01.04<- Biostrings::pairwiseAlignment(crebrf_vector[1], crebrf_vector[4])
align02.03<- Biostrings::pairwiseAlignment(crebrf_vector[2], crebrf_vector[3])
align02.04<- Biostrings::pairwiseAlignment(crebrf_vector[2], crebrf_vector[4])
align03.04<- Biostrings::pairwiseAlignment(crebrf_vector[3], crebrf_vector[4])
Biostrings::pid(align01.03)
## [1] 93.75
Biostrings::pid(align01.04)
## [1] 94.375
Build matrix
pids <- c(1, NA, NA, NA,
pid(align01.02), 1, NA, NA,
pid(align01.03), pid(align02.03), 1, NA,
pid(align01.04), pid(align02.04), pid(align03.04), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Mouse","Rat")
colnames(mat) <- c("Homo","Pan","Mouse","Rat")
pander::pander(mat)
| Homo | Pan | Mouse | Rat | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Pan | 100 | 1 | NA | NA |
| Mouse | 93.75 | 93.75 | 1 | NA |
| Rat | 94.38 | 94.38 | 98.12 | 1 |
Compare different PID methods.
We were told to use the chimps’ data, but in this case the pid is 100 and all pid methods should be the same, so I use human vs. mouse instead
pid_1<-pid(align01.03, type="PID1")
pid_2<-pid(align01.03, type="PID2")
pid_3<-pid(align01.03, type="PID3")
pid_4<-pid(align01.03, type="PID4")
pid_comparison<-data.frame(c("PID1","PID2","PID3","PID4"),c(pid_1,pid_2,pid_3,pid_4),c("(aligned positions + internal gap positions)","(aligned positions)","(length shorter sequence)","(average length of the two sequences)"))
colnames(pid_comparison)<-c("method","PID","denominator")
pander(pid_comparison)
| method | PID | denominator |
|---|---|---|
| PID1 | 93.75 | (aligned positions + internal gap positions) |
| PID2 | 93.9 | (aligned positions) |
| PID3 | 93.9 | (length shorter sequence) |
| PID4 | 93.82 | (average length of the two sequences) |
For use with R bioinformatics tools we need to convert our named vector to a string set using Biostrings::AAStringSet(). Note the _ss tag at the end of the object we’re assigning the output to, which designates this as a string set.
names(crebrf_vector)<-names(crebrf_list)
crebrf_vector_ss <- Biostrings::AAStringSet(crebrf_vector)
crebrf_align <- msa(crebrf_vector_ss,
method = "ClustalW")
## use default substitution matrix
msa produces a species MSA objects
class(crebrf_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(crebrf_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
crebrf_align
## CLUSTAL 2.1
##
## Call:
## msa(crebrf_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 727 columns
## aln names
## [1] -------------------------...QVLGKTAEGNPTGGLVGLRIPASKV NP_084146.1
## [2] -------------------------...QVLGKTAEGNPTGGLVGLRIPASKV NP_001264086.1
## [3] -------------------------...QVLEKTAEGNPTGGLVGLRIPTSKV NP_705835.2
## [4] -------------------------...QVLEKTAEGNPTGGLVGLRIPTSKV XP_518103.2
## [5] -------------------------...QVLEKTAEGNPTGGLVGLRIPTSKV XP_005619260.1
## [6] -------------------------...QVLEKTAEGNPTGGLVGLRIPTSKV XP_004016921.1
## [7] -------------------------...QVLEKTAEGNPTGGLVGLRIPTSKV XP_005694592.1
## [8] -------------------------...QVLEKTAEGNPTGGLVGLRIPTSKV XP_005221488.1
## [9] -MYSSFVIDSTEKLQKRSVGQQRKY...QVLEKTSDGDATGGLVGLRIPTSKV XP_004912902.1
## [10] MFKTKLPPEIDDKIGRLRQRRCQKS...QILENTGKGDPTGGLVGLRVPTSKV XP_005170343.1
## Con -------------------------...QVLEKTAEGNPTGGLVGLRIPTSKV Consensus
class(crebrf_align) <- "AAMultipleAlignment"
crebrf_align_seqinr <- msaConvert(crebrf_align,
type = "seqinr::alignment")
compbio4all::print_msa(alignment = crebrf_align_seqinr,
chunksize = 60)
## [1] "------------------------------------MPQPSVSGMDPPFGDAFRSHTFSE 0"
## [1] "------------------------------------MPQPSVSGMDPPFGDAFRSHTFSE 0"
## [1] "------------------------------------MPQPSVSGMDPPFGDAFRSHTFSE 0"
## [1] "------------------------------------MPQPSVSGMDPPFGDAFRSHTFSE 0"
## [1] "------------------------------------MPQPSVSGMDPPFGDAFRSHTFSE 0"
## [1] "------------------------------------MPQPSVSGMDPPFGDAFRSHTFSE 0"
## [1] "------------------------------------MPQPSVSGMDPPFGDAFRSHTFSE 0"
## [1] "------------------------------------MPQPSVSGMDPPFGDAFRSHTFSE 0"
## [1] "-MYSSFVIDSTEKLQKRSVGQQRKYLGLSPENLGFEMPQPSVSGMDPPFGDAFRSPTFPE 0"
## [1] "MFKTKLPPEIDDKIGRLRQRRCQKSFRQGTEDLCSAMPQPSISGMDPPFGDAFQNCSFAD 0"
## [1] " "
## [1] "QTLMSTDLLANSSDPDFMYELDREMNYQQNPRDNFLSLEDCKDIENLETFTDVLDNEDAL 0"
## [1] "QTLMSTDLLANSSDPDFMYELDREMNYQQNPRDNFLSLEDCKDIENLETFTDVLDNEDAL 0"
## [1] "QTLMSTDLLANSSDPDFMYELDREMNYQQNPRDNFLSLEDCKDIENLESFTDVLDNEGAL 0"
## [1] "QTLMSTDLLANSSDPDFMYELDREMNYQQNPRDNFLSLEDCKDIENLESFTDVLDNEGAL 0"
## [1] "QTLMSTDLLANSSDPDFMYELDREMNYQQNPRDNFLSLEDCKDIENLESFTDVLDNEGAL 0"
## [1] "QTLMSTDLLANSSDPDFMYELDREMNYQQNPRDNFLSLEDCKDIENLESFTDVLDNEGTL 0"
## [1] "QTLMSTDLLANSSDPDFMYELDREMNYQQNPRDNFLSLEDCKDIENLESFTDVLDNEGTL 0"
## [1] "QTLMSTDLLANSSDPDFMYELDREMNYQQNPRDNFLSLEDCKDIENLESFTDVLDNEGTL 0"
## [1] "QTLMSTDLLANSSDPMFMYELDQELGYQQSPTENFLMLEDGKDFENLESFTDVLESSPPV 0"
## [1] "QALTSTDLLANSSDPDFMYELDRDMTCRQSPRGDIFTVGDCKDLDTMDHLPELVDSDPAC 0"
## [1] " "
## [1] "TSNWEQWDTYCEDLTKYTKLTSCDIWGTKEVDYLGLDDFSSPYQDEEVISKTPTLAQLNS 0"
## [1] "TSNWEQWDTYCEDLTKYTKLTSCDIWGTKEVDYLGLDDFSSPYQDEEVISKTPTLAQLNS 0"
## [1] "TSNWEQWDTYCEDLTKYTKLTSCDIWGTKEVDYLGLDDFSSPYQDEEVISKTPTLAQLNS 0"
## [1] "TSNWEQWDTYCEDLTKYTKLTSCDIWGTKEVDYLGLDDFSSPYQDEEVISKTPTLAQLNS 0"
## [1] "TSNWEQWDTYCEDLTKYTKLTSCDIWGTKEVDYLGLDDFSSPYQDEEVISKTPTLAQLNS 0"
## [1] "TSNWEQWDTYCEDLTKYTKLTSCDIWGTKEVDYLGLDDFSSPYQDEEVISKTPTLAQLNS 0"
## [1] "TSNWEQWDTYCEDLTKYTKLTSCDIWGTKEVDYLGLDDFSSPYQDEEVISKTPTLAQLNS 0"
## [1] "TSNWEQWDTYCEDLTKYTKLTSCDIWGTKEVDYLGLDDFSSPYQDEEVISKTPTLAQLNS 0"
## [1] "TSNWEQWDTYCEDLTKYTKLTSCDIWGTKEVDYLGLDDFSSPYQDEEVISKTPTLAQLNS 0"
## [1] "NSNFEQWDTYWEDLTKYTRLTSCDIWGTKEVDFLGLDDFSSPYQDEEVISRTPTLAQLNS 0"
## [1] " "
## [1] "EDSQSVSDSLYYPDSLFSVKQNPLPPSSFPSKKITN-RAAAPVCSSKTLQAEVP-SSDCV 0"
## [1] "EDSQSVSDSLYYPDSLFSVKQNPLPPSSFPSKKITS-RAAAPVCSSKILQAEVP-SSDCV 0"
## [1] "EDSQSVSDSLYYPDSLFSVKQNPLP-SSFPGKKITS-RAAAPVCSSKTLQAEVP-LSDCV 0"
## [1] "EDSQSVSDSLYYPDSLFSVKQNPLP-SSFPGKKITS-RAAAPVCSSKTLQAEVP-LSDCV 0"
## [1] "EDSQSVSDSLYYPDSLFSVKQNPLP-SSFPSKKISN-RAAAPVCSSKTLQAEVP-LSDCV 0"
## [1] "EDSQSVSDSLYYPDSLFSVKQNPLA-SSFPGKKITS-RAAAPVCSSKALQAEVP-LSDCV 0"
## [1] "EDSQSVSDSLYYPDSLFSVKQNPLA-SSFPGKKITS-RAAAPVCSSKALQAEVP-LSDCV 0"
## [1] "EDSQSVSDSLYYPDSLFSVKQNPLA-SSFPGKKITS-RAAAPVCSSKTLQAEVP-LSDCV 0"
## [1] "EDSQSVADSLYYTDSFLGTKT-----SYVPGKKVTAVRVTAPVCSSKTSHTEVP-LADCS 0"
## [1] "EDSQPVCDTLYHPDLLVGQKQLLFP--TPTMAKKTN-RFSNPSTGASSSRNPLPDFAEGS 0"
## [1] " "
## [1] "QKASKPT-SSTQIMVKTNMYHNEKVN-FHVECKDYVKKAKVKINP--------------V 0"
## [1] "QKASKPT-SSTQIMVKTNMYHNEKVN-FHVECKDYVKKAKVKINP--------------V 0"
## [1] "QKASKPT-SSTQIMVKTNMYHNEKVN-FHVECKDYVKKAKVKINP--------------V 0"
## [1] "QKASKPT-SSTQIMVKTNMYHNEKVN-FHVECKDYVKKAKVKINP--------------V 0"
## [1] "QKASKPT-SSTQIMVKTNMYHNEKVN-FHVECKDYVKKAKVKINP--------------V 0"
## [1] "QKASKPT-SSTQIMVKTNMYHNEKVN-FHVECKDYVKKAKVKINP--------------V 0"
## [1] "QKASKPT-SSTQIMVKTNMYHNEKVN-FHVECKDYVKKAKVKINP--------------V 0"
## [1] "QKASKPT-SSTQIMVKTNMYHNEKVN-FHVECKDYVKKAKVKINP--------------V 0"
## [1] "QKATKSV-ASIQMAVKTNVSCEEKIN-LHGGCKDYVKKAKVKISS--------------V 0"
## [1] "QKATWPVPSSTDTMAKAQIHGPSKTSTELLENTDYVRKAKVCFSVPRKPLVESSAQPVSI 0"
## [1] " "
## [1] "QQGRPLLSQVHIDAAKENTCYCGAVAKRQERRGVEPHQGRGTP-ALPFKETQELLLSPLT 0"
## [1] "QQGRPLLSQVHIDAAKENTCYCGAVVKRQEKRGVEPHQGHGTP-ALPFRETQELLLSPLP 0"
## [1] "QQSRPLLSQIHTDAAKENTCYCGAVAKRQEKKGMEPLQGHATP-ALPFKETQELLLSPLP 0"
## [1] "QQSRPLLSQIHTDAAKENTCYCGAVAKRQEKKGMEPLQGHATP-ALPFKETQELLLSPLP 0"
## [1] "QQSRPLLSQIHADAAKENTCYCGAVAKRPEKKGMEPLQGHATP-ALPFKETQELLLSPLS 0"
## [1] "QQSRPLLSQVHADAAKENTCYCGAVAKRPEKRGTEPLQGHATP-ALPFKEAQELLLSPLP 0"
## [1] "QQSRPLLSQVHADAAKENTCYCGAVAKRPEKRGTEPLQGHATP-ALPFKEAQELLLSPLP 0"
## [1] "QQSRPLLSQVHADAAKENTCYCGAVAKRPEKKGTEPLQGHATP-ALPFKETQELLLSPLP 0"
## [1] "LPCRPTTTMVAADAAKENTCFCGALAKKQEKAIGEPPIGRKNYGILPFKETQKLFCP--- 0"
## [1] "SSSTPIQESTASLVSEDPAVITAANTASSISCEKRVAIPKREVTSGPAPEVGTLRAVPHK 0"
## [1] " "
## [1] "QDSPGLVATAE-SGSLSASTSVSDSSQKKEE-HNYSLFVS---------DNMREQPTKYS 0"
## [1] "QHGPGLVATAE-SGSLSASTSVSDSSQKKEE-HNYSLFVS---------DNMREQPTKYS 0"
## [1] "QEGPGSLAAGE-SSSLSASTSVSDSSQKKEE-HNYSLFVS---------DNLGEQPTKCS 0"
## [1] "QEGPGSLAAGE-SSSLSASTSVSDSSQKKEE-HNYSLFVS---------DNLGEQPTKCS 0"
## [1] "QEGPGSIAAGE-SSSLSASTLVSDSSQKKEE-HNYSLFVS---------DNLGEQPTKCS 0"
## [1] "QEGPGPVAAGE-SSSLSASTSVSDSSQKKEE-HNYSLFVS---------DNLGEQPTKCS 0"
## [1] "QEGPGSVAAGE-SSSLSASTSVSDSSQKKEE-HNYSLFVS---------DNLGEQPTKCS 0"
## [1] "QEGPGSVAAGE-SSSLSASTSVSDSSQKKEE-HNYSLFVS---------DNLGEQPTKCS 0"
## [1] "--APALITIGPGDGNTSAGASVSDPSQKKEE-HNYSLFVS---------EGLTEPTVKSE 0"
## [1] "LHFPSATGSETLLTMSALGSDASAADKKKEEEHNYSLFLTRARLAGKVTADIEEEEEEEE 0"
## [1] " "
## [1] "P----EDDEDDEDE-------FDDEDHDEGFGSEHELSENEEEEEEEE---------DYE 0"
## [1] "P----EDDEDDEDD-------IDDEDHDEGFGSEHELSENEEEEEEEE---------DYE 0"
## [1] "P----EEDEEDEED-------VDDEDHDEGFGSEHELSENEEEEEEEE---------DYE 0"
## [1] "P----EEDEEDEED-------VDDEDHDEGFGSEHELSENEEEEEEEE---------DYE 0"
## [1] "P----EEDEEDEED-------VDDEDHDEGFGSEHELSENEEEEEEEE---------DYE 0"
## [1] "P----EEDEEDEED-------VDDEDHDEGFGSEHELSENEEEEEEEE---------DYE 0"
## [1] "P----EEDEEDEED-------VDDEDHDEGFGSEHELSENEEEEEEEE---------DYE 0"
## [1] "P----EEDEEDEED-------VDDEDHDEGFGSEHELSENEEEEEEEE---------DYE 0"
## [1] "ISQQQEEEEEEEAE-------LDEEDHDEGFGSEHELSENEEEEEEEEEEEEEEEEEDYE 0"
## [1] "EEVDDEEEEEEEEEEKAEELDLDDEDHDEGFGSEHELSENDEEEEEEDD-------EDYE 0"
## [1] " "
## [1] "DDRDDDISDTFSEPGYENDSVEDLKEMTSISSRKRGKRRYFWEYSEQLTPSQQERILRPS 0"
## [1] "DDRDDDISDTFSEPGYENDSVEDLKEMTSISSRKRGKRRYFWEYSEQLTPSQQERILRPS 0"
## [1] "DDKDDDISDTFSEPGYENDSVEDLKEVTSISSRKRGKRRYFWEYSEQLTPSQQERMLRPS 0"
## [1] "DDKDDDISDTFSEPGYENDSVEDLKEVTSISSRKRGKRRYFWEYSEQLTPSQQERMLRPS 0"
## [1] "DDKDDDISDTFSEPGYENDSVEDLKEMTSISSRKRGKRRYFWEYSEQLTPSQQERMLRPS 0"
## [1] "DDKDDDISDTFSEPGYENDSVEDLKEMTSITSRKRGKRRYFWEYSEQLTPSQQERMLRPS 0"
## [1] "DDKDDDISDTFSEPGYENDSVEDLKEMTSITSRKRGKRRYFWEYSEQLTPSQQERMLRPS 0"
## [1] "DDKDDDISDTFSEPGYENDSVEDLKEMTSVTSRKRGKRRYFWEYSEQLTPSQQERMLRPS 0"
## [1] "DDKDD-ISDTFSEPGYENDSVEDLKGITAISSRKRGKRRYFWEYSEQLLPSQQEGTLRPS 0"
## [1] "GDKDDYASDAFSEPGCDTDMVDDVKGLTAGISRKRGKRRYFWEYSEQIIPFKQERMLKPS 0"
## [1] " "
## [1] "EWNRDTLPSNMYQKNGLHHGKYAVKKSRRTDVEDLTPNPKKLLQIGNELRKLNKVISDLT 0"
## [1] "EWNRDTLPSNMYQKNGLHHGKYAVKKSRRTDVEDLTPNPKKLLQIGNELRKLNKVISDLT 0"
## [1] "EWNRDTLPSNMYQKNGLHHGKYAVKKSRRTDVEDLTPNPKKLLQIGNELRKLNKVISDLT 0"
## [1] "EWNRDTLPSNMYQKNGLHHGKYAVKKSRRTDVEDLTPNPKKLLQIGNELRKLNKVISDLT 0"
## [1] "EWNRDTLPSNMYQKNGLHHGKYAVKKSRRTDVEDLTPNPKKLLQIGNELRKLNKVISDLT 0"
## [1] "EWNRETLPSNMYQKNGLHHGKYAVKKSRRTDVEDLTPNPKKLLQIGNELRKLNKVISDLT 0"
## [1] "EWNRETLPSNMYQKNGLHHGKYAVKKSRRTDVEDLTPNPKKLLQIGNELRKLNKVISDLT 0"
## [1] "EWNRETLPSNMYQKNGLHHGKYAVKKSRRTDVEDLTPNPKKLLQIGNELRKLNKVISDLT 0"
## [1] "EWNRDTLPSNMYQKNGLHHGKYTVKKSRRTDVEDLTPNPRKLLQIGGELRKLNKVISDLT 0"
## [1] "EWDRDTLPSNMYQKNGLHHGKYTLKKSRRTDVEDLTPNPRKLLQIGNELRKLNKVISDLT 0"
## [1] " "
## [1] "PVSELPLTARPRSRKEKNKLASRACRLKKKAQYEANKVKLWGLNTEYDNLLFVINSIKQD 0"
## [1] "PVSELPLTARPRSRKEKNKLASRACRLKKKAQYEANKVKLWGLNTEYDNLLFVINSIKQD 0"
## [1] "PVSELPLTARPRSRKEKNKLASRACRLKKKAQYEANKVKLWGLNTEYDNLLFVINSIKQE 0"
## [1] "PVSELPLTARPRSRKEKNKLASRACRLKKKAQYEANKVKLWGLNTEYDNLLFVINSIKQE 0"
## [1] "PVSELPLTARPRSRKEKNKLASRACRLKKKAQYEANKVKLWGLNTEYDNLLFVINSIKQE 0"
## [1] "PVSELPLTARPRSRKEKNKLASRACRLKKKAQYEANKVKLWGLNTEYDNLLFVINSIKQE 0"
## [1] "PVSELPLTARPRSRKEKNKLASRACRLKKKAQYEANKVKLWGLNTEYDNLLFVINSIKQE 0"
## [1] "PVSELPLTARPRSRKEKNKLASRACRLKKKAQYEANKVKLWGLNTEYDNLLFVINSIKQE 0"
## [1] "PVSELPLTARPRSRKEKNKLASRACRLKKKAQYEANKVKLWGLNTEYDNLLFVINSIKQE 0"
## [1] "PVSELPLTARPRSRKEKNKLASRACRLKKKAQYEANKVKLWGLGTEYDRLLFVINAIKEE 0"
## [1] " "
## [1] "IVNRVQNPREEREPSMGQKLEILIKDTLGLP-VAGQTSEFVNQVLGKTAEGNPTGGLVGL 0"
## [1] "IVNRVQNPREERGPSMGQKLEILIKDTLGLP-VAGQTSEFVNQVLGKTAEGNPTGGLVGL 0"
## [1] "IVNRVQNPRDERGPNMGQKLEILIKDTLGLP-VAGQTSEFVNQVLEKTAEGNPTGGLVGL 0"
## [1] "IVNRVQNPRDERGPNMGQKLEILIKDTLGLP-VAGQTSEFVNQVLEKTAEGNPTGGLVGL 0"
## [1] "IVNRVQNPREERGPNMGQKLEILIKDTLGLP-VAGQTSEFVNQVLEKTAEGNPTGGLVGL 0"
## [1] "IVNRVQNPREERGPNMGQKLEILIKDTLGLP-VAGQTSEFVNQVLEKTAEGNPTGGLVGL 0"
## [1] "IVNRVQNPREERGPNMGQKLEILIKDTLGLP-VAGQTSEFVNQVLEKTAEGNPTGGLVGL 0"
## [1] "IVNRVQSPREERGPNMGQKLEILIKDTLGLP-VAGQTSEFVNQVLEKTAEGNPTGGLVGL 0"
## [1] "IMNRIQIPCEEWSASMGQKLDAMIKDTIGSP-VAGQTSEFVNQVLEKTSDGDATGGLVGL 0"
## [1] "IVNKVQDISQDKTTSMTEKLDKLIEDTLVEPPVASQTSDFVNQILENTGKGDPTGGLVGL 0"
## [1] " "
## [1] "RIPASKV 53"
## [1] "RIPASKV 53"
## [1] "RIPTSKV 53"
## [1] "RIPTSKV 53"
## [1] "RIPTSKV 53"
## [1] "RIPTSKV 53"
## [1] "RIPTSKV 53"
## [1] "RIPTSKV 53"
## [1] "RIPTSKV 53"
## [1] "RVPTSKV 53"
## [1] " "
Based on the output from drawProtiens, the 500-800 aa is shown to be bZIP, so I print the msa of 650-750(550-800 is too much)
class(crebrf_align) <- "AAMultipleAlignment"
# run ggmsa
ggmsa::ggmsa(crebrf_align,
start = 650,
end = 750)
Make a distance matrix
This produces a “dist” class object.
crebrf_dist <- seqinr::dist.alignment(crebrf_align_seqinr,
matrix = "identity")
is(crebrf_dist)
## [1] "dist" "oldClass"
class(crebrf_dist)
## [1] "dist"
crebrf_align_seqinr_rnd <- round(crebrf_dist, 3)
crebrf_align_seqinr_rnd
## NP_084146.1 NP_001264086.1 NP_705835.2 XP_518103.2
## NP_001264086.1 0.137
## NP_705835.2 0.247 0.234
## XP_518103.2 0.247 0.234 0.000
## XP_005619260.1 0.244 0.241 0.125 0.125
## XP_004016921.1 0.253 0.237 0.153 0.153
## XP_005694592.1 0.253 0.237 0.148 0.148
## XP_005221488.1 0.256 0.244 0.143 0.143
## XP_004912902.1 0.473 0.480 0.465 0.465
## XP_005170343.1 0.637 0.638 0.630 0.630
## XP_005619260.1 XP_004016921.1 XP_005694592.1 XP_005221488.1
## NP_001264086.1
## NP_705835.2
## XP_518103.2
## XP_005619260.1
## XP_004016921.1 0.158
## XP_005694592.1 0.153 0.040
## XP_005221488.1 0.148 0.097 0.088
## XP_004912902.1 0.467 0.468 0.468 0.467
## XP_005170343.1 0.630 0.633 0.633 0.633
## XP_004912902.1
## NP_001264086.1
## NP_705835.2
## XP_518103.2
## XP_005619260.1
## XP_004016921.1
## XP_005694592.1
## XP_005221488.1
## XP_004912902.1
## XP_005170343.1 0.651
Build a phylogenetic tree from distance matrix
tree_subset <- nj(crebrf_dist)
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree",
use.edge.length = F)
# add label
mtext(text = "CREBRF family gene tree - rooted, no branch lenths")