Introduction

This code compiles summary information about the gene NOS2 (nitric oxide synthase 2).

It also generates pairwise alignments and creates a phylogenetic tree to indicate the evolutionary relationship between the human version of the gene and other homologs from different species.

Preparation

library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
##   is available with R version '4.1'; see https://bioconductor.org/install
library(drawProteins)

library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
library(rentrez)
library(seqinr)
library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(pander)


library(ggplot2)

library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
## 
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
## 
##     version
library(drawProteins)

library(Biostrings)

Accession numbers

Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The Neanderthal genome database was searched but did not have any information on Nos2. PDB only had information on the human Nos2 gene.

A protein BLAST search (https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome) was carried out excluding vertebrates and mammals to determine if it occurred outside of vertebrates and mammals. The gene does not appear in non-vertebrates or non-mammals.

Not available: Neanderthal

Does not occur: Outside of mammals and vertebrates

Accession number table

name = c("Human", "Zebrafish", "Norway rat", "Cattle", "Dog", "Sheep", "Pig", "House mouse", "Chicken", "Turkey")
id = c("NP_000616.3", "NP_001098407.1", "NP_036743.3", " NP_851380.2", "NP_001300777.1", "NP_001123373.1", "NP_999460.1", "NP_001300850.1", "NP_990292.2", "NP_001290142.1")

uni = c("P35228", "A7WK81", "Q06518", "A0A3Q1LSD1", "O62699", "Q4U3W6", "Q28969", "P29477", "NA", "H6S058")

sci_name = c("Homo sapiens", "Danio rerio", "Rattus norvegicus", "Bos taurus", "Canis lupus familiaris", "Ovis aries", "Sus scrofa", "Mus musculus", "Gallus gallus", "Meleagris gallopavo")

gene_name = c("NOS2", "nos2a","Nos2", "NOS3", "NOS2", "NOS3", "NOS3", "Nos2", " NOS2", "NOS2"  )

pdb = c("1NSI", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA")
nos2_table <- data.frame(ncbi.protein.accession = id,  UniProt.id= uni, PDB = pdb, species = sci_name, common.name = name, gene.name = gene_name)

pander(nos2_table)
Table continues below
ncbi.protein.accession UniProt.id PDB species
NP_000616.3 P35228 1NSI Homo sapiens
NP_001098407.1 A7WK81 NA Danio rerio
NP_036743.3 Q06518 NA Rattus norvegicus
NP_851380.2 A0A3Q1LSD1 NA Bos taurus
NP_001300777.1 O62699 NA Canis lupus familiaris
NP_001123373.1 Q4U3W6 NA Ovis aries
NP_999460.1 Q28969 NA Sus scrofa
NP_001300850.1 P29477 NA Mus musculus
NP_990292.2 NA NA Gallus gallus
NP_001290142.1 H6S058 NA Meleagris gallopavo
common.name gene.name
Human NOS2
Zebrafish nos2a
Norway rat Nos2
Cattle NOS3
Dog NOS2
Sheep NOS3
Pig NOS3
House mouse Nos2
Chicken NOS2
Turkey NOS2

Data preparation

All sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.

nos2_list <- entrez_fetch_list(db = "protein", 
                          id = nos2_table$ncbi.protein.accession, 
                          rettype = "fasta")
length(nos2_list)
## [1] 10
nos2_list[[1]]
## [1] ">NP_000616.3 nitric oxide synthase, inducible [Homo sapiens]\nMACPWKFLFKTKFHQYAMNGEKDINNNVEKAPCATSSPVTQDDLQYHNLSKQQNESPQPLVETGKKSPES\nLVKLDATPLSSPRHVRIKNWGSGMTFQDTLHHKAKGILTCRSKSCLGSIMTPKSLTRGPRDKPTPPDELL\nPQAIEFVNQYYGSFKEAKIEEHLARVEAVTKEIETTGTYQLTGDELIFATKQAWRNAPRCIGRIQWSNLQ\nVFDARSCSTAREMFEHICRHVRYSTNNGNIRSAITVFPQRSDGKHDFRVWNAQLIRYAGYQMPDGSIRGD\nPANVEFTQLCIDLGWKPKYGRFDVVPLVLQANGRDPELFEIPPDLVLEVAMEHPKYEWFRELELKWYALP\nAVANMLLEVGGLEFPGCPFNGWYMGTEIGVRDFCDVQRYNILEEVGRRMGLETHKLASLWKDQAVVEINI\nAVLHSFQKQNVTIMDHHSAAESFMKYMQNEYRSRGGCPADWIWLVPPMSGSITPVFHQEMLNYVLSPFYY\nYQVEAWKTHVWQDEKRRPKRREIPLKVLVKAVLFACMLMRKTMASRVRVTILFATETGKSEALAWDLGAL\nFSCAFNPKVVCMDKYRLSCLEEERLLLVVTSTFGNGDCPGNGEKLKKSLFMLKELNNKFRYAVFGLGSSM\nYPRFCAFAHDIDQKLSHLGASQLTPMGEGDELSGQEDAFRSWAVQTFKAACETFDVRGKQHIQIPKLYTS\nNVTWDPHHYRLVQDSQPLDLSKALSSMHAKNVFTMRLKSRQNLQSPTSSRATILVELSCEDGQGLNYLPG\nEHLGVCPGNQPALVQGILERVVDGPTPHQTVRLEALDESGSYWVSDKRLPPCSLSQALTYFLDITTPPTQ\nLLLQKLAQVATEEPERQRLEALCQPSEYSKWKFTNSPTFLEVLEEFPSLRVSAGFLLSQLPILKPRFYSI\nSSSRDHTPTEIHLTVAVVTYHTRDGQGPLHHGVCSTWLNSLKPQDPVPCFVRNASGFHLPEDPSHPCILI\nGPGTGIAPFRSFWQQRLHDSQHKGVRGGRMTLVFGCRRPDEDHIYQEEMLEMAQKGVLHAVHTAYSRLPG\nKPKVYVQDILRQQLASEVLRVLHKEPGHLYVCGDVRMARDVAHTLKQLVAAKLKLNEEQVEDYFFQLKSQ\nKRYHEDIFGAVFPYEAKKDRVAVQPSSLEMSAL\n\n"
for(i in 1:length(nos2_list)){
  nos2_list[[i]] <- compbio4all::fasta_cleaner(nos2_list[[i]], parse = F)
}

General Protein information Protein diagram

P35228_json  <- drawProteins::get_features("P35228")
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(P35228_json)

my_canvas <- draw_canvas(my_prot_df)  
my_canvas <- draw_chains(my_canvas, my_prot_df, 
                         label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas

Draw dotplot

nos2_human <- rentrez::entrez_fetch(id = "NP_000616.3",
                                      db = "protein", 
                                      rettype="fasta")
nos2_human_vector <- fasta_cleaner(nos2_human)

par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

dotPlot(nos2_human_vector, 
        nos2_human_vector, 
        wsize = 1, 
        nmatch = 1)

dotPlot(nos2_human_vector, nos2_human_vector, 
        wsize = 10, 
        nmatch = 1)

dotPlot(nos2_human_vector,nos2_human_vector,
        wsize = 10, 
        nmatch = 5)

dotPlot(nos2_human_vector, nos2_human_vector, 
        wsize = 20,
        nmatch =5)

par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

dotPlot(nos2_human_vector, 
        nos2_human_vector,
        wsize = 1, 
        nmatch = 1)

Protein properties compiled from databases

The human homolog is listed in Alphafold (https://alphafold.ebi.ac.uk/entry/P35228). The predicted structure contains alpha helices, beta sheets, and disordered regions.

labels <- c("Major feature from Pfam", "Presense of disordered regions from DisProt", "Presense of tandem repeats from RepeatsDB", "Subcellular location from Uniprot", "Protein secondary structural classifications from PDB")
feature <- c("Flavodoxin_1 domain", "No information available", "No information available", "Cytoplasm and Cytosol", "Alpha and Beta")

URL <- c("http://pfam.xfam.org/protein/P29474#tabview=tab0", "No information available", "No information available", "https://www.uniprot.org/uniprot/P35228#subcellular_location", "https://www.rcsb.org/3d-view/1NSI/1")

df1 <- data.frame(labels, feature, URL)
pander(df1)
Table continues below
labels feature
Major feature from Pfam Flavodoxin_1 domain
Presense of disordered regions from DisProt No information available
Presense of tandem repeats from RepeatsDB No information available
Subcellular location from Uniprot Cytoplasm and Cytosol
Protein secondary structural classifications from PDB Alpha and Beta
URL
http://pfam.xfam.org/protein/P29474#tabview=tab0
No information available
No information available
https://www.uniprot.org/uniprot/P35228#subcellular_location
https://www.rcsb.org/3d-view/1NSI/1

Predict protein fold

Alphafold indicates that there are a mix of alpha helices and beta sheets. I predict that the machine-learning methods will indicate an a+b and a/b structure.

aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
##    aa.1.1 alpha beta a.plus.b a.div.b
## 1       A   285  203      175     361
## 2       R    53   67       78     146
## 3       N    97  139      120     183
## 4       D   163  121      111     244
## 5       C    22   75       74      63
## 6       Q    67  122       74     114
## 7       E   134   86       86     257
## 8       G   197  297      171     377
## 9       H   111   49       33     107
## 10      I    91  120       93     239
## 11      L   221  177      110     339
## 12      K   249  115      112     321
## 13      M    48   16       25      91
## 14      F   123   85       52     158
## 15      P    82  127       71     188
## 16      S   122  341      126     327
## 17      T   119  253      117     238
## 18      W    33   44       30      72
## 19      Y    63  110      108     130
## 20      V   167  229      123     378
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)

row.names(aa.prop) <- aa.1.1

aa.prop
##    alpha.prop   beta.prop a.plus.b.prop    a.div.b
## A 0.116469146 0.073126801    0.09264161 0.08331410
## R 0.021659174 0.024135447    0.04129169 0.03369490
## N 0.039640376 0.050072046    0.06352567 0.04223402
## D 0.066612178 0.043587896    0.05876125 0.05631202
## C 0.008990601 0.027017291    0.03917417 0.01453958
## Q 0.027380466 0.043948127    0.03917417 0.02630972
## E 0.054760932 0.030979827    0.04552673 0.05931225
## G 0.080506743 0.106988473    0.09052409 0.08700669
## H 0.045361667 0.017651297    0.01746956 0.02469421
## I 0.037188394 0.043227666    0.04923240 0.05515809
## L 0.090314671 0.063760807    0.05823187 0.07823679
## K 0.101757254 0.041426513    0.05929063 0.07408262
## M 0.019615856 0.005763689    0.01323452 0.02100162
## F 0.050265631 0.030619597    0.02752779 0.03646434
## P 0.033510421 0.045749280    0.03758602 0.04338795
## S 0.049856968 0.122838617    0.06670196 0.07546734
## T 0.048630977 0.091138329    0.06193753 0.05492730
## W 0.013485901 0.015850144    0.01588142 0.01661666
## Y 0.025745811 0.039625360    0.05717311 0.03000231
## V 0.068246833 0.082492795    0.06511382 0.08723748
par(mfrow = c(1,1), mar = c(4,4,4,4))
human_id <- rentrez::entrez_fetch(id = "NP_000616.3",
                                     db = "protein",
                                     rettype = "fasta")

human_id <- compbio4all::fasta_cleaner(human_id, parse = TRUE)

human_id.freq.table <- table(human_id)/length(human_id)


table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}
nos2.human.aa.freq <- table_to_vector(human_id.freq.table)
aa.names <- names(nos2.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
aa.prop$nos2.human.aa.freq <- nos2.human.aa.freq
aa.prop
##    alpha.prop   beta.prop a.plus.b.prop    a.div.b nos2.human.aa.freq
## A 0.116469146 0.073126801    0.09264161 0.08331410         0.06244579
## R 0.021659174 0.024135447    0.04129169 0.03369490         0.02341717
## N 0.039640376 0.050072046    0.06352567 0.04223402         0.04509974
## D 0.066612178 0.043587896    0.05876125 0.05631202         0.06591500
## C 0.008990601 0.027017291    0.03917417 0.01453958         0.04423244
## Q 0.027380466 0.043948127    0.03917417 0.02630972         0.05897658
## E 0.054760932 0.030979827    0.04552673 0.05931225         0.03469211
## G 0.080506743 0.106988473    0.09052409 0.08700669         0.03555941
## H 0.045361667 0.017651297    0.01746956 0.02469421         0.05810928
## I 0.037188394 0.043227666    0.04923240 0.05515809         0.10147441
## L 0.090314671 0.063760807    0.05823187 0.07823679         0.02515178
## K 0.101757254 0.041426513    0.05929063 0.07408262         0.03209020
## M 0.019615856 0.005763689    0.01323452 0.02100162         0.06331310
## F 0.050265631 0.030619597    0.02752779 0.03646434         0.05377277
## P 0.033510421 0.045749280    0.03758602 0.04338795         0.05810928
## S 0.049856968 0.122838617    0.06670196 0.07546734         0.06764961
## T 0.048630977 0.091138329    0.06193753 0.05492730         0.05117086
## W 0.013485901 0.015850144    0.01588142 0.01661666         0.06851691
## Y 0.025745811 0.039625360    0.05717311 0.03000231         0.01821336
## V 0.068246833 0.082492795    0.06511382 0.08723748         0.03209020
chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}

par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ nos2.human.aa.freq, data = aa.prop)
plot(beta.prop ~ nos2.human.aa.freq, data = aa.prop)
plot(a.plus.b.prop  ~ nos2.human.aa.freq, data = aa.prop)
plot(a.div.b ~ nos2.human.aa.freq, data = aa.prop)

par(mfrow = c(1,1), mar = c(1,1,1,1))

corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])


cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
##                       A    R    N    D    C    Q    E    G    H    I    L    K
## alpha.prop         0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop          0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop      0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b            0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## nos2.human.aa.freq 0.06 0.02 0.05 0.07 0.04 0.06 0.03 0.04 0.06 0.10 0.03 0.03
##                       M    F    P    S    T    W    Y    V
## alpha.prop         0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop          0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop      0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b            0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## nos2.human.aa.freq 0.06 0.05 0.06 0.07 0.05 0.07 0.02 0.03
dist(aa.prop.flipped, method = "euclidean")
##                    alpha.prop  beta.prop a.plus.b.prop    a.div.b
## beta.prop          0.13342098                                    
## a.plus.b.prop      0.09281824 0.08289406                         
## a.div.b            0.06699039 0.08659174    0.06175113           
## nos2.human.aa.freq 0.16808467 0.16522702    0.14320188 0.14812862
dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)


sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )


pander::pander(df)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 0.7748 0.7748 0.1681
beta 0.7863 0.7863 0.1652
alpha plus beta 0.8244 0.8244 0.1432 most.sim min.dist
alpha/beta 0.8161 0.8161 0.1481

Percent Identity Comparisons (PID)

data("BLOSUM50")


human_string = nos2_list[[1]]


zebrafish_string = nos2_list[[2]]


rat_string = nos2_list[[3]]

chimp_string <- entrez_fetch(db = "protein", 
                          id = "XP_016801373", 
                          rettype = "fasta")

chimp_string <- fasta_cleaner(chimp_string,  parse = F)


align_human_chimp <- Biostrings::pairwiseAlignment(pattern = human_string, 
                              subject = chimp_string, 
                              type = "global",
                              gapOpening = -9.5,
                              gapExtension = -0.5)

align_human_zebrafish <- Biostrings::pairwiseAlignment(pattern = human_string, 
                              subject = zebrafish_string, 
                              type = "global",
                              gapOpening = -9.5,
                              gapExtension = -0.5)

align_human_rat <- Biostrings::pairwiseAlignment(pattern = human_string, 
                              subject = rat_string, 
                              type = "global",
                              gapOpening = -9.5,
                              gapExtension = -0.5)

align_chimp_rat <- Biostrings::pairwiseAlignment(pattern = chimp_string, 
                              subject = rat_string, 
                              type = "global",
                              gapOpening = -9.5,
                              gapExtension = -0.5)

align_zebrafish_rat <- Biostrings::pairwiseAlignment(pattern = zebrafish_string, 
                              subject = rat_string, 
                              type = "global",
                              gapOpening = -9.5,
                              gapExtension = -0.5)

align_chimp_zebrafish <- Biostrings::pairwiseAlignment(pattern = chimp_string, 
                              subject = zebrafish_string, 
                              type = "global",
                              gapOpening = -9.5,
                              gapExtension = -0.5)


Biostrings::pid(align_human_chimp)
## [1] 32.64095
Biostrings::pid(align_human_zebrafish)
## [1] 42.55469
Biostrings::pid(align_human_rat)
## [1] 74.97956

PID table

pids <- c(1,                  NA,     NA,     NA,
          pid(align_human_chimp),          1,     NA,     NA,
          pid(align_human_zebrafish), pid(align_chimp_zebrafish),      1,     NA,
          pid(align_human_rat), pid(align_chimp_rat), pid(align_zebrafish_rat), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Human","Chimp","Zebrafish","Rat")   
colnames(mat) <- c("Human","Chimp","Zebrafish","Rat")   
pander::pander(mat)  
  Human Chimp Zebrafish Rat
Human 1 NA NA NA
Chimp 32.64 1 NA NA
Zebrafish 42.55 29.59 1 NA
Rat 74.98 29.3 39.56 1

PID methods comparison

method <- c("PID1", "PID2", "PID3","PID4")

PID <- c(pid(align_human_chimp,type = "PID1"),pid(align_human_chimp,type = "PID2"),pid(align_human_chimp,type = "PID3"),pid(align_human_chimp,type = "PID4") )

denominator <- c("(aligned positions + internal gap positions)", "(aligned positions + internal gap positions)", "(length shorter sequence)", "(average length of the two sequences)")

nos2_df <- data.frame(method = method, PID = PID, denominator = denominator)

pander(nos2_df)
method PID denominator
PID1 32.64 (aligned positions + internal gap positions)
PID2 75.03 (aligned positions + internal gap positions)
PID3 47.7 (length shorter sequence)
PID4 44.73 (average length of the two sequences)

Multiple sequence alignment

MSA data preparation

nos2_vector <- unlist(nos2_list)
names(nos2_vector) <- names(nos2_list)
nos2_vector_ss <- Biostrings::AAStringSet(nos2_vector)
nos2_align <- msa(nos2_vector_ss,
                     method = "ClustalW")
## use default substitution matrix
class(nos2_align) <- "AAMultipleAlignment"
nos2_align_seqinr <- msaConvert(nos2_align, 
                                   type = "seqinr::alignment")


ggmsa(nos2_align, start = 750, end = 800, char_width = 0.5, seq_name = T) 

Phylogenetic trees of sequences

nos2_dist <- seqinr::dist.alignment(nos2_align_seqinr, 
                                       matrix = "identity")
nos2_align_seqinr_rnd <- round(nos2_dist, 3)
tree_subset <- nj(nos2_dist)
plot.phylo(tree_subset, main="Nos2 family gene tree- rooted, no branch lengths",  
            use.edge.length = F)