Final portfolio assignment Nos2 - nitric oxide synthase 2

Introduction

This code compiles summary information about the gene NOS2 (nitric oxide synthase 2).

It also generates pairwise alignments and creates a phylogenetic tree to indicate the evolutionary relationship between the human version of the gene and other homologs from different species.

Preparation

library(BiocManager)

## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
##   is available with R version '4.1'; see https://bioconductor.org/install

library(drawProteins)

library(compbio4all)
library(ggmsa)

## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2

library(rentrez)
library(seqinr)
library(ape)

## 
## Attaching package: 'ape'

## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus

library(pander)


library(ggplot2)

library(msa)

## Loading required package: Biostrings

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: XVector

## Loading required package: GenomeInfoDb

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:ape':
## 
##     complement

## The following object is masked from 'package:seqinr':
## 
##     translate

## The following object is masked from 'package:base':
## 
##     strsplit

## 
## Attaching package: 'msa'

## The following object is masked from 'package:BiocManager':
## 
##     version

library(drawProteins)

library(Biostrings)

Accession numbers

Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The Neanderthal genome database was searched but did not have any information on Nos2. PDB only had information on the human Nos2 gene.

A protein BLAST search (https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome) was carried out excluding vertebrates and mammals to determine if it occurred outside of vertebrates and mammals. The gene does not appear in non-vertebrates or non-mammals.

Not available: Neanderthal

Does not occur: Outside of mammals and vertebrates

Accession number table

name = c("Human", "Zebrafish", "Norway rat", "Cattle", "Dog", "Sheep", "Pig", "House mouse", "Chicken", "Turkey")
id = c("NP_000616.3", "NP_001098407.1", "NP_036743.3", " NP_851380.2", "NP_001300777.1", "NP_001123373.1", "NP_999460.1", "NP_001300850.1", "NP_990292.2", "NP_001290142.1")

uni = c("P35228", "A7WK81", "Q06518", "A0A3Q1LSD1", "O62699", "Q4U3W6", "Q28969", "P29477", "NA", "H6S058")

sci_name = c("Homo sapiens", "Danio rerio", "Rattus norvegicus", "Bos taurus", "Canis lupus familiaris", "Ovis aries", "Sus scrofa", "Mus musculus", "Gallus gallus", "Meleagris gallopavo")

gene_name = c("NOS2", "nos2a","Nos2", "NOS3", "NOS2", "NOS3", "NOS3", "Nos2", " NOS2", "NOS2"  )

pdb = c("1NSI", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA")

nos2_table <- data.frame(ncbi.protein.accession = id,  UniProt.id= uni, PDB = pdb, species = sci_name, common.name = name, gene.name = gene_name)

pander(nos2_table)

Table continues below
ncbi.protein.accession	UniProt.id	PDB	species
NP_000616.3	P35228	1NSI	Homo sapiens
NP_001098407.1	A7WK81	NA	Danio rerio
NP_036743.3	Q06518	NA	Rattus norvegicus
NP_851380.2	A0A3Q1LSD1	NA	Bos taurus
NP_001300777.1	O62699	NA	Canis lupus familiaris
NP_001123373.1	Q4U3W6	NA	Ovis aries
NP_999460.1	Q28969	NA	Sus scrofa
NP_001300850.1	P29477	NA	Mus musculus
NP_990292.2	NA	NA	Gallus gallus
NP_001290142.1	H6S058	NA	Meleagris gallopavo

common.name	gene.name
Human	NOS2
Zebrafish	nos2a
Norway rat	Nos2
Cattle	NOS3
Dog	NOS2
Sheep	NOS3
Pig	NOS3
House mouse	Nos2
Chicken	NOS2
Turkey	NOS2

Data preparation

All sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.

nos2_list <- entrez_fetch_list(db = "protein", 
                          id = nos2_table$ncbi.protein.accession, 
                          rettype = "fasta")
length(nos2_list)

## [1] 10

nos2_list[[1]]

## [1] ">NP_000616.3 nitric oxide synthase, inducible [Homo sapiens]\nMACPWKFLFKTKFHQYAMNGEKDINNNVEKAPCATSSPVTQDDLQYHNLSKQQNESPQPLVETGKKSPES\nLVKLDATPLSSPRHVRIKNWGSGMTFQDTLHHKAKGILTCRSKSCLGSIMTPKSLTRGPRDKPTPPDELL\nPQAIEFVNQYYGSFKEAKIEEHLARVEAVTKEIETTGTYQLTGDELIFATKQAWRNAPRCIGRIQWSNLQ\nVFDARSCSTAREMFEHICRHVRYSTNNGNIRSAITVFPQRSDGKHDFRVWNAQLIRYAGYQMPDGSIRGD\nPANVEFTQLCIDLGWKPKYGRFDVVPLVLQANGRDPELFEIPPDLVLEVAMEHPKYEWFRELELKWYALP\nAVANMLLEVGGLEFPGCPFNGWYMGTEIGVRDFCDVQRYNILEEVGRRMGLETHKLASLWKDQAVVEINI\nAVLHSFQKQNVTIMDHHSAAESFMKYMQNEYRSRGGCPADWIWLVPPMSGSITPVFHQEMLNYVLSPFYY\nYQVEAWKTHVWQDEKRRPKRREIPLKVLVKAVLFACMLMRKTMASRVRVTILFATETGKSEALAWDLGAL\nFSCAFNPKVVCMDKYRLSCLEEERLLLVVTSTFGNGDCPGNGEKLKKSLFMLKELNNKFRYAVFGLGSSM\nYPRFCAFAHDIDQKLSHLGASQLTPMGEGDELSGQEDAFRSWAVQTFKAACETFDVRGKQHIQIPKLYTS\nNVTWDPHHYRLVQDSQPLDLSKALSSMHAKNVFTMRLKSRQNLQSPTSSRATILVELSCEDGQGLNYLPG\nEHLGVCPGNQPALVQGILERVVDGPTPHQTVRLEALDESGSYWVSDKRLPPCSLSQALTYFLDITTPPTQ\nLLLQKLAQVATEEPERQRLEALCQPSEYSKWKFTNSPTFLEVLEEFPSLRVSAGFLLSQLPILKPRFYSI\nSSSRDHTPTEIHLTVAVVTYHTRDGQGPLHHGVCSTWLNSLKPQDPVPCFVRNASGFHLPEDPSHPCILI\nGPGTGIAPFRSFWQQRLHDSQHKGVRGGRMTLVFGCRRPDEDHIYQEEMLEMAQKGVLHAVHTAYSRLPG\nKPKVYVQDILRQQLASEVLRVLHKEPGHLYVCGDVRMARDVAHTLKQLVAAKLKLNEEQVEDYFFQLKSQ\nKRYHEDIFGAVFPYEAKKDRVAVQPSSLEMSAL\n\n"

for(i in 1:length(nos2_list)){
  nos2_list[[i]] <- compbio4all::fasta_cleaner(nos2_list[[i]], parse = F)
}

General Protein information Protein diagram

P35228_json  <- drawProteins::get_features("P35228")

## [1] "Download has worked"

my_prot_df <- drawProteins::feature_to_dataframe(P35228_json)

my_canvas <- draw_canvas(my_prot_df)  
my_canvas <- draw_chains(my_canvas, my_prot_df, 
                         label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas

Draw dotplot

nos2_human <- rentrez::entrez_fetch(id = "NP_000616.3",
                                      db = "protein", 
                                      rettype="fasta")
nos2_human_vector <- fasta_cleaner(nos2_human)

par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

dotPlot(nos2_human_vector, 
        nos2_human_vector, 
        wsize = 1, 
        nmatch = 1)

dotPlot(nos2_human_vector, nos2_human_vector, 
        wsize = 10, 
        nmatch = 1)

dotPlot(nos2_human_vector,nos2_human_vector,
        wsize = 10, 
        nmatch = 5)

dotPlot(nos2_human_vector, nos2_human_vector, 
        wsize = 20,
        nmatch =5)

par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

dotPlot(nos2_human_vector, 
        nos2_human_vector,
        wsize = 1, 
        nmatch = 1)

Protein properties compiled from databases

The human homolog is listed in Alphafold (https://alphafold.ebi.ac.uk/entry/P35228). The predicted structure contains alpha helices, beta sheets, and disordered regions.

labels <- c("Major feature from Pfam", "Presense of disordered regions from DisProt", "Presense of tandem repeats from RepeatsDB", "Subcellular location from Uniprot", "Protein secondary structural classifications from PDB")
feature <- c("Flavodoxin_1 domain", "No information available", "No information available", "Cytoplasm and Cytosol", "Alpha and Beta")

URL <- c("http://pfam.xfam.org/protein/P29474#tabview=tab0", "No information available", "No information available", "https://www.uniprot.org/uniprot/P35228#subcellular_location", "https://www.rcsb.org/3d-view/1NSI/1")

df1 <- data.frame(labels, feature, URL)
pander(df1)

Table continues below
labels	feature
Major feature from Pfam	Flavodoxin_1 domain
Presense of disordered regions from DisProt	No information available
Presense of tandem repeats from RepeatsDB	No information available
Subcellular location from Uniprot	Cytoplasm and Cytosol
Protein secondary structural classifications from PDB	Alpha and Beta

URL
http://pfam.xfam.org/protein/P29474#tabview=tab0
No information available
No information available
https://www.uniprot.org/uniprot/P35228#subcellular_location
https://www.rcsb.org/3d-view/1NSI/1

Predict protein fold

Alphafold indicates that there are a mix of alpha helices and beta sheets. I predict that the machine-learning methods will indicate an a+b and a/b structure.

aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)

##    aa.1.1 alpha beta a.plus.b a.div.b
## 1       A   285  203      175     361
## 2       R    53   67       78     146
## 3       N    97  139      120     183
## 4       D   163  121      111     244
## 5       C    22   75       74      63
## 6       Q    67  122       74     114
## 7       E   134   86       86     257
## 8       G   197  297      171     377
## 9       H   111   49       33     107
## 10      I    91  120       93     239
## 11      L   221  177      110     339
## 12      K   249  115      112     321
## 13      M    48   16       25      91
## 14      F   123   85       52     158
## 15      P    82  127       71     188
## 16      S   122  341      126     327
## 17      T   119  253      117     238
## 18      W    33   44       30      72
## 19      Y    63  110      108     130
## 20      V   167  229      123     378

alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)

row.names(aa.prop) <- aa.1.1

aa.prop

##    alpha.prop   beta.prop a.plus.b.prop    a.div.b
## A 0.116469146 0.073126801    0.09264161 0.08331410
## R 0.021659174 0.024135447    0.04129169 0.03369490
## N 0.039640376 0.050072046    0.06352567 0.04223402
## D 0.066612178 0.043587896    0.05876125 0.05631202
## C 0.008990601 0.027017291    0.03917417 0.01453958
## Q 0.027380466 0.043948127    0.03917417 0.02630972
## E 0.054760932 0.030979827    0.04552673 0.05931225
## G 0.080506743 0.106988473    0.09052409 0.08700669
## H 0.045361667 0.017651297    0.01746956 0.02469421
## I 0.037188394 0.043227666    0.04923240 0.05515809
## L 0.090314671 0.063760807    0.05823187 0.07823679
## K 0.101757254 0.041426513    0.05929063 0.07408262
## M 0.019615856 0.005763689    0.01323452 0.02100162
## F 0.050265631 0.030619597    0.02752779 0.03646434
## P 0.033510421 0.045749280    0.03758602 0.04338795
## S 0.049856968 0.122838617    0.06670196 0.07546734
## T 0.048630977 0.091138329    0.06193753 0.05492730
## W 0.013485901 0.015850144    0.01588142 0.01661666
## Y 0.025745811 0.039625360    0.05717311 0.03000231
## V 0.068246833 0.082492795    0.06511382 0.08723748

par(mfrow = c(1,1), mar = c(4,4,4,4))
human_id <- rentrez::entrez_fetch(id = "NP_000616.3",
                                     db = "protein",
                                     rettype = "fasta")

human_id <- compbio4all::fasta_cleaner(human_id, parse = TRUE)

human_id.freq.table <- table(human_id)/length(human_id)


table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}
nos2.human.aa.freq <- table_to_vector(human_id.freq.table)
aa.names <- names(nos2.human.aa.freq)
any(aa.names == "U")

## [1] FALSE

aa.prop$nos2.human.aa.freq <- nos2.human.aa.freq
aa.prop

##    alpha.prop   beta.prop a.plus.b.prop    a.div.b nos2.human.aa.freq
## A 0.116469146 0.073126801    0.09264161 0.08331410         0.06244579
## R 0.021659174 0.024135447    0.04129169 0.03369490         0.02341717
## N 0.039640376 0.050072046    0.06352567 0.04223402         0.04509974
## D 0.066612178 0.043587896    0.05876125 0.05631202         0.06591500
## C 0.008990601 0.027017291    0.03917417 0.01453958         0.04423244
## Q 0.027380466 0.043948127    0.03917417 0.02630972         0.05897658
## E 0.054760932 0.030979827    0.04552673 0.05931225         0.03469211
## G 0.080506743 0.106988473    0.09052409 0.08700669         0.03555941
## H 0.045361667 0.017651297    0.01746956 0.02469421         0.05810928
## I 0.037188394 0.043227666    0.04923240 0.05515809         0.10147441
## L 0.090314671 0.063760807    0.05823187 0.07823679         0.02515178
## K 0.101757254 0.041426513    0.05929063 0.07408262         0.03209020
## M 0.019615856 0.005763689    0.01323452 0.02100162         0.06331310
## F 0.050265631 0.030619597    0.02752779 0.03646434         0.05377277
## P 0.033510421 0.045749280    0.03758602 0.04338795         0.05810928
## S 0.049856968 0.122838617    0.06670196 0.07546734         0.06764961
## T 0.048630977 0.091138329    0.06193753 0.05492730         0.05117086
## W 0.013485901 0.015850144    0.01588142 0.01661666         0.06851691
## Y 0.025745811 0.039625360    0.05717311 0.03000231         0.01821336
## V 0.068246833 0.082492795    0.06511382 0.08723748         0.03209020

chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}

par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ nos2.human.aa.freq, data = aa.prop)
plot(beta.prop ~ nos2.human.aa.freq, data = aa.prop)
plot(a.plus.b.prop  ~ nos2.human.aa.freq, data = aa.prop)
plot(a.div.b ~ nos2.human.aa.freq, data = aa.prop)

par(mfrow = c(1,1), mar = c(1,1,1,1))

corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])


cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)

##                       A    R    N    D    C    Q    E    G    H    I    L    K
## alpha.prop         0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop          0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop      0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b            0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## nos2.human.aa.freq 0.06 0.02 0.05 0.07 0.04 0.06 0.03 0.04 0.06 0.10 0.03 0.03
##                       M    F    P    S    T    W    Y    V
## alpha.prop         0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop          0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop      0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b            0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## nos2.human.aa.freq 0.06 0.05 0.06 0.07 0.05 0.07 0.02 0.03

dist(aa.prop.flipped, method = "euclidean")

##                    alpha.prop  beta.prop a.plus.b.prop    a.div.b
## beta.prop          0.13342098                                    
## a.plus.b.prop      0.09281824 0.08289406                         
## a.div.b            0.06699039 0.08659174    0.06175113           
## nos2.human.aa.freq 0.16808467 0.16522702    0.14320188 0.14812862

dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)


sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )


pander::pander(df)

fold.type	corr.sim	cosine.sim	Euclidean.dist	sim.sum	dist.sum
alpha	0.7748	0.7748	0.1681
beta	0.7863	0.7863	0.1652
alpha plus beta	0.8244	0.8244	0.1432	most.sim	min.dist
alpha/beta	0.8161	0.8161	0.1481

Percent Identity Comparisons (PID)

data("BLOSUM50")


human_string = nos2_list[[1]]


zebrafish_string = nos2_list[[2]]


rat_string = nos2_list[[3]]

chimp_string <- entrez_fetch(db = "protein", 
                          id = "XP_016801373", 
                          rettype = "fasta")

chimp_string <- fasta_cleaner(chimp_string,  parse = F)


align_human_chimp <- Biostrings::pairwiseAlignment(pattern = human_string, 
                              subject = chimp_string, 
                              type = "global",
                              gapOpening = -9.5,
                              gapExtension = -0.5)

align_human_zebrafish <- Biostrings::pairwiseAlignment(pattern = human_string, 
                              subject = zebrafish_string, 
                              type = "global",
                              gapOpening = -9.5,
                              gapExtension = -0.5)

align_human_rat <- Biostrings::pairwiseAlignment(pattern = human_string, 
                              subject = rat_string, 
                              type = "global",
                              gapOpening = -9.5,
                              gapExtension = -0.5)

align_chimp_rat <- Biostrings::pairwiseAlignment(pattern = chimp_string, 
                              subject = rat_string, 
                              type = "global",
                              gapOpening = -9.5,
                              gapExtension = -0.5)

align_zebrafish_rat <- Biostrings::pairwiseAlignment(pattern = zebrafish_string, 
                              subject = rat_string, 
                              type = "global",
                              gapOpening = -9.5,
                              gapExtension = -0.5)

align_chimp_zebrafish <- Biostrings::pairwiseAlignment(pattern = chimp_string, 
                              subject = zebrafish_string, 
                              type = "global",
                              gapOpening = -9.5,
                              gapExtension = -0.5)


Biostrings::pid(align_human_chimp)

## [1] 32.64095

Biostrings::pid(align_human_zebrafish)

## [1] 42.55469

Biostrings::pid(align_human_rat)

## [1] 74.97956

PID table

pids <- c(1,                  NA,     NA,     NA,
          pid(align_human_chimp),          1,     NA,     NA,
          pid(align_human_zebrafish), pid(align_chimp_zebrafish),      1,     NA,
          pid(align_human_rat), pid(align_chimp_rat), pid(align_zebrafish_rat), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Human","Chimp","Zebrafish","Rat")   
colnames(mat) <- c("Human","Chimp","Zebrafish","Rat")   
pander::pander(mat)

	Human	Chimp	Zebrafish	Rat
Human	1	NA	NA	NA
Chimp	32.64	1	NA	NA
Zebrafish	42.55	29.59	1	NA
Rat	74.98	29.3	39.56	1

PID methods comparison

method <- c("PID1", "PID2", "PID3","PID4")

PID <- c(pid(align_human_chimp,type = "PID1"),pid(align_human_chimp,type = "PID2"),pid(align_human_chimp,type = "PID3"),pid(align_human_chimp,type = "PID4") )

denominator <- c("(aligned positions + internal gap positions)", "(aligned positions + internal gap positions)", "(length shorter sequence)", "(average length of the two sequences)")

nos2_df <- data.frame(method = method, PID = PID, denominator = denominator)

pander(nos2_df)

method	PID	denominator
PID1	32.64	(aligned positions + internal gap positions)
PID2	75.03	(aligned positions + internal gap positions)
PID3	47.7	(length shorter sequence)
PID4	44.73	(average length of the two sequences)

Multiple sequence alignment

MSA data preparation

nos2_vector <- unlist(nos2_list)
names(nos2_vector) <- names(nos2_list)
nos2_vector_ss <- Biostrings::AAStringSet(nos2_vector)
nos2_align <- msa(nos2_vector_ss,
                     method = "ClustalW")

## use default substitution matrix

class(nos2_align) <- "AAMultipleAlignment"
nos2_align_seqinr <- msaConvert(nos2_align, 
                                   type = "seqinr::alignment")


ggmsa(nos2_align, start = 750, end = 800, char_width = 0.5, seq_name = T)

Phylogenetic trees of sequences

nos2_dist <- seqinr::dist.alignment(nos2_align_seqinr, 
                                       matrix = "identity")
nos2_align_seqinr_rnd <- round(nos2_dist, 3)
tree_subset <- nj(nos2_dist)
plot.phylo(tree_subset, main="Nos2 family gene tree- rooted, no branch lengths",  
            use.edge.length = F)

Final portfolio assignment Nos2 - nitric oxide synthase 2

Niti Negi

12/9/2021