Introduction
This code compiles summary information about the gene NOS2 (nitric oxide synthase 2).
It also generates pairwise alignments and creates a phylogenetic tree to indicate the evolutionary relationship between the human version of the gene and other homologs from different species.
Preparation
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
library(drawProteins)
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(drawProteins)
library(Biostrings)
Accession numbers
Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The Neanderthal genome database was searched but did not have any information on Nos2. PDB only had information on the human Nos2 gene.
A protein BLAST search (https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome) was carried out excluding vertebrates and mammals to determine if it occurred outside of vertebrates and mammals. The gene does not appear in non-vertebrates or non-mammals.
Not available: Neanderthal
Does not occur: Outside of mammals and vertebrates
Accession number table
name = c("Human", "Zebrafish", "Norway rat", "Cattle", "Dog", "Sheep", "Pig", "House mouse", "Chicken", "Turkey")
id = c("NP_000616.3", "NP_001098407.1", "NP_036743.3", " NP_851380.2", "NP_001300777.1", "NP_001123373.1", "NP_999460.1", "NP_001300850.1", "NP_990292.2", "NP_001290142.1")
uni = c("P35228", "A7WK81", "Q06518", "A0A3Q1LSD1", "O62699", "Q4U3W6", "Q28969", "P29477", "NA", "H6S058")
sci_name = c("Homo sapiens", "Danio rerio", "Rattus norvegicus", "Bos taurus", "Canis lupus familiaris", "Ovis aries", "Sus scrofa", "Mus musculus", "Gallus gallus", "Meleagris gallopavo")
gene_name = c("NOS2", "nos2a","Nos2", "NOS3", "NOS2", "NOS3", "NOS3", "Nos2", " NOS2", "NOS2" )
pdb = c("1NSI", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA")
nos2_table <- data.frame(ncbi.protein.accession = id, UniProt.id= uni, PDB = pdb, species = sci_name, common.name = name, gene.name = gene_name)
pander(nos2_table)
| ncbi.protein.accession | UniProt.id | PDB | species |
|---|---|---|---|
| NP_000616.3 | P35228 | 1NSI | Homo sapiens |
| NP_001098407.1 | A7WK81 | NA | Danio rerio |
| NP_036743.3 | Q06518 | NA | Rattus norvegicus |
| NP_851380.2 | A0A3Q1LSD1 | NA | Bos taurus |
| NP_001300777.1 | O62699 | NA | Canis lupus familiaris |
| NP_001123373.1 | Q4U3W6 | NA | Ovis aries |
| NP_999460.1 | Q28969 | NA | Sus scrofa |
| NP_001300850.1 | P29477 | NA | Mus musculus |
| NP_990292.2 | NA | NA | Gallus gallus |
| NP_001290142.1 | H6S058 | NA | Meleagris gallopavo |
| common.name | gene.name |
|---|---|
| Human | NOS2 |
| Zebrafish | nos2a |
| Norway rat | Nos2 |
| Cattle | NOS3 |
| Dog | NOS2 |
| Sheep | NOS3 |
| Pig | NOS3 |
| House mouse | Nos2 |
| Chicken | NOS2 |
| Turkey | NOS2 |
Data preparation
All sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.
nos2_list <- entrez_fetch_list(db = "protein",
id = nos2_table$ncbi.protein.accession,
rettype = "fasta")
length(nos2_list)
## [1] 10
nos2_list[[1]]
## [1] ">NP_000616.3 nitric oxide synthase, inducible [Homo sapiens]\nMACPWKFLFKTKFHQYAMNGEKDINNNVEKAPCATSSPVTQDDLQYHNLSKQQNESPQPLVETGKKSPES\nLVKLDATPLSSPRHVRIKNWGSGMTFQDTLHHKAKGILTCRSKSCLGSIMTPKSLTRGPRDKPTPPDELL\nPQAIEFVNQYYGSFKEAKIEEHLARVEAVTKEIETTGTYQLTGDELIFATKQAWRNAPRCIGRIQWSNLQ\nVFDARSCSTAREMFEHICRHVRYSTNNGNIRSAITVFPQRSDGKHDFRVWNAQLIRYAGYQMPDGSIRGD\nPANVEFTQLCIDLGWKPKYGRFDVVPLVLQANGRDPELFEIPPDLVLEVAMEHPKYEWFRELELKWYALP\nAVANMLLEVGGLEFPGCPFNGWYMGTEIGVRDFCDVQRYNILEEVGRRMGLETHKLASLWKDQAVVEINI\nAVLHSFQKQNVTIMDHHSAAESFMKYMQNEYRSRGGCPADWIWLVPPMSGSITPVFHQEMLNYVLSPFYY\nYQVEAWKTHVWQDEKRRPKRREIPLKVLVKAVLFACMLMRKTMASRVRVTILFATETGKSEALAWDLGAL\nFSCAFNPKVVCMDKYRLSCLEEERLLLVVTSTFGNGDCPGNGEKLKKSLFMLKELNNKFRYAVFGLGSSM\nYPRFCAFAHDIDQKLSHLGASQLTPMGEGDELSGQEDAFRSWAVQTFKAACETFDVRGKQHIQIPKLYTS\nNVTWDPHHYRLVQDSQPLDLSKALSSMHAKNVFTMRLKSRQNLQSPTSSRATILVELSCEDGQGLNYLPG\nEHLGVCPGNQPALVQGILERVVDGPTPHQTVRLEALDESGSYWVSDKRLPPCSLSQALTYFLDITTPPTQ\nLLLQKLAQVATEEPERQRLEALCQPSEYSKWKFTNSPTFLEVLEEFPSLRVSAGFLLSQLPILKPRFYSI\nSSSRDHTPTEIHLTVAVVTYHTRDGQGPLHHGVCSTWLNSLKPQDPVPCFVRNASGFHLPEDPSHPCILI\nGPGTGIAPFRSFWQQRLHDSQHKGVRGGRMTLVFGCRRPDEDHIYQEEMLEMAQKGVLHAVHTAYSRLPG\nKPKVYVQDILRQQLASEVLRVLHKEPGHLYVCGDVRMARDVAHTLKQLVAAKLKLNEEQVEDYFFQLKSQ\nKRYHEDIFGAVFPYEAKKDRVAVQPSSLEMSAL\n\n"
for(i in 1:length(nos2_list)){
nos2_list[[i]] <- compbio4all::fasta_cleaner(nos2_list[[i]], parse = F)
}
General Protein information Protein diagram
P35228_json <- drawProteins::get_features("P35228")
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(P35228_json)
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df,
label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas
Draw dotplot
nos2_human <- rentrez::entrez_fetch(id = "NP_000616.3",
db = "protein",
rettype="fasta")
nos2_human_vector <- fasta_cleaner(nos2_human)
par(mfrow = c(2,2),
mar = c(0,0,2,1))
dotPlot(nos2_human_vector,
nos2_human_vector,
wsize = 1,
nmatch = 1)
dotPlot(nos2_human_vector, nos2_human_vector,
wsize = 10,
nmatch = 1)
dotPlot(nos2_human_vector,nos2_human_vector,
wsize = 10,
nmatch = 5)
dotPlot(nos2_human_vector, nos2_human_vector,
wsize = 20,
nmatch =5)
par(mfrow = c(1,1),
mar = c(4,4,4,4))
dotPlot(nos2_human_vector,
nos2_human_vector,
wsize = 1,
nmatch = 1)
Protein properties compiled from databases
The human homolog is listed in Alphafold (https://alphafold.ebi.ac.uk/entry/P35228). The predicted structure contains alpha helices, beta sheets, and disordered regions.
labels <- c("Major feature from Pfam", "Presense of disordered regions from DisProt", "Presense of tandem repeats from RepeatsDB", "Subcellular location from Uniprot", "Protein secondary structural classifications from PDB")
feature <- c("Flavodoxin_1 domain", "No information available", "No information available", "Cytoplasm and Cytosol", "Alpha and Beta")
URL <- c("http://pfam.xfam.org/protein/P29474#tabview=tab0", "No information available", "No information available", "https://www.uniprot.org/uniprot/P35228#subcellular_location", "https://www.rcsb.org/3d-view/1NSI/1")
df1 <- data.frame(labels, feature, URL)
pander(df1)
| labels | feature |
|---|---|
| Major feature from Pfam | Flavodoxin_1 domain |
| Presense of disordered regions from DisProt | No information available |
| Presense of tandem repeats from RepeatsDB | No information available |
| Subcellular location from Uniprot | Cytoplasm and Cytosol |
| Protein secondary structural classifications from PDB | Alpha and Beta |
| URL |
|---|
| http://pfam.xfam.org/protein/P29474#tabview=tab0 |
| No information available |
| No information available |
| https://www.uniprot.org/uniprot/P35228#subcellular_location |
| https://www.rcsb.org/3d-view/1NSI/1 |
Predict protein fold
Alphafold indicates that there are a mix of alpha helices and beta sheets. I predict that the machine-learning methods will indicate an a+b and a/b structure.
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
## aa.1.1 alpha beta a.plus.b a.div.b
## 1 A 285 203 175 361
## 2 R 53 67 78 146
## 3 N 97 139 120 183
## 4 D 163 121 111 244
## 5 C 22 75 74 63
## 6 Q 67 122 74 114
## 7 E 134 86 86 257
## 8 G 197 297 171 377
## 9 H 111 49 33 107
## 10 I 91 120 93 239
## 11 L 221 177 110 339
## 12 K 249 115 112 321
## 13 M 48 16 25 91
## 14 F 123 85 52 158
## 15 P 82 127 71 188
## 16 S 122 341 126 327
## 17 T 119 253 117 238
## 18 W 33 44 30 72
## 19 Y 63 110 108 130
## 20 V 167 229 123 378
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
row.names(aa.prop) <- aa.1.1
aa.prop
## alpha.prop beta.prop a.plus.b.prop a.div.b
## A 0.116469146 0.073126801 0.09264161 0.08331410
## R 0.021659174 0.024135447 0.04129169 0.03369490
## N 0.039640376 0.050072046 0.06352567 0.04223402
## D 0.066612178 0.043587896 0.05876125 0.05631202
## C 0.008990601 0.027017291 0.03917417 0.01453958
## Q 0.027380466 0.043948127 0.03917417 0.02630972
## E 0.054760932 0.030979827 0.04552673 0.05931225
## G 0.080506743 0.106988473 0.09052409 0.08700669
## H 0.045361667 0.017651297 0.01746956 0.02469421
## I 0.037188394 0.043227666 0.04923240 0.05515809
## L 0.090314671 0.063760807 0.05823187 0.07823679
## K 0.101757254 0.041426513 0.05929063 0.07408262
## M 0.019615856 0.005763689 0.01323452 0.02100162
## F 0.050265631 0.030619597 0.02752779 0.03646434
## P 0.033510421 0.045749280 0.03758602 0.04338795
## S 0.049856968 0.122838617 0.06670196 0.07546734
## T 0.048630977 0.091138329 0.06193753 0.05492730
## W 0.013485901 0.015850144 0.01588142 0.01661666
## Y 0.025745811 0.039625360 0.05717311 0.03000231
## V 0.068246833 0.082492795 0.06511382 0.08723748
par(mfrow = c(1,1), mar = c(4,4,4,4))
human_id <- rentrez::entrez_fetch(id = "NP_000616.3",
db = "protein",
rettype = "fasta")
human_id <- compbio4all::fasta_cleaner(human_id, parse = TRUE)
human_id.freq.table <- table(human_id)/length(human_id)
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
nos2.human.aa.freq <- table_to_vector(human_id.freq.table)
aa.names <- names(nos2.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
aa.prop$nos2.human.aa.freq <- nos2.human.aa.freq
aa.prop
## alpha.prop beta.prop a.plus.b.prop a.div.b nos2.human.aa.freq
## A 0.116469146 0.073126801 0.09264161 0.08331410 0.06244579
## R 0.021659174 0.024135447 0.04129169 0.03369490 0.02341717
## N 0.039640376 0.050072046 0.06352567 0.04223402 0.04509974
## D 0.066612178 0.043587896 0.05876125 0.05631202 0.06591500
## C 0.008990601 0.027017291 0.03917417 0.01453958 0.04423244
## Q 0.027380466 0.043948127 0.03917417 0.02630972 0.05897658
## E 0.054760932 0.030979827 0.04552673 0.05931225 0.03469211
## G 0.080506743 0.106988473 0.09052409 0.08700669 0.03555941
## H 0.045361667 0.017651297 0.01746956 0.02469421 0.05810928
## I 0.037188394 0.043227666 0.04923240 0.05515809 0.10147441
## L 0.090314671 0.063760807 0.05823187 0.07823679 0.02515178
## K 0.101757254 0.041426513 0.05929063 0.07408262 0.03209020
## M 0.019615856 0.005763689 0.01323452 0.02100162 0.06331310
## F 0.050265631 0.030619597 0.02752779 0.03646434 0.05377277
## P 0.033510421 0.045749280 0.03758602 0.04338795 0.05810928
## S 0.049856968 0.122838617 0.06670196 0.07546734 0.06764961
## T 0.048630977 0.091138329 0.06193753 0.05492730 0.05117086
## W 0.013485901 0.015850144 0.01588142 0.01661666 0.06851691
## Y 0.025745811 0.039625360 0.05717311 0.03000231 0.01821336
## V 0.068246833 0.082492795 0.06511382 0.08723748 0.03209020
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ nos2.human.aa.freq, data = aa.prop)
plot(beta.prop ~ nos2.human.aa.freq, data = aa.prop)
plot(a.plus.b.prop ~ nos2.human.aa.freq, data = aa.prop)
plot(a.div.b ~ nos2.human.aa.freq, data = aa.prop)
par(mfrow = c(1,1), mar = c(1,1,1,1))
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## nos2.human.aa.freq 0.06 0.02 0.05 0.07 0.04 0.06 0.03 0.04 0.06 0.10 0.03 0.03
## M F P S T W Y V
## alpha.prop 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## nos2.human.aa.freq 0.06 0.05 0.06 0.07 0.05 0.07 0.02 0.03
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## nos2.human.aa.freq 0.16808467 0.16522702 0.14320188 0.14812862
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7748 | 0.7748 | 0.1681 | ||
| beta | 0.7863 | 0.7863 | 0.1652 | ||
| alpha plus beta | 0.8244 | 0.8244 | 0.1432 | most.sim | min.dist |
| alpha/beta | 0.8161 | 0.8161 | 0.1481 |
Percent Identity Comparisons (PID)
data("BLOSUM50")
human_string = nos2_list[[1]]
zebrafish_string = nos2_list[[2]]
rat_string = nos2_list[[3]]
chimp_string <- entrez_fetch(db = "protein",
id = "XP_016801373",
rettype = "fasta")
chimp_string <- fasta_cleaner(chimp_string, parse = F)
align_human_chimp <- Biostrings::pairwiseAlignment(pattern = human_string,
subject = chimp_string,
type = "global",
gapOpening = -9.5,
gapExtension = -0.5)
align_human_zebrafish <- Biostrings::pairwiseAlignment(pattern = human_string,
subject = zebrafish_string,
type = "global",
gapOpening = -9.5,
gapExtension = -0.5)
align_human_rat <- Biostrings::pairwiseAlignment(pattern = human_string,
subject = rat_string,
type = "global",
gapOpening = -9.5,
gapExtension = -0.5)
align_chimp_rat <- Biostrings::pairwiseAlignment(pattern = chimp_string,
subject = rat_string,
type = "global",
gapOpening = -9.5,
gapExtension = -0.5)
align_zebrafish_rat <- Biostrings::pairwiseAlignment(pattern = zebrafish_string,
subject = rat_string,
type = "global",
gapOpening = -9.5,
gapExtension = -0.5)
align_chimp_zebrafish <- Biostrings::pairwiseAlignment(pattern = chimp_string,
subject = zebrafish_string,
type = "global",
gapOpening = -9.5,
gapExtension = -0.5)
Biostrings::pid(align_human_chimp)
## [1] 32.64095
Biostrings::pid(align_human_zebrafish)
## [1] 42.55469
Biostrings::pid(align_human_rat)
## [1] 74.97956
PID table
pids <- c(1, NA, NA, NA,
pid(align_human_chimp), 1, NA, NA,
pid(align_human_zebrafish), pid(align_chimp_zebrafish), 1, NA,
pid(align_human_rat), pid(align_chimp_rat), pid(align_zebrafish_rat), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Human","Chimp","Zebrafish","Rat")
colnames(mat) <- c("Human","Chimp","Zebrafish","Rat")
pander::pander(mat)
| Human | Chimp | Zebrafish | Rat | |
|---|---|---|---|---|
| Human | 1 | NA | NA | NA |
| Chimp | 32.64 | 1 | NA | NA |
| Zebrafish | 42.55 | 29.59 | 1 | NA |
| Rat | 74.98 | 29.3 | 39.56 | 1 |
PID methods comparison
method <- c("PID1", "PID2", "PID3","PID4")
PID <- c(pid(align_human_chimp,type = "PID1"),pid(align_human_chimp,type = "PID2"),pid(align_human_chimp,type = "PID3"),pid(align_human_chimp,type = "PID4") )
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions + internal gap positions)", "(length shorter sequence)", "(average length of the two sequences)")
nos2_df <- data.frame(method = method, PID = PID, denominator = denominator)
pander(nos2_df)
| method | PID | denominator |
|---|---|---|
| PID1 | 32.64 | (aligned positions + internal gap positions) |
| PID2 | 75.03 | (aligned positions + internal gap positions) |
| PID3 | 47.7 | (length shorter sequence) |
| PID4 | 44.73 | (average length of the two sequences) |
Multiple sequence alignment
MSA data preparation
nos2_vector <- unlist(nos2_list)
names(nos2_vector) <- names(nos2_list)
nos2_vector_ss <- Biostrings::AAStringSet(nos2_vector)
nos2_align <- msa(nos2_vector_ss,
method = "ClustalW")
## use default substitution matrix
class(nos2_align) <- "AAMultipleAlignment"
nos2_align_seqinr <- msaConvert(nos2_align,
type = "seqinr::alignment")
ggmsa(nos2_align, start = 750, end = 800, char_width = 0.5, seq_name = T)
Phylogenetic trees of sequences
nos2_dist <- seqinr::dist.alignment(nos2_align_seqinr,
matrix = "identity")
nos2_align_seqinr_rnd <- round(nos2_dist, 3)
tree_subset <- nj(nos2_dist)
plot.phylo(tree_subset, main="Nos2 family gene tree- rooted, no branch lengths",
use.edge.length = F)