The Apolipoprotein 1 gene, better known as APOL1, is a protein involved in processes around the kidneys and liver. Genetic mutations in this gene have been found to be cause of kidney diseases. The following code will give a detailed summary of the properties, phylogenies, alignments, and evolutionary characteristics of the APOL1 gene in humans to other species containing the gene.
RefSeq Gene: https://www.ncbi.nlm.nih.gov/search/all/?term=NP_001130012.1 RefSeq Homologene: N/A Neanderthal Genome: N/A UniProt: https://www.uniprot.org/uniprot/O14791 PDB: https://www.rcsb.org/structure/7L6K
Download and load necessary packages: Comment the installation commands out after they are installed!
###CRAN PACKAGES
##downloaded using install.packages()
#install.packages("rentrez",dependencies = TRUE)
#install.packages("devtools")
#install.packages("ape")
#install.packages("seqinr")
#install.packages("BiocManager")
#install.packages("ggplot2")
library(rentrez)
## Warning: package 'rentrez' was built under R version 4.1.2
library(devtools)
## Loading required package: usethis
library(ape)
## Warning: package 'ape' was built under R version 4.1.2
library(seqinr)
##
## Attaching package: 'seqinr'
## The following objects are masked from 'package:ape':
##
## as.alignment, consensus
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
##
## Attaching package: 'BiocManager'
## The following object is masked from 'package:devtools':
##
## install
library(ggplot2)
###BIOCONDUCTOR PACKAGES
##downloaded with BiocManager::install()
#BiocManager::install("msa")
#BiocManager::install("Biostrings")
#BiocManager::install("drawProteins")
#BiocManager::install("HGNChelper")
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(Biostrings)
library(drawProteins)
library(HGNChelper)
## Warning: package 'HGNChelper' was built under R version 4.1.2
###GITHUB PACKAGES
##requires devtools package and its function install_github()
#devtools::install_github("brouwern/compbio4all")
#devtools::install_github("YuLab-SMU/ggmsa")
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
## ggmsa v1.1.2 Document: http://yulab-smu.top/ggmsa/
##
## If you use ggmsa in published research, please cite: DOI: 10.18129/B9.bioc.ggmsa
The identification numbers, known as accession numbers, for the APOL1 gene were obtained from RefSeq, UniProt, and Protein Data Bank (PDB). Another resource for accession numbers is RefSeq Homologene, but this gene does not have a Homologene page, therefore no accession number from this source. However, that makes for interesting results to be shown later! ### Accession NUmber Table Not available: Neanderthal, RefSeq Homologene
#Data
apol1.table <- c("NP_001130012.1", "O14791", "7L6K", "Homo sapiens", "Human", "APOL1",
"XP_018873842.1", "A0A2I2ZQF1", "NA", "Gorilla gorilla", "Gorilla", "APOL1",
"NP_808412.1", "Q8CCA5", "NA", "Mus musculus", "Mouse", "APOL10A",
"NP_001025309.1", "Q7T131", "NA", "Danio rerio", "Zebrafish", "APOL1",
"XP_006242011.1", "NA", "NA", "Rattus norvegicus", "Rat", "APOL1",
"XP_003264742", "NA", "NA", "Nomascus leucogenys", "Northern White Cheeked Gibbon", "APOL1",
"XP_025255312", "NA", "NA", "Theropithecus gelada", "Gelada", "APOL1",
"XP_011834337", "NA", "NA", "Mandrillus leucophaeus", "Drill", "APOL1",
"NP_001289030.1", "R4TIY2", "NA", "Papio anubis", "Olive Baboon", "APOL1",
"XP_008588416", "NA", "NA", "Galeopterus variegatus", "Sunda Flying Lemur", "APOL1")
#Getting information on the table
is(apol1.table)
## [1] "character" "vector"
## [3] "data.frameRowLabels" "SuperClassMethod"
## [5] "character_OR_connection" "character_OR_NULL"
## [7] "atomic" "EnumerationValue"
## [9] "vector_OR_Vector" "vector_OR_factor"
class(apol1.table)
## [1] "character"
length(apol1.table)
## [1] 60
#Creating the matrix and cleaning it up with Pander
apol1.matrix <- matrix(apol1.table, nrow = 10, byrow = T)
apol1.df <- data.frame(apol1.matrix, stringsAsFactors = F)
names(apol1.df) <- c("RefSeq", "Uniprot", "PDB", "Scientific Name", "Common Name", "Gene Name")
apol1.df
## RefSeq Uniprot PDB Scientific Name
## 1 NP_001130012.1 O14791 7L6K Homo sapiens
## 2 XP_018873842.1 A0A2I2ZQF1 NA Gorilla gorilla
## 3 NP_808412.1 Q8CCA5 NA Mus musculus
## 4 NP_001025309.1 Q7T131 NA Danio rerio
## 5 XP_006242011.1 NA NA Rattus norvegicus
## 6 XP_003264742 NA NA Nomascus leucogenys
## 7 XP_025255312 NA NA Theropithecus gelada
## 8 XP_011834337 NA NA Mandrillus leucophaeus
## 9 NP_001289030.1 R4TIY2 NA Papio anubis
## 10 XP_008588416 NA NA Galeopterus variegatus
## Common Name Gene Name
## 1 Human APOL1
## 2 Gorilla APOL1
## 3 Mouse APOL10A
## 4 Zebrafish APOL1
## 5 Rat APOL1
## 6 Northern White Cheeked Gibbon APOL1
## 7 Gelada APOL1
## 8 Drill APOL1
## 9 Olive Baboon APOL1
## 10 Sunda Flying Lemur APOL1
pander::pander(apol1.df)
| RefSeq | Uniprot | PDB | Scientific Name |
|---|---|---|---|
| NP_001130012.1 | O14791 | 7L6K | Homo sapiens |
| XP_018873842.1 | A0A2I2ZQF1 | NA | Gorilla gorilla |
| NP_808412.1 | Q8CCA5 | NA | Mus musculus |
| NP_001025309.1 | Q7T131 | NA | Danio rerio |
| XP_006242011.1 | NA | NA | Rattus norvegicus |
| XP_003264742 | NA | NA | Nomascus leucogenys |
| XP_025255312 | NA | NA | Theropithecus gelada |
| XP_011834337 | NA | NA | Mandrillus leucophaeus |
| NP_001289030.1 | R4TIY2 | NA | Papio anubis |
| XP_008588416 | NA | NA | Galeopterus variegatus |
| Common Name | Gene Name |
|---|---|
| Human | APOL1 |
| Gorilla | APOL1 |
| Mouse | APOL10A |
| Zebrafish | APOL1 |
| Rat | APOL1 |
| Northern White Cheeked Gibbon | APOL1 |
| Gelada | APOL1 |
| Drill | APOL1 |
| Olive Baboon | APOL1 |
| Sunda Flying Lemur | APOL1 |
#Extracting RefSeq Accession Numbers from the Data Frame
apol1.df$RefSeq
## [1] "NP_001130012.1" "XP_018873842.1" "NP_808412.1" "NP_001025309.1"
## [5] "XP_006242011.1" "XP_003264742" "XP_025255312" "XP_011834337"
## [9] "NP_001289030.1" "XP_008588416"
#Putting the Accession Numbers into a Vector
apol1 <- entrez_fetch(db = "protein",
id = apol1.df$RefSeq,
rettype = "fasta")
#A cleaner version of all the accession number (very long)
cat(apol1)
#Storing the data
entrez_fetch_list <- function(db, id, rettype, ...){
#setup list for storing output
n.seq <- length(id)
list.output <- as.list(rep(NA, n.seq))
names(list.output) <- id
# get output
for(i in 1:length(id)){
list.output[[i]] <- rentrez::entrez_fetch(db = db,
id = id[i],
rettype = rettype)
}
return(list.output)
}
#Downloading the Sequences
apol1.list <- entrez_fetch_list(db = "protein",
id = apol1.df$RefSeq,
rettype = "fasta")
#Getting characteristics of the list
length(apol1.list)
## [1] 10
apol1.list[[1]]
## [1] ">NP_001130012.1 apolipoprotein L1 isoform a precursor [Homo sapiens]\nMEGAALLRVSVLCIWMSALFLGVGVRAEEAGARVQQNVPSGTDTGDPQSKPLGDWAAGTMDPESSIFIED\nAIKYFKEKVSTQNLLLLLTDNEAWNGFVAAAELPRNEADELRKALDNLARQMIMKDKNWHDKGQQYRNWF\nLKEFPRLKSELEDNIRRLRALADGVQKVHKGTTIANVVSGSLSISSGILTLVGMGLAPFTEGGSLVLLEP\nGMELGITAALTGITSSTMDYGKKWWTQAQAHDLVIKSLDKLKEVREFLGENISNFLSLAGNTYQLTRGIG\nKDIRALRRARANLQSVPHASASRPRVTEPISAESGEQVERVNEPSILEMSRGVKLTDVAPVSFFLVLDVV\nYLVYESKHLHEGAKSETAEELKKVAQELEEKLNILNNNYKILQADQEL\n\n"
#Using a for loop to clean each FASTA file. This leaves less room for error and is more concise than writing 10 separate lines for each species.
for(i in 1:length(apol1.list)){
apol1.list[[i]] <- fasta_cleaner(apol1.list[[i]], parse = F)
}
Make sure to have the following packages installed and loaded: BiocManager drawProteins ggplot2
O14791_apol1 <- drawProteins::get_features("O14791")
## [1] "Download has worked"
is(O14791_apol1)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df <- drawProteins::feature_to_dataframe(O14791_apol1)
is(my_prot_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp SIGNAL 1 27 26 O14791 APOL1_HUMAN 9606 1
## featuresTemp.1 CHAIN 28 398 370 O14791 APOL1_HUMAN 9606 1
## featuresTemp.2 REGION 35 55 20 O14791 APOL1_HUMAN 9606 1
## featuresTemp.3 REGION 297 317 20 O14791 APOL1_HUMAN 9606 1
## featuresTemp.4 MOD_RES 311 311 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.5 MOD_RES 314 314 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.6 CARBOHYD 261 261 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.7 VAR_SEQ 1 1 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.8 VAR_SEQ 16 33 17 O14791 APOL1_HUMAN 9606 1
## featuresTemp.9 VARIANT 150 150 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.10 VARIANT 188 188 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.11 VARIANT 228 228 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.12 VARIANT 255 255 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.13 VARIANT 337 337 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.14 VARIANT 342 342 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.15 VARIANT 384 384 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.16 CONFLICT 24 24 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.17 CONFLICT 256 256 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.18 CONFLICT 346 346 0 O14791 APOL1_HUMAN 9606 1
O14791_apol1 <- drawProteins::get_features("O14791")
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(O14791_apol1)
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_regions(my_canvas, my_prot_df)
my_canvas <- draw_motif(my_canvas, my_prot_df)
my_canvas <- draw_phospho(my_canvas, my_prot_df)
my_canvas <- draw_repeat(my_canvas, my_prot_df)
my_canvas <- draw_recept_dom(my_canvas, my_prot_df)
my_canvas <- draw_folding(my_canvas, my_prot_df)
my_canvas
my_prot_df
## type description begin end
## featuresTemp SIGNAL 1 27
## featuresTemp.1 CHAIN Apolipoprotein L1 28 398
## featuresTemp.2 REGION Disordered 35 55
## featuresTemp.3 REGION Disordered 297 317
## featuresTemp.4 MOD_RES Phosphoserine; by FAM20C 311 311
## featuresTemp.5 MOD_RES Phosphoserine; by FAM20C 314 314
## featuresTemp.6 CARBOHYD N-linked (GlcNAc...) asparagine 261 261
## featuresTemp.7 VAR_SEQ NONE 1 1
## featuresTemp.8 VAR_SEQ NONE 16 33
## featuresTemp.9 VARIANT in dbSNP:rs2239785 150 150
## featuresTemp.10 VARIANT in a breast cancer sample; somatic mutation 188 188
## featuresTemp.11 VARIANT in dbSNP:rs136175 228 228
## featuresTemp.12 VARIANT in dbSNP:rs136176 255 255
## featuresTemp.13 VARIANT in dbSNP:rs16996616 337 337
## featuresTemp.14 VARIANT in FSGS4; dbSNP:rs73885319 342 342
## featuresTemp.15 VARIANT in FSGS4; dbSNP:rs60910145 384 384
## featuresTemp.16 CONFLICT in Ref. 1; AAG53690 and 2; AAK11591 24 24
## featuresTemp.17 CONFLICT in Ref. 3; AAK20210 256 256
## featuresTemp.18 CONFLICT in Ref. 3; AAK20210 346 346
## length accession entryName taxid order
## featuresTemp 26 O14791 APOL1_HUMAN 9606 1
## featuresTemp.1 370 O14791 APOL1_HUMAN 9606 1
## featuresTemp.2 20 O14791 APOL1_HUMAN 9606 1
## featuresTemp.3 20 O14791 APOL1_HUMAN 9606 1
## featuresTemp.4 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.5 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.6 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.7 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.8 17 O14791 APOL1_HUMAN 9606 1
## featuresTemp.9 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.10 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.11 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.12 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.13 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.14 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.15 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.16 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.17 0 O14791 APOL1_HUMAN 9606 1
## featuresTemp.18 0 O14791 APOL1_HUMAN 9606 1
O14791_apol1 <- drawProteins::get_features("O14791")
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(O14791_apol1)
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_phospho(my_canvas, my_prot_df)
my_canvas <- draw_folding(my_canvas, my_prot_df)
my_canvas
## Draw Dotplot ### Prepare Data
apol1.human.fasta <- rentrez::entrez_fetch(id = "NP_001130012.1",
db = "protein",
rettype="fasta")
apol1.human.vector <- fasta_cleaner(apol1.human.fasta)
apol1.human.string <- fasta_cleaner(apol1.human.vector,
parse = F)
#Creating the Dotplot
dotPlot(apol1.human.vector, apol1.human.vector)
#Creating Variations of the Dotplot to Find the Most Interesting Plot
par(mfrow = c(2,2),
mar = c(0,0,2,1))
# plot 1: Defaults
dotPlot(apol1.human.vector, apol1.human.vector,
wsize = 1 ,
nmatch = 1 ,
main = "Dotplot 1")
# plot 2 size = 10, nmatch = 1
dotPlot(apol1.human.vector, apol1.human.vector,
wsize = 10 ,
nmatch = 1 ,
main = "Dotplot 2")
# plot 3: size = 10, nmatch = 5
dotPlot(apol1.human.vector, apol1.human.vector,
wsize = 10 ,
nmatch = 5 ,
main = "Dotplot 3")
# plot 4: size = 20, nmatch = 5
dotPlot(apol1.human.vector, apol1.human.vector,
wsize = 20 ,
nmatch = 5 ,
main = "Dotplot 4")
# reset par() - run this or other plots will be small!
par(mfrow = c(1,1),
mar = c(4,4,4,4))
#Creating a Property Table
sources <- c("UniProt", "PDB", "AlphaFold")
property <- c("The APOL1 protein is disordered and is one large domain.", "This protein is a membrane protein.", "The APOL1 gene is involved in lipid exchange and transport, and commonly is seen working in the kidneys and liver.")
link <- c("https://www.uniprot.org/uniprot/O14791/protvista", "https://www.rcsb.org/structure/7L6K", "https://alphafold.ebi.ac.uk/entry/O14791")
property.df <- data.frame("Source" = sources, "Protein Property" = property, "Link" = link)
pander::pander(property.df)
| Source | Protein.Property |
|---|---|
| UniProt | The APOL1 protein is disordered and is one large domain. |
| PDB | This protein is a membrane protein. |
| AlphaFold | The APOL1 gene is involved in lipid exchange and transport, and commonly is seen working in the kidneys and liver. |
| Link |
|---|
| https://www.uniprot.org/uniprot/O14791/protvista |
| https://www.rcsb.org/structure/7L6K |
| https://alphafold.ebi.ac.uk/entry/O14791 |
Make sure to have the pander package installed and loaded! ### Introduction Chou is a scientist who worked to discover information about secondary structure in amino acids. The methods used were multivariate statistics, and at first this work did not look at all at the actual sequence and what it could mean. All of this went to working on trying to predict full 3D structures of a protein. ### Amino Acid Compositions for Four Primary Folding Types The four primary folding types are Alpha, Beta, Alpha + Beta, and Alpha/Beta. There is a fifth folding type called Disordered, but the main four are what are focused on most. These compositions can be determined and then used to further investigate qualities of a protein. ### Amino Acid Frequencies
#Making a vector of amino acid single letter codes
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
#We enter it again so we can double check that it is correct!
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
#Checking length
length(aa.1.1) == length(aa.1.2)
## [1] TRUE
#Checking to see if all values are the same
aa.1.1 == aa.1.2
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
#Seeing how many unique values are in the object
unique(aa.1.1)
## [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
## [20] "V"
#Checking that the vectors have the same number of unique values
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
#Setting Up Data and Doing Logical Checks
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
sum(alpha) == 2447
## [1] TRUE
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
sum(beta) == 2776
## [1] TRUE
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
#Making a Data Frame for the Folding Types
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
## aa.1.1 alpha beta a.plus.b a.div.b
## 1 A 285 203 175 361
## 2 R 53 67 78 146
## 3 N 97 139 120 183
## 4 D 163 121 111 244
## 5 C 22 75 74 63
## 6 Q 67 122 74 114
## 7 E 134 86 86 257
## 8 G 197 297 171 377
## 9 H 111 49 33 107
## 10 I 91 120 93 239
## 11 L 221 177 110 339
## 12 K 249 115 112 321
## 13 M 48 16 25 91
## 14 F 123 85 52 158
## 15 P 82 127 71 188
## 16 S 122 341 126 327
## 17 T 119 253 117 238
## 18 W 33 44 30 72
## 19 Y 63 110 108 130
## 20 V 167 229 123 378
#Creating Vectors
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
#Creating a Data Frame for Frquencies
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
row.names(aa.prop) <- aa.1.1
aa.prop
## alpha.prop beta.prop a.plus.b.prop a.div.b
## A 0.116469146 0.073126801 0.09264161 0.08331410
## R 0.021659174 0.024135447 0.04129169 0.03369490
## N 0.039640376 0.050072046 0.06352567 0.04223402
## D 0.066612178 0.043587896 0.05876125 0.05631202
## C 0.008990601 0.027017291 0.03917417 0.01453958
## Q 0.027380466 0.043948127 0.03917417 0.02630972
## E 0.054760932 0.030979827 0.04552673 0.05931225
## G 0.080506743 0.106988473 0.09052409 0.08700669
## H 0.045361667 0.017651297 0.01746956 0.02469421
## I 0.037188394 0.043227666 0.04923240 0.05515809
## L 0.090314671 0.063760807 0.05823187 0.07823679
## K 0.101757254 0.041426513 0.05929063 0.07408262
## M 0.019615856 0.005763689 0.01323452 0.02100162
## F 0.050265631 0.030619597 0.02752779 0.03646434
## P 0.033510421 0.045749280 0.03758602 0.04338795
## S 0.049856968 0.122838617 0.06670196 0.07546734
## T 0.048630977 0.091138329 0.06193753 0.05492730
## W 0.013485901 0.015850144 0.01588142 0.01661666
## Y 0.025745811 0.039625360 0.05717311 0.03000231
## V 0.068246833 0.082492795 0.06511382 0.08723748
#Plotting the Dotplots to Determine Correlation
plot(aa.prop,panel = panel.smooth)
#Correlating Values and Rounding Them
cor(aa.prop)
## alpha.prop beta.prop a.plus.b.prop a.div.b
## alpha.prop 1.0000000 0.4941143 0.6969508 0.8555289
## beta.prop 0.4941143 1.0000000 0.7977771 0.7706654
## a.plus.b.prop 0.6969508 0.7977771 1.0000000 0.8198043
## a.div.b 0.8555289 0.7706654 0.8198043 1.0000000
round(cor(aa.prop), 3)
## alpha.prop beta.prop a.plus.b.prop a.div.b
## alpha.prop 1.000 0.494 0.697 0.856
## beta.prop 0.494 1.000 0.798 0.771
## a.plus.b.prop 0.697 0.798 1.000 0.820
## a.div.b 0.856 0.771 0.820 1.000
#Setting Up Parameters for the Plots
par(mfrow = c(1,3), mar = c(4,4,1,0))
plot(alpha.prop ~ beta.prop, data = aa.prop)
plot(alpha.prop ~ a.plus.b.prop, data = aa.prop)
plot(alpha.prop ~ a.div.b, data = aa.prop)
par(mfrow = c(1,1), mar = c(4,4,4,4))
#Retrieving the FASTA file and cleaning it
NP_001130012.1 <- rentrez::entrez_fetch(id = "NP_001130012.1",
db = "protein",
rettype = "fasta")
NP_001130012.1 <- compbio4all::fasta_cleaner(NP_001130012.1, parse = TRUE)
NP_001130012.1.freq.table <- table(NP_001130012.1)/length(NP_001130012.1)
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
APOL1.human.aa.freq <- table_to_vector(NP_001130012.1.freq.table)
aa.names <- names(APOL1.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
sum(APOL1.human.aa.freq)
## [1] 1
aa.prop$APOL1.human.aa.freq <- APOL1.human.aa.freq
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ APOL1.human.aa.freq, data = aa.prop)
plot(beta.prop ~ APOL1.human.aa.freq, data = aa.prop)
plot(a.plus.b.prop ~ APOL1.human.aa.freq, data = aa.prop)
plot(a.div.b ~ APOL1.human.aa.freq, data = aa.prop)
par(mfrow = c(1,1), mar = c(1,1,1,1))
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## APOL1.human.aa.freq 0.09 0.00 0.05 0.09 0.03 0.07 0.02 0.05 0.06 0.13 0.02 0.05
## M F P S T W Y V
## alpha.prop 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## APOL1.human.aa.freq 0.03 0.04 0.05 0.07 0.05 0.07 0.02 0.02
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## APOL1.human.aa.freq 0.17086517 0.17492990 0.15359108 0.16166376
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7824 | 0.7824 | 0.1709 | ||
| beta | 0.7749 | 0.7749 | 0.1749 | ||
| alpha plus beta | 0.8155 | 0.8155 | 0.1536 | most.sim | min.dist |
| alpha/beta | 0.7982 | 0.7982 | 0.1617 |
names(apol1.list)
## [1] "NP_001130012.1" "XP_018873842.1" "NP_808412.1" "NP_001025309.1"
## [5] "XP_006242011.1" "XP_003264742" "XP_025255312" "XP_011834337"
## [9] "NP_001289030.1" "XP_008588416"
length(apol1.list)
## [1] 10
apol1.list[1]
## $NP_001130012.1
## [1] "MEGAALLRVSVLCIWMSALFLGVGVRAEEAGARVQQNVPSGTDTGDPQSKPLGDWAAGTMDPESSIFIEDAIKYFKEKVSTQNLLLLLTDNEAWNGFVAAAELPRNEADELRKALDNLARQMIMKDKNWHDKGQQYRNWFLKEFPRLKSELEDNIRRLRALADGVQKVHKGTTIANVVSGSLSISSGILTLVGMGLAPFTEGGSLVLLEPGMELGITAALTGITSSTMDYGKKWWTQAQAHDLVIKSLDKLKEVREFLGENISNFLSLAGNTYQLTRGIGKDIRALRRARANLQSVPHASASRPRVTEPISAESGEQVERVNEPSILEMSRGVKLTDVAPVSFFLVLDVVYLVYESKHLHEGAKSETAEELKKVAQELEEKLNILNNNYKILQADQEL"
#Storing the List to a Vector
apol1.vector <- rep(NA, length(apol1.list))
#Using a for loop to make each entry of the list into a vector
for(i in 1:length(apol1.vector)){
apol1.vector[i] <- apol1.list[[i]]
}
#Naming the Vector
names(apol1.vector) <- names(apol1.list)
#Human vs Gorilla
align.human.vs.gorilla <- Biostrings::pairwiseAlignment(apol1.list[[1]],
apol1.list[[2]])
align.human.vs.gorilla
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MEGAALLRVSVLCIWMSALFLGVGVRAEEAGARV...SETAEELKKVAQELEEKLNILNNNYKILQADQEL
## subject: MEGAALLRVCVLCIWMNALFLGVGVRAEEAGARV...SETAEELKKVAQELEEKLNILNNNYKILQAGQEL
## score: 1607.438
Biostrings::pid(align.human.vs.gorilla)
## [1] 97.48744
#Human vs Mouse
align.human.vs.mouse <- Biostrings::pairwiseAlignment(apol1.list[[1]],
apol1.list[[3]])
align.human.vs.mouse
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MEGAALLRVSVLCIWMSALFLGVGVRAEEAGARV...SETAEELKKVAQELEEKLNILNNNYKILQADQEL
## subject: MDWNEILED----IRK------VERRIVE-----...TESGGALRDLAHKLEE--N--------------L
## score: -1326.387
Biostrings::pid(align.human.vs.mouse)
## [1] 28.57143
#Human vs Rat
align.human.vs.rat <- Biostrings::pairwiseAlignment(apol1.list[[1]],
apol1.list[[5]])
align.human.vs.rat
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MEGAALLRV-S----VLCI--WMSA----LFLGV...SETAEELKKVAQELEEKLNILNNNYKILQADQEL
## subject: MSCRWNYRRLSHLTWVLGTELWSSAKHGVLLLNT...TESAGALRELCHKLDDKVKIFEQIYKALQSDLPQ
## score: -1379.513
Biostrings::pid(align.human.vs.rat)
## [1] 32.19616
#Gorilla vs Rat
align.gorilla.vs.rat <- Biostrings::pairwiseAlignment(apol1.list[[2]],
apol1.list[[5]])
align.gorilla.vs.rat
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: M------------------E-------GAALL--...SETAEELKKVAQELEEKLNILNNNYKILQAGQEL
## subject: MSCRWNYRRLSHLTWVLGTELWSSAKHGVLLLNT...TESAGALRELCHKLDDKVKIFEQIYKALQSDLPQ
## score: -1372.902
Biostrings::pid(align.gorilla.vs.rat)
## [1] 31.48936
#Gorilla vs Mouse
align.gorilla.vs.mouse <- Biostrings::pairwiseAlignment(apol1.list[[2]],
apol1.list[[3]])
align.gorilla.vs.mouse
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MEGAALLRVCVLCIWMNALFLGVGVRAEEAGARV...SETAEELKKVAQELEEKLNILNNNYKILQAGQEL
## subject: MDWNEILED----IRK------VERRIVE-----...TESGGALRDLAHKLEE--N--------------L
## score: -1299.499
Biostrings::pid(align.gorilla.vs.mouse)
## [1] 29.32692
#Mouse vs Rat
align.mouse.vs.rat <- Biostrings::pairwiseAlignment(apol1.list[[3]],
apol1.list[[5]])
align.mouse.vs.rat
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MD--WN----------------------------...TESGGALRDLAHKLE------EN----L------
## subject: MSCRWNYRRLSHLTWVLGTELWSSAKHGVLLLNT...TESAGALRELCHKLDDKVKIFEQIYKALQSDLPQ
## score: -756.525
Biostrings::pid(align.mouse.vs.rat)
## [1] 39.65142
pids <- c(1, NA, NA, NA,
pid(align.human.vs.gorilla), 1, NA, NA,
pid(align.human.vs.rat), pid(align.gorilla.vs.rat), 1, NA,
pid(align.human.vs.mouse), pid(align.gorilla.vs.mouse), pid(align.mouse.vs.rat), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Gorilla","Mouse","Rat")
colnames(mat) <- c("Homo","Gorilla","Mouse","Rat")
pander::pander(mat)
| Homo | Gorilla | Mouse | Rat | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Gorilla | 97.49 | 1 | NA | NA |
| Mouse | 32.2 | 31.49 | 1 | NA |
| Rat | 28.57 | 29.33 | 39.65 | 1 |
#Comparing Types of PID Methods
pid(align.human.vs.gorilla, type = "PID1")
## [1] 97.48744
pid(align.human.vs.gorilla, type = "PID2")
## [1] 97.48744
pid(align.human.vs.gorilla, type = "PID3")
## [1] 97.48744
pid(align.human.vs.gorilla, type = "PID4")
## [1] 97.48744
#Converting Vector to a String Set
apol1.vector.ss <- Biostrings::AAStringSet(apol1.vector)
apol1.align <- msa(apol1.vector.ss,
method = "ClustalW")
## use default substitution matrix
class(apol1.align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(apol1.align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
#Default Output of MSA
apol1.align
## CLUSTAL 2.1
##
## Call:
## msa(apol1.vector.ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 527 columns
## aln names
## [1] -------------------------...ENL---------------------- NP_808412.1
## [2] MSCRWNYRRLSHLTWVLGTELWSSA...DKVKIFEQIYKALQSDLPQ------ XP_006242011.1
## [3] -------------------------...EKLNILNNNYKILQADQEL------ NP_001130012.1
## [4] -------------------------...EKLNILNNNYKILQAGQEL------ XP_018873842.1
## [5] -------------------------...GKLNILTKIHEILQADQEL------ XP_003264742
## [6] -------------------------...KKLNILNKKYETLSQEP-------- XP_025255312
## [7] -------------------------...KKLNILNKKYETLSQEP-------- XP_011834337
## [8] -------------------------...KKLNILNKKYETLRQEP-------- NP_001289030.1
## [9] -------------------------...KKLEELTQIHKGLQLPPN------- XP_008588416
## [10] -------------------------...EGLMELNVIRDELRMTTTVKEDQTS NP_001025309.1
## Con -------------------------...?KLNILN??Y??LQ????------- Consensus
#Changing the Class of Alignment
class(apol1.align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
class(apol1.align) <- "AAMultipleAlignment"
class(apol1.align)
## [1] "AAMultipleAlignment"
#Converting to Seqinr Format
apol1.align.seqinr <- msaConvert(apol1.align,
type = "seqinr::alignment")
#Displaying the Output
compbio4all::print_msa(apol1.align.seqinr)
## [1] "------------------------------------------------------------ 0"
## [1] "MSCRWNYRRLSHLTWVLGTELWSSAKHGVLLLNTETLLSMAEEQGGPGRHRAGCVLRSHM 0"
## [1] "----------------------------------------------------------ME 0"
## [1] "----------------------------------------------------------ME 0"
## [1] "----------------------------------------------------------ME 0"
## [1] "----------------------------------------------------------ME 0"
## [1] "----------------------------------------------------------ME 0"
## [1] "----------------------------------------------------------ME 0"
## [1] "----------------------------------------------------------MC 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "---------------------------MDWNEILEDIR---------------------- 0"
## [1] "PELFLSLSCTGYIMPRACAWLSLRRAAGDWIHTAKDSRDTSKLAYRKIPCPDTCRTVRSP 0"
## [1] "GAALLRVSVLCIWMSALFLGVG-VRAEEAGARVQQNVPSG--TDTGDPQSKPLGDWAAGT 0"
## [1] "GAALLRVCVLCIWMNALFLGVG-VRAEEAGARVQQNVPSG--TDTGDPQSKPLGDWAAGT 0"
## [1] "GAALLTVFVLCIWTSALLLGVG-VRAEEAGGRVQQNVPSGTVTDTGNPQGKPLGDWAAGS 0"
## [1] "GAALLRLFVLCIWTSGLFLGVG-VRAEEATARVQQNHPGG--TDTGDPQGKPLSDRAAGT 0"
## [1] "GAALLRLFVLCIWTSGLFLGVG-VRAEEAAVRVQQNHPGG--TDTGDPQGKPLSDRAAGT 0"
## [1] "GAALLRLFVLCIWTSGLFLGVG-VRAEEATARVQQNHPGG--TDTGDLQ----------- 0"
## [1] "SSQAEGTIQHQHPAEAPSKGQSQTKNDKDEIQVQTHLQTSGVVQDPSSSAHPFCDFKTAA 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "----------------------------------------KVERRIVEKFIEDLTENFLR 0"
## [1] "MAHALDR-----------------------------AAVSKVARRSVEELTDFLTEVLCK 0"
## [1] "MDPESS--------------------------------------IFIEDAIKYFKEKVST 0"
## [1] "MDPESS--------------------------------------IFIEDAIKYFKEKVST 0"
## [1] "MDPESS--------------------------------------IFIEDAIKYFQEKVST 0"
## [1] "MDPESN--------------------------------------VSLEDYLNYFQKNVSP 0"
## [1] "MDPESN--------------------------------------VSLEDYLNYFQKNVSP 0"
## [1] "---ESN--------------------------------------VSLEDYLNYFQKNVSP 0"
## [1] "LAPESHRKKGERTRRLCLGLEVRAEGAGARVKQNLPAWTQTDSKSFIEQVIKYFKDYVRR 0"
## [1] "----------------------------------------------MEWLDDFRLSETQR 0"
## [1] " "
## [1] "TDLRSLITEDGAWNGFLETAELSSEEKAALRDALKESSAQKPSGENDRPERKPQ-KEQIL 0"
## [1] "RDLKSLITEDGVWKGFVEAAELSSEEEAALYDALKEHLQQHPTDENDGPQREQE-KERFL 0"
## [1] "QNLLLLLTDNEAWNGFVAAAELPRNEADELRKALDNLARQMIMKDKNWHDKGQQYRNWFL 0"
## [1] "ENLLLLLTDNEAWNGFVAAAELPSNEADELRKALDNLARQMIMKDKNWHDKGQQYRNWFL 0"
## [1] "QNLLLLLTDDEAWNGFVAAAELTRDEADEFHKALNKLARHVIMKDKNWHDKDQQHRKWFL 0"
## [1] "QELLLLLSDHKAWERVVATAELPRDEADELYKALNKLIRHMVMKDKNWLEEVQQHRKRFL 0"
## [1] "QEMLLLLSDHKAWERVVATAELPRDEADELYKALNKLIRHMVMKDKNWLEEVQQHRKRFL 0"
## [1] "QEMLLLLSDHKAWERVVATAELPRDEADELYKALNKLIRHMVMKDKNWLEEVQQHRKRFL 0"
## [1] "EALELLLTEGGVWESLLAEAGLSRDEANELYEALRKLMTVMATEDKAMLQKEEQDRKRFL 0"
## [1] "EDEEDLEGLMDWWNTVEQWEDMPKNDNMTEKEEAKAFAVT--------ADKVQKGIRVFN 0"
## [1] " "
## [1] "REVPQLKKKLEVHISKLRELADKLDLVHKCCTISNVVSLTLSAASGVLKLLDFVLSQIYG 0"
## [1] "RVFPQLKKKLEDHIKKLRDLADHLDQVHHGCTISNVVSSSVSTVSGILG---LVLLPFTG 0"
## [1] "KEFPRLKSELEDNIRRLRALADGVQKVHKGTTIANVVSGSLSISSGILTLVGMGLAPFTE 0"
## [1] "KVFPRLKSELEDNIRRLRALADGVQKVHKGTTIANVVSGSLSISSGILTLVGMGLAPFTE 0"
## [1] "REFPHLKRELQDHIIRLHVLADRVEQVHRGTTITNVVSSTAGVISDILTLISVGLAPFTE 0"
## [1] "EEFPRLERELQDKIRRLCDLAGEVQKVHKGATIANAFSSTLGVASGVLTFLGLGLAPFTA 0"
## [1] "EEFPRLERELQDKIRRLCDLAGEVQKVHKGATIANAFSSTLGVASGVLTFLGMGLAPFTA 0"
## [1] "EEFPRLERELQDKIRRLCDLAGEVQKVHKGATIANAFSSTLGVASGVLTFLGLGLAPFTA 0"
## [1] "NEFPQTKMELEELIGKLHALADKVDKVHRDCTISNVVATSTSIVSGILTILGLSLAPVTA 0"
## [1] "KLFSERAEGLWQHVIDLHAIADGLDRFNKKTKIAQITGGSTSAVGGVATITGLILAPFTM 0"
## [1] " "
## [1] "MPRRALSATSEGLGMASDMINITTTIVGESFRQSYESEARRLVGASINILYEIINITPMI 0"
## [1] "GASLILSATAVGLGVTASVNGLVTNIVEESIKLSDESEASRLVGASMDVLGEILKITPKI 0"
## [1] "GGSLVLLEPGMELGITAALTGITSSTMDYGKKWWTQAQAHDLVIKSLDKLKEVREFLGEN 0"
## [1] "GGSLVLLEPGMELGITAALTGITSSAMDYGKKWWTQAQAHDLVIKSLDKLKEVKEFLGEN 0"
## [1] "EISLGLSETGMGKGIVAAVTRIAGNAVEQAKKLWAQAQAGNLDQRDPDVAKVIKEFMGEN 0"
## [1] "GSSLVLLEPVTGLGIAAALTGITSGSVEYAKKRWAQAEAHDLVNKSLDTVEEMNEFLYHN 0"
## [1] "GSSLVLLEPVTGLAIAAALTGITSGSVEYAKKRSAQAEAHELVNKSLDTVEEMNEFLYHN 0"
## [1] "GSSLVLLEPVTGLGIAAALTGITSGSVEYAKKRWAQAEAHELVNKSLDTVEEMNEFLYHN 0"
## [1] "GASLVLSATGMGLGTAAAVTGVSSNIVDYSSSLWAKAEADHLVSAGISGEELVMKVVGHA 0"
## [1] "GTSLIVTAVGLGVAMAGGLTSASAGITSTVNNSLDRKKVERIVGDYQAKMGDLNKCMKFI 0"
## [1] " "
## [1] "TVKLYYTVKDLVEAFKTLTGQIRAIRTAISNSDLGTQARNPASTGRSSGQ---------- 0"
## [1] "TIKLYNTGKELVEAFKTLRDQIRAIRRARSISQGAAAARSLTSTGRGSVQSAREMG---- 0"
## [1] "ISNFLSLAGNTYQLTRGIGKDIRALRRARANLQSVPHASASRPRVTEPISAESGEQ---- 0"
## [1] "ISNFLSLAGNTYQLTRGIGKDIRALRRARANLQSVPHASASRPRVTEPISAESGEQ---- 0"
## [1] "TPNFLSLIDNSYQVVQGIGKDIRAIRQAGSNSQLVP--SATPLEIIGQISAERDEE---- 0"
## [1] "ISNFISLRVNLVKFTEDTGKAIRAIRQARANPHSVPHVPASLHRVTEPVLATSFEE---- 0"
## [1] "IPNFISLRVNLVKFTEDTGKAIRAIRQARANPHSVPHVPASLHRVTEPVSATSFEE---- 0"
## [1] "IPNFISLRVNLVKFTEDTGKAIRAIRQARANPHSVSHVPASLHRVTEPVSATSVEERARV 0"
## [1] "ASEIKSVTEKCIPLIQRIGKHIRAIKKAKANPHLLAEAKSFMTAGTASVQCSGQVQ---- 0"
## [1] "KQGITNLRKFNLNKMKKHAYNRDFPCFDNVYEDGAMAGKAILTNANEIMRVMQIAN---- 0"
## [1] " "
## [1] "--------------------------GVPQMISR-ARIREIGLTVPFLEQDLRDLAQQFE 0"
## [1] "----------------------AAFTGSPLSTTRGARFGAAGVTSFFLAWDVYHLVADSM 0"
## [1] "--------------------VERVNEPSILEMSRGVKLTDVAPVSFFLVLDVVYLVYESK 0"
## [1] "--------------------VERVNEPRILEISRGVKLTDVAPVSFFLVLDVVYLVYESK 0"
## [1] "--------------------VGRVTESPTLEMSRGTKIVGVATGGILLVLDVVSLAYESK 0"
## [1] "--------MERVGEMERVGEMERVAESRTTEVIRGAKIVDKVFEGALFVLDVVSLVCQLK 0"
## [1] "--------MERVGEMERVGEMERVAESLTTEVIRGAKIVDKVFEGALFVLDVVGLVCQLK 0"
## [1] "VEMERVGEMERVGEMERVGEMERVAESRTTEVIRGAKIVDEVFEGALFVLDVVSLVCQLK 0"
## [1] "----------------------EAFGGTALVMTKGARIAGAATAGVLLLADVISLVKESK 0"
## [1] "-----------------------VAGTTAARAVQIASISTGVLTGLFVGMDIYFVAKDSH 0"
## [1] " "
## [1] "VLKDGAKTESGGALRDLAHKLEENL---------------------- 13"
## [1] "DLYDGAKTESAGALRELCHKLDDKVKIFEQIYKALQSDLPQ------ 13"
## [1] "HLHEGAKSETAEELKKVAQELEEKLNILNNNYKILQADQEL------ 13"
## [1] "HLHEGAKSETAEELKKVAQELEEKLNILNNNYKILQAGQEL------ 13"
## [1] "HLHEGAKSESAEELKERAQELEGKLNILTKIHEILQADQEL------ 13"
## [1] "HLHEGAKSKTAEELKKVAQELEKKLNILNKKYETLSQEP-------- 13"
## [1] "HLHEGAKSKTAEELKKVAQELEKKLNILNKKYETLSQEP-------- 13"
## [1] "HLHEGAKSKTAEELKKVAQELEKKLNILNKKYETLRQEP-------- 13"
## [1] "HLHEGAKAESGKELRQRAQELEKKLEELTQIHKGLQLPPN------- 13"
## [1] "ELKNGAKSEFAAKIREVAEQLHEGLMELNVIRDELRMTTTVKEDQTS 13"
## [1] " "
class(apol1.align) <- "AAMultipleAlignment"
ggmsa::ggmsa(apol1.align,
start = 400,
end = 500)
#Making a Subset of Sequences
names(apol1.vector.ss)
## [1] "NP_001130012.1" "XP_018873842.1" "NP_808412.1" "NP_001025309.1"
## [5] "XP_006242011.1" "XP_003264742" "XP_025255312" "XP_011834337"
## [9] "NP_001289030.1" "XP_008588416"
names.focal <- c("NP_001130012.1","NP_808412.1","NP_001025309.1","NP_001289030.1", "XP_018873842.1","XP_006242011.1")
apol1.vector.ss[names.focal]
## AAStringSet object of length 6:
## width seq names
## [1] 398 MEGAALLRVSVLCIWMSALFLGV...QELEEKLNILNNNYKILQADQEL NP_001130012.1
## [2] 318 MDWNEILEDIRKVERRIVEKFIE...KDGAKTESGGALRDLAHKLEENL NP_808412.1
## [3] 326 MEWLDDFRLSETQREDEEDLEGL...LMELNVIRDELRMTTTVKEDQTS NP_001025309.1
## [4] 406 MEGAALLRLFVLCIWTSGLFLGV...VAQELEKKLNILNKKYETLRQEP NP_001289030.1
## [5] 398 MEGAALLRVCVLCIWMNALFLGV...QELEEKLNILNNNYKILQAGQEL XP_018873842.1
## [6] 462 MSCRWNYRRLSHLTWVLGTELWS...HKLDDKVKIFEQIYKALQSDLPQ XP_006242011.1
apol1.vector.ss.subset <- apol1.vector.ss[names.focal]
apol1.align.subset <- msa(apol1.vector.ss.subset,
method = "ClustalW")
## use default substitution matrix
class(apol1.align.subset) <- "AAMultipleAlignment"
apol1.align.subset.seqinr <- msaConvert(apol1.align.subset, type = "seqinr::alignment")
ggmsa::ggmsa(apol1.align.subset,
start = 200,
end = 270)
apol1.subset.dist <- seqinr::dist.alignment(apol1.align.subset.seqinr,
matrix = "identity")
is(apol1.subset.dist)
## [1] "dist" "oldClass"
class(apol1.subset.dist)
## [1] "dist"
apol1.subset.dist.rounded <- round(apol1.subset.dist,
digits = 3)
apol1.subset.dist.rounded
## NP_001130012.1 XP_018873842.1 NP_001289030.1 NP_808412.1
## XP_018873842.1 0.159
## NP_001289030.1 0.614 0.620
## NP_808412.1 0.852 0.850 0.865
## XP_006242011.1 0.844 0.844 0.860 0.667
## NP_001025309.1 0.894 0.893 0.899 0.909
## XP_006242011.1
## XP_018873842.1
## NP_001289030.1
## NP_808412.1
## XP_006242011.1
## NP_001025309.1 0.891
tree_subset <- nj(apol1.subset.dist)
plot.phylo(tree_subset, main="Phylogenetic Tree",
type = "phylogram",
use.edge.length = F)
mtext(text = "APOL1 Gene Tree")