https://docs.google.com/spreadsheets/d/1NE5nCe6rFW9X-ck4PhIlGwUuhIO2BhLeE9mlcn51-M0/edit#gid=0 https://rpubs.com/lowbrowR/844554
#Introduction # Q9BRR0 (ZKSC3_HUMAN): The gene, SLC30A9 plays a key role in activating transcription for Wnt-responsive genes and is a secondary coactivator for nuclear receptors.
#Resources/References #https://www.genecards.org/cgi-bin/carddisp.pl?gene=SLC30A9#:~:text=UniProtKB%2FSwiss%2DProt%20Summary%20for%20SLC30A9%20Gene&text=Functions%20as%20a%20secondary%20coactivator,responsive%20genes%20(By%20similarity). #https://www.uniprot.org/uniprot/Q9BRR0
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
#install("drawProteins")
library(drawProteins)
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
# Bioconductor packages
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(drawProteins)
## Biostrings
library(Biostrings)
#install("HGNChelper")
library(HGNChelper)
## Warning: package 'HGNChelper' was built under R version 4.1.2
#Accession Numbers #RefSeq, Refseq Homlogene, UniProt and PDB were used to obtain the accession numbers. #Accession number table
dio1_table<-c("NC_000004.12", "Q6PML9","NA","Homo sapiens" , "Human", "SLC30A9",
"NC_036883.1", "CK820", "NA", "Pan troglodytes" , "Chimpanzee","SLC30A9",
"NC_000071", "C57BL","NA","Mus musculus", "Mouse", "SLC30A9",
"NC_036907.1", "G7P5H4","NA","Rattus norvegicus", "Organutan", "SLC30A9",
"NC_044606.1", "G3QHK4","NA","Gorilla gorilla gorilla","Gorilla", "SLC30A9",
"NC_048243.1","P9597","NA","Pan paniscus", "Pygmy chimpanzee", "SLC30A9",
"NC_030677.2", "F6ZBB2", "NA", "Xenopus tropicalis", "Frog", "SLC30A9",
"NC_051349.1", "D3ZWW5", "NA", "Rattus norvegicus", "Rat", "SLC30A9",
"NC_007125.7", "D9CICH", "NA", "Danio rerio", "Fish", "SLC30A9",
"NC_010450.4", "F6Q0R2", "NA", "Sus scrofa", "Pig", "SLC30A9")
pander::pander(dio1_table)
NC_000004.12, Q6PML9, NA, Homo sapiens, Human, SLC30A9, NC_036883.1, CK820, NA, Pan troglodytes, Chimpanzee, SLC30A9, NC_000071, C57BL, NA, Mus musculus, Mouse, SLC30A9, NC_036907.1, G7P5H4, NA, Rattus norvegicus, Organutan, SLC30A9, NC_044606.1, G3QHK4, NA, Gorilla gorilla gorilla, Gorilla, SLC30A9, NC_048243.1, P9597, NA, Pan paniscus, Pygmy chimpanzee, SLC30A9, NC_030677.2, F6ZBB2, NA, Xenopus tropicalis, Frog, SLC30A9, NC_051349.1, D3ZWW5, NA, Rattus norvegicus, Rat, SLC30A9, NC_007125.7, D9CICH, NA, Danio rerio, Fish, SLC30A9, NC_010450.4, F6Q0R2, NA, Sus scrofa, Pig and SLC30A9
human_fasta <- rentrez::entrez_fetch(db = "gene",
id = "NC_000004.12",
rettype = "fasta")
chimp_fasta <- rentrez::entrez_fetch(db = "gene",
id = "NC_036883.1",
rettype = "fasta")
mouse_fasta <- rentrez::entrez_fetch(db = "gene",
id = "NC_000071",
rettype = "fasta")
organutan_fasta <- rentrez::entrez_fetch(db = "gene",
id = "NC_036907.1",
rettype = "fasta")
gorilla_fasta <- rentrez::entrez_fetch(db = "gene",
id = "NC_044606.1",
rettype = "fasta")
pygmy_fasta <- rentrez::entrez_fetch(db = "gene",
id = "NC_048243.1",
rettype = "fasta")
frog_fasta <- rentrez::entrez_fetch(db = "gene",
id = "NC_030677.2",
rettype = "fasta")
rat_fasta <- rentrez::entrez_fetch(db = "gene",
id = "NC_051349.1",
rettype = "fasta")
fish_fasta <- rentrez::entrez_fetch(db = "gene",
id = "NC_007125.7",
rettype = "fasta")
pig_fasta <- rentrez::entrez_fetch(db = "gene",
id = "NC_010450.4",
rettype = "fasta")
SLC30A9_FASTA <- c(pig_fasta, fish_fasta, rat_fasta, frog_fasta, pygmy_fasta, gorilla_fasta, organutan_fasta, mouse_fasta, chimp_fasta, human_fasta)
length(SLC30A9_FASTA)
## [1] 10
#FASTA Cleaning
for(i in 1:length(SLC30A9_FASTA)){
SLC30A9_FASTA[[i]] <- compbio4all::fasta_cleaner(SLC30A9_FASTA[[i]], parse = F)
}
#Protein Diagram; ZNF306; Q9BRR0 (ZKSC3_HUMAN)
Q6PML9_json <- drawProteins::get_features("Q9BRR0")
## [1] "Download has worked"
is(Q6PML9_json)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df <- drawProteins::feature_to_dataframe(Q6PML9_json)
is(my_prot_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp CHAIN 1 538 537 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.1 DOMAIN 46 128 82 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.2 DOMAIN 214 274 60 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.3 ZN_FING 314 336 22 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.4 ZN_FING 342 364 22 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.5 ZN_FING 370 392 22 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.6 ZN_FING 398 420 22 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.7 ZN_FING 426 448 22 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.8 ZN_FING 480 502 22 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.9 ZN_FING 508 530 22 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.10 REGION 226 274 48 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.11 COMPBIAS 237 274 37 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.12 MOD_RES 42 42 0 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.13 MOD_RES 207 207 0 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.14 MOD_RES 449 449 0 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.15 CROSSLNK 171 171 0 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.16 VAR_SEQ 1 148 147 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.17 VARIANT 3 3 0 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.18 VARIANT 33 33 0 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.19 VARIANT 34 34 0 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.20 VARIANT 189 189 0 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.21 VARIANT 200 200 0 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.22 VARIANT 200 200 0 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.23 VARIANT 200 200 0 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.24 VARIANT 246 246 0 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.25 CONFLICT 206 207 1 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.26 CONFLICT 348 350 2 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.27 CONFLICT 437 437 0 Q9BRR0 ZKSC3_HUMAN 9606 1
## featuresTemp.28 CONFLICT 459 479 20 Q9BRR0 ZKSC3_HUMAN 9606 1
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df,
label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas
my_prot_df <- drawProteins::feature_to_dataframe(Q6PML9_json)
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_recept_dom(my_canvas, my_prot_df)
my_canvas
#DotPlot
Q9BRR0_FASTA <- rentrez::entrez_fetch(id = "Q9BRR0",
db = "gene",
rettype="fasta")
Q9BRR0_vector <- fasta_cleaner(Q9BRR0_FASTA)
Q9BRR0_string <- fasta_cleaner(Q9BRR0_FASTA, parse = F)
table(Q9BRR0_vector)
## Q9BRR0_vector
## - ( ) , . : ; [ ] _ 0 1 2 3 4 5 6 7 8 9 a A b c C
## 7 41 3 3 3 4 9 5 1 1 1 7 17 4 2 2 1 1 1 8 3 37 9 1 10 4
## d D e f g h H i I l L m M n N o O p r s S t T u y
## 2 2 33 8 1 6 1 13 3 14 1 14 3 19 13 17 3 4 21 21 1 20 5 1 13
dotPlot(Q9BRR0_vector, Q9BRR0_vector)
#This source shows one repeat in the protein. https://prosite.expasy.org/rule/PRU00187
#Protein feature prediction #ZNF306; Q9BRR0 (ZKSC3_HUMAN): Uniprot had some information on the Q9BRR0 zinc finger protein. It is a transcriptional factor that binds to a 5’ - 3’ sequence that represses the process of autophagy which is the process of cleaning/removing old and damaged cells. It also can progress cancer by acting as a transcription activator. # Source: https://www.uniprot.org/uniprot/Q9BRR0
#Predict Protein Fold
# enter once
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# enter again
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
sum(alpha) == 2447
## [1] TRUE
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
sum(beta) == 2776
## [1] TRUE
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
pander(data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b))
| aa.1.1 | alpha | beta | a.plus.b | a.div.b |
|---|---|---|---|---|
| A | 285 | 203 | 175 | 361 |
| R | 53 | 67 | 78 | 146 |
| N | 97 | 139 | 120 | 183 |
| D | 163 | 121 | 111 | 244 |
| C | 22 | 75 | 74 | 63 |
| Q | 67 | 122 | 74 | 114 |
| E | 134 | 86 | 86 | 257 |
| G | 197 | 297 | 171 | 377 |
| H | 111 | 49 | 33 | 107 |
| I | 91 | 120 | 93 | 239 |
| L | 221 | 177 | 110 | 339 |
| K | 249 | 115 | 112 | 321 |
| M | 48 | 16 | 25 | 91 |
| F | 123 | 85 | 52 | 158 |
| P | 82 | 127 | 71 | 188 |
| S | 122 | 341 | 126 | 327 |
| T | 119 | 253 | 117 | 238 |
| W | 33 | 44 | 30 | 72 |
| Y | 63 | 110 | 108 | 130 |
| V | 167 | 229 | 123 | 378 |
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
aa.prop
## alpha.prop beta.prop a.plus.b.prop a.div.b
## A 0.116469146 0.073126801 0.09264161 0.08331410
## R 0.021659174 0.024135447 0.04129169 0.03369490
## N 0.039640376 0.050072046 0.06352567 0.04223402
## D 0.066612178 0.043587896 0.05876125 0.05631202
## C 0.008990601 0.027017291 0.03917417 0.01453958
## Q 0.027380466 0.043948127 0.03917417 0.02630972
## E 0.054760932 0.030979827 0.04552673 0.05931225
## G 0.080506743 0.106988473 0.09052409 0.08700669
## H 0.045361667 0.017651297 0.01746956 0.02469421
## I 0.037188394 0.043227666 0.04923240 0.05515809
## L 0.090314671 0.063760807 0.05823187 0.07823679
## K 0.101757254 0.041426513 0.05929063 0.07408262
## M 0.019615856 0.005763689 0.01323452 0.02100162
## F 0.050265631 0.030619597 0.02752779 0.03646434
## P 0.033510421 0.045749280 0.03758602 0.04338795
## S 0.049856968 0.122838617 0.06670196 0.07546734
## T 0.048630977 0.091138329 0.06193753 0.05492730
## W 0.013485901 0.015850144 0.01588142 0.01661666
## Y 0.025745811 0.039625360 0.05717311 0.03000231
## V 0.068246833 0.082492795 0.06511382 0.08723748
NP_000783 <- rentrez::entrez_fetch(id = "NC_000004.12",
db = "gene",
rettype = "fasta")
NP_000783.freq.table <- table(NP_000783)/length(NP_000783)
# clean and turn into vector
NP_000783 <- compbio4all::fasta_cleaner(NP_000783, parse = TRUE)
NP_000784 <- rentrez::entrez_fetch(id = "NC_036883.1",
db = "gene",
rettype = "fasta")
NP_000784.freq.table <- table(NP_000784)/length(NP_000784)
# clean and turn into vector
NP_000784 <- compbio4all::fasta_cleaner(NP_000784, parse = TRUE)
NP_000783.freq.table <- table(NP_000783)/length(NP_000783)
#table_to_vector <- function(table_x){table_names <- attr(table_x, "dimnames")[[1]] table_vect <- as.vector(table_x) names(table_vect) <- table_names return(table_vect)}
slc30a9.human.aa.freq <- table_to_vector(NP_000783.freq.table)
aa.names <- names(slc30a9.human.aa.freq)
any(aa.names == "U")
i.U <- which(aa.names == "U")
aa.names[i.U]
slc30a9.human.aa.freq[i.U]
which(aa.names %in% c("U"))
slc30a9.human.aa.freq <- slc30a9.human.aa.freq[-i.U]
# sum is just under 1; ok for our purposes
sum(slc30a9.human.aa.freq)
aa.prop <- slc30a9.human.aa.freq
pander::pander(NP_000784)
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)}
slc30a9_human_vector <- human_fasta
slc30a9_human_table <- table(slc30a9_human_vector)/length(slc30a9_human_vector)
slc30a9.human.aa.freq <- table_to_vector(slc30a9_human_table)
slc30a9.human.aa.freq
## \n1. A12M1\nadenovirus-12 chromosome modification site 1C [Homo sapiens (human)]\nOther Designations: Adenovirus-12 chromosome modification site-1q1\nChromosome: 1; Location: 1q42-q43\nThis record was discontinued.\nID: 4\n\n2. SERPINA3\nOfficial Symbol: SERPINA3 and Name: serpin family A member 3 [Homo sapiens (human)]\nOther Aliases: AACT, ACT, GIG24, GIG25\nOther Designations: alpha-1-antichymotrypsin; cell growth-inhibiting gene 24/25 protein; growth-inhibiting protein 24; growth-inhibiting protein 25; serine (or cysteine) proteinase inhibitor, clade A, member 3; serpin A3; serpin peptidase inhibitor, clade A (alpha-1 antiproteinase, antitrypsin), member 3\nChromosome: 14; Location: 14q32.13\nAnnotation: Chromosome 14 NC_000014.9 (94612391..94624053)\nMIM: 107280\nID: 12\n\n
## 1
aa.names <- names(slc30a9.human.aa.freq)
i.U <- which(aa.names == "U")
aa.names[i.U]
## character(0)
slc30a9.human.aa.freq[i.U]
## named numeric(0)
aa.prop$slc30a9.human.aa.freq <- slc30a9.human.aa.freq
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | slc30a9.human.aa.freq | |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 1 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 1 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 1 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 1 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 1 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 1 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 1 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 1 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 1 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 1 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 1 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 1 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 1 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 1 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 1 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 1 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 1 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 1 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 1 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 1 |
#Functions to calculate similarities
# Corrleation used in Chou adn Zhange 1992.
chou_cor <- function(x,y){numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)}
# Cosine similarity used in Higgs and Attwood (2005).
#chou_cosine <- function(z.1, z.2){z.1.abs <- sqrt(sum(z.1^2))z.2.abs <- sqrt(sum(z.2^2)) my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs) return(my.cosine)}
#Correlation
cor(NP_000784.freq.table, NP_000783.freq.table)
#Cosine similarity
#install.packages(lsa)
library(lsa)
cos.mat.numeric <- habk.df2[ , c(-1, -2, -3)]
cos.mat.centered <- cos.mat.numeric
cos.mat.standard <- cos.mat.centered
cos.mat.dist <- cos.mat.standard
cos.mat <- lsa::cosine(t(cos.mat.standard))
cos.mat.rnd <- round(cos.mat,2)
mdendro.complete <- mdendro::linkage(prox = cos.mat.dist,
type.prox = "similarity",
weighted =T,
method = "complete")
mdendro.arith <- mdendro::linkage(prox = cos.mat.dist,
type.prox = "similarity",
weighted =T,
method = "arithmetic")
plot(mdendro.arith)
aa.prop.flipped <- t(aa.prop$slc30a9.human.aa.freq)
round(aa.prop.flipped,2)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 1 1 1 1 1 1
#Distance
#Note: we need to flip the dataframe on its side using a command called t()
aa.prop.flipped <- t(aa.prop$slc30a9.human.aa.freq)
round(aa.prop.flipped,2)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 1 1 1 1 1 1
dist(aa.prop.flipped)
## dist(0)
#Percent Identity Comparisons (PID)
names(SLC30A9_FASTA)
## NULL
length(SLC30A9_FASTA)
## [1] 10
names(SLC30A9_vector) <- names(SLC30A9_FASTA)
pids <- c(1, NA, NA, NA,
pid(align01.02), 1, NA, NA,
pid(align01.05), pid(align02.05), 1, NA,
pid(align01.06), pid(align02.06), pid(align05.06), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Mus","Rattus", "Gorilla", "Pan", "Xenopus", "Rattus", "Danio", "Sus")
colnames(mat) <- c("Homo","Pan","Mus","Rattus", "Gorilla", "Pan", "Xenopus", "Rattus", "Danio", "Sus")
pander::pander(mat)
slc30a9_vector <- c(SLC30A9_FASTA)
names(slc30a9_vector) <- names(SLC30A9_FASTA)
#PID methods comparison:
#Human vs Chimp PID comparison: ######################################## # Program: needle # Rundate: Fri 17 Dec 2021 23:59:53 # Commandline: needle # -auto # -stdout # -asequence emboss_needle-I20211218-000323-0340-3146481-p2m.asequence # -bsequence emboss_needle-I20211218-000323-0340-3146481-p2m.bsequence # -datafile EBLOSUM62 # -gapopen 10.0 # -gapextend 0.5 # -endopen 10.0 # -endextend 0.5 # -aformat3 pair # -sprotein1 # -sprotein2 # Align_format: pair # Report_file: stdout ########################################
#======================================= # # Aligned_sequences: 2 # 1: EMBOSS_001 # 2: EMBOSS_001 # Matrix: EBLOSUM62 # Gap_penalty: 10.0 # Extend_penalty: 0.5 # # Length: 569 # Identity: 545/569 (95.8%) # Similarity: 556/569 (97.7%) # Gaps: 1/569 ( 0.2%) # Score: 2781.0 # # #======================================= #https://www.ebi.ac.uk/Tools/services/web/toolresult.ebi?jobId=emboss_needle-I20211218-000323-0340-3146481-p2m
#Multiple sequence alignment
hShroom2 <- entrez_fetch(db = "protein",
id = "Q6PML9",
rettype = "fasta")
mShroom3 <- entrez_fetch(db = "protein",
id = "CK820",
rettype = "fasta")
hShroom4 <- entrez_fetch(db = "protein",
id = "C57BL",
rettype = "fasta")
sShroom1 <- entrez_fetch(db = "protein",
id = "G7P5H4",
rettype = "fasta")
fasta_cleaner <- function(fasta_object, parse = TRUE){
fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)","\\4",fasta_object)
fasta_object <- gsub("\n", "", fasta_object)
if(parse == TRUE){
fasta_object <- stringr::str_split(fasta_object,
pattern = "",
simplify = FALSE)
}
return(fasta_object[[1]])
}
hShroom4 <- fasta_cleaner(hShroom4, parse = F)
mShroom3 <- fasta_cleaner(mShroom3, parse = F)
hShroom2 <- fasta_cleaner(hShroom2, parse = F)
sShroom1 <- fasta_cleaner(sShroom1, parse = F)
shrooms_align <- msa(shrooms_vector_ss,
method = "ClustalW")
class(shrooms_align) <- "AAMultipleAlignment"
shrooms_align_seqinr <- msaConvert(shrooms_align, type = "seqinr::alignment")
compbio4all::print_msa(alignment = shrooms_align_seqinr,
chunksize = 60)
# key step - must have class set properly
class(shrooms_align) <- "AAMultipleAlignment"
# run ggmsa
ggmsa::ggmsa(shrooms_align, # shrooms_align, NOT shrooms_align_seqinr
start = 2000,
end = 2100)
msaPrettyPrint(shrooms_align_subset, # alignment
file = "shroom_msa_subset.pdf", # file name
y=c(2030, 2100), # range
askForOverwrite=FALSE)
#Distance matrix
shrooms_subset_dist <- seqinr::dist.alignment(shrooms_align_subset_seqinr,
matrix = "identity")
shrooms_subset_dist_alt <- matrix(data = NA,
nrow = 5,
ncol = 5)
distances <- c(0.8260049,
0.8478722, 0.9000568,
0.9244596, 0.9435187, 0.9372139,
0.9238779, 0.9370038, 0.9323225,0.9413209)
shrooms_subset_dist_alt[lower.tri(shrooms_subset_dist_alt)] <- distances
seqnames <- c("EAA12598","ABA81834","XP_392427", "XP_783573","CAA78718")
colnames(shrooms_subset_dist_alt) <- seqnames
row.names(shrooms_subset_dist_alt) <- seqnames
shrooms_subset_dist_alt <- as.dist(shrooms_subset_dist_alt)
shrooms_subset_dist <- shrooms_subset_dist_alt
shrooms_subset_dist_rounded <- round(shrooms_subset_dist,
digits = 3)
#Phylognetic trees of sequences
# Note - not using rounded values
tree_subset <- nj(shrooms_subset_dist)
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree",
type = "unrooted",
use.edge.length = F)
# add label
mtext(text = "Shroom family gene tree - UNrooted, no branch lengths")
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree",
use.edge.length = F)
# add label
mtext(text = "Shroom family gene tree - rooted, no branch lenths")