https://docs.google.com/spreadsheets/d/1NE5nCe6rFW9X-ck4PhIlGwUuhIO2BhLeE9mlcn51-M0/edit#gid=0 https://rpubs.com/lowbrowR/844554

#Introduction # Q9BRR0 (ZKSC3_HUMAN): The gene, SLC30A9 plays a key role in activating transcription for Wnt-responsive genes and is a secondary coactivator for nuclear receptors.

#Resources/References #https://www.genecards.org/cgi-bin/carddisp.pl?gene=SLC30A9#:~:text=UniProtKB%2FSwiss%2DProt%20Summary%20for%20SLC30A9%20Gene&text=Functions%20as%20a%20secondary%20coactivator,responsive%20genes%20(By%20similarity). #https://www.uniprot.org/uniprot/Q9BRR0

library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
##   is available with R version '4.1'; see https://bioconductor.org/install
#install("drawProteins")
library(drawProteins)
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(pander)


library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
# Bioconductor packages

library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
## 
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
## 
##     version
library(drawProteins)

## Biostrings
library(Biostrings)
#install("HGNChelper")
library(HGNChelper)
## Warning: package 'HGNChelper' was built under R version 4.1.2

#Accession Numbers #RefSeq, Refseq Homlogene, UniProt and PDB were used to obtain the accession numbers. #Accession number table

dio1_table<-c("NC_000004.12", "Q6PML9","NA","Homo sapiens" , "Human", "SLC30A9",
              "NC_036883.1", "CK820", "NA", "Pan troglodytes" , "Chimpanzee","SLC30A9",
              "NC_000071", "C57BL","NA","Mus musculus", "Mouse", "SLC30A9",
              "NC_036907.1", "G7P5H4","NA","Rattus norvegicus", "Organutan",      "SLC30A9",
              "NC_044606.1", "G3QHK4","NA","Gorilla gorilla gorilla","Gorilla", "SLC30A9",
              "NC_048243.1","P9597","NA","Pan paniscus", "Pygmy chimpanzee", "SLC30A9", 
              "NC_030677.2", "F6ZBB2", "NA", "Xenopus tropicalis", "Frog", "SLC30A9", 
              "NC_051349.1", "D3ZWW5", "NA", "Rattus norvegicus", "Rat", "SLC30A9", 
              "NC_007125.7", "D9CICH", "NA", "Danio rerio", "Fish", "SLC30A9", 
              "NC_010450.4", "F6Q0R2", "NA", "Sus scrofa", "Pig", "SLC30A9")
pander::pander(dio1_table)

NC_000004.12, Q6PML9, NA, Homo sapiens, Human, SLC30A9, NC_036883.1, CK820, NA, Pan troglodytes, Chimpanzee, SLC30A9, NC_000071, C57BL, NA, Mus musculus, Mouse, SLC30A9, NC_036907.1, G7P5H4, NA, Rattus norvegicus, Organutan, SLC30A9, NC_044606.1, G3QHK4, NA, Gorilla gorilla gorilla, Gorilla, SLC30A9, NC_048243.1, P9597, NA, Pan paniscus, Pygmy chimpanzee, SLC30A9, NC_030677.2, F6ZBB2, NA, Xenopus tropicalis, Frog, SLC30A9, NC_051349.1, D3ZWW5, NA, Rattus norvegicus, Rat, SLC30A9, NC_007125.7, D9CICH, NA, Danio rerio, Fish, SLC30A9, NC_010450.4, F6Q0R2, NA, Sus scrofa, Pig and SLC30A9

human_fasta <- rentrez::entrez_fetch(db = "gene", 
                          id = "NC_000004.12", 
                          rettype = "fasta")
chimp_fasta <- rentrez::entrez_fetch(db = "gene", 
                          id = "NC_036883.1", 
                          rettype = "fasta")
mouse_fasta <- rentrez::entrez_fetch(db = "gene", 
                          id = "NC_000071", 
                          rettype = "fasta")
organutan_fasta <- rentrez::entrez_fetch(db = "gene", 
                          id = "NC_036907.1", 
                          rettype = "fasta")
gorilla_fasta <- rentrez::entrez_fetch(db = "gene", 
                          id = "NC_044606.1", 
                          rettype = "fasta")
pygmy_fasta <- rentrez::entrez_fetch(db = "gene", 
                          id = "NC_048243.1", 
                          rettype = "fasta")
frog_fasta <- rentrez::entrez_fetch(db = "gene", 
                          id = "NC_030677.2", 
                          rettype = "fasta")
rat_fasta <- rentrez::entrez_fetch(db = "gene", 
                          id = "NC_051349.1", 
                          rettype = "fasta")
fish_fasta <- rentrez::entrez_fetch(db = "gene", 
                          id = "NC_007125.7", 
                          rettype = "fasta")
pig_fasta <- rentrez::entrez_fetch(db = "gene", 
                          id = "NC_010450.4", 
                          rettype = "fasta")
SLC30A9_FASTA <- c(pig_fasta, fish_fasta, rat_fasta, frog_fasta, pygmy_fasta, gorilla_fasta, organutan_fasta, mouse_fasta, chimp_fasta, human_fasta)
length(SLC30A9_FASTA)
## [1] 10

#FASTA Cleaning

for(i in 1:length(SLC30A9_FASTA)){
  SLC30A9_FASTA[[i]] <- compbio4all::fasta_cleaner(SLC30A9_FASTA[[i]], parse = F)
}

#Protein Diagram; ZNF306; Q9BRR0 (ZKSC3_HUMAN)

Q6PML9_json  <- drawProteins::get_features("Q9BRR0")
## [1] "Download has worked"
is(Q6PML9_json)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df <- drawProteins::feature_to_dataframe(Q6PML9_json)
is(my_prot_df)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
##                     type begin end length accession   entryName taxid order
## featuresTemp       CHAIN     1 538    537    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.1    DOMAIN    46 128     82    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.2    DOMAIN   214 274     60    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.3   ZN_FING   314 336     22    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.4   ZN_FING   342 364     22    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.5   ZN_FING   370 392     22    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.6   ZN_FING   398 420     22    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.7   ZN_FING   426 448     22    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.8   ZN_FING   480 502     22    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.9   ZN_FING   508 530     22    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.10   REGION   226 274     48    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.11 COMPBIAS   237 274     37    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.12  MOD_RES    42  42      0    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.13  MOD_RES   207 207      0    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.14  MOD_RES   449 449      0    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.15 CROSSLNK   171 171      0    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.16  VAR_SEQ     1 148    147    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.17  VARIANT     3   3      0    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.18  VARIANT    33  33      0    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.19  VARIANT    34  34      0    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.20  VARIANT   189 189      0    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.21  VARIANT   200 200      0    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.22  VARIANT   200 200      0    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.23  VARIANT   200 200      0    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.24  VARIANT   246 246      0    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.25 CONFLICT   206 207      1    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.26 CONFLICT   348 350      2    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.27 CONFLICT   437 437      0    Q9BRR0 ZKSC3_HUMAN  9606     1
## featuresTemp.28 CONFLICT   459 479     20    Q9BRR0 ZKSC3_HUMAN  9606     1
my_canvas <- draw_canvas(my_prot_df)  
my_canvas <- draw_chains(my_canvas, my_prot_df, 
                         label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas

my_prot_df <- drawProteins::feature_to_dataframe(Q6PML9_json)

my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_recept_dom(my_canvas, my_prot_df)
my_canvas

#DotPlot

Q9BRR0_FASTA <- rentrez::entrez_fetch(id = "Q9BRR0",
                                      db = "gene", 
                                      rettype="fasta")
Q9BRR0_vector <- fasta_cleaner(Q9BRR0_FASTA)
Q9BRR0_string <- fasta_cleaner(Q9BRR0_FASTA, parse = F)
table(Q9BRR0_vector)
## Q9BRR0_vector
##  -     (  )  ,  .  :  ;  [  ]  _  0  1  2  3  4  5  6  7  8  9  a  A  b  c  C 
##  7 41  3  3  3  4  9  5  1  1  1  7 17  4  2  2  1  1  1  8  3 37  9  1 10  4 
##  d  D  e  f  g  h  H  i  I  l  L  m  M  n  N  o  O  p  r  s  S  t  T  u  y 
##  2  2 33  8  1  6  1 13  3 14  1 14  3 19 13 17  3  4 21 21  1 20  5  1 13
dotPlot(Q9BRR0_vector, Q9BRR0_vector)

ZNF306; Q9BRR0 (ZKSC3_HUMAN) Protein Properties

#This source shows one repeat in the protein. https://prosite.expasy.org/rule/PRU00187

#Protein feature prediction #ZNF306; Q9BRR0 (ZKSC3_HUMAN): Uniprot had some information on the Q9BRR0 zinc finger protein. It is a transcriptional factor that binds to a 5’ - 3’ sequence that represses the process of autophagy which is the process of cleaning/removing old and damaged cells. It also can progress cancer by acting as a transcription activator. # Source: https://www.uniprot.org/uniprot/Q9BRR0

#Predict Protein Fold

# enter once
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

# enter again
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
sum(alpha) == 2447
## [1] TRUE
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
sum(beta) == 2776
## [1] TRUE
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 339, 321, 91, 158, 188, 327, 238, 72, 130, 378)

sum(a.div.b) == 4333
## [1] TRUE
pander(data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b))
aa.1.1 alpha beta a.plus.b a.div.b
A 285 203 175 361
R 53 67 78 146
N 97 139 120 183
D 163 121 111 244
C 22 75 74 63
Q 67 122 74 114
E 134 86 86 257
G 197 297 171 377
H 111 49 33 107
I 91 120 93 239
L 221 177 110 339
K 249 115 112 321
M 48 16 25 91
F 123 85 52 158
P 82 127 71 188
S 122 341 126 327
T 119 253 117 238
W 33 44 30 72
Y 63 110 108 130
V 167 229 123 378
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
aa.prop
##    alpha.prop   beta.prop a.plus.b.prop    a.div.b
## A 0.116469146 0.073126801    0.09264161 0.08331410
## R 0.021659174 0.024135447    0.04129169 0.03369490
## N 0.039640376 0.050072046    0.06352567 0.04223402
## D 0.066612178 0.043587896    0.05876125 0.05631202
## C 0.008990601 0.027017291    0.03917417 0.01453958
## Q 0.027380466 0.043948127    0.03917417 0.02630972
## E 0.054760932 0.030979827    0.04552673 0.05931225
## G 0.080506743 0.106988473    0.09052409 0.08700669
## H 0.045361667 0.017651297    0.01746956 0.02469421
## I 0.037188394 0.043227666    0.04923240 0.05515809
## L 0.090314671 0.063760807    0.05823187 0.07823679
## K 0.101757254 0.041426513    0.05929063 0.07408262
## M 0.019615856 0.005763689    0.01323452 0.02100162
## F 0.050265631 0.030619597    0.02752779 0.03646434
## P 0.033510421 0.045749280    0.03758602 0.04338795
## S 0.049856968 0.122838617    0.06670196 0.07546734
## T 0.048630977 0.091138329    0.06193753 0.05492730
## W 0.013485901 0.015850144    0.01588142 0.01661666
## Y 0.025745811 0.039625360    0.05717311 0.03000231
## V 0.068246833 0.082492795    0.06511382 0.08723748
NP_000783 <- rentrez::entrez_fetch(id = "NC_000004.12",
                                     db = "gene",
                                     rettype = "fasta")
NP_000783.freq.table <- table(NP_000783)/length(NP_000783)
# clean and turn into vector
NP_000783 <- compbio4all::fasta_cleaner(NP_000783, parse = TRUE)
NP_000784 <- rentrez::entrez_fetch(id = "NC_036883.1",
                                     db = "gene",
                                     rettype = "fasta")

NP_000784.freq.table <- table(NP_000784)/length(NP_000784)
# clean and turn into vector
NP_000784 <- compbio4all::fasta_cleaner(NP_000784, parse = TRUE)
NP_000783.freq.table <- table(NP_000783)/length(NP_000783)
#table_to_vector <- function(table_x){table_names <- attr(table_x, "dimnames")[[1]] table_vect <- as.vector(table_x) names(table_vect) <- table_names return(table_vect)}
slc30a9.human.aa.freq <- table_to_vector(NP_000783.freq.table)
aa.names <- names(slc30a9.human.aa.freq)
any(aa.names == "U")
i.U <- which(aa.names == "U")

aa.names[i.U]
slc30a9.human.aa.freq[i.U]
which(aa.names %in% c("U"))
slc30a9.human.aa.freq <- slc30a9.human.aa.freq[-i.U]
# sum is just under 1; ok for our purposes
sum(slc30a9.human.aa.freq)
aa.prop <- slc30a9.human.aa.freq
pander::pander(NP_000784)
table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)}
slc30a9_human_vector <- human_fasta
slc30a9_human_table <- table(slc30a9_human_vector)/length(slc30a9_human_vector)
slc30a9.human.aa.freq <- table_to_vector(slc30a9_human_table)
slc30a9.human.aa.freq
## \n1. A12M1\nadenovirus-12 chromosome modification site 1C [Homo sapiens (human)]\nOther Designations: Adenovirus-12 chromosome modification site-1q1\nChromosome: 1; Location: 1q42-q43\nThis record was discontinued.\nID: 4\n\n2. SERPINA3\nOfficial Symbol: SERPINA3 and Name: serpin family A member 3 [Homo sapiens (human)]\nOther Aliases: AACT, ACT, GIG24, GIG25\nOther Designations: alpha-1-antichymotrypsin; cell growth-inhibiting gene 24/25 protein; growth-inhibiting protein 24; growth-inhibiting protein 25; serine (or cysteine) proteinase inhibitor, clade A, member 3; serpin A3; serpin peptidase inhibitor, clade A (alpha-1 antiproteinase, antitrypsin), member 3\nChromosome: 14; Location: 14q32.13\nAnnotation: Chromosome 14 NC_000014.9 (94612391..94624053)\nMIM: 107280\nID: 12\n\n 
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     1
aa.names <- names(slc30a9.human.aa.freq)
i.U <- which(aa.names == "U")
aa.names[i.U]
## character(0)
slc30a9.human.aa.freq[i.U]
## named numeric(0)
aa.prop$slc30a9.human.aa.freq <- slc30a9.human.aa.freq
pander::pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b slc30a9.human.aa.freq
A 0.1165 0.07313 0.09264 0.08331 1
R 0.02166 0.02414 0.04129 0.03369 1
N 0.03964 0.05007 0.06353 0.04223 1
D 0.06661 0.04359 0.05876 0.05631 1
C 0.008991 0.02702 0.03917 0.01454 1
Q 0.02738 0.04395 0.03917 0.02631 1
E 0.05476 0.03098 0.04553 0.05931 1
G 0.08051 0.107 0.09052 0.08701 1
H 0.04536 0.01765 0.01747 0.02469 1
I 0.03719 0.04323 0.04923 0.05516 1
L 0.09031 0.06376 0.05823 0.07824 1
K 0.1018 0.04143 0.05929 0.07408 1
M 0.01962 0.005764 0.01323 0.021 1
F 0.05027 0.03062 0.02753 0.03646 1
P 0.03351 0.04575 0.03759 0.04339 1
S 0.04986 0.1228 0.0667 0.07547 1
T 0.04863 0.09114 0.06194 0.05493 1
W 0.01349 0.01585 0.01588 0.01662 1
Y 0.02575 0.03963 0.05717 0.03 1
V 0.06825 0.08249 0.06511 0.08724 1

#Functions to calculate similarities

# Corrleation used in Chou adn Zhange 1992.
chou_cor <- function(x,y){numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)}

# Cosine similarity used in Higgs and Attwood (2005). 
#chou_cosine <- function(z.1, z.2){z.1.abs <- sqrt(sum(z.1^2))z.2.abs <- sqrt(sum(z.2^2)) my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs) return(my.cosine)}

#Correlation

cor(NP_000784.freq.table, NP_000783.freq.table)

#Cosine similarity

#install.packages(lsa)
library(lsa)
cos.mat.numeric <- habk.df2[ , c(-1, -2, -3)]

cos.mat.centered <- cos.mat.numeric

cos.mat.standard <- cos.mat.centered

cos.mat.dist <- cos.mat.standard

cos.mat <- lsa::cosine(t(cos.mat.standard))
cos.mat.rnd <- round(cos.mat,2)


mdendro.complete <- mdendro::linkage(prox = cos.mat.dist,
                                type.prox = "similarity",
                                weighted =T,
                                method = "complete")

mdendro.arith <- mdendro::linkage(prox = cos.mat.dist,
                                type.prox = "similarity",
                                weighted =T,
                                method = "arithmetic")
plot(mdendro.arith)
aa.prop.flipped <- t(aa.prop$slc30a9.human.aa.freq)
round(aa.prop.flipped,2)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1    1    1    1    1    1    1    1    1     1     1     1     1     1
##      [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]     1     1     1     1     1     1

#Distance

#Note: we need to flip the dataframe on its side using a command called t()

aa.prop.flipped <- t(aa.prop$slc30a9.human.aa.freq)
round(aa.prop.flipped,2)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1    1    1    1    1    1    1    1    1     1     1     1     1     1
##      [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]     1     1     1     1     1     1
dist(aa.prop.flipped)
## dist(0)

#Percent Identity Comparisons (PID)

names(SLC30A9_FASTA)
## NULL
length(SLC30A9_FASTA)
## [1] 10
names(SLC30A9_vector) <- names(SLC30A9_FASTA)
pids <- c(1, NA,     NA,     NA,
          pid(align01.02),          1,     NA,     NA,
          pid(align01.05), pid(align02.05),      1,     NA,
          pid(align01.06), pid(align02.06), pid(align05.06), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Mus","Rattus", "Gorilla", "Pan", "Xenopus", "Rattus", "Danio", "Sus")   
colnames(mat) <- c("Homo","Pan","Mus","Rattus", "Gorilla", "Pan", "Xenopus", "Rattus", "Danio", "Sus")   
pander::pander(mat)  
slc30a9_vector <- c(SLC30A9_FASTA)
names(slc30a9_vector) <- names(SLC30A9_FASTA)

#PID methods comparison:

#Human vs Chimp PID comparison: ######################################## # Program: needle # Rundate: Fri 17 Dec 2021 23:59:53 # Commandline: needle # -auto # -stdout # -asequence emboss_needle-I20211218-000323-0340-3146481-p2m.asequence # -bsequence emboss_needle-I20211218-000323-0340-3146481-p2m.bsequence # -datafile EBLOSUM62 # -gapopen 10.0 # -gapextend 0.5 # -endopen 10.0 # -endextend 0.5 # -aformat3 pair # -sprotein1 # -sprotein2 # Align_format: pair # Report_file: stdout ########################################

#======================================= # # Aligned_sequences: 2 # 1: EMBOSS_001 # 2: EMBOSS_001 # Matrix: EBLOSUM62 # Gap_penalty: 10.0 # Extend_penalty: 0.5 # # Length: 569 # Identity: 545/569 (95.8%) # Similarity: 556/569 (97.7%) # Gaps: 1/569 ( 0.2%) # Score: 2781.0 # # #======================================= #https://www.ebi.ac.uk/Tools/services/web/toolresult.ebi?jobId=emboss_needle-I20211218-000323-0340-3146481-p2m

#Multiple sequence alignment


hShroom2 <- entrez_fetch(db = "protein", 
                          id = "Q6PML9", 
                          rettype = "fasta")

mShroom3 <- entrez_fetch(db = "protein", 
                          id = "CK820", 
                          rettype = "fasta")


hShroom4 <- entrez_fetch(db = "protein", 
                          id = "C57BL", 
                          rettype = "fasta")



sShroom1 <- entrez_fetch(db = "protein", 
                          id = "G7P5H4", 
                          rettype = "fasta")
fasta_cleaner <- function(fasta_object, parse = TRUE){

  fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)","\\4",fasta_object)
  fasta_object <- gsub("\n", "", fasta_object)

  if(parse == TRUE){
    fasta_object <- stringr::str_split(fasta_object,
                                       pattern = "",
                                       simplify = FALSE)
  }

  return(fasta_object[[1]])
}

hShroom4  <- fasta_cleaner(hShroom4,  parse = F)
mShroom3 <- fasta_cleaner(mShroom3, parse = F)
hShroom2  <- fasta_cleaner(hShroom2,  parse = F)
sShroom1   <- fasta_cleaner(sShroom1,   parse = F)



shrooms_align <- msa(shrooms_vector_ss,
                     method = "ClustalW")


class(shrooms_align) <- "AAMultipleAlignment"


shrooms_align_seqinr <- msaConvert(shrooms_align, type = "seqinr::alignment")


compbio4all::print_msa(alignment = shrooms_align_seqinr, 
          chunksize = 60)


# key step - must have class set properly
class(shrooms_align) <- "AAMultipleAlignment"

# run ggmsa
ggmsa::ggmsa(shrooms_align,   # shrooms_align, NOT shrooms_align_seqinr
      start = 2000, 
      end = 2100) 

msaPrettyPrint(shrooms_align_subset,             # alignment
               file = "shroom_msa_subset.pdf",   # file name
               y=c(2030, 2100),           # range
               askForOverwrite=FALSE)

#Distance matrix

shrooms_subset_dist <- seqinr::dist.alignment(shrooms_align_subset_seqinr, 
                                       matrix = "identity")

shrooms_subset_dist_alt <- matrix(data = NA,
                              nrow = 5, 
                              ncol = 5)

distances <- c(0.8260049, 
               0.8478722, 0.9000568, 
               0.9244596, 0.9435187, 0.9372139, 
               0.9238779, 0.9370038, 0.9323225,0.9413209)
shrooms_subset_dist_alt[lower.tri(shrooms_subset_dist_alt)] <- distances


seqnames <- c("EAA12598","ABA81834","XP_392427", "XP_783573","CAA78718")
colnames(shrooms_subset_dist_alt) <- seqnames
row.names(shrooms_subset_dist_alt)  <- seqnames
shrooms_subset_dist_alt <- as.dist(shrooms_subset_dist_alt)
shrooms_subset_dist <- shrooms_subset_dist_alt


shrooms_subset_dist_rounded <- round(shrooms_subset_dist,
                              digits = 3)

#Phylognetic trees of sequences


# Note - not using rounded values
tree_subset <- nj(shrooms_subset_dist)


# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree", 
            type = "unrooted", 
            use.edge.length = F)

# add label
mtext(text = "Shroom family gene tree - UNrooted, no branch lengths")
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree", 
            use.edge.length = F)

# add label
mtext(text = "Shroom family gene tree - rooted, no branch lenths")