Introduction

Gene APOBEC3G is a member of the cytidine deaminase gene family. It is one of seven related genes or pseudogenes found in a cluster, thought to result from gene duplication, on chromosome 22. The protein encoded by this gene catalyzes site-specific deamination of both RNA and single-stranded DNA. The encoded protein has been found to be a specific inhibitor of human immunodeficiency virus-1 (HIV-1) infectivity.

Resources / References

Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/60489

Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene?LinkName=gene_homologene&from_uid=60489

UniProt: https://www.uniprot.org/uniprot/Q9HC16

PDB: https://www.rcsb.org/structure/2KBO

Preparation

library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
##   is available with R version '4.1'; see https://bioconductor.org/install
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
library(rentrez)
library(seqinr)
library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(pander)
library(ggplot2)
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
## 
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
## 
##     version
library(Biostrings)
library(HGNChelper)
library(drawProteins)

Accession numbers

Chromosome accession number (NC_000022.11) was obtained through the NCBI database by searching the respective gene name. Protein accession number (NP_068594.1) was located also on the NCBI website under the GenBank section, which provided more information about the function and purpose of the gene. Using a protein BLAST search, only experimental versions of the protein was found in bonobos, gorillas, and orangutans, and and none were identified in mice, frogs, and fruit flies. With a more exploratory search completed also using BLAST, accession numbers for Drill monkeys (Mandrillus leucophaeus) and chimpanzees (Pan troglodytes) in the mRNA form. A search was conducted further for mRNA accession numbers from the same gene in this method in other species.

Accession Numbers Table

RefSeq_Accession_Numbers <- c("NP_068594", "NP_001009001", "NP_001332812", "NP_001279005", "NP_001292891", "NP_001332845", "NP_001185622", "XP_034804992", "XP_024095464", "XP_025254494")
Uniprot_Accession_Numbers <- c("Q9HC16","Q7YR24","Q694B7","AEY75958","AGX93028", "NA", "AGE34493", "Q694B6", "PNJ49698", "NA")
PDB <- c("NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA")
Scientific_Name <- c("Homo sapiens", "Pan troglodytes", "Papio anubis", "Chlorocebus sabaeus", "C atys", "Mandrillus leucophaeus", "Macaca mulatta", "Pan paniscus", "Pongo abelii", "Theropithecus gelada")
Common_Name <- c("Human", "Chimpanzee", "Olive Baboon", "Green Monkey", "Sooty Mangabey", "Drill Monkey", "Rhesus monkey", "Pygmy Chimpanzee", "Sumatran Orangutan", "Gelada Monkey")
Gene_Name <- c("APOBEC3G", "APOBEC3G", "APOBEC3G", "APOBEC3G", "APOBEC3G", "APOBEC3G", "APOBEC3G", "APOBEC3G", "APOBEC3G", "APOBEC3G")

APOBEC3G <- data.frame(RefSeq_Accession_Numbers,Uniprot_Accession_Numbers,PDB,Scientific_Name, Common_Name,Gene_Name)


pander(APOBEC3G)
Table continues below
RefSeq_Accession_Numbers Uniprot_Accession_Numbers PDB
NP_068594 Q9HC16 NA
NP_001009001 Q7YR24 NA
NP_001332812 Q694B7 NA
NP_001279005 AEY75958 NA
NP_001292891 AGX93028 NA
NP_001332845 NA NA
NP_001185622 AGE34493 NA
XP_034804992 Q694B6 NA
XP_024095464 PNJ49698 NA
XP_025254494 NA NA
Scientific_Name Common_Name Gene_Name
Homo sapiens Human APOBEC3G
Pan troglodytes Chimpanzee APOBEC3G
Papio anubis Olive Baboon APOBEC3G
Chlorocebus sabaeus Green Monkey APOBEC3G
C atys Sooty Mangabey APOBEC3G
Mandrillus leucophaeus Drill Monkey APOBEC3G
Macaca mulatta Rhesus monkey APOBEC3G
Pan paniscus Pygmy Chimpanzee APOBEC3G
Pongo abelii Sumatran Orangutan APOBEC3G
Theropithecus gelada Gelada Monkey APOBEC3G

Data prepartation

APOBEC3G_list <- entrez_fetch_list(db = "protein", 
                          id = RefSeq_Accession_Numbers,
                          rettype = "fasta") 

Number of FASTA files obtained

length(APOBEC3G_list)
## [1] 10

The first entry

APOBEC3G_list[[1]]
## [1] ">NP_068594.1 DNA dC-_dU-editing enzyme APOBEC-3G isoform 1 [Homo sapiens]\nMKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSRPPLDAKIFRGQVYSELKYHPEMRF\nFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDMATFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQ\nKRDGPRATMKIMNYDEFQHCWSKFVYSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTFNFNNEP\nWVRGRHETYLCYEVERMHNDTWVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLDQDYRVT\nCFTSWSPCFSCAQEMAKFISKNKHVSLCIFTARIYDDQGRCQEGLRTLAEAGAKISIMTYSEFKHCWDTF\nVDHQGCPFQPWDGLDEHSQDLSGRLRAILQNQEN\n\n"

Initial data cleaning

Remove FASTA header

for(i in 1:length(APOBEC3G_list)){APOBEC3G_list[[i]]<-compbio4all::fasta_cleaner(APOBEC3G_list[[i]], parse = F)}

#General Protein information

Protein diagram

APOBEC3G_json <- drawProteins::get_features("Q9HC16")
## [1] "Download has worked"
is(APOBEC3G_json)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df <- drawProteins::feature_to_dataframe(APOBEC3G_json)
is(my_prot_df)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
##                     type begin end length accession   entryName taxid order
## featuresTemp       CHAIN     1 384    383    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.1    DOMAIN    29 138    109    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.2    DOMAIN   214 328    114    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.3    REGION     1  60     59    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.4    REGION   209 336    127    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.5    REGION   213 215      2    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.6    REGION   313 320      7    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.7  ACT_SITE   259 259      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.8     METAL    65  65      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.9     METAL    97  97      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.10    METAL   100 100      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.11    METAL   257 257      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.12    METAL   288 288      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.13    METAL   291 291      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.14     SITE   244 244      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.15  MOD_RES    32  32      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.16  MOD_RES   218 218      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.17  VAR_SEQ    58  79     21    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.18  VAR_SEQ    80 384    304    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.19  VARIANT   186 186      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.20  VARIANT   256 256      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.21  VARIANT   275 275      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.22  MUTAGEN    67  67      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.23  MUTAGEN    67  67      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.24  MUTAGEN    67  67      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.25  MUTAGEN    74  74      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.26  MUTAGEN    80  80      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.27  MUTAGEN    81  81      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.28  MUTAGEN    85  85      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.29  MUTAGEN    86  86      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.30  MUTAGEN    97  97      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.31  MUTAGEN   100 100      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.32  MUTAGEN   107 107      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.33  MUTAGEN   128 128      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.34  MUTAGEN   213 213      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.35  MUTAGEN   213 213      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.36  MUTAGEN   215 215      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.37  MUTAGEN   217 217      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.38  MUTAGEN   218 218      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.39  MUTAGEN   218 218      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.40  MUTAGEN   221 221      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.41  MUTAGEN   244 244      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.42  MUTAGEN   247 247      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.43  MUTAGEN   256 256      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.44  MUTAGEN   257 257      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.45  MUTAGEN   259 259      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.46  MUTAGEN   259 259      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.47  MUTAGEN   259 259      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.48  MUTAGEN   285 285      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.49  MUTAGEN   288 288      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.50  MUTAGEN   291 291      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.51  MUTAGEN   313 313      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.52  MUTAGEN   315 315      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.53  MUTAGEN   320 320      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.54  MUTAGEN   320 320      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.55  MUTAGEN   323 323      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.56 CONFLICT   162 162      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.57 CONFLICT   370 370      0    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.58   STRAND   195 197      2    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.59    HELIX   199 206      7    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.60    HELIX   209 211      2    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.61   STRAND   213 217      4    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.62   STRAND   219 228      9    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.63   STRAND   231 234      3    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.64    HELIX   236 238      2    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.65   STRAND   240 243      3    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.66   STRAND   247 250      3    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.67    HELIX   258 265      7    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.68    HELIX   266 269      3    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.69   STRAND   273 275      2    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.70   STRAND   277 285      8    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.71    HELIX   289 301     12    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.72   STRAND   305 313      8    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.73   STRAND   318 320      2    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.74    HELIX   321 330      9    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.75   STRAND   334 337      3    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.76    HELIX   340 350     10    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.77     TURN   353 355      2    Q9HC16 ABC3G_HUMAN  9606     1
## featuresTemp.78    HELIX   364 379     15    Q9HC16 ABC3G_HUMAN  9606     1
my_canvas <- drawProteins::draw_canvas(my_prot_df)  
my_canvas <- drawProteins::draw_chains(my_canvas, my_prot_df, 
                         label_size = 2.5)
my_canvas <- drawProteins::draw_domains(my_canvas, my_prot_df)
my_canvas

Dotplot

Preparing data

APOBEC3G_human_vector <- fasta_cleaner(APOBEC3G_list[[1]])

2x2 panel

par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

#plot 1: 
dotPlot(APOBEC3G_human_vector, APOBEC3G_human_vector, wsize = 1, nmatch = 1, main = "Defaults")

# plot 2:
dotPlot(APOBEC3G_human_vector, APOBEC3G_human_vector, wsize = 10, nmatch = 1, main = "size = 10, nmatch = 1")

# plot 3: 
dotPlot(APOBEC3G_human_vector, APOBEC3G_human_vector, wsize = 10, nmatch = 5, main = "size = 10, nmatch = 5")

# plot 4:
dotPlot(APOBEC3G_human_vector, APOBEC3G_human_vector, wsize = 20, nmatch = 5,main = "size = 20, nmatch = 5")

Reset par()

par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

Plot

dotPlot(APOBEC3G_human_vector, APOBEC3G_human_vector, 
        wsize = 25,
        nmatch = 5,
        main = "size = 20, nmatch = 5")

Protein properties compiled from databases

PDB: https://www.rcsb.org/structure/2KBO Uniprot:https://www.uniprot.org/uniprot/Q9HC16 Pfam: http://pfam.xfam.org/family/PF18782.3 Alphafold: http://pfam.xfam.org/family/PF18782.3

Source <- c("PDB", "Uniprot", "Pfam", "Alphafold")

Property <- c("Classification - HYDROLASE",
                "Location: Nucleus & Cytoplasm",
                "Domain NAD2 from 10aa - 193aa & Domain APOBEC_C from 303aa - 379aa", 
                "Structure: Alpha helicies, and beta-pleated sheets")

Protein.Propertiesdf <- data.frame(Source, Property)

pander::pander(Protein.Propertiesdf)
Source Property
PDB Classification - HYDROLASE
Uniprot Location: Nucleus & Cytoplasm
Pfam Domain NAD2 from 10aa - 193aa & Domain APOBEC_C from 303aa - 379aa
Alphafold Structure: Alpha helicies, and beta-pleated sheets

Protein feature prediction

Multivariate statistcal techniques were used to confirm the information about protein structure and location in the line database.

Using Uniprot, the conclusion indicates that this protein is located primarily in the nucleus, and secondarily in the cytoplasm.

##Predict Protien Fold

Alphafold indicates that there are a mix of alpha helices and beta sheets. I therefore predict that machine-learning methods will indicate an a+b and a/b structure.

Chou & Zhang Data Table

aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)

beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)

a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)

pander(data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b))
aa.1.1 alpha beta a.plus.b a.div.b
A 285 203 175 361
R 53 67 78 146
N 97 139 120 183
D 163 121 111 244
C 22 75 74 63
Q 67 122 74 114
E 134 86 86 257
G 197 297 171 377
H 111 49 33 107
I 91 120 93 239
L 221 177 110 339
K 249 115 112 321
M 48 16 25 91
F 123 85 52 158
P 82 127 71 188
S 122 341 126 327
T 119 253 117 238
W 33 44 30 72
Y 63 110 108 130
V 167 229 123 378

Convert to frequencies

alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)

row.names(aa.prop) <- aa.1.1

pander::pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b
A 0.1165 0.07313 0.09264 0.08331
R 0.02166 0.02414 0.04129 0.03369
N 0.03964 0.05007 0.06353 0.04223
D 0.06661 0.04359 0.05876 0.05631
C 0.008991 0.02702 0.03917 0.01454
Q 0.02738 0.04395 0.03917 0.02631
E 0.05476 0.03098 0.04553 0.05931
G 0.08051 0.107 0.09052 0.08701
H 0.04536 0.01765 0.01747 0.02469
I 0.03719 0.04323 0.04923 0.05516
L 0.09031 0.06376 0.05823 0.07824
K 0.1018 0.04143 0.05929 0.07408
M 0.01962 0.005764 0.01323 0.021
F 0.05027 0.03062 0.02753 0.03646
P 0.03351 0.04575 0.03759 0.04339
S 0.04986 0.1228 0.0667 0.07547
T 0.04863 0.09114 0.06194 0.05493
W 0.01349 0.01585 0.01588 0.01662
Y 0.02575 0.03963 0.05717 0.03
V 0.06825 0.08249 0.06511 0.08724

Determine the number of each amino acid in APOBEC3G

APOBEC3G.table <- table(APOBEC3G_human_vector)/length(APOBEC3G_human_vector)

Convert a table into a vector

table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}


APOBEC3G_human_table <- table(APOBEC3G_human_vector)/length(APOBEC3G_human_vector)
APOBEC3G.human.aa.freq <- table_to_vector(APOBEC3G_human_table) 
pander(APOBEC3G.human.aa.freq) 
Table continues below
A C D E F G H I
0.03906 0.03906 0.05729 0.0625 0.07031 0.03646 0.04167 0.03906
Table continues below
K L M N P Q R S
0.05208 0.08333 0.02865 0.04167 0.05208 0.04427 0.07812 0.05208
T V W Y
0.05469 0.03906 0.03646 0.05208

Check for the presence of “U”

aa.names <- names(APOBEC3G.human.aa.freq)
i.U <- which(aa.names == "U")
aa.names[i.U]
## character(0)

Add data on APOBEC3G to the amino acid frequency table

aa.prop$APOBEC3G.human.aa.freq <- APOBEC3G.human.aa.freq
pander::pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b APOBEC3G.human.aa.freq
A 0.1165 0.07313 0.09264 0.08331 0.03906
R 0.02166 0.02414 0.04129 0.03369 0.03906
N 0.03964 0.05007 0.06353 0.04223 0.05729
D 0.06661 0.04359 0.05876 0.05631 0.0625
C 0.008991 0.02702 0.03917 0.01454 0.07031
Q 0.02738 0.04395 0.03917 0.02631 0.03646
E 0.05476 0.03098 0.04553 0.05931 0.04167
G 0.08051 0.107 0.09052 0.08701 0.03906
H 0.04536 0.01765 0.01747 0.02469 0.05208
I 0.03719 0.04323 0.04923 0.05516 0.08333
L 0.09031 0.06376 0.05823 0.07824 0.02865
K 0.1018 0.04143 0.05929 0.07408 0.04167
M 0.01962 0.005764 0.01323 0.021 0.05208
F 0.05027 0.03062 0.02753 0.03646 0.04427
P 0.03351 0.04575 0.03759 0.04339 0.07812
S 0.04986 0.1228 0.0667 0.07547 0.05208
T 0.04863 0.09114 0.06194 0.05493 0.05469
W 0.01349 0.01585 0.01588 0.01662 0.03906
Y 0.02575 0.03963 0.05717 0.03 0.03646
V 0.06825 0.08249 0.06511 0.08724 0.05208

Functions to calculate similarities

chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}

Calculate correlation between each column

corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

Calculate cosine similarity

cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

Calculate distance

aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2) 
##                           A    R    N    D    C    Q    E    G    H    I    L
## alpha.prop             0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop              0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop          0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b                0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## APOBEC3G.human.aa.freq 0.04 0.04 0.06 0.06 0.07 0.04 0.04 0.04 0.05 0.08 0.03
##                           K    M    F    P    S    T    W    Y    V
## alpha.prop             0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop              0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop          0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b                0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## APOBEC3G.human.aa.freq 0.04 0.05 0.04 0.08 0.05 0.05 0.04 0.04 0.05

Get distance matrix

dist(aa.prop.flipped, method = "euclidean")
##                        alpha.prop  beta.prop a.plus.b.prop    a.div.b
## beta.prop              0.13342098                                    
## a.plus.b.prop          0.09281824 0.08289406                         
## a.div.b                0.06699039 0.08659174    0.06175113           
## APOBEC3G.human.aa.freq 0.16154202 0.15384231    0.12227476 0.13300972

Individual distances

dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

Compile the information. Rounding makes it easier to read

# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )

Display output

pander::pander(df)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 0.7882 0.7882 0.1615
beta 0.8124 0.8124 0.1538
alpha plus beta 0.8688 0.8688 0.1223 most.sim min.dist
alpha/beta 0.8485 0.8485 0.133

Percent Identity Comparisons (PID)

names(APOBEC3G_list)[1] <- "NP_068594"
names(APOBEC3G_list)[2] <- "NP_001009001"
names(APOBEC3G_list)[3] <- "NP_001332812"
names(APOBEC3G_list)[4] <- "NP_001279005"
names(APOBEC3G_list)[5] <- "NP_001292891"
names(APOBEC3G_list)[6] <- "NP_001332845"
names(APOBEC3G_list)[7] <- "NP_001185622"
names(APOBEC3G_list)[8] <- "XP_034804992"
names(APOBEC3G_list)[9] <- "XP_024095464"
names(APOBEC3G_list)[10] <- "XP_025254494"
  
APOBEC3G_list[1]
## $NP_068594
## [1] "MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSRPPLDAKIFRGQVYSELKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDMATFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFVYSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTFNFNNEPWVRGRHETYLCYEVERMHNDTWVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLDQDYRVTCFTSWSPCFSCAQEMAKFISKNKHVSLCIFTARIYDDQGRCQEGLRTLAEAGAKISIMTYSEFKHCWDTFVDHQGCPFQPWDGLDEHSQDLSGRLRAILQNQEN"
APOBEC3G_list
## $NP_068594
## [1] "MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSRPPLDAKIFRGQVYSELKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDMATFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFVYSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTFNFNNEPWVRGRHETYLCYEVERMHNDTWVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLDQDYRVTCFTSWSPCFSCAQEMAKFISKNKHVSLCIFTARIYDDQGRCQEGLRTLAEAGAKISIMTYSEFKHCWDTFVDHQGCPFQPWDGLDEHSQDLSGRLRAILQNQEN"
## 
## $NP_001009001
## [1] "MKPQFRNPVERMYQDTFSDNFYNRPILSRRNTVWLCYEVKTKGPSRPPLDAKIFRGQVYSKLKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDVATFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFVYSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTSNFNNELWVRGRHETYLCYEVERLHNDTWVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLHQDYRVTCFTSWSPCFSCAQEMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLAKAGAKISIMTYSEFKHCWDTFVDHQGCPFQPWDGLEEHSQALSERLQAILQNQGN"
## 
## $NP_001332812
## [1] "MVKRMKADIFVSNFNNRPILSGRNTVWLCCEVNTKDPSGPPLDAKIFRGKVYSKAKYHPEMRFLHWFRKWRQLHRDQEYEVTWYVSWSPCTGCANSVATFLAEDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFVRGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGTFTSNFYNKPWVSGQHETYLCYKVERLHNGTWVPLNQHRGFLRNQAPDIHGFPKGRHAELCFLDLIPFWKLDGQQYRVTCFTSWSPCFSCAQEMAKFISNNEHVSLCIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEYCWDTFVDRQGRPFQPWDGLDEHSQDLSGRLRAILQNQGN"
## 
## $NP_001279005
## [1] "MVERMKPGIFVYYFNNRPILSGRNIVWLCCEVKTKDPSGPPLDANIFQGELYPEAKDHPEMKFLHWFRKWRQLHRDQEYEVTWYVSWSPCTRCANSVATFLAEDPKVTLTIFVARLYYFWKPHYQEALRILCQKRGGPHATMKIMNYNEFQHCWNEFVDGQGKPFKPRKNLPKHYTLLHATLGELLRHVMDPGTFTSNFNNKPWVSGQRETYLCYKVERSHNDTWVLLNQHRGFLRNQAPDRHGFPKGRHAELCFLDLIPFWKLDDQQYRVTCFTSWSPCFSCAQKMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLHRDGAKIAVMNYSEFEYCWDTFVDRQGRPFQPWDGLDEHSQALSGRLRAILQNQGN"
## 
## $NP_001292891
## [1] "MVEPMKTGIFVSNFNNKPILSGRNTVWLCCEVKTKDPSGPPLDAKIFRGKVYSKAKYHPEMRFLRWFLKWRQLHRDQEYEVTWYVSWSPCTGCANSVATFLAKDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFVRGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGTFTSNFNNKLWVSGQHETYLCYKVERPHNDTWVLLNQHRGFLQNQAPDIHGFPKGRHAELCFLDLIPLWKLDGQQYRVTCFTSWSPCFNCAQEMAKFISNNKHVSLRIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEYCWDTFVDRQGRPFQPWDGLDEHSQALSERLRAILQNQGN"
## 
## $NP_001332845
## [1] "MVKRMKPGIFVSNFNNKPILSGRNTVWLCCEVKTKDPSGPPLDAKIFRDKVYSKAKYHPEMRFLRWFRKWRQLHRDQEYEVTWYVSWSPCTGCANSVATFLAEDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFVRGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGTFTSNFNNKLWVSGQHETYLCYKVERPHNDTWVLLNQHRGFLQNQAPDIHGFPKGRHAELCFLDLIPFWKLDDQQYRVTCFTSWSPCFNCAQEMAKFISDNKHVSLRIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEYCWDTFVDRQGRPFQPWDGLDEHSQDLSGRLRAILQNQGN"
## 
## $NP_001185622
## [1] "MNPQIRNMVEPMDPRTFVSNFNNRPILSGLNTVWLCCEVKTKDPSGPPLDAKIFQGKVLRSKAKYHPEMRFLQWFREWRQLHHDQEYKVTWYVSWSPCTRCANSVATFLAKDPKVTLTIFVARLYYFWKPNYQQALRILCQKRDGPHATMKIMNYNEFQDCWNKFVDGRGKPFKPWNNLPKHYTLLQATLGELLRHLMDPGTFTSNFNNKPWVSGQHETYLCYKVERLHNDTWVPLNQHRGFLRNQAPNIHGFPKGRHAELCFLDLIPFWKLDGQQYRVTCFTSWSPCFSCAQEMAKFISNNEHVSLCIFAARIYDDQGRYQEGLRTLHRDGAKIAMMNYSEFEYCWDTFVDCQGCPFQPWDGLDEHSQALSERLRAILQNQGN"
## 
## $XP_034804992
## [1] "MKPHFRNPVERMYQDTFSDNFYNRPILSRRNTVWLCYEVKTKGPSRPPLDAKIFRGQVYSKLKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDVATFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFVYSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTSNFNNELWVRGRHETYLCYEVERLHNDTRVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLHQDYRVTCFTSWSPCFSCAQEMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLAKAGAEISIMTYSEFKHCWDTFVDHQGCPFQPWDGLEEHSQALSERLQAILQNQGN"
## 
## $XP_024095464
## [1] "MLQTKILVRTSRPMMNPQFRNMVDGMDPHKFSYNFKNRPILSRRNTVWLCYEVKTKGPSRPPLDAKIFRGQVYFELKNHPEMRFFHWFSKWRTLHRDQECEVTWYMSWSPCTKCTRNVATFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCRERDGPRANMKIMNYDEFQHCWNKFVYSQRELFEPWNNLPKYYIVLHIILGEILRHSMDPLTFTSNFNNEPCVEGRHETYLCYKVERLHNDTWVLLNQRRGFLCNQAPAIHGFPEGRHAELCFLDVIPFWKLDGKQRYRVTCFTSWSPCFRCAQEMAKFISNNQHVSLCIFAARIYDDQGRCKEGLRTLDEAEAKISIMTYSEFQHCWDTFVDHQGRPFQPWDGLEEHSEAWSGKLQAILQNQGN"
## 
## $XP_025254494
## [1] "MKPQFRNTVERMYRDTFFYNFNNRPILSRRNTVWLCYEVKTRGPSMPTWGTKIFRGQVYSKAKYHPEMRFLHWFRKWRQLHRDQEYEVTWYVSWSPCTGCANSVATFLAEDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFVRGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGMFTSNFYNKSWVSGQHETYLCYKVERPHNDTWVLLNQHRGFLRNQAPDIHGFPKGRHAELCFLDLIPFWKLDGQQYRVTCFTSWSPCFNCAQEMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEYCWDTFVDRQGRPFQPWDGLDEHSQALSERLRAILQNQGN"
APOBEC3G_vector <- unlist(APOBEC3G_list)
names(APOBEC3G_vector) <- names(APOBEC3G_list)

APOBEC3G_vector[1]
##                                                                                                                                                                                                                                                                                                                                                                                          NP_068594 
## "MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSRPPLDAKIFRGQVYSELKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDMATFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFVYSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTFNFNNEPWVRGRHETYLCYEVERMHNDTWVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLDQDYRVTCFTSWSPCFSCAQEMAKFISKNKHVSLCIFTARIYDDQGRCQEGLRTLAEAGAKISIMTYSEFKHCWDTFVDHQGCPFQPWDGLDEHSQDLSGRLRAILQNQEN"

PID Table

# PID - Human vs Chimpanzee
align01.02 <- Biostrings::pairwiseAlignment(APOBEC3G_vector[1], APOBEC3G_vector[2])
pid(align01.02)
## [1] 95.3125
# PID - Human vs Olive Baboon 
align01.03 <- Biostrings::pairwiseAlignment(APOBEC3G_vector[1], APOBEC3G_vector[3])
pid(align01.03)
## [1] 78.38542
# PID - Human vs Green Monkey
align01.04 <- Biostrings::pairwiseAlignment(APOBEC3G_vector[1], APOBEC3G_vector[4])
pid(align01.04)
## [1] 76.5625
# PID - Chimpanzee vs Olive Baboon 
align02.03 <- Biostrings::pairwiseAlignment(APOBEC3G_vector[2], APOBEC3G_vector[3])
pid(align02.03)
## [1] 78.90625
# PID - Chimpanzee vs Green Monkey
align02.04 <- Biostrings::pairwiseAlignment(APOBEC3G_vector[2], APOBEC3G_vector[4])
pid(align02.04)
## [1] 76.30208
# PID - Olive Baboon vs Green Monkey
align03.04 <- Biostrings::pairwiseAlignment(APOBEC3G_vector[3], APOBEC3G_vector[4])
pid(align03.04)
## [1] 89.62766
pid_val <- c(1, NA, NA, NA,
                pid(align01.02), 1, NA, NA, 
                pid(align01.03), pid(align02.03), 1, NA,
                pid(align01.04), pid(align02.04), pid(align03.04), 1)

pid_mat <- matrix(pid_val, nrow = 4, byrow = T)
row.names(pid_mat) <- c("Homo","Pan","Olive","Green")   
colnames(pid_mat) <- c("Homo","Pan","Olive","Green")
pander::pander(pid_mat)  
  Homo Pan Olive Green
Homo 1 NA NA NA
Pan 95.31 1 NA NA
Olive 78.39 78.91 1 NA
Green 76.56 76.3 89.63 1

PID methods comparison

pid1 <- pid(align01.02, type = "PID1")
pid2 <- pid(align01.02, type = "PID2")
pid3 <- pid(align01.02, type = "PID3")
pid4 <- pid(align01.02, type = "PID4")

pid_method <- c("PID1", "PID2", "PID3", "PID4")
pid_value <- c(pid1, pid2, pid3, pid4)
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")

pid_df <- data.frame(pid_method, pid_value, denominator)
pander::pander(pid_df)
pid_method pid_value denominator
PID1 95.31 (aligned positions + internal gap positions)
PID2 95.31 (aligned positions)
PID3 95.31 (length shorter sequence)
PID4 95.31 (average length of the two sequences)

Multiple Sequence Alignment

MSA Data Preparation

APOBEC3G_vector_ss <- Biostrings::AAStringSet(APOBEC3G_vector)

Build MSA

APOBEC3G_align <- msa(APOBEC3G_vector_ss, method = "ClustalW")
## use default substitution matrix

Clean & Set up MSA

class(APOBEC3G_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(APOBEC3G_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment"    "MsaMetaData"           
## [4] "MultipleAlignment"
APOBEC3G_align
## CLUSTAL 2.1  
## 
## Call:
##    msa(APOBEC3G_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 10 rows and 399 columns
##      aln                                                   names
##  [1] ---------------------MVEP...PWDGLDEHSQALSERLRAILQNQGN NP_001292891
##  [2] ---------------------MVKR...PWDGLDEHSQDLSGRLRAILQNQGN NP_001332845
##  [3] ---------------------MVKR...PWDGLDEHSQDLSGRLRAILQNQGN NP_001332812
##  [4] ---------------------MVER...PWDGLDEHSQALSGRLRAILQNQGN NP_001279005
##  [5] --------------MNPQIRNMVEP...PWDGLDEHSQALSERLRAILQNQGN NP_001185622
##  [6] --------------MKPQFRNTVER...PWDGLDEHSQALSERLRAILQNQGN XP_025254494
##  [7] --------------MKPQFRNPVER...PWDGLEEHSQALSERLQAILQNQGN NP_001009001
##  [8] --------------MKPHFRNPVER...PWDGLEEHSQALSERLQAILQNQGN XP_034804992
##  [9] --------------MKPHFRNTVER...PWDGLDEHSQDLSGRLRAILQNQEN NP_068594
## [10] MLQTKILVRTSRPMMNPQFRNMVDG...PWDGLEEHSEAWSGKLQAILQNQGN XP_024095464
##  Con --------------M?P?FRNMVER...PWDGLDEHSQALS?RLRAILQNQGN Consensus
class(APOBEC3G_align) <- "AAMultipleAlignment"

APOBEC3G_align_seqinr <- msaConvert(APOBEC3G_align, type = "seqinr::alignment")

compbio4all::print_msa(APOBEC3G_align_seqinr)
## [1] "---------------------MVEPMKTGIFVSNFNNKPILSGRNTVWLCCEVKTKDPSG 0"
## [1] "---------------------MVKRMKPGIFVSNFNNKPILSGRNTVWLCCEVKTKDPSG 0"
## [1] "---------------------MVKRMKADIFVSNFNNRPILSGRNTVWLCCEVNTKDPSG 0"
## [1] "---------------------MVERMKPGIFVYYFNNRPILSGRNIVWLCCEVKTKDPSG 0"
## [1] "--------------MNPQIRNMVEPMDPRTFVSNFNNRPILSGLNTVWLCCEVKTKDPSG 0"
## [1] "--------------MKPQFRNTVERMYRDTFFYNFNNRPILSRRNTVWLCYEVKTRGPSM 0"
## [1] "--------------MKPQFRNPVERMYQDTFSDNFYNRPILSRRNTVWLCYEVKTKGPSR 0"
## [1] "--------------MKPHFRNPVERMYQDTFSDNFYNRPILSRRNTVWLCYEVKTKGPSR 0"
## [1] "--------------MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSR 0"
## [1] "MLQTKILVRTSRPMMNPQFRNMVDGMDPHKFSYNFKNRPILSRRNTVWLCYEVKTKGPSR 0"
## [1] " "
## [1] "PPLDAKIFRG-KVYSKAKYHPEMRFLRWFLKWRQLHRDQEYEVTWYVSWSPCTGCANSVA 0"
## [1] "PPLDAKIFRD-KVYSKAKYHPEMRFLRWFRKWRQLHRDQEYEVTWYVSWSPCTGCANSVA 0"
## [1] "PPLDAKIFRG-KVYSKAKYHPEMRFLHWFRKWRQLHRDQEYEVTWYVSWSPCTGCANSVA 0"
## [1] "PPLDANIFQG-ELYPEAKDHPEMKFLHWFRKWRQLHRDQEYEVTWYVSWSPCTRCANSVA 0"
## [1] "PPLDAKIFQGKVLRSKAKYHPEMRFLQWFREWRQLHHDQEYKVTWYVSWSPCTRCANSVA 0"
## [1] "PTWGTKIFRG-QVYSKAKYHPEMRFLHWFRKWRQLHRDQEYEVTWYVSWSPCTGCANSVA 0"
## [1] "PPLDAKIFRG-QVYSKLKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDVA 0"
## [1] "PPLDAKIFRG-QVYSKLKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDVA 0"
## [1] "PPLDAKIFRG-QVYSELKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDMA 0"
## [1] "PPLDAKIFRG-QVYFELKNHPEMRFFHWFSKWRTLHRDQECEVTWYMSWSPCTKCTRNVA 0"
## [1] " "
## [1] "TFLAKDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWKPHYQEALRILCQKRGGPHATMKIMNYNEFQHCWNEFV 0"
## [1] "TFLAKDPKVTLTIFVARLYYFWKPNYQQALRILCQKRDGPHATMKIMNYNEFQDCWNKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCRERDGPRANMKIMNYDEFQHCWNKFV 0"
## [1] " "
## [1] "RGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGTFTSNFNNKLWVSGQHETYLCYKVE 0"
## [1] "RGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGTFTSNFNNKLWVSGQHETYLCYKVE 0"
## [1] "RGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGTFTSNFYNKPWVSGQHETYLCYKVE 0"
## [1] "DGQGKPFKPRKNLPKHYTLLHATLGELLRHVMDPGTFTSNFNNKPWVSGQRETYLCYKVE 0"
## [1] "DGRGKPFKPWNNLPKHYTLLQATLGELLRHLMDPGTFTSNFNNKPWVSGQHETYLCYKVE 0"
## [1] "RGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGMFTSNFYNKSWVSGQHETYLCYKVE 0"
## [1] "YSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTSNFNNELWVRGRHETYLCYEVE 0"
## [1] "YSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTSNFNNELWVRGRHETYLCYEVE 0"
## [1] "YSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTFNFNNEPWVRGRHETYLCYEVE 0"
## [1] "YSQRELFEPWNNLPKYYIVLHIILGEILRHSMDPLTFTSNFNNEPCVEGRHETYLCYKVE 0"
## [1] " "
## [1] "RPHNDTWVLLNQHRGFLQNQAPDIHGFPKGRHAELCFLDLIPLWKLDGQQ-YRVTCFTSW 0"
## [1] "RPHNDTWVLLNQHRGFLQNQAPDIHGFPKGRHAELCFLDLIPFWKLDDQQ-YRVTCFTSW 0"
## [1] "RLHNGTWVPLNQHRGFLRNQAPDIHGFPKGRHAELCFLDLIPFWKLDGQQ-YRVTCFTSW 0"
## [1] "RSHNDTWVLLNQHRGFLRNQAPDRHGFPKGRHAELCFLDLIPFWKLDDQQ-YRVTCFTSW 0"
## [1] "RLHNDTWVPLNQHRGFLRNQAPNIHGFPKGRHAELCFLDLIPFWKLDGQQ-YRVTCFTSW 0"
## [1] "RPHNDTWVLLNQHRGFLRNQAPDIHGFPKGRHAELCFLDLIPFWKLDGQQ-YRVTCFTSW 0"
## [1] "RLHNDTWVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLHQDYRVTCFTSW 0"
## [1] "RLHNDTRVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLHQDYRVTCFTSW 0"
## [1] "RMHNDTWVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLDQDYRVTCFTSW 0"
## [1] "RLHNDTWVLLNQRRGFLCNQAPAIHGFPEGRHAELCFLDVIPFWKLDGKQRYRVTCFTSW 0"
## [1] " "
## [1] "SPCFNCAQEMAKFISNNKHVSLRIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEY 0"
## [1] "SPCFNCAQEMAKFISDNKHVSLRIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEY 0"
## [1] "SPCFSCAQEMAKFISNNEHVSLCIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEY 0"
## [1] "SPCFSCAQKMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLHRDGAKIAVMNYSEFEY 0"
## [1] "SPCFSCAQEMAKFISNNEHVSLCIFAARIYDDQGRYQEGLRTLHRDGAKIAMMNYSEFEY 0"
## [1] "SPCFNCAQEMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEY 0"
## [1] "SPCFSCAQEMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLAKAGAKISIMTYSEFKH 0"
## [1] "SPCFSCAQEMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLAKAGAEISIMTYSEFKH 0"
## [1] "SPCFSCAQEMAKFISKNKHVSLCIFTARIYDDQGRCQEGLRTLAEAGAKISIMTYSEFKH 0"
## [1] "SPCFRCAQEMAKFISNNQHVSLCIFAARIYDDQGRCKEGLRTLDEAEAKISIMTYSEFQH 0"
## [1] " "
## [1] "CWDTFVDRQGRPFQPWDGLDEHSQALSERLRAILQNQGN 21"
## [1] "CWDTFVDRQGRPFQPWDGLDEHSQDLSGRLRAILQNQGN 21"
## [1] "CWDTFVDRQGRPFQPWDGLDEHSQDLSGRLRAILQNQGN 21"
## [1] "CWDTFVDRQGRPFQPWDGLDEHSQALSGRLRAILQNQGN 21"
## [1] "CWDTFVDCQGCPFQPWDGLDEHSQALSERLRAILQNQGN 21"
## [1] "CWDTFVDRQGRPFQPWDGLDEHSQALSERLRAILQNQGN 21"
## [1] "CWDTFVDHQGCPFQPWDGLEEHSQALSERLQAILQNQGN 21"
## [1] "CWDTFVDHQGCPFQPWDGLEEHSQALSERLQAILQNQGN 21"
## [1] "CWDTFVDHQGCPFQPWDGLDEHSQDLSGRLRAILQNQEN 21"
## [1] "CWDTFVDHQGRPFQPWDGLEEHSEAWSGKLQAILQNQGN 21"
## [1] " "

Print MSA

ggmsa::ggmsa(APOBEC3G_align, start = 25, end = 100) 

Distance Matrix

APOBEC3G_dist <- seqinr::dist.alignment(APOBEC3G_align_seqinr, matrix = "identity")
is(APOBEC3G_dist)
## [1] "dist"     "oldClass"
class(APOBEC3G_dist)
## [1] "dist"
APOBEC3G_dist_round <- round(APOBEC3G_dist, 3)
APOBEC3G_dist_round
##              NP_001292891 NP_001332845 NP_001332812 NP_001279005 NP_001185622
## NP_001332845        0.171                                                    
## NP_001332812        0.236        0.213                                       
## NP_001279005        0.334        0.322        0.322                          
## NP_001185622        0.326        0.346        0.318        0.338             
## XP_025254494        0.273        0.282        0.268        0.361        0.368
## NP_001009001        0.444        0.450        0.444        0.473        0.468
## XP_034804992        0.450        0.455        0.450        0.478        0.477
## NP_068594           0.464        0.455        0.450        0.473        0.487
## XP_024095464        0.475        0.475        0.467        0.487        0.493
##              XP_025254494 NP_001009001 XP_034804992 NP_068594
## NP_001332845                                                 
## NP_001332812                                                 
## NP_001279005                                                 
## NP_001185622                                                 
## XP_025254494                                                 
## NP_001009001        0.424                                    
## XP_034804992        0.434        0.088                       
## NP_068594           0.437        0.217        0.222          
## XP_024095464        0.468        0.346        0.357     0.364

Phylogenetic Trees from Sequences

tree_subset <- nj(APOBEC3G_dist)

plot.phylo(tree_subset, main = "Phylogenetic Tree", use.edge.length = FALSE)
mtext(text = "APOBEC3G family gene tree - rooted, no branch lengths")