Gene APOBEC3G is a member of the cytidine deaminase gene family. It is one of seven related genes or pseudogenes found in a cluster, thought to result from gene duplication, on chromosome 22. The protein encoded by this gene catalyzes site-specific deamination of both RNA and single-stranded DNA. The encoded protein has been found to be a specific inhibitor of human immunodeficiency virus-1 (HIV-1) infectivity.
Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/60489
Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene?LinkName=gene_homologene&from_uid=60489
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(Biostrings)
library(HGNChelper)
library(drawProteins)
Chromosome accession number (NC_000022.11) was obtained through the NCBI database by searching the respective gene name. Protein accession number (NP_068594.1) was located also on the NCBI website under the GenBank section, which provided more information about the function and purpose of the gene. Using a protein BLAST search, only experimental versions of the protein was found in bonobos, gorillas, and orangutans, and and none were identified in mice, frogs, and fruit flies. With a more exploratory search completed also using BLAST, accession numbers for Drill monkeys (Mandrillus leucophaeus) and chimpanzees (Pan troglodytes) in the mRNA form. A search was conducted further for mRNA accession numbers from the same gene in this method in other species.
RefSeq_Accession_Numbers <- c("NP_068594", "NP_001009001", "NP_001332812", "NP_001279005", "NP_001292891", "NP_001332845", "NP_001185622", "XP_034804992", "XP_024095464", "XP_025254494")
Uniprot_Accession_Numbers <- c("Q9HC16","Q7YR24","Q694B7","AEY75958","AGX93028", "NA", "AGE34493", "Q694B6", "PNJ49698", "NA")
PDB <- c("NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA")
Scientific_Name <- c("Homo sapiens", "Pan troglodytes", "Papio anubis", "Chlorocebus sabaeus", "C atys", "Mandrillus leucophaeus", "Macaca mulatta", "Pan paniscus", "Pongo abelii", "Theropithecus gelada")
Common_Name <- c("Human", "Chimpanzee", "Olive Baboon", "Green Monkey", "Sooty Mangabey", "Drill Monkey", "Rhesus monkey", "Pygmy Chimpanzee", "Sumatran Orangutan", "Gelada Monkey")
Gene_Name <- c("APOBEC3G", "APOBEC3G", "APOBEC3G", "APOBEC3G", "APOBEC3G", "APOBEC3G", "APOBEC3G", "APOBEC3G", "APOBEC3G", "APOBEC3G")
APOBEC3G <- data.frame(RefSeq_Accession_Numbers,Uniprot_Accession_Numbers,PDB,Scientific_Name, Common_Name,Gene_Name)
pander(APOBEC3G)
| RefSeq_Accession_Numbers | Uniprot_Accession_Numbers | PDB |
|---|---|---|
| NP_068594 | Q9HC16 | NA |
| NP_001009001 | Q7YR24 | NA |
| NP_001332812 | Q694B7 | NA |
| NP_001279005 | AEY75958 | NA |
| NP_001292891 | AGX93028 | NA |
| NP_001332845 | NA | NA |
| NP_001185622 | AGE34493 | NA |
| XP_034804992 | Q694B6 | NA |
| XP_024095464 | PNJ49698 | NA |
| XP_025254494 | NA | NA |
| Scientific_Name | Common_Name | Gene_Name |
|---|---|---|
| Homo sapiens | Human | APOBEC3G |
| Pan troglodytes | Chimpanzee | APOBEC3G |
| Papio anubis | Olive Baboon | APOBEC3G |
| Chlorocebus sabaeus | Green Monkey | APOBEC3G |
| C atys | Sooty Mangabey | APOBEC3G |
| Mandrillus leucophaeus | Drill Monkey | APOBEC3G |
| Macaca mulatta | Rhesus monkey | APOBEC3G |
| Pan paniscus | Pygmy Chimpanzee | APOBEC3G |
| Pongo abelii | Sumatran Orangutan | APOBEC3G |
| Theropithecus gelada | Gelada Monkey | APOBEC3G |
APOBEC3G_list <- entrez_fetch_list(db = "protein",
id = RefSeq_Accession_Numbers,
rettype = "fasta")
Number of FASTA files obtained
length(APOBEC3G_list)
## [1] 10
The first entry
APOBEC3G_list[[1]]
## [1] ">NP_068594.1 DNA dC-_dU-editing enzyme APOBEC-3G isoform 1 [Homo sapiens]\nMKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSRPPLDAKIFRGQVYSELKYHPEMRF\nFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDMATFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQ\nKRDGPRATMKIMNYDEFQHCWSKFVYSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTFNFNNEP\nWVRGRHETYLCYEVERMHNDTWVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLDQDYRVT\nCFTSWSPCFSCAQEMAKFISKNKHVSLCIFTARIYDDQGRCQEGLRTLAEAGAKISIMTYSEFKHCWDTF\nVDHQGCPFQPWDGLDEHSQDLSGRLRAILQNQEN\n\n"
Remove FASTA header
for(i in 1:length(APOBEC3G_list)){APOBEC3G_list[[i]]<-compbio4all::fasta_cleaner(APOBEC3G_list[[i]], parse = F)}
#General Protein information
Protein diagram
APOBEC3G_json <- drawProteins::get_features("Q9HC16")
## [1] "Download has worked"
is(APOBEC3G_json)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df <- drawProteins::feature_to_dataframe(APOBEC3G_json)
is(my_prot_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp CHAIN 1 384 383 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.1 DOMAIN 29 138 109 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.2 DOMAIN 214 328 114 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.3 REGION 1 60 59 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.4 REGION 209 336 127 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.5 REGION 213 215 2 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.6 REGION 313 320 7 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.7 ACT_SITE 259 259 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.8 METAL 65 65 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.9 METAL 97 97 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.10 METAL 100 100 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.11 METAL 257 257 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.12 METAL 288 288 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.13 METAL 291 291 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.14 SITE 244 244 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.15 MOD_RES 32 32 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.16 MOD_RES 218 218 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.17 VAR_SEQ 58 79 21 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.18 VAR_SEQ 80 384 304 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.19 VARIANT 186 186 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.20 VARIANT 256 256 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.21 VARIANT 275 275 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.22 MUTAGEN 67 67 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.23 MUTAGEN 67 67 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.24 MUTAGEN 67 67 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.25 MUTAGEN 74 74 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.26 MUTAGEN 80 80 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.27 MUTAGEN 81 81 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.28 MUTAGEN 85 85 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.29 MUTAGEN 86 86 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.30 MUTAGEN 97 97 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.31 MUTAGEN 100 100 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.32 MUTAGEN 107 107 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.33 MUTAGEN 128 128 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.34 MUTAGEN 213 213 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.35 MUTAGEN 213 213 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.36 MUTAGEN 215 215 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.37 MUTAGEN 217 217 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.38 MUTAGEN 218 218 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.39 MUTAGEN 218 218 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.40 MUTAGEN 221 221 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.41 MUTAGEN 244 244 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.42 MUTAGEN 247 247 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.43 MUTAGEN 256 256 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.44 MUTAGEN 257 257 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.45 MUTAGEN 259 259 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.46 MUTAGEN 259 259 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.47 MUTAGEN 259 259 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.48 MUTAGEN 285 285 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.49 MUTAGEN 288 288 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.50 MUTAGEN 291 291 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.51 MUTAGEN 313 313 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.52 MUTAGEN 315 315 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.53 MUTAGEN 320 320 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.54 MUTAGEN 320 320 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.55 MUTAGEN 323 323 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.56 CONFLICT 162 162 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.57 CONFLICT 370 370 0 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.58 STRAND 195 197 2 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.59 HELIX 199 206 7 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.60 HELIX 209 211 2 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.61 STRAND 213 217 4 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.62 STRAND 219 228 9 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.63 STRAND 231 234 3 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.64 HELIX 236 238 2 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.65 STRAND 240 243 3 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.66 STRAND 247 250 3 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.67 HELIX 258 265 7 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.68 HELIX 266 269 3 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.69 STRAND 273 275 2 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.70 STRAND 277 285 8 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.71 HELIX 289 301 12 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.72 STRAND 305 313 8 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.73 STRAND 318 320 2 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.74 HELIX 321 330 9 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.75 STRAND 334 337 3 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.76 HELIX 340 350 10 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.77 TURN 353 355 2 Q9HC16 ABC3G_HUMAN 9606 1
## featuresTemp.78 HELIX 364 379 15 Q9HC16 ABC3G_HUMAN 9606 1
my_canvas <- drawProteins::draw_canvas(my_prot_df)
my_canvas <- drawProteins::draw_chains(my_canvas, my_prot_df,
label_size = 2.5)
my_canvas <- drawProteins::draw_domains(my_canvas, my_prot_df)
my_canvas
Preparing data
APOBEC3G_human_vector <- fasta_cleaner(APOBEC3G_list[[1]])
2x2 panel
par(mfrow = c(2,2),
mar = c(0,0,2,1))
#plot 1:
dotPlot(APOBEC3G_human_vector, APOBEC3G_human_vector, wsize = 1, nmatch = 1, main = "Defaults")
# plot 2:
dotPlot(APOBEC3G_human_vector, APOBEC3G_human_vector, wsize = 10, nmatch = 1, main = "size = 10, nmatch = 1")
# plot 3:
dotPlot(APOBEC3G_human_vector, APOBEC3G_human_vector, wsize = 10, nmatch = 5, main = "size = 10, nmatch = 5")
# plot 4:
dotPlot(APOBEC3G_human_vector, APOBEC3G_human_vector, wsize = 20, nmatch = 5,main = "size = 20, nmatch = 5")
Reset par()
par(mfrow = c(1,1),
mar = c(4,4,4,4))
Plot
dotPlot(APOBEC3G_human_vector, APOBEC3G_human_vector,
wsize = 25,
nmatch = 5,
main = "size = 20, nmatch = 5")
PDB: https://www.rcsb.org/structure/2KBO Uniprot:https://www.uniprot.org/uniprot/Q9HC16 Pfam: http://pfam.xfam.org/family/PF18782.3 Alphafold: http://pfam.xfam.org/family/PF18782.3
Source <- c("PDB", "Uniprot", "Pfam", "Alphafold")
Property <- c("Classification - HYDROLASE",
"Location: Nucleus & Cytoplasm",
"Domain NAD2 from 10aa - 193aa & Domain APOBEC_C from 303aa - 379aa",
"Structure: Alpha helicies, and beta-pleated sheets")
Protein.Propertiesdf <- data.frame(Source, Property)
pander::pander(Protein.Propertiesdf)
| Source | Property |
|---|---|
| PDB | Classification - HYDROLASE |
| Uniprot | Location: Nucleus & Cytoplasm |
| Pfam | Domain NAD2 from 10aa - 193aa & Domain APOBEC_C from 303aa - 379aa |
| Alphafold | Structure: Alpha helicies, and beta-pleated sheets |
Multivariate statistcal techniques were used to confirm the information about protein structure and location in the line database.
Using Uniprot, the conclusion indicates that this protein is located primarily in the nucleus, and secondarily in the cytoplasm.
##Predict Protien Fold
Alphafold indicates that there are a mix of alpha helices and beta sheets. I therefore predict that machine-learning methods will indicate an a+b and a/b structure.
Chou & Zhang Data Table
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
pander(data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b))
| aa.1.1 | alpha | beta | a.plus.b | a.div.b |
|---|---|---|---|---|
| A | 285 | 203 | 175 | 361 |
| R | 53 | 67 | 78 | 146 |
| N | 97 | 139 | 120 | 183 |
| D | 163 | 121 | 111 | 244 |
| C | 22 | 75 | 74 | 63 |
| Q | 67 | 122 | 74 | 114 |
| E | 134 | 86 | 86 | 257 |
| G | 197 | 297 | 171 | 377 |
| H | 111 | 49 | 33 | 107 |
| I | 91 | 120 | 93 | 239 |
| L | 221 | 177 | 110 | 339 |
| K | 249 | 115 | 112 | 321 |
| M | 48 | 16 | 25 | 91 |
| F | 123 | 85 | 52 | 158 |
| P | 82 | 127 | 71 | 188 |
| S | 122 | 341 | 126 | 327 |
| T | 119 | 253 | 117 | 238 |
| W | 33 | 44 | 30 | 72 |
| Y | 63 | 110 | 108 | 130 |
| V | 167 | 229 | 123 | 378 |
Convert to frequencies
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
row.names(aa.prop) <- aa.1.1
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
Determine the number of each amino acid in APOBEC3G
APOBEC3G.table <- table(APOBEC3G_human_vector)/length(APOBEC3G_human_vector)
Convert a table into a vector
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
APOBEC3G_human_table <- table(APOBEC3G_human_vector)/length(APOBEC3G_human_vector)
APOBEC3G.human.aa.freq <- table_to_vector(APOBEC3G_human_table)
pander(APOBEC3G.human.aa.freq)
| A | C | D | E | F | G | H | I |
|---|---|---|---|---|---|---|---|
| 0.03906 | 0.03906 | 0.05729 | 0.0625 | 0.07031 | 0.03646 | 0.04167 | 0.03906 |
| K | L | M | N | P | Q | R | S |
|---|---|---|---|---|---|---|---|
| 0.05208 | 0.08333 | 0.02865 | 0.04167 | 0.05208 | 0.04427 | 0.07812 | 0.05208 |
| T | V | W | Y |
|---|---|---|---|
| 0.05469 | 0.03906 | 0.03646 | 0.05208 |
Check for the presence of “U”
aa.names <- names(APOBEC3G.human.aa.freq)
i.U <- which(aa.names == "U")
aa.names[i.U]
## character(0)
Add data on APOBEC3G to the amino acid frequency table
aa.prop$APOBEC3G.human.aa.freq <- APOBEC3G.human.aa.freq
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | APOBEC3G.human.aa.freq | |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 0.03906 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 0.03906 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 0.05729 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 0.0625 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 0.07031 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 0.03646 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 0.04167 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 0.03906 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 0.05208 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 0.08333 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 0.02865 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 0.04167 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 0.05208 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 0.04427 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 0.07812 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 0.05208 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 0.05469 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 0.03906 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 0.03646 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 0.05208 |
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
Calculate correlation between each column
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
Calculate cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
Calculate distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## APOBEC3G.human.aa.freq 0.04 0.04 0.06 0.06 0.07 0.04 0.04 0.04 0.05 0.08 0.03
## K M F P S T W Y V
## alpha.prop 0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## APOBEC3G.human.aa.freq 0.04 0.05 0.04 0.08 0.05 0.05 0.04 0.04 0.05
Get distance matrix
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## APOBEC3G.human.aa.freq 0.16154202 0.15384231 0.12227476 0.13300972
Individual distances
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
Compile the information. Rounding makes it easier to read
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
Display output
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7882 | 0.7882 | 0.1615 | ||
| beta | 0.8124 | 0.8124 | 0.1538 | ||
| alpha plus beta | 0.8688 | 0.8688 | 0.1223 | most.sim | min.dist |
| alpha/beta | 0.8485 | 0.8485 | 0.133 |
names(APOBEC3G_list)[1] <- "NP_068594"
names(APOBEC3G_list)[2] <- "NP_001009001"
names(APOBEC3G_list)[3] <- "NP_001332812"
names(APOBEC3G_list)[4] <- "NP_001279005"
names(APOBEC3G_list)[5] <- "NP_001292891"
names(APOBEC3G_list)[6] <- "NP_001332845"
names(APOBEC3G_list)[7] <- "NP_001185622"
names(APOBEC3G_list)[8] <- "XP_034804992"
names(APOBEC3G_list)[9] <- "XP_024095464"
names(APOBEC3G_list)[10] <- "XP_025254494"
APOBEC3G_list[1]
## $NP_068594
## [1] "MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSRPPLDAKIFRGQVYSELKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDMATFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFVYSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTFNFNNEPWVRGRHETYLCYEVERMHNDTWVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLDQDYRVTCFTSWSPCFSCAQEMAKFISKNKHVSLCIFTARIYDDQGRCQEGLRTLAEAGAKISIMTYSEFKHCWDTFVDHQGCPFQPWDGLDEHSQDLSGRLRAILQNQEN"
APOBEC3G_list
## $NP_068594
## [1] "MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSRPPLDAKIFRGQVYSELKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDMATFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFVYSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTFNFNNEPWVRGRHETYLCYEVERMHNDTWVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLDQDYRVTCFTSWSPCFSCAQEMAKFISKNKHVSLCIFTARIYDDQGRCQEGLRTLAEAGAKISIMTYSEFKHCWDTFVDHQGCPFQPWDGLDEHSQDLSGRLRAILQNQEN"
##
## $NP_001009001
## [1] "MKPQFRNPVERMYQDTFSDNFYNRPILSRRNTVWLCYEVKTKGPSRPPLDAKIFRGQVYSKLKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDVATFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFVYSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTSNFNNELWVRGRHETYLCYEVERLHNDTWVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLHQDYRVTCFTSWSPCFSCAQEMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLAKAGAKISIMTYSEFKHCWDTFVDHQGCPFQPWDGLEEHSQALSERLQAILQNQGN"
##
## $NP_001332812
## [1] "MVKRMKADIFVSNFNNRPILSGRNTVWLCCEVNTKDPSGPPLDAKIFRGKVYSKAKYHPEMRFLHWFRKWRQLHRDQEYEVTWYVSWSPCTGCANSVATFLAEDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFVRGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGTFTSNFYNKPWVSGQHETYLCYKVERLHNGTWVPLNQHRGFLRNQAPDIHGFPKGRHAELCFLDLIPFWKLDGQQYRVTCFTSWSPCFSCAQEMAKFISNNEHVSLCIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEYCWDTFVDRQGRPFQPWDGLDEHSQDLSGRLRAILQNQGN"
##
## $NP_001279005
## [1] "MVERMKPGIFVYYFNNRPILSGRNIVWLCCEVKTKDPSGPPLDANIFQGELYPEAKDHPEMKFLHWFRKWRQLHRDQEYEVTWYVSWSPCTRCANSVATFLAEDPKVTLTIFVARLYYFWKPHYQEALRILCQKRGGPHATMKIMNYNEFQHCWNEFVDGQGKPFKPRKNLPKHYTLLHATLGELLRHVMDPGTFTSNFNNKPWVSGQRETYLCYKVERSHNDTWVLLNQHRGFLRNQAPDRHGFPKGRHAELCFLDLIPFWKLDDQQYRVTCFTSWSPCFSCAQKMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLHRDGAKIAVMNYSEFEYCWDTFVDRQGRPFQPWDGLDEHSQALSGRLRAILQNQGN"
##
## $NP_001292891
## [1] "MVEPMKTGIFVSNFNNKPILSGRNTVWLCCEVKTKDPSGPPLDAKIFRGKVYSKAKYHPEMRFLRWFLKWRQLHRDQEYEVTWYVSWSPCTGCANSVATFLAKDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFVRGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGTFTSNFNNKLWVSGQHETYLCYKVERPHNDTWVLLNQHRGFLQNQAPDIHGFPKGRHAELCFLDLIPLWKLDGQQYRVTCFTSWSPCFNCAQEMAKFISNNKHVSLRIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEYCWDTFVDRQGRPFQPWDGLDEHSQALSERLRAILQNQGN"
##
## $NP_001332845
## [1] "MVKRMKPGIFVSNFNNKPILSGRNTVWLCCEVKTKDPSGPPLDAKIFRDKVYSKAKYHPEMRFLRWFRKWRQLHRDQEYEVTWYVSWSPCTGCANSVATFLAEDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFVRGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGTFTSNFNNKLWVSGQHETYLCYKVERPHNDTWVLLNQHRGFLQNQAPDIHGFPKGRHAELCFLDLIPFWKLDDQQYRVTCFTSWSPCFNCAQEMAKFISDNKHVSLRIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEYCWDTFVDRQGRPFQPWDGLDEHSQDLSGRLRAILQNQGN"
##
## $NP_001185622
## [1] "MNPQIRNMVEPMDPRTFVSNFNNRPILSGLNTVWLCCEVKTKDPSGPPLDAKIFQGKVLRSKAKYHPEMRFLQWFREWRQLHHDQEYKVTWYVSWSPCTRCANSVATFLAKDPKVTLTIFVARLYYFWKPNYQQALRILCQKRDGPHATMKIMNYNEFQDCWNKFVDGRGKPFKPWNNLPKHYTLLQATLGELLRHLMDPGTFTSNFNNKPWVSGQHETYLCYKVERLHNDTWVPLNQHRGFLRNQAPNIHGFPKGRHAELCFLDLIPFWKLDGQQYRVTCFTSWSPCFSCAQEMAKFISNNEHVSLCIFAARIYDDQGRYQEGLRTLHRDGAKIAMMNYSEFEYCWDTFVDCQGCPFQPWDGLDEHSQALSERLRAILQNQGN"
##
## $XP_034804992
## [1] "MKPHFRNPVERMYQDTFSDNFYNRPILSRRNTVWLCYEVKTKGPSRPPLDAKIFRGQVYSKLKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDVATFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFVYSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTSNFNNELWVRGRHETYLCYEVERLHNDTRVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLHQDYRVTCFTSWSPCFSCAQEMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLAKAGAEISIMTYSEFKHCWDTFVDHQGCPFQPWDGLEEHSQALSERLQAILQNQGN"
##
## $XP_024095464
## [1] "MLQTKILVRTSRPMMNPQFRNMVDGMDPHKFSYNFKNRPILSRRNTVWLCYEVKTKGPSRPPLDAKIFRGQVYFELKNHPEMRFFHWFSKWRTLHRDQECEVTWYMSWSPCTKCTRNVATFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCRERDGPRANMKIMNYDEFQHCWNKFVYSQRELFEPWNNLPKYYIVLHIILGEILRHSMDPLTFTSNFNNEPCVEGRHETYLCYKVERLHNDTWVLLNQRRGFLCNQAPAIHGFPEGRHAELCFLDVIPFWKLDGKQRYRVTCFTSWSPCFRCAQEMAKFISNNQHVSLCIFAARIYDDQGRCKEGLRTLDEAEAKISIMTYSEFQHCWDTFVDHQGRPFQPWDGLEEHSEAWSGKLQAILQNQGN"
##
## $XP_025254494
## [1] "MKPQFRNTVERMYRDTFFYNFNNRPILSRRNTVWLCYEVKTRGPSMPTWGTKIFRGQVYSKAKYHPEMRFLHWFRKWRQLHRDQEYEVTWYVSWSPCTGCANSVATFLAEDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFVRGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGMFTSNFYNKSWVSGQHETYLCYKVERPHNDTWVLLNQHRGFLRNQAPDIHGFPKGRHAELCFLDLIPFWKLDGQQYRVTCFTSWSPCFNCAQEMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEYCWDTFVDRQGRPFQPWDGLDEHSQALSERLRAILQNQGN"
APOBEC3G_vector <- unlist(APOBEC3G_list)
names(APOBEC3G_vector) <- names(APOBEC3G_list)
APOBEC3G_vector[1]
## NP_068594
## "MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSRPPLDAKIFRGQVYSELKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDMATFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFVYSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTFNFNNEPWVRGRHETYLCYEVERMHNDTWVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLDQDYRVTCFTSWSPCFSCAQEMAKFISKNKHVSLCIFTARIYDDQGRCQEGLRTLAEAGAKISIMTYSEFKHCWDTFVDHQGCPFQPWDGLDEHSQDLSGRLRAILQNQEN"
# PID - Human vs Chimpanzee
align01.02 <- Biostrings::pairwiseAlignment(APOBEC3G_vector[1], APOBEC3G_vector[2])
pid(align01.02)
## [1] 95.3125
# PID - Human vs Olive Baboon
align01.03 <- Biostrings::pairwiseAlignment(APOBEC3G_vector[1], APOBEC3G_vector[3])
pid(align01.03)
## [1] 78.38542
# PID - Human vs Green Monkey
align01.04 <- Biostrings::pairwiseAlignment(APOBEC3G_vector[1], APOBEC3G_vector[4])
pid(align01.04)
## [1] 76.5625
# PID - Chimpanzee vs Olive Baboon
align02.03 <- Biostrings::pairwiseAlignment(APOBEC3G_vector[2], APOBEC3G_vector[3])
pid(align02.03)
## [1] 78.90625
# PID - Chimpanzee vs Green Monkey
align02.04 <- Biostrings::pairwiseAlignment(APOBEC3G_vector[2], APOBEC3G_vector[4])
pid(align02.04)
## [1] 76.30208
# PID - Olive Baboon vs Green Monkey
align03.04 <- Biostrings::pairwiseAlignment(APOBEC3G_vector[3], APOBEC3G_vector[4])
pid(align03.04)
## [1] 89.62766
pid_val <- c(1, NA, NA, NA,
pid(align01.02), 1, NA, NA,
pid(align01.03), pid(align02.03), 1, NA,
pid(align01.04), pid(align02.04), pid(align03.04), 1)
pid_mat <- matrix(pid_val, nrow = 4, byrow = T)
row.names(pid_mat) <- c("Homo","Pan","Olive","Green")
colnames(pid_mat) <- c("Homo","Pan","Olive","Green")
pander::pander(pid_mat)
| Homo | Pan | Olive | Green | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Pan | 95.31 | 1 | NA | NA |
| Olive | 78.39 | 78.91 | 1 | NA |
| Green | 76.56 | 76.3 | 89.63 | 1 |
pid1 <- pid(align01.02, type = "PID1")
pid2 <- pid(align01.02, type = "PID2")
pid3 <- pid(align01.02, type = "PID3")
pid4 <- pid(align01.02, type = "PID4")
pid_method <- c("PID1", "PID2", "PID3", "PID4")
pid_value <- c(pid1, pid2, pid3, pid4)
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")
pid_df <- data.frame(pid_method, pid_value, denominator)
pander::pander(pid_df)
| pid_method | pid_value | denominator |
|---|---|---|
| PID1 | 95.31 | (aligned positions + internal gap positions) |
| PID2 | 95.31 | (aligned positions) |
| PID3 | 95.31 | (length shorter sequence) |
| PID4 | 95.31 | (average length of the two sequences) |
MSA Data Preparation
APOBEC3G_vector_ss <- Biostrings::AAStringSet(APOBEC3G_vector)
Build MSA
APOBEC3G_align <- msa(APOBEC3G_vector_ss, method = "ClustalW")
## use default substitution matrix
Clean & Set up MSA
class(APOBEC3G_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(APOBEC3G_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
APOBEC3G_align
## CLUSTAL 2.1
##
## Call:
## msa(APOBEC3G_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 399 columns
## aln names
## [1] ---------------------MVEP...PWDGLDEHSQALSERLRAILQNQGN NP_001292891
## [2] ---------------------MVKR...PWDGLDEHSQDLSGRLRAILQNQGN NP_001332845
## [3] ---------------------MVKR...PWDGLDEHSQDLSGRLRAILQNQGN NP_001332812
## [4] ---------------------MVER...PWDGLDEHSQALSGRLRAILQNQGN NP_001279005
## [5] --------------MNPQIRNMVEP...PWDGLDEHSQALSERLRAILQNQGN NP_001185622
## [6] --------------MKPQFRNTVER...PWDGLDEHSQALSERLRAILQNQGN XP_025254494
## [7] --------------MKPQFRNPVER...PWDGLEEHSQALSERLQAILQNQGN NP_001009001
## [8] --------------MKPHFRNPVER...PWDGLEEHSQALSERLQAILQNQGN XP_034804992
## [9] --------------MKPHFRNTVER...PWDGLDEHSQDLSGRLRAILQNQEN NP_068594
## [10] MLQTKILVRTSRPMMNPQFRNMVDG...PWDGLEEHSEAWSGKLQAILQNQGN XP_024095464
## Con --------------M?P?FRNMVER...PWDGLDEHSQALS?RLRAILQNQGN Consensus
class(APOBEC3G_align) <- "AAMultipleAlignment"
APOBEC3G_align_seqinr <- msaConvert(APOBEC3G_align, type = "seqinr::alignment")
compbio4all::print_msa(APOBEC3G_align_seqinr)
## [1] "---------------------MVEPMKTGIFVSNFNNKPILSGRNTVWLCCEVKTKDPSG 0"
## [1] "---------------------MVKRMKPGIFVSNFNNKPILSGRNTVWLCCEVKTKDPSG 0"
## [1] "---------------------MVKRMKADIFVSNFNNRPILSGRNTVWLCCEVNTKDPSG 0"
## [1] "---------------------MVERMKPGIFVYYFNNRPILSGRNIVWLCCEVKTKDPSG 0"
## [1] "--------------MNPQIRNMVEPMDPRTFVSNFNNRPILSGLNTVWLCCEVKTKDPSG 0"
## [1] "--------------MKPQFRNTVERMYRDTFFYNFNNRPILSRRNTVWLCYEVKTRGPSM 0"
## [1] "--------------MKPQFRNPVERMYQDTFSDNFYNRPILSRRNTVWLCYEVKTKGPSR 0"
## [1] "--------------MKPHFRNPVERMYQDTFSDNFYNRPILSRRNTVWLCYEVKTKGPSR 0"
## [1] "--------------MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSR 0"
## [1] "MLQTKILVRTSRPMMNPQFRNMVDGMDPHKFSYNFKNRPILSRRNTVWLCYEVKTKGPSR 0"
## [1] " "
## [1] "PPLDAKIFRG-KVYSKAKYHPEMRFLRWFLKWRQLHRDQEYEVTWYVSWSPCTGCANSVA 0"
## [1] "PPLDAKIFRD-KVYSKAKYHPEMRFLRWFRKWRQLHRDQEYEVTWYVSWSPCTGCANSVA 0"
## [1] "PPLDAKIFRG-KVYSKAKYHPEMRFLHWFRKWRQLHRDQEYEVTWYVSWSPCTGCANSVA 0"
## [1] "PPLDANIFQG-ELYPEAKDHPEMKFLHWFRKWRQLHRDQEYEVTWYVSWSPCTRCANSVA 0"
## [1] "PPLDAKIFQGKVLRSKAKYHPEMRFLQWFREWRQLHHDQEYKVTWYVSWSPCTRCANSVA 0"
## [1] "PTWGTKIFRG-QVYSKAKYHPEMRFLHWFRKWRQLHRDQEYEVTWYVSWSPCTGCANSVA 0"
## [1] "PPLDAKIFRG-QVYSKLKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDVA 0"
## [1] "PPLDAKIFRG-QVYSKLKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDVA 0"
## [1] "PPLDAKIFRG-QVYSELKYHPEMRFFHWFSKWRKLHRDQEYEVTWYISWSPCTKCTRDMA 0"
## [1] "PPLDAKIFRG-QVYFELKNHPEMRFFHWFSKWRTLHRDQECEVTWYMSWSPCTKCTRNVA 0"
## [1] " "
## [1] "TFLAKDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWKPHYQEALRILCQKRGGPHATMKIMNYNEFQHCWNEFV 0"
## [1] "TFLAKDPKVTLTIFVARLYYFWKPNYQQALRILCQKRDGPHATMKIMNYNEFQDCWNKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWKPDYQEALRVLCQKRGSPHATMKIMNYNEFQHCWNKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCQKRDGPRATMKIMNYDEFQHCWSKFV 0"
## [1] "TFLAEDPKVTLTIFVARLYYFWDPDYQEALRSLCRERDGPRANMKIMNYDEFQHCWNKFV 0"
## [1] " "
## [1] "RGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGTFTSNFNNKLWVSGQHETYLCYKVE 0"
## [1] "RGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGTFTSNFNNKLWVSGQHETYLCYKVE 0"
## [1] "RGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGTFTSNFYNKPWVSGQHETYLCYKVE 0"
## [1] "DGQGKPFKPRKNLPKHYTLLHATLGELLRHVMDPGTFTSNFNNKPWVSGQRETYLCYKVE 0"
## [1] "DGRGKPFKPWNNLPKHYTLLQATLGELLRHLMDPGTFTSNFNNKPWVSGQHETYLCYKVE 0"
## [1] "RGRREPFEPWENLPKHYTLLHATLGELLRHLMDPGMFTSNFYNKSWVSGQHETYLCYKVE 0"
## [1] "YSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTSNFNNELWVRGRHETYLCYEVE 0"
## [1] "YSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTSNFNNELWVRGRHETYLCYEVE 0"
## [1] "YSQRELFEPWNNLPKYYILLHIMLGEILRHSMDPPTFTFNFNNEPWVRGRHETYLCYEVE 0"
## [1] "YSQRELFEPWNNLPKYYIVLHIILGEILRHSMDPLTFTSNFNNEPCVEGRHETYLCYKVE 0"
## [1] " "
## [1] "RPHNDTWVLLNQHRGFLQNQAPDIHGFPKGRHAELCFLDLIPLWKLDGQQ-YRVTCFTSW 0"
## [1] "RPHNDTWVLLNQHRGFLQNQAPDIHGFPKGRHAELCFLDLIPFWKLDDQQ-YRVTCFTSW 0"
## [1] "RLHNGTWVPLNQHRGFLRNQAPDIHGFPKGRHAELCFLDLIPFWKLDGQQ-YRVTCFTSW 0"
## [1] "RSHNDTWVLLNQHRGFLRNQAPDRHGFPKGRHAELCFLDLIPFWKLDDQQ-YRVTCFTSW 0"
## [1] "RLHNDTWVPLNQHRGFLRNQAPNIHGFPKGRHAELCFLDLIPFWKLDGQQ-YRVTCFTSW 0"
## [1] "RPHNDTWVLLNQHRGFLRNQAPDIHGFPKGRHAELCFLDLIPFWKLDGQQ-YRVTCFTSW 0"
## [1] "RLHNDTWVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLHQDYRVTCFTSW 0"
## [1] "RLHNDTRVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLHQDYRVTCFTSW 0"
## [1] "RMHNDTWVLLNQRRGFLCNQAPHKHGFLEGRHAELCFLDVIPFWKLDLDQDYRVTCFTSW 0"
## [1] "RLHNDTWVLLNQRRGFLCNQAPAIHGFPEGRHAELCFLDVIPFWKLDGKQRYRVTCFTSW 0"
## [1] " "
## [1] "SPCFNCAQEMAKFISNNKHVSLRIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEY 0"
## [1] "SPCFNCAQEMAKFISDNKHVSLRIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEY 0"
## [1] "SPCFSCAQEMAKFISNNEHVSLCIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEY 0"
## [1] "SPCFSCAQKMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLHRDGAKIAVMNYSEFEY 0"
## [1] "SPCFSCAQEMAKFISNNEHVSLCIFAARIYDDQGRYQEGLRTLHRDGAKIAMMNYSEFEY 0"
## [1] "SPCFNCAQEMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLHRDGAKIAMMNYSEFEY 0"
## [1] "SPCFSCAQEMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLAKAGAKISIMTYSEFKH 0"
## [1] "SPCFSCAQEMAKFISNNKHVSLCIFAARIYDDQGRCQEGLRTLAKAGAEISIMTYSEFKH 0"
## [1] "SPCFSCAQEMAKFISKNKHVSLCIFTARIYDDQGRCQEGLRTLAEAGAKISIMTYSEFKH 0"
## [1] "SPCFRCAQEMAKFISNNQHVSLCIFAARIYDDQGRCKEGLRTLDEAEAKISIMTYSEFQH 0"
## [1] " "
## [1] "CWDTFVDRQGRPFQPWDGLDEHSQALSERLRAILQNQGN 21"
## [1] "CWDTFVDRQGRPFQPWDGLDEHSQDLSGRLRAILQNQGN 21"
## [1] "CWDTFVDRQGRPFQPWDGLDEHSQDLSGRLRAILQNQGN 21"
## [1] "CWDTFVDRQGRPFQPWDGLDEHSQALSGRLRAILQNQGN 21"
## [1] "CWDTFVDCQGCPFQPWDGLDEHSQALSERLRAILQNQGN 21"
## [1] "CWDTFVDRQGRPFQPWDGLDEHSQALSERLRAILQNQGN 21"
## [1] "CWDTFVDHQGCPFQPWDGLEEHSQALSERLQAILQNQGN 21"
## [1] "CWDTFVDHQGCPFQPWDGLEEHSQALSERLQAILQNQGN 21"
## [1] "CWDTFVDHQGCPFQPWDGLDEHSQDLSGRLRAILQNQEN 21"
## [1] "CWDTFVDHQGRPFQPWDGLEEHSEAWSGKLQAILQNQGN 21"
## [1] " "
Print MSA
ggmsa::ggmsa(APOBEC3G_align, start = 25, end = 100)
APOBEC3G_dist <- seqinr::dist.alignment(APOBEC3G_align_seqinr, matrix = "identity")
is(APOBEC3G_dist)
## [1] "dist" "oldClass"
class(APOBEC3G_dist)
## [1] "dist"
APOBEC3G_dist_round <- round(APOBEC3G_dist, 3)
APOBEC3G_dist_round
## NP_001292891 NP_001332845 NP_001332812 NP_001279005 NP_001185622
## NP_001332845 0.171
## NP_001332812 0.236 0.213
## NP_001279005 0.334 0.322 0.322
## NP_001185622 0.326 0.346 0.318 0.338
## XP_025254494 0.273 0.282 0.268 0.361 0.368
## NP_001009001 0.444 0.450 0.444 0.473 0.468
## XP_034804992 0.450 0.455 0.450 0.478 0.477
## NP_068594 0.464 0.455 0.450 0.473 0.487
## XP_024095464 0.475 0.475 0.467 0.487 0.493
## XP_025254494 NP_001009001 XP_034804992 NP_068594
## NP_001332845
## NP_001332812
## NP_001279005
## NP_001185622
## XP_025254494
## NP_001009001 0.424
## XP_034804992 0.434 0.088
## NP_068594 0.437 0.217 0.222
## XP_024095464 0.468 0.346 0.357 0.364
tree_subset <- nj(APOBEC3G_dist)
plot.phylo(tree_subset, main = "Phylogenetic Tree", use.edge.length = FALSE)
mtext(text = "APOBEC3G family gene tree - rooted, no branch lengths")