This code compiles summary information about the gene HFE(Homeostatic Iron Regulator).
It also generates alignments and a phylogeneitc tree to indicating the evolutionary relationship betweeen the human version of the gene and its homologs in other species.
RefSeq Gene: https://www.ncbi.nlm.nih.gov/gene/3077 Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene/88330 Uniprot: https://www.uniprot.org/uniprot/Q30201 PDB: https://www.rcsb.org/structure/1A6Z
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
#install("drawProteins")
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
# Bioconductor packages
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(drawProteins)
## Biostrings
library(Biostrings)
library(HGNChelper)
Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The the Neanderthal genome database was searched but did not yield sequence information on.
HGNChelper::checkGeneSymbols(x = c("HFE"))
## Maps last updated on: Thu Oct 24 12:31:05 2019
## x Approved Suggested.Symbol
## 1 HFE TRUE HFE
RefSeq <- c("NP_000401.1", "NP_034554.2", "NP_001009101.1", "NP_001012399.1", "NP_445753.1", "XP_001085245.2", "XP_003434224.2", "XP_004043416.1", "XP_007971683.1", "XP_010363199.1")
UniProt <- c("Q30201", "P70387", "A0A2I3SQG8", "NA", "O35799", "NA", "NA", "G3QU39", "NA", "A0A2K6NRY4")
PDB <- c("NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA")
SciName <- c("Homo sapiens", "Mus musculus", "Pan troglodytes", "Bos taurus", "Rattus norvegicus", "Macaca mulatta", "Canis lupus", "Gorilla gorilla gorilla", "Chlorocebus sabaeus", "Rhinopithecus roxellana")
CommonName <- c("Human", "House mouse", "Chimpanzee", "Cattle", "Norway rat", "Rhesus monkey", "Dog", "Western lowland gorilla", "Green monkey", "Golden snub-nosed monkey")
GeneName <- c("HFE", "HFE", "HFE", "HFE", "HFE", "HFE","HFE", "HFE", "HFE", "HFE")
HFE_table <- data.frame(RefSeq,
UniProt,
PDB,
SciName,
CommonName,
GeneName)
pander::pander(HFE_table)
| RefSeq | UniProt | PDB | SciName |
|---|---|---|---|
| NP_000401.1 | Q30201 | NA | Homo sapiens |
| NP_034554.2 | P70387 | NA | Mus musculus |
| NP_001009101.1 | A0A2I3SQG8 | NA | Pan troglodytes |
| NP_001012399.1 | NA | NA | Bos taurus |
| NP_445753.1 | O35799 | NA | Rattus norvegicus |
| XP_001085245.2 | NA | NA | Macaca mulatta |
| XP_003434224.2 | NA | NA | Canis lupus |
| XP_004043416.1 | G3QU39 | NA | Gorilla gorilla gorilla |
| XP_007971683.1 | NA | NA | Chlorocebus sabaeus |
| XP_010363199.1 | A0A2K6NRY4 | NA | Rhinopithecus roxellana |
| CommonName | GeneName |
|---|---|
| Human | HFE |
| House mouse | HFE |
| Chimpanzee | HFE |
| Cattle | HFE |
| Norway rat | HFE |
| Rhesus monkey | HFE |
| Dog | HFE |
| Western lowland gorilla | HFE |
| Green monkey | HFE |
| Golden snub-nosed monkey | HFE |
#downloading sequences
HFE <- rentrez::entrez_fetch(db = "protein",
id = HFE_table$RefSeq,
rettype = "fasta")
HFE_list <- entrez_fetch_list(db = "protein",
id = HFE_table$RefSeq,
rettype = "fasta")
#number of fasta files obtained
length(HFE_list)
## [1] 10
#first entry
HFE_list[[1]]
## [1] ">NP_000401.1 hereditary hemochromatosis protein isoform 1 precursor [Homo sapiens]\nMGPRARPALLLLMLLQTAVLQGRLLRSHSLHYLFMGASEQDLGLSLFEALGYVDDQLFVFYDHESRRVEP\nRTPWVSSRISSQMWLQLSQSLKGWDHMFTVDFWTIMENHNHSKESHTLQVILGCEMQEDNSTEGYWKYGY\nDGQDHLEFCPDTLDWRAAEPRAWPTKLEWERHKIRARQNRAYLERDCPAQLQQLLELGRGVLDQQVPPLV\nKVTHHVTSSVTTLRCRALNYYPQNITMKWLKDKQPMDAKEFEPKDVLPNGDGTYQGWITLAVPPGEEQRY\nTCQVEHPGLDQPLIVIWEPSPSGTLVIGVISGIAVFVVILFIGILFIILRKRQGSRGAMGHYVLAERE\n\n"
#cleaning sequence
for(i in 1:length(HFE_list)){HFE_list[[i]] <- compbio4all::fasta_cleaner(HFE_list[[i]], parse = F)}
#ownloading UniProt
Q30201_HFE <- drawProteins::get_features("Q30201")
## [1] "Download has worked"
is(Q30201_HFE)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
#Create dataframe
Human_Uniprot_df <- drawProteins::feature_to_dataframe(Q30201_HFE)
is(Human_Uniprot_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
#View dataframe
Human_Uniprot_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp SIGNAL 1 22 21 Q30201 HFE_HUMAN 9606 1
## featuresTemp.1 CHAIN 23 348 325 Q30201 HFE_HUMAN 9606 1
## featuresTemp.2 TOPO_DOM 23 306 283 Q30201 HFE_HUMAN 9606 1
## featuresTemp.3 TRANSMEM 307 330 23 Q30201 HFE_HUMAN 9606 1
## featuresTemp.4 TOPO_DOM 331 348 17 Q30201 HFE_HUMAN 9606 1
## featuresTemp.5 DOMAIN 207 298 91 Q30201 HFE_HUMAN 9606 1
## featuresTemp.6 REGION 23 114 91 Q30201 HFE_HUMAN 9606 1
## featuresTemp.7 REGION 115 205 90 Q30201 HFE_HUMAN 9606 1
## featuresTemp.8 REGION 206 297 91 Q30201 HFE_HUMAN 9606 1
## featuresTemp.9 REGION 298 306 8 Q30201 HFE_HUMAN 9606 1
## featuresTemp.10 CARBOHYD 110 110 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.11 CARBOHYD 130 130 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.12 CARBOHYD 234 234 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.13 DISULFID 124 187 63 Q30201 HFE_HUMAN 9606 1
## featuresTemp.14 DISULFID 225 282 57 Q30201 HFE_HUMAN 9606 1
## featuresTemp.15 VAR_SEQ 26 114 88 Q30201 HFE_HUMAN 9606 1
## featuresTemp.16 VAR_SEQ 26 49 23 Q30201 HFE_HUMAN 9606 1
## featuresTemp.17 VAR_SEQ 26 26 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.18 VAR_SEQ 26 26 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.19 VAR_SEQ 27 298 271 Q30201 HFE_HUMAN 9606 1
## featuresTemp.20 VAR_SEQ 27 206 179 Q30201 HFE_HUMAN 9606 1
## featuresTemp.21 VAR_SEQ 114 219 105 Q30201 HFE_HUMAN 9606 1
## featuresTemp.22 VAR_SEQ 114 205 91 Q30201 HFE_HUMAN 9606 1
## featuresTemp.23 VAR_SEQ 144 161 17 Q30201 HFE_HUMAN 9606 1
## featuresTemp.24 VAR_SEQ 162 348 186 Q30201 HFE_HUMAN 9606 1
## featuresTemp.25 VAR_SEQ 207 220 13 Q30201 HFE_HUMAN 9606 1
## featuresTemp.26 VAR_SEQ 275 276 1 Q30201 HFE_HUMAN 9606 1
## featuresTemp.27 VAR_SEQ 277 348 71 Q30201 HFE_HUMAN 9606 1
## featuresTemp.28 VARIANT 6 6 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.29 VARIANT 43 43 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.30 VARIANT 53 53 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.31 VARIANT 59 59 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.32 VARIANT 63 63 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.33 VARIANT 65 65 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.34 VARIANT 66 66 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.35 VARIANT 93 93 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.36 VARIANT 105 105 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.37 VARIANT 127 127 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.38 VARIANT 176 176 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.39 VARIANT 217 217 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.40 VARIANT 224 224 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.41 VARIANT 224 224 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.42 VARIANT 277 277 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.43 VARIANT 282 282 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.44 VARIANT 283 283 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.45 VARIANT 295 295 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.46 VARIANT 330 330 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.47 CONFLICT 230 230 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.48 CONFLICT 248 248 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.49 CONFLICT 256 256 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.50 CONFLICT 275 275 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.51 CONFLICT 311 311 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.52 CONFLICT 339 339 0 Q30201 HFE_HUMAN 9606 1
## featuresTemp.53 STRAND 28 38 10 Q30201 HFE_HUMAN 9606 1
## featuresTemp.54 STRAND 43 45 2 Q30201 HFE_HUMAN 9606 1
## featuresTemp.55 STRAND 48 53 5 Q30201 HFE_HUMAN 9606 1
## featuresTemp.56 STRAND 56 65 9 Q30201 HFE_HUMAN 9606 1
## featuresTemp.57 STRAND 68 70 2 Q30201 HFE_HUMAN 9606 1
## featuresTemp.58 HELIX 75 78 3 Q30201 HFE_HUMAN 9606 1
## featuresTemp.59 TURN 79 82 3 Q30201 HFE_HUMAN 9606 1
## featuresTemp.60 HELIX 83 107 24 Q30201 HFE_HUMAN 9606 1
## featuresTemp.61 TURN 108 110 2 Q30201 HFE_HUMAN 9606 1
## featuresTemp.62 STRAND 112 114 2 Q30201 HFE_HUMAN 9606 1
## featuresTemp.63 STRAND 117 126 9 Q30201 HFE_HUMAN 9606 1
## featuresTemp.64 STRAND 132 140 8 Q30201 HFE_HUMAN 9606 1
## featuresTemp.65 STRAND 143 149 6 Q30201 HFE_HUMAN 9606 1
## featuresTemp.66 HELIX 150 152 2 Q30201 HFE_HUMAN 9606 1
## featuresTemp.67 STRAND 154 159 5 Q30201 HFE_HUMAN 9606 1
## featuresTemp.68 HELIX 160 162 2 Q30201 HFE_HUMAN 9606 1
## featuresTemp.69 HELIX 163 170 7 Q30201 HFE_HUMAN 9606 1
## featuresTemp.70 HELIX 174 184 10 Q30201 HFE_HUMAN 9606 1
## featuresTemp.71 HELIX 186 198 12 Q30201 HFE_HUMAN 9606 1
## featuresTemp.72 TURN 199 201 2 Q30201 HFE_HUMAN 9606 1
## featuresTemp.73 STRAND 209 216 7 Q30201 HFE_HUMAN 9606 1
## featuresTemp.74 STRAND 221 233 12 Q30201 HFE_HUMAN 9606 1
## featuresTemp.75 STRAND 236 241 5 Q30201 HFE_HUMAN 9606 1
## featuresTemp.76 HELIX 248 250 2 Q30201 HFE_HUMAN 9606 1
## featuresTemp.77 STRAND 255 258 3 Q30201 HFE_HUMAN 9606 1
## featuresTemp.78 STRAND 264 272 8 Q30201 HFE_HUMAN 9606 1
## featuresTemp.79 HELIX 276 279 3 Q30201 HFE_HUMAN 9606 1
## featuresTemp.80 STRAND 280 285 5 Q30201 HFE_HUMAN 9606 1
## featuresTemp.81 STRAND 289 291 2 Q30201 HFE_HUMAN 9606 1
## featuresTemp.82 STRAND 293 296 3 Q30201 HFE_HUMAN 9606 1
my_canvas <- draw_canvas(Human_Uniprot_df)
my_canvas <- draw_chains(my_canvas, Human_Uniprot_df,
label_size = 2.5)
my_canvas <- draw_domains(my_canvas, Human_Uniprot_df)
my_canvas
#This protein only has one domain
#creates vector
Human_HFE_vector <- compbio4all:: fasta_cleaner(HFE_list[[1]])
#default dot plot
seqinr::dotPlot(Human_HFE_vector,
Human_HFE_vector)
par(mfrow = c(2,2),
mar = c(0,0,2,1))
#2x2 panel to show different values for dotplot settings
#Plot 1: HFE Defaults
dotPlot(Human_HFE_vector,
Human_HFE_vector,
wsize = 1,
nmatch = 1,
main = "HFE Defaults")
#Plot 2: HFE size = 10, nmatch = 1
dotPlot(Human_HFE_vector, Human_HFE_vector,
wsize = 10,
nmatch = 1,
main = "HFE size = 10, nmatch = 1")
#Plot 3: HFE size = 10, nmatch = 5
dotPlot(Human_HFE_vector, Human_HFE_vector,
wsize = 10,
nmatch = 5,
main = "HFE size = 10, nmatch = 5")
#Plot 4: HFE size = 20, nmatch = 5
dotPlot(Human_HFE_vector, Human_HFE_vector,
wsize = 20,
nmatch =5,
main = "HFE size = 20, nmatch = 5")
#single large plot with the best version of the plot
par(mfrow = c(1,1),
mar = c(4,4,4,4))
dotPlot(Human_HFE_vector,
Human_HFE_vector,
wsize = 10,
nmatch = 5,
main = "HFE window = 10, nmatch = 5")
databases <- c("Pfam", "Disprot", "RepeatsDB", "Uniprot", "PDB")
info <- c("The HFE gene has two domains: C1-set and MHC", "NA", "NA", "The subcellular location is the cytoplasm", "NA")
databasedf <- data.frame(databases,
info)
pander(databasedf)
| databases | info |
|---|---|
| Pfam | The HFE gene has two domains: C1-set and MHC |
| Disprot | NA |
| RepeatsDB | NA |
| Uniprot | The subcellular location is the cytoplasm |
| PDB | NA |
Multivariate statistcal techniques were used to confirm the information about protein structure and location in the line database.
Uniprot (which uses http://www.csbio.sjtu.edu.cn/bioinf/Cell-PLoc-2/ I believe) indicates that the protein is a membrane-bound protein, particularlly in the ER.
#create 2 vectors if amino acids and check to make sure they are the same length and contain the same values
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
length(aa.1.1) == length(aa.1.2)
## [1] TRUE
aa.1.1 == aa.1.2
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
#shows the unique values present
unique(aa.1.1)
## [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
## [20] "V"
#shows they both have same number of unique calues present
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
#vector of the frequency of the amino acids
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
#checks the total
sum(alpha) == 2447
## [1] TRUE
#repeat with beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
sum(beta) == 2776
## [1] TRUE
#repeat with a.plus.b proteins
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
#same thing with a.div.b proteins
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
#creates dataframe
pfdf <- data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
#prints dataframe
pander(proteinfold.df)
Quitting from lines 207-375 (FinalPortfolio.Rmd) Error in pander(proteinfold.df) : object ‘proteinfold.df’ not found Calls:
#convert to frequencies
a.prop <- alpha/sum(alpha)
b.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b.prop <- a.div.b/sum(a.div.b)
#create dataframe and row labels
aa.prop <- data.frame(a.prop,
b.prop,
a.plus.b.prop,
a.div.b.prop)
row.names(aa.prop) <- aa.1.1
#prints dataframe
pander(aa.prop)
| a.prop | b.prop | a.plus.b.prop | a.div.b.prop | |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
#number of each amino acid in protein
table(Human_HFE_vector)
## Human_HFE_vector
## A C D E F G H I K L M N P Q R S T V W Y
## 17 5 18 23 11 24 13 16 12 43 10 7 20 23 23 20 17 23 11 12
#convert table to vector
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
#converts table of amino acid frequencies into vector
HFE_human_table <- table(Human_HFE_vector)/length(Human_HFE_vector)
HFE.human.aa.freq <- table_to_vector(HFE_human_table)
HFE.human.aa.freq
## A C D E F G H
## 0.04885057 0.01436782 0.05172414 0.06609195 0.03160920 0.06896552 0.03735632
## I K L M N P Q
## 0.04597701 0.03448276 0.12356322 0.02873563 0.02011494 0.05747126 0.06609195
## R S T V W Y
## 0.06609195 0.05747126 0.04885057 0.06609195 0.03160920 0.03448276
#Checks for U, an unknown amino acid
aa.names <- names(HFE.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
aa.prop$HFE.human.aa.freq <- HFE.human.aa.freq
pander::pander(aa.prop)
| a.prop | b.prop | a.plus.b.prop | a.div.b.prop | HFE.human.aa.freq | |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 0.04885 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 0.01437 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 0.05172 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 0.06609 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 0.03161 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 0.06897 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 0.03736 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 0.04598 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 0.03448 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 0.1236 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 0.02874 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 0.02011 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 0.05747 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 0.06609 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 0.06609 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 0.05747 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 0.04885 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 0.06609 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 0.03161 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 0.03448 |
#correlation used in chou
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
#cosine similarity used in Higgs and Attwood
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
#calculates and displays correlation between each column
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
corr.alpha
## [1] 0.7398139
corr.beta
## [1] 0.7745986
corr.apb
## [1] 0.8111314
corr.adb
## [1] 0.7998707
#cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
cos.alpha
## [1] 0.7398139
cos.beta
## [1] 0.7745986
cos.apb
## [1] 0.8111314
cos.adb
## [1] 0.7998707
#euclidean distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K
## a.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## b.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b.prop 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## HFE.human.aa.freq 0.05 0.01 0.05 0.07 0.03 0.07 0.04 0.05 0.03 0.12 0.03 0.02
## M F P S T W Y V
## a.prop 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## b.prop 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b.prop 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## HFE.human.aa.freq 0.06 0.07 0.07 0.06 0.05 0.07 0.03 0.03
#distance matrix
dist(aa.prop.flipped, method = "euclidean")
## a.prop b.prop a.plus.b.prop a.div.b.prop
## b.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b.prop 0.06699039 0.08659174 0.06175113
## HFE.human.aa.freq 0.18245313 0.17123772 0.15058824 0.15645720
#Individual distances using dist()
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
dist.alpha
## a.prop
## HFE.human.aa.freq 0.1824531
dist.beta
## b.prop
## HFE.human.aa.freq 0.1712377
dist.apb
## a.plus.b.prop
## HFE.human.aa.freq 0.1505882
dist.adb
## a.div.b.prop
## HFE.human.aa.freq 0.1564572
#Compile the information. Rounding makes it easier to read
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
#display output
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7398 | 0.7398 | 0.1825 | ||
| beta | 0.7746 | 0.7746 | 0.1712 | ||
| alpha plus beta | 0.8111 | 0.8111 | 0.1506 | most.sim | min.dist |
| alpha/beta | 0.7999 | 0.7999 | 0.1565 |
#Convert all FASTA records intro entries in a single vector. FASTA entries are contained in a list produced at the beginning of the script. They were cleaned to remove the header and newline characters.
names(HFE_list)
## [1] "NP_000401.1" "NP_034554.2" "NP_001009101.1" "NP_001012399.1"
## [5] "NP_445753.1" "XP_001085245.2" "XP_003434224.2" "XP_004043416.1"
## [9] "XP_007971683.1" "XP_010363199.1"
length(HFE_list)
## [1] 10
HFE_list[1]
## $NP_000401.1
## [1] "MGPRARPALLLLMLLQTAVLQGRLLRSHSLHYLFMGASEQDLGLSLFEALGYVDDQLFVFYDHESRRVEPRTPWVSSRISSQMWLQLSQSLKGWDHMFTVDFWTIMENHNHSKESHTLQVILGCEMQEDNSTEGYWKYGYDGQDHLEFCPDTLDWRAAEPRAWPTKLEWERHKIRARQNRAYLERDCPAQLQQLLELGRGVLDQQVPPLVKVTHHVTSSVTTLRCRALNYYPQNITMKWLKDKQPMDAKEFEPKDVLPNGDGTYQGWITLAVPPGEEQRYTCQVEHPGLDQPLIVIWEPSPSGTLVIGVISGIAVFVVILFIGILFIILRKRQGSRGAMGHYVLAERE"
#Creates and prints empty vector
HFE_vector <- rep(NA, length(HFE_list))
HFE_vector
## [1] NA NA NA NA NA NA NA NA NA NA
#Makes each entry of the list into a vector.
for(i in 1:length(HFE_vector)){
HFE_vector[i] <- HFE_list[[i]]
}
# name the vector
names(HFE_vector) <- names(HFE_list)
#Turns sequences into vectors
humanHFE <- fasta_cleaner(HFE_list[[1]], parse = F)
chimpHFE <- fasta_cleaner(HFE_list[[3]], parse = F)
housemouseHFE <- fasta_cleaner(HFE_list[[2]], parse = F)
cattleHFE <- fasta_cleaner(HFE_list[[4]], parse = F)
#Global pairwise alignments on sequences
align.01.03 <- Biostrings::pairwiseAlignment(
humanHFE,
chimpHFE)
align.01.02 <- Biostrings::pairwiseAlignment(
humanHFE,
housemouseHFE)
align.01.04 <- Biostrings::pairwiseAlignment(
humanHFE,
cattleHFE)
Do pairwise alignments for humans, chimps and 2-other species.
Biostrings::pid(align.01.03)
## [1] 100
Biostrings::pid(align.01.02)
## [1] 66.94444
Biostrings::pid(align.01.04)
## [1] 76.40449
#Global pairwise alignments for matrix
align.03.02 <- Biostrings::pairwiseAlignment(
chimpHFE,
housemouseHFE)
align.03.04 <- Biostrings::pairwiseAlignment(
chimpHFE,
cattleHFE)
align.02.04 <- Biostrings::pairwiseAlignment(
housemouseHFE,
cattleHFE)
#Build matrix of PID
pids <- c(1, NA, NA, NA,
pid(align.01.03), 1, NA, NA,
pid(align.01.02), pid(align.03.02), 1, NA,
pid(align.01.04), pid(align.03.04), pid(align.02.04), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Mus","Bos")
colnames(mat) <- c("Homo","Pan","Mus","Bos")
pander::pander(mat)
| Homo | Pan | Mus | Bos | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Pan | 100 | 1 | NA | NA |
| Mus | 66.94 | 66.94 | 1 | NA |
| Bos | 76.4 | 76.4 | 68.61 | 1 |
#Comparisons between humans and chimps
PID1 <- pid(align.01.03, type = "PID1")
PID2 <- pid(align.01.03, type = "PID2")
PID3 <- pid(align.01.03, type = "PID3")
PID4 <- pid(align.01.03, type = "PID4")
#Creates dataframe
method <- c("PID1", "PID2", "PID3", "PID4")
PID <- c(PID1, PID2, PID3, PID4)
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")
PIDdf <- data.frame(method,
PID,
denominator)
pander(PIDdf)
| method | PID | denominator |
|---|---|---|
| PID1 | 100 | (aligned positions + internal gap positions) |
| PID2 | 100 | (aligned positions) |
| PID3 | 100 | (length shorter sequence) |
| PID4 | 100 | (average length of the two sequences) |
For use with R bioinformatics tools we need to convert our named vector to a string set using Biostrings::AAStringSet(). Note the _ss tag at the end of the object we’re assigning the output to, which designates this as a string set.
HFE_vector_ss <- Biostrings::AAStringSet(HFE_vector)
#uses default substitution matrix
HFE_align <- msa(HFE_vector_ss,
method = "ClustalW")
## use default substitution matrix
msa produces a species MSA objects
#Shows information about stringset
class(HFE_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(HFE_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
#default output of MSA
HFE_align
## CLUSTAL 2.1
##
## Call:
## msa(HFE_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 504 columns
## aln names
## [1] -------------------------...------------------------- NP_000401.1
## [2] -------------------------...------------------------- NP_001009101.1
## [3] -------------------------...------------------------- XP_004043416.1
## [4] -------------------------...------------------------- XP_001085245.2
## [5] -------------------------...------------------------- XP_007971683.1
## [6] -------------------------...------------------------- XP_010363199.1
## [7] -------------------------...------------------------- NP_001012399.1
## [8] MGLFRSDKVQSTFLHQGRRPRRTDT...ARARGRFPAHQSDSDPSHTPLHPDG XP_003434224.2
## [9] -------------------------...------------------------- NP_034554.2
## [10] -------------------------...------------------------- NP_445753.1
## Con -------------------------...------------------------- Consensus
#Changes class of alignment
class(HFE_align) <- "AAMultipleAlignment"
#Converts to seqinr format
HFE_align_seqinr <- msaConvert(HFE_align,
type = "seqinr::alignment")
#Shows information
class(HFE_align_seqinr)
## [1] "alignment"
is(HFE_align_seqinr)
## [1] "alignment"
#Display output
compbio4all::print_msa(HFE_align_seqinr)
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "MGLFRSDKVQSTFLHQGRRPRRTDTSQGQRGPGRFHQQVLGIEESSSSGVSFPPTARKRI 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "-------------------------------------------MGPRARPA--------- 0"
## [1] "-------------------------------------------MGPRARPA--------- 0"
## [1] "-------------------------------------------MGPRARPA--------- 0"
## [1] "-------------------------------------------MGPRARPA--------- 0"
## [1] "-------------------------------------------MGPRARPA--------- 0"
## [1] "-------------------------------------------MGPRARPA--------- 0"
## [1] "-------------------------------------------MGPRARPA--------- 0"
## [1] "ALARKFCLASPQPGRRLQGDLPAGPAPPPHAGGGSSGRAAAAEMSPGARRARLLLLLLLL 0"
## [1] "-------------------------------------------MSLSAGLPVR------P 0"
## [1] "-------------------------------------------MDRSAGLPVRL-----L 0"
## [1] " "
## [1] "-LLLLMLLQTAVLQGRLLRSHSLHYLFMGASEQDLGLSLFEALGYVDDQLFVFYDHESRR 0"
## [1] "-LLLLMLLQTAVLQGRLLRSHSLHYLFMGASEQDLGLSLFEALGYVDDQLFVFYDHESRR 0"
## [1] "-LLLLMLLQTAVLQGRLLRSHSLHYLFMGASEQDLGLSLFEALGYVDDQLFVFYDHESRR 0"
## [1] "-LLLLMLLQTAVLQGRLLRSHSLHYLFMGSSEQDLGLSLFEALGYVDDQLFVFYDHESRR 0"
## [1] "-LLLLMLLQTAVLQGRLLRSHSLHYLFMGSSEQDLGLSLFEALGYVDDQLFVFYDHESRR 0"
## [1] "-LLLLMLLQTAVLQGRLLQSH--------------------------------------- 0"
## [1] "-LLLLILLRTAATQGRPPRSHSLRFLFMGASKPDLGLPLFEALGYVDDQLFVSYDHESRR 0"
## [1] "LLFLLLQLPTVAAQRRPPRSHSLRYLFMGASVPDLGLPLFEARGYVDDQLFVSYSHESRR 0"
## [1] "LLLLLLLLWSVAPQALPPRSHSLRYLFMGASEPDLGLPLFEARGYVDDQLFVSYNHESRR 0"
## [1] "LLLLLLLLWSVAPQALRPGSHSLRYLFMGASKPDLGLPFFEALGYVDDQLFVSYNHESRR 0"
## [1] " "
## [1] "VEPRTPWVSSRISSQMWLQLSQSLKGWDHMFTVDFWTIMENHNHS--------KESHTLQ 0"
## [1] "VEPRTPWVSSRISSQMWLQLSQSLKGWDHMFTVDFWTIMENHNHS--------KESHTLQ 0"
## [1] "VEPRTPWVSSRISSQVWLQLSQSLKGWDHMFTVDFWTIMENHNHS--------KESHTLQ 0"
## [1] "VEPRTPWVSGRTSSQMWLQLSQSLKGWDHMFTVDFWTIMENHNHS--------KESHTLQ 0"
## [1] "VEPRTPWVSGRTSSQMWLQLSQSLKGWDHMFTVDFWTIMENHNHS--------KESHTLQ 0"
## [1] "---------------------------------------------------------TLQ 0"
## [1] "AERRAPWLWGRATSQLWLQLSQNLKGWDHMFIVDFWTIMDNHNQSKVTKLGALPESHTLQ 0"
## [1] "AEPRAQWVRTGAASQLWLQLSQSLKGWDHMFIVDFWTIMDNHNHSKVTKLGVSSESHTLQ 0"
## [1] "AEPRAPWILEQTSSQLWLHLSQSLKGWDYMFIVDFWTIMGNYNHSKVTKLGVVSESHILQ 0"
## [1] "AEPRAPWILGQTSSQLWLQLSQSLKGWDYMFIVDFWTIMGNYNHSKVTKLRVVPESHILQ 0"
## [1] " "
## [1] "VILGCEMQEDNSTEGYWKYGYDGQDHLEFCPDTLDWRAAEPRAWPTKLEWERHKIRARQN 0"
## [1] "VILGCEMQEDNSTEGYWKYGYDGQDHLEFCPDTLDWRAAEPRAWPTKLEWERHKIRARQN 0"
## [1] "VILGCEMQEDNSTEGYWKYGYDGQDHLEFCPDTLDWRAAEPRAWPTKLEWERHKIRARQN 0"
## [1] "VILGCKMQEDNSTEGFWKYGYDGQDHLEFCPDTLDWRAAEPRAWPTKLEWERHKIRARQN 0"
## [1] "VILGCKMQEDNSTEGFWKYGYDGQDHLEFCPDTLDWKAAEPRAWPTKLEWERHKIRARQN 0"
## [1] "VILGCKMQEDNSTEGFWKYGYDGQDHLEFCPDTLDWRAAEPRAWPTKLEWERHKIRARQN 0"
## [1] "VILGCELQEDNSTRGFWKYGYDGQDHLEFRPETLDWRAAEPRAQVTKLEWEVNKIRAKQN 0"
## [1] "VILGCEVQEDNSTTGFWKYGYDGQNHLEFCPETLDWRAAEPKAQATKLEWEVNKIRAKQN 0"
## [1] "VVLGCEVHEDNSTSGFWRYGYDGQDHLEFCPKTLNWSAAEPGAWATKVEWDEHKIRAKQN 0"
## [1] "VILGCEVHEDNSTSGFWKYGYDGQDHLEFCPKTLNWSAAEPRAWATKMEWEEHRIRARQS 0"
## [1] " "
## [1] "RAYLERDCPAQLQQLLELGRGVLDQQVPPLVKVTHHVTSSVTTLRCRALNYYPQNITMKW 0"
## [1] "RAYLERDCPAQLQQLLELGRGVLDQQVPPLVKVTHHVTSSVTTLRCRALNYYPQNITMKW 0"
## [1] "RAYLERDCPAQLQQLLELGRGLLDQQVPPLVKVTHHVTSSVTTLRCRALNYYPQNITMKW 0"
## [1] "RAYLERDCPVQLQQLLELGRGVFDRPVPPLVKVTHHVTSSVTTLRCRALNYYPQNITMKW 0"
## [1] "RAYLERDCPVQLQQLLELGRGVFDRPVPPLVKVTHHVTSSVTTLRCRALNYYPQNITMKW 0"
## [1] "RAYLERDCPAQLQQLLELGRGVFDRPVPPLVKVTHHVTSSVTTLRCRALNYYPQNITMKW 0"
## [1] "RAYLDRDCPEQLLHLLELGRGPLEQQVPPLVKVTHHVTSSLTTLRCRALNFYPQNITIRW 0"
## [1] "RAYLQRDCPEQLRQLLELGRGVLDRQVPPLVKMTHHVTSAVTTLRCQALSFYPQNITMKW 0"
## [1] "RDYLEKDCPEQLKRLLELGRGVLGQQVPTLVKVTRHWASTGTSLRCQALDFFPQNITMRW 0"
## [1] "RDYLQRDCPQQLKQVLELQRGVLGQQVPTLVKVTRHWASTGTSLRCQALNFFPQNITMRW 0"
## [1] " "
## [1] "LKDKQPMDAKEFEPKDVLPNGDGTYQGWITLAVPPGEEQRYTCQVEHPGLDQPLIVIWEP 0"
## [1] "LKDKQPMDAKEFEPKDVLPNGDGTYQGWITLAVPPGEEQRYTCQVEHPGLDQPLIVIWEP 0"
## [1] "LKDKQPMDAKEFEPKDVLPNGDGTYQGWITLAVPPGEEQRYTCQVEHPGLDQPLIVIWEP 0"
## [1] "LKDRQSMDAKEVEPKDVLPNGDGTYQGWITLTVPPGEEQRYTCQVEHPGLDQPLLAFWEP 0"
## [1] "LKDRQPMDAKEVERKDVLPNGDGTYQGWITLTVPPGEEQRYTCQVEHPGLDQPLLAFWEP 0"
## [1] "LKDRQPMDAKEVKPKDVLPNGDGTYQGWITLTVPPGEEQRYTCQVEHPGLDQPLLAIWDP 0"
## [1] "LKDKQFLDAKEIKPEDVLPNGDGTYQAWVALAMLPGEEQRYSCQVEHPGLDQPLTATWEP 0"
## [1] "LKDRQPLDAEDVEPKDVLPNGDGTYQRWVALAVAPGEEQRYTCQVEHPGLDQPLTASWEA 0"
## [1] "LKDNQPLDAKDVNPEKVLPNGDETYQGWLTLAVAPGDETRFTCQVEHPGLDQPLTASWEP 0"
## [1] "LKDSQPLDAKDVNPENVLPNGDGTYQGWLTLAVAPGEETRFSCQVEHPGLDQPLTATWEP 0"
## [1] " "
## [1] "SPSGTLVIGVISGIAVFVVILFIGILFIILRKRQGSRGAMGHYVLAERE----------- 0"
## [1] "SPSGTLVIGVISGIAVFVVILFIGILFIILRKRQGSRGAMGHYVLAERE----------- 0"
## [1] "SPSGTLVIGVISGIAVFFVILFIGILFIILRKRQGSRGAMGHYVLAERE----------- 0"
## [1] "SPSRTLVIGVISGIAVFVIILFIGILFIILRKRQTSRGVMGHYVLAERE----------- 0"
## [1] "SPSRTLVIGVISGIAVFVIILFIGILFIILRKRQTSRGVMGHYVLAERF----------- 0"
## [1] "SPSGTLVIGVISGIAVFVIILFIGILFIILRKRQTSRGVMGHYVLAERE----------- 0"
## [1] "SLSGTLVTGILSGIAVCVIIFLIGILFRILKRRQSSRGAAVNYALAECE----------- 0"
## [1] "PMSGTLVVGIISGIAVCIIVLFTGILFRILRKRQASRGAMGDYVLAEYEENEARSVSQPA 0"
## [1] "LQSQAMIIGIISGVTVCA-IFLVGILFLILRKRKASGGTMGGYVLTDCE----------- 0"
## [1] "SRSQDMIIGIISGITICA-IFFVGILILVLRKRKVSGGTMGDYVLTECE----------- 0"
## [1] " "
## [1] "------------------------ 36"
## [1] "------------------------ 36"
## [1] "------------------------ 36"
## [1] "------------------------ 36"
## [1] "------------------------ 36"
## [1] "------------------------ 36"
## [1] "------------------------ 36"
## [1] "RARGRFPAHQSDSDPSHTPLHPDG 36"
## [1] "------------------------ 36"
## [1] "------------------------ 36"
## [1] " "
Based on the output from drawProtiens, the first 50 amino acids appears to contain an interesting helical section.
#First step changes class of alignment; second step displays MSA
class(HFE_align) <- "AAMultipleAlignment"
ggmsa::ggmsa(HFE_align,
start = 225,
end = 275)
Make a distance matrix This produces a “dist” class object.
#calculates genetic distance using MSA
HFE_subset_dist <- seqinr::dist.alignment(HFE_align_seqinr,
matrix = "identity")
is(HFE_subset_dist)
## [1] "dist" "oldClass"
class(HFE_subset_dist)
## [1] "dist"
#round for display
HFE_align_seqinr_rnd <- round(HFE_subset_dist, 3)
HFE_align_seqinr_rnd
## NP_000401.1 NP_001009101.1 XP_004043416.1 XP_001085245.2
## NP_001009101.1 0.000
## XP_004043416.1 0.093 0.093
## XP_001085245.2 0.240 0.240 0.257
## XP_007971683.1 0.251 0.251 0.268 0.107
## XP_010363199.1 0.248 0.248 0.263 0.164
## NP_001012399.1 0.470 0.470 0.473 0.485
## XP_003434224.2 0.473 0.473 0.476 0.479
## NP_034554.2 0.560 0.560 0.563 0.568
## NP_445753.1 0.550 0.550 0.553 0.555
## XP_007971683.1 XP_010363199.1 NP_001012399.1 XP_003434224.2
## NP_001009101.1
## XP_004043416.1
## XP_001085245.2
## XP_007971683.1
## XP_010363199.1 0.186
## NP_001012399.1 0.494 0.488
## XP_003434224.2 0.485 0.484 0.459
## NP_034554.2 0.571 0.599 0.557 0.520
## NP_445753.1 0.558 0.583 0.546 0.532
## NP_034554.2
## NP_001009101.1
## XP_004043416.1
## XP_001085245.2
## XP_007971683.1
## XP_010363199.1
## NP_001012399.1
## XP_003434224.2
## NP_034554.2
## NP_445753.1 0.350
Build a phylogenetic tree from distance matrix
# Uses genetic distances to generate clades from cluster sequences
tree <- nj(HFE_subset_dist)
# Plots phylogenetic tree and adds a label
plot.phylo(tree, main="Phylogenetic Tree",
use.edge.length = F)
mtext(text = "HFE family gene tree - rooted, no branch lengths")