INTRODUCTION
TODO: Brief statement in of the gene and its full name and in your own words what is done in your script
EXAMPLE: This code compiles summary information about the gene DIO1 (iodothyronine deiodinase 1). It also generates alignments and a phylogeneitc tree to indicating the evolutionary relationship betweeen the human version of the gene and its homologs in other species.
setup
Install any needed packages.
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
BiocManager::install("drawProteins")
## Bioconductor version 3.13 (BiocManager 1.30.16), R 4.1.2 (2021-11-01)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
## re-install: 'drawProteins'
## Installation paths not writeable, unable to update packages
## path: C:/Program Files/R/R-4.1.2/library
## packages:
## Matrix
## Old packages: 'digest', 'glue', 'rlang', 'tibble', 'XML'
library(drawProteins)
#install.packages('HGNChelper')
#install.packages("cli")
load the necessary packages
#library(devtools)
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
# Bioconductor packages
## msa
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(drawProteins)
## Biostrings
library(Biostrings)
#library(HGNChelper)
ACCESSION NUMBERS
acession number table
#Getting all the info
acc_num <- c("NP_001123361","XP_016780936.1","NP_034154.2","NP_001271395.1","NP_001012684.1","NP_001017161.1","NP_571630.2","XP_004626354.1","XP_025731052.1","XP_004746206.1")
unip_acc <- c("P40126","A0A2I3SUR0","P29812","F1PYI1","F1MC37","F7EEJ9","F1QRR2","A0A6P3EM25","A0A3Q7PLZ6","M3YKA6")
pdb_acc <- c("NA","NA","NA","NA","NA","NA","NA","NA","NA","NA")
sci_name <- c("Homo sapiens","Pan troglodytes","Mus musculus","Canis lupus","Bos taurus","Xenopus tropicalis","Danio rerio","Octodon degus","Callorhinus ursinus","Mustela putorius furo")
com_name <- c("Human","Chimpanzee","Mouse","Dog","Cattle","Frog","Fish","Common degu","Northern fur seal","Ferret")
gene_name <- c("DCT","DCT","DCT","DCT","DCT","DCT","DCT","DCT","DCT","DCT")
the finished table
pander::pander(dct_df)
Table continues below
| NP_001123361 |
P40126 |
NA |
Homo sapiens |
| XP_016780936.1 |
A0A2I3SUR0 |
NA |
Pan troglodytes |
| NP_034154.2 |
P29812 |
NA |
Mus musculus |
| NP_001271395.1 |
F1PYI1 |
NA |
Canis lupus |
| NP_001012684.1 |
F1MC37 |
NA |
Bos taurus |
| NP_001017161.1 |
F7EEJ9 |
NA |
Xenopus tropicalis |
| NP_571630.2 |
F1QRR2 |
NA |
Danio rerio |
| XP_004626354.1 |
A0A6P3EM25 |
NA |
Octodon degus |
| XP_025731052.1 |
A0A3Q7PLZ6 |
NA |
Callorhinus ursinus |
| XP_004746206.1 |
M3YKA6 |
NA |
Mustela putorius furo |
| Human |
DCT |
| Chimpanzee |
DCT |
| Mouse |
DCT |
| Dog |
DCT |
| Cattle |
DCT |
| Frog |
DCT |
| Fish |
DCT |
| Common degu |
DCT |
| Northern fur seal |
DCT |
| Ferret |
DCT |
PROTEIN FEATURE PREDICTION
secondary structure prediction
# enter once
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# enter again
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# are they the same length?
length(aa.1.1) == length(aa.1.2)
## [1] TRUE
make vectors with the frequency of each amino acid in the databse of proteins of each type used by Chou
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
# check against chou's total
sum(alpha) == 2447
## [1] TRUE
# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
# check against chou's total
sum(beta) == 2776
## [1] TRUE
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
build table
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
## aa.1.1 alpha beta a.plus.b a.div.b
## 1 A 285 203 175 361
## 2 R 53 67 78 146
## 3 N 97 139 120 183
## 4 D 163 121 111 244
## 5 C 22 75 74 63
## 6 Q 67 122 74 114
## 7 E 134 86 86 257
## 8 G 197 297 171 377
## 9 H 111 49 33 107
## 10 I 91 120 93 239
## 11 L 221 177 110 339
## 12 K 249 115 112 321
## 13 M 48 16 25 91
## 14 F 123 85 52 158
## 15 P 82 127 71 188
## 16 S 122 341 126 327
## 17 T 119 253 117 238
## 18 W 33 44 30 72
## 19 Y 63 110 108 130
## 20 V 167 229 123 378
pander(data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b))
| A |
285 |
203 |
175 |
361 |
| R |
53 |
67 |
78 |
146 |
| N |
97 |
139 |
120 |
183 |
| D |
163 |
121 |
111 |
244 |
| C |
22 |
75 |
74 |
63 |
| Q |
67 |
122 |
74 |
114 |
| E |
134 |
86 |
86 |
257 |
| G |
197 |
297 |
171 |
377 |
| H |
111 |
49 |
33 |
107 |
| I |
91 |
120 |
93 |
239 |
| L |
221 |
177 |
110 |
339 |
| K |
249 |
115 |
112 |
321 |
| M |
48 |
16 |
25 |
91 |
| F |
123 |
85 |
52 |
158 |
| P |
82 |
127 |
71 |
188 |
| S |
122 |
341 |
126 |
327 |
| T |
119 |
253 |
117 |
238 |
| W |
33 |
44 |
30 |
72 |
| Y |
63 |
110 |
108 |
130 |
| V |
167 |
229 |
123 |
378 |
calculate proportions for each of the four protein fold types
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
create another data frame
#dataframe
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
aa.prop
## alpha.prop beta.prop a.plus.b.prop a.div.b
## A 0.116469146 0.073126801 0.09264161 0.08331410
## R 0.021659174 0.024135447 0.04129169 0.03369490
## N 0.039640376 0.050072046 0.06352567 0.04223402
## D 0.066612178 0.043587896 0.05876125 0.05631202
## C 0.008990601 0.027017291 0.03917417 0.01453958
## Q 0.027380466 0.043948127 0.03917417 0.02630972
## E 0.054760932 0.030979827 0.04552673 0.05931225
## G 0.080506743 0.106988473 0.09052409 0.08700669
## H 0.045361667 0.017651297 0.01746956 0.02469421
## I 0.037188394 0.043227666 0.04923240 0.05515809
## L 0.090314671 0.063760807 0.05823187 0.07823679
## K 0.101757254 0.041426513 0.05929063 0.07408262
## M 0.019615856 0.005763689 0.01323452 0.02100162
## F 0.050265631 0.030619597 0.02752779 0.03646434
## P 0.033510421 0.045749280 0.03758602 0.04338795
## S 0.049856968 0.122838617 0.06670196 0.07546734
## T 0.048630977 0.091138329 0.06193753 0.05492730
## W 0.013485901 0.015850144 0.01588142 0.01661666
## Y 0.025745811 0.039625360 0.05717311 0.03000231
## V 0.068246833 0.082492795 0.06511382 0.08723748
determine the number of each amino acid in my protein
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
#build new table with specific frequencies of DCT
dct_human_table <- table(dct_human_vector)/length(dct_human_vector)
DCT.human.aa.freq <- table_to_vector(dct_human_table)
DCT.human.aa.freq
## A C D E F G H
## 0.05253623 0.03442029 0.05434783 0.04528986 0.05797101 0.06521739 0.03079710
## I K L M N P Q
## 0.02717391 0.03442029 0.11775362 0.01811594 0.05615942 0.06702899 0.03985507
## R S T V W Y
## 0.06521739 0.05978261 0.06159420 0.05615942 0.02355072 0.03260870
check for presence of “U”
aa.names <- names(DCT.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
#i.U <- which(aa.names == "U")
#aa.names[i.U]
#DCT.human.aa.freq[i.U]
add data on my focal protein to the amino acid frequency table.
aa.prop$DCT.human.aa.freq <- DCT.human.aa.freq
pander::pander(aa.prop)
| A |
0.1165 |
0.07313 |
0.09264 |
0.08331 |
0.05254 |
| R |
0.02166 |
0.02414 |
0.04129 |
0.03369 |
0.03442 |
| N |
0.03964 |
0.05007 |
0.06353 |
0.04223 |
0.05435 |
| D |
0.06661 |
0.04359 |
0.05876 |
0.05631 |
0.04529 |
| C |
0.008991 |
0.02702 |
0.03917 |
0.01454 |
0.05797 |
| Q |
0.02738 |
0.04395 |
0.03917 |
0.02631 |
0.06522 |
| E |
0.05476 |
0.03098 |
0.04553 |
0.05931 |
0.0308 |
| G |
0.08051 |
0.107 |
0.09052 |
0.08701 |
0.02717 |
| H |
0.04536 |
0.01765 |
0.01747 |
0.02469 |
0.03442 |
| I |
0.03719 |
0.04323 |
0.04923 |
0.05516 |
0.1178 |
| L |
0.09031 |
0.06376 |
0.05823 |
0.07824 |
0.01812 |
| K |
0.1018 |
0.04143 |
0.05929 |
0.07408 |
0.05616 |
| M |
0.01962 |
0.005764 |
0.01323 |
0.021 |
0.06703 |
| F |
0.05027 |
0.03062 |
0.02753 |
0.03646 |
0.03986 |
| P |
0.03351 |
0.04575 |
0.03759 |
0.04339 |
0.06522 |
| S |
0.04986 |
0.1228 |
0.0667 |
0.07547 |
0.05978 |
| T |
0.04863 |
0.09114 |
0.06194 |
0.05493 |
0.06159 |
| W |
0.01349 |
0.01585 |
0.01588 |
0.01662 |
0.05616 |
| Y |
0.02575 |
0.03963 |
0.05717 |
0.03 |
0.02355 |
| V |
0.06825 |
0.08249 |
0.06511 |
0.08724 |
0.03261 |
functions to calculate similarity
# correlation
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
# cosine similarity
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
data exploration
par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ DCT.human.aa.freq, data = aa.prop)
plot(beta.prop ~ DCT.human.aa.freq, data = aa.prop)
plot(a.plus.b.prop ~ DCT.human.aa.freq, data = aa.prop)
plot(a.div.b ~ DCT.human.aa.freq, data = aa.prop)

calculate correlation between each column and cosine similarity and distance
#correlation
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
#cosine sim
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
#distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## DCT.human.aa.freq 0.05 0.03 0.05 0.05 0.06 0.07 0.03 0.03 0.03 0.12 0.02 0.06
## M F P S T W Y V
## alpha.prop 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## DCT.human.aa.freq 0.07 0.04 0.07 0.06 0.06 0.06 0.02 0.03
distance matrix
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## DCT.human.aa.freq 0.18109963 0.17293345 0.14646291 0.15634679
individual distances
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
compile the info
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
# display
pander::pander(df)
| alpha |
0.7409 |
0.7409 |
0.1811 |
|
|
| beta |
0.7678 |
0.7678 |
0.1729 |
|
|
| alpha plus beta |
0.8188 |
0.8188 |
0.1465 |
most.sim |
min.dist |
| alpha/beta |
0.7976 |
0.7976 |
0.1563 |
|
|
PERCENT IDENTITY COMPARISONS (PID)
PID table
#converting the FASTA files in the original list into a single vector
dct_vector <- unlist(dct_list)
cat(dct_vector)
## MSPLWWGFLLSCLGCKILPGAQGQFPRVCMTVDSLVNKECCPRLGAESANVCGSQQGRGQCTEVRADTRPWSGPYILRNQDDRELWPRKFFHRTCKCTGNFAGYNCGDCKFGWTGPNCERKKPPVIRQNIHSLSPQEREQFLGALDLAKKRVHPDYVITTQHWLGLLGPNGTQPQFANCSVYDFFVWLHYYSVRDTLLGPGRPYRAIDFSHQGPAFVTWHRYHLLCLERDLQRLIGNESFALPYWNFATGRNECDVCTDQLFGAARPDDPTLISRNSRFSSWETVCDSLDDYNHLVTLCNGTYEGLLRRNQMGRNSMKLPTLKDIRDCLSLQKFDNPPFFQNSTFSFRNALEGFDKADGTLDSQVMSLHNLVHSFLNGTNALPHSAANDPIFVVISNRLLYNATTNILEHVRKEKATKELPSLHVLVLHSFTDAIFDEWMKRFNPPADAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTSDQLGYSYAIDLPVSVEETPGWPTTLLVVMGTLVALVGLFVLLAFLQYRRLRKGYTPLMETHLSSKRYTEEA MSPLWWGFLLSCLGCKILPGAQAQFPRVCMTVDSLVNKECCPRLGAESANVCGSQQGRGQCTEVRADTRPWSGPYILRNQDDRELWPRKFFHRTCKCTGNFAGYNCGDCKFGWTGPNCERKKPPVIRQNIHSLSPQEREQFLGALDLAKKSVHPDYVITTQHWLGLLGPNGTQPQFANCSVYDFFVWLHYYSVRDTLLGPGRPYRAIDFSHQGPAFVTWHRYHLLCLERDLQRLIGNESFALPYWNFATGRNECDVCTDQLFGEARPDDPTLISRNSRFSSWETVCDSLDDYNHRVTLCNGTYEGLLRRNQMGRNSMKLPTLKDIRDCLSLQKFDNPPFFQNSTFSFRNALEGFDKADGTLDSQVMSLHNLVHSFLNGTNALPHSAANDPIYVVISNRLLYNATTNILEHVRKEKATKELPSLHVLVLHSFTDAIFDEWMKRFNPPADAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTSDQLGYSYAIDLPVSVEETPGWPTTLLVVMGMLVALVGLFVLLAFLQYRRLRKGYTPLMETHLSSKRYTEEA MGLVGWGLLLGCLGCGILLRARAQFPRVCMTLDGVLNKECCPPLGPEATNICGFLEGRGQCAEVQTDTRPWSGPYILRNQDDREQWPRKFFNRTCKCTGNFAGYNCGGCKFGWTGPDCNRKKPAILRRNIHSLTAQEREQFLGALDLAKKSIHPDYVITTQHWLGLLGPNGTQPQIANCSVYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERELQRLTGNESFALPYWNFATGKNECDVCTDELLGAARQDDPTLISRNSRFSTWEIVCDSLDDYNRRVTLCNGTYEGLLRRNKVGRNNEKLPTLKNVQDCLSLQKFDSPPFFQNSTFSFRNALEGFDKADGTLDSQVMNLHNLAHSFLNGTNALPHSAANDPVFVVLHSFTDAIFDEWLKRNNPSTDAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTAEQLGYNYAVDLSEEEAPVWSTTLSVVIGILGAFVLLLGLLAFLQYRRLRKGYAPLMETGLSSKRYTEEA MQAQELKSGRGEGQDRIREWKARGSSPADKAMSPLGWGLVLCCLGGGLLPGAWAQFPRVCMTVDNLMAKECCPPLGLEPANICGSQEGRGQCMEVQTDARPWSGPYVLRNQDDREGWPRKFFHRTCQCTGHYGGYNCGVCKFGWTGHDCNQRKARVIRKNIHSLTPEEREQFLEALDLAKNTTHPDYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQQLIGNESFALPYWNFATGRNDCDVCTDQLLGAARQDDPTLISENSRFSNWEIVCDSLDDYNRRVTLCNGTYEGLLRRNQVGLNSEKLPTLKDIQDCLSLKKFDSPPFFQNSTFSFRNALEGFDKANGILDSQVMSLHNLVHSFLNGTSALPHSAANDPVFVVLHSFTDAIFDEWMKRSNPSVDAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTADQLGYSYAIDLPVEETPAWTTTLLAVMGMLVALVGVFVVLFFLQYRRLRKGYTPLMETHLSDRKYTEEA MSPLGWGLLLGCLGCALPSGARAQFPRVCMTVGSLQAKECCPPLGADPANVCGSREGRGQCAEVQTDTRPWSGPYVLRNQDDRERWPRKFFDRTCRCTGNFAGYNCGNCRFGWTGPKCDQKKPLVVRRDVHSLTPQEREQFLDALDLAKYTPHPDYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQRLTGNESFALPYWNFATGRNECDVCTDQLLGAARQDDPTLISQNSRFSSWEIVCDSLNDYNRRVTLCNGTYEGLLKRNQMGRNSEKLPTLKDIQNCLSLKKFDSPPFFQNSTLSFRNALEGFGKADGTLDSQVMNFHNLVHSFLNGTSALPHSAANDPVFVVLHSFTDAIFDEWMKRFNPPVDAWPRELAPIGHNRMYNMVPFFPPVTNEELFLTADQLGYSYAIDLPVEETPDWTTVLSVVTGMLVVLVVLFALLLFLQYRRLRKGYTPLVETQLSNKRYTEEA MECKGVRAVLLLACCLQAVRAQFPRPCVTVEALTSKRCCPALGSDLNNVCGSLEGRGQCTPVVVDERPWSGPYTLRNVDDRERWPLKFFNSICRCTGNFAGFNCGECKFGWTGPTCEQRKPPVVRKNIHSLTAQERTQFLDALDQAKNTIHPDYVIATQHWLSILGPNGTEPQVANTSIYNYFVWLHYYSVRDTLLGPGRPFTAIDFSHQGPAFVTWHRYHLLLLERDLQRMTGNESFALPYWNFATGRNECDVCTDELFGAPRLDDPNLISAGSRFSRWGIVCNSLNDYNRLVTLCNGTNEGFLQRNPLGGAEGRLPSMEDVQKCLSLNEFDNPPFFRNSTFSFRNALEGFDEPDGTLNSTAMSLHNLVHSFLNGTSAQSHSSANDPIFVVLHSFTDAIFDEWMKRFQPSDVAWPQELAPIGHNRMYNMVPFFPPVTNEELFLPAEQLGYVYSIDLPASIEDTRAVAVLVGSTIGGILLAVLLLLLLLVLFMQRKRQQGFEPLMNATFTNKRYTEDA MKRFVLMLCCFGCMYLSEAQFPRVCCTVEGIVSKECCPALGSDPANVCGLLAGRGNCSDVHVDSKPWGGPYSLRNVDDRERWPTKFFNRTCRCFGNFAGYNCGECKFGWTGLNCDQRKAPVLRKDIHSLSPEELQEFLNALDLSKTTVHPDYMIASQHWLGLLGPNGTEPQFSNISIYDFFVWQHYYSVRDTLLGPGRPFKAIDFSHKGPAFITWHRYHLLSLERELQKLTGNENFAVPYWNFATGQSECDVCTDSLLGGRDPSDPSRISPRSRFARWGVVCDSLDEYNRLVTLCNGTNEGSLRRGAAMSTNVTLPTMNDVRSCLNLRDFDSPPYFTNSTFSFRNALEGYDKPDGELDTSVSNLHNLVHQFLNGTSALSHSAANDPIFLVLHAFTDAIFDEWMRRFRENETFPDEMAPIGHNRHYNMVPFFPPVTNEEIYIASEQLGYSYSIDLEELDNRPAMFLLGSTLGGIFLGLLLLLIVLVLYIQQRRKREFEPLLNAEFTNTKYSGEA MRLLPWGLLFCCLGCRIQPGVWAQFPRICMTLDSLRSKECCPALGAEPGNVCGSRDGRGQCSEVQADTRPWSGPYVLRNQDDREQWPRKFFNRTCRCTGNFAGYNCGDCWFGWTGPNCDQRKPLVVRQNIHSLTPQEREQFLGALDLAKKTTHPDYVITTQHWLGLLGPNGTQPQITNCSIYDFFVWLHYYSVRDTLLGPGRPFKAIDFSHQGPAFVTWHRYHLLWLERDLQRLTGNESFALPYWNFATGRNECDVCTDQLLGAARPDDPTLISQNSRFSSWEIVCDSLDDYNRRVTLCNGTYEGLLRRHQVGRNSETLPTLKNIQDCLSLQKFDNAPFFQNSTFSFRNALEGFDKADGTLDSQVVSLHNLVHSFLNGTSAFPHSAANDPVFVVLHSFTDGIFDEWMKRHKPAADAWPEELAPIGHNRMYNMVPFFPPVTNEELFLTTDHLGYSYAIDWPVTVEETPTWSAVLSVVVSTLAAVVGLFVLLIFLQYRRLRRGYAPLIETHLIRKKYTEEAEGAQTVP MSPLGWGLVLSCLGCGILPGAHAQFPRVCMTVDSLMTKECCPSLGPEPDNICGSQEGRGQCMEVQADTRPWSGPYVLRNQDDREGWPRKFFHRTCRCTGHFAGYNCGVCKFGWTGHDCNQRKAPVIRKNIHSLTAQEREQFLDALDLAKNTTHPNYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQQLIDNESFALPYWNFATGRNDCDVCTDQLLGAARQDDPTLISQNSRFSSWEIICDSLDDYNRRVTLCNGTYEGLLRRNQVGLNSEKLPTLKDIQDCLSLKKFDSPPFFQNSTFSFRNALEGFDKANGILDSQVMSLHNLVHSFLNGTSALPHSAANDPVFVVLHSFTDAIFDEWMKRSNPSVDAWPQELAPIGHNRMYNMVPFFPPVTNEEFFLTADQLGYSYAIDLPAEETPGWTTTFLVVAGMLGALAGVFVVLFFLQYRRLRKGYTPLVETHLSNKKYTEEA MCPLGWGLLLSCLGCALLPGAHAQFPRVCMTVDSLMSKECCPSLGPEPDNICGSQEGRGQCTEVQADTRPWSGPYVLRNQDDREGWPRKFFHRTCRCTGHFAGYNCGGCKFGWTGHDCSQRKAPVTRKNLHSLTAQEREQFLDALDLAKNTTHPDYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQQLIGNESFALPYWNFATGRNDCDVCTDQLLGAARQDDPTLISQNSRFSNWEIVCDSLDDYNRRVTLCNGTYEGLLRRNQVGLNSAKLPTLKDIQDCLSLKKFDSPPFFQNSTFSFRNALEGFDKANGILDSQVMSLHNLVHSFLNGTSALPHSAANDPIFVVLHSFTDAIFDEWMKRSNPSLDAWPQELAPIGHNRMYNMVPFFPPVTNEEFFLTADQLGYSYAINLPAEETPGWTPTLLVVMGMLVALAGVFVVLFFLQYRRLRKGYTPLAETHLNNKKYTEEA
dct_vector
## [1] "MSPLWWGFLLSCLGCKILPGAQGQFPRVCMTVDSLVNKECCPRLGAESANVCGSQQGRGQCTEVRADTRPWSGPYILRNQDDRELWPRKFFHRTCKCTGNFAGYNCGDCKFGWTGPNCERKKPPVIRQNIHSLSPQEREQFLGALDLAKKRVHPDYVITTQHWLGLLGPNGTQPQFANCSVYDFFVWLHYYSVRDTLLGPGRPYRAIDFSHQGPAFVTWHRYHLLCLERDLQRLIGNESFALPYWNFATGRNECDVCTDQLFGAARPDDPTLISRNSRFSSWETVCDSLDDYNHLVTLCNGTYEGLLRRNQMGRNSMKLPTLKDIRDCLSLQKFDNPPFFQNSTFSFRNALEGFDKADGTLDSQVMSLHNLVHSFLNGTNALPHSAANDPIFVVISNRLLYNATTNILEHVRKEKATKELPSLHVLVLHSFTDAIFDEWMKRFNPPADAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTSDQLGYSYAIDLPVSVEETPGWPTTLLVVMGTLVALVGLFVLLAFLQYRRLRKGYTPLMETHLSSKRYTEEA"
## [2] "MSPLWWGFLLSCLGCKILPGAQAQFPRVCMTVDSLVNKECCPRLGAESANVCGSQQGRGQCTEVRADTRPWSGPYILRNQDDRELWPRKFFHRTCKCTGNFAGYNCGDCKFGWTGPNCERKKPPVIRQNIHSLSPQEREQFLGALDLAKKSVHPDYVITTQHWLGLLGPNGTQPQFANCSVYDFFVWLHYYSVRDTLLGPGRPYRAIDFSHQGPAFVTWHRYHLLCLERDLQRLIGNESFALPYWNFATGRNECDVCTDQLFGEARPDDPTLISRNSRFSSWETVCDSLDDYNHRVTLCNGTYEGLLRRNQMGRNSMKLPTLKDIRDCLSLQKFDNPPFFQNSTFSFRNALEGFDKADGTLDSQVMSLHNLVHSFLNGTNALPHSAANDPIYVVISNRLLYNATTNILEHVRKEKATKELPSLHVLVLHSFTDAIFDEWMKRFNPPADAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTSDQLGYSYAIDLPVSVEETPGWPTTLLVVMGMLVALVGLFVLLAFLQYRRLRKGYTPLMETHLSSKRYTEEA"
## [3] "MGLVGWGLLLGCLGCGILLRARAQFPRVCMTLDGVLNKECCPPLGPEATNICGFLEGRGQCAEVQTDTRPWSGPYILRNQDDREQWPRKFFNRTCKCTGNFAGYNCGGCKFGWTGPDCNRKKPAILRRNIHSLTAQEREQFLGALDLAKKSIHPDYVITTQHWLGLLGPNGTQPQIANCSVYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERELQRLTGNESFALPYWNFATGKNECDVCTDELLGAARQDDPTLISRNSRFSTWEIVCDSLDDYNRRVTLCNGTYEGLLRRNKVGRNNEKLPTLKNVQDCLSLQKFDSPPFFQNSTFSFRNALEGFDKADGTLDSQVMNLHNLAHSFLNGTNALPHSAANDPVFVVLHSFTDAIFDEWLKRNNPSTDAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTAEQLGYNYAVDLSEEEAPVWSTTLSVVIGILGAFVLLLGLLAFLQYRRLRKGYAPLMETGLSSKRYTEEA"
## [4] "MQAQELKSGRGEGQDRIREWKARGSSPADKAMSPLGWGLVLCCLGGGLLPGAWAQFPRVCMTVDNLMAKECCPPLGLEPANICGSQEGRGQCMEVQTDARPWSGPYVLRNQDDREGWPRKFFHRTCQCTGHYGGYNCGVCKFGWTGHDCNQRKARVIRKNIHSLTPEEREQFLEALDLAKNTTHPDYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQQLIGNESFALPYWNFATGRNDCDVCTDQLLGAARQDDPTLISENSRFSNWEIVCDSLDDYNRRVTLCNGTYEGLLRRNQVGLNSEKLPTLKDIQDCLSLKKFDSPPFFQNSTFSFRNALEGFDKANGILDSQVMSLHNLVHSFLNGTSALPHSAANDPVFVVLHSFTDAIFDEWMKRSNPSVDAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTADQLGYSYAIDLPVEETPAWTTTLLAVMGMLVALVGVFVVLFFLQYRRLRKGYTPLMETHLSDRKYTEEA"
## [5] "MSPLGWGLLLGCLGCALPSGARAQFPRVCMTVGSLQAKECCPPLGADPANVCGSREGRGQCAEVQTDTRPWSGPYVLRNQDDRERWPRKFFDRTCRCTGNFAGYNCGNCRFGWTGPKCDQKKPLVVRRDVHSLTPQEREQFLDALDLAKYTPHPDYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQRLTGNESFALPYWNFATGRNECDVCTDQLLGAARQDDPTLISQNSRFSSWEIVCDSLNDYNRRVTLCNGTYEGLLKRNQMGRNSEKLPTLKDIQNCLSLKKFDSPPFFQNSTLSFRNALEGFGKADGTLDSQVMNFHNLVHSFLNGTSALPHSAANDPVFVVLHSFTDAIFDEWMKRFNPPVDAWPRELAPIGHNRMYNMVPFFPPVTNEELFLTADQLGYSYAIDLPVEETPDWTTVLSVVTGMLVVLVVLFALLLFLQYRRLRKGYTPLVETQLSNKRYTEEA"
## [6] "MECKGVRAVLLLACCLQAVRAQFPRPCVTVEALTSKRCCPALGSDLNNVCGSLEGRGQCTPVVVDERPWSGPYTLRNVDDRERWPLKFFNSICRCTGNFAGFNCGECKFGWTGPTCEQRKPPVVRKNIHSLTAQERTQFLDALDQAKNTIHPDYVIATQHWLSILGPNGTEPQVANTSIYNYFVWLHYYSVRDTLLGPGRPFTAIDFSHQGPAFVTWHRYHLLLLERDLQRMTGNESFALPYWNFATGRNECDVCTDELFGAPRLDDPNLISAGSRFSRWGIVCNSLNDYNRLVTLCNGTNEGFLQRNPLGGAEGRLPSMEDVQKCLSLNEFDNPPFFRNSTFSFRNALEGFDEPDGTLNSTAMSLHNLVHSFLNGTSAQSHSSANDPIFVVLHSFTDAIFDEWMKRFQPSDVAWPQELAPIGHNRMYNMVPFFPPVTNEELFLPAEQLGYVYSIDLPASIEDTRAVAVLVGSTIGGILLAVLLLLLLLVLFMQRKRQQGFEPLMNATFTNKRYTEDA"
## [7] "MKRFVLMLCCFGCMYLSEAQFPRVCCTVEGIVSKECCPALGSDPANVCGLLAGRGNCSDVHVDSKPWGGPYSLRNVDDRERWPTKFFNRTCRCFGNFAGYNCGECKFGWTGLNCDQRKAPVLRKDIHSLSPEELQEFLNALDLSKTTVHPDYMIASQHWLGLLGPNGTEPQFSNISIYDFFVWQHYYSVRDTLLGPGRPFKAIDFSHKGPAFITWHRYHLLSLERELQKLTGNENFAVPYWNFATGQSECDVCTDSLLGGRDPSDPSRISPRSRFARWGVVCDSLDEYNRLVTLCNGTNEGSLRRGAAMSTNVTLPTMNDVRSCLNLRDFDSPPYFTNSTFSFRNALEGYDKPDGELDTSVSNLHNLVHQFLNGTSALSHSAANDPIFLVLHAFTDAIFDEWMRRFRENETFPDEMAPIGHNRHYNMVPFFPPVTNEEIYIASEQLGYSYSIDLEELDNRPAMFLLGSTLGGIFLGLLLLLIVLVLYIQQRRKREFEPLLNAEFTNTKYSGEA"
## [8] "MRLLPWGLLFCCLGCRIQPGVWAQFPRICMTLDSLRSKECCPALGAEPGNVCGSRDGRGQCSEVQADTRPWSGPYVLRNQDDREQWPRKFFNRTCRCTGNFAGYNCGDCWFGWTGPNCDQRKPLVVRQNIHSLTPQEREQFLGALDLAKKTTHPDYVITTQHWLGLLGPNGTQPQITNCSIYDFFVWLHYYSVRDTLLGPGRPFKAIDFSHQGPAFVTWHRYHLLWLERDLQRLTGNESFALPYWNFATGRNECDVCTDQLLGAARPDDPTLISQNSRFSSWEIVCDSLDDYNRRVTLCNGTYEGLLRRHQVGRNSETLPTLKNIQDCLSLQKFDNAPFFQNSTFSFRNALEGFDKADGTLDSQVVSLHNLVHSFLNGTSAFPHSAANDPVFVVLHSFTDGIFDEWMKRHKPAADAWPEELAPIGHNRMYNMVPFFPPVTNEELFLTTDHLGYSYAIDWPVTVEETPTWSAVLSVVVSTLAAVVGLFVLLIFLQYRRLRRGYAPLIETHLIRKKYTEEAEGAQTVP"
## [9] "MSPLGWGLVLSCLGCGILPGAHAQFPRVCMTVDSLMTKECCPSLGPEPDNICGSQEGRGQCMEVQADTRPWSGPYVLRNQDDREGWPRKFFHRTCRCTGHFAGYNCGVCKFGWTGHDCNQRKAPVIRKNIHSLTAQEREQFLDALDLAKNTTHPNYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQQLIDNESFALPYWNFATGRNDCDVCTDQLLGAARQDDPTLISQNSRFSSWEIICDSLDDYNRRVTLCNGTYEGLLRRNQVGLNSEKLPTLKDIQDCLSLKKFDSPPFFQNSTFSFRNALEGFDKANGILDSQVMSLHNLVHSFLNGTSALPHSAANDPVFVVLHSFTDAIFDEWMKRSNPSVDAWPQELAPIGHNRMYNMVPFFPPVTNEEFFLTADQLGYSYAIDLPAEETPGWTTTFLVVAGMLGALAGVFVVLFFLQYRRLRKGYTPLVETHLSNKKYTEEA"
## [10] "MCPLGWGLLLSCLGCALLPGAHAQFPRVCMTVDSLMSKECCPSLGPEPDNICGSQEGRGQCTEVQADTRPWSGPYVLRNQDDREGWPRKFFHRTCRCTGHFAGYNCGGCKFGWTGHDCSQRKAPVTRKNLHSLTAQEREQFLDALDLAKNTTHPDYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQQLIGNESFALPYWNFATGRNDCDVCTDQLLGAARQDDPTLISQNSRFSNWEIVCDSLDDYNRRVTLCNGTYEGLLRRNQVGLNSAKLPTLKDIQDCLSLKKFDSPPFFQNSTFSFRNALEGFDKANGILDSQVMSLHNLVHSFLNGTSALPHSAANDPIFVVLHSFTDAIFDEWMKRSNPSLDAWPQELAPIGHNRMYNMVPFFPPVTNEEFFLTADQLGYSYAINLPAEETPGWTPTLLVVMGMLVALAGVFVVLFFLQYRRLRKGYTPLAETHLNNKKYTEEA"
names(dct_vector) <- names(dct_list)
length(dct_vector)
## [1] 10
pairwise alignments
align01.02 <- pairwiseAlignment(dct_vector[1], dct_vector[2])
align01.03 <- pairwiseAlignment(dct_vector[1], dct_vector[3])
align01.04 <- pairwiseAlignment(dct_vector[1], dct_vector[4])
Biostrings::pid(align01.02)
## [1] 98.91304
Biostrings::pid(align01.03)
## [1] 78.26087
Biostrings::pid(align01.04)
## [1] 75.12864
align02.03 <- pairwiseAlignment(dct_vector[2], dct_vector[3])
align02.04 <- pairwiseAlignment(dct_vector[2], dct_vector[4])
align03.04 <- pairwiseAlignment(dct_vector[3], dct_vector[4])
pids <- c(1, NA, NA, NA,
pid(align01.02), 1, NA, NA,
pid(align01.03), pid(align02.03), 1, NA,
pid(align01.04), pid(align02.04), pid(align03.04), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Mouse","Dog")
colnames(mat) <- c("Homo","Pan","Mouse","Dog")
pander::pander(mat)
| Homo |
1 |
NA |
NA |
NA |
| Pan |
98.91 |
1 |
NA |
NA |
| Mouse |
78.26 |
78.44 |
1 |
NA |
| Dog |
75.13 |
75.3 |
77.74 |
1 |
PID methods comparison
type1 <- Biostrings::pid(align01.02, type="PID1")
type2 <- Biostrings::pid(align01.02, type="PID2")
type3 <- Biostrings::pid(align01.02, type="PID3")
type4 <- Biostrings::pid(align01.02, type="PID4")
method <- c("PID1","PID2","PID3","PID4")
PID_ <- c(type1, type2, type3, type4)
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")
methods.df <- data.frame(method, PID_, denominator)
pander::pander(methods.df)
| PID1 |
98.91 |
(aligned positions + internal gap positions) |
| PID2 |
98.91 |
(aligned positions) |
| PID3 |
98.91 |
(length shorter sequence) |
| PID4 |
98.91 |
(average length of the two sequences) |