# install.packages(ape)
# install.packages(phangorn)
library(ape)
## Warning: package 'ape' was built under R version 4.0.2
library(phangorn)
## Warning: package 'phangorn' was built under R version 4.0.2
The 5x5 matrix below was constructed using data gathered from a sequence analysis of five geographically labeled subspecies of Chimpanzees. Firstly, a brief DNA sequence from their genomes was entered into excel. Then 10 polymorphic loci were selected and aligned. Each subspecies ‘polymorphic loci sequence’ was compared in pairs to determine the percent identity. Percent identity can be determined by dividing the number of positions that have the same nucleotide by the length of the sequence being compared. The PID values were then input into a PID matrix. Lastly, the PID matrix was converted into a dissimilarity matrix.
This matrix is based on the proportion of bases that are identical between sequence. This is often referred to as PID for Proportion Identical or Percentage Identical.
BLAST reports PID in its main output. PID is a very simple metric of similarity; more sophisticated measures are used in pratice.
Make a similarity matrix with the matrix() command. Note that I have to declare the number of rows
# Bad matrix 1
matrix(c(1.0, 0.5, 0.3,
0.5, 1.0, 0.4,
0.3, 0.4, 1.0))
## [,1]
## [1,] 1.0
## [2,] 0.5
## [3,] 0.3
## [4,] 0.5
## [5,] 1.0
## [6,] 0.4
## [7,] 0.3
## [8,] 0.4
## [9,] 1.0
# Good matrix, Declares the number of rows
matrix(c(1.0, 0.5, 0.3,
0.5, 1.0, 0.4,
0.3, 0.4, 1.0),
nrow = 3)
## [,1] [,2] [,3]
## [1,] 1.0 0.5 0.3
## [2,] 0.5 1.0 0.4
## [3,] 0.3 0.4 1.0
Store the matrix
my_sim_mat <- matrix(c(1.0, 0.5, 0.3,
0.5, 1.0, 0.4,
0.3, 0.4, 1.0),
nrow = 3,
byrow = T)
Label the matrix with row.names() and colnames()
row.names(my_sim_mat) <- c("G","T","M") #Gombe, Tai, Mahale
colnames(my_sim_mat) <- c("G","T","M")
Similarity, disimilarity, and distance are all related. Most methods use distance, not similarity.
We can do vectorized math to recalculate the matrix
my_dist_mat <- 1-my_sim_mat
my_dist_mat
## G T M
## G 0.0 0.5 0.7
## T 0.5 0.0 0.6
## M 0.7 0.6 0.0
my_dist_mat2 <- as.dist(my_dist_mat) #as.dist converts matrix to a distance matrix -> removes duplicate information
my_dist_mat2
## G T
## T 0.5
## M 0.7 0.6
Neighbor Joining is one of the most common ways to build a tree using molecular data that’s been converted to sequences; its one of the options within BLAST.
Build the tree with nj()
my_nj <- ape::nj(my_dist_mat2)
Plot the tree as an “unrooted” tree
plot(my_nj, "unrooted")
Plot the tree as an “rooted” tree
plot(my_nj)
UPGMA/WPGMA are other algorithms that work with distance matrices. They are not commonly used now but are useful for teaching becaues they can easily be done by hand on small datasets. They are clustering algorithms, put similar things closer together and disimilar things farther apart.
my_upgma <- phangorn::upgma(my_dist_mat2)
Plot the UPGMA tree
plot(my_upgma)
Compare the rooted NJ and the UPGMA
par(mfrow = c(1,2))
plot(my_nj)
plot(my_upgma)
WPGMA tree
plot(wpgma(my_dist_mat2))
Minimum evolution tree
plot(fastme.ols(my_dist_mat2))
Be sure to add the nrow = … statemetn.
five_sim_mat <- matrix(c(1.0, 0.0, 0.0, 0.0, 0.0,
0.6, 1.0, 0.0, 0.0, 0.0,
0.8, 0.7, 1.0, 0.0, 0.0,
0.3, 0.3, 0.1, 1.0, 0.0,
0.4, 0.4, 0.2, 0.7, 1.0),
nrow = 5,
byrow = T) ######
Name things
row.names(five_sim_mat) <- c("Mahale_East", "Bodongo_East", "Gombe_East", "Tai_West", "Mali_West")
colnames(five_sim_mat) <- c("Mahale_East", "Bodongo_East", "Gombe_East", "Tai_West", "Mali_West")
Turn into a distnace matrix. This is 2 steps and requires the as.dist() command
five_dist_mat <- 1 - five_sim_mat
five_dist_mat2 <- as.dist(five_dist_mat)
five_dist_mat2
## Mahale_East Bodongo_East Gombe_East Tai_West
## Bodongo_East 0.4
## Gombe_East 0.2 0.3
## Tai_West 0.7 0.7 0.9
## Mali_West 0.6 0.6 0.8 0.3
Neighbor-Joining tree with nj()
five_nj <- ape::nj(five_dist_mat2)
Plot unrooted NJ tree
plot(five_nj, "unrooted")
Plot rooted NJ tree
plot(five_nj)
Build UPGMA tree
five_upgma <- phangorn::upgma(five_dist_mat2)
Plot UPGMA tree
plot(five_upgma)
Compare rooted NJ and UPGMA plots
par(mfrow = c(1,2))
plot(five_nj)
plot(five_upgma)
Build WPGMA tree
plot(wpgma(five_dist_mat2))
Compare rooted WPGMA and UPGMA plots
par(mfrow = c(1, 2))
plot(wpgma(five_dist_mat2))
plot(five_upgma)
Build Minimum evolution tree
plot(fastme.ols(five_dist_mat2))