————————————————————————-

Cài đặt các packges cần thiết”

install.packages(“remotes”)

remotes::install_github(“GuangchuangYu/treeio”)

install.packages(“BiocManager”)

BiocManager::install(“ggtree”)

BiocManager::install(“DECIPHER”)

install.packages(“ape”)

install.packages(“seqinr”)

install.packages(“adegenet”)

install.packages(“viridis”)

install.packages(“ggplot2”)

Gọi các packages

library(seqinr)
library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(ggtree)
## Registered S3 method overwritten by 'ggtree':
##   method      from 
##   identify.gg ggfun
## ggtree v3.2.1  For help: https://yulab-smu.top/treedata-book/
## 
## If you use ggtree in published research, please cite the most appropriate paper(s):
## 
## 1. Guangchuang Yu. Using ggtree to visualize data on tree-like structures. Current Protocols in Bioinformatics. 2020, 69:e96. doi:10.1002/cpbi.96
## 2. Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution. 2018, 35(12):3041-3043. doi:10.1093/molbev/msy194
## 3. Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
## 
## Attaching package: 'ggtree'
## The following object is masked from 'package:ape':
## 
##     rotate
library(DECIPHER)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:ggtree':
## 
##     expand
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:ggtree':
## 
##     collapse
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: RSQLite
## Loading required package: parallel
library(viridis)
## Loading required package: viridisLite
library(ggplot2)
library(ade4)
## 
## Attaching package: 'ade4'
## The following object is masked from 'package:Biostrings':
## 
##     score
## The following object is masked from 'package:BiocGenerics':
## 
##     score

Truy xuất trình tự

seqs <- readDNAStringSet("Litho_ITS.txt", format = "fasta")

Kiểm tra trình tự (ko bắt buộc)

seqs
## DNAStringSet object of length 15:
##      width seq                                              names               
##  [1]   575 TCGAAACCTGCCCCGCAGAACGA...GCCCCGGCGGCGCTCGCAACGC AF389105.1 Lithoc...
##  [2]   581 TCGAAACCTGCCCCGCAGAACGA...GCCCCGGCGGCGCTCGCAACGC AY040419.1 Lithoc...
##  [3]   580 TCGAAACCTACACAGCAGAACGA...GCCTCGGCGGCGCTCACAACGC AY040407.1 Lithoc...
##  [4]   411 AGGATCATTGTCGAACCTGCACA...CCCCCAAAACCCCCGGGTTCGG MT274517.1 Lithoc...
##  [5]   579 TCGAAACCTACACAGCAGAACGA...GCCTCGGCGGCGCTCACAACGC AY040406.1 Lithoc...
##  ...   ... ...
## [11]   396 AAACCGAACCCCGGTGCAGAACA...GTGGTTTTTCGACCCTCGTTCC KJ685201.1 Lithoc...
## [12]   396 AAACAGAACCCCAGTGCGGAATG...GTGGTTTTTCAACCTCCGTTCC KJ685198.1 Lithoc...
## [13]   396 AAACCGAACGCTGCACGGAACAC...GTGATTTTTCGACCTTCGTTCC KJ685169.1 Lithoc...
## [14]   590 TCGAAACCTGCACAGCAGAACGA...GCCTCGGCGGCGCTCGCAACGC AF389107.1 Lithoc...
## [15]   594 TCGAAACCTGCCCCGCAGAACGA...GCCCCGGCGGCGCTCGCAACGC AF389106.1 Lithoc...

Các trình tự nucleotide phải cùng chiều, nếu chưa cùng chiều sẽ được định dạng lại cho cùng chiều.

seqs <- OrientNucleotides(seqs)
## ========================================================================================================================================================================================================
## 
## Time difference of 0.02 secs

Thực hiện sắp hàng các trình tự

aligned <- AlignSeqs(seqs)
## Determining distance matrix based on shared 9-mers:
## ================================================================================
## 
## Time difference of 0 secs
## 
## Clustering into groups by similarity:
## ================================================================================
## 
## Time difference of 0 secs
## 
## Aligning Sequences:
## ================================================================================
## 
## Time difference of 0.06 secs
## 
## Iteration 1 of 2:
## 
## Determining distance matrix based on alignment:
## ================================================================================
## 
## Time difference of 0 secs
## 
## Reclustering into groups by similarity:
## ================================================================================
## 
## Time difference of 0 secs
## 
## Realigning Sequences:
## ================================================================================
## 
## Time difference of 0.03 secs
## 
## Iteration 2 of 2:
## 
## Determining distance matrix based on alignment:
## ================================================================================
## 
## Time difference of 0 secs
## 
## Reclustering into groups by similarity:
## ================================================================================
## 
## Time difference of 0 secs
## 
## Realigning Sequences:
## ================================================================================
## 
## Time difference of 0 secs
## 
## Refining the alignment:
## ================================================================================
## 
## Time difference of 0.01 secs

View các trình tự trong trình duyệt (ko bắt buộc) bằng lệnh sau:

BrowseSeqs(aligned, highlight=0)

Ghi trình tự đã được sắp dưới dạng file FASTA

writeXStringSet(aligned, file=“Litho_aligned.fasta”)

Đọc đọc file fasta đã aligned

dna <- read.alignment("Litho_aligned.fasta", format = "fasta")

Tạo ma trận khoảng cách

D <- dist.alignment(dna, matrix = "similarity")
temp <- as.data.frame(as.matrix(D))

Biểu diễn ma trận khoảng cách dưới dạng biểu đồ bằng cú pháp

table.paint(temp, cleg=0, clabel.row=.5, clabel.col=.5)+scale_color_viridis()

## NULL

màu tối hơn biểu thị giá trị khoảng cách lớn hơn

Tạo dữ liệu cây phát sinh

tre <- nj(D)
class(tre) # tất cả cây được tạo ra sử dụng packages ape
## [1] "phylo"
tre <- ladderize(tre)

Trực quan các cây

Biểu diễn cây

plot(tre, cex = 0.6)

Thêm tiêu đề cây

plot(tre, cex = 0.6)
title("similarity in Lithocarpus (ITS)")

Hoặc dùng phương pháp trung bình sử dụng cho cây UPGMA

h_cluster <- hclust(D, method = "average", members = NULL) 
plot(h_cluster, cex = 0.6)

Trực quan cây phát sinh băng ggtree

Cây dạng hình quạt

ggtree(tre, yscale = "NA")+
  geom_tiplab(hjust = -0.3, size=4, align = TRUE)+
  xlim(0,0.5)
## Warning: The tree contained negative edge length. If you want to ignore the edge, you
## can set 'options(ignore.negative.edge=TRUE)', then re-run ggtree.
## Warning in scaleY(as.phylo(model), res, yscale, layout, ...): yscale is not available...

Cây dạng cơ bản

ggtree(tre) + 
  geom_tiplab(hjust = -0.3, size=4, align = TRUE)+
  xlim(0,0.5)
## Warning: The tree contained negative edge length. If you want to ignore the edge, you
## can set 'options(ignore.negative.edge=TRUE)', then re-run ggtree.

~~~~~~~~ Tuỳ biến cây ~~~~~~~~

Trực quan dùngggtree và highlight các nhánh

ggtree(tre) + 
  geom_tiplab(hjust = -0.3, size=4, align = TRUE) + 
  geom_hilight(node=19, fill="purple", alpha = 0.2) + 
  geom_hilight(node=17, fill="dark green", alpha = 0.2) +
  geom_hilight(node=20, fill="gold", alpha = 0.2) +
  xlim(0,0.5)
## Warning: The tree contained negative edge length. If you want to ignore the edge, you
## can set 'options(ignore.negative.edge=TRUE)', then re-run ggtree.

highlight các nhánh và thêm các line cho các nhóm

ggtree(tre) + 
  geom_tiplab(hjust = -0.3, size=4, align = TRUE) + 
  geom_hilight(node=19, fill="purple", alpha = 0.2) + 
  geom_hilight(node=17, fill="dark green", alpha = 0.2) +
  geom_hilight(node=20, fill="gold", alpha = 0.2) +
  geom_cladelabel(node=19, label=" Cluster 1", 
                  color="purple", offset=.1, barsize = 2,
                  fontsize = 5, align=TRUE, alpha = 0.5) + 
  geom_cladelabel(node=17, label=" Cluster 2", 
                  color="dark green", offset=.1, barsize = 2,
                  fontsize = 5, align=TRUE, alpha = 0.5) + 
  geom_cladelabel(node=20, label=" Cluster 3", 
                  color="gold", offset=.1, barsize = 2,
                  fontsize = 5, align=TRUE, alpha = 0.5) + 
  xlim(0,0.5) 
## Warning: The tree contained negative edge length. If you want to ignore the edge, you
## can set 'options(ignore.negative.edge=TRUE)', then re-run ggtree.

~~~~~~ Trực quan trình tự cùng với cây phát sinh ~~~~~~

Lets plot the alignment with the tree, to do this we first have to
- Match the names to the tip labels
- Set our tree into a new name
tre.new <- tre
# change tip labels to full alignment names
tre.new$tip.label <- aligned@ranges@NAMES

# plot the alignment 
msaplot(p=ggtree(tre.new), fasta="Litho_aligned.fasta", 
        window=c(150, 175))+
  scale_fill_viridis_d(alpha = 0.8)
## Warning: The tree contained negative edge length. If you want to ignore the edge, you
## can set 'options(ignore.negative.edge=TRUE)', then re-run ggtree.
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.