rm(list = ls())
#BiocManager::install("msa")
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
library(msaR)
## Warning: package 'msaR' was built under R version 4.0.5
library(seqinr)
## Warning: package 'seqinr' was built under R version 4.1.0
##
## Attaching package: 'seqinr'
## The following object is masked from 'package:Biostrings':
##
## translate
library(ape)
## Warning: package 'ape' was built under R version 4.1.0
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
## The following object is masked from 'package:Biostrings':
##
## complement
#install.packages("msaR")
#Data Sample
getwd()
## [1] "C:/Users/liyix/OneDrive/Desktop"
myseq <- readDNAStringSet("viral.1.1.genomic.fna.gz")
head(myseq,5)
## DNAStringSet object of length 5:
## width seq names
## [1] 154675 AGTCCCCGTCTTGCCGCGCGGGG...GGGGGAAAAGAGGCGGGGCGGG NC_001798.2 Human...
## [2] 8908 TGTTGCGTTAACAACAAATCAAT...GTTGTGGCTTTGTTGTAGCGCA NC_030692.1 Borna...
## [3] 8957 CAGCTCTCGCACCAAAACCCAGT...GATGTGTTGGTTTGGTTTCTTT NC_027892.1 Canar...
## [4] 9006 TGTTGCGTTAACAACAAACCAAC...CGTTGTGGTTTTGTTGTAGCGC NC_029642.1 Aquat...
## [5] 8910 GTTGCGTTAACAACAAACCACTC...TGTTGCTTTGTTGTAGCGCGTT NC_001607.1 Borna...
length(myseq)
## [1] 9142
myseq <- myseq[c(2,2,3,3)]
myseq
## DNAStringSet object of length 4:
## width seq names
## [1] 8908 TGTTGCGTTAACAACAAATCAAT...AGTTGTGGCTTTGTTGTAGCGCA NC_030692.1 Borna...
## [2] 8908 TGTTGCGTTAACAACAAATCAAT...AGTTGTGGCTTTGTTGTAGCGCA NC_030692.1 Borna...
## [3] 8957 CAGCTCTCGCACCAAAACCCAGT...GGATGTGTTGGTTTGGTTTCTTT NC_027892.1 Canar...
## [4] 8957 CAGCTCTCGCACCAAAACCCAGT...GGATGTGTTGGTTTGGTTTCTTT NC_027892.1 Canar...
names(myseq) <- paste0(gsub("\\..*", "", names(myseq)),"-",1:4)
##Visualizing Our Data Sample
#msaR(myseq, seqlogo = F)
#Multiple Sequence Alignment
cap_msa <- msa(myseq)
## use default substitution matrix
cap_msa
## CLUSTAL 2.1
##
## Call:
## msa(myseq)
##
## MsaDNAMultipleAlignment with 4 rows and 8987 columns
## aln names
## [1] -------------------CAGCTCT...AAGGATGTGTTGGTTTGGTTTCTTT NC_027892-3
## [2] -------------------CAGCTCT...AAGGATGTGTTGGTTTGGTTTCTTT NC_027892-4
## [3] TGTTGCGTTAACAACAAATCAATCAC...------------------------- NC_030692-1
## [4] TGTTGCGTTAACAACAAATCAATCAC...------------------------- NC_030692-2
## Con ???????????????????CA?????...????????????????????????? Consensus
#Then we will convert our msa object into alignment.
cap_align <- msaConvert(cap_msa, type="seqinr::alignment")
cap_align$nb
## [1] 4
cap_align$nam
## [1] "NC_027892-3" "NC_027892-4" "NC_030692-1" "NC_030692-2"
#cap_align$seq[1]
cap_align$com
## [1] NA
#Which now we can calculate the score of similarites of the alignment.
#identity matrix (for protein and DNA sequences).
cap_ident <- dist.alignment(cap_align, matrix = "identity") ###note here is dist
class(cap_ident)
## [1] "dist"
cap_ident
## NC_027892-3 NC_027892-4 NC_030692-1
## NC_027892-4 0.0000000
## NC_030692-1 0.5795571 0.5795571
## NC_030692-2 0.5795571 0.5795571 0.0000000
as.matrix(cap_ident)
## NC_027892-3 NC_027892-4 NC_030692-1 NC_030692-2
## NC_027892-3 0.0000000 0.0000000 0.5795571 0.5795571
## NC_027892-4 0.0000000 0.0000000 0.5795571 0.5795571
## NC_030692-1 0.5795571 0.5795571 0.0000000 0.0000000
## NC_030692-2 0.5795571 0.5795571 0.0000000 0.0000000
##############
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.1.0
df <- melt(as.matrix(cap_ident), varnames = c("row", "col"))
df
## row col value
## 1 NC_027892-3 NC_027892-3 0.0000000
## 2 NC_027892-4 NC_027892-3 0.0000000
## 3 NC_030692-1 NC_027892-3 0.5795571
## 4 NC_030692-2 NC_027892-3 0.5795571
## 5 NC_027892-3 NC_027892-4 0.0000000
## 6 NC_027892-4 NC_027892-4 0.0000000
## 7 NC_030692-1 NC_027892-4 0.5795571
## 8 NC_030692-2 NC_027892-4 0.5795571
## 9 NC_027892-3 NC_030692-1 0.5795571
## 10 NC_027892-4 NC_030692-1 0.5795571
## 11 NC_030692-1 NC_030692-1 0.0000000
## 12 NC_030692-2 NC_030692-1 0.0000000
## 13 NC_027892-3 NC_030692-2 0.5795571
## 14 NC_027892-4 NC_030692-2 0.5795571
## 15 NC_030692-1 NC_030692-2 0.0000000
## 16 NC_030692-2 NC_030692-2 0.0000000
str(df)
## 'data.frame': 16 obs. of 3 variables:
## $ row : Factor w/ 4 levels "NC_027892-3",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ col : Factor w/ 4 levels "NC_027892-3",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ value: num 0 0 0.58 0.58 0 ...
df[as.numeric(df$row) > as.numeric(df$col), ]
## row col value
## 2 NC_027892-4 NC_027892-3 0.0000000
## 3 NC_030692-1 NC_027892-3 0.5795571
## 4 NC_030692-2 NC_027892-3 0.5795571
## 7 NC_030692-1 NC_027892-4 0.5795571
## 8 NC_030692-2 NC_027892-4 0.5795571
## 12 NC_030692-2 NC_030692-1 0.0000000
library(Biostrings)
a1 <- pairwiseAlignment(pattern=myseq[1], subject=myseq[3], gapOpening=1, gapExtension=0)
a1
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: --TG-T-T-GCGTTAACAACAAATCAATCACCAC...GTTTGGGTT-GAGTTGTGGCTTTGTTGTAGCGCA
## subject: CA-GCTCTCGC----AC--CAAA------ACC--...-TT--GGTTTG-GTT-T--CTTT-----------
## score: 10054.94
hc <- hclust(cap_ident)
library(ggdendro)
## Warning: package 'ggdendro' was built under R version 4.1.0
ggdendrogram(hc)

ggdendrogram(hc, rotate = TRUE, theme_dendro = FALSE)

dend <- as.dendrogram(hc)
dend
## 'dendrogram' with 2 branches and 4 members total, at height 0.5795571
dend_data <- dendro_data(dend, type = "rectangle")
names(dend_data)
## [1] "segments" "labels" "leaf_labels" "class"
head(dend_data$segments)
## x y xend yend
## 1 2.5 0.5795571 1.5 0.5795571
## 2 1.5 0.5795571 1.5 0.0000000
## 3 1.5 0.0000000 1.0 0.0000000
## 4 1.0 0.0000000 1.0 0.0000000
## 5 1.5 0.0000000 2.0 0.0000000
## 6 2.0 0.0000000 2.0 0.0000000
# Plot line segments and add labels
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.0
ggplot(dend_data$segments) +
geom_segment(aes(x = x, y = y, xend = xend, yend = yend))+
geom_text(data = dend_data$labels, aes(x, y, label = label),
hjust = 1, angle = 90, size = 3)+
ylim(-0.2, 0.6)

#Tweaking ggdendro.R
dendro_data_k <- function(hc, k) {
hcdata <- ggdendro::dendro_data(hc, type = "rectangle")
seg <- hcdata$segments
labclust <- cutree(hc, k)[hc$order]
segclust <- rep(0L, nrow(seg))
heights <- sort(hc$height, decreasing = TRUE)
height <- mean(c(heights[k], heights[k - 1L]), na.rm = TRUE)
for (i in 1:k) {
xi <- hcdata$labels$x[labclust == i]
idx1 <- seg$x >= min(xi) & seg$x <= max(xi)
idx2 <- seg$xend >= min(xi) & seg$xend <= max(xi)
idx3 <- seg$yend < height
idx <- idx1 & idx2 & idx3
segclust[idx] <- i
}
idx <- which(segclust == 0L)
segclust[idx] <- segclust[idx + 1L]
hcdata$segments$clust <- segclust
hcdata$segments$line <- as.integer(segclust < 1L)
hcdata$labels$clust <- labclust
hcdata
}
set_labels_params <- function(nbLabels,
direction = c("tb", "bt", "lr", "rl"),
fan = FALSE) {
if (fan) {
angle <- 360 / nbLabels * 1:nbLabels + 90
idx <- angle >= 90 & angle <= 270
angle[idx] <- angle[idx] + 180
hjust <- rep(0, nbLabels)
hjust[idx] <- 1
} else {
angle <- rep(0, nbLabels)
hjust <- 0
if (direction %in% c("tb", "bt")) { angle <- angle + 45 }
if (direction %in% c("tb", "rl")) { hjust <- 1 }
}
list(angle = angle, hjust = hjust, vjust = 0.5)
}
plot_ggdendro <- function(hcdata,
direction = c("lr", "rl", "tb", "bt"),
fan = FALSE,
scale.color = NULL,
branch.size = 1,
label.size = 3,
nudge.label = 0.01,
expand.y = 0.1) {
direction <- match.arg(direction) # if fan = FALSE
ybreaks <- pretty(segment(hcdata)$y, n = 5)
ymax <- max(segment(hcdata)$y)
## branches
p <- ggplot() +
geom_segment(data = segment(hcdata),
aes(x = x,
y = y,
xend = xend,
yend = yend,
linetype = factor(line),
colour = factor(clust)),
lineend = "round",
show.legend = FALSE,
size = branch.size)
## orientation
if (fan) {
p <- p +
coord_polar(direction = -1) +
scale_x_continuous(breaks = NULL,
limits = c(0, nrow(label(hcdata)))) +
scale_y_reverse(breaks = ybreaks)
} else {
p <- p + scale_x_continuous(breaks = NULL)
if (direction %in% c("rl", "lr")) {
p <- p + coord_flip()
}
if (direction %in% c("bt", "lr")) {
p <- p + scale_y_reverse(breaks = ybreaks)
} else {
p <- p + scale_y_continuous(breaks = ybreaks)
nudge.label <- -(nudge.label)
}
}
# labels
labelParams <- set_labels_params(nrow(hcdata$labels), direction, fan)
hcdata$labels$angle <- labelParams$angle
p <- p +
geom_text(data = label(hcdata),
aes(x = x,
y = y,
label = label,
colour = factor(clust),
angle = angle),
vjust = labelParams$vjust,
hjust = labelParams$hjust,
nudge_y = ymax * nudge.label,
size = label.size,
show.legend = FALSE)
# colors and limits
if (!is.null(scale.color)) {
p <- p + scale_color_manual(values = scale.color)
}
ylim <- -round(ymax * expand.y, 1)
p <- p + expand_limits(y = ylim)
p
}
################################
#Basic dendrogram
hcdata <- dendro_data_k(hc, 2)
plot_ggdendro(hcdata,
direction = "lr",
expand.y = 0.2)

################################
ggsave(filename = paste0(Sys.Date(),"-","dendrogram.tif"),
plot = last_plot(), device = "tiff",
width =15, height = 15, units = "cm",
dpi = 300, limitsize = TRUE, compression = "lzw")
#REF http://www.sthda.com/english/wiki/beautiful-dendrogram-visualizations-in-r-5-must-known-methods-unsupervised-machine-learning
#https://atrebas.github.io/post/2019-06-08-lightweight-dendrograms/
#https://rpubs.com/TX-YXL/662586
##https://rpubs.com/HandoyoSjarif/DecodeDNA