DNA-alignment-similarity-BLAST-cluster-plot.R

rm(list = ls())
#BiocManager::install("msa")
library(msa)

## Loading required package: Biostrings

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: XVector

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:base':
## 
##     strsplit

library(msaR)

## Warning: package 'msaR' was built under R version 4.0.5

library(seqinr)

## Warning: package 'seqinr' was built under R version 4.1.0

## 
## Attaching package: 'seqinr'

## The following object is masked from 'package:Biostrings':
## 
##     translate

library(ape)

## Warning: package 'ape' was built under R version 4.1.0

## 
## Attaching package: 'ape'

## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus

## The following object is masked from 'package:Biostrings':
## 
##     complement

#install.packages("msaR")
#Data Sample
getwd()

## [1] "C:/Users/liyix/OneDrive/Desktop"

myseq <- readDNAStringSet("viral.1.1.genomic.fna.gz")
head(myseq,5)

## DNAStringSet object of length 5:
##      width seq                                              names               
## [1] 154675 AGTCCCCGTCTTGCCGCGCGGGG...GGGGGAAAAGAGGCGGGGCGGG NC_001798.2 Human...
## [2]   8908 TGTTGCGTTAACAACAAATCAAT...GTTGTGGCTTTGTTGTAGCGCA NC_030692.1 Borna...
## [3]   8957 CAGCTCTCGCACCAAAACCCAGT...GATGTGTTGGTTTGGTTTCTTT NC_027892.1 Canar...
## [4]   9006 TGTTGCGTTAACAACAAACCAAC...CGTTGTGGTTTTGTTGTAGCGC NC_029642.1 Aquat...
## [5]   8910 GTTGCGTTAACAACAAACCACTC...TGTTGCTTTGTTGTAGCGCGTT NC_001607.1 Borna...

length(myseq)

## [1] 9142

myseq <- myseq[c(2,2,3,3)]
myseq

## DNAStringSet object of length 4:
##     width seq                                               names               
## [1]  8908 TGTTGCGTTAACAACAAATCAAT...AGTTGTGGCTTTGTTGTAGCGCA NC_030692.1 Borna...
## [2]  8908 TGTTGCGTTAACAACAAATCAAT...AGTTGTGGCTTTGTTGTAGCGCA NC_030692.1 Borna...
## [3]  8957 CAGCTCTCGCACCAAAACCCAGT...GGATGTGTTGGTTTGGTTTCTTT NC_027892.1 Canar...
## [4]  8957 CAGCTCTCGCACCAAAACCCAGT...GGATGTGTTGGTTTGGTTTCTTT NC_027892.1 Canar...

names(myseq) <- paste0(gsub("\\..*", "", names(myseq)),"-",1:4)
##Visualizing Our Data Sample
#msaR(myseq, seqlogo = F)
#Multiple Sequence Alignment
cap_msa <- msa(myseq)

## use default substitution matrix

cap_msa

## CLUSTAL 2.1  
## 
## Call:
##    msa(myseq)
## 
## MsaDNAMultipleAlignment with 4 rows and 8987 columns
##     aln                                                    names
## [1] -------------------CAGCTCT...AAGGATGTGTTGGTTTGGTTTCTTT NC_027892-3
## [2] -------------------CAGCTCT...AAGGATGTGTTGGTTTGGTTTCTTT NC_027892-4
## [3] TGTTGCGTTAACAACAAATCAATCAC...------------------------- NC_030692-1
## [4] TGTTGCGTTAACAACAAATCAATCAC...------------------------- NC_030692-2
## Con ???????????????????CA?????...????????????????????????? Consensus

#Then we will convert our msa object into alignment.
cap_align <- msaConvert(cap_msa, type="seqinr::alignment")
cap_align$nb

## [1] 4

cap_align$nam

## [1] "NC_027892-3" "NC_027892-4" "NC_030692-1" "NC_030692-2"

#cap_align$seq[1]
cap_align$com

## [1] NA

#Which now we can calculate the score of similarites of the alignment.
#identity matrix (for protein and DNA sequences).
cap_ident <- dist.alignment(cap_align, matrix = "identity") ###note here is dist
class(cap_ident)

## [1] "dist"

cap_ident

##             NC_027892-3 NC_027892-4 NC_030692-1
## NC_027892-4   0.0000000                        
## NC_030692-1   0.5795571   0.5795571            
## NC_030692-2   0.5795571   0.5795571   0.0000000

as.matrix(cap_ident)

##             NC_027892-3 NC_027892-4 NC_030692-1 NC_030692-2
## NC_027892-3   0.0000000   0.0000000   0.5795571   0.5795571
## NC_027892-4   0.0000000   0.0000000   0.5795571   0.5795571
## NC_030692-1   0.5795571   0.5795571   0.0000000   0.0000000
## NC_030692-2   0.5795571   0.5795571   0.0000000   0.0000000

##############
library(reshape2)

## Warning: package 'reshape2' was built under R version 4.1.0

df <- melt(as.matrix(cap_ident), varnames = c("row", "col"))
df

##            row         col     value
## 1  NC_027892-3 NC_027892-3 0.0000000
## 2  NC_027892-4 NC_027892-3 0.0000000
## 3  NC_030692-1 NC_027892-3 0.5795571
## 4  NC_030692-2 NC_027892-3 0.5795571
## 5  NC_027892-3 NC_027892-4 0.0000000
## 6  NC_027892-4 NC_027892-4 0.0000000
## 7  NC_030692-1 NC_027892-4 0.5795571
## 8  NC_030692-2 NC_027892-4 0.5795571
## 9  NC_027892-3 NC_030692-1 0.5795571
## 10 NC_027892-4 NC_030692-1 0.5795571
## 11 NC_030692-1 NC_030692-1 0.0000000
## 12 NC_030692-2 NC_030692-1 0.0000000
## 13 NC_027892-3 NC_030692-2 0.5795571
## 14 NC_027892-4 NC_030692-2 0.5795571
## 15 NC_030692-1 NC_030692-2 0.0000000
## 16 NC_030692-2 NC_030692-2 0.0000000

str(df)

## 'data.frame':    16 obs. of  3 variables:
##  $ row  : Factor w/ 4 levels "NC_027892-3",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ col  : Factor w/ 4 levels "NC_027892-3",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ value: num  0 0 0.58 0.58 0 ...

df[as.numeric(df$row) > as.numeric(df$col), ]

##            row         col     value
## 2  NC_027892-4 NC_027892-3 0.0000000
## 3  NC_030692-1 NC_027892-3 0.5795571
## 4  NC_030692-2 NC_027892-3 0.5795571
## 7  NC_030692-1 NC_027892-4 0.5795571
## 8  NC_030692-2 NC_027892-4 0.5795571
## 12 NC_030692-2 NC_030692-1 0.0000000

library(Biostrings)
a1 <- pairwiseAlignment(pattern=myseq[1], subject=myseq[3], gapOpening=1, gapExtension=0)
a1

## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: --TG-T-T-GCGTTAACAACAAATCAATCACCAC...GTTTGGGTT-GAGTTGTGGCTTTGTTGTAGCGCA
## subject: CA-GCTCTCGC----AC--CAAA------ACC--...-TT--GGTTTG-GTT-T--CTTT-----------
## score: 10054.94

hc <- hclust(cap_ident)
library(ggdendro)

## Warning: package 'ggdendro' was built under R version 4.1.0

ggdendrogram(hc)

ggdendrogram(hc, rotate = TRUE, theme_dendro = FALSE)

dend <- as.dendrogram(hc)
dend

## 'dendrogram' with 2 branches and 4 members total, at height 0.5795571

dend_data <- dendro_data(dend, type = "rectangle")
names(dend_data)

## [1] "segments"    "labels"      "leaf_labels" "class"

head(dend_data$segments)

##     x         y xend      yend
## 1 2.5 0.5795571  1.5 0.5795571
## 2 1.5 0.5795571  1.5 0.0000000
## 3 1.5 0.0000000  1.0 0.0000000
## 4 1.0 0.0000000  1.0 0.0000000
## 5 1.5 0.0000000  2.0 0.0000000
## 6 2.0 0.0000000  2.0 0.0000000

# Plot line segments and add labels
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.1.0

ggplot(dend_data$segments) + 
  geom_segment(aes(x = x, y = y, xend = xend, yend = yend))+
  geom_text(data = dend_data$labels, aes(x, y, label = label),
            hjust = 1, angle = 90, size = 3)+
  ylim(-0.2, 0.6)

#Tweaking ggdendro.R
dendro_data_k <- function(hc, k) {
  
  hcdata    <-  ggdendro::dendro_data(hc, type = "rectangle")
  seg       <-  hcdata$segments
  labclust  <-  cutree(hc, k)[hc$order]
  segclust  <-  rep(0L, nrow(seg))
  heights   <-  sort(hc$height, decreasing = TRUE)
  height    <-  mean(c(heights[k], heights[k - 1L]), na.rm = TRUE)
  
  for (i in 1:k) {
    xi      <-  hcdata$labels$x[labclust == i]
    idx1    <-  seg$x    >= min(xi) & seg$x    <= max(xi)
    idx2    <-  seg$xend >= min(xi) & seg$xend <= max(xi)
    idx3    <-  seg$yend < height
    idx     <-  idx1 & idx2 & idx3
    segclust[idx] <- i
  }
  
  idx                    <-  which(segclust == 0L)
  segclust[idx]          <-  segclust[idx + 1L]
  hcdata$segments$clust  <-  segclust
  hcdata$segments$line   <-  as.integer(segclust < 1L)
  hcdata$labels$clust    <-  labclust
  
  hcdata
}
set_labels_params <- function(nbLabels,
                              direction = c("tb", "bt", "lr", "rl"),
                              fan       = FALSE) {
  if (fan) {
    angle       <-  360 / nbLabels * 1:nbLabels + 90
    idx         <-  angle >= 90 & angle <= 270
    angle[idx]  <-  angle[idx] + 180
    hjust       <-  rep(0, nbLabels)
    hjust[idx]  <-  1
  } else {
    angle       <-  rep(0, nbLabels)
    hjust       <-  0
    if (direction %in% c("tb", "bt")) { angle <- angle + 45 }
    if (direction %in% c("tb", "rl")) { hjust <- 1 }
  }
  list(angle = angle, hjust = hjust, vjust = 0.5)
}
plot_ggdendro <- function(hcdata,
                          direction   = c("lr", "rl", "tb", "bt"),
                          fan         = FALSE,
                          scale.color = NULL,
                          branch.size = 1,
                          label.size  = 3,
                          nudge.label = 0.01,
                          expand.y    = 0.1) {
  
  direction <- match.arg(direction) # if fan = FALSE
  ybreaks   <- pretty(segment(hcdata)$y, n = 5)
  ymax      <- max(segment(hcdata)$y)
  
  ## branches
  p <- ggplot() +
    geom_segment(data         =  segment(hcdata),
                 aes(x        =  x,
                     y        =  y,
                     xend     =  xend,
                     yend     =  yend,
                     linetype =  factor(line),
                     colour   =  factor(clust)),
                 lineend      =  "round",
                 show.legend  =  FALSE,
                 size         =  branch.size)
  
  ## orientation
  if (fan) {
    p <- p +
      coord_polar(direction = -1) +
      scale_x_continuous(breaks = NULL,
                         limits = c(0, nrow(label(hcdata)))) +
      scale_y_reverse(breaks = ybreaks)
  } else {
    p <- p + scale_x_continuous(breaks = NULL)
    if (direction %in% c("rl", "lr")) {
      p <- p + coord_flip()
    }
    if (direction %in% c("bt", "lr")) {
      p <- p + scale_y_reverse(breaks = ybreaks)
    } else {
      p <- p + scale_y_continuous(breaks = ybreaks)
      nudge.label <- -(nudge.label)
    }
  }
  
  # labels
  labelParams <- set_labels_params(nrow(hcdata$labels), direction, fan)
  hcdata$labels$angle <- labelParams$angle
  
  p <- p +
    geom_text(data        =  label(hcdata),
              aes(x       =  x,
                  y       =  y,
                  label   =  label,
                  colour  =  factor(clust),
                  angle   =  angle),
              vjust       =  labelParams$vjust,
              hjust       =  labelParams$hjust,
              nudge_y     =  ymax * nudge.label,
              size        =  label.size,
              show.legend =  FALSE)
  
  # colors and limits
  if (!is.null(scale.color)) {
    p <- p + scale_color_manual(values = scale.color)
  }
  
  ylim <- -round(ymax * expand.y, 1)
  p    <- p + expand_limits(y = ylim)
  
  p
}
################################
#Basic dendrogram
hcdata <- dendro_data_k(hc, 2)
plot_ggdendro(hcdata,
              direction   = "lr",
              expand.y    = 0.2)

################################
ggsave(filename = paste0(Sys.Date(),"-","dendrogram.tif"), 
       plot = last_plot(), device = "tiff",
       width =15, height = 15, units = "cm",
       dpi = 300, limitsize = TRUE, compression = "lzw")
#REF http://www.sthda.com/english/wiki/beautiful-dendrogram-visualizations-in-r-5-must-known-methods-unsupervised-machine-learning
#https://atrebas.github.io/post/2019-06-08-lightweight-dendrograms/
#https://rpubs.com/TX-YXL/662586
##https://rpubs.com/HandoyoSjarif/DecodeDNA

DNA-alignment-similarity-BLAST-cluster-plot.R

liyix

2021-08-22