Preliminaries

Load packages

library(seqinr)
library(rentrez)
library(compbio4all)
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit

Get data

Download sequence P73709

P73709_FASTA <- rentrez::entrez_fetch(id = "P73709",
                                      db = "protein", 
                                      rettype="fasta")

Clean and set up sequence as vector.

NOTE: no arguments besides sequence passed to fasta_cleaner() - we do this differently for parwise alignment

P73709_vector <- fasta_cleaner(P73709_FASTA)

Set up as 1 continuous string

P73709_FASTA_str <- fasta_cleaner(P73709_FASTA, 
                               parse = F)
length(P73709_FASTA_str)
## [1] 1
nchar(P73709_FASTA_str)
## [1] 331

Compare structure of each type

str(P73709_FASTA)
##  chr ">sp|P73709.1|Y1819_SYNY3 RecName: Full=Uncharacterized protein slr1819\nMEAKELVQRYRNGETLFTGLKLPGINLEAADLIGIVLNE"| __truncated__
str(P73709_vector)
##  chr [1:331] "M" "E" "A" "K" "E" "L" "V" "Q" "R" "Y" "R" "N" "G" "E" "T" ...
str(P73709_FASTA_str)
##  chr "MEAKELVQRYRNGETLFTGLKLPGINLEAADLIGIVLNEADLRGANLLFCYLNRANLGQANLVAANLSGASLNQADLAGADLRSANFHGAMLQGAILRDSDMTLATLQDTN"| __truncated__
length(P73709_FASTA_str)
## [1] 1
nchar(P73709_FASTA_str)
## [1] 331

For fun - make a global pairwiser alignment

Takes data in STRING form!

align <- pairwiseAlignment(P73709_FASTA_str, 
                           P73709_FASTA_str, 
                           type = "global")

Look at PID

pid(align)
## [1] 100

Explore vector data

Raw fasta

str()

Vectors

[ ]

P73709_vector[1]
## [1] "M"
P73709_vector[2]
## [1] "E"
P73709_vector[7]
## [1] "V"

[x:y]

colon (not semi colon, not a dash, not sace)

P73709_vector[1:2]
## [1] "M" "E"
P73709_vector[1:50]
##  [1] "M" "E" "A" "K" "E" "L" "V" "Q" "R" "Y" "R" "N" "G" "E" "T" "L" "F" "T" "G"
## [20] "L" "K" "L" "P" "G" "I" "N" "L" "E" "A" "A" "D" "L" "I" "G" "I" "V" "L" "N"
## [39] "E" "A" "D" "L" "R" "G" "A" "N" "L" "L" "F" "C"

length()

table()

table(P73709_vector)
## P73709_vector
##  A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  V  Y 
## 53  3 21 13  8 27  4 10 11 52 12 34  2 13 20 15 17 10  6
P73709_vector[50:60]
##  [1] "C" "Y" "L" "N" "R" "A" "N" "L" "G" "Q" "A"
table(P73709_vector[50:60])
## 
## A C G L N Q R Y 
## 2 1 1 2 2 1 1 1
P73709_vector[40:79]
##  [1] "A" "D" "L" "R" "G" "A" "N" "L" "L" "F" "C" "Y" "L" "N" "R" "A" "N" "L" "G"
## [20] "Q" "A" "N" "L" "V" "A" "A" "N" "L" "S" "G" "A" "S" "L" "N" "Q" "A" "D" "L"
## [39] "A" "G"
table(P73709_vector[40:79])
## 
## A C D F G L N Q R S V Y 
## 9 1 2 1 4 9 6 2 2 2 1 1

Default dotplot from R

Note orientation. Any strong diagonals?

dotPlot(P73709_vector[40:79], 
        P73709_vector[40:79])

dotPlot(P73709_vector, 
        P73709_vector)

Arguements of dotPlot

Default is exact match (binary):

filter out “noise” amplify the signal

dotPlot(P73709_vector, 
        P73709_vector, 
        wsize = 1, 
        wstep = 1,
        nmatch = 1)

Don’t vary wstep

Running average

dotPlot(P73709_vector, 
        P73709_vector, 
        wsize = 1, 
        nmatch = 1)

main = … sets a title

I’ll use “Default: wsize = 1, nmatch = 1

dotPlot(P73709_vector, 
        P73709_vector, 
        wsize = 1, 
        nmatch = 1, 
        main = "Default: wsize = 1, nmatch = 1")

We can make a grid of plots with the par() command (we’ll leave this as a black box for now) mfrow sets layout of grid mar sets margins

# set up 2 x 2 grid, make margins thingger
par(mfrow = c(2,2), 
    mar = c(0,0,1,1))

# plot 1: Defaults
dotPlot(P73709_vector, P73709_vector, 
        wsize = 1, 
        nmatch = 1, 
        main = "Defaults")

# plot 2 size = 10, nmatch = 1
dotPlot(P73709_vector, P73709_vector, 
        wsize = 10, 
        nmatch = 1, 
        main = "size = 10, nmatch = 1")

# plot 3: size = 10, nmatch = 5
dotPlot(P73709_vector, P73709_vector, 
        wsize = 10, 
        nmatch = 5, 
        main = "size = 10, nmatch = 5")

# plot 4: size = 20, nmatch = 5
dotPlot(P73709_vector, P73709_vector, 
        wsize = 20,
        nmatch = 5,
        main = "size = 20, nmatch = 5")

# reset par() - run this or other plots will be small!
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

Dialed in

# be sure to run par - re-run just in cae
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))


dotPlot(P73709_vector, 
        P73709_vector,
        wsize = 20, 
        wstep = 1,
        nmatch = 5)

Diagonal dropped

Make new function

dot_plot <- function(seq1, seq2, wsize = 1, wstep = 1, nmatch = 1, col = c("white", 
    "black"), xlab = deparse(substitute(seq1)), ylab = deparse(substitute(seq2)), 
    ...) {
  
  # make sure input wors
    if (nchar(seq1[1]) > 1) 
        stop("seq1 should be provided as a vector of single chars")
    if (nchar(seq2[1]) > 1) 
        stop("seq2 should be provided as a vector of single chars")
    if (wsize < 1) 
        stop("non allowed value for wsize")
    if (wstep < 1) 
        stop("non allowed value for wstep")
    if (nmatch < 1) 
        stop("non allowed value for nmatch")
    if (nmatch > wsize) 
        stop("nmatch > wsize is not allowed")
  
  # internal function
    mkwin <- function(seq, wsize, wstep) {
        sapply(seq(from = 1, to = length(seq) - wsize + 1, by = wstep), 
            function(i) c2s(seq[i:(i + wsize - 1)]))
    }
    wseq1 <- mkwin(seq1, wsize, wstep)
    wseq2 <- mkwin(seq2, wsize, wstep)
    if (nmatch == wsize) {
        xy <- outer(wseq1, wseq2, "==")
    }
    else {
        "%==%" <- function(x, y) colSums(sapply(x, s2c) == sapply(y, 
            s2c)) >= nmatch
        xy <- outer(wseq1, wseq2, "%==%")
    }
    
    # compile output in list
    out <- list(x = seq(from = 1, to = length(seq1), length = length(wseq1)),
                y = seq(from = 1, to = length(seq2), length = length(wseq2)),
                z = xy)
}

Use new function, save output (doesn’t autoplot)

my_dot_out <- dot_plot(P73709_vector,
                       P73709_vector, 
        wsize = 20, 
        wstep = 1,
        nmatch = 8) # threshold for number of "hits" or number of "matchs
                    # within the window
                    # 5/20 matches;  

Get rid of upper triangular portion

my_dot_out$z[lower.tri(my_dot_out$z)] <- FALSE

Do some weird prep (don’t worry about it)

my_dot_out$z <- my_dot_out$z[, nrow(my_dot_out$z):1]

Plot using image() command

# seriously - it will drive you crazy if you forget abouthis
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

# plot with image()
image(x = my_dot_out$x, 
      y = my_dot_out$y, 
      z = my_dot_out$z)

Other examples

P24587

P24587 <- rentrez::entrez_fetch(id = "P24587",
                                db = "protein", 
                                rettype="fasta")

P24587 <- fasta_cleaner(P24587)

length(P24587)
## [1] 714
# Use [ : ] to subset 300 to 400 
dotPlot(P24587[300:400],
        P24587[300:400],
        wsize = 15, 
        wstep = 1,
        nmatch = 5)

P02840

P02840 <- rentrez::entrez_fetch(id = "P02840",db = "protein", rettype="fasta")

P02840 <- fasta_cleaner(P02840)
length(P02840)
## [1] 307
# set limit to 80 to 113
dotPlot(P02840[],
        P02840[],
        wsize = 5, 
        wstep = 1,
        nmatch = 5)

P19246

P19246 <- rentrez::entrez_fetch(id = "P19246",db = "protein", rettype="fasta")

P19246 <- fasta_cleaner(P19246)
length(P19246)
## [1] 1090
# full
dotPlot(P19246,P19246,
        wsize = 1, 
        wstep = 1,
        nmatch = 1)

# set limit to 525:550
dotPlot(P19246[],P19246[],
        wsize = 1, 
        wstep = 1,
        nmatch = 1)

Q55837

Q55837 <- rentrez::entrez_fetch(id = "Q55837",db = "protein", rettype="fasta")

Q55837 <- fasta_cleaner(Q55837)
length(Q55837)
## [1] 166
dotPlot(Q55837,
        Q55837)