Adjusting dotplot setting in R to investigate sequence repeats

In this exercise we’ll look at a sequence with known tandem repeats. We’ll load the data, explore it in R, then use the dotPlot() function to make various dotplots to see how changing settings for dotPlots() help make repeat patterns stand out.

Add the necessary code to make this script functional.

Preliminaries

Load packages

library(seqinr)
library(rentrez)
library(compbio4all)
library(Biostrings)

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: XVector

## Loading required package: GenomeInfoDb

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:seqinr':
## 
##     translate

## The following object is masked from 'package:base':
## 
##     strsplit

Get data

Download sequence P73709

P73709_FASTA <- rentrez::entrez_fetch(id = "P73709" ,
                                      db = "protein", 
                                      rettype="fasta")

Clean and set up sequence as vector.

NOTE: no arguments besides sequence passed to fasta_cleaner() - we do this differently for pairwise alignment

P73709_vector <- fasta_cleaner(P73709_FASTA)

Set up as 1 continuous string

P73709_string <- fasta_cleaner(P73709_vector, 
                               parse = F)

Compare structure of each type

str(P73709_FASTA)

##  chr ">sp|P73709.1|Y1819_SYNY3 RecName: Full=Uncharacterized protein slr1819\nMEAKELVQRYRNGETLFTGLKLPGINLEAADLIGIVLNE"| __truncated__

str(P73709_vector)

##  chr [1:331] "M" "E" "A" "K" "E" "L" "V" "Q" "R" "Y" "R" "N" "G" "E" "T" ...

str(P73709_string)

##  chr "M"

# could compare length() or # of characters nchar()

For fun - make a global pairwiser alignment

Takes data in STRING form!

align <- pairwiseAlignment(P73709_FASTA, 
                           P73709_FASTA, 
                           type = "global")

Look at PID

pid(align)

## [1] 100

# PID = 100% because the same thing is being compared

Explore vector data

Raw fasta

str()

Vectors

[ ]

P73709_vector[1]

## [1] "M"

# gets the element at that index in vector

[x:y]

P73709_vector[1:2]  # shows first 2

## [1] "M" "E"

P73709_vector[1:50]   # shows first 50

##  [1] "M" "E" "A" "K" "E" "L" "V" "Q" "R" "Y" "R" "N" "G" "E" "T" "L" "F" "T" "G"
## [20] "L" "K" "L" "P" "G" "I" "N" "L" "E" "A" "A" "D" "L" "I" "G" "I" "V" "L" "N"
## [39] "E" "A" "D" "L" "R" "G" "A" "N" "L" "L" "F" "C"

length(P73709_vector)

## [1] 331

# Make a table relevant to data
# returns table of frequency of characters
#  A B C D
#  2 5 1 5
table(P73709_vector)

## P73709_vector
##  A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  V  Y 
## 53  3 21 13  8 27  4 10 11 52 12 34  2 13 20 15 17 10  6

Default dotplot from R

Note orientation. Any strong diagonals?

dotPlot(P73709_vector[40:70], 
        P73709_vector[40:70])

Arguements of dotPlot

window size: wsize
window step: wstep
number matching within window: nmatch (the threshold)

Default is exact match (binary):

# dotPlot() defualt makes a binary dotplot
# filter noise and amplify signal 

dotPlot(P73709_vector, 
        P73709_vector, 
        wsize = 1,   #window size
        wstep = 1,  
        nmatch = 1)   #"threshold"

Don’t vary wstep

# running average = moving window and smooth out big spikes and get rid of noise
dotPlot(P73709_vector, 
        P73709_vector, 
        wsize = 1, 
        nmatch = 1)

main = … sets a title

I’ll use "Default: wsize = 1, nmatch = 1

dotPlot(P73709_vector, 
        P73709_vector, 
        wsize = 1, 
        nmatch = 1, 
        main = "Deafult: wsize = 1, nmatch = 1")

We can make a grid of plots with the par() command (we’ll leave this as a black box for now) mfrow sets layout of grid mar sets margins

# set up 2 x 2 grid, make margins things
par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

# plot 1: Defaults
dotPlot(P73709_vector, P73709_vector, 
        wsize = 1, 
        nmatch = 1, 
        main = "Size = 1, nmatch = 1")

# plot 2 size = 10, nmatch = 1
dotPlot(P73709_vector, P73709_vector, 
        wsize =10 , 
        nmatch =1 , 
        main = "Size = 10, nmatch = 1")

# plot 3: size = 10, nmatch = 5
dotPlot(P73709_vector, P73709_vector, 
        wsize = 10, 
        nmatch = 5, 
        main = "Size = 10, nmatch = 5")

# plot 4: size = 20, nmatch = 5
dotPlot(P73709_vector, P73709_vector, 
        wsize = 20,
        nmatch = 5,
        main = "Size = 20, nmatch = 5")

# reset par() - run this or other plots will be small!
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

Dialed in

wsize = 20, wstep = 1, nmatch = 5

# be sure to run par - re-run just in case
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))


dotPlot(P73709_vector, 
        P73709_vector
        )

Diagonal dropped

Make new function

dot_plot <- function(seq1, seq2, wsize = 1, wstep = 1, nmatch = 1, col = c("white", 
    "black"), xlab = deparse(substitute(seq1)), ylab = deparse(substitute(seq2)), 
    ...) {
  
  # make sure input works
    if (nchar(seq1[1]) > 1) 
        stop("seq1 should be provided as a vector of single chars")
    if (nchar(seq2[1]) > 1) 
        stop("seq2 should be provided as a vector of single chars")
    if (wsize < 1) 
        stop("non allowed value for wsize")
    if (wstep < 1) 
        stop("non allowed value for wstep")
    if (nmatch < 1) 
        stop("non allowed value for nmatch")
    if (nmatch > wsize) 
        stop("nmatch > wsize is not allowed")
  
  # internal function
    # function defined within a function!
    mkwin <- function(seq, wsize, wstep) {
        sapply(seq(from = 1, to = length(seq) - wsize + 1, by = wstep), 
            function(i) c2s(seq[i:(i + wsize - 1)]))
    }
    wseq1 <- mkwin(seq1, wsize, wstep)
    wseq2 <- mkwin(seq2, wsize, wstep)
    if (nmatch == wsize) {
        xy <- outer(wseq1, wseq2, "==")
    }
    else {
        "%==%" <- function(x, y) colSums(sapply(x, s2c) == sapply(y, 
            s2c)) >= nmatch
        xy <- outer(wseq1, wseq2, "%==%")
    }
    
    # compile output in list
    out <- list(x = seq(from = 1, to = length(seq1), length = length(wseq1)),
                y = seq(from = 1, to = length(seq2), length = length(wseq2)),
                z = xy)
}

Use new function, save output (doesn’t autoplot)

my_dot_out <- dot_plot(P73709_vector,
                       P73709_vector, 
        wsize = 15, 
        wstep = 1,
        nmatch = 5) # threshold of matches or hits in the window out of 15, only 5 need to match

Get rid of upper triangular portion

my_dot_out$z[lower.tri(my_dot_out$z)] <- FALSE

Do some weird prep (don’t worry about it)

my_dot_out$z <- my_dot_out$z[, nrow(my_dot_out$z):1]

Plot using image() command

# seriously - it will drive you crazy if you forget about this
par(mfrow = c(1,1), 
    mar = c(4,4,4,4)) # will leave this as a black box for now

# plot with image()
image(x = my_dot_out$x, 
      y = my_dot_out$y, 
      z = my_dot_out$z)

Other examples

P24587

P24587 <- rentrez::entrez_fetch(id = "P24587",
                                db = "protein", 
                                rettype="fasta")

P24587 <- fasta_cleaner(P24587)

length(P24587)

## [1] 714

# Use [ : ] to subset 300 to 400 
dotPlot(P24587[300:400],
        P24587[300:400],
        wsize = 15, 
        wstep = 1,
        nmatch = 5)

P02840

P02840 <- rentrez::entrez_fetch(id = "P02840",db = "protein", rettype="fasta")

P02840 <- fasta_cleaner(P02840)
length(P02840)

## [1] 307

# set limit to 80 to 113
dotPlot(P02840[80:113],P02840[80:113],
        wsize = 5, 
        wstep = 1,
        nmatch = 5)

P19246

P19246 <- rentrez::entrez_fetch(id = "P19246",
                                db = "protein", 
                                rettype="fasta")

P19246 <- fasta_cleaner(P19246)
length(P19246)

## [1] 1090

# full
dotPlot(P19246,P19246,
        wsize = 1, 
        wstep = 1,
        nmatch = 1)

# set limit to 525:550
dotPlot(P19246[525:550],P19246[525:550],
        wsize = 1, 
        wstep = 1,
        nmatch = 1)

Q55837

Q55837 <- rentrez::entrez_fetch(id = "Q55837",
                                db = "protein", 
                                rettype="fasta")

Q55837 <- fasta_cleaner(Q55837)
dotPlot(Q55837,
        Q55837)