Edits and some detail added by David Reed Hall

  1. Change the XXXX of the title to your gene name.
  2. Change the names and text appropriately to reflect your gene / protein.
  3. Add the necessary code to make this script functional.
  4. Download the PROTEIN sequence of your gene.
  5. Adapting the code below, make 2 grids of 4 plots (8 plots total) exploring different settings for window size and the match threshold (nmatch)
  6. Use the modified dotplot function to make a plot of your whole gene with only one part shown.
  7. Use the modified dotplot function to plot just a focal section of your gene.
  8. Make sure that all plots have titles relevant to your gene.

This script is based on the Shroom example used in a previous Portfolio assignment.

Preliminaries

Load packages

library(seqinr)
library(rentrez)
library(compbio4all)
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit

Get gene PON2 data

Download sequence of PON2.

PON2_FASTA <- rentrez::entrez_fetch(id = "NP_000296.2",
                                      db = "protein", 
                                      rettype="fasta")

Clean and set up sequence as vector.

PON2_vector<- fasta_cleaner(PON2_FASTA)

Confirm set up.

str(PON2_vector)
##  chr [1:354] "M" "G" "R" "L" "V" "A" "V" "G" "L" "L" "G" "I" "A" "L" "A" ...

Make a 2 x 2 grid of dotplots to explore effect of changing window size and nmatch

# set up 2 x 2 grid, make margins thinner
par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

# plot 1:  - Defaults
dotPlot(PON2_vector, PON2_vector, 
        wsize = 1, 
        nmatch = 1, 
        main = "PON2 Defaults")

# plot - size = X, nmatch = X
dotPlot(PON2_vector, PON2_vector, 
        wsize = 5, 
        nmatch = 3, 
        main = "PON2 - size = 5, nmatch = 3")

# plot 3:  - size = , nmatch = 
dotPlot(PON2_vector, PON2_vector, 
        wsize = 10, 
        nmatch = 3, 
        main = "PON2 - size = 10, nmatch = 3")

# plot 4: size = , nmatch = 
dotPlot(PON2_vector, PON2_vector, 
        wsize = 10,
        nmatch = 5,
        main = "PON2 - size = 10, nmatch = 5")

# reset par() - run this or other plots will be small!
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

Make ANOTER 2 x 2 grid of dotplots to explore effect of changing window size and nmatch

# set up 2 x 2 grid, make margins thinner
par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

# plot 1: PON2 - size = 20, nmatch = 5
dotPlot(PON2_vector, PON2_vector, 
        wsize = 20, 
        nmatch = 5, 
        main = "PON2 - size = 20, nmatch = 5")

# plot 2 PON2 - size = 20, nmatch = 10
dotPlot(PON2_vector, PON2_vector, 
        wsize = 20, 
        nmatch = 10, 
        main = "PON2 - size = 20, nmatch = 10")

# plot 3: PON2 - size = 30, nmatch = 5
dotPlot(PON2_vector, PON2_vector, 
        wsize = 30, 
        nmatch = 5, 
        main = "PON2 - size = 30, nmatch = 5")

# plot 4: PON2 - size = 30, nmatch = 10
dotPlot(PON2_vector, PON2_vector, 
        wsize = 30,
        nmatch = 10,
        main = "PON2 - size = 30 , nmatch = 10")

# reset par() - run this or other plots will be small!
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

Best plot using normal dotplot

This is the most interesting gene PON2 dotplot.

# be sure to run par - re-run just in case
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

dotPlot(PON2_vector, 
        PON2_vector,
        wsize = 30, 
        nmatch = 5, )

New dotplot function

The code below makes a new dotplot function

Make new function

dot_plot <- function(seq1, seq2, wsize = 1, wstep = 1, nmatch = 1, col = c("white", 
    "black"), xlab = deparse(substitute(seq1)), ylab = deparse(substitute(seq2)), 
    ...) {
  
  # make sure input works
    if (nchar(seq1[1]) > 1) 
        stop("seq1 should be provided as a vector of single chars")
    if (nchar(seq2[1]) > 1) 
        stop("seq2 should be provided as a vector of single chars")
    if (wsize < 1) 
        stop("non allowed value for wsize")
    if (wstep < 1) 
        stop("non allowed value for wstep")
    if (nmatch < 1) 
        stop("non allowed value for nmatch")
    if (nmatch > wsize) 
        stop("nmatch > wsize is not allowed")
  
  # internal function
    mkwin <- function(seq, wsize, wstep) {
        sapply(seq(from = 1, to = length(seq) - wsize + 1, by = wstep), 
            function(i) c2s(seq[i:(i + wsize - 1)]))
    }
    wseq1 <- mkwin(seq1, wsize, wstep)
    wseq2 <- mkwin(seq2, wsize, wstep)
    if (nmatch == wsize) {
        xy <- outer(wseq1, wseq2, "==")
    }
    else {
        "%==%" <- function(x, y) colSums(sapply(x, s2c) == sapply(y, 
            s2c)) >= nmatch
        xy <- outer(wseq1, wseq2, "%==%")
    }
    
    # compile output in list
    out <- list(x = seq(from = 1, to = length(seq1), length = length(wseq1)),
                y = seq(from = 1, to = length(seq2), length = length(wseq2)),
                z = xy)
}

Full-length plot

Use new function on the full-length protein, save output (doesn’t autoplot)

my_dot_out <- dot_plot(PON2_vector,
                       PON2_vector, 
        wsize = 30, 
        wstep = 1,
        nmatch = 5)

Get rid of upper triangular portion

# don't change this
my_dot_out$z[lower.tri(my_dot_out$z)] <- FALSE

Do some weird prep (don’t worry about it)

# don't change this
my_dot_out$z <- my_dot_out$z[, nrow(my_dot_out$z):1]

Plot using image() command

# seriously - it will drive you crazy if you forget about this
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

# plot with image()
image(x = my_dot_out$x, 
      y = my_dot_out$y, 
      z = my_dot_out$z)

Focal subplot

Use new function on the part of the protein, save output (doesn’t autoplot)

my_dot_out <- dot_plot(PON2_vector[1:100],
                       PON2_vector[1:100], 
        wsize = 30, 
        wstep = 1,
        nmatch = 5)

Get rid of upper triangular portion

# don't change this
my_dot_out$z[lower.tri(my_dot_out$z)] <- FALSE

Do some weird prep (don’t worry about it)

# don't change this
my_dot_out$z <- my_dot_out$z[, nrow(my_dot_out$z):1]

Plot using image() command

# seriously - it will drive you crazy if you forget about this
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

# plot with image()
# don't change this
image(x = my_dot_out$x, 
      y = my_dot_out$y, 
      z = my_dot_out$z)