Using R to investigate gene fads2 for sequence repeates

Change the XXXXX of the title to your gene name.
Change the names and text appropriately to reflect your gene / protein.
Add the necessary code to make this script functional.
Download the PROTEIN sequence of your gene.
Adapting the code below, make 2 grids of 4 plots (8 plots total) exploring different settings for window size and the match threshold (nmatch)
Use the modified dotplot function to make a plot of your whole gene with only one part shown.
Use the modified dotplot function to plot just a focal section of your gene.
Make sure that all plots have titles relevant to your gene.

This script is based on the Shroom example used in a previous Portfolio assignment.

Preliminaries

Load packages

library(seqinr)
library(rentrez)
library(compbio4all)
library(Biostrings)

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: XVector

## Loading required package: GenomeInfoDb

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:seqinr':
## 
##     translate

## The following object is masked from 'package:base':
## 
##     strsplit

Get gene FADS2 data

Download sequence of NP_004256.1.

 fads2 <- rentrez::entrez_fetch(id = "NP_004256.1",
                                      db = "protein", 
                                      rettype="fasta")

Clean and set up sequence as vector.

 fads2_vector <- fasta_cleaner(fads2)

Confirm set up.

str(fads2_vector)

##  chr [1:444] "M" "G" "K" "G" "G" "N" "Q" "G" "E" "G" "A" "A" "E" "R" "E" ...

length(fads2_vector)

## [1] 444

Make a 2 x 2 grid of dotplots to explore effect of changing window size and nmatch

# set up 2 x 2 grid, make margins thinner
par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

# plot 1:  - Defaults
dotPlot(fads2_vector,
        fads2_vector, 
        wsize = 1, 
        nmatch = 1, 
        main = "fads2 Defaults")

# plot - size = 10, nmatch = 1
dotPlot(fads2_vector, 
        fads2_vector, 
        wsize = 10, 
        nmatch = 1, 
        main = " fads2 - size = 10, nmatch = 1")

# plot 3:  - size = 10, nmatch = 5
dotPlot(fads2_vector, 
        fads2_vector, 
        wsize = 10, 
        nmatch = 5, 
        main = "fads2 - size = 10, nmatch = 5")

# plot 4: size = 20, nmatch = 5
dotPlot(fads2_vector, 
        fads2_vector, 
        wsize = 20,
        nmatch = 5,
        main = "fads2 - size = 20, nmatch = 5")

# reset par() - run this or other plots will be small!
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

Make ANOTER 2 x 2 grid of dotplots to explore effect of changing window size and nmatch

# set up 2 x 2 grid, make margins thinner
par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

# plot 1: 
dotPlot(fads2_vector, 
        fads2_vector, 
        wsize = 30, 
        nmatch = 5, 
        main = "fads2 30 / 5")

# plot 2 fads2 - size = 30, nmatch = 10
dotPlot(fads2_vector, 
        fads2_vector, 
        wsize = 30, 
        nmatch = 10, 
        main = "fads2 - size = 30, nmatch = 10")

# plot 3: fads2 - size = 5, nmatch = 2
dotPlot(fads2_vector, 
        fads2_vector, 
        wsize = 5, 
        nmatch = 2, 
        main = "fads2 - size = 5, nmatch = 2")

# plot 4: size = 12, nmatch = 4
dotPlot(fads2_vector, 
        fads2_vector, 
        wsize = 12,
        nmatch = 4,
        main = "fads2 - size = 12, nmatch = 4")

# reset par() - run this or other plots will be small!
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

Best plot using normal dotplot

This is the most interesting gene fads2 dotplot.

# be sure to run par - re-run just in case
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

dotPlot(fads2_vector, 
        fads2_vector,
        wsize = 20, 
        nmatch = 5,
        main = "fads2 size = 20, nmatch = 5")

New dotplot function

The code below makes a new dotplot function

Make new function

dot_plot <- function(seq1, seq2, wsize = 1, wstep = 1, nmatch = 1, col = c("white", 
    "black"), xlab = deparse(substitute(seq1)), ylab = deparse(substitute(seq2)), 
    ...) {
  
  # make sure input works
    if (nchar(seq1[1]) > 1) 
        stop("seq1 should be provided as a vector of single chars")
    if (nchar(seq2[1]) > 1) 
        stop("seq2 should be provided as a vector of single chars")
    if (wsize < 1) 
        stop("non allowed value for wsize")
    if (wstep < 1) 
        stop("non allowed value for wstep")
    if (nmatch < 1) 
        stop("non allowed value for nmatch")
    if (nmatch > wsize) 
        stop("nmatch > wsize is not allowed")
  
  # internal function
    mkwin <- function(seq, wsize, wstep) {
        sapply(seq(from = 1, to = length(seq) - wsize + 1, by = wstep), 
            function(i) c2s(seq[i:(i + wsize - 1)]))
    }
    wseq1 <- mkwin(seq1, wsize, wstep)
    wseq2 <- mkwin(seq2, wsize, wstep)
    if (nmatch == wsize) {
        xy <- outer(wseq1, wseq2, "==")
    }
    else {
        "%==%" <- function(x, y) colSums(sapply(x, s2c) == sapply(y, 
            s2c)) >= nmatch
        xy <- outer(wseq1, wseq2, "%==%")
    }
    
    # compile output in list
    out <- list(x = seq(from = 1, to = length(seq1), length = length(wseq1)),
                y = seq(from = 1, to = length(seq2), length = length(wseq2)),
                z = xy)
}

Full-length plot

Use new function on the full-length protein, save output (doesn’t autoplot)

my_dot_out <- dot_plot(fads2_vector,
                       fads2_vector, 
        wsize = 20, 
        wstep = 5,
        nmatch = 5)

Get rid of upper triangular portion

# don't change this
my_dot_out$z[lower.tri(my_dot_out$z)] <- FALSE

Do some weird prep (don’t worry about it)

# don't change this
my_dot_out$z <- my_dot_out$z[, nrow(my_dot_out$z):1]

Plot using image() command

# seriously - it will drive you crazy if you forget about this
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

# plot with image()
image(x = my_dot_out$x, 
      y = my_dot_out$y, 
      z = my_dot_out$z,
      main = "fads2 Dotplot")

Focal subplot