This script is based on the Shroom example used in a previous Portfolio assignment.
library(seqinr)
library(rentrez)
library(compbio4all)
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
Download sequence of NP_004256.1.
fads2 <- rentrez::entrez_fetch(id = "NP_004256.1",
db = "protein",
rettype="fasta")
Clean and set up sequence as vector.
fads2_vector <- fasta_cleaner(fads2)
Confirm set up.
str(fads2_vector)
## chr [1:444] "M" "G" "K" "G" "G" "N" "Q" "G" "E" "G" "A" "A" "E" "R" "E" ...
length(fads2_vector)
## [1] 444
Make a 2 x 2 grid of dotplots to explore effect of changing window size and nmatch
# set up 2 x 2 grid, make margins thinner
par(mfrow = c(2,2),
mar = c(0,0,2,1))
# plot 1: - Defaults
dotPlot(fads2_vector,
fads2_vector,
wsize = 1,
nmatch = 1,
main = "fads2 Defaults")
# plot - size = 10, nmatch = 1
dotPlot(fads2_vector,
fads2_vector,
wsize = 10,
nmatch = 1,
main = " fads2 - size = 10, nmatch = 1")
# plot 3: - size = 10, nmatch = 5
dotPlot(fads2_vector,
fads2_vector,
wsize = 10,
nmatch = 5,
main = "fads2 - size = 10, nmatch = 5")
# plot 4: size = 20, nmatch = 5
dotPlot(fads2_vector,
fads2_vector,
wsize = 20,
nmatch = 5,
main = "fads2 - size = 20, nmatch = 5")
# reset par() - run this or other plots will be small!
par(mfrow = c(1,1),
mar = c(4,4,4,4))
Make ANOTER 2 x 2 grid of dotplots to explore effect of changing window size and nmatch
# set up 2 x 2 grid, make margins thinner
par(mfrow = c(2,2),
mar = c(0,0,2,1))
# plot 1:
dotPlot(fads2_vector,
fads2_vector,
wsize = 30,
nmatch = 5,
main = "fads2 30 / 5")
# plot 2 fads2 - size = 30, nmatch = 10
dotPlot(fads2_vector,
fads2_vector,
wsize = 30,
nmatch = 10,
main = "fads2 - size = 30, nmatch = 10")
# plot 3: fads2 - size = 5, nmatch = 2
dotPlot(fads2_vector,
fads2_vector,
wsize = 5,
nmatch = 2,
main = "fads2 - size = 5, nmatch = 2")
# plot 4: size = 12, nmatch = 4
dotPlot(fads2_vector,
fads2_vector,
wsize = 12,
nmatch = 4,
main = "fads2 - size = 12, nmatch = 4")
# reset par() - run this or other plots will be small!
par(mfrow = c(1,1),
mar = c(4,4,4,4))
This is the most interesting gene fads2 dotplot.
# be sure to run par - re-run just in case
par(mfrow = c(1,1),
mar = c(4,4,4,4))
dotPlot(fads2_vector,
fads2_vector,
wsize = 20,
nmatch = 5,
main = "fads2 size = 20, nmatch = 5")
The code below makes a new dotplot function
Make new function
dot_plot <- function(seq1, seq2, wsize = 1, wstep = 1, nmatch = 1, col = c("white",
"black"), xlab = deparse(substitute(seq1)), ylab = deparse(substitute(seq2)),
...) {
# make sure input works
if (nchar(seq1[1]) > 1)
stop("seq1 should be provided as a vector of single chars")
if (nchar(seq2[1]) > 1)
stop("seq2 should be provided as a vector of single chars")
if (wsize < 1)
stop("non allowed value for wsize")
if (wstep < 1)
stop("non allowed value for wstep")
if (nmatch < 1)
stop("non allowed value for nmatch")
if (nmatch > wsize)
stop("nmatch > wsize is not allowed")
# internal function
mkwin <- function(seq, wsize, wstep) {
sapply(seq(from = 1, to = length(seq) - wsize + 1, by = wstep),
function(i) c2s(seq[i:(i + wsize - 1)]))
}
wseq1 <- mkwin(seq1, wsize, wstep)
wseq2 <- mkwin(seq2, wsize, wstep)
if (nmatch == wsize) {
xy <- outer(wseq1, wseq2, "==")
}
else {
"%==%" <- function(x, y) colSums(sapply(x, s2c) == sapply(y,
s2c)) >= nmatch
xy <- outer(wseq1, wseq2, "%==%")
}
# compile output in list
out <- list(x = seq(from = 1, to = length(seq1), length = length(wseq1)),
y = seq(from = 1, to = length(seq2), length = length(wseq2)),
z = xy)
}
Use new function on the full-length protein, save output (doesn’t autoplot)
my_dot_out <- dot_plot(fads2_vector,
fads2_vector,
wsize = 20,
wstep = 5,
nmatch = 5)
Get rid of upper triangular portion
# don't change this
my_dot_out$z[lower.tri(my_dot_out$z)] <- FALSE
Do some weird prep (don’t worry about it)
# don't change this
my_dot_out$z <- my_dot_out$z[, nrow(my_dot_out$z):1]
Plot using image() command
# seriously - it will drive you crazy if you forget about this
par(mfrow = c(1,1),
mar = c(4,4,4,4))
# plot with image()
image(x = my_dot_out$x,
y = my_dot_out$y,
z = my_dot_out$z,
main = "fads2 Dotplot")
Use new function on the full-length protein, save output (doesn’t autoplot)
my_dot_out <- dot_plot(fads2_vector[100:300],
fads2_vector[100:300],
wsize = 25,
wstep = 1,
nmatch = 5)
Get rid of upper triangular portion
# don't change this
my_dot_out$z[lower.tri(my_dot_out$z)] <- FALSE
Do some weird prep (don’t worry about it)
# don't change this
my_dot_out$z <- my_dot_out$z[, nrow(my_dot_out$z):1]
Plot using image() command
# seriously - it will drive you crazy if you forget about this
par(mfrow = c(1,1),
mar = c(4,4,4,4))
# plot with image()
# don't change this
image(x = my_dot_out$x,
y = my_dot_out$y,
z = my_dot_out$z,
main = "fads2 100-300 dotplot")