In this exercise we’ll look at a sequence with known tandem repeats. We’ll load the data, explore it in R, then use the dotPlot() function to make various dotplots to see how changing settings for dotPlots() help make repeat patterns stand out.
Add the necessary code to make this script functional.
library(seqinr)
library(rentrez)
library(compbio4all)
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
## CRAN PACKAGES
# downloaded using install.packages()
# install.packages("seqinr")
### BiocManager - CRAN package to download
##### Bioconductor packages
# requires BiocManager
# install.packages("BiocManager")
## BioConductor packages
### downloaded with BiocManager::install(), NOT install.packages()
# BiocManager::install("Biostrings")
## GitHub packages
### requires devtools package and its function install_github()
# library(devtools)
# devtools::install_github("brouwern/combio4all")
Download sequence P73709
P73709_FASTA <- rentrez::entrez_fetch(id ="P73709" ,
db = "protein",
rettype="fasta")
Clean and set up sequence as vector.
NOTE: no arguments besides sequence passed to fasta_cleaner() - we do this differently for pairwise alignment
P73709_vector <- fasta_cleaner(P73709_FASTA)
Set up as 1 continuous string
P73709_string <- fasta_cleaner(P73709_FASTA,
parse = F)
Compare structure of each type
str(P73709_FASTA)
## chr ">sp|P73709.1|Y1819_SYNY3 RecName: Full=Uncharacterized protein slr1819\nMEAKELVQRYRNGETLFTGLKLPGINLEAADLIGIVLNE"| __truncated__
str(P73709_vector)
## chr [1:331] "M" "E" "A" "K" "E" "L" "V" "Q" "R" "Y" "R" "N" "G" "E" "T" ...
str(P73709_string) #nchar 331 length=1
## chr "MEAKELVQRYRNGETLFTGLKLPGINLEAADLIGIVLNEADLRGANLLFCYLNRANLGQANLVAANLSGASLNQADLAGADLRSANFHGAMLQGAILRDSDMTLATLQDTN"| __truncated__
Takes data in STRING form!
align <- pairwiseAlignment(P73709_string,
P73709_string,
type = "global")
Look at PID
pid(align) #100
## [1] 100
str()
P73709_FASTA
## [1] ">sp|P73709.1|Y1819_SYNY3 RecName: Full=Uncharacterized protein slr1819\nMEAKELVQRYRNGETLFTGLKLPGINLEAADLIGIVLNEADLRGANLLFCYLNRANLGQANLVAANLSGA\nSLNQADLAGADLRSANFHGAMLQGAILRDSDMTLATLQDTNLIGADLRGADLSGATLTGACLRGANMRQE\nKKGYYTNLQAAILGRADLQGANMKGVDLSRADLSYANLKEANLRDVDLRKADLSYANLKGALLTDANLSG\nAKLNGADLQNANLMRAKISEAEMTAVNCQGAIMTHVNLNRTNLTGSNLSFTRMNSADLSRANLTKANLQE\nAELIEAFFARANLTEANFINANLVRADLMSANMVGADFQGATMPDGQVRHH\n\n"
is(P73709_FASTA)
## [1] "character" "vector"
## [3] "data.frameRowLabels" "SuperClassMethod"
## [5] "character_OR_connection" "character_OR_NULL"
## [7] "atomic" "EnumerationValue"
## [9] "vector_OR_Vector" "vector_OR_factor"
length(P73709_FASTA) #1
## [1] 1
[ ]
P73709_vector
## [1] "M" "E" "A" "K" "E" "L" "V" "Q" "R" "Y" "R" "N" "G" "E" "T" "L" "F" "T"
## [19] "G" "L" "K" "L" "P" "G" "I" "N" "L" "E" "A" "A" "D" "L" "I" "G" "I" "V"
## [37] "L" "N" "E" "A" "D" "L" "R" "G" "A" "N" "L" "L" "F" "C" "Y" "L" "N" "R"
## [55] "A" "N" "L" "G" "Q" "A" "N" "L" "V" "A" "A" "N" "L" "S" "G" "A" "S" "L"
## [73] "N" "Q" "A" "D" "L" "A" "G" "A" "D" "L" "R" "S" "A" "N" "F" "H" "G" "A"
## [91] "M" "L" "Q" "G" "A" "I" "L" "R" "D" "S" "D" "M" "T" "L" "A" "T" "L" "Q"
## [109] "D" "T" "N" "L" "I" "G" "A" "D" "L" "R" "G" "A" "D" "L" "S" "G" "A" "T"
## [127] "L" "T" "G" "A" "C" "L" "R" "G" "A" "N" "M" "R" "Q" "E" "K" "K" "G" "Y"
## [145] "Y" "T" "N" "L" "Q" "A" "A" "I" "L" "G" "R" "A" "D" "L" "Q" "G" "A" "N"
## [163] "M" "K" "G" "V" "D" "L" "S" "R" "A" "D" "L" "S" "Y" "A" "N" "L" "K" "E"
## [181] "A" "N" "L" "R" "D" "V" "D" "L" "R" "K" "A" "D" "L" "S" "Y" "A" "N" "L"
## [199] "K" "G" "A" "L" "L" "T" "D" "A" "N" "L" "S" "G" "A" "K" "L" "N" "G" "A"
## [217] "D" "L" "Q" "N" "A" "N" "L" "M" "R" "A" "K" "I" "S" "E" "A" "E" "M" "T"
## [235] "A" "V" "N" "C" "Q" "G" "A" "I" "M" "T" "H" "V" "N" "L" "N" "R" "T" "N"
## [253] "L" "T" "G" "S" "N" "L" "S" "F" "T" "R" "M" "N" "S" "A" "D" "L" "S" "R"
## [271] "A" "N" "L" "T" "K" "A" "N" "L" "Q" "E" "A" "E" "L" "I" "E" "A" "F" "F"
## [289] "A" "R" "A" "N" "L" "T" "E" "A" "N" "F" "I" "N" "A" "N" "L" "V" "R" "A"
## [307] "D" "L" "M" "S" "A" "N" "M" "V" "G" "A" "D" "F" "Q" "G" "A" "T" "M" "P"
## [325] "D" "G" "Q" "V" "R" "H" "H"
P73709_vector[1] # "M"
## [1] "M"
length(P73709_vector) #331
## [1] 331
[x:y]
P73709_vector[1:50] #vector of single chars of the seq
## [1] "M" "E" "A" "K" "E" "L" "V" "Q" "R" "Y" "R" "N" "G" "E" "T" "L" "F" "T" "G"
## [20] "L" "K" "L" "P" "G" "I" "N" "L" "E" "A" "A" "D" "L" "I" "G" "I" "V" "L" "N"
## [39] "E" "A" "D" "L" "R" "G" "A" "N" "L" "L" "F" "C"
length()
length(P73709_FASTA) #1
## [1] 1
length(P73709_vector) #331
## [1] 331
length( P73709_vector[1:50] )
## [1] 50
# 50
table()
#create table, R figure's out what's relevant
table(P73709_vector)
## P73709_vector
## A C D E F G H I K L M N P Q R S T V Y
## 53 3 21 13 8 27 4 10 11 52 12 34 2 13 20 15 17 10 6
# output - frequency of each char in the vector
Note orientation. Any strong diagonals? Diagonal from bottom left to top right
seqinr::dotPlot(P73709_vector, P73709_vector)
Default is exact match (binary):
seqinr::dotPlot(P73709_vector, P73709_vector
,
wsize = 1,
wstep = 1,
nmatch = 1)
Don’t vary wstep
seqinr::dotPlot(P73709_vector, P73709_vector,
wsize = 1,
nmatch = 1)
main = … sets a title
I’ll use "Default: wsize = 1, nmatch = 1
seqinr::dotPlot(P73709_vector, P73709_vector
,
wsize = 1,
nmatch = 1,
main = "DotPlot of P73709 x P73709")
We can make a grid of plots with the par() command (we’ll leave this as a black box for now)
mfrow sets layout of grid mar sets margins
# set up 2 x 2 grid, make margins things
par(mfrow = c(2,2),
mar = c(0,0,2,1))
# plot 1: Defaults
seqinr::dotPlot(P73709_vector, P73709_vector,
wsize = 1,
nmatch = 1,
main = "size=1, num match=1")
# plot 2 size = 10, nmatch = 1
seqinr::dotPlot(P73709_vector, P73709_vector,
wsize = 10,
nmatch = 1,
main = "size = 10, nmatch = 1")
# plot 3: size = 10, nmatch = 5
seqinr::dotPlot(P73709_vector, P73709_vector,
wsize = 10,
nmatch = 5,
main = "size = 10, nmatch = 5")
# plot 4: size = 20, nmatch = 5
seqinr::dotPlot(P73709_vector, P73709_vector,
wsize = 20,
nmatch = 5,
main = "size = 20, nmatch = 5")
# reset par() - run this or other plots will be small!
par(mfrow = c(1,1),
mar = c(4,4,4,4))
wsize = 20, wstep = 1, nmatch = 5
# be sure to run par - re-run just in case
par(mfrow = c(1,1),
mar = c(4,4,4,4))
seqinr::dotPlot(P73709_vector,
P73709_vector,
wsize=20,
wstep=1,
nmatch=5
)
Make new function
dot_plot <- function(seq1, seq2, wsize = 1, wstep = 1, nmatch = 1, col = c("white",
"black"), xlab = deparse(substitute(seq1)), ylab = deparse(substitute(seq2)),
...) {
# make sure input works
if (nchar(seq1[1]) > 1)
stop("seq1 should be provided as a vector of single chars")
if (nchar(seq2[1]) > 1)
stop("seq2 should be provided as a vector of single chars")
if (wsize < 1)
stop("non allowed value for wsize")
if (wstep < 1)
stop("non allowed value for wstep")
if (nmatch < 1)
stop("non allowed value for nmatch")
if (nmatch > wsize)
stop("nmatch > wsize is not allowed")
# internal function
mkwin <- function(seq, wsize, wstep) {
sapply(seq(from = 1, to = length(seq) - wsize + 1, by = wstep),
function(i) c2s(seq[i:(i + wsize - 1)]))
}
wseq1 <- mkwin(seq1, wsize, wstep)
wseq2 <- mkwin(seq2, wsize, wstep)
if (nmatch == wsize) {
xy <- outer(wseq1, wseq2, "==")
}
else {
"%==%" <- function(x, y) colSums(sapply(x, s2c) == sapply(y,
s2c)) >= nmatch
xy <- outer(wseq1, wseq2, "%==%")
}
# compile output in list
out <- list(x = seq(from = 1, to = length(seq1), length = length(wseq1)),
y = seq(from = 1, to = length(seq2), length = length(wseq2)),
z = xy)
}
Use new function, save output (doesn’t autoplot)
my_dot_out <- dot_plot(P73709_vector,
P73709_vector,
wsize = 15,
wstep = 1,
nmatch = 5)
Get rid of upper triangular portion
my_dot_out$z[lower.tri(my_dot_out$z)] <- FALSE
Do some weird prep (don’t worry about it)
my_dot_out$z <- my_dot_out$z[, nrow(my_dot_out$z):1]
Plot using image() command
# seriously - it will drive you crazy if you forget about this
par(mfrow = c(1,1),
mar = c(4,4,4,4))
# plot with image()
image(x = my_dot_out$x,
y = my_dot_out$y,
z = my_dot_out$z)
P24587 <- rentrez::entrez_fetch(id = "P24587",
db = "protein",
rettype="fasta")
P24587 <- fasta_cleaner(P24587)
length(P24587)
## [1] 714
# Use [ : ] to subset 300 to 400
dotPlot(P24587[300:400],
P24587[300:400],
wsize = 15,
wstep = 1,
nmatch = 5)
P02840 <- rentrez::entrez_fetch(id = "P02840",db = "protein", rettype="fasta")
P02840 <- fasta_cleaner(P02840)
length(P02840)
## [1] 307
# set limit to 80 to 113
dotPlot(P02840[80:113],P02840[80:113],
wsize = 5,
wstep = 1,
nmatch = 5)
P19246 <- rentrez::entrez_fetch(id = "P19246",
db = "protein",
rettype="fasta")
P19246 <- fasta_cleaner(P19246)
length(P19246)
## [1] 1090
# full
dotPlot(P19246,P19246,
wsize = 1,
wstep = 1,
nmatch = 1)
# set limit to 525:550
dotPlot(P19246[525:550],P19246[525:550],
wsize = 1,
wstep = 1,
nmatch = 1)
Q55837 <- rentrez::entrez_fetch(id = "Q55837",
db = "protein",
rettype="fasta")
Q55837 <- fasta_cleaner(Q55837)
dotPlot(Q55837,
Q55837)