# Load the needed packages
# Biostrings needed for pattern recognition
# BSgenome is the genome of interest
# ggplot2 for plotting the graph
# plyr for transforming the dataframes
# scales is needed for log2 transformation of the y-axis
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist
## 
## Loading required package: IRanges
## Loading required package: XVector
library(BSgenome.Mmusculus.UCSC.mm10)
## Loading required package: BSgenome
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
library(ggplot2)
library(plyr)
## 
## Attaching package: 'plyr'
## 
## The following object is masked from 'package:XVector':
## 
##     compact
## 
## The following objects are masked from 'package:IRanges':
## 
##     desc, rename
library(scales)

mdf=data.frame();
# Perform pattern recognition one chromosome at a time for all the chromosomes in mm10
for (i in 1:length(BSgenome.Mmusculus.UCSC.mm10)){
  print(i)
  # Obtain the co-ordinates of the matching pattern as XStringViews object
  # gaps() function extracts the co-ordinates of the genomic fragments between the patterns identified
  # Extract the start and end co-ordinates as integer vectors from SStringViews object and subtract 4 from the start co-ordinates to include the pattern start co-ordinate
  # Replace the negative integers at chr start with 0
  # add the information to the whole genome dataframe (rbind)
  m<-matchPattern("CCGG", Mmusculus[[i]])
  starts <- start(gaps(m))
  ends <-  end(gaps(m))
  temp_df<-data.frame(chr=seqnames(Mmusculus)[i],start=starts-4,end=ends) #actually end = ends
  temp_df$start<-replace(temp_df$start, temp_df$start == -3, 0)
  mdf<-rbind(mdf,temp_df)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 50
## [1] 51
## [1] 52
## [1] 53
## [1] 54
## [1] 55
## [1] 56
## [1] 57
## [1] 58
## [1] 59
## [1] 60
## [1] 61
## [1] 62
## [1] 63
## [1] 64
## [1] 65
## [1] 66
# Calculate the width of the digested fragments from the start and end co-ordinates
# filter the reads of defined length (here >= 40 and <= 600)
# count the number of times a width size is repeated and make a df of the width and count (ddply)
mdf$width=mdf$end-mdf$start
ml<-mdf[mdf$width>39&mdf$width<601,]
counts<-ddply(ml,.(width), nrow)
write.table(ml,file="msp_digsn_mmus10.bed", row.names=F, col.names=F, sep="\t", quote=F)
# plot
p<-ggplot(counts,aes(x=width, y=V1))+geom_line()
p<-p+scale_y_continuous(trans=log2_trans())
p+coord_cartesian(xlim = c(40,600 ))

plot of chunk unnamed-chunk-1