Read in the broadPeak file generated by MACS2 to do various exploratory data analysis

setwd("/Users/mtang1/projects/diff_ChIP_test/results")
list.dirs(".")
## [1] "."                                       
## [2] "./panc1H3K27ac_homer_tag_dir"            
## [3] "./panc1H3k27acRep1_with_model_broad"     
## [4] "./panc1H3k27acRep1_without_model_broad"  
## [5] "./panc1H3k27acRep1_without_model_regular"
no_model_broad_peaks<- read.table("./panc1H3k27acRep1_without_model_broad/panc1H3k27acRep1_peaks.broadPeak", sep="\t")

colnames(no_model_broad_peaks)<- c("chrom","start","end","name","pileup", "strand", "fold", "-log10(pvalue)","-log10(qvalue)")

head(no_model_broad_peaks)
##   chrom  start    end                    name pileup strand    fold
## 1  chr1 713240 714754 panc1H3k27acRep1_peak_1     15      . 2.14219
## 2  chr1 715377 715523 panc1H3k27acRep1_peak_2     25      . 2.19918
## 3  chr1 740617 740763 panc1H3k27acRep1_peak_3     64      . 4.98146
## 4  chr1 761091 761376 panc1H3k27acRep1_peak_4     54      . 4.54966
## 5  chr1 762061 762835 panc1H3k27acRep1_peak_5     13      . 2.03326
## 6  chr1 838831 842471 panc1H3k27acRep1_peak_6     17      . 2.34735
##   -log10(pvalue) -log10(qvalue)
## 1        2.45276        1.50432
## 2        3.24609        2.51438
## 3        8.20847        6.49046
## 4        7.07291        5.44006
## 5        2.27348        1.36058
## 6        2.76044        1.72095
model_broad_peaks<- read.table("./panc1H3k27acRep1_with_model_broad/panc1H3k27acRep1_peaks.broadPeak", sep="\t")

colnames(model_broad_peaks)<- c("chrom","start","end","name","pileup", "strand", "fold", "-log10(pvalue)","-log10(qvalue)")

## if you want to do some range based overlapping like what bedtools does, read the peak files into GRanges object and use GenomicRanges package to do overlapping.

see how Dave Tang did

peak length distribution

summary(model_broad_peaks$end - model_broad_peaks$start)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     219     644    1397    3083    3154  385200
summary(no_model_broad_peaks$end - no_model_broad_peaks$start)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     146     421     876    2053    2171  356100
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## how many peaks are longer than 5000 ?
model_broad_peaks %>% filter((end-start) > 5000) %>% count()
## Source: local data frame [1 x 1]
## 
##      n
## 1 7563
no_model_broad_peaks %>% filter((end-start) > 5000) %>% count()
## Source: local data frame [1 x 1]
## 
##      n
## 1 5398

you can use cowplot which is in an early development stage to arrange the ggplot into grids. (plot_grid function)

I will use gridExtra

library(ggplot2)
library(gridExtra)
## Loading required package: grid
## there are some really long "peaks" which you might want to filter them out for your
## subsequent analysis

histo_1<- ggplot(model_broad_peaks %>% filter((end-start) < 5000)) + geom_histogram(aes(x=end-start), color="white", binwidth=500) + xlab("peak length") + ggtitle("peak length distribution with model building")

histo_2<- ggplot(no_model_broad_peaks %>% filter((end-start) < 5000)) + geom_histogram(aes(x=end-start), color="white", binwidth=500) + xlab("peak length") + ggtitle("peak length distribution without model building")

grid.arrange(histo_1, histo_2, ncol=1)

It looks like the peaks are shorter without model building.

Let’s make some box-plot to see the distribution

d1<- data.frame(peak_length=model_broad_peaks$end - model_broad_peaks$start, category= rep("model", nrow(model_broad_peaks)))

d2<- data.frame(peak_length=no_model_broad_peaks$end - no_model_broad_peaks$start, category= rep("no_model", nrow(no_model_broad_peaks)))

d<- rbind(d1,d2)

ylim<- boxplot.stats(d$peak_length)$stats[c(1,5)]
ggplot(d) + geom_boxplot(aes(x=category, y=peak_length)) + coord_cartesian(ylim=ylim)

## the peak_length is not normally distrituted (from the histogram), to test whether they are statistically different, we will use non-parametric test: mann-whitney U tests which is also called Wilcoxon rank-sum test

wilcox.test(peak_length ~ category, data=d, alternative="greater")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  peak_length by category
## W = 2173700000, p-value < 2.2e-16
## alternative hypothesis: true location shift is greater than 0

Note if you use + ylim(c(0,5000)), ggplot will throw away outliers and then determine the quantiles. To keep the outliers in calculation but not draw them, use coord_catesian

plot a scatter plot for fold change vs -log10(pvalue), if it were a gene-expression data (RNA-seq, microarray), it is the so-called volcano plot.

p1<- ggplot(model_broad_peaks, aes(x=fold, y= `-log10(pvalue)`)) + geom_point(alpha=0.1) + ggtitle("peaks with model building")
p2<- ggplot(no_model_broad_peaks, aes(x=fold, y= `-log10(pvalue)`)) + geom_point(alpha=0.1) + ggtitle("peaks without model building")

grid.arrange(p1,p2, ncol=2)