Read in the broadPeak file generated by MACS2 to do various exploratory data analysis
setwd("/Users/mtang1/projects/diff_ChIP_test/results")
list.dirs(".")
## [1] "."
## [2] "./panc1H3K27ac_homer_tag_dir"
## [3] "./panc1H3k27acRep1_with_model_broad"
## [4] "./panc1H3k27acRep1_without_model_broad"
## [5] "./panc1H3k27acRep1_without_model_regular"
no_model_broad_peaks<- read.table("./panc1H3k27acRep1_without_model_broad/panc1H3k27acRep1_peaks.broadPeak", sep="\t")
colnames(no_model_broad_peaks)<- c("chrom","start","end","name","pileup", "strand", "fold", "-log10(pvalue)","-log10(qvalue)")
head(no_model_broad_peaks)
## chrom start end name pileup strand fold
## 1 chr1 713240 714754 panc1H3k27acRep1_peak_1 15 . 2.14219
## 2 chr1 715377 715523 panc1H3k27acRep1_peak_2 25 . 2.19918
## 3 chr1 740617 740763 panc1H3k27acRep1_peak_3 64 . 4.98146
## 4 chr1 761091 761376 panc1H3k27acRep1_peak_4 54 . 4.54966
## 5 chr1 762061 762835 panc1H3k27acRep1_peak_5 13 . 2.03326
## 6 chr1 838831 842471 panc1H3k27acRep1_peak_6 17 . 2.34735
## -log10(pvalue) -log10(qvalue)
## 1 2.45276 1.50432
## 2 3.24609 2.51438
## 3 8.20847 6.49046
## 4 7.07291 5.44006
## 5 2.27348 1.36058
## 6 2.76044 1.72095
model_broad_peaks<- read.table("./panc1H3k27acRep1_with_model_broad/panc1H3k27acRep1_peaks.broadPeak", sep="\t")
colnames(model_broad_peaks)<- c("chrom","start","end","name","pileup", "strand", "fold", "-log10(pvalue)","-log10(qvalue)")
## if you want to do some range based overlapping like what bedtools does, read the peak files into GRanges object and use GenomicRanges package to do overlapping.
see how Dave Tang did
peak length distribution
summary(model_broad_peaks$end - model_broad_peaks$start)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 219 644 1397 3083 3154 385200
summary(no_model_broad_peaks$end - no_model_broad_peaks$start)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 146 421 876 2053 2171 356100
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## how many peaks are longer than 5000 ?
model_broad_peaks %>% filter((end-start) > 5000) %>% count()
## Source: local data frame [1 x 1]
##
## n
## 1 7563
no_model_broad_peaks %>% filter((end-start) > 5000) %>% count()
## Source: local data frame [1 x 1]
##
## n
## 1 5398
you can use cowplot which is in an early development stage to arrange the ggplot into grids. (plot_grid function)
I will use gridExtra
library(ggplot2)
library(gridExtra)
## Loading required package: grid
## there are some really long "peaks" which you might want to filter them out for your
## subsequent analysis
histo_1<- ggplot(model_broad_peaks %>% filter((end-start) < 5000)) + geom_histogram(aes(x=end-start), color="white", binwidth=500) + xlab("peak length") + ggtitle("peak length distribution with model building")
histo_2<- ggplot(no_model_broad_peaks %>% filter((end-start) < 5000)) + geom_histogram(aes(x=end-start), color="white", binwidth=500) + xlab("peak length") + ggtitle("peak length distribution without model building")
grid.arrange(histo_1, histo_2, ncol=1)
It looks like the peaks are shorter without model building.
Let’s make some box-plot to see the distribution
d1<- data.frame(peak_length=model_broad_peaks$end - model_broad_peaks$start, category= rep("model", nrow(model_broad_peaks)))
d2<- data.frame(peak_length=no_model_broad_peaks$end - no_model_broad_peaks$start, category= rep("no_model", nrow(no_model_broad_peaks)))
d<- rbind(d1,d2)
ylim<- boxplot.stats(d$peak_length)$stats[c(1,5)]
ggplot(d) + geom_boxplot(aes(x=category, y=peak_length)) + coord_cartesian(ylim=ylim)
## the peak_length is not normally distrituted (from the histogram), to test whether they are statistically different, we will use non-parametric test: mann-whitney U tests which is also called Wilcoxon rank-sum test
wilcox.test(peak_length ~ category, data=d, alternative="greater")
##
## Wilcoxon rank sum test with continuity correction
##
## data: peak_length by category
## W = 2173700000, p-value < 2.2e-16
## alternative hypothesis: true location shift is greater than 0
Note if you use + ylim(c(0,5000)), ggplot will throw away outliers and then determine the quantiles. To keep the outliers in calculation but not draw them, use coord_catesian
plot a scatter plot for fold change vs -log10(pvalue), if it were a gene-expression data (RNA-seq, microarray), it is the so-called volcano plot.
p1<- ggplot(model_broad_peaks, aes(x=fold, y= `-log10(pvalue)`)) + geom_point(alpha=0.1) + ggtitle("peaks with model building")
p2<- ggplot(no_model_broad_peaks, aes(x=fold, y= `-log10(pvalue)`)) + geom_point(alpha=0.1) + ggtitle("peaks without model building")
grid.arrange(p1,p2, ncol=2)