counts<- read.csv("/Users/mtang1/projects/diff_ChIP_test/results/broad_peaks_all/counts_table.csv",
                  header = F)

## give some meaniningful names
names(counts)<- c("peak_id", "chr", "start", "end", "strand", "length",
                  "subRead_Mcf7_input", "subRead_Mcf7Rep1", "subRead_Mcf7Rep2",
                  "subRead_Panc1_input", "subRead_Panc1Rep1", "subRead_Panc1Rep2",
                  "bedtools_Mcf7_input", "bedtools_Mcf7Rep1", "bedtools_Mcf7Rep2",
                  "bedtools_Panc1_input", "bedtools_Panc1Rep1", "bedtools_Panc1Rep2")

plot with ggplot2

library(ggplot2)

ggplot(counts) + geom_point(aes(x=log2(subRead_Mcf7Rep1 +1 ), y=log2(bedtools_Mcf7Rep1 +1))) +
        geom_abline(intercept = 0, slope = 1, col="red")

Subread counts and bedtools counts are very similar. see correlation below.

cor(log2(counts$subRead_Mcf7Rep1+1), log2(counts$bedtools_Mcf7Rep1+1))
## [1] 0.999985

counts between replicates:

ggplot(counts) + geom_point(aes(x=log2(subRead_Mcf7Rep1 +1 ), y=log2(subRead_Mcf7Rep2 +1))) +
        geom_abline(intercept = 0, slope = 1, col="red")

ggplot(counts) + geom_point(aes(x=log2(subRead_Panc1Rep1 +1 ), y=log2(subRead_Panc1Rep2 +1))) +
        geom_abline(intercept = 0, slope = 1, col="red")

counts between different cell lines

ggplot(counts) + geom_point(aes(x=log2(subRead_Mcf7Rep1 +1 ), y=log2(subRead_Panc1Rep1 +1))) +
        geom_abline(intercept = 0, slope = 1, col="red")

It is more variable between cell lines. See paper Sequence and chromatin determinants of cell-type–specific transcription factor binding Fig5B.

pearson correlation of different counts

cor(log2(counts$subRead_Mcf7Rep1+1), log2(counts$subRead_Mcf7Rep2+1))
## [1] 0.9742128
cor(log2(counts$subRead_Panc1Rep1+1), log2(counts$subRead_Panc1Rep2+1))
## [1] 0.960262
cor(log2(counts$subRead_Mcf7Rep1+1), log2(counts$subRead_Panc1Rep1+1))
## [1] 0.5210859

PCA analysis

subread_counts<- counts[,7:12]

X<- subread_counts
sv = svd(t(X))
U = sv$u
V = sv$v
D = sv$d

Z<- t(X) %*% V

cols = as.numeric(as.factor(colnames(X)))
plot(Z[,1], Z[,2], type ="n", xlab="PC1", ylab="PC2")
text(Z[,1], Z[,2], colnames(X), col=cols)

MCF7 replicates cluster together, Panc1 replicates cluster together and inputs cluster together.