This is a script that will do a meta-gene analysis for ChIP-seq experiments that will plot the coverage of a specific region around the TSS of certain gene loci. In the case of this output, I am looking at a region 2000 bp downstream and upstream of the TSS. Output will look like this:
This project is looking at the differences in expression pattern for all genes that showed differential chromatin binding in the presence of an RNAi knockdown of two histone modifiers, gene1 and gene2, compared to an RNAi control, control. Differential binding (DB) was found using the pearl program diffReps (Shen et al, 2013).
The first step was to load in the necessary packages and files.
library(GenomicRanges)
library(GenomicAlignments)
library(rtracklayer)
library(IRanges)
library(ggplot2)
library(dplyr)
library(tidyr)
setwd("~/Desktop/Meta.Gene.Files/")
### Planaria Transcript Cordinates in Granges format
genes <- readRDS("genes.gr")
genes <- c(genes[[1]], genes[[2]], genes[[3]])
#### Read in gene1 file of DB targets
gene1 <- read.table("diffReps_output_gene1_X1.txt", header=TRUE, sep="\t")
## filter out DB regions that are more than 500 bp from TSS (H3K4me3 property)
gene1 <- gene1[which(gene1$Distance.TSS < 500),]
##Filter out DB regions that didn't have a log2FC of -1.5
gene1 <- gene1[which(gene1$log2FC < -log2(1.5)),]
#### Read in gene2 file of DB targets and filter like above
gene2 <- read.table("diffReps_output_gene2_X1.txt", header=TRUE, sep="\t")
gene2 <- gene2[which(gene2$Distance.TSS < 500),]
gene2 <- gene2[which(gene2$log2FC < -log2(1.5)),]
My script to convert aligned bam files to RPM data objects used below can be found here
###Load in premade coverage files in RLE format
###gene1
setwd("~/Desktop/Meta.Gene.Files/")
load("gene1_RNAi_1_150_rpm.RData")
load("gene1_RNAi_2_150_rpm.RData")
load("control_1_1_150_rpm.RData")
load("control_1_2_150_rpm.RData")
###gene2
load("gene2_RNAi_2_150_rpm.RData")
load("gene2_RNAi_1_150_rpm.RData")
load("control_2_1_150_rpm.RData")
load("control_2_2_150_rpm.RData")
The next step is to get the coordinates of the DB genes 2000 upstream and downstream of the TSS
##Pull out coverages of 2000 bp up and downsteam of TSS
#####################
##### gene1 #####
#####################
## Make a GRanges object of the genes of DB regions we found in diffReps
genes.in.gene1 <- genes[gene1$Gene]
start.gene1 <- start(genes.in.gene1)
## Get the Chromosome lengths to make sure the end of the gene isn't 2000 from end of Chrom
## The reason to do this is if the gene is on the - strand 2000 upstream of TSS, the gene
## must be greator than 2000 away from chrom length to match
chr.len.gene1 <- seqlengths(seqinfo(genes.in.gene1))
is.close.end <- end(genes.in.gene1) + 2000 > chr.len.gene1[as.character(seqnames(genes.in.gene1))]
## Make sure we have 2000 upstream of gene for genes on "+" strand (start)
## Make sure we have 2000 upstream of gene for genes of "-" strand !(is.close.end)
genes.in.gene1 <- genes.in.gene1[start.gene1 > 2000 & !(is.close.end)]
## Get 4000 bp of coverage around the TSS, 2000 upstream & 2000 downstream
genes.in.gene1 <- promoters(genes.in.gene1, upstream = 2000, downstream = 2000)
## 389 genes that fit this cutoff
## Now do the same for gene2 DB regions
#####################
##### gene2 #####
#####################
genes.in.gene2 <- genes[gene2$Gene]
start.gene2 <- start(genes.in.gene2)
chr.len.gene2 <- seqlengths(seqinfo(genes.in.gene2))
is.close.end <- end(genes.in.gene2) + 2000 > chr.len.gene2[as.character(seqnames(genes.in.gene2))]
genes.in.gene2 <- genes.in.gene2[start.gene2 > 2000 & !(is.close.end)]
genes.in.gene2 <- promoters(genes.in.gene2, upstream = 2000, downstream = 2000)
## 244 genes that fil this cuttoff
The next step is to predefine a function that can extract a coverage array for each gene for the regions defined above. Then, the function will combine them to make a matrix of all genes’ coverage.
###################################################################
#### Coverage of Gene 2000 upstream and 2000 Downstream of TSS ####
###################################################################
metagene <- function(file,file1, index, genes) {
## Find Chromosome file gene is on
ch <- seqnames(genes[index])
## Get the coverage array at each basepair for the gene (2000 Downstream & 2000 upstream of TSS)
view <- Views(file[ch][[1]], ranges(genes[index]))
view.f1 <- Views(file1[ch][[1]], ranges(genes[index]))
## Change the class of the gene to numeric and then if the gene is in the reverse orientation,
## reverse the orientation of the array
cov <- as.numeric(view[1][[1]])
if (as.logical(strand(genes[index]) == "-")) {
cov <- rev(cov)
}
cov.mtx <- sapply(1:4000, function(x) {cov[x]})
## Same For Replicate 2
cov.f1 <- as.numeric(view.f1[1][[1]])
if (as.logical(strand(genes[index]) == "-")) {
cov.f1 <- rev(cov.f1)
}
cov.mtx.f1 <- sapply(1:4000, function(x) {cov.f1[x]})
## Get the average coverage for both replicates at each basepair
overall.mean <- ceiling((cov.mtx + cov.mtx.f1)/2)
## Convert it into a matrix in an outputable way
cov.frame <- t(matrix(overall.mean))
## have the rownames for each coverage matrix be the associated gene
row.names(cov.frame) <- names(genes[index])
## Return the coverage frame
return(cov.frame)
}
Next, I lapply-d the function to each list of genes defined above. For each graph, one curve will be made for the control dataset (control) and one will be made for the RNAi knockdown dataset (gene1 or gene2) to plot the differences in chromatin profile around the TSS for the DB genes found in each experiment.
###### gene1 #######
gene1.list <- lapply(1:length(genes.in.gene1), metagene, file=gene1_1, file1=gene1_2, genes=genes.in.gene1)
gene1.lis <- do.call(rbind, gene1.list)
control.gene1.list <- lapply(1:length(genes.in.gene1), metagene, file=control1_1, file1=control1_2, genes=genes.in.gene1)
control.gene1.lis <- do.call(rbind, control.gene1.list)
###### gene2 #######
gene2.list <- lapply(1:length(genes.in.gene2), metagene, file=gene2_1, file1=gene2_2, genes=genes.in.gene2)
gene2.lis <- do.call(rbind, gene2.list)
control.gene2.list <- lapply(1:length(genes.in.gene2), metagene, file=control_1, file1=control_2, genes=genes.in.gene2)
control.gene2.lis <- do.call(rbind, control.gene2.list)
Additionaly, we need to define a plot function that reshapes the data and then prints it in a nice ggplot output.
####################
#### ggplot def ####
####################
### Make a column standard deviation function to compute the standard deviation for SE error bars
colstd <- function(x) {
sapply(1:4000, function(y) { sd(x[,y])})
}
ggplot_avg_gene1 <- function(y, z) {
## Make a dataframe of the average coverage across all genes at each basepair
## for gene1 and gene2
df <- data.frame(cbind(colMeans(y), colMeans(z)))
colnames(df) <- c("gene1", "control")
## Reshape the dataframe so it is 'ggplot'-able
df.gg <- df %>% gather(Expression, RPM, gene1:control)
df.gg$Index <- rep(1:4000, 2)
## Add a column for standard error
df.gg$se <- c(colstd(y)/sqrt(nrow(y)), colstd(z)/sqrt(nrow(z)))
##ggplot function
ggplot(df.gg, aes(x=Index, y=RPM, Group=factor(Expression))) +
geom_ribbon(aes(ymin=RPM-se, ymax=RPM+se),
alpha=0.2) +
geom_line(aes(colour=factor(Expression))) +
scale_colour_manual(values = c("#F8766D", "grey10")) + ###7CAE00 ###F8766D
scale_x_continuous(breaks = c(0, 1000, 2000, 3000, 4000),
labels = c("-2000", "-1000", "TSS", "+1000", "+2000")) +
geom_vline(xintercept=2000, colour="red", linetype="longdash") +
labs(colour="RNAi") + xlab("Position(BP)") +
ggtitle("Metagene plot of RNAi Affected Genes") +
theme_bw()
}
###plot for gene2
ggplot_avg_gene2 <- function(y, z) {
df <- data.frame(cbind(colMeans(y), colMeans(z)))
colnames(df) <- c("gene2", "control")
df.gg <- df %>% gather(Expression, RPM, gene2:control)
df.gg$Index <- rep(1:4000, 2)
df.gg$se <- c(colstd(y)/sqrt(nrow(y)), colstd(z)/sqrt(nrow(z)))
ggplot(df.gg, aes(x=Index, y=RPM, Group=factor(Expression))) +
geom_ribbon(aes(ymin=RPM-se, ymax=RPM+se),
alpha=0.2) +
geom_line(aes(colour=factor(Expression))) +
scale_colour_manual(values = c("#7CAE00", "grey10")) +
scale_x_continuous(breaks = c(0, 1000, 2000, 3000, 4000),
labels = c("-2000", "-1000", "TSS", "+1000", "+2000")) +
geom_vline(xintercept=2000, colour="red", linetype="longdash") +
labs(colour="RNAi") + xlab("Position (BP)") +
ggtitle("Metagene plot of RNAi Affected Genes") +
theme_bw()
}
Finally, all we need to do is plot the final results using the defined ggplot function.
ggplot_avg_gene1(gene1.lis, control.gene1.lis)
ggplot_avg_gene2(gene2.lis, control.gene2.lis)