Meta-Gene Analysis for ChIP-seq Data

Overview

This is a script that will do a meta-gene analysis for ChIP-seq experiments that will plot the coverage of a specific region around the TSS of certain gene loci. In the case of this output, I am looking at a region 2000 bp downstream and upstream of the TSS. Output will look like this:

Project Details

This project is looking at the differences in expression pattern for all genes that showed differential chromatin binding in the presence of an RNAi knockdown of two histone modifiers, gene1 and gene2, compared to an RNAi control, control. Differential binding (DB) was found using the pearl program diffReps (Shen et al, 2013).

Loading In Files

The first step was to load in the necessary packages and files.

library(GenomicRanges)
library(GenomicAlignments)
library(rtracklayer)
library(IRanges)
library(ggplot2)
library(dplyr)
library(tidyr)

setwd("~/Desktop/Meta.Gene.Files/")
### Planaria Transcript Cordinates in Granges format
genes <- readRDS("genes.gr")
genes <- c(genes[[1]], genes[[2]], genes[[3]])


#### Read in gene1 file of DB targets
gene1 <- read.table("diffReps_output_gene1_X1.txt", header=TRUE, sep="\t")
## filter out DB regions that are more than 500 bp from TSS (H3K4me3 property)
gene1 <- gene1[which(gene1$Distance.TSS < 500),]
##Filter out DB regions that didn't have a log2FC of -1.5 
gene1 <- gene1[which(gene1$log2FC < -log2(1.5)),]

#### Read in gene2 file of DB targets and filter like above
gene2 <- read.table("diffReps_output_gene2_X1.txt", header=TRUE, sep="\t")
gene2 <- gene2[which(gene2$Distance.TSS <  500),]
gene2 <- gene2[which(gene2$log2FC < -log2(1.5)),]

My script to convert aligned bam files to RPM data objects used below can be found here

###Load in premade coverage files in RLE format
###gene1
setwd("~/Desktop/Meta.Gene.Files/")
load("gene1_RNAi_1_150_rpm.RData")
load("gene1_RNAi_2_150_rpm.RData")
load("control_1_1_150_rpm.RData")
load("control_1_2_150_rpm.RData")

###gene2
load("gene2_RNAi_2_150_rpm.RData")
load("gene2_RNAi_1_150_rpm.RData")
load("control_2_1_150_rpm.RData")
load("control_2_2_150_rpm.RData")

Find Cordinates Of Regions That Are Differentially Bound

The next step is to get the coordinates of the DB genes 2000 upstream and downstream of the TSS

##Pull out coverages of 2000 bp up and downsteam of TSS

#####################
#####   gene1    #####
#####################
## Make a GRanges object of the genes of DB regions we found in diffReps
genes.in.gene1 <- genes[gene1$Gene]
start.gene1 <- start(genes.in.gene1)

## Get the Chromosome lengths to make sure the end of the gene isn't 2000 from end of Chrom
## The reason to do this is if the gene is on the - strand 2000 upstream of TSS, the gene
## must be greator than 2000 away from chrom length to match 
chr.len.gene1 <- seqlengths(seqinfo(genes.in.gene1))
is.close.end <- end(genes.in.gene1) + 2000 > chr.len.gene1[as.character(seqnames(genes.in.gene1))]

## Make sure we have 2000 upstream of gene for genes on "+" strand (start)
## Make sure we have 2000 upstream of gene for genes of "-" strand !(is.close.end)
genes.in.gene1 <- genes.in.gene1[start.gene1 > 2000 & !(is.close.end)]

## Get 4000 bp of coverage around the TSS, 2000 upstream & 2000 downstream
genes.in.gene1 <- promoters(genes.in.gene1, upstream = 2000, downstream = 2000)
## 389 genes that fit this cutoff

## Now do the same for gene2 DB regions

#####################
#####   gene2    #####
#####################
genes.in.gene2 <- genes[gene2$Gene]
start.gene2 <- start(genes.in.gene2)
chr.len.gene2 <- seqlengths(seqinfo(genes.in.gene2))
is.close.end <- end(genes.in.gene2) + 2000 > chr.len.gene2[as.character(seqnames(genes.in.gene2))]
genes.in.gene2 <- genes.in.gene2[start.gene2 > 2000 & !(is.close.end)]
genes.in.gene2 <- promoters(genes.in.gene2, upstream = 2000, downstream = 2000)
## 244 genes that fil this cuttoff

Function That Gets Coverage Of Predifined Regions

The next step is to predefine a function that can extract a coverage array for each gene for the regions defined above. Then, the function will combine them to make a matrix of all genes’ coverage.

###################################################################
#### Coverage of Gene 2000 upstream and 2000 Downstream of TSS ####
###################################################################
metagene <- function(file,file1, index, genes) {
    ## Find Chromosome file gene is on
    ch <- seqnames(genes[index])
    
    ## Get the coverage array at each basepair for the gene (2000 Downstream & 2000 upstream of TSS)
    view  <- Views(file[ch][[1]], ranges(genes[index]))
    view.f1 <- Views(file1[ch][[1]], ranges(genes[index]))
    
    ## Change the class of the gene to numeric and then if the gene is in the reverse orientation,
    ## reverse the orientation of the array
    cov <- as.numeric(view[1][[1]])
    if (as.logical(strand(genes[index]) == "-")) {
        cov <- rev(cov)
    }
    cov.mtx <- sapply(1:4000, function(x) {cov[x]})
    
    
    ## Same For Replicate 2
    cov.f1 <- as.numeric(view.f1[1][[1]])
    if (as.logical(strand(genes[index]) == "-")) {
        cov.f1 <- rev(cov.f1)
    }
    cov.mtx.f1 <- sapply(1:4000, function(x) {cov.f1[x]})
    
    ## Get the average coverage for both replicates at each basepair 
    overall.mean <- ceiling((cov.mtx + cov.mtx.f1)/2)
    
    ## Convert it into a matrix in an outputable way
    cov.frame <- t(matrix(overall.mean))
    
    ## have the rownames for each coverage matrix be the associated gene
    row.names(cov.frame) <- names(genes[index])
    
    ## Return the coverage frame
    return(cov.frame)
}

Lapply The Function To All Genes In Each Set

Next, I lapply-d the function to each list of genes defined above. For each graph, one curve will be made for the control dataset (control) and one will be made for the RNAi knockdown dataset (gene1 or gene2) to plot the differences in chromatin profile around the TSS for the DB genes found in each experiment.

###### gene1 #######
gene1.list <- lapply(1:length(genes.in.gene1), metagene, file=gene1_1, file1=gene1_2, genes=genes.in.gene1)
gene1.lis <- do.call(rbind, gene1.list)

control.gene1.list <- lapply(1:length(genes.in.gene1), metagene, file=control1_1, file1=control1_2, genes=genes.in.gene1)
control.gene1.lis <- do.call(rbind, control.gene1.list)

###### gene2 #######
gene2.list <- lapply(1:length(genes.in.gene2), metagene, file=gene2_1, file1=gene2_2, genes=genes.in.gene2)
gene2.lis <- do.call(rbind, gene2.list)

control.gene2.list <- lapply(1:length(genes.in.gene2), metagene, file=control_1, file1=control_2, genes=genes.in.gene2)
control.gene2.lis <- do.call(rbind, control.gene2.list)

Define The Meta-Gene ggplot Function

Additionaly, we need to define a plot function that reshapes the data and then prints it in a nice ggplot output.

####################
#### ggplot def ####
####################

### Make a column standard deviation function to compute the standard deviation for SE error bars
colstd <- function(x) {
    sapply(1:4000, function(y) { sd(x[,y])})
}

ggplot_avg_gene1 <- function(y, z) {
    ## Make a dataframe of the average coverage across all genes at each basepair
    ## for gene1 and gene2
    df <- data.frame(cbind(colMeans(y), colMeans(z)))
    colnames(df) <- c("gene1", "control")
    
    ## Reshape the dataframe so it is 'ggplot'-able
    df.gg <- df %>% gather(Expression, RPM, gene1:control)
    df.gg$Index <- rep(1:4000, 2)
    
    ## Add a column for standard error
    df.gg$se <- c(colstd(y)/sqrt(nrow(y)), colstd(z)/sqrt(nrow(z)))
    
    ##ggplot function
    ggplot(df.gg, aes(x=Index, y=RPM, Group=factor(Expression))) +
           geom_ribbon(aes(ymin=RPM-se, ymax=RPM+se),
                       alpha=0.2) +
           geom_line(aes(colour=factor(Expression))) +
           scale_colour_manual(values = c("#F8766D", "grey10")) + ###7CAE00 ###F8766D
           scale_x_continuous(breaks = c(0, 1000, 2000, 3000, 4000),
                              labels = c("-2000", "-1000", "TSS", "+1000", "+2000")) +
           geom_vline(xintercept=2000, colour="red", linetype="longdash") +
           labs(colour="RNAi") + xlab("Position(BP)") +
           ggtitle("Metagene plot of RNAi Affected Genes") +
           theme_bw()        
}

###plot for gene2
ggplot_avg_gene2 <- function(y, z) {
    df <- data.frame(cbind(colMeans(y), colMeans(z)))
    colnames(df) <- c("gene2", "control")
    df.gg <- df %>% gather(Expression, RPM, gene2:control)
    df.gg$Index <- rep(1:4000, 2)
    df.gg$se <- c(colstd(y)/sqrt(nrow(y)), colstd(z)/sqrt(nrow(z)))
    ggplot(df.gg, aes(x=Index, y=RPM, Group=factor(Expression))) +
           geom_ribbon(aes(ymin=RPM-se, ymax=RPM+se),
                       alpha=0.2) +
           geom_line(aes(colour=factor(Expression))) +
           scale_colour_manual(values = c("#7CAE00", "grey10")) + 
           scale_x_continuous(breaks = c(0, 1000, 2000, 3000, 4000),
                              labels = c("-2000", "-1000", "TSS", "+1000", "+2000")) +
           geom_vline(xintercept=2000, colour="red", linetype="longdash") +
           labs(colour="RNAi") + xlab("Position (BP)") +
           ggtitle("Metagene plot of RNAi Affected Genes") +
           theme_bw()        
}

Plot The Results

Finally, all we need to do is plot the final results using the defined ggplot function.

ggplot_avg_gene1(gene1.lis, control.gene1.lis)

ggplot_avg_gene2(gene2.lis, control.gene2.lis)