library(dplyr)
library(biomaRt)
library(tximport)
library(rhdf5)
library(gplots)
library(org.Hs.eg.db)
library(DESeq2)
library(DT)
library(apeglm)
library(RColorBrewer)
library(IHW)
library(PCAtools)
library(pheatmap)
library(clusterProfiler)
library(EnhancedVolcano)
library(ComplexHeatmap)
library(circlize)
library(fgsea)
library(tidyverse)
library(ggpubr)

Introduction

This R markdown document walks through an RNA-Seq differential expression analysis using DESeq2 for MA5112. Quality control plots, differential expression results, plots and pathway analysis are covered in the document.

The tutorial assumes you have 9 Kallisto quantification directories downloaded to your local machine. Assign the path to a variable now:

# path where your extracted the tar.gz folder to.
# strip the trailing '/'
quant_dir <- "/data/github/quant"
list.files(quant_dir)
##  [1] "A375_1"                 "A375_2"                 "A375_3"                
##  [4] "A549_1"                 "A549_2"                 "A549_3"                
##  [7] "c5.bp.v7.0.symbols.gmt" "ctrl_1"                 "ctrl_2"                
## [10] "ctrl_3"                 "samples.csv"            "tx2gene.txt"

Reading Input Data

Metadata

Read in the file samples.csv which contains the experimental metadata. Please make sure that the rownames of this dataframe match the kallisto quantification directory names at quant/.

samples <- read.csv(paste0(quant_dir, "/samples.csv"), header=T, row.names = "sample", stringsAsFactors = T)
samples
##        condition replicate
## ctrl_1   control         1
## ctrl_2   control         2
## ctrl_3   control         3
## A549_1      lung         1
## A549_2      lung         2
## A549_3      lung         3
## A375_1  melanoma         1
## A375_2  melanoma         2
## A375_3  melanoma         3

Convert Numerics to Factors

When performing differential expression analysis using DESeq2, if a covariate in your metadata file is numeric and provided to the design formula, DESeq2 will produce the following message:

the design formula contains a numeric variable with integer values, specifying a model with increasing fold change for higher values. did you mean for this to be a factor? if so, first convert this variable to a factor using the factor() function

What does this mean? From Mike Love (DESeq2 author):

There is a constant fold change for every unit of change in replicates. So if the estimated fold change is 2, this implies that replicates 2 = 2x replicates 1, replicates 3 = 2x replicates 2, etc. Or in other words, the relationship is linear on the log counts scale.

This is not what we want! Be really careful with your metadata file as it defines our statistical model design.

To fix this, change the replicate column to a categorical factor using factor().

samples$replicate <- factor(samples$replicate)

# check its ok:
factor_cols <- sapply(samples, is.factor)
factor_cols
## condition replicate 
##      TRUE      TRUE

Stage Kallisto files

We need to create a file handle object (a named character list) containing the sample IDs and the paths to the Kallisto quantification .h5 files. If the rownames of your metadata object (samples) do not match the quantification directory names, you will get an error during the TXI object step.

files <- file.path(quant_dir, rownames(samples), "abundance.h5")
names(files) <- paste0(rownames(samples))
files
##                                   ctrl_1 
## "/data/github/quant/ctrl_1/abundance.h5" 
##                                   ctrl_2 
## "/data/github/quant/ctrl_2/abundance.h5" 
##                                   ctrl_3 
## "/data/github/quant/ctrl_3/abundance.h5" 
##                                   A549_1 
## "/data/github/quant/A549_1/abundance.h5" 
##                                   A549_2 
## "/data/github/quant/A549_2/abundance.h5" 
##                                   A549_3 
## "/data/github/quant/A549_3/abundance.h5" 
##                                   A375_1 
## "/data/github/quant/A375_1/abundance.h5" 
##                                   A375_2 
## "/data/github/quant/A375_2/abundance.h5" 
##                                   A375_3 
## "/data/github/quant/A375_3/abundance.h5"

Tximport

tximport imports transcript-level abundances from quantification tools (kallisto in our case) and converts them to gene counts for downstream analyses using a differential expression analysis package.

BiomaRt

We will use biomaRt to connect to the ENSEMBL databases and map transcript IDs to gene IDs. Firstly, we will create the mart object specifying which database to use:

N.B: biomaRt is super finicky, fails to conect one seconds, connects the next..

mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")

Transcript to gene (tx2gene)

Recall we used the ENSEMBL reference cDNA file Homo_sapiens.GRCh38.cdna.all.fa for Kallisto quant on LUGH. This means that our quantification files have ENSEMBL transcript ID’s. We can map them to gene symbols running the code below:

## show how to identify attribute type
# $ head /data/github/quant/ctrl_1/abundance.tsv

## show how to query mart
#listAttributes(mart)

tx2gene <- getBM(attributes = c("ensembl_transcript_id_version", "hgnc_symbol"), mart = mart, useCache = FALSE)
head(tx2gene)
##   ensembl_transcript_id_version hgnc_symbol
## 1             ENST00000387314.1       MT-TF
## 2             ENST00000389680.2     MT-RNR1
## 3             ENST00000387342.1       MT-TV
## 4             ENST00000387347.2     MT-RNR2
## 5             ENST00000386347.1      MT-TL1
## 6             ENST00000361390.2      MT-ND1

TXI object

Create a txi object summarizing kallisto transcript abundance to gene-level counts:

txi <- tximport(files, type = "kallisto", tx2gene = tx2gene)
head(txi$counts)
##             ctrl_1       ctrl_2       ctrl_3       A549_1       A549_2
##       186431.11394 2.129361e+05 1.658358e+05 171876.16396 112013.71701
## A1BG     142.22844 1.401601e+02 1.220176e+02    185.24378    123.08770
## A1CF      10.44957 1.087808e+00 4.041526e+00     12.20734     10.21507
## A2M      511.29496 3.668799e+02 4.075007e+02    410.37397    342.34454
## A2ML1     17.19034 1.808698e+01 8.119118e+00     14.23547     11.12270
## A2MP1     92.00000 9.400000e+01 6.000000e+01    117.00000     93.00000
##             A549_3      A375_1       A375_2       A375_3
##       112813.56142 134301.9883 1.571526e+05 1.589701e+05
## A1BG     112.27716    100.8454 1.258208e+02 9.175603e+01
## A1CF       3.19809      2.0000 8.308615e+00 4.296704e+00
## A2M      345.62160    312.5389 3.079068e+02 4.671517e+02
## A2ML1     15.12763     17.2735 1.711676e+01 2.616338e+01
## A2MP1     88.00000     75.0000 7.300000e+01 9.800000e+01

DESeq2

Now we are ready to create the DDS object for DESeq2 analysis that contains the counts matrix, experiment metadata and model design.

Design

The model design is one of the most important steps for an RNA-Seq analysis. Here, we are going to specify the design ~ replicate + condition (columns of the metadata file). The factor of interest goes last in the model terms, we want to compare control vs. lung vs. melanoma in the experiment. Replicate has been included as we want to control for the technical/biological effect of sample replicates.

DDS

We are going to create the DDS object using DESeqDataSetFromTximport() as we used tximport to convert transcript abundances to gene-level counts.

dds <- DESeqDataSetFromTximport(txi, colData = samples, design = ~ replicate + condition )

Factor levels

By default, R will choose a reference level for factors based on alphabetical order. We will explicitly set control as the reference level so you can see how it’s done using relevel().

N.B this step must be done prior to creating the dds object using DESeq()! If you want to change the reference level, you must re-run DESeq().

Check that relevel() worked correctly by running resultsNames(dds) - we want control to be the reference level so we can compare lung vs control and melanoma vs. control.

dds$condition <- relevel(dds$condition, ref = "control")
dds <- DESeq(dds)
resultsNames(dds)
## [1] "Intercept"                     "replicate_2_vs_1"             
## [3] "replicate_3_vs_1"              "condition_lung_vs_control"    
## [5] "condition_melanoma_vs_control"
## show post hoc re-leveling doesn't work:
#dds$condition <- relevel(dds$condition, ref = "lung")
#resultsNames(dds)

Available contrasts

The output of resultsNames(dds) indicates that we can compare lung vs control and melanoma vs control. If you wanted to compare melanoma vs lung, you will have to change the reference level and create a new dds object.

Quality Control

We will use the gene-level counts to perform some quality control checks on the samples in the experiment. We will choose a suitable transformation for the gene counts…

Extract Counts

Extract gene-level counts from the DDS object.

counts <- counts(dds, normalized=TRUE)

Transform Counts

We will take a look at three transformations: log2(), and rlog().

## DESeq2 is weird about extracting transformations as a matrix - you must use `assay()` 
log2 <- assay(normTransform(dds))
rld <- assay(rlog(dds))

Generate mean sd plots to check noise from low count genes (low information genes).

library(vsn)
library(hexbin)

## x-axis is the transformed mean not the raw mean..

log2_plt <- meanSdPlot(log2, ranks=FALSE, plot=FALSE)
log2_plt$gg + ggtitle("Log2 + PC Transformation") + xlim(0,20)

rld_plt <- meanSdPlot(rld, ranks=FALSE, plot=FALSE)
rld_plt$gg + ggtitle("Rlog Transformation") + xlim(0,20)

Save Counts

Write the counts files to text files in ~/RNA-Seq

#dir.create("~/RNA-Seq/counts")
#write.table(counts, "~/RNA-Seq/counts/normalised_counts.txt", sep="\t", quote = F)
#write.table(rld, "~/RNA-Seq/counts/rld_transformed_counts.txt", sep="\t", quote = F)

Sample Heatmap

The first step of our Quality control is to check that samples cluster together as expected.

## Calculate distance between samples
sampleDists <- dist(t(rld))

## Place distances in matrix
sampleDistMatrix <- as.matrix(sampleDists)

## Optional, remove colnames
colnames(sampleDistMatrix) <- NULL

## create annotation dataframe
ann <- data.frame(Condition = samples$condition)

col <- c("blue", "forestgreen", "red1")
names(col) <- c("melanoma", "lung", "control")
ann_col <- list(Condition = col)

## match annotation rownames to distance mat
rownames(ann) <- rownames(sampleDistMatrix)

pheatmap(mat=sampleDistMatrix,
         ## pass distance metric calculated to heatmap
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         ## pass annotation dataframe 
         annotation_col = ann,
         ## add colors
         annotation_colors = ann_col,
         ## heatmap colours
         col=hcl.colors(100,"GnBu",rev=T))

## type hcl.pals(type=NULL) to inspect their color palettes, they spruce up plots very well...  

PCA

PCA plots the variance explained by samples in each principal component. Typically PC1 & PC2 explain the most variation in the dataset, you should look at these first and foremost.

PCA can give you a good idea of how successful the Differential Expression analysis will be - samples that are very close together in PC feature space will not produce as many DE genes as those that are separated by a large distance.

p <- pca(rld, metadata = samples)

biplot(p,
       colby = 'condition',
       colkey = c('melanoma'='royalblue', 'control'='red1', 'lung'='forestgreen'),
       hline = 0,
       vline = 0,
       legendPosition = 'right',
       legendLabSize = 12,
       legendIconSize = 8.0,
       title = 'PCA bi-plot',
       subtitle = 'PC1 versus PC2')

Differential Expression Analysis

DESeq results()

We can extract differentially expressed genes between phenotypes of interest by using results() on the DDS object.

Furthermore, we will apply the apeglm shrinkage estimator on our results. apeglm shrinks low confidence (high inter sample variation) differentially expressed genes towards 0, producing a robust set of differentially expressed genes.

The argument coef in lfcShrink refers to the contrast of interest returned by resultsNames(dds). For lung vs. control, it is the 4th character string returned thus coef=4 for lung vs. control.

Set up lung vs control, melanoma vs control:

# make lung vs control object
lung_v_ctrl <- results(dds, filterFun=ihw, alpha=0.05, c("condition", "lung", "control"))
res1 <- lfcShrink(dds=dds, res=lung_v_ctrl, coef=4, type="apeglm")
summary(res1)
## 
## out of 22530 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 2716, 12%
## LFC < 0 (down)     : 3313, 15%
## outliers [1]       : 0, 0%
## [1] see 'cooksCutoff' argument of ?results
## see metadata(res)$ihwResult on hypothesis weighting
# make melanoma vs control object
melanoma_v_ctrl <- results(dds, filterFun=ihw, alpha=0.05, c("condition", "melanoma", "control"))
res2 <- lfcShrink(dds=dds, res=melanoma_v_ctrl, coef=5, type="apeglm")
summary(res2)
## 
## out of 22530 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 455, 2%
## LFC < 0 (down)     : 411, 1.8%
## outliers [1]       : 0, 0%
## [1] see 'cooksCutoff' argument of ?results
## see metadata(res)$ihwResult on hypothesis weighting

Set up lung vs melanoma. We need to reconfigure the DDS object so ‘melanoma’ is the reference level in ‘condition’, and re-run DESeq().

Check resultsNames(dds) to ensure the correct contrasts are set:

# to make lung vs melanoma, relevel the dds object reference level and redo the DESeq call
dds$condition <- relevel(dds$condition, ref = "melanoma")
dds <- DESeq(dds)

# double check it worked
resultsNames(dds)
## [1] "Intercept"                     "replicate_2_vs_1"             
## [3] "replicate_3_vs_1"              "condition_control_vs_melanoma"
## [5] "condition_lung_vs_melanoma"

Now extract the contrast results and perform apeglm.

# make lung vs melanoma
lung_v_melanoma <- results(dds, filterFun=ihw, alpha=0.05, c("condition", "lung", "melanoma"))
res3 <- lfcShrink(dds=dds, res=lung_v_melanoma, coef=5, type="apeglm")
summary(res3)
## 
## out of 22530 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 2699, 12%
## LFC < 0 (down)     : 3260, 14%
## outliers [1]       : 0, 0%
## [1] see 'cooksCutoff' argument of ?results
## see metadata(res)$ihwResult on hypothesis weighting

Take a look at the summary statistics of the number of DE genes. Do they conform to comments made about the PCA plot?

Resusable Functions

Below are some handy functions I use all the time for extracting differentially expressed genes and performing annotation of DE genes.

Extract Up Regulated Genes
get_upregulated <- function(df){

    key <- intersect(rownames(df)[which(df$log2FoldChange>=1)], rownames(df)[which(df$pvalue<=0.05)])

    results <- as.data.frame((df)[which(rownames(df) %in% key),])
    return(results)
}
Extract Down Regulated genes
get_downregulated <- function(df){

    key <- intersect(rownames(df)[which(df$log2FoldChange<=-1)], rownames(df)[which(df$pvalue<=0.05)])

    results <- as.data.frame((df)[which(rownames(df) %in% key),])
    return(results)
}
Annotate DE genes
annotate_de_genes <- function(df){

    df$hgnc_symbol <- rownames(df)
    mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
    info <- getBM(attributes=c("hgnc_symbol",
                               "ensembl_gene_id_version",
                               "chromosome_name",
                               "start_position",
                               "end_position",
                               "strand",
                               "entrezgene_description"),
                  filters = c("hgnc_symbol"),
                  values = df$hgnc_symbol,
                  mart = mart,
                  useCache=FALSE)

    tmp <- merge(df, info, by="hgnc_symbol")
    tmp$strand <- gsub("-1", "-", tmp$strand)
    tmp$strand <- gsub("1", "+", tmp$strand)
    tmp$hgnc_symbol <- make.names(tmp$hgnc_symbol, unique = T)
    tmp <- tmp[!grepl("CHR", tmp$chromosome_name),]

    output_col <- c("Gene", "Ensembl ID", "Chromosome", "Start", "Stop", "Strand", "Description", "Log2FC", "P-value", "Adj P-value")
    tmp <- subset(tmp, select=c(hgnc_symbol, ensembl_gene_id_version, chromosome_name, start_position, end_position, strand, entrezgene_description, log2FoldChange, pvalue, padj))
    colnames(tmp) <- output_col

    if(min(tmp$Log2FC) > 0){
        tmp <- tmp[order(-tmp$Log2FC),]
    }else{
        tmp <- tmp[order(tmp$Log2FC),]
    }

    return(tmp)

}

Write DE results

Use the above functions to extract up/down regulated genes and annotate them using biomaRt, and write to a file. Please note that the get_(up/down)regulated function requires a dataframe as input.

Note: we will focus on lung vs control for the remainder of the practical

de_up <- get_upregulated(as.data.frame(res1))
de_down <- get_downregulated(as.data.frame(res1))

# comment out if biomaRt is being difficult
#upregulated_genes <- annotate_de_genes(de_up)
#downregulated_genes <- annotate_de_genes(de_down)

#dir.create("~/RNA-Seq/DESeq_results/")
#write.table(upregulated_genes, "~/RNA-Seq/DESeq_results/lung_vs_control_upregulated.txt", sep="\t", row.names=F, quote=F)
#write.table(downregulated_genes, "~/RNA-Seq/DESeq_results/lung_vs_control_downregulated.txt", sep="\t", row.names=F, quote=F)

Results Plots

Volcano Plot

Volcano plots are useful to show how many genes are differentially expressed in the experimental contrast of interest. Labels are optional, I have included them so you know how to use them.

N.B Volcano plots use -log10 on the Y-axis.

## remove NA values from results
res1 <- na.omit(res1)

## calculate min/max axis values for plot (optional)
min_width <- min(res1$log2FoldChange)
max_width <- max(res1$log2FoldChange)
max_height <- -log10(min(res1[res1$pvalue>0, 5]))

## Grab top 10 up-reg genes for plot
up <- subset(res1, res1$log2FoldChange > 1 & res1$pvalue <= 0.05)
up <- up[order(-up$log2FoldChange),]
up_list <- head(rownames(up), n=10L)

## Grab top 10 down-reg genes for plot
down <- subset(res1, res1$log2FoldChange < -1 & res1$pvalue <= 0.05)
down <- down[order(down$log2FoldChange),]
down_list <- head(rownames(down), n=10L)

## place top 20 DE genes in vector (optinal...)
plot_top_20 <- c(up_list, down_list)

EnhancedVolcano(res1,
                lab=rownames(res1),
                x="log2FoldChange",
                y="pvalue",
                selectLab=plot_top_20,
                drawConnectors=TRUE,
                legendPosition = "none",
                FCcutoff=1.0,
                pCutoff=0.05,
                title="Volcano Plot",
                subtitle="Lung vs. Control",
                caption = paste0('Total Genes = ', nrow(res1)),
                xlim=c(min_width, max_width),
                ylim=c(0, max_height))

apeglm visualised

plotMA(lung_v_ctrl, ylim=c(-10,10))

plotMA(res1, ylim=c(-10,10))

Heatmap

Heatmaps are another way to show the differentially expressed genes in the experimental contrast of interest.

# subset the counts matrix to get the lung and control samples
subset <- rld[, 1:6]

# now select de_up, de_down, i.e DE genes that passed the filtering our function produced
up <- rownames(de_up)
down <- rownames(de_down)

# subset matrix to include only DE genes
key <- c(up, down)
subset <- subset[which(rownames(subset) %in% key),]

# scale and center the values
mat <- as.matrix(scale(t(subset), center = T))

# basic plot to check we're plotting something sensible
#pheatmap(t(mat))

# spruce it up a bit..
ann <- data.frame(Condition = c(rep("Control", 3), rep("Lung", 3)))
rownames(ann) <- rownames(mat)
col <- c("blue", "forestgreen")
names(col) <- c("Control", "Lung")
ann_col <- list(Condition = col)

pheatmap(t(mat), 
         show_rownames = FALSE,
         annotation_col = ann,
         annotation_colors = ann_col,
         color = hcl.colors(100, "PRGn",rev=F))

Pathway Analysis

fgsea

Load the GO Biological processes file c5.bp.v7.0.symbols.gmt - it should be in the quant/ directory I gave you.

Create ranked gene list

Extract the gene names and associated log2FoldChanges from our lung vs control study to generate a ranked gene list.

## convert result object to dataframe
res <- as.data.frame(res1) # lung vs control
res$hgnc_symbol <- rownames(res)

# compute summary stat
fgsea_rank <- res %>%
              dplyr::select(hgnc_symbol, log2FoldChange) %>%
              na.omit() %>%
              distinct() %>%
              group_by(hgnc_symbol) %>%
              summarize(stat=mean(log2FoldChange))

fgsea_rank
## # A tibble: 22,530 × 2
##    hgnc_symbol    stat
##    <chr>         <dbl>
##  1 ""          -0.161 
##  2 "A1BG"      -0.166 
##  3 "A1CF"       0.0557
##  4 "A2M"        0.190 
##  5 "A2ML1"      0.381 
##  6 "A2MP1"      0.671 
##  7 "A3GALT2"    0.0226
##  8 "A4GALT"     0.267 
##  9 "A4GNT"     -0.0156
## 10 "AAAS"      -0.439 
## # … with 22,520 more rows

Convert to a named list

# create named list
rank <- deframe(fgsea_rank)
head(rank, 20)
##                      A1BG         A1CF          A2M        A2ML1        A2MP1 
## -0.160560295 -0.166224204  0.055662907  0.189671714  0.381380365  0.670570018 
##      A3GALT2       A4GALT        A4GNT         AAAS         AACS       AACSP1 
##  0.022582068  0.267407058 -0.015577910 -0.439099072 -0.664341345 -0.078231057 
##        AADAC      AADACL3      AADACP1        AADAT        AAGAB         AAK1 
## -0.013629602  0.006091806  0.037048200 -0.169815621  0.120694518  0.144493709 
##        AAMDC         AAMP 
## -0.559084516  0.181110370

Read GMT file

# read in gmt file
pathway <- gmtPathways("/data/github/quant/c5.bp.v7.0.symbols.gmt")
head(pathway, 1)
## $GO_POSITIVE_REGULATION_OF_VIRAL_TRANSCRIPTION
##  [1] "CDK9"    "CHD1"    "DHX9"    "EP300"   "SNW1"    "RRP1B"   "NELFB"  
##  [8] "GTF2B"   "GTF2F1"  "GTF2F2"  "MDFIC"   "HPN"     "JUN"     "LEF1"   
## [15] "ZNF639"  "NELFCD"  "RSF1"    "POLR2A"  "POLR2B"  "POLR2C"  "POLR2D" 
## [22] "POLR2E"  "POLR2F"  "POLR2G"  "POLR2H"  "POLR2I"  "POLR2J"  "POLR2K" 
## [29] "POLR2L"  "NUCKS1"  "SMARCA4" "SMARCB1" "SP1"     "SUPT4H1" "SUPT5H" 
## [36] "TAF11"   "TFAP4"   "NELFA"   "NELFE"   "CCNT1"   "CTDP1"

Run fgsea

# run fgsea
fgsea <- fgsea(pathways=pathway, stats=rank, nperm=1000)

fgseaResTidy <- fgsea %>%
  as_tibble() %>%
  arrange(desc(NES))

# Show in a nice table:
fgseaResTidy %>%
  dplyr::select(-leadingEdge, -ES, -nMoreExtreme) %>%
  arrange(padj) %>%
  DT::datatable()

Enrichment plots

I will show you can example of a pathway enriched in our lung samples, and a pathway that is enriched in Control (i.e negative NES score)

filtered_pathway <- subset(fgsea, NES > 2.35)

filt_up <- as.vector(filtered_pathway$pathway)

for (i in filt_up){
    plt <- plotEnrichment(pathway = pathway[[i]],
    gseaParam = 1, ticksSize = 0.5, stats= rank) +
    labs(title=i) + theme(plot.title = element_text(hjust = 0.5, face="bold"))
    print(plt)
}

filtered_pathway <- subset(fgsea, NES < -2.556)

filt_down <- as.vector(filtered_pathway$pathway)

for (i in filt_down){
    plt <- plotEnrichment(pathway = pathway[[i]],
    gseaParam = 1, ticksSize = 0.5, stats= rank) +
    labs(title=i) + theme(plot.title = element_text(hjust = 0.5, face="bold"))
    print(plt)
}