Libraries

library(airway) # for my data
library(Rsamtools) # to import the BAM files
library(GenomicFeatures) # to import GTF file
library(GenomicAlignments) # for summarizeoverlaps, counting reads
library(DESeq2) # DE analysis
library(pheatmap) # plot heatmap sample distances
library(RColorBrewer) # create nice color plaettes
library(ggplot2) # customise our plots
library(genefilter) # to select genes with most variance
library(AnnotationDbi) # functions to annotate the data
library(org.Hs.eg.db) # human database for annotation

Import the Data

Import the metadata to R

# this is for something
sampleTable <- read.csv("data/sample_table.csv", row.names = 1)

Import bam files

filenames <- paste0(sampleTable$Run, "_subset.bam")
filepath <- file.path("data", filenames)
bamfiles <- BamFileList(filepath, yieldSize = 2000000)

Import the GTF file

gtfpath <- "data/Homo_sapiens.GRCh37.75_subset.gtf"
txdb <- makeTxDbFromGFF(gtfpath, format = "gtf", circ_seqs = character(0))
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
##   stop_codon. This information was ignored.
## OK
ebg <- exonsBy(txdb,by="gene")

Read counting

se <- summarizeOverlaps(features = ebg,
                        reads = bamfiles,
                        mode = "Union",
                        fragments = TRUE,
                        singleEnd = FALSE
                        )
colData(se) <- DataFrame(sampleTable)

SWAP DATA

Because we are in the workshp and we don’t want to do DE on only 20 genes.

data("airway")
se <- airway

DESeq2 create the object

Creates own object with 2 differences: - assay matrix needs to be coutns - includes design formula ## Design formula

~ cell + dex

se$dex <- relevel(se$dex, "untrt")
dds <- DESeqDataSet(se, design = ~ cell + dex)

Exploratory analysis

Pre-filter the dataset

keep <- rowSums(counts(dds)) > 0
dds <- dds[keep, ]

transformation rlog

rld <- rlog(dds, blind = FALSE)

Sample distance

sampleDists <- dist(t(assay(rld)), diag = TRUE)
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(rld$cell, rld$dex, sep="-")
colnames(sampleDistMatrix) <- paste(rld$cell, rld$dex, sep="-")

pheatmap(sampleDistMatrix, 
         color = colorRampPalette(rev(brewer.pal(9, "Blues")))(255),
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists
         )

PCA plot

pcadata <- plotPCA(rld, intgroup = c("dex", "cell"), returnData = TRUE)
percentVar <- round(attr(pcadata, "percentVar") * 100)

ggplot(data = pcadata, mapping = aes(x = PC1, y = PC2, color = dex, shape =cell)) + 
  geom_point() +
  xlab(paste("PC1: ", percentVar[1], "% variance")) +
  ylab(paste("PC2: ", percentVar[2], "% variance"))

Differential expression analysis

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)

summary(res)
## 
## out of 33469 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 2608, 7.8%
## LFC < 0 (down)     : 2216, 6.6%
## outliers [1]       : 0, 0%
## low counts [2]     : 15573, 47%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# adjusting the FDR
res.05 <- results(dds, alpha = 0.05)
table(res.05$padj < 0.05)
## 
## FALSE  TRUE 
## 13230  4017
# raising the FC threshold
resLFC1 <- results(dds, lfcThreshold = 1)
table(resLFC1$padj < 0.1)
## 
## FALSE  TRUE 
## 20251   240

extract significant results

resSig <- subset(res.05, padj < 0.05)
head(resSig[order(resSig$log2FoldChange), ])
## log2 fold change (MLE): dex trt vs untrt 
## Wald test p-value: dex trt vs untrt 
## DataFrame with 6 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat      pvalue
##                 <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000128285   6.62474       -5.32591  1.257816  -4.23425 2.29319e-05
## ENSG00000267339  26.23357       -4.61155  0.673094  -6.85127 7.31984e-12
## ENSG00000019186  14.08760       -4.32591  0.857768  -5.04321 4.57776e-07
## ENSG00000146006  46.80760       -4.21185  0.528852  -7.96414 1.66379e-15
## ENSG00000141469  53.43653       -4.12478  1.129798  -3.65091 2.61317e-04
## ENSG00000108700  15.02281       -4.04536  0.812001  -4.98196 6.29439e-07
##                        padj
##                   <numeric>
## ENSG00000128285 2.27826e-04
## ENSG00000267339 1.96338e-10
## ENSG00000019186 6.33044e-06
## ENSG00000146006 6.84596e-14
## ENSG00000141469 1.98544e-03
## ENSG00000108700 8.44820e-06
head(resSig[order(resSig$log2FoldChange, decreasing = TRUE), ])
## log2 fold change (MLE): dex trt vs untrt 
## Wald test p-value: dex trt vs untrt 
## DataFrame with 6 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat      pvalue
##                 <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000179593  67.24305        9.50597  1.054502   9.01465 1.97493e-19
## ENSG00000109906 385.07103        7.35263  0.536390  13.70761 9.14199e-43
## ENSG00000250978  56.31819        6.32738  0.677797   9.33521 1.00787e-20
## ENSG00000127954 286.38412        5.20716  0.493083  10.56042 4.54630e-26
## ENSG00000249364   8.83906        5.09811  1.159614   4.39638 1.10069e-05
## ENSG00000171819 254.88437        5.08157  0.757302   6.71010 1.94486e-11
##                        padj
##                   <numeric>
## ENSG00000179593 1.19935e-17
## ENSG00000109906 2.15989e-40
## ENSG00000250978 6.89793e-19
## ENSG00000127954 4.84013e-24
## ENSG00000249364 1.17256e-04
## ENSG00000171819 4.94006e-10

extract other comparisons

cellRes <- results(dds, contrast = c("cell", "N061011", "N61311"))
summary(cellRes)
## 
## out of 33469 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 923, 2.8%
## LFC < 0 (down)     : 757, 2.3%
## outliers [1]       : 0, 0%
## low counts [2]     : 16871, 50%
## (mean count < 8)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Plotting results

counts

topGene <- rownames(res)[which.min(res$pvalue)]
countData <- plotCounts(dds, gene=topGene, intgroup = c("dex", "cell"), returnData = TRUE)

ggplot(countData, aes(x = dex, y = count, color=cell, group = cell)) + 
  geom_point() +
 # geom_jitter(width = 0.1) + 
  geom_line()

  scale_y_log10()
## <ScaleContinuousPosition>
##  Range:  
##  Limits:    0 --    1

MA plot

plotMA(res, ylim = c(-5, 5))

plotMA(res.05, ylim = c(-5, 5))

Gene Clustering

Select genes to plot

topVarGenes <- head(order(rowVars(assay(rld)), decreasing = TRUE), 20)
mat <- assay(rld)[topVarGenes,]
mat <- mat - rowMeans(mat)
df <- as.data.frame(colData(rld)[,c("cell", "dex")])
pheatmap(mat, annotation_col = df)

Annotation

res$symbol <- mapIds(org.Hs.eg.db,
                     keys = row.names(res),
                     column = "SYMBOL",
                     keytype = "ENSEMBL",
                     multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
res$genename <- mapIds(org.Hs.eg.db,
                       keys =  row.names(res),
                       keytype = "ENSEMBL",
                       column = "GENENAME",
                       multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns

Export the results

resOrd <- res[order(res$pvalue), ]
resOrdDf <- as.data.frame(resOrd)
write.csv(resOrdDf, file = "results/de_genes.csv")