library(airway) # for my data
library(Rsamtools) # to import the BAM files
library(GenomicFeatures) # to import GTF file
library(GenomicAlignments) # for summarizeoverlaps, counting reads
library(DESeq2) # DE analysis
library(pheatmap) # plot heatmap sample distances
library(RColorBrewer) # create nice color plaettes
library(ggplot2) # customise our plots
library(genefilter) # to select genes with most variance
library(AnnotationDbi) # functions to annotate the data
library(org.Hs.eg.db) # human database for annotation
Import the metadata to R
# this is for something
sampleTable <- read.csv("data/sample_table.csv", row.names = 1)
Import bam files
filenames <- paste0(sampleTable$Run, "_subset.bam")
filepath <- file.path("data", filenames)
bamfiles <- BamFileList(filepath, yieldSize = 2000000)
Import the GTF file
gtfpath <- "data/Homo_sapiens.GRCh37.75_subset.gtf"
txdb <- makeTxDbFromGFF(gtfpath, format = "gtf", circ_seqs = character(0))
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
## stop_codon. This information was ignored.
## OK
ebg <- exonsBy(txdb,by="gene")
se <- summarizeOverlaps(features = ebg,
reads = bamfiles,
mode = "Union",
fragments = TRUE,
singleEnd = FALSE
)
colData(se) <- DataFrame(sampleTable)
Because we are in the workshp and we don’t want to do DE on only 20 genes.
data("airway")
se <- airway
Creates own object with 2 differences: - assay matrix needs to be coutns - includes design formula ## Design formula
~ cell + dex
se$dex <- relevel(se$dex, "untrt")
dds <- DESeqDataSet(se, design = ~ cell + dex)
keep <- rowSums(counts(dds)) > 0
dds <- dds[keep, ]
rld <- rlog(dds, blind = FALSE)
sampleDists <- dist(t(assay(rld)), diag = TRUE)
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(rld$cell, rld$dex, sep="-")
colnames(sampleDistMatrix) <- paste(rld$cell, rld$dex, sep="-")
pheatmap(sampleDistMatrix,
color = colorRampPalette(rev(brewer.pal(9, "Blues")))(255),
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists
)
pcadata <- plotPCA(rld, intgroup = c("dex", "cell"), returnData = TRUE)
percentVar <- round(attr(pcadata, "percentVar") * 100)
ggplot(data = pcadata, mapping = aes(x = PC1, y = PC2, color = dex, shape =cell)) +
geom_point() +
xlab(paste("PC1: ", percentVar[1], "% variance")) +
ylab(paste("PC2: ", percentVar[2], "% variance"))
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)
summary(res)
##
## out of 33469 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 2608, 7.8%
## LFC < 0 (down) : 2216, 6.6%
## outliers [1] : 0, 0%
## low counts [2] : 15573, 47%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# adjusting the FDR
res.05 <- results(dds, alpha = 0.05)
table(res.05$padj < 0.05)
##
## FALSE TRUE
## 13230 4017
# raising the FC threshold
resLFC1 <- results(dds, lfcThreshold = 1)
table(resLFC1$padj < 0.1)
##
## FALSE TRUE
## 20251 240
resSig <- subset(res.05, padj < 0.05)
head(resSig[order(resSig$log2FoldChange), ])
## log2 fold change (MLE): dex trt vs untrt
## Wald test p-value: dex trt vs untrt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000128285 6.62474 -5.32591 1.257816 -4.23425 2.29319e-05
## ENSG00000267339 26.23357 -4.61155 0.673094 -6.85127 7.31984e-12
## ENSG00000019186 14.08760 -4.32591 0.857768 -5.04321 4.57776e-07
## ENSG00000146006 46.80760 -4.21185 0.528852 -7.96414 1.66379e-15
## ENSG00000141469 53.43653 -4.12478 1.129798 -3.65091 2.61317e-04
## ENSG00000108700 15.02281 -4.04536 0.812001 -4.98196 6.29439e-07
## padj
## <numeric>
## ENSG00000128285 2.27826e-04
## ENSG00000267339 1.96338e-10
## ENSG00000019186 6.33044e-06
## ENSG00000146006 6.84596e-14
## ENSG00000141469 1.98544e-03
## ENSG00000108700 8.44820e-06
head(resSig[order(resSig$log2FoldChange, decreasing = TRUE), ])
## log2 fold change (MLE): dex trt vs untrt
## Wald test p-value: dex trt vs untrt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000179593 67.24305 9.50597 1.054502 9.01465 1.97493e-19
## ENSG00000109906 385.07103 7.35263 0.536390 13.70761 9.14199e-43
## ENSG00000250978 56.31819 6.32738 0.677797 9.33521 1.00787e-20
## ENSG00000127954 286.38412 5.20716 0.493083 10.56042 4.54630e-26
## ENSG00000249364 8.83906 5.09811 1.159614 4.39638 1.10069e-05
## ENSG00000171819 254.88437 5.08157 0.757302 6.71010 1.94486e-11
## padj
## <numeric>
## ENSG00000179593 1.19935e-17
## ENSG00000109906 2.15989e-40
## ENSG00000250978 6.89793e-19
## ENSG00000127954 4.84013e-24
## ENSG00000249364 1.17256e-04
## ENSG00000171819 4.94006e-10
cellRes <- results(dds, contrast = c("cell", "N061011", "N61311"))
summary(cellRes)
##
## out of 33469 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 923, 2.8%
## LFC < 0 (down) : 757, 2.3%
## outliers [1] : 0, 0%
## low counts [2] : 16871, 50%
## (mean count < 8)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
topGene <- rownames(res)[which.min(res$pvalue)]
countData <- plotCounts(dds, gene=topGene, intgroup = c("dex", "cell"), returnData = TRUE)
ggplot(countData, aes(x = dex, y = count, color=cell, group = cell)) +
geom_point() +
# geom_jitter(width = 0.1) +
geom_line()
scale_y_log10()
## <ScaleContinuousPosition>
## Range:
## Limits: 0 -- 1
plotMA(res, ylim = c(-5, 5))
plotMA(res.05, ylim = c(-5, 5))
Select genes to plot
topVarGenes <- head(order(rowVars(assay(rld)), decreasing = TRUE), 20)
mat <- assay(rld)[topVarGenes,]
mat <- mat - rowMeans(mat)
df <- as.data.frame(colData(rld)[,c("cell", "dex")])
pheatmap(mat, annotation_col = df)
res$symbol <- mapIds(org.Hs.eg.db,
keys = row.names(res),
column = "SYMBOL",
keytype = "ENSEMBL",
multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
res$genename <- mapIds(org.Hs.eg.db,
keys = row.names(res),
keytype = "ENSEMBL",
column = "GENENAME",
multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
resOrd <- res[order(res$pvalue), ]
resOrdDf <- as.data.frame(resOrd)
write.csv(resOrdDf, file = "results/de_genes.csv")