The workflow for Detect differentially expressed genes from bulk RNA-seq data project, steps 5-7. Previous batch scripts on cluster here.
fcdata <- read.table("featurecounts.txt", header=TRUE, row.names=1)
# head(fcdata)
Check gene expression patterns and remove first five columns, by removing (chr, start, end, strand, length)
fcdata <- fcdata[ ,6:ncol(fcdata)]
Extract file names. i.e. ...bam.NonTNBC3_align.sorted.bam to NonTNBC3
colnames(fcdata) <- gsub("_align.sorted.bam$", "", colnames(fcdata))
colnames(fcdata) <- gsub("^.*bam\\.", "", colnames(fcdata))
# head(fcdata)
Use DESeq2::DESeqDataSetFromMatrix() to read in the counts from FeatureCounts and specifying the experimental group of each sample
fcdata <- as.matrix(fcdata)
group <- factor(c(rep("HER2", 3), rep("NonTNBC", 3), rep("TNBC", 3), rep("Normal", 3)))
condition = factor(c(rep("Tumor", 9), rep("Normal", 3)))
coldata <- data.frame(row.names=colnames(fcdata), group, factor(c("1", "2", "3", "1", "2", "3", "1", "2", "3", "1", "2", "3")), condition)
dds <- DESeq2::DESeqDataSetFromMatrix(countData=fcdata, colData=coldata, design=~group + factor(c("1", "2", "3", "1", "2", "3", "1", "2", "3", "1", "2", "3")))
Run the DESeq2::DESeq()` function and save its output to a variable.
dds <- DESeq2::DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Remove the dependence of the variance on the mean by running DESeq2::rlog() on dds. For visualisation, we typically do not want to consider the experimental groups and set blind=TRUE. See here for background.
Assess how the samples cluster based on their gene expression profiles using, for example, DESeq2::plotPCA(). Draw PCA using the 500 most variably expressed genes
png("PCA.png", 1200, 1200, pointsize=20)
DESeq2::plotPCA(dds_rlog, intgroup="group", ntop=500) +
theme_bw() + # change theme
geom_point(size=5) + # add point size
ggtitle(label="Principal Component Analysis (PCA)", subtitle="Top 500 most variable genes")
dev.off()
knitr::include_graphics("PCA.png")
Extract the results for at least one pairwise contrast and order them by adjusted p-value
result1 <- DESeq2::results(dds, contrast=c("group", "TNBC", "NonTNBC"), alpha = 0.05)
result1 <- result1[order(result1$padj), ]
result2 <- DESeq2::results(dds, contrast=c("group", "HER2", "TNBC"), alpha = 0.05)
result2 <- result2[order(result2$padj), ]
sum(result1$padj < 0.05, na.rm=TRUE)
## [1] 13734
sum(result2$padj < 0.05, na.rm=TRUE)
## [1] 14514
summary(result1)
##
## out of 56167 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 4152, 7.4%
## LFC < 0 (down) : 9582, 17%
## outliers [1] : 0, 0%
## low counts [2] : 19519, 35%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
summary(result2)
##
## out of 56167 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 9362, 17%
## LFC < 0 (down) : 5152, 9.2%
## outliers [1] : 0, 0%
## low counts [2] : 17350, 31%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Mean expression against log-fold change. Genes with p-adjusted below alpha will be shown in blue, all others in grey. These figures are not very often used now.
png("MA.png", 1200, 1200, pointsize=20)
par(mfrow=c(2,1))
plotMA(result1, main="TNBC vs NonTNBC", alpha=0.05)
plotMA(result2, main="HER2 vs. TNBC", alpha=0.05)
dev.off()
knitr::include_graphics("MA.png")
Heatmap using the 50 most highly expressed genes
png("Heatmap.png", 1200, 1200, pointsize=20)
select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:50]
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
heatmap.2(assay(dds_rlog)[select,], col = hmcol, trace="none", margin=c(10,6),labCol=colnames(dds), cexRow = 1/log10(length(select)))
dev.off()
knitr::include_graphics("Heatmap.png")
png("vulcano.png", 1200, 1200, pointsize=20)
vol_data1 <- data.frame(gene=row.names(result1), pval=-log10(result1$padj), lfc=result1$log2FoldChange)
# remove na
vol_data1 <- na.omit(vol_data1)
# set upper and lower threshold
vol_data1 <- mutate(vol_data1, color=case_when(
vol_data1$lfc > 0 & vol_data1$pval > 1.3 ~ "Increased",
vol_data1$lfc < 0 & vol_data1$pval > 1.3 ~ "Decreased",
vol_data1$pval < 1.3 ~ "nonsignificant"))
vol1 <- ggplot(vol_data1, aes(x=lfc, y=pval, color=color))
vol_data2 <- data.frame(gene=row.names(result2), pval=-log10(result2$padj), lfc=result2$log2FoldChange)
# remove na
vol_data2 <- na.omit(vol_data2)
# set upper and lower threshold
vol_data2 <- mutate(vol_data2, color=case_when(
vol_data2$lfc > 0 & vol_data2$pval > 1.3 ~ "Increased",
vol_data2$lfc < 0 & vol_data2$pval > 1.3 ~ "Decreased",
vol_data2$pval < 1.3 ~ "nonsignificant"))
vol2 <- ggplot(vol_data2, aes(x=lfc, y=pval, color=color))
library(cowplot)
plot_grid(vol1 + ggtitle(label="Volcano Plot") +
geom_point(size=2.5, alpha=0.8, na.rm=T) +
scale_color_manual(name="Directionality",
values=c(Increased="#008B00", Decreased="#CD4F39",
nonsignificant="darkgray")) +
theme_bw(base_size=14) +
theme(legend.position="right") +
xlab(expression(log[2]("TNBC vs. NonTNBC"))) +
ylab(expression(-log[10]("adjusted p-value"))) +
geom_hline(yintercept=1.3, colour="darkgrey") +
scale_y_continuous(trans="log1p")
, vol2 + geom_point(size=2.5, alpha=0.8, na.rm=T) +
scale_color_manual(name="Directionality",
values=c(Increased="#008B00", Decreased="#CD4F39",
nonsignificant="darkgray")) +
theme_bw(base_size=14) +
theme(legend.position="right") +
xlab(expression(log[2]("HER2 vs. TNBC"))) +
ylab(expression(-log[10]("adjusted p-value"))) +
geom_hline(yintercept=1.3, colour="darkgrey") +
scale_y_continuous(trans="log1p")
, byrow = TRUE, nrow = 2)
dev.off()
## png
## 2
knitr::include_graphics("vulcano.png")
Merge with normalized count data
resdata1 <- merge(as.data.frame(result1), as.data.frame(counts(dds, normalized=TRUE)), by="row.names")
resdata2 <- merge(as.data.frame(result2), as.data.frame(counts(dds, normalized=TRUE)), by="row.names")
names(resdata1)[1] <- "Gene"
names(resdata2)[1] <- "Gene"
# write csv
write.csv(resdata1, file="diffexpr-results_1.csv")
write.csv(resdata2, file="diffexpr-results_2.csv")
# Expression comparison of chosen genes
png("expression.png", 1200, 1200, pointsize=20)
par(mfrow=c(2,2))
SPARC <- plotCounts(dds, "ENSG00000113140", returnData = TRUE)
boxplot(count ~ group , data=SPARC, main = "Expression of SPARC")
boxplot(count ~ condition , data=SPARC, main = "Expression of SPARC")
HRAS <- plotCounts(dds,"ENSG00000174775", returnData = TRUE)
plot(count ~ group , data=HRAS, main = "Expression of HRAS")
plot(count ~ condition , data=HRAS, main = "Expression of HRAS")
dev.off()
knitr::include_graphics("expression.png")
# Output data frame
as.data.frame(SPARC)
as.data.frame(HRAS)
# GO of pairwise comparison
data1 <- resdata1
geneList <- as.character(data1$Gene)
data(geneList)
de <- names(geneList)[abs(geneList) > 2]
gene.df1 <- bitr(de, fromType = "ENTREZID",
toType = c("ENSEMBL", "SYMBOL"),
OrgDb = "org.Hs.eg.db")
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(de, fromType = "ENTREZID", toType = c("ENSEMBL", "SYMBOL"), :
## 0.48% of input gene IDs are fail to map...
data2 <- resdata2
geneList <- as.character(data2$Gene)
data(geneList)
de <- names(geneList)[abs(geneList) > 2]
gene.df2 <- bitr(de, fromType = "ENTREZID",
toType = c("ENSEMBL", "SYMBOL"),
OrgDb = "org.Hs.eg.db")
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(de, fromType = "ENTREZID", toType = c("ENSEMBL", "SYMBOL"), :
## 0.48% of input gene IDs are fail to map...
# GO
ego1 <- enrichGO(gene = gene.df1$ENSEMBL,
OrgDb = "org.Hs.eg.db",
keyType = 'ENSEMBL',
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
head(ego1)
ego2 <- enrichGO(gene = gene.df2$ENSEMBL,
OrgDb = "org.Hs.eg.db",
keyType = 'ENSEMBL',
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
head(ego2)
# Visualization dot plot
png("dotplot1.png", 1200, 1200, pointsize=20)
dotplot(ego1, showCategory=30) + ggtitle("Over-representation analysis")
dev.off()
## png
## 2
png("dotplot2.png", 1200, 1200, pointsize=20)
dotplot(ego2, showCategory=30) + ggtitle("Over-representation analysis")
dev.off()
## png
## 2
knitr::include_graphics(c("dotplot1.png", "dotplot2.png"))
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22000)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=Chinese (Simplified)_China.936
## [2] LC_CTYPE=Chinese (Simplified)_China.936
## [3] LC_MONETARY=Chinese (Simplified)_China.936
## [4] LC_NUMERIC=C
## [5] LC_TIME=Chinese (Simplified)_China.936
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] cowplot_1.1.1 gplots_3.1.1
## [3] RColorBrewer_1.1-2 ggupset_0.3.0
## [5] ggstance_0.3.5.9000 enrichplot_1.14.1
## [7] forcats_0.5.1 dplyr_1.0.7
## [9] ggplot2_3.3.5 DOSE_3.20.1
## [11] org.Hs.eg.db_3.14.0 AnnotationDbi_1.56.2
## [13] clusterProfiler_4.2.2 DESeq2_1.34.0
## [15] SummarizedExperiment_1.24.0 Biobase_2.54.0
## [17] MatrixGenerics_1.6.0 matrixStats_0.61.0
## [19] GenomicRanges_1.46.1 GenomeInfoDb_1.30.0
## [21] IRanges_2.28.0 S4Vectors_0.32.3
## [23] BiocGenerics_0.40.0
##
## loaded via a namespace (and not attached):
## [1] shadowtext_0.1.1 fastmatch_1.1-3 plyr_1.8.6
## [4] igraph_1.2.11 lazyeval_0.2.2 splines_4.1.2
## [7] BiocParallel_1.28.3 digest_0.6.28 yulab.utils_0.0.4
## [10] htmltools_0.5.2 GOSemSim_2.20.0 viridis_0.6.2
## [13] GO.db_3.14.0 fansi_0.5.0 magrittr_2.0.1
## [16] memoise_2.0.1 Biostrings_2.62.0 annotate_1.72.0
## [19] graphlayouts_0.8.0 colorspace_2.0-2 blob_1.2.2
## [22] ggrepel_0.9.1 xfun_0.29 crayon_1.4.2
## [25] RCurl_1.98-1.5 jsonlite_1.7.3 scatterpie_0.1.7
## [28] genefilter_1.76.0 survival_3.2-13 ape_5.6-1
## [31] glue_1.6.0 polyclip_1.10-0 gtable_0.3.0
## [34] zlibbioc_1.40.0 XVector_0.34.0 DelayedArray_0.20.0
## [37] scales_1.1.1 DBI_1.1.2 Rcpp_1.0.8
## [40] viridisLite_0.4.0 xtable_1.8-4 gridGraphics_0.5-1
## [43] tidytree_0.3.7 bit_4.0.4 httr_1.4.2
## [46] fgsea_1.20.0 ellipsis_0.3.2 pkgconfig_2.0.3
## [49] XML_3.99-0.8 farver_2.1.0 sass_0.4.0
## [52] locfit_1.5-9.4 utf8_1.2.2 ggplotify_0.1.0
## [55] tidyselect_1.1.1 labeling_0.4.2 rlang_0.4.11
## [58] reshape2_1.4.4 munsell_0.5.0 tools_4.1.2
## [61] cachem_1.0.6 downloader_0.4 generics_0.1.1
## [64] RSQLite_2.2.9 evaluate_0.14 stringr_1.4.0
## [67] fastmap_1.1.0 yaml_2.2.1 ggtree_3.2.1
## [70] knitr_1.37 bit64_4.0.5 tidygraph_1.2.0
## [73] caTools_1.18.2 purrr_0.3.4 KEGGREST_1.34.0
## [76] ggraph_2.0.5 nlme_3.1-155 aplot_0.1.2
## [79] DO.db_2.9 compiler_4.1.2 png_0.1-7
## [82] treeio_1.18.1 tibble_3.1.6 tweenr_1.0.2
## [85] geneplotter_1.72.0 bslib_0.3.1 stringi_1.7.6
## [88] highr_0.9 lattice_0.20-45 Matrix_1.4-0
## [91] vctrs_0.3.8 pillar_1.6.4 lifecycle_1.0.1
## [94] jquerylib_0.1.4 data.table_1.14.2 bitops_1.0-7
## [97] patchwork_1.1.1 qvalue_2.26.0 R6_2.5.1
## [100] KernSmooth_2.23-20 gridExtra_2.3 MASS_7.3-55
## [103] gtools_3.9.2 assertthat_0.2.1 withr_2.4.3
## [106] GenomeInfoDbData_1.2.7 parallel_4.1.2 grid_4.1.2
## [109] ggfun_0.0.5 tidyr_1.1.4 rmarkdown_2.11
## [112] ggforce_0.3.3