The workflow for Detect differentially expressed genes from bulk RNA-seq data project, steps 5-7. Previous batch scripts on cluster here.

Load librarys and data

fcdata <- read.table("featurecounts.txt", header=TRUE, row.names=1)
# head(fcdata)

Exploratory data analysis

Check gene expression patterns and remove first five columns, by removing (chr, start, end, strand, length)

fcdata <- fcdata[ ,6:ncol(fcdata)]

Extract file names. i.e. ...bam.NonTNBC3_align.sorted.bam to NonTNBC3

colnames(fcdata) <- gsub("_align.sorted.bam$", "", colnames(fcdata))
colnames(fcdata) <- gsub("^.*bam\\.", "", colnames(fcdata))
# head(fcdata)

Use DESeq2::DESeqDataSetFromMatrix() to read in the counts from FeatureCounts and specifying the experimental group of each sample

fcdata <- as.matrix(fcdata)
group <- factor(c(rep("HER2", 3), rep("NonTNBC", 3), rep("TNBC", 3), rep("Normal", 3)))
condition = factor(c(rep("Tumor", 9), rep("Normal", 3)))
coldata <- data.frame(row.names=colnames(fcdata), group, factor(c("1", "2", "3", "1", "2", "3", "1", "2", "3", "1", "2", "3")), condition)

dds <- DESeq2::DESeqDataSetFromMatrix(countData=fcdata, colData=coldata, design=~group + factor(c("1", "2", "3", "1", "2", "3", "1", "2", "3", "1", "2", "3")))

Run the DESeq2::DESeq()` function and save its output to a variable.

dds <- DESeq2::DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

Remove the dependence of the variance on the mean by running DESeq2::rlog() on dds. For visualisation, we typically do not want to consider the experimental groups and set blind=TRUE. See here for background.

Assess how the samples cluster based on their gene expression profiles using, for example, DESeq2::plotPCA(). Draw PCA using the 500 most variably expressed genes

png("PCA.png", 1200, 1200, pointsize=20)
DESeq2::plotPCA(dds_rlog, intgroup="group", ntop=500) +
  theme_bw() + # change theme
  geom_point(size=5) + # add point size
  ggtitle(label="Principal Component Analysis (PCA)", subtitle="Top 500 most variable genes")
dev.off()
knitr::include_graphics("PCA.png")

Differential expression analysis

Extract the results for at least one pairwise contrast and order them by adjusted p-value

result1 <- DESeq2::results(dds, contrast=c("group", "TNBC", "NonTNBC"), alpha = 0.05)
result1 <- result1[order(result1$padj), ]

result2 <- DESeq2::results(dds, contrast=c("group",  "HER2", "TNBC"), alpha = 0.05)
result2 <- result2[order(result2$padj), ]

sum(result1$padj < 0.05, na.rm=TRUE)
## [1] 13734
sum(result2$padj < 0.05, na.rm=TRUE)
## [1] 14514
summary(result1)
## 
## out of 56167 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 4152, 7.4%
## LFC < 0 (down)     : 9582, 17%
## outliers [1]       : 0, 0%
## low counts [2]     : 19519, 35%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
summary(result2)
## 
## out of 56167 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 9362, 17%
## LFC < 0 (down)     : 5152, 9.2%
## outliers [1]       : 0, 0%
## low counts [2]     : 17350, 31%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Mean expression against log-fold change. Genes with p-adjusted below alpha will be shown in blue, all others in grey. These figures are not very often used now.

png("MA.png", 1200, 1200, pointsize=20)
par(mfrow=c(2,1))
plotMA(result1, main="TNBC vs NonTNBC", alpha=0.05)
plotMA(result2, main="HER2 vs. TNBC", alpha=0.05)
dev.off()
knitr::include_graphics("MA.png")

Heatmap using the 50 most highly expressed genes

png("Heatmap.png", 1200, 1200, pointsize=20)

select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:50]
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
heatmap.2(assay(dds_rlog)[select,], col = hmcol, trace="none", margin=c(10,6),labCol=colnames(dds), cexRow = 1/log10(length(select)))
dev.off()
knitr::include_graphics("Heatmap.png")

png("vulcano.png", 1200, 1200, pointsize=20)

vol_data1 <- data.frame(gene=row.names(result1), pval=-log10(result1$padj), lfc=result1$log2FoldChange)
# remove na
vol_data1 <- na.omit(vol_data1)
# set upper and lower threshold
vol_data1 <- mutate(vol_data1, color=case_when(
    vol_data1$lfc > 0 & vol_data1$pval > 1.3 ~ "Increased",
    vol_data1$lfc < 0 & vol_data1$pval > 1.3 ~ "Decreased",
    vol_data1$pval < 1.3 ~ "nonsignificant"))
vol1 <- ggplot(vol_data1, aes(x=lfc, y=pval, color=color))

vol_data2 <- data.frame(gene=row.names(result2), pval=-log10(result2$padj), lfc=result2$log2FoldChange)
# remove na
vol_data2 <- na.omit(vol_data2)
# set upper and lower threshold
vol_data2 <- mutate(vol_data2, color=case_when(
    vol_data2$lfc > 0 & vol_data2$pval > 1.3 ~ "Increased",
    vol_data2$lfc < 0 & vol_data2$pval > 1.3 ~ "Decreased",
    vol_data2$pval < 1.3 ~ "nonsignificant"))
vol2 <- ggplot(vol_data2, aes(x=lfc, y=pval, color=color))
library(cowplot)
plot_grid(vol1 +  ggtitle(label="Volcano Plot") +
            geom_point(size=2.5, alpha=0.8, na.rm=T) +
  scale_color_manual(name="Directionality",
                     values=c(Increased="#008B00", Decreased="#CD4F39",
                              nonsignificant="darkgray")) +
theme_bw(base_size=14) +
theme(legend.position="right") +
xlab(expression(log[2]("TNBC vs. NonTNBC"))) +
ylab(expression(-log[10]("adjusted p-value"))) +
geom_hline(yintercept=1.3, colour="darkgrey") +
scale_y_continuous(trans="log1p")

, vol2 + geom_point(size=2.5, alpha=0.8, na.rm=T) +
  scale_color_manual(name="Directionality",
                     values=c(Increased="#008B00", Decreased="#CD4F39",
                              nonsignificant="darkgray")) +
theme_bw(base_size=14) +
theme(legend.position="right") +
xlab(expression(log[2]("HER2 vs. TNBC"))) +
ylab(expression(-log[10]("adjusted p-value"))) +
geom_hline(yintercept=1.3, colour="darkgrey") +
scale_y_continuous(trans="log1p")
, byrow = TRUE, nrow = 2)
dev.off()
## png 
##   2
knitr::include_graphics("vulcano.png")

Merge with normalized count data

resdata1 <- merge(as.data.frame(result1), as.data.frame(counts(dds, normalized=TRUE)), by="row.names")
resdata2 <- merge(as.data.frame(result2), as.data.frame(counts(dds, normalized=TRUE)), by="row.names")
names(resdata1)[1] <- "Gene"
names(resdata2)[1] <- "Gene"
# write csv
write.csv(resdata1, file="diffexpr-results_1.csv")
write.csv(resdata2, file="diffexpr-results_2.csv")
# Expression comparison of chosen genes 
png("expression.png", 1200, 1200, pointsize=20)
par(mfrow=c(2,2))

SPARC <- plotCounts(dds, "ENSG00000113140", returnData = TRUE)
boxplot(count ~ group , data=SPARC, main = "Expression of SPARC")
boxplot(count ~ condition , data=SPARC, main = "Expression of SPARC")

HRAS <- plotCounts(dds,"ENSG00000174775", returnData = TRUE)
plot(count ~ group , data=HRAS, main = "Expression of HRAS")
plot(count ~ condition , data=HRAS, main = "Expression of HRAS")
dev.off()
knitr::include_graphics("expression.png")

# Output data frame 
as.data.frame(SPARC)
as.data.frame(HRAS)

Overrepresentation analysis

# GO of pairwise comparison
data1     <-  resdata1
geneList <- as.character(data1$Gene)
data(geneList)
de       <- names(geneList)[abs(geneList) > 2]

gene.df1 <- bitr(de, fromType = "ENTREZID",
                 toType = c("ENSEMBL", "SYMBOL"),
                 OrgDb = "org.Hs.eg.db")
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(de, fromType = "ENTREZID", toType = c("ENSEMBL", "SYMBOL"), :
## 0.48% of input gene IDs are fail to map...
data2     <-  resdata2
geneList <- as.character(data2$Gene)
data(geneList)
de       <- names(geneList)[abs(geneList) > 2]

gene.df2 <- bitr(de, fromType = "ENTREZID",
                 toType = c("ENSEMBL", "SYMBOL"),
                 OrgDb = "org.Hs.eg.db")
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(de, fromType = "ENTREZID", toType = c("ENSEMBL", "SYMBOL"), :
## 0.48% of input gene IDs are fail to map...
# GO 
ego1  <- enrichGO(gene         = gene.df1$ENSEMBL,
                  OrgDb         = "org.Hs.eg.db",
                  keyType       = 'ENSEMBL',
                  ont           = "BP",
                  pAdjustMethod = "BH",
                  pvalueCutoff  = 0.05,
                  readable      = TRUE)
head(ego1)
ego2  <- enrichGO(gene         = gene.df2$ENSEMBL,
                  OrgDb         = "org.Hs.eg.db",
                  keyType       = 'ENSEMBL',
                  ont           = "BP",
                  pAdjustMethod = "BH",
                  pvalueCutoff  = 0.05,
                  readable      = TRUE)
head(ego2)
# Visualization dot plot
png("dotplot1.png", 1200, 1200, pointsize=20)
dotplot(ego1, showCategory=30) + ggtitle("Over-representation analysis")
dev.off()
## png 
##   2
png("dotplot2.png", 1200, 1200, pointsize=20)
dotplot(ego2, showCategory=30) + ggtitle("Over-representation analysis")
dev.off()
## png 
##   2
knitr::include_graphics(c("dotplot1.png", "dotplot2.png"))

SessionInfo

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22000)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=Chinese (Simplified)_China.936 
## [2] LC_CTYPE=Chinese (Simplified)_China.936   
## [3] LC_MONETARY=Chinese (Simplified)_China.936
## [4] LC_NUMERIC=C                              
## [5] LC_TIME=Chinese (Simplified)_China.936    
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] cowplot_1.1.1               gplots_3.1.1               
##  [3] RColorBrewer_1.1-2          ggupset_0.3.0              
##  [5] ggstance_0.3.5.9000         enrichplot_1.14.1          
##  [7] forcats_0.5.1               dplyr_1.0.7                
##  [9] ggplot2_3.3.5               DOSE_3.20.1                
## [11] org.Hs.eg.db_3.14.0         AnnotationDbi_1.56.2       
## [13] clusterProfiler_4.2.2       DESeq2_1.34.0              
## [15] SummarizedExperiment_1.24.0 Biobase_2.54.0             
## [17] MatrixGenerics_1.6.0        matrixStats_0.61.0         
## [19] GenomicRanges_1.46.1        GenomeInfoDb_1.30.0        
## [21] IRanges_2.28.0              S4Vectors_0.32.3           
## [23] BiocGenerics_0.40.0        
## 
## loaded via a namespace (and not attached):
##   [1] shadowtext_0.1.1       fastmatch_1.1-3        plyr_1.8.6            
##   [4] igraph_1.2.11          lazyeval_0.2.2         splines_4.1.2         
##   [7] BiocParallel_1.28.3    digest_0.6.28          yulab.utils_0.0.4     
##  [10] htmltools_0.5.2        GOSemSim_2.20.0        viridis_0.6.2         
##  [13] GO.db_3.14.0           fansi_0.5.0            magrittr_2.0.1        
##  [16] memoise_2.0.1          Biostrings_2.62.0      annotate_1.72.0       
##  [19] graphlayouts_0.8.0     colorspace_2.0-2       blob_1.2.2            
##  [22] ggrepel_0.9.1          xfun_0.29              crayon_1.4.2          
##  [25] RCurl_1.98-1.5         jsonlite_1.7.3         scatterpie_0.1.7      
##  [28] genefilter_1.76.0      survival_3.2-13        ape_5.6-1             
##  [31] glue_1.6.0             polyclip_1.10-0        gtable_0.3.0          
##  [34] zlibbioc_1.40.0        XVector_0.34.0         DelayedArray_0.20.0   
##  [37] scales_1.1.1           DBI_1.1.2              Rcpp_1.0.8            
##  [40] viridisLite_0.4.0      xtable_1.8-4           gridGraphics_0.5-1    
##  [43] tidytree_0.3.7         bit_4.0.4              httr_1.4.2            
##  [46] fgsea_1.20.0           ellipsis_0.3.2         pkgconfig_2.0.3       
##  [49] XML_3.99-0.8           farver_2.1.0           sass_0.4.0            
##  [52] locfit_1.5-9.4         utf8_1.2.2             ggplotify_0.1.0       
##  [55] tidyselect_1.1.1       labeling_0.4.2         rlang_0.4.11          
##  [58] reshape2_1.4.4         munsell_0.5.0          tools_4.1.2           
##  [61] cachem_1.0.6           downloader_0.4         generics_0.1.1        
##  [64] RSQLite_2.2.9          evaluate_0.14          stringr_1.4.0         
##  [67] fastmap_1.1.0          yaml_2.2.1             ggtree_3.2.1          
##  [70] knitr_1.37             bit64_4.0.5            tidygraph_1.2.0       
##  [73] caTools_1.18.2         purrr_0.3.4            KEGGREST_1.34.0       
##  [76] ggraph_2.0.5           nlme_3.1-155           aplot_0.1.2           
##  [79] DO.db_2.9              compiler_4.1.2         png_0.1-7             
##  [82] treeio_1.18.1          tibble_3.1.6           tweenr_1.0.2          
##  [85] geneplotter_1.72.0     bslib_0.3.1            stringi_1.7.6         
##  [88] highr_0.9              lattice_0.20-45        Matrix_1.4-0          
##  [91] vctrs_0.3.8            pillar_1.6.4           lifecycle_1.0.1       
##  [94] jquerylib_0.1.4        data.table_1.14.2      bitops_1.0-7          
##  [97] patchwork_1.1.1        qvalue_2.26.0          R6_2.5.1              
## [100] KernSmooth_2.23-20     gridExtra_2.3          MASS_7.3-55           
## [103] gtools_3.9.2           assertthat_0.2.1       withr_2.4.3           
## [106] GenomeInfoDbData_1.2.7 parallel_4.1.2         grid_4.1.2            
## [109] ggfun_0.0.5            tidyr_1.1.4            rmarkdown_2.11        
## [112] ggforce_0.3.3