SQ NOTE: use this to process the mouse bulk RNAseq data, three FAD vs three Ctrl samples. can run all, no problem.
By default, the ctrl (ref) samples will be on the left side of the csv file, and fad sample on the right side of csv file.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("DESeq2")
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.1
library(org.Mm.eg.db)
## Loading required package: AnnotationDbi
##
ls
## MM_STAR_dir
## Mus_musculus.GRCm39.109.gtf
## Mus_musculus.GRCm39.dna_sm.primary_assembly.fa
## Sanbomics_RNAseq_video6.ipynb
## bams
## count.out
## count.out.summary
## count_table.csv
## deseq_results_all_genes.csv
## deseq_results_sig_genes.csv
## diff_expr.Rmd
## diff_expr_MS.Rmd
## diff_expr_MS.html
## diff_expr_MS.nb.html
## ensemble_key_mapper_mm.csv
## fastq
## heatmap_v1.pdf
## heatmap_v1.png
## mapped
## ref
## rnaseq_ms.Rproj
## rsconnect
Counts <- read.delim("count_table.csv", header = TRUE, row.names = 1, sep = ",")
Counts
Counts <- Counts[which(rowSums(Counts) > 25),]
Counts
condition <- factor(c("wt","wt","wt","fad","fad","fad"))
coldata <- data.frame(row.names = colnames(Counts), condition)
coldata
dds <- DESeqDataSetFromMatrix(countData = Counts, colData = coldata, design = ~condition)
dds$condition <- relevel(dds$condition, ref = "wt")
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
vsdata <- vst(dds, blind=FALSE)
plotPCA(vsdata, intgroup = "condition")+ geom_text(aes(label=name),vjust=2)
plotDispEsts(dds)
resultsNames(dds)
## [1] "Intercept" "condition_fad_vs_wt"
res <- results(dds)
res
## log2 fold change (MLE): condition fad vs wt
## Wald test p-value: condition fad vs wt
## DataFrame with 21326 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000100480 8.80389 -1.1773226 0.802749 -1.466614 0.142481
## ENSMUSG00000051285 4597.07526 0.0439428 0.110797 0.396606 0.691658
## ENSMUSG00000103509 382.25020 0.1376425 0.239029 0.575840 0.564723
## ENSMUSG00000048538 4.54758 -1.7152120 1.250257 -1.371887 0.170099
## ENSMUSG00000097797 29.82424 -0.3425033 0.532200 -0.643561 0.519860
## ... ... ... ... ... ...
## ENSMUSG00000064368 18875.954 -0.2148991 0.0921759 -2.331402 0.0197321
## ENSMUSG00000064370 176355.018 -0.1403333 0.1078287 -1.301447 0.1931055
## ENSMUSG00000064372 1230.703 -0.0946784 0.1440312 -0.657346 0.5109582
## ENSMUSG00000095041 9465.453 0.9065034 0.5485486 1.652549 0.0984227
## ENSMUSG00000095742 212.973 0.0377310 0.2497472 0.151077 0.8799153
## padj
## <numeric>
## ENSMUSG00000100480 0.934367
## ENSMUSG00000051285 0.992713
## ENSMUSG00000103509 0.989088
## ENSMUSG00000048538 0.944566
## ENSMUSG00000097797 0.989088
## ... ...
## ENSMUSG00000064368 0.714208
## ENSMUSG00000064370 0.953216
## ENSMUSG00000064372 0.989088
## ENSMUSG00000095041 0.889714
## ENSMUSG00000095742 0.997788
summary(res) #summary of results
##
## out of 21326 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 7, 0.033%
## LFC < 0 (down) : 67, 0.31%
## outliers [1] : 8, 0.038%
## low counts [2] : 0, 0%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res <- res[order(res$padj, decreasing = FALSE),]
head(res)
## log2 fold change (MLE): condition fad vs wt
## Wald test p-value: condition fad vs wt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000048108 269.8040 -4.14532 0.421351 -9.83816 7.71079e-23
## ENSMUSG00000068323 93.3203 -5.53643 0.621219 -8.91221 5.00247e-19
## ENSMUSG00000004655 134.0642 -4.24178 0.514565 -8.24343 1.67337e-16
## ENSMUSG00000019232 498.5344 -1.41205 0.177111 -7.97267 1.55282e-15
## ENSMUSG00000039672 68.1150 -5.39696 0.678155 -7.95829 1.74437e-15
## ENSMUSG00000026579 166.2382 -3.33617 0.434970 -7.66990 1.72127e-14
## padj
## <numeric>
## ENSMUSG00000048108 1.64379e-18
## ENSMUSG00000068323 5.33213e-15
## ENSMUSG00000004655 1.18910e-12
## ENSMUSG00000019232 7.43730e-12
## ENSMUSG00000039672 7.43730e-12
## ENSMUSG00000026579 6.11568e-11
res.df <- as.data.frame(res)
res.df
res.df$symbol <- mapIds(org.Mm.eg.db, keys = rownames(res.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
write.csv(res, file = "deseq_results_all_genes.csv")
sigs <- na.omit(res)
sigs <- sigs[sigs$padj<0.05,]
sigs <- sigs[order(sigs$padj, decreasing = FALSE),]
sigs.df <- as.data.frame(sigs)
# additional filters to apply if needed
# sigs.df <- sigs.df[(sigs.df$baseMean > 50) & (abs(sigs.df$log2FoldChange) > 1.2),]
sigs.df$symbol <- mapIds(org.Mm.eg.db, keys = rownames(sigs.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
sigs.df
write.csv(sigs, file = "deseq_results_sig_genes.csv")
ls
## MM_STAR_dir
## Mus_musculus.GRCm39.109.gtf
## Mus_musculus.GRCm39.dna_sm.primary_assembly.fa
## Sanbomics_RNAseq_video6.ipynb
## bams
## count.out
## count.out.summary
## count_table.csv
## deseq_results_all_genes.csv
## deseq_results_sig_genes.csv
## diff_expr.Rmd
## diff_expr_MS.Rmd
## diff_expr_MS.html
## diff_expr_MS.nb.html
## diff_expr_MS_files
## ensemble_key_mapper_mm.csv
## fastq
## heatmap_v1.pdf
## heatmap_v1.png
## mapped
## ref
## rnaseq_ms.Rproj
## rsconnect
sigs
## log2 fold change (MLE): condition fad vs wt
## Wald test p-value: condition fad vs wt
## DataFrame with 59 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000048108 269.8040 -4.14532 0.421351 -9.83816 7.71079e-23
## ENSMUSG00000068323 93.3203 -5.53643 0.621219 -8.91221 5.00247e-19
## ENSMUSG00000004655 134.0642 -4.24178 0.514565 -8.24343 1.67337e-16
## ENSMUSG00000019232 498.5344 -1.41205 0.177111 -7.97267 1.55282e-15
## ENSMUSG00000039672 68.1150 -5.39696 0.678155 -7.95829 1.74437e-15
## ... ... ... ... ... ...
## ENSMUSG00000082045 7.93306 6.556752 1.693515 3.87168 0.000108087
## ENSMUSG00000037474 281.29224 0.874760 0.226225 3.86676 0.000110290
## ENSMUSG00000041577 368.76267 -1.005360 0.260310 -3.86217 0.000112385
## ENSMUSG00000032338 425.76013 -0.798469 0.207023 -3.85691 0.000114831
## ENSMUSG00000004098 176.61809 -0.866527 0.225835 -3.83699 0.000124554
## padj
## <numeric>
## ENSMUSG00000048108 1.64379e-18
## ENSMUSG00000068323 5.33213e-15
## ENSMUSG00000004655 1.18910e-12
## ENSMUSG00000019232 7.43730e-12
## ENSMUSG00000039672 7.43730e-12
## ... ...
## ENSMUSG00000082045 0.0418945
## ENSMUSG00000037474 0.0419849
## ENSMUSG00000041577 0.0420320
## ENSMUSG00000032338 0.0422064
## ENSMUSG00000004098 0.0450040
df <- as.data.frame(sigs)
df
ensembl_map <- read.csv('ensemble_key_mapper_mm.csv', header = FALSE)
keys <- ensembl_map$V1
values <- ensembl_map$V2
l <- list()
for (i in 1:length(keys)){
l[keys[i]] <- values[i]
}
#for non-mapped labels
no_values <- setdiff(rownames(df), keys)
for (i in 1:length(no_values)){
l[no_values[i]] <- 'NA'
}
df$symbol <- unlist(l[rownames(df)], use.names = FALSE)
df.top <- df[ (df$baseMean > 40) & (abs(df$log2FoldChange) > 0.75),]
# df.top <- df[ (df$baseMean > 40) & (abs(df$log2FoldChange) > 1.1),]
df.top
df.top <- df.top[order(df.top$log2FoldChange, decreasing = TRUE),]
rlog_out <- rlog(dds, blind=FALSE) #get normalized count data from dds object
mat<-assay(rlog_out)[rownames(df.top), rownames(coldata)] #sig genes x samples
colnames(mat) <- rownames(coldata)
base_mean <- rowMeans(mat)
mat.scaled <- t(apply(mat, 1, scale)) #center and scale each column (Z-score) then transpose
colnames(mat.scaled)<-colnames(mat)
num_keep <- 25
#1 to num_keep len-num_keep to len
rows_keep <- c(seq(1:num_keep), seq((nrow(mat.scaled)-num_keep), nrow(mat.scaled)) )
l2_val <- as.matrix(df.top[rows_keep,]$log2FoldChange) #getting log2 value for each gene we are keeping
colnames(l2_val)<-"logFC"
mean <- as.matrix(df.top[rows_keep,]$baseMean) #getting mean value for each gene we are keeping
colnames(mean)<-"AveExpr"
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("ComplexHeatmap")
library(ComplexHeatmap)
## Warning: package 'ComplexHeatmap' was built under R version 4.2.1
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.12.1
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite either one:
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
##
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(RColorBrewer)
library(circlize)
## ========================================
## circlize version 0.4.15
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(circlize))
## ========================================
#maps values between b/w/r for min and max l2 values
col_logFC <- colorRamp2(c(min(l2_val),0, max(l2_val)), c("blue", "white", "red"))
#maps between 0% quantile, and 75% quantile of mean values --- 0, 25, 50, 75, 100
col_AveExpr <- colorRamp2(c(quantile(mean)[1], quantile(mean)[4]), c("white", "red"))
ha <- HeatmapAnnotation(summary = anno_summary(gp = gpar(fill = 2),
height = unit(2, "cm")))
h1 <- Heatmap(mat.scaled[rows_keep,], cluster_rows = F,
column_labels = colnames(mat.scaled), name="Z-score",
cluster_columns = T)
h2 <- Heatmap(l2_val, row_labels = df.top$symbol[rows_keep],
cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
grid.text(round(l2_val[i, j],2), x, y)
})
h3 <- Heatmap(mean, row_labels = df.top$symbol[rows_keep],
cluster_rows = F, name = "AveExpr", col=col_AveExpr,
cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
grid.text(round(mean[i, j],2), x, y)
})
h<-h1+h2+h3
h
png("./heatmap_v1.png", res = 300, width = 3000, height = 5500)
print(h)
dev.off()
## png
## 2
pdf("./heatmap_v1.pdf", width = 6, height = 9, pointsize = 6 )
print(h)
dev.off()
sessionInfo()
## R version 4.2.0 (2022-04-22 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] circlize_0.4.15 RColorBrewer_1.1-3
## [3] ComplexHeatmap_2.12.1 org.Mm.eg.db_3.15.0
## [5] AnnotationDbi_1.58.0 ggplot2_3.3.6
## [7] DESeq2_1.36.0 SummarizedExperiment_1.26.1
## [9] Biobase_2.56.0 MatrixGenerics_1.8.1
## [11] matrixStats_0.62.0 GenomicRanges_1.48.0
## [13] GenomeInfoDb_1.32.2 IRanges_2.30.0
## [15] S4Vectors_0.34.0 BiocGenerics_0.42.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 bit64_4.0.5 doParallel_1.0.17
## [4] httr_1.4.3 tools_4.2.0 bslib_0.3.1
## [7] utf8_1.2.2 R6_2.5.1 DBI_1.1.3
## [10] colorspace_2.0-3 GetoptLong_1.0.5 withr_2.5.0
## [13] tidyselect_1.1.2 bit_4.0.4 compiler_4.2.0
## [16] cli_3.3.0 Cairo_1.5-15 DelayedArray_0.22.0
## [19] labeling_0.4.2 sass_0.4.1 scales_1.2.0
## [22] genefilter_1.78.0 stringr_1.4.0 digest_0.6.29
## [25] rmarkdown_2.14 XVector_0.36.0 pkgconfig_2.0.3
## [28] htmltools_0.5.2 fastmap_1.1.0 highr_0.9
## [31] GlobalOptions_0.1.2 rlang_1.0.3 rstudioapi_0.13
## [34] RSQLite_2.2.14 shape_1.4.6 jquerylib_0.1.4
## [37] generics_0.1.2 farver_2.1.0 jsonlite_1.8.0
## [40] BiocParallel_1.30.3 dplyr_1.0.9 RCurl_1.98-1.6
## [43] magrittr_2.0.3 GenomeInfoDbData_1.2.8 Matrix_1.5-3
## [46] Rcpp_1.0.8.3 munsell_0.5.0 fansi_1.0.3
## [49] lifecycle_1.0.1 stringi_1.7.6 yaml_2.3.5
## [52] zlibbioc_1.42.0 blob_1.2.3 parallel_4.2.0
## [55] crayon_1.5.1 lattice_0.20-45 Biostrings_2.64.0
## [58] splines_4.2.0 annotate_1.74.0 KEGGREST_1.36.2
## [61] locfit_1.5-9.5 knitr_1.39 pillar_1.7.0
## [64] rjson_0.2.21 geneplotter_1.74.0 codetools_0.2-18
## [67] XML_3.99-0.10 glue_1.6.2 evaluate_0.15
## [70] png_0.1-7 vctrs_0.4.1 foreach_1.5.2
## [73] gtable_0.3.0 purrr_0.3.4 clue_0.3-62
## [76] assertthat_0.2.1 cachem_1.0.6 xfun_0.31
## [79] xtable_1.8-4 survival_3.3-1 tibble_3.1.7
## [82] iterators_1.0.14 memoise_2.0.1 cluster_2.1.3
## [85] ellipsis_0.3.2