R Notebook

SQ NOTE: use this to process the mouse bulk RNAseq data, three FAD vs three Ctrl samples. can run all, no problem.

By default, the ctrl (ref) samples will be on the left side of the csv file, and fad sample on the right side of csv file.

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("DESeq2")

library(DESeq2)

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.2.1

library(org.Mm.eg.db)

## Loading required package: AnnotationDbi

##

ls

## MM_STAR_dir
## Mus_musculus.GRCm39.109.gtf
## Mus_musculus.GRCm39.dna_sm.primary_assembly.fa
## Sanbomics_RNAseq_video6.ipynb
## bams
## count.out
## count.out.summary
## count_table.csv
## deseq_results_all_genes.csv
## deseq_results_sig_genes.csv
## diff_expr.Rmd
## diff_expr_MS.Rmd
## diff_expr_MS.html
## diff_expr_MS.nb.html
## ensemble_key_mapper_mm.csv
## fastq
## heatmap_v1.pdf
## heatmap_v1.png
## mapped
## ref
## rnaseq_ms.Rproj
## rsconnect

Counts <- read.delim("count_table.csv", header = TRUE, row.names = 1, sep = ",")

Counts

Counts <- Counts[which(rowSums(Counts) > 25),]

Counts

condition <- factor(c("wt","wt","wt","fad","fad","fad"))

coldata <- data.frame(row.names = colnames(Counts), condition)

coldata

dds <- DESeqDataSetFromMatrix(countData = Counts, colData = coldata, design = ~condition)
dds$condition <- relevel(dds$condition, ref = "wt")

dds <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

vsdata <- vst(dds, blind=FALSE)

plotPCA(vsdata, intgroup = "condition")+ geom_text(aes(label=name),vjust=2)

plotDispEsts(dds)

resultsNames(dds)

## [1] "Intercept"           "condition_fad_vs_wt"

res <- results(dds)

res

## log2 fold change (MLE): condition fad vs wt 
## Wald test p-value: condition fad vs wt 
## DataFrame with 21326 rows and 6 columns
##                      baseMean log2FoldChange     lfcSE      stat    pvalue
##                     <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000100480    8.80389     -1.1773226  0.802749 -1.466614  0.142481
## ENSMUSG00000051285 4597.07526      0.0439428  0.110797  0.396606  0.691658
## ENSMUSG00000103509  382.25020      0.1376425  0.239029  0.575840  0.564723
## ENSMUSG00000048538    4.54758     -1.7152120  1.250257 -1.371887  0.170099
## ENSMUSG00000097797   29.82424     -0.3425033  0.532200 -0.643561  0.519860
## ...                       ...            ...       ...       ...       ...
## ENSMUSG00000064368  18875.954     -0.2148991 0.0921759 -2.331402 0.0197321
## ENSMUSG00000064370 176355.018     -0.1403333 0.1078287 -1.301447 0.1931055
## ENSMUSG00000064372   1230.703     -0.0946784 0.1440312 -0.657346 0.5109582
## ENSMUSG00000095041   9465.453      0.9065034 0.5485486  1.652549 0.0984227
## ENSMUSG00000095742    212.973      0.0377310 0.2497472  0.151077 0.8799153
##                         padj
##                    <numeric>
## ENSMUSG00000100480  0.934367
## ENSMUSG00000051285  0.992713
## ENSMUSG00000103509  0.989088
## ENSMUSG00000048538  0.944566
## ENSMUSG00000097797  0.989088
## ...                      ...
## ENSMUSG00000064368  0.714208
## ENSMUSG00000064370  0.953216
## ENSMUSG00000064372  0.989088
## ENSMUSG00000095041  0.889714
## ENSMUSG00000095742  0.997788

summary(res) #summary of results

## 
## out of 21326 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 7, 0.033%
## LFC < 0 (down)     : 67, 0.31%
## outliers [1]       : 8, 0.038%
## low counts [2]     : 0, 0%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

res <- res[order(res$padj, decreasing = FALSE),]
head(res)

## log2 fold change (MLE): condition fad vs wt 
## Wald test p-value: condition fad vs wt 
## DataFrame with 6 rows and 6 columns
##                     baseMean log2FoldChange     lfcSE      stat      pvalue
##                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSMUSG00000048108  269.8040       -4.14532  0.421351  -9.83816 7.71079e-23
## ENSMUSG00000068323   93.3203       -5.53643  0.621219  -8.91221 5.00247e-19
## ENSMUSG00000004655  134.0642       -4.24178  0.514565  -8.24343 1.67337e-16
## ENSMUSG00000019232  498.5344       -1.41205  0.177111  -7.97267 1.55282e-15
## ENSMUSG00000039672   68.1150       -5.39696  0.678155  -7.95829 1.74437e-15
## ENSMUSG00000026579  166.2382       -3.33617  0.434970  -7.66990 1.72127e-14
##                           padj
##                      <numeric>
## ENSMUSG00000048108 1.64379e-18
## ENSMUSG00000068323 5.33213e-15
## ENSMUSG00000004655 1.18910e-12
## ENSMUSG00000019232 7.43730e-12
## ENSMUSG00000039672 7.43730e-12
## ENSMUSG00000026579 6.11568e-11

res.df <- as.data.frame(res)

res.df

res.df$symbol <- mapIds(org.Mm.eg.db, keys = rownames(res.df), keytype = "ENSEMBL", column = "SYMBOL")

## 'select()' returned 1:many mapping between keys and columns

write.csv(res, file = "deseq_results_all_genes.csv")

sigs <- na.omit(res)
sigs <- sigs[sigs$padj<0.05,]

sigs <- sigs[order(sigs$padj, decreasing = FALSE),]

sigs.df <- as.data.frame(sigs)
# additional filters to apply if needed
# sigs.df <- sigs.df[(sigs.df$baseMean > 50) & (abs(sigs.df$log2FoldChange) > 1.2),]
sigs.df$symbol <- mapIds(org.Mm.eg.db, keys = rownames(sigs.df), keytype = "ENSEMBL", column = "SYMBOL")

## 'select()' returned 1:1 mapping between keys and columns

sigs.df

write.csv(sigs, file = "deseq_results_sig_genes.csv")

A new section for complexheatmap

ls

## MM_STAR_dir
## Mus_musculus.GRCm39.109.gtf
## Mus_musculus.GRCm39.dna_sm.primary_assembly.fa
## Sanbomics_RNAseq_video6.ipynb
## bams
## count.out
## count.out.summary
## count_table.csv
## deseq_results_all_genes.csv
## deseq_results_sig_genes.csv
## diff_expr.Rmd
## diff_expr_MS.Rmd
## diff_expr_MS.html
## diff_expr_MS.nb.html
## diff_expr_MS_files
## ensemble_key_mapper_mm.csv
## fastq
## heatmap_v1.pdf
## heatmap_v1.png
## mapped
## ref
## rnaseq_ms.Rproj
## rsconnect

sigs

## log2 fold change (MLE): condition fad vs wt 
## Wald test p-value: condition fad vs wt 
## DataFrame with 59 rows and 6 columns
##                     baseMean log2FoldChange     lfcSE      stat      pvalue
##                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSMUSG00000048108  269.8040       -4.14532  0.421351  -9.83816 7.71079e-23
## ENSMUSG00000068323   93.3203       -5.53643  0.621219  -8.91221 5.00247e-19
## ENSMUSG00000004655  134.0642       -4.24178  0.514565  -8.24343 1.67337e-16
## ENSMUSG00000019232  498.5344       -1.41205  0.177111  -7.97267 1.55282e-15
## ENSMUSG00000039672   68.1150       -5.39696  0.678155  -7.95829 1.74437e-15
## ...                      ...            ...       ...       ...         ...
## ENSMUSG00000082045   7.93306       6.556752  1.693515   3.87168 0.000108087
## ENSMUSG00000037474 281.29224       0.874760  0.226225   3.86676 0.000110290
## ENSMUSG00000041577 368.76267      -1.005360  0.260310  -3.86217 0.000112385
## ENSMUSG00000032338 425.76013      -0.798469  0.207023  -3.85691 0.000114831
## ENSMUSG00000004098 176.61809      -0.866527  0.225835  -3.83699 0.000124554
##                           padj
##                      <numeric>
## ENSMUSG00000048108 1.64379e-18
## ENSMUSG00000068323 5.33213e-15
## ENSMUSG00000004655 1.18910e-12
## ENSMUSG00000019232 7.43730e-12
## ENSMUSG00000039672 7.43730e-12
## ...                        ...
## ENSMUSG00000082045   0.0418945
## ENSMUSG00000037474   0.0419849
## ENSMUSG00000041577   0.0420320
## ENSMUSG00000032338   0.0422064
## ENSMUSG00000004098   0.0450040

df <- as.data.frame(sigs)
df

ensembl_map <- read.csv('ensemble_key_mapper_mm.csv', header = FALSE)
keys <- ensembl_map$V1
values <- ensembl_map$V2
l <- list()
for (i in 1:length(keys)){
  l[keys[i]] <- values[i]
}

#for non-mapped labels
no_values <- setdiff(rownames(df), keys)
for (i in 1:length(no_values)){
  l[no_values[i]] <- 'NA'
}

df$symbol <- unlist(l[rownames(df)], use.names = FALSE)

df.top <- df[ (df$baseMean > 40) & (abs(df$log2FoldChange) > 0.75),]
# df.top <- df[ (df$baseMean > 40) & (abs(df$log2FoldChange) > 1.1),]
df.top

df.top <- df.top[order(df.top$log2FoldChange, decreasing = TRUE),]

rlog_out <- rlog(dds, blind=FALSE) #get normalized count data from dds object
mat<-assay(rlog_out)[rownames(df.top), rownames(coldata)] #sig genes x samples
colnames(mat) <- rownames(coldata)
base_mean <- rowMeans(mat)
mat.scaled <- t(apply(mat, 1, scale)) #center and scale each column (Z-score) then transpose
colnames(mat.scaled)<-colnames(mat)

num_keep <- 25
#1 to num_keep len-num_keep to len
rows_keep <- c(seq(1:num_keep), seq((nrow(mat.scaled)-num_keep), nrow(mat.scaled)) )

l2_val <- as.matrix(df.top[rows_keep,]$log2FoldChange) #getting log2 value for each gene we are keeping
colnames(l2_val)<-"logFC"
mean <- as.matrix(df.top[rows_keep,]$baseMean) #getting mean value for each gene we are keeping
colnames(mean)<-"AveExpr"

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("ComplexHeatmap")

library(ComplexHeatmap)

## Warning: package 'ComplexHeatmap' was built under R version 4.2.1

## Loading required package: grid

## ========================================
## ComplexHeatmap version 2.12.1
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================

library(RColorBrewer)
library(circlize)

## ========================================
## circlize version 0.4.15
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================

#maps values between b/w/r for min and max l2 values
col_logFC <- colorRamp2(c(min(l2_val),0, max(l2_val)), c("blue", "white", "red")) 

#maps between 0% quantile, and 75% quantile of mean values --- 0, 25, 50, 75, 100
col_AveExpr <- colorRamp2(c(quantile(mean)[1], quantile(mean)[4]), c("white", "red"))

ha <- HeatmapAnnotation(summary = anno_summary(gp = gpar(fill = 2), 
                                               height = unit(2, "cm")))
h1 <- Heatmap(mat.scaled[rows_keep,], cluster_rows = F, 
            column_labels = colnames(mat.scaled), name="Z-score",
            cluster_columns = T)
h2 <- Heatmap(l2_val, row_labels = df.top$symbol[rows_keep], 
            cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
            cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
              grid.text(round(l2_val[i, j],2), x, y)
            })

h3 <- Heatmap(mean, row_labels = df.top$symbol[rows_keep], 
            cluster_rows = F, name = "AveExpr", col=col_AveExpr,
            cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
              grid.text(round(mean[i, j],2), x, y)
            })
h<-h1+h2+h3
h

png("./heatmap_v1.png", res = 300, width = 3000, height = 5500)
print(h)
dev.off()

## png 
##   2

pdf("./heatmap_v1.pdf", width = 6, height = 9, pointsize = 6 )
print(h)
dev.off()

sessionInfo()

## R version 4.2.0 (2022-04-22 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] circlize_0.4.15             RColorBrewer_1.1-3         
##  [3] ComplexHeatmap_2.12.1       org.Mm.eg.db_3.15.0        
##  [5] AnnotationDbi_1.58.0        ggplot2_3.3.6              
##  [7] DESeq2_1.36.0               SummarizedExperiment_1.26.1
##  [9] Biobase_2.56.0              MatrixGenerics_1.8.1       
## [11] matrixStats_0.62.0          GenomicRanges_1.48.0       
## [13] GenomeInfoDb_1.32.2         IRanges_2.30.0             
## [15] S4Vectors_0.34.0            BiocGenerics_0.42.0        
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7           bit64_4.0.5            doParallel_1.0.17     
##  [4] httr_1.4.3             tools_4.2.0            bslib_0.3.1           
##  [7] utf8_1.2.2             R6_2.5.1               DBI_1.1.3             
## [10] colorspace_2.0-3       GetoptLong_1.0.5       withr_2.5.0           
## [13] tidyselect_1.1.2       bit_4.0.4              compiler_4.2.0        
## [16] cli_3.3.0              Cairo_1.5-15           DelayedArray_0.22.0   
## [19] labeling_0.4.2         sass_0.4.1             scales_1.2.0          
## [22] genefilter_1.78.0      stringr_1.4.0          digest_0.6.29         
## [25] rmarkdown_2.14         XVector_0.36.0         pkgconfig_2.0.3       
## [28] htmltools_0.5.2        fastmap_1.1.0          highr_0.9             
## [31] GlobalOptions_0.1.2    rlang_1.0.3            rstudioapi_0.13       
## [34] RSQLite_2.2.14         shape_1.4.6            jquerylib_0.1.4       
## [37] generics_0.1.2         farver_2.1.0           jsonlite_1.8.0        
## [40] BiocParallel_1.30.3    dplyr_1.0.9            RCurl_1.98-1.6        
## [43] magrittr_2.0.3         GenomeInfoDbData_1.2.8 Matrix_1.5-3          
## [46] Rcpp_1.0.8.3           munsell_0.5.0          fansi_1.0.3           
## [49] lifecycle_1.0.1        stringi_1.7.6          yaml_2.3.5            
## [52] zlibbioc_1.42.0        blob_1.2.3             parallel_4.2.0        
## [55] crayon_1.5.1           lattice_0.20-45        Biostrings_2.64.0     
## [58] splines_4.2.0          annotate_1.74.0        KEGGREST_1.36.2       
## [61] locfit_1.5-9.5         knitr_1.39             pillar_1.7.0          
## [64] rjson_0.2.21           geneplotter_1.74.0     codetools_0.2-18      
## [67] XML_3.99-0.10          glue_1.6.2             evaluate_0.15         
## [70] png_0.1-7              vctrs_0.4.1            foreach_1.5.2         
## [73] gtable_0.3.0           purrr_0.3.4            clue_0.3-62           
## [76] assertthat_0.2.1       cachem_1.0.6           xfun_0.31             
## [79] xtable_1.8-4           survival_3.3-1         tibble_3.1.7          
## [82] iterators_1.0.14       memoise_2.0.1          cluster_2.1.3         
## [85] ellipsis_0.3.2