The most recent update of this html document occurred: Wed May 16 10:53:51 2018

library(knitr)

library(ggplot2)
library(reshape)
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:reshape':
## 
##     expand, rename
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:base':
## 
##     apply
library(genefilter)
## 
## Attaching package: 'genefilter'
## The following objects are masked from 'package:matrixStats':
## 
##     rowSds, rowVars
library(CHBUtils)
library(gtools)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
library(devtools)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following object is masked from 'package:reshape':
## 
##     rename
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(isomiRs)
## Loading required package: DiscriMiner
## Loading required package: TMB
## Loading required package: RcppEigen
library(pheatmap)

knitr::opts_chunk$set(tidy=TRUE, highlight=TRUE, dev="png", fig.width=6,fig.heigh=6,
               cache=FALSE, highlight=TRUE, autodep=TRUE, 
               warning=FALSE, error=FALSE,
               message=FALSE, prompt=TRUE, comment='', fig.cap='', bootstrap.show.code=FALSE)

root_path = "C:/rstudio/mirna/final/"
root_file = file.path(root_path, "srna_out_files")
dir.create(root_file, showWarnings = FALSE)

metadata_fn =  list.files(file.path(root_path), pattern = "summary.csv$",recursive = T, full.names = T)
metadata = read.csv(metadata_fn, row.names="sample_id")
condition = names(metadata)[1]
design = metadata
formula = ~ condition # modify this to get your own formula, it should be a column in your metadata
isde=TRUE # turn this true to make DE ananlysis

Exploratory analysis

In this section we will see descriptive figures about quality of the data, reads with adapter, reads mapped to miRNAs, reads mapped to other small RNAs.

size distribution

After adapter removal, we can plot the size distribution of the small RNAs.

> files = list.files(file.path(root_path), pattern = "trimming_stats", recursive = T)
> isadapter = length(files) > 0
> names(files) = sapply(files, function(x) {
+     gsub("-ready.trimming_stats", "", basename(x))
+ })
> 
> 
> tab = data.frame()
> for (sample in rownames(metadata)) {
+     d = read.table(file.path(root_path, files[sample]), sep = " ")
+     tab = rbind(tab, d %>% mutate(sample = sample, group = metadata[sample, 
+         condition]))
+ }
> 
> 
> reads_adapter = tab %>% group_by(sample, group) %>% summarise(total = sum(V2))
> ggplot(reads_adapter, aes(x = sample, y = total, fill = group)) + geom_bar(stat = "identity", 
+     position = "dodge") + ggtitle("total number of reads with adapter") + ylab("# reads") + 
+     theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

> ggplot(tab, aes(x = V1, y = V2, group = sample)) + geom_bar(stat = "identity", 
+     position = "dodge") + facet_wrap(~group, ncol = 2) + ggtitle("size distribution") + 
+     ylab("# reads") + xlab("size") + theme(axis.text.x = element_text(angle = 90, 
+     vjust = 0.5, hjust = 1))

> files = list.files(file.path(root_path), pattern = "mirbase-ready", recursive = T, 
+     full.names = T)
> ismirbase = length(files) > 0
> mirdeep2_files = list.files(file.path(root_path), pattern = "novel-ready", recursive = T, 
+     full.names = T)
> ismirdeep2 = length(mirdeep2_files) > 0

miRNA

total miRNA expression annotated with mirbase

> names(files) = sapply(files, function(x) {
+     gsub("-mirbase-ready.counts", "", basename(x))
+ })
> 
> obj <- IsomirDataSeqFromFiles(files = files[rownames(design)], coldata = design, 
+     header = T, skip = 0)
> ggplot(data.frame(sample = colnames(counts(obj)), total = colSums(counts(obj)))) + 
+     geom_bar(aes(x = sample, y = total), stat = "identity") + theme(axis.text.x = element_text(angle = 90, 
+     vjust = 0.5, hjust = 1))

> mirna_step <- as.data.frame(colSums(counts(obj)))

Distribution of mirna expression

> ggplot(melt(counts(obj))) + geom_boxplot(aes(x = X2, y = value)) + scale_y_log10() + 
+     theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

cumulative distribution of miRNAs

> cs <- as.data.frame(apply(counts(obj), 2, function(x) {
+     cumsum(sort(x, decreasing = T))
+ }))
> cs$pos <- 1:nrow(cs)
> 
> ggplot((melt(cs, id.vars = "pos"))) + geom_line(aes(x = pos, y = value, color = variable)) + 
+     scale_y_log10()

Clustering

> counts = counts(obj)
> dds = DESeqDataSetFromMatrix(counts[rowSums(counts > 0) > 10, ], colData = design, 
+     design = ~1)
> vst = rlog(dds)
> 
> # pheatmap(assay(vst), annotation_col = design, show_rownames = F,
> # clustering_distance_cols = 'correlation', clustering_method = 'ward.D')

MDS plot

> # mds(assay(vst), condition = design[,condition])

complexity

Number of miRNAs with > 3 counts.

> kable(as.data.frame(colSums(counts > 10)))
colSums(counts > 10)
SRR2496797 254
SRR2496785 264
SRR2496788 139
SRR2496792 340
SRR2496795 304
SRR2496783 104
SRR2496791 208
SRR2496787 246
SRR2496782 290
SRR2496784 265
SRR2496786 82
SRR2496798 266
SRR2496796 202
SRR2496781 101
SRR2496790 293
SRR2496793 92
SRR2496789 130
SRR2496794 321

Differential expression

DESeq2 is used for this analysis.

> library(DESeq2)
> # library(DEGreport)
> library(vsn)
> #' save file
> save_file <- function(dat, fn, basedir = ".") {
+     tab <- cbind(id = data.frame(id = row.names(dat)), as.data.frame(dat))
+     write.table(tab, file.path(basedir, fn), quote = F, sep = "\t", row.names = F)
+ }
> 
> filter_handle <- function(res) {
+     res_nona <- res[!is.na(res$padj), ]
+     keep <- res_nona$padj < 0.1
+     res_nona[keep, ]
+ }
> 
> handle_deseq2 = function(dds, summarydata, column, prefix, all_combs = NULL) {
+     if (is.null(all_combs)) 
+         all_combs = combn(levels(summarydata[, column]), 2, simplify = FALSE)
+     all_results = list()
+     contrast_strings = list()
+     rlog = rlog(dds)
+     for (comb in all_combs) {
+         contrast_string = paste(comb, collapse = "_vs_")
+         cat("\n\n## Comparison: ", contrast_string, "\n")
+         contrast = c(column, comb)
+         res = results(dds, contrast = contrast)
+         res = res[order(res$padj), ]
+         all_results = c(all_results, res)
+         contrast_strings = c(contrast_strings, contrast_string)
+         print_out(dds, rlog, res, paste0(prefix, "_", contrast_string))
+     }
+     names(all_results) = contrast_strings
+     return(all_results)
+ }
> 
> do_de = function(raw, summarydata, condition, minc = 3) {
+     dss = DESeqDataSetFromMatrix(countData = raw[rowMeans(raw) > minc, ], colData = summarydata, 
+         design = ~condition)
+     dss = DESeq(dss)
+     dss
+ }
> 
> do_norm = function(dss, path, prefix) {
+     rlog_ma = assay(rlog(dss))
+     count_ma = counts(dss, normalized = TRUE)
+     raw = counts(dss, normalized = FALSE)
+     
+     fn_log = paste0(prefix, "_log_matrix.txt")
+     save_file(rlog_ma, fn_log, path)
+     
+     fn_count = paste0(prefix, "_norm_matrix.txt")
+     save_file(count_ma, fn_count, path)
+     
+     fn_raw = paste0(prefix, "_raw_matrix.txt")
+     save_file(raw, fn_raw, path)
+ }
> 
> print_out = function(dss, rlog = NULL, res = NULL, prefix = "standard_") {
+     plotDispEsts(dss)
+     if (is.null(res)) 
+         res = results(dss)
+     if (is.null(rlog)) 
+         rlog = rlog(dss)
+     
+     rlogmat = assay(rlog)
+     design = as.data.frame(colData(dss)[, names(colData(dss)) != "sizeFactor", 
+         drop = FALSE])
+     
+     out_df = as.data.frame(res)
+     out_df = out_df[!is.na(out_df$padj), ]
+     out_df = out_df[order(out_df$padj), ]
+     do_norm(dss, root_file, prefix)
+     
+     cat("\n", paste(capture.output(summary(res))[1:8], collapse = "<br>"), "\n")
+     cat("\n\n### MA plot plot\n\n")
+     DESeq2::plotMA(res)
+     
+     cat("\n\n### Top DE miRNAs\n\n")
+     print(kable(head(out_df, 20)))
+     fn = paste(prefix, ".tsv", sep = "")
+     save_file(out_df, fn, root_file)
+     
+     sign = row.names(out_df)[out_df$padj < 0.05 & !is.na(out_df$padj) & abs(out_df$log2FoldChange) > 
+         0.5]
+     
+     cat("\n\n### Heatmap most significand(", length(sign), "), padj<0.05 and log2FC > 0.5\n")
+     if (length(sign) < 5000) {
+         cat("Too few genes to plot.")
+     } else {
+         pheatmap(rlogmat[sign, ], show_rownames = F, annotation_col = design, 
+             clustering_distance_cols = "correlation", clustering_method = "ward.D2")
+         mds(rlogmat[sign, ], condition = design[, condition])
+     }
+     
+ }

Analysis for miRNA

> counts = counts(obj)
> dss = DESeqDataSetFromMatrix(countData = counts[rowSums(counts > 0) > 3, ], 
+     colData = design, design = ~batch)
> dss = DESeq(dss)
> all_results = handle_deseq2(dss, design, condition, "mirna_")

Comparison: F_vs_M


out of 406 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 20, 4.9%
LFC < 0 (down) : 2, 0.49%
outliers [1] : 0, 0%
low counts [2] : 142, 35%
(mean count < 4)

MA plot plot

Top DE miRNAs

baseMean log2FoldChange lfcSE stat pvalue padj
hsa-miR-4646-5p 3.405981e+01 8.797824 1.7809461 4.939972 0.0000008 0.0002063
hsa-miR-219a-2-3p 5.981795e+01 7.451697 1.7239601 4.322431 0.0000154 0.0020370
hsa-miR-1275 9.207448e+01 6.228503 1.5449361 4.031560 0.0000554 0.0048759
hsa-miR-10b-5p 4.028161e+05 -2.416169 0.6273645 -3.851300 0.0001175 0.0051697
hsa-miR-486-5p 1.611402e+06 -2.413232 0.6208915 -3.886720 0.0001016 0.0051697
hsa-miR-576-3p 2.427332e+01 6.639493 1.7238433 3.851564 0.0001174 0.0051697
hsa-miR-7847-3p 2.851261e+01 6.423556 1.7263968 3.720788 0.0001986 0.0074901
hsa-miR-2110 3.624389e+01 5.169466 1.4117892 3.661641 0.0002506 0.0082699
hsa-miR-766-5p 4.702677e+01 9.299753 2.6627057 3.492595 0.0004784 0.0140316
hsa-miR-548k 4.989899e+00 6.149741 1.8623425 3.302153 0.0009595 0.0253296
hsa-miR-193b-5p 4.921041e+01 3.502641 1.0763114 3.254301 0.0011367 0.0272813
hsa-miR-320d 1.178751e+02 3.052208 0.9528917 3.203100 0.0013596 0.0299105
hsa-miR-181c-5p 9.698695e+01 3.028320 0.9577253 3.161993 0.0015669 0.0305699
hsa-miR-4665-5p 4.865710e+01 9.350295 2.9663909 3.152078 0.0016211 0.0305699
hsa-miR-3620-5p 2.632829e+01 5.576037 1.8232027 3.058375 0.0022254 0.0391672
hsa-miR-6894-5p 8.263860e+00 6.009911 2.0274985 2.964200 0.0030347 0.0500727
hsa-miR-6740-5p 2.187887e+02 3.333372 1.1501896 2.898106 0.0037542 0.0583010
hsa-miR-30c-1-3p 1.282674e+02 3.310783 1.1676563 2.835409 0.0045767 0.0635921
hsa-miR-7975 6.217590e+00 6.263759 2.1988187 2.848693 0.0043899 0.0635921
hsa-miR-378a-3p 6.583108e+03 2.154846 0.7784957 2.767962 0.0056408 0.0744587

Heatmap most significand( 15 ), padj<0.05 and log2FC > 0.5

Too few genes to plot.

> DESeq2::plotCounts(dds, put_gene_name_here)

Analysis for novel miRNA

> counts = counts(obj_mirdeep)
> dss_mirdeep2 = DESeqDataSetFromMatrix(countData = counts[rowSums(counts > 0) > 
+     3, ], colData = design, design = ~batch)
> 
> dss = DESeq(dss)
> all_results = handle_deseq2(dss_mirdeep2, design, condition, "mirdeep2_")

Analysis for isomiRs

> counts_iso = counts(isoCounts(obj, ref = T, iso5 = T, iso3 = T, add = T, subs = T))
> dss_iso = DESeqDataSetFromMatrix(countData = counts_iso[rowSums(counts_iso > 
+     0) > 3, ], colData = design, design = ~batch)
> 
> dss_iso = DESeq(dss_iso)
> all_results = handle_deseq2(dss_iso, design, condition, "isomirs_")

Comparison: F_vs_M


out of 2332 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 106, 4.5%
LFC < 0 (down) : 28, 1.2%
outliers [1] : 0, 0%
low counts [2] : 769, 33%
(mean count < 2)

MA plot plot

Top DE miRNAs

baseMean log2FoldChange lfcSE stat pvalue padj
hsa-miR-483-5p.iso.t5:0.t3:0.ad:A.mm:0 168.88734 11.226455 1.696447 6.617626 0.00e+00 0.0000001
hsa-miR-193b-5p.iso.t5:0.t3:tga.ad:0.mm:0 116.42661 9.109118 1.475957 6.171668 0.00e+00 0.0000005
hsa-miR-320a.iso.t5:0.t3:0.ad:TAT.mm:0 34.67457 8.921150 1.720230 5.186020 2.00e-07 0.0000969
hsa-miR-584-5p.iso.t5:0.t3:g.ad:AT.mm:0 10.22043 7.249124 1.405099 5.159157 2.00e-07 0.0000969
hsa-miR-1307-3p.iso.t5:0.t3:G.ad:0.mm:0 19.78330 8.166836 1.620583 5.039443 5.00e-07 0.0001303
hsa-miR-320a.iso.t5:0.t3:cga.ad:ATT.mm:0 12.57756 7.528003 1.514466 4.970733 7.00e-07 0.0001303
hsa-miR-320b.iso.t5:0.t3:caa.ad:ATT.mm:0 12.57756 7.528003 1.514466 4.970733 7.00e-07 0.0001303
hsa-miR-320c.iso.t5:0.t3:t.ad:ATT.mm:0 12.57756 7.528003 1.514466 4.970733 7.00e-07 0.0001303
hsa-miR-30c-1-3p.iso.t5:0.t3:cc.ad:0.mm:0 54.38555 8.168848 1.666152 4.902821 9.00e-07 0.0001641
hsa-miR-6740-5p.iso.t5:ag.t3:A.ad:0.mm:0 22.77918 8.273814 1.817105 4.553295 5.30e-06 0.0007724
hsa-miR-873-3p.ref.t5:0.t3:0.ad:0.mm:0 18.28245 8.007490 1.760961 4.547227 5.40e-06 0.0007724
hsa-miR-6740-5p.ref.t5:0.t3:0.ad:0.mm:0 33.27529 7.614839 1.683390 4.523516 6.10e-06 0.0007922
hsa-miR-320a.iso.t5:G.t3:cga.ad:AT.mm:0 22.23981 7.916747 1.821125 4.347175 1.38e-05 0.0014369
hsa-miR-320b.iso.t5:G.t3:caa.ad:AT.mm:0 22.23981 7.916747 1.821125 4.347175 1.38e-05 0.0014369
hsa-miR-320c.iso.t5:G.t3:t.ad:AT.mm:0 22.23981 7.916747 1.821125 4.347175 1.38e-05 0.0014369
hsa-miR-576-3p.iso.t5:0.t3:C.ad:0.mm:0 21.64953 8.209513 1.922180 4.270938 1.95e-05 0.0019015
hsa-miR-9-5p.iso.t5:0.t3:a.ad:T.mm:0 13.14213 7.578587 1.821900 4.159717 3.19e-05 0.0029296
hsa-miR-584-5p.iso.t5:0.t3:g.ad:A.mm:0 34.96398 6.050823 1.465173 4.129767 3.63e-05 0.0031532
hsa-miR-320a.iso.t5:0.t3:a.ad:TT.mm:0 611.38604 4.212629 1.028334 4.096556 4.19e-05 0.0034142
hsa-miR-378a-3p.iso.t5:a.t3:0.ad:A.mm:0 22.81826 6.364778 1.557300 4.087059 4.37e-05 0.0034142

Heatmap most significand( 95 ), padj<0.05 and log2FC > 0.5

Too few genes to plot.

> DESeq2::plotCounts(dds, put_gene_name_here)

Files

Files generated contains raw count, normalized counts, log2 normalized counts and DESeq2 results.

R Session Info

(useful if replicating these results)

> sessionInfo()
R version 3.4.4 (2018-03-15)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 17134)

Matrix products: default

locale:
[1] LC_COLLATE=Russian_Russia.1251  LC_CTYPE=Russian_Russia.1251   
[3] LC_MONETARY=Russian_Russia.1251 LC_NUMERIC=C                   
[5] LC_TIME=Russian_Russia.1251    

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] vsn_3.46.0                 bindrcpp_0.2              
 [3] pheatmap_1.0.8             isomiRs_1.6.0             
 [5] RcppEigen_0.3.3.4.0        TMB_1.7.12                
 [7] DiscriMiner_0.1-29         dplyr_0.7.4               
 [9] devtools_1.13.5            gridExtra_2.3             
[11] gtools_3.5.0               CHBUtils_0.1              
[13] genefilter_1.60.0          DESeq2_1.18.1             
[15] SummarizedExperiment_1.8.1 DelayedArray_0.4.1        
[17] matrixStats_0.53.1         Biobase_2.38.0            
[19] GenomicRanges_1.30.0       GenomeInfoDb_1.14.0       
[21] IRanges_2.12.0             S4Vectors_0.16.0          
[23] BiocGenerics_0.24.0        reshape_0.8.7             
[25] ggplot2_2.2.1              knitr_1.20                

loaded via a namespace (and not attached):
 [1] minqa_1.2.4            colorspace_1.3-2       rprojroot_1.3-2       
 [4] htmlTable_1.11.2       XVector_0.18.0         base64enc_0.1-3       
 [7] rstudioapi_0.7         affyio_1.48.0          bit64_0.9-7           
[10] AnnotationDbi_1.40.0   splines_3.4.4          geneplotter_1.56.0    
[13] Formula_1.2-2          nloptr_1.0.4           annotate_1.56.1       
[16] cluster_2.0.6          gamlss.dist_5.0-4      readr_1.1.1           
[19] compiler_3.4.4         backports_1.1.2        assertthat_0.2.0      
[22] Matrix_1.2-12          lazyeval_0.2.1         limma_3.34.9          
[25] formatR_1.5            acepack_1.4.1          htmltools_0.3.6       
[28] tools_3.4.4            gtable_0.2.0           glue_1.2.0            
[31] GenomeInfoDbData_1.0.0 affy_1.56.0            Rcpp_0.12.16          
[34] gdata_2.18.0           preprocessCore_1.40.0  nlme_3.1-131.1        
[37] stringr_1.3.0          lme4_1.1-15            XML_3.98-1.10         
[40] zlibbioc_1.24.0        MASS_7.3-49            scales_0.5.0          
[43] BiocInstaller_1.28.0   hms_0.4.2              gamlss.data_5.0-0     
[46] RColorBrewer_1.1-2     yaml_2.1.18            memoise_1.1.0         
[49] rpart_4.1-13           latticeExtra_0.6-28    stringi_1.1.7         
[52] RSQLite_2.0            highr_0.6              checkmate_1.8.5       
[55] caTools_1.17.1         BiocParallel_1.12.0    rlang_0.2.0           
[58] pkgconfig_2.0.1        bitops_1.0-6           evaluate_0.10.1       
[61] lattice_0.20-35        purrr_0.2.4            bindr_0.1.1           
[64] htmlwidgets_1.0        labeling_0.3           bit_1.1-12            
[67] GGally_1.3.2           plyr_1.8.4             magrittr_1.5          
[70] R6_2.2.2               gplots_3.0.1           Hmisc_4.1-1           
[73] DBI_0.8                pillar_1.2.1           foreign_0.8-69        
[76] withr_2.1.2            survival_2.41-3        RCurl_1.95-4.10       
[79] nnet_7.3-12            tibble_1.4.2           gamlss_5.0-6          
[82] KernSmooth_2.23-15     rmarkdown_1.9          locfit_1.5-9.1        
[85] grid_3.4.4             data.table_1.10.4-3    blob_1.1.0            
[88] digest_0.6.15          xtable_1.8-2           tidyr_0.8.0           
[91] munsell_0.4.3