Objective

Perform meta analysis of TB microarray AND RNA-seq data using the Rank Sum approach.
Overview:

  1. Microarray data: load log(fold change) data that Diana generated with Geo2r
  2. RNA-seq data: load published tables
  3. Aggregate microarray data by gene symbol; where the same gene is represented by several probes, calculate the median log(fold change)
  4. Merge fold change data from all datasets into a single matrix (outer join)
  5. Filter out genes with missing values (NA): keep only genes with a maximum number of missing values
  6. In each dataset rank genes by fold change (convert fold changes to ranks; NAs stay NAs; maximum ranks will vary by NA count)
  7. Replace NAs with average ranks in the other datasets
  8. Now rank again so number of ranks in each dataset is the same
  9. Calculate rank sum statistics and p values using RankProd package
  10. Generate heatmaps of top 100 regulated genes
# CHECK R VERSION
# stopifnot(R.version.string == "R version 3.4.3 (2017-11-30)")

# CLEANUP
# Clear all variables:
rm(list=ls(all=TRUE))

# Unload current packages:
# if (!is.null(names(utils::sessionInfo()[["otherPkgs"]]))) pacman::p_unload("all")
# KNITR DEFAULTS
knitr::opts_chunk$set(eval = TRUE, fig.height = 10)
# LOAD PACKAGES:
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
if (!requireNamespace("ComplexHeatmap", quietly = TRUE)) BiocManager::install("ComplexHeatmap")
if (!require(stringr)) install.packages("stringr")
if (!require(rstudioapi)) install.packages("rstudioapi")
if (!require(tidyr)) install.packages("tidyr")
if (!require(tibble)) install.packages("tibble")
if (!require(readxl)) install.packages("readxl")
if (!require(purrr)) install.packages("purrr")
if (!require(RColorBrewer)) install.packages("RColorBrewer")
if (!require(devtools)) install.packages("devtools")
if (!require(magrittr)) install.packages("magrittr")
if (!require(dplyr)) install.packages("dplyr")
if (!require(scales)) install.packages("scales")
if (!require(knitr)) install.packages("knitr")
if (!require(kableExtra)) install.packages("kableExtra")
if (!requireNamespace("RankProd", quietly = TRUE)) BiocManager::install("RankProd")

library(magrittr)
library(dplyr)
library(scales)
library(knitr)
library(kableExtra)
library(RankProd)
library(stringr)
# GET CURRENT SCRIPT NAME
(this.script <- rstudioapi::getActiveDocumentContext() %>% .$path %>% basename)
getwd()
list.files()
stopifnot(this.script != "")

Data: microarray studies

Geo2r data files

geo2r.data.files <- list.files(path="../source_data", recursive = T, 
                               pattern="^GSE.*Geo2r.*txt$", full.names = T)
geo2r.data.files <- grep(pattern = "GSE108363", geo2r.data.files, invert = T, value = T)
cat(basename(geo2r.data.files), sep="<br>\n")
tailleux.data.files <- list.files(path="../source_data", recursive = T,
                                  pattern="*tailleux.*txt", full.names = T)
cat(basename(tailleux.data.files), sep="<br>\n")
muarray.data.files <- c(geo2r.data.files, tailleux.data.files)
muarray.data.files <- grep("GSE29731", muarray.data.files, invert = T, value=T)
cat(basename(muarray.data.files), sep="<br>\n")

GSE103092_Geo2r_symbols.txt
GSE34151_Geo2r.txt
02_tailleux_18hrs_a.Rmd.tt.DC.2020_05_22.txt
02_tailleux_18hrs_a.Rmd.tt.MP.2020_05_22.txt

Load Geo2r data

geo2r.data.list <- lapply(muarray.data.files, function(f) {
  a <- read.table(f, sep="\t", header=T, quote='"', stringsAsFactors = F)
  names(a)[grep("symbol", names(a), ignore.case = T)] <- "Gene.symbol"
  names(a) <- stringr::str_remove(names(a), "^X\\.")
  names(a) <- stringr::str_remove(names(a), "\\.$")
  a
}) %>% 
  set_names(stringr::str_remove_all(basename(muarray.data.files), "\\.txt"))


names(geo2r.data.list) <- stringr::str_remove(names(geo2r.data.list), "^X")
names(geo2r.data.list)[grep("DC", names(geo2r.data.list))] <- "A-BUGS-23_DC"
names(geo2r.data.list)[grep("MP", names(geo2r.data.list))] <- "A-BUGS-23_MP"

# class(geo2r.data.list) # "list"
# length(geo2r.data.list) # 4
# sapply(geo2r.data.list, class) # "data.frame"
# sapply(geo2r.data.list, nrow) %>% unname # 29102 47231 24501 24501
data.frame(List.item = names(geo2r.data.list),
           class = sapply(geo2r.data.list, class),
           Columns = sapply(geo2r.data.list, ncol),
           Rows = comma(sapply(geo2r.data.list, nrow)),
           Has.Gene.Symbols = sapply(geo2r.data.list, function(x) {any(grepl("Gene.symbol", names(x)))})
           # Unique.genes = sapply(geo2r.data.list, function(x) {length(unique(x$))})
           ) %>% 
  mutate(Has.Gene.Symbols = cell_spec(Has.Gene.Symbols, "html", color = ifelse(Has.Gene.Symbols == "TRUE", "black", "red"))) %>% 
  knitr::kable(., row.names = F, align=c("l", "l", "c", "c", "c"), format = "html", escape = F) %>% 
  kableExtra::kable_styling("striped", full_width = T)
List.item class Columns Rows Has.Gene.Symbols
GSE103092_Geo2r_symbols data.frame 7 29,102 TRUE
GSE34151_Geo2r data.frame 8 47,231 TRUE
A-BUGS-23_DC data.frame 9 24,501 TRUE
A-BUGS-23_MP data.frame 9 24,501 TRUE
# head(geo2r.data.list[[2]])

Aggregate logFC by gene symbol

geo.fold.change.list <- lapply(seq_along(geo2r.data.list), function(i, list.names) {
  df <- geo2r.data.list[[i]]
  x <- dplyr::select(df, Gene.symbol, logFC) %>% 
    dplyr::mutate(Gene.symbol = ifelse(Gene.symbol=="", NA, Gene.symbol)) %>% 
    tidyr::drop_na(.) %>% 
    dplyr::mutate(logFC = as.numeric(logFC)) %>% 
    dplyr::mutate(Gene.symbol = stringr::str_remove_all(Gene.symbol, '\\"')) %>% 
    tidyr::drop_na(.) %>% 
    dplyr::group_by(Gene.symbol) %>% 
    dplyr::summarise(logFC = median(logFC), .groups="drop") %>% 
    tidyr::drop_na(.) %>% 
    as.data.frame(., stringsAsFactors=FALSE)
  # names(x)[names(x) == "logFC"] <- list.names[i]
  x
}, list.names = names(geo2r.data.list)) %>% 
  set_names(names(geo2r.data.list))
names(geo.fold.change.list) <- stringr::str_remove(names(geo.fold.change.list), "_Geo2r.*$")
data.frame(
  List.item = names(geo.fold.change.list),
  Class = sapply(geo.fold.change.list, function(x) {paste(class(x), collapse=",")}),
  Columns = sapply(geo.fold.change.list, ncol),
  Rows = comma(sapply(geo.fold.change.list, nrow)),
  Duplicated.genes = sapply(geo.fold.change.list, function(x) {any(duplicated(x$Gene.symbol))})
) %>% 
  knitr::kable(., row.names = F, align=c("l", "l", "c", "c", "c"), format = "html", escape = F) %>% 
  kableExtra::kable_styling("striped", full_width = T)
List.item Class Columns Rows Duplicated.genes
GSE103092 data.frame 2 21,025 FALSE
GSE34151 data.frame 2 20,762 FALSE
A-BUGS-23_DC data.frame 2 13,768 FALSE
A-BUGS-23_MP data.frame 2 13,768 FALSE

Data: RNA-seq studies

# <a href="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE64179">GSE64179</a>
# https://pubmed.ncbi.nlm.nih.gov/26392366/
add.geo.hyperlink <- function(id) {
  a <- "<a href='https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
  b <- "'>"
  c <- "</a>"
  res <- paste0(a, id, b, id, c)
  res
}

add.pubmed.hyperlink <- function(id) {
  a <- "<a href='https://pubmed.ncbi.nlm.nih.gov/"
  b <- "/'>"
  c <- "</a>"
  res <- paste0(a, id, b, id, c)
  res
}
rs.dataset.table <- data.frame(
  c=c("GSE64179", "26392366", "6", "Dendritic cells"),
  b=c("GSE67427", "26586179", "8", "Monocyte derived macrophages"),
  a=c("GSE148731", "32341411", "6", "M1, M2 macrophages")
) %>%
  t %>% 
  as.data.frame(., stringsAsFactors=F) %>% 
  set_colnames(c("Accession", "Pubmed", "Replicates", "Cell.Type")) %>% 
  mutate(Accession=add.geo.hyperlink(Accession)) %>% 
  mutate(Pubmed = add.pubmed.hyperlink(Pubmed))
knitr::kable(rs.dataset.table, row.names = F, escape=F, format = "html") %>% 
  kableExtra::kable_styling("striped", full_width = T)
Accession Pubmed Replicates Cell.Type
GSE64179 26392366 6 Dendritic cells
GSE67427 26586179 8 Monocyte derived macrophages
GSE148731 32341411 6 M1, M2 macrophages

Load RNA-seq data: GSE64179

# list.files("../source_data/rna_seq/GSE64179")
rna.seq.data.files <- list()
rna.seq.data.files$GSE64179 <- "../source_data/rna_seq/GSE64179/Pacis_2015_TableS6_DEG.xlsx"
stopifnot(file.exists(rna.seq.data.files$GSE64179))
rna.seq.data.list <- list()
rna.seq.data.list$GSE64179 <- readxl::read_xlsx(path=rna.seq.data.files$GSE64179, range = "B3:C18818") %>%
set_colnames(c("Gene.symbol", "logFC")) %>% 
mutate(Gene.symbol = toupper(Gene.symbol)) %>% 
group_by(Gene.symbol) %>% 
summarise(logFC = median(logFC), .groups="drop") %>% 
as.data.frame(., stringsAsFactors=FALSE)
# WITHOUT GROUPING / AGGREGATING:
# dim(rna.seq.data$GSE64179) # 18815     2
# sapply(rna.seq.data$GSE64179, class) # "character"   "numeric"
# range(rna.seq.data$GSE64179$logFC) # -9.426613 11.612170
# sum(rna.seq.data$GSE64179$logFC == 0) # 0
# sum(duplicated(rna.seq.data$GSE64179$Gene.symbol)) # 37
# dups <- rna.seq.data$GSE64179$Gene.symbol[duplicated(rna.seq.data$GSE64179$Gene.symbol)]
# subset(rna.seq.data$GSE64179, Gene.symbol %in% dups) %>% 
#   arrange(Gene.symbol, logFC)
# head(rna.seq.data$GSE64179)
# WITH AGGREGATING: 
# dim(rna.seq.data$GSE64179) # 18776     2
# sapply(rna.seq.data$GSE64179, class) # "character"   "numeric"
# range(rna.seq.data$GSE64179$logFC) # -9.426613 11.612170
# sum(rna.seq.data$GSE64179$logFC == 0) # 0
# sum(duplicated(rna.seq.data$GSE64179$Gene.symbol)) # 0
head(rna.seq.data.list$GSE64179) %>% 
  knitr::kable(., row.names = F, format="html") %>% 
  kableExtra::kable_styling("striped", full_width = T)
Gene.symbol logFC
A1BG 0.0601550
A1CF -0.0382101
A2M -0.9667654
A2ML1 0.5481361
A3GALT2 0.6965051
A4GALT -0.0035594

Load RNA-seq data: GSE67427

rna.seq.data.files$GSE67427 <-"../source_data/rna_seq/GSE67427/GSE67427_table_s2.txt"
stopifnot(file.exists(rna.seq.data.files$GSE67427))
#rna.seq.data.list <-list()
rna.seq.data.list$GSE67427<- read.table(file = rna.seq.data.files$GSE67427, header = T, sep = "\t", stringsAsFactors = FALSE)  %>% 
#str(rna.seq.data.list$GSE67427) #data.frame':  12728 obs. of  140 variables:
  # head(rna.seq.data.list$GSE67427)
  
  # names(rna.seq.data.list$GSE67427)
  # select(matches(c("name", "Rv.18")))
 select(matches(c("name","Rv.18.logFC"), ignore.case = TRUE)) %>%
   # head(rna.seq.data.list$GSE67427)
  set_colnames(c("Gene.symbol", "logFC")) %>%
  mutate(Gene.symbol = toupper(Gene.symbol)) %>% 
  group_by(Gene.symbol) %>% 
  summarise(logFC = median(logFC), .groups="drop") %>% 
  as.data.frame(., stringsAsFactors=FALSE)
 head(rna.seq.data.list$GSE67427)
# WITHOUT GROUPING / AGGREGATING:
dim(rna.seq.data.list$GSE67427) #  12724     2
[1] 12724     2
sapply(rna.seq.data.list$GSE67427, class) # "character"   "numeric"
Gene.symbol       logFC 
"character"   "numeric" 
range(rna.seq.data.list$GSE67427$logFC) # -4.155479  8.205439
[1] -4.155479  8.205439
sum(rna.seq.data.list$GSE67427$logFC == 0) # 0
[1] 0
sum(duplicated(rna.seq.data.list$GSE67427$Gene.symbol)) # 0
[1] 0
# dups <- rna.seq.data$GSE64179$Gene.symbol[duplicated(rna.seq.data$GSE64179$Gene.symbol)]
# subset(rna.seq.data$GSE64179, Gene.symbol %in% dups) %>% 
#   arrange(Gene.symbol, logFC)
# head(rna.seq.data$GSE64179)
# WITH AGGREGATING: 
# dim(rna.seq.data$GSE64179) # 18776     2
# sapply(rna.seq.data$GSE64179, class) # "character"   "numeric"
# range(rna.seq.data$GSE64179$logFC) # -9.426613 11.612170
# sum(rna.seq.data$GSE64179$logFC == 0) # 0
# sum(duplicated(rna.seq.data$GSE64179$Gene.symbol)) # 0
head(rna.seq.data.list$GSE67427) %>% 
  knitr::kable(., row.names = F, format="html") %>% 
  kableExtra::kable_styling("striped", full_width = T)

Gene.symbol logFC
A1BG -0.1946990
A2M -0.4127574
A4GALT 3.1197860
AAAS -0.4116309
AACS -0.4685173
AAED1 0.0936937

NA

Load RNA-seq data: GSE48731.MF1

# list.files("../source_data/rna_seq/GSE148731")
# rna.seq.data.files <- list()
rna.seq.data.files$GSE148731.MF1 <- "../source_data/rna_seq/GSE148731/GSE148731_toptable_MF1_01.Rmd.2020_07_09.txt"
stopifnot(file.exists(rna.seq.data.files$GSE148731.MF1))
 # rna.seq.data.list <- list()
rna.seq.data.list$GSE148731.MF1 <- read.table(file = rna.seq.data.files$GSE148731.MF1, header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>%
   select(c("Gene.symbol", "logFC")) %>% 
  mutate(Gene.symbol = toupper(Gene.symbol)) %>% 
  # summarise(logFC = median(logFC), .groups="drop") %>% 
   as.data.frame(., stringsAsFactors=FALSE)
head(rna.seq.data.list$GSE148731.MF1)
# WITH AGGREGATING:
dim(rna.seq.data.list$GSE148731.MF1) # 17802     2
[1] 17802     2
sapply(rna.seq.data.list$GSE148731.MF1, class) # "character"   "numeric"
Gene.symbol       logFC 
"character"   "numeric" 
range(rna.seq.data.list$GSE148731.MF1$logFC) # -2.386124  9.704865
[1] -2.386124  9.704865
sum(rna.seq.data.list$GSE148731.MF1$logFC == 0) #
[1] 0
sum(duplicated(rna.seq.data.list$GSE148731.MF1$Gene.symbol)) # 0
[1] 0
head(rna.seq.data.list$GSE148731.MF1) %>% 
  knitr::kable(., row.names = F, format="html") %>% 
  kableExtra::kable_styling("striped", full_width = T)
Gene.symbol logFC
RSAD2 9.704866
IFIT1 7.715897
CMPK2 6.811519
GBP4 6.700888
SIGLEC1 6.601206
IFIT2 6.441537

Load RNA-seq data: GSE48731.MF2

 # list.files("../source_data/rna_seq/GSE148731")
# rna.seq.data.files <- list()
rna.seq.data.files$GSE148731.MF2 <- "../source_data/rna_seq/GSE148731/GSE148731_toptable_MF2_01.Rmd.2020_07_09.txt"
stopifnot(file.exists(rna.seq.data.files$GSE148731.MF2))
 # rna.seq.data.list <- list()
rna.seq.data.list$GSE148731.MF2 <- read.table(file = rna.seq.data.files$GSE148731.MF2, header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>%
   select(c("Gene.symbol", "logFC")) %>% 
  mutate(Gene.symbol = toupper(Gene.symbol)) %>% 
 # summarise(logFC = median(logFC), .groups="drop") %>% 
   as.data.frame(., stringsAsFactors=FALSE)
head(rna.seq.data.list$GSE148731.MF2)
# WITH AGGREGATING:
dim(rna.seq.data.list$GSE148731.MF2) #  17085     2
[1] 17085     2
sapply(rna.seq.data.list$GSE148731.MF2, class) # "character"   "numeric"
Gene.symbol       logFC 
"character"   "numeric" 
range(rna.seq.data.list$GSE148731.MF2$logFC) # -7.524136  9.795100
[1] -7.524136  9.795100
sum(rna.seq.data.list$GSE148731.MF2$logFC == 0) #
[1] 0
sum(duplicated(rna.seq.data.list$GSE148731.MF2$Gene.symbol)) # 0
[1] 0
head(rna.seq.data.list$GSE148731.MF2) %>% 
  knitr::kable(., row.names = F, format="html") %>% 
  kableExtra::kable_styling("striped", full_width = T)
Gene.symbol logFC
CXCL8 9.795100
IL1B 9.369129
INHBA 9.227757
AC005027.3 9.200438
CXCL1 8.877153
IL7R 7.901586

Combine all data frames

# length(rna.seq.data.list) #4
# length(geo.fold.change.list)# 4
all.data.list <- c(geo.fold.change.list, rna.seq.data.list)
all(sapply(all.data.list, class) == "data.frame") # TRUE
[1] TRUE
lapply(all.data.list, names)
$GSE103092
[1] "Gene.symbol" "logFC"      

$GSE34151
[1] "Gene.symbol" "logFC"      

$`A-BUGS-23_DC`
[1] "Gene.symbol" "logFC"      

$`A-BUGS-23_MP`
[1] "Gene.symbol" "logFC"      

$GSE64179
[1] "Gene.symbol" "logFC"      

$GSE67427
[1] "Gene.symbol" "logFC"      

$GSE148731.MF1
[1] "Gene.symbol" "logFC"      

$GSE148731.MF2
[1] "Gene.symbol" "logFC"      
length(all.data.list) # 8
[1] 8

Merge data frames

merged.df <- all.data.list %>% purrr::reduce(dplyr::full_join, by="Gene.symbol") %>% 
  tibble::column_to_rownames("Gene.symbol") %>% 
  set_colnames(paste0("dataset", seq_along(.)))

dim(merged.df) # 30687     8
[1] 30687     8
head(merged.df) %>% 
  knitr::kable(., row.names = T, format="html") %>% 
  kableExtra::kable_styling("striped", full_width = T)
dataset1 dataset2 dataset3 dataset4 dataset5 dataset6 dataset7 dataset8
A1BG -0.0991062 0.025601 NA NA 0.0601550 -0.1946990 -0.3256752 -0.2898109
A1BG-AS1 0.1676623 0.010700 NA NA NA NA -0.0930987 -0.0342790
A1CF 0.2107242 -0.020400 0.0479557 0.0408764 -0.0382101 NA NA NA
A2M -1.2083956 -0.382000 -0.3817807 0.0638831 -0.9667654 -0.4127574 -0.0881266 -1.2132680
A2M-AS1 0.4142154 NA NA NA NA NA -0.3492692 -1.5406447
A2ML1 0.1463684 0.003970 NA NA 0.5481361 NA NA NA

Summary table:

df.summary <- data.frame(
  Column = names(merged.df),
  Dataset=names(all.data.list),
  Min = round(sapply(merged.df, min, na.rm=T), 2),
  Max = round(sapply(merged.df, max, na.rm=T), 2),
  Count.zero = sapply(merged.df, function(x) {sum(x==0, na.rm=T)}),
  Count.NA = comma(sapply(merged.df, function(x) {sum(is.na(x))}))
) %>% 
  mutate(Count.zero = comma(as.integer(Count.zero), accuracy=1))
knitr::kable(df.summary, row.names = F, format="html", align=rep("l", 5)) %>% 
  kableExtra::kable_styling("striped", full_width = T)
Column Dataset Min Max Count.zero Count.NA
dataset1 GSE103092 -3.48 7.02 0 9,662
dataset2 GSE34151 -3.75 5.65 2 9,925
dataset3 A-BUGS-23_DC -8.78 9.84 1,670 16,919
dataset4 A-BUGS-23_MP -6.26 8.72 14 16,919
dataset5 GSE64179 -9.43 11.61 0 11,911
dataset6 GSE67427 -4.16 8.21 0 17,963
dataset7 GSE148731.MF1 -2.39 9.70 0 12,885
dataset8 GSE148731.MF2 -7.52 9.80 0 13,602
cat("Number of rows in merged data frame:<b>", comma(nrow(merged.df)), "</b><br>\n")

Number of rows in merged data frame: 30,687

cat("Number of gene symbols in merged data frame:<b>", comma(length(unique(row.names(merged.df)))), "</b><br>\n")

Number of gene symbols in merged data frame: 30,687

cat("Number of complete data rows: <b>", comma(sum(complete.cases(merged.df))), "</b> (",
    percent(sum(complete.cases(merged.df))/length(unique(row.names(merged.df)))),
    ")<br>\n", sep="")

Number of complete data rows: 7,729 (25%)

Filter rows: Three NA max

three.na.max.count <- as.matrix(merged.df) %>% 
  apply(., 1, function(x) {  # 1 = select rows
    n <- sum(is.na(x))
    n
  })
class(three.na.max.count) # integer
[1] "integer"
length(three.na.max.count) # 30687
[1] 30687
merged.df.filt.na <- merged.df[three.na.max.count <= 3,]
dim(merged.df.filt.na)# 14743 8 vs 11873     8  vs  8626    8 (one NA max)
[1] 14743     8
 dim(merged.df)    #  30687     8  
[1] 30687     8
cat("Number of rows with max three NA: <b>", comma(nrow(merged.df.filt.na)), "</b> (",
    percent(nrow(merged.df.filt.na)/length(unique(row.names(merged.df)))),
    ")<br>\n", sep="")
Number of rows with max three NA: <b>14,743</b> (48%)<br>

Print 10 rows:

head(merged.df.filt.na, 10) %>% 
  knitr::kable(., row.names = T, format="html", align=rep("l", 5)) %>% 
  kableExtra::kable_styling("striped", full_width = T)
dataset1 dataset2 dataset3 dataset4 dataset5 dataset6 dataset7 dataset8
A1BG -0.0991062 0.025601 NA NA 0.0601550 -0.1946990 -0.3256752 -0.2898109
A1CF 0.2107242 -0.020400 0.0479557 0.0408764 -0.0382101 NA NA NA
A2M -1.2083956 -0.382000 -0.3817807 0.0638831 -0.9667654 -0.4127574 -0.0881266 -1.2132680
A4GALT -0.3770078 0.288000 0.1716445 0.3668996 -0.0035594 3.1197860 0.3486892 2.8026054
A4GNT -0.5358127 0.002210 0.0000000 0.0000000 -2.9383732 NA NA NA
AAAS -0.0636266 -0.224000 0.1201606 -0.0527861 -0.6797045 -0.4116309 -0.3497130 -0.3449479
AACS -0.3830068 0.090400 -0.5272623 0.0146090 0.0995290 -0.4685173 -0.2095024 -0.2474777
AAGAB 0.1637777 -0.150000 -0.3421747 -0.6694862 -0.2844965 -0.2943462 0.0484454 -0.2547554
AAK1 0.0656871 -0.006945 0.0000000 0.0000000 -0.0675787 0.3417414 0.3352014 -0.1097963
AAMDC -0.1248942 -0.022400 -0.2027101 -0.1717276 -0.6259770 -0.0881718 0.0590204 0.0251869

Summary table after filtering NAs:

df.summary.2 <- data.frame(
  Column = names(merged.df.filt.na),
  Dataset=names(all.data.list),
  Min = round(sapply(merged.df.filt.na, min, na.rm=T), 2),
  Max = round(sapply(merged.df.filt.na, max, na.rm=T), 2),
  Count.zero = sapply(merged.df.filt.na, function(x) {sum(x==0, na.rm=T)}),
  Count.NA = comma(sapply(merged.df.filt.na, function(x) {sum(is.na(x))}))
) %>% 
  mutate(Count.zero = comma(as.integer(Count.zero), accuracy=1))
knitr::kable(df.summary.2, row.names = F, format="html", align=rep("l", 5)) %>% 
  kableExtra::kable_styling("striped", full_width = T)

Column Dataset Min Max Count.zero Count.NA
dataset1 GSE103092 -3.48 7.02 0 505
dataset2 GSE34151 -3.75 5.65 1 252
dataset3 A-BUGS-23_DC -8.78 9.84 1,300 2,705
dataset4 A-BUGS-23_MP -6.26 8.72 12 2,705
dataset5 GSE64179 -9.43 11.10 0 278
dataset6 GSE67427 -4.11 8.21 0 3,063
dataset7 GSE148731.MF1 -2.39 9.70 0 3,131
dataset8 GSE148731.MF2 -7.52 9.80 0 3,362

rm(df.summary.2)

Calculate ranks (1)

Key paramters:

  • NAs stay NAs
  • ties are averaged
  • keeping NAs results in different maximum ranks
  • (will have to rank again after filling NAs)

Ranks: down-regulated

ranks.down.mx <- apply(merged.df.filt.na, 2, rank, ties.method="average", na.last="keep")
# class(ranks.down.mx) # matrix
# dim(ranks.down.mx) # 12786     5
# ranks.down.mx[1:10,]

Print 6 rows (sorted):

ranks.down.mx[order(rowMeans(ranks.down.mx, na.rm=T)),] %>% head
       dataset1 dataset2 dataset3 dataset4 dataset5 dataset6
MS4A6A      302      2.0        4        6       17      295
TREM2       195     58.5       12       37       62       88
ADORA3      185      4.0      531       24        4       22
CD36        140    139.0       58        1      298      167
S100A4      284     68.5       75       86       54      896
FZD2        510     37.5      175       95      313      400
       dataset7 dataset8
MS4A6A        4        3
TREM2       337      236
ADORA3       NA       NA
CD36        297      413
S100A4       49      506
FZD2         NA      313

The maximum ranks will vary by dataset:

apply(ranks.down.mx, 2, max, na.rm=T) %>%
  as.data.frame(., stringsAsFactors=F) %>% 
  tibble::rownames_to_column() %>% 
  set_colnames(c("Dataset (down)", "Max.Rank")) %>% 
  mutate(Max.Rank = comma(Max.Rank)) %>% 
  knitr::kable(., row.names = F, align=c("l","r"), format="html") %>% 
  kableExtra::kable_styling("striped")
Dataset (down) Max.Rank
dataset1 14,238
dataset2 14,491
dataset3 12,038
dataset4 12,038
dataset5 14,465
dataset6 11,680
dataset7 11,612
dataset8 11,381

Ranks: up-regulated

ranks.up.mx <- apply((merged.df.filt.na * -1), 2, rank, ties.method="average", na.last="keep")

Print 6 rows (sorted):

ranks.up.mx[order(rowMeans(ranks.up.mx, na.rm=T)),] %>% head
        dataset1 dataset2 dataset3 dataset4 dataset5 dataset6
CCL20         53      8.0        5        2        7        3
CCR7           1      4.0       25       16       41        4
IL1B          48     11.0       18       59       36       37
IDO1         167      1.0       22       10       12       72
TNFAIP6       17    176.0       50       50       45       43
IL6          131    129.5       64        6       46       17
        dataset7 dataset8
CCL20         NA       NA
CCR7          NA       NA
IL1B          51        2
IDO1          NA       NA
TNFAIP6       NA       NA
IL6           NA       NA
apply(ranks.up.mx, 2, max, na.rm=T) %>%
  as.data.frame(., stringsAsFactors=F) %>% 
  tibble::rownames_to_column() %>% 
  set_colnames(c("Dataset (up)", "Max.Rank")) %>% 
  mutate(Max.Rank = comma(Max.Rank)) %>% 
  knitr::kable(., row.names = F, align=c("l","r"), format="html") %>% 
  kableExtra::kable_styling("striped")
Dataset (up) Max.Rank
dataset1 14,238
dataset2 14,491
dataset3 12,038
dataset4 12,038
dataset5 14,465
dataset6 11,680
dataset7 11,612
dataset8 11,381

Replace missing values with averages

na.to.average <- function(x) {
  any.na <- any(is.na(x))
  if (isTRUE(any.na)) {
    indx.na <- which(is.na(x))
    ave <- mean(x, na.rm=T)
    x[indx.na] <- ave
  }
  x
}

Replace NAs: down-regulation

ranks.down.nona.mx <- t(apply(ranks.down.mx, 1, na.to.average))
# dim(ranks.down.mx) # 12786     5
# dim(ranks.down.nona.mx) # 12786     5
# sum(is.na(ranks.down.mx)) # 1436
# sum(is.na(ranks.down.nona.mx)) # 0
# ranks.down.nona.mx[1:4,1:4]
ranks.down.nona.mx[order(rowMeans(ranks.down.nona.mx, na.rm=T)),] %>% 
  head
       dataset1 dataset2 dataset3 dataset4 dataset5 dataset6
MS4A6A      302      2.0        4        6       17      295
TREM2       195     58.5       12       37       62       88
ADORA3      185      4.0      531       24        4       22
CD36        140    139.0       58        1      298      167
S100A4      284     68.5       75       86       54      896
FZD2        510     37.5      175       95      313      400
       dataset7 dataset8
MS4A6A   4.0000   3.0000
TREM2  337.0000 236.0000
ADORA3 128.3333 128.3333
CD36   297.0000 413.0000
S100A4  49.0000 506.0000
FZD2   263.3571 313.0000

Replace NAs: up-regulation

ranks.up.nona.mx <- t(apply(ranks.up.mx, 1, na.to.average))
# dim(ranks.up.mx) # 12786     5
# dim(ranks.up.nona.mx) # 12786     5
# sum(is.na(ranks.up.mx)) # 1436
# sum(is.na(ranks.up.nona.mx)) # 0
# ranks.up.nona.mx[1:4,1:4]
ranks.up.nona.mx[order(rowMeans(ranks.up.nona.mx, na.rm=T)),] %>% 
  head
        dataset1 dataset2 dataset3 dataset4 dataset5 dataset6
CCL20         53      8.0        5        2        7        3
CCR7           1      4.0       25       16       41        4
IL1B          48     11.0       18       59       36       37
IDO1         167      1.0       22       10       12       72
TNFAIP6       17    176.0       50       50       45       43
IL6          131    129.5       64        6       46       17
        dataset7 dataset8
CCL20   13.00000 13.00000
CCR7    15.16667 15.16667
IL1B    51.00000  2.00000
IDO1    47.33333 47.33333
TNFAIP6 63.50000 63.50000
IL6     65.58333 65.58333

Calculate ranks (2)

Having replaced missing values with averages, repeat the ranking such that maximum ranks will be the same for each dataset.
Key paramters:

  • ties are averaged

Ranks: down-regulated (2)

ranks.down.mx.2 <- apply(ranks.down.nona.mx, 2, rank, ties.method="average", na.last="keep")
# class(ranks.down.mx.2) # matrix
# dim(ranks.down.mx.2) # 12786     5
# ranks.down.mx.2[1:10,]
ranks.down.mx.2[order(rowMeans(ranks.down.mx.2, na.rm=T)),] %>% 
  head
       dataset1 dataset2 dataset3 dataset4 dataset5 dataset6
MS4A6A      302      2.0        4        6       17      295
TREM2       195     58.5       12       37       62       88
ADORA3      185      4.0      532       24        4       22
CD36        140    139.0       58        1      298      167
S100A4      284     68.5       75       86       54      897
FZD2        511     37.5      175       95      313      400
       dataset7 dataset8
MS4A6A        4        3
TREM2       339      237
ADORA3      129      129
CD36        299      414
S100A4       49      507
FZD2        265      314
apply(ranks.down.mx.2, 2, max) %>%
  as.data.frame(., stringsAsFactors=F) %>% 
  tibble::rownames_to_column() %>% 
  set_colnames(c("Dataset (down)", "Max.Rank")) %>% 
  knitr::kable(., row.names = F, align=c("l","r"), format="html") %>% 
  kableExtra::kable_styling("striped")
Dataset (down) Max.Rank
dataset1 14743
dataset2 14743
dataset3 14743
dataset4 14743
dataset5 14743
dataset6 14743
dataset7 14743
dataset8 14743

Ranks: up-regulated (2)

ranks.up.mx.2 <- apply(ranks.up.nona.mx, 2, rank, ties.method="average", na.last="keep")
# ranks.up.mx.2[1:10,]
ranks.up.mx.2[order(rowMeans(ranks.up.mx.2, na.rm=T)),] %>% head
        dataset1 dataset2 dataset3 dataset4 dataset5 dataset6
CCL20         53      8.0        5        2        7        3
CCR7           1      4.0       25       16       41        4
IL1B          48     11.0       18       59       36       37
IDO1         167      1.0       22       10       12       72
TNFAIP6       17    176.0       50       50       45       43
IL6          131    129.5       64        6       46       17
        dataset7 dataset8
CCL20       13.5     13.5
CCR7        17.0     17.0
IL1B        54.0      2.0
IDO1        50.0     50.0
TNFAIP6     67.0     67.0
IL6         70.0     70.0
apply(ranks.up.mx.2, 2, max) %>%
  as.data.frame(., stringsAsFactors=F) %>% 
  tibble::rownames_to_column() %>% 
  set_colnames(c("Dataset (up)", "Max.Rank")) %>% 
  knitr::kable(., row.names = F, align=c("l","r"), format="html") %>% 
  kableExtra::kable_styling("striped")
Dataset (up) Max.Rank
dataset1 14743
dataset2 14743
dataset3 14743
dataset4 14743
dataset5 14743
dataset6 14743
dataset7 14743
dataset8 14743

Plot correlations

Plot fold changes

panel.cor <- function(x, y, digits = 3, prefix = "", cex.cor, ...)
{
     usr <- par("usr"); on.exit(par(usr))
     par(usr = c(0, 1, 0, 1))
     r <- cor(x, y, use="pairwise.complete.obs")
     txt <- format(c(r, 0.123456789), digits = digits)[1]
     txt <- paste0("r=", txt)
     # if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
     # text(0.5, 0.5, txt, cex = cex.cor * r)
     text(0.5, 0.5, txt, cex=1.5, col="blue", font = 3)
}

cor.df<- merged.df %>% 
  set_colnames(names(all.data.list)) 
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\_DC", " \nDC")
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\_MP", " \nMP")
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\.MF1", " \nMF1")
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\.MF2", " \nMF2") 
  colnames(cor.df) <- str_replace(colnames(cor.df), "GSE103092", "GSE103092\nMP")
  colnames(cor.df) <- str_replace(colnames(cor.df), "GSE64179", "GSE64179\nDC")              
  colnames(cor.df) <- str_replace(colnames(cor.df),"GSE34151", "GSE34151\nDC")
  colnames(cor.df) <- str_replace(colnames(cor.df),"GSE67427",
"GSE67427\nMP")
  
  pairs(cor.df, col = alpha("black", 0.3), pch=20, upper.panel = panel.cor,
        xaxt='n', yaxt='n',
        text.panel = function(x,y,lab,cex,font) {text(x,y,lab, cex=1.0, font=2)}) 

NA
NA

Save plot as a png

out.file <- paste("data_out", stringr::str_replace(this.script, "\\.[Rr][Mm][Dd]$", ".png"), sep="/")
out.file
[1] "data_out/rank_product_01g.3NA.png"
png(file=out.file, width = 680, height= 480, units = "px")
cor.df<- merged.df %>% 
  set_colnames(names(all.data.list)) 
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\_DC", " \nDC")
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\_MP", " \nMP")
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\.MF1", " \nMF1")
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\.MF2", " \nMF2") 
    colnames(cor.df) <- str_replace(colnames(cor.df), "GSE103092", "GSE103092\nMP")
  colnames(cor.df) <- str_replace(colnames(cor.df), "GSE64179", "GSE64179\nDC")              
  colnames(cor.df) <- str_replace(colnames(cor.df),"GSE34151", "GSE34151\nDC")
  colnames(cor.df) <- str_replace(colnames(cor.df),"GSE67427",
"GSE67427\nMP")
  
  pairs(cor.df, col = alpha("black", 0.3), pch=20, upper.panel = panel.cor,
        xaxt='n', yaxt='n',
        text.panel = function(x,y,lab,cex,font) {text(x,y,lab, cex=1.4, font=2)}) 
  
dev.off()
null device 
          1 

Plot ranks

panel.cor.spearman <- function(x, y, digits = 3, prefix = "", cex.cor, ...)
{
     usr <- par("usr"); on.exit(par(usr))
     par(usr = c(0, 1, 0, 1))
     r <- cor(x, y, use="pairwise.complete.obs", method="spearman")
     txt <- format(c(r, 0.123456789), digits = digits)[1]
     txt <- paste0("r=", txt)
     text(0.5, 0.5, txt, cex=2.5, col="purple", font = 3)
}
ranks.down.mx.2 %>% 
  set_colnames(names(all.data.list)) %>% 
  pairs(., col = alpha("black", 0.3), pch=18, upper.panel = panel.cor.spearman,
        xaxt='n', yaxt='n',
      text.panel = function(x,y,lab,cex,font) {text(x,y,lab, cex=3, font=2)})

Calculate rank sum statistics

# browseVignettes("RankProd")
# help(package="RankProd")
# help(RankProducts, package="RankProd")
get.rank.sum <- function(mx) {
  cl.down <- rep(1, ncol(mx))
  rank.sum <- RankProd::RankProducts(
    data=mx,
    cl=cl.down,
    calculateProduct=FALSE,
    gene.names = row.names(ranks.down.mx.2)
    )
  rank.sum
}
get.rank.sum.stats <- function(rank.sum.result) {
  # Get rank sums:
  RS <- as.data.frame(rank.sum.result$RSs) %>% 
    tibble::rownames_to_column(.) %>% 
    dplyr::select(1:2) %>% 
    set_colnames(c("Gene", "Rank.Sum")) %>% 
    arrange(Rank.Sum)
  
  # Get p values:
  PVAL <- as.data.frame(rank.sum.result$pval) %>% 
  tibble::rownames_to_column(.) %>% 
  dplyr::select(1:2) %>% 
  set_colnames(c("Gene", "P.Val"))
  
  # Merge:
  RS.PVAL <- merge(x=RS, y=PVAL, by="Gene", all=T)
  
  # Adjust p values:
  RS.PVAL$P.Val.Adj <- p.adjust(RS.PVAL$P.Val, method = "fdr")
  
  RS.PVAL
}

Rank sum stats: down-regulation

rank.sum.down <- get.rank.sum(ranks.down.mx.2) %>% 
  get.rank.sum.stats(.)
Rank Product analysis for paired case 
 

 done  
# class(rank.sum.down) # "data.frame"
# dim(rank.sum.down) # 11684     4
top.10.down<- rank.sum.down %>% 
  arrange(Rank.Sum) %>% 
  head(., 10) 
  # knitr::kable(., row.names = F, align=rep("l", 4), format = "html", escape = F) %>% 
  # kableExtra::kable_styling("striped", full_width = T)
top.10.down
NA

Save top 10 down-regulated genes

out.table.down <- paste0("data_out/",this.script, ".top.down.txt")
out.table.down
[1] "data_out/rank_product_01g.3NA.Rmd.top.down.txt"
write.table(top.10.down, file = out.table.down, sep = "\t")

Rank sum stats: up-regulation

rank.sum.up <- get.rank.sum(ranks.up.mx.2) %>% 
  get.rank.sum.stats(.)
Rank Product analysis for paired case 
 

 done  
# class(rank.sum.up) # "data.frame"
# dim(rank.sum.up) # 12786     4
top.10.up <- rank.sum.up %>% 
  arrange(Rank.Sum) %>% 
  head(., 10) 
  # knitr::kable(., row.names = F, align=rep("l", 4), format = "html", escape = F) %>% 
  # kableExtra::kable_styling("striped", full_width = T)
top.10.up

` ### Save top 10 up-regulated genes

out.table.up <- paste0("data_out/",this.script, ".top.up.txt")
out.table.up
[1] "data_out/rank_product_01g.3NA.Rmd.top.up.txt"
write.table(top.10.up, file = out.table.up, sep = "\t")

Heatmaps for top regulated genes

(Datanovia Tutorial)

get.heatmap <- function(mx, heat.cols, ha) {
  ComplexHeatmap::Heatmap(
  mx, name = "Rank",
  col = heat.cols,
  top_annotation = ha,
  cluster_rows = FALSE,
  show_row_names = FALSE,
  show_column_names = TRUE
  )
}

Heatmap: 100 most down-regulated genes

# nrow(subset(rank.sum.up, P.Val.Adj <= 0.05)) # 594
# nrow(subset(rank.sum.up, P.Val.Adj <= 0.01)) # 317
down.100 <- dplyr::arrange(rank.sum.down, P.Val.Adj, Rank.Sum) %>% 
  head(.,100) %>% .$Gene
# down.100
mx.down.100 <- ranks.down.mx.2[down.100,] %>% 
  set_colnames(names(all.data.list))
 colnames(mx.down.100) <- str_replace(colnames(mx.down.100), "_DC", " (DC)")
 colnames(mx.down.100) <- str_replace(colnames(mx.down.100), "_MP", " (MP)")
colnames(mx.down.100) <- str_replace(colnames(mx.down.100), ".MF1", " (MF1)")
colnames(mx.down.100) <- str_replace(colnames(mx.down.100), ".MF2", " (MF2)")
# dim(mx.down.100) # 100   5
head(mx.down.100)
       GSE103092 GSE34151 A-BUGS-23 (DC) A-BUGS-23 (MP) GSE64179
MS4A6A       302      2.0              4              6       17
ADORA3       185      4.0            532             24        4
TREM2        195     58.5             12             37       62
CD36         140    139.0             58              1      298
S100A4       284     68.5             75             86       54
FZD2         511     37.5            175             95      313
       GSE67427 GSE148731 (MF1) GSE148731 (MF2)
MS4A6A      295               4               3
ADORA3       22             129             129
TREM2        88             339             237
CD36        167             299             414
S100A4      897              49             507
FZD2        400             265             314

Save top 100 up-regulated genes

out.file <- paste("data_out", stringr::str_replace(this.script,"\\.[Rr][Mm][Dd]$", ".down.100.txt"), sep="/")

out.file
[1] "data_out/rank_product_01g.3NA.down.100.txt"
cat(down.100, file = out.file, sep = "\n")
down.100.df <- as.data.frame(t(mx.down.100))
# dim(down.100.df) # 5 100

row.names(down.100.df) <- names(all.data.list)

down.100.df <- as.data.frame(t(mx.down.100)) %>% 
  set_rownames(names(all.data.list)) %>% 
  tibble::rownames_to_column("Dataset") %>% 
  dplyr::mutate(Technology = "Microarray") %>%
  dplyr::mutate(Cell.type = "Macrophages") %>%
  dplyr::select(Dataset, Technology, Cell.type, dplyr::everything())
down.100.df$Technology[down.100.df$Dataset %in% names(rna.seq.data.list)] <- "RNA-seq"
down.100.df$Cell.type[down.100.df$Dataset %in% c("GSE64179", "A-BUGS-23_DC", "GSE34151")] <- "Dendritic cells"
down.100.df[,1:6]
# help(HeatmapAnnotation, package="ComplexHeatmap")
col.down <- list(Technology = c("Microarray" = "Darkblue", "RNA-seq" = "lightblue"),
            Cell.type = c("Macrophages" = "grey", "Dendritic cells" = "black"))
ha.down <- ComplexHeatmap::HeatmapAnnotation(
  Technology = down.100.df$Technology,
  Cell.type = down.100.df$Cell.type,
  col=col.down
)
rm(col.down)
# "YlOrRd", "YlOrBr", "YlGnBu"*, "PuBuGn", "YlGn", "RdPu"-, "PuRd"-, "BuGn"
# "Blues", "Greens", "Purples", "Oranges"
heat.cols <- colorRampPalette(RColorBrewer::brewer.pal(7, "Oranges"))(256) %>% rev
get.heatmap(mx.down.100, heat.cols, ha.down)

Save heatmap for down-regulated genes

out.file.map.down <- paste("data_out",
                           str_replace(this.script,"\\.[Rr][Mm][Dd]$", ".map.down.pdf"),
                           sep="/")
out.file.map.down
[1] "data_out/rank_product_01g.3NA.map.down.pdf"
pdf(file = out.file.map.down, width=6, height=8.5)
get.heatmap(mx.down.100, heat.cols, ha.down)
dev.off()
null device 
          1 

Heatmap: 100 most up-regulated genes

up.100 <- dplyr::arrange(rank.sum.up, P.Val.Adj, Rank.Sum) %>% 
  head(.,100) %>% .$Gene
head(up.100)
[1] "CCL20"   "CCR7"    "IL1B"    "IDO1"    "TNFAIP6" "IL6"    
mx.up.100 <- ranks.up.mx.2[up.100,] %>% 
  set_colnames(names(all.data.list))
colnames(mx.down.100) <- str_replace(colnames(mx.down.100), "_DC", " (DC)")
colnames(mx.down.100) <- str_replace(colnames(mx.down.100), "_MP", " (MP)")
colnames(mx.down.100) <- str_replace(colnames(mx.down.100), ".MF1", " (MF1)")
colnames(mx.down.100) <- str_replace(colnames(mx.down.100), ".MF2", " (MF2)")
 dim(mx.up.100) # 100   6
[1] 100   8
head(mx.up.100)
        GSE103092 GSE34151 A-BUGS-23_DC A-BUGS-23_MP GSE64179
CCL20          53      8.0            5            2        7
CCR7            1      4.0           25           16       41
IL1B           48     11.0           18           59       36
IDO1          167      1.0           22           10       12
TNFAIP6        17    176.0           50           50       45
IL6           131    129.5           64            6       46
        GSE67427 GSE148731.MF1 GSE148731.MF2
CCL20          3          13.5          13.5
CCR7           4          17.0          17.0
IL1B          37          54.0           2.0
IDO1          72          50.0          50.0
TNFAIP6       43          67.0          67.0
IL6           17          70.0          70.0

Save top 100 up-regulated genes

out.file <- paste("data_out", stringr::str_replace(this.script,"\\.[Rr][Mm][Dd]$", ".up.100.txt"), sep="/")

out.file
[1] "data_out/rank_product_01g.3NA.up.100.txt"
cat(up.100, file = out.file, sep = "\n")
up.100.df <- as.data.frame(t(mx.up.100))
# dim(down.100.df) # 5 100
row.names(up.100.df) <- names(all.data.list)

up.100.df <- as.data.frame(t(mx.up.100)) %>% 
  set_rownames(names(all.data.list)) %>% 
  tibble::rownames_to_column("Dataset") %>% 
  dplyr::mutate(Technology = "Microarray") %>%
  dplyr::mutate(Cell.type = "Macrophages") %>%
  dplyr::select(Dataset, Technology, Cell.type, dplyr::everything())
up.100.df$Technology[up.100.df$Dataset %in% names(rna.seq.data.list)] <- "RNA-seq"
up.100.df$Cell.type[up.100.df$Dataset %in% c("GSE64179", "A-BUGS-23_DC","GSE34151")] <- "Dendritic cells"
col.up <- list(Technology = c("Microarray" = "Darkblue", "RNA-seq" = "lightblue"),
            Cell.type = c("Macrophages" = "grey", "Dendritic cells" = "black"))
ha.up <- ComplexHeatmap::HeatmapAnnotation(
  Technology = up.100.df$Technology,
  Cell.type = up.100.df$Cell.type,
  col=col.up
)
heat.cols <- colorRampPalette(RColorBrewer::brewer.pal(7, "Oranges"))(256) %>% rev
get.heatmap(mx.up.100, heat.cols, ha.up)

Save heatmap for up-regulated genes

out.file.map.up <- paste("data_out", stringr::str_replace(this.script,"\\.[Rr][Mm][Dd]$", ".map.up.pdf"), sep="/")
out.file.map.up
[1] "data_out/rank_product_01g.3NA.map.up.pdf"
pdf(file = out.file.map.up, width=6, height=8.5)
get.heatmap(mx.up.100, heat.cols, ha.up)
dev.off()
null device 
          1 

Save genes

Save top down-regulated genes

gs.down.01 <- rank.sum.down %>% 
  dplyr::filter(P.Val.Adj <= 0.01) %>% 
  use_series("Gene")
 length(gs.down.01) # 824

[1] 824

cat("Number of consistently <b><i>down</i></b>regulated genes (p <= 0.01):<b>", length(gs.down.01), "</b>")

Number of consistently downregulated genes (p <= 0.01): 824

out.file.down.01 <- paste0("data_out/", this.script, ".down.01.txt")
cat(gs.down.01, sep="\n", file=out.file.down.01)
cat("File saved:", out.file.down.01)
File saved: data_out/rank_product_01g.3NA.Rmd.down.01.txt

Save down-regulated genes with adjusted p value <= 0.001

gs.down.001 <- rank.sum.down %>% 
  dplyr::filter(P.Val.Adj <= 0.001) %>% 
  use_series("Gene")
 length(gs.down.001) # 299
[1] 299
cat("Number of consistently <b><i>down</i></b>regulated genes (p <= 0.001):<b>", length(gs.down.001), "</b>")
Number of consistently <b><i>down</i></b>regulated genes (p <= 0.001):<b> 299 </b>
out.file.down.001 <- paste0("data_out/", this.script, ".down.001.txt")
cat(gs.down.001, sep="\n", file=out.file.down.001)
cat("File saved:", out.file.down.001)
File saved: data_out/rank_product_01g.3NA.Rmd.down.001.txt

Save top up-regulated genes

gs.up.01 <- rank.sum.up %>% 
  dplyr::filter(P.Val.Adj <= 0.01) %>% 
  use_series("Gene")
length(gs.up.01) # 912

[1] 912

cat("Number of consistently <b><i>up</i></b>regulated genes (p <= 0.01):<b>", length(gs.up.01), "</b>")

Number of consistently upregulated genes (p <= 0.01): 912

out.file.up.01 <- paste0("data_out/", this.script, ".up.01.txt")
cat(gs.up.01, sep="\n", file=out.file.up.01)
cat("File saved:", out.file.up.01)
File saved: data_out/rank_product_01g.3NA.Rmd.up.01.txt

Save up-regulated genes with adjusted p value <= 0.001

gs.up.001 <- rank.sum.up %>% 
  dplyr::filter(P.Val.Adj <= 0.001) %>% 
  use_series("Gene")
length(gs.up.001) # 479
[1] 479
cat("Number of consistently <b><i>up</i></b>regulated genes (p <= 0.001):<b>", length(gs.up.001), "</b>")
Number of consistently <b><i>up</i></b>regulated genes (p <= 0.001):<b> 479 </b>
out.file.up.001 <- paste0("data_out/", this.script, ".up.001.txt")
cat(gs.up.001, sep="\n", file=out.file.up.001)
cat("File saved:", out.file.up.001)
File saved: data_out/rank_product_01g.3NA.Rmd.up.001.txt

Session info

Sys.time()
[1] "2020-09-09 12:00:33 BST"
# sessionInfo()
devtools::session_info()
─ Session info ──────────────────────────────────────────────────

─ Packages ──────────────────────────────────────────────────────
 package        * version date       lib source        
 assertthat       0.2.1   2019-03-21 [2] CRAN (R 3.5.3)
 backports        1.1.4   2019-04-10 [2] CRAN (R 3.5.3)
 BiocManager      1.30.10 2019-11-16 [1] CRAN (R 3.5.3)
 callr            3.4.3   2020-03-28 [1] CRAN (R 3.5.3)
 cellranger       1.1.0   2016-07-27 [1] CRAN (R 3.5.3)
 circlize         0.4.10  2020-06-15 [1] CRAN (R 3.5.3)
 cli              2.0.2   2020-02-28 [1] CRAN (R 3.5.3)
 colorspace       1.4-1   2019-03-18 [2] CRAN (R 3.5.3)
 ComplexHeatmap   1.20.0  2018-10-30 [1] Bioconductor  
 crayon           1.3.4   2017-09-16 [2] CRAN (R 3.5.3)
 desc             1.2.0   2018-05-01 [1] CRAN (R 3.5.3)
 devtools       * 2.3.0   2020-04-10 [1] CRAN (R 3.5.3)
 digest           0.6.25  2020-02-23 [1] CRAN (R 3.5.3)
 dplyr          * 1.0.0   2020-05-29 [1] CRAN (R 3.5.3)
 ellipsis         0.3.1   2020-05-15 [1] CRAN (R 3.5.3)
 evaluate         0.14    2019-05-28 [2] CRAN (R 3.5.3)
 fansi            0.4.0   2018-10-05 [2] CRAN (R 3.5.3)
 farver           2.0.3   2020-01-16 [1] CRAN (R 3.5.3)
 fs               1.4.2   2020-06-30 [1] CRAN (R 3.5.3)
 generics         0.0.2   2018-11-29 [1] CRAN (R 3.5.3)
 GetoptLong       1.0.2   2020-07-06 [1] CRAN (R 3.5.3)
 GlobalOptions    0.1.2   2020-06-10 [1] CRAN (R 3.5.3)
 glue             1.4.1   2020-05-13 [1] CRAN (R 3.5.3)
 gmp            * 0.6-0   2020-06-09 [1] CRAN (R 3.5.3)
 highr            0.8     2019-03-20 [2] CRAN (R 3.5.3)
 hms              0.5.3   2020-01-08 [1] CRAN (R 3.5.3)
 htmltools        0.5.0   2020-06-16 [1] CRAN (R 3.5.3)
 httr             1.4.1   2019-08-05 [1] CRAN (R 3.5.3)
 kableExtra     * 1.1.0   2019-03-16 [1] CRAN (R 3.5.3)
 knitr          * 1.25    2019-09-18 [2] CRAN (R 3.5.3)
 lifecycle        0.2.0   2020-03-06 [1] CRAN (R 3.5.3)
 magrittr       * 1.5     2014-11-22 [2] CRAN (R 3.5.3)
 memoise          1.1.0   2017-04-21 [1] CRAN (R 3.5.3)
 munsell          0.5.0   2018-06-12 [2] CRAN (R 3.5.3)
 packrat          0.5.0   2018-11-14 [1] CRAN (R 3.5.3)
 pillar           1.4.5   2020-07-09 [1] CRAN (R 3.5.3)
 pkgbuild         1.0.8   2020-05-07 [1] CRAN (R 3.5.3)
 pkgconfig        2.0.2   2018-08-16 [2] CRAN (R 3.5.3)
 pkgload          1.1.0   2020-05-29 [1] CRAN (R 3.5.3)
 prettyunits      1.0.2   2015-07-13 [2] CRAN (R 3.5.3)
 processx         3.4.1   2019-07-18 [2] CRAN (R 3.5.3)
 ps               1.3.0   2018-12-21 [2] CRAN (R 3.5.3)
 purrr          * 0.3.4   2020-04-17 [1] CRAN (R 3.5.3)
 R6               2.4.0   2019-02-14 [2] CRAN (R 3.5.3)
 RankProd       * 3.8.0   2018-10-30 [1] Bioconductor  
 RColorBrewer   * 1.1-2   2014-12-07 [2] CRAN (R 3.5.3)
 Rcpp             1.0.2   2019-07-25 [2] CRAN (R 3.5.3)
 readr            1.3.1   2018-12-21 [1] CRAN (R 3.5.3)
 readxl         * 1.3.1   2019-03-13 [1] CRAN (R 3.5.3)
 rematch          1.0.1   2016-04-21 [2] CRAN (R 3.5.3)
 remotes          2.1.1   2020-02-15 [1] CRAN (R 3.5.3)
 rjson            0.2.20  2018-06-08 [1] CRAN (R 3.5.3)
 rlang            0.4.7   2020-07-09 [1] CRAN (R 3.5.3)
 rmarkdown        2.3     2020-06-18 [1] CRAN (R 3.5.3)
 Rmpfr          * 0.8-1   2020-01-24 [1] CRAN (R 3.5.3)
 rprojroot        1.3-2   2018-01-03 [1] CRAN (R 3.5.3)
 rstudioapi     * 0.11    2020-02-07 [1] CRAN (R 3.5.3)
 rvest            0.3.5   2019-11-08 [1] CRAN (R 3.5.3)
 scales         * 1.1.1   2020-05-11 [1] CRAN (R 3.5.3)
 sessioninfo      1.1.1   2018-11-05 [1] CRAN (R 3.5.3)
 shape            1.4.4   2018-02-07 [1] CRAN (R 3.5.3)
 stringi          1.4.3   2019-03-12 [2] CRAN (R 3.5.3)
 stringr        * 1.4.0   2019-02-10 [2] CRAN (R 3.5.3)
 testthat         2.3.2   2020-03-02 [1] CRAN (R 3.5.3)
 tibble         * 3.0.2   2020-07-07 [1] CRAN (R 3.5.3)
 tidyr          * 1.1.0   2020-05-20 [1] CRAN (R 3.5.3)
 tidyselect       1.1.0   2020-05-11 [1] CRAN (R 3.5.3)
 usethis        * 1.6.1   2020-04-29 [1] CRAN (R 3.5.3)
 vctrs            0.3.1   2020-06-05 [1] CRAN (R 3.5.3)
 viridisLite      0.3.0   2018-02-01 [2] CRAN (R 3.5.3)
 webshot          0.5.1   2018-09-28 [2] CRAN (R 3.5.3)
 withr            2.1.2   2018-03-15 [2] CRAN (R 3.5.3)
 xfun             0.9     2019-08-21 [2] CRAN (R 3.5.3)
 xml2             1.3.2   2020-04-23 [1] CRAN (R 3.5.3)

[1] /homes/homedirs20/sghms/student/users/m1902840/R/x86_64-pc-linux-gnu-library/3.5
[2] /usr/local/lib64/R/library
knitr::purl(this.script, documentation = 0)


processing file: rank_product_01g.3NA.Rmd

  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |.                                                                |   1%
  |                                                                       
  |.                                                                |   2%
  |                                                                       
  |..                                                               |   2%
  |                                                                       
  |..                                                               |   3%
  |                                                                       
  |...                                                              |   4%
  |                                                                       
  |...                                                              |   5%
  |                                                                       
  |....                                                             |   6%
  |                                                                       
  |.....                                                            |   7%
  |                                                                       
  |.....                                                            |   8%
  |                                                                       
  |......                                                           |   9%
  |                                                                       
  |......                                                           |  10%
  |                                                                       
  |.......                                                          |  10%
  |                                                                       
  |.......                                                          |  11%
  |                                                                       
  |........                                                         |  12%
  |                                                                       
  |........                                                         |  13%
  |                                                                       
  |.........                                                        |  13%
  |                                                                       
  |.........                                                        |  14%
  |                                                                       
  |..........                                                       |  15%
  |                                                                       
  |..........                                                       |  16%
  |                                                                       
  |...........                                                      |  16%
  |                                                                       
  |...........                                                      |  17%
  |                                                                       
  |............                                                     |  18%
  |                                                                       
  |............                                                     |  19%
  |                                                                       
  |.............                                                    |  20%
  |                                                                       
  |..............                                                   |  21%
  |                                                                       
  |..............                                                   |  22%
  |                                                                       
  |...............                                                  |  23%
  |                                                                       
  |...............                                                  |  24%
  |                                                                       
  |................                                                 |  24%
  |                                                                       
  |................                                                 |  25%
  |                                                                       
  |.................                                                |  25%
  |                                                                       
  |.................                                                |  26%
  |                                                                       
  |.................                                                |  27%
  |                                                                       
  |..................                                               |  27%
  |                                                                       
  |..................                                               |  28%
  |                                                                       
  |...................                                              |  29%
  |                                                                       
  |....................                                             |  30%
  |                                                                       
  |....................                                             |  31%
  |                                                                       
  |.....................                                            |  32%
  |                                                                       
  |.....................                                            |  33%
  |                                                                       
  |......................                                           |  34%
  |                                                                       
  |.......................                                          |  35%
  |                                                                       
  |.......................                                          |  36%
  |                                                                       
  |........................                                         |  36%
  |                                                                       
  |........................                                         |  37%
  |                                                                       
  |........................                                         |  38%
  |                                                                       
  |.........................                                        |  38%
  |                                                                       
  |.........................                                        |  39%
  |                                                                       
  |..........................                                       |  39%
  |                                                                       
  |..........................                                       |  40%
  |                                                                       
  |...........................                                      |  41%
  |                                                                       
  |...........................                                      |  42%
  |                                                                       
  |............................                                     |  43%
  |                                                                       
  |.............................                                    |  44%
  |                                                                       
  |.............................                                    |  45%
  |                                                                       
  |..............................                                   |  46%
  |                                                                       
  |..............................                                   |  47%
  |                                                                       
  |...............................                                  |  47%
  |                                                                       
  |...............................                                  |  48%
  |                                                                       
  |................................                                 |  49%
  |                                                                       
  |................................                                 |  50%
  |                                                                       
  |.................................                                |  50%
  |                                                                       
  |.................................                                |  51%
  |                                                                       
  |..................................                               |  52%
  |                                                                       
  |..................................                               |  53%
  |                                                                       
  |...................................                              |  53%
  |                                                                       
  |...................................                              |  54%
  |                                                                       
  |....................................                             |  55%
  |                                                                       
  |....................................                             |  56%
  |                                                                       
  |.....................................                            |  57%
  |                                                                       
  |......................................                           |  58%
  |                                                                       
  |......................................                           |  59%
  |                                                                       
  |.......................................                          |  60%
  |                                                                       
  |.......................................                          |  61%
  |                                                                       
  |........................................                         |  61%
  |                                                                       
  |........................................                         |  62%
  |                                                                       
  |.........................................                        |  62%
  |                                                                       
  |.........................................                        |  63%
  |                                                                       
  |.........................................                        |  64%
  |                                                                       
  |..........................................                       |  64%
  |                                                                       
  |..........................................                       |  65%
  |                                                                       
  |...........................................                      |  66%
  |                                                                       
  |............................................                     |  67%
  |                                                                       
  |............................................                     |  68%
  |                                                                       
  |.............................................                    |  69%
  |                                                                       
  |.............................................                    |  70%
  |                                                                       
  |..............................................                   |  71%
  |                                                                       
  |...............................................                  |  72%
  |                                                                       
  |...............................................                  |  73%
  |                                                                       
  |................................................                 |  73%
  |                                                                       
  |................................................                 |  74%
  |                                                                       
  |................................................                 |  75%
  |                                                                       
  |.................................................                |  75%
  |                                                                       
  |.................................................                |  76%
  |                                                                       
  |..................................................               |  76%
  |                                                                       
  |..................................................               |  77%
  |                                                                       
  |...................................................              |  78%
  |                                                                       
  |...................................................              |  79%
  |                                                                       
  |....................................................             |  80%
  |                                                                       
  |.....................................................            |  81%
  |                                                                       
  |.....................................................            |  82%
  |                                                                       
  |......................................................           |  83%
  |                                                                       
  |......................................................           |  84%
  |                                                                       
  |.......................................................          |  84%
  |                                                                       
  |.......................................................          |  85%
  |                                                                       
  |........................................................         |  86%
  |                                                                       
  |........................................................         |  87%
  |                                                                       
  |.........................................................        |  87%
  |                                                                       
  |.........................................................        |  88%
  |                                                                       
  |..........................................................       |  89%
  |                                                                       
  |..........................................................       |  90%
  |                                                                       
  |...........................................................      |  90%
  |                                                                       
  |...........................................................      |  91%
  |                                                                       
  |............................................................     |  92%
  |                                                                       
  |............................................................     |  93%
  |                                                                       
  |.............................................................    |  94%
  |                                                                       
  |..............................................................   |  95%
  |                                                                       
  |..............................................................   |  96%
  |                                                                       
  |...............................................................  |  97%
  |                                                                       
  |...............................................................  |  98%
  |                                                                       
  |................................................................ |  98%
  |                                                                       
  |................................................................ |  99%
  |                                                                       
  |.................................................................|  99%
  |                                                                       
  |.................................................................| 100%
output file: rank_product_01g.3NA.R
[1] "rank_product_01g.3NA.R"
---
title: "Meta analysis - rank product method"
author: '9401'
date: "`r Sys.Date()`"
output:
  html_notebook:
    df_print: paged
    toc: yes
    highlight: tango
    number_sections: no
    fig_caption: yes
    theme: sandstone
    code_folding: hide
    toc_depth: 3
    toc_float: yes
  pdf_document:
    toc: yes
    toc_depth: '3'
  html_document:
    toc: yes
    toc_depth: '3'
    df_print: paged
---
  
<style>
h1 {background: darkblue;color: white;padding-left: 7px;}
h2 {color: darkblue;}
.code-folding-btn {display: none;}
</style>
  
<script>
  function show_span(id) {
    var x = document.getElementById(id);
    if (x.style.display === 'none') {
      x.style.display = 'inline';
    } else {
      x.style.display = 'none';
    }
  }
function myFunction(id) {
  var x = document.getElementById(id);
  if (x.style.display === 'none') {
    x.style.display = 'block';
  } else {
    x.style.display = 'none';
  }
}
</script>

## Objective  

Perform meta analysis of TB microarray AND RNA-seq data using the Rank Sum approach.  
Overview:  

1. Microarray data: load log(fold change) data that Diana generated with Geo2r  
2. RNA-seq data: load published tables  
3. Aggregate microarray data by gene symbol; where the same gene is represented by several probes, calculate the _**median**_ log(fold change)  
4. Merge fold change data from all datasets into a single matrix (**outer join**)  
5. Filter out genes with missing values (NA): keep only genes with a maximum number of missing values  
6. In each dataset rank genes by fold change (convert fold changes to ranks; NAs stay NAs; maximum ranks will vary by NA count)  
7. Replace NAs with average ranks in the other datasets  
8. Now rank again so number of ranks in each dataset is the same  
9. Calculate **rank sum** statistics and p values using RankProd package  
10. Generate heatmaps of top 100 regulated genes  
  
```{r include=FALSE}
# About <style> chunk above:
# * provides custom formatting for level-1 and level-2 headers
# * hides the 'code' buttons next to every code chunk in the html output; leaves only one 'code' button at the top of the html notebook.

# About the <script> chunk at the top and bottom of the file:
# * allows to insert <div>'s whose content visbility is toggled with a button.
# * I use this to show/hide te output of sessionInfo() at the end of the script (see below)
```


```{r include=FALSE}
# RMARKDOWN HELP
# Rmarkdown:
# https://bookdown.org/yihui/rmarkdown/

# Chunk options:
# https://yihui.name/knitr/options
```

```{r cleanup, warning=FALSE, message=FALSE, error=FALSE}
# CHECK R VERSION
# stopifnot(R.version.string == "R version 3.4.3 (2017-11-30)")

# CLEANUP
# Clear all variables:
rm(list=ls(all=TRUE))

# Unload current packages:
# if (!is.null(names(utils::sessionInfo()[["otherPkgs"]]))) pacman::p_unload("all")
```

```{r setup}
# KNITR DEFAULTS
knitr::opts_chunk$set(eval = TRUE, fig.height = 10)
```

```{r packages, results="hide", warning=FALSE, message=FALSE, error=FALSE}
# LOAD PACKAGES:
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
if (!requireNamespace("ComplexHeatmap", quietly = TRUE)) BiocManager::install("ComplexHeatmap")
if (!require(stringr)) install.packages("stringr")
if (!require(rstudioapi)) install.packages("rstudioapi")
if (!require(tidyr)) install.packages("tidyr")
if (!require(tibble)) install.packages("tibble")
if (!require(readxl)) install.packages("readxl")
if (!require(purrr)) install.packages("purrr")
if (!require(RColorBrewer)) install.packages("RColorBrewer")
if (!require(devtools)) install.packages("devtools")
if (!require(magrittr)) install.packages("magrittr")
if (!require(dplyr)) install.packages("dplyr")
if (!require(scales)) install.packages("scales")
if (!require(knitr)) install.packages("knitr")
if (!require(kableExtra)) install.packages("kableExtra")
if (!requireNamespace("RankProd", quietly = TRUE)) BiocManager::install("RankProd")

library(magrittr)
library(dplyr)
library(scales)
library(knitr)
library(kableExtra)
library(RankProd)
library(stringr)
```


```{r setup_script, eval=TRUE, results='hide'}
# GET CURRENT SCRIPT NAME
(this.script <- rstudioapi::getActiveDocumentContext() %>% .$path %>% basename)
getwd()
list.files()
stopifnot(this.script != "")
```

## Data: microarray studies    

### Geo2r data files  
```{r results="hide"}
geo2r.data.files <- list.files(path="../source_data", recursive = T, 
                               pattern="^GSE.*Geo2r.*txt$", full.names = T)
geo2r.data.files <- grep(pattern = "GSE108363", geo2r.data.files, invert = T, value = T)
cat(basename(geo2r.data.files), sep="<br>\n")
```

```{r results="hide"}
tailleux.data.files <- list.files(path="../source_data", recursive = T,
                                  pattern="*tailleux.*txt", full.names = T)
cat(basename(tailleux.data.files), sep="<br>\n")

```

```{r results="asis"}
muarray.data.files <- c(geo2r.data.files, tailleux.data.files)
muarray.data.files <- grep("GSE29731", muarray.data.files, invert = T, value=T)
cat(basename(muarray.data.files), sep="<br>\n")
```
### Load Geo2r data  
```{r results="hide"}
geo2r.data.list <- lapply(muarray.data.files, function(f) {
  a <- read.table(f, sep="\t", header=T, quote='"', stringsAsFactors = F)
  names(a)[grep("symbol", names(a), ignore.case = T)] <- "Gene.symbol"
  names(a) <- stringr::str_remove(names(a), "^X\\.")
  names(a) <- stringr::str_remove(names(a), "\\.$")
  a
}) %>% 
  set_names(stringr::str_remove_all(basename(muarray.data.files), "\\.txt"))


names(geo2r.data.list) <- stringr::str_remove(names(geo2r.data.list), "^X")
names(geo2r.data.list)[grep("DC", names(geo2r.data.list))] <- "A-BUGS-23_DC"
names(geo2r.data.list)[grep("MP", names(geo2r.data.list))] <- "A-BUGS-23_MP"

# class(geo2r.data.list) # "list"
# length(geo2r.data.list) # 4
# sapply(geo2r.data.list, class) # "data.frame"
# sapply(geo2r.data.list, nrow) %>% unname # 29102 47231 24501 24501
```

```{r}
data.frame(List.item = names(geo2r.data.list),
           class = sapply(geo2r.data.list, class),
           Columns = sapply(geo2r.data.list, ncol),
           Rows = comma(sapply(geo2r.data.list, nrow)),
           Has.Gene.Symbols = sapply(geo2r.data.list, function(x) {any(grepl("Gene.symbol", names(x)))})
           # Unique.genes = sapply(geo2r.data.list, function(x) {length(unique(x$))})
           ) %>% 
  mutate(Has.Gene.Symbols = cell_spec(Has.Gene.Symbols, "html", color = ifelse(Has.Gene.Symbols == "TRUE", "black", "red"))) %>% 
  knitr::kable(., row.names = F, align=c("l", "l", "c", "c", "c"), format = "html", escape = F) %>% 
  kableExtra::kable_styling("striped", full_width = T)
```

```{r results="hide"}
# head(geo2r.data.list[[2]])
```


### Aggregate logFC by gene symbol  
```{r}
geo.fold.change.list <- lapply(seq_along(geo2r.data.list), function(i, list.names) {
  df <- geo2r.data.list[[i]]
  x <- dplyr::select(df, Gene.symbol, logFC) %>% 
    dplyr::mutate(Gene.symbol = ifelse(Gene.symbol=="", NA, Gene.symbol)) %>% 
    tidyr::drop_na(.) %>% 
    dplyr::mutate(logFC = as.numeric(logFC)) %>% 
    dplyr::mutate(Gene.symbol = stringr::str_remove_all(Gene.symbol, '\\"')) %>% 
    tidyr::drop_na(.) %>% 
    dplyr::group_by(Gene.symbol) %>% 
    dplyr::summarise(logFC = median(logFC), .groups="drop") %>% 
    tidyr::drop_na(.) %>% 
    as.data.frame(., stringsAsFactors=FALSE)
  # names(x)[names(x) == "logFC"] <- list.names[i]
  x
}, list.names = names(geo2r.data.list)) %>% 
  set_names(names(geo2r.data.list))
names(geo.fold.change.list) <- stringr::str_remove(names(geo.fold.change.list), "_Geo2r.*$")
```


```{r}
data.frame(
  List.item = names(geo.fold.change.list),
  Class = sapply(geo.fold.change.list, function(x) {paste(class(x), collapse=",")}),
  Columns = sapply(geo.fold.change.list, ncol),
  Rows = comma(sapply(geo.fold.change.list, nrow)),
  Duplicated.genes = sapply(geo.fold.change.list, function(x) {any(duplicated(x$Gene.symbol))})
) %>% 
  knitr::kable(., row.names = F, align=c("l", "l", "c", "c", "c"), format = "html", escape = F) %>% 
  kableExtra::kable_styling("striped", full_width = T)
```

## Data: RNA-seq studies  
```{r}
# <a href="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE64179">GSE64179</a>
# https://pubmed.ncbi.nlm.nih.gov/26392366/
add.geo.hyperlink <- function(id) {
  a <- "<a href='https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
  b <- "'>"
  c <- "</a>"
  res <- paste0(a, id, b, id, c)
  res
}

add.pubmed.hyperlink <- function(id) {
  a <- "<a href='https://pubmed.ncbi.nlm.nih.gov/"
  b <- "/'>"
  c <- "</a>"
  res <- paste0(a, id, b, id, c)
  res
}
```


```{r}
rs.dataset.table <- data.frame(
  c=c("GSE64179", "26392366", "6", "Dendritic cells"),
  b=c("GSE67427", "26586179", "8", "Monocyte derived macrophages"),
  a=c("GSE148731", "32341411", "6", "M1, M2 macrophages")
) %>%
  t %>% 
  as.data.frame(., stringsAsFactors=F) %>% 
  set_colnames(c("Accession", "Pubmed", "Replicates", "Cell.Type")) %>% 
  mutate(Accession=add.geo.hyperlink(Accession)) %>% 
  mutate(Pubmed = add.pubmed.hyperlink(Pubmed))
knitr::kable(rs.dataset.table, row.names = F, escape=F, format = "html") %>% 
  kableExtra::kable_styling("striped", full_width = T)
```


### Load RNA-seq data: GSE64179  
```{r message=FALSE}
# list.files("../source_data/rna_seq/GSE64179")
rna.seq.data.files <- list()
rna.seq.data.files$GSE64179 <- "../source_data/rna_seq/GSE64179/Pacis_2015_TableS6_DEG.xlsx"
stopifnot(file.exists(rna.seq.data.files$GSE64179))
rna.seq.data.list <- list()
rna.seq.data.list$GSE64179 <- readxl::read_xlsx(path=rna.seq.data.files$GSE64179, range = "B3:C18818") %>%
set_colnames(c("Gene.symbol", "logFC")) %>% 
mutate(Gene.symbol = toupper(Gene.symbol)) %>% 
group_by(Gene.symbol) %>% 
summarise(logFC = median(logFC), .groups="drop") %>% 
as.data.frame(., stringsAsFactors=FALSE)
```


```{r}
# WITHOUT GROUPING / AGGREGATING:
# dim(rna.seq.data$GSE64179) # 18815     2
# sapply(rna.seq.data$GSE64179, class) # "character"   "numeric"
# range(rna.seq.data$GSE64179$logFC) # -9.426613 11.612170
# sum(rna.seq.data$GSE64179$logFC == 0) # 0
# sum(duplicated(rna.seq.data$GSE64179$Gene.symbol)) # 37
# dups <- rna.seq.data$GSE64179$Gene.symbol[duplicated(rna.seq.data$GSE64179$Gene.symbol)]
# subset(rna.seq.data$GSE64179, Gene.symbol %in% dups) %>% 
#   arrange(Gene.symbol, logFC)
# head(rna.seq.data$GSE64179)
```

```{r}
# WITH AGGREGATING: 
# dim(rna.seq.data$GSE64179) # 18776     2
# sapply(rna.seq.data$GSE64179, class) # "character"   "numeric"
# range(rna.seq.data$GSE64179$logFC) # -9.426613 11.612170
# sum(rna.seq.data$GSE64179$logFC == 0) # 0
# sum(duplicated(rna.seq.data$GSE64179$Gene.symbol)) # 0
head(rna.seq.data.list$GSE64179) %>% 
  knitr::kable(., row.names = F, format="html") %>% 
  kableExtra::kable_styling("striped", full_width = T)
```




### Load RNA-seq data: GSE67427
```{r}
rna.seq.data.files$GSE67427 <-"../source_data/rna_seq/GSE67427/GSE67427_table_s2.txt"
stopifnot(file.exists(rna.seq.data.files$GSE67427))
#rna.seq.data.list <-list()
rna.seq.data.list$GSE67427<- read.table(file = rna.seq.data.files$GSE67427, header = T, sep = "\t", stringsAsFactors = FALSE)  %>% 
#str(rna.seq.data.list$GSE67427) #data.frame':	12728 obs. of  140 variables:
  # head(rna.seq.data.list$GSE67427)
  
  # names(rna.seq.data.list$GSE67427)
  # select(matches(c("name", "Rv.18")))
 select(matches(c("name","Rv.18.logFC"), ignore.case = TRUE)) %>%
   # head(rna.seq.data.list$GSE67427)
  set_colnames(c("Gene.symbol", "logFC")) %>%
  mutate(Gene.symbol = toupper(Gene.symbol)) %>% 
  group_by(Gene.symbol) %>% 
  summarise(logFC = median(logFC), .groups="drop") %>% 
  as.data.frame(., stringsAsFactors=FALSE)
 head(rna.seq.data.list$GSE67427)
```

```{r}
# WITHOUT GROUPING / AGGREGATING:
dim(rna.seq.data.list$GSE67427) #  12724     2
sapply(rna.seq.data.list$GSE67427, class) # "character"   "numeric"
range(rna.seq.data.list$GSE67427$logFC) # -4.155479  8.205439
sum(rna.seq.data.list$GSE67427$logFC == 0) # 0
sum(duplicated(rna.seq.data.list$GSE67427$Gene.symbol)) # 0
# dups <- rna.seq.data$GSE64179$Gene.symbol[duplicated(rna.seq.data$GSE64179$Gene.symbol)]
# subset(rna.seq.data$GSE64179, Gene.symbol %in% dups) %>% 
#   arrange(Gene.symbol, logFC)
# head(rna.seq.data$GSE64179)
```

```{r}
# WITH AGGREGATING: 
# dim(rna.seq.data$GSE64179) # 18776     2
# sapply(rna.seq.data$GSE64179, class) # "character"   "numeric"
# range(rna.seq.data$GSE64179$logFC) # -9.426613 11.612170
# sum(rna.seq.data$GSE64179$logFC == 0) # 0
# sum(duplicated(rna.seq.data$GSE64179$Gene.symbol)) # 0
head(rna.seq.data.list$GSE67427) %>% 
  knitr::kable(., row.names = F, format="html") %>% 
  kableExtra::kable_styling("striped", full_width = T)

```

### Load RNA-seq data: GSE48731.MF1
```{r message=FALSE}
# list.files("../source_data/rna_seq/GSE148731")
# rna.seq.data.files <- list()
rna.seq.data.files$GSE148731.MF1 <- "../source_data/rna_seq/GSE148731/GSE148731_toptable_MF1_01.Rmd.2020_07_09.txt"
stopifnot(file.exists(rna.seq.data.files$GSE148731.MF1))
 # rna.seq.data.list <- list()
rna.seq.data.list$GSE148731.MF1 <- read.table(file = rna.seq.data.files$GSE148731.MF1, header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>%
   select(c("Gene.symbol", "logFC")) %>% 
  mutate(Gene.symbol = toupper(Gene.symbol)) %>% 
  # summarise(logFC = median(logFC), .groups="drop") %>% 
   as.data.frame(., stringsAsFactors=FALSE)
head(rna.seq.data.list$GSE148731.MF1)
```
```{r}
# WITH AGGREGATING:
dim(rna.seq.data.list$GSE148731.MF1) # 17802     2
sapply(rna.seq.data.list$GSE148731.MF1, class) # "character"   "numeric"
range(rna.seq.data.list$GSE148731.MF1$logFC) # -2.386124  9.704865
sum(rna.seq.data.list$GSE148731.MF1$logFC == 0) #
sum(duplicated(rna.seq.data.list$GSE148731.MF1$Gene.symbol)) # 0

```

```{r}
head(rna.seq.data.list$GSE148731.MF1) %>% 
  knitr::kable(., row.names = F, format="html") %>% 
  kableExtra::kable_styling("striped", full_width = T)
```


### Load RNA-seq data: GSE48731.MF2
```{r message=FALSE}
 # list.files("../source_data/rna_seq/GSE148731")
# rna.seq.data.files <- list()
rna.seq.data.files$GSE148731.MF2 <- "../source_data/rna_seq/GSE148731/GSE148731_toptable_MF2_01.Rmd.2020_07_09.txt"
stopifnot(file.exists(rna.seq.data.files$GSE148731.MF2))
 # rna.seq.data.list <- list()
rna.seq.data.list$GSE148731.MF2 <- read.table(file = rna.seq.data.files$GSE148731.MF2, header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>%
   select(c("Gene.symbol", "logFC")) %>% 
  mutate(Gene.symbol = toupper(Gene.symbol)) %>% 
 # summarise(logFC = median(logFC), .groups="drop") %>% 
   as.data.frame(., stringsAsFactors=FALSE)
head(rna.seq.data.list$GSE148731.MF2)
```


```{r}
# WITH AGGREGATING:
dim(rna.seq.data.list$GSE148731.MF2) #  17085     2
sapply(rna.seq.data.list$GSE148731.MF2, class) # "character"   "numeric"
range(rna.seq.data.list$GSE148731.MF2$logFC) # -7.524136  9.795100
sum(rna.seq.data.list$GSE148731.MF2$logFC == 0) #
sum(duplicated(rna.seq.data.list$GSE148731.MF2$Gene.symbol)) # 0

```

```{r}
head(rna.seq.data.list$GSE148731.MF2) %>% 
  knitr::kable(., row.names = F, format="html") %>% 
  kableExtra::kable_styling("striped", full_width = T)
```

## Combine all data frames  
```{r}
# length(rna.seq.data.list) #4
# length(geo.fold.change.list)# 4
all.data.list <- c(geo.fold.change.list, rna.seq.data.list)
all(sapply(all.data.list, class) == "data.frame") # TRUE
```

```{r}
lapply(all.data.list, names)
length(all.data.list) # 8
```


### Merge data frames  
```{r}
merged.df <- all.data.list %>% purrr::reduce(dplyr::full_join, by="Gene.symbol") %>% 
  tibble::column_to_rownames("Gene.symbol") %>% 
  set_colnames(paste0("dataset", seq_along(.)))

dim(merged.df) # 30687     8
head(merged.df) %>% 
  knitr::kable(., row.names = T, format="html") %>% 
  kableExtra::kable_styling("striped", full_width = T)
```

Summary table:  
```{r}
df.summary <- data.frame(
  Column = names(merged.df),
  Dataset=names(all.data.list),
  Min = round(sapply(merged.df, min, na.rm=T), 2),
  Max = round(sapply(merged.df, max, na.rm=T), 2),
  Count.zero = sapply(merged.df, function(x) {sum(x==0, na.rm=T)}),
  Count.NA = comma(sapply(merged.df, function(x) {sum(is.na(x))}))
) %>% 
  mutate(Count.zero = comma(as.integer(Count.zero), accuracy=1))
knitr::kable(df.summary, row.names = F, format="html", align=rep("l", 5)) %>% 
  kableExtra::kable_styling("striped", full_width = T)
```


```{r results="asis"}
cat("Number of rows in merged data frame:<b>", comma(nrow(merged.df)), "</b><br>\n")
cat("Number of gene symbols in merged data frame:<b>", comma(length(unique(row.names(merged.df)))), "</b><br>\n")
```

```{r results="asis"}
cat("Number of complete data rows: <b>", comma(sum(complete.cases(merged.df))), "</b> (",
    percent(sum(complete.cases(merged.df))/length(unique(row.names(merged.df)))),
    ")<br>\n", sep="")
```

### Filter rows: Three NA max  
```{r}
three.na.max.count <- as.matrix(merged.df) %>% 
  apply(., 1, function(x) {  # 1 = select rows
    n <- sum(is.na(x))
    n
  })
class(three.na.max.count) # integer
length(three.na.max.count) # 30687
merged.df.filt.na <- merged.df[three.na.max.count <= 3,]
dim(merged.df.filt.na)# 14743 8 vs 11873     8  vs  8626    8 (one NA max)
 dim(merged.df)    #  30687     8  
```


```{r}
cat("Number of rows with max three NA: <b>", comma(nrow(merged.df.filt.na)), "</b> (",
    percent(nrow(merged.df.filt.na)/length(unique(row.names(merged.df)))),
    ")<br>\n", sep="")

```

Print 10 rows:  
```{r}
head(merged.df.filt.na, 10) %>% 
  knitr::kable(., row.names = T, format="html", align=rep("l", 5)) %>% 
  kableExtra::kable_styling("striped", full_width = T)
```

### Summary table after filtering NAs:  
```{r}
df.summary.2 <- data.frame(
  Column = names(merged.df.filt.na),
  Dataset=names(all.data.list),
  Min = round(sapply(merged.df.filt.na, min, na.rm=T), 2),
  Max = round(sapply(merged.df.filt.na, max, na.rm=T), 2),
  Count.zero = sapply(merged.df.filt.na, function(x) {sum(x==0, na.rm=T)}),
  Count.NA = comma(sapply(merged.df.filt.na, function(x) {sum(is.na(x))}))
) %>% 
  mutate(Count.zero = comma(as.integer(Count.zero), accuracy=1))
knitr::kable(df.summary.2, row.names = F, format="html", align=rep("l", 5)) %>% 
  kableExtra::kable_styling("striped", full_width = T)
rm(df.summary.2)
```



## Calculate ranks (1)  

Key paramters:  

* NAs stay NAs  
* ties are averaged  
* keeping NAs results in different maximum ranks  
* (will have to rank again after filling NAs)  

### Ranks: down-regulated  
```{r}
ranks.down.mx <- apply(merged.df.filt.na, 2, rank, ties.method="average", na.last="keep")
# class(ranks.down.mx) # matrix
# dim(ranks.down.mx) # 12786     5
# ranks.down.mx[1:10,]
```

Print 6 rows (sorted):  
```{r}
ranks.down.mx[order(rowMeans(ranks.down.mx, na.rm=T)),] %>% head
```

The maximum ranks will vary by dataset:  
```{r}
apply(ranks.down.mx, 2, max, na.rm=T) %>%
  as.data.frame(., stringsAsFactors=F) %>% 
  tibble::rownames_to_column() %>% 
  set_colnames(c("Dataset (down)", "Max.Rank")) %>% 
  mutate(Max.Rank = comma(Max.Rank)) %>% 
  knitr::kable(., row.names = F, align=c("l","r"), format="html") %>% 
  kableExtra::kable_styling("striped")
```

### Ranks: up-regulated  
```{r}
ranks.up.mx <- apply((merged.df.filt.na * -1), 2, rank, ties.method="average", na.last="keep")
```

Print 6 rows (sorted):  
```{r}
ranks.up.mx[order(rowMeans(ranks.up.mx, na.rm=T)),] %>% head
```


```{r}
apply(ranks.up.mx, 2, max, na.rm=T) %>%
  as.data.frame(., stringsAsFactors=F) %>% 
  tibble::rownames_to_column() %>% 
  set_colnames(c("Dataset (up)", "Max.Rank")) %>% 
  mutate(Max.Rank = comma(Max.Rank)) %>% 
  knitr::kable(., row.names = F, align=c("l","r"), format="html") %>% 
  kableExtra::kable_styling("striped")
```

## Replace missing values with averages  
```{r}
na.to.average <- function(x) {
  any.na <- any(is.na(x))
  if (isTRUE(any.na)) {
    indx.na <- which(is.na(x))
    ave <- mean(x, na.rm=T)
    x[indx.na] <- ave
  }
  x
}
```

### Replace NAs: down-regulation  
```{r}
ranks.down.nona.mx <- t(apply(ranks.down.mx, 1, na.to.average))
# dim(ranks.down.mx) # 12786     5
# dim(ranks.down.nona.mx) # 12786     5
# sum(is.na(ranks.down.mx)) # 1436
# sum(is.na(ranks.down.nona.mx)) # 0
# ranks.down.nona.mx[1:4,1:4]
ranks.down.nona.mx[order(rowMeans(ranks.down.nona.mx, na.rm=T)),] %>% 
  head
```

### Replace NAs: up-regulation  
```{r}
ranks.up.nona.mx <- t(apply(ranks.up.mx, 1, na.to.average))
# dim(ranks.up.mx) # 12786     5
# dim(ranks.up.nona.mx) # 12786     5
# sum(is.na(ranks.up.mx)) # 1436
# sum(is.na(ranks.up.nona.mx)) # 0
# ranks.up.nona.mx[1:4,1:4]
ranks.up.nona.mx[order(rowMeans(ranks.up.nona.mx, na.rm=T)),] %>% 
  head
```


## Calculate ranks (2)  
Having replaced missing values with averages, repeat the ranking such that maximum ranks will be the same for each dataset.  
Key paramters:  

* ties are averaged  

### Ranks: down-regulated (2)  
```{r}
ranks.down.mx.2 <- apply(ranks.down.nona.mx, 2, rank, ties.method="average", na.last="keep")
# class(ranks.down.mx.2) # matrix
# dim(ranks.down.mx.2) # 12786     5
# ranks.down.mx.2[1:10,]
ranks.down.mx.2[order(rowMeans(ranks.down.mx.2, na.rm=T)),] %>% 
  head
```

```{r}
apply(ranks.down.mx.2, 2, max) %>%
  as.data.frame(., stringsAsFactors=F) %>% 
  tibble::rownames_to_column() %>% 
  set_colnames(c("Dataset (down)", "Max.Rank")) %>% 
  knitr::kable(., row.names = F, align=c("l","r"), format="html") %>% 
  kableExtra::kable_styling("striped")
```

### Ranks: up-regulated (2)  
```{r}
ranks.up.mx.2 <- apply(ranks.up.nona.mx, 2, rank, ties.method="average", na.last="keep")
# ranks.up.mx.2[1:10,]
ranks.up.mx.2[order(rowMeans(ranks.up.mx.2, na.rm=T)),] %>% head
```

```{r}
apply(ranks.up.mx.2, 2, max) %>%
  as.data.frame(., stringsAsFactors=F) %>% 
  tibble::rownames_to_column() %>% 
  set_colnames(c("Dataset (up)", "Max.Rank")) %>% 
  knitr::kable(., row.names = F, align=c("l","r"), format="html") %>% 
  kableExtra::kable_styling("striped")
```



## Plot correlations  
### Plot fold changes  
```{r}
panel.cor <- function(x, y, digits = 3, prefix = "", cex.cor, ...)
{
     usr <- par("usr"); on.exit(par(usr))
     par(usr = c(0, 1, 0, 1))
     r <- cor(x, y, use="pairwise.complete.obs")
     txt <- format(c(r, 0.123456789), digits = digits)[1]
     txt <- paste0("r=", txt)
     # if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
     # text(0.5, 0.5, txt, cex = cex.cor * r)
     text(0.5, 0.5, txt, cex=1.5, col="blue", font = 3)
}

cor.df<- merged.df %>% 
  set_colnames(names(all.data.list)) 
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\_DC", " \nDC")
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\_MP", " \nMP")
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\.MF1", " \nMF1")
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\.MF2", " \nMF2") 
  colnames(cor.df) <- str_replace(colnames(cor.df), "GSE103092", "GSE103092\nMP")
  colnames(cor.df) <- str_replace(colnames(cor.df), "GSE64179", "GSE64179\nDC")              
  colnames(cor.df) <- str_replace(colnames(cor.df),"GSE34151", "GSE34151\nDC")
  colnames(cor.df) <- str_replace(colnames(cor.df),"GSE67427",
"GSE67427\nMP")
  
  pairs(cor.df, col = alpha("black", 0.3), pch=20, upper.panel = panel.cor,
        xaxt='n', yaxt='n',
        text.panel = function(x,y,lab,cex,font) {text(x,y,lab, cex=1.0, font=2)}) 
  
  
```

### Save plot as a png
```{r}
out.file <- paste("data_out", stringr::str_replace(this.script, "\\.[Rr][Mm][Dd]$", ".png"), sep="/")
out.file
```

```{r}
png(file=out.file, width = 680, height= 480, units = "px")
cor.df<- merged.df %>% 
  set_colnames(names(all.data.list)) 
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\_DC", " \nDC")
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\_MP", " \nMP")
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\.MF1", " \nMF1")
  colnames(cor.df) <- str_replace(colnames(cor.df), "\\.MF2", " \nMF2") 
    colnames(cor.df) <- str_replace(colnames(cor.df), "GSE103092", "GSE103092\nMP")
  colnames(cor.df) <- str_replace(colnames(cor.df), "GSE64179", "GSE64179\nDC")              
  colnames(cor.df) <- str_replace(colnames(cor.df),"GSE34151", "GSE34151\nDC")
  colnames(cor.df) <- str_replace(colnames(cor.df),"GSE67427",
"GSE67427\nMP")
  
  pairs(cor.df, col = alpha("black", 0.3), pch=20, upper.panel = panel.cor,
        xaxt='n', yaxt='n',
        text.panel = function(x,y,lab,cex,font) {text(x,y,lab, cex=1.4, font=2)}) 
  
dev.off()
```


### Plot ranks  
```{r}
panel.cor.spearman <- function(x, y, digits = 3, prefix = "", cex.cor, ...)
{
     usr <- par("usr"); on.exit(par(usr))
     par(usr = c(0, 1, 0, 1))
     r <- cor(x, y, use="pairwise.complete.obs", method="spearman")
     txt <- format(c(r, 0.123456789), digits = digits)[1]
     txt <- paste0("r=", txt)
     text(0.5, 0.5, txt, cex=2.5, col="purple", font = 3)
}
ranks.down.mx.2 %>% 
  set_colnames(names(all.data.list)) %>% 
  pairs(., col = alpha("black", 0.3), pch=18, upper.panel = panel.cor.spearman,
        xaxt='n', yaxt='n',
      text.panel = function(x,y,lab,cex,font) {text(x,y,lab, cex=3, font=2)})
```

## Calculate rank sum statistics  
```{r}
# browseVignettes("RankProd")
# help(package="RankProd")
# help(RankProducts, package="RankProd")
```

```{r}
get.rank.sum <- function(mx) {
  cl.down <- rep(1, ncol(mx))
  rank.sum <- RankProd::RankProducts(
    data=mx,
    cl=cl.down,
    calculateProduct=FALSE,
    gene.names = row.names(ranks.down.mx.2)
    )
  rank.sum
}
```

```{r}
get.rank.sum.stats <- function(rank.sum.result) {
  # Get rank sums:
  RS <- as.data.frame(rank.sum.result$RSs) %>% 
    tibble::rownames_to_column(.) %>% 
    dplyr::select(1:2) %>% 
    set_colnames(c("Gene", "Rank.Sum")) %>% 
    arrange(Rank.Sum)
  
  # Get p values:
  PVAL <- as.data.frame(rank.sum.result$pval) %>% 
  tibble::rownames_to_column(.) %>% 
  dplyr::select(1:2) %>% 
  set_colnames(c("Gene", "P.Val"))
  
  # Merge:
  RS.PVAL <- merge(x=RS, y=PVAL, by="Gene", all=T)
  
  # Adjust p values:
  RS.PVAL$P.Val.Adj <- p.adjust(RS.PVAL$P.Val, method = "fdr")
  
  RS.PVAL
}
```


### Rank sum stats: down-regulation  
```{r message=FALSE}
rank.sum.down <- get.rank.sum(ranks.down.mx.2) %>% 
  get.rank.sum.stats(.)
# class(rank.sum.down) # "data.frame"
# dim(rank.sum.down) # 11684     4
top.10.down<- rank.sum.down %>% 
  arrange(Rank.Sum) %>% 
  head(., 10) 
  # knitr::kable(., row.names = F, align=rep("l", 4), format = "html", escape = F) %>% 
  # kableExtra::kable_styling("striped", full_width = T)
top.10.down

```
### Save top 10 down-regulated genes
```{r}
out.table.down <- paste0("data_out/",this.script, ".top.down.txt")
out.table.down
write.table(top.10.down, file = out.table.down, sep = "\t")
```

### Rank sum stats: up-regulation  
```{r message=FALSE}
rank.sum.up <- get.rank.sum(ranks.up.mx.2) %>% 
  get.rank.sum.stats(.)
# class(rank.sum.up) # "data.frame"
# dim(rank.sum.up) # 12786     4
top.10.up <- rank.sum.up %>% 
  arrange(Rank.Sum) %>% 
  head(., 10) 
  # knitr::kable(., row.names = F, align=rep("l", 4), format = "html", escape = F) %>% 
  # kableExtra::kable_styling("striped", full_width = T)
top.10.up
```
`
### Save top 10 up-regulated genes
```{r}
out.table.up <- paste0("data_out/",this.script, ".top.up.txt")
out.table.up
write.table(top.10.up, file = out.table.up, sep = "\t")
```


## Heatmaps for top regulated genes  
([Datanovia Tutorial](https://www.datanovia.com/en/lessons/heatmap-in-r-static-and-interactive-visualization/#r-base-heatmap-heatmap))

```{r}
get.heatmap <- function(mx, heat.cols, ha) {
  ComplexHeatmap::Heatmap(
  mx, name = "Rank",
  col = heat.cols,
  top_annotation = ha,
  cluster_rows = FALSE,
  show_row_names = FALSE,
  show_column_names = TRUE
  )
}

```

### Heatmap: 100 most down-regulated genes  
```{r}
# nrow(subset(rank.sum.up, P.Val.Adj <= 0.05)) # 594
# nrow(subset(rank.sum.up, P.Val.Adj <= 0.01)) # 317
down.100 <- dplyr::arrange(rank.sum.down, P.Val.Adj, Rank.Sum) %>% 
  head(.,100) %>% .$Gene
# down.100
mx.down.100 <- ranks.down.mx.2[down.100,] %>% 
  set_colnames(names(all.data.list))
 colnames(mx.down.100) <- str_replace(colnames(mx.down.100), "_DC", " (DC)")
 colnames(mx.down.100) <- str_replace(colnames(mx.down.100), "_MP", " (MP)")
colnames(mx.down.100) <- str_replace(colnames(mx.down.100), ".MF1", " (MF1)")
colnames(mx.down.100) <- str_replace(colnames(mx.down.100), ".MF2", " (MF2)")
# dim(mx.down.100) # 100   5
head(mx.down.100)
```
### Save top 100 up-regulated genes
```{r}
out.file <- paste("data_out", stringr::str_replace(this.script,"\\.[Rr][Mm][Dd]$", ".down.100.txt"), sep="/")

out.file
cat(down.100, file = out.file, sep = "\n")
```



```{r}
down.100.df <- as.data.frame(t(mx.down.100))
# dim(down.100.df) # 5 100

row.names(down.100.df) <- names(all.data.list)

down.100.df <- as.data.frame(t(mx.down.100)) %>% 
  set_rownames(names(all.data.list)) %>% 
  tibble::rownames_to_column("Dataset") %>% 
  dplyr::mutate(Technology = "Microarray") %>%
  dplyr::mutate(Cell.type = "Macrophages") %>%
  dplyr::select(Dataset, Technology, Cell.type, dplyr::everything())
down.100.df$Technology[down.100.df$Dataset %in% names(rna.seq.data.list)] <- "RNA-seq"
down.100.df$Cell.type[down.100.df$Dataset %in% c("GSE64179", "A-BUGS-23_DC", "GSE34151")] <- "Dendritic cells"
down.100.df[,1:6]
```

```{r}
# help(HeatmapAnnotation, package="ComplexHeatmap")
col.down <- list(Technology = c("Microarray" = "Darkblue", "RNA-seq" = "lightblue"),
            Cell.type = c("Macrophages" = "grey", "Dendritic cells" = "black"))
ha.down <- ComplexHeatmap::HeatmapAnnotation(
  Technology = down.100.df$Technology,
  Cell.type = down.100.df$Cell.type,
  col=col.down
)
rm(col.down)
```

```{r}
# "YlOrRd", "YlOrBr", "YlGnBu"*, "PuBuGn", "YlGn", "RdPu"-, "PuRd"-, "BuGn"
# "Blues", "Greens", "Purples", "Oranges"
heat.cols <- colorRampPalette(RColorBrewer::brewer.pal(7, "Oranges"))(256) %>% rev
get.heatmap(mx.down.100, heat.cols, ha.down)
```
### Save heatmap for down-regulated genes
```{r}
out.file.map.down <- paste("data_out",
                           str_replace(this.script,"\\.[Rr][Mm][Dd]$", ".map.down.pdf"),
                           sep="/")
out.file.map.down
```

```{r}
pdf(file = out.file.map.down, width=6, height=8.5)
get.heatmap(mx.down.100, heat.cols, ha.down)
dev.off()
```


### Heatmap: 100 most up-regulated genes  
```{r}
up.100 <- dplyr::arrange(rank.sum.up, P.Val.Adj, Rank.Sum) %>% 
  head(.,100) %>% .$Gene
head(up.100)
mx.up.100 <- ranks.up.mx.2[up.100,] %>% 
  set_colnames(names(all.data.list))
colnames(mx.down.100) <- str_replace(colnames(mx.down.100), "_DC", " (DC)")
colnames(mx.down.100) <- str_replace(colnames(mx.down.100), "_MP", " (MP)")
colnames(mx.down.100) <- str_replace(colnames(mx.down.100), ".MF1", " (MF1)")
colnames(mx.down.100) <- str_replace(colnames(mx.down.100), ".MF2", " (MF2)")
 dim(mx.up.100) # 100   6
head(mx.up.100)

```
### Save top 100 up-regulated genes
```{r}
out.file <- paste("data_out", stringr::str_replace(this.script,"\\.[Rr][Mm][Dd]$", ".up.100.txt"), sep="/")

out.file
cat(up.100, file = out.file, sep = "\n")

```


```{r}
up.100.df <- as.data.frame(t(mx.up.100))
# dim(down.100.df) # 5 100
row.names(up.100.df) <- names(all.data.list)

up.100.df <- as.data.frame(t(mx.up.100)) %>% 
  set_rownames(names(all.data.list)) %>% 
  tibble::rownames_to_column("Dataset") %>% 
  dplyr::mutate(Technology = "Microarray") %>%
  dplyr::mutate(Cell.type = "Macrophages") %>%
  dplyr::select(Dataset, Technology, Cell.type, dplyr::everything())
up.100.df$Technology[up.100.df$Dataset %in% names(rna.seq.data.list)] <- "RNA-seq"
up.100.df$Cell.type[up.100.df$Dataset %in% c("GSE64179", "A-BUGS-23_DC","GSE34151")] <- "Dendritic cells"

```

```{r}
col.up <- list(Technology = c("Microarray" = "Darkblue", "RNA-seq" = "lightblue"),
            Cell.type = c("Macrophages" = "grey", "Dendritic cells" = "black"))
ha.up <- ComplexHeatmap::HeatmapAnnotation(
  Technology = up.100.df$Technology,
  Cell.type = up.100.df$Cell.type,
  col=col.up
)
heat.cols <- colorRampPalette(RColorBrewer::brewer.pal(7, "Oranges"))(256) %>% rev
get.heatmap(mx.up.100, heat.cols, ha.up)
```

### Save heatmap for up-regulated genes
```{r}
out.file.map.up <- paste("data_out", stringr::str_replace(this.script,"\\.[Rr][Mm][Dd]$", ".map.up.pdf"), sep="/")
out.file.map.up
```

```{r}
pdf(file = out.file.map.up, width=6, height=8.5)
get.heatmap(mx.up.100, heat.cols, ha.up)
dev.off()
```


## Save genes  
### Save top down-regulated genes  
```{r results="asis"}
gs.down.01 <- rank.sum.down %>% 
  dplyr::filter(P.Val.Adj <= 0.01) %>% 
  use_series("Gene")
 length(gs.down.01) # 824
cat("Number of consistently <b><i>down</i></b>regulated genes (p <= 0.01):<b>", length(gs.down.01), "</b>")

```
```{r}
out.file.down.01 <- paste0("data_out/", this.script, ".down.01.txt")
cat(gs.down.01, sep="\n", file=out.file.down.01)
cat("File saved:", out.file.down.01)

```

### Save down-regulated genes with adjusted p value <= 0.001
```{r}
gs.down.001 <- rank.sum.down %>% 
  dplyr::filter(P.Val.Adj <= 0.001) %>% 
  use_series("Gene")
 length(gs.down.001) # 299
cat("Number of consistently <b><i>down</i></b>regulated genes (p <= 0.001):<b>", length(gs.down.001), "</b>")


```

```{r}
out.file.down.001 <- paste0("data_out/", this.script, ".down.001.txt")
cat(gs.down.001, sep="\n", file=out.file.down.001)
cat("File saved:", out.file.down.001)

```


### Save top up-regulated genes  
```{r results="asis"}
gs.up.01 <- rank.sum.up %>% 
  dplyr::filter(P.Val.Adj <= 0.01) %>% 
  use_series("Gene")
length(gs.up.01) # 912
cat("Number of consistently <b><i>up</i></b>regulated genes (p <= 0.01):<b>", length(gs.up.01), "</b>")
```

```{r}
out.file.up.01 <- paste0("data_out/", this.script, ".up.01.txt")
cat(gs.up.01, sep="\n", file=out.file.up.01)
cat("File saved:", out.file.up.01)
```
### Save up-regulated genes with adjusted p value <= 0.001
```{r}
gs.up.001 <- rank.sum.up %>% 
  dplyr::filter(P.Val.Adj <= 0.001) %>% 
  use_series("Gene")
length(gs.up.001) # 479
cat("Number of consistently <b><i>up</i></b>regulated genes (p <= 0.001):<b>", length(gs.up.001), "</b>")
```

```{r}
out.file.up.001 <- paste0("data_out/", this.script, ".up.001.txt")
cat(gs.up.001, sep="\n", file=out.file.up.001)
cat("File saved:", out.file.up.001)
```


## Session info  
<button class="button" onclick="myFunction('DIV_5')">Show/hide session info</button>
<div id="DIV_5" class="div_default_hide">

```{r print_date_and_time}
Sys.time()
```

```{r print_session_info}
# sessionInfo()
devtools::session_info()
```
</div>
  
<script>
  var divsToHide = document.getElementsByClassName("div_default_hide");
for(var i = 0; i < divsToHide.length; i++)
{
  divsToHide[i].style.display = 'none';
}
</script>

```{r}
knitr::purl(this.script, documentation = 0)
```

