1 Correlation analysis

This report contains a correlation analysis between DNA methylation levels of probes and their annotated gene nearest genes. The main purpose is to compare the differences in the DNA methylation metadata in GDC legacy database vs GDC harmonized data.

A pair of probe and genes are consider:

Insignificant if FDR > 0.05
Strongly negatively correlated (SNC) when the rho value was less than -0.5 (FDR < 0.05);
Weakly negatively correlated (WNC) when the rho value was between -0.5 and -0.25 (FDR < 0.05);
No negative correlation (NNC) when the rho value was greater than -0.25 (FDR < 0.05);

As the purpose is to compare DNA methylation annotation, the correlation of both analyses (hg19 and hg38) used the same gene expression data (GDC harmonized - FPKM-UQ).

Annotations

hg19: Illumina manifest
- Source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL13534
hg38: Zhou W, Laird PW and Shen H, Comprehensive characterization, annotation and innovative use of Infinium DNA Methylation BeadChip probes, Nucleic Acids Research 2017
- Source: http://zwdzwd.github.io/InfiniumAnnotation (Also available in harmonized GDC DNA methylation data)

2 Evaluating associations by annotation

2.1 Negative distances

2.1.1 Unique pair probe genes (with max distance TSS 1500 bp)

This sections compared hg38 associations and h19 associations. The hg38 annotation was retrieved (same used in GDC) from http://zwdzwd.github.io/InfiniumAnnotation and only genes & probes with distance between 0 and -1500 (negative distance) are kept. The hg19 annotation was retrieved the ilumina manifest. We kept only TSS200 and TSS1500 associations.

# The code to create the object (associated.genes_id and associated.genes_id.hg19) is below
associated.genes_id <- associated.genes_id[as.numeric(as.character(associated.genes_id$distance)) < 0,]
associated.genes_id

associated.genes_id$distance <- NULL # ignore different distances
associated.genes_id <- unique(associated.genes_id)
associated.genes_id

# hg19: Illumina manifest. Max distance TSS 1500
unique(associated.genes_id.hg19[,c("probe", "ensembl_gene_id","group_type")])

hg19.pairs <- unique(associated.genes_id.hg19[,c("probe", "ensembl_gene_id")])
colnames(hg19.pairs)[1] <- "probe"
hg38.pairs <- unique(associated.genes_id[,c("probe","ensembl_gene_id")])
shared.pairs <- inner_join(as_tibble(hg19.pairs),as_tibble(hg38.pairs))
unique.hg19.pairs <- anti_join(as_tibble(hg19.pairs),as_tibble(hg38.pairs))
unique.hg38.pairs <- anti_join(as_tibble(hg38.pairs),as_tibble(hg19.pairs))
unique.hg19.genes <- setdiff(unique(hg19.pairs$ensembl_gene_id),unique(hg38.pairs$ensembl_gene_id))
unique.hg38.genes <- setdiff(unique(hg38.pairs$ensembl_gene_id),unique(hg19.pairs$ensembl_gene_id))
shared.genes <- intersect(unique(hg38.pairs$ensembl_gene_id),unique(hg19.pairs$ensembl_gene_id))

2.1.2 Comparing number of unique associate probes between hg38 and hg19

library(VennDiagram)
draw.pairwise.venn(length(unique(associated.genes_id$probe)), 
                   length(unique(associated.genes_id.hg19$probe)),
                   length(intersect(unique(associated.genes_id$probe),
                                    unique(associated.genes_id.hg19$probe))),
                   category = c("Probes hg38", "Probes hg19"), 
                   lty = rep("blank", 2), 
                   fill = c("light blue", "pink"), 
                   alpha = rep(0.5, 2), 
                   cat.pos = c(0, 0), 
                   cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.11], polygon[GRID.polygon.12], polygon[GRID.polygon.13], polygon[GRID.polygon.14], text[GRID.text.15], text[GRID.text.16], lines[GRID.lines.17], text[GRID.text.18], text[GRID.text.19], text[GRID.text.20])

2.1.3 Comparing number of unique associate genes between hg38 and hg19

draw.pairwise.venn(length(unique(associated.genes_id$ensembl_gene_id)), 
                   length(unique(associated.genes_id.hg19$ensembl_gene_id)),
                   length(intersect(unique(associated.genes_id$ensembl_gene_id),
                                    unique(associated.genes_id.hg19$ensembl_gene_id))),
                   category = c("Genes associated to probes hg38", "Genes associated to probes hg19"), 
                   lty = rep("blank", 2), 
                   fill = c("light blue", "pink"), 
                   alpha = rep(0.5, 2), 
                   cat.pos = c(0, 0), 
                   cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.21], polygon[GRID.polygon.22], polygon[GRID.polygon.23], polygon[GRID.polygon.24], text[GRID.text.25], text[GRID.text.26], lines[GRID.lines.27], text[GRID.text.28], text[GRID.text.29], text[GRID.text.30])

unique.hg19.genes <- setdiff(associated.genes_id.hg19$ensembl_gene_id,associated.genes_id$ensembl_gene_id)
unique.hg38.genes <- setdiff(associated.genes_id$ensembl_gene_id,associated.genes_id.hg19$ensembl_gene_id)

2.1.4 Comparing number of unique pairs (gene-probe) between hg38 and hg19

g1 <- unique(associated.genes_id[,c("probe","ensembl_gene_id")])
g2 <- unique(associated.genes_id.hg19[,c("probe","ensembl_gene_id")])
draw.pairwise.venn(nrow(g1), 
                   nrow(g2), 
                   sum(paste0(g1$probe,"_",g1$ensembl_gene_id) %in% paste0(g2$probe,"_",g2$ensembl_gene_id)),
                   category = c("Pairs associated to probes hg38", "Pairs associated to probes hg19"), 
                   lty = rep("blank", 2), 
                   fill = c("light blue", "pink"), 
                   alpha = rep(0.5, 2), 
                   cat.pos = c(0, 0), 
                   cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.31], polygon[GRID.polygon.32], polygon[GRID.polygon.33], polygon[GRID.polygon.34], text[GRID.text.35], text[GRID.text.36], lines[GRID.lines.37], text[GRID.text.38], text[GRID.text.39], text[GRID.text.40])

2.1.5 Correlation status between probe and associated gene

2.1.5.1 Considering all pairs

library(plotly)

# Each correlation file has pairs of gene probes (max distance 1500bp)
# correlation values (spearman pvalue, rho and status)
# we read it and summarize
correlation.status <- NULL
files <- dir(pattern = "correlation_final.rda",recursive = T,full.names = T)
for(f in files){
  load(f)
  # to keep only the same distance signal
  hg38_correlation <- left_join(as_tibble(unique(associated.genes_id[,c("probe", "ensembl_gene_id")])),
                                as_tibble(hg38_correlation))
  
  tab <- plyr::count(hg19_correlation$status)
  hg19.col <- gsub("\\.\\/","",gsub("_correlation_final.rda","_hg19",f))
  colnames(tab) <- c("Status",hg19.col)
  if(is.null(correlation.status)) {
    correlation.status <- tab
  } else {
    correlation.status <- full_join(correlation.status, tab)
  }
  tab <- plyr::count(hg38_correlation$status)
  hg38.col <- gsub("\\.\\/","",gsub("_correlation_final.rda","_hg38",f))
  colnames(tab) <- c("Status",hg38.col)
  correlation.status <- full_join(correlation.status, tab)
}
correlation.status[is.na(correlation.status)] <- 0

# Each correlation file has pairs of gene probes (max distance 1500bp)
# correlation values (spearman pvalue, rho and status)
# we read it and select only Strongly negatively correlated (SNC)
SNC.type <- NULL
phantom <- NULL

files <- dir(pattern = "correlation_final.rda", recursive = T, full.names = T)
for(f in files){
  load(f)
  # to keep only the same distance signal
  hg38_correlation <- left_join(as_tibble(unique(associated.genes_id[,c("probe", "ensembl_gene_id")])),
                                as_tibble(hg38_correlation))
  
  hg19 <- hg19_correlation[hg19_correlation$status == "Strongly negatively correlated (SNC)",]
  paired.genes <- unique(hg19$ensembl_gene_id)
  tab <- plyr::count(hg38_correlation[match(paired.genes, hg38_correlation$ensembl_gene_id),]$group_type)
  phantom.hg19 <- c(sum(table(hg19[!duplicated(hg19$probe),]$Phantom)),sum(is.na(hg19[!duplicated(hg19$probe),]$Phantom)))
  hg19.col <- gsub("\\.\\/","", gsub("_correlation_final.rda","_hg19",f))
  colnames(tab) <- c("Gene_group_type", hg19.col)
  if(is.null(SNC.type)) {
    SNC.type <- tab
  } else {
    SNC.type <- full_join(SNC.type, tab)
  }
  aux <- hg38_correlation[hg38_correlation$status == "Strongly negatively correlated (SNC)",]
  z <- hg19_correlation[hg19_correlation$probe %in% aux$probe,]
  z <- z[!duplicated(z$probe),]
  phantom.hg38 <- c(sum(table(z$Phantom)),
                    sum(is.na(z$Phantom)))
  phantom <- rbind(phantom,phantom.hg19,phantom.hg38)
  tab <- plyr::count(aux[!duplicated(aux$ensembl_gene_id),]$group_type)
  hg38.col <- gsub("\\.\\/", "", gsub("_correlation_final.rda","_hg38",f))
  colnames(tab) <- c("Gene_group_type", hg38.col)
  SNC.type <- full_join(SNC.type, tab)
}

colnames(phantom) <- c("Phantom","No phantom")
rownames(phantom) <- sapply(files, function(x) return(c(paste0("hg38",x),paste0("hg19",x))))

# Each correlation file has pairs of gene probes (max distance 1500bp)
# correlation values (spearman pvalue, rho and status)
# we read it and select only Weakly negatively correlated (WNC)
WNC.type <- NULL
files <- dir(pattern = "correlation_final.rda", recursive = T, full.names = T)
for(f in files){
  load(f)
  # to keep only the same distance signal
  hg38_correlation <- left_join(as_tibble(unique(associated.genes_id[,c("probe", "ensembl_gene_id")])),
                                as_tibble(hg38_correlation))
  
  paired.genes <- unique(hg19_correlation[hg19_correlation$status == "Weakly negatively correlated (WNC)",]$ensembl_gene_id)
  tab <- plyr::count(hg38_correlation[match(paired.genes, hg38_correlation$ensembl_gene_id),]$group_type)
  hg19.col <- gsub("\\.\\/","", gsub("_correlation_final.rda","_hg19",f))
  colnames(tab) <- c("Gene_group_type", hg19.col)
  if(is.null(WNC.type)) {
    WNC.type <- tab
  } else {
    WNC.type <- full_join(WNC.type, tab)
  }
  aux <- hg38_correlation[hg38_correlation$status == "Weakly negatively correlated (WNC)",]
  tab <- plyr::count(aux[!duplicated(aux$ensembl_gene_id),]$group_type)
  hg38.col <- gsub("\\.\\/", "", gsub("_correlation_final.rda","_hg38",f))
  colnames(tab) <- c("Gene_group_type", hg38.col)
  WNC.type <- full_join(WNC.type, tab)
}
colnames(correlation.status)[-1] <- paste0(colnames(correlation.status)[-1],"_negative")
colnames(SNC.type)[-1] <- paste0(colnames(SNC.type)[-1],"_negative")
colnames(WNC.type)[-1] <- paste0(colnames(WNC.type)[-1],"_negative")
WNC.type.negative <- WNC.type
SNC.type.negative <- SNC.type
correlation.status.negative <- correlation.status

2.1.5.1.1 Summary

DT::datatable(correlation.status,
              options = list(scrollX = TRUE),
              filter = 'top')

DT::datatable(SNC.type,
              options = list(scrollX = TRUE),
              filter = 'top')

2.2 Positive distances

This sections compared hg38 associations and h19 associations. The hg38 annotation was retrieved (same used in GDC) from http://zwdzwd.github.io/InfiniumAnnotation and only genes & probes with distance between 0 and +1500 (positive distance) are kept. The hg19 annotation was retrieved the ilumina manifest. We kept only TSS200 and TSS1500 associations.

2.2.1 Unique pair probe genes (with max distance TSS 1500 bp)

# Source: http://zwdzwd.github.io/InfiniumAnnotation
# Only keeping genes & probes with distance between 0 and 1500
associated.genes_id

associated.genes_id$distance <- NULL # ignore different distances
associated.genes_id <- unique(associated.genes_id)
associated.genes_id

# hg19: Illumina manifest. Max distance TSS 1500
unique(associated.genes_id.hg19[,c("probe", "ensembl_gene_id","group_type")])

hg19.pairs <- unique(associated.genes_id.hg19[,c("probe", "ensembl_gene_id")])
colnames(hg19.pairs)[1] <- "probe"
hg38.pairs <- unique(associated.genes_id[,c("probe","ensembl_gene_id")])
shared.pairs <- inner_join(as_tibble(hg19.pairs),as_tibble(hg38.pairs))
unique.hg19.pairs <- anti_join(as_tibble(hg19.pairs),as_tibble(hg38.pairs))
unique.hg38.pairs <- anti_join(as_tibble(hg38.pairs),as_tibble(hg19.pairs))
unique.hg19.genes <- setdiff(unique(hg19.pairs$ensembl_gene_id),unique(hg38.pairs$ensembl_gene_id))
unique.hg38.genes <- setdiff(unique(hg38.pairs$ensembl_gene_id),unique(hg19.pairs$ensembl_gene_id))
shared.genes <- intersect(unique(hg38.pairs$ensembl_gene_id),unique(hg19.pairs$ensembl_gene_id))

2.2.2 Comparing number of unique associate probes between hg38 and hg19

library(VennDiagram)
draw.pairwise.venn(length(unique(associated.genes_id$probe)), 
                   length(unique(associated.genes_id.hg19$probe)),
                   length(intersect(unique(associated.genes_id$probe),
                                    unique(associated.genes_id.hg19$probe))),
                   category = c("Probes hg38", "Probes hg19"), 
                   lty = rep("blank", 2), 
                   fill = c("light blue", "pink"), 
                   alpha = rep(0.5, 2), 
                   cat.pos = c(0, 0), 
                   cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.41], polygon[GRID.polygon.42], polygon[GRID.polygon.43], polygon[GRID.polygon.44], text[GRID.text.45], text[GRID.text.46], text[GRID.text.47], text[GRID.text.48], text[GRID.text.49])

2.2.3 Comparing number of unique associate genes between hg38 and hg19

draw.pairwise.venn(length(unique(associated.genes_id$ensembl_gene_id)), 
                   length(unique(associated.genes_id.hg19$ensembl_gene_id)),
                   length(intersect(unique(associated.genes_id$ensembl_gene_id),
                                    unique(associated.genes_id.hg19$ensembl_gene_id))),
                   category = c("Genes associated to probes hg38", "Genes associated to probes hg19"), 
                   lty = rep("blank", 2), 
                   fill = c("light blue", "pink"), 
                   alpha = rep(0.5, 2), 
                   cat.pos = c(0, 0), 
                   cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.50], polygon[GRID.polygon.51], polygon[GRID.polygon.52], polygon[GRID.polygon.53], text[GRID.text.54], text[GRID.text.55], lines[GRID.lines.56], text[GRID.text.57], text[GRID.text.58], text[GRID.text.59])

unique.hg19.genes <- setdiff(associated.genes_id.hg19$ensembl_gene_id,associated.genes_id$ensembl_gene_id)
unique.hg38.genes <- setdiff(associated.genes_id$ensembl_gene_id,associated.genes_id.hg19$ensembl_gene_id)

2.2.4 Comparing number of unique pairs (gene-probe) between hg38 and hg19

g1 <- unique(associated.genes_id[,c("probe","ensembl_gene_id")])
g2 <- unique(associated.genes_id.hg19[,c("probe","ensembl_gene_id")])
draw.pairwise.venn(nrow(g1), 
                   nrow(g2), 
                   sum(paste0(g1$probe,"_",g1$ensembl_gene_id) %in% paste0(g2$probe,"_",g2$ensembl_gene_id)),
                   category = c("Pairs associated to probes hg38", "Pairs associated to probes hg19"), 
                   lty = rep("blank", 2), 
                   fill = c("light blue", "pink"), 
                   alpha = rep(0.5, 2), 
                   cat.pos = c(0, 0), 
                   cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.60], polygon[GRID.polygon.61], polygon[GRID.polygon.62], polygon[GRID.polygon.63], text[GRID.text.64], text[GRID.text.65], text[GRID.text.66], text[GRID.text.67], text[GRID.text.68])

2.3 All distances (positive and negative)

This sections compared hg38 associations and h19 associations. The hg38 annotation was retrieved (same used in GDC) from http://zwdzwd.github.io/InfiniumAnnotation and only genes & probes with distance between -1500 and +1500 are kept. The hg19 annotation was retrieved the ilumina manifest. We kept only TSS200 and TSS1500 associations.

2.3.1 Unique pair probe genes (with max distance TSS 1500 bp)

# Source: http://zwdzwd.github.io/InfiniumAnnotation
# Only keeping genes & probes with distance < 1500
associated.genes_id

associated.genes_id$distance <- NULL # ignore different distances
associated.genes_id <- unique(associated.genes_id)
associated.genes_id

# hg19: Illumina manifest. Max distance TSS 1500
unique(associated.genes_id.hg19[,c("probe", "ensembl_gene_id","group_type")])

hg19.pairs <- unique(associated.genes_id.hg19[,c("probe", "ensembl_gene_id")])
colnames(hg19.pairs)[1] <- "probe"
hg38.pairs <- unique(associated.genes_id[,c("probe","ensembl_gene_id")])
shared.pairs <- inner_join(as_tibble(hg19.pairs),as_tibble(hg38.pairs))
unique.hg19.pairs <- anti_join(as_tibble(hg19.pairs),as_tibble(hg38.pairs))
unique.hg38.pairs <- anti_join(as_tibble(hg38.pairs),as_tibble(hg19.pairs))
unique.hg19.genes <- setdiff(unique(hg19.pairs$ensembl_gene_id),unique(hg38.pairs$ensembl_gene_id))
unique.hg38.genes <- setdiff(unique(hg38.pairs$ensembl_gene_id),unique(hg19.pairs$ensembl_gene_id))
shared.genes <- intersect(unique(hg38.pairs$ensembl_gene_id),unique(hg19.pairs$ensembl_gene_id))

2.3.2 Comparing number of unique associate probes between hg38 and hg19

library(VennDiagram)
draw.pairwise.venn(length(unique(associated.genes_id$probe)), 
                   length(unique(associated.genes_id.hg19$probe)),
                   length(intersect(unique(associated.genes_id$probe),
                                    unique(associated.genes_id.hg19$probe))),
                   category = c("Probes hg38", "Probes hg19"), 
                   lty = rep("blank", 2), 
                   fill = c("light blue", "pink"), 
                   alpha = rep(0.5, 2), 
                   cat.pos = c(0, 0), 
                   cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.69], polygon[GRID.polygon.70], polygon[GRID.polygon.71], polygon[GRID.polygon.72], text[GRID.text.73], text[GRID.text.74], lines[GRID.lines.75], text[GRID.text.76], text[GRID.text.77], text[GRID.text.78])

2.3.3 Comparing number of unique associate genes between hg38 and hg19

draw.pairwise.venn(length(unique(associated.genes_id$ensembl_gene_id)), 
                   length(unique(associated.genes_id.hg19$ensembl_gene_id)),
                   length(intersect(unique(associated.genes_id$ensembl_gene_id),
                                    unique(associated.genes_id.hg19$ensembl_gene_id))),
                   category = c("Genes associated to probes hg38", "Genes associated to probes hg19"), 
                   lty = rep("blank", 2), 
                   fill = c("light blue", "pink"), 
                   alpha = rep(0.5, 2), 
                   cat.pos = c(0, 0), 
                   cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.79], polygon[GRID.polygon.80], polygon[GRID.polygon.81], polygon[GRID.polygon.82], text[GRID.text.83], text[GRID.text.84], lines[GRID.lines.85], text[GRID.text.86], text[GRID.text.87], text[GRID.text.88])

unique.hg19.genes <- setdiff(associated.genes_id.hg19$ensembl_gene_id,associated.genes_id$ensembl_gene_id)
unique.hg38.genes <- setdiff(associated.genes_id$ensembl_gene_id,associated.genes_id.hg19$ensembl_gene_id)

2.3.4 Comparing number of unique pairs (gene-probe) between hg38 and hg19

g1 <- unique(associated.genes_id[,c("probe","ensembl_gene_id")])
g2 <- unique(associated.genes_id.hg19[,c("probe","ensembl_gene_id")])
draw.pairwise.venn(nrow(g1), 
                   nrow(g2), 
                   sum(paste0(g1$probe,"_",g1$ensembl_gene_id) %in% paste0(g2$probe,"_",g2$ensembl_gene_id)),
                   category = c("Pairs associated to probes hg38", "Pairs associated to probes hg19"), 
                   lty = rep("blank", 2), 
                   fill = c("light blue", "pink"), 
                   alpha = rep(0.5, 2), 
                   cat.pos = c(0, 0), 
                   cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.89], polygon[GRID.polygon.90], polygon[GRID.polygon.91], polygon[GRID.polygon.92], text[GRID.text.93], text[GRID.text.94], lines[GRID.lines.95], text[GRID.text.96], text[GRID.text.97], text[GRID.text.98])

2.3.5 Correlation status between probe and associated gene

2.3.5.1 Considering all pairs

load("~/paper_elmer/GDAN/associated.genes_id_hg19.rda")
load("~/paper_elmer/GDAN/associated.genes_id.rda")
colnames(associated.genes_id.hg19)[2] <- "probe"


associated.genes_id.negative <- associated.genes_id[as.numeric(as.character(associated.genes_id$distance)) < 0,]
associated.genes_id.negative <- unique(associated.genes_id.negative[,c("group_type","ensembl_gene_id")])
associated.genes_id <- unique(associated.genes_id[,c("group_type","ensembl_gene_id")])
gene.hg19 <- unique(associated.genes_id.hg19$ensembl_gene_id)
associated.genes_id.hg19 <- data.frame(ensembl_gene_id = gene.hg19, 
                                       group_type = associated.genes_id$group_type[match(gene.hg19,
                                                                                         associated.genes_id$ensembl_gene_id)]
)
tab.h19 <- plyr::count(associated.genes_id.hg19$group_type)
colnames(tab.h19) <- c("Gene_group_type", "hg19_negative")

tab.h38 <- plyr::count(associated.genes_id.negative$group_type)
colnames(tab.h38) <- c("Gene_group_type", "hg38_negative")
tab.h38.all <- plyr::count(associated.genes_id$group_type)
colnames(tab.h38.all) <- c("Gene_group_type", "hg38_all")


all.type <- NULL
all.type <- full_join(tab.h38, tab.h38.all)
all.type <- full_join(all.type, tab.h19)

library(plotly)

# Each correlation file has pairs of gene probes (max distance 1500bp)
# correlation values (spearman pvalue, rho and status)
# we read it and summarize
files <- dir(pattern = "correlation_final.rda",recursive = T,full.names = T)
correlation.status <- NULL
for(f in files){
  load(f)
  tab <- plyr::count(hg19_correlation$status)
  hg19.col <- gsub("\\.\\/","",gsub("_correlation_final.rda","_hg19",f))
  colnames(tab) <- c("Status",hg19.col)
  if(is.null(correlation.status)) {
    correlation.status <- tab
  } else {
    correlation.status <- full_join(correlation.status, tab)
  }
  tab <- plyr::count(hg38_correlation$status)
  hg38.col <- gsub("\\.\\/","",gsub("_correlation_final.rda","_hg38",f))
  colnames(tab) <- c("Status",hg38.col)
  correlation.status <- full_join(correlation.status, tab)
}

# Each correlation file has pairs of gene probes (max distance 1500bp)
# correlation values (spearman pvalue, rho and status)
# we read it and select only Strongly negatively correlated (SNC)
SNC.type <- NULL
phantom <- NULL

files <- dir(pattern = "correlation_final.rda", recursive = T, full.names = T)
for(f in files){
  message(f)
  load(f)
  if(!any(hg19_correlation$status == "Strongly negatively correlated (SNC)")) next
  hg19 <- hg19_correlation[hg19_correlation$status == "Strongly negatively correlated (SNC)",]
  paired.genes <- unique(hg19$ensembl_gene_id)
  tab <- plyr::count(hg38_correlation[match(paired.genes, hg38_correlation$ensembl_gene_id),]$group_type)
  phantom.hg19 <- c(sum(table(hg19[!duplicated(hg19$probe),]$Phantom)),sum(is.na(hg19[!duplicated(hg19$probe),]$Phantom)))
  hg19.col <- gsub("\\.\\/","", gsub("_correlation_final.rda","_hg19",f))
  colnames(tab) <- c("Gene_group_type", hg19.col)
  if(is.null(SNC.type)) {
    SNC.type <- tab
  } else {
    SNC.type <- full_join(SNC.type, tab)
  }
  aux <- hg38_correlation[hg38_correlation$status == "Strongly negatively correlated (SNC)",]
  tab <- plyr::count(aux[!duplicated(aux$ensembl_gene_id),]$group_type)
  hg38.col <- gsub("\\.\\/", "", gsub("_correlation_final.rda","_hg38",f))
  colnames(tab) <- c("Gene_group_type", hg38.col)
  SNC.type <- full_join(SNC.type, tab)
}

# Each correlation file has pairs of gene probes (max distance 1500bp)
# correlation values (spearman pvalue, rho and status)
# we read it and select only Weakly negatively correlated (WNC)
WNC.type <- NULL
files <- dir(pattern = "correlation_final.rda", recursive = T, full.names = T)
for(f in files){
  load(f)
  if(!any(hg19_correlation$status == "Weakly negatively correlated (WNC)")) next
  paired.genes <- unique(hg19_correlation[hg19_correlation$status == "Weakly negatively correlated (WNC)",]$ensembl_gene_id)
  tab <- plyr::count(hg38_correlation[match(paired.genes, hg38_correlation$ensembl_gene_id),]$group_type)
  hg19.col <- gsub("\\.\\/","", gsub("_correlation_final.rda","_hg19",f))
  colnames(tab) <- c("Gene_group_type", hg19.col)
  if(is.null(WNC.type)) {
    WNC.type <- tab
  } else {
    WNC.type <- full_join(WNC.type, tab)
  }
  aux <- hg38_correlation[hg38_correlation$status == "Weakly negatively correlated (WNC)",]
  tab <- plyr::count(aux[!duplicated(aux$ensembl_gene_id),]$group_type)
  hg38.col <- gsub("\\.\\/", "", gsub("_correlation_final.rda","_hg38",f))
  colnames(tab) <- c("Gene_group_type", hg38.col)
  WNC.type <- full_join(WNC.type, tab)
}

colnames(correlation.status)[-1]  <- paste0(colnames(correlation.status)[-1] ,"_all")
colnames(SNC.type)[-1] <- paste0(colnames(SNC.type)[-1] ,"_all")
colnames(WNC.type)[-1] <- paste0(colnames(WNC.type)[-1] ,"_all")
WNC.type.all <- WNC.type
SNC.type.all <- SNC.type
correlation.status.all <- correlation.status

WNC.type <- full_join(WNC.type.negative, WNC.type.all)
SNC.type <- full_join(SNC.type.negative, SNC.type.all)
correlation.status <- full_join(correlation.status.negative,correlation.status.all)
save(WNC.type,SNC.type,correlation.status,file = "plot.rda")

load("plot.rda")

2.3.5.1.1 Summary

DT::datatable(correlation.status,
              options = list(scrollX = TRUE),
              filter = 'top')

DT::datatable(SNC.type,
              options = list(scrollX = TRUE),
              filter = 'top')

3 Plots

This section will evaluate the transcript type for all associations and for each tumor type their status and fthe most significant anti-correlated associations (SNC and WNC) the transcript type for all associated genes.

3.1 All linked gene group types

3.1.1 All gene types

3.1.2 All gene types - no protein coding

3.2 ACC

3.2.1 All pairs status

3.2.2 All probes - gene types

3.2.2.1 No protein_coding gene type

3.2.3 Unique Strongly negatively correlated (SNC) gene types

3.2.3.1 All gene types

3.2.3.2 No protein_coding gene type

3.2.4 Unique Weakly negatively correlated (WNC) gene types

3.2.4.1 All gene types

3.2.4.2 No protein_coding gene type

3.3 BRCA

3.3.1 All pairs status

3.3.2 All probes - gene types

3.3.2.1 No protein_coding gene type

3.3.3 Unique Strongly negatively correlated (SNC) gene types

3.3.3.1 All gene types

3.3.3.2 No protein_coding gene type

3.3.4 Unique Weakly negatively correlated (WNC) gene types

3.3.4.1 All gene types

3.3.4.2 No protein_coding gene type

3.4 CESC

3.4.1 All pairs status

3.4.2 All probes - gene types

3.4.2.1 No protein_coding gene type

3.4.3 Unique Strongly negatively correlated (SNC) gene types

3.4.3.1 All gene types

3.4.3.2 No protein_coding gene type

3.4.4 Unique Weakly negatively correlated (WNC) gene types

3.4.4.1 All gene types

3.4.4.2 No protein_coding gene type

3.5 CHOL

3.5.1 All pairs status

3.5.2 All probes - gene types

3.5.2.1 No protein_coding gene type

3.5.3 Unique Strongly negatively correlated (SNC) gene types

3.5.3.1 All gene types

3.5.3.2 No protein_coding gene type

3.5.4 Unique Weakly negatively correlated (WNC) gene types

3.5.4.1 All gene types

3.5.4.2 No protein_coding gene type

3.6 DLBC

3.6.1 All pairs status

3.6.2 All probes - gene types

3.6.2.1 No protein_coding gene type

3.6.3 Unique Strongly negatively correlated (SNC) gene types

3.6.3.1 All gene types

3.6.3.2 No protein_coding gene type

3.6.4 Unique Weakly negatively correlated (WNC) gene types

3.6.4.1 All gene types

3.6.4.2 No protein_coding gene type

3.7 ESCA

3.7.1 All pairs status

3.7.2 All probes - gene types

3.7.2.1 No protein_coding gene type

3.7.3 Unique Strongly negatively correlated (SNC) gene types

3.7.3.1 All gene types

3.7.3.2 No protein_coding gene type

3.7.4 Unique Weakly negatively correlated (WNC) gene types

3.7.4.1 All gene types

3.7.4.2 No protein_coding gene type

3.8 GBM

3.8.1 All pairs status

3.8.2 All probes - gene types

3.8.2.1 No protein_coding gene type

3.8.3 Unique Strongly negatively correlated (SNC) gene types

3.8.3.1 All gene types

3.8.3.2 No protein_coding gene type

3.8.4 Unique Weakly negatively correlated (WNC) gene types

3.8.4.1 All gene types

3.8.4.2 No protein_coding gene type

3.9 HNSC

3.9.1 All pairs status

3.9.2 All probes - gene types

3.9.2.1 No protein_coding gene type

3.9.3 Unique Strongly negatively correlated (SNC) gene types

3.9.3.1 All gene types

3.9.3.2 No protein_coding gene type

3.9.4 Unique Weakly negatively correlated (WNC) gene types

3.9.4.1 All gene types

3.9.4.2 No protein_coding gene type

3.10 KICH

3.10.1 All pairs status

3.10.2 All probes - gene types

3.10.2.1 No protein_coding gene type

3.10.3 Unique Strongly negatively correlated (SNC) gene types

3.10.3.1 All gene types

3.10.3.2 No protein_coding gene type

3.10.4 Unique Weakly negatively correlated (WNC) gene types

3.10.4.1 All gene types

3.10.4.2 No protein_coding gene type

3.11 KIRC

3.11.1 All pairs status

3.11.2 All probes - gene types

3.11.2.1 No protein_coding gene type

3.11.3 Unique Strongly negatively correlated (SNC) gene types

3.11.3.1 All gene types

3.11.3.2 No protein_coding gene type

3.11.4 Unique Weakly negatively correlated (WNC) gene types

3.11.4.1 All gene types

3.11.4.2 No protein_coding gene type

3.12 LAML

3.12.1 All pairs status

3.12.2 All probes - gene types

3.12.2.1 No protein_coding gene type

3.12.3 Unique Strongly negatively correlated (SNC) gene types

3.12.3.1 All gene types

3.12.3.2 No protein_coding gene type

3.12.4 Unique Weakly negatively correlated (WNC) gene types

3.12.4.1 All gene types

3.12.4.2 No protein_coding gene type

3.13 LGG

3.13.1 All pairs status

3.13.2 All probes - gene types

3.13.2.1 No protein_coding gene type

3.13.3 Unique Strongly negatively correlated (SNC) gene types

3.13.3.1 All gene types

3.13.3.2 No protein_coding gene type

3.13.4 Unique Weakly negatively correlated (WNC) gene types

3.13.4.1 All gene types

3.13.4.2 No protein_coding gene type

3.14 LUAD

3.14.1 All pairs status

3.14.2 All probes - gene types

3.14.2.1 No protein_coding gene type

3.14.3 Unique Strongly negatively correlated (SNC) gene types

3.14.3.1 All gene types

3.14.3.2 No protein_coding gene type

3.14.4 Unique Weakly negatively correlated (WNC) gene types

3.14.4.1 All gene types

3.14.4.2 No protein_coding gene type

3.15 LUSC

3.15.1 All pairs status

3.15.2 All probes - gene types

3.15.2.1 No protein_coding gene type

3.15.3 Unique Strongly negatively correlated (SNC) gene types

3.15.3.1 All gene types

3.15.3.2 No protein_coding gene type

3.15.4 Unique Weakly negatively correlated (WNC) gene types

3.15.4.1 All gene types

3.15.4.2 No protein_coding gene type

3.16 MESO

3.16.1 All pairs status

3.16.2 All probes - gene types

3.16.2.1 No protein_coding gene type

3.16.3 Unique Strongly negatively correlated (SNC) gene types

3.16.3.1 All gene types

3.16.3.2 No protein_coding gene type

3.16.4 Unique Weakly negatively correlated (WNC) gene types

3.16.4.1 All gene types

3.16.4.2 No protein_coding gene type

3.17 PCPG

3.17.1 All pairs status

3.17.2 All probes - gene types

3.17.2.1 No protein_coding gene type

3.17.3 Unique Strongly negatively correlated (SNC) gene types

3.17.3.1 All gene types

3.17.3.2 No protein_coding gene type

3.17.4 Unique Weakly negatively correlated (WNC) gene types

3.17.4.1 All gene types

3.17.4.2 No protein_coding gene type

3.18 READ

3.18.1 All pairs status

3.18.2 All probes - gene types

3.18.2.1 No protein_coding gene type

3.18.3 Unique Strongly negatively correlated (SNC) gene types

3.18.3.1 All gene types

3.18.3.2 No protein_coding gene type

3.18.4 Unique Weakly negatively correlated (WNC) gene types

3.18.4.1 All gene types

3.18.4.2 No protein_coding gene type

3.19 SARC

3.19.1 All pairs status

3.19.2 All probes - gene types

3.19.2.1 No protein_coding gene type

3.19.3 Unique Strongly negatively correlated (SNC) gene types

3.19.3.1 All gene types

3.19.3.2 No protein_coding gene type

3.19.4 Unique Weakly negatively correlated (WNC) gene types

3.19.4.1 All gene types

3.19.4.2 No protein_coding gene type

3.20 SKCM

3.20.1 All pairs status

3.20.2 All probes - gene types

3.20.2.1 No protein_coding gene type

3.20.3 Unique Strongly negatively correlated (SNC) gene types

3.20.3.1 All gene types

3.20.3.2 No protein_coding gene type

3.20.4 Unique Weakly negatively correlated (WNC) gene types

3.20.4.1 All gene types

3.20.4.2 No protein_coding gene type

3.21 STAD

3.21.1 All pairs status

3.21.2 All probes - gene types

3.21.2.1 No protein_coding gene type

3.21.3 Unique Strongly negatively correlated (SNC) gene types

3.21.3.1 All gene types

3.21.3.2 No protein_coding gene type

3.21.4 Unique Weakly negatively correlated (WNC) gene types

3.21.4.1 All gene types

3.21.4.2 No protein_coding gene type

3.22 TGCT

3.22.1 All pairs status

3.22.2 All probes - gene types

3.22.2.1 No protein_coding gene type

3.22.3 Unique Strongly negatively correlated (SNC) gene types

3.22.3.1 All gene types

3.22.3.2 No protein_coding gene type

3.22.4 Unique Weakly negatively correlated (WNC) gene types

3.22.4.1 All gene types

3.22.4.2 No protein_coding gene type

3.23 THCA

3.23.1 All pairs status

3.23.2 All probes - gene types

3.23.2.1 No protein_coding gene type

3.23.3 Unique Strongly negatively correlated (SNC) gene types

3.23.3.1 All gene types

3.23.3.2 No protein_coding gene type

3.23.4 Unique Weakly negatively correlated (WNC) gene types

3.23.4.1 All gene types

3.23.4.2 No protein_coding gene type

3.24 THYM

3.24.1 All pairs status

3.24.2 All probes - gene types

3.24.2.1 No protein_coding gene type

3.24.3 Unique Strongly negatively correlated (SNC) gene types

3.24.3.1 All gene types

3.24.3.2 No protein_coding gene type

3.24.4 Unique Weakly negatively correlated (WNC) gene types

3.24.4.1 All gene types

3.24.4.2 No protein_coding gene type

3.25 UCEC

3.25.1 All pairs status

3.25.2 All probes - gene types

3.25.2.1 No protein_coding gene type

3.25.3 Unique Strongly negatively correlated (SNC) gene types

3.25.3.1 All gene types

3.25.3.2 No protein_coding gene type

3.25.4 Unique Weakly negatively correlated (WNC) gene types

3.25.4.1 All gene types

3.25.4.2 No protein_coding gene type

3.26 UCS

3.26.1 All pairs status

3.26.2 All probes - gene types

3.26.2.1 No protein_coding gene type

3.26.3 Unique Strongly negatively correlated (SNC) gene types

3.26.3.1 All gene types

3.26.3.2 No protein_coding gene type

3.26.4 Unique Weakly negatively correlated (WNC) gene types

3.26.4.1 All gene types

3.26.4.2 No protein_coding gene type

4 Code

4.1 hg38 analysis

library(doParallel)
library(parallel)
library(dplyr)
library(ELMER)
library(SummarizedExperiment)
library(TCGAbiolinks)
library(readr)
cores <- 2
registerDoParallel(cores)
files <- dir(pattern = "correlation_final.csv",recursive = T,full.names = T)
tumors <- unique(basename(dirname(dirname(files))))

file <- "associated.genes_id.rda"
associated.genes_id <- NULL
if(file.exists(file)) {
  associated.genes_id <- unique(get(load(file)))
  associated.genes_id$distance <- NULL
  associated.genes_id <- unique(associated.genes_id)
}
for(tumor in tumors){
  print(tumor)
  if(file.exists(paste0(paste0(tumor,"/hg38/"),tumor,"_hg38_correlation.csv"))) next
  dir.create(paste0(tumor,"/hg38/"),showWarnings = FALSE,recursive = T)
  met <- get(load(paste0("~/paper_elmer/Data/",tumor,"/",tumor,"_meth_hg38_no_filter.rda")))
  rna <- get(load(paste0("~/paper_elmer/Data/",tumor,"/",tumor,"_RNA_hg38_no_filter.rda")))
  gene.metadata <- values(rna)[,1:2]
  met <- met[rowRanges(met)$Gene_Symbol != ".",]
  
  if(is.null(associated.genes_id)){
    associated.genes_id <- plyr::adply(.data = values(met),
                                       .margins = 1, 
                                       .fun = function(x){
                                         genes <- x["Gene_Symbol"] %>% stringr::str_split(";") %>% unlist
                                         groups <- x["Gene_Type"] %>% stringr::str_split(";") %>% unlist
                                         distance <- x["Position_to_TSS"] %>% stringr::str_split(";") %>% unlist
                                         idx <- which(abs(as.numeric(distance)) <= 1500)
                                         data.frame(gene = genes[idx],
                                                    group_type = groups[idx],
                                                    distance = distance[idx],
                                                    probe = rep(x$Composite.Element.REF,
                                                                length(idx)))
                                       },.id = NULL,.expand = T,.progress = T,.parallel = T)
    save(associated.genes_id, file = "associated.genes_id_not_merged_max_dist_1500.rda")
    
    load("Human_genes__GRCh38_p12__tss.rda")
    idx <- !associated.genes_id$gene %in% tss$external_gene_name
    associated.genes_id$gene[idx] <- as.character(HGNChelper::checkGeneSymbols(associated.genes_id$gene[idx])$Suggested.Symbol)
    
    associated.genes_id <- unique(
      merge(associated.genes_id, 
            unique(tss[,c("external_gene_name","ensembl_gene_id")]), 
            by.x = "gene", 
            by.y = "external_gene_name")
    )
    save(associated.genes_id,file = file)
  }
  
  # This will keep samples with the both DNA methylation and gene expression
  # Take the log2(exp + 1)
  # Put samples in the right order
  # Keep only probes in 2KB close to TSS
  # Remove probes that have NA for more than 50% of samples
  mae.file <- paste0(paste0(tumor),"/",tumor,"_mae_hg38.rda")
  if(file.exists(mae.file)) {
    mae <- get(load(mae.file))
  } else {
    mae <- createMAE(exp = rna,
                     met = met[rownames(met) %in% associated.genes_id$probe,],
                     TCGA = TRUE,
                     linearize.exp = T,
                     save = T,
                     save.filename = paste0(tumor,"/",tumor,"_mae_hg38.rda"),
                     met.platform = "450K",
                     genome = "hg38",
                     met.na.cut = 0.5)
  }
  correlations <- plyr::adply(unique(associated.genes_id[,3:4]),
                              .margins = 1,
                              .fun =  function(x){
                                GeneID <- as.character(x$ensembl_gene_id)
                                probe <- as.character(x$probe)
                                if(!probe %in% names(getMet(mae))) {
                                  return(tibble::tibble(rho = NA, 
                                                        pval = NA, 
                                                        probe,
                                                        ensembl_gene_id =  GeneID))
                                }
                                cor <- cor.test(x = as.numeric(assay(getMet(mae)[probe])), 
                                                y = as.numeric(assay(getExp(mae)[GeneID])),
                                                method = c("spearman"))
                                corval <- as.numeric(cor$estimate)
                                pvalue <- as.numeric(cor$p.value)
                                
                                df <- tibble::tibble(rho = corval, 
                                                     pval = pvalue, 
                                                     probe, 
                                                     ensembl_gene_id = GeneID)
                                return(df)
                              },.id = NULL,.expand = T,.progress = "text",.parallel = F)
  #correlations <- correlations[!is.na(correlations$rho),] # Remove associations not found
  correlations$rho <- as.numeric(as.character(correlations$rho))
  correlations$pval <- as.numeric(as.character(correlations$pval))
  correlations$FDR <- p.adjust(as.numeric(as.character(correlations$pval)),method = "BH")
  
  # If FDR > 0.05 - Insignificant
  # If FDR <= 0.05:
  # - Strongly negatively correlated (SNC) when the rho value was less than -0.5;
  # - Weakly negatively correlated (WNC) when the rho value was between -0.5 and -0.25;
  # - No negative correlation (NNC) when the rho value was greater than -0.25.
  
  correlations$status <- "Insignificant"
  correlations$status[correlations$FDR <= 0.05 & as.numeric(correlations$rho) >= -0.25] <- "No negative correlation"
  correlations$status[correlations$FDR <= 0.05 & correlations$rho < -0.25 & correlations$rho > -0.5] <- "Weakly negatively correlated (WNC)"
  correlations$status[correlations$FDR <= 0.05 & correlations$rho <= -0.5] <- "Strongly negatively correlated (SNC)"
  correlations <- correlations[with(correlations,order(rho,pval)),]
  
  # save
  readr::write_csv(as.data.frame(correlations),
                   path = paste0(paste0(tumor,"/hg38/"),"/",tumor,"_hg38_correlation.csv"))
  
  aux <- left_join(as_tibble(correlations),
                   unique(as_tibble(associated.genes_id)))
  readr::write_csv(aux,
                   path = paste0(paste0(tumor,"/hg38/"),"/",tumor,"_hg38_correlation_all_info.csv"))
  
}

4.2 hg19 analysis

library(doParallel)
library(parallel)
library(dplyr)
library(ELMER)
library(SummarizedExperiment)
library(TCGAbiolinks)


# Illumina manifest
# Source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL13534
metadata.hg19 <- readr::read_tsv(file = "~/paper_elmer/GDAN/450k-manifest-GPL13534-11288.txt",skip = 37)
metadata.hg19 <- metadata.hg19[!is.na(metadata.hg19$UCSC_RefGene_Name),]
metadata.hg19 <- metadata.hg19[grep("TSS1500|TSS200",metadata.hg19$UCSC_RefGene_Group),]

file <- "associated.genes_id_hg19.rda"
associated.genes_id.hg19 <- NULL
if(file.exists(file)) {
  associated.genes_id.hg19 <- get(load(file))
  colnames(associated.genes_id.hg19)[3] <- "probe"
}


if(is.null(associated.genes_id.hg19)){
  associated.genes_id.hg19 <- plyr::adply(.data = metadata.hg19,
                                          .margins = 1, 
                                          .fun = function(x){
                                            genes <- x["UCSC_RefGene_Name"] %>% stringr::str_split(";") %>% unlist
                                            groups <- x["UCSC_RefGene_Group"] %>% stringr::str_split(";") %>% unlist
                                            idx <- grep("TSS",groups)
                                            data.frame(gene = genes[idx],group_type = groups[idx])
                                          },.id = NULL,.expand = T,.progress = T,.parallel = TRUE)
  
  # get all LOC gene names and update them there are not in ENSEMBL database with LOC 
  symbols <- gsub("LOC","",unique(grep("^LOC",associated.genes_id.hg19$gene,value = T)))
  mapping <- select(org.Hs.eg.db, symbols, c("ENTREZID","GENENAME","ENSEMBL","SYMBOL"))
  
  # Replace LOC by gene names. LOC should be removed as this gene names are LOC + entrez ID
  aux <- associated.genes_id.hg19
  idx <- grep("^LOC",aux$gene)
  aux$gene[idx] <- as.character(mapping$SYMBOL[match(gsub("LOC","",aux$gene[idx]),mapping$ENTREZID)])
  idx <- !aux$gene %in% tss$external_gene_name
  aux$gene[idx] <- HGNChelper::checkGeneSymbols(aux$gene[idx])$Suggested.Symbol
  associated.genes_id.hg19 <- aux
  
  associated.genes_id.hg19 <- merge(associated.genes_id.hg19, 
                                    values(rna)[,1:2], 
                                    by.x = "gene", 
                                    by.y = "external_gene_name")
  
  save(associated.genes_id.hg19,file = file)
}

library(doParallel)
library(parallel)
library(dplyr)
library(ELMER)
cores <- 2
registerDoParallel(cores)


files <- dir(pattern = "correlation_final.csv",recursive = T,full.names = T)
tumors <- unique(basename(dirname(dirname(files))))
for(tumor in tumors){
  print(tumor)
  if(file.exists(paste0(paste0(tumor,"/hg19/"),"/",tumor,"_hg19_correlation.csv"))) next
  dir.create(paste0(tumor,"/hg19/"),showWarnings = FALSE,recursive = TRUE)
  met <- get(load(paste0("~/paper_elmer/Data/",tumor,"/",tumor,"_meth_hg38_no_filter.rda")))
  met <- met[unique(associated.genes_id.hg19$ID),]
  rna <- get(load(paste0("~/paper_elmer/Data/",tumor,"/",tumor,"_RNA_hg38_no_filter.rda")))
  
  # This will keep samples with the both DNA methylation and gene expression
  # Take the log2(exp + 1)
  # Put samples in the right order
  # Keep only probes in 2KB close to TSS
  # Remove probes that have NA for more than 50% of samples
  dir.create(paste0(paste0(tumor,"/hg19/")),showWarnings = F,recursive = T)
  mae.file <- paste0(paste0(tumor,"/hg19/"),"/",tumor,"_mae_hg19.rda")
  if(file.exists(mae.file)) {
    mae <- get(load(mae.file))
  } else {
    mae <- createMAE(exp = rna,
                     met = met,
                     TCGA = TRUE,
                     linearize.exp = T,
                     save = T,
                     save.filename = mae.file,
                     met.platform = "450K",
                     genome = "hg19",
                     met.na.cut = 0.5)
    save(mae, file = mae.file)
    
  }
  correlations <- plyr::adply(associated.genes_id.hg19,
                              .margins = 1,
                              .fun =  function(x){
                                GeneID <- as.character(x$ensembl_gene_id)
                                probe <- as.character(x$probe)
                                if(!probe %in% names(getMet(mae))) {
                                  return(tibble::tibble(rho = NA, pval = NA, ensembl_gene_id = GeneID, probe))
                                }
                                cor <- cor.test(x = as.numeric(assay(getMet(mae)[probe])), 
                                                y = as.numeric(assay(getExp(mae)[GeneID])),
                                                method = c("spearman"))
                                corval <- as.numeric(cor$estimate)
                                pvalue <- as.numeric(cor$p.value)
                                tibble::tibble(rho = corval,pval = pvalue, ensembl_gene_id = GeneID, probe)
                              },.id = NULL,
                              .expand = T,
                              .progress = "text",
                              .parallel = T)
  #correlations <- correlations[!is.na(correlations$rho),] # Remove associations not found
  correlations$rho <- as.numeric(as.character(correlations$rho))
  correlations$pval <- as.numeric(as.character(correlations$pval))
  correlations$FDR <- p.adjust(as.numeric(as.character(correlations$pval)),method = "BH")
  
  # If FDR > 0.05 - Insignificant
  # If FDR <= 0.05:
  # - Strongly negatively correlated (SNC) when the rho value was less than -0.5;
  # - Weakly negatively correlated (WNC) when the rho value was between -0.5 and -0.25;
  # - No negative correlation (NNC) when the rho value was greater than -0.25.
  correlations <- correlations[!is.na( as.numeric(correlations$rho)),]
  correlations$status <- "Insignificant"
  correlations[correlations$FDR <= 0.05 & as.numeric(correlations$rho) >= -0.25,]$status <- "No negative correlation"
  correlations[correlations$FDR <= 0.05 & correlations$rho < -0.25 & correlations$rho > -0.5,]$status <- "Weakly negatively correlated (WNC)"
  correlations[correlations$FDR <= 0.05 & correlations$rho <= -0.5,]$status <- "Strongly negatively correlated (SNC)"
  correlations <- correlations[with(correlations,order(rho,pval)),]
  
  # save
  readr::write_csv(unique(as.data.frame(correlations)),
                   path = paste0(paste0(tumor,"/hg19/"),"/",tumor,"_hg19_correlation.csv"))
}

library(readr)
library(dplyr)
#load("~/paper_elmer/GDAN/associated.genes_id_hg19.rda")
load("~/paper_elmer/GDAN/associated.genes_id_hg19.rda")
load("~/paper_elmer/GDAN/associated.genes_id.rda")
colnames(associated.genes_id.hg19)[2] <- "probe"
associated.genes_id$distance <- NULL
associated.genes_id <- unique(associated.genes_id)

files <- dir(pattern = "correlation.csv",recursive = T,full.names = T)
tumors <- basename(dirname(dirname(files)))
tumors <- "BRCA"
snc.in.hg38.not.in.hg19 <- NULL
for(tumor in tumors){
  print(tumor)
  f <- sort(grep(tumor, files,value = T))
  hg19_correlation <- read_csv(f[1])
  colnames(hg19_correlation)[grep("cg",hg19_correlation[1,])] <- "probe"
  hg38_correlation <- read_csv(f[2])
  colnames(hg38_correlation)[grep("ENSG",hg38_correlation[1,])] <- "ensembl_gene_id"
  hg19_correlation <- left_join(as_tibble(hg19_correlation),unique(as_tibble(associated.genes_id.hg19)))
  hg38_correlation <- left_join(as_tibble(hg38_correlation),unique(as_tibble(associated.genes_id)))
  save(hg19_correlation,hg38_correlation, file = paste0(tumor,"_correlation_final.rda"))
  
  if(any(hg19_correlation$status == "Strongly negatively correlated (SNC)")){
    y <- hg19_correlation[hg19_correlation$status == "Strongly negatively correlated (SNC)",]
    z <- hg38_correlation[hg38_correlation$status == "Strongly negatively correlated (SNC)",]
    z <- unique(z[!z$ensembl_gene_id %in% y$ensembl_gene_id,c(8:9)])
    z$tumor <- tumor
    snc.in.hg38.not.in.hg19 <- rbind(snc.in.hg38.not.in.hg19,z)
  }
}
readr::write_csv(unique(as.data.frame(snc.in.hg38.not.in.hg19)),path = "snc_in_hg38_not_in_hg19.csv")

# Evaluating  only negative
library(readr)
#load("~/paper_elmer/GDAN/associated.genes_id_hg19.rda")
load("~/paper_elmer/GDAN/associated.genes_id_hg19.rda")
load("~/paper_elmer/GDAN/associated.genes_id.rda")
colnames(associated.genes_id.hg19)[2] <- "probe"
associated.genes_id <- associated.genes_id[as.numeric(as.character(associated.genes_id$distance)) < 0,]
associated.genes_id$distance <- NULL
associated.genes_id <- unique(associated.genes_id)

files <- dir(pattern = "correlation.csv",recursive = T,full.names = T)
tumors <- basename(dirname(dirname(files)))
tumors <- "BRCA"
snc.in.hg38.not.in.hg19 <- NULL
wnc.in.hg38.not.in.hg19 <- NULL
wnc.in.hg38.not.in.hg19.all <- NULL
snc.in.hg38.not.in.hg19.all <- NULL
for(tumor in tumors){
  print(tumor)
  f <- sort(grep(tumor, files,value = T))
  hg19_correlation <- read_csv(f[1])
  colnames(hg19_correlation)[grep("cg",hg19_correlation[1,])] <- "probe"
  hg38_correlation <- read_csv(f[2])
  colnames(hg38_correlation)[grep("ENSG",hg38_correlation[1,])] <- "ensembl_gene_id"
  hg19_correlation <- left_join(as_tibble(hg19_correlation),unique(as_tibble(associated.genes_id.hg19)))
  hg38_correlation <- left_join(as_tibble(hg38_correlation),unique(as_tibble(associated.genes_id)))
  save(hg19_correlation,hg38_correlation, file = paste0(tumor,"_correlation_only_negative.rda"))
  
  if(any(hg19_correlation$status == "Strongly negatively correlated (SNC)")){
    y <- hg19_correlation[hg19_correlation$status ==  "Strongly negatively correlated (SNC)",]
    z <- hg38_correlation[hg38_correlation$status ==  "Strongly negatively correlated (SNC)",]
    z$tumor <- tumor
    aux <- unique(z[!z$ensembl_gene_id %in% y$ensembl_gene_id,c(8:9)])
    snc.in.hg38.not.in.hg19 <- rbind(snc.in.hg38.not.in.hg19,aux)
    aux <- unique(z[!z$ensembl_gene_id %in% y$ensembl_gene_id,])
    snc.in.hg38.not.in.hg19.all <- rbind(snc.in.hg38.not.in.hg19.all,aux)
  }
  if(any(hg19_correlation$status == "Weakly negatively correlated (WNC)")){
    y <- hg19_correlation[hg19_correlation$status == "Weakly negatively correlated (WNC)",]
    z <- hg38_correlation[hg38_correlation$status == "Weakly negatively correlated (WNC)",]
    z$tumor <- tumor
    aux <- unique(z[!z$ensembl_gene_id %in% y$ensembl_gene_id,c(8:9)])
    wnc.in.hg38.not.in.hg19 <- rbind(wnc.in.hg38.not.in.hg19,aux)
    aux <- unique(z[!z$ensembl_gene_id %in% y$ensembl_gene_id,])
    wnc.in.hg38.not.in.hg19.all <- rbind(wnc.in.hg38.not.in.hg19.all,aux)
  }
}
readr::write_csv(unique(as.data.frame(snc.in.hg38.not.in.hg19.all)),
                 path = "snc_in_hg38_negative_1500bp_not_in_hg19_negative_1500bp_all_info.csv")
readr::write_csv(unique(as.data.frame(wnc.in.hg38.not.in.hg19.all)),
                 path = "wnc_in_hg38_negative_1500bp_not_in_hg19_negative_1500bp_all_info.csv")

readr::write_csv(unique(as.data.frame(snc.in.hg38.not.in.hg19)),
                 path = "snc_in_hg38_negative_1500bp_not_in_hg19_negative_1500bp.csv")
readr::write_csv(unique(as.data.frame(wnc.in.hg38.not.in.hg19)),
                 path = "wnc_in_hg38_negative_1500bp_not_in_hg19_negative_1500bp.csv")


# Plots
files <- dir(pattern = "correlation.csv",recursive = T,full.names = T)
for(tumor in tumors){
  mae.file <- grep(tumor,dir(pattern = "mae_hg38",recursive = T,full.names = T),value = TRUE)
  load(mae.file)
  f <- sort(grep(tumor, files,value = T))
  hg38_correlation <- read_csv(f[2])
  colnames(hg38_correlation)[grep("ENSG",hg38_correlation[1,])] <- "ensembl_gene_id"
  for(i in 1:100){
    library(ELMER)
    dir.create(paste0(tumor,"/hg38/plots/"),recursive = T)
    scatter.plot(data = mae,
                 dir.out = paste0(tumor,"/hg38/plots/"),
                 save = TRUE,
                 byPair = list(probe = as.character(hg38_correlation$probe[i]),
                               gene = as.character(hg38_correlation$ensembl_gene_id[i])), 
                 category = "definition")
  }
}

files <- dir(pattern = "correlation.csv",recursive = T,full.names = T)
for(tumor in tumors){
  f <- sort(grep(tumor, files,value = T))
  hg19_correlation <- read_csv(f[1])
  colnames(hg19_correlation)[grep("cg",hg19_correlation[1,])] <- "probe"
  mae.file <- grep(tumor,dir(pattern = "mae_hg19",recursive = T,full.names = T),value = TRUE)
  load(mae.file)
  for(i in 1:100){
    library(ELMER)
    dir.create(paste0(tumor,"/hg19/plots/"),recursive = T)
    scatter.plot(data = mae,
                 dir.out = paste0(tumor,"/hg19/plots/"),
                 save = TRUE,
                 byPair = list(probe = as.character(hg19_correlation$probe[i]),
                               gene = as.character(hg19_correlation$ensembl_gene_id[i])), 
                 category = "definition")
  }
}

GDAN - DNA methylation legacy vs harmonized GDC databases