This report contains a correlation analysis between DNA methylation levels of probes and their annotated gene nearest genes. The main purpose is to compare the differences in the DNA methylation metadata in GDC legacy database vs GDC harmonized data.
A pair of probe and genes are consider:
As the purpose is to compare DNA methylation annotation the correlation of both analysis used the same gene expression data (GDC harmonized - FPKM-UQ).
Source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL13534
# Source: http://zwdzwd.github.io/InfiniumAnnotation
# Only keeping genes & probes with distance between 0 and -1500
associated.genes_id
## DataFrame with 1057281 rows and 5 columns
## gene group_type distance probe ensembl_gene_id
## <factor> <factor> <factor> <factor> <character>
## 1 5S_rRNA rRNA -1473 cg02415963 ENSG00000276442
## 2 5S_rRNA rRNA -1473 cg02415963 ENSG00000274759
## 3 5S_rRNA rRNA -1473 cg02415963 ENSG00000275305
## 4 5S_rRNA rRNA -1473 cg02415963 ENSG00000212595
## 5 5S_rRNA rRNA -1473 cg02415963 ENSG00000252830
## ... ... ... ... ... ...
## 1057277 ZZZ3 protein_coding -260 cg24762437 ENSG00000036549
## 1057278 ZZZ3 protein_coding -486 cg05029193 ENSG00000036549
## 1057279 ZZZ3 protein_coding -171 cg01485247 ENSG00000036549
## 1057280 ZZZ3 protein_coding -1035 cg02988267 ENSG00000036549
## 1057281 ZZZ3 protein_coding -41 cg02490994 ENSG00000036549
associated.genes_id$distance <- NULL # ignore different distances
associated.genes_id <- unique(associated.genes_id)
associated.genes_id
## DataFrame with 531638 rows and 4 columns
## gene group_type probe ensembl_gene_id
## <factor> <factor> <factor> <character>
## 1 5S_rRNA rRNA cg02415963 ENSG00000276442
## 2 5S_rRNA rRNA cg02415963 ENSG00000274759
## 3 5S_rRNA rRNA cg02415963 ENSG00000275305
## 4 5S_rRNA rRNA cg02415963 ENSG00000212595
## 5 5S_rRNA rRNA cg02415963 ENSG00000252830
## ... ... ... ... ...
## 531634 ZZZ3 protein_coding cg02490994 ENSG00000036549
## 531635 ZZZ3 protein_coding cg01485247 ENSG00000036549
## 531636 ZZZ3 protein_coding cg17864040 ENSG00000036549
## 531637 ZZZ3 protein_coding cg22600394 ENSG00000036549
## 531638 ZZZ3 protein_coding cg02988267 ENSG00000036549
# hg19: Illumina manifest. Max distance TSS 1500
unique(associated.genes_id.hg19[,c("Name", "ensembl_gene_id","group_type")])
## DataFrame with 135007 rows and 3 columns
## Name ensembl_gene_id group_type
## <character> <character> <factor>
## 1 cg17498272 ENSG00000121410 TSS200
## 2 cg07739758 ENSG00000121410 TSS1500
## 3 cg10162823 ENSG00000121410 TSS1500
## 4 cg02957155 ENSG00000121410 TSS200
## 5 cg04269689 ENSG00000121410 TSS200
## ... ... ... ...
## 135003 cg02988267 ENSG00000036549 TSS1500
## 135004 cg26534213 ENSG00000036549 TSS1500
## 135005 cg05029193 ENSG00000036549 TSS1500
## 135006 cg05776075 ENSG00000036549 TSS1500
## 135007 cg01485247 ENSG00000036549 TSS200
hg19.pairs <- unique(associated.genes_id.hg19[,c("Name", "ensembl_gene_id")])
colnames(hg19.pairs)[1] <- "probe"
hg38.pairs <- unique(associated.genes_id[,c("probe","ensembl_gene_id")])
shared.pairs <- inner_join(as_tibble(hg19.pairs),as_tibble(hg38.pairs))
unique.hg19.pairs <- anti_join(as_tibble(hg19.pairs),as_tibble(hg38.pairs))
unique.hg38.pairs <- anti_join(as_tibble(hg38.pairs),as_tibble(hg19.pairs))
unique.hg19.genes <- setdiff(unique(hg19.pairs$ensembl_gene_id),unique(hg38.pairs$ensembl_gene_id))
unique.hg38.genes <- setdiff(unique(hg38.pairs$ensembl_gene_id),unique(hg19.pairs$ensembl_gene_id))
shared.genes <- intersect(unique(hg38.pairs$ensembl_gene_id),unique(hg19.pairs$ensembl_gene_id))
library(VennDiagram)
draw.pairwise.venn(length(unique(associated.genes_id$probe)),
length(unique(associated.genes_id.hg19$Name)),
length(intersect(unique(associated.genes_id$probe),
unique(associated.genes_id.hg19$Name))),
category = c("Probes hg38", "Probes hg19"),
lty = rep("blank", 2),
fill = c("light blue", "pink"),
alpha = rep(0.5, 2),
cat.pos = c(0, 0),
cat.dist = rep(0.025, 2))
## (polygon[GRID.polygon.11], polygon[GRID.polygon.12], polygon[GRID.polygon.13], polygon[GRID.polygon.14], text[GRID.text.15], text[GRID.text.16], lines[GRID.lines.17], text[GRID.text.18], text[GRID.text.19], text[GRID.text.20])
draw.pairwise.venn(length(unique(associated.genes_id$ensembl_gene_id)),
length(unique(associated.genes_id.hg19$ensembl_gene_id)),
length(intersect(unique(associated.genes_id$ensembl_gene_id),
unique(associated.genes_id.hg19$ensembl_gene_id))),
category = c("Genes associated to probes hg38", "Genes associated to probes hg19"),
lty = rep("blank", 2),
fill = c("light blue", "pink"),
alpha = rep(0.5, 2),
cat.pos = c(0, 0),
cat.dist = rep(0.025, 2))
## (polygon[GRID.polygon.21], polygon[GRID.polygon.22], polygon[GRID.polygon.23], polygon[GRID.polygon.24], text[GRID.text.25], text[GRID.text.26], lines[GRID.lines.27], text[GRID.text.28], text[GRID.text.29], text[GRID.text.30])
g1 <- unique(associated.genes_id[,c("probe","ensembl_gene_id")])
g2 <- unique(associated.genes_id.hg19[,c("Name","ensembl_gene_id")])
draw.pairwise.venn(nrow(g1),
nrow(g2),
sum(paste0(g1$probe,"_",g1$ensembl_gene_id) %in% paste0(g2$Name,"_",g2$ensembl_gene_id)),
category = c("Pairs associated to probes hg38", "Pairs associated to probes hg19"),
lty = rep("blank", 2),
fill = c("light blue", "pink"),
alpha = rep(0.5, 2),
cat.pos = c(0, 0),
cat.dist = rep(0.025, 2))
## (polygon[GRID.polygon.31], polygon[GRID.polygon.32], polygon[GRID.polygon.33], polygon[GRID.polygon.34], text[GRID.text.35], text[GRID.text.36], lines[GRID.lines.37], text[GRID.text.38], text[GRID.text.39], text[GRID.text.40])
library(plotly)
# Each correlation file has pairs of gene probes (max distance 1500bp)
# correlation values (spearman pvalue, rho and status)
# we read it and summarize
correlation.status <- NULL
files <- dir(pattern = "correlation.rda",recursive = T,full.names = T)
for(f in files){
load(f)
# to keep only the same distance signal
hg38_correlation <- left_join(as_tibble(unique(associated.genes_id[,c("probe", "ensembl_gene_id")])),
as_tibble(hg38_correlation))
tab <- plyr::count(hg19_correlation$status)
hg19.col <- gsub("\\.\\/","",gsub("_correlation.rda","_hg19",f))
colnames(tab) <- c("Status",hg19.col)
if(is.null(correlation.status)) {
correlation.status <- tab
} else {
correlation.status <- full_join(correlation.status, tab)
}
tab <- plyr::count(hg38_correlation$status)
hg38.col <- gsub("\\.\\/","",gsub("_correlation.rda","_hg38",f))
colnames(tab) <- c("Status",hg38.col)
correlation.status <- full_join(correlation.status, tab)
}
correlation.status[is.na(correlation.status)] <- 0
# Each correlation file has pairs of gene probes (max distance 1500bp)
# correlation values (spearman pvalue, rho and status)
# we read it and select only Strongly negatively correlated (SNC)
SNC.type <- NULL
phantom <- NULL
files <- dir(pattern = "correlation.rda", recursive = T, full.names = T)
for(f in files){
load(f)
# to keep only the same distance signal
hg38_correlation <- left_join(as_tibble(unique(associated.genes_id[,c("probe", "ensembl_gene_id")])),
as_tibble(hg38_correlation))
hg19 <- hg19_correlation[hg19_correlation$status == "Strongly negatively correlated (SNC)",]
paired.genes <- unique(hg19$ensembl_gene_id)
tab <- plyr::count(hg38_correlation[match(paired.genes, hg38_correlation$ensembl_gene_id),]$group_type)
phantom.hg19 <- c(sum(table(hg19[!duplicated(hg19$Name),]$Phantom)),sum(is.na(hg19[!duplicated(hg19$Name),]$Phantom)))
hg19.col <- gsub("\\.\\/","", gsub("_correlation.rda","_hg19",f))
colnames(tab) <- c("Gene_group_type", hg19.col)
if(is.null(SNC.type)) {
SNC.type <- tab
} else {
SNC.type <- full_join(SNC.type, tab)
}
aux <- hg38_correlation[hg38_correlation$status == "Strongly negatively correlated (SNC)",]
z <- hg19_correlation[hg19_correlation$Name %in% aux$probe,]
z <- z[!duplicated(z$Name),]
phantom.hg38 <- c(sum(table(z$Phantom)),
sum(is.na(z$Phantom)))
phantom <- rbind(phantom,phantom.hg19,phantom.hg38)
tab <- plyr::count(aux[!duplicated(aux$ensembl_gene_id),]$group_type)
hg38.col <- gsub("\\.\\/", "", gsub("_correlation.rda","_hg38",f))
colnames(tab) <- c("Gene_group_type", hg38.col)
SNC.type <- full_join(SNC.type, tab)
}
colnames(phantom) <- c("Phantom","No phantom")
rownames(phantom) <- sapply(files, function(x) return(c(paste0("hg38",x),paste0("hg19",x))))
# Each correlation file has pairs of gene probes (max distance 1500bp)
# correlation values (spearman pvalue, rho and status)
# we read it and select only Weakly negatively correlated (WNC)
WNC.type <- NULL
files <- dir(pattern = "correlation.rda", recursive = T, full.names = T)
for(f in files){
load(f)
# to keep only the same distance signal
hg38_correlation <- left_join(as_tibble(unique(associated.genes_id[,c("probe", "ensembl_gene_id")])),
as_tibble(hg38_correlation))
paired.genes <- unique(hg19_correlation[hg19_correlation$status == "Weakly negatively correlated (WNC)",]$ensembl_gene_id)
tab <- plyr::count(hg38_correlation[match(paired.genes, hg38_correlation$ensembl_gene_id),]$group_type)
hg19.col <- gsub("\\.\\/","", gsub("_correlation.rda","_hg19",f))
colnames(tab) <- c("Gene_group_type", hg19.col)
if(is.null(WNC.type)) {
WNC.type <- tab
} else {
WNC.type <- full_join(WNC.type, tab)
}
aux <- hg38_correlation[hg38_correlation$status == "Weakly negatively correlated (WNC)",]
tab <- plyr::count(aux[!duplicated(aux$ensembl_gene_id),]$group_type)
hg38.col <- gsub("\\.\\/", "", gsub("_correlation.rda","_hg38",f))
colnames(tab) <- c("Gene_group_type", hg38.col)
WNC.type <- full_join(WNC.type, tab)
}
colnames(correlation.status)[-1] <- paste0(colnames(correlation.status)[-1],"_negative")
colnames(SNC.type)[-1] <- paste0(colnames(SNC.type)[-1],"_negative")
colnames(WNC.type)[-1] <- paste0(colnames(WNC.type)[-1],"_negative")
WNC.type.negative <- WNC.type
SNC.type.negative <- SNC.type
correlation.status.negative <- correlation.status
# Source: http://zwdzwd.github.io/InfiniumAnnotation
# Only keeping genes & probes with distance between 0 and 1500
associated.genes_id
## DataFrame with 473320 rows and 5 columns
## gene group_type distance probe ensembl_gene_id
## <factor> <factor> <factor> <factor> <character>
## 1 5S_rRNA rRNA 27 cg24534731 ENSG00000276442
## 2 5S_rRNA rRNA 27 cg24534731 ENSG00000274759
## 3 5S_rRNA rRNA 27 cg24534731 ENSG00000275305
## 4 5S_rRNA rRNA 27 cg24534731 ENSG00000212595
## 5 5S_rRNA rRNA 27 cg24534731 ENSG00000252830
## ... ... ... ... ... ...
## 473316 ZZZ3 protein_coding 99 cg04127303 ENSG00000036549
## 473317 ZZZ3 protein_coding 536 cg17864040 ENSG00000036549
## 473318 ZZZ3 protein_coding 275 cg05029193 ENSG00000036549
## 473319 ZZZ3 protein_coding 82 cg04127303 ENSG00000036549
## 473320 ZZZ3 protein_coding 693 cg24577193 ENSG00000036549
associated.genes_id$distance <- NULL # ignore different distances
associated.genes_id <- unique(associated.genes_id)
associated.genes_id
## DataFrame with 173538 rows and 4 columns
## gene group_type probe ensembl_gene_id
## <factor> <factor> <factor> <character>
## 1 5S_rRNA rRNA cg24534731 ENSG00000276442
## 2 5S_rRNA rRNA cg24534731 ENSG00000274759
## 3 5S_rRNA rRNA cg24534731 ENSG00000275305
## 4 5S_rRNA rRNA cg24534731 ENSG00000212595
## 5 5S_rRNA rRNA cg24534731 ENSG00000252830
## ... ... ... ... ...
## 173534 ZZZ3 protein_coding cg17224732 ENSG00000036549
## 173535 ZZZ3 protein_coding cg04127303 ENSG00000036549
## 173536 ZZZ3 protein_coding cg24762437 ENSG00000036549
## 173537 ZZZ3 protein_coding cg17864040 ENSG00000036549
## 173538 ZZZ3 protein_coding cg05029193 ENSG00000036549
# hg19: Illumina manifest. Max distance TSS 1500
unique(associated.genes_id.hg19[,c("Name", "ensembl_gene_id","group_type")])
## DataFrame with 135007 rows and 3 columns
## Name ensembl_gene_id group_type
## <character> <character> <factor>
## 1 cg17498272 ENSG00000121410 TSS200
## 2 cg07739758 ENSG00000121410 TSS1500
## 3 cg10162823 ENSG00000121410 TSS1500
## 4 cg02957155 ENSG00000121410 TSS200
## 5 cg04269689 ENSG00000121410 TSS200
## ... ... ... ...
## 135003 cg02988267 ENSG00000036549 TSS1500
## 135004 cg26534213 ENSG00000036549 TSS1500
## 135005 cg05029193 ENSG00000036549 TSS1500
## 135006 cg05776075 ENSG00000036549 TSS1500
## 135007 cg01485247 ENSG00000036549 TSS200
hg19.pairs <- unique(associated.genes_id.hg19[,c("Name", "ensembl_gene_id")])
colnames(hg19.pairs)[1] <- "probe"
hg38.pairs <- unique(associated.genes_id[,c("probe","ensembl_gene_id")])
shared.pairs <- inner_join(as_tibble(hg19.pairs),as_tibble(hg38.pairs))
unique.hg19.pairs <- anti_join(as_tibble(hg19.pairs),as_tibble(hg38.pairs))
unique.hg38.pairs <- anti_join(as_tibble(hg38.pairs),as_tibble(hg19.pairs))
unique.hg19.genes <- setdiff(unique(hg19.pairs$ensembl_gene_id),unique(hg38.pairs$ensembl_gene_id))
unique.hg38.genes <- setdiff(unique(hg38.pairs$ensembl_gene_id),unique(hg19.pairs$ensembl_gene_id))
shared.genes <- intersect(unique(hg38.pairs$ensembl_gene_id),unique(hg19.pairs$ensembl_gene_id))
library(VennDiagram)
draw.pairwise.venn(length(unique(associated.genes_id$probe)),
length(unique(associated.genes_id.hg19$Name)),
length(intersect(unique(associated.genes_id$probe),
unique(associated.genes_id.hg19$Name))),
category = c("Probes hg38", "Probes hg19"),
lty = rep("blank", 2),
fill = c("light blue", "pink"),
alpha = rep(0.5, 2),
cat.pos = c(0, 0),
cat.dist = rep(0.025, 2))
## (polygon[GRID.polygon.41], polygon[GRID.polygon.42], polygon[GRID.polygon.43], polygon[GRID.polygon.44], text[GRID.text.45], text[GRID.text.46], text[GRID.text.47], text[GRID.text.48], text[GRID.text.49])
draw.pairwise.venn(length(unique(associated.genes_id$ensembl_gene_id)),
length(unique(associated.genes_id.hg19$ensembl_gene_id)),
length(intersect(unique(associated.genes_id$ensembl_gene_id),
unique(associated.genes_id.hg19$ensembl_gene_id))),
category = c("Genes associated to probes hg38", "Genes associated to probes hg19"),
lty = rep("blank", 2),
fill = c("light blue", "pink"),
alpha = rep(0.5, 2),
cat.pos = c(0, 0),
cat.dist = rep(0.025, 2))
## (polygon[GRID.polygon.50], polygon[GRID.polygon.51], polygon[GRID.polygon.52], polygon[GRID.polygon.53], text[GRID.text.54], text[GRID.text.55], lines[GRID.lines.56], text[GRID.text.57], text[GRID.text.58], text[GRID.text.59])
g1 <- unique(associated.genes_id[,c("probe","ensembl_gene_id")])
g2 <- unique(associated.genes_id.hg19[,c("Name","ensembl_gene_id")])
draw.pairwise.venn(nrow(g1),
nrow(g2),
sum(paste0(g1$probe,"_",g1$ensembl_gene_id) %in% paste0(g2$Name,"_",g2$ensembl_gene_id)),
category = c("Pairs associated to probes hg38", "Pairs associated to probes hg19"),
lty = rep("blank", 2),
fill = c("light blue", "pink"),
alpha = rep(0.5, 2),
cat.pos = c(0, 0),
cat.dist = rep(0.025, 2))
## (polygon[GRID.polygon.60], polygon[GRID.polygon.61], polygon[GRID.polygon.62], polygon[GRID.polygon.63], text[GRID.text.64], text[GRID.text.65], text[GRID.text.66], text[GRID.text.67], text[GRID.text.68])
# Source: http://zwdzwd.github.io/InfiniumAnnotation
# Only keeping genes & probes with distance < 1500
associated.genes_id
## DataFrame with 1531317 rows and 5 columns
## gene group_type distance probe ensembl_gene_id
## <factor> <factor> <factor> <factor> <character>
## 1 5S_rRNA rRNA -1473 cg02415963 ENSG00000276442
## 2 5S_rRNA rRNA -1473 cg02415963 ENSG00000274759
## 3 5S_rRNA rRNA -1473 cg02415963 ENSG00000275305
## 4 5S_rRNA rRNA -1473 cg02415963 ENSG00000212595
## 5 5S_rRNA rRNA -1473 cg02415963 ENSG00000252830
## ... ... ... ... ... ...
## 1531313 ZZZ3 protein_coding 82 cg04127303 ENSG00000036549
## 1531314 ZZZ3 protein_coding 693 cg24577193 ENSG00000036549
## 1531315 ZZZ3 protein_coding -171 cg01485247 ENSG00000036549
## 1531316 ZZZ3 protein_coding -1035 cg02988267 ENSG00000036549
## 1531317 ZZZ3 protein_coding -41 cg02490994 ENSG00000036549
associated.genes_id$distance <- NULL # ignore different distances
associated.genes_id <- unique(associated.genes_id)
associated.genes_id
## DataFrame with 639893 rows and 4 columns
## gene group_type probe ensembl_gene_id
## <factor> <factor> <factor> <character>
## 1 5S_rRNA rRNA cg02415963 ENSG00000276442
## 2 5S_rRNA rRNA cg02415963 ENSG00000274759
## 3 5S_rRNA rRNA cg02415963 ENSG00000275305
## 4 5S_rRNA rRNA cg02415963 ENSG00000212595
## 5 5S_rRNA rRNA cg02415963 ENSG00000252830
## ... ... ... ... ...
## 639889 ZZZ3 protein_coding cg02490994 ENSG00000036549
## 639890 ZZZ3 protein_coding cg04718055 ENSG00000036549
## 639891 ZZZ3 protein_coding cg17864040 ENSG00000036549
## 639892 ZZZ3 protein_coding cg04127303 ENSG00000036549
## 639893 ZZZ3 protein_coding cg02988267 ENSG00000036549
# hg19: Illumina manifest. Max distance TSS 1500
unique(associated.genes_id.hg19[,c("Name", "ensembl_gene_id","group_type")])
## DataFrame with 135007 rows and 3 columns
## Name ensembl_gene_id group_type
## <character> <character> <factor>
## 1 cg17498272 ENSG00000121410 TSS200
## 2 cg07739758 ENSG00000121410 TSS1500
## 3 cg10162823 ENSG00000121410 TSS1500
## 4 cg02957155 ENSG00000121410 TSS200
## 5 cg04269689 ENSG00000121410 TSS200
## ... ... ... ...
## 135003 cg02988267 ENSG00000036549 TSS1500
## 135004 cg26534213 ENSG00000036549 TSS1500
## 135005 cg05029193 ENSG00000036549 TSS1500
## 135006 cg05776075 ENSG00000036549 TSS1500
## 135007 cg01485247 ENSG00000036549 TSS200
hg19.pairs <- unique(associated.genes_id.hg19[,c("Name", "ensembl_gene_id")])
colnames(hg19.pairs)[1] <- "probe"
hg38.pairs <- unique(associated.genes_id[,c("probe","ensembl_gene_id")])
shared.pairs <- inner_join(as_tibble(hg19.pairs),as_tibble(hg38.pairs))
unique.hg19.pairs <- anti_join(as_tibble(hg19.pairs),as_tibble(hg38.pairs))
unique.hg38.pairs <- anti_join(as_tibble(hg38.pairs),as_tibble(hg19.pairs))
unique.hg19.genes <- setdiff(unique(hg19.pairs$ensembl_gene_id),unique(hg38.pairs$ensembl_gene_id))
unique.hg38.genes <- setdiff(unique(hg38.pairs$ensembl_gene_id),unique(hg19.pairs$ensembl_gene_id))
shared.genes <- intersect(unique(hg38.pairs$ensembl_gene_id),unique(hg19.pairs$ensembl_gene_id))
library(VennDiagram)
draw.pairwise.venn(length(unique(associated.genes_id$probe)),
length(unique(associated.genes_id.hg19$Name)),
length(intersect(unique(associated.genes_id$probe),
unique(associated.genes_id.hg19$Name))),
category = c("Probes hg38", "Probes hg19"),
lty = rep("blank", 2),
fill = c("light blue", "pink"),
alpha = rep(0.5, 2),
cat.pos = c(0, 0),
cat.dist = rep(0.025, 2))
## (polygon[GRID.polygon.69], polygon[GRID.polygon.70], polygon[GRID.polygon.71], polygon[GRID.polygon.72], text[GRID.text.73], text[GRID.text.74], lines[GRID.lines.75], text[GRID.text.76], text[GRID.text.77], text[GRID.text.78])
draw.pairwise.venn(length(unique(associated.genes_id$ensembl_gene_id)),
length(unique(associated.genes_id.hg19$ensembl_gene_id)),
length(intersect(unique(associated.genes_id$ensembl_gene_id),
unique(associated.genes_id.hg19$ensembl_gene_id))),
category = c("Genes associated to probes hg38", "Genes associated to probes hg19"),
lty = rep("blank", 2),
fill = c("light blue", "pink"),
alpha = rep(0.5, 2),
cat.pos = c(0, 0),
cat.dist = rep(0.025, 2))
## (polygon[GRID.polygon.79], polygon[GRID.polygon.80], polygon[GRID.polygon.81], polygon[GRID.polygon.82], text[GRID.text.83], text[GRID.text.84], lines[GRID.lines.85], text[GRID.text.86], text[GRID.text.87], text[GRID.text.88])
g1 <- unique(associated.genes_id[,c("probe","ensembl_gene_id")])
g2 <- unique(associated.genes_id.hg19[,c("Name","ensembl_gene_id")])
draw.pairwise.venn(nrow(g1),
nrow(g2),
sum(paste0(g1$probe,"_",g1$ensembl_gene_id) %in% paste0(g2$Name,"_",g2$ensembl_gene_id)),
category = c("Pairs associated to probes hg38", "Pairs associated to probes hg19"),
lty = rep("blank", 2),
fill = c("light blue", "pink"),
alpha = rep(0.5, 2),
cat.pos = c(0, 0),
cat.dist = rep(0.025, 2))
## (polygon[GRID.polygon.89], polygon[GRID.polygon.90], polygon[GRID.polygon.91], polygon[GRID.polygon.92], text[GRID.text.93], text[GRID.text.94], lines[GRID.lines.95], text[GRID.text.96], text[GRID.text.97], text[GRID.text.98])
load("~/paper_elmer/GDAN/associated.genes_id_hg19.rda")
load("~/paper_elmer/GDAN/associated.genes_id.rda")
associated.genes_id.negative <- associated.genes_id[as.numeric(as.character(associated.genes_id$distance)) < 0,]
associated.genes_id.negative <- unique(associated.genes_id.negative[,c("group_type","ensembl_gene_id")])
associated.genes_id <- unique(associated.genes_id[,c("group_type","ensembl_gene_id")])
gene.hg19 <- unique(associated.genes_id.hg19$ensembl_gene_id)
associated.genes_id.hg19 <- data.frame(ensembl_gene_id = gene.hg19,
group_type = associated.genes_id$group_type[match(gene.hg19,
associated.genes_id$ensembl_gene_id)]
)
tab.h19 <- plyr::count(associated.genes_id.hg19$group_type)
colnames(tab.h19) <- c("Gene_group_type", "hg19_negative")
tab.h38 <- plyr::count(associated.genes_id.negative$group_type)
colnames(tab.h38) <- c("Gene_group_type", "hg38_negative")
tab.h38.all <- plyr::count(associated.genes_id$group_type)
colnames(tab.h38.all) <- c("Gene_group_type", "hg38_all")
all.type <- NULL
all.type <- full_join(tab.h38, tab.h38.all)
all.type <- full_join(all.type, tab.h19)
library(plotly)
# Each correlation file has pairs of gene probes (max distance 1500bp)
# correlation values (spearman pvalue, rho and status)
# we read it and summarize
files <- dir(pattern = "correlation.rda",recursive = T,full.names = T)
correlation.status <- NULL
for(f in files){
load(f)
tab <- plyr::count(hg19_correlation$status)
hg19.col <- gsub("\\.\\/","",gsub("_correlation.rda","_hg19",f))
colnames(tab) <- c("Status",hg19.col)
if(is.null(correlation.status)) {
correlation.status <- tab
} else {
correlation.status <- full_join(correlation.status, tab)
}
tab <- plyr::count(hg38_correlation$status)
hg38.col <- gsub("\\.\\/","",gsub("_correlation.rda","_hg38",f))
colnames(tab) <- c("Status",hg38.col)
correlation.status <- full_join(correlation.status, tab)
}
# Each correlation file has pairs of gene probes (max distance 1500bp)
# correlation values (spearman pvalue, rho and status)
# we read it and select only Strongly negatively correlated (SNC)
SNC.type <- NULL
phantom <- NULL
files <- dir(pattern = "correlation.rda", recursive = T, full.names = T)
for(f in files){
message(f)
load(f)
if(!any(hg19_correlation$status == "Strongly negatively correlated (SNC)")) next
hg19 <- hg19_correlation[hg19_correlation$status == "Strongly negatively correlated (SNC)",]
paired.genes <- unique(hg19$ensembl_gene_id)
tab <- plyr::count(hg38_correlation[match(paired.genes, hg38_correlation$ensembl_gene_id),]$group_type)
phantom.hg19 <- c(sum(table(hg19[!duplicated(hg19$Name),]$Phantom)),sum(is.na(hg19[!duplicated(hg19$Name),]$Phantom)))
hg19.col <- gsub("\\.\\/","", gsub("_correlation.rda","_hg19",f))
colnames(tab) <- c("Gene_group_type", hg19.col)
if(is.null(SNC.type)) {
SNC.type <- tab
} else {
SNC.type <- full_join(SNC.type, tab)
}
aux <- hg38_correlation[hg38_correlation$status == "Strongly negatively correlated (SNC)",]
tab <- plyr::count(aux[!duplicated(aux$ensembl_gene_id),]$group_type)
hg38.col <- gsub("\\.\\/", "", gsub("_correlation.rda","_hg38",f))
colnames(tab) <- c("Gene_group_type", hg38.col)
SNC.type <- full_join(SNC.type, tab)
}
# Each correlation file has pairs of gene probes (max distance 1500bp)
# correlation values (spearman pvalue, rho and status)
# we read it and select only Weakly negatively correlated (WNC)
WNC.type <- NULL
files <- dir(pattern = "correlation.rda", recursive = T, full.names = T)
for(f in files){
load(f)
if(!any(hg19_correlation$status == "Weakly negatively correlated (WNC)")) next
paired.genes <- unique(hg19_correlation[hg19_correlation$status == "Weakly negatively correlated (WNC)",]$ensembl_gene_id)
tab <- plyr::count(hg38_correlation[match(paired.genes, hg38_correlation$ensembl_gene_id),]$group_type)
hg19.col <- gsub("\\.\\/","", gsub("_correlation.rda","_hg19",f))
colnames(tab) <- c("Gene_group_type", hg19.col)
if(is.null(WNC.type)) {
WNC.type <- tab
} else {
WNC.type <- full_join(WNC.type, tab)
}
aux <- hg38_correlation[hg38_correlation$status == "Weakly negatively correlated (WNC)",]
tab <- plyr::count(aux[!duplicated(aux$ensembl_gene_id),]$group_type)
hg38.col <- gsub("\\.\\/", "", gsub("_correlation.rda","_hg38",f))
colnames(tab) <- c("Gene_group_type", hg38.col)
WNC.type <- full_join(WNC.type, tab)
}
colnames(correlation.status)[-1] <- paste0(colnames(correlation.status)[-1] ,"_all")
colnames(SNC.type)[-1] <- paste0(colnames(SNC.type)[-1] ,"_all")
colnames(WNC.type)[-1] <- paste0(colnames(WNC.type)[-1] ,"_all")
WNC.type.all <- WNC.type
SNC.type.all <- SNC.type
correlation.status.all <- correlation.status
WNC.type <- full_join(WNC.type.negative, WNC.type.all)
SNC.type <- full_join(SNC.type.negative, SNC.type.all)
correlation.status <- full_join(correlation.status.negative,correlation.status.all)
save(WNC.type,SNC.type,correlation.status,file = "plot.rda")
library(doParallel)
library(parallel)
library(dplyr)
library(ELMER)
library(SummarizedExperiment)
library(TCGAbiolinks)
library(readr)
cores <- 2
registerDoParallel(cores)
projects <- getGDCprojects()$project_id
projects <- grep("TCGA",projects,value = TRUE)
tumors <- sort(gsub("TCGA-","",projects),decreasing = T)
file <- "associated.genes_id.rda"
associated.genes_id <- NULL
if(file.exists(file)) {
associated.genes_id <- unique(get(load(file)))
associated.genes_id$distance <- NULL
associated.genes_id <- unique(associated.genes_id)
}
for(tumor in tumors){
print(tumor)
if(file.exists(paste0(paste0(tumor,"/hg38/"),tumor,"_hg38_correlation.csv"))) next
dir.create(paste0(tumor,"/hg38/"),showWarnings = FALSE,recursive = T)
met <- get(load(paste0("~/paper_elmer/Data/",tumor,"/",tumor,"_meth_hg38_no_filter.rda")))
rna <- get(load(paste0("~/paper_elmer/Data/",tumor,"/",tumor,"_RNA_hg38_no_filter.rda")))
gene.metadata <- values(rna)[,1:2]
met <- met[rowRanges(met)$Gene_Symbol != ".",]
if(is.null(associated.genes_id)){
associated.genes_id <- plyr::adply(.data = values(met),
.margins = 1,
.fun = function(x){
genes <- x["Gene_Symbol"] %>% stringr::str_split(";") %>% unlist
groups <- x["Gene_Type"] %>% stringr::str_split(";") %>% unlist
distance <- x["Position_to_TSS"] %>% stringr::str_split(";") %>% unlist
idx <- which(abs(as.numeric(distance)) <= 1500)
data.frame(gene = genes[idx],
group_type = groups[idx],
distance = distance[idx],
probe = rep(x$Composite.Element.REF,
length(idx)))
},.id = NULL,.expand = T,.progress = T,.parallel = FALSE)
associated.genes_id <- full_join(associated.genes_id,
values(rna)[,1:2],
by.x = "gene",
by.y = "external_gene_name")
save(associated.genes_id,file = file)
}
# This will keep samples with the both DNA methylation and gene expression
# Take the log2(exp + 1)
# Put samples in the right order
# Keep only probes in 2KB close to TSS
# Remove probes that have NA for more than 50% of samples
mae.file <- paste0(paste0(tumor),"/",tumor,"_mae_hg38.rda")
if(file.exists(mae.file)) {
mae <- get(load(mae.file))
} else {
mae <- createMAE(exp = rna,
met = met[rownames(met) %in% associated.genes_id$probe,],
TCGA = TRUE,
linearize.exp = T,
save = T,
save.filename = paste0(tumor,"/",tumor,"_mae_hg38.rda"),
met.platform = "450K",
genome = "hg38",
met.na.cut = 0.5)
}
correlations <- plyr::adply(unique(associated.genes_id[,3:4]),
.margins = 1,
.fun = function(x){
GeneID <- as.character(x$ensembl_gene_id)
probe <- as.character(x$probe)
if(!probe %in% names(getMet(mae))) {
return(tibble::tibble(rho = NA,
pval = NA,
probe,
ensembl_gene_id = GeneID))
}
cor <- cor.test(x = as.numeric(assay(getMet(mae)[probe])),
y = as.numeric(assay(getExp(mae)[GeneID])),
method = c("spearman"))
corval <- as.numeric(cor$estimate)
pvalue <- as.numeric(cor$p.value)
df <- tibble::tibble(rho = corval,
pval = pvalue,
probe,
ensembl_gene_id = GeneID)
return(df)
},.id = NULL,.expand = T,.progress = "text",.parallel = F)
#correlations <- correlations[!is.na(correlations$rho),] # Remove associations not found
correlations$rho <- as.numeric(as.character(correlations$rho))
correlations$pval <- as.numeric(as.character(correlations$pval))
correlations$FDR <- p.adjust(as.numeric(as.character(correlations$pval)),method = "BH")
# If FDR > 0.05 - Insignificant
# If FDR <= 0.05:
# - Strongly negatively correlated (SNC) when the rho value was less than -0.5;
# - Weakly negatively correlated (WNC) when the rho value was between -0.5 and -0.25;
# - No negative correlation (NNC) when the rho value was greater than -0.25.
correlations$status <- "Insignificant"
correlations$status[correlations$FDR <= 0.05 & as.numeric(correlations$rho) >= -0.25] <- "No negative correlation"
correlations$status[correlations$FDR <= 0.05 & correlations$rho < -0.25 & correlations$rho > -0.5] <- "Weakly negatively correlated (WNC)"
correlations$status[correlations$FDR <= 0.05 & correlations$rho <= -0.5] <- "Strongly negatively correlated (SNC)"
correlations <- correlations[with(correlations,order(rho,pval)),]
# save
readr::write_csv(as.data.frame(correlations),
path = paste0(paste0(tumor,"/hg38/"),"/",tumor,"_hg38_correlation.csv"))
aux <- left_join(as_tibble(correlations),
unique(as_tibble(associated.genes_id)))
readr::write_csv(aux,
path = paste0(paste0(tumor,"/hg38/"),"/",tumor,"_hg38_correlation_all_info.csv"))
}
library(doParallel)
library(parallel)
library(dplyr)
library(ELMER)
library(SummarizedExperiment)
library(TCGAbiolinks)
# Illumina manifest
# Source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL13534
metadata.hg19 <- readr::read_tsv(file = "~/paper_elmer/GDAN/450k-manifest-GPL13534-11288.txt",skip = 37)
metadata.hg19 <- metadata.hg19[!is.na(metadata.hg19$UCSC_RefGene_Name),]
metadata.hg19 <- metadata.hg19[grep("TSS1500|TSS200",metadata.hg19$UCSC_RefGene_Group),]
file <- "associated.genes_id_hg19.rda"
associated.genes_id.hg19 <- NULL
if(file.exists(file)) {
associated.genes_id.hg19 <- get(load(file))
colnames(associated.genes_id.hg19)[3] <- "probe"
}
if(is.null(associated.genes_id.hg19)){
associated.genes_id.hg19 <- plyr::adply(.data = metadata.hg19,
.margins = 1,
.fun = function(x){
genes <- x["UCSC_RefGene_Name"] %>% stringr::str_split(";") %>% unlist
groups <- x["UCSC_RefGene_Group"] %>% stringr::str_split(";") %>% unlist
idx <- grep("TSS",groups)
data.frame(gene = genes[idx],group_type = groups[idx])
},.id = NULL,.expand = T,.progress = T,.parallel = TRUE)
# get all LOC gene names and update them there are not in ENSEMBL database with LOC
symbols <- gsub("LOC","",unique(grep("LOC",associated.genes_id.hg19$gene,value = T)))
mapping <- select(org.Hs.eg.db, symbols, c("ENTREZID","GENENAME","ENSEMBL","SYMBOL"))
# Replace LOC by gene names. LOC should be removed as this gene names are LOC + entrez ID
idx <- grep("LOC",associated.genes_id.hg19$gene)
#gene.<- as.character(mapping$ENSEMBL[match(gsub("LOC","",associated.genes_id.hg19$gene[idx]), mapping$ENTREZID)])
#associated.genes_id.hg19$gene[idx] <-
associated.genes_id.hg19 <- merge(associated.genes_id.hg19,
values(rna)[,1:2],
by.x = "gene",
by.x = "external_gene_name")
save(associated.genes_id.hg19,file = file)
}
library(doParallel)
library(parallel)
library(dplyr)
library(ELMER)
cores <- 2
registerDoParallel(cores)
projects <- getGDCprojects()$project_id
projects <- grep("TCGA",projects,value = TRUE)
tumors <- sort(gsub("TCGA-","",projects))
#tumors <- c("ESCA","BRCA","LUAD","LUSC")
for(tumor in tumors){
print(tumor)
if(file.exists(paste0(paste0(tumor,"/hg19/"),"/",tumor,"_hg19_correlation.csv"))) next
dir.create(paste0(tumor,"/hg19/"),showWarnings = FALSE,recursive = TRUE)
met <- get(load(paste0("~/paper_elmer/Data/",tumor,"/",tumor,"_meth_hg38_no_filter.rda")))
met <- met[unique(associated.genes_id.hg19$ID),]
rna <- get(load(paste0("~/paper_elmer/Data/",tumor,"/",tumor,"_RNA_hg38_no_filter.rda")))
# This will keep samples with the both DNA methylation and gene expression
# Take the log2(exp + 1)
# Put samples in the right order
# Keep only probes in 2KB close to TSS
# Remove probes that have NA for more than 50% of samples
dir.create(paste0(paste0(tumor,"/hg19/")),showWarnings = F,recursive = T)
mae.file <- paste0(paste0(tumor,"/hg19/"),"/",tumor,"_mae_hg19.rda")
if(file.exists(mae.file)) {
mae <- get(load(mae.file))
} else {
mae <- createMAE(exp = rna,
met = met,
TCGA = TRUE,
linearize.exp = T,
save = T,
save.filename = mae.file,
met.platform = "450K",
genome = "hg19",
met.na.cut = 0.5)
save(mae, file = mae.file)
}
correlations <- plyr::adply(associated.genes_id.hg19,
.margins = 1,
.fun = function(x){
GeneID <- as.character(x$ensembl_gene_id)
probe <- as.character(x$probe)
if(!probe %in% names(getMet(mae))) {
return(tibble::tibble(rho = NA, pval = NA, ensembl_gene_id = GeneID, probe))
}
cor <- cor.test(x = as.numeric(assay(getMet(mae)[probe])),
y = as.numeric(assay(getExp(mae)[GeneID])),
method = c("spearman"))
corval <- as.numeric(cor$estimate)
pvalue <- as.numeric(cor$p.value)
tibble::tibble(rho = corval,pval = pvalue, ensembl_gene_id = GeneID, probe)
},.id = NULL,
.expand = T,
.progress = "text",
.parallel = T)
#correlations <- correlations[!is.na(correlations$rho),] # Remove associations not found
correlations$rho <- as.numeric(as.character(correlations$rho))
correlations$pval <- as.numeric(as.character(correlations$pval))
correlations$FDR <- p.adjust(as.numeric(as.character(correlations$pval)),method = "BH")
# If FDR > 0.05 - Insignificant
# If FDR <= 0.05:
# - Strongly negatively correlated (SNC) when the rho value was less than -0.5;
# - Weakly negatively correlated (WNC) when the rho value was between -0.5 and -0.25;
# - No negative correlation (NNC) when the rho value was greater than -0.25.
correlations <- correlations[!is.na( as.numeric(correlations$rho)),]
correlations$status <- "Insignificant"
correlations[correlations$FDR <= 0.05 & as.numeric(correlations$rho) >= -0.25,]$status <- "No negative correlation"
correlations[correlations$FDR <= 0.05 & correlations$rho < -0.25 & correlations$rho > -0.5,]$status <- "Weakly negatively correlated (WNC)"
correlations[correlations$FDR <= 0.05 & correlations$rho <= -0.5,]$status <- "Strongly negatively correlated (SNC)"
correlations <- correlations[with(correlations,order(rho,pval)),]
# save
readr::write_csv(unique(as.data.frame(correlations)),
path = paste0(paste0(tumor,"/hg19/"),"/",tumor,"_hg19_correlation.csv"))
}
library(readr)
library(dplyr)
load("~/paper_elmer/GDAN/associated.genes_id_hg19.rda")
load("~/paper_elmer/GDAN/associated.genes_id.rda")
colnames(associated.genes_id.hg19)[3] <- "probe"
associated.genes_id$distance <- NULL
associated.genes_id <- unique(associated.genes_id)
files <- dir(pattern = "correlation.csv",recursive = T,full.names = T)
tumors <- basename(dirname(dirname(files)))
tumors <- tumors[duplicated(tumors)]
snc.in.hg38.not.in.hg19 <- NULL
for(tumor in tumors){
print(tumor)
f <- sort(grep(tumor, files,value = T))
hg19_correlation <- read_csv(f[1])
colnames(hg19_correlation)[grep("cg",hg19_correlation[1,])] <- "probe"
hg38_correlation <- read_csv(f[2])
colnames(hg38_correlation)[grep("ENSG",hg38_correlation[1,])] <- "ensembl_gene_id"
hg19_correlation <- left_join(as_tibble(hg19_correlation),unique(as_tibble(associated.genes_id.hg19)))
hg38_correlation <- left_join(as_tibble(hg38_correlation),unique(as_tibble(associated.genes_id)))
save(hg19_correlation,hg38_correlation, file = paste0(tumor,"_correlation.rda"))
if(any(hg19_correlation$status == "Strongly negatively correlated (SNC)")){
y <- hg19_correlation[hg19_correlation$status == "Strongly negatively correlated (SNC)",]
z <- hg38_correlation[hg38_correlation$status == "Strongly negatively correlated (SNC)",]
z <- unique(z[!z$ensembl_gene_id %in% y$ensembl_gene_id,c(8:9)])
z$tumor <- tumor
snc.in.hg38.not.in.hg19 <- rbind(snc.in.hg38.not.in.hg19,z)
}
}
readr::write_csv(unique(as.data.frame(snc.in.hg38.not.in.hg19)),path = "snc_in_hg38_not_in_hg19.csv")
# Evaluating only negative
library(readr)
load("~/paper_elmer/GDAN/associated.genes_id_hg19.rda")
load("~/paper_elmer/GDAN/associated.genes_id.rda")
colnames(associated.genes_id.hg19)[3] <- "probe"
associated.genes_id <- associated.genes_id[as.numeric(as.character(associated.genes_id$distance)) < 0,]
associated.genes_id$distance <- NULL
associated.genes_id <- unique(associated.genes_id)
files <- dir(pattern = "correlation.csv",recursive = T,full.names = T)
tumors <- basename(dirname(dirname(files)))
tumors <- tumors[duplicated(tumors)]
snc.in.hg38.not.in.hg19 <- NULL
wnc.in.hg38.not.in.hg19 <- NULL
wnc.in.hg38.not.in.hg19.all <- NULL
snc.in.hg38.not.in.hg19.all <- NULL
for(tumor in tumors){
print(tumor)
f <- sort(grep(tumor, files,value = T))
hg19_correlation <- read_csv(f[1])
colnames(hg19_correlation)[grep("cg",hg19_correlation[1,])] <- "probe"
hg38_correlation <- read_csv(f[2])
colnames(hg38_correlation)[grep("ENSG",hg38_correlation[1,])] <- "ensembl_gene_id"
hg19_correlation <- left_join(as_tibble(hg19_correlation),unique(as_tibble(associated.genes_id.hg19)))
hg38_correlation <- left_join(as_tibble(hg38_correlation),unique(as_tibble(associated.genes_id)))
save(hg19_correlation,hg38_correlation, file = paste0(tumor,"_correlation_only_negative.rda"))
if(any(hg19_correlation$status == "Strongly negatively correlated (SNC)")){
y <- hg19_correlation[hg19_correlation$status == "Strongly negatively correlated (SNC)",]
z <- hg38_correlation[hg38_correlation$status == "Strongly negatively correlated (SNC)",]
z$tumor <- tumor
aux <- unique(z[!z$ensembl_gene_id %in% y$ensembl_gene_id,c(8:9)])
snc.in.hg38.not.in.hg19 <- rbind(snc.in.hg38.not.in.hg19,aux)
aux <- unique(z[!z$ensembl_gene_id %in% y$ensembl_gene_id,])
snc.in.hg38.not.in.hg19.all <- rbind(snc.in.hg38.not.in.hg19.all,aux)
}
if(any(hg19_correlation$status == "Weakly negatively correlated (WNC)")){
y <- hg19_correlation[hg19_correlation$status == "Weakly negatively correlated (WNC)",]
z <- hg38_correlation[hg38_correlation$status == "Weakly negatively correlated (WNC)",]
z$tumor <- tumor
aux <- unique(z[!z$ensembl_gene_id %in% y$ensembl_gene_id,c(8:9)])
wnc.in.hg38.not.in.hg19 <- rbind(wnc.in.hg38.not.in.hg19,aux)
aux <- unique(z[!z$ensembl_gene_id %in% y$ensembl_gene_id,])
wnc.in.hg38.not.in.hg19.all <- rbind(wnc.in.hg38.not.in.hg19.all,aux)
}
}
readr::write_csv(unique(as.data.frame(snc.in.hg38.not.in.hg19.all)),
path = "snc_in_hg38_negative_1500bp_not_in_hg19_negative_1500bp_all_info.csv")
readr::write_csv(unique(as.data.frame(wnc.in.hg38.not.in.hg19.all)),
path = "wnc_in_hg38_negative_1500bp_not_in_hg19_negative_1500bp_all_info.csv")
readr::write_csv(unique(as.data.frame(snc.in.hg38.not.in.hg19)),
path = "snc_in_hg38_negative_1500bp_not_in_hg19_negative_1500bp.csv")
readr::write_csv(unique(as.data.frame(wnc.in.hg38.not.in.hg19)),
path = "wnc_in_hg38_negative_1500bp_not_in_hg19_negative_1500bp.csv")
# Plots
files <- dir(pattern = "correlation.csv",recursive = T,full.names = T)
for(tumor in tumors){
mae.file <- grep(tumor,dir(pattern = "mae_hg38",recursive = T,full.names = T),value = TRUE)
load(mae.file)
f <- sort(grep(tumor, files,value = T))
hg38_correlation <- read_csv(f[2])
colnames(hg38_correlation)[grep("ENSG",hg38_correlation[1,])] <- "ensembl_gene_id"
for(i in 1:100){
library(ELMER)
dir.create(paste0(tumor,"/hg38/plots/"),recursive = T)
scatter.plot(data = mae,
dir.out = paste0(tumor,"/hg38/plots/"),
save = TRUE,
byPair = list(probe = as.character(hg38_correlation$probe[i]),
gene = as.character(hg38_correlation$ensembl_gene_id[i])),
category = "definition")
}
}
files <- dir(pattern = "correlation.csv",recursive = T,full.names = T)
for(tumor in tumors){
f <- sort(grep(tumor, files,value = T))
hg19_correlation <- read_csv(f[1])
colnames(hg19_correlation)[grep("cg",hg19_correlation[1,])] <- "probe"
mae.file <- grep(tumor,dir(pattern = "mae_hg19",recursive = T,full.names = T),value = TRUE)
load(mae.file)
for(i in 1:100){
library(ELMER)
dir.create(paste0(tumor,"/hg19/plots/"),recursive = T)
scatter.plot(data = mae,
dir.out = paste0(tumor,"/hg19/plots/"),
save = TRUE,
byPair = list(probe = as.character(hg19_correlation$probe[i]),
gene = as.character(hg19_correlation$ensembl_gene_id[i])),
category = "definition")
}
}