1 Main libraries and configuration

dir.result <- "./DATASETS/matched_analysis_results/"
dir.dmr <- "../meta_analysis_region_results/step4_dmr_vs_cpgs/" 
dir.single.cpg <- "../meta_analysis_single_cpg_results/"        
dir.combp <- "../../../Michael/matchedAnalysis_revision_7-13-2020/comb-p_7_15_2020/"
dir.revision <- "../../../DRAFT_REVISION_NatComm_6-2020/Individual files/"
dir.fig <- "./FIGURES"
library(dplyr)
library(tidyr)
library(ExperimentHub)
library(GenomicRanges)
library(e1071)
library(meta)

2 Matched analysis

devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")
## Sourcing https://gist.githubusercontent.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e/raw/a14424662da343c1301b7b2f03210d28d16ae05c/functions.R
## SHA-1 hash of file is ef6f39dc4e5eddb5ca1c6e5af321e75ff06e9362
files <- dir(
  path = "../DATASETS",
  pattern = paste0(".*neuro"),
  recursive = TRUE,
  full.names = TRUE,
  ignore.case = TRUE
)
files
## [1] "../DATASETS/GASPARONI/step8_neuron_comp/pheno_withNeuronProp_df.RDS"    
## [2] "../DATASETS/LONDON/step6_neuron_comp/pheno107_PFC_withNeuronProp_df.RDS"
## [3] "../DATASETS/MtSinai/step6_neuron_comp/pheno141_withNeuronProp_df.RDS"   
## [4] "../DATASETS/ROSMAP/step8_neuron_comp/pheno_PFC_withNeuronProp_df.RDS"
pheno.ls <- lapply(files, function (f){
  readRDS(f)
})
names(pheno.ls) <- files %>% dirname %>% dirname %>% basename
lapply(pheno.ls,dim)
## $GASPARONI
## [1] 56  8
## 
## $LONDON
## [1] 107   9
## 
## $MtSinai
## [1] 141   8
## 
## $ROSMAP
## [1] 726  28

2.1 Rosmap dataset

rosmap <- pheno.ls$ROSMAP[, c("Sample", "msex", "braaksc", "age_death")]
rosmap$age <- rosmap$age_death
rosmap$status <- ifelse(rosmap$braaksc < 3, "Control", "Case")
rosmap$status <- as.factor(rosmap$status)
rosmap$sex <- ifelse(rosmap$msex == 1, "Male", "Female")
rosmap$sex <- as.factor(rosmap$sex)
str(rosmap)
## 'data.frame':    726 obs. of  7 variables:
##  $ Sample   : chr  "PT-313T" "PT-3142" "PT-318X" "PT-35N6" ...
##  $ msex     : num  1 0 1 1 0 0 0 0 1 1 ...
##  $ braaksc  : num  1 5 4 6 3 3 3 4 4 3 ...
##  $ age_death: num  84.4 90 87.9 89 90 ...
##  $ age      : num  84.4 90 87.9 89 90 ...
##  $ status   : Factor w/ 2 levels "Case","Control": 2 1 1 1 1 1 1 1 1 1 ...
##  $ sex      : Factor w/ 2 levels "Female","Male": 2 1 2 2 1 1 1 1 2 2 ...
{
  set.seed(5) # matchControl uses a distance calculation, if ties, it uses sample function
  m_ros <- matchControls(
    formula = status ~ age + sex,
    data = rosmap,
    contlabel = "Case",
    caselabel = "Control"
  )
}

outFile <- data.frame(
  "matchedControls" = rosmap[m_ros$cases, "Sample"], 
  "matchedCases" = rosmap[m_ros$controls, "Sample"]
)

outFileControl <- merge(
  x = outFile, 
  y = rosmap[, c("Sample", "status", "age", "sex")],
  by.x = "matchedControls",
  by.y = "Sample"
)

colnames(outFileControl)[3:5] <- c("status_control", "age_control", "sex_control")

outFileControlCases <- merge(
  x = outFileControl, 
  y = rosmap[, c("Sample", "status", "age", "sex")],
  by.x = "matchedCases",
  by.y = "Sample"
)

colnames(outFileControlCases)[6:8] <- c("status_case", "age_case", "sex_case")

outFileControlCases <- outFileControlCases[, c(2, 3, 4, 5, 1, 6, 7, 8)]
write.csv(
  outFileControlCases,
  paste0(dir.result, "Rosmap_matchedCaseControls.csv"),
  row.names = FALSE
)
ROSMAP_matchAgeSex <-  outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)
dim(ROSMAP_matchAgeSex)
## [1] 122   8
table(ROSMAP_matchAgeSex$status_control, ROSMAP_matchAgeSex$sex_control)
##          
##           Female Male
##   Case         0    0
##   Control     71   51
table(ROSMAP_matchAgeSex$status_case, ROSMAP_matchAgeSex$sex_case)
##          
##           Female Male
##   Case        71   51
##   Control      0    0
ROSMAP_samples <- c(
  as.character(ROSMAP_matchAgeSex$matchedControls),
  as.character(ROSMAP_matchAgeSex$matchedCases)
)
saveRDS(
  ROSMAP_samples,
  paste0(dir.result, "ROSMAP_samples.RDS")
)

2.1.1 Samples

length(readRDS(paste0(dir.result, "ROSMAP_samples.RDS")))
## [1] 244

2.1.2 Linear model

### Import datasets
cohort <- "ROSMAP"
pheno <- "pheno"

info_df <- readRDS(
  dir(path = "../DATASETS/ROSMAP////", pattern = "info", recursive = TRUE, full.names = TRUE)
)
mediansMval_df <- readRDS(
  dir(path = "../DATASETS/ROSMAP/", pattern = "medians", recursive = TRUE, full.names = TRUE)
)
pheno_df <- readRDS(
  dir(path = "../DATASETS/ROSMAP/", pattern = "Neuron", recursive = TRUE, full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "ROSMAP_samples.RDS")
)

### Limit samples
mediansMval_df <- mediansMval_df[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

mediansMval_df <- mediansMval_df[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(mediansMval_df))

str(pheno_df)

pheno_df$age_death <- as.numeric(as.character(pheno_df$age_death))
pheno_df$msex <- as.factor(as.character(pheno_df$msex))
pheno_df$Slide <- as.factor(as.character(pheno_df$Slide))
pheno_df$batch <- as.factor(as.character(pheno_df$batch))

predictors_char <- "braaksc"
covariates_char <- c("prop.neuron", "batch")

res_df <- TestAllRegions_noInfo(
  predictors_char = predictors_char,
  covariates_char = covariates_char,
  pheno_df = pheno_df,
  summarizedRegions_df = mediansMval_df,
  cores = 4
)

colnames(res_df) <- c(
  paste0(cohort, "_estimate"),
  paste0(cohort, "_se"),
  paste0(cohort, "_pVal"),
  paste0(cohort, "_fdr")
)

res_withInfo_df <- cbind(info_df, res_df)
saveRDS(
  res_withInfo_df,
  paste0(dir.result, cohort, "_matched_linear_df.rds")
)

2.1.3 Single cpg linear model

### Import datasets
beta_mat <- readRDS(
  "../DATASETS/ROSMAP/step7_pca_filtering/ROSMAP_QNBMIQ_PCfiltered.RDS"
) 
pheno_df <- readRDS(
  dir(path = "../DATASETS/ROSMAP/", pattern = "Neuron", recursive = TRUE, full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "ROSMAP_samples.RDS")
)

### Limit samples
beta_mat <- beta_mat[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

beta_mat <- beta_mat[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(beta_mat))
## [1] TRUE
### Compute M values
mval_mat <- log2(beta_mat / (1 - beta_mat))

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$Sentrix_ID)
pheno_df$batch <- as.factor(pheno_df$batch)

str(pheno_df)
## 'data.frame':    244 obs. of  29 variables:
##  $ Sample                : chr  "PT-313T" "PT-318X" "PT-35PM" "PT-BY89" ...
##  $ Sentrix_ID            : num  5.82e+09 6.04e+09 5.82e+09 5.82e+09 5.82e+09 ...
##  $ Sentrix_Position      : chr  "R04C01" "R02C02" "R05C01" "R04C02" ...
##  $ Sample_Plate          : chr  "WG0001091-MSA4" "WG0003259-MSA4" "WG0001091-MSA4" "WG0001338-MSA4" ...
##  $ Sample_Well           : chr  "H08" "H01" "A03" "F12" ...
##  $ Sample_Group          : chr  "PT-313T" "PT-318X" "PT-35PM" "PT-BY89" ...
##  $ batch                 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 1 1 1 2 ...
##  $ Sentrix               : chr  "5822020006_R04C01" "6042324057_R02C02" "5822020007_R05C01" "5822020002_R04C02" ...
##  $ Slide                 : num  5.82e+09 6.04e+09 5.82e+09 5.82e+09 5.82e+09 ...
##  $ projid                : num  11464261 3713990 22209673 75990666 32705437 ...
##  $ Study                 : chr  "ROS" "MAP" "ROS" "MAP" ...
##  $ msex                  : num  1 1 0 1 1 1 0 0 1 1 ...
##  $ educ                  : num  16 12 18 12 16 13 16 16 14 13 ...
##  $ race                  : Factor w/ 4 levels "1","2","3","7": 1 1 2 1 1 1 1 1 1 1 ...
##  $ spanish               : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ apoe_genotype         : Factor w/ 6 levels "22","23","24",..: 2 4 5 5 4 4 4 4 2 4 ...
##  $ age_at_visit_max      : chr  "83.507186858316217" "87.668720054757017" "68.668035592060235" "87.463381245722104" ...
##  $ age_first_ad_dx       : chr  NA NA NA NA ...
##  $ age_death             : num  84.4 87.9 69.2 88 90 ...
##  $ cts_mmse30_first_ad_dx: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ cts_mmse30_lv         : num  24 30 26 12 24 27 29 28 29 27 ...
##  $ pmi                   : num  4 4.33 8.5 7.58 13.47 ...
##  $ braaksc               : num  1 4 3 4 4 1 2 2 1 2 ...
##  $ ceradsc               : num  4 2 2 2 1 3 4 2 4 4 ...
##  $ cogdx                 : num  4 1 6 5 2 3 1 1 1 1 ...
##  $ stage3                : chr  "0-2" "3-4" "3-4" "3-4" ...
##  $ sex                   : Factor w/ 2 levels "Female","Male": 2 2 1 2 2 2 1 1 2 2 ...
##  $ prop.neuron           : num  0.458 0.395 0.41 0.401 0.569 ...
##  $ slide                 : Factor w/ 65 levels "5772325072","5772325089",..: 30 65 31 26 44 28 17 10 3 38 ...
is(pheno_df$braaksc,"numeric")
## [1] TRUE
is(pheno_df$prop.neuron,"numeric")
## [1] TRUE
predictors_char <- "braaksc"
covariates_char <- c("prop.neuron", "batch")

doParallel::registerDoParallel(cores = parallel::detectCores()/2)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")

results_ordered_df <- plyr::adply(mval_mat,1, function(row){
  
  sumOneRegion_df <- as.data.frame(t(row))
  
  result <- TestSingleRegion(
    predictors_char = predictors_char,
    covariates_char = covariates_char,
    pheno_df = pheno_df,
    sumOneRegion_df = sumOneRegion_df
  )
  result
}, .progress = "time",.parallel = TRUE,.id = "cpg")
colnames(results_ordered_df)[1] <- "cpg"

identical(row.names(mval_mat), results_ordered_df$cpg %>% as.character())

write.csv(
  results_ordered_df,
  paste0(dir.result, "ROSMAP_matched_single_cpg_linear_df.csv"),
  row.names = FALSE
)
results_ordered_df <- readr::read_csv(
  paste0(dir.result, "ROSMAP_matched_single_cpg_linear_df.csv"),
  col_types = readr::cols()
)
dim(results_ordered_df)
## [1] 431803      4
results_ordered_df %>% head

2.2 London dataset

london <- pheno.ls$LONDON[, c("sample", "sex", "stage", "age.brain")]
london$status <- ifelse(london$stage < 3, "Control", "Case")
london$status <- as.factor(london$status)
london$sex <- as.factor(london$sex)
str(london)

{
  set.seed(5)
  m_london <- matchControls(
    formula = status ~ age.brain + sex,
    data = london,
    contlabel = "Case",
    caselabel = "Control"
  )
}
outFile <-  data.frame(
  "matchedControls" = london[m_london$cases, "sample"],
  "matchedCases" = london[m_london$controls, "sample"]
)

outFileControl <- merge(
  x = outFile,
  y = london[, c("sample", "status", "age.brain", "sex")],
  by.x = "matchedControls",
  by.y = "sample"
)
colnames(outFileControl)[3:5] <- c("status_control", "age_control", "sex_control")

outFileControlCases <- merge(
  x = outFileControl,
  y = london[, c("sample", "status", "age.brain", "sex")],
  by.x = "matchedCases",
  by.y = "sample"
)
colnames(outFileControlCases)[6:8] <- c("status_case", "age_case", "sex_case")
outFileControlCases <- outFileControlCases[, c(2, 3, 4, 5, 1, 6, 7, 8)]

write.csv(
  outFileControlCases,
  paste0(dir.result, "London_matchedCaseControls.csv"),
  row.names = FALSE
)

London_matchAgeSex <- outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)

dim(London_matchAgeSex)

table(London_matchAgeSex$status_control, London_matchAgeSex$sex_control)
table(London_matchAgeSex$status_case, London_matchAgeSex$sex_case)
London_samples <- c(
  as.character(London_matchAgeSex$matchedControls),
  as.character(London_matchAgeSex$matchedCases)
)
saveRDS(
  London_samples,
  paste0(dir.result, "London_samples.RDS")
)

2.2.1 Samples

length(readRDS(paste0(dir.result, "London_samples.RDS")))
## [1] 46

2.2.2 Linear model

### Import datasets
cohort <- "London"
pheno <- "pheno"

info_df <- readRDS(
  dir(path = "../DATASETS/LONDON///", pattern = "info", recursive = TRUE, full.names = TRUE)
)
mediansMval_df <- readRDS(
  dir(path = "../DATASETS/LONDON/", pattern = "medians", recursive = TRUE, full.names = TRUE)
)
pheno_df <- readRDS(
  dir(path = "../DATASETS/LONDON/", pattern = "NeuronProp_df", recursive = TRUE, full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "London_samples.RDS")
)

### Limit samples
mediansMval_df <- mediansMval_df[, samples]
pheno_df <- pheno_df[pheno_df$sample %in% samples, ]

mediansMval_df <- mediansMval_df[, pheno_df$sample]

### Check variables before fitting model
pheno_df$Sample <- pheno_df$sample

identical(pheno_df$Sample, colnames(mediansMval_df))
## [1] TRUE
str(pheno_df)
## 'data.frame':    46 obs. of  10 variables:
##  $ sample     : chr  "GSM1443251" "GSM1443256" "GSM1443263" "GSM1443269" ...
##  $ subject.id : num  1 2 4 5 6 7 9 10 13 18 ...
##  $ sentrix_id : chr  "6042316048_R05C01" "6042316066_R05C01" "7786923107_R02C01" "6042316121_R04C02" ...
##  $ slide      : chr  "6042316048" "6042316066" "7786923107" "6042316121" ...
##  $ age.brain  : num  82 82 81 92 78 86 90 82 80 82 ...
##  $ sex        : chr  "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
##  $ stage      : num  0 2 1 2 1 3 6 6 5 6 ...
##  $ stage3     : chr  "0-2" "0-2" "0-2" "0-2" ...
##  $ prop.neuron: num  0.377 0.226 0.206 0.456 0.424 ...
##  $ Sample     : chr  "GSM1443251" "GSM1443256" "GSM1443263" "GSM1443269" ...
pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)
# If rosmap cohort, don't forget batch effect
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")
predictors_char <- "stage"
covariates_char <- "prop.neuron"

res_df <- TestAllRegions_noInfo(
  predictors_char = predictors_char,
  covariates_char = covariates_char,
  pheno_df = pheno_df,
  summarizedRegions_df = mediansMval_df,
  cores = 4
)

colnames(res_df) <- c(
  paste0(cohort, "_estimate"),
  paste0(cohort, "_se"),
  paste0(cohort, "_pVal"),
  paste0(cohort, "_fdr")
)

res_withInfo_df <- cbind(info_df, res_df)
saveRDS(
  res_withInfo_df,
  paste0(dir.result, cohort, "_matched_linear_df.rds")
)

2.2.3 Single cpg linear model

### Import datasets
beta_mat <- readRDS(
  "../DATASETS/LONDON/step5_pca_filtering/London_PFC_QNBMIQ_PCfiltered.RDS"
) 
pheno_df <- readRDS(
  dir("../DATASETS/LONDON/",pattern = "NeuronProp_df",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "London_samples.RDS")
)

### Limit samples
pheno_df$Sample <- pheno_df$sample

beta_mat <- beta_mat[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

beta_mat <- beta_mat[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(beta_mat))
## [1] TRUE
### Compute M values
mval_mat <- log2(beta_mat / (1 - beta_mat))

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)

str(pheno_df)
## 'data.frame':    46 obs. of  10 variables:
##  $ sample     : chr  "GSM1443251" "GSM1443256" "GSM1443263" "GSM1443269" ...
##  $ subject.id : num  1 2 4 5 6 7 9 10 13 18 ...
##  $ sentrix_id : chr  "6042316048_R05C01" "6042316066_R05C01" "7786923107_R02C01" "6042316121_R04C02" ...
##  $ slide      : Factor w/ 12 levels "6042316035","6042316048",..: 2 3 9 8 6 1 5 11 7 2 ...
##  $ age.brain  : num  82 82 81 92 78 86 90 82 80 82 ...
##  $ sex        : Factor w/ 2 levels "FEMALE","MALE": 1 1 1 1 2 2 1 2 1 1 ...
##  $ stage      : num  0 2 1 2 1 3 6 6 5 6 ...
##  $ stage3     : chr  "0-2" "0-2" "0-2" "0-2" ...
##  $ prop.neuron: num  0.377 0.226 0.206 0.456 0.424 ...
##  $ Sample     : chr  "GSM1443251" "GSM1443256" "GSM1443263" "GSM1443269" ...
is(pheno_df$stage,"numeric")
## [1] TRUE
is(pheno_df$prop.neuron,"numeric")
## [1] TRUE
predictors_char <- "stage"
covariates_char <- "prop.neuron"

doParallel::registerDoParallel(cores = parallel::detectCores()/2)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")

results_ordered_df <- plyr::adply(mval_mat,1, function(row){
  
  sumOneRegion_df <- as.data.frame(t(row))
  
  result <- TestSingleRegion(
    predictors_char = predictors_char,
    covariates_char = covariates_char,
    pheno_df = pheno_df,
    sumOneRegion_df = sumOneRegion_df
  )
  result
}, .progress = "time", .parallel = TRUE, .id = "cpg")
colnames(results_ordered_df)[1] <- "cpg"

identical(row.names(mval_mat), results_ordered_df$cpg %>% as.character())
write.csv(
  results_ordered_df,
  paste0(dir.result, "London_matched_single_cpg_linear_df.csv"),
  row.names = FALSE
)
results_ordered_df <- readr::read_csv(
  paste0(dir.result, "London_matched_single_cpg_linear_df.csv"),
  col_types = readr::cols()
)
dim(results_ordered_df)
## [1] 450793      4
results_ordered_df %>% head

2.3 Gasparoni dataset

write.csv(
  outFileControlCases,
  paste0(dir.result, "Gasparoni_matchedCaseControls.csv"),
  row.names = FALSE
)
Gasparoni_matchAgeSex <-  outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)
dim(Gasparoni_matchAgeSex)

table(Gasparoni_matchAgeSex$status_control, Gasparoni_matchAgeSex$sex_control)
table(Gasparoni_matchAgeSex$status_case, Gasparoni_matchAgeSex$sex_case)

Gasparoni_samples <- c(
  as.character(Gasparoni_matchAgeSex$matchedControls),
  as.character(Gasparoni_matchAgeSex$matchedCases)
)
saveRDS(
  Gasparoni_samples,
  paste0(dir.result, "Gasparoni_samples.RDS")
)

2.4 Mt.Sinai dataset

mtsinai <- pheno.ls$MtSinai[, c("sample", "sex", "stage", "age.brain")]
mtsinai$status <- ifelse(mtsinai$stage < 3, "Control", "Case")
mtsinai$status <- as.factor(mtsinai$status)
mtsinai$sex <- as.factor(mtsinai$sex)
str(mtsinai)

{
  set.seed(5)
  m_mtsinai <- matchControls(
    formula = status ~ age.brain + sex,
    data = mtsinai,
    contlabel = "Case",
    caselabel = "Control"
  )
}
outFile <- data.frame(
  "matchedControls" = mtsinai[m_mtsinai$cases, "sample"], 
  "matchedCases" = mtsinai[m_mtsinai$controls, "sample"]
)

outFileControl <- merge(
  outFile, 
  mtsinai[, c("sample", "status", "age.brain", "sex")], 
  by.x = "matchedControls", 
  by.y = "sample"
)

colnames(outFileControl)[3:5] <- c("status_control", "age_control", "sex_control")
outFileControlCases <-  merge(
  outFileControl, 
  mtsinai[, c("sample", "status", "age.brain", "sex")], 
  by.x = "matchedCases", 
  by.y = "sample"
)
colnames(outFileControlCases)[6:8] <- c("status_case", "age_case", "sex_case")
outFileControlCases <- outFileControlCases[, c(2, 3, 4, 5, 1, 6, 7, 8)]
write.csv(
  outFileControlCases,
  paste0(dir.result, "MtSinai_matchedCaseControls.csv"),
  row.names = FALSE
)
MtSinai_matchAgeSex <-  outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)
dim(MtSinai_matchAgeSex)

table(MtSinai_matchAgeSex$status_control, MtSinai_matchAgeSex$sex_control)
table(MtSinai_matchAgeSex$status_case, MtSinai_matchAgeSex$sex_case)

MtSinai_samples <- c(
  as.character(MtSinai_matchAgeSex$matchedControls),
  as.character(MtSinai_matchAgeSex$matchedCases)
)
saveRDS(
  MtSinai_samples,
  paste0(dir.result, "MtSinai_samples.RDS")
)

2.4.1 Samples

length(readRDS(paste0(dir.result, "MtSinai_samples.RDS")))
## [1] 56

2.4.2 Linear model

### Import datasets
cohort <- "MtSinai"
pheno <- "pheno"

info_df <- readRDS(
  dir("../DATASETS/MtSinai//",pattern = "info",recursive = TRUE,full.names = TRUE)
)
mediansMval_df <- readRDS(
  dir("../DATASETS/MtSinai/",pattern = "medians",recursive = TRUE,full.names = TRUE)
)
pheno_df <- readRDS(
  dir("../DATASETS/MtSinai/",pattern = "NeuronProp",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "MtSinai_samples.RDS")
)

### Limit samples
mediansMval_df <- mediansMval_df[, samples]
pheno_df <- pheno_df[pheno_df$sample %in% samples, ]

mediansMval_df <- mediansMval_df[, pheno_df$sample]

### Check variables before fitting model
pheno_df$Sample <- pheno_df$sample

identical(pheno_df$Sample, colnames(mediansMval_df))
## [1] TRUE
str(pheno_df)
## 'data.frame':    56 obs. of  9 variables:
##  $ sample     : chr  "GSM2139163" "GSM2139164" "GSM2139166" "GSM2139170" ...
##  $ subject.id : chr  "MS66" "MS546" "MS1549" "MS842" ...
##  $ sentrix_id : chr  "7796806144_R01C01" "7796806144_R01C02" "7796806144_R03C01" "7796806144_R05C01" ...
##  $ slide      : chr  "7796806144" "7796806144" "7796806144" "7796806144" ...
##  $ age.brain  : num  89 80 78 83 78 73 81 93 93 73 ...
##  $ sex        : chr  "male" "female" "male" "female" ...
##  $ stage      : num  5 2 4 1 2 6 2 6 2 6 ...
##  $ prop.neuron: num  0.243 0.202 0.304 0.329 0.235 ...
##  $ Sample     : chr  "GSM2139163" "GSM2139164" "GSM2139166" "GSM2139170" ...
pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)
# If rosmap cohort, don't forget batch effect
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")
predictors_char <- "stage"
covariates_char <- "prop.neuron"

res_df <- TestAllRegions_noInfo(
  predictors_char = predictors_char,
  covariates_char = covariates_char,
  pheno_df = pheno_df,
  summarizedRegions_df = mediansMval_df,
  cores = 4
)

colnames(res_df) <- c(
  paste0(cohort, "_estimate"),
  paste0(cohort, "_se"),
  paste0(cohort, "_pVal"),
  paste0(cohort, "_fdr")
)

res_withInfo_df <- cbind(info_df, res_df)
saveRDS(
  res_withInfo_df,
  paste0(dir.result, cohort, "_matched_linear_df.rds")
)

2.4.3 Single cpg linear model

### Import datasets
beta_mat <- readRDS(
  "../DATASETS/MtSinai/step5_pca_filtering/MtSinai_QNBMIQ_PCfiltered.RDS"
) 
pheno_df <- readRDS(
  dir("../DATASETS/MtSinai/",pattern = "Neuron",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "MtSinai_samples.RDS")
)

### Limit samples
pheno_df$Sample <- pheno_df$sample

beta_mat <- beta_mat[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

beta_mat <- beta_mat[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(beta_mat))
## [1] TRUE
### Compute M values
mval_mat <- log2(beta_mat / (1 - beta_mat))

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)

str(pheno_df)
## 'data.frame':    56 obs. of  9 variables:
##  $ sample     : chr  "GSM2139163" "GSM2139164" "GSM2139166" "GSM2139170" ...
##  $ subject.id : chr  "MS66" "MS546" "MS1549" "MS842" ...
##  $ sentrix_id : chr  "7796806144_R01C01" "7796806144_R01C02" "7796806144_R03C01" "7796806144_R05C01" ...
##  $ slide      : Factor w/ 12 levels "7796806144","7796814008",..: 1 1 1 1 1 2 2 2 2 2 ...
##  $ age.brain  : num  89 80 78 83 78 73 81 93 93 73 ...
##  $ sex        : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 1 1 ...
##  $ stage      : num  5 2 4 1 2 6 2 6 2 6 ...
##  $ prop.neuron: num  0.243 0.202 0.304 0.329 0.235 ...
##  $ Sample     : chr  "GSM2139163" "GSM2139164" "GSM2139166" "GSM2139170" ...
is(pheno_df$stage,"numeric")
## [1] TRUE
is(pheno_df$prop.neuron,"numeric")
## [1] TRUE
predictors_char <- "stage"
covariates_char <- "prop.neuron"

doParallel::registerDoParallel(cores = parallel::detectCores()/2)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")

results_ordered_df <- plyr::adply(mval_mat,1, function(row){
  
  sumOneRegion_df <- as.data.frame(t(row))
  
  result <- TestSingleRegion(
    predictors_char = predictors_char,
    covariates_char = covariates_char,
    pheno_df = pheno_df,
    sumOneRegion_df = sumOneRegion_df
  )
  result
}, .progress = "time",.parallel = TRUE,.id = "cpg")
colnames(results_ordered_df)[1] <- "cpg"

identical(row.names(mval_mat), results_ordered_df$cpg %>% as.character())
write.csv(
  results_ordered_df,
  paste0(dir.result, "MtSinai_matched_single_cpg_linear_df.csv"),
  row.names = FALSE
)
results_ordered_df <- readr::read_csv(
  paste0(dir.result, "MtSinai_matched_single_cpg_linear_df.csv"),
  col_types = readr::cols()
)
dim(results_ordered_df)
## [1] 434610      4
results_ordered_df %>% head

3 Meta-analysis of Genomic Regions

3.1 Import datasets and pre-process for each cohort

preMeta <- function(cohort){
  
  ### Load data
  file <- dir(
    path = "./DATASETS/matched_analysis_results",
    pattern = paste0(cohort,".*matched_linear_df"),
    recursive = T,
    full.names = TRUE,
    ignore.case = T
  )
  linear.results.final <- readRDS(file)
  
  ### select the most sig cometh region for each input region
  pva.col <- grep("_pVal",colnames(linear.results.final),value = TRUE)
  colnames(linear.results.final)[grep("inputRegion",colnames(linear.results.final))] <- "inputRegion"
  
  # !! is used to unquote an input 
  # https://dplyr.tidyverse.org/articles/programming.html
  result_sig <- linear.results.final %>%
    group_by(inputRegion) %>%
    filter((!!as.symbol(pva.col)) == min(!!as.symbol(pva.col)))
  
  data.frame(result_sig, stringsAsFactors = FALSE)
}

London_PFC <- preMeta(cohort = "London")
dim(London_PFC)

MtSinai <- preMeta(cohort = "MtSinai") 
dim(MtSinai)

ROSMAP <- preMeta(cohort = "ROSMAP")
dim(ROSMAP)

3.2 Merge cohorts

### merge datasets
cohort_ls <- list(
  London_PFC = London_PFC,
  MtSinai = MtSinai,
  ROSMAP = ROSMAP
)

### outer join input region
multi_cohorts <- Reduce(
  function(x,y, ...) merge(x, y, by = "inputRegion", all = TRUE, ...),
  cohort_ls
)

3.3 Meta analysis

doParallel::registerDoParallel(cores = parallel::detectCores()/2)
meta_df <- plyr::adply(
  .data = multi_cohorts, 
  .margins = 1, 
  .fun =  function(rowOne_df){
    
    est <- rowOne_df[grep("estimate",colnames(rowOne_df))] %>% as.numeric
    
    direction <-  paste(ifelse(
      is.na(est), ".",
      ifelse(est > 0, "+", "-")
    ),collapse = "")
    
    se <- rowOne_df[grep("se",colnames(rowOne_df))] %>% as.numeric
    cohort <- gsub("_se","",grep("se",colnames(rowOne_df),value = T))
    rowOne_reform_df <- data.frame(
      cohort = cohort,
      est = est,
      se = se,
      stringsAsFactors = FALSE
    )
    
    f <- metagen(
      TE = est,
      seTE = se,
      data = rowOne_reform_df
    )
    
    tibble::tibble(
      inputRegion = rowOne_df$inputRegion,
      estimate = f$TE.fixed,
      se = f$seTE.fixed,
      pVal.fixed = f$pval.fixed,
      pVal.random = f$pval.random,
      pValQ = f$pval.Q,
      direction = direction
    )
  }  , .progress = "time",
  .parallel = TRUE,
  .id = NULL
)

### create final pVal
meta_df$pVal.final <- ifelse(
  meta_df$pValQ > 0.05, meta_df$pVal.fixed, meta_df$pVal.random
)

### calculate fdr
meta_df$fdr <- p.adjust(meta_df$pVal.final, method = "fdr")

### order meta_df
meta_final_df <- meta_df[, c(grep("_",colnames(meta_df),invert = T),
                             grep("_",colnames(meta_df),invert = F))]
meta_final_ordered_df <- meta_final_df[order(meta_final_df$pVal.final),]

3.4 Add annotation to input regions

### find annotations
library(coMethDMR)

splited_input <- meta_final_ordered_df %>% 
  tidyr::separate(col = inputRegion,into =  c("chrom", "start", "end"),remove = FALSE)
splited_input_annot <- AnnotateResults(splited_input[,c("chrom", "start", "end")],nCores_int = 10) 

### merge annotation with meta analysis data
meta_ordered_withAnnot_df <- cbind(
  meta_final_ordered_df, splited_input_annot[, 4:7]
)

### order columns
meta_ordered_withAnnot_df <- meta_ordered_withAnnot_df %>% 
  dplyr::select(c(1,
                  (ncol(meta_ordered_withAnnot_df) - 3):ncol(meta_ordered_withAnnot_df),
                  2:(ncol(meta_ordered_withAnnot_df) - 4))
  )
### save dataset
write.csv(
  meta_ordered_withAnnot_df,
  paste0(dir.result, "meta_analysis_matched_df.csv"),
  row.names = FALSE
)

meta_all_sig <- meta_ordered_withAnnot_df[
  !is.na(meta_ordered_withAnnot_df$fdr) &
    (meta_ordered_withAnnot_df$fdr < 0.05),  
  ] #dim: 36 34
row.names(meta_all_sig) <- NULL
write.csv(
  meta_all_sig,
  paste0(dir.result, "meta_analysis_matched_sig_df.csv"),
  row.names = FALSE
)

3.5 Delete sig DMRs with cross-hybridizing or smoking probes

### call in all cross hybridizing probes
eh = ExperimentHub()
query(eh, "DMRcate")
crosshyb <- eh[["EH3129"]]

### Get significant smoking probes
smoking.file <- "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5267325/bin/NIHMS817273-supplement-001506_-_Supplemental_Tables.xlsx"

if(!file.exists(basename(smoking.file))) downloader::download(smoking.file,basename(smoking.file))

smoking <- readxl::read_xlsx(
  basename(smoking.file),
  sheet = "02",
  skip = 2
)
smoking.sig.probes <- smoking %>% dplyr::filter(`P-value` < 1*10^(-7)) %>% pull("Probe ID") 
length(smoking.sig.probes)

### Call in meta analysis final results
meta_all <- read.csv(
  paste0(dir.result, "meta_analysis_matched_sig_df.csv")
) #dim: 36 34

### Find files with regions and probes
files <- dir(
  path = "../DATASETS",
  pattern = paste0(".*_residuals_cometh_input_ls.rds"),
  recursive = T,
  full.names = TRUE,
  ignore.case = TRUE
)
files <- grep("LONDON_PFC|MtSinai|ROSMAP",files,value = TRUE,ignore.case = TRUE)

### Read files and Limit the cohort_ls to cohort_coMethRegion in meta_all 
cometh.probes.list <- lapply(files, function(f){
  print(f)
  all.clusters <- readRDS(f)$coMeth_ls # Read files
  cohort <- f %>% dirname %>% dirname %>% basename # get cohort from folder name
  
  col <- grep(paste0(cohort,"_coMethRegion"),
              colnames(meta_all),
              ignore.case = TRUE,
              value = TRUE) # get column with cohort sig regions
  
  # keep sig regions only
  all.clusters[names(all.clusters) %in% meta_all[[col]]]
})
names(cometh.probes.list) <-  files %>% dirname %>% dirname %>% basename

lapply(cometh.probes.list,length)

### Map probes in each region to smoking and crosshybrdizing
extractCrosHybridization <- function(probes.list){
  crosshyb.extracted <- plyr::laply(probes.list,function(probes){
    paste(intersect(probes, crosshyb), 
          collapse = ", "
    )
  })
  smoking.extracted <- plyr::laply(probes.list,function(probes){
    paste(intersect(probes, smoking.sig.probes), 
          collapse = ", "
    )
  })
  tibble::tibble(
    "coMethRegion" = names(probes.list),
    "crossHyb" = crosshyb.extracted, 
    "crossHyb_bi" = ifelse(crosshyb.extracted == "",0,1),
    "smoke" = smoking.extracted,
    "smoke_bi" = ifelse(smoking.extracted == "",0,1)
  )
}

London_PFC_crossHyb_df <- extractCrosHybridization(cometh.probes.list$LONDON)
colnames(London_PFC_crossHyb_df) <- paste0("London_",colnames(London_PFC_crossHyb_df))
plyr::count(
  London_PFC_crossHyb_df, 
  vars = grep("_bi",colnames(London_PFC_crossHyb_df),value = TRUE)
)

MtSinai_crossHyb_df <- extractCrosHybridization(cometh.probes.list$MtSinai)
colnames(MtSinai_crossHyb_df) <- paste0("MtSinai_",colnames(MtSinai_crossHyb_df))
plyr::count(
  MtSinai_crossHyb_df, 
  vars = grep("_bi",colnames(MtSinai_crossHyb_df),value = TRUE)
)

ROSMAP_crossHyb_df <- extractCrosHybridization(cometh.probes.list$ROSMAP)
colnames(ROSMAP_crossHyb_df) <- paste0("ROSMAP_",colnames(ROSMAP_crossHyb_df))
plyr::count(ROSMAP_crossHyb_df, vars = grep("_bi",colnames(ROSMAP_crossHyb_df),value = TRUE))

### Merge smoking and crossHyb probes information with meta analysis results 
meta_all_final <- meta_all %>%
  left_join(London_PFC_crossHyb_df) %>% 
  left_join(MtSinai_crossHyb_df) %>% 
  left_join(ROSMAP_crossHyb_df)

### Add information with input regions with any cross hybridization in cohorts 
meta_all_final$crossHyb_bi <- rowSums(meta_all_final[,grep("crossHyb_bi",colnames(meta_all_final))],na.rm = TRUE) > 0
meta_all_final$smoke_bi <- rowSums(meta_all_final[,grep("smoke_bi",colnames(meta_all_final))],na.rm = TRUE) > 0

# Sort by region meta analysis FDR
# Cluster columns of the projects together
meta_all_final <- meta_all_final[
  order(meta_all_final$fdr),
  c(grep("MtSinai|London|ROSMAP",colnames(meta_all_final),ignore.case = TRUE,invert = TRUE),
    grep("Gasparoni",colnames(meta_all_final),ignore.case = TRUE),
    grep("MtSinai",colnames(meta_all_final),ignore.case = TRUE),
    grep("London",colnames(meta_all_final),ignore.case = TRUE),
    grep("ROSMAP",colnames(meta_all_final),ignore.case = TRUE)
  )
 ]
str(meta_all_final)

meta_sig_final <- meta_all_final[
  meta_all_final$crossHyb_bi == 0 &
    meta_all_final$smoke_bi == 0,
] #dim: 32 48
### Save
write.csv(
  meta_sig_final %>% as.data.frame,
  paste0(dir.result, "meta_analysis_matched_sig_no_crossHyb_smoking_df.csv"),
  row.names = FALSE
)

3.6 Results

4 Meta-analysis Single CpGs

  1. merge cohorts
  2. meta analysis
  3. add annotation
  4. Delete cross-hybridizing and smoking probes from sig probes

4.1 Import datasets

results.files <- dir(
  path = "./DATASETS/matched_analysis_results/",
  pattern = "single_cpg_linear_df.csv",
  recursive = TRUE,
  full.names = TRUE,
  ignore.case = TRUE
)

for(i in results.files){
  data <- readr::read_csv(i)
  dataset <- unlist(stringr::str_split(basename(i),"\\/|\\_"))[1]  %>% as.character()
  aux <- paste0(dataset,c("_estimate", "_se", "_pValue"))
  colnames(data) <- c("cpg", aux)
  assign(dataset,data)
}

4.2 Merge cohorts

cohort_ls <- list(
  London_PFC = London,
  MtSinai = MtSinai,
  ROSMAP = ROSMAP
)

### outer join input region
multi_cohorts <- Reduce(
  function(x,y, ...) merge(x, y, by = "cpg", all = TRUE, ...),
  cohort_ls
) #dim: 450793 10
dim(multi_cohorts)

4.3 Meta analysis

### calculate meta analysis z scores and p values
doParallel::registerDoParallel(cores = parallel::detectCores()/2)
meta_df <- plyr::adply(
  .data = multi_cohorts, 
  .margins = 1, 
  .fun =  function(rowOne_df){
    
    est <- rowOne_df[grep("estimate",colnames(rowOne_df))] %>% as.numeric
    
    direction <-  paste(ifelse(
      is.na(est), ".",
      ifelse(est > 0, "+", "-")
    ),collapse = "")
    
    se <- rowOne_df[grep("se",colnames(rowOne_df))] %>% as.numeric
    cohort <- gsub("_se","",grep("se",colnames(rowOne_df),value = T))
    rowOne_reform_df <- data.frame(
      cohort = cohort,
      est = est,
      se = se,
      stringsAsFactors = FALSE
    )
    
    f <- metagen(
      TE = est,
      seTE = se,
      data = rowOne_reform_df
    )
    
    tibble::tibble(
      cpg = rowOne_df$cpg,
      estimate = f$TE.fixed,
      se = f$seTE.fixed,
      pVal.fixed = f$pval.fixed,
      pVal.random = f$pval.random,
      pValQ = f$pval.Q,
      direction = direction
    )
  }  , .progress = "time",
  .parallel = TRUE,
  .id = NULL
)

### create final pVal
meta_df$pVal.final <- ifelse(
  meta_df$pValQ > 0.05, meta_df$pVal.fixed, meta_df$pVal.random
)

### calculate fdr
meta_df$fdr <- p.adjust(meta_df$pVal.final, method = "fdr")

### order meta_df
meta_final_df <- meta_df[, c(grep("_",colnames(meta_df),invert = T),
                             grep("_",colnames(meta_df),invert = F))]
meta_final_ordered_df <- meta_final_df[order(meta_final_df$pVal.final),]

4.4 Add annotation

library(sesame)
probes.info <- sesameDataGet("HM450.hg19.manifest")
probes.info <- probes.info[meta_final_ordered_df$cpg %>% as.character()] %>% 
  as.data.frame %>% 
  dplyr::select(c("seqnames","start","end"))

result_annot_df <- merge(
  y = meta_final_ordered_df,
  x = probes.info,
  by.y = "cpg",
  by.x = "row.names",
  all.y = TRUE,
  sort = FALSE
)
### final raw data
write.csv(
  result_annot_df  %>% as.data.frame(),
  paste0(dir.result, "meta_analysis_matched_single_cpg_df.csv"),
  row.names = FALSE
)
### prepare data for comb-p
result_for_combp_df <- result_annot_df[
  , c("seqnames", "start", "end", "pVal.final")
  ]
colnames(result_for_combp_df)[c(1,4)] <- c("chr", "pValue")
result_for_combp_df$chr <- as.character(result_for_combp_df$chr)
write.csv(
  result_for_combp_df,
  paste0(dir.result, "meta_analysis_matched_single_cpg_df_for_combp.csv"),
  row.names = FALSE
)

4.5 Delete cross-hybridizing and smoking probes from sig probes

### Exclude non-significant probes
result_annot_sig_df <- result_annot_df %>% filter(fdr < 0.05) 
write.csv(
  result_annot_sig_df %>% as.data.frame(),
  paste0(dir.result, "meta_analysis_matched_single_cpg_sig_df.csv"),
  row.names = FALSE
)
### Get crosshybrdizing probes
library(ExperimentHub)

eh = ExperimentHub()
query(eh, "DMRcate")
crosshyb <- eh[["EH3129"]]

### Get significant smoking probes
smoking.file <- "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5267325/bin/NIHMS817273-supplement-001506_-_Supplemental_Tables.xlsx"

if(!file.exists(basename(smoking.file))) downloader::download(smoking.file,basename(smoking.file))

smoking <- readxl::read_xlsx(
  basename(smoking.file),
  sheet = "02",
  skip = 2
)
smoking.sig.probes <- smoking %>% dplyr::filter(`P-value` < 1*10^(-7)) %>% pull("Probe ID")


### Exclude cross-hybridizing and smoking probes
result_annot_sig_df$Row.names <- as.character(result_annot_sig_df$Row.names)

result_annot_sig_no_crossHyb_smoking_df <- result_annot_sig_df[
  !((result_annot_sig_df$Row.names %in% crosshyb) |
      (result_annot_sig_df$Row.names %in% smoking.sig.probes)),
  ] #dim: 642 24

### Add annotation
library(coMethDMR)
result_annot_sig_no_crossHyb_smoking_df$chrom <- as.character(
  result_annot_sig_no_crossHyb_smoking_df$seqnames
)
result_final <- AnnotateResults(result_annot_sig_no_crossHyb_smoking_df)

result_final_ordered <- result_final[
  ,c(1:4, 23:26, 5:21)
]
write.csv(
  result_final_ordered %>% as.data.frame(),
  paste0(dir.result, "meta_analysis_matched_single_cpg_sig_no_crossHyb_smoking_df.csv"),
  row.names = FALSE
)           

4.6 Results

5 Compare matched-analysis results with main meta-analysis results

sig_cpgs <- readxl::read_xlsx(
    path = dir(dir.revision,pattern = "ALL",full.names = TRUE),
    sheet = "Supp Table 1",
    skip = 3
)

sig_dmrs <- readxl::read_xlsx(
    path = dir(dir.revision,pattern = "ALL",full.names = TRUE),
    sheet = "Supp Table 2",
    skip = 3
)

matched_cpgs <- read.csv(
  paste0(dir.result, "meta_analysis_matched_single_cpg_sig_no_crossHyb_smoking_df.csv")
)

matched_dmrs <- read.csv(
  paste0(dir.result, "meta_analysis_matched_sig_no_crossHyb_smoking_df.csv")
)
sig_cpgs <- sig_cpgs[, 1:17]
matched_cpgs <- matched_cpgs[, c(1, 9:16)]
colnames(matched_cpgs) <- c(
    "cpg", paste0("matched_", colnames(matched_cpgs)[2:ncol(matched_cpgs)])
)
sig_matched_cpgs <- merge(
    sig_cpgs, 
    matched_cpgs,
    by = "cpg",
    sort = FALSE
)
nrow(sig_matched_cpgs)
## [1] 129
meta_cpg_direction <- ifelse(
    sig_matched_cpgs$estimate > 0, "+", "-"
)
matched_cpg_direction <- ifelse(
    sig_matched_cpgs$matched_estimate > 0, "+", "-"
)
table(meta_cpg_direction == matched_cpg_direction)
## 
## TRUE 
##  129
sig_dmrs <- sig_dmrs[, 1:15]
matched_dmrs <- matched_dmrs[, c(1, 6:13)]
colnames(matched_dmrs) <- c(
    "DMR", paste0("matched_", colnames(matched_dmrs)[2:ncol(matched_dmrs)])
)
sig_matched_dmrs <- merge(
    sig_dmrs, matched_dmrs,
    by = "DMR",
    sort = FALSE
)
nrow(sig_matched_dmrs)
## [1] 16
meta_dmr_direction <- ifelse(
    sig_matched_dmrs$estimate > 0, "+", "-"
)
matched_dmr_direction <- ifelse(
    sig_matched_dmrs$matched_estimate > 0, "+", "-"
)
table(meta_dmr_direction == matched_dmr_direction)
## 
## TRUE 
##   16
write.csv(
   sig_matched_cpgs,
   paste0(dir.result, "meta_matched_common_cpgs.csv"),
   row.names = FALSE
)

write.csv(
   sig_matched_dmrs,
   paste0(dir.result, "meta_matched_common_dmrs.csv"),
   row.names = FALSE
)

6 Session information

devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 4.0.2 (2020-06-22)
##  os       macOS Catalina 10.15.6      
##  system   x86_64, darwin17.0          
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/New_York            
##  date     2020-08-11                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package                * version  date       lib source        
##  AnnotationDbi            1.50.3   2020-07-25 [1] Bioconductor  
##  AnnotationHub          * 2.20.0   2020-04-27 [1] Bioconductor  
##  assertthat               0.2.1    2019-03-21 [1] CRAN (R 4.0.2)
##  backports                1.1.8    2020-06-17 [1] CRAN (R 4.0.2)
##  base64enc                0.1-3    2015-07-28 [1] CRAN (R 4.0.2)
##  Biobase                  2.48.0   2020-04-27 [1] Bioconductor  
##  BiocFileCache          * 1.12.0   2020-04-27 [1] Bioconductor  
##  BiocGenerics           * 0.34.0   2020-04-27 [1] Bioconductor  
##  BiocManager              1.30.10  2019-11-16 [1] CRAN (R 4.0.2)
##  BiocVersion              3.11.1   2020-04-07 [1] Bioconductor  
##  bit                      4.0.3    2020-07-30 [1] CRAN (R 4.0.2)
##  bit64                    4.0.2    2020-07-30 [1] CRAN (R 4.0.2)
##  bitops                   1.0-6    2013-08-17 [1] CRAN (R 4.0.2)
##  blob                     1.2.1    2020-01-20 [1] CRAN (R 4.0.2)
##  boot                     1.3-25   2020-04-26 [1] CRAN (R 4.0.2)
##  callr                    3.4.3    2020-03-28 [1] CRAN (R 4.0.2)
##  cellranger               1.1.0    2016-07-27 [1] CRAN (R 4.0.2)
##  class                    7.3-17   2020-04-26 [1] CRAN (R 4.0.2)
##  cli                      2.0.2    2020-02-28 [1] CRAN (R 4.0.2)
##  cluster                  2.1.0    2019-06-19 [1] CRAN (R 4.0.2)
##  CompQuadForm             1.4.3    2017-04-12 [1] CRAN (R 4.0.2)
##  crayon                   1.3.4    2017-09-16 [1] CRAN (R 4.0.2)
##  curl                     4.3      2019-12-02 [1] CRAN (R 4.0.1)
##  DBI                      1.1.0    2019-12-15 [1] CRAN (R 4.0.2)
##  dbplyr                 * 1.4.4    2020-05-27 [1] CRAN (R 4.0.2)
##  desc                     1.2.0    2018-05-01 [1] CRAN (R 4.0.2)
##  devtools                 2.3.1    2020-07-21 [1] CRAN (R 4.0.2)
##  digest                   0.6.25   2020-02-23 [1] CRAN (R 4.0.2)
##  dplyr                  * 1.0.1    2020-07-31 [1] CRAN (R 4.0.2)
##  e1071                  * 1.7-3    2019-11-26 [1] CRAN (R 4.0.2)
##  ellipsis                 0.3.1    2020-05-15 [1] CRAN (R 4.0.2)
##  evaluate                 0.14     2019-05-28 [1] CRAN (R 4.0.1)
##  ExperimentHub          * 1.14.0   2020-04-27 [1] Bioconductor  
##  fansi                    0.4.1    2020-01-08 [1] CRAN (R 4.0.2)
##  fastmap                  1.0.1    2019-10-08 [1] CRAN (R 4.0.2)
##  fs                       1.5.0    2020-07-31 [1] CRAN (R 4.0.2)
##  generics                 0.0.2    2018-11-29 [1] CRAN (R 4.0.2)
##  GenomeInfoDb           * 1.24.2   2020-06-15 [1] Bioconductor  
##  GenomeInfoDbData         1.2.3    2020-07-23 [1] Bioconductor  
##  GenomicRanges          * 1.40.0   2020-04-27 [1] Bioconductor  
##  glue                     1.4.1    2020-05-13 [1] CRAN (R 4.0.2)
##  hms                      0.5.3    2020-01-08 [1] CRAN (R 4.0.2)
##  htmltools                0.5.0    2020-06-16 [1] CRAN (R 4.0.2)
##  httpuv                   1.5.4    2020-06-06 [1] CRAN (R 4.0.2)
##  httr                     1.4.2    2020-07-20 [1] CRAN (R 4.0.2)
##  interactiveDisplayBase   1.26.3   2020-06-02 [1] Bioconductor  
##  IRanges                * 2.22.2   2020-05-21 [1] Bioconductor  
##  jsonlite                 1.7.0    2020-06-25 [1] CRAN (R 4.0.2)
##  knitr                    1.29     2020-06-23 [1] CRAN (R 4.0.2)
##  later                    1.1.0.1  2020-06-05 [1] CRAN (R 4.0.2)
##  lattice                  0.20-41  2020-04-02 [1] CRAN (R 4.0.2)
##  lifecycle                0.2.0    2020-03-06 [1] CRAN (R 4.0.2)
##  lme4                     1.1-23   2020-04-07 [1] CRAN (R 4.0.1)
##  magrittr                 1.5      2014-11-22 [1] CRAN (R 4.0.2)
##  MASS                     7.3-51.6 2020-04-26 [1] CRAN (R 4.0.2)
##  Matrix                   1.2-18   2019-11-27 [1] CRAN (R 4.0.2)
##  memoise                  1.1.0    2017-04-21 [1] CRAN (R 4.0.2)
##  meta                   * 4.13-0   2020-07-03 [1] CRAN (R 4.0.2)
##  metafor                  2.4-0    2020-03-19 [1] CRAN (R 4.0.2)
##  mime                     0.9      2020-02-04 [1] CRAN (R 4.0.2)
##  minqa                    1.2.4    2014-10-09 [1] CRAN (R 4.0.2)
##  nlme                     3.1-148  2020-05-24 [1] CRAN (R 4.0.2)
##  nloptr                   1.2.2.2  2020-07-02 [1] CRAN (R 4.0.2)
##  pillar                   1.4.6    2020-07-10 [1] CRAN (R 4.0.2)
##  pkgbuild                 1.1.0    2020-07-13 [1] CRAN (R 4.0.2)
##  pkgconfig                2.0.3    2019-09-22 [1] CRAN (R 4.0.2)
##  pkgload                  1.1.0    2020-05-29 [1] CRAN (R 4.0.2)
##  prettyunits              1.1.1    2020-01-24 [1] CRAN (R 4.0.2)
##  processx                 3.4.3    2020-07-05 [1] CRAN (R 4.0.2)
##  promises                 1.1.1    2020-06-09 [1] CRAN (R 4.0.2)
##  ps                       1.3.3    2020-05-08 [1] CRAN (R 4.0.2)
##  purrr                    0.3.4    2020-04-17 [1] CRAN (R 4.0.2)
##  R6                       2.4.1    2019-11-12 [1] CRAN (R 4.0.2)
##  rappdirs                 0.3.1    2016-03-28 [1] CRAN (R 4.0.2)
##  Rcpp                     1.0.5    2020-07-06 [1] CRAN (R 4.0.2)
##  RCurl                    1.98-1.2 2020-04-18 [1] CRAN (R 4.0.2)
##  readr                    1.3.1    2018-12-21 [1] CRAN (R 4.0.2)
##  readxl                   1.3.1    2019-03-13 [1] CRAN (R 4.0.2)
##  remotes                  2.2.0    2020-07-21 [1] CRAN (R 4.0.2)
##  rlang                    0.4.7    2020-07-09 [1] CRAN (R 4.0.2)
##  rmarkdown                2.3      2020-06-18 [1] CRAN (R 4.0.2)
##  rprojroot                1.3-2    2018-01-03 [1] CRAN (R 4.0.2)
##  RSQLite                  2.2.0    2020-01-07 [1] CRAN (R 4.0.2)
##  S4Vectors              * 0.26.1   2020-05-16 [1] Bioconductor  
##  sessioninfo              1.1.1    2018-11-05 [1] CRAN (R 4.0.2)
##  shiny                    1.5.0    2020-06-23 [1] CRAN (R 4.0.2)
##  statmod                  1.4.34   2020-02-17 [1] CRAN (R 4.0.2)
##  stringi                  1.4.6    2020-02-17 [1] CRAN (R 4.0.2)
##  stringr                  1.4.0    2019-02-10 [1] CRAN (R 4.0.2)
##  testthat                 2.3.2    2020-03-02 [1] CRAN (R 4.0.2)
##  tibble                   3.0.3    2020-07-10 [1] CRAN (R 4.0.2)
##  tidyr                  * 1.1.1    2020-07-31 [1] CRAN (R 4.0.2)
##  tidyselect               1.1.0    2020-05-11 [1] CRAN (R 4.0.2)
##  usethis                  1.6.1    2020-04-29 [1] CRAN (R 4.0.2)
##  vctrs                    0.3.2    2020-07-15 [1] CRAN (R 4.0.2)
##  withr                    2.2.0    2020-04-20 [1] CRAN (R 4.0.2)
##  xfun                     0.16     2020-07-24 [1] CRAN (R 4.0.2)
##  xtable                   1.8-4    2019-04-21 [1] CRAN (R 4.0.2)
##  XVector                  0.28.0   2020-04-27 [1] Bioconductor  
##  yaml                     2.2.1    2020-02-01 [1] CRAN (R 4.0.2)
##  zlibbioc                 1.34.0   2020-04-27 [1] Bioconductor  
## 
## [1] /Library/Frameworks/R.framework/Versions/4.0/Resources/library
---
title: "Meta-analysis with matched samples (by age at death and sex)"
author: "Lanyu Zhang, Tiago C. Silva, Lily Wang"
date: "`r Sys.Date()`"
output:
  rmarkdown::html_document:
    theme: lumen
    toc: true
    number_sections: true
    df_print: paged
    code_download: true
    toc_float:
      collapsed: yes
    toc_depth: 3
editor_options:
  chunk_output_type: inline    
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE)
```

# Main libraries and configuration

```{R}
dir.result <- "./DATASETS/matched_analysis_results/"
dir.dmr <- "../meta_analysis_region_results/step4_dmr_vs_cpgs/" 
dir.single.cpg <- "../meta_analysis_single_cpg_results/"        
dir.combp <- "../../../Michael/matchedAnalysis_revision_7-13-2020/comb-p_7_15_2020/"
dir.revision <- "../../../DRAFT_REVISION_NatComm_6-2020/Individual files/"
dir.fig <- "./FIGURES"
```

```{R, message = FALSE, results = "hide"}
library(dplyr)
library(tidyr)
library(ExperimentHub)
library(GenomicRanges)
library(e1071)
library(meta)
```

# Matched analysis

```{R}
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")
```

```{R}
files <- dir(
  path = "../DATASETS",
  pattern = paste0(".*neuro"),
  recursive = TRUE,
  full.names = TRUE,
  ignore.case = TRUE
)
```

```{R, echo = FALSE}
files <- files[c(1,2,4,5)]
```

```{R}
files
pheno.ls <- lapply(files, function (f){
  readRDS(f)
})
names(pheno.ls) <- files %>% dirname %>% dirname %>% basename
lapply(pheno.ls,dim)
```

## Rosmap dataset

```{R, eval = TRUE}
rosmap <- pheno.ls$ROSMAP[, c("Sample", "msex", "braaksc", "age_death")]
rosmap$age <- rosmap$age_death
rosmap$status <- ifelse(rosmap$braaksc < 3, "Control", "Case")
rosmap$status <- as.factor(rosmap$status)
rosmap$sex <- ifelse(rosmap$msex == 1, "Male", "Female")
rosmap$sex <- as.factor(rosmap$sex)
str(rosmap)


{
  set.seed(5) # matchControl uses a distance calculation, if ties, it uses sample function
  m_ros <- matchControls(
    formula = status ~ age + sex,
    data = rosmap,
    contlabel = "Case",
    caselabel = "Control"
  )
}

outFile <- data.frame(
  "matchedControls" = rosmap[m_ros$cases, "Sample"], 
  "matchedCases" = rosmap[m_ros$controls, "Sample"]
)

outFileControl <- merge(
  x = outFile, 
  y = rosmap[, c("Sample", "status", "age", "sex")],
  by.x = "matchedControls",
  by.y = "Sample"
)

colnames(outFileControl)[3:5] <- c("status_control", "age_control", "sex_control")

outFileControlCases <- merge(
  x = outFileControl, 
  y = rosmap[, c("Sample", "status", "age", "sex")],
  by.x = "matchedCases",
  by.y = "Sample"
)

colnames(outFileControlCases)[6:8] <- c("status_case", "age_case", "sex_case")

outFileControlCases <- outFileControlCases[, c(2, 3, 4, 5, 1, 6, 7, 8)]
```

```{R, eval = FALSE}
write.csv(
  outFileControlCases,
  paste0(dir.result, "Rosmap_matchedCaseControls.csv"),
  row.names = FALSE
)
```

```{R}
ROSMAP_matchAgeSex <-  outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)
dim(ROSMAP_matchAgeSex)

table(ROSMAP_matchAgeSex$status_control, ROSMAP_matchAgeSex$sex_control)
table(ROSMAP_matchAgeSex$status_case, ROSMAP_matchAgeSex$sex_case)

ROSMAP_samples <- c(
  as.character(ROSMAP_matchAgeSex$matchedControls),
  as.character(ROSMAP_matchAgeSex$matchedCases)
)
```

```{R, eval = FALSE}
saveRDS(
  ROSMAP_samples,
  paste0(dir.result, "ROSMAP_samples.RDS")
)
```

### Samples

```{R}
length(readRDS(paste0(dir.result, "ROSMAP_samples.RDS")))
```

### Linear model

```{R ROSMAP_DMR, eval = FALSE}
### Import datasets
cohort <- "ROSMAP"
pheno <- "pheno"

info_df <- readRDS(
  dir(path = "../DATASETS/ROSMAP////", pattern = "info", recursive = TRUE, full.names = TRUE)
)
mediansMval_df <- readRDS(
  dir(path = "../DATASETS/ROSMAP/", pattern = "medians", recursive = TRUE, full.names = TRUE)
)
pheno_df <- readRDS(
  dir(path = "../DATASETS/ROSMAP/", pattern = "Neuron", recursive = TRUE, full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "ROSMAP_samples.RDS")
)

### Limit samples
mediansMval_df <- mediansMval_df[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

mediansMval_df <- mediansMval_df[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(mediansMval_df))

str(pheno_df)

pheno_df$age_death <- as.numeric(as.character(pheno_df$age_death))
pheno_df$msex <- as.factor(as.character(pheno_df$msex))
pheno_df$Slide <- as.factor(as.character(pheno_df$Slide))
pheno_df$batch <- as.factor(as.character(pheno_df$batch))

predictors_char <- "braaksc"
covariates_char <- c("prop.neuron", "batch")

res_df <- TestAllRegions_noInfo(
  predictors_char = predictors_char,
  covariates_char = covariates_char,
  pheno_df = pheno_df,
  summarizedRegions_df = mediansMval_df,
  cores = 4
)

colnames(res_df) <- c(
  paste0(cohort, "_estimate"),
  paste0(cohort, "_se"),
  paste0(cohort, "_pVal"),
  paste0(cohort, "_fdr")
)

res_withInfo_df <- cbind(info_df, res_df)
```

```{R, eval = FALSE}
saveRDS(
  res_withInfo_df,
  paste0(dir.result, cohort, "_matched_linear_df.rds")
)
```

### Single cpg linear model

```{R ROSMAP_singleCpG, eval = TRUE}
### Import datasets
beta_mat <- readRDS(
  "../DATASETS/ROSMAP/step7_pca_filtering/ROSMAP_QNBMIQ_PCfiltered.RDS"
) 
pheno_df <- readRDS(
  dir(path = "../DATASETS/ROSMAP/", pattern = "Neuron", recursive = TRUE, full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "ROSMAP_samples.RDS")
)

### Limit samples
beta_mat <- beta_mat[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

beta_mat <- beta_mat[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(beta_mat))

### Compute M values
mval_mat <- log2(beta_mat / (1 - beta_mat))

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$Sentrix_ID)
pheno_df$batch <- as.factor(pheno_df$batch)

str(pheno_df)

is(pheno_df$braaksc,"numeric")
is(pheno_df$prop.neuron,"numeric")
```

```{R, eval = FALSE}
predictors_char <- "braaksc"
covariates_char <- c("prop.neuron", "batch")

doParallel::registerDoParallel(cores = parallel::detectCores()/2)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")

results_ordered_df <- plyr::adply(mval_mat,1, function(row){
  
  sumOneRegion_df <- as.data.frame(t(row))
  
  result <- TestSingleRegion(
    predictors_char = predictors_char,
    covariates_char = covariates_char,
    pheno_df = pheno_df,
    sumOneRegion_df = sumOneRegion_df
  )
  result
}, .progress = "time",.parallel = TRUE,.id = "cpg")
colnames(results_ordered_df)[1] <- "cpg"

identical(row.names(mval_mat), results_ordered_df$cpg %>% as.character())

write.csv(
  results_ordered_df,
  paste0(dir.result, "ROSMAP_matched_single_cpg_linear_df.csv"),
  row.names = FALSE
)
```

```{R}
results_ordered_df <- readr::read_csv(
  paste0(dir.result, "ROSMAP_matched_single_cpg_linear_df.csv"),
  col_types = readr::cols()
)
dim(results_ordered_df)
results_ordered_df %>% head
```

## London dataset

```{R, eval = FALSE}
london <- pheno.ls$LONDON[, c("sample", "sex", "stage", "age.brain")]
london$status <- ifelse(london$stage < 3, "Control", "Case")
london$status <- as.factor(london$status)
london$sex <- as.factor(london$sex)
str(london)

{
  set.seed(5)
  m_london <- matchControls(
    formula = status ~ age.brain + sex,
    data = london,
    contlabel = "Case",
    caselabel = "Control"
  )
}
outFile <-  data.frame(
  "matchedControls" = london[m_london$cases, "sample"],
  "matchedCases" = london[m_london$controls, "sample"]
)

outFileControl <- merge(
  x = outFile,
  y = london[, c("sample", "status", "age.brain", "sex")],
  by.x = "matchedControls",
  by.y = "sample"
)
colnames(outFileControl)[3:5] <- c("status_control", "age_control", "sex_control")

outFileControlCases <- merge(
  x = outFileControl,
  y = london[, c("sample", "status", "age.brain", "sex")],
  by.x = "matchedCases",
  by.y = "sample"
)
colnames(outFileControlCases)[6:8] <- c("status_case", "age_case", "sex_case")
outFileControlCases <- outFileControlCases[, c(2, 3, 4, 5, 1, 6, 7, 8)]

write.csv(
  outFileControlCases,
  paste0(dir.result, "London_matchedCaseControls.csv"),
  row.names = FALSE
)

London_matchAgeSex <- outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)

dim(London_matchAgeSex)

table(London_matchAgeSex$status_control, London_matchAgeSex$sex_control)
table(London_matchAgeSex$status_case, London_matchAgeSex$sex_case)
London_samples <- c(
  as.character(London_matchAgeSex$matchedControls),
  as.character(London_matchAgeSex$matchedCases)
)
```

```{R, eval = FALSE}
saveRDS(
  London_samples,
  paste0(dir.result, "London_samples.RDS")
)

```

### Samples

```{R}
length(readRDS(paste0(dir.result, "London_samples.RDS")))
```

### Linear model

```{R London_DMR, eval = TRUE}
### Import datasets
cohort <- "London"
pheno <- "pheno"

info_df <- readRDS(
  dir(path = "../DATASETS/LONDON///", pattern = "info", recursive = TRUE, full.names = TRUE)
)
mediansMval_df <- readRDS(
  dir(path = "../DATASETS/LONDON/", pattern = "medians", recursive = TRUE, full.names = TRUE)
)
pheno_df <- readRDS(
  dir(path = "../DATASETS/LONDON/", pattern = "NeuronProp_df", recursive = TRUE, full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "London_samples.RDS")
)

### Limit samples
mediansMval_df <- mediansMval_df[, samples]
pheno_df <- pheno_df[pheno_df$sample %in% samples, ]

mediansMval_df <- mediansMval_df[, pheno_df$sample]

### Check variables before fitting model
pheno_df$Sample <- pheno_df$sample

identical(pheno_df$Sample, colnames(mediansMval_df))

str(pheno_df)

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)
# If rosmap cohort, don't forget batch effect
```

```{R, eval = FALSE}
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")
predictors_char <- "stage"
covariates_char <- "prop.neuron"

res_df <- TestAllRegions_noInfo(
  predictors_char = predictors_char,
  covariates_char = covariates_char,
  pheno_df = pheno_df,
  summarizedRegions_df = mediansMval_df,
  cores = 4
)

colnames(res_df) <- c(
  paste0(cohort, "_estimate"),
  paste0(cohort, "_se"),
  paste0(cohort, "_pVal"),
  paste0(cohort, "_fdr")
)

res_withInfo_df <- cbind(info_df, res_df)
```

```{R, eval = FALSE}
saveRDS(
  res_withInfo_df,
  paste0(dir.result, cohort, "_matched_linear_df.rds")
)
```

### Single cpg linear model

```{R London_singleCpG, eval = TRUE}
### Import datasets
beta_mat <- readRDS(
  "../DATASETS/LONDON/step5_pca_filtering/London_PFC_QNBMIQ_PCfiltered.RDS"
) 
pheno_df <- readRDS(
  dir("../DATASETS/LONDON/",pattern = "NeuronProp_df",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "London_samples.RDS")
)

### Limit samples
pheno_df$Sample <- pheno_df$sample

beta_mat <- beta_mat[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

beta_mat <- beta_mat[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(beta_mat))

### Compute M values
mval_mat <- log2(beta_mat / (1 - beta_mat))

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)

str(pheno_df)

is(pheno_df$stage,"numeric")
is(pheno_df$prop.neuron,"numeric")
```

```{R, eval = FALSE}
predictors_char <- "stage"
covariates_char <- "prop.neuron"

doParallel::registerDoParallel(cores = parallel::detectCores()/2)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")

results_ordered_df <- plyr::adply(mval_mat,1, function(row){
  
  sumOneRegion_df <- as.data.frame(t(row))
  
  result <- TestSingleRegion(
    predictors_char = predictors_char,
    covariates_char = covariates_char,
    pheno_df = pheno_df,
    sumOneRegion_df = sumOneRegion_df
  )
  result
}, .progress = "time", .parallel = TRUE, .id = "cpg")
colnames(results_ordered_df)[1] <- "cpg"

identical(row.names(mval_mat), results_ordered_df$cpg %>% as.character())
```

```{R, eval = FALSE}
write.csv(
  results_ordered_df,
  paste0(dir.result, "London_matched_single_cpg_linear_df.csv"),
  row.names = FALSE
)
```

```{R}
results_ordered_df <- readr::read_csv(
  paste0(dir.result, "London_matched_single_cpg_linear_df.csv"),
  col_types = readr::cols()
)
dim(results_ordered_df)
results_ordered_df %>% head
```

## Gasparoni dataset

```{R, eval = FALSE, include = FALSE}
gasparoni <- pheno.ls$GASPARONI[, c("sample", "sex", "stage", "age.brain")]
gasparoni <- gasparoni[, c("sample", "sex", "stage", "age.brain")]
gasparoni$status <- ifelse(gasparoni$stage < 3, "Control", "Case")
gasparoni$status <- as.factor(gasparoni$status)
gasparoni$sex <- as.factor(gasparoni$sex)
str(gasparoni)

{
  set.seed(5)
  m_gasparoni <- matchControls(
    formula = status ~ age.brain + sex,
    data = gasparoni,
    contlabel = "Case",
    caselabel = "Control"
  )
}

outFile <- data.frame(
  "matchedControls" = gasparoni[m_gasparoni$cases, "sample"], 
  "matchedCases" = gasparoni[m_gasparoni$controls, "sample"]
)

outFileControl <- merge(
  x = outFile, 
  y = gasparoni[, c("sample", "status", "age.brain", "sex")],
  by.x = "matchedControls",
  by.y = "sample"
)
colnames(outFileControl)[3:5] <- c("status_control", "age_control", "sex_control")

outFileControlCases <- merge(
  x = outFileControl,
  y = gasparoni[, c("sample", "status", "age.brain", "sex")],
  by.x = "matchedCases",
  by.y = "sample"
)
colnames(outFileControlCases)[6:8] <- c("status_case", "age_case", "sex_case")

outFileControlCases <- outFileControlCases[, c(2, 3, 4, 5, 1, 6, 7, 8)]

table(outFileControlCases$status_control,
      outFileControlCases$sex_control)
table(outFileControlCases$status_case,
      outFileControlCases$sex_case)
```

```{R, eval = FALSE}
write.csv(
  outFileControlCases,
  paste0(dir.result, "Gasparoni_matchedCaseControls.csv"),
  row.names = FALSE
)
```

```{R, eval = FALSE}
Gasparoni_matchAgeSex <-  outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)
dim(Gasparoni_matchAgeSex)

table(Gasparoni_matchAgeSex$status_control, Gasparoni_matchAgeSex$sex_control)
table(Gasparoni_matchAgeSex$status_case, Gasparoni_matchAgeSex$sex_case)

Gasparoni_samples <- c(
  as.character(Gasparoni_matchAgeSex$matchedControls),
  as.character(Gasparoni_matchAgeSex$matchedCases)
)
```

```{R, eval = FALSE}
saveRDS(
  Gasparoni_samples,
  paste0(dir.result, "Gasparoni_samples.RDS")
)

```

## Mt.Sinai dataset

```{R, eval = FALSE}
mtsinai <- pheno.ls$MtSinai[, c("sample", "sex", "stage", "age.brain")]
mtsinai$status <- ifelse(mtsinai$stage < 3, "Control", "Case")
mtsinai$status <- as.factor(mtsinai$status)
mtsinai$sex <- as.factor(mtsinai$sex)
str(mtsinai)

{
  set.seed(5)
  m_mtsinai <- matchControls(
    formula = status ~ age.brain + sex,
    data = mtsinai,
    contlabel = "Case",
    caselabel = "Control"
  )
}
outFile <- data.frame(
  "matchedControls" = mtsinai[m_mtsinai$cases, "sample"], 
  "matchedCases" = mtsinai[m_mtsinai$controls, "sample"]
)

outFileControl <- merge(
  outFile, 
  mtsinai[, c("sample", "status", "age.brain", "sex")], 
  by.x = "matchedControls", 
  by.y = "sample"
)

colnames(outFileControl)[3:5] <- c("status_control", "age_control", "sex_control")
outFileControlCases <-  merge(
  outFileControl, 
  mtsinai[, c("sample", "status", "age.brain", "sex")], 
  by.x = "matchedCases", 
  by.y = "sample"
)
colnames(outFileControlCases)[6:8] <- c("status_case", "age_case", "sex_case")
outFileControlCases <- outFileControlCases[, c(2, 3, 4, 5, 1, 6, 7, 8)]
```

```{R, eval = FALSE}
write.csv(
  outFileControlCases,
  paste0(dir.result, "MtSinai_matchedCaseControls.csv"),
  row.names = FALSE
)
```

```{R, eval = FALSE}
MtSinai_matchAgeSex <-  outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)
dim(MtSinai_matchAgeSex)

table(MtSinai_matchAgeSex$status_control, MtSinai_matchAgeSex$sex_control)
table(MtSinai_matchAgeSex$status_case, MtSinai_matchAgeSex$sex_case)

MtSinai_samples <- c(
  as.character(MtSinai_matchAgeSex$matchedControls),
  as.character(MtSinai_matchAgeSex$matchedCases)
)
```

```{R, eval = FALSE}
saveRDS(
  MtSinai_samples,
  paste0(dir.result, "MtSinai_samples.RDS")
)
```

### Samples

```{R}
length(readRDS(paste0(dir.result, "MtSinai_samples.RDS")))
```

### Linear model

```{R MtSinai_DMR, eval = TRUE}
### Import datasets
cohort <- "MtSinai"
pheno <- "pheno"

info_df <- readRDS(
  dir("../DATASETS/MtSinai//",pattern = "info",recursive = TRUE,full.names = TRUE)
)
mediansMval_df <- readRDS(
  dir("../DATASETS/MtSinai/",pattern = "medians",recursive = TRUE,full.names = TRUE)
)
pheno_df <- readRDS(
  dir("../DATASETS/MtSinai/",pattern = "NeuronProp",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "MtSinai_samples.RDS")
)

### Limit samples
mediansMval_df <- mediansMval_df[, samples]
pheno_df <- pheno_df[pheno_df$sample %in% samples, ]

mediansMval_df <- mediansMval_df[, pheno_df$sample]

### Check variables before fitting model
pheno_df$Sample <- pheno_df$sample

identical(pheno_df$Sample, colnames(mediansMval_df))

str(pheno_df)

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)
# If rosmap cohort, don't forget batch effect
```

```{R, eval = FALSE}
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")
predictors_char <- "stage"
covariates_char <- "prop.neuron"

res_df <- TestAllRegions_noInfo(
  predictors_char = predictors_char,
  covariates_char = covariates_char,
  pheno_df = pheno_df,
  summarizedRegions_df = mediansMval_df,
  cores = 4
)

colnames(res_df) <- c(
  paste0(cohort, "_estimate"),
  paste0(cohort, "_se"),
  paste0(cohort, "_pVal"),
  paste0(cohort, "_fdr")
)

res_withInfo_df <- cbind(info_df, res_df)
```

```{R, eval = FALSE}
saveRDS(
  res_withInfo_df,
  paste0(dir.result, cohort, "_matched_linear_df.rds")
)
```

### Single cpg linear model

```{R MtSinai_singleCpG, eval = TRUE}
### Import datasets
beta_mat <- readRDS(
  "../DATASETS/MtSinai/step5_pca_filtering/MtSinai_QNBMIQ_PCfiltered.RDS"
) 
pheno_df <- readRDS(
  dir("../DATASETS/MtSinai/",pattern = "Neuron",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "MtSinai_samples.RDS")
)

### Limit samples
pheno_df$Sample <- pheno_df$sample

beta_mat <- beta_mat[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

beta_mat <- beta_mat[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(beta_mat))

### Compute M values
mval_mat <- log2(beta_mat / (1 - beta_mat))

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)

str(pheno_df)

is(pheno_df$stage,"numeric")
is(pheno_df$prop.neuron,"numeric")
```

```{R, eval = FALSE}
predictors_char <- "stage"
covariates_char <- "prop.neuron"

doParallel::registerDoParallel(cores = parallel::detectCores()/2)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")

results_ordered_df <- plyr::adply(mval_mat,1, function(row){
  
  sumOneRegion_df <- as.data.frame(t(row))
  
  result <- TestSingleRegion(
    predictors_char = predictors_char,
    covariates_char = covariates_char,
    pheno_df = pheno_df,
    sumOneRegion_df = sumOneRegion_df
  )
  result
}, .progress = "time",.parallel = TRUE,.id = "cpg")
colnames(results_ordered_df)[1] <- "cpg"

identical(row.names(mval_mat), results_ordered_df$cpg %>% as.character())
```

```{R, eval = FALSE}
write.csv(
  results_ordered_df,
  paste0(dir.result, "MtSinai_matched_single_cpg_linear_df.csv"),
  row.names = FALSE
)
```

```{R}
results_ordered_df <- readr::read_csv(
  paste0(dir.result, "MtSinai_matched_single_cpg_linear_df.csv"),
  col_types = readr::cols()
)
dim(results_ordered_df)
results_ordered_df %>% head
```

# Meta-analysis of Genomic Regions

## Import datasets and pre-process for each cohort 

```{R, eval = FALSE}
preMeta <- function(cohort){
  
  ### Load data
  file <- dir(
    path = "./DATASETS/matched_analysis_results",
    pattern = paste0(cohort,".*matched_linear_df"),
    recursive = T,
    full.names = TRUE,
    ignore.case = T
  )
  linear.results.final <- readRDS(file)
  
  ### select the most sig cometh region for each input region
  pva.col <- grep("_pVal",colnames(linear.results.final),value = TRUE)
  colnames(linear.results.final)[grep("inputRegion",colnames(linear.results.final))] <- "inputRegion"
  
  # !! is used to unquote an input 
  # https://dplyr.tidyverse.org/articles/programming.html
  result_sig <- linear.results.final %>%
    group_by(inputRegion) %>%
    filter((!!as.symbol(pva.col)) == min(!!as.symbol(pva.col)))
  
  data.frame(result_sig, stringsAsFactors = FALSE)
}

London_PFC <- preMeta(cohort = "London")
dim(London_PFC)

MtSinai <- preMeta(cohort = "MtSinai") 
dim(MtSinai)

ROSMAP <- preMeta(cohort = "ROSMAP")
dim(ROSMAP)
```

## Merge cohorts 

```{R, eval = FALSE}
### merge datasets
cohort_ls <- list(
  London_PFC = London_PFC,
  MtSinai = MtSinai,
  ROSMAP = ROSMAP
)

### outer join input region
multi_cohorts <- Reduce(
  function(x,y, ...) merge(x, y, by = "inputRegion", all = TRUE, ...),
  cohort_ls
)
```

## Meta analysis

```{R, eval = FALSE}
doParallel::registerDoParallel(cores = parallel::detectCores()/2)
meta_df <- plyr::adply(
  .data = multi_cohorts, 
  .margins = 1, 
  .fun =  function(rowOne_df){
    
    est <- rowOne_df[grep("estimate",colnames(rowOne_df))] %>% as.numeric
    
    direction <-  paste(ifelse(
      is.na(est), ".",
      ifelse(est > 0, "+", "-")
    ),collapse = "")
    
    se <- rowOne_df[grep("se",colnames(rowOne_df))] %>% as.numeric
    cohort <- gsub("_se","",grep("se",colnames(rowOne_df),value = T))
    rowOne_reform_df <- data.frame(
      cohort = cohort,
      est = est,
      se = se,
      stringsAsFactors = FALSE
    )
    
    f <- metagen(
      TE = est,
      seTE = se,
      data = rowOne_reform_df
    )
    
    tibble::tibble(
      inputRegion = rowOne_df$inputRegion,
      estimate = f$TE.fixed,
      se = f$seTE.fixed,
      pVal.fixed = f$pval.fixed,
      pVal.random = f$pval.random,
      pValQ = f$pval.Q,
      direction = direction
    )
  }  , .progress = "time",
  .parallel = TRUE,
  .id = NULL
)

### create final pVal
meta_df$pVal.final <- ifelse(
  meta_df$pValQ > 0.05, meta_df$pVal.fixed, meta_df$pVal.random
)

### calculate fdr
meta_df$fdr <- p.adjust(meta_df$pVal.final, method = "fdr")

### order meta_df
meta_final_df <- meta_df[, c(grep("_",colnames(meta_df),invert = T),
                             grep("_",colnames(meta_df),invert = F))]
meta_final_ordered_df <- meta_final_df[order(meta_final_df$pVal.final),]
```

## Add annotation to input regions

```{R, eval = FALSE}
### find annotations
library(coMethDMR)

splited_input <- meta_final_ordered_df %>% 
  tidyr::separate(col = inputRegion,into =  c("chrom", "start", "end"),remove = FALSE)
splited_input_annot <- AnnotateResults(splited_input[,c("chrom", "start", "end")],nCores_int = 10) 

### merge annotation with meta analysis data
meta_ordered_withAnnot_df <- cbind(
  meta_final_ordered_df, splited_input_annot[, 4:7]
)

### order columns
meta_ordered_withAnnot_df <- meta_ordered_withAnnot_df %>% 
  dplyr::select(c(1,
                  (ncol(meta_ordered_withAnnot_df) - 3):ncol(meta_ordered_withAnnot_df),
                  2:(ncol(meta_ordered_withAnnot_df) - 4))
  )
```

```{R, eval = FALSE}
### save dataset
write.csv(
  meta_ordered_withAnnot_df,
  paste0(dir.result, "meta_analysis_matched_df.csv"),
  row.names = FALSE
)

meta_all_sig <- meta_ordered_withAnnot_df[
  !is.na(meta_ordered_withAnnot_df$fdr) &
    (meta_ordered_withAnnot_df$fdr < 0.05),  
  ] #dim: 36 34
row.names(meta_all_sig) <- NULL
```

```{R, eval = FALSE}
write.csv(
  meta_all_sig,
  paste0(dir.result, "meta_analysis_matched_sig_df.csv"),
  row.names = FALSE
)

```

```{R, eval = FALSE, include = FALSE}
## Overlap with comb-p DMRs
matched_dmrs <- read.csv(
  paste0(dir.result, "meta_analysis_matched_sig_df.csv")
)
matched_dmrs_gr <- matched_dmrs %>%
  tidyr::separate("inputRegion", c("chrom", "start", "end")) %>%
  makeGRangesFromDataFrame()

combp_dmrs <- read.table(
  paste0(dir.combp, "cnew.regions-p.bed"),
  header = TRUE
)
colnames(combp_dmrs) <- c(
  "chrom", "start", "end", "min_p", "n_probes", "z_p", "z_sidak_p"
)
combp_sig <- combp_dmrs %>% filter(z_sidak_p < 0.05 & n_probes > 2)
combp_sig_gr <- makeGRangesFromDataFrame(combp_sig)

overlapping.results <- matched_dmrs[
  queryHits(findOverlaps(matched_dmrs_gr,combp_sig_gr)),
]
#No overlap with combp

# overlapping.results.unique <- unique(overlapping.results)
# 
# write.csv(
#     overlapping.results.unique,
#     paste0(dir.combp, "matched_analysis_ov_comb_p.csv"),
#     row.names = FALSE
# )
```

## Delete sig DMRs with cross-hybridizing or smoking probes 

```{R, eval = FALSE}
### call in all cross hybridizing probes
eh = ExperimentHub()
query(eh, "DMRcate")
crosshyb <- eh[["EH3129"]]

### Get significant smoking probes
smoking.file <- "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5267325/bin/NIHMS817273-supplement-001506_-_Supplemental_Tables.xlsx"

if(!file.exists(basename(smoking.file))) downloader::download(smoking.file,basename(smoking.file))

smoking <- readxl::read_xlsx(
  basename(smoking.file),
  sheet = "02",
  skip = 2
)
smoking.sig.probes <- smoking %>% dplyr::filter(`P-value` < 1*10^(-7)) %>% pull("Probe ID") 
length(smoking.sig.probes)

### Call in meta analysis final results
meta_all <- read.csv(
  paste0(dir.result, "meta_analysis_matched_sig_df.csv")
) #dim: 36 34

### Find files with regions and probes
files <- dir(
  path = "../DATASETS",
  pattern = paste0(".*_residuals_cometh_input_ls.rds"),
  recursive = T,
  full.names = TRUE,
  ignore.case = TRUE
)
files <- grep("LONDON_PFC|MtSinai|ROSMAP",files,value = TRUE,ignore.case = TRUE)

### Read files and Limit the cohort_ls to cohort_coMethRegion in meta_all 
cometh.probes.list <- lapply(files, function(f){
  print(f)
  all.clusters <- readRDS(f)$coMeth_ls # Read files
  cohort <- f %>% dirname %>% dirname %>% basename # get cohort from folder name
  
  col <- grep(paste0(cohort,"_coMethRegion"),
              colnames(meta_all),
              ignore.case = TRUE,
              value = TRUE) # get column with cohort sig regions
  
  # keep sig regions only
  all.clusters[names(all.clusters) %in% meta_all[[col]]]
})
names(cometh.probes.list) <-  files %>% dirname %>% dirname %>% basename

lapply(cometh.probes.list,length)

### Map probes in each region to smoking and crosshybrdizing
extractCrosHybridization <- function(probes.list){
  crosshyb.extracted <- plyr::laply(probes.list,function(probes){
    paste(intersect(probes, crosshyb), 
          collapse = ", "
    )
  })
  smoking.extracted <- plyr::laply(probes.list,function(probes){
    paste(intersect(probes, smoking.sig.probes), 
          collapse = ", "
    )
  })
  tibble::tibble(
    "coMethRegion" = names(probes.list),
    "crossHyb" = crosshyb.extracted, 
    "crossHyb_bi" = ifelse(crosshyb.extracted == "",0,1),
    "smoke" = smoking.extracted,
    "smoke_bi" = ifelse(smoking.extracted == "",0,1)
  )
}

London_PFC_crossHyb_df <- extractCrosHybridization(cometh.probes.list$LONDON)
colnames(London_PFC_crossHyb_df) <- paste0("London_",colnames(London_PFC_crossHyb_df))
plyr::count(
  London_PFC_crossHyb_df, 
  vars = grep("_bi",colnames(London_PFC_crossHyb_df),value = TRUE)
)

MtSinai_crossHyb_df <- extractCrosHybridization(cometh.probes.list$MtSinai)
colnames(MtSinai_crossHyb_df) <- paste0("MtSinai_",colnames(MtSinai_crossHyb_df))
plyr::count(
  MtSinai_crossHyb_df, 
  vars = grep("_bi",colnames(MtSinai_crossHyb_df),value = TRUE)
)

ROSMAP_crossHyb_df <- extractCrosHybridization(cometh.probes.list$ROSMAP)
colnames(ROSMAP_crossHyb_df) <- paste0("ROSMAP_",colnames(ROSMAP_crossHyb_df))
plyr::count(ROSMAP_crossHyb_df, vars = grep("_bi",colnames(ROSMAP_crossHyb_df),value = TRUE))

### Merge smoking and crossHyb probes information with meta analysis results 
meta_all_final <- meta_all %>%
  left_join(London_PFC_crossHyb_df) %>% 
  left_join(MtSinai_crossHyb_df) %>% 
  left_join(ROSMAP_crossHyb_df)

### Add information with input regions with any cross hybridization in cohorts 
meta_all_final$crossHyb_bi <- rowSums(meta_all_final[,grep("crossHyb_bi",colnames(meta_all_final))],na.rm = TRUE) > 0
meta_all_final$smoke_bi <- rowSums(meta_all_final[,grep("smoke_bi",colnames(meta_all_final))],na.rm = TRUE) > 0

# Sort by region meta analysis FDR
# Cluster columns of the projects together
meta_all_final <- meta_all_final[
  order(meta_all_final$fdr),
  c(grep("MtSinai|London|ROSMAP",colnames(meta_all_final),ignore.case = TRUE,invert = TRUE),
    grep("Gasparoni",colnames(meta_all_final),ignore.case = TRUE),
    grep("MtSinai",colnames(meta_all_final),ignore.case = TRUE),
    grep("London",colnames(meta_all_final),ignore.case = TRUE),
    grep("ROSMAP",colnames(meta_all_final),ignore.case = TRUE)
  )
 ]
str(meta_all_final)

meta_sig_final <- meta_all_final[
  meta_all_final$crossHyb_bi == 0 &
    meta_all_final$smoke_bi == 0,
] #dim: 32 48
```

```{R, eval = FALSE}
### Save
write.csv(
  meta_sig_final %>% as.data.frame,
  paste0(dir.result, "meta_analysis_matched_sig_no_crossHyb_smoking_df.csv"),
  row.names = FALSE
)
```

## Results

```{R, echo = FALSE}
res <- readr::read_csv(
  paste0(dir.result, "meta_analysis_matched_sig_no_crossHyb_smoking_df.csv"),
  col_types = readr::cols()
) 
res <- res[order(res$fdr),]
res
```


```{R, eval = FALSE, include = FALSE}
## Compare with sig DMRs from main analysis

### Call in datasets
main_dmrs <- read.csv(
  paste0(dir.dmr, "meta_analysis_sig_no_crossHyb_smoking_ov_comb_p_with_sig_single_cpgs.csv")
) #dim: 119 60

matched_dmrs <- read.csv(
  paste0(dir.result, "meta_analysis_matched_sig_no_crossHyb_smoking_df.csv")
) #dim: 83 59

### Turn input regions into granges
main_dmrs_gr <- main_dmrs %>%
  tidyr::separate("inputRegion",c("chrom","start","end")) %>%
  makeGRangesFromDataFrame() %>%
  unique

matched_dmrs_gr <- matched_dmrs %>%
  tidyr::separate("inputRegion", c("chrom", "start", "end")) %>%
  makeGRangesFromDataFrame() %>%
  unique

### Create Venn Diagram
library(ChIPpeakAnno)
library(ggplot2)

methods      <- c("main analysis", "matched analysis")
methodsLabel <- c("main analysis sig DMRs", "matched analysis sig DMRs")

n <- length(methods)

gg_color_hue <- function(n){
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

cols = gg_color_hue(n)

ranges.list <- list(main_dmrs_gr, matched_dmrs_gr)

# pdf(
#  file = paste0(dir.result,"venn_main_matched_sig_DMRs.pdf"),
#  width = 5, height = 5
#)
makeVennDiagram(
  ranges.list,
  NameOfPeaks = methodsLabel[1:n],
  totalTest = 37159, 
  by = "region",
  main = paste0(" "),
  col = c("#440154ff", '#21908dff'),
  fill = c(alpha("#440154ff",0.3), alpha('#21908dff',0.3)),
  cex = 1,
  fontfamily = "sans",
  cat.cex = 1,
  cat.default.pos = "outer",
  cat.pos = c(180, 180),
  cat.dist = c(-0.055, -0.055),
  cat.fontfamily = "sans",
  cat.col = c("#440154ff", '#21908dff'))
# dev.off()
```

# Meta-analysis Single CpGs

1. merge cohorts
2. meta analysis
3. add annotation
4. Delete cross-hybridizing and smoking probes from sig probes

## Import datasets  

```{R, eval = FALSE}
results.files <- dir(
  path = "./DATASETS/matched_analysis_results/",
  pattern = "single_cpg_linear_df.csv",
  recursive = TRUE,
  full.names = TRUE,
  ignore.case = TRUE
)

for(i in results.files){
  data <- readr::read_csv(i)
  dataset <- unlist(stringr::str_split(basename(i),"\\/|\\_"))[1]  %>% as.character()
  aux <- paste0(dataset,c("_estimate", "_se", "_pValue"))
  colnames(data) <- c("cpg", aux)
  assign(dataset,data)
}
```

## Merge cohorts

```{R, eval = FALSE}
cohort_ls <- list(
  London_PFC = London,
  MtSinai = MtSinai,
  ROSMAP = ROSMAP
)

### outer join input region
multi_cohorts <- Reduce(
  function(x,y, ...) merge(x, y, by = "cpg", all = TRUE, ...),
  cohort_ls
) #dim: 450793 10
dim(multi_cohorts)
```

## Meta analysis  

```{R, eval = FALSE}
### calculate meta analysis z scores and p values
doParallel::registerDoParallel(cores = parallel::detectCores()/2)
meta_df <- plyr::adply(
  .data = multi_cohorts, 
  .margins = 1, 
  .fun =  function(rowOne_df){
    
    est <- rowOne_df[grep("estimate",colnames(rowOne_df))] %>% as.numeric
    
    direction <-  paste(ifelse(
      is.na(est), ".",
      ifelse(est > 0, "+", "-")
    ),collapse = "")
    
    se <- rowOne_df[grep("se",colnames(rowOne_df))] %>% as.numeric
    cohort <- gsub("_se","",grep("se",colnames(rowOne_df),value = T))
    rowOne_reform_df <- data.frame(
      cohort = cohort,
      est = est,
      se = se,
      stringsAsFactors = FALSE
    )
    
    f <- metagen(
      TE = est,
      seTE = se,
      data = rowOne_reform_df
    )
    
    tibble::tibble(
      cpg = rowOne_df$cpg,
      estimate = f$TE.fixed,
      se = f$seTE.fixed,
      pVal.fixed = f$pval.fixed,
      pVal.random = f$pval.random,
      pValQ = f$pval.Q,
      direction = direction
    )
  }  , .progress = "time",
  .parallel = TRUE,
  .id = NULL
)

### create final pVal
meta_df$pVal.final <- ifelse(
  meta_df$pValQ > 0.05, meta_df$pVal.fixed, meta_df$pVal.random
)

### calculate fdr
meta_df$fdr <- p.adjust(meta_df$pVal.final, method = "fdr")

### order meta_df
meta_final_df <- meta_df[, c(grep("_",colnames(meta_df),invert = T),
                             grep("_",colnames(meta_df),invert = F))]
meta_final_ordered_df <- meta_final_df[order(meta_final_df$pVal.final),]
```

## Add annotation  

```{R, eval = FALSE}
library(sesame)
probes.info <- sesameDataGet("HM450.hg19.manifest")
probes.info <- probes.info[meta_final_ordered_df$cpg %>% as.character()] %>% 
  as.data.frame %>% 
  dplyr::select(c("seqnames","start","end"))

result_annot_df <- merge(
  y = meta_final_ordered_df,
  x = probes.info,
  by.y = "cpg",
  by.x = "row.names",
  all.y = TRUE,
  sort = FALSE
)
```

```{R, eval = FALSE}
### final raw data
write.csv(
  result_annot_df  %>% as.data.frame(),
  paste0(dir.result, "meta_analysis_matched_single_cpg_df.csv"),
  row.names = FALSE
)
```

```{R, eval = FALSE}
### prepare data for comb-p
result_for_combp_df <- result_annot_df[
  , c("seqnames", "start", "end", "pVal.final")
  ]
colnames(result_for_combp_df)[c(1,4)] <- c("chr", "pValue")
result_for_combp_df$chr <- as.character(result_for_combp_df$chr)
```

```{R, eval = FALSE}
write.csv(
  result_for_combp_df,
  paste0(dir.result, "meta_analysis_matched_single_cpg_df_for_combp.csv"),
  row.names = FALSE
)
```

## Delete cross-hybridizing and smoking probes from sig probes 

```{R, eval = FALSE}
### Exclude non-significant probes
result_annot_sig_df <- result_annot_df %>% filter(fdr < 0.05) 
```

```{R, eval = FALSE}
write.csv(
  result_annot_sig_df %>% as.data.frame(),
  paste0(dir.result, "meta_analysis_matched_single_cpg_sig_df.csv"),
  row.names = FALSE
)
```

```{R, eval = FALSE}
### Get crosshybrdizing probes
library(ExperimentHub)

eh = ExperimentHub()
query(eh, "DMRcate")
crosshyb <- eh[["EH3129"]]

### Get significant smoking probes
smoking.file <- "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5267325/bin/NIHMS817273-supplement-001506_-_Supplemental_Tables.xlsx"

if(!file.exists(basename(smoking.file))) downloader::download(smoking.file,basename(smoking.file))

smoking <- readxl::read_xlsx(
  basename(smoking.file),
  sheet = "02",
  skip = 2
)
smoking.sig.probes <- smoking %>% dplyr::filter(`P-value` < 1*10^(-7)) %>% pull("Probe ID")


### Exclude cross-hybridizing and smoking probes
result_annot_sig_df$Row.names <- as.character(result_annot_sig_df$Row.names)

result_annot_sig_no_crossHyb_smoking_df <- result_annot_sig_df[
  !((result_annot_sig_df$Row.names %in% crosshyb) |
      (result_annot_sig_df$Row.names %in% smoking.sig.probes)),
  ] #dim: 642 24

### Add annotation
library(coMethDMR)
result_annot_sig_no_crossHyb_smoking_df$chrom <- as.character(
  result_annot_sig_no_crossHyb_smoking_df$seqnames
)
result_final <- AnnotateResults(result_annot_sig_no_crossHyb_smoking_df)

result_final_ordered <- result_final[
  ,c(1:4, 23:26, 5:21)
]
```

```{R, eval = FALSE}
write.csv(
  result_final_ordered %>% as.data.frame(),
  paste0(dir.result, "meta_analysis_matched_single_cpg_sig_no_crossHyb_smoking_df.csv"),
  row.names = FALSE
)           
```

## Results

```{R, echo = FALSE}
res <- readr::read_csv(
  paste0(dir.result, "meta_analysis_matched_single_cpg_sig_no_crossHyb_smoking_df.csv"),
  col_types = readr::cols()
) 
res <- res[order(res$fdr),]
res
```

```{R, eval = FALSE, include = FALSE}
## Compare with sig CpGs from main analysis

### Call in datasets
main_cpgs <- read.csv(
  paste0(dir.single.cpg, "meta_analysis_single_cpg_sig_no_crossHyb_smoking_df.csv")
) #dim: 3751 32
main_cpgs <- main_cpgs %>% pull(cpg) %>% as.character

matched_cpgs <- read.csv(
  paste0(dir.result, "meta_analysis_matched_single_cpg_sig_no_crossHyb_smoking_df.csv")
) #dim: 151 25
matched_cpgs <- matched_cpgs %>% pull(1) %>% as.character


### Create Venn Diagram
library(VennDiagram)
library(ggplot2)

# Make the plot
venn.diagram(
  x = list(
    main_cpgs %>% unique ,
    matched_cpgs %>% unique
  ), 
  category.names = c("Main analysis sig. probes" , "Matched analysis sig. probes" ),
  filename = file.path(dir.result, '/venn_main_matched_sig_cpgs.png'),
  output = TRUE ,
  imagetype = "png" ,
  height = 700 ,
  width = 700 ,
  resolution = 300,
  compression = "lzw",
  lwd = 1,
  col = c("#440154ff", '#21908dff'),
  fill = c(alpha("#440154ff",0.3), alpha('#21908dff',0.3)),
  cex = 0.5,
  cat.cex = 0.3,
  cat.default.pos = "outer",
  cat.pos = c(0, 0),
  cat.dist = c(0.01, 0.01)
)
```

# Compare matched-analysis results with main meta-analysis results

```{R}
sig_cpgs <- readxl::read_xlsx(
    path = dir(dir.revision,pattern = "ALL",full.names = TRUE),
    sheet = "Supp Table 1",
    skip = 3
)

sig_dmrs <- readxl::read_xlsx(
    path = dir(dir.revision,pattern = "ALL",full.names = TRUE),
    sheet = "Supp Table 2",
    skip = 3
)

matched_cpgs <- read.csv(
  paste0(dir.result, "meta_analysis_matched_single_cpg_sig_no_crossHyb_smoking_df.csv")
)

matched_dmrs <- read.csv(
  paste0(dir.result, "meta_analysis_matched_sig_no_crossHyb_smoking_df.csv")
)
```

```{R}
sig_cpgs <- sig_cpgs[, 1:17]
matched_cpgs <- matched_cpgs[, c(1, 9:16)]
colnames(matched_cpgs) <- c(
    "cpg", paste0("matched_", colnames(matched_cpgs)[2:ncol(matched_cpgs)])
)
sig_matched_cpgs <- merge(
    sig_cpgs, 
    matched_cpgs,
    by = "cpg",
    sort = FALSE
)
nrow(sig_matched_cpgs)

meta_cpg_direction <- ifelse(
    sig_matched_cpgs$estimate > 0, "+", "-"
)
matched_cpg_direction <- ifelse(
    sig_matched_cpgs$matched_estimate > 0, "+", "-"
)
table(meta_cpg_direction == matched_cpg_direction)
```

```{R}
sig_dmrs <- sig_dmrs[, 1:15]
matched_dmrs <- matched_dmrs[, c(1, 6:13)]
colnames(matched_dmrs) <- c(
    "DMR", paste0("matched_", colnames(matched_dmrs)[2:ncol(matched_dmrs)])
)
sig_matched_dmrs <- merge(
    sig_dmrs, matched_dmrs,
    by = "DMR",
    sort = FALSE
)
nrow(sig_matched_dmrs)

meta_dmr_direction <- ifelse(
    sig_matched_dmrs$estimate > 0, "+", "-"
)
matched_dmr_direction <- ifelse(
    sig_matched_dmrs$matched_estimate > 0, "+", "-"
)
table(meta_dmr_direction == matched_dmr_direction)
```

```{R, eval = FALSE}
write.csv(
   sig_matched_cpgs,
   paste0(dir.result, "meta_matched_common_cpgs.csv"),
   row.names = FALSE
)

write.csv(
   sig_matched_dmrs,
   paste0(dir.result, "meta_matched_common_dmrs.csv"),
   row.names = FALSE
)
```

# Session information

```{R}
devtools::session_info()
```