1 Main libraries and configuration

dir.result <- "matched_analysis_results/"
dir.dmr <- "meta_analysis_region_results/step4_dmr_vs_cpgs/" 
dir.single.cpg <- "meta_analysis_single_cpg_results/"        
dir.combp <- "../../Michael/data_for_combp-matchedAnalysis_1-29-2020/results/"  
dir.create(dir.result)

2 Matched analysis

library(dplyr)
library(ExperimentHub)
library(GenomicRanges)
library(e1071)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")
files <- dir(path = "DATASETS",
             pattern = paste0(".*neuro"),
             recursive = TRUE,
             full.names = TRUE,
             ignore.case = TRUE)
files
## [1] "DATASETS/GASPARONI/step8_neuron_comp/pheno_withNeuronProp_df.RDS"    
## [2] "DATASETS/LONDON/step6_neuron_comp/pheno107_PFC_withNeuronProp_df.RDS"
## [3] "DATASETS/MtSinai/step6_neuron_comp/pheno141_withNeuronProp_df.RDS"   
## [4] "DATASETS/ROSMAP/step8_neuron_comp/pheno_PFC_withNeuronProp_df.RDS"
pheno.ls <- lapply(files, function (f){
  readRDS(f)
})
names(pheno.ls) <- files %>% dirname %>% dirname %>% basename
lapply(pheno.ls,dim)
## $GASPARONI
## [1] 56  8
## 
## $LONDON
## [1] 107   9
## 
## $MtSinai
## [1] 141   8
## 
## $ROSMAP
## [1] 726  28

2.1 Rosmap dataset

rosmap <- pheno.ls$ROSMAP[, c("Sample", "msex", "braaksc", "age_death")]
rosmap$age_death <- as.numeric(as.character(rosmap$age_death))
rosmap$age <- ifelse(is.na(rosmap$age_death), 90, rosmap$age_death)
rosmap$status <- ifelse(rosmap$braaksc < 3, "Control", "Case")
rosmap$status <- as.factor(rosmap$status)
rosmap$sex <- ifelse(rosmap$msex == 1, "Male", "Female")
rosmap$sex <- as.factor(rosmap$sex)
str(rosmap)


{
  set.seed(5) # matchControl uses a distance calculation, if ties, it uses sample function
  m_ros <- matchControls(
    formula = status ~ age,
    data = rosmap,
    contlabel = "Case",
    caselabel = "Control"
  )
}

outFile <- data.frame(
  "matchedControls" = rosmap[m_ros$cases, "Sample"], 
  "matchedCases" = rosmap[m_ros$controls, "Sample"]
)

outFileControl <- merge(outFile, rosmap[, c("Sample", "status", "age", "sex")], 
                        by.x = "matchedControls",
                        by.y = "Sample")

colnames(outFileControl)[3:5] <- c("status_control", "age_control", "sex_control")

outFileControlCases <- merge(outFileControl, 
                             rosmap[, c("Sample", "status", "age", "sex")],
                             by.x = "matchedCases",
                             by.y = "Sample")

colnames(outFileControlCases)[6:8] <- c("status_case", "age_case", "sex_case")

outFileControlCases <- outFileControlCases[, c(2, 3, 4, 5, 1, 6, 7, 8)]

write.csv(
  outFileControlCases,
  paste0(dir.result, "Rosmap_matchedCaseControls.csv"),
  row.names = FALSE
)

ROSMAP_matchAgeSex <-  outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)
dim(ROSMAP_matchAgeSex)

table(ROSMAP_matchAgeSex$status_control, ROSMAP_matchAgeSex$sex_control)
table(ROSMAP_matchAgeSex$status_case, ROSMAP_matchAgeSex$sex_case)

ROSMAP_samples <- c(
  as.character(ROSMAP_matchAgeSex$matchedControls),
  as.character(ROSMAP_matchAgeSex$matchedCases)
)
saveRDS(
  ROSMAP_samples,
  paste0(dir.result, "ROSMAP_samples.RDS")
)

2.1.1 Samples

length(readRDS(paste0(dir.result, "ROSMAP_samples.RDS")))
## [1] 256

2.1.2 Linear model

### Import datasets
cohort <- "ROSMAP"
pheno <- "pheno"

info_df <- readRDS(
  dir("DATASETS/ROSMAP////",pattern = "info",recursive = TRUE,full.names = TRUE)
)
mediansMval_df <- readRDS(
  dir("DATASETS/ROSMAP/",pattern = "medians",recursive = TRUE,full.names = TRUE)
)
pheno_df <- readRDS(
  dir("DATASETS/ROSMAP/",pattern = "Neuron",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "ROSMAP_samples.RDS")
)

### Limit samples
mediansMval_df <- mediansMval_df[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

mediansMval_df <- mediansMval_df[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(mediansMval_df))

str(pheno_df)

pheno_df$age_death <- as.numeric(as.character(pheno_df$age_death))
pheno_df$msex <- as.factor(as.character(pheno_df$msex))
pheno_df$Slide <- as.factor(as.character(pheno_df$Slide))
pheno_df$batch <- as.factor(as.character(pheno_df$batch))

predictors_char <- "braaksc"
covariates_char <- c("msex", "prop.neuron", "Slide", "batch")


res_df <- TestAllRegions_noInfo(
  predictors_char = predictors_char,
  covariates_char = covariates_char,
  pheno_df = pheno_df,
  summarizedRegions_df = mediansMval_df,
  cores = 4
)

colnames(res_df) <- c(
  paste0(cohort, "_estimate"),
  paste0(cohort, "_se"),
  paste0(cohort, "_pVal"),
  paste0(cohort, "_fdr")
)

res_withInfo_df <- cbind(info_df, res_df)

saveRDS(
  res_withInfo_df,
  paste0(dir.result, cohort, "_matched_linear_df.rds")
)

2.1.3 Single cpg linear model

### Import datasets
beta_mat <- readRDS(
  "DATASETS/ROSMAP/step7_pca_filtering/ROSMAP_QNBMIQ_PCfiltered.RDS"
) 
pheno_df <- readRDS(
  dir("DATASETS/ROSMAP/",pattern = "Neuron",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "ROSMAP_samples.RDS")
)

### Limit samples
beta_mat <- beta_mat[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

beta_mat <- beta_mat[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(beta_mat))
## [1] TRUE
### Compute M values
mval_mat <- log2(beta_mat / (1 - beta_mat))

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$Sentrix_ID)
pheno_df$batch <- as.factor(pheno_df$batch)

str(pheno_df)
## 'data.frame':    256 obs. of  29 variables:
##  $ Sample                : chr  "PT-313T" "PT-35PM" "PT-BY8G" "PT-BY9P" ...
##  $ Sentrix_ID            : num  5.82e+09 5.82e+09 6.04e+09 5.82e+09 5.82e+09 ...
##  $ Sentrix_Position      : chr  "R04C01" "R05C01" "R05C02" "R05C02" ...
##  $ Sample_Plate          : chr  "WG0001091-MSA4" "WG0001091-MSA4" "WG0003259-MSA4" "WG0001091-MSA4" ...
##  $ Sample_Well           : chr  "H08" "A03" "C02" "C05" ...
##  $ Sample_Group          : chr  "PT-313T" "PT-35PM" "PT-BY8G" "PT-BY9P" ...
##  $ batch                 : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 1 2 2 2 ...
##  $ Sentrix               : chr  "5822020006_R04C01" "5822020007_R05C01" "6042324057_R05C02" "5822020004_R05C02" ...
##  $ Slide                 : num  5.82e+09 5.82e+09 6.04e+09 5.82e+09 5.82e+09 ...
##  $ projid                : num  11464261 22209673 78452313 39484737 46251007 ...
##  $ Study                 : chr  "ROS" "ROS" "MAP" "MAP" ...
##  $ msex                  : num  1 0 1 1 0 0 1 1 0 0 ...
##  $ educ                  : num  16 18 16 13 16 16 14 13 20 11 ...
##  $ race                  : Factor w/ 4 levels "1","2","3","7": 1 2 1 1 1 1 1 1 1 1 ...
##  $ spanish               : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ apoe_genotype         : Factor w/ 6 levels "22","23","24",..: 2 5 5 4 4 4 2 4 4 5 ...
##  $ age_at_visit_max      : chr  "83.507186858316217" "68.668035592060235" "75.493497604380565" "85.30595482546201" ...
##  $ age_first_ad_dx       : chr  NA NA "71.444216290212182" NA ...
##  $ age_death             : num  84.4 69.2 76 86.3 89.1 ...
##  $ cts_mmse30_first_ad_dx: num  NA NA 21 NA NA NA NA NA 23 NA ...
##  $ cts_mmse30_lv         : num  24 26 2 27 29 28 29 27 5 30 ...
##  $ pmi                   : num  4 8.5 6.08 8.73 6.12 ...
##  $ braaksc               : num  1 3 5 1 2 2 1 2 3 5 ...
##  $ ceradsc               : num  4 2 1 3 4 2 4 4 1 1 ...
##  $ cogdx                 : num  4 6 4 3 1 1 1 1 4 1 ...
##  $ stage3                : chr  "0-2" "3-4" "5-6" "0-2" ...
##  $ sex                   : Factor w/ 2 levels "Female","Male": 2 1 2 2 1 1 2 2 1 1 ...
##  $ prop.neuron           : num  0.458 0.41 0.41 0.449 0.463 ...
##  $ slide                 : Factor w/ 65 levels "5772325072","5772325089",..: 30 31 65 28 17 10 3 38 21 6 ...
is(pheno_df$braaksc,"numeric")
## [1] TRUE
is(pheno_df$prop.neuron,"numeric")
## [1] TRUE
predictors_char <- "braaksc"
covariates_char <- c("sex", "prop.neuron", "slide", "batch")


doParallel::registerDoParallel(cores = parallel::detectCores()/2)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")

results_ordered_df <- plyr::adply(mval_mat,1, function(row){
  
  sumOneRegion_df <- as.data.frame(t(row))
  
  result <- TestSingleRegion(
    predictors_char = predictors_char,
    covariates_char = covariates_char,
    pheno_df = pheno_df,
    sumOneRegion_df = sumOneRegion_df
  )
  result
}, .progress = "time",.parallel = TRUE,.id = "cpg")
colnames(results_ordered_df)[1] <- "cpg"

identical(row.names(mval_mat), results_ordered_df$cpg %>% as.character())

write.csv(
  results_ordered_df,
  paste0(dir.result, "ROSMAP_matched_single_cpg_linear_df.csv"),
  row.names = FALSE
)
results_ordered_df <- readr::read_csv(
  paste0(dir.result, "ROSMAP_matched_single_cpg_linear_df.csv"),
  col_types = readr::cols()
)
dim(results_ordered_df)
## [1] 431803      4
results_ordered_df %>% head

2.2 London dataset

london <- pheno.ls$LONDON[, c("sample", "sex", "stage", "age.brain")]
london$status <- ifelse(london$stage < 3, "Control", "Case")
london$status <- as.factor(london$status)
london$sex <- as.factor(london$sex)
str(london)

{
  set.seed(5)
  m_london <- matchControls(
    formula = status ~ age.brain,
    data = london,
    contlabel = "Case",
    caselabel = "Control"
  )
}
outFile <-  data.frame(
  "matchedControls" = london[m_london$cases, "sample"],
  "matchedCases" = london[m_london$controls, "sample"]
)

outFileControl <- merge(outFile,
                        london[, c("sample", "status", "age.brain", "sex")],
                        by.x = "matchedControls",
                        by.y = "sample")
colnames(outFileControl)[3:5] <- c("status_control", "age_control", "sex_control")

outFileControlCases <- merge(outFileControl,
                             london[, c("sample", "status", "age.brain", "sex")],
                             by.x = "matchedCases",
                             by.y = "sample")
colnames(outFileControlCases)[6:8] <- c("status_case", "age_case", "sex_case")
outFileControlCases <- outFileControlCases[, c(2, 3, 4, 5, 1, 6, 7, 8)]

write.csv(
  outFileControlCases,
  paste0(dir.result, "London_matchedCaseControls.csv"),
  row.names = FALSE
)



London_matchAgeSex <- outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)

dim(London_matchAgeSex)

table(London_matchAgeSex$status_control, London_matchAgeSex$sex_control)
table(London_matchAgeSex$status_case, London_matchAgeSex$sex_case)
London_samples <- c(
  as.character(London_matchAgeSex$matchedControls),
  as.character(London_matchAgeSex$matchedCases)
)
saveRDS(
  London_samples,
  paste0(dir.result, "London_samples.RDS")
)

2.2.1 Samples

length(readRDS(paste0(dir.result, "London_samples.RDS")))
## [1] 46

2.2.2 Linear model

### Import datasets
cohort <- "London"
pheno <- "pheno"

info_df <- readRDS(
  dir("DATASETS/LONDON///",pattern = "info",recursive = TRUE,full.names = TRUE)
)
mediansMval_df <- readRDS(
  dir("DATASETS/LONDON/",pattern = "medians",recursive = TRUE,full.names = TRUE)
)
pheno_df <- readRDS(
  dir("DATASETS/LONDON/",pattern = "NeuronProp_df",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "London_samples.RDS")
)

### Limit samples
mediansMval_df <- mediansMval_df[, samples]
pheno_df <- pheno_df[pheno_df$sample %in% samples, ]

mediansMval_df <- mediansMval_df[, pheno_df$sample]

### Check variables before fitting model
pheno_df$Sample <- pheno_df$sample

identical(pheno_df$Sample, colnames(mediansMval_df))
## [1] TRUE
str(pheno_df)
## 'data.frame':    46 obs. of  10 variables:
##  $ sample     : chr  "GSM1443251" "GSM1443256" "GSM1443263" "GSM1443269" ...
##  $ subject.id : num  1 2 4 5 6 10 13 18 25 26 ...
##  $ sentrix_id : chr  "6042316048_R05C01" "6042316066_R05C01" "7786923107_R02C01" "6042316121_R04C02" ...
##  $ slide      : chr  "6042316048" "6042316066" "7786923107" "6042316121" ...
##  $ age.brain  : num  82 82 81 92 78 82 80 82 97 87 ...
##  $ sex        : chr  "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
##  $ stage      : num  0 2 1 2 1 6 5 6 5 3 ...
##  $ stage3     : chr  "0-2" "0-2" "0-2" "0-2" ...
##  $ prop.neuron: num  0.377 0.226 0.206 0.456 0.424 ...
##  $ Sample     : chr  "GSM1443251" "GSM1443256" "GSM1443263" "GSM1443269" ...
pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)
# If rosmap cohort, don't forget batch effect
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")
predictors_char <- "stage"
covariates_char <- c("sex", "prop.neuron", "slide")

res_df <- TestAllRegions_noInfo(
  predictors_char = predictors_char,
  covariates_char = covariates_char,
  pheno_df = pheno_df,
  summarizedRegions_df = mediansMval_df,
  cores = 4
)

colnames(res_df) <- c(
  paste0(cohort, "_estimate"),
  paste0(cohort, "_se"),
  paste0(cohort, "_pVal"),
  paste0(cohort, "_fdr")
)

res_withInfo_df <- cbind(info_df, res_df)

saveRDS(
  res_withInfo_df,
  paste0(dir.result, cohort, "_matched_linear_df.rds")
)

2.2.3 Single cpg linear model

### Import datasets
beta_mat <- readRDS(
  "DATASETS/LONDON/step5_pca_filtering/London_PFC_QNBMIQ_PCfiltered.RDS"
) 
pheno_df <- readRDS(
  dir("DATASETS/LONDON/",pattern = "NeuronProp_df",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "London_samples.RDS")
)

### Limit samples
pheno_df$Sample <- pheno_df$sample

beta_mat <- beta_mat[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

beta_mat <- beta_mat[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(beta_mat))
## [1] TRUE
### Compute M values
mval_mat <- log2(beta_mat / (1 - beta_mat))

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)

str(pheno_df)
## 'data.frame':    46 obs. of  10 variables:
##  $ sample     : chr  "GSM1443251" "GSM1443256" "GSM1443263" "GSM1443269" ...
##  $ subject.id : num  1 2 4 5 6 10 13 18 25 26 ...
##  $ sentrix_id : chr  "6042316048_R05C01" "6042316066_R05C01" "7786923107_R02C01" "6042316121_R04C02" ...
##  $ slide      : Factor w/ 12 levels "6042316035","6042316048",..: 2 3 9 8 6 11 7 2 2 9 ...
##  $ age.brain  : num  82 82 81 92 78 82 80 82 97 87 ...
##  $ sex        : Factor w/ 2 levels "FEMALE","MALE": 1 1 1 1 2 2 1 1 1 1 ...
##  $ stage      : num  0 2 1 2 1 6 5 6 5 3 ...
##  $ stage3     : chr  "0-2" "0-2" "0-2" "0-2" ...
##  $ prop.neuron: num  0.377 0.226 0.206 0.456 0.424 ...
##  $ Sample     : chr  "GSM1443251" "GSM1443256" "GSM1443263" "GSM1443269" ...
is(pheno_df$stage,"numeric")
## [1] TRUE
is(pheno_df$prop.neuron,"numeric")
## [1] TRUE
predictors_char <- "stage"
covariates_char <- c("sex", "prop.neuron", "slide")


doParallel::registerDoParallel(cores = parallel::detectCores()/2)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")

results_ordered_df <- plyr::adply(mval_mat,1, function(row){
  
  sumOneRegion_df <- as.data.frame(t(row))
  
  result <- TestSingleRegion(
    predictors_char = predictors_char,
    covariates_char = covariates_char,
    pheno_df = pheno_df,
    sumOneRegion_df = sumOneRegion_df
  )
  result
}, .progress = "time",.parallel = TRUE,.id = "cpg")
colnames(results_ordered_df)[1] <- "cpg"

identical(row.names(mval_mat), results_ordered_df$cpg %>% as.character())

write.csv(
  results_ordered_df,
  paste0(dir.result, "London_matched_single_cpg_linear_df.csv"),
  row.names = FALSE
)
results_ordered_df <- readr::read_csv(
  paste0(dir.result, "London_matched_single_cpg_linear_df.csv"),
  col_types = readr::cols()
)
dim(results_ordered_df)
## [1] 450793      4
results_ordered_df %>% head
results_ordered_df <- readr::read_csv(
  paste0(dir.result, "Gasparoni_matched_single_cpg_linear_df.csv"),
  col_types = readr::cols()
)
dim(results_ordered_df)
## [1] 446792      4
results_ordered_df %>% head

2.3 Mt.Sinai dataset

mtsinai <- pheno.ls$MtSinai[, c("sample", "sex", "stage", "age.brain")]
mtsinai$status <- ifelse(mtsinai$stage < 3, "Control", "Case")
mtsinai$status <- as.factor(mtsinai$status)
mtsinai$sex <- as.factor(mtsinai$sex)
str(mtsinai)

{
  set.seed(5)
  m_mtsinai <- matchControls(
    formula = status ~ age.brain,
    data = mtsinai,
    contlabel = "Case",
    caselabel = "Control"
  )
}
outFile <- data.frame(
  "matchedControls" = mtsinai[m_mtsinai$cases, "sample"], 
  "matchedCases" = mtsinai[m_mtsinai$controls, "sample"]
)

outFileControl <- merge(outFile, 
                        mtsinai[, c("sample", "status", "age.brain", "sex")], 
                        by.x = "matchedControls", 
                        by.y = "sample")

colnames(outFileControl)[3:5] <- c("status_control", "age_control", "sex_control")
outFileControlCases <-  merge(outFileControl, 
                              mtsinai[, c("sample", "status", "age.brain", "sex")], 
                              by.x = "matchedCases", 
                              by.y = "sample")
colnames(outFileControlCases)[6:8] <- c("status_case", "age_case", "sex_case")
outFileControlCases <- outFileControlCases[, c(2, 3, 4, 5, 1, 6, 7, 8)]

write.csv(
  outFileControlCases,
  paste0(dir.result, "MtSinai_matchedCaseControls.csv"),
  row.names = FALSE
)



MtSinai_matchAgeSex <-  outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)
dim(MtSinai_matchAgeSex)


table(MtSinai_matchAgeSex$status_control, MtSinai_matchAgeSex$sex_control)
table(MtSinai_matchAgeSex$status_case, MtSinai_matchAgeSex$sex_case)

MtSinai_samples <- c(
  as.character(MtSinai_matchAgeSex$matchedControls),
  as.character(MtSinai_matchAgeSex$matchedCases)
)

saveRDS(
  MtSinai_samples,
  paste0(dir.result, "MtSinai_samples.RDS")
)

2.3.1 Samples

length(readRDS(paste0(dir.result, "MtSinai_samples.RDS")))
## [1] 70

2.3.2 Linear model

### Import datasets
cohort <- "MtSinai"
pheno <- "pheno"

info_df <- readRDS(
  dir("DATASETS/MtSinai//",pattern = "info",recursive = TRUE,full.names = TRUE)
)
mediansMval_df <- readRDS(
  dir("DATASETS/MtSinai/",pattern = "medians",recursive = TRUE,full.names = TRUE)
)
pheno_df <- readRDS(
  dir("DATASETS/MtSinai/",pattern = "NeuronProp",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "MtSinai_samples.RDS")
)

### Limit samples
mediansMval_df <- mediansMval_df[, samples]
pheno_df <- pheno_df[pheno_df$sample %in% samples, ]

mediansMval_df <- mediansMval_df[, pheno_df$sample]

### Check variables before fitting model
pheno_df$Sample <- pheno_df$sample

identical(pheno_df$Sample, colnames(mediansMval_df))
## [1] TRUE
str(pheno_df)
## 'data.frame':    70 obs. of  9 variables:
##  $ sample     : chr  "GSM2139163" "GSM2139164" "GSM2139166" "GSM2139169" ...
##  $ subject.id : chr  "MS66" "MS546" "MS1549" "MS504" ...
##  $ sentrix_id : chr  "7796806144_R01C01" "7796806144_R01C02" "7796806144_R03C01" "7796806144_R04C02" ...
##  $ slide      : chr  "7796806144" "7796806144" "7796806144" "7796806144" ...
##  $ age.brain  : num  89 80 78 85 83 78 86 73 81 93 ...
##  $ sex        : chr  "male" "female" "male" "male" ...
##  $ stage      : num  5 2 4 5 1 2 6 6 2 6 ...
##  $ prop.neuron: num  0.243 0.202 0.304 0.248 0.329 ...
##  $ Sample     : chr  "GSM2139163" "GSM2139164" "GSM2139166" "GSM2139169" ...
pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)
# If rosmap cohort, don't forget batch effect
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")
predictors_char <- "stage"
covariates_char <- c("sex", "prop.neuron", "slide")

res_df <- TestAllRegions_noInfo(
  predictors_char = predictors_char,
  covariates_char = covariates_char,
  pheno_df = pheno_df,
  summarizedRegions_df = mediansMval_df,
  cores = 4
)

colnames(res_df) <- c(
  paste0(cohort, "_estimate"),
  paste0(cohort, "_se"),
  paste0(cohort, "_pVal"),
  paste0(cohort, "_fdr")
)

res_withInfo_df <- cbind(info_df, res_df)

saveRDS(
  res_withInfo_df,
  paste0(dir.result, cohort, "_matched_linear_df.rds")
)

2.3.3 Single cpg linear model

### Import datasets
beta_mat <- readRDS(
  "DATASETS/MtSinai/step5_pca_filtering/MtSinai_QNBMIQ_PCfiltered.RDS"
) 
pheno_df <- readRDS(
  dir("DATASETS/MtSinai/",pattern = "Neuron",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "MtSinai_samples.RDS")
)

### Limit samples
pheno_df$Sample <- pheno_df$sample

beta_mat <- beta_mat[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

beta_mat <- beta_mat[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(beta_mat))
## [1] TRUE
### Compute M values
mval_mat <- log2(beta_mat / (1 - beta_mat))

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)

str(pheno_df)
## 'data.frame':    70 obs. of  9 variables:
##  $ sample     : chr  "GSM2139163" "GSM2139164" "GSM2139166" "GSM2139169" ...
##  $ subject.id : chr  "MS66" "MS546" "MS1549" "MS504" ...
##  $ sentrix_id : chr  "7796806144_R01C01" "7796806144_R01C02" "7796806144_R03C01" "7796806144_R04C02" ...
##  $ slide      : Factor w/ 12 levels "7796806144","7796814008",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ age.brain  : num  89 80 78 85 83 78 86 73 81 93 ...
##  $ sex        : Factor w/ 2 levels "female","male": 2 1 2 2 1 2 1 2 2 1 ...
##  $ stage      : num  5 2 4 5 1 2 6 6 2 6 ...
##  $ prop.neuron: num  0.243 0.202 0.304 0.248 0.329 ...
##  $ Sample     : chr  "GSM2139163" "GSM2139164" "GSM2139166" "GSM2139169" ...
is(pheno_df$stage,"numeric")
## [1] TRUE
is(pheno_df$prop.neuron,"numeric")
## [1] TRUE
predictors_char <- "stage"
covariates_char <- c("sex", "prop.neuron", "slide")


doParallel::registerDoParallel(cores = parallel::detectCores()/2)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")

results_ordered_df <- plyr::adply(mval_mat,1, function(row){
  
  sumOneRegion_df <- as.data.frame(t(row))
  
  result <- TestSingleRegion(
    predictors_char = predictors_char,
    covariates_char = covariates_char,
    pheno_df = pheno_df,
    sumOneRegion_df = sumOneRegion_df
  )
  result
}, .progress = "time",.parallel = TRUE,.id = "cpg")
colnames(results_ordered_df)[1] <- "cpg"

identical(row.names(mval_mat), results_ordered_df$cpg %>% as.character())

write.csv(
  results_ordered_df,
  paste0(dir.result, "MtSinai_matched_single_cpg_linear_df.csv"),
  row.names = FALSE
)
results_ordered_df <- readr::read_csv(
  paste0(dir.result, "MtSinai_matched_single_cpg_linear_df.csv"),
  col_types = readr::cols()
)
dim(results_ordered_df)
## [1] 434610      4
results_ordered_df %>% head

3 Meta-analysis of Genomic Regions

3.1 Import datasets and pre-process for each cohort

library(dplyr)
library(tidyr)
preMeta <- function(cohort){
  
  ### Load data
  file <- dir(path = "matched_analysis_results",
              pattern = paste0(cohort,".*matched_linear_df"),
              recursive = T,
              full.names = TRUE,
              ignore.case = T)
  linear.results.final <- readRDS(file)
  
  ### select the most sig cometh region for each input region
  pva.col <- grep("_pVal",colnames(linear.results.final),value = TRUE)
  colnames(linear.results.final)[grep("inputRegion",colnames(linear.results.final))] <- "inputRegion"
  
  # !! is used to unquote an input 
  # https://dplyr.tidyverse.org/articles/programming.html
  result_sig <- linear.results.final %>%
    group_by(inputRegion) %>%
    filter((!!as.symbol(pva.col)) == min(!!as.symbol(pva.col)))
  
  data.frame(result_sig, stringsAsFactors = FALSE)
}

London_PFC <- preMeta(cohort = "London")
dim(London_PFC)

MtSinai <- preMeta(cohort = "MtSinai") 
dim(MtSinai)

ROSMAP <- preMeta(cohort = "ROSMAP")
dim(ROSMAP)

3.2 Merge cohorts

### merge datasets
cohort_ls <- list(
  London_PFC = London_PFC,
  MtSinai = MtSinai,
  ROSMAP = ROSMAP
)

### outer join input region
multi_cohorts <- Reduce(
  function(x,y, ...) merge(x, y, by = "inputRegion", all = TRUE, ...),
  cohort_ls
)

3.3 Meta analysis

library(meta)

doParallel::registerDoParallel(cores = parallel::detectCores()/2)
meta_df <- plyr::adply(.data = multi_cohorts, 
                       .margins = 1, 
                       .fun =  function(rowOne_df){
                         
                         est <- rowOne_df[grep("estimate",colnames(rowOne_df))] %>% as.numeric
                         
                         direction <-  paste(ifelse(
                           is.na(est), ".",
                           ifelse(est > 0, "+", "-")
                         ),collapse = "")
                         
                         se <- rowOne_df[grep("se",colnames(rowOne_df))] %>% as.numeric
                         cohort <- gsub("_se","",grep("se",colnames(rowOne_df),value = T))
                         rowOne_reform_df <- data.frame(
                           cohort = cohort,
                           est = est,
                           se = se,
                           stringsAsFactors = FALSE
                         )
                         
                         f <- metagen(
                           TE = est,
                           seTE = se,
                           data = rowOne_reform_df
                         )
                         
                         tibble::tibble(
                           inputRegion = rowOne_df$inputRegion,
                           estimate = f$TE.fixed,
                           se = f$seTE.fixed,
                           pVal.fixed = f$pval.fixed,
                           pVal.random = f$pval.random,
                           pValQ = f$pval.Q,
                           direction = direction
                         )
                       }  , .progress = "time",
                       .parallel = TRUE,
                       .id = NULL
)

### create final pVal
meta_df$pVal.final <- ifelse(
  meta_df$pValQ > 0.05, meta_df$pVal.fixed, meta_df$pVal.random
)

### calculate fdr
meta_df$fdr <- p.adjust(meta_df$pVal.final, method = "fdr")

### order meta_df
meta_final_df <- meta_df[, c(grep("_",colnames(meta_df),invert = T),
                             grep("_",colnames(meta_df),invert = F))]
meta_final_ordered_df <- meta_final_df[order(meta_final_df$pVal.final),]

3.4 Add annotation to input regions

### find annotations
library(coMethDMR)

splited_input <- meta_final_ordered_df %>% 
  tidyr::separate(col = inputRegion,into =  c("chrom", "start", "end"),remove = FALSE)
splited_input_annot <- AnnotateResults(splited_input[,c("chrom", "start", "end")],nCores_int = 10) 

### merge annotation with meta analysis data
meta_ordered_withAnnot_df <- cbind(
  meta_final_ordered_df, splited_input_annot[, 4:7]
)

### order columns
meta_ordered_withAnnot_df <- meta_ordered_withAnnot_df %>% 
  dplyr::select(c(1,
                  (ncol(meta_ordered_withAnnot_df) - 3):ncol(meta_ordered_withAnnot_df),
                  2:(ncol(meta_ordered_withAnnot_df) - 4))
  )
### save dataset
write.csv(
  meta_ordered_withAnnot_df,
  paste0(dir.result, "meta_analysis_matched_df.csv"),
  row.names = FALSE
)

meta_all_sig <- meta_ordered_withAnnot_df[
  !is.na(meta_ordered_withAnnot_df$fdr) &
    (meta_ordered_withAnnot_df$fdr < 0.05),  
  ] #dim: 102 41
row.names(meta_all_sig) <- NULL

write.csv(
  meta_all_sig,
  paste0(dir.result, "meta_analysis_matched_sig_df.csv"),
  row.names = FALSE
)
library(GenomicRanges)

3.5 Delete sig DMRs with cross-hybridizing or smoking probes

### call in all cross hybridizing probes
eh = ExperimentHub()
query(eh, "DMRcate")
crosshyb <- eh[["EH3129"]]

### Get significant smoking probes
smoking.file <- "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5267325/bin/NIHMS817273-supplement-001506_-_Supplemental_Tables.xlsx"

if(!file.exists(basename(smoking.file))) downloader::download(smoking.file,basename(smoking.file))

smoking <- readxl::read_xlsx(
  basename(smoking.file),
  sheet = "02",
  skip = 2
)
smoking.sig.probes <- smoking %>% dplyr::filter(`P-value` < 1*10^(-7)) %>% pull("Probe ID") 
length(smoking.sig.probes)

### Call in meta analysis final results
meta_all <- read.csv(
  paste0(dir.result, "meta_analysis_matched_sig_df.csv")
) #dim: 102 41

### Find files with regions and probes
files <- dir(pattern = paste0(".*_residuals_cometh_input_ls.rds"),
             recursive = T,
             full.names = TRUE,
             ignore.case = T)
files <- grep("GASPARONI|LONDON_PFC|MtSinai|ROSMAP",files,value = TRUE,ignore.case = TRUE)

### Read files and Limit the cohort_ls to cohort_coMethRegion in meta_all 
cometh.probes.list <- lapply(files, function (f){
  print(f)
  all.clusters <- readRDS(f)$coMeth_ls # Read files
  cohort <- f %>% dirname %>% dirname %>% basename # get cohort from folder name
  
  col <- grep(paste0(cohort,"_coMethRegion"),
              colnames(meta_all),
              ignore.case = TRUE,
              value = TRUE) # get column with cohort sig regions
  
  # keep sig regions only
  all.clusters[names(all.clusters) %in% meta_all[[col]]]
})
names(cometh.probes.list) <-  files %>% dirname %>% dirname %>% basename

lapply(cometh.probes.list,length)

### Map probes in each region to smoking and crosshybrdizing
extractCrosHybridization <- function(probes.list){
  crosshyb.extracted <- plyr::laply(probes.list,function(probes){
    paste(intersect(probes, crosshyb), 
          collapse = ", "
    )
  })
  smoking.extracted <- plyr::laply(probes.list,function(probes){
    paste(intersect(probes, smoking.sig.probes), 
          collapse = ", "
    )
  })
  tibble::tibble(
    "coMethRegion" = names(probes.list),
    "crossHyb" = crosshyb.extracted, 
    "crossHyb_bi" = ifelse(crosshyb.extracted == "",0,1),
    "smoke" = smoking.extracted,
    "smoke_bi" = ifelse(smoking.extracted == "",0,1)
  )
}

Gasparoni_crossHyb_df <- extractCrosHybridization(cometh.probes.list$GASPARONI)
colnames(Gasparoni_crossHyb_df) <- paste0("GASPARONI_",colnames(Gasparoni_crossHyb_df))
plyr::count(
  Gasparoni_crossHyb_df, 
  vars = grep("_bi",colnames(Gasparoni_crossHyb_df),value = TRUE)
)

London_PFC_crossHyb_df <- extractCrosHybridization(cometh.probes.list$LONDON)
colnames(London_PFC_crossHyb_df) <- paste0("London_",colnames(London_PFC_crossHyb_df))
plyr::count(
  London_PFC_crossHyb_df, 
  vars = grep("_bi",colnames(London_PFC_crossHyb_df),value = TRUE)
)

MtSinai_crossHyb_df <- extractCrosHybridization(cometh.probes.list$MtSinai)
colnames(MtSinai_crossHyb_df) <- paste0("MtSinai_",colnames(MtSinai_crossHyb_df))
plyr::count(
  MtSinai_crossHyb_df, 
  vars = grep("_bi",colnames(MtSinai_crossHyb_df),value = TRUE)
)

ROSMAP_crossHyb_df <- extractCrosHybridization(cometh.probes.list$ROSMAP)
colnames(ROSMAP_crossHyb_df) <- paste0("ROSMAP_",colnames(ROSMAP_crossHyb_df))
plyr::count(ROSMAP_crossHyb_df, vars = grep("_bi",colnames(ROSMAP_crossHyb_df),value = TRUE))

### Merge smoking and crossHyb probes  information with meta analysis results 
meta_all_final <- meta_all %>% left_join(Gasparoni_crossHyb_df)  %>% 
  left_join(London_PFC_crossHyb_df) %>% 
  left_join(MtSinai_crossHyb_df) %>% 
  left_join(ROSMAP_crossHyb_df)

### Add information with input regions with any cross hybridization in cohorts 
meta_all_final$crossHyb_bi <- rowSums(meta_all_final[,grep("crossHyb_bi",colnames(meta_all_final))],na.rm = TRUE) > 0
meta_all_final$smoke_bi <- rowSums(meta_all_final[,grep("smoke_bi",colnames(meta_all_final))],na.rm = TRUE) > 0

# Sort by region meta analysis FDR
# Cluster columns of the projects together
meta_all_final <- meta_all_final[
  order(meta_all_final$fdr),
  c(grep("Gasparoni|MtSinai|London|ROSMAP",colnames(meta_all_final),ignore.case = TRUE,invert = TRUE),
    grep("Gasparoni",colnames(meta_all_final),ignore.case = TRUE),
    grep("MtSinai",colnames(meta_all_final),ignore.case = TRUE),
    grep("London",colnames(meta_all_final),ignore.case = TRUE),
    grep("ROSMAP",colnames(meta_all_final),ignore.case = TRUE)
  )
  ]
str(meta_all_final)

meta_sig_final <- meta_all_final[
  meta_all_final$crossHyb_bi == 0 &
    meta_all_final$smoke_bi == 0,
  ] #dim: 83 59
### Save
write.csv(
  meta_sig_final %>% as.data.frame,
  paste0(dir.result, "meta_analysis_matched_sig_no_crossHyb_smoking_df.csv"),
  row.names = FALSE
)

3.6 Results

4 Meta-analysis Single CpGs

  1. merge cohorts
  2. meta analysis
  3. add annotation
  4. Delete cross-hybridizing and smoking probes from sig probes

4.1 Import datasets

library(dplyr)
library(tidyr)

results.files <- dir("matched_analysis_results/",
                     pattern = "single_cpg_linear_df.csv",
                     recursive = TRUE,
                     full.names = TRUE,
                     ignore.case = TRUE)

for(i in results.files){
  data <- readr::read_csv(i)
  dataset <- unlist(stringr::str_split(basename(i),"\\/|\\_"))[1]  %>% as.character()
  aux <- paste0(dataset,c("_estimate", "_se", "_pValue"))
  colnames(data) <- c("cpg", aux)
  assign(dataset,data)
}

4.2 Merge cohorts

cohort_ls <- list(
  Gasparoni = Gasparoni,
  London_PFC = London,
  MtSinai = MtSinai,
  ROSMAP = ROSMAP
)

### outer join input region
multi_cohorts <- Reduce(
  function(x,y, ...) merge(x, y, by = "cpg", all = TRUE, ...),
  cohort_ls
) #dim: 450793 13
dim(multi_cohorts)

4.3 Meta analysis

### calculate meta analysis z scores and p values
library(meta)

doParallel::registerDoParallel(cores = parallel::detectCores()/2)
meta_df <- plyr::adply(.data = multi_cohorts, 
                       .margins = 1, 
                       .fun =  function(rowOne_df){
                         
                         est <- rowOne_df[grep("estimate",colnames(rowOne_df))] %>% as.numeric
                         
                         direction <-  paste(ifelse(
                           is.na(est), ".",
                           ifelse(est > 0, "+", "-")
                         ),collapse = "")
                         
                         se <- rowOne_df[grep("se",colnames(rowOne_df))] %>% as.numeric
                         cohort <- gsub("_se","",grep("se",colnames(rowOne_df),value = T))
                         rowOne_reform_df <- data.frame(
                           cohort = cohort,
                           est = est,
                           se = se,
                           stringsAsFactors = FALSE
                         )
                         
                         f <- metagen(
                           TE = est,
                           seTE = se,
                           data = rowOne_reform_df
                         )
                         
                         tibble::tibble(
                           cpg = rowOne_df$cpg,
                           estimate = f$TE.fixed,
                           se = f$seTE.fixed,
                           pVal.fixed = f$pval.fixed,
                           pVal.random = f$pval.random,
                           pValQ = f$pval.Q,
                           direction = direction
                         )
                       }  , .progress = "time",
                       .parallel = TRUE,
                       .id = NULL
)

### create final pVal
meta_df$pVal.final <- ifelse(
  meta_df$pValQ > 0.05, meta_df$pVal.fixed, meta_df$pVal.random
)

### calculate fdr
meta_df$fdr <- p.adjust(meta_df$pVal.final, method = "fdr")

### order meta_df
meta_final_df <- meta_df[, c(grep("_",colnames(meta_df),invert = T),
                             grep("_",colnames(meta_df),invert = F))]
meta_final_ordered_df <- meta_final_df[order(meta_final_df$pVal.final),]

4.4 Add annotation

library(sesame)
probes.info <- sesameDataGet("HM450.hg19.manifest")
probes.info <- probes.info[meta_final_ordered_df$cpg %>% as.character()] %>% as.data.frame %>% dplyr::select(c("seqnames","start","end"))

result_annot_df <- merge(
  y = meta_final_ordered_df,
  x = probes.info,
  by.y = "cpg",
  by.x = "row.names",
  all.y = TRUE,
  sort = FALSE
)

### final raw data
write.csv(
  result_annot_df  %>% as.data.frame(),
  paste0(dir.result, "meta_analysis_matched_single_cpg_df.csv"),
  row.names = FALSE
)

### prepare data for comb-p
result_for_combp_df <- result_annot_df[
  , c("seqnames", "start", "end", "pVal.final")
  ]
colnames(result_for_combp_df)[c(1,4)] <- c("chr", "pValue")
result_for_combp_df$chr <- as.character(result_for_combp_df$chr)

write.csv(
  result_for_combp_df,
  paste0(dir.result, "meta_analysis_matched_single_cpg_df_for_combp.csv"),
  row.names = FALSE
)

4.5 Delete cross-hybridizing and smoking probes from sig probes

### Exclude non-significant probes
result_annot_sig_df <- result_annot_df %>% filter(fdr < 0.05) 

write.csv(
  result_annot_sig_df %>% as.data.frame(),
  paste0(dir.result, "meta_analysis_matched_single_cpg_sig_df.csv"),
  row.names = FALSE
)

### Get crosshybrdizing probes
library(ExperimentHub)

eh = ExperimentHub()
query(eh, "DMRcate")
crosshyb <- eh[["EH3129"]]


### Get significant smoking probes
smoking.file <- "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5267325/bin/NIHMS817273-supplement-001506_-_Supplemental_Tables.xlsx"

if(!file.exists(basename(smoking.file))) downloader::download(smoking.file,basename(smoking.file))

smoking <- readxl::read_xlsx(
  basename(smoking.file),
  sheet = "02",
  skip = 2
)
smoking.sig.probes <- smoking %>% dplyr::filter(`P-value` < 1*10^(-7)) %>% pull("Probe ID")


### Exclude cross-hybridizing and smoking probes
result_annot_sig_df$Row.names <- as.character(result_annot_sig_df$Row.names)

result_annot_sig_no_crossHyb_smoking_df <- result_annot_sig_df[
  !((result_annot_sig_df$Row.names %in% crosshyb) |
      (result_annot_sig_df$Row.names %in% smoking.sig.probes)),
  ] #dim: 642 24

### Add annotation
library(coMethDMR)
result_annot_sig_no_crossHyb_smoking_df$chrom <- as.character(
  result_annot_sig_no_crossHyb_smoking_df$seqnames
)
result_final <- AnnotateResults(result_annot_sig_no_crossHyb_smoking_df)

result_final_ordered <- result_final[
  ,c(1:4, 26:29, 5:24)
  ]
write.csv(
  result_final_ordered %>% as.data.frame(),
  paste0(dir.result, "meta_analysis_matched_single_cpg_sig_no_crossHyb_smoking_df.csv"),
  row.names = FALSE
)           

4.6 Results

5 Session information

devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value                                             
##  version  R Under development (unstable) (2020-02-25 r77857)
##  os       macOS Catalina 10.15.3                            
##  system   x86_64, darwin15.6.0                              
##  ui       X11                                               
##  language (EN)                                              
##  collate  en_US.UTF-8                                       
##  ctype    en_US.UTF-8                                       
##  tz       America/New_York                                  
##  date     2020-04-26                                        
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package                * version     date       lib
##  AnnotationDbi            1.49.1      2020-01-25 [1]
##  AnnotationHub          * 2.19.11     2020-04-16 [1]
##  assertthat               0.2.1       2019-03-21 [1]
##  backports                1.1.6       2020-04-05 [1]
##  base64enc                0.1-3       2015-07-28 [1]
##  Biobase                  2.47.3      2020-03-16 [1]
##  BiocFileCache          * 1.11.6      2020-04-16 [1]
##  BiocGenerics           * 0.33.3      2020-03-23 [1]
##  BiocManager              1.30.10     2019-11-16 [1]
##  BiocVersion              3.11.1      2019-11-13 [1]
##  bit                      1.1-15.2    2020-02-10 [1]
##  bit64                    0.9-7       2017-05-08 [1]
##  bitops                   1.0-6       2013-08-17 [1]
##  blob                     1.2.1       2020-01-20 [1]
##  callr                    3.4.3       2020-03-28 [1]
##  class                    7.3-16      2020-03-25 [1]
##  cli                      2.0.2       2020-02-28 [1]
##  crayon                   1.3.4       2017-09-16 [1]
##  curl                     4.3         2019-12-02 [1]
##  DBI                      1.1.0       2019-12-15 [1]
##  dbplyr                 * 1.4.3       2020-04-19 [1]
##  desc                     1.2.0       2018-05-01 [1]
##  devtools                 2.3.0       2020-04-10 [1]
##  digest                   0.6.25      2020-02-23 [1]
##  dplyr                  * 0.8.99.9002 2020-04-02 [1]
##  e1071                  * 1.7-3       2019-11-26 [1]
##  ellipsis                 0.3.0       2019-09-20 [1]
##  evaluate                 0.14        2019-05-28 [1]
##  ExperimentHub          * 1.13.7      2020-04-16 [1]
##  fansi                    0.4.1       2020-01-08 [1]
##  fastmap                  1.0.1       2019-10-08 [1]
##  fs                       1.4.1       2020-04-04 [1]
##  generics                 0.0.2       2018-11-29 [1]
##  GenomeInfoDb           * 1.23.17     2020-04-13 [1]
##  GenomeInfoDbData         1.2.3       2020-04-20 [1]
##  GenomicRanges          * 1.39.3      2020-04-08 [1]
##  glue                     1.4.0       2020-04-03 [1]
##  hms                      0.5.3       2020-01-08 [1]
##  htmltools                0.4.0       2019-10-04 [1]
##  httpuv                   1.5.2       2019-09-11 [1]
##  httr                     1.4.1       2019-08-05 [1]
##  interactiveDisplayBase   1.25.0      2019-11-06 [1]
##  IRanges                * 2.21.8      2020-03-25 [1]
##  jsonlite                 1.6.1       2020-02-02 [1]
##  knitr                    1.28        2020-02-06 [1]
##  later                    1.0.0       2019-10-04 [1]
##  lifecycle                0.2.0       2020-03-06 [1]
##  magrittr                 1.5         2014-11-22 [1]
##  memoise                  1.1.0       2017-04-21 [1]
##  mime                     0.9         2020-02-04 [1]
##  pillar                   1.4.3       2019-12-20 [1]
##  pkgbuild                 1.0.6       2019-10-09 [1]
##  pkgconfig                2.0.3       2019-09-22 [1]
##  pkgload                  1.0.2       2018-10-29 [1]
##  prettyunits              1.1.1       2020-01-24 [1]
##  processx                 3.4.2       2020-02-09 [1]
##  promises                 1.1.0       2019-10-04 [1]
##  ps                       1.3.2       2020-02-13 [1]
##  purrr                    0.3.4       2020-04-17 [1]
##  R6                       2.4.1       2019-11-12 [1]
##  rappdirs                 0.3.1       2016-03-28 [1]
##  Rcpp                     1.0.4.6     2020-04-09 [1]
##  RCurl                    1.98-1.2    2020-04-18 [1]
##  readr                    1.3.1       2018-12-21 [1]
##  remotes                  2.1.1       2020-02-15 [1]
##  rlang                    0.4.5.9000  2020-03-20 [1]
##  rmarkdown                2.1         2020-01-20 [1]
##  rprojroot                1.3-2       2018-01-03 [1]
##  RSQLite                  2.2.0       2020-01-07 [1]
##  S4Vectors              * 0.25.15     2020-04-04 [1]
##  sessioninfo              1.1.1       2018-11-05 [1]
##  shiny                    1.4.0.2     2020-03-13 [1]
##  stringi                  1.4.6       2020-02-17 [1]
##  stringr                  1.4.0       2019-02-10 [1]
##  testthat                 2.3.2       2020-03-02 [1]
##  tibble                   3.0.1       2020-04-20 [1]
##  tidyselect               1.0.0       2020-01-27 [1]
##  usethis                  1.6.0       2020-04-09 [1]
##  vctrs                    0.2.99.9010 2020-04-02 [1]
##  withr                    2.1.2       2018-03-15 [1]
##  xfun                     0.13        2020-04-13 [1]
##  xtable                   1.8-4       2019-04-21 [1]
##  XVector                  0.27.2      2020-03-24 [1]
##  yaml                     2.2.1       2020-02-01 [1]
##  zlibbioc                 1.33.1      2020-01-24 [1]
##  source                                     
##  Bioconductor                               
##  Bioconductor                               
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  Bioconductor                               
##  Bioconductor                               
##  Bioconductor                               
##  CRAN (R 4.0.0)                             
##  Bioconductor                               
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  Github (tidyverse/dplyr@affb977)           
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  Bioconductor                               
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  Bioconductor                               
##  Bioconductor                               
##  Github (Bioconductor/GenomicRanges@70e6e69)
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  Bioconductor                               
##  Bioconductor                               
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  Github (r-lib/rlang@a90b04b)               
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  Bioconductor                               
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  Github (r-lib/vctrs@fd24927)               
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  CRAN (R 4.0.0)                             
##  Bioconductor                               
##  CRAN (R 4.0.0)                             
##  Bioconductor                               
## 
## [1] /Library/Frameworks/R.framework/Versions/4.0/Resources/library
---
title: "Meta-analysis dataset"
author: "Lanyu Zhang, Tiago C. Silva, Lily Wang"
date: "`r Sys.Date()`"
output:
  rmarkdown::html_document:
    theme: lumen
    toc: true
    number_sections: true
    df_print: paged
    code_download: true
    toc_float:
      collapsed: yes
    toc_depth: 3
editor_options:
  chunk_output_type: inline    
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE)
```


# Main libraries and configuration

```{R}
dir.result <- "matched_analysis_results/"
dir.dmr <- "meta_analysis_region_results/step4_dmr_vs_cpgs/" 
dir.single.cpg <- "meta_analysis_single_cpg_results/"        
dir.combp <- "../../Michael/data_for_combp-matchedAnalysis_1-29-2020/results/"  
dir.create(dir.result)
```

# Matched analysis

```{R Matched_libs,message = FALSE, results = "hide"}
library(dplyr)
library(ExperimentHub)
library(GenomicRanges)
library(e1071)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")
```

```{R}
files <- dir(path = "DATASETS",
             pattern = paste0(".*neuro"),
             recursive = TRUE,
             full.names = TRUE,
             ignore.case = TRUE)
```

```{R, echo = FALSE}
files <- files[c(1,2,4,5)]
```

```{R}
files
pheno.ls <- lapply(files, function (f){
  readRDS(f)
})
names(pheno.ls) <- files %>% dirname %>% dirname %>% basename
lapply(pheno.ls,dim)
```

## Rosmap dataset

```{R, eval = FALSE}
rosmap <- pheno.ls$ROSMAP[, c("Sample", "msex", "braaksc", "age_death")]
rosmap$age_death <- as.numeric(as.character(rosmap$age_death))
rosmap$age <- ifelse(is.na(rosmap$age_death), 90, rosmap$age_death)
rosmap$status <- ifelse(rosmap$braaksc < 3, "Control", "Case")
rosmap$status <- as.factor(rosmap$status)
rosmap$sex <- ifelse(rosmap$msex == 1, "Male", "Female")
rosmap$sex <- as.factor(rosmap$sex)
str(rosmap)


{
  set.seed(5) # matchControl uses a distance calculation, if ties, it uses sample function
  m_ros <- matchControls(
    formula = status ~ age,
    data = rosmap,
    contlabel = "Case",
    caselabel = "Control"
  )
}

outFile <- data.frame(
  "matchedControls" = rosmap[m_ros$cases, "Sample"], 
  "matchedCases" = rosmap[m_ros$controls, "Sample"]
)

outFileControl <- merge(outFile, rosmap[, c("Sample", "status", "age", "sex")], 
                        by.x = "matchedControls",
                        by.y = "Sample")

colnames(outFileControl)[3:5] <- c("status_control", "age_control", "sex_control")

outFileControlCases <- merge(outFileControl, 
                             rosmap[, c("Sample", "status", "age", "sex")],
                             by.x = "matchedCases",
                             by.y = "Sample")

colnames(outFileControlCases)[6:8] <- c("status_case", "age_case", "sex_case")

outFileControlCases <- outFileControlCases[, c(2, 3, 4, 5, 1, 6, 7, 8)]

write.csv(
  outFileControlCases,
  paste0(dir.result, "Rosmap_matchedCaseControls.csv"),
  row.names = FALSE
)

ROSMAP_matchAgeSex <-  outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)
dim(ROSMAP_matchAgeSex)

table(ROSMAP_matchAgeSex$status_control, ROSMAP_matchAgeSex$sex_control)
table(ROSMAP_matchAgeSex$status_case, ROSMAP_matchAgeSex$sex_case)

ROSMAP_samples <- c(
  as.character(ROSMAP_matchAgeSex$matchedControls),
  as.character(ROSMAP_matchAgeSex$matchedCases)
)
saveRDS(
  ROSMAP_samples,
  paste0(dir.result, "ROSMAP_samples.RDS")
)
```

### Samples

```{R}
length(readRDS(paste0(dir.result, "ROSMAP_samples.RDS")))
```

### Linear model

```{R ROSMAP_DMR, eval = FALSE}
### Import datasets
cohort <- "ROSMAP"
pheno <- "pheno"

info_df <- readRDS(
  dir("DATASETS/ROSMAP////",pattern = "info",recursive = TRUE,full.names = TRUE)
)
mediansMval_df <- readRDS(
  dir("DATASETS/ROSMAP/",pattern = "medians",recursive = TRUE,full.names = TRUE)
)
pheno_df <- readRDS(
  dir("DATASETS/ROSMAP/",pattern = "Neuron",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "ROSMAP_samples.RDS")
)

### Limit samples
mediansMval_df <- mediansMval_df[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

mediansMval_df <- mediansMval_df[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(mediansMval_df))

str(pheno_df)

pheno_df$age_death <- as.numeric(as.character(pheno_df$age_death))
pheno_df$msex <- as.factor(as.character(pheno_df$msex))
pheno_df$Slide <- as.factor(as.character(pheno_df$Slide))
pheno_df$batch <- as.factor(as.character(pheno_df$batch))

predictors_char <- "braaksc"
covariates_char <- c("msex", "prop.neuron", "Slide", "batch")


res_df <- TestAllRegions_noInfo(
  predictors_char = predictors_char,
  covariates_char = covariates_char,
  pheno_df = pheno_df,
  summarizedRegions_df = mediansMval_df,
  cores = 4
)

colnames(res_df) <- c(
  paste0(cohort, "_estimate"),
  paste0(cohort, "_se"),
  paste0(cohort, "_pVal"),
  paste0(cohort, "_fdr")
)

res_withInfo_df <- cbind(info_df, res_df)

saveRDS(
  res_withInfo_df,
  paste0(dir.result, cohort, "_matched_linear_df.rds")
)
```

### Single cpg linear model

```{R ROSMAP_singleCpG, eval = TRUE}
### Import datasets
beta_mat <- readRDS(
  "DATASETS/ROSMAP/step7_pca_filtering/ROSMAP_QNBMIQ_PCfiltered.RDS"
) 
pheno_df <- readRDS(
  dir("DATASETS/ROSMAP/",pattern = "Neuron",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "ROSMAP_samples.RDS")
)

### Limit samples
beta_mat <- beta_mat[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

beta_mat <- beta_mat[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(beta_mat))

### Compute M values
mval_mat <- log2(beta_mat / (1 - beta_mat))

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$Sentrix_ID)
pheno_df$batch <- as.factor(pheno_df$batch)

str(pheno_df)

is(pheno_df$braaksc,"numeric")
is(pheno_df$prop.neuron,"numeric")
```

```{R, eval = FALSE}
predictors_char <- "braaksc"
covariates_char <- c("sex", "prop.neuron", "slide", "batch")


doParallel::registerDoParallel(cores = parallel::detectCores()/2)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")

results_ordered_df <- plyr::adply(mval_mat,1, function(row){
  
  sumOneRegion_df <- as.data.frame(t(row))
  
  result <- TestSingleRegion(
    predictors_char = predictors_char,
    covariates_char = covariates_char,
    pheno_df = pheno_df,
    sumOneRegion_df = sumOneRegion_df
  )
  result
}, .progress = "time",.parallel = TRUE,.id = "cpg")
colnames(results_ordered_df)[1] <- "cpg"

identical(row.names(mval_mat), results_ordered_df$cpg %>% as.character())

write.csv(
  results_ordered_df,
  paste0(dir.result, "ROSMAP_matched_single_cpg_linear_df.csv"),
  row.names = FALSE
)
```

```{R}
results_ordered_df <- readr::read_csv(
  paste0(dir.result, "ROSMAP_matched_single_cpg_linear_df.csv"),
  col_types = readr::cols()
)
dim(results_ordered_df)
results_ordered_df %>% head
```


## London dataset

```{R, eval = FALSE}
london <- pheno.ls$LONDON[, c("sample", "sex", "stage", "age.brain")]
london$status <- ifelse(london$stage < 3, "Control", "Case")
london$status <- as.factor(london$status)
london$sex <- as.factor(london$sex)
str(london)

{
  set.seed(5)
  m_london <- matchControls(
    formula = status ~ age.brain,
    data = london,
    contlabel = "Case",
    caselabel = "Control"
  )
}
outFile <-  data.frame(
  "matchedControls" = london[m_london$cases, "sample"],
  "matchedCases" = london[m_london$controls, "sample"]
)

outFileControl <- merge(outFile,
                        london[, c("sample", "status", "age.brain", "sex")],
                        by.x = "matchedControls",
                        by.y = "sample")
colnames(outFileControl)[3:5] <- c("status_control", "age_control", "sex_control")

outFileControlCases <- merge(outFileControl,
                             london[, c("sample", "status", "age.brain", "sex")],
                             by.x = "matchedCases",
                             by.y = "sample")
colnames(outFileControlCases)[6:8] <- c("status_case", "age_case", "sex_case")
outFileControlCases <- outFileControlCases[, c(2, 3, 4, 5, 1, 6, 7, 8)]

write.csv(
  outFileControlCases,
  paste0(dir.result, "London_matchedCaseControls.csv"),
  row.names = FALSE
)



London_matchAgeSex <- outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)

dim(London_matchAgeSex)

table(London_matchAgeSex$status_control, London_matchAgeSex$sex_control)
table(London_matchAgeSex$status_case, London_matchAgeSex$sex_case)
London_samples <- c(
  as.character(London_matchAgeSex$matchedControls),
  as.character(London_matchAgeSex$matchedCases)
)
saveRDS(
  London_samples,
  paste0(dir.result, "London_samples.RDS")
)

```



### Samples


```{R}
length(readRDS(paste0(dir.result, "London_samples.RDS")))
```



### Linear model


```{R London_DMR, eval = TRUE}
### Import datasets
cohort <- "London"
pheno <- "pheno"

info_df <- readRDS(
  dir("DATASETS/LONDON///",pattern = "info",recursive = TRUE,full.names = TRUE)
)
mediansMval_df <- readRDS(
  dir("DATASETS/LONDON/",pattern = "medians",recursive = TRUE,full.names = TRUE)
)
pheno_df <- readRDS(
  dir("DATASETS/LONDON/",pattern = "NeuronProp_df",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "London_samples.RDS")
)

### Limit samples
mediansMval_df <- mediansMval_df[, samples]
pheno_df <- pheno_df[pheno_df$sample %in% samples, ]

mediansMval_df <- mediansMval_df[, pheno_df$sample]

### Check variables before fitting model
pheno_df$Sample <- pheno_df$sample

identical(pheno_df$Sample, colnames(mediansMval_df))

str(pheno_df)

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)
# If rosmap cohort, don't forget batch effect
```

```{R, eval = FALSE}
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")
predictors_char <- "stage"
covariates_char <- c("sex", "prop.neuron", "slide")

res_df <- TestAllRegions_noInfo(
  predictors_char = predictors_char,
  covariates_char = covariates_char,
  pheno_df = pheno_df,
  summarizedRegions_df = mediansMval_df,
  cores = 4
)

colnames(res_df) <- c(
  paste0(cohort, "_estimate"),
  paste0(cohort, "_se"),
  paste0(cohort, "_pVal"),
  paste0(cohort, "_fdr")
)

res_withInfo_df <- cbind(info_df, res_df)

saveRDS(
  res_withInfo_df,
  paste0(dir.result, cohort, "_matched_linear_df.rds")
)
```

### Single cpg linear model

```{R London_singleCpG, eval = TRUE}
### Import datasets
beta_mat <- readRDS(
  "DATASETS/LONDON/step5_pca_filtering/London_PFC_QNBMIQ_PCfiltered.RDS"
) 
pheno_df <- readRDS(
  dir("DATASETS/LONDON/",pattern = "NeuronProp_df",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "London_samples.RDS")
)

### Limit samples
pheno_df$Sample <- pheno_df$sample

beta_mat <- beta_mat[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

beta_mat <- beta_mat[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(beta_mat))

### Compute M values
mval_mat <- log2(beta_mat / (1 - beta_mat))

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)

str(pheno_df)

is(pheno_df$stage,"numeric")
is(pheno_df$prop.neuron,"numeric")
```

```{R, eval = FALSE}
predictors_char <- "stage"
covariates_char <- c("sex", "prop.neuron", "slide")


doParallel::registerDoParallel(cores = parallel::detectCores()/2)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")

results_ordered_df <- plyr::adply(mval_mat,1, function(row){
  
  sumOneRegion_df <- as.data.frame(t(row))
  
  result <- TestSingleRegion(
    predictors_char = predictors_char,
    covariates_char = covariates_char,
    pheno_df = pheno_df,
    sumOneRegion_df = sumOneRegion_df
  )
  result
}, .progress = "time",.parallel = TRUE,.id = "cpg")
colnames(results_ordered_df)[1] <- "cpg"

identical(row.names(mval_mat), results_ordered_df$cpg %>% as.character())

write.csv(
  results_ordered_df,
  paste0(dir.result, "London_matched_single_cpg_linear_df.csv"),
  row.names = FALSE
)
```

```{R}
results_ordered_df <- readr::read_csv(
  paste0(dir.result, "London_matched_single_cpg_linear_df.csv"),
  col_types = readr::cols()
)
dim(results_ordered_df)
results_ordered_df %>% head
```


```{R, eval = FALSE, include = FALSE}

## Gasparoni dataset
gasparoni <- pheno.ls$GASPARONI[, c("sample", "sex", "stage", "age.brain")]
gasparoni <- gasparoni[, c("sample", "sex", "stage", "age.brain")]
gasparoni$status <- ifelse(gasparoni$stage < 3, "Control", "Case")
gasparoni$status <- as.factor(gasparoni$status)
gasparoni$sex <- as.factor(gasparoni$sex)
str(gasparoni)

{
  set.seed(5)
  m_gasparoni <- matchControls(
    formula = status ~ age.brain,
    data = gasparoni,
    contlabel = "Case",
    caselabel = "Control"
  )
}

outFile <- data.frame("matchedControls" = gasparoni[m_gasparoni$cases, "sample"], 
                      "matchedCases" = gasparoni[m_gasparoni$controls, "sample"]
)

outFileControl <- merge(outFile,
                        gasparoni[, c("sample", "status", "age.brain", "sex")],
                        by.x = "matchedControls",
                        by.y = "sample")
colnames(outFileControl)[3:5] <-   c("status_control", "age_control", "sex_control")

outFileControlCases <- merge(outFileControl,
                             gasparoni[, c("sample", "status", "age.brain", "sex")],
                             by.x = "matchedCases",
                             by.y = "sample")
colnames(outFileControlCases)[6:8] <- c("status_case", "age_case", "sex_case")

outFileControlCases <- outFileControlCases[, c(2, 3, 4, 5, 1, 6, 7, 8)]

table(outFileControlCases$status_control,
      outFileControlCases$sex_control)
table(outFileControlCases$status_case,
      outFileControlCases$sex_case)

write.csv(
  outFileControlCases,
  paste0(dir.result, "Gasparoni_matchedCaseControls.csv"),
  row.names = FALSE
)


Gasparoni_matchAgeSex <-  outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)
dim(Gasparoni_matchAgeSex)

table(Gasparoni_matchAgeSex$status_control, Gasparoni_matchAgeSex$sex_control)
table(Gasparoni_matchAgeSex$status_case, Gasparoni_matchAgeSex$sex_case)

Gasparoni_samples <- c(
  as.character(Gasparoni_matchAgeSex$matchedControls),
  as.character(Gasparoni_matchAgeSex$matchedCases)
)
saveRDS(
  Gasparoni_samples,
  paste0(dir.result, "Gasparoni_samples.RDS")
)

```



```{R, eval= FALSE, include = FALSE}
### Samples
length(readRDS(paste0(dir.result, "Gasparoni_samples.RDS")))
```

```{R gasparoni_DMR, eval = FALSE, include = FALSE}
### Linear model
### Import datasets
cohort <- "Gasparoni"
pheno <- "pheno"

info_df <- readRDS(
  dir("DATASETS/GASPARONI/",pattern = "info",recursive = TRUE,full.names = TRUE)
)
mediansMval_df <- readRDS(
  dir("DATASETS/GASPARONI/",pattern = "medians",recursive = TRUE,full.names = TRUE)
)
pheno_df <- readRDS(
  dir("DATASETS/GASPARONI/step8_neuron_comp/",pattern = "",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "Gasparoni_samples.RDS")
)

### Limit samples
mediansMval_df <- mediansMval_df[, samples]
pheno_df <- pheno_df[pheno_df$sample %in% samples, ]

mediansMval_df <- mediansMval_df[, pheno_df$sample]

### Check variables before fitting model
pheno_df$Sample <- pheno_df$sample

identical(pheno_df$Sample, colnames(mediansMval_df))

str(pheno_df)

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)
# If rosmap cohort, don't forget batch effect
```

```{R, eval = FALSE, include = FALSE}
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")
predictors_char <- "stage"
covariates_char <- c("sex", "prop.neuron", "slide")

res_df <- TestAllRegions_noInfo(
  predictors_char = predictors_char,
  covariates_char = covariates_char,
  pheno_df = pheno_df,
  summarizedRegions_df = mediansMval_df,
  cores = 4
)

colnames(res_df) <- c(
  paste0(cohort, "_estimate"),
  paste0(cohort, "_se"),
  paste0(cohort, "_pVal"),
  paste0(cohort, "_fdr")
)

res_withInfo_df <- cbind(info_df, res_df)

saveRDS(
  res_withInfo_df,
  paste0(dir.result, cohort, "_matched_linear_df.rds")
)

```

```{R gasparoni_singleCpG, eval = FALSE, include = FALSE}
### Single cpg linear model 

### Import datasets
beta_mat <- readRDS(
  "DATASETS/GASPARONI/step7_pca_filtering/Gasparoni_QNBMIQ_PCfiltered.RDS"
) 
pheno_df <- readRDS(
  dir("DATASETS/GASPARONI/",pattern = "Neuron",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "Gasparoni_samples.RDS")
)

### Limit samples
pheno_df$Sample <- pheno_df$sample

beta_mat <- beta_mat[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

beta_mat <- beta_mat[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(beta_mat))

### Compute M values
mval_mat <- log2(beta_mat / (1 - beta_mat))

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)

str(pheno_df)

is(pheno_df$stage,"numeric")
is(pheno_df$prop.neuron,"numeric")
```


```{R, eval = FALSE, include = FALSE}
predictors_char <- "stage"
covariates_char <- c("sex", "prop.neuron", "slide")


doParallel::registerDoParallel(cores = parallel::detectCores()/2)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")

results_ordered_df <- plyr::adply(mval_mat,1, function(row){
  
  sumOneRegion_df <- as.data.frame(t(row))
  
  result <- TestSingleRegion(
    predictors_char = predictors_char,
    covariates_char = covariates_char,
    pheno_df = pheno_df,
    sumOneRegion_df = sumOneRegion_df
  )
  result
}, .progress = "time",.parallel = TRUE,.id = "cpg")
colnames(results_ordered_df)[1] <- "cpg"

identical(row.names(mval_mat), results_ordered_df$cpg %>% as.character())

write.csv(
  results_ordered_df,
  paste0(dir.result, "Gasparoni_matched_single_cpg_linear_df.csv"),
  row.names = FALSE
)
```
```{R}
results_ordered_df <- readr::read_csv(
  paste0(dir.result, "Gasparoni_matched_single_cpg_linear_df.csv"),
  col_types = readr::cols()
)
dim(results_ordered_df)
results_ordered_df %>% head
```


## Mt.Sinai dataset

```{R, eval = FALSE}
mtsinai <- pheno.ls$MtSinai[, c("sample", "sex", "stage", "age.brain")]
mtsinai$status <- ifelse(mtsinai$stage < 3, "Control", "Case")
mtsinai$status <- as.factor(mtsinai$status)
mtsinai$sex <- as.factor(mtsinai$sex)
str(mtsinai)

{
  set.seed(5)
  m_mtsinai <- matchControls(
    formula = status ~ age.brain,
    data = mtsinai,
    contlabel = "Case",
    caselabel = "Control"
  )
}
outFile <- data.frame(
  "matchedControls" = mtsinai[m_mtsinai$cases, "sample"], 
  "matchedCases" = mtsinai[m_mtsinai$controls, "sample"]
)

outFileControl <- merge(outFile, 
                        mtsinai[, c("sample", "status", "age.brain", "sex")], 
                        by.x = "matchedControls", 
                        by.y = "sample")

colnames(outFileControl)[3:5] <- c("status_control", "age_control", "sex_control")
outFileControlCases <-  merge(outFileControl, 
                              mtsinai[, c("sample", "status", "age.brain", "sex")], 
                              by.x = "matchedCases", 
                              by.y = "sample")
colnames(outFileControlCases)[6:8] <- c("status_case", "age_case", "sex_case")
outFileControlCases <- outFileControlCases[, c(2, 3, 4, 5, 1, 6, 7, 8)]

write.csv(
  outFileControlCases,
  paste0(dir.result, "MtSinai_matchedCaseControls.csv"),
  row.names = FALSE
)



MtSinai_matchAgeSex <-  outFileControlCases %>% 
  dplyr::filter(abs(age_control - age_case) <= 1)
dim(MtSinai_matchAgeSex)


table(MtSinai_matchAgeSex$status_control, MtSinai_matchAgeSex$sex_control)
table(MtSinai_matchAgeSex$status_case, MtSinai_matchAgeSex$sex_case)

MtSinai_samples <- c(
  as.character(MtSinai_matchAgeSex$matchedControls),
  as.character(MtSinai_matchAgeSex$matchedCases)
)

saveRDS(
  MtSinai_samples,
  paste0(dir.result, "MtSinai_samples.RDS")
)
```

### Samples

```{R}
length(readRDS(paste0(dir.result, "MtSinai_samples.RDS")))
```


### Linear model

```{R MtSinai_DMR, eval = TRUE}

### Import datasets
cohort <- "MtSinai"
pheno <- "pheno"

info_df <- readRDS(
  dir("DATASETS/MtSinai//",pattern = "info",recursive = TRUE,full.names = TRUE)
)
mediansMval_df <- readRDS(
  dir("DATASETS/MtSinai/",pattern = "medians",recursive = TRUE,full.names = TRUE)
)
pheno_df <- readRDS(
  dir("DATASETS/MtSinai/",pattern = "NeuronProp",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "MtSinai_samples.RDS")
)

### Limit samples
mediansMval_df <- mediansMval_df[, samples]
pheno_df <- pheno_df[pheno_df$sample %in% samples, ]

mediansMval_df <- mediansMval_df[, pheno_df$sample]

### Check variables before fitting model
pheno_df$Sample <- pheno_df$sample

identical(pheno_df$Sample, colnames(mediansMval_df))

str(pheno_df)

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)
# If rosmap cohort, don't forget batch effect
```

```{R, eval = FALSE}
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")
predictors_char <- "stage"
covariates_char <- c("sex", "prop.neuron", "slide")

res_df <- TestAllRegions_noInfo(
  predictors_char = predictors_char,
  covariates_char = covariates_char,
  pheno_df = pheno_df,
  summarizedRegions_df = mediansMval_df,
  cores = 4
)

colnames(res_df) <- c(
  paste0(cohort, "_estimate"),
  paste0(cohort, "_se"),
  paste0(cohort, "_pVal"),
  paste0(cohort, "_fdr")
)

res_withInfo_df <- cbind(info_df, res_df)

saveRDS(
  res_withInfo_df,
  paste0(dir.result, cohort, "_matched_linear_df.rds")
)
```

### Single cpg linear model

```{R MtSinai_singleCpG, eval = TRUE}
### Import datasets
beta_mat <- readRDS(
  "DATASETS/MtSinai/step5_pca_filtering/MtSinai_QNBMIQ_PCfiltered.RDS"
) 
pheno_df <- readRDS(
  dir("DATASETS/MtSinai/",pattern = "Neuron",recursive = TRUE,full.names = TRUE)
)
samples <- readRDS(
  paste0(dir.result, "MtSinai_samples.RDS")
)

### Limit samples
pheno_df$Sample <- pheno_df$sample

beta_mat <- beta_mat[, samples]
pheno_df <- pheno_df[pheno_df$Sample %in% samples, ]

beta_mat <- beta_mat[, pheno_df$Sample]

identical(pheno_df$Sample, colnames(beta_mat))

### Compute M values
mval_mat <- log2(beta_mat / (1 - beta_mat))

pheno_df$sex <- as.factor(pheno_df$sex)
pheno_df$slide <- as.factor(pheno_df$slide)

str(pheno_df)

is(pheno_df$stage,"numeric")
is(pheno_df$prop.neuron,"numeric")
```

```{R, eval = FALSE}
predictors_char <- "stage"
covariates_char <- c("sex", "prop.neuron", "slide")


doParallel::registerDoParallel(cores = parallel::detectCores()/2)
devtools::source_gist("https://gist.github.com/tiagochst/d3a7b1639acf603916c315d23b1efb3e")

results_ordered_df <- plyr::adply(mval_mat,1, function(row){
  
  sumOneRegion_df <- as.data.frame(t(row))
  
  result <- TestSingleRegion(
    predictors_char = predictors_char,
    covariates_char = covariates_char,
    pheno_df = pheno_df,
    sumOneRegion_df = sumOneRegion_df
  )
  result
}, .progress = "time",.parallel = TRUE,.id = "cpg")
colnames(results_ordered_df)[1] <- "cpg"

identical(row.names(mval_mat), results_ordered_df$cpg %>% as.character())

write.csv(
  results_ordered_df,
  paste0(dir.result, "MtSinai_matched_single_cpg_linear_df.csv"),
  row.names = FALSE
)
```
```{R}
results_ordered_df <- readr::read_csv(
  paste0(dir.result, "MtSinai_matched_single_cpg_linear_df.csv"),
  col_types = readr::cols()
)
dim(results_ordered_df)
results_ordered_df %>% head
```


# Meta-analysis of Genomic Regions

## Import datasets and pre-process for each cohort 

```{R, eval = FALSE}
library(dplyr)
library(tidyr)
preMeta <- function(cohort){
  
  ### Load data
  file <- dir(path = "matched_analysis_results",
              pattern = paste0(cohort,".*matched_linear_df"),
              recursive = T,
              full.names = TRUE,
              ignore.case = T)
  linear.results.final <- readRDS(file)
  
  ### select the most sig cometh region for each input region
  pva.col <- grep("_pVal",colnames(linear.results.final),value = TRUE)
  colnames(linear.results.final)[grep("inputRegion",colnames(linear.results.final))] <- "inputRegion"
  
  # !! is used to unquote an input 
  # https://dplyr.tidyverse.org/articles/programming.html
  result_sig <- linear.results.final %>%
    group_by(inputRegion) %>%
    filter((!!as.symbol(pva.col)) == min(!!as.symbol(pva.col)))
  
  data.frame(result_sig, stringsAsFactors = FALSE)
}

London_PFC <- preMeta(cohort = "London")
dim(London_PFC)

MtSinai <- preMeta(cohort = "MtSinai") 
dim(MtSinai)

ROSMAP <- preMeta(cohort = "ROSMAP")
dim(ROSMAP)
```


## Merge cohorts 

```{R, eval = FALSE}
### merge datasets
cohort_ls <- list(
  London_PFC = London_PFC,
  MtSinai = MtSinai,
  ROSMAP = ROSMAP
)

### outer join input region
multi_cohorts <- Reduce(
  function(x,y, ...) merge(x, y, by = "inputRegion", all = TRUE, ...),
  cohort_ls
)
```

## Meta analysis

```{R, eval = FALSE}
library(meta)

doParallel::registerDoParallel(cores = parallel::detectCores()/2)
meta_df <- plyr::adply(.data = multi_cohorts, 
                       .margins = 1, 
                       .fun =  function(rowOne_df){
                         
                         est <- rowOne_df[grep("estimate",colnames(rowOne_df))] %>% as.numeric
                         
                         direction <-  paste(ifelse(
                           is.na(est), ".",
                           ifelse(est > 0, "+", "-")
                         ),collapse = "")
                         
                         se <- rowOne_df[grep("se",colnames(rowOne_df))] %>% as.numeric
                         cohort <- gsub("_se","",grep("se",colnames(rowOne_df),value = T))
                         rowOne_reform_df <- data.frame(
                           cohort = cohort,
                           est = est,
                           se = se,
                           stringsAsFactors = FALSE
                         )
                         
                         f <- metagen(
                           TE = est,
                           seTE = se,
                           data = rowOne_reform_df
                         )
                         
                         tibble::tibble(
                           inputRegion = rowOne_df$inputRegion,
                           estimate = f$TE.fixed,
                           se = f$seTE.fixed,
                           pVal.fixed = f$pval.fixed,
                           pVal.random = f$pval.random,
                           pValQ = f$pval.Q,
                           direction = direction
                         )
                       }  , .progress = "time",
                       .parallel = TRUE,
                       .id = NULL
)

### create final pVal
meta_df$pVal.final <- ifelse(
  meta_df$pValQ > 0.05, meta_df$pVal.fixed, meta_df$pVal.random
)

### calculate fdr
meta_df$fdr <- p.adjust(meta_df$pVal.final, method = "fdr")

### order meta_df
meta_final_df <- meta_df[, c(grep("_",colnames(meta_df),invert = T),
                             grep("_",colnames(meta_df),invert = F))]
meta_final_ordered_df <- meta_final_df[order(meta_final_df$pVal.final),]
```

## Add annotation to input regions

```{R, eval = FALSE}
### find annotations
library(coMethDMR)

splited_input <- meta_final_ordered_df %>% 
  tidyr::separate(col = inputRegion,into =  c("chrom", "start", "end"),remove = FALSE)
splited_input_annot <- AnnotateResults(splited_input[,c("chrom", "start", "end")],nCores_int = 10) 

### merge annotation with meta analysis data
meta_ordered_withAnnot_df <- cbind(
  meta_final_ordered_df, splited_input_annot[, 4:7]
)

### order columns
meta_ordered_withAnnot_df <- meta_ordered_withAnnot_df %>% 
  dplyr::select(c(1,
                  (ncol(meta_ordered_withAnnot_df) - 3):ncol(meta_ordered_withAnnot_df),
                  2:(ncol(meta_ordered_withAnnot_df) - 4))
  )
### save dataset
write.csv(
  meta_ordered_withAnnot_df,
  paste0(dir.result, "meta_analysis_matched_df.csv"),
  row.names = FALSE
)

meta_all_sig <- meta_ordered_withAnnot_df[
  !is.na(meta_ordered_withAnnot_df$fdr) &
    (meta_ordered_withAnnot_df$fdr < 0.05),  
  ] #dim: 102 41
row.names(meta_all_sig) <- NULL

write.csv(
  meta_all_sig,
  paste0(dir.result, "meta_analysis_matched_sig_df.csv"),
  row.names = FALSE
)

```



```{R eval = FALSE}
library(GenomicRanges)
```

```{R, eval = FALSE, include = FALSE}

## Overlap with comb-p DMRs
matched_dmrs <- read.csv(
  paste0(dir.result, "meta_analysis_matched_sig_df.csv")
)
matched_dmrs_gr <- matched_dmrs %>%
  tidyr::separate("inputRegion", c("chrom", "start", "end")) %>%
  makeGRangesFromDataFrame()

combp_dmrs <- read.table(
  paste0(dir.combp, "cnew.regions-p.bed"),
  header = TRUE
)
colnames(combp_dmrs) <- c(
  "chrom", "start", "end", "min_p", "n_probes", "z_p", "z_sidak_p"
)
combp_sig <- combp_dmrs %>% filter(z_sidak_p < 0.05 & n_probes > 2)
combp_sig_gr <- makeGRangesFromDataFrame(combp_sig)

overlapping.results <- matched_dmrs[
  queryHits(findOverlaps(matched_dmrs_gr,combp_sig_gr)),
  ]
#No overlap with combp

# overlapping.results.unique <- unique(overlapping.results)
# 
# write.csv(
#     overlapping.results.unique,
#     paste0(dir.combp, "matched_analysis_ov_comb_p.csv"),
#     row.names = FALSE
# )
```

## Delete sig DMRs with cross-hybridizing or smoking probes 

```{R, eval = FALSE}
### call in all cross hybridizing probes
eh = ExperimentHub()
query(eh, "DMRcate")
crosshyb <- eh[["EH3129"]]

### Get significant smoking probes
smoking.file <- "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5267325/bin/NIHMS817273-supplement-001506_-_Supplemental_Tables.xlsx"

if(!file.exists(basename(smoking.file))) downloader::download(smoking.file,basename(smoking.file))

smoking <- readxl::read_xlsx(
  basename(smoking.file),
  sheet = "02",
  skip = 2
)
smoking.sig.probes <- smoking %>% dplyr::filter(`P-value` < 1*10^(-7)) %>% pull("Probe ID") 
length(smoking.sig.probes)

### Call in meta analysis final results
meta_all <- read.csv(
  paste0(dir.result, "meta_analysis_matched_sig_df.csv")
) #dim: 102 41

### Find files with regions and probes
files <- dir(pattern = paste0(".*_residuals_cometh_input_ls.rds"),
             recursive = T,
             full.names = TRUE,
             ignore.case = T)
files <- grep("GASPARONI|LONDON_PFC|MtSinai|ROSMAP",files,value = TRUE,ignore.case = TRUE)

### Read files and Limit the cohort_ls to cohort_coMethRegion in meta_all 
cometh.probes.list <- lapply(files, function (f){
  print(f)
  all.clusters <- readRDS(f)$coMeth_ls # Read files
  cohort <- f %>% dirname %>% dirname %>% basename # get cohort from folder name
  
  col <- grep(paste0(cohort,"_coMethRegion"),
              colnames(meta_all),
              ignore.case = TRUE,
              value = TRUE) # get column with cohort sig regions
  
  # keep sig regions only
  all.clusters[names(all.clusters) %in% meta_all[[col]]]
})
names(cometh.probes.list) <-  files %>% dirname %>% dirname %>% basename

lapply(cometh.probes.list,length)

### Map probes in each region to smoking and crosshybrdizing
extractCrosHybridization <- function(probes.list){
  crosshyb.extracted <- plyr::laply(probes.list,function(probes){
    paste(intersect(probes, crosshyb), 
          collapse = ", "
    )
  })
  smoking.extracted <- plyr::laply(probes.list,function(probes){
    paste(intersect(probes, smoking.sig.probes), 
          collapse = ", "
    )
  })
  tibble::tibble(
    "coMethRegion" = names(probes.list),
    "crossHyb" = crosshyb.extracted, 
    "crossHyb_bi" = ifelse(crosshyb.extracted == "",0,1),
    "smoke" = smoking.extracted,
    "smoke_bi" = ifelse(smoking.extracted == "",0,1)
  )
}

Gasparoni_crossHyb_df <- extractCrosHybridization(cometh.probes.list$GASPARONI)
colnames(Gasparoni_crossHyb_df) <- paste0("GASPARONI_",colnames(Gasparoni_crossHyb_df))
plyr::count(
  Gasparoni_crossHyb_df, 
  vars = grep("_bi",colnames(Gasparoni_crossHyb_df),value = TRUE)
)

London_PFC_crossHyb_df <- extractCrosHybridization(cometh.probes.list$LONDON)
colnames(London_PFC_crossHyb_df) <- paste0("London_",colnames(London_PFC_crossHyb_df))
plyr::count(
  London_PFC_crossHyb_df, 
  vars = grep("_bi",colnames(London_PFC_crossHyb_df),value = TRUE)
)

MtSinai_crossHyb_df <- extractCrosHybridization(cometh.probes.list$MtSinai)
colnames(MtSinai_crossHyb_df) <- paste0("MtSinai_",colnames(MtSinai_crossHyb_df))
plyr::count(
  MtSinai_crossHyb_df, 
  vars = grep("_bi",colnames(MtSinai_crossHyb_df),value = TRUE)
)

ROSMAP_crossHyb_df <- extractCrosHybridization(cometh.probes.list$ROSMAP)
colnames(ROSMAP_crossHyb_df) <- paste0("ROSMAP_",colnames(ROSMAP_crossHyb_df))
plyr::count(ROSMAP_crossHyb_df, vars = grep("_bi",colnames(ROSMAP_crossHyb_df),value = TRUE))

### Merge smoking and crossHyb probes  information with meta analysis results 
meta_all_final <- meta_all %>% left_join(Gasparoni_crossHyb_df)  %>% 
  left_join(London_PFC_crossHyb_df) %>% 
  left_join(MtSinai_crossHyb_df) %>% 
  left_join(ROSMAP_crossHyb_df)

### Add information with input regions with any cross hybridization in cohorts 
meta_all_final$crossHyb_bi <- rowSums(meta_all_final[,grep("crossHyb_bi",colnames(meta_all_final))],na.rm = TRUE) > 0
meta_all_final$smoke_bi <- rowSums(meta_all_final[,grep("smoke_bi",colnames(meta_all_final))],na.rm = TRUE) > 0

# Sort by region meta analysis FDR
# Cluster columns of the projects together
meta_all_final <- meta_all_final[
  order(meta_all_final$fdr),
  c(grep("Gasparoni|MtSinai|London|ROSMAP",colnames(meta_all_final),ignore.case = TRUE,invert = TRUE),
    grep("Gasparoni",colnames(meta_all_final),ignore.case = TRUE),
    grep("MtSinai",colnames(meta_all_final),ignore.case = TRUE),
    grep("London",colnames(meta_all_final),ignore.case = TRUE),
    grep("ROSMAP",colnames(meta_all_final),ignore.case = TRUE)
  )
  ]
str(meta_all_final)

meta_sig_final <- meta_all_final[
  meta_all_final$crossHyb_bi == 0 &
    meta_all_final$smoke_bi == 0,
  ] #dim: 83 59
### Save
write.csv(
  meta_sig_final %>% as.data.frame,
  paste0(dir.result, "meta_analysis_matched_sig_no_crossHyb_smoking_df.csv"),
  row.names = FALSE
)
```

## Results

```{R, echo = FALSE}
res <- readr::read_csv(
  paste0(dir.result, "meta_analysis_matched_sig_no_crossHyb_smoking_df.csv"),
  col_types = readr::cols()
) 
res <- res[order(res$fdr),]
res
```


```{R, eval = FALSE, include = FALSE}
## Compare with sig DMRs from main analysis

### Call in datasets
main_dmrs <- read.csv(
  paste0(dir.dmr, "meta_analysis_sig_no_crossHyb_smoking_ov_comb_p_with_sig_single_cpgs.csv")
) #dim: 119 60

matched_dmrs <- read.csv(
  paste0(dir.result, "meta_analysis_matched_sig_no_crossHyb_smoking_df.csv")
) #dim: 83 59

### Turn input regions into granges
library(dplyr)
library(GenomicRanges)
main_dmrs_gr <- main_dmrs %>%
  tidyr::separate("inputRegion",c("chrom","start","end")) %>%
  makeGRangesFromDataFrame() %>%
  unique

matched_dmrs_gr <- matched_dmrs %>%
  tidyr::separate("inputRegion", c("chrom", "start", "end")) %>%
  makeGRangesFromDataFrame() %>%
  unique

### Create Venn Diagram
library(ChIPpeakAnno)
library(ggplot2)

methods      <- c("main analysis", "matched analysis")
methodsLabel <- c("main analysis sig DMRs", "matched analysis sig DMRs")


n <- length(methods)

gg_color_hue <- function(n){
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

cols = gg_color_hue(n)

ranges.list <- list(main_dmrs_gr, matched_dmrs_gr)

# pdf(
#  file = paste0(dir.result,"venn_main_matched_sig_DMRs.pdf"),
#  width = 5, height = 5
#)
makeVennDiagram(
  ranges.list,
  NameOfPeaks = methodsLabel[1:n],
  totalTest = 37159, 
  by = "region",
  main = paste0(" "),
  col = c("#440154ff", '#21908dff'),
  fill = c(alpha("#440154ff",0.3), alpha('#21908dff',0.3)),
  cex = 1,
  fontfamily = "sans",
  cat.cex = 1,
  cat.default.pos = "outer",
  cat.pos = c(180, 180),
  cat.dist = c(-0.055, -0.055),
  cat.fontfamily = "sans",
  cat.col = c("#440154ff", '#21908dff'))
# dev.off()
```


# Meta-analysis Single CpGs

1. merge cohorts
2. meta analysis
3. add annotation
4. Delete cross-hybridizing and smoking probes from sig probes

## Import datasets  

```{R, eval = FALSE}
library(dplyr)
library(tidyr)

results.files <- dir("matched_analysis_results/",
                     pattern = "single_cpg_linear_df.csv",
                     recursive = TRUE,
                     full.names = TRUE,
                     ignore.case = TRUE)

for(i in results.files){
  data <- readr::read_csv(i)
  dataset <- unlist(stringr::str_split(basename(i),"\\/|\\_"))[1]  %>% as.character()
  aux <- paste0(dataset,c("_estimate", "_se", "_pValue"))
  colnames(data) <- c("cpg", aux)
  assign(dataset,data)
}
```

## Merge cohorts

```{R, eval = FALSE}
cohort_ls <- list(
  Gasparoni = Gasparoni,
  London_PFC = London,
  MtSinai = MtSinai,
  ROSMAP = ROSMAP
)

### outer join input region
multi_cohorts <- Reduce(
  function(x,y, ...) merge(x, y, by = "cpg", all = TRUE, ...),
  cohort_ls
) #dim: 450793 13
dim(multi_cohorts)
```

## Meta analysis  

```{R, eval = FALSE}
### calculate meta analysis z scores and p values
library(meta)

doParallel::registerDoParallel(cores = parallel::detectCores()/2)
meta_df <- plyr::adply(.data = multi_cohorts, 
                       .margins = 1, 
                       .fun =  function(rowOne_df){
                         
                         est <- rowOne_df[grep("estimate",colnames(rowOne_df))] %>% as.numeric
                         
                         direction <-  paste(ifelse(
                           is.na(est), ".",
                           ifelse(est > 0, "+", "-")
                         ),collapse = "")
                         
                         se <- rowOne_df[grep("se",colnames(rowOne_df))] %>% as.numeric
                         cohort <- gsub("_se","",grep("se",colnames(rowOne_df),value = T))
                         rowOne_reform_df <- data.frame(
                           cohort = cohort,
                           est = est,
                           se = se,
                           stringsAsFactors = FALSE
                         )
                         
                         f <- metagen(
                           TE = est,
                           seTE = se,
                           data = rowOne_reform_df
                         )
                         
                         tibble::tibble(
                           cpg = rowOne_df$cpg,
                           estimate = f$TE.fixed,
                           se = f$seTE.fixed,
                           pVal.fixed = f$pval.fixed,
                           pVal.random = f$pval.random,
                           pValQ = f$pval.Q,
                           direction = direction
                         )
                       }  , .progress = "time",
                       .parallel = TRUE,
                       .id = NULL
)

### create final pVal
meta_df$pVal.final <- ifelse(
  meta_df$pValQ > 0.05, meta_df$pVal.fixed, meta_df$pVal.random
)

### calculate fdr
meta_df$fdr <- p.adjust(meta_df$pVal.final, method = "fdr")

### order meta_df
meta_final_df <- meta_df[, c(grep("_",colnames(meta_df),invert = T),
                             grep("_",colnames(meta_df),invert = F))]
meta_final_ordered_df <- meta_final_df[order(meta_final_df$pVal.final),]
```

## Add annotation  

```{R, eval = FALSE}
library(sesame)
probes.info <- sesameDataGet("HM450.hg19.manifest")
probes.info <- probes.info[meta_final_ordered_df$cpg %>% as.character()] %>% as.data.frame %>% dplyr::select(c("seqnames","start","end"))

result_annot_df <- merge(
  y = meta_final_ordered_df,
  x = probes.info,
  by.y = "cpg",
  by.x = "row.names",
  all.y = TRUE,
  sort = FALSE
)

### final raw data
write.csv(
  result_annot_df  %>% as.data.frame(),
  paste0(dir.result, "meta_analysis_matched_single_cpg_df.csv"),
  row.names = FALSE
)

### prepare data for comb-p
result_for_combp_df <- result_annot_df[
  , c("seqnames", "start", "end", "pVal.final")
  ]
colnames(result_for_combp_df)[c(1,4)] <- c("chr", "pValue")
result_for_combp_df$chr <- as.character(result_for_combp_df$chr)

write.csv(
  result_for_combp_df,
  paste0(dir.result, "meta_analysis_matched_single_cpg_df_for_combp.csv"),
  row.names = FALSE
)
```

## Delete cross-hybridizing and smoking probes from sig probes 

```{R, eval = FALSE}
### Exclude non-significant probes
result_annot_sig_df <- result_annot_df %>% filter(fdr < 0.05) 

write.csv(
  result_annot_sig_df %>% as.data.frame(),
  paste0(dir.result, "meta_analysis_matched_single_cpg_sig_df.csv"),
  row.names = FALSE
)

### Get crosshybrdizing probes
library(ExperimentHub)

eh = ExperimentHub()
query(eh, "DMRcate")
crosshyb <- eh[["EH3129"]]


### Get significant smoking probes
smoking.file <- "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5267325/bin/NIHMS817273-supplement-001506_-_Supplemental_Tables.xlsx"

if(!file.exists(basename(smoking.file))) downloader::download(smoking.file,basename(smoking.file))

smoking <- readxl::read_xlsx(
  basename(smoking.file),
  sheet = "02",
  skip = 2
)
smoking.sig.probes <- smoking %>% dplyr::filter(`P-value` < 1*10^(-7)) %>% pull("Probe ID")


### Exclude cross-hybridizing and smoking probes
result_annot_sig_df$Row.names <- as.character(result_annot_sig_df$Row.names)

result_annot_sig_no_crossHyb_smoking_df <- result_annot_sig_df[
  !((result_annot_sig_df$Row.names %in% crosshyb) |
      (result_annot_sig_df$Row.names %in% smoking.sig.probes)),
  ] #dim: 642 24

### Add annotation
library(coMethDMR)
result_annot_sig_no_crossHyb_smoking_df$chrom <- as.character(
  result_annot_sig_no_crossHyb_smoking_df$seqnames
)
result_final <- AnnotateResults(result_annot_sig_no_crossHyb_smoking_df)

result_final_ordered <- result_final[
  ,c(1:4, 26:29, 5:24)
  ]
write.csv(
  result_final_ordered %>% as.data.frame(),
  paste0(dir.result, "meta_analysis_matched_single_cpg_sig_no_crossHyb_smoking_df.csv"),
  row.names = FALSE
)           
```

## Results

```{R, echo = FALSE}
res <- readr::read_csv(
  paste0(dir.result, "meta_analysis_matched_single_cpg_sig_no_crossHyb_smoking_df.csv"),
  col_types = readr::cols()
) 
res <- res[order(res$fdr),]
res
```


```{R, eval = FALSE, include = FALSE}
## Compare with sig CpGs from main analysis

### Call in datasets
main_cpgs <- read.csv(
  paste0(dir.single.cpg, "meta_analysis_single_cpg_sig_no_crossHyb_smoking_df.csv")
) #dim: 3751 28
main_cpgs <- main_cpgs %>% pull(cpg) %>% as.character

matched_cpgs <- read.csv(
  paste0(dir.result, "meta_analysis_matched_single_cpg_sig_no_crossHyb_smoking_df.csv")
) #dim: 642 24
matched_cpgs <- matched_cpgs %>% pull(1) %>% as.character


### Create Venn Diagram
library(VennDiagram)
library(ggplot2)

# Make the plot
venn <- venn.diagram(
  x = list(
    main_cpgs %>% unique ,
    matched_cpgs %>% unique
  ), 
  category.names = c("Main analysis sig. probes" , "Matched analysis sig. probes" ),
  filename = file.path(dir.result, '/venn_main_matched_sig_cpgs.png'),
  output = TRUE ,
  imagetype = "png" ,
  height = 700 ,
  width = 700 ,
  resolution = 300,
  compression = "lzw",
  lwd = 1,
  col = c("#440154ff", '#21908dff'),
  fill = c(alpha("#440154ff",0.3), alpha('#21908dff',0.3)),
  cex = 0.5,
  cat.cex = 0.3,
  cat.default.pos = "outer",
  cat.pos = c(0, 0),
  cat.dist = c(0.01, 0.01)
)
```




# Session information
```{R}
devtools::session_info()
```

