Here we are again, this time we’re going to look at Day 10 and 135, initially with treatment as the predictor variable, and time as a covariate. We may run this again, using time as a predictor variable after we look at the results.
Set working directory, creates a variable with sample names etc.
setwd("~/Documents/Genewiz_hdd/Day10-Day135")
samples.vec <- c("EPI-103", "EPI-104", "EPI-111", "EPI-113", "EPI-119", "EPI-120", "EPI-127", "EPI-128", "EPI-135", "EPI-136", "EPI-143", "EPI-145", "EPI-151", "EPI-152", "EPI-153", "EPI-154", "EPI-159", "EPI-160", "EPI-161", "EPI-162", "EPI-167", "EPI-168", "EPI-169", "EPI-170")
Next, we do the VCF tools stuff to make the relatedness matrix.
setwd("~/Documents/Genewiz_hdd/Day10-Day135/SNPvcfs")
for(i in 1:length(samples.vec)) {
system(paste0("bgzip ", samples.vec[i], "_SNVs.vcf"))
system(paste0("tabix ", samples.vec[i], "_SNVs.vcf.gz"))
}
system("vcf-merge *.gz > day10-mergedSNV.vcf")
system("vcftools --relatedness --vcf day10-mergedSNV.vcf --out Day-10-Day135-relatedness")
system("head Day-10-Day135-relatedness.relatedness")
As usual, I have to rearrange the relatedness file in Excel,
system("head ~/Documents/Genewiz_hdd/Day10-Day135/macau/Day-10-Day135-relatedness.csv")
0.772756,0.0616173,-0.0555682,-0.0301271,-0.140959,0.175972,-0.0575158,-0.104365,-0.065484,-0.0876808,-0.108867,-0.0268435,0.0651811,-0.0976301,0.0519872,0.0335265,-0.0786845,0.000404752,-0.0206828,-0.111164,0.0741324,-0.00148568,-0.155082,-0.0339459
0.0616173,0.765444,0.0559054,-0.158057,-0.128915,0.0123093,-0.0802873,-0.089678,0.0867827,0.00236708,0.021073,0.015912,0.0338877,-0.0702592,0.0188069,-0.0122614,-0.056052,-0.0668107,-0.0732663,-0.105306,-0.0371107,0.0204452,-0.0729856,0.0879318
-0.0555682,0.0559054,0.717083,-0.0607101,-0.0277436,-0.145559,-0.0217134,0.0900564,0.0971049,0.000131313,0.0206886,-0.00412024,-0.0760375,-0.190183,0.00286808,-0.140265,-0.0397838,-0.0526699,-0.0802194,-0.0595176,-0.0198178,-0.0559708,-0.107856,-0.0449497
-0.0301271,-0.158057,-0.0607101,0.798623,-0.0523297,0.069677,0.097757,-0.0735476,-0.0186603,-0.0324117,0.0457297,-0.147807,-0.0760616,-0.147098,-0.124806,-0.0736548,0.0239895,0.089279,0.0235495,-0.029184,0.0320727,-0.0529963,0.0834451,-0.175084
-0.140959,-0.128915,-0.0277436,-0.0523297,0.746344,0.111047,-0.0756017,0.277983,-0.057765,-0.0790771,0.0200264,-0.0771036,-0.0990917,-0.0183635,-0.0395786,-0.0520509,-0.108949,-0.0997594,0.0412607,0.0483842,-0.0589136,-0.133022,-0.00189873,-0.136011
0.175972,0.0123093,-0.145559,0.069677,0.111047,0.624014,-0.072909,-0.0580862,-0.0262967,-0.0958482,-0.184225,-0.0698374,-0.0193632,-0.166235,0.0208795,-0.0183182,-0.0231917,-0.129077,-0.145037,0.0282283,-0.118999,-0.0536829,-0.0281362,-0.053337
-0.0575158,-0.0802873,-0.0217134,0.097757,-0.0756017,-0.072909,0.722275,-0.253142,-0.046049,-0.109786,0.167895,-0.0425923,-0.119257,-0.18964,0.0396593,0.0027129,-0.11265,0.0128236,-0.00503317,0.019215,-0.0547299,-0.142602,0.097075,-0.138677
-0.104365,-0.089678,0.0900564,-0.0735476,0.277983,-0.0580862,-0.253142,0.74351,-0.126914,-0.0275129,-0.00323909,-0.0918203,-0.0059337,0.140407,0.00210494,-0.0988238,-0.0975985,0.0346344,-0.0187337,-0.147393,-0.0677,-0.0680102,-0.079573,-0.0102327
-0.065484,0.0867827,0.0971049,-0.0186603,-0.057765,-0.0262967,-0.046049,-0.126914,0.824493,0.056567,0.0225954,-0.127503,-0.00172593,-0.0699872,-0.0783149,-0.0225787,-0.0104318,-0.175903,-0.033088,0.0416627,0.053026,0.0326844,-0.0700631,-0.255086
-0.0876808,0.00236708,0.000131313,-0.0324117,-0.0790771,-0.0958482,-0.109786,-0.0275129,0.056567,0.825498,0.0700155,-0.0323732,0.024468,-0.096485,0.0404942,0.0299229,-0.00675917,0.102415,-0.0248466,-0.0736544,0.0450776,0.00812042,-0.0592398,-0.16154
-0.108867,0.021073,0.0206886,0.0457297,0.0200264,-0.184225,0.167895,-0.00323909,0.0225954,0.0700155,0.773341,-0.0306455,0.0905632,-0.092057,-0.0851138,-0.11483,-0.020056,0.0772618,-0.0992651,-0.0706117,-0.109878,-0.119295,-0.121216,-0.061719
-0.0268435,0.015912,-0.00412024,-0.147807,-0.0771036,-0.0698374,-0.0425923,-0.0918203,-0.127503,-0.0323732,-0.0306455,0.71388,-0.021926,-0.10943,0.0108184,0.110448,-0.0752085,0.0832381,-0.0421874,0.00976156,-0.037776,-0.119867,0.029008,0.216124
0.0651811,0.0338877,-0.0760375,-0.0760616,-0.0990917,-0.0193632,-0.119257,-0.0059337,-0.00172593,0.024468,0.0905632,-0.021926,0.843098,-0.0245746,-0.0072884,0.106193,0.0955382,-0.0831017,-0.0410608,-0.107784,-0.00894402,-0.111853,-0.10372,-0.0220078
-0.0976301,-0.0702592,-0.190183,-0.147098,-0.0183635,-0.166235,-0.18964,0.140407,-0.0699872,-0.096485,-0.092057,-0.10943,-0.0245746,0.864135,0.0782344,0.00872868,-0.0181127,-0.0859465,0.0314602,0.0573296,-0.208583,0.228856,-0.0242416,-0.135552
0.0519872,0.0188069,0.00286808,-0.124806,-0.0395786,0.0208795,0.0396593,0.00210494,-0.0783149,0.0404942,-0.0851138,0.0108184,-0.0072884,0.0782344,0.841472,0.0415415,-0.0238836,-0.0471489,-0.0277895,-0.165357,-0.0748297,-0.0709018,-0.164315,-0.0799694
0.0335265,-0.0122614,-0.140265,-0.0736548,-0.0520509,-0.0183182,0.0027129,-0.0988238,-0.0225787,0.0299229,-0.11483,0.110448,0.106193,0.00872868,0.0415415,0.93084,0.0378955,-0.0293963,-0.0389262,0.069167,0.035613,-0.161289,-0.0497545,-0.115228
-0.0786845,-0.056052,-0.0397838,0.0239895,-0.108949,-0.0231917,-0.11265,-0.0975985,-0.0104318,-0.00675917,-0.020056,-0.0752085,0.0955382,-0.0181127,-0.0238836,0.0378955,0.973497,-0.0914245,0.0657328,-0.0306448,-0.118127,0.0588192,0.010824,0.0975144
0.000404752,-0.0668107,-0.0526699,0.089279,-0.0997594,-0.129077,0.0128236,0.0346344,-0.175903,0.102415,0.0772618,0.0832381,-0.0831017,-0.0859465,-0.0471489,-0.0293963,-0.0914245,0.734248,-0.00715264,-0.0414363,0.0382714,-0.0317608,-0.090624,-0.0743369
-0.0206828,-0.0732663,-0.0802194,0.0235495,0.0412607,-0.145037,-0.00503317,-0.0187337,-0.033088,-0.0248466,-0.0992651,-0.0421874,-0.0410608,0.0314602,-0.0277895,-0.0389262,0.0657328,-0.00715264,0.793274,-0.00636344,-0.0320081,-0.0581993,-0.106283,-0.0875273
-0.111164,-0.105306,-0.0595176,-0.029184,0.0483842,0.0282283,0.019215,-0.147393,0.0416627,-0.0736544,-0.0706117,0.00976156,-0.107784,0.0573296,-0.165357,0.069167,-0.0306448,-0.0414363,-0.00636344,0.715446,0.120514,-0.0265643,0.0132095,-0.126289
0.0741324,-0.0371107,-0.0198178,0.0320727,-0.0589136,-0.118999,-0.0547299,-0.0677,0.053026,0.0450776,-0.109878,-0.037776,-0.00894402,-0.208583,-0.0748297,0.035613,-0.118127,0.0382714,-0.0320081,0.120514,0.789858,-0.0850307,-0.12556,-0.135722
-0.00148568,0.0204452,-0.0559708,-0.0529963,-0.133022,-0.0536829,-0.142602,-0.0680102,0.0326844,0.00812042,-0.119295,-0.119867,-0.111853,0.228856,-0.0709018,-0.161289,0.0588192,-0.0317608,-0.0581993,-0.0265643,-0.0850307,0.787642,0.0486647,-0.0446784
-0.155082,-0.0729856,-0.107856,0.0834451,-0.00189873,-0.0281362,0.097075,-0.079573,-0.0700631,-0.0592398,-0.121216,0.029008,-0.10372,-0.0242416,-0.164315,-0.0497545,0.010824,-0.090624,-0.106283,0.0132095,-0.12556,0.0486647,0.798765,0.278712
-0.0339459,0.0879318,-0.0449497,-0.175084,-0.136011,-0.053337,-0.138677,-0.0102327,-0.255086,-0.16154,-0.061719,0.216124,-0.0220078,-0.135552,-0.0799694,-0.115228,0.0975144,-0.0743369,-0.0875273,-0.126289,-0.135722,-0.0446784,0.278712,0.622827
yup, there’s some numbers! Now lets deal with the count files.
First we combine all of the individual sample count files.
library(readr)
setwd("~/Documents/Genewiz_hdd/Day10-Day135/CGOutput")
data <- list.files(path = ".", pattern = "*.output")
print(data)
db <- read_delim("~/Documents/Genewiz_hdd/Day10-Day135/CGOutput/EPI-103_CG.output", "\t", escape_double = FALSE, trim_ws = TRUE, na = ".")
db <- as.data.table(db)
db$EPI103Meth <- rowSums(cbind(db[,4], db[,7]), na.rm = TRUE)
db$EPI103TotCov <- rowSums(cbind(db[,5], db[,8]), na.rm = TRUE)
meth.db <- db[,c(1,2,3,10)]
totcov.db <- db[,c(1,2,3,11)]
for(i in 2:length(data)) {
temp <- read_delim(paste0("~/Documents/Genewiz_hdd/Day10-Day135/CGOutput/", data[i]), "\t", escape_double = FALSE, trim_ws = TRUE, na = ".")
temp <- as.data.table(temp)
temp$Meth <- rowSums(cbind(temp[,4], temp[,7]), na.rm = TRUE)
temp$TotCov <- rowSums(cbind(temp[,5], temp[,8]), na.rm = TRUE)
meth.db <- merge(meth.db, temp[,c(1,2,3,10)], by = names(meth.db[,c(1,2,3)]), all = TRUE)
colnames(meth.db)[ncol(meth.db)] <- paste0(substr(data[i], 1, 7))
meth.db <- as.data.table(meth.db)
totcov.db <- merge(totcov.db, temp[,c(1,2,3,11)], by = names(totcov.db[,c(1,2,3)]), all = TRUE)
colnames(totcov.db)[ncol(totcov.db)] <- paste0(substr(data[i], 1, 7))
totcov.db <- as.data.table(totcov.db)
}
Now that thats combined, we need to get rid of everything missing data, and that has less than 10x coverage
setwd("~/Documents/Genewiz_hdd/Day10-Day135/macau")
meth.db2 <- meth.db[complete.cases(meth.db[,c(4:15)]),]
totcov.db2 <- totcov.db[complete.cases(totcov.db[,c(4:15)]),]
#meth.db3 <- meth.db2[apply(meth.db2[, c(4:15)], MARGIN = 1, function(x) all(x > 10)), ] <-- this is bad. DOn't do this.
totcov.db3 <- totcov.db2[apply(totcov.db2[, c(4:15)], MARGIN = 1, function(x) all(x > 10)), ]
meth.db4 <- as.data.table(cbind(paste0(meth.db2$`#CHROM`, "-", meth.db2$POS), meth.db2[,4:ncol(meth.db2)]))
colnames(meth.db4)[1] <- "Site"
totcov.db4 <- as.data.table(cbind(paste0(totcov.db3$`#CHROM`, "-", totcov.db3$POS), totcov.db3[,4:ncol(totcov.db3)]))
colnames(totcov.db4)[1] <- "Site"
meth.db5 <- meth.db4[meth.db4$Site %in% totcov.db4$Site,]
nrow(meth.db5)
[1] 15558
nrow(totcov.db4)
[1] 15558
length(totcov.db4$Site %in% meth.db5$Site)
[1] 15558
setwd("~/Documents/Genewiz_hdd/Day10-Day135/macau")
write.table(meth.db5, file = "Day10-Day135-10xCov-macau.methcount.txt", sep = " ", quote = FALSE, row.names = FALSE)
write.table(totcov.db4, file = "Day10-Day135-10xCov-macau.totalcount.txt", sep = " ", quote = FALSE, row.names = FALSE)
Well that was quick! Now lets run Macau. This will not be quick. Not in the least.
setwd("~/Documents/Genewiz_hdd/Day10-Day135/macau")
system("macau -g Day10-Day135-10xCov-macau.methcount.txt -t Day10-Day135-10xCov-macau.totalcount.txt -p Day10-Day135-Predictor.csv -k Day-10-Day135-relatedness.csv -c Day10-Day135-Covariate.csv -o Day10-Day135-results -s 10000 -seed 123")
Reading Files ...
## number of total individuals = 24
## number of analyzed individuals = 24
## number of covariates = 2
## number of total genes/sites = 15558
Performing Analysis 0.00%
Performing Analysis === 6.43%
Performing Analysis ====== 12.86%
Performing Analysis ========= 19.28%
Performing Analysis ============ 25.71%
Performing Analysis ================ 32.14%
Performing Analysis =================== 38.57%
Performing Analysis ====================== 45.00%
Performing Analysis ========================= 51.42%
Performing Analysis ============================ 57.85%
Performing Analysis ================================ 64.28%
Performing Analysis =================================== 70.71%
Performing Analysis ====================================== 77.14%
Performing Analysis ========================================= 83.56%
Performing Analysis ============================================ 89.99%
Performing Analysis ================================================ 96.42%
Performing Analysis ==================================================100.00%
3 hours later we have some results! Below we parse them, and extract any significant DMLs, writing them to a bed file.
setwd("~/Documents/Genewiz_hdd/Day10-Day135/macau")
result_assoc <- read_delim("~/Documents/Genewiz_hdd/Day10-Day135/macau/output/Day10-Day135-results.assoc.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
Parsed with column specification:
cols(
id = col_character(),
n = col_integer(),
acpt_rate = col_double(),
beta = col_double(),
se_beta = col_double(),
pvalue = col_double(),
h = col_double(),
se_h = col_double(),
sigma2 = col_double(),
se_sigma2 = col_double(),
alpha0 = col_double(),
se_alpha0 = col_double(),
alpha1 = col_double(),
se_alpha1 = col_double()
)
sig.results <- result_assoc[result_assoc$pvalue <= 0.05,]
for(i in 1:nrow(sig.results)) {
temp <- strsplit(sig.results$id[i], "-")
sig.results[i,13] <- temp[[1]][1]
sig.results[i,14] <- temp[[1]][2]
}
colnames(sig.results)[c(13,14)] <- c("Scaffold", "Loc")
bed.test <- as.data.frame(cbind(sig.results[,13], (sig.results[,14]), (sig.results[,14]), sig.results$beta))
colnames(bed.test)[4] <- "beta"
write.table(bed.test, file = "Day10-Day135-10xCov-DiffMethRegions.bed", sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE)
nrow(sig.results)
[1] 401
Next we need to deal with the multiple comparisons issue. Below I try a few different implementations of the q-value statistic, one a bootstrap method for identifying pi(0), the other using a smoother method. According to the qvalue documentation, the bootstrap method is better for pathological situations, which they don’t define.
Lastly I try the Benjamini and Yekutieli implementation of FDR just for fun. It’s a specific implementation of the FDR idea from Benjamini and Hochberg, but is built to deal with a low number of true positives and potential dependence between the data.
library(qvalue)
hist(result_assoc$pvalue)
bootstrap.qobj.robust <- qvalue(result_assoc$pvalue, robust = TRUE, pi0.meth = "bootstrap", lambda = 0.5)
summary(bootstrap.qobj.robust)
Call:
qvalue(p = result_assoc$pvalue, robust = TRUE, pi0.meth = "bootstrap", lambda = 0.5)
pi0: 1
Cumulative number of significant calls:
<1e-04 <0.001 <0.01 <0.025 <0.05 <0.1 <1
p-value 1 8 70 190 401 888 15132
q-value 0 0 0 0 0 0 15132
local FDR 0 0 0 0 0 0 0
plot(bootstrap.qobj.robust)
hist(result_assoc$pvalue)
hist(bootstrap.qobj.robust)
smoother.qobj.robust <- qvalue(result_assoc$pvalue, robust = TRUE, pi0.meth = "smoother")
summary(smoother.qobj.robust)
Call:
qvalue(p = result_assoc$pvalue, robust = TRUE, pi0.meth = "smoother")
pi0: 1
Cumulative number of significant calls:
<1e-04 <0.001 <0.01 <0.025 <0.05 <0.1 <1
p-value 1 8 70 190 401 888 15132
q-value 0 0 0 0 0 0 15132
local FDR 0 0 0 0 0 0 0
plot(smoother.qobj.robust)
hist(result_assoc$pvalue)
hist(smoother.qobj.robust)
p.adjust.obj.BY <- p.adjust(result_assoc$pvalue, method = "BY", n = length(result_assoc$pvalue))
summary(p.adjust.obj.BY)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1 1 1 1 1 1
plot(p.adjust.obj.BY)
Well, thats a bummer. It looks like we get a 100% FDR with the data we have.
How about potential clustering!
setwd("~/Documents/Genewiz_hdd/Day10-Day135/clustering")
The working directory was changed to /home/sean/Documents/Genewiz_hdd/Day10-Day135/clustering inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the the working directory for notebook chunks.
sample.files <- list.files(".", pattern = "*.bedgraph")
DMR.info <- read_delim("~/Documents/Genewiz_hdd/Day10-Day135/clustering/EPI-103_percmeth1.bedgraph", " ", escape_double = FALSE, col_names = FALSE, col_types = cols(X4 = col_double()), trim_ws = TRUE)
The following named parsers don't match the column names: X4