We will perform differential methylation analyses for Control samples
and 50ppb SFLX samples from the following experiment: #
We detect differential methylation for 500 base pair windows, shifting
every 250 base pairs. We reached this choice by first testing for
differential methylation at specific CpG sites (DMNs) as well as 50bp
tilling windows (DMRs). We concluded that we would need more data to
reliably detect DMNs. It is also preferable to find DMRs at Highly
methylated regions (HMRs), which are typically, in insects, in the order
of 500bp size, than 50bp DMRs.
For this final analysis, we will use data obtained using the Megalodon methylation caller. We have performed the same analysis with Megalodon methylation caller data and found highly similar results.
First we will compile general descriptive statistics for our data.
We will then move on to find, for the three control timepoints, T1-control, T2-learning, T3-memory, for DMRs of 500bp length.
Then, for our candidate DMRs, will check how these sites behave after exposure to 50ppb SFLX: T2 CTL - T2 50ppb = we are looking for methylation/removal site affected by 50ppb Sulfoxaflor for the learning phase
Last we could compare all control bees with all bees that have been treated with Sulfoxaflor. M1 and M3 samples (a previous experiment for which we have data here) are all data from the T3 time point, and we will include them in this last analysis
The sample list is the following:
M5-M5B_b1 (MEELIH-05 and 5b, barcode 1) T1-CTL (T1 timepoint, Control sample) M5-M5B_b2 T1-CTL M5-M5B_b3 T1-CTL M5-M5B_b4 T2-CTL M5-M5B_b5 T2-CTL M5-M5B_b6 T2-CTL M5-M5B_b7 T3-CTL M5-M5B_b8 T3-CTL M5-M5B_b9 T3-CTL M6-M5B_b10 retrieval-T3-50ppb M6-M5B_b11 retrieval-T3-50ppb M6-M5B_b12 retrieval-T3-50ppb M6_b1 T2-50ppb M6_b2 T2-50ppb M6_b3 T2-50ppb M6_b4 T3-50ppb M6_b5 T3-50ppb M6_b6 T3-50ppb
additional datapoints that will be used in a general analysis in the end: M1_b1(MEELIH-01 barcode 1) T3-CTL (T3 timepoint, Control sample) M1_b2 T3-CTL M1_b3 T3-CTL M1_b4 T3-50ppb M1_b5 T3-50ppb M1_b6 T3-50ppb M3_b1 T3-CTL M3_b2 T3-5ppb M3_b3 T3-5ppb M3_b4 T3-5ppb M3_b5 T3-5ppb M3_b6 T3-50ppb
#Part 1, descriptive statistics
First, install and load the required packages
if (!require("tidyverse")) install.packages("tidyverse",repos = "http://cran.us.r-project.org")
if (!require("data.table")) install.packages("data.table",repos = "http://cran.us.r-project.org")
if (!require("ggplot2")) install.packages("ggplot2",repos = "http://cran.us.r-project.org")
if (!require("BiocManager")) install.packages("BiocManager",repos = "http://cran.us.r-project.org")
if (!require("genomation")) BiocManager::install("genomation")
if (!require("methylKit")) BiocManager::install("methylKit")
Create a list with the files to be loaded. Here we load the files generated by Megalodon methylation calling software. Each barcode corresponds to each sample.
library(genomation)
library(methylKit)
meelih_input <- as.list(list.files(path="./bed_files_from_methCallers/",pattern = "*Megalodon.bed.tsv",full.names = TRUE))
print(meelih_input[1:18])
## [[1]]
## [1] "./bed_files_from_methCallers/T1_CTL_m5b1_Megalodon.bed.tsv"
##
## [[2]]
## [1] "./bed_files_from_methCallers/T1_CTL_m5b2_Megalodon.bed.tsv"
##
## [[3]]
## [1] "./bed_files_from_methCallers/T1_CTL_m5b3_Megalodon.bed.tsv"
##
## [[4]]
## [1] "./bed_files_from_methCallers/T2_50ppb_m6b1_Megalodon.bed.tsv"
##
## [[5]]
## [1] "./bed_files_from_methCallers/T2_50ppb_m6b2_Megalodon.bed.tsv"
##
## [[6]]
## [1] "./bed_files_from_methCallers/T2_50ppb_m6b3_Megalodon.bed.tsv"
##
## [[7]]
## [1] "./bed_files_from_methCallers/T2_CTL_m5b4_Megalodon.bed.tsv"
##
## [[8]]
## [1] "./bed_files_from_methCallers/T2_CTL_m5b5_Megalodon.bed.tsv"
##
## [[9]]
## [1] "./bed_files_from_methCallers/T2_CTL_m5b6_Megalodon.bed.tsv"
##
## [[10]]
## [1] "./bed_files_from_methCallers/T3_50ppb_m1b4_Megalodon.bed.tsv"
##
## [[11]]
## [1] "./bed_files_from_methCallers/T3_50ppb_m1b5_Megalodon.bed.tsv"
##
## [[12]]
## [1] "./bed_files_from_methCallers/T3_50ppb_m1b6_Megalodon.bed.tsv"
##
## [[13]]
## [1] "./bed_files_from_methCallers/T3_50ppb_m3b6_Megalodon.bed.tsv"
##
## [[14]]
## [1] "./bed_files_from_methCallers/T3_50ppb_m6b4_Megalodon.bed.tsv"
##
## [[15]]
## [1] "./bed_files_from_methCallers/T3_50ppb_m6b5_Megalodon.bed.tsv"
##
## [[16]]
## [1] "./bed_files_from_methCallers/T3_50ppb_m6b6_Megalodon.bed.tsv"
##
## [[17]]
## [1] "./bed_files_from_methCallers/T3_5ppb_m3b2_Megalodon.bed.tsv"
##
## [[18]]
## [1] "./bed_files_from_methCallers/T3_5ppb_m3b3_Megalodon.bed.tsv"
obj <- methRead(meelih_input[1:18],
sample.id=list("T1_CTL_m5b1","T1_CTL_m5b2","T1_CTL_m5b3","T2_50ppb_m6b1","T2_50ppb_m6b2","T2_50ppb_m6b3","T2_CTL_m5b4","T2_CTL_m5b5","T2_CTL_m5b6","T3_50ppb_m6b4","T3_50ppb_m6b5","T3_50ppb_m6b6","T3_CTL_m5b7","T3_CTL_m5b8","T3_CTL_m5b9","T3_ret_50ppb_m6b10","T3_ret_50ppb_m6b11","T3_ret_50ppb_m6b12"),
# 0 is control, 1 is treatment group
treatment = c(0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5),
# Assembly is just a character vector and doesn't affect anything
assembly="amel3.1",header=TRUE,
# We have called CpG methylation
context="CpG",
# We want base resolution
resolution="base",
# The pipeline function is used because Megalodon has given us a methylation file
# that methylKit doesn't know how to read.
# So we have to specify which columns correspond to what.
#pipeline=list(fraction=FALSE,chr.col=1,start.col=2,end.col=3,strand.col=FALSE,coverage.col=5,freqC.col=6), #dorado
pipeline=list(fraction=FALSE,chr.col=1,start.col=2,end.col=3, coverage.col=4, strand.col=FALSE, freqC.col=5), #megalodon
#we want minimum coverage to be considered
mincov = 1,
)
Combine the files together into one object
meth <- methylKit::unite(obj, destrand=TRUE, min.per.group=1L)
Methylation stats
for (i in 1:18){
getMethylationStats(obj[[i]],plot=FALSE,both.strands=FALSE)
}
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 5.294 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60%
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## 70% 80% 90% 95% 99% 99.5% 99.9%
## 0.000000 7.142857 20.000000 33.333333 66.666667 100.000000 100.000000
## 100%
## 100.000000
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 5.284 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 0.00000 20.00000 33.33333 66.66667 100.00000 100.00000 100.00000
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 5.023 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 0.00000 20.00000 33.33333 100.00000 100.00000 100.00000 100.00000
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 3.806 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 0 0 0 0 0 0 0 0 0 0 25 100 100
## 99.9% 100%
## 100 100
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 3.786 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 0.00000 0.00000 33.33333 100.00000 100.00000 100.00000 100.00000
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 3.727 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 0 0 0 0 0 0 0 0 0 0 25 100 100
## 99.9% 100%
## 100 100
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 5.196 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 0.00000 20.00000 33.33333 71.42857 100.00000 100.00000 100.00000
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 5.286 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 0.00000 20.00000 33.33333 66.66667 100.00000 100.00000 100.00000
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 4.475 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 0 0 0 0 0 0 0 0 0 0 50 100 100
## 99.9% 100%
## 100 100
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 2.712 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 0.00000 0.00000 20.00000 66.66667 87.50000 100.00000 100.00000
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 2.703 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 0.00000 0.00000 20.00000 66.66667 100.00000 100.00000 100.00000
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 2.758 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 0 0 0 0 0 0 0 0 0 0 0 100 100
## 99.9% 100%
## 100 100
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 4.059 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 0.00000 0.00000 33.33333 100.00000 100.00000 100.00000 100.00000
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 3.867 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 0.00000 0.00000 33.33333 100.00000 100.00000 100.00000 100.00000
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 3.914 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 0.00000 0.00000 33.33333 100.00000 100.00000 100.00000 100.00000
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 3.858 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 0.00000 0.00000 33.33333 100.00000 100.00000 100.00000 100.00000
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 4.267 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 3.97351 14.28571 25.00000 60.00000 80.00000 100.00000 100.00000
##
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 4.157 0.000 100.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 0.00000 14.28571 33.33333 66.66667 100.00000 100.00000 100.00000
Coverage stats
for (i in 1:18){
getCoverageStats(obj[[1]],plot=FALSE,both.strands=FALSE)
}
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
##
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
Clustering of samples
par(mar=c(20,2,2,2))
clusterSamples(meth, dist="correlation", method="ward", plot=TRUE, chunk.size = 200,)#chunk.size = 1e+03)
##
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
##
## Cluster method : ward.D
## Distance : pearson
## Number of objects: 18
#title(xlab=,mgp=c(10,1,0))
Principal component analysis of samples
PCASamples(meth, screeplot=TRUE,chunk.size=200)
pc=PCASamples(meth,obj.return = TRUE,transpose=FALSE, adj.lim=c(0.1,0.1),chunk.size=200, center=TRUE)
Lets see how the coverages for each sample look like for all samples
coverages_df <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(coverages_df) <-c("sample_ID", "coverage","methylated")
#combine all coverage data in this flat dataframe
#include the T1 coverages
for (i in c(1:18)) {
b<-getData(obj[[i]])[,5:6]
t<-NROW(b)
a<-rep(obj[[i]]@sample.id,times=t)
a<-cbind(a,b)
coverages_df<-rbind(coverages_df,a)
}
str(coverages_df)
## 'data.frame': 151157387 obs. of 3 variables:
## $ a : chr "T1_CTL_m5b1" "T1_CTL_m5b1" "T1_CTL_m5b1" "T1_CTL_m5b1" ...
## $ coverage: int 1 2 2 2 1 1 1 2 1 2 ...
## $ numCs : int 1 2 2 2 1 1 1 2 1 2 ...
#calculating coverages and methylation levels, the CTL and 50ppb M3 will be merged in M1
colnames(coverages_df) <-c("sample_ID", "coverage","methylated")
coverages_df$sample_ID<-as.factor(coverages_df$sample_ID)
coverages_df$coverage<-as.numeric(coverages_df$coverage)
coverages_df$methylated<-as.numeric(coverages_df$methylated)
sapply(coverages_df, levels)
## $sample_ID
## [1] "T1_CTL_m5b1" "T1_CTL_m5b2" "T1_CTL_m5b3"
## [4] "T2_50ppb_m6b1" "T2_50ppb_m6b2" "T2_50ppb_m6b3"
## [7] "T2_CTL_m5b4" "T2_CTL_m5b5" "T2_CTL_m5b6"
## [10] "T3_50ppb_m6b4" "T3_50ppb_m6b5" "T3_50ppb_m6b6"
## [13] "T3_CTL_m5b7" "T3_CTL_m5b8" "T3_CTL_m5b9"
## [16] "T3_ret_50ppb_m6b10" "T3_ret_50ppb_m6b11" "T3_ret_50ppb_m6b12"
##
## $coverage
## NULL
##
## $methylated
## NULL
library(dplyr)
coverages_df <- coverages_df %>%
mutate(group = case_when(
startsWith(as.character(sample_ID), "T1_CTL_m5") ~ "T1_CTL_m5",
startsWith(as.character(sample_ID), "T2_CTL_m5") ~ "T2_CTL_m5",
startsWith(as.character(sample_ID), "T3_CTL_m5") ~ "T3_CTL_m5",
startsWith(as.character(sample_ID), "T2_50ppb_m6") ~ "T2_50ppb_m6",
startsWith(as.character(sample_ID), "T3_50ppb_m6") ~ "T3_50ppb_m6",
startsWith(as.character(sample_ID), "T3_ret_50ppb_m6") ~ "T3_ret_50ppb_m6",
))
coverages_df$group <-as.factor(coverages_df$group)
#coverages_df$group <- factor(coverages_df$group, levels=c("T1_CTL_m5","T2_CTL_m5","T3_CTL_m5","T2_50ppb_m6","T3_50ppb_m6","T3_ret_50ppb_m6")) #attempting to sort the results at the plots
#coverages_df <- coverages_df[order(levels(coverages_df$group)),]
sapply(coverages_df, levels)
## $sample_ID
## [1] "T1_CTL_m5b1" "T1_CTL_m5b2" "T1_CTL_m5b3"
## [4] "T2_50ppb_m6b1" "T2_50ppb_m6b2" "T2_50ppb_m6b3"
## [7] "T2_CTL_m5b4" "T2_CTL_m5b5" "T2_CTL_m5b6"
## [10] "T3_50ppb_m6b4" "T3_50ppb_m6b5" "T3_50ppb_m6b6"
## [13] "T3_CTL_m5b7" "T3_CTL_m5b8" "T3_CTL_m5b9"
## [16] "T3_ret_50ppb_m6b10" "T3_ret_50ppb_m6b11" "T3_ret_50ppb_m6b12"
##
## $coverage
## NULL
##
## $methylated
## NULL
##
## $group
## [1] "T1_CTL_m5" "T2_50ppb_m6" "T2_CTL_m5" "T3_50ppb_m6"
## [5] "T3_CTL_m5" "T3_ret_50ppb_m6"
str(coverages_df)
## 'data.frame': 151157387 obs. of 4 variables:
## $ sample_ID : Factor w/ 18 levels "T1_CTL_m5b1",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ coverage : num 1 2 2 2 1 1 1 2 1 2 ...
## $ methylated: num 1 2 2 2 1 1 1 2 1 2 ...
## $ group : Factor w/ 6 levels "T1_CTL_m5","T2_50ppb_m6",..: 1 1 1 1 1 1 1 1 1 1 ...
Lets calculate the overall level of methylation in each sample and then plot these levels for each group and test using ANOVA if there is significant difference in methylation levels between the groups.
frac<-function(x,y){
z<-x/y
return(z)
}
a<-mapply(frac,coverages_df$methylated,coverages_df$coverage)
coverages_df<-cbind(coverages_df, fraction=a)
coverages_df<-coverages_df[,1:5]
Calculating the overall methylation level for each sample.
coverages_sum<-tapply(coverages_df$coverage,as.factor(coverages_df$sample_ID),sum)
methylated_sum<-tapply(coverages_df$methylated,as.factor(coverages_df$sample_ID),sum)
bigFractions<-mapply(frac,methylated_sum,coverages_sum)
bigFractions<-as.data.frame(bigFractions)
bigFractions
calculate coverage
genome_length<-225250884 #as calculated using grep -v ">" Apis_mellifera.Amel_HAv3.1.dna.toplevel.fa | tr -d '\n' | wc -c
cc<-1:6
for (i in 0:5){
cc[i+1]<-(coverages_sum[[i*3+1]]+coverages_sum[[i*3+2]]+coverages_sum[[i*3+3]])/genome_length
cc
}
bigFractions$row_names<-row.names(bigFractions)
bigFractions <- bigFractions %>%
mutate(group = case_when(
startsWith(as.character(row_names), "T1_CTL_m5") ~ "T1_CTL_m5",
startsWith(as.character(row_names), "T2_CTL_m5") ~ "T2_CTL_m5",
startsWith(as.character(row_names), "T3_CTL_m5") ~ "T3_CTL_m5",
startsWith(as.character(row_names), "T2_50ppb_m6") ~ "T2_50ppb_m6",
startsWith(as.character(row_names), "T3_50ppb_m6") ~ "T3_50ppb_m6",
startsWith(as.character(row_names), "T3_ret_50ppb_m6") ~ "T3_ret_50ppb_m6",
))
bigFractions$group <-as.factor(bigFractions$group)
sapply(bigFractions, levels)
## $bigFractions
## NULL
##
## $row_names
## NULL
##
## $group
## [1] "T1_CTL_m5" "T2_50ppb_m6" "T2_CTL_m5" "T3_50ppb_m6"
## [5] "T3_CTL_m5" "T3_ret_50ppb_m6"
Plotting the average methylation levels of all samples.
library(ggplot2)
if (!require("ggstatsplot")) install.packages("ggstatsplot",repos = "http://cran.us.r-project.org")
library(ggstatsplot)
library(tidyverse)
gghistostats(
data = bigFractions,
x = bigFractions,
y = group,
title="Average Methylation level of all samples",
k=5L,
)
bigFractionsMeans<-with(bigFractions, tapply(bigFractions, group, mean))
bigFractionsMeans
## T1_CTL_m5 T2_50ppb_m6 T2_CTL_m5 T3_50ppb_m6 T3_CTL_m5
## 0.04760118 0.03909307 0.04664171 0.02913959 0.04044047
## T3_ret_50ppb_m6
## 0.04024405
if (!require("rstantools")) install.packages("rstantools",repos = "http://cran.us.r-project.org")
library(rstantools)
library(ggplot2)
library(ggstatsplot)
library(tidyverse)
#library(rsantools)
ggbetweenstats(
data = bigFractions,
x = group,
y = bigFractions,
title="Average Methylation level of all samples, grouped by sample type (timepoint-treatment-experiment)",
centrality.point.args = list(size = 3, color = "darkred"),
k=4
)
Now lets plot the coverage levels for each sample and group.
# Horizontal half violin plot
library(ggplot2)
coverages_df<-coverages_df[,1:5] #because if we run too many times the previous code chunk we get duplicate columns
ggplot(coverages_df[,1:2], ) + geom_violin(aes(coverage, sample_ID, fill = sample_ID )) +coord_flip() + theme(legend.position = "none") + coord_trans( x='log10') + scale_x_continuous(breaks=c(0,1,2,5,10,20,100,1000,6000)) + scale_fill_manual(values=c("cyan3","cyan3","cyan3","orange","orange","orange","aquamarine","aquamarine","aquamarine","red","red","red","forestgreen","forestgreen","forestgreen","purple","purple","purple"))
library(ggplot2)
ggplot(coverages_df, aes(x = coverage, fill = group)) + xlim(NA,30) + geom_histogram() + facet_grid(group ~ .) + scale_fill_manual(values=c("cyan3","orange","aquamarine","red","forestgreen","purple"))
get bedgraphs to upload to UCSC Genomic viewer
#bedgraph(myDiff25p,file.name="diff.bedgraph",col.name="meth.diff")
Optional, check if the coverage from Dorado methylation caller is better than from Megalodon.
library(readr)
#coverages_Megalodon<-data.frame(coverage = coverages_sum)
#coverages_Megalodon$sample <- attr(coverages_sum, "dimnames")[[1]]
#write_csv(coverages_Megalodon, file="./output_data_frames/coverages_sum_Meg.csv")
coverages_Megalodon<-read_csv(file="./output_data_frames/coverages_sum_Meg.csv")
coverages_Dorado<-read_csv(file="./output_data_frames/coverages_sum_Dor.csv")
library(dplyr)
library(ggplot2)
library(tidyr)
combined_coverages <- left_join(coverages_Dorado, coverages_Megalodon, by = "sample", suffix = c("_Dorado", "_Megalodon"))
long_format <- pivot_longer(combined_coverages, cols = c("coverage_Dorado", "coverage_Megalodon"),
names_to = "dataset", values_to = "coverage")
ggplot(long_format, aes(x = sample, y = coverage, color = dataset)) +
geom_point() +
scale_color_manual(values = c("coverage_Dorado" = "red", "coverage_Megalodon" = "green")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Sample", y = "Coverage", color = "Dataset")
#Part 2, DMRs for control samples
We will start over by erasing and re-loading the data, so that one can start this part of the analysis from here. Remove everything…
rm(list = ls(all.names = TRUE))
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 11510719 614.8 67476106 3603.7 257400911 13746.7
## Vcells 37115710 283.2 3427109745 26146.8 4283887181 32683.5
Lets first import the Megalodon methylation data
library(genomation)
library(methylKit)
megFiles <- as.list(list.files(path="./bed_files_from_methCallers/",pattern =".*CTL_m[5,6].*Megalodon.bed.tsv",full.names = TRUE))
print(megFiles)
## [[1]]
## [1] "./bed_files_from_methCallers/T1_CTL_m5b1_Megalodon.bed.tsv"
##
## [[2]]
## [1] "./bed_files_from_methCallers/T1_CTL_m5b2_Megalodon.bed.tsv"
##
## [[3]]
## [1] "./bed_files_from_methCallers/T1_CTL_m5b3_Megalodon.bed.tsv"
##
## [[4]]
## [1] "./bed_files_from_methCallers/T2_CTL_m5b4_Megalodon.bed.tsv"
##
## [[5]]
## [1] "./bed_files_from_methCallers/T2_CTL_m5b5_Megalodon.bed.tsv"
##
## [[6]]
## [1] "./bed_files_from_methCallers/T2_CTL_m5b6_Megalodon.bed.tsv"
##
## [[7]]
## [1] "./bed_files_from_methCallers/T3_CTL_m5b7_Megalodon.bed.tsv"
##
## [[8]]
## [1] "./bed_files_from_methCallers/T3_CTL_m5b8_Megalodon.bed.tsv"
##
## [[9]]
## [1] "./bed_files_from_methCallers/T3_CTL_m5b9_Megalodon.bed.tsv"
DataMegalodon <- methRead(megFiles,
sample.id=list("T1_CTL_m5b1","T1_CTL_m5b2","T1_CTL_m5b3","T2_CTL_m5b4","T2_CTL_m5b5","T2_CTL_m5b6","T3_CTL_m5b7","T3_CTL_m5b8","T3_CTL_m5b9"),
treatment = c(1,1,1,2,2,2,3,3,3),
assembly="amel3.1",header=TRUE,
context="CpG",
resolution="base",
pipeline=list(fraction=FALSE,chr.col=1,start.col=2,end.col=3, coverage.col=4, strand.col=FALSE, freqC.col=5),
mincov = 1,
)
filteredDataMegalodon <- filterByCoverage(DataMegalodon, lo.count = 2, lo.perc=0.01, hi.count=500, hi.perc = 99.9)
normalizedFilteredDataMegalodon<-normalizeCoverage(filteredDataMegalodon)
unitedNormalizedFilteredDataMegalodon <- methylKit::unite(normalizedFilteredDataMegalodon, destrand=FALSE)
Pool and tile data, change annotation from Refseq to Ensembl
pooledData=pool(unitedNormalizedFilteredDataMegalodon,sample.ids=c("T1","T2","T3"))
tilesData=tileMethylCounts(pooledData,win.size=500,step.size=250)
#change the chr column from Refseq to Ensembl, if for Dorado
#chromosome_names<-read.csv(file="./annotation/Chromosome_names_Ensembl_Refseq.csv", header=TRUE)
#chromosome_names<-chromosome_names[,c(2,3)]
#tilesData$chr<-chromosome_names$Ensembl[match(tilesData$chr,chromosome_names$Refseq)]
Remove objects for efficiency (optional)
rm(DataMegalodon,filteredDataMegalodon,normalizedFilteredDataMegalodon,unitedNormalizedFilteredDataMegalodon,pooledData)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 11541552 616.4 53980885 2882.9 257400911 13746.7
## Vcells 40982847 312.7 2741687796 20917.5 4283887181 32683.5
diffMeth <- suppressWarnings(calculateDiffMeth(tilesData, treatment=c(1,1,1,2,2,2,3,3,3), num.cores=6))
significantDiffMeth <- suppressWarnings(getMethylDiff(diffMeth, difference=10, qvalue=0.05))
Notes from the calculation: more than two groups detected: Calculated methylation difference as the difference of max(x) - min(x), where x is vector of mean methylation per group per region, but the statistical test remains the same.
Now annotate the DMRs
#get annotation
gene.obj=readTranscriptFeatures("./annotation/amel3.1.bed12", remove.unusual = F)
annotatedDMRs=annotateWithGeneParts(as(significantDiffMeth,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(annotatedDMRs,precedence=TRUE, main="T1_CTL - T2_CTL - T3_CTL multiple comparison DMR annotation")
#plot distances to TSS
DMRtoTSS=getAssociationWithTSS(annotatedDMRs)
hist(DMRtoTSS$dist.to.feature[abs(DMRtoTSS$dist.to.feature)<=100000], main="distance of T1_CTL - T2_CTL - T3_CTL DMRs to nearest TSS",xlab="distance in bp",breaks=50,col="green2")
Lets see the same plot, but now for the distances of random positions to the TSS of randomly placed genes, on a genome of the same size and number of genes to Apis mellifera. This is done to get an intuition of what we expect for this plot to look like if we would have picked the positions by chance alone:
randomTSSpositions<-sample(1:225250884,28410) #based on latest genome size and number of genes
randomDistancesToNearestTSS<-vector(mode='list',length(30000))
for (x in 1:30000){
randomMethylatedPosition<-runif(1,1,225250884)
minAbsDistance<- 1000000000
minDistance<- 1000000000
for (i in randomTSSpositions){
if(abs(randomMethylatedPosition-i)<minAbsDistance){
minAbsDistance<-abs(randomMethylatedPosition-i)
minDistance<-randomMethylatedPosition-i
}
}
randomDistancesToNearestTSS[x]<-minDistance
}
hist(as.numeric(unlist(randomDistancesToNearestTSS)),main="distance to nearest TSS of random position",xlab="distance in bp",breaks=50,col="brown4")
plot the DMR methylation levels in respect to the TSS, for each timepoint
library(ggplot2)
library(dplyr)
# Extract the relevant information from annotatedDMRs
dmrs_df<-cbind(methylKit::getData(significantDiffMeth)[1:nrow(annotatedDMRs@dist.to.TSS[,2:4]),] , annotatedDMRs@dist.to.TSS[,2:4], annotatedDMRs@members[1:nrow(annotatedDMRs@dist.to.TSS[,2:4]),])
#str(dmrs_df)
mb_data <- as(tilesData, "data.frame")
methylation_data <- data.frame(
chr = mb_data$chr,
start = mb_data$start,
end = mb_data$end,
strand = mb_data$strand,
coverageT1 = mb_data[[tilesData@coverage.index[1]]],
numCT1 = mb_data[[tilesData@numCs.index[1]]],
coverageT2 = mb_data[[tilesData@coverage.index[2]]],
numCT2 = mb_data[[tilesData@numCs.index[2]]],
coverageT3 = mb_data[[tilesData@coverage.index[3]]],
numCT3 = mb_data[[tilesData@numCs.index[3]]]
)
# Calculate methylation fraction for each timepoint
methylation_data$fracMethT1 <- methylation_data$numCT1 / methylation_data$coverageT1
methylation_data$fracMethT2 <- methylation_data$numCT2 / methylation_data$coverageT2
methylation_data$fracMethT3 <- methylation_data$numCT3 / methylation_data$coverageT3
# Merge this methylation data with DMR information
dmrs_methylation <- merge(dmrs_df, methylation_data, by = c("chr", "start", "end"))
# Now plot the data, filtering for DMRs within 1 kilobase of the TSS
dmrs_near_tss <- dmrs_methylation %>%
filter(-4000 <= dist.to.feature) %>%
filter(dist.to.feature <= 6000)
# Melting the data for plotting with ggplot2
dmrs_melted <- reshape2::melt(dmrs_near_tss,
id.vars = c("chr", "start", "end", "dist.to.feature"),
measure.vars = c("fracMethT1", "fracMethT2", "fracMethT3"),
variable.name = "Timepoint",
value.name = "MethylationFraction")
TSS_DMRs_plot<-ggplot(dmrs_melted, aes(x = dist.to.feature, y = MethylationFraction, color = Timepoint)) +
geom_point(alpha = 0.1) +
geom_smooth(method = "loess", span=0.20, se = TRUE) +
labs(x = "Distance to TSS (bp)", y = "Methylation Fraction") +
theme_minimal() +
theme(legend.position = "bottom")
TSS_DMRs_plot
Clustering of Control DMRs. From here on, we will only deal with DMRs that are in exon, intron and promoter regions.
library(dplyr)
library(pheatmap)
# Keep the DMRs in exons, introns and promoters
dmrs_in_genes <- dmrs_methylation %>%
dplyr::filter(prom==1 | exon==1 | intron==1)
# Melting the data for plotting with ggplot2
dmrs_melted2 <- reshape2::melt(dmrs_in_genes,
id.vars = c("chr", "start", "end", "dist.to.feature"),
measure.vars = c("fracMethT1", "fracMethT2", "fracMethT3"),
variable.name = "Timepoint",
value.name = "MethylationFraction")
# Convert it to a wide format for clustering
dmrs_wide <- dmrs_melted2 %>%
spread(key = Timepoint, value = MethylationFraction)
# Remove unnecessary columns for clustering (like chr, start, end, dist.to.feature)
dmrs_for_clustering <- dplyr::select(dmrs_wide, -chr, -start, -end, -dist.to.feature)
# Perform hierarchical clustering
dist_matrix <- dist(dmrs_for_clustering)
hc <- hclust(dist_matrix)
# Cut the tree into clusters, choose the number of clusters k
k <- 3
clusters <- cutree(hc, k = k)
# Add cluster information to the DMRs data frame
dmrs_wide$Cluster <- clusters
# Convert the cluster information to a factor for ordered display in heatmap
dmrs_wide$Cluster <- factor(dmrs_wide$Cluster, levels = 1:k)
dmrs_wide$Cluster <- as.factor(dmrs_wide$Cluster)
Make DMR heatmap
# Define annotation colors (adjust the color values as needed)
annotation_cs <- list(Cluster = c("1" = "blue", "2" = "red", "3" = "green"))
# Prepare annotation data for pheatmap
annotation_data <- as.data.frame(dmrs_wide["Cluster"])
rownames(annotation_data) <- rownames(dmrs_for_clustering)
# Heatmap of DMRs clustering
#pheatmap(as.matrix(dmrs_for_clustering), scale = "row", cluster_rows = TRUE)
# Ensure the columns are in the desired order
dmrs_for_clustering_ordered <- dmrs_for_clustering[, c("fracMethT1", "fracMethT2", "fracMethT3")]
DMRs_heatmap<-pheatmap(as.matrix(dmrs_for_clustering_ordered),
scale = "row",
cluster_rows = TRUE, cluster_cols=FALSE,
# annotation_row = annotation_data,
# annotation_colors = annotation_cs,
show_rownames = FALSE)
DMRs_heatmap
Renaming the dataset to plot each cluster
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggpubr)
# Assuming dmrs_wide contains the cluster information and methylation data
dmrs_long <- dmrs_wide %>%
gather(key = "Phase", value = "MethylationFraction", fracMethT1, fracMethT2, fracMethT3) %>%
mutate(Phase = factor(Phase, levels = c("fracMethT1", "fracMethT2", "fracMethT3"),
labels = c("Basal", "Learning", "Memory"))) %>%
dplyr::select(-chr, -start, -end, -dist.to.feature) # Removing unnecessary columns
# Rename Cluster for clarity
dmrs_long$Cluster <- paste("Cluster", dmrs_long$Cluster)
Plot each cluster
# Plot style settings
cluster_colors <- c("Cluster 1" = "#29AF7FFF", "Cluster 2" = "#DCE319FF", "Cluster 3" = "#404788FF")
# Create the plot
methylation_plot <- ggplot(dmrs_long, aes(x = Phase, y = MethylationFraction, group = Cluster)) +
geom_point(aes(color = Cluster), position = position_dodge(width = 0.2), size = 3) +
stat_summary(geom = "line", fun = mean, aes(color = Cluster), size = 1) +
facet_wrap(~ Cluster, scales = "free") +
scale_color_manual(values = cluster_colors) +
geom_hline(yintercept = 0, linetype = "dashed", color = "black", size = 0.5) +
theme_minimal() +
theme(plot.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.title = element_text(size = 12),
legend.title = element_blank(),
legend.text = element_text(size = 12),
strip.text = element_text(size = 14),
strip.background = element_blank()) +
labs(x = "Phase", y = "Methylation Fraction", title = "Methylation Changes Across Phases in Different Clusters")
# Display the plot
methylation_plot
Output the data for further analysis
DMRs_final<- dmrs_in_genes %>%
dplyr::left_join(dmrs_wide %>% dplyr::select(chr,start,end,Cluster), by= c("chr","start","end"))
if (!require("ape")) install.packages("ape",repos = "http://cran.us.r-project.org")
library(ape)
gff<-read.gff(file="./annotation/GCF_003254395.2_Amel_HAv3.1_genomic.gff", GFF3 = TRUE)
getAttributeField <- function (x, field, attrsep = ";") {
s = strsplit(x, split = attrsep, fixed = TRUE)
sapply(s, function(atts) {
a = strsplit(atts, split = "=", fixed = TRUE)
m = match(field, sapply(a, "[", 1))
if (!is.na(m)) {
rv = a[[m]][2]
}
else {
rv = as.character(NA)
}
return(rv)
})
}
gff$name <- getAttributeField(gff$attributes, "Name")
gff$gene <- getAttributeField(gff$attributes, "gene")
DMRs_final<- DMRs_final %>%
mutate(
gene_ID = gff$gene[pmatch(feature.name,gff$name, duplicates.ok = TRUE)],
#gene_attributes = gff$attributes[pmatch(feature.name, gff$name, duplicates.ok = TRUE)]
)
library(readr)
write_csv(DMRs_final,file="./output_data_frames/DMRs_of_genes_final_500bp_CTL_Megalodon_d10_q005.csv")
Output plots
ggsave("./output_plots/methylation_plot_500bp_CTL_Megalodon_d10_q005.png", methylation_plot, width = 14, height = 6)
ggsave("./output_plots/TSS_DMRs_plot_500bp_CTL_Megalodon_d10_q005.png", TSS_DMRs_plot, width = 10, height = 6)
# Saving a heatmap generated by pheatmap
png("./output_plots/DMR_heatmap_500bp_CTL_Megalodon_d10_q005.png", width = 200, height = 800)
print(DMRs_heatmap)
dev.off()
## png
## 2
#Part 3, combining the Sulfoxaflor treatment methylation data
Remove what we dont need …
rm(list = ls(all.names = TRUE))
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 11722594 626.1 34547767 1845.1 257400911 13746.7
## Vcells 37679394 287.5 1754680190 13387.2 4283887181 32683.5
Get SFLX data and instersect them with the DMRs above.
DMRs_final<-read_csv("./output_data_frames/DMRs_of_genes_final_500bp_CTL_Megalodon_d10_q005.csv") #if you want the analysis to start from here
library(genomation)
library(methylKit)
megFilesSFLX <- as.list(list.files(path="./bed_files_from_methCallers/",pattern =".*50ppb_m6b.*Megalodon.bed.tsv",full.names = TRUE))
print(megFilesSFLX)
## [[1]]
## [1] "./bed_files_from_methCallers/T2_50ppb_m6b1_Megalodon.bed.tsv"
##
## [[2]]
## [1] "./bed_files_from_methCallers/T2_50ppb_m6b2_Megalodon.bed.tsv"
##
## [[3]]
## [1] "./bed_files_from_methCallers/T2_50ppb_m6b3_Megalodon.bed.tsv"
##
## [[4]]
## [1] "./bed_files_from_methCallers/T3_50ppb_m6b4_Megalodon.bed.tsv"
##
## [[5]]
## [1] "./bed_files_from_methCallers/T3_50ppb_m6b5_Megalodon.bed.tsv"
##
## [[6]]
## [1] "./bed_files_from_methCallers/T3_50ppb_m6b6_Megalodon.bed.tsv"
##
## [[7]]
## [1] "./bed_files_from_methCallers/T3_ret50ppb_m6b10_Megalodon.bed.tsv"
##
## [[8]]
## [1] "./bed_files_from_methCallers/T3_ret50ppb_m6b11_Megalodon.bed.tsv"
##
## [[9]]
## [1] "./bed_files_from_methCallers/T3_ret50ppb_m6b12_Megalodon.bed.tsv"
DataMegalodonSFLX <- methRead(megFilesSFLX,
sample.id=list("T2_50ppb_m6b1","T2_50ppb_m6b2","T2_50ppb_m6b3","T3_50ppb_m6b4","T3_50ppb_m6b5","T3_50ppb_m6b6","T3_ret50ppb_m6b10","T3_ret50ppb_m6b11","T3_ret50ppb_m6b12"),
treatment = c(1,1,1,2,2,2,3,3,3),
assembly="amel3.1",header=TRUE,
context="CpG",
resolution="base",
#pipeline=list(fraction=FALSE,chr.col=1,start.col=2,end.col=3, coverage.col=5, strand.col=FALSE, freqC.col=6), #dorado
pipeline=list(fraction=FALSE,chr.col=1,start.col=2,end.col=3, coverage.col=4, strand.col=FALSE, freqC.col=5), #megalodon
mincov = 1,
)
#filteredDataMegalodonSFLX <- filterByCoverage(DataMegalodonSFLX, lo.count = 2, lo.perc=0.01, hi.count=500, hi.perc = 99.9)
#normalizedFilteredDataMegalodonSFLX<-normalizeCoverage(filteredDataMegalodonSFLX)
unitedDataMegalodonSFLX <- methylKit::unite(DataMegalodonSFLX, destrand=TRUE, min.per.group=1L)
Pool and tile data
pooledDataMegSFLX=pool(unitedDataMegalodonSFLX,sample.ids=c("T2S","T3S","T3Sr"))
tilesDataMegSFLX=tileMethylCounts(pooledDataMegSFLX,win.size=500,step.size=250)
Remove objects for efficiency (optional)
rm(DataMegalodonSFLX,unitedDataMegalodonSFLX,pooledDataMegSFLX)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 11731541 626.6 33197856 1773.0 257400911 13746.7
## Vcells 47405310 361.7 1403744152 10709.8 4283887181 32683.5
(not done for Megalodon)Now read the chromosome correspondence between RefSeq and Ensembl, and match and change into Refseq everything:
library(dplyr)
tilesMegSFLX_df<-methylKit::getData(tilesDataMegSFLX)
#chromosome_names<-read.csv(file="./annotation/Chromosome_names_Ensembl_Refseq.csv", header=TRUE)
#chromosome_names<-chromosome_names[,c(2,3)]
#tilesMegSFLX_df$chr<-chromosome_names$Ensembl[match(tilesMegSFLX_df$chr,chromosome_names$Refseq)]
subset_info <- DMRs_final %>%
dplyr::select(chr,start,end)
subset_info<-as.data.frame(subset_info)
# Subsetting the tiles data frame
tilesMegSFLX_DMRintersection <- tilesMegSFLX_df %>%
dplyr::inner_join(subset_info, by = c("chr", "start", "end"))
tilesMegSFLX_DMRintersection$learning50ppbMegMethFreq <- tilesMegSFLX_DMRintersection$numCs1 / tilesMegSFLX_DMRintersection$coverage1
tilesMegSFLX_DMRintersection$memory50ppbMegMethFreq <- tilesMegSFLX_DMRintersection$numCs2 / tilesMegSFLX_DMRintersection$coverage2
tilesMegSFLX_DMRintersection$memoryRet50ppbMegMethFreq <- tilesMegSFLX_DMRintersection$numCs3 / tilesMegSFLX_DMRintersection$coverage3
DMRs_final_withSFLX <- DMRs_final %>%
dplyr::inner_join(tilesMegSFLX_DMRintersection, by = c("chr", "start", "end"))
#DMRs_final_withSFLX <- DMRs_final_withSFLX %>%
# dplyr::full_join(tilesMegSFLX_DMRintersection, by = c("chr", "start", "end"))
DMRs_final_withSFLX_minimal<- DMRs_final_withSFLX %>%
dplyr::select(chr, start, end, fracMethT1,fracMethT2,fracMethT3,Cluster,learning50ppbMegMethFreq,memory50ppbMegMethFreq,memoryRet50ppbMegMethFreq)
DMRs_final_withSFLX_minimal<-DMRs_final_withSFLX_minimal[complete.cases(DMRs_final_withSFLX_minimal),]
colnames(DMRs_final_withSFLX_minimal)[colnames(DMRs_final_withSFLX_minimal)=="fracMethT1"] <- "controlMethFrac"
colnames(DMRs_final_withSFLX_minimal)[colnames(DMRs_final_withSFLX_minimal)=="fracMethT2"] <- "learningMethFrac"
colnames(DMRs_final_withSFLX_minimal)[colnames(DMRs_final_withSFLX_minimal)=="fracMethT3"] <- "memoryMethFrac"
#if (!requireNamespace("BiocManager", quietly = TRUE))install.packages("BiocManager")
#BiocManager::install("ComplexHeatmap")
# Extracting relevant columns for clustering
clustering_data <- DMRs_final_withSFLX_minimal %>%
dplyr::select(controlMethFrac, learningMethFrac, memoryMethFrac)
# Perform k-means clustering with k = 3
set.seed(123) # Set a seed for reproducibility
clusters <- kmeans(clustering_data, centers = 3, nstart = 25)
# Add cluster assignments to your data frame
DMRs_final_withSFLX_minimal$Cluster_kmeans <- clusters$cluster
#DMRs_final_withSFLX_minimal<- DMRs_final_withSFLX_minimal %>%
# dplyr::select(-Cluster)
library(ComplexHeatmap)
library(circlize)
library(RColorBrewer)
# Define row order
row_order <- order(DMRs_final_withSFLX_minimal$Cluster_kmeans)
# Define the colors for the annotation
cluster_colors <- setNames(c("red", "blue", "green"), c(1, 2, 3))
# Create the cluster annotation
cluster_annotation <- rowAnnotation(
cluster = anno_block(
DMRs_final_withSFLX_minimal$Cluster_kmeans[row_order],
gp = gpar(fill = setNames(c("red", "blue", "green"), 1:3))
)
)
# Prepare your heatmap data
heatmap_data <- as.matrix(DMRs_final_withSFLX_minimal[, c("controlMethFrac", "learningMethFrac", "memoryMethFrac", "learning50ppbMegMethFreq", "memory50ppbMegMethFreq", "memoryRet50ppbMegMethFreq")])
colnames(heatmap_data)<-c("control Methylation Fraction", "learning Meth.Frac.", "memory Meth.Frac.", "learning SFLX Meth.Frac.", "memory SFLX Meth.Frac.", "memory ret. SFLX Meth.Frac.")
# Create the heatmap
heatmap <- Heatmap(
heatmap_data[row_order, ],
col = colorRampPalette(rev(brewer.pal(3, "Set1")))(256),
name = "Methylation",
cluster_columns=FALSE,
show_row_names = FALSE,
show_column_names = TRUE,
column_title = "Timepoints and Treatments",
column_names_rot = 60,
row_dend_width=unit(.2, "npc"),
#left_annotation = cluster_annotation
)
print(heatmap)
# Saving a heatmap generated (its best if you print screen from Rstudio for this plot)
#png("./output_plots/DMR_heatmap_500bp_Megalodon_CTL_with_SFLX_added_d10_p005.png", width = 300, height = 600)
#print(heatmap)
#dev.off()
#Part 4, combine DEGs with DMRs
remove all objects from previous parts
rm(list = ls(all.names = TRUE))
Lets first import the Megalodon methylation data and the EdgeR Expression data. The data describe the expression levels of genes and methylation fractions of 50bp tiles across 3 timepoints: during the control, learning and memory phase of the experiment, where the bees are before, during and after the learning test conducted on them with or without the effect of SFLX.
library(genomation)
library(methylKit)
dmrs<-read_csv("./output_data_frames/DMRs_of_genes_final_500bp_CTL_Megalodon_d10_q005.csv")
degs<-read_csv("./DEGs_data/DEGs_xx.csv")
colnames(dmrs)[colnames(dmrs)=="gene_ID"] <- "gene_id"
colnames(dmrs)[colnames(dmrs)=="fracMethT1"] <- "controlMethFrac"
colnames(dmrs)[colnames(dmrs)=="fracMethT2"] <- "learningMethFrac"
colnames(dmrs)[colnames(dmrs)=="fracMethT3"] <- "memoryMethFrac"
colnames(degs)[colnames(degs)=="control"] <- "controlExp"
colnames(degs)[colnames(degs)=="learning"] <- "learningExp"
colnames(degs)[colnames(degs)=="memory"] <- "memoryExp"
Plot the DMR data and filter for proximity to TSS, using the change in methylation we have observed close to the TSS as a guide for what to keep.
library(ggplot2)
library(scales)
dmrs$Color <- ifelse(dmrs$exon,"1",
ifelse(dmrs$intron, "2",
ifelse(dmrs$prom,"3","0")))
dmrs$index<- seq_along(dmrs$chr)
# Custom symlog transformation
symlog_trans <- function() {
trans_new(name = 'symlog',
transform = function(x) sign(x) * log1p(abs(x)),
inverse = function(x) sign(x) * (exp(abs(x)) - 1))
}
#Prefintering plot
ggplot(dmrs, aes(x =index, y = dist.to.feature, color = Color)) +
geom_point(alpha=.15) + #=.05 for 50bp DMRs
scale_color_manual(values = c("1" = "orange", "2" = "aquamarine", "3" = "magenta", "0" = "black")) +
scale_y_continuous(trans = symlog_trans())
dmrs <- dmrs %>%
filter(dist.to.feature >= 0 & dist.to.feature <= 3000) #-2000, 5000 is good, in Meg i get correlation for 0 - 3000
ggplot(dmrs, aes(x =index, y = dist.to.feature, color = Color)) +
geom_point(alpha=.15) + #=.05 for 50bp DMRs
scale_color_manual(values = c("1" = "orange", "2" = "aquamarine", "3" = "magenta", "0" = "black")) +
scale_y_continuous(trans = symlog_trans())
Normalize the respective data, integrate into one dataset, test for correlations.
library(dplyr)
degs_simple<- degs %>% dplyr::select(cluster:memoryExp)
dmrs_simple<- dmrs %>% dplyr::select(controlMethFrac:gene_id)
# Normalize DEGs
degs_normalized <- degs_simple %>%
mutate(across(controlExp:memoryExp, ~ as.vector(scale(.)))) # Scale normalization for expression levels
# Normalize DMRs
dmrs_normalized <- dmrs_simple %>%
mutate(across(controlMethFrac:memoryMethFrac, ~ as.vector(scale(.)))) # Scale normalization for methylation fractions
# For DEGs: Calculate changes in normalized expression levels
degs_changes <- degs_normalized %>%
mutate(change_control_to_learning_Exp = learningExp - controlExp,
change_learning_to_memory_Exp = memoryExp - learningExp)
# For DMRs: Calculate changes in normalized methylation fractions
dmrs_changes <- dmrs_normalized %>%
mutate(change_control_to_learning_Meth = learningMethFrac - controlMethFrac,
change_learning_to_memory_Meth = memoryMethFrac - learningMethFrac)
# Aggregate DMR changes for each gene
dmrs_aggregated <- dmrs_changes %>%
group_by(gene_id) %>%
summarize(controlMethFrac = mean(controlMethFrac, na.rm = TRUE),
learningMethFrac = mean(learningMethFrac, na.rm = TRUE),
memoryMethFrac = mean(memoryMethFrac, na.rm = TRUE),
avg_change_control_to_learning_Meth = mean(change_control_to_learning_Meth, na.rm = TRUE),
avg_change_learning_to_memory_Meth = mean(change_learning_to_memory_Meth, na.rm = TRUE),
.groups = 'drop')
# Merge DEGs, DEG changes with DMRs and aggregated DMR changes
integrated_data <- left_join(degs_changes, dmrs_aggregated, by = "gene_id")
# Filter out rows where DMR data is missing
integrated_data <- integrated_data %>%
filter(!is.na(avg_change_control_to_learning_Meth) & !is.na(avg_change_learning_to_memory_Meth))
# Spearman correlation for changes between timepoints
correlation_control_to_learning <- cor.test(integrated_data$change_control_to_learning_Exp,
integrated_data$avg_change_control_to_learning_Meth,
method = "spearman", use = "complete.obs")
correlation_learning_to_memory <- cor.test(integrated_data$change_learning_to_memory_Exp,
integrated_data$avg_change_learning_to_memory_Meth,
method = "spearman", use = "complete.obs")
# Results
list(
Correlation_Control_to_Learning = correlation_control_to_learning,
Correlation_Learning_to_Memory = correlation_learning_to_memory
)
## $Correlation_Control_to_Learning
##
## Spearman's rank correlation rho
##
## data: integrated_data$change_control_to_learning_Exp and integrated_data$avg_change_control_to_learning_Meth
## S = 6722, p-value = 0.4926
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.1233289
##
##
## $Correlation_Learning_to_Memory
##
## Spearman's rank correlation rho
##
## data: integrated_data$change_learning_to_memory_Exp and integrated_data$avg_change_learning_to_memory_Meth
## S = 6952.1, p-value = 0.3684
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.1617782
Correlation plots for respective changes, control to learning and learning to memory
library(ggplot2)
# Scatter plot for Control to Learning Changes
ggplot(integrated_data, aes(x = change_control_to_learning_Exp, y = avg_change_control_to_learning_Meth)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "blue") +
labs(x = "Normalized Change in Expression (Control to Learning)",
y = "Normalized Average Change in Methylation (Control to Learning)",
title = "Correlation between DEGs and DMRs (Control to Learning)") +
theme_minimal()
# Scatter plot for Learning to Memory Changes
ggplot(integrated_data, aes(x = change_learning_to_memory_Exp, y = avg_change_learning_to_memory_Meth)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "blue") +
labs(x = "Normalized Change in Expression (Learning to Memory)",
y = "Normalized Average Change in Methylation (Learning to Memory)",
title = "Correlation between DEGs and DMRs (Learning to Memory)") +
theme_minimal()
Non-normalized data tests
library(dplyr)
library(tidyr)
library(stringr)
dmrs_simple_aggregated <- dmrs_simple %>%
group_by(gene_id) %>%
summarize(controlMethFrac = mean(controlMethFrac, na.rm = TRUE),
learningMethFrac = mean(learningMethFrac, na.rm = TRUE),
memoryMethFrac = mean(memoryMethFrac, na.rm = TRUE),
.groups = 'drop')
# Reshape dmrs_simple to long format
dmrs_long <- pivot_longer(dmrs_simple_aggregated,
cols = c(controlMethFrac, learningMethFrac, memoryMethFrac),
names_to = "timepoint",
values_to = "methFrac")
# Reshape degs_simple to long format
degs_long <- pivot_longer(degs_simple,
cols = c(controlExp, learningExp, memoryExp),
names_to = "timepoint",
values_to = "expression")
dmrs_long<-dmrs_long %>%
mutate(timepoint = str_sub(timepoint, 1, -9))
degs_long<-degs_long %>%
mutate(timepoint = str_sub(timepoint, 1, -4))
# Merge the long-format data frames on gene_id and timepoint
degs_dmrs_pairs <- inner_join(dmrs_long, degs_long, by = c("gene_id", "timepoint"))
# Result
correlation_all_pairs <- cor.test(degs_dmrs_pairs$expression,
degs_dmrs_pairs$methFrac,
method = "pearson", use = "complete.obs")
correlation_all_pairs
##
## Pearson's product-moment correlation
##
## data: degs_dmrs_pairs$expression and degs_dmrs_pairs$methFrac
## t = -0.46925, df = 97, p-value = 0.6399
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2427229 0.1512413
## sample estimates:
## cor
## -0.04759147
# Scatter plot for all pairs
ggplot(degs_dmrs_pairs, aes(x = expression, y = methFrac)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "green") +
labs(x = "expression level",
y = "methylation fraction",
title = "Correlation between DEG and DMR derived pairs of expression levels and
methylation fraction (DMRs found in the 0 to +2500bp region) of a gene ") +
theme_minimal()
time delay tests
# Correlation: Methylation (Control to Learning) with Expression (Learning to Memory)
correlation_methylation_control_learning_with_expression_learning_memory <- cor.test(integrated_data$avg_change_control_to_learning_Meth,
integrated_data$change_learning_to_memory_Exp,
method = "spearman", use = "complete.obs")
# Correlation: Expression (Control to Learning) with Methylation (Learning to Memory)
correlation_expression_control_learning_with_methylation_learning_memory <- cor.test(integrated_data$change_control_to_learning_Exp,
integrated_data$avg_change_learning_to_memory_Meth,
method = "spearman", use = "complete.obs")
# Results
list(
Correlation_Methylation_Control_Learning_with_Expression_Learning_Memory = correlation_methylation_control_learning_with_expression_learning_memory,
Correlation_Expression_Control_Learning_with_Methylation_Learning_Memory = correlation_expression_control_learning_with_methylation_learning_memory
)
## $Correlation_Methylation_Control_Learning_with_Expression_Learning_Memory
##
## Spearman's rank correlation rho
##
## data: integrated_data$avg_change_control_to_learning_Meth and integrated_data$change_learning_to_memory_Exp
## S = 5622, p-value = 0.7373
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.06049465
##
##
## $Correlation_Expression_Control_Learning_with_Methylation_Learning_Memory
##
## Spearman's rank correlation rho
##
## data: integrated_data$change_control_to_learning_Exp and integrated_data$avg_change_learning_to_memory_Meth
## S = 6619.1, p-value = 0.5567
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.1061252
# Methylation (T1 to T2) with Expression (Learning to Memory)
ggplot(integrated_data, aes(x = avg_change_control_to_learning_Meth, y = change_learning_to_memory_Exp)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "blue") +
labs(x = "Average Change in Methylation (Control to Learning)",
y = "Change in Expression (Learning to Memory)",
title = "Time-Delayed Correlation: Methylation to Expression") +
theme_minimal()
# Expression (Control to Learning) with Methylation (Learning to Memory)
ggplot(integrated_data, aes(x = change_control_to_learning_Exp, y = avg_change_learning_to_memory_Meth)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "blue") +
labs(x = "Change in Expression (Control to Learning)",
y = "Average Change in Methylation (Learning to Memory)",
title = "Time-Delayed Correlation: Expression to Methylation") +
theme_minimal()
Hierarchical clustering using all 6 datapoints (3 for DEGs, 3 for DMRs)
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("ComplexHeatmap")
#BiocManager::install("circlize")
library(ComplexHeatmap)
library(circlize)
library(RColorBrewer)
# Preparing the dataset for clustering
clustering_data <- integrated_data %>%
dplyr::select(controlExp, learningExp, memoryExp, controlMethFrac, learningMethFrac, memoryMethFrac)
clustering_data_matrix <- as.matrix(clustering_data)
rownames(clustering_data_matrix) <- integrated_data$gene_id
# Assuming clustering_data_matrix is correctly set up
# Define the color palette for the heatmap
heatmap_colors <- colorRampPalette(c("blue", "white", "red"))(100)
# Calculate hierarchical clustering for rows and columns
row_hc <- hclust(dist(clustering_data_matrix), method = "complete")
column_hc <- hclust(dist(t(clustering_data_matrix)), method = "complete")
# Create the heatmap with hierarchical clustering for rows and columns
heatmap_plot <- ComplexHeatmap::Heatmap(clustering_data_matrix,
name = "Expression/Methylation",
col = colorRampPalette(rev(brewer.pal(3, "Set1")))(256),
row_dend_reorder = TRUE,
column_dend_reorder = TRUE,
show_row_names = TRUE,
show_column_names = TRUE,
cluster_columns = FALSE,
clustering_distance_rows = "euclidean",
#clustering_distance_columns = "euclidean",
#clustering_method = "complete",
row_title = "Genes",
column_title = "Variables",
heatmap_legend_param = list(title = "Z-score", at = c(-2, 0, 2), labels = c("Low", "Medium", "High")))
# Print the heatmap
print(heatmap_plot)
add SFLX data to the heatmaps above? add methylation data in DEGs regardless the differential status?
#Part 5, detect differential methylation in SFLX samples.
remove all objects from previous parts
rm(list = ls(all.names = TRUE))
Create a list with the files to be loaded. Here we load the files generated by Megalodon methylation calling software on the highest accuracy settings (SUP) generated in the Hypatia cluster. Each barcode corresponds to each sample, and where analysed separately in the cluster.
library(genomation)
library(methylKit)
meelih_input <- as.list(list.files(path="./bed_files_from_methCallers/",pattern ="m[56].*Megalodon.bed.tsv",full.names = TRUE))
print(meelih_input)
## [[1]]
## [1] "./bed_files_from_methCallers/T1_CTL_m5b1_Megalodon.bed.tsv"
##
## [[2]]
## [1] "./bed_files_from_methCallers/T1_CTL_m5b2_Megalodon.bed.tsv"
##
## [[3]]
## [1] "./bed_files_from_methCallers/T1_CTL_m5b3_Megalodon.bed.tsv"
##
## [[4]]
## [1] "./bed_files_from_methCallers/T2_50ppb_m6b1_Megalodon.bed.tsv"
##
## [[5]]
## [1] "./bed_files_from_methCallers/T2_50ppb_m6b2_Megalodon.bed.tsv"
##
## [[6]]
## [1] "./bed_files_from_methCallers/T2_50ppb_m6b3_Megalodon.bed.tsv"
##
## [[7]]
## [1] "./bed_files_from_methCallers/T2_CTL_m5b4_Megalodon.bed.tsv"
##
## [[8]]
## [1] "./bed_files_from_methCallers/T2_CTL_m5b5_Megalodon.bed.tsv"
##
## [[9]]
## [1] "./bed_files_from_methCallers/T2_CTL_m5b6_Megalodon.bed.tsv"
##
## [[10]]
## [1] "./bed_files_from_methCallers/T3_50ppb_m6b4_Megalodon.bed.tsv"
##
## [[11]]
## [1] "./bed_files_from_methCallers/T3_50ppb_m6b5_Megalodon.bed.tsv"
##
## [[12]]
## [1] "./bed_files_from_methCallers/T3_50ppb_m6b6_Megalodon.bed.tsv"
##
## [[13]]
## [1] "./bed_files_from_methCallers/T3_CTL_m5b7_Megalodon.bed.tsv"
##
## [[14]]
## [1] "./bed_files_from_methCallers/T3_CTL_m5b8_Megalodon.bed.tsv"
##
## [[15]]
## [1] "./bed_files_from_methCallers/T3_CTL_m5b9_Megalodon.bed.tsv"
##
## [[16]]
## [1] "./bed_files_from_methCallers/T3_ret50ppb_m6b10_Megalodon.bed.tsv"
##
## [[17]]
## [1] "./bed_files_from_methCallers/T3_ret50ppb_m6b11_Megalodon.bed.tsv"
##
## [[18]]
## [1] "./bed_files_from_methCallers/T3_ret50ppb_m6b12_Megalodon.bed.tsv"
Get the data and normalize the data.
obj1 <- methRead(meelih_input,
# Provide a list with the names. As the files have been named bc1, bc2 etc
# methRead will load bc1 first, bc2 second and so forth.
#dbtype="tabix", #store in a db so no problem with memory
sample.id=list("T1_CTL_m5b1","T1_CTL_m5b2","T1_CTL_m5b3","T2_50ppb_m6b1","T2_50ppb_m6b2","T2_50ppb_m6b3","T2_CTL_m5b4","T2_CTL_m5b5","T2_CTL_m5b6","T3_50ppb_m6b4","T3_50ppb_m6b5","T3_50ppb_m6b6","T3_CTL_m5b7","T3_CTL_m5b8","T3_CTL_m5b9","T3_ret_50ppb_m6b10","T3_ret_50ppb_m6b11","T3_ret_50ppb_m6b12"),
# 0 is control, 1 is treatment group
treatment = c(0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5),
# Assembly is just a character vector and doesn't affect anything
# can be amel, can also be any random combination of letters
assembly="amel3.1",header=TRUE,
# We have called CpG methylation
context="CpG",
# We want base resolution
resolution="base",
# The pipeline function is used because megalodon has given us a methylation file
# that methylKit doesn't know how to read.
# So we have to specify which columns correspond to what.
pipeline=list(fraction=FALSE,chr.col=1,start.col=2,end.col=3, coverage.col=4, strand.col=FALSE, freqC.col=5),
#we want minimum coverage to be considered
mincov = 1,
)
#obj1<-normalizeCoverage(obj1)
—–Next lets see the T2 CTL vs T2 50ppb SFLX difference:
obj_i<-reorganize(obj1,sample.ids=c("T2_CTL_m5b4","T2_CTL_m5b5","T2_CTL_m5b6","T2_50ppb_m6b1","T2_50ppb_m6b2","T2_50ppb_m6b3"), treatment=c(0,0,0,1,1,1))
obj_i<-normalizeCoverage(obj_i)
meth4 <- methylKit::unite(obj_i, destrand=TRUE, min.per.group=1L)
rm(obj_i)
Tilling windows analysis:
pooled.meth4=pool(meth4,sample.ids=c("T2_CTL","T2_50ppb"))
tilesMeth4=tileMethylCounts(pooled.meth4,win.size=500,step.size=250)
diffTiles4=calculateDiffMeth(tilesMeth4)
Extracting DMRs
myDiffTiles4=getMethylDiff(diffTiles4,difference=20,qvalue=0.01)
rm(meth4,pooled.meth4,tilesMeth4)
We annotate the DMRs, plot their positions in the genome, and check if there is an overlap of DMRs and DMNs detected in analyses above to the set of DMRs found here.
gene.obj=readTranscriptFeatures("./annotation/amel3.1.bed12", remove.unusual = F)
diffAnnTiles4=annotateWithGeneParts(as(myDiffTiles4,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(diffAnnTiles4,precedence=TRUE, main="T2_CTL - T2_50ppb_SFLX DMR annotation")
We next plot the distance to the nearest TSS for T2_CTL vs T2_50ppb DMRs.
#plot distances to TSS
tssTiles4=getAssociationWithTSS(diffAnnTiles4)
hist(tssTiles4$dist.to.feature[abs(tssTiles4$dist.to.feature)<=100000], main="distance of T2_CTL vs T2_50ppb DMR Tile to nearest TSS",xlab="distance in bp",breaks=50,col="orange")
—–Next lets see the T3 CTL vs T3 retrieval 50ppb SFLX difference:
obj_i<-reorganize(obj1,sample.ids=c("T3_CTL_m5b7","T3_CTL_m5b8","T3_CTL_m5b9","T3_ret_50ppb_m6b10","T3_ret_50ppb_m6b11","T3_ret_50ppb_m6b12"), treatment=c(0,0,0,1,1,1))
obj_i<-normalizeCoverage(obj_i)
meth5 <- methylKit::unite(obj_i, destrand=TRUE, min.per.group=1L)
rm(obj_i)
Tilling windows analysis:
pooled.meth5=pool(meth5,sample.ids=c("T3_CTL","T3_ret_50ppb"))
tilesMeth5=tileMethylCounts(pooled.meth5,win.size=500,step.size=250)
diffTiles5=calculateDiffMeth(tilesMeth5)
Extracting DMRs
myDiffTiles5=getMethylDiff(diffTiles5,difference=20,qvalue=0.01)
rm(meth5,pooled.meth5,tilesMeth5)
We annotate the DMRs, plot their positions in the genome.
diffAnnTiles5=annotateWithGeneParts(as(myDiffTiles5,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(diffAnnTiles5,precedence=TRUE, main="T3_CTL - T3_ret_50ppb_SFLX DMR annotation")
We next plot the distance to the nearest TSS for T3_CTL vs T3_ret_50ppb DMRs.
#plot distances to TSS
tssTiles5=getAssociationWithTSS(diffAnnTiles5)
hist(tssTiles5$dist.to.feature[abs(tssTiles5$dist.to.feature)<=100000], main="distance of T3_CTL vs T3_ret_50ppb_SFLX DMR Tile to nearest TSS",xlab="distance in bp",breaks=50,col="purple3")
—–We test just the difference between T3 CTL - T3 50ppb SFLX.
obj_i<-reorganize(obj1,sample.ids=c("T3_CTL_m5b7","T3_CTL_m5b8","T3_CTL_m5b9","T3_50ppb_m6b4","T3_50ppb_m6b5","T3_50ppb_m6b6"), treatment=c(0,0,0,1,1,1))
obj_i<-normalizeCoverage(obj_i)
meth6 <- methylKit::unite(obj_i, destrand=TRUE, min.per.group=1L)
rm(obj_i)
Next we do Tilling windows analysis, to detect DMRs:
pooled.meth6=pool(meth6,sample.ids=c("T3_CTL","T3_50ppb"))
tilesMeth6=tileMethylCounts(pooled.meth6,win.size=500,step.size=250)
diffTiles6=calculateDiffMeth(tilesMeth6)
myDiffTiles6=getMethylDiff(diffTiles6,difference=20,qvalue=0.01)
rm(meth6,pooled.meth6,tilesMeth6)
We annotate the DMRs, plot their positions in the genome.
diffAnnTiles6=annotateWithGeneParts(as(myDiffTiles6,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(diffAnnTiles6,precedence=TRUE, main="T3_CTL - T3_50ppb DMR annotation")
We next plot the distance to the nearest TSS for T3_CTL vs T3_50ppb DMRs.
#plot distances to TSS
tssTiles6=getAssociationWithTSS(diffAnnTiles6)
hist(tssTiles6$dist.to.feature[abs(tssTiles6$dist.to.feature)<=100000], main="distance of T3_CTL - T3_50ppb DMR Tile to nearest TSS",xlab="distance in bp",breaks=50,col="red3")
Format, annotate and export data. Outputing the gene ID lists in csv files. First we make the final tables of DMRs with annotation. The annotation is done on chromosome regions only, so we have either to append a table of NAs for the non annotated rows at the end of myDiffTilesx, or to just omit the results that are on scaffolds that cannot be annotated. We will do the second here, for ease of interpretation of the results. By visual inspection, the annotation only fails after the end of DMRs that map to chromosomes, so what we do bellow is ok.
TilesDataFinal_T2CTLvsT2.50ppb<-cbind(getData(myDiffTiles4)[1:nrow(diffAnnTiles4@dist.to.TSS[,2:4]),] , diffAnnTiles4@dist.to.TSS[,2:4])
TilesDataFinal_T3CTLvsT3.ret.50ppb<-cbind(getData(myDiffTiles5)[1:nrow(diffAnnTiles5@dist.to.TSS[,2:4]),] , diffAnnTiles5@dist.to.TSS[,2:4])
TilesDataFinal_T3CTLvsT3.50ppb<-cbind(getData(myDiffTiles6)[1:nrow(diffAnnTiles6@dist.to.TSS[,2:4]),] , diffAnnTiles6@dist.to.TSS[,2:4])
#Lets add the annotation for promoter/exon/intron
TilesDataFinal_T2CTLvsT2.50ppb<-cbind(TilesDataFinal_T2CTLvsT2.50ppb,diffAnnTiles4@members[1:nrow(TilesDataFinal_T2CTLvsT2.50ppb),])
TilesDataFinal_T3CTLvsT3.ret.50ppb<-cbind(TilesDataFinal_T3CTLvsT3.ret.50ppb,diffAnnTiles5@members[1:nrow(TilesDataFinal_T3CTLvsT3.ret.50ppb),])
TilesDataFinal_T3CTLvsT3.50ppb<-cbind(TilesDataFinal_T3CTLvsT3.50ppb,diffAnnTiles6@members[1:nrow(TilesDataFinal_T3CTLvsT3.50ppb),])
Next lets annotate what we find and see for overlaps between the DMRs of different comparisons. Reading the gff file so that we can give the proper gene IDs:
if (!require("ape")) install.packages("ape",repos = "http://cran.us.r-project.org")
library(ape)
gff<-read.gff(file="./annotation/GCF_003254395.2_Amel_HAv3.1_genomic.gff", GFF3 = TRUE)
getAttributeField <- function (x, field, attrsep = ";") {
s = strsplit(x, split = attrsep, fixed = TRUE)
sapply(s, function(atts) {
a = strsplit(atts, split = "=", fixed = TRUE)
m = match(field, sapply(a, "[", 1))
if (!is.na(m)) {
rv = a[[m]][2]
}
else {
rv = as.character(NA)
}
return(rv)
})
}
gff$name <- getAttributeField(gff$attributes, "Name")
gff$gene <- getAttributeField(gff$attributes, "gene")
Next we add the corresponding gene IDs and gene attributes from the gff annotation gene.obj object.
library(dplyr)
TilesDataFinal_T2CTLvsT2.50ppb<-TilesDataFinal_T2CTLvsT2.50ppb %>%
mutate( gene_ID = gff$gene[pmatch(feature.name,gff$name, duplicates.ok = TRUE)] ,
gene_attributes = gff$attributes[pmatch(feature.name, gff$name, duplicates.ok = TRUE)]
)
TilesDataFinal_T3CTLvsT3.ret.50ppb<-TilesDataFinal_T3CTLvsT3.ret.50ppb %>%
mutate( gene_ID = gff$gene[pmatch(feature.name,gff$name, duplicates.ok = TRUE)] ,
gene_attributes = gff$attributes[pmatch(feature.name, gff$name, duplicates.ok = TRUE)]
)
TilesDataFinal_T3CTLvsT3.50ppb<-TilesDataFinal_T3CTLvsT3.50ppb %>%
mutate( gene_ID = gff$gene[pmatch(feature.name,gff$name, duplicates.ok = TRUE)] ,
gene_attributes = gff$attributes[pmatch(feature.name, gff$name, duplicates.ok = TRUE)]
)
Write data.
write.table(TilesDataFinal_T2CTLvsT2.50ppb, file="./output_data_frames/final_T2CTL_vs_T250ppbSFLX_DMRs.csv", row.names=FALSE, col.names=TRUE, quote=FALSE, sep=", ", na ="NA")
write.table(TilesDataFinal_T3CTLvsT3.50ppb, file="./output_data_frames/final_T3CTL_vs_T350ppbSFLX_DMRs.csv", row.names=FALSE, col.names=TRUE, quote=FALSE, sep=", ", na ="NA")
write.table(TilesDataFinal_T3CTLvsT3.ret.50ppb, file="./output_data_frames/final_T3CTL_vs_T3ret50ppbSFLX_DMRs.csv", row.names=FALSE, col.names=TRUE, quote=FALSE, sep=", ", na ="NA")
Gene ontology results after using the gene lists in Hymenopteramine. All possitive results are collated here.
#Treatment 0
GO_T1_2_3CTL_CC<-read.table("./GO_output/DMRs_of_genes_final_500bp_CTL_Megalodon_d10_q005_Gene_Ontology_celular_component_Hymenopteramine.tsv", sep = '\t', header = F)
GO_T1_2_3CTL_MF<-read.table("./GO_output/DMRs_of_genes_final_500bp_CTL_Megalodon_d10_q005_Gene_Ontology_molecular_function_Hymenopteramine.tsv", sep = '\t', header = F)
GO_T1_2_3CTL_PE<-read.table("./GO_output/DMRs_of_genes_final_500bp_CTL_Megalodon_d10_q005_Gene_Ontology_Pathway_enrichment_Hymenopteramine.tsv", sep = '\t', header = F)
colnames(GO_T1_2_3CTL_CC)<-c("GOterm","Pvalue","Genes","GO")
colnames(GO_T1_2_3CTL_MF)<-c("GOterm","Pvalue","Genes","GO")
colnames(GO_T1_2_3CTL_PE)<-c("GOterm","Pvalue","Genes","GO")
GO_T1_2_3CTL_CC$GOroot<-"cellular components"
GO_T1_2_3CTL_MF$GOroot<-"molecular functions"
GO_T1_2_3CTL_PE$GOroot<-"pathway enrichment"
GO_T1_2_3CTL<-rbind(GO_T1_2_3CTL_CC,GO_T1_2_3CTL_MF,GO_T1_2_3CTL_PE)
GO_T1_2_3CTL$Treatment<-"T1_2_3_CTL"
#Treatment 1
GO_T2CTLvsT250ppb_BP<-read.table("./GO_output/final_T2CTL_vs_T250ppbSFLX_DMRs_GeneOntology_biological_process_Hymenopteramine.tsv", sep = '\t', header = F)
GO_T2CTLvsT250ppb_CC<-read.table("./GO_output/final_T2CTL_vs_T250ppbSFLX_DMRs_GeneOntology_cellular_component_Hymenopteramine.tsv", sep = '\t', header = F)
GO_T2CTLvsT250ppb_MF<-read.table("./GO_output/final_T2CTL_vs_T250ppbSFLX_DMRs_GeneOntology_molecular_function_Hymenopteramine.tsv", sep = '\t', header = F)
colnames(GO_T2CTLvsT250ppb_BP)<-c("GOterm","Pvalue","Genes","GO")
colnames(GO_T2CTLvsT250ppb_CC)<-c("GOterm","Pvalue","Genes","GO")
colnames(GO_T2CTLvsT250ppb_MF)<-c("GOterm","Pvalue","Genes","GO")
GO_T2CTLvsT250ppb_BP$GOroot<-"biological process"
GO_T2CTLvsT250ppb_CC$GOroot<-"cellular components"
GO_T2CTLvsT250ppb_MF$GOroot<-"molecular functions"
GO_T2CTLvsT250ppb<-rbind(GO_T2CTLvsT250ppb_BP,GO_T2CTLvsT250ppb_CC,GO_T2CTLvsT250ppb_MF)
GO_T2CTLvsT250ppb$Treatment<-"T2CTL_vs_T2SFLX"
#Treatment 1
GO_T3CTLvsT350ppb_BP<-read.table("./GO_output/final_T3CTL_vs_T350ppbSFLX_DMRs_GeneOntology_biological_process_Hymenopteramine.tsv", sep = '\t', header = F)
GO_T3CTLvsT350ppb_CC<-read.table("./GO_output/final_T3CTL_vs_T350ppbSFLX_DMRs_GeneOntology_cellular_component_Hymenopteramine.tsv", sep = '\t', header = F)
GO_T3CTLvsT350ppb_MF<-read.table("./GO_output/final_T3CTL_vs_T350ppbSFLX_DMRs_GeneOntology_molecular_function_Hymenopteramine.tsv", sep = '\t', header = F)
colnames(GO_T3CTLvsT350ppb_BP)<-c("GOterm","Pvalue","Genes","GO")
colnames(GO_T3CTLvsT350ppb_CC)<-c("GOterm","Pvalue","Genes","GO")
colnames(GO_T3CTLvsT350ppb_MF)<-c("GOterm","Pvalue","Genes","GO")
GO_T3CTLvsT350ppb_BP$GOroot<-"biological process"
GO_T3CTLvsT350ppb_CC$GOroot<-"cellular components"
GO_T3CTLvsT350ppb_MF$GOroot<-"molecular functions"
GO_T3CTLvsT350ppb<-rbind(GO_T3CTLvsT350ppb_BP,GO_T3CTLvsT350ppb_CC,GO_T3CTLvsT350ppb_MF)
GO_T3CTLvsT350ppb$Treatment<-"T3CTL_vs_T3SFLX"
#Treatment 1
GO_T3CTLvsT3ret50ppb_BP<-read.table("./GO_output/final_T3CTL_vs_T3ret50ppbSFLX_DMRs_GeneOntology_biological_process_Hymenopteramine.tsv", sep = '\t', header = F)
GO_T3CTLvsT3ret50ppb_CC<-read.table("./GO_output/final_T3CTL_vs_T3ret50ppbSFLX_DMRs_GeneOntology_cellular_component_Hymenopteramine.tsv", sep = '\t', header = F)
GO_T3CTLvsT3ret50ppb_MF<-read.table("./GO_output/final_T3CTL_vs_T3ret50ppbSFLX_DMRs_GeneOntology_molecular_function_Hymenopteramine.tsv", sep = '\t', header = F)
colnames(GO_T3CTLvsT3ret50ppb_BP)<-c("GOterm","Pvalue","Genes","GO")
colnames(GO_T3CTLvsT3ret50ppb_CC)<-c("GOterm","Pvalue","Genes","GO")
colnames(GO_T3CTLvsT3ret50ppb_MF)<-c("GOterm","Pvalue","Genes","GO")
GO_T3CTLvsT3ret50ppb_BP$GOroot<-"biological process"
GO_T3CTLvsT3ret50ppb_CC$GOroot<-"cellular components"
GO_T3CTLvsT3ret50ppb_MF$GOroot<-"molecular functions"
GO_T3CTLvsT3ret50ppb<-rbind(GO_T3CTLvsT3ret50ppb_BP,GO_T3CTLvsT3ret50ppb_CC,GO_T3CTLvsT3ret50ppb_MF)
GO_T3CTLvsT3ret50ppb$Treatment<-"T3CTL_vs_T3retSFLX"
AllGO<-rbind(GO_T1_2_3CTL,GO_T2CTLvsT250ppb,GO_T3CTLvsT350ppb,GO_T3CTLvsT3ret50ppb)
#extract count
AllGO$CountGenes<-sapply(strsplit(AllGO$Genes,","),FUN=function(x){length(x[x!="Null"])})
#plot GO with pvalue (log10pvalue)
AllGO$log10pvalue<--log10(AllGO$Pvalue)
#filter to reduce results
GO_threshold<-8
filtered_AllGO<- AllGO %>%
filter(log10pvalue>GO_threshold)
treatment_colors <- c("T1_2_3_CTL" = "green3","T2CTL_vs_T2SFLX" = "orange", "T3CTL_vs_T3SFLX" = "red2", "T3CTL_vs_T3retSFLX" = "purple3")
GOplot <- ggplot(subset(filtered_AllGO, log10pvalue >= 1), aes(reorder(GOterm, log10pvalue), log10pvalue, fill = Treatment)) +
geom_col(width = 0.7, color = "black") +
scale_fill_manual(values = treatment_colors) +
#facet_wrap(~ Treatment) +
coord_flip() +
theme_minimal() +
theme(
plot.title = element_text(size = 12),
axis.line = element_line(colour = "black"),
axis.text = element_text(size = 10, colour = "black"),
axis.title = element_text(size = 10),
legend.title = element_blank(),
legend.text = element_text(size = 10)) +
labs(x = "-Log10(p.value)", y = "", title = "GO terms enrichment") +
ylab("")
GOplot
GOplot_T0 <- ggplot(subset(filtered_AllGO, log10pvalue >= 1 & Treatment=="T1_2_3_CTL"), aes(reorder(GOterm, log10pvalue), log10pvalue, fill = Treatment)) +
geom_col(width = 0.7, color = "black") +
scale_fill_manual(values = treatment_colors) + # Use scale_fill_manual to specify the fill colors based on treatment_colors
facet_wrap(~ Treatment) +
coord_flip() +
theme_minimal() +
theme(
plot.title = element_text(size = 12),
axis.line = element_line(colour = "black"),
axis.text = element_text(size = 10, colour = "black"),
axis.title = element_text(size = 10),
legend.title = element_blank(),
legend.text = element_text(size = 10)) +
labs(x = "-Log10(p.value)", y = "", title = "GO terms enrichment") +
ylab("")
GOplot_T0
GOplot_T1 <- ggplot(subset(filtered_AllGO, log10pvalue >= 1 & Treatment=="T2CTL_vs_T2SFLX"), aes(reorder(GOterm, log10pvalue), log10pvalue, fill = Treatment)) +
geom_col(width = 0.7, color = "black") +
scale_fill_manual(values = treatment_colors) + # Use scale_fill_manual to specify the fill colors based on treatment_colors
facet_wrap(~ Treatment) +
coord_flip() +
theme_minimal() +
theme(
plot.title = element_text(size = 12),
axis.line = element_line(colour = "black"),
axis.text = element_text(size = 10, colour = "black"),
axis.title = element_text(size = 10),
legend.title = element_blank(),
legend.text = element_text(size = 10)) +
labs(x = "-Log10(p.value)", y = "", title = "GO terms enrichment") +
ylab("")
GOplot_T1
GOplot_T2 <- ggplot(subset(filtered_AllGO, log10pvalue >= 1 & Treatment=="T3CTL_vs_T3SFLX"), aes(reorder(GOterm, log10pvalue), log10pvalue, fill = Treatment)) +
geom_col(width = 0.7, color = "black") +
scale_fill_manual(values = treatment_colors) + # Use scale_fill_manual to specify the fill colors based on treatment_colors
facet_wrap(~ Treatment) +
coord_flip() +
theme_minimal() +
theme(
plot.title = element_text(size = 12),
axis.line = element_line(colour = "black"),
axis.text = element_text(size = 10, colour = "black"),
axis.title = element_text(size = 10),
legend.title = element_blank(),
legend.text = element_text(size = 10)) +
labs(x = "-Log10(p.value)", y = "", title = "GO terms enrichment") +
ylab("")
GOplot_T2
GOplot_T3 <- ggplot(subset(filtered_AllGO, log10pvalue >= 1 & Treatment=="T3CTL_vs_T3retSFLX"), aes(reorder(GOterm, log10pvalue), log10pvalue, fill = Treatment)) +
geom_col(width = 0.7, color = "black") +
scale_fill_manual(values = treatment_colors) + # Use scale_fill_manual to specify the fill colors based on treatment_colors
facet_wrap(~ Treatment) +
coord_flip() +
theme_minimal() +
theme(
plot.title = element_text(size = 12),
axis.line = element_line(colour = "black"),
axis.text = element_text(size = 10, colour = "black"),
axis.title = element_text(size = 10),
legend.title = element_blank(),
legend.text = element_text(size = 10)) +
labs(x = "-Log10(p.value)", y = "", title = "GO terms enrichment") +
ylab("")
GOplot_T3
library(gridExtra)
grid.arrange(GOplot_T0,GOplot_T1,GOplot_T2,GOplot_T3,nrow=1, ncol=4)