We will perform differential methylation analyses for Control samples
and 50ppb SFLX samples from the following experiment: #
We detect differential methylation per base and per 50 base pair
windows, shifting every 25 base pairs.
The comparisons we will perform first are on the effects of learning on the methylation patterns of the bee brain: T1-T2 = we are looking for methylation acquisition/removal sites specifically for the learning phase T2-T3= we are looking for methylation acquisition/removal sites specifically for the memory retrieval T1-T3= we are looking for methylation acquisition/removal sites specifically for the memory acquisition
Then, if we have good candidate sites, will check how these sites behave after exposure to 50ppb SFLX: T2 CTL - T2 50ppb = we are looking for methylation/removal site affected by 50ppb for the learning phase
Last we will compare all control bees with all 50ppb bees. M1 and M3 samples are all T3 time point, so perhaps we should compare only the T3 CTL versus T3 50ppb.
The sample list is the following: M1_b1(MEELIH-01 barcode 1) T3-CTL
M1_b2 T3-CTL M1_b3 T3-CTL M1_b4 T3-50ppb M1_b5 T3-50ppb M1_b6 T3-50ppb M3_b1 T3-CTL M3_b2 T3-5ppb M3_b3 T3-5ppb M3_b4 T3-5ppb M3_b5 T3-5ppb M3_b6 T3-50ppb M5-M5B_b1 T1-CTL M5-M5B_b2 T1-CTL M5-M5B_b3 T1-CTL M5-M5B_b4 T2-CTL M5-M5B_b5 T2-CTL M5-M5B_b6 T2-CTL M5-M5B_b7 T3-CTL M5-M5B_b8 T3-CTL M5-M5B_b9 T3-CTL M6-M5B_b10 retrieval-T3-50ppb M6-M5B_b11 retrieval-T3-50ppb M6-M5B_b12 retrieval-T3-50ppb M6_b1 T2-50ppb M6_b2 T2-50ppb M6_b3 T2-50ppb M6_b4 T3-50ppb M6_b5 T3-50ppb M6_b6 T3-50ppb
First, install and load the required packages
if (!require("tidyverse")) install.packages("tidyverse")
if (!require("data.table")) install.packages("data.table")
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("ggstatsplot")) install.packages("ggstatsplot")
if (!require("genomation")) BiocManager::install("genomation")
if (!require("methylKit")) BiocManager::install("methylKit")
Create a list with the files to be loaded. Here we load the files generated by Megalodon methylation calling software on the highest accuracy settings (SUP) generated in the Hypatia cluster. Each barcode corresponds to each sample, and where analysed separately in the cluster.
meelih_input <- as.list(list.files(path=".",pattern = "M5_M5B*",full.names = FALSE))
print(meelih_input)
## [[1]]
## [1] "M5_M5B_b1_T1-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[2]]
## [1] "M5_M5B_b2_T1-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[3]]
## [1] "M5_M5B_b3_T1-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[4]]
## [1] "M5_M5B_b4_T2-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[5]]
## [1] "M5_M5B_b5_T2-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[6]]
## [1] "M5_M5B_b6_T2-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[7]]
## [1] "M5_M5B_b7_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[8]]
## [1] "M5_M5B_b8_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[9]]
## [1] "M5_M5B_b9_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
str(meelih_input[1:6])
## List of 6
## $ : chr "M5_M5B_b1_T1-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
## $ : chr "M5_M5B_b2_T1-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
## $ : chr "M5_M5B_b3_T1-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
## $ : chr "M5_M5B_b4_T2-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
## $ : chr "M5_M5B_b5_T2-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
## $ : chr "M5_M5B_b6_T2-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
obj1 <- methRead(meelih_input[1:6],
# Provide a list with the names. As the files have been named bc1, bc2 etc
# methRead will load bc1 first, bc2 second and so forth.
sample.id=list("T1_CTL_m5b1","T1_CTL_m5b2","T1_CTL_m5b3","T2_CTL_m5b4","T2_CTL_m5b5","T2_CTL_m5b6"),
# 0 is control, 1 is treatment group
treatment = c(0,0,0,1,1,1),
# Assembly is just a character vector and doesn't affect anything
# can be amel, can also be any random combination of letters
assembly="amel3.1",header=TRUE,
# We have called CpG methylation
context="CpG",
# We want base resolution
resolution="base",
# The pipeline function is used because megalodon has given us a methylation file
# that methylKit doesn't know how to read.
# So we have to specify which columns correspond to what.
pipeline=list(fraction=FALSE,chr.col=1,start.col=2,end.col=3, coverage.col=4, strand.col=FALSE, freqC.col=5),
#we want minimum coverage to be considered
mincov = 1,
)
Plotting the methylation and coverage stats per sample.
obj1@treatment
## [1] 0 0 0 1 1 1
for (i in 1:6) {
# Methylation stats
getMethylationStats(obj1[[i]],plot=TRUE)
# Coverage stats
getCoverageStats(obj1[[i]],plot=TRUE)
getCoverageStats(obj1[[i]])
}
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.937 8.000 2914.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 5 6 6 7 8 9 10 12 15 18
## 99.9% 100%
## 98 2914
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 6.000 6.233 7.000 2732.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 4 5 6 6 7 8 9 10 13 16
## 99.9% 100%
## 94 2732
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 4.000 4.427 5.000 1802.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 2 2 3 3 4 4 5 6 7 8 10 12
## 99.9% 100%
## 65 1802
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 5.000 5.915 7.000 2118.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 2 3 4 5 5 6 7 8 9 10 13 15
## 99.9% 100%
## 88 2118
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 6.000 6.368 8.000 2876.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 3 4 4 5 6 6 7 8 10 11 14 16
## 99.9% 100%
## 91 2876
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 1.00 1.81 2.00 1772.00
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 1 1 1 1 1 2 2 2 3 4 5 6
## 99.9% 100%
## 25 1772
We can now combine the files together into one object that we can use for Differentially Methylated Nucleotide (DMN) and Diff. Meth. Region (DMR) analysis.
meth1 <- methylKit::unite(obj1, destrand=TRUE)
First, we look at the correlation between samples, cluster them, and look at the PCA plots.
# Correlation
getCorrelation(meth1,plot=TRUE)
## T1_CTL_m5b1 T1_CTL_m5b2 T1_CTL_m5b3 T2_CTL_m5b4 T2_CTL_m5b5
## T1_CTL_m5b1 1.0000000 0.3706860 0.3596811 0.3691495 0.3673136
## T1_CTL_m5b2 0.3706860 1.0000000 0.3313421 0.3817389 0.3585796
## T1_CTL_m5b3 0.3596811 0.3313421 1.0000000 0.3292470 0.3265476
## T2_CTL_m5b4 0.3691495 0.3817389 0.3292470 1.0000000 0.3579771
## T2_CTL_m5b5 0.3673136 0.3585796 0.3265476 0.3579771 1.0000000
## T2_CTL_m5b6 0.2627289 0.2578668 0.2345226 0.2565070 0.2629208
## T2_CTL_m5b6
## T1_CTL_m5b1 0.2627289
## T1_CTL_m5b2 0.2578668
## T1_CTL_m5b3 0.2345226
## T2_CTL_m5b4 0.2565070
## T2_CTL_m5b5 0.2629208
## T2_CTL_m5b6 1.0000000
# Clustering
clusterSamples(meth1, dist="correlation", method="average", plot=TRUE)
##
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
##
## Cluster method : average
## Distance : pearson
## Number of objects: 6
# Also save the clustering as an object
hc1 = clusterSamples(meth1, dist="correlation", method="ward", plot=FALSE)
# PCA
PCASamples(meth1, screeplot=TRUE)
PCASamples(meth1)
Next we calculate the Differential Methylation per base.
myDiff1=calculateDiffMeth(meth1)
We extract the DMNs as well as the hypo- and hyper-methylated nucleotides separately. Here, due to low number of detected DMNs, I’ve set the threshold difference to 5%.
myDiff1_5p=getMethylDiff(myDiff1,difference=5,qvalue=0.01)
#str(myDiff20p)
# Hypomethylated
myDiff1_5p.hypo=getMethylDiff(myDiff1,difference=5,qvalue=0.01,type="hypo")
#str(myDiff20p.hypo)
# Hypermethylated
myDiff1_5p.hyper=getMethylDiff(myDiff1,difference=5,qvalue=0.01,type="hyper")
#str(myDiff20p.hyper)
We annotate the DMNs. First, we need to read the BED12 file that has the genome annotation
gene.obj=readTranscriptFeatures("./amel3.1.bed12", remove.unusual = F)
We annotate the DMNs per chromosome
diffMethPerChr(myDiff1, plot=TRUE, qvalue.cutoff=0.01, meth.cutoff=5)
We annotate the DMNs for their position in introns, exons, etc
diffAnn1=annotateWithGeneParts(as(myDiff1_5p,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(diffAnn1,precedence=TRUE, main="T1_CTL - T2_CTL DMN annotation")
We then describe the DMNs by displaying the structure of the diffAnn object, as well as specific attributes, like the gene name that is closest to them (feature name). We also plot distance to nearest TSS. We extract the gene ID lists at the end of the analysis.
#str(diffAnn)
#here we see the way to access the annotation for each DMN.
#diffAnn@dist.to.TSS$dist.to.feature
#diffAnn@dist.to.TSS$feature.name
#diffAnn@members
tss.assoc.1=getAssociationWithTSS(diffAnn1)
hist(tss.assoc.1$dist.to.feature[abs(tss.assoc.1$dist.to.feature)<=100000],main="T1_CTL - T2_CTL DMN distance to nearest TSS",xlab="distance in bp",breaks=50,col="brown4")
Next lets do tilling windows analysis. The windows are set at 50bp length and we check for Diff. Meth. Regions (DMRs) every 25bp:
pooled.meth1=pool(meth1,sample.ids=c("T1","T2"))
tilesMeth1=tileMethylCounts(pooled.meth1,win.size=50,step.size=25)
#head(tilesMeth1)
diffTiles1=calculateDiffMeth(tilesMeth1)
#head(diffTiles1)
myDiffTiles1_10p=getMethylDiff(diffTiles1,difference=10,qvalue=0.01)
We show some attributes of the DMRs, like the ID of the closest gene, and plot their position:
diffAnnTiles1=annotateWithGeneParts(as(myDiffTiles1_10p,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(diffAnnTiles1,precedence=TRUE, main="T1_CTL - T2_CTL DMR annotation")
#str(diffAnnTiles)
#diffAnnTiles@dist.to.TSS$dist.to.feature
#diffAnnTiles@dist.to.TSS$feature.name
#diffAnnTiles@members
We then plot the distance to the nearest TSS for the DMRs. Note that our null assumption is a bell curve, so it is likely that we have more DMRs in gene coding regions than expected by chance, as the plot bellow suggests:
#plot distances to TSS
tssTiles1=getAssociationWithTSS(diffAnnTiles1)
hist(tssTiles1$dist.to.feature[abs(tssTiles1$dist.to.feature)<=100000], main="T1_CTL - T2_CTL DMR distance of Tile to nearest TSS",xlab="distance in bp",breaks=50,col="brown2")
Lets see the same plot, but now for the distances of random positions to the TSS of randomly placed genes, on a genome of the same size and number of genes to Apis mellifera. This is done to get an intuition of what we expect for this plot to look like if we would have picked the positions by chance alone:
randomTSSpositions<-sample(1:225250884,28410) #based on latest genome size and number of genes
randomDistancesToNearestTSS<-vector(mode='list',length(30000))
for (x in 1:30000){
randomMethylatedPosition<-runif(1,1,225250884)
minAbsDistance<- 1000000000
minDistance<- 1000000000
for (i in randomTSSpositions){
if(abs(randomMethylatedPosition-i)<minAbsDistance){
minAbsDistance<-abs(randomMethylatedPosition-i)
minDistance<-randomMethylatedPosition-i
}
}
randomDistancesToNearestTSS[x]<-minDistance
}
hist(as.numeric(unlist(randomDistancesToNearestTSS)),main="distance to nearest TSS of random position",xlab="distance in bp",breaks=50,col="brown4")
Lets make a plot of the annotation of random genomic positions as well: We will change the positions but not the chromosomes at the moment.
randomTiles<-myDiffTiles1_10p
#str(randomTiles)
randomTiles$start[1:20]
## [1] 1176576 1176601 4559076 4559101 6721276 6721301 7065326 9332651
## [9] 9332676 11509926 11509951 11545601 19595276 19595301 21039251 22678451
## [17] 22779851 24619476 3941051 3941076
plusMinusRand<-function(x){
z<-x+sample(c(-10000:-9000,9000:10000),1)
return(z)
}
plusOne<-function(x){
z<-x+1
z<-as.integer(z)
return(z)
}
randomPositionsStart<-sapply(randomTiles$start,plusMinusRand)
randomTiles$start<-randomPositionsStart
randomPositionsEnd<-sapply(randomTiles$start,plusOne)
randomTiles$end<-randomPositionsEnd
randomTiles$start[1:20]
## [1] 1167431 1166820 4568332 4549363 6730310 6730589 7055738 9323034
## [9] 9322687 11500763 11519578 11555336 19585718 19604810 21048567 22687528
## [17] 22770117 24609829 3950989 3931165
#randomTilesAnn<-randomizeFeature(randomTilesAnn, chrom.sizes = NULL,
# stranded = FALSE, keep.strand.prop = TRUE, keep.chrom = TRUE,
# exclude = NULL, include = NULL, seed = NULL, nrand = 1)
randomTilesAnn<-annotateWithGeneParts(as(randomTiles,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(randomTilesAnn,precedence=TRUE, main="Annotation of 682 random positions")
Next we perform the same set of DMN and DMR analyses, lightly truncated, for comparison between T2_CTL and T3_CTL
#-------------T2-T3 comparison-------------------------------------
str(meelih_input[4:9])
## List of 6
## $ : chr "M5_M5B_b4_T2-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
## $ : chr "M5_M5B_b5_T2-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
## $ : chr "M5_M5B_b6_T2-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
## $ : chr "M5_M5B_b7_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
## $ : chr "M5_M5B_b8_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
## $ : chr "M5_M5B_b9_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
obj2 <- methRead(meelih_input[4:9],
sample.id=list("T2_CTL_m5b4","T2_CTL_m5b5","T2_CTL_m5b6","T3_CTL_m5b7","T3_CTL_m5b8","T3_CTL_m5b9"),
treatment = c(0,0,0,1,1,1),
assembly="amel3.1",header=TRUE,
context="CpG",
resolution="base",
pipeline=list(fraction=FALSE,chr.col=1,start.col=2,end.col=3, coverage.col=4, strand.col=FALSE, freqC.col=5),
mincov = 1,
)
Basic statistics for each sample:
obj2@treatment
## [1] 0 0 0 1 1 1
for (i in 4:6) {
# Methylation stats
getMethylationStats(obj2[[i]],plot=TRUE)
# Coverage stats
getCoverageStats(obj2[[i]],plot=TRUE)
getCoverageStats(obj2[[i]])
}
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 5.000 5.085 6.000 1614.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 2 3 3 4 5 5 6 7 8 9 12 14
## 99.9% 100%
## 68 1614
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.676 2.000 876.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 1 1 1 1 1 2 2 2 3 3 5 6
## 99.9% 100%
## 23 876
## read coverage statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.654 2.000 566.000
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5%
## 1 1 1 1 1 1 2 2 2 3 3 5 5
## 99.9% 100%
## 22 566
Now lets calculate the T2 vs T3 CTL DMNs and DMRs
meth2 <- methylKit::unite(obj2, destrand=TRUE)
PCASamples(meth2)
DMN calculation:
myDiff2=calculateDiffMeth(meth2)
myDiff2_5p=getMethylDiff(myDiff2,difference=5,qvalue=0.01)
# Hypomethylated
myDiff2_5p.hypo=getMethylDiff(myDiff2,difference=5,qvalue=0.01,type="hypo")
# Hypermethylated
myDiff2_5p.hyper=getMethylDiff(myDiff2,difference=5,qvalue=0.01,type="hyper")
Annotation of the DMNs, as well as display of DMN attributes. We check to see if DMNs found for T1-T2 are also found here for T2-T3 (?result). Last we plot the position of DMNs generally and relative to TSSs:
diffAnn2=annotateWithGeneParts(as(myDiff2_5p,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(diffAnn2,precedence=TRUE, main="T2_CTL - T3_CTL DMN annotation")
#str(diffAnn2)
#diffAnn2@dist.to.TSS$dist.to.feature
#diffAnn2@dist.to.TSS$feature.name
#diffAnn2@members
#diffAnn@dist.to.TSS$feature.name %in% diffAnn2@dist.to.TSS$feature.name
tss.assoc.2=getAssociationWithTSS(diffAnn2)
hist(tss.assoc.2$dist.to.feature[abs(tss.assoc.2$dist.to.feature)<=100000],main="distance of T2_CTL - T3_CTL DMN to nearest TSS",xlab="distance in bp",breaks=50,col="orange4")
Next we do Tilling windows analysis, to detect DMRs, as above:
pooled.meth2=pool(meth2,sample.ids=c("T2","T3"))
tilesMeth2=tileMethylCounts(pooled.meth2,win.size=50,step.size=25)
#head(tilesMeth2)
diffTiles2=calculateDiffMeth(tilesMeth2)
#head(diffTiles2)
Detecting DMRs:
myDiffTiles2_10p=getMethylDiff(diffTiles2,difference=10,qvalue=0.01)
We annotate the DMRs, plot their positions in the genome, and check if there is an overlap of DMRs and DMNs detected in analyses above to the set of DMRs found here.
diffAnnTiles2=annotateWithGeneParts(as(myDiffTiles2_10p,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(diffAnnTiles2,precedence=TRUE, main="T2_CTL - T3_CTL DMR annotation")
#str(diffAnnTiles2)
#diffAnnTiles2@dist.to.TSS$dist.to.feature
#diffAnnTiles2@dist.to.TSS$feature.name
#diffAnnTiles2@members
#diffAnn@dist.to.TSS$feature.name %in% diffAnnTiles2@dist.to.TSS$feature.name
#diffAnnTiles@dist.to.TSS$feature.name %in% diffAnnTiles2@dist.to.TSS$feature.name
#diffAnn2@dist.to.TSS$feature.name %in% diffAnnTiles2@dist.to.TSS$feature.name
We next plot the distance to the nearest TSS for T2vsT3 CTL DMRs.
#plot distances to TSS
tssTiles2=getAssociationWithTSS(diffAnnTiles2)
hist(tssTiles2$dist.to.feature[abs(tssTiles2$dist.to.feature)<=100000], main="distance of T2_CTL vs T3_CTL DMR Tile to nearest TSS",xlab="distance in bp",breaks=50,col="orange2")
Next lets see the T1 vs T3 CTL differential methylation:
obj3 <- methRead(meelih_input[c(1:3,7:9)],
sample.id=list("T1_CTL_m5b1","T1_CTL_m5b2","T1_CTL_m5b3","T3_CTL_m5b7","T3_CTL_m5b8","T3_CTL_m5b9"),
treatment = c(0,0,0,1,1,1),
assembly="amel3.1",header=TRUE,
context="CpG",
resolution="base",
pipeline=list(fraction=FALSE,chr.col=1,start.col=2,end.col=3, coverage.col=4, strand.col=FALSE, freqC.col=5),
mincov = 1)
Now lets calculate the T1 vs T3 CTL DMNs and DMRs
meth3 <- methylKit::unite(obj3, destrand=TRUE)
PCASamples(meth3)
DMN calculation
myDiff3=calculateDiffMeth(meth3)
myDiff3_5p=getMethylDiff(myDiff3,difference=5,qvalue=0.01)
myDiff3_5p.hypo=getMethylDiff(myDiff3,difference=5,qvalue=0.01,type="hypo")
myDiff3_5p.hyper=getMethylDiff(myDiff3,difference=5,qvalue=0.01,type="hyper")
Annotation of the DMNs,
diffAnn3=annotateWithGeneParts(as(myDiff3_5p,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(diffAnn3,precedence=TRUE, main="T1_CTL - T3_CTL DMN annotation")
tss.assoc.3=getAssociationWithTSS(diffAnn3)
hist(tss.assoc.3$dist.to.feature[abs(tss.assoc.3$dist.to.feature)<=100000],main="distance of T1_CTL - T3_CTL DMN to nearest TSS",xlab="distance in bp",breaks=50,col="purple4")
Next we do Tilling windows analysis, to detect DMRs:
pooled.meth3=pool(meth3,sample.ids=c("T1","T3"))
tilesMeth3=tileMethylCounts(pooled.meth3,win.size=50,step.size=25)
diffTiles3=calculateDiffMeth(tilesMeth3)
myDiffTiles3_10p=getMethylDiff(diffTiles3,difference=10,qvalue=0.01)
We annotate the DMRs and plot their positions in the genome.
diffAnnTiles3=annotateWithGeneParts(as(myDiffTiles3_10p,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(diffAnnTiles3,precedence=TRUE, main="T1_CTL - T3_CTL DMR annotation")
We next plot the distance to the nearest TSS for T1-T3 DMRs.
#plot distances to TSS
tssTiles3=getAssociationWithTSS(diffAnnTiles3)
hist(tssTiles3$dist.to.feature[abs(tssTiles3$dist.to.feature)<=100000], main="distance of T1_CTL - T3_CTL DMR Tile to nearest TSS",xlab="distance in bp",breaks=50,col="purple2")
——-Effect of Sulfoxaflor (SFLX) on the methylation patterns at timepoints T2 and T3——————
—–Next lets see the T2 CTL vs T2 50ppb SFLX difference:
meelih_all <- as.list(list.files(path=".",pattern = "M*.tsv",full.names = FALSE))
meelih_all[c(16:18,22:24)]
## [[1]]
## [1] "M5_M5B_b4_T2-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[2]]
## [1] "M5_M5B_b5_T2-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[3]]
## [1] "M5_M5B_b6_T2-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[4]]
## [1] "M6_b1_T2-50ppb_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[5]]
## [1] "M6_b2_T2-50ppb_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[6]]
## [1] "M6_b3_T2-50ppb_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
Now lets calculate the T2_CTL vs T2_50ppb DMNs and DMRs
obj4 <- methRead(meelih_all[c(16:18,22:24)],
sample.id=list("T2_CTL_m5b4","T2_CTL_m5b5","T2_CTL_m5b6","T2_50ppb_m6b1","T2_50ppb_m6b2","T2_50ppb_m6b3"),
treatment = c(0,0,0,1,1,1),
assembly="amel3.1",header=TRUE,
context="CpG",
resolution="base",
pipeline=list(fraction=FALSE,chr.col=1,start.col=2,end.col=3, coverage.col=4, strand.col=FALSE, freqC.col=5),
mincov = 1)
meth4 <- methylKit::unite(obj4, destrand=TRUE)
PCASamples(meth4)
DMN calculation
myDiff4=calculateDiffMeth(meth4)
myDiff4_5p=getMethylDiff(myDiff4,difference=5,qvalue=0.01)
myDiff4_5p.hypo=getMethylDiff(myDiff4,difference=5,qvalue=0.01,type="hypo")
myDiff4_5p.hyper=getMethylDiff(myDiff4,difference=5,qvalue=0.01,type="hyper")
Annotation of the DMNs
diffAnn4=annotateWithGeneParts(as(myDiff4_5p,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(diffAnn4,precedence=TRUE, main="T2_CTL - T2_50ppb_SFLX DMN annotation")
tss.assoc.4=getAssociationWithTSS(diffAnn4)
hist(tss.assoc.4$dist.to.feature[abs(tss.assoc.4$dist.to.feature)<=100000],main="distance of T2_CTL - T2_50ppb_SFLX DMN to nearest TSS",xlab="distance in bp",breaks=50,col="gray4")
Next we do Tilling windows analysis, to detect DMRs:
pooled.meth4=pool(meth4,sample.ids=c("T2_CTL","T2_50ppb"))
tilesMeth4=tileMethylCounts(pooled.meth4,win.size=50,step.size=25)
diffTiles4=calculateDiffMeth(tilesMeth4)
myDiffTiles4_10p=getMethylDiff(diffTiles4,difference=10,qvalue=0.01)
We annotate the DMRs, plot their positions in the genome, and check if there is an overlap of DMRs and DMNs detected in analyses above to the set of DMRs found here.
diffAnnTiles4=annotateWithGeneParts(as(myDiffTiles4_10p,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(diffAnnTiles4,precedence=TRUE, main="T2_CTL - T2_50ppb_SFLX DMR annotation")
We next plot the distance to the nearest TSS for T2_CTL vs T2_50ppb DMRs.
#plot distances to TSS
tssTiles4=getAssociationWithTSS(diffAnnTiles4)
hist(tssTiles4$dist.to.feature[abs(tssTiles4$dist.to.feature)<=100000], main="distance of T2_CTL vs T2_50ppb DMR Tile to nearest TSS",xlab="distance in bp",breaks=50,col="gray2")
—–We we now test the difference between T3 CTL - T3 50ppb SFLX, for all the samples that we have, including M1, M3:
meelih_all[c(1:7,12,19:21,25:27)]
## [[1]]
## [1] "M1_b1_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[2]]
## [1] "M1_b2_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[3]]
## [1] "M1_b3_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[4]]
## [1] "M1_b4_T3-50ppb_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[5]]
## [1] "M1_b5_T3-50ppb_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[6]]
## [1] "M1_b6_T3-50ppb_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[7]]
## [1] "M3_b1_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[8]]
## [1] "M3_b6_T3-50ppb_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[9]]
## [1] "M5_M5B_b7_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[10]]
## [1] "M5_M5B_b8_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[11]]
## [1] "M5_M5B_b9_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[12]]
## [1] "M6_b4_T3-50ppb_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[13]]
## [1] "M6_b5_T3-50ppb_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[14]]
## [1] "M6_b6_T3-50ppb_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
Now lets calculate the T3_CTL vs T3_50ppb DMNs and DMRs
obj5 <- methRead(meelih_all[c(1:7,12,19:21,25:27)],
sample.id=list("T3_CTL_m1b1","T3_CTL_m1b2","T3_CTL_m1b3","T3_50ppb_m1b4","T3_50ppb_m1b5","T3_50ppb_m1b6","T3_CTL_m3b1","T3_50ppb_m3b6","T3_CTL_m5b7","T3_CTL_m5b8","T3_CTL_m5b9","T3_50ppb_m6b4","T3_50ppb_m6b5","T3_50ppb_m6b6"),
treatment = c(0,0,0,1,1,1,0,1,0,0,0,1,1,1),
assembly="amel3.1",header=TRUE,
context="CpG",
resolution="base",
pipeline=list(fraction=FALSE,chr.col=1,start.col=2,end.col=3, coverage.col=4, strand.col=FALSE, freqC.col=5),
mincov = 1)
meth5 <- methylKit::unite(obj5, destrand=TRUE)
PCASamples(meth5)
DMN calculation
myDiff5=calculateDiffMeth(meth5)
myDiff5_5p=getMethylDiff(myDiff5,difference=5,qvalue=0.01)
myDiff5_5p.hypo=getMethylDiff(myDiff5,difference=5,qvalue=0.01,type="hypo")
myDiff5_5p.hyper=getMethylDiff(myDiff5,difference=5,qvalue=0.01,type="hyper")
Annotation of the DMNs
diffAnn5=annotateWithGeneParts(as(myDiff5_5p,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(diffAnn5,precedence=TRUE, main="full set T3_CTL - T3_50ppb DMN annotation")
tss.assoc.5=getAssociationWithTSS(diffAnn5)
hist(tss.assoc.5$dist.to.feature[abs(tss.assoc.5$dist.to.feature)<=100000],main="distance of full set T3_CTL - T3_50ppb DMN to nearest TSS",xlab="distance in bp",breaks=50,col="pink")
Next we do Tilling windows analysis, to detect DMRs:
pooled.meth5=pool(meth5,sample.ids=c("T3_CTL","T3_50ppb"))
tilesMeth5=tileMethylCounts(pooled.meth5,win.size=50,step.size=25)
diffTiles5=calculateDiffMeth(tilesMeth5)
myDiffTiles5_10p=getMethylDiff(diffTiles5,difference=10,qvalue=0.01)
We annotate the DMRs, plot their positions in the genome, and check if there is an overlap of DMRs and DMNs detected in analyses above to the set of DMRs found here.
diffAnnTiles5=annotateWithGeneParts(as(myDiffTiles5_10p,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(diffAnnTiles5,precedence=TRUE, main="full set T3_CTL - T3_50ppb DMR annotation")
We next plot the distance to the nearest TSS for T3_CTL vs T3_50ppb DMRs.
#plot distances to TSS
tssTiles5=getAssociationWithTSS(diffAnnTiles5)
hist(tssTiles5$dist.to.feature[abs(tssTiles5$dist.to.feature)<=100000], main="distance of full set T3_CTL - T3_50ppb DMR Tile to nearest TSS",xlab="distance in bp",breaks=50,col="pink3")
—–We also test just the difference between T3 CTL - T3 50ppb SFLX, for just M5 and M6:
meelih_all[c(19:21,25:27)]
## [[1]]
## [1] "M5_M5B_b7_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[2]]
## [1] "M5_M5B_b8_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[3]]
## [1] "M5_M5B_b9_T3-CTL_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[4]]
## [1] "M6_b4_T3-50ppb_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[5]]
## [1] "M6_b5_T3-50ppb_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
##
## [[6]]
## [1] "M6_b6_T3-50ppb_modified_bases.5mC.bed.freq-per-CG-combStrand.tsv"
Now lets calculate the T3_CTL vs T3_50ppb DMNs and DMRs
obj6 <- methRead(meelih_all[c(19:21,25:27)],
sample.id=list("T3_CTL_m5b7","T3_CTL_m5b8","T3_CTL_m5b9","T3_50ppb_m6b4","T3_50ppb_m6b5","T3_50ppb_m6b6"),
treatment = c(0,0,0,1,1,1),
assembly="amel3.1",header=TRUE,
context="CpG",
resolution="base",
pipeline=list(fraction=FALSE,chr.col=1,start.col=2,end.col=3, coverage.col=4, strand.col=FALSE, freqC.col=5),
mincov = 1)
meth6 <- methylKit::unite(obj6, destrand=TRUE)
PCASamples(meth6)
DMN calculation
myDiff6=calculateDiffMeth(meth6)
myDiff6_5p=getMethylDiff(myDiff6,difference=5,qvalue=0.01)
myDiff6_5p.hypo=getMethylDiff(myDiff6,difference=5,qvalue=0.01,type="hypo")
myDiff6_5p.hyper=getMethylDiff(myDiff6,difference=5,qvalue=0.01,type="hyper")
Annotation of the DMNs #ommitted due to error, only one DMN without corresponding annotation in the myDiff6_5p diffAnn6=annotateWithGeneParts(as(myDiff6_5p,“GRanges”),gene.obj) genomation::plotTargetAnnotation(diffAnn6,precedence=TRUE, main=“T3_CTL - T3_50ppb DMN annotation”)
tss.assoc.6=getAssociationWithTSS(diffAnn6) hist(tss.assoc.6\(dist.to.feature[abs(tss.assoc.6\)dist.to.feature)<=100000],main=“distance of T3_CTL - T3_50ppb DMN to nearest TSS”,xlab=“distance in bp”,breaks=50,col=“gray”)
Next we do Tilling windows analysis, to detect DMRs:
pooled.meth6=pool(meth6,sample.ids=c("T3_CTL","T3_50ppb"))
tilesMeth6=tileMethylCounts(pooled.meth6,win.size=50,step.size=25)
diffTiles6=calculateDiffMeth(tilesMeth6)
myDiffTiles6_10p=getMethylDiff(diffTiles6,difference=10,qvalue=0.01)
We annotate the DMRs, plot their positions in the genome, and check if there is an overlap of DMRs and DMNs detected in analyses above to the set of DMRs found here.
diffAnnTiles6=annotateWithGeneParts(as(myDiffTiles6_10p,"GRanges"),gene.obj)
genomation::plotTargetAnnotation(diffAnnTiles6,precedence=TRUE, main="T3_CTL - T3_50ppb DMR annotation")
We next plot the distance to the nearest TSS for T3_CTL vs T3_50ppb DMRs.
#plot distances to TSS
tssTiles6=getAssociationWithTSS(diffAnnTiles6)
hist(tssTiles6$dist.to.feature[abs(tssTiles6$dist.to.feature)<=100000], main="distance of T3_CTL - T3_50ppb DMR Tile to nearest TSS",xlab="distance in bp",breaks=50,col="gray")
VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
Lets see how the coverages for each sample look like for all samples (T1, T2, T3)
coverages_df <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(coverages_df) <-c("sample_ID", "coverage","methylated")
#combine all coverage data in this flat dataframe
for (i in 1:6) {
b<-getData(obj2[[i]])[,5:6]
t<-NROW(b)
a<-rep(obj2[[i]]@sample.id,times=t)
a<-cbind(a,b)
coverages_df<-rbind(coverages_df,a)
}
#include the T1 coverages
for (i in c(1:3)) {
b<-getData(obj1[[i]])[,5:6]
t<-NROW(b)
a<-rep(obj1[[i]]@sample.id,times=t)
a<-cbind(a,b)
coverages_df<-rbind(coverages_df,a)
}
#include the T2_50ppb coverages
for (i in c(4:6)) {
b<-getData(obj4[[i]])[,5:6]
t<-NROW(b)
a<-rep(obj4[[i]]@sample.id,times=t)
a<-cbind(a,b)
coverages_df<-rbind(coverages_df,a)
}
#include the T3_50ppb coverages
for (i in c(1:8,12:14)) {
b<-getData(obj5[[i]])[,5:6]
t<-NROW(b)
a<-rep(obj5[[i]]@sample.id,times=t)
a<-cbind(a,b)
coverages_df<-rbind(coverages_df,a)
}
str(coverages_df)
## 'data.frame': 191842863 obs. of 3 variables:
## $ a : chr "T2_CTL_m5b4" "T2_CTL_m5b4" "T2_CTL_m5b4" "T2_CTL_m5b4" ...
## $ coverage: int 1 1 1 1 1 1 1 1 1 1 ...
## $ numCs : int 0 0 0 0 0 0 0 0 0 0 ...
#calculating coverages and methylation levels
colnames(coverages_df) <-c("sample_ID", "coverage","methylated")
coverages_df$sample_ID<-as.factor(coverages_df$sample_ID)
coverages_df$coverage<-as.numeric(coverages_df$coverage)
coverages_df$methylated<-as.numeric(coverages_df$methylated)
sapply(coverages_df, levels)
## $sample_ID
## [1] "T1_CTL_m5b1" "T1_CTL_m5b2" "T1_CTL_m5b3" "T2_50ppb_m6b1"
## [5] "T2_50ppb_m6b2" "T2_50ppb_m6b3" "T2_CTL_m5b4" "T2_CTL_m5b5"
## [9] "T2_CTL_m5b6" "T3_50ppb_m1b4" "T3_50ppb_m1b5" "T3_50ppb_m1b6"
## [13] "T3_50ppb_m3b6" "T3_50ppb_m6b4" "T3_50ppb_m6b5" "T3_50ppb_m6b6"
## [17] "T3_CTL_m1b1" "T3_CTL_m1b2" "T3_CTL_m1b3" "T3_CTL_m3b1"
## [21] "T3_CTL_m5b7" "T3_CTL_m5b8" "T3_CTL_m5b9"
##
## $coverage
## NULL
##
## $methylated
## NULL
library(dplyr)
coverages_df <- coverages_df %>%
mutate(group = case_when(
startsWith(as.character(sample_ID), "T1_CTL_m5") ~ "T1_CTL_m5",
startsWith(as.character(sample_ID), "T2_CTL_m5") ~ "T2_CTL_m5",
startsWith(as.character(sample_ID), "T3_CTL_m5") ~ "T3_CTL_m5",
startsWith(as.character(sample_ID), "T2_50ppb_m6") ~ "T2_50ppb_m6",
startsWith(as.character(sample_ID), "T3_50ppb_m6") ~ "T3_50ppb_m6",
startsWith(as.character(sample_ID), "T3_CTL_m1") ~ "T3_CTL_m1",
startsWith(as.character(sample_ID), "T3_CTL_m3") ~ "T3_CTL_m3",
startsWith(as.character(sample_ID), "T3_50ppb_m1") ~ "T3_50ppb_m1",
startsWith(as.character(sample_ID), "T3_50ppb_m3") ~ "T3_50ppb_m3"
))
coverages_df$group <-as.factor(coverages_df$group)
sapply(coverages_df, levels)
## $sample_ID
## [1] "T1_CTL_m5b1" "T1_CTL_m5b2" "T1_CTL_m5b3" "T2_50ppb_m6b1"
## [5] "T2_50ppb_m6b2" "T2_50ppb_m6b3" "T2_CTL_m5b4" "T2_CTL_m5b5"
## [9] "T2_CTL_m5b6" "T3_50ppb_m1b4" "T3_50ppb_m1b5" "T3_50ppb_m1b6"
## [13] "T3_50ppb_m3b6" "T3_50ppb_m6b4" "T3_50ppb_m6b5" "T3_50ppb_m6b6"
## [17] "T3_CTL_m1b1" "T3_CTL_m1b2" "T3_CTL_m1b3" "T3_CTL_m3b1"
## [21] "T3_CTL_m5b7" "T3_CTL_m5b8" "T3_CTL_m5b9"
##
## $coverage
## NULL
##
## $methylated
## NULL
##
## $group
## [1] "T1_CTL_m5" "T2_50ppb_m6" "T2_CTL_m5" "T3_50ppb_m1" "T3_50ppb_m3"
## [6] "T3_50ppb_m6" "T3_CTL_m1" "T3_CTL_m3" "T3_CTL_m5"
str(coverages_df)
## 'data.frame': 191842863 obs. of 4 variables:
## $ sample_ID : Factor w/ 23 levels "T1_CTL_m5b1",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ coverage : num 1 1 1 1 1 1 1 1 1 1 ...
## $ methylated: num 0 0 0 0 0 0 0 0 0 0 ...
## $ group : Factor w/ 9 levels "T1_CTL_m5","T2_50ppb_m6",..: 3 3 3 3 3 3 3 3 3 3 ...
Lets calculate the overall level of methylation in each sample and then plot these levels for each group and test using ANOVA if there is significant difference in methylation levels between the groups.
frac<-function(x,y){
z<-x/y
return(z)
}
a<-mapply(frac,coverages_df$methylated,coverages_df$coverage)
coverages_df<-cbind(coverages_df, fraction=a)
coverages_df<-coverages_df[,1:5]
Calculating the overall methylation level for each sample.
coverages_sum<-tapply(coverages_df$coverage,as.factor(coverages_df$sample_ID),sum)
methylated_sum<-tapply(coverages_df$methylated,as.factor(coverages_df$sample_ID),sum)
bigFractions<-mapply(frac,methylated_sum,coverages_sum)
bigFractions<-as.data.frame(bigFractions)
bigFractions
bigFractions$row_names<-row.names(bigFractions)
bigFractions <- bigFractions %>%
mutate(group = case_when(
startsWith(as.character(row_names), "T1_CTL_m5") ~ "T1_CTL_m5",
startsWith(as.character(row_names), "T2_CTL_m5") ~ "T2_CTL_m5",
startsWith(as.character(row_names), "T3_CTL_m5") ~ "T3_CTL_m5",
startsWith(as.character(row_names), "T2_50ppb_m6") ~ "T2_50ppb_m6",
startsWith(as.character(row_names), "T3_50ppb_m6") ~ "T3_50ppb_m6",
startsWith(as.character(row_names), "T3_CTL_m1") ~ "T3_CTL_m1",
startsWith(as.character(row_names), "T3_CTL_m3") ~ "T3_CTL_m3",
startsWith(as.character(row_names), "T3_50ppb_m1") ~ "T3_50ppb_m1",
startsWith(as.character(row_names), "T3_50ppb_m3") ~ "T3_50ppb_m3"
))
bigFractions$group <-as.factor(bigFractions$group)
sapply(bigFractions, levels)
## $bigFractions
## NULL
##
## $row_names
## NULL
##
## $group
## [1] "T1_CTL_m5" "T2_50ppb_m6" "T2_CTL_m5" "T3_50ppb_m1" "T3_50ppb_m3"
## [6] "T3_50ppb_m6" "T3_CTL_m1" "T3_CTL_m3" "T3_CTL_m5"
Plotting the average methylation levels of all samples.
library(ggplot2)
library(ggstatsplot)
library(tidyverse)
gghistostats(
data = bigFractions,
x = bigFractions,
y = bigFractions,
title="Average Methylation level of all samples"
)
library(ggplot2)
library(ggstatsplot)
library(tidyverse)
#library(rsantools)
ggbetweenstats(
data = bigFractions,
x = group,
y = bigFractions,
title="Average Methylation level of all samples, grouped by sample type (timepoint-treatment-experiment)",
centrality.point.args = list(size = 3, color = "darkred"),
)
Now lets plot the coverage levels for each sample and group.
# Horizontal half violin plot
library(ggplot2)
coverages_df<-coverages_df[,1:5] #because if we run too many times the previous code chunk we get duplicate columns
ggplot(coverages_df[,1:2], ) + geom_violin(aes(coverage, sample_ID, fill = sample_ID )) +coord_flip() + theme(legend.position = "none") + coord_trans( x='log10') + scale_x_continuous(breaks=c(0,1,2,5,10,20,100,1000,6000)) + scale_fill_manual(values=c("cyan","cyan","cyan","orange","orange","orange","cyan3","cyan3","cyan3","purple","purple","purple","purple3","red","red","red","blue","blue","blue","blue3","cyan4","cyan4","cyan4"))
library(ggplot2)
ggplot(coverages_df, aes(x = coverage, fill = group)) + xlim(NA,30) + geom_histogram() + facet_grid(group ~ .) + scale_fill_manual(values=c("cyan","orange","cyan3","purple","purple3","red","blue","blue3","cyan4"))
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Next lets annotate what we find and see for overlaps between the DMRs of different comparisons. Reading the gff file so that we can give the proper gene IDs:
if (!require("ape")) install.packages("ape")
library(ape)
gff<-read.gff(file="GCF_003254395.2_Amel_HAv3.1_genomic.gff", GFF3 = TRUE)
getAttributeField <- function (x, field, attrsep = ";") {
s = strsplit(x, split = attrsep, fixed = TRUE)
sapply(s, function(atts) {
a = strsplit(atts, split = "=", fixed = TRUE)
m = match(field, sapply(a, "[", 1))
if (!is.na(m)) {
rv = a[[m]][2]
}
else {
rv = as.character(NA)
}
return(rv)
})
}
gff$name <- getAttributeField(gff$attributes, "Name")
gff$gene <- getAttributeField(gff$attributes, "gene")
Outputing the gene ID lists in csv files. First we make the final tables of DMRs with annotation. The annotation is done on chromosome regions only, so we have to append a table of NAs for the non annotated rows at the end of myDiffTilesx
TilesDataFinal_T1CTLvsT2CTL<-cbind(getData(myDiffTiles1_10p),rbind(diffAnnTiles1@dist.to.TSS[,2:4],c(NA,NA,NA)))
TilesDataFinal_T2CTLvsT3CTL<-cbind(getData(myDiffTiles2_10p),diffAnnTiles2@dist.to.TSS[,2:4])
TilesDataFinal_T1CTLvsT3CTL<-cbind(getData(myDiffTiles3_10p),rbind(diffAnnTiles3@dist.to.TSS[,2:4],data.frame(matrix(NA,nrow=6,ncol=3,dimnames=list(c(1:6),colnames(diffAnnTiles3@dist.to.TSS[,2:4]))))))
TilesDataFinal_T2CTLvsT2.50ppb<-cbind(getData(myDiffTiles4_10p),rbind(diffAnnTiles4@dist.to.TSS[,2:4],data.frame(matrix(NA,nrow=4,ncol=3,dimnames=list(c(1:4),colnames(diffAnnTiles4@dist.to.TSS[,2:4]))))))
TilesDataFinal_T3CTLvsT3.50ppb_ALL<-cbind(getData(myDiffTiles5_10p),rbind(diffAnnTiles5@dist.to.TSS[,2:4],data.frame(matrix(NA,nrow=5,ncol=3,dimnames=list(c(1:5),colnames(diffAnnTiles5@dist.to.TSS[,2:4]))))))
TilesDataFinal_T3CTLvsT3.50ppb_M5.6<-cbind(getData(myDiffTiles6_10p),diffAnnTiles6@dist.to.TSS[,2:4])
Lets add the annotation for promoter/exon/intron
TilesDataFinal_T1CTLvsT2CTL<-cbind(TilesDataFinal_T1CTLvsT2CTL,diffAnnTiles1@members)
TilesDataFinal_T2CTLvsT3CTL<-cbind(TilesDataFinal_T2CTLvsT3CTL,diffAnnTiles2@members)
TilesDataFinal_T1CTLvsT3CTL<-cbind(TilesDataFinal_T1CTLvsT3CTL,diffAnnTiles3@members)
TilesDataFinal_T2CTLvsT2.50ppb<-cbind(TilesDataFinal_T2CTLvsT2.50ppb,diffAnnTiles4@members)
TilesDataFinal_T3CTLvsT3.50ppb_ALL<-cbind(TilesDataFinal_T3CTLvsT3.50ppb_ALL,diffAnnTiles5@members)
TilesDataFinal_T3CTLvsT3.50ppb_M5.6<-cbind(TilesDataFinal_T3CTLvsT3.50ppb_M5.6,diffAnnTiles6@members)
Next we add the corresponding gene IDs and gene attributes from the gff annotation gene.obj object.
library(dplyr)
TilesDataFinal_T1CTLvsT2CTL<-TilesDataFinal_T1CTLvsT2CTL %>%
mutate(
gene_ID = gff$gene[pmatch(feature.name,gff$name, duplicates.ok = TRUE)],
gene_attributes = gff$attributes[pmatch(feature.name, gff$name, duplicates.ok = TRUE)]
)
TilesDataFinal_T2CTLvsT3CTL<-TilesDataFinal_T2CTLvsT3CTL %>%
mutate( gene_ID = gff$gene[pmatch(feature.name,gff$name, duplicates.ok = TRUE)] ,
gene_attributes = gff$attributes[pmatch(feature.name, gff$name, duplicates.ok = TRUE)]
)
TilesDataFinal_T1CTLvsT3CTL<-TilesDataFinal_T1CTLvsT3CTL %>%
mutate( gene_ID = gff$gene[pmatch(feature.name,gff$name, duplicates.ok = TRUE)] ,
gene_attributes = gff$attributes[pmatch(feature.name, gff$name, duplicates.ok = TRUE)]
)
TilesDataFinal_T2CTLvsT2.50ppb<-TilesDataFinal_T2CTLvsT2.50ppb %>%
mutate( gene_ID = gff$gene[pmatch(feature.name,gff$name, duplicates.ok = TRUE)] ,
gene_attributes = gff$attributes[pmatch(feature.name, gff$name, duplicates.ok = TRUE)]
)
TilesDataFinal_T3CTLvsT3.50ppb_ALL<-TilesDataFinal_T3CTLvsT3.50ppb_ALL %>%
mutate( gene_ID = gff$gene[pmatch(feature.name,gff$name, duplicates.ok = TRUE)] ,
gene_attributes = gff$attributes[pmatch(feature.name, gff$name, duplicates.ok = TRUE)]
)
TilesDataFinal_T3CTLvsT3.50ppb_M5.6<-TilesDataFinal_T3CTLvsT3.50ppb_M5.6 %>%
mutate( gene_ID = gff$gene[pmatch(feature.name,gff$name, duplicates.ok = TRUE)] ,
gene_attributes = gff$attributes[pmatch(feature.name, gff$name, duplicates.ok = TRUE)]
)
Check for intersections between the DMRs from different comparisons. First compare the DMR start:
intersect(TilesDataFinal_T1CTLvsT2CTL$start,TilesDataFinal_T1CTLvsT3CTL$start)
## [1] 11509926 11509951 303226 13474401 6525751 5169651 5169676 5843451
## [9] 5843476
intersect(TilesDataFinal_T1CTLvsT2CTL$start,TilesDataFinal_T2CTLvsT3CTL$start)
## integer(0)
intersect(TilesDataFinal_T1CTLvsT3CTL$start,TilesDataFinal_T2CTLvsT3CTL$start)
## [1] 7424176
intersect(TilesDataFinal_T1CTLvsT2CTL$start,TilesDataFinal_T2CTLvsT2.50ppb$start)
## integer(0)
intersect(TilesDataFinal_T1CTLvsT3CTL$start,TilesDataFinal_T2CTLvsT2.50ppb$start)
## integer(0)
intersect(TilesDataFinal_T1CTLvsT2CTL$start,TilesDataFinal_T3CTLvsT3.50ppb_ALL$start)
## integer(0)
intersect(TilesDataFinal_T1CTLvsT3CTL$start,TilesDataFinal_T3CTLvsT3.50ppb_ALL$start)
## integer(0)
intersect(TilesDataFinal_T1CTLvsT2CTL$start,TilesDataFinal_T3CTLvsT3.50ppb_M5.6$start)
## integer(0)
intersect(TilesDataFinal_T1CTLvsT3CTL$start,TilesDataFinal_T3CTLvsT3.50ppb_M5.6$start)
## [1] 9650601
intersect(TilesDataFinal_T2CTLvsT2.50ppb$start,TilesDataFinal_T3CTLvsT3.50ppb_M5.6$start)
## integer(0)
intersect(TilesDataFinal_T2CTLvsT2.50ppb$start,TilesDataFinal_T3CTLvsT3.50ppb_ALL$start)
## integer(0)
intersect(TilesDataFinal_T3CTLvsT3.50ppb_M5.6$start,TilesDataFinal_T3CTLvsT3.50ppb_ALL$start)
## integer(0)
Then compare the proximal (to the DMR) genes detected
intersect(TilesDataFinal_T1CTLvsT2CTL$gene_ID,TilesDataFinal_T1CTLvsT3CTL$gene_ID)
## [1] "LOC551612" "Mir6047a" "LOC411695" "LOC409303" "LOC102656106"
## [6] "LOC102654517" "LOC552850" "LOC551580"
intersect(TilesDataFinal_T1CTLvsT2CTL$gene_ID,TilesDataFinal_T2CTLvsT3CTL$gene_ID)
## character(0)
intersect(TilesDataFinal_T1CTLvsT3CTL$gene_ID,TilesDataFinal_T2CTLvsT3CTL$gene_ID)
## [1] "LOC113218900"
intersect(TilesDataFinal_T1CTLvsT2CTL$gene_ID,TilesDataFinal_T2CTLvsT2.50ppb$gene_ID)
## [1] NA "LOC551580"
intersect(TilesDataFinal_T1CTLvsT3CTL$gene_ID,TilesDataFinal_T2CTLvsT2.50ppb$gene_ID)
## [1] "LOC551580"
intersect(TilesDataFinal_T1CTLvsT2CTL$gene_ID,TilesDataFinal_T3CTLvsT3.50ppb_ALL$gene_ID)
## [1] "LOC551580"
intersect(TilesDataFinal_T1CTLvsT3CTL$gene_ID,TilesDataFinal_T3CTLvsT3.50ppb_ALL$gene_ID)
## [1] "LOC551580"
intersect(TilesDataFinal_T1CTLvsT2CTL$gene_ID,TilesDataFinal_T3CTLvsT3.50ppb_M5.6$gene_ID)
## [1] "LOC102656106"
intersect(TilesDataFinal_T1CTLvsT3CTL$gene_ID,TilesDataFinal_T3CTLvsT3.50ppb_M5.6$gene_ID)
## [1] "LOC102656106"
intersect(TilesDataFinal_T2CTLvsT2.50ppb$gene_ID,TilesDataFinal_T3CTLvsT3.50ppb_M5.6$gene_ID)
## character(0)
intersect(TilesDataFinal_T2CTLvsT2.50ppb$gene_ID,TilesDataFinal_T3CTLvsT3.50ppb_ALL$gene_ID)
## [1] "LOC100577216" "LOC551580"
intersect(TilesDataFinal_T3CTLvsT3.50ppb_M5.6$gene_ID,TilesDataFinal_T3CTLvsT3.50ppb_ALL$gene_ID)
## character(0)
We can see that only the comparison between T1->T2 CTLs and T1->T3 CTLs yields a good number of common DMRs. LOC551580 is a spurious annotation, since it comes when the feature_name is NA. We can ignore this as it refers to DMRs on contigs like QIUM02000109 that are not on chromosomes. LOC102656106 is very interesting, as it appears on both T1->T2 CTLs and T1->T3 CTLs as well as T3CTL-T3SFLX. This is a gene that changes methylation during the training phase but also responds to SFLX 50ppb treatment. According to NCBI it is an ncRNA. However, a quick check in the tables and we can see that the DMR detected is distal to the gene, and, interestingly, it is methylated during normal training but demethylated under SFLX treatment:
TilesDataFinal_T1CTLvsT2CTL[pmatch(intersect(TilesDataFinal_T1CTLvsT2CTL$gene_ID,TilesDataFinal_T3CTLvsT3.50ppb_M5.6$gene_ID),TilesDataFinal_T1CTLvsT2CTL$gene_ID),]
TilesDataFinal_T3CTLvsT3.50ppb_M5.6[pmatch(intersect(TilesDataFinal_T1CTLvsT2CTL$gene_ID,TilesDataFinal_T3CTLvsT3.50ppb_M5.6$gene_ID),TilesDataFinal_T3CTLvsT3.50ppb_M5.6$gene_ID),]
Next lets make a final table with all the DMRs present in multiple comparisons
Intersection_of_DMRs<-TilesDataFinal_T1CTLvsT2CTL[pmatch(intersect(TilesDataFinal_T1CTLvsT2CTL$start,TilesDataFinal_T1CTLvsT3CTL$start),TilesDataFinal_T1CTLvsT2CTL$start),]
Intersection_of_DMRs<-Intersection_of_DMRs %>%
mutate( from_comparison="T1CTLvsT2CTL")
tmp<-TilesDataFinal_T1CTLvsT3CTL[pmatch(intersect(TilesDataFinal_T1CTLvsT2CTL$start,TilesDataFinal_T1CTLvsT3CTL$start),TilesDataFinal_T1CTLvsT3CTL$start),]
tmp<-tmp %>%
mutate( from_comparison="T1CTLvsT3CTL")
Intersection_of_DMRs<-rbind(Intersection_of_DMRs,tmp)
tmp<-TilesDataFinal_T1CTLvsT3CTL[pmatch(intersect(TilesDataFinal_T1CTLvsT3CTL$start,TilesDataFinal_T2CTLvsT3CTL$start),TilesDataFinal_T1CTLvsT3CTL$start),]
tmp<-tmp %>%
mutate( from_comparison="T1CTLvsT3CTL")
Intersection_of_DMRs<-rbind(Intersection_of_DMRs,tmp)
tmp<-TilesDataFinal_T2CTLvsT3CTL[pmatch(intersect(TilesDataFinal_T1CTLvsT3CTL$start,TilesDataFinal_T2CTLvsT3CTL$start),TilesDataFinal_T2CTLvsT3CTL$start),]
tmp<-tmp %>%
mutate( from_comparison="T2CTLvsT3CTL")
Intersection_of_DMRs<-rbind(Intersection_of_DMRs,tmp)
tmp<-TilesDataFinal_T1CTLvsT2CTL[pmatch(intersect(TilesDataFinal_T1CTLvsT2CTL$gene_ID,TilesDataFinal_T3CTLvsT3.50ppb_M5.6$gene_ID),TilesDataFinal_T1CTLvsT2CTL$gene_ID),]
tmp<-tmp %>%
mutate( from_comparison="T1CTLvsT2CTL")
Intersection_of_DMRs<-rbind(Intersection_of_DMRs,tmp)
tmp<-TilesDataFinal_T3CTLvsT3.50ppb_M5.6[pmatch(intersect(TilesDataFinal_T1CTLvsT2CTL$gene_ID,TilesDataFinal_T3CTLvsT3.50ppb_M5.6$gene_ID),TilesDataFinal_T3CTLvsT3.50ppb_M5.6$gene_ID),]
tmp<-tmp %>%
mutate( from_comparison="T3CTLvsT3.50ppb_M5.6")
Intersection_of_DMRs<-rbind(Intersection_of_DMRs,tmp)
tmp<-TilesDataFinal_T1CTLvsT3CTL[pmatch(intersect(TilesDataFinal_T1CTLvsT3CTL$gene_ID,TilesDataFinal_T3CTLvsT3.50ppb_M5.6$gene_ID),TilesDataFinal_T1CTLvsT3CTL$gene_ID),]
tmp<-tmp %>%
mutate( from_comparison="T1CTLvsT3CTL")
Intersection_of_DMRs<-rbind(Intersection_of_DMRs,tmp)
tmp<-TilesDataFinal_T2CTLvsT2.50ppb[pmatch(intersect(TilesDataFinal_T2CTLvsT2.50ppb$gene_ID,TilesDataFinal_T3CTLvsT3.50ppb_ALL$gene_ID),TilesDataFinal_T2CTLvsT2.50ppb$gene_ID),]
tmp<-tmp %>%
mutate( from_comparison="T2CTLvsT2.50ppb")
Intersection_of_DMRs<-rbind(Intersection_of_DMRs,tmp)
tmp<-TilesDataFinal_T3CTLvsT3.50ppb_ALL[pmatch(intersect(TilesDataFinal_T2CTLvsT2.50ppb$gene_ID,TilesDataFinal_T3CTLvsT3.50ppb_ALL$gene_ID),TilesDataFinal_T3CTLvsT3.50ppb_ALL$gene_ID),]
tmp<-tmp %>%
mutate( from_comparison="T3CTLvsT3.50ppb_ALL")
Intersection_of_DMRs<-rbind(Intersection_of_DMRs,tmp)
Intersection_of_DMRs<-arrange(Intersection_of_DMRs,start,from_comparison)
Intersection_of_DMRs<-Intersection_of_DMRs %>% relocate(from_comparison)
We write everything in .csv files.
write.table(Intersection_of_DMRs, file="T-all_Intersection_of_DMRs_from_all_comparisons.DMRs_present_in_at_least_two_comparisons.csv", row.names=FALSE, col.names=TRUE, quote=FALSE, sep=", ", na ="NA")
write.table(TilesDataFinal_T1CTLvsT2CTL, file="T1-CTL_vs_T2-CTL_M5_M6_DMRs.csv", row.names=FALSE, col.names=TRUE, quote=FALSE, sep=", ", na ="NA")
write.table(TilesDataFinal_T1CTLvsT3CTL, file="T1-CTL_vs_T3-CTL_M5_M6_DMRs.csv", row.names=FALSE, col.names=TRUE, quote=FALSE, sep=", ", na ="NA")
write.table(TilesDataFinal_T2CTLvsT3CTL, file="T2-CTL_vs_T3-CTL_M5_M6_DMRs.csv", row.names=FALSE, col.names=TRUE, quote=FALSE, sep=", ", na ="NA")
write.table(TilesDataFinal_T2CTLvsT2.50ppb, file="T2-CTL_vs_T2-50ppb-SFLX_M5_M6_DMRs.csv", row.names=FALSE, col.names=TRUE, quote=FALSE, sep=", ", na ="NA")
write.table(TilesDataFinal_T3CTLvsT3.50ppb_ALL, file="T3-CTL_vs_T3-50ppb-SFLX_All_M1_M3_M5_M6_DMRs.csv", row.names=FALSE, col.names=TRUE, quote=FALSE, sep=", ", na ="NA")
write.table(TilesDataFinal_T3CTLvsT3.50ppb_M5.6, file="T3-CTL_vs_T3-50ppb-SFLX_M5_M6_DMRs.csv", row.names=FALSE, col.names=TRUE, quote=FALSE, sep=", ", na ="NA")
All data can be found in the files above, as well as an Excel spreadsheet I prepared from them. The lists of gene_IDs can be taken to HymenopteraMine for further analysis.