My DMPs vs Reference DMPs

library(GenomicRanges)

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: GenomeInfoDb

library(rtracklayer)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union

## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect

## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union

## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union

## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:BiocGenerics':
## 
##     combine

setwd("/Users/joelbrunomendes/Documents/Methylation_data/run0/Control_DMP")

# Function to read BED.gz file
read_bed_gz <- function(bed_file) {
  # Read the BED file (assuming standard BED format: chr, start, end, ...)
  bed_data <- read.table(bed_file, header = FALSE, sep = "\t", stringsAsFactors = FALSE)
  
  # Create GRanges object (assuming first 3 columns are chr, start, end)
  gr <- GRanges(
    seqnames = bed_data[,1],
    ranges = IRanges(start = bed_data[,2] + 1, end = bed_data[,3]),  # Convert to 1-based
    strand = "*"
  )
  
  # Add additional columns if they exist
  if(ncol(bed_data) > 3) {
    mcols(gr) <- bed_data[,4:ncol(bed_data)]
  }
  
  return(gr)
}

# Function to analyze DMR distances
analyze_dmr_distances <- function(dmrs, genes, dataset_name) {
  cat(sprintf("Analyzing %s dataset...\n", dataset_name))
  
  # Use distanceToNearest which returns a Hits object
  nearest_results <- distanceToNearest(dmrs, genes, ignore.strand = TRUE)
  
  # Extract the components
  query_hits <- queryHits(nearest_results)
  subject_hits <- subjectHits(nearest_results)
  distances <- mcols(nearest_results)$distance
  
  cat(sprintf("%s: DMRs with nearest genes found: %d out of %d\n", 
      dataset_name, length(query_hits), length(dmrs)))
  
  # Check for missing matches
  missing_dmrs <- setdiff(1:length(dmrs), query_hits)
  if(length(missing_dmrs) > 0) {
    cat(sprintf("Warning: %d DMRs in %s had no nearest gene match\n", 
        length(missing_dmrs), dataset_name))
  }
  
  # Create results data frame
  results <- data.frame(
    dataset = dataset_name,
    dmr_index = query_hits,
    nearest_gene_index = subject_hits,
    distance = distances,
    gene_id = mcols(genes)$gene_id[subject_hits],
    gene_name = ifelse(is.null(mcols(genes)$gene_name), 
                       mcols(genes)$gene_id[subject_hits],
                       mcols(genes)$gene_name[subject_hits])
  )
  
  return(list(results = results, distances = distances))
}

# Function to create distance categories
create_distance_categories <- function(distances, dataset_name) {
  data.frame(
    Dataset = dataset_name,
    Category = c("Overlapping (0 bp)", "≤1 kb", "≤5 kb", "≤10 kb", "≤50 kb", ">50 kb"),
    Count = c(
      sum(distances == 0),
      sum(distances <= 1000),
      sum(distances <= 5000),
      sum(distances <= 10000),
      sum(distances <= 50000),
      sum(distances > 50000)
    ),
    Total = length(distances)
  )
}

# Read your files
cat("Reading first DMR BED file...\n")

## Reading first DMR BED file...

dmrs1 <- read_bed_gz("/Users/joelbrunomendes/Documents/Methylation_data/run0/Control_DMP/GSE186458_blocks.s205.bed.gz")  # Replace with your first DMR file path
dataset1_name <- "Reference"  # Change this name as needed
seqlevels(dmrs1) <- sub("^chr", "", seqlevels(dmrs1))

cat("Reading second DMR BED file...\n")

## Reading second DMR BED file...

dmrs2 <- read_bed_gz("/Users/joelbrunomendes/Documents/Methylation_data/run0/Control_DMP/dmp_gr_for_ucsc1.bed")  # Replace with your second DMR file path
dataset2_name <- "My DMPs"  # Change this name as needed
seqlevels(dmrs2) <- sub("^chr", "", seqlevels(dmrs2))

cat("Reading GTF file...\n")

## Reading GTF file...

gtf <- import("/Users/joelbrunomendes/Documents/Methylation_data/run0/Control_DMP/GRCh38.gtf")  # Replace with your actual file path

# Filter GTF for genes only (not transcripts/exons)
genes <- gtf[gtf$type == "gene"]

# If no 'gene' entries, try 'transcript' and collapse to gene level
if(length(genes) == 0) {
  transcripts <- gtf[gtf$type == "transcript"]
  # Group by gene_id to get gene boundaries
  genes_df <- as.data.frame(transcripts) %>%
    group_by(gene_id) %>%
    summarise(
      seqnames = first(seqnames),
      start = min(start),
      end = max(end),
      strand = first(strand),
      gene_name = first(gene_name),
      .groups = 'drop'
    )
  
  genes <- GRanges(
    seqnames = genes_df$seqnames,
    ranges = IRanges(start = genes_df$start, end = genes_df$end),
    strand = genes_df$strand,
    gene_id = genes_df$gene_id,
    gene_name = genes_df$gene_name
  )
}

cat(sprintf("Found %d DMRs in dataset 1, %d DMRs in dataset 2, and %d genes\n", 
    length(dmrs1), length(dmrs2), length(genes)))

## Found 7104162 DMRs in dataset 1, 8042 DMRs in dataset 2, and 78686 genes

# Analyze both datasets
analysis1 <- analyze_dmr_distances(dmrs1, genes, dataset1_name)

## Analyzing Reference dataset...

## Warning in .merge_two_Seqinfo_objects(x, y): Each of the 2 combined objects has sequence levels not in the other:
##   - in 'x': M
##   - in 'y': MT
##   Make sure to always combine/compare objects based on the same reference
##   genome (use suppressWarnings() to suppress this warning).

## Reference: DMRs with nearest genes found: 7104055 out of 7104162
## Warning: 107 DMRs in Reference had no nearest gene match

analysis2 <- analyze_dmr_distances(dmrs2, genes, dataset2_name)

## Analyzing My DMPs dataset...
## My DMPs: DMRs with nearest genes found: 8042 out of 8042

distances1 <- analysis1$distances
distances2 <- analysis2$distances

# Combine results
combined_results <- rbind(analysis1$results, analysis2$results)

# Summary statistics comparison
cat("\n=== COMPARATIVE SUMMARY STATISTICS ===\n")

## 
## === COMPARATIVE SUMMARY STATISTICS ===

cat(sprintf("Dataset 1 (%s):\n", dataset1_name))

## Dataset 1 (Reference):

cat(sprintf("  Count: %d DMRs\n", length(distances1)))

##   Count: 7104055 DMRs

cat(sprintf("  Median: %.0f bp\n", median(distances1)))

##   Median: 0 bp

cat(sprintf("  Mean: %.0f bp\n", mean(distances1)))

##   Mean: 18539 bp

cat(sprintf("  Q1: %.0f bp, Q3: %.0f bp\n", quantile(distances1, 0.25), quantile(distances1, 0.75)))

##   Q1: 0 bp, Q3: 2001 bp

cat(sprintf("\nDataset 2 (%s):\n", dataset2_name))

## 
## Dataset 2 (My DMPs):

cat(sprintf("  Count: %d DMRs\n", length(distances2)))

##   Count: 8042 DMRs

cat(sprintf("  Median: %.0f bp\n", median(distances2)))

##   Median: 0 bp

cat(sprintf("  Mean: %.0f bp\n", mean(distances2)))

##   Mean: 63 bp

cat(sprintf("  Q1: %.0f bp, Q3: %.0f bp\n", quantile(distances2, 0.25), quantile(distances2, 0.75)))

##   Q1: 0 bp, Q3: 0 bp

# Statistical test
wilcox_test <- wilcox.test(distances1, distances2)
cat(sprintf("\nWilcoxon rank-sum test p-value: %.2e\n", wilcox_test$p.value))

## 
## Wilcoxon rank-sum test p-value: 1.62e-202

# Distance categories comparison
cat_data1 <- create_distance_categories(distances1, dataset1_name)
cat_data2 <- create_distance_categories(distances2, dataset2_name)
cat_data1$Percentage <- round(cat_data1$Count / cat_data1$Total * 100, 1)
cat_data2$Percentage <- round(cat_data2$Count / cat_data2$Total * 100, 1)

combined_categories <- rbind(cat_data1, cat_data2)

cat("\n=== DISTANCE CATEGORIES COMPARISON ===\n")

## 
## === DISTANCE CATEGORIES COMPARISON ===

print(combined_categories)

##      Dataset           Category   Count   Total Percentage
## 1  Reference Overlapping (0 bp) 5052329 7104055       71.1
## 2  Reference              ≤1 kb 5201746 7104055       73.2
## 3  Reference              ≤5 kb 5619392 7104055       79.1
## 4  Reference             ≤10 kb 5939978 7104055       83.6
## 5  Reference             ≤50 kb 6719248 7104055       94.6
## 6  Reference             >50 kb  384807 7104055        5.4
## 7    My DMPs Overlapping (0 bp)    6576    8042       81.8
## 8    My DMPs              ≤1 kb    7927    8042       98.6
## 9    My DMPs              ≤5 kb    8041    8042      100.0
## 10   My DMPs             ≤10 kb    8041    8042      100.0
## 11   My DMPs             ≤50 kb    8042    8042      100.0
## 12   My DMPs             >50 kb       0    8042        0.0

# Create comparative visualizations
cat("\nCreating comparative plots...\n")

## 
## Creating comparative plots...

# Prepare data for plotting
plot_data <- data.frame(
  distance = c(distances1, distances2),
  dataset = c(rep(dataset1_name, length(distances1)), 
              rep(dataset2_name, length(distances2)))
)

# 1. Comparative histogram
p1 <- ggplot(plot_data, aes(x = distance, fill = dataset)) +
  geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
  scale_x_log10(labels = scales::comma) +
  scale_fill_manual(values = c("steelblue", "coral")) +
  labs(title = "Comparative Distribution of DMR-to-Gene Distances",
       subtitle = paste("Dataset 1: n =", length(distances1), 
                        "| Dataset 2: n =", length(distances2)),
       x = "Distance to nearest gene (bp, log scale)",
       y = "Count",
       fill = "Dataset") +
  theme_minimal() +
  theme(legend.position = "top")

# 2. Comparative density plots
p2 <- ggplot(plot_data, aes(x = distance, fill = dataset)) +
  geom_density(alpha = 0.5) +
  scale_x_log10(labels = scales::comma) +
  scale_fill_manual(values = c("steelblue", "coral")) +
  labs(title = "Comparative Density Distribution",
       x = "Distance to nearest gene (bp, log scale)",
       y = "Density",
       fill = "Dataset") +
  theme_minimal() +
  theme(legend.position = "top")

# 3. Side-by-side boxplots
p3 <- ggplot(plot_data, aes(x = dataset, y = distance, fill = dataset)) +
  geom_boxplot(alpha = 0.7) +
  scale_y_log10(labels = scales::comma) +
  scale_fill_manual(values = c("steelblue", "coral")) +
  labs(title = "Distance Distribution Comparison",
       subtitle = paste("Wilcoxon test p-value:", format(wilcox_test$p.value, scientific = TRUE)),
       x = "Dataset",
       y = "Distance (bp, log scale)") +
  theme_minimal() +
  theme(legend.position = "none")

# 4. Cumulative distribution comparison
distances1_sorted <- sort(distances1)
distances2_sorted <- sort(distances2)
cum_data <- data.frame(
  distance = c(distances1_sorted, distances2_sorted),
  cumulative = c((1:length(distances1_sorted)) / length(distances1_sorted),
                 (1:length(distances2_sorted)) / length(distances2_sorted)),
  dataset = c(rep(dataset1_name, length(distances1_sorted)),
              rep(dataset2_name, length(distances2_sorted)))
)

p4 <- ggplot(cum_data, aes(x = distance, y = cumulative, color = dataset)) +
  geom_line(size = 1) +
  scale_x_log10(labels = scales::comma) +
  scale_y_continuous(labels = scales::percent) +
  scale_color_manual(values = c("steelblue", "coral")) +
  labs(title = "Cumulative Distribution Comparison",
       x = "Distance to nearest gene (bp, log scale)",
       y = "Cumulative percentage of DMRs",
       color = "Dataset") +
  theme_minimal() +
  theme(legend.position = "top")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 5. Category comparison barplot
p5 <- ggplot(combined_categories, aes(x = Category, y = Percentage, fill = Dataset)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
  scale_fill_manual(values = c("steelblue", "coral")) +
  labs(title = "Distance Categories Comparison",
       x = "Distance Category",
       y = "Percentage of DMRs",
       fill = "Dataset") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "top")

# Display plots
print(p1)

## Warning in scale_x_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

## Warning: Removed 5058905 rows containing non-finite outside the scale range
## (`stat_bin()`).

print(p2)

## Warning in scale_x_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

## Warning: Removed 5058905 rows containing non-finite outside the scale range
## (`stat_density()`).

print(p3)

## Warning in scale_y_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

## Warning: Removed 5058905 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

print(p4)

## Warning in scale_x_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

print(p5)

# Create a summary plot combining key comparisons
p_summary <- grid.arrange(p3, p4, ncol = 2, 
                         top = "DMR Distance Distribution Summary")

## Warning in scale_y_log10(labels = scales::comma): log-10 transformation introduced infinite values.
## Removed 5058905 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning in scale_x_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

# Save results
cat("\nSaving results...\n")

## 
## Saving results...

write.csv(combined_results, "comparative_dmr_gene_distances.csv", row.names = FALSE)
write.csv(combined_categories, "comparative_distance_summary.csv", row.names = FALSE)

# Save plots
ggsave("comparative_histogram.png", p1, width = 12, height = 8, dpi = 300)

## Warning in scale_x_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

## Warning: Removed 5058905 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggsave("comparative_density.png", p2, width = 12, height = 8, dpi = 300)

## Warning in scale_x_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

## Warning: Removed 5058905 rows containing non-finite outside the scale range
## (`stat_density()`).

ggsave("comparative_boxplot.png", p3, width = 10, height = 8, dpi = 300)

## Warning in scale_y_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

## Warning: Removed 5058905 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggsave("comparative_cumulative.png", p4, width = 12, height = 8, dpi = 300)

## Warning in scale_x_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

ggsave("comparative_categories.png", p5, width = 12, height = 8, dpi = 300)
ggsave("comparative_summary.png", p_summary, width = 16, height = 8, dpi = 300)

# Additional analysis: overlap between datasets
cat("\n=== OVERLAP ANALYSIS ===\n")

## 
## === OVERLAP ANALYSIS ===

overlap <- findOverlaps(dmrs1, dmrs2)
overlapping_dmrs1 <- unique(queryHits(overlap))
overlapping_dmrs2 <- unique(subjectHits(overlap))

cat(sprintf("DMRs overlapping between datasets:\n"))

## DMRs overlapping between datasets:

cat(sprintf("  Dataset 1: %d/%d (%.1f%%)\n", 
    length(overlapping_dmrs1), length(dmrs1), 
    length(overlapping_dmrs1)/length(dmrs1)*100))

##   Dataset 1: 4426/7104162 (0.1%)

cat(sprintf("  Dataset 2: %d/%d (%.1f%%)\n", 
    length(overlapping_dmrs2), length(dmrs2), 
    length(overlapping_dmrs2)/length(dmrs2)*100))

##   Dataset 2: 4932/8042 (61.3%)

cat("Analysis complete! Check the generated comparative plots and CSV files.\n")

## Analysis complete! Check the generated comparative plots and CSV files.

# Create comparative visualizations
cat("\nCreating comparative plots...\n")

## 
## Creating comparative plots...

# Prepare data for plotting
plot_data <- data.frame(
  distance = c(distances1, distances2),
  dataset = c(rep(dataset1_name, length(distances1)), 
              rep(dataset2_name, length(distances2)))
)

# 1. Comparative histogram
p1 <- ggplot(plot_data, aes(x = distance, fill = dataset)) +
  geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
  scale_x_log10(labels = scales::comma) +
  scale_fill_manual(values = c("steelblue", "coral")) +
  labs(title = "Comparative Distribution of DMR-to-Gene Distances",
       subtitle = paste("Dataset 1: n =", length(distances1), 
                        "| Dataset 2: n =", length(distances2)),
       x = "Distance to nearest gene (bp, log scale)",
       y = "Count",
       fill = "Dataset") +
  theme_minimal() +
  theme(legend.position = "top")

# 2. Comparative density plots
p2 <- ggplot(plot_data, aes(x = distance, fill = dataset)) +
  geom_density(alpha = 0.5) +
  scale_x_log10(labels = scales::comma) +
  scale_fill_manual(values = c("steelblue", "coral")) +
  labs(title = "Comparative Density Distribution",
       x = "Distance to nearest gene (bp, log scale)",
       y = "Density",
       fill = "Dataset") +
  theme_minimal() +
  theme(legend.position = "top")

# 3. Side-by-side boxplots
p3 <- ggplot(plot_data, aes(x = dataset, y = distance, fill = dataset)) +
  geom_boxplot(alpha = 0.7) +
  scale_y_log10(labels = scales::comma) +
  scale_fill_manual(values = c("steelblue", "coral")) +
  labs(title = "Distance Distribution Comparison",
       subtitle = paste("Wilcoxon test p-value:", format(wilcox_test$p.value, scientific = TRUE)),
       x = "Dataset",
       y = "Distance (bp, log scale)") +
  theme_minimal() +
  theme(legend.position = "none")

# 4. Cumulative distribution comparison
distances1_sorted <- sort(distances1)
distances2_sorted <- sort(distances2)
cum_data <- data.frame(
  distance = c(distances1_sorted, distances2_sorted),
  cumulative = c((1:length(distances1_sorted)) / length(distances1_sorted),
                 (1:length(distances2_sorted)) / length(distances2_sorted)),
  dataset = c(rep(dataset1_name, length(distances1_sorted)),
              rep(dataset2_name, length(distances2_sorted)))
)

p4 <- ggplot(cum_data, aes(x = distance, y = cumulative, color = dataset)) +
  geom_line(size = 1) +
  scale_x_log10(labels = scales::comma) +
  scale_y_continuous(labels = scales::percent) +
  scale_color_manual(values = c("steelblue", "coral")) +
  labs(title = "Cumulative Distribution Comparison",
       x = "Distance to nearest gene (bp, log scale)",
       y = "Cumulative percentage of DMRs",
       color = "Dataset") +
  theme_minimal() +
  theme(legend.position = "top")

# 5. Category comparison barplot
p5 <- ggplot(combined_categories, aes(x = Category, y = Percentage, fill = Dataset)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
  scale_fill_manual(values = c("steelblue", "coral")) +
  labs(title = "Distance Categories Comparison",
       x = "Distance Category",
       y = "Percentage of DMRs",
       fill = "Dataset") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "top")

# Display plots
print(p1)

## Warning in scale_x_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

## Warning: Removed 5058905 rows containing non-finite outside the scale range
## (`stat_bin()`).

print(p2)

## Warning in scale_x_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

## Warning: Removed 5058905 rows containing non-finite outside the scale range
## (`stat_density()`).

print(p3)

## Warning in scale_y_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

## Warning: Removed 5058905 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

print(p4)

## Warning in scale_x_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

print(p5)

# Create a summary plot combining key comparisons
p_summary <- grid.arrange(p3, p4, ncol = 2, 
                         top = "DMR Distance Distribution Summary")

## Warning in scale_y_log10(labels = scales::comma): log-10 transformation introduced infinite values.
## Removed 5058905 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning in scale_x_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

# Save results
cat("\nSaving results...\n")

## 
## Saving results...

write.csv(combined_results, "comparative_dmr_gene_distances.csv", row.names = FALSE)
write.csv(combined_categories, "comparative_distance_summary.csv", row.names = FALSE)

# Save plots
ggsave("comparative_histogram.png", p1, width = 12, height = 8, dpi = 300)

## Warning in scale_x_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

## Warning: Removed 5058905 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggsave("comparative_density.png", p2, width = 12, height = 8, dpi = 300)

## Warning in scale_x_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

## Warning: Removed 5058905 rows containing non-finite outside the scale range
## (`stat_density()`).

ggsave("comparative_boxplot.png", p3, width = 10, height = 8, dpi = 300)

## Warning in scale_y_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

## Warning: Removed 5058905 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggsave("comparative_cumulative.png", p4, width = 12, height = 8, dpi = 300)

## Warning in scale_x_log10(labels = scales::comma): log-10 transformation
## introduced infinite values.

ggsave("comparative_categories.png", p5, width = 12, height = 8, dpi = 300)
ggsave("comparative_summary.png", p_summary, width = 16, height = 8, dpi = 300)

# Additional analysis: overlap between datasets
cat("\n=== OVERLAP ANALYSIS ===\n")

## 
## === OVERLAP ANALYSIS ===

overlap <- findOverlaps(dmrs1, dmrs2)
overlapping_dmrs1 <- unique(queryHits(overlap))
overlapping_dmrs2 <- unique(subjectHits(overlap))

cat(sprintf("DMRs overlapping between datasets:\n"))

## DMRs overlapping between datasets:

cat(sprintf("  Dataset 1: %d/%d (%.1f%%)\n", 
    length(overlapping_dmrs1), length(dmrs1), 
    length(overlapping_dmrs1)/length(dmrs1)*100))

##   Dataset 1: 4426/7104162 (0.1%)

cat(sprintf("  Dataset 2: %d/%d (%.1f%%)\n", 
    length(overlapping_dmrs2), length(dmrs2), 
    length(overlapping_dmrs2)/length(dmrs2)*100))

##   Dataset 2: 4932/8042 (61.3%)

cat("Analysis complete! Check the generated comparative plots and CSV files.\n")

## Analysis complete! Check the generated comparative plots and CSV files.