Intersecting BED sequences

Bioinformatics
Tidyverse
R
Author

Agalic Rodriguez-Duboc

Published

May 24, 2023

Setup

file_path1 <- system.file("extdata", "example_merge.bed", package = "bedtorch")

(tbl_x <- read_bed(file_path1, genome = "hs37-1kg"))
GRanges object with 20 ranges and 1 metadata column:
       seqnames    ranges strand |     score
          <Rle> <IRanges>  <Rle> | <integer>
   [1]       21       4-6      * |         8
   [2]       21     11-16      * |         5
   [3]       21     13-17      * |         4
   [4]       21     16-22      * |         5
   [5]       21     23-25      * |         7
   ...      ...       ...    ... .       ...
  [16]       22     52-59      * |         9
  [17]       22     54-57      * |         5
  [18]       22     58-65      * |         9
  [19]       22     63-70      * |         8
  [20]       22     66-72      * |         2
  -------
  seqinfo: 92 sequences from GRCh37 genome
df_x <- as.data.frame(tbl_x)
file_path2 <- system.file("extdata", "example_intersect_y.bed", package = "bedtorch")

(tbl_y <- read_bed(file_path2, genome = "hs37-1kg"))
GRanges object with 3 ranges and 0 metadata columns:
      seqnames    ranges strand
         <Rle> <IRanges>  <Rle>
  [1]       21     23-36      *
  [2]       21     48-55      *
  [3]       22     41-60      *
  -------
  seqinfo: 92 sequences from GRCh37 genome
df_y <- as.data.frame(tbl_y)

1 Using the tidyverse:

(inner_join(df_x, df_y, join_by(seqnames, overlaps(x$start, x$end, y$start, y$end))) 
  |> rowwise()
  |> mutate(
    start = min(intersect(start.x:end.x, start.y:end.y)), 
    end = max(intersect(start.x:end.x, start.y:end.y)),
    width = end - start + 1
  )
  |> ungroup() 
  |> select(seqnames, start, end, width, strand = strand.x, score)
)
# A tibble: 12 × 6
   seqnames start   end width strand score
   <fct>    <int> <int> <dbl> <fct>  <int>
 1 21          23    25     3 *          7
 2 21          27    30     4 *          7
 3 21          30    35     6 *          9
 4 21          48    49     2 *          1
 5 21          48    50     3 *          2
 6 21          54    55     2 *          5
 7 22          41    41     1 *          8
 8 22          43    44     2 *          9
 9 22          47    50     4 *          0
10 22          52    59     8 *          9
11 22          54    57     4 *          5
12 22          58    60     3 *          9

A more efficient solution using R’s vectorized pmin & pmax:

(inner_join(df_x, df_y, join_by(seqnames, overlaps(x$start, x$end, y$start, y$end))) 
  |> mutate(
    start = pmax(start.x, start.y), 
    end = pmin(end.x, end.y), 
    width = end - start + 1
  )
  |> select(seqnames, start, end, width, strand = strand.x, score)
)
   seqnames start end width strand score
1        21    23  25     3      *     7
2        21    27  30     4      *     7
3        21    30  35     6      *     9
4        21    48  49     2      *     1
5        21    48  50     3      *     2
6        21    54  55     2      *     5
7        22    41  41     1      *     8
8        22    43  44     2      *     9
9        22    47  50     4      *     0
10       22    52  59     8      *     9
11       22    54  57     4      *     5
12       22    58  60     3      *     9

2 Using the specialized package bedtorch:

intersect_bed(tbl_x, tbl_y)
GRanges object with 12 ranges and 1 metadata column:
       seqnames    ranges strand |     score
          <Rle> <IRanges>  <Rle> | <integer>
   [1]       21     23-25      * |         7
   [2]       21     27-30      * |         7
   [3]       21     30-35      * |         9
   [4]       21     48-49      * |         1
   [5]       21     48-50      * |         2
   ...      ...       ...    ... .       ...
   [8]       22     43-44      * |         9
   [9]       22     47-50      * |         0
  [10]       22     52-59      * |         9
  [11]       22     54-57      * |         5
  [12]       22     58-60      * |         9
  -------
  seqinfo: 92 sequences from GRCh37 genome