Load the libs you need
library(tidyverse)
library(annotables)
Function to create it
create_bed_from_annotable <- function (annotable, mybiotype) {
annotable %>%
filter(biotype==mybiotype) %>%
select(chr, start, end, strand, symbol) %>%
distinct() %>%
arrange(chr, start, end, strand)
}
LincRNAs on grch37:
create_bed_from_annotable(grch37, "lincRNA")
## # A tibble: 7,340 x 5
## chr start end strand symbol
## <chr> <int> <int> <int> <chr>
## 1 1 29554 31109 1 MIR1302-10
## 2 1 34554 36081 -1 FAM138A
## 3 1 89295 133566 -1 RP11-34P13.7
## 4 1 89551 91105 -1 RP11-34P13.8
## 5 1 160446 161525 1 RP11-34P13.9
## 6 1 227615 267253 -1 AP006222.2
## 7 1 317720 453948 1 RP4-669L17.10
## 8 1 453633 460480 -1 RP4-669L17.2
## 9 1 459656 461954 1 RP5-857K21.15
## 10 1 521369 523833 -1 RP5-857K21.1
## # ... with 7,330 more rows
Or with the chr
create_bed_from_annotable(grch37, "lincRNA") %>%
mutate(chr=paste0("chr", chr))
## # A tibble: 7,340 x 5
## chr start end strand symbol
## <chr> <int> <int> <int> <chr>
## 1 chr1 29554 31109 1 MIR1302-10
## 2 chr1 34554 36081 -1 FAM138A
## 3 chr1 89295 133566 -1 RP11-34P13.7
## 4 chr1 89551 91105 -1 RP11-34P13.8
## 5 chr1 160446 161525 1 RP11-34P13.9
## 6 chr1 227615 267253 -1 AP006222.2
## 7 chr1 317720 453948 1 RP4-669L17.10
## 8 chr1 453633 460480 -1 RP4-669L17.2
## 9 chr1 459656 461954 1 RP5-857K21.15
## 10 chr1 521369 523833 -1 RP5-857K21.1
## # ... with 7,330 more rows
Do it with protein coding on GRCh37
create_bed_from_annotable(grch38, "protein_coding")
## # A tibble: 22,375 x 5
## chr start end strand symbol
## <chr> <int> <int> <int> <chr>
## 1 1 65419 71585 1 OR4F5
## 2 1 450703 451697 -1 OR4F29
## 3 1 685679 686673 -1 OR4F16
## 4 1 923928 944581 1 SAMD11
## 5 1 944204 959309 -1 NOC2L
## 6 1 960587 965715 1 KLHL17
## 7 1 966497 975865 1 PLEKHN1
## 8 1 975204 982093 -1 PERM1
## 9 1 998962 1000172 -1 HES4
## 10 1 1001138 1014541 1 ISG15
## # ... with 22,365 more rows
Pseudogenes might make a nice negative control? Maybe??
create_bed_from_annotable(grch38, "processed_pseudogene")
## # A tibble: 10,803 x 5
## chr start end strand symbol
## <chr> <int> <int> <int> <chr>
## 1 1 131025 134836 1 CICP27
## 2 1 135141 135895 -1 AL627309.7
## 3 1 137682 137965 -1 AL627309.8
## 4 1 347982 348366 -1 RPL23AP24
## 5 1 439870 440232 1 WBP1LP7
## 6 1 487101 489906 1 CICP7
## 7 1 674842 675265 1 WBP1LP6
## 8 1 722092 724903 1 CICP3
## 9 1 1008076 1008229 -1 AL645608.4
## 10 1 1378666 1379032 1 NDUFB4P8
## # ... with 10,793 more rows
Note it’s called something different in b37.
create_bed_from_annotable(grch37, "pseudogene")
## # A tibble: 15,583 x 5
## chr start end strand symbol
## <chr> <int> <int> <int> <chr>
## 1 1 11869 14412 1 DDX11L1
## 2 1 14363 29806 -1 WASH7P
## 3 1 52473 54936 1 OR4G4P
## 4 1 62948 63887 1 OR4G11P
## 5 1 131025 134836 1 CICP27
## 6 1 135141 135895 -1 RP11-34P13.15
## 7 1 137682 137965 -1 RP11-34P13.16
## 8 1 228292 228775 -1 AP006222.1
## 9 1 326096 328112 1 RP4-669L17.8
## 10 1 329431 332236 -1 CICP7
## # ... with 15,573 more rows