Quick and dirty lncRNA bed file from annotables

Load the libs you need

library(tidyverse)
library(annotables)

Function to create it

create_bed_from_annotable <- function (annotable, mybiotype) {
  annotable %>% 
    filter(biotype==mybiotype) %>% 
    select(chr, start, end, strand, symbol) %>% 
    distinct() %>% 
    arrange(chr, start, end, strand)
}

LincRNAs on grch37:

create_bed_from_annotable(grch37, "lincRNA")

## # A tibble: 7,340 x 5
##    chr    start    end strand symbol       
##    <chr>  <int>  <int>  <int> <chr>        
##  1 1      29554  31109      1 MIR1302-10   
##  2 1      34554  36081     -1 FAM138A      
##  3 1      89295 133566     -1 RP11-34P13.7 
##  4 1      89551  91105     -1 RP11-34P13.8 
##  5 1     160446 161525      1 RP11-34P13.9 
##  6 1     227615 267253     -1 AP006222.2   
##  7 1     317720 453948      1 RP4-669L17.10
##  8 1     453633 460480     -1 RP4-669L17.2 
##  9 1     459656 461954      1 RP5-857K21.15
## 10 1     521369 523833     -1 RP5-857K21.1 
## # ... with 7,330 more rows

Or with the chr

create_bed_from_annotable(grch37, "lincRNA") %>% 
  mutate(chr=paste0("chr", chr))

## # A tibble: 7,340 x 5
##    chr    start    end strand symbol       
##    <chr>  <int>  <int>  <int> <chr>        
##  1 chr1   29554  31109      1 MIR1302-10   
##  2 chr1   34554  36081     -1 FAM138A      
##  3 chr1   89295 133566     -1 RP11-34P13.7 
##  4 chr1   89551  91105     -1 RP11-34P13.8 
##  5 chr1  160446 161525      1 RP11-34P13.9 
##  6 chr1  227615 267253     -1 AP006222.2   
##  7 chr1  317720 453948      1 RP4-669L17.10
##  8 chr1  453633 460480     -1 RP4-669L17.2 
##  9 chr1  459656 461954      1 RP5-857K21.15
## 10 chr1  521369 523833     -1 RP5-857K21.1 
## # ... with 7,330 more rows

Do it with protein coding on GRCh37

create_bed_from_annotable(grch38, "protein_coding")

## # A tibble: 22,375 x 5
##    chr     start     end strand symbol 
##    <chr>   <int>   <int>  <int> <chr>  
##  1 1       65419   71585      1 OR4F5  
##  2 1      450703  451697     -1 OR4F29 
##  3 1      685679  686673     -1 OR4F16 
##  4 1      923928  944581      1 SAMD11 
##  5 1      944204  959309     -1 NOC2L  
##  6 1      960587  965715      1 KLHL17 
##  7 1      966497  975865      1 PLEKHN1
##  8 1      975204  982093     -1 PERM1  
##  9 1      998962 1000172     -1 HES4   
## 10 1     1001138 1014541      1 ISG15  
## # ... with 22,365 more rows

Pseudogenes might make a nice negative control? Maybe??

create_bed_from_annotable(grch38, "processed_pseudogene")

## # A tibble: 10,803 x 5
##    chr     start     end strand symbol    
##    <chr>   <int>   <int>  <int> <chr>     
##  1 1      131025  134836      1 CICP27    
##  2 1      135141  135895     -1 AL627309.7
##  3 1      137682  137965     -1 AL627309.8
##  4 1      347982  348366     -1 RPL23AP24 
##  5 1      439870  440232      1 WBP1LP7   
##  6 1      487101  489906      1 CICP7     
##  7 1      674842  675265      1 WBP1LP6   
##  8 1      722092  724903      1 CICP3     
##  9 1     1008076 1008229     -1 AL645608.4
## 10 1     1378666 1379032      1 NDUFB4P8  
## # ... with 10,793 more rows

Note it’s called something different in b37.

create_bed_from_annotable(grch37, "pseudogene")

## # A tibble: 15,583 x 5
##    chr    start    end strand symbol       
##    <chr>  <int>  <int>  <int> <chr>        
##  1 1      11869  14412      1 DDX11L1      
##  2 1      14363  29806     -1 WASH7P       
##  3 1      52473  54936      1 OR4G4P       
##  4 1      62948  63887      1 OR4G11P      
##  5 1     131025 134836      1 CICP27       
##  6 1     135141 135895     -1 RP11-34P13.15
##  7 1     137682 137965     -1 RP11-34P13.16
##  8 1     228292 228775     -1 AP006222.1   
##  9 1     326096 328112      1 RP4-669L17.8 
## 10 1     329431 332236     -1 CICP7        
## # ... with 15,573 more rows

Quick and dirty lncRNA bed file from annotables

Stephen Turner

4/4/2018