Why to find unannotated transcribed regions?

Usually is a hard challenge to find the count reads that we have outside of annotated regions within a RNAseq experiment. In essence, we have some genes that we know about and we are able to see that they are transcribed as the have aligned read coverage, but there are another regiones that don’t lie in any annotations and we have to know the locations of those alignments of the reads. Hence, we will find such regions with Bioconductor library and some libraries.

The dataset is called windows.bam and genes.gff and it will be a synthetic genome around 6.000 bp region and 2 gene features with reads and third unannotated region with aligning reads (https://github.com/PacktPublishing/R-Bioinformatics-Cookbook/tree/master). As shown in the following image:

Example used in the R Bioinformatics Cookbook
Example used in the R Bioinformatics Cookbook

Required Bioconductor installation

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install(version = "3.21")
BiocManager::install("IRanges") #
BiocManager::install("csaw")# 3 required packages for this purpose
BiocManager::install("rtracklayer")#

library(SummarizedExperiment)

### Also we need the readr package
install.packages("readr")
## package 'readr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\USUARIO\AppData\Local\Temp\RtmpC4iMKc\downloaded_packages
install.packages("magrittr")
## package 'magrittr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\USUARIO\AppData\Local\Temp\RtmpC4iMKc\downloaded_packages
install.packages("ggplot2")
## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\USUARIO\AppData\Local\Temp\RtmpC4iMKc\downloaded_packages
install.packages("forcats")
## package 'forcats' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\USUARIO\AppData\Local\Temp\RtmpC4iMKc\downloaded_packages
library(ggplot2)

Set up loading function

get_annotated_regions_from_gff <- function(genes.gff) {
  gff <- rtracklayer::import.gff(genes.gff)
  as(gff, "GRanges")
}

if (!requireNamespace("Rsamtools", quietly = TRUE)) {
  BiocManager::install("Rsamtools")
}
library(Rsamtools)

bam_file <- file.path("datasets", "windows.bam")
indexBam(bam_file)   # Esto crea windows.bam.bai
##       datasets/windows.bam 
## "datasets/windows.bam.bai"

Getting genome counts with csaw

# Input files
bam_file <- file.path("datasets", "windows.bam")
gff_file <- file.path("datasets", "genes.gff")

# Read BAM with csaw
whole_genome <- csaw::windowCounts(
  bam_file,
  bin = TRUE,
  filter = 0,
  width = 500,
  param = csaw::readParam(minq = 20, dedup = TRUE, pe = "both")
)

colnames(whole_genome) <- "small_data"

# Import annotations
annotated_regions <- get_annotated_regions_from_gff(gff_file)
# Find overlap between genome windows and gene annotations
windows_in_genes <- IRanges::overlapsAny(
  SummarizedExperiment::rowRanges(whole_genome),
  annotated_regions
)

# Separate counts
annotated_window_counts <- whole_genome[windows_in_genes, ]
non_annotated_window_counts <- whole_genome[!windows_in_genes, ]

# Show counts for non-annotated regions
assay(non_annotated_window_counts)
##      small_data
## [1,]          0
## [2,]          0
## [3,]         24
## [4,]         25
## [5,]          0
## [6,]          0
library(dplyr)
library(ggplot2)

# Extraer conteos y coordenadas
df <- data.frame(
  seqnames = as.character(GenomicRanges::seqnames(SummarizedExperiment::rowRanges(whole_genome))),
  start = GenomicRanges::start(SummarizedExperiment::rowRanges(whole_genome)),
  end = GenomicRanges::end(SummarizedExperiment::rowRanges(whole_genome)),
  count = SummarizedExperiment::assay(whole_genome)[, 1] # primera (y única) muestra
)

# Identificar ventanas anotadas
df$annotated <- IRanges::overlapsAny(
  SummarizedExperiment::rowRanges(whole_genome),
  annotated_regions
)

# Graficar cobertura en el genoma
ggplot(df, aes(x = start, y = count, fill = annotated)) +
  geom_col(width = 400) + # barras por cada ventana (ancho cercano a 500 bp)
  scale_fill_manual(values = c("TRUE" = "steelblue", "FALSE" = "tomato"),
                    labels = c("FALSE" = "Unannotated", "TRUE" = "Annotated")) +
  labs(
    title = "Read coverage across genome windows",
    x = "Genomic position (bp)",
    y = "Read counts",
    fill = "Region type"
  ) +
  theme_minimal(base_size = 13)