Read primer DB

Pariksheet Nanda

2017-05-01

Fetch the data

Download the original data, but extract only the “human_primer_*.csv" files.

url <- "http://www.wisdom.weizmann.ac.il/~atanay/4cseq_pipe/4cseq_primer_db.tgz"
file_tarball <- basename(url)
if (! file.exists(file_tarball))
    download.file(url, file_tarball)
primers <- c("Csp6I_BfaI",
             "DpnII_BfaI",
             "DpnII_Csp6I",
             "NlaIII_BfaI",
             "NlaIII_Csp6I",
             "NlaIII_DpnII")
files <- file.path(tools::file_path_sans_ext(file_tarball),
                   paste0("human_primers_", primers, ".csv"))
if (! all(file.exists(files)))
    untar(file_tarball, files = files)

Tidy up the data as restriction sites and fragments

Unfortunately the TSV files have varying column names, named after the primer. Fix the naming using the tidyr package’s gather() function.

Additionally, create a corresponding fragments object to visualize the cut fragments. We will reuse the pp_id column which is unique to each fragment. In our case, creating the fragments by summarising with head() and str_c() are slow, about 20 seconds per file, because there are millions of length 2 pp_id groups. Summary operations are much faster when there are fewer groups. A better fast, though more complicated alternative would be to split into 2 tables and then perform the summary operations. But we instead use cache() from Biobase to speed up these operations.

suppressPackageStartupMessages({
    library(Biobase)     # cache
    library(tidyverse)   # %>% gather mutate rename group_by summarise
    library(stringr)     # str_split str_c
})

read_tsv_sites <- function(file) {
    ## The restriction site column names are based on the filenames,
    ## so we need to do standardize the column names we can combine
    ## the file rows together.
    col_sites <- str_split(tools::file_path_sans_ext(file),
                           "[_.]", simplify = TRUE) %>%
        as.character() %>% tail(2)
    col_sites_orig <- str_c("loc", col_sites, "site", sep = "_")
    ## Don't read all the columns; just these 4.
    col_names <- c("chr_alias", col_sites_orig, "pp_id")
    ## We must specify "chr_alias" as a character, otherwise it would
    ## be assumed to be an integer, and we would lose the X and Y
    ## chromosomes.
    columns <- list(col_character(), col_integer(), col_integer(),
                    col_integer())
    names(columns) <- col_names
    names(col_sites_orig) <- col_sites
    read_tsv(file, col_types = do.call(cols_only, columns)) %>%
        rename_(.dots = col_sites_orig) %>%
        gather_(key = "site", value = "loc", gather_cols = col_sites)
}
cache(
    sites_ <- lapply(files, read_tsv_sites) %>% bind_rows() %>%
        rename_("chrom" = "chr_alias") %>% arrange(chrom, loc) %>%
        mutate(chrom = str_c("chr", chrom))
)
cache(
    fragments_ <- sites_ %>%
        group_by(pp_id) %>%
        summarise(chrom = head(chrom, 1),
                  start = min(loc),
                  end = max(loc),
                  name = str_c(site, collapse = "_"))
)

sites_
#> # A tibble: 5,634,098 × 4
#>    chrom    pp_id   site    loc
#>    <chr>    <int>  <chr>  <int>
#> 1   chr1 10872946  Csp6I  55158
#> 2   chr1 10872946   BfaI  55575
#> 3   chr1     3258 NlaIII  62708
#> 4   chr1     3258  DpnII  63095
#> 5   chr1 12832397  Csp6I 565605
#> 6   chr1 12832397  DpnII 567010
#> 7   chr1 12832401  Csp6I 567565
#> 8   chr1 12832401  DpnII 568209
#> 9   chr1 12833144  DpnII 718860
#> 10  chr1 12833144  Csp6I 719929
#> # ... with 5,634,088 more rows
fragments_
#> # A tibble: 2,611,623 × 5
#>    pp_id chrom  start    end         name
#>    <int> <chr>  <int>  <int>        <chr>
#> 1   3258  chr1  62708  63095 NlaIII_DpnII
#> 2   8869  chr1 749642 750061 DpnII_NlaIII
#> 3   9015  chr1 768660 769142 DpnII_NlaIII
#> 4   9076  chr1 777269 777656 NlaIII_DpnII
#> 5   9323  chr1 797184 797691 DpnII_NlaIII
#> 6   9586  chr1 820923 821239 NlaIII_DpnII
#> 7   9703  chr1 831435 832016 DpnII_NlaIII
#> 8   9774  chr1 839455 840285 NlaIII_DpnII
#> 9   9803  chr1 845800 846453 DpnII_NlaIII
#> 10  9835  chr1 850402 850704 DpnII_NlaIII
#> # ... with 2,611,613 more rows

Save as BED files for viewing in the UCSC browser

suppressPackageStartupMessages({
    library(rtracklayer)                # export.bed
})

## Coerce sites to GRanges.
sites <- sites_ %>% dplyr::rename(start = loc, name = site) %>%
    mutate(end = start) %>% GRanges()
## Coercing fragments requires no work.
fragments <- GRanges(fragments_)

save_bed <- function(gr) {
    file <- substitute(gr) %>% deparse() %>% str_c(".bed.gz")
    if (! file.exists(file))
        export.bed(gr, file)
}
save_bed(fragments)
save_bed(sites)

Visualize in R with ggbio

suppressPackageStartupMessages({
    library(ggbio)                      # autoplot
})

hg19 <- GenomeInfoDb::Seqinfo(genome="hg19") %>%
    keepStandardChromosomes()
seqinfo(sites) <- hg19
seqinfo(fragments) <- hg19
#> Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 9930 out-of-bound ranges located on
#>   sequences chr11, chr12, chr13, chr14, chr20, chr21, chr22, chr15,
#>   chr16, chr17, chr18, and chr19. Note that only ranges located on a
#>   non-circular sequence whose length is not NA can be considered
#>   out-of-bound (use seqlengths() and isCircular() to get the lengths
#>   and circularity flags of the underlying sequences). You can use
#>   trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
#>   more information.

set.seed(123)
idx <- sample.int(length(sites), size = 10000)
autoplot(sites[idx], layout = "karyogram")
#> Scale for 'x' is already present. Adding another scale for 'x', which
#> will replace the existing scale.
#> Scale for 'x' is already present. Adding another scale for 'x', which
#> will replace the existing scale.