Download the original data, but extract only the “human_primer_*.csv" files.
url <- "http://www.wisdom.weizmann.ac.il/~atanay/4cseq_pipe/4cseq_primer_db.tgz"
file_tarball <- basename(url)
if (! file.exists(file_tarball))
download.file(url, file_tarball)
primers <- c("Csp6I_BfaI",
"DpnII_BfaI",
"DpnII_Csp6I",
"NlaIII_BfaI",
"NlaIII_Csp6I",
"NlaIII_DpnII")
files <- file.path(tools::file_path_sans_ext(file_tarball),
paste0("human_primers_", primers, ".csv"))
if (! all(file.exists(files)))
untar(file_tarball, files = files)Unfortunately the TSV files have varying column names, named after the primer. Fix the naming using the tidyr package’s gather() function.
Additionally, create a corresponding fragments object to visualize the cut fragments. We will reuse the pp_id column which is unique to each fragment. In our case, creating the fragments by summarising with head() and str_c() are slow, about 20 seconds per file, because there are millions of length 2 pp_id groups. Summary operations are much faster when there are fewer groups. A better fast, though more complicated alternative would be to split into 2 tables and then perform the summary operations. But we instead use cache() from Biobase to speed up these operations.
suppressPackageStartupMessages({
library(Biobase) # cache
library(tidyverse) # %>% gather mutate rename group_by summarise
library(stringr) # str_split str_c
})
read_tsv_sites <- function(file) {
## The restriction site column names are based on the filenames,
## so we need to do standardize the column names we can combine
## the file rows together.
col_sites <- str_split(tools::file_path_sans_ext(file),
"[_.]", simplify = TRUE) %>%
as.character() %>% tail(2)
col_sites_orig <- str_c("loc", col_sites, "site", sep = "_")
## Don't read all the columns; just these 4.
col_names <- c("chr_alias", col_sites_orig, "pp_id")
## We must specify "chr_alias" as a character, otherwise it would
## be assumed to be an integer, and we would lose the X and Y
## chromosomes.
columns <- list(col_character(), col_integer(), col_integer(),
col_integer())
names(columns) <- col_names
names(col_sites_orig) <- col_sites
read_tsv(file, col_types = do.call(cols_only, columns)) %>%
rename_(.dots = col_sites_orig) %>%
gather_(key = "site", value = "loc", gather_cols = col_sites)
}
cache(
sites_ <- lapply(files, read_tsv_sites) %>% bind_rows() %>%
rename_("chrom" = "chr_alias") %>% arrange(chrom, loc) %>%
mutate(chrom = str_c("chr", chrom))
)
cache(
fragments_ <- sites_ %>%
group_by(pp_id) %>%
summarise(chrom = head(chrom, 1),
start = min(loc),
end = max(loc),
name = str_c(site, collapse = "_"))
)
sites_
#> # A tibble: 5,634,098 × 4
#> chrom pp_id site loc
#> <chr> <int> <chr> <int>
#> 1 chr1 10872946 Csp6I 55158
#> 2 chr1 10872946 BfaI 55575
#> 3 chr1 3258 NlaIII 62708
#> 4 chr1 3258 DpnII 63095
#> 5 chr1 12832397 Csp6I 565605
#> 6 chr1 12832397 DpnII 567010
#> 7 chr1 12832401 Csp6I 567565
#> 8 chr1 12832401 DpnII 568209
#> 9 chr1 12833144 DpnII 718860
#> 10 chr1 12833144 Csp6I 719929
#> # ... with 5,634,088 more rows
fragments_
#> # A tibble: 2,611,623 × 5
#> pp_id chrom start end name
#> <int> <chr> <int> <int> <chr>
#> 1 3258 chr1 62708 63095 NlaIII_DpnII
#> 2 8869 chr1 749642 750061 DpnII_NlaIII
#> 3 9015 chr1 768660 769142 DpnII_NlaIII
#> 4 9076 chr1 777269 777656 NlaIII_DpnII
#> 5 9323 chr1 797184 797691 DpnII_NlaIII
#> 6 9586 chr1 820923 821239 NlaIII_DpnII
#> 7 9703 chr1 831435 832016 DpnII_NlaIII
#> 8 9774 chr1 839455 840285 NlaIII_DpnII
#> 9 9803 chr1 845800 846453 DpnII_NlaIII
#> 10 9835 chr1 850402 850704 DpnII_NlaIII
#> # ... with 2,611,613 more rowssuppressPackageStartupMessages({
library(rtracklayer) # export.bed
})
## Coerce sites to GRanges.
sites <- sites_ %>% dplyr::rename(start = loc, name = site) %>%
mutate(end = start) %>% GRanges()
## Coercing fragments requires no work.
fragments <- GRanges(fragments_)
save_bed <- function(gr) {
file <- substitute(gr) %>% deparse() %>% str_c(".bed.gz")
if (! file.exists(file))
export.bed(gr, file)
}
save_bed(fragments)
save_bed(sites)suppressPackageStartupMessages({
library(ggbio) # autoplot
})
hg19 <- GenomeInfoDb::Seqinfo(genome="hg19") %>%
keepStandardChromosomes()
seqinfo(sites) <- hg19
seqinfo(fragments) <- hg19
#> Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 9930 out-of-bound ranges located on
#> sequences chr11, chr12, chr13, chr14, chr20, chr21, chr22, chr15,
#> chr16, chr17, chr18, and chr19. Note that only ranges located on a
#> non-circular sequence whose length is not NA can be considered
#> out-of-bound (use seqlengths() and isCircular() to get the lengths
#> and circularity flags of the underlying sequences). You can use
#> trim() to trim these ranges. See ?`trim,GenomicRanges-method` for
#> more information.
set.seed(123)
idx <- sample.int(length(sites), size = 10000)
autoplot(sites[idx], layout = "karyogram")
#> Scale for 'x' is already present. Adding another scale for 'x', which
#> will replace the existing scale.
#> Scale for 'x' is already present. Adding another scale for 'x', which
#> will replace the existing scale.