ゲノムデータを読み込んでsummaryを出力する
fastaファイル(genome)を読み込んでNXとLXもしくはNGXとLGXを計算する関数
- N1~N99の値を計算する。
- ゲノムサイズを指定しない場合はアセンブリの合計サイズをゲノムサイズとする。
# 5つのアセンブリに対してN1~N99までの値を求める.
nxd_cgr <- nx(cgr, asmbl = "Cgr1.0")
nxd_c_gr <- nx(c_gr, asmbl = "C_griseus1.0")
nxd_crgr <- nx(crgr, asmbl = "CriGri_1.0")
nxd_crgrp <- nx(crgrp, asmbl = "CriGri_PICR")
nxd_chkgs <- nx(chkgs, asmbl = "CHOK1GS_HDv1")
# 1つのデータフレームにまとめる.
dats <- list(nxd_cgr, nxd_c_gr, nxd_crgr, nxd_crgrp, nxd_chkgs)
ngdats <- Reduce(rbind, dats) %>%
mutate_at(vars(assembly), list(~ factor(., levels = unique(.))))
head(ngdats)
## NGX LGX bp Mbp assembly
## 1 1 2 11220275 11.220275 Cgr1.0
## 2 2 5 8029756 8.029756 Cgr1.0
## 3 3 8 7032613 7.032613 Cgr1.0
## 4 4 11 6490304 6.490304 Cgr1.0
## 5 5 15 5477275 5.477275 Cgr1.0
## 6 6 20 5227729 5.227729 Cgr1.0

fastaファイル(genome)を読み込んでsummaryを出力する関数
- スキャッフォールド数、全塩基数、N50, L50, GC(%), N(%) , etc
- ゲノムデータの統計値, 塩基の頻度のテーブル, 各コンディグの塩基数
genome_summary <- function(in_f, N, genome = NULL){
# read fasta file ----
if (class(in_f) == "DNAStringSet") {
seq <- in_f
} else if (file.exists(in_f)) {
seq <- Biostrings::readDNAStringSet(in_f)
} else {
stop("'in_f' must be 'DNAStringSet' object, or fasta file PATH")
}
# genome size ----
if (is.null(genome)) {
genome <- sum(as.numeric(Biostrings::width(seq)))
}
# contig or scaffold ----
ncontig <- length(seq)
# total bases ---
wseq <- Biostrings::width(seq)
bp <- sum(as.numeric(wseq))
max_cntg <- max(wseq)
min_cntg <- min(wseq)
res <- rskoseq::nx(in_f = seq, genome)
nxbp <- res$bp[res$NGX %in% N]
lxn <- res$LGX[res$NGX %in% N]
# GC content(%), N content (%) ----
frq_base <- apply(Biostrings::alphabetFrequency(seq), 2, sum)
at <- sum(as.numeric(frq_base[names(frq_base) %in% c("A","T")]))
gc <- sum(as.numeric(frq_base[names(frq_base) %in% c("C","G")]))
gc_rate <- gc/(at + gc)
n_rate <- frq_base[which(names(frq_base) == "N")]/bp
value <- as.character(c(bp, ncontig, max_cntg, min_cntg, nxbp, lxn,
round(n_rate, digits = 3), round(gc_rate, digits = 3)))
tag <- c("Total sequence length(bp)", "Number of contigs", "Longest contig(bp)", "Shortest contig(bp)",
paste0("N", N), paste0("L", N), "N", "GC")
gsum <- data.frame(tag, value, stringsAsFactors = F)
return(list(summary = gsum, base_freq = frq_base, dist_cntg = wseq))
}
ゲノムデータのサマリー
Cgr <- genome_summary(cgr, N = c(90, 50, 10))
C_gr <- genome_summary(c_gr, N = c(90, 50, 10))
CriGri <- genome_summary(crgr, N = c(90, 50, 10))
CriGri_PICR <- genome_summary(crgrp, N = c(90, 50, 10))
CHK1GS <- genome_summary(chkgs, N = c(90, 50, 10))
# 結果
rsko::kb_utls(Cgr$summary)
rsko::kb_utls(stack(Cgr$base_freq))
asmbl <- c("Cgr1.0", "C_griseus1.0", "CriGri_1.0", "CriGri_PICR", "CHOK1GS_HDv1")
smls <- list(Cgr, C_gr, CriGri, CriGri_PICR, CHK1GS)
cho_sum <- lapply(seq_along(smls), function(i){
setNames(smls[[i]]$summary, c("tag", asmbl[i]))}) %>%
Reduce(function(x,y) merge(x, y, by = "tag"), .) %>%
mutate_at(vars(tag), as.character) %>%
.[match(Cgr$summary$tag, .$tag),]
knitr::kable(cho_sum, format = "pandoc", align = "l", caption="C. griceus genome summary")
tag
|
value
|
Total sequence length(bp)
|
2332774290
|
Number of contigs
|
28749
|
Longest contig(bp)
|
14658418
|
Shortest contig(bp)
|
830
|
N90
|
4040277
|
N50
|
1236516
|
N10
|
180686
|
L90
|
41
|
L50
|
501
|
L10
|
2251
|
N
|
0.104
|
GC
|
0.413
|
values
|
ind
|
612896951
|
A
|
430937235
|
C
|
431194278
|
G
|
613965601
|
T
|
11240
|
M
|
23120
|
R
|
10003
|
W
|
6824
|
S
|
22657
|
Y
|
11009
|
K
|
422
|
V
|
517
|
H
|
493
|
D
|
331
|
B
|
243693609
|
N
|
0
|
|
0
|
|
0
|
.
|
C. griceus genome summary
12 |
Total sequence length(bp) |
2332774290 |
2360130144 |
2399786748 |
2368906908 |
2358167390 |
10 |
Number of contigs |
28749 |
52710 |
109152 |
1830 |
8265 |
5 |
Longest contig(bp) |
14658418 |
8324132 |
8779783 |
80584097 |
224834208 |
11 |
Shortest contig(bp) |
830 |
201 |
200 |
568 |
2000 |
9 |
N90 |
4040277 |
3925365 |
3787113 |
64410965 |
169160581 |
8 |
N50 |
1236516 |
1558295 |
1147233 |
19581774 |
62039716 |
7 |
N10 |
180686 |
395288 |
102441 |
4400567 |
14197385 |
4 |
L90 |
41 |
47 |
49 |
4 |
2 |
3 |
L50 |
501 |
450 |
547 |
33 |
12 |
2 |
L10 |
2251 |
1558 |
3177 |
122 |
42 |
6 |
N |
0.104 |
0.025 |
0.034 |
0.001 |
0.015 |
1 |
GC |
0.413 |
0.414 |
0.414 |
0.415 |
0.415 |
scaffold長のヒストグラム
# 配列長
w.cgr <- width(cgr)
w.c_gr <- width(c_gr)
w.crigri <- width(crgr)
w.crgrp <- width(crgrp)
w.chkgs <- width(chkgs)
# スキャッフォールド長データ
pacman::p_load(ggplot2, dplyr)
dat <- data.frame(key = c(rep("Cgr1.0", length(w.cgr)),
rep("C_griseus1.0", length(w.c_gr)),
rep("CriGri_1.0", length(w.crigri)),
rep("CriGri_PICR", length(w.crgrp)),
rep("CHOK1GS_HDv1", length(w.chkgs))
),
scf_len = c(w.cgr, w.c_gr, w.crigri, w.crgrp, w.chkgs),
check.names = F) %>%
mutate(key = factor(key,
levels = c("Cgr1.0","C_griseus1.0","CriGri_1.0",
"CriGri_PICR", "CHOK1GS_HDv1")))
# スキャッフォールド長サマリー
sum_dat <- dat %>%
group_by(key) %>%
summarize_at(vars(scf_len), list(mean = mean, median = median,
max = max, min = min))
knitr::kable(sum_dat, format = "pandoc", align = "l",
caption = "C. griceus genome scaffolds")
C. griceus genome scaffolds
Cgr1.0 |
81142.80 |
1927 |
14658418 |
830 |
C_griseus1.0 |
44775.76 |
363 |
8324132 |
201 |
CriGri_1.0 |
21985.73 |
503 |
8779783 |
200 |
CriGri_PICR |
1294484.65 |
37031 |
80584097 |
568 |
CHOK1GS_HDv1 |
285319.71 |
3141 |
224834208 |
2000 |


gff読み込み
options(readr.show_progress = F)
# gffファイルパス ----
f.cgr.gff <- "~/db/genome/Cgr1.0/GCA_000448345.1_Cgr1.0_genomic.gff.gz"
f.c_gr.gff <- "~/db/genome/C_griseus1.0/GCA_000419365.1_C_griseus_v1.0_genomic.gff.gz"
f.crgr.gff <- "~/db/genome/CriGri_1.0/Cricetulus_griseus_crigri.CriGri_1.0.90.gff3"
f.crgrp.gff <- "~/db/genome/CriGri_PICR/Cricetulus_griseus_picr.CriGri-PICR.96.gff3.gz"
f.chok1gs.gff <- "~/db/genome/CHOK1GS_HDv1/CHOK1GS_HDv1.90.gff3.gz"
# gffファイル読み込み ----
cgr.gff <- read_tsv(f.cgr.gff, col_names = F, col_types = "ccccccccc", comment = "#")
c_gr.gff <- read_tsv(f.c_gr.gff, col_names = F, col_types = "ccccccccc", comment = "#")
crgr.gff <- read_tsv(f.crgr.gff, col_names = F, col_types = "ccccccccc", comment = "#")
crgrp.gff <- read_tsv(f.crgrp.gff, col_names=F, col_types="ccccccccc", comment = "#")
chkgs.gff <- read_tsv(f.chok1gs.gff, col_names=F, col_types="ccccccccc", comment = "#")
Cgr1.0
X1
|
X2
|
X3
|
X4
|
X5
|
X6
|
X7
|
X8
|
X9
|
KE662775.1
|
Genbank
|
region
|
1
|
7004031
|
.
|
|
.
|
ID=id0;Dbxref=taxon:10029;Name=1;chromosome=1;gbkey=Src;genome=genomic;map=unlocalized;mol_type=genomic DNA;strain=17A/GY
|
KE662775.1
|
Genbank
|
gene
|
632
|
3313
|
.
|
|
.
|
ID=gene0;Name=H671_1g0001;gbkey=Gene;gene_biotype=protein_coding;locus_tag=H671_1g0001
|
KE662775.1
|
Genbank
|
mRNA
|
632
|
3313
|
.
|
|
.
|
ID=rna0;Parent=gene0;gbkey=mRNA;product=glutathione S-transferase theta-1-like protein
|
KE662775.1
|
Genbank
|
exon
|
632
|
1635
|
.
|
|
.
|
ID=id1;Parent=rna0;gbkey=mRNA;product=glutathione S-transferase theta-1-like protein
|
KE662775.1
|
Genbank
|
exon
|
2910
|
3313
|
.
|
|
.
|
ID=id2;Parent=rna0;gbkey=mRNA;product=glutathione S-transferase theta-1-like protein
|
KE662775.1
|
Genbank
|
CDS
|
1462
|
1635
|
.
|
|
0
|
ID=cds0;Parent=rna0;Dbxref=InterPro:IPR004046,InterPro:IPR010987,InterPro:IPR017933,NCBI_GP:ERE92752.1;Name=ERE92752.1;gbkey=CDS;product=glutathione S-transferase theta-1-like protein;protein_id=ERE92752.1
|
C_griseus_v1.0
X1
|
X2
|
X3
|
X4
|
X5
|
X6
|
X7
|
X8
|
X9
|
AMDS01163391.1
|
Genbank
|
region
|
1
|
201
|
.
|
|
.
|
ID=id0;Dbxref=taxon:10029;collection-date=Feb-2011;country=China;gbkey=Src;mol_type=genomic DNA
|
AMDS01163392.1
|
Genbank
|
region
|
1
|
201
|
.
|
|
.
|
ID=id1;Dbxref=taxon:10029;collection-date=Feb-2011;country=China;gbkey=Src;mol_type=genomic DNA
|
AMDS01163393.1
|
Genbank
|
region
|
1
|
201
|
.
|
|
.
|
ID=id2;Dbxref=taxon:10029;collection-date=Feb-2011;country=China;gbkey=Src;mol_type=genomic DNA
|
AMDS01163394.1
|
Genbank
|
region
|
1
|
201
|
.
|
|
.
|
ID=id3;Dbxref=taxon:10029;collection-date=Feb-2011;country=China;gbkey=Src;mol_type=genomic DNA
|
AMDS01163395.1
|
Genbank
|
region
|
1
|
201
|
.
|
|
.
|
ID=id4;Dbxref=taxon:10029;collection-date=Feb-2011;country=China;gbkey=Src;mol_type=genomic DNA
|
AMDS01163396.1
|
Genbank
|
region
|
1
|
201
|
.
|
|
.
|
ID=id5;Dbxref=taxon:10029;collection-date=Feb-2011;country=China;gbkey=Src;mol_type=genomic DNA
|
CriGri_1.0
X1
|
X2
|
X3
|
X4
|
X5
|
X6
|
X7
|
X8
|
X9
|
JH000001.1
|
UCSC computational genomics laboratory
|
supercontig
|
1
|
8779783
|
.
|
.
|
.
|
ID=supercontig:JH000001.1;Alias=scaffold329,NW_003613580.1
|
JH000001.1
|
.
|
biological_region
|
323137
|
323205
|
0.7
|
|
.
|
external_name=rank %3D 1;logic_name=firstef
|
JH000001.1
|
.
|
biological_region
|
1257524
|
1257825
|
0.551
|
|
.
|
external_name=rank %3D 1;logic_name=firstef
|
JH000001.1
|
.
|
biological_region
|
1280596
|
1280668
|
22.3
|
|
.
|
external_name=Pseudo;logic_name=trnascan
|
JH000001.1
|
.
|
biological_region
|
1460954
|
1461024
|
23.5
|
|
.
|
external_name=Pseudo;logic_name=trnascan
|
JH000001.1
|
.
|
biological_region
|
1578225
|
1578873
|
0.535
|
|
.
|
external_name=rank %3D 1;logic_name=firstef
|
CriGri_PICR
X1
|
X2
|
X3
|
X4
|
X5
|
X6
|
X7
|
X8
|
X9
|
RAZU01000001.1
|
Ensembl
|
region
|
1
|
17833227
|
.
|
.
|
.
|
ID=region:RAZU01000001.1;Alias=picr_41_new
|
RAZU01000001.1
|
.
|
biological_region
|
320691
|
321248
|
631
|
.
|
.
|
external_name=oe %3D 1.05;logic_name=cpg
|
RAZU01000001.1
|
.
|
biological_region
|
320718
|
320720
|
0.999
|
|
.
|
logic_name=eponine
|
RAZU01000001.1
|
.
|
biological_region
|
320969
|
320972
|
0.999
|
|
.
|
logic_name=eponine
|
RAZU01000001.1
|
.
|
biological_region
|
321094
|
321097
|
0.999
|
|
.
|
logic_name=eponine
|
RAZU01000001.1
|
.
|
biological_region
|
321155
|
321157
|
0.999
|
|
.
|
logic_name=eponine
|
CHOK1GS_HDv1
X1
|
X2
|
X3
|
X4
|
X5
|
X6
|
X7
|
X8
|
X9
|
JH000001.1
|
UCSC computational genomics laboratory
|
supercontig
|
1
|
8779783
|
.
|
.
|
.
|
ID=supercontig:JH000001.1;Alias=scaffold329,NW_003613580.1
|
JH000001.1
|
.
|
biological_region
|
323137
|
323205
|
0.7
|
|
.
|
external_name=rank %3D 1;logic_name=firstef
|
JH000001.1
|
.
|
biological_region
|
1257524
|
1257825
|
0.551
|
|
.
|
external_name=rank %3D 1;logic_name=firstef
|
JH000001.1
|
.
|
biological_region
|
1280596
|
1280668
|
22.3
|
|
.
|
external_name=Pseudo;logic_name=trnascan
|
JH000001.1
|
.
|
biological_region
|
1460954
|
1461024
|
23.5
|
|
.
|
external_name=Pseudo;logic_name=trnascan
|
JH000001.1
|
.
|
biological_region
|
1578225
|
1578873
|
0.535
|
|
.
|
external_name=rank %3D 1;logic_name=firstef
|
環境
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggplot2_3.3.3 dplyr_0.8.5 readr_1.4.0
## [4] Biostrings_2.50.2 XVector_0.22.0 IRanges_2.16.0
## [7] S4Vectors_0.20.1 BiocGenerics_0.28.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-0 ellipsis_0.3.1
## [3] htmlTable_2.1.0 GenomicRanges_1.34.0
## [5] base64enc_0.1-3 rstudioapi_0.13
## [7] farver_2.0.3 bit64_4.0.5
## [9] AnnotationDbi_1.44.0 fansi_0.4.2
## [11] xml2_1.3.2 codetools_0.2-18
## [13] splines_3.5.2 cachem_1.0.4
## [15] DESeq_1.34.1 geneplotter_1.60.0
## [17] knitr_1.31 Formula_1.2-4
## [19] jsonlite_1.7.2 TCC_1.22.1
## [21] rskoseq_0.1.0 annotate_1.60.1
## [23] baySeq_2.16.0 cluster_2.1.1
## [25] compiler_3.5.2 httr_1.4.2
## [27] backports_1.2.1 assertthat_0.2.1
## [29] Matrix_1.2-18 fastmap_1.1.0
## [31] limma_3.38.3 htmltools_0.5.1.1
## [33] tools_3.5.2 gtable_0.3.0
## [35] glue_1.4.2 GenomeInfoDbData_1.2.0
## [37] Rcpp_1.0.6 Biobase_2.42.0
## [39] jquerylib_0.1.3 vctrs_0.3.7
## [41] svglite_1.2.3 iterators_1.0.13
## [43] xfun_0.21 stringr_1.4.0
## [45] ROC_1.58.0 rvest_0.3.6
## [47] lifecycle_1.0.0 pacman_0.5.1
## [49] XML_3.99-0.3 edgeR_3.24.3
## [51] zlibbioc_1.28.0 scales_1.1.1
## [53] hms_1.0.0 SummarizedExperiment_1.12.0
## [55] RColorBrewer_1.1-2 yaml_2.2.1
## [57] memoise_2.0.0 gridExtra_2.3
## [59] gdtools_0.2.1 sass_0.3.1
## [61] rpart_4.1-15 latticeExtra_0.6-28
## [63] stringi_1.5.3 RSQLite_2.2.3
## [65] highr_0.8 genefilter_1.64.0
## [67] foreach_1.5.1 checkmate_2.0.0
## [69] BiocParallel_1.16.6 GenomeInfoDb_1.18.2
## [71] systemfonts_0.1.1 rlang_0.4.10
## [73] pkgconfig_2.0.3 matrixStats_0.58.0
## [75] bitops_1.0-6 evaluate_0.14
## [77] lattice_0.20-41 purrr_0.3.4
## [79] htmlwidgets_1.5.3 labeling_0.4.2
## [81] bit_4.0.4 tidyselect_1.1.0
## [83] plyr_1.8.6 magrittr_2.0.1
## [85] DESeq2_1.22.2 R6_2.5.0
## [87] Hmisc_4.4-2 DelayedArray_0.8.0
## [89] DBI_1.1.1 pillar_1.5.1
## [91] foreign_0.8-72 withr_2.4.1
## [93] survival_3.2-7 abind_1.4-5
## [95] RCurl_1.98-1.2 nnet_7.3-15
## [97] tibble_3.1.0 crayon_1.4.1
## [99] utf8_1.2.1 rmarkdown_2.7
## [101] rsko_0.1.0 locfit_1.5-9.4
## [103] grid_3.5.2 data.table_1.14.0
## [105] blob_1.2.1 webshot_0.5.2
## [107] digest_0.6.27 xtable_1.8-4
## [109] munsell_0.5.0 viridisLite_0.3.0
## [111] kableExtra_1.3.4 bslib_0.2.4