library(readr)
setwd("~/db/genome/GRCh38")
gtf82 <- read_delim("Homo_sapiens.GRCh38.82.gtf.gz",
delim = "\t", col_names = F, skip = 5,
col_types = "cccnncccc")
gtf83 <- read_delim("Homo_sapiens.GRCh38.83.gtf.gz",
delim = "\t", col_names = F, skip = 5,
col_types = "cccnncccc")
gtf85 <- read_delim("Homo_sapiens.GRCh38.85.gtf.gz",
delim = "\t", col_names = F, skip = 5,
col_types = "cccnncccc")
#check
dim(gtf82); dim(gtf83); dim(gtf85)## [1] 2561563 9
## [1] 2569150 9
## [1] 2575871 9
## transcriptのみを抽出
gtf82.transcript <- subset(gtf82, gtf82$X3=="transcript")
gtf83.transcript <- subset(gtf83, gtf83$X3=="transcript")
gtf85.transcript <- subset(gtf85, gtf85$X3=="transcript")
## 文字列置換とデータ整形
suppressMessages(suppressWarnings(library(dplyr)))
tr82 <- lapply(strsplit(gtf82.transcript$X9, ";"), "[", c(3, 1, 5, 7))
tr82 <- tr82 %>%
lapply(function(x){sub("gene_id \"", "", x)}) %>%
lapply(function(x){sub(" transcript_id \"", "", x)}) %>%
lapply(function(x){sub(" gene_name \"", "", x)}) %>%
lapply(function(x){sub(" gene_biotype \"", "", x)}) %>%
lapply(function(x){gsub("\"", "", x)})
tr83 <- lapply(strsplit(gtf83.transcript$X9, ";"), "[", c(3, 1, 5, 7))
tr83 <- tr83 %>%
lapply(function(x){sub("gene_id \"", "", x)}) %>%
lapply(function(x){sub(" transcript_id \"", "", x)}) %>%
lapply(function(x){sub(" gene_name \"", "", x)}) %>%
lapply(function(x){sub(" gene_biotype \"", "", x)}) %>%
lapply(function(x){gsub("\"", "", x)})
tr85 <- lapply(strsplit(gtf85.transcript$X9, ";"), "[", c(3, 1, 5, 7))
tr85 <- tr85 %>%
lapply(function(x){sub("gene_id \"", "", x)}) %>%
lapply(function(x){sub(" transcript_id \"", "", x)}) %>%
lapply(function(x){sub(" gene_name \"", "", x)}) %>%
lapply(function(x){sub(" gene_biotype \"", "", x)}) %>%
lapply(function(x){gsub("\"", "", x)})
tr82.dat <- data.frame(matrix(unlist(tr82), nc=4, byrow = T), row.names = NULL, stringsAsFactors = F)
names(tr82.dat) <- c("transcript_id", "gene_id", "gene_name", "gene_biotype")
tr83.dat <- data.frame(matrix(unlist(tr83), nc=4, byrow = T), row.names = NULL, stringsAsFactors = F)
names(tr83.dat) <- c("transcript_id", "gene_id", "gene_name", "gene_biotype")
tr85.dat <- data.frame(matrix(unlist(tr85), nc=4, byrow = T), row.names = NULL, stringsAsFactors = F)
names(tr85.dat) <- c("transcript_id", "gene_id", "gene_name", "gene_biotype")
## transcriptの数、geneの数
length(tr82.dat$transcript_id); length(tr83.dat$transcript_id); length(tr85.dat$transcript_id)## [1] 198634
## [1] 199184
## [1] 198002
length(unique(tr82.dat$gene_id)); length(unique(tr83.dat$gene_id)); length(unique(tr85.dat$gene_id))## [1] 60619
## [1] 60675
## [1] 58051
# check
knitr::kable(head(tr82.dat), format = "pandoc", caption="release82 (1-6行)")| transcript_id | gene_id | gene_name | gene_biotype |
|---|---|---|---|
| ENST00000456328 | ENSG00000223972 | DDX11L1 | transcribed_unprocessed_pseudogene |
| ENST00000450305 | ENSG00000223972 | DDX11L1 | transcribed_unprocessed_pseudogene |
| ENST00000488147 | ENSG00000227232 | WASH7P | unprocessed_pseudogene |
| ENST00000619216 | ENSG00000278267 | MIR6859-1 | miRNA |
| ENST00000473358 | ENSG00000243485 | RP11-34P13.3 | lincRNA |
| ENST00000469289 | ENSG00000243485 | RP11-34P13.3 | lincRNA |
knitr::kable(head(tr83.dat), format = "pandoc", caption="release83 (1-6行)")| transcript_id | gene_id | gene_name | gene_biotype |
|---|---|---|---|
| ENST00000456328 | ENSG00000223972 | DDX11L1 | transcribed_unprocessed_pseudogene |
| ENST00000450305 | ENSG00000223972 | DDX11L1 | transcribed_unprocessed_pseudogene |
| ENST00000488147 | ENSG00000227232 | WASH7P | unprocessed_pseudogene |
| ENST00000619216 | ENSG00000278267 | MIR6859-1 | miRNA |
| ENST00000473358 | ENSG00000243485 | RP11-34P13.3 | lincRNA |
| ENST00000469289 | ENSG00000243485 | RP11-34P13.3 | lincRNA |
knitr::kable(head(tr85.dat), format = "pandoc", caption="release85 (1-6行)")| transcript_id | gene_id | gene_name | gene_biotype |
|---|---|---|---|
| ENST00000456328 | ENSG00000223972 | DDX11L1 | transcribed_unprocessed_pseudogene |
| ENST00000450305 | ENSG00000223972 | DDX11L1 | transcribed_unprocessed_pseudogene |
| ENST00000488147 | ENSG00000227232 | WASH7P | unprocessed_pseudogene |
| ENST00000619216 | ENSG00000278267 | MIR6859-1 | miRNA |
| ENST00000473358 | ENSG00000243485 | MIR1302-2 | lincRNA |
| ENST00000469289 | ENSG00000243485 | MIR1302-2 | lincRNA |
suppressMessages(suppressWarnings(library(gplots)))
# 結合
all.dat <- rbind(tr82.dat, tr83.dat, tr85.dat)
all.dat <- all.dat[match(unique(all.dat$transcript_id),
all.dat$transcript_id),]
# transcript ---
tr82.tid <- unique(tr82.dat$transcript_id); length(tr82.tid)## [1] 198634
tr83.tid <- unique(tr83.dat$transcript_id); length(tr83.tid)## [1] 199184
tr85.tid <- unique(tr85.dat$transcript_id); length(tr85.tid)## [1] 198002
data <- list(release82=tr82.tid, release83=tr83.tid, release85=tr85.tid)
## 積集合,和集合, 差集合
insec <- intersect(tr82.tid, intersect(tr83.tid, tr85.tid))
uni <- union(tr82.tid, union(tr83.tid, tr85.tid))
dif <- setdiff(uni, insec)
all.dif <- all.dat[all.dat$transcript_id %in% dif,]; dim(all.dif)## [1] 6611 4
## plot
par(mfrow=c(2,2))
par(mar=c(1,1,1,1))
venn(data)
par(mar=c(4,10,2,1))
barplot(table(all.dif$gene_biotype), horiz = T, las=2, cex.names = 0.6, main = "changed transcript_id")
# gene ----
tr82.gid <- unique(tr82.dat$gene_id)
tr83.gid <- unique(tr83.dat$gene_id)
tr85.gid <- unique(tr85.dat$gene_id)
data <-list(release82=tr82.gid, release83=tr83.gid, release85=tr85.gid)
## 積集合,和集合, 差集合
insec <- intersect(tr82.gid, intersect(tr83.gid, tr85.gid))
uni <- union(tr82.gid, union(tr83.gid, tr85.gid))
dif <- setdiff(uni, insec)
all.dif <- all.dat[all.dat$gene_id %in% dif,]; dim(all.dif)## [1] 3885 4
## plot
par(mar=c(1,1,1,1))
venn(data)
par(mar=c(4,10,2,1))
barplot(table(all.dif$gene_biotype), horiz = T, las=2, cex.names = 0.6, main = "changed gene_id")sessionInfo()## R version 3.3.0 (2016-05-03)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.5 (Yosemite)
##
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gplots_3.0.1 dplyr_0.4.3 readr_0.2.2
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.4 gtools_3.5.0 digest_0.6.9
## [4] assertthat_0.1 bitops_1.0-6 R6_2.1.2
## [7] DBI_0.4-1 formatR_1.4 magrittr_1.5
## [10] evaluate_0.9 KernSmooth_2.23-15 highr_0.6
## [13] stringi_1.0-1 gdata_2.17.0 rmarkdown_0.9.6
## [16] tools_3.3.0 stringr_1.0.0 yaml_2.1.13
## [19] parallel_3.3.0 caTools_1.17.1 htmltools_0.3.5
## [22] knitr_1.13