• ヒトゲノムGRCh38の最新のgtfファイルは ここから ダウンロードする。
  • 過去のリリースはここからダウンロード

1 gtfファイル読み込み

library(readr)
setwd("~/db/genome/GRCh38")
gtf82 <- read_delim("Homo_sapiens.GRCh38.82.gtf.gz", 
                    delim = "\t", col_names = F, skip = 5, 
                    col_types = "cccnncccc")

gtf83 <- read_delim("Homo_sapiens.GRCh38.83.gtf.gz", 
                    delim = "\t", col_names = F, skip = 5, 
                    col_types = "cccnncccc")

gtf85 <- read_delim("Homo_sapiens.GRCh38.85.gtf.gz",
                    delim = "\t", col_names = F, skip = 5,
                    col_types = "cccnncccc")

#check
dim(gtf82); dim(gtf83); dim(gtf85)
## [1] 2561563       9
## [1] 2569150       9
## [1] 2575871       9

1.1 transcriptのみを抽出

## transcriptのみを抽出
gtf82.transcript <- subset(gtf82, gtf82$X3=="transcript")
gtf83.transcript <- subset(gtf83, gtf83$X3=="transcript")
gtf85.transcript <- subset(gtf85, gtf85$X3=="transcript")


## 文字列置換とデータ整形
suppressMessages(suppressWarnings(library(dplyr)))

tr82 <- lapply(strsplit(gtf82.transcript$X9, ";"), "[", c(3, 1, 5, 7))
tr82 <- tr82 %>%
  lapply(function(x){sub("gene_id \"", "", x)}) %>%
  lapply(function(x){sub(" transcript_id \"", "", x)}) %>%
  lapply(function(x){sub(" gene_name \"", "", x)}) %>%
  lapply(function(x){sub(" gene_biotype \"", "", x)}) %>%
  lapply(function(x){gsub("\"", "", x)})


tr83 <- lapply(strsplit(gtf83.transcript$X9, ";"), "[", c(3, 1, 5, 7))
tr83 <- tr83 %>%
  lapply(function(x){sub("gene_id \"", "", x)}) %>%
  lapply(function(x){sub(" transcript_id \"", "", x)}) %>%
  lapply(function(x){sub(" gene_name \"", "", x)}) %>%
  lapply(function(x){sub(" gene_biotype \"", "", x)}) %>%
  lapply(function(x){gsub("\"", "", x)})

tr85 <- lapply(strsplit(gtf85.transcript$X9, ";"), "[", c(3, 1, 5, 7))
tr85 <- tr85 %>%
  lapply(function(x){sub("gene_id \"", "", x)}) %>%
  lapply(function(x){sub(" transcript_id \"", "", x)}) %>%
  lapply(function(x){sub(" gene_name \"", "", x)}) %>%
  lapply(function(x){sub(" gene_biotype \"", "", x)}) %>%
  lapply(function(x){gsub("\"", "", x)})


tr82.dat <- data.frame(matrix(unlist(tr82), nc=4, byrow = T), row.names = NULL, stringsAsFactors = F)
names(tr82.dat) <- c("transcript_id", "gene_id", "gene_name", "gene_biotype")

tr83.dat <- data.frame(matrix(unlist(tr83), nc=4, byrow = T), row.names = NULL, stringsAsFactors = F)
names(tr83.dat) <- c("transcript_id", "gene_id", "gene_name", "gene_biotype")

tr85.dat <- data.frame(matrix(unlist(tr85), nc=4, byrow = T), row.names = NULL, stringsAsFactors = F)
names(tr85.dat) <- c("transcript_id", "gene_id", "gene_name", "gene_biotype")

## transcriptの数、geneの数
length(tr82.dat$transcript_id); length(tr83.dat$transcript_id); length(tr85.dat$transcript_id)
## [1] 198634
## [1] 199184
## [1] 198002
length(unique(tr82.dat$gene_id)); length(unique(tr83.dat$gene_id)); length(unique(tr85.dat$gene_id))
## [1] 60619
## [1] 60675
## [1] 58051
# check
knitr::kable(head(tr82.dat), format = "pandoc", caption="release82 (1-6行)")
release82 (1-6行)
transcript_id gene_id gene_name gene_biotype
ENST00000456328 ENSG00000223972 DDX11L1 transcribed_unprocessed_pseudogene
ENST00000450305 ENSG00000223972 DDX11L1 transcribed_unprocessed_pseudogene
ENST00000488147 ENSG00000227232 WASH7P unprocessed_pseudogene
ENST00000619216 ENSG00000278267 MIR6859-1 miRNA
ENST00000473358 ENSG00000243485 RP11-34P13.3 lincRNA
ENST00000469289 ENSG00000243485 RP11-34P13.3 lincRNA
knitr::kable(head(tr83.dat), format = "pandoc", caption="release83 (1-6行)")
release83 (1-6行)
transcript_id gene_id gene_name gene_biotype
ENST00000456328 ENSG00000223972 DDX11L1 transcribed_unprocessed_pseudogene
ENST00000450305 ENSG00000223972 DDX11L1 transcribed_unprocessed_pseudogene
ENST00000488147 ENSG00000227232 WASH7P unprocessed_pseudogene
ENST00000619216 ENSG00000278267 MIR6859-1 miRNA
ENST00000473358 ENSG00000243485 RP11-34P13.3 lincRNA
ENST00000469289 ENSG00000243485 RP11-34P13.3 lincRNA
knitr::kable(head(tr85.dat), format = "pandoc", caption="release85 (1-6行)")
release85 (1-6行)
transcript_id gene_id gene_name gene_biotype
ENST00000456328 ENSG00000223972 DDX11L1 transcribed_unprocessed_pseudogene
ENST00000450305 ENSG00000223972 DDX11L1 transcribed_unprocessed_pseudogene
ENST00000488147 ENSG00000227232 WASH7P unprocessed_pseudogene
ENST00000619216 ENSG00000278267 MIR6859-1 miRNA
ENST00000473358 ENSG00000243485 MIR1302-2 lincRNA
ENST00000469289 ENSG00000243485 MIR1302-2 lincRNA

1.2 gtfのversion違いによる変更箇所

suppressMessages(suppressWarnings(library(gplots)))
# 結合
all.dat <- rbind(tr82.dat, tr83.dat, tr85.dat)
all.dat <- all.dat[match(unique(all.dat$transcript_id),
                         all.dat$transcript_id),]

# transcript ---
tr82.tid <- unique(tr82.dat$transcript_id); length(tr82.tid)
## [1] 198634
tr83.tid <- unique(tr83.dat$transcript_id); length(tr83.tid)
## [1] 199184
tr85.tid <- unique(tr85.dat$transcript_id); length(tr85.tid)
## [1] 198002
data <- list(release82=tr82.tid, release83=tr83.tid, release85=tr85.tid)

## 積集合,和集合, 差集合
insec <- intersect(tr82.tid, intersect(tr83.tid, tr85.tid))
uni   <- union(tr82.tid, union(tr83.tid, tr85.tid))
dif <- setdiff(uni, insec)

all.dif <- all.dat[all.dat$transcript_id %in% dif,]; dim(all.dif)
## [1] 6611    4
## plot
par(mfrow=c(2,2))
par(mar=c(1,1,1,1))
venn(data)
par(mar=c(4,10,2,1))
barplot(table(all.dif$gene_biotype), horiz = T, las=2, cex.names = 0.6, main = "changed transcript_id")

# gene ----
tr82.gid <- unique(tr82.dat$gene_id)
tr83.gid <- unique(tr83.dat$gene_id)
tr85.gid <- unique(tr85.dat$gene_id)

data <-list(release82=tr82.gid, release83=tr83.gid, release85=tr85.gid)

## 積集合,和集合, 差集合
insec <- intersect(tr82.gid, intersect(tr83.gid, tr85.gid))
uni   <- union(tr82.gid, union(tr83.gid, tr85.gid))
dif <- setdiff(uni, insec)

all.dif <- all.dat[all.dat$gene_id %in% dif,]; dim(all.dif)
## [1] 3885    4
## plot
par(mar=c(1,1,1,1))
venn(data)
par(mar=c(4,10,2,1))
barplot(table(all.dif$gene_biotype), horiz = T, las=2, cex.names = 0.6, main = "changed gene_id")

2 環境

sessionInfo()
## R version 3.3.0 (2016-05-03)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.5 (Yosemite)
## 
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] gplots_3.0.1 dplyr_0.4.3  readr_0.2.2 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.4        gtools_3.5.0       digest_0.6.9      
##  [4] assertthat_0.1     bitops_1.0-6       R6_2.1.2          
##  [7] DBI_0.4-1          formatR_1.4        magrittr_1.5      
## [10] evaluate_0.9       KernSmooth_2.23-15 highr_0.6         
## [13] stringi_1.0-1      gdata_2.17.0       rmarkdown_0.9.6   
## [16] tools_3.3.0        stringr_1.0.0      yaml_2.1.13       
## [19] parallel_3.3.0     caTools_1.17.1     htmltools_0.3.5   
## [22] knitr_1.13