Pipeline method
t1 <- system.time({
annotation <- as.data.frame(gtf)
annotation <- annotation[!duplicated(annotation),]
genes <- unique(as.character(annotation$gene_id))
anno <- matrix(nrow=length(genes),ncol=6)
anno <- as.data.frame(anno)
colnames(anno)<-c("chr","start","end","strand","gene","symbol")
anno$gene<-genes
for(i in 1:nrow(anno)){
anno$chr[i]<-unique(as.character(annotation[as.character(annotation$gene_id)==anno$gene[i],]$seqnames))
anno$start[i]<-min(as.numeric(as.character(annotation[as.character(annotation$gene_id)==anno$gene[i],]$start)))
anno$end[i]<-max(as.numeric(as.character(annotation[as.character(annotation$gene_id)==anno$gene[i],]$end)))
anno$strand[i]<-unique(as.character(annotation[as.character(annotation$gene_id)==anno$gene[i],]$strand))
anno$symbol[i]<-unique(as.character(annotation[as.character(annotation$gene_id)==anno$gene[i],]$gene_name))
}
annoData<-with(anno,GRanges(seqnames=chr,
ranges=IRanges(start=start,end=end,names=gene),
strand=strand,
symbol=symbol))
})
#
annoData
## GRanges object with 8491 ranges and 1 metadata column:
## seqnames ranges strand | symbol
## <Rle> <IRanges> <Rle> | <character>
## ENSG00000223972.5 chr1 [11869, 14409] + | DDX11L1
## ENSG00000227232.5 chr1 [14404, 29570] - | WASH7P
## ENSG00000278267.1 chr1 [17369, 17436] - | MIR6859-1
## ENSG00000243485.5 chr1 [29554, 31109] + | RP11-34P13.3
## ENSG00000284332.1 chr1 [30366, 30503] + | MIR1302-2
## ... ... ... ... . ...
## ENSG00000082126.17 chr2 [201644870, 201698694] - | MPP4
## ENSG00000222972.1 chr2 [201646716, 201646820] + | RNU6-651P
## ENSG00000003393.14 chr2 [201700554, 201781189] - | ALS2
## ENSG00000230799.1 chr2 [201762737, 201763573] + | AC007279.2
## ENSG00000138395.14 chr2 [201790461, 201895550] + | CDK15
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
Improved method
## GRanges object with 8491 ranges and 1 metadata column:
## seqnames ranges strand | symbol
## <Rle> <IRanges> <Rle> | <character>
## [1] chr1 [11869, 14409] + | DDX11L1
## [2] chr1 [14404, 29570] - | WASH7P
## [3] chr1 [17369, 17436] - | MIR6859-1
## [4] chr1 [29554, 31109] + | RP11-34P13.3
## [5] chr1 [30366, 30503] + | MIR1302-2
## ... ... ... ... . ...
## [8487] chr2 [201644870, 201698694] - | MPP4
## [8488] chr2 [201646716, 201646820] + | RNU6-651P
## [8489] chr2 [201700554, 201781189] - | ALS2
## [8490] chr2 [201762737, 201763573] + | AC007279.2
## [8491] chr2 [201790461, 201895550] + | CDK15
## -------
## seqinfo: 25 sequences from an unspecified genome; no seqlengths
Check if results are the same
## [1] TRUE
Is parallelizing faster ?
## GRanges object with 8491 ranges and 1 metadata column:
## seqnames ranges strand | symbol
## <Rle> <IRanges> <Rle> | <character>
## [1] chr1 [11869, 14409] + | DDX11L1
## [2] chr1 [14404, 29570] - | WASH7P
## [3] chr1 [17369, 17436] - | MIR6859-1
## [4] chr1 [29554, 31109] + | RP11-34P13.3
## [5] chr1 [30366, 30503] + | MIR1302-2
## ... ... ... ... . ...
## [8487] chr2 [201644870, 201698694] - | MPP4
## [8488] chr2 [201646716, 201646820] + | RNU6-651P
## [8489] chr2 [201700554, 201781189] - | ALS2
## [8490] chr2 [201762737, 201763573] + | AC007279.2
## [8491] chr2 [201790461, 201895550] + | CDK15
## -------
## seqinfo: 25 sequences from an unspecified genome; no seqlengths
Check if results are the same
## [1] TRUE
Times
## user.self sys.self elapsed user.child sys.child
## For loop 509.532 0.004 509.538 0.000 0.000
## plyr 244.536 0.000 244.535 0.000 0.000
## plyr parallel 4.432 0.216 67.965 273.888 2.688