Preparing gene level count table. It is adapted from RNA-seq workflow from bioconductor.org. Full version can be found here: https://www.bioconductor.org/help/workflows/rnaseqGene/. TPM vs. RPKM is adapted from https://gist.github.com/slowkow/6e34ccb4d1311b8fe62e#file-rpkm_versus_tpm-r.
library("airway")
indir <- system.file("extdata", package="airway", mustWork=TRUE)
list.files(indir)
[1] "GSE52778_series_matrix.txt" "Homo_sapiens.GRCh37.75_subset.gtf"
[3] "sample_table.csv" "SraRunInfo_SRP033351.csv"
[5] "SRR1039508_subset.bam" "SRR1039509_subset.bam"
[7] "SRR1039512_subset.bam" "SRR1039513_subset.bam"
[9] "SRR1039516_subset.bam" "SRR1039517_subset.bam"
[11] "SRR1039520_subset.bam" "SRR1039521_subset.bam"
# design table
csvfile <- file.path(indir, "sample_table.csv")
sampleTable <- read.csv(csvfile, row.names = 1)
sampleTable
filenames <- file.path(indir, paste0(sampleTable$Run, "_subset.bam"))
file.exists(filenames)
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
library("Rsamtools")
bamfiles <- BamFileList(filenames, yieldSize=2000000)
bamfiles
BamFileList of length 8
names(8): SRR1039508_subset.bam SRR1039509_subset.bam ... SRR1039521_subset.bam
seqinfo(bamfiles[1])
Seqinfo object with 84 sequences from an unspecified genome:
seqnames seqlengths isCircular genome
1 249250621 <NA> <NA>
10 135534747 <NA> <NA>
11 135006516 <NA> <NA>
12 133851895 <NA> <NA>
13 115169878 <NA> <NA>
... ... ... ...
GL000210.1 27682 <NA> <NA>
GL000231.1 27386 <NA> <NA>
GL000229.1 19913 <NA> <NA>
GL000226.1 15008 <NA> <NA>
GL000207.1 4262 <NA> <NA>
sapply(scanBam(bamfiles[[1]])[[1]], function(x)head(x,1))
$qname
[1] "SRR1039508.8900698"
$flag
[1] 99
$rname
[1] 1
$strand
[1] 1
$pos
[1] 11053773
$qwidth
[1] 63
$mapq
[1] 255
$cigar
[1] "63M"
$mrnm
[1] 1
$mpos
[1] 11053840
$isize
[1] 130
$seq
A DNAStringSet instance of length 1
width seq
[1] 63 ATACAAAAATTAGCCGGGCGTGGTGGCGCACGCCCGCAATCCCAGCTACTCAGGAAGCTGAGG
$qual
A PhredQuality instance of length 1
width seq
[1] 63 HJJJJJIJJJJJAGGIJJIJJJJFIJJJJJHBDFDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
library("GenomicFeatures")
gtffile <- file.path(indir,"Homo_sapiens.GRCh37.75_subset.gtf")
txdb <- makeTxDbFromGFF(gtffile, format = "gtf", circ_seqs = character())
Import genomic features from the file as a GRanges object ... OK
Prepare the 'metadata' data frame ... OK
Make the TxDb object ... Named parameters not used in query: internal_chrom_id, chrom, length, is_circularNamed parameters not used in query: internal_id, name, type, chrom, strand, start, endNamed parameters not used in query: internal_id, name, chrom, strand, start, endNamed parameters not used in query: internal_id, name, chrom, strand, start, endNamed parameters not used in query: internal_tx_id, exon_rank, internal_exon_id, internal_cds_idNamed parameters not used in query: gene_id, internal_tx_idOK
txdb
TxDb object:
# Db type: TxDb
# Supporting package: GenomicFeatures
# Data source: /Users/lis/Library/R/3.3/library/airway/extdata/Homo_sapiens.GRCh37.75_subset.gtf
# Organism: NA
# Taxonomy ID: NA
# miRBase build ID: NA
# Genome: NA
# transcript_nrow: 65
# exon_nrow: 279
# cds_nrow: 158
# Db created by: GenomicFeatures package from Bioconductor
# Creation time: 2017-05-14 23:26:01 -0400 (Sun, 14 May 2017)
# GenomicFeatures version at creation time: 1.26.4
# RSQLite version at creation time: 1.1-2
# DBSCHEMAVERSION: 1.1
ebg <- exonsBy(txdb, by="gene")
ebg
GRangesList object of length 20:
$ENSG00000009724
GRanges object with 18 ranges and 2 metadata columns:
seqnames ranges strand | exon_id exon_name
<Rle> <IRanges> <Rle> | <integer> <character>
[1] 1 [11086580, 11087705] - | 98 ENSE00000818830
[2] 1 [11090233, 11090307] - | 99 ENSE00000472123
[3] 1 [11090805, 11090939] - | 100 ENSE00000743084
[4] 1 [11094885, 11094963] - | 101 ENSE00000743085
[5] 1 [11097750, 11097868] - | 102 ENSE00003482788
... ... ... ... . ... ...
[14] 1 [11106948, 11107176] - | 111 ENSE00003467404
[15] 1 [11106948, 11107176] - | 112 ENSE00003489217
[16] 1 [11107260, 11107280] - | 113 ENSE00001833377
[17] 1 [11107260, 11107284] - | 114 ENSE00001472289
[18] 1 [11107260, 11107290] - | 115 ENSE00001881401
...
<19 more elements>
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
gene.length=sapply(ebg, function(x)sum(width(reduce(x))))
gene.length
ENSG00000009724 ENSG00000116649 ENSG00000120942 ENSG00000120948 ENSG00000171819 ENSG00000171824
4251 1898 5082 4622 2238 6003
ENSG00000175262 ENSG00000198793 ENSG00000207213 ENSG00000207451 ENSG00000215785 ENSG00000225602
3573 12119 84 107 494 763
ENSG00000226849 ENSG00000230337 ENSG00000238173 ENSG00000238199 ENSG00000253086 ENSG00000264181
2682 536 150 437 103 88
ENSG00000271794 ENSG00000271895
107 870
library("GenomicAlignments")
library("BiocParallel")
register(SerialParam())
se <- summarizeOverlaps(features=ebg, reads=bamfiles,
mode="Union",
singleEnd=FALSE,
ignore.strand=TRUE,
fragments=TRUE )
head(assay(se), 3)
SRR1039508_subset.bam SRR1039509_subset.bam SRR1039512_subset.bam
ENSG00000009724 38 28 66
ENSG00000116649 1004 1255 1122
ENSG00000120942 218 256 233
SRR1039513_subset.bam SRR1039516_subset.bam SRR1039517_subset.bam
ENSG00000009724 24 42 41
ENSG00000116649 1313 1100 1879
ENSG00000120942 252 269 465
SRR1039520_subset.bam SRR1039521_subset.bam
ENSG00000009724 47 36
ENSG00000116649 745 1536
ENSG00000120942 207 400
colData(se)=DataFrame(sampleTable)
colData(se)
DataFrame with 8 rows and 9 columns
SampleName cell dex albut Run avgLength Experiment
<character> <character> <character> <character> <character> <integer> <character>
SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126 SRX384345
SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346
SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126 SRX384349
SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350
SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120 SRX384353
SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 SRX384354
SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101 SRX384357
SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98 SRX384358
Sample BioSample
<character> <character>
SRR1039508 SRS508568 SAMN02422669
SRR1039509 SRS508567 SAMN02422675
SRR1039512 SRS508571 SAMN02422678
SRR1039513 SRS508572 SAMN02422670
SRR1039516 SRS508575 SAMN02422682
SRR1039517 SRS508576 SAMN02422673
SRR1039520 SRS508579 SAMN02422683
SRR1039521 SRS508580 SAMN02422677
se
class: RangedSummarizedExperiment
dim: 20 8
metadata(0):
assays(1): counts
rownames(20): ENSG00000009724 ENSG00000116649 ... ENSG00000271794 ENSG00000271895
rowData names(0):
colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
colData names(9): SampleName cell ... Sample BioSample
rpkm <- function(counts, lengths) {
rate <- counts / lengths
rate / sum(counts) * 1e6
}
tpm <- function(counts, lengths) {
rate <- counts / lengths
rate / sum(rate) * 1e6
}
counts=assay(se)
rpkms <- apply(counts, 2, function(x) rpkm(x, gene.length))
head(rpkms)
SRR1039508_subset.bam SRR1039509_subset.bam SRR1039512_subset.bam
ENSG00000009724 1.3799125 1.013180 2.016594
ENSG00000116649 81.6575905 101.710866 76.782514
ENSG00000120942 6.6218736 7.748634 5.955071
ENSG00000120948 91.8797290 69.223448 94.225683
ENSG00000171819 0.2759046 3.436606 1.102705
ENSG00000171824 22.3465503 27.546090 24.125271
SRR1039513_subset.bam SRR1039516_subset.bam SRR1039517_subset.bam
ENSG00000009724 0.8301324 1.23361571 0.8890026
ENSG00000116649 101.7175154 72.36326993 91.2516787
ENSG00000120942 7.2911008 6.60905438 8.4339026
ENSG00000120948 51.3453140 95.06289414 74.1064578
ENSG00000171819 35.6752457 0.05579068 0.4118606
ENSG00000171824 25.7431447 19.63474915 21.5733839
SRR1039520_subset.bam SRR1039521_subset.bam
ENSG00000009724 2.104344 0.9237124
ENSG00000116649 74.708496 88.2714789
ENSG00000120942 7.752569 8.5852061
ENSG00000120948 91.418263 46.9622105
ENSG00000171819 1.190633 52.0031598
ENSG00000171824 23.716095 28.8904413
tpms <- apply(counts, 2, function(x) tpm(x,gene.length))
head(tpms)
SRR1039508_subset.bam SRR1039509_subset.bam SRR1039512_subset.bam
ENSG00000009724 5904.669 4232.317 8778.839
ENSG00000116649 349414.209 424872.683 334257.287
ENSG00000120942 28335.109 32368.054 25924.208
ENSG00000120948 393154.912 289164.307 410192.625
ENSG00000171819 1180.601 14355.596 4800.404
ENSG00000171824 95621.266 115067.165 105024.533
SRR1039513_subset.bam SRR1039516_subset.bam SRR1039517_subset.bam
ENSG00000009724 3272.639 5564.3846 3914.086
ENSG00000116649 401001.946 326403.9665 401761.335
ENSG00000120942 28743.777 29811.0017 37132.643
ENSG00000120948 202419.129 428793.5820 326274.649
ENSG00000171819 140642.866 251.6511 1813.333
ENSG00000171824 101487.449 88565.0968 94982.927
SRR1039520_subset.bam SRR1039521_subset.bam
ENSG00000009724 9228.910 3666.959
ENSG00000116649 327645.123 350420.649
ENSG00000120942 34000.032 34081.603
ENSG00000120948 400928.267 186430.866
ENSG00000171819 5221.696 206442.457
ENSG00000171824 104010.431 114689.448
colSums(rpkms)
SRR1039508_subset.bam SRR1039509_subset.bam SRR1039512_subset.bam SRR1039513_subset.bam
233.6985 239.3914 229.7108 253.6584
SRR1039516_subset.bam SRR1039517_subset.bam SRR1039520_subset.bam SRR1039521_subset.bam
221.6985 227.1291 228.0165 251.9015
colSums(tpms)
SRR1039508_subset.bam SRR1039509_subset.bam SRR1039512_subset.bam SRR1039513_subset.bam
1e+06 1e+06 1e+06 1e+06
SRR1039516_subset.bam SRR1039517_subset.bam SRR1039520_subset.bam SRR1039521_subset.bam
1e+06 1e+06 1e+06 1e+06
colMeans(rpkms)
SRR1039508_subset.bam SRR1039509_subset.bam SRR1039512_subset.bam SRR1039513_subset.bam
11.68493 11.96957 11.48554 12.68292
SRR1039516_subset.bam SRR1039517_subset.bam SRR1039520_subset.bam SRR1039521_subset.bam
11.08493 11.35645 11.40083 12.59507
colMeans(tpms)
SRR1039508_subset.bam SRR1039509_subset.bam SRR1039512_subset.bam SRR1039513_subset.bam
50000 50000 50000 50000
SRR1039516_subset.bam SRR1039517_subset.bam SRR1039520_subset.bam SRR1039521_subset.bam
50000 50000 50000 50000