library(AnnotationHub)
Attaching package: ‘AnnotationHub’
The following object is masked from ‘package:Biobase’:
cache
library(GEOquery)
Setting options('download.file.method.GEOquery'='auto')
Setting options('GEOquery.inmemory.gpl'=FALSE)
library(data.table)
data.table 1.10.4
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
Release notes, videos and slides: http://r-datatable.com
Attaching package: ‘data.table’
The following objects are masked from ‘package:reshape2’:
dcast, melt
The following object is masked from ‘package:SummarizedExperiment’:
shift
The following object is masked from ‘package:GenomicRanges’:
shift
The following object is masked from ‘package:IRanges’:
shift
The following objects are masked from ‘package:S4Vectors’:
first, second
suppressMessages(library("DESeq2"))
hub = AnnotationHub()
snapshotDate(): 2017-04-25
gse = query(hub, "GSE62944")
suppl <- GEOquery::getGEOSuppFiles("GSE62944")
https://ftp.ncbi.nlm.nih.gov/geo/series/GSE62nnn/GSE62944/suppl/
OK
trying URL 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE62nnn/GSE62944/suppl//GSE62944_01_27_15_TCGA_20_420_Clinical_Variables_7706_Samples.txt.gz'
Content type 'application/x-gzip' length 529880 bytes (517 KB)
==================================================
downloaded 517 KB
trying URL 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE62nnn/GSE62944/suppl//GSE62944_01_27_15_TCGA_20_CancerType_Samples.txt.gz'
Content type 'application/x-gzip' length 57132 bytes (55 KB)
==================================================
downloaded 55 KB
trying URL 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE62nnn/GSE62944/suppl//GSE62944_06_01_15_TCGA_24_548_Clinical_Variables_9264_Samples.txt.gz'
Content type 'application/x-gzip' length 1226302 bytes (1.2 MB)
==================================================
downloaded 1.2 MB
trying URL 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE62nnn/GSE62944/suppl//GSE62944_06_01_15_TCGA_24_CancerType_Samples.txt.gz'
Content type 'application/x-gzip' length 73327 bytes (71 KB)
==================================================
downloaded 71 KB
trying URL 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE62nnn/GSE62944/suppl//GSE62944_06_01_15_TCGA_24_Normal_CancerType_Samples.txt.gz'
Content type 'application/x-gzip' length 5585 bytes
==================================================
downloaded 5585 bytes
trying URL 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE62nnn/GSE62944/suppl//GSE62944_RAW.tar'
Content type 'application/x-tar' length 6303805440 bytes (6011.8 MB)
==================================================
downloaded 6011.8 MB
trying URL 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE62nnn/GSE62944/suppl//filelist.txt'
Content type 'text/plain' length 997 bytes
==================================================
downloaded 997 bytes
clinvar <- read.table('../GSE62944_06_01_15_TCGA_24_548_Clinical_Variables_9264_Samples.txt', sep='\t', quote='', stringsAsFactors = F)
CancerType <- read.table('../GSE62944_06_01_15_TCGA_24_CancerType_Samples.txt', stringsAsFactors = F)
colnames(CancerType) <- c('ID', 'type')
NormalType <- read.table('../GSE62944_06_01_15_TCGA_24_Normal_CancerType_Samples.txt', stringsAsFactors = F)
colnames(NormalType) <- c('ID', 'type')
gbm.tumor <- subset(CancerType, type=='GBM')
gbm.normal <- subset(NormalType, type=='GBM')
gbm.normal.participants <- as.vector(sapply(gbm.normal$ID, function(x) paste(strsplit(x, "-")[[1]][1:3], collapse='-')))
gbm.tumor.participants <- as.vector(sapply(gbm.tumor$ID, function(x) paste(strsplit(x, "-")[[1]][1:3], collapse='-')))
gbm.tumor[gbm.tumor.participants %in% gbm.normal.participants,]
So we don’t have matched normals for GBM. But do we have it for say BRCA? (more easily obtainable tissue than brain)
brca.tumor <- subset(CancerType, type=='BRCA')
brca.normal <- subset(NormalType, type=='BRCA')
brca.normal.participants <- as.vector(sapply(brca.normal$ID, function(x) paste(strsplit(x, "-")[[1]][1:3], collapse='-')))
brca.tumor.participants <- as.vector(sapply(brca.tumor$ID, function(x) paste(strsplit(x, "-")[[1]][1:3], collapse='-')))
brca.tumor[brca.tumor.participants %in% brca.normal.participants,]
Loads of them! So we have no way but to work with GBM sameples from the same center.
gbm.normal.participants <- as.vector(sapply(gbm.normal$ID, function(x) paste(strsplit(x, "-")[[1]][1:2], collapse='-')))
gbm.tumor.participants <- as.vector(sapply(gbm.tumor$ID, function(x) paste(strsplit(x, "-")[[1]][1:2], collapse='-')))
gbm.06.samples <- gbm.tumor[gbm.tumor.participants %in% gbm.normal.participants,]
gbm.06.samples
all.tumor.counts <- read.table('../GBM_TCGA_featurecounts/GSM1536837_06_01_15_TCGA_24.tumor_Rsubread_FeatureCounts.txt')
all.normal.counts <- read.table('../GBM_TCGA_featurecounts/GSM1697009_06_01_15_TCGA_24.normal_Rsubread_FeatureCounts.txt')
gbm.tumor.counts <- all.tumor.counts[, colnames(all.tumor.counts) %in% gsub("-", ".", gbm.06.samples$ID)]
gbm.normal.counts <- all.normal.counts[, colnames(all.normal.counts) %in% gsub("-", ".", gbm.normal$ID)]
gbm.merged.counts <- cbind(gbm.tumor.counts, gbm.normal.counts)
write.table(gbm.merged.counts, '../GBM_TCGA_featurecounts/GBM.normal.tumour.featureCounts.tsv')
colData <- data.frame('ID' = c(gbm.normal$ID, gbm.06.samples$ID), 'condition' = c(rep('normal', length(gbm.normal$ID)), rep('gbm', length(gbm.06.samples$ID))) )
rownames(colData) <- gsub("-", ".", colData$ID)
write.table(colData, '../GBM_TCGA_featurecounts/GBM.normal.tumour.featureCounts.conditions.tsv')
#merge(x=gbm.normal.counts, y=gbm.tumor.counts, all=TRUE)
do_DE_analysis <- function(countData, colData){
stopifnot(rownames(colData) %in% colnames(countData))
countData <- countData[, rownames(colData)]
ddsCounts <- DESeqDataSetFromMatrix(countData=countData, colData=colData, design=~condition)
featureData <- data.frame(gene=rownames(countData))
mcols(ddsCounts) <- DataFrame(mcols(ddsCounts), featureData)
## We dont need version numbers
## Filter genes with atleast 2 count
ddsCounts <- ddsCounts[ rowSums(counts(ddsCounts)) > 1, ]
ddsCounts$condition<-factor(ddsCounts$condition, levels=c('normal','gbm'))
dds <- DESeq(ddsCounts)
res <- results(dds, contrast=c('condition', 'gbm', 'normal'))
resOrdered <- res[order(res$padj),]
return(resOrdered)
}
countData <- as.matrix(gbm.merged.counts)
results.full <- do_DE_analysis(countData, colData)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 864 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
set.seed(42)
## randomly sample 5 samples from GBM so that we match the control samples
subsampled.gbm <- sample(gbm.06.samples$ID, 5)
subsampled.tumor.counts <- all.tumor.counts[, colnames(all.tumor.counts) %in% gsub("-", ".", subsampled.gbm)]
subsampled.counts <- cbind(subsampled.tumor.counts, gbm.normal.counts)
subsampled.colData <- data.frame('ID' = c(gbm.normal$ID, subsampled.gbm), 'condition' = c(rep('normal', length(gbm.normal$ID)), rep('gbm', length(subsampled.gbm))) )
rownames(subsampled.colData) <- gsub("-", ".", subsampled.colData$ID)
results.subsampled <- do_DE_analysis(as.matrix(subsampled.counts), subsampled.colData)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
resSig.full <- as.data.frame(subset(results.full, padj<0.01))
resSig.subsampled <- as.data.frame(subset(results.subsampled, padj<0.01))
length(intersect(rownames(resSig.full), rownames(resSig.subsampled)))
[1] 6001
length(rownames(resSig.full))
[1] 8605
length(rownames(resSig.subsampled))
[1] 6965
length(intersect(rownames(resSig.full), rownames(resSig.subsampled)))/length(union(rownames(resSig.full), rownames(resSig.subsampled)))
[1] 0.6271293