We would like to calculate the run time needed to perform two-sample testing (for differential expression analysis) using various datasets containing varying numbers of probes (n).
We will begin by loading necessary packages and the photorec dataset
library(plyr)
library(lattice)
prDat <- read.table("GSE4051_data.tsv")
str(prDat, max.level = 0)
## 'data.frame': 29949 obs. of 39 variables:
prDes <- readRDS("GSE4051_design.rds")
str(prDes)
## 'data.frame': 39 obs. of 4 variables:
## $ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
## $ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
Let's draw increasing numbers (n) of probes from the dataset
n <- seq(from = 50, to = 1000, by = 100)
set.seed(72)
Now we will write a function that prepares a dataframe of n probes and then calculates run time for differential gene expression analysis
Calc.Run.Time <- function(x) {
random.probes <- sample(1:nrow(prDat), x)
mini.df <- prDat[random.probes, ]
mini.df <- data.frame(gExp = as.vector(t(as.matrix(mini.df))), gene = factor(rep(rownames(mini.df),
each = ncol(mini.df)), levels = rownames(mini.df)))
mini.df <- suppressWarnings(data.frame(prDes, mini.df))
run.time <- system.time(ppval <- suppressWarnings(ddply(mini.df, ~gene,
function(w) {
tt <- t.test(gExp ~ gType, w)
wt <- wilcox.test(gExp ~ gType, w)
ks <- with(w, ks.test(gExp[gType == "NrlKO"], gExp[gType == "wt"]))
round(c(t.test = tt$p.value, Wilcoxon = wt$p.value, KS = ks$p.value),
4)
})))
return(data.frame(probes = x, elapsed = run.time["elapsed"]))
}
Now let's compute run times for each value of n
nRunTimes <- lapply(n, Calc.Run.Time)
run.time.df <- do.call(rbind, nRunTimes)
head(run.time.df)
## probes elapsed
## elapsed 50 0.289
## elapsed1 150 0.787
## elapsed2 250 1.318
## elapsed3 350 1.830
## elapsed4 450 2.400
## elapsed5 550 2.974
Plot the elapsed time against number of probes in dataset
xyplot(elapsed ~ probes, run.time.df, xlab = "Number of probes", ylab = "Run time")
As we would expect, the run time increases linearly with increasing number of probes in the dataset