Seminar 4: Run Time Calculation

We would like to calculate the run time needed to perform two-sample testing (for differential expression analysis) using various datasets containing varying numbers of probes (n).

We will begin by loading necessary packages and the photorec dataset

library(plyr)
library(lattice)
prDat <- read.table("GSE4051_data.tsv")
str(prDat, max.level = 0)

## 'data.frame':    29949 obs. of  39 variables:

prDes <- readRDS("GSE4051_design.rds")
str(prDes)

## 'data.frame':    39 obs. of  4 variables:
##  $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
##  $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...

Let's draw increasing numbers (n) of probes from the dataset

n <- seq(from = 50, to = 1000, by = 100)
set.seed(72)

Now we will write a function that prepares a dataframe of n probes and then calculates run time for differential gene expression analysis

Calc.Run.Time <- function(x) {
    random.probes <- sample(1:nrow(prDat), x)
    mini.df <- prDat[random.probes, ]
    mini.df <- data.frame(gExp = as.vector(t(as.matrix(mini.df))), gene = factor(rep(rownames(mini.df), 
        each = ncol(mini.df)), levels = rownames(mini.df)))
    mini.df <- suppressWarnings(data.frame(prDes, mini.df))
    run.time <- system.time(ppval <- suppressWarnings(ddply(mini.df, ~gene, 
        function(w) {
            tt <- t.test(gExp ~ gType, w)
            wt <- wilcox.test(gExp ~ gType, w)
            ks <- with(w, ks.test(gExp[gType == "NrlKO"], gExp[gType == "wt"]))
            round(c(t.test = tt$p.value, Wilcoxon = wt$p.value, KS = ks$p.value), 
                4)
        })))
    return(data.frame(probes = x, elapsed = run.time["elapsed"]))
}

Now let's compute run times for each value of n

nRunTimes <- lapply(n, Calc.Run.Time)
run.time.df <- do.call(rbind, nRunTimes)
head(run.time.df)

##          probes elapsed
## elapsed      50   0.289
## elapsed1    150   0.787
## elapsed2    250   1.318
## elapsed3    350   1.830
## elapsed4    450   2.400
## elapsed5    550   2.974

Plot the elapsed time against number of probes in dataset

xyplot(elapsed ~ probes, run.time.df, xlab = "Number of probes", ylab = "Run time")

plot of chunk unnamed-chunk-1

As we would expect, the run time increases linearly with increasing number of probes in the dataset