benchmark

libraries <- c("vpc", "PKPDmisc", "data.table", "fastread")
sapply(libraries, require, character.only = T)

##        vpc   PKPDmisc data.table   fastread 
##       TRUE       TRUE       TRUE       TRUE

library(knitr)
opts_chunk$set(cache=T, message=F, warning=F)

Objective to test reading speed

bm_read.table <- function(){
  r <- read.table("nonmem/run004_notitle_csv.vpc", header=T)
  }

bm_read_nonmem_sed <- function() {
  r <- read_nonmem_sed("nonmem/run004_notitle_csv.vpc")
  }

bm_read_nonmem <- function (){
  r <- read_nonmem("nonmem/run004_notitle_csv.vpc",
              colClasses="character",nrow = 120199, stringsAsFactors=F)
  }

bm_read_table_nm <- function() {
  r <- read_table_nm("nonmem/run004_notitle_csv.vpc")
}

bm_fread <- function() {
 r <- fread("nonmem/run004_notitle_csv.vpc", header=F)
}

bm_read_csv <- function() {
  r <- read_csv("nonmem/run004_notitle_csv.vpc", col_names=F)
}

Data design consisted of 50 ids with rich sampling given a cross-over of IV and oral dosing. Rich sampling (12 samples per occasion) totaling to 24 samples per ID.

For run004, an NSUB of 100 was used, resulting in a final dataset of appxoimately 120000 rows and totaling approximately 20 MB.

suppressMessages(library(ggplot2))
suppressMessages(library(microbenchmark))

# check varying subset sizes 20 - 200 individuals
tm <- microbenchmark(bm_fread(), 
                     bm_read_csv(),
                     bm_read_nonmem(),
                     bm_read_nonmem_sed(),
                     bm_read_table_nm(),
                     bm_read.table(), times = 20L)
tm

## Unit: milliseconds
##                  expr       min        lq      mean    median        uq
##            bm_fread()  146.2494  164.7170  221.1164  225.5674  235.4835
##         bm_read_csv()  166.4415  218.7338  245.1720  257.4366  271.1599
##      bm_read_nonmem() 2475.1878 2556.0454 2635.0976 2629.8077 2710.5933
##  bm_read_nonmem_sed() 1335.0996 1362.0040 1394.5717 1396.5219 1417.2148
##    bm_read_table_nm() 3146.6488 3181.8222 3273.9012 3261.3065 3336.9847
##       bm_read.table() 3714.9453 3828.6276 3887.9343 3855.6954 3902.7690
##        max neval
##   477.6478    20
##   290.7194    20
##  2826.1461    20
##  1506.1472    20
##  3454.4616    20
##  4216.8223    20

autoplot(tm)+ scale_y_continuous(breaks = c(100, 500, 1000, 2000, 3000, 4000)) +
   ylab("time, milliseconds")

bm_read.table <- function(){
  r <- read.table("nonmem/run005_notitle_csv.vpc", header=T)
  }

bm_read_nonmem_sed <- function() {
  r <- read_nonmem_sed("nonmem/run005_notitle_csv.vpc")
  }

bm_read_nonmem <- function (){
  r <- read_nonmem("nonmem/run005_notitle_csv.vpc",
              colClasses="character",nrow = 120199, stringsAsFactors=F)
  }

bm_read_table_nm <- function() {
  r <- read_table_nm("nonmem/run005_notitle_csv.vpc")
}

bm_fread <- function() {
 r <- fread("nonmem/run005_notitle_csv.vpc", header=F)
}

bm_read_csv <- function() {
  r <- read_csv("nonmem/run005_notitle_csv.vpc", col_names=F)
}

For run005, an NSUB of 1000 was used, resulting in a final dataset of appxoimately 1200000 rows and totaling approximately 200 MB.

suppressMessages(library(ggplot2))
suppressMessages(library(microbenchmark))

# check varying subset sizes 20 - 200 individuals
tm2 <- microbenchmark(bm_fread(), 
                     bm_read_csv(),
                     bm_read_nonmem(),
                     bm_read_nonmem_sed(),
                     bm_read_table_nm(),
                     bm_read.table(), times = 20L)

## 
Read 91.5% of 1201999 rows
Read 1201999 rows and 27 (of 27) columns from 0.187 GB file in 00:00:03
## 
Read 94.0% of 1202000 rows
Read 1202000 rows and 14 (of 14) columns from 0.189 GB file in 00:00:03
## 
Read 99.8% of 1201999 rows
Read 1201999 rows and 27 (of 27) columns from 0.187 GB file in 00:00:03
## 
Read 88.2% of 1201999 rows
Read 1201999 rows and 27 (of 27) columns from 0.187 GB file in 00:00:03
## 
Read 91.5% of 1201999 rows
Read 1201999 rows and 27 (of 27) columns from 0.187 GB file in 00:00:03
## 
Read 97.3% of 1201999 rows
Read 1201999 rows and 27 (of 27) columns from 0.187 GB file in 00:00:03
## 
Read 86.5% of 1201999 rows
Read 1201999 rows and 27 (of 27) columns from 0.187 GB file in 00:00:03
## 
Read 91.5% of 1201999 rows
Read 1201999 rows and 27 (of 27) columns from 0.187 GB file in 00:00:03
## 
Read 94.8% of 1201999 rows
Read 1201999 rows and 27 (of 27) columns from 0.187 GB file in 00:00:03
## 
Read 96.5% of 1201999 rows
Read 1201999 rows and 27 (of 27) columns from 0.187 GB file in 00:00:03
## 
Read 92.3% of 1201999 rows
Read 1201999 rows and 27 (of 27) columns from 0.187 GB file in 00:00:03
## 
Read 89.0% of 1201999 rows
Read 1201999 rows and 27 (of 27) columns from 0.187 GB file in 00:00:03

tm2

## Unit: seconds
##                  expr       min        lq      mean    median        uq
##            bm_fread()  1.579058  1.710019  1.803121  1.780046  1.913566
##         bm_read_csv()  1.908847  2.118259  2.271644  2.206372  2.331011
##      bm_read_nonmem()  2.444360  2.562065  2.668116  2.685839  2.778880
##  bm_read_nonmem_sed() 13.003735 13.334322 13.716798 13.805448 13.906848
##    bm_read_table_nm() 29.409739 30.078192 30.596502 30.304177 31.189465
##       bm_read.table() 32.758065 33.457997 34.500843 34.335992 35.825961
##        max neval
##   2.113301    20
##   3.738626    20
##   2.865107    20
##  14.487845    20
##  32.102037    20
##  36.412897    20

autoplot(tm2) + scale_y_continuous(breaks = c(1, 5, 10, 15, 20, 25, 30, 35)) + 
  ylab("time, seconds")

In both scenarios (20 and 200 MB files), the relationship of between the default read.table and both the fastread and fread functions held, with both fast read implementations being approximately 20x faster.

The use of sed outperformed perl in all cases, and was shown to fall in-between the fastread and normal implementations. Simply pre-allocating the memory (specifying nrow) and the colClasses was found to beat out both sed and perl examples significantly (5x and 10x) respectively for larger datasets, however for smaller datasets the sed implementation was almost twice as fast. That said, for smaller datasets the decrepencies are not nearly as ‘unmanageable’ as no method took more than 4 seconds to read in. On the other hand, for the simulated data, the gains are substantial.

take aways

At this point, the perl implementation (read_table_nm) can be dropped (thereby also removing the issue of requiring a perl install on the computer, which is not always the case if modeling is done on the cluster, but post-processing done locally), and the simpler ‘R-optimized’ method can be used as a fallback if neither fastread package is available to the end-user.

benchmark_read

Devin

October 24, 2014

take aways