Different Read Implementations for Nonmem

File is 74 MB, 16 cols and ~ 360000 rows

Understanding the internals of the sedFread with a small file



sedFread2 <- function(file, sedCmd = NULL, ...) {
    if (is.null(sedCmd)) {
        # default : sed for convert blank separated table to csv. Thanks NeronLevelu
        # !
        sedCmd <- "'s/^[[:blank:]]*//;s/[[:blank:]]\\{1,\\}/,/g'"
    }
    # sed into temp file
    tmpPath <- tempfile(pattern = "tmp", fileext = ".txt")
    sysCmd <- paste("sed", sedCmd, file, ">", tmpPath)

    try(system(sysCmd))
    print(readLines(tmpPath, n = 5))  # see what the internal csv output looks like
    DT <- fread(tmpPath, ...)
    try(system(paste("rm", tmpPath)))
    return(DT)
}

# comparing what before, during, and after looks like
readLines("~/BEproj/covariate_check/1001.ETA", n = 5)  # before
## [1] "TABLE NO.  5"                                                                                                            
## [2] " ID          REP         CLI         V2I         KAI         AMRTI       F8I         F2I         K72I        K82I"       
## [3] "  3.0000E+00  0.0000E+00  3.5036E-02  9.4326E-02  7.3651E-02 -1.9508E-01  6.1227E-01  0.0000E+00  0.0000E+00  1.4106E-01"
## [4] "  6.0000E+00  0.0000E+00  2.7961E-01  5.6817E-02  2.9769E-02 -1.1283E-01  2.3112E-01  0.0000E+00  0.0000E+00  1.7832E-01"
## [5] "  8.0000E+00  0.0000E+00  1.9332E-01  7.6032E-02  3.2132E-02 -3.1584E-01  1.5444E-01  0.0000E+00  0.0000E+00 -1.9636E-01"

sedFread2("~/BEproj/covariate_check/1001.ETA")  # during and after
## [1] "TABLE,NO.,5"                                                                                                    
## [2] "ID,REP,CLI,V2I,KAI,AMRTI,F8I,F2I,K72I,K82I"                                                                     
## [3] "3.0000E+00,0.0000E+00,3.5036E-02,9.4326E-02,7.3651E-02,-1.9508E-01,6.1227E-01,0.0000E+00,0.0000E+00,1.4106E-01" 
## [4] "6.0000E+00,0.0000E+00,2.7961E-01,5.6817E-02,2.9769E-02,-1.1283E-01,2.3112E-01,0.0000E+00,0.0000E+00,1.7832E-01" 
## [5] "8.0000E+00,0.0000E+00,1.9332E-01,7.6032E-02,3.2132E-02,-3.1584E-01,1.5444E-01,0.0000E+00,0.0000E+00,-1.9636E-01"
## Error: could not find function "fread"
suppressMessages(library(data.table))
suppressMessages(library(ggplot2))
suppressMessages(library(microbenchmark))


## helper functions
fileRowsCount <- function(file) {
    if (file.exists(file)) {
        sysCmd <- paste("wc -l", file)
        rowCount <- system(sysCmd, intern = T)
        rowCount <- sub("^\\s", "", rowCount)
        as.numeric(strsplit(rowCount, "\\s")[[1]][1])
    }
}

sedFread <- function(file, sedCmd = NULL, ...) {
    if (is.null(sedCmd)) {
        # default : sed for convert blank separated table to csv. Thanks NeronLevelu
        # !
        sedCmd <- "'s/^[[:blank:]]*//;s/[[:blank:]]\\{1,\\}/,/g'"
    }
    # sed into temp file
    tmpPath <- tempfile(pattern = "tmp", fileext = ".txt")
    sysCmd <- paste("sed", sedCmd, file, ">", tmpPath)
    try(system(sysCmd))
    DT <- fread(tmpPath, ...)
    try(system(paste("rm", tmpPath)))
    return(DT)
}


normal_read <- function() {
    dat <- read.table("~/BEproj/covariate_check/sdtab4000sim")

}

colClasses_read <- function() {
    dat <- read.table("~/BEproj/covariate_check/sdtab4000sim", colClasses = "numeric")

}

colClasses_nrows_read <- function() {
    dat <- read.table("~/BEproj/covariate_check/sdtab4000sim", colClasses = "numeric", 
        nrows = fileRowsCount("~/BEproj/covariate_check/sdtab4000sim"))
}


# sed to covert to csv then fread
sed_Fread <- function() {
    dat <- sedFread("~/BEproj/covariate_check/sdtab4000sim")
}

bm <- microbenchmark(normal_read(), colClasses_read(), colClasses_nrows_read(), 
    sed_Fread(), times = 20L, unit = "s")
bm
## Unit: seconds
##                     expr   min    lq median    uq   max neval
##            normal_read() 8.708 8.854  8.933 9.171 9.407    20
##        colClasses_read() 6.273 6.324  6.409 6.481 8.215    20
##  colClasses_nrows_read() 6.205 6.244  6.290 6.350 6.751    20
##              sed_Fread() 8.487 8.530  8.554 8.602 8.856    20
autoplot(bm)

plot of chunk unnamed-chunk-4


sessionInfo()
## R version 3.0.2 (2013-09-25)
## Platform: x86_64-pc-linux-gnu (64-bit)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C         LC_TIME=C           
##  [4] LC_COLLATE=C         LC_MONETARY=C        LC_MESSAGES=C       
##  [7] LC_PAPER=C           LC_NAME=C            LC_ADDRESS=C        
## [10] LC_TELEPHONE=C       LC_MEASUREMENT=C     LC_IDENTIFICATION=C 
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] microbenchmark_1.3-0 ggplot2_0.9.3.1      data.table_1.9.2    
## [4] knitr_1.5           
## 
## loaded via a namespace (and not attached):
##  [1] MASS_7.3-29        RColorBrewer_1.0-5 Rcpp_0.11.0       
##  [4] codetools_0.2-8    colorspace_1.2-4   dichromat_2.0-0   
##  [7] digest_0.6.4       evaluate_0.5.1     formatR_0.10      
## [10] grid_3.0.2         gtable_0.1.2       labeling_0.2      
## [13] munsell_0.4.2      plyr_1.8.1         proto_0.3-10      
## [16] reshape2_1.2.2     scales_0.2.3       stringr_0.6.2     
## [19] tools_3.0.2