File is 74 MB, 16 cols and ~ 360000 rows
Understanding the internals of the sedFread with a small file
sedFread2 <- function(file, sedCmd = NULL, ...) {
if (is.null(sedCmd)) {
# default : sed for convert blank separated table to csv. Thanks NeronLevelu
# !
sedCmd <- "'s/^[[:blank:]]*//;s/[[:blank:]]\\{1,\\}/,/g'"
}
# sed into temp file
tmpPath <- tempfile(pattern = "tmp", fileext = ".txt")
sysCmd <- paste("sed", sedCmd, file, ">", tmpPath)
try(system(sysCmd))
print(readLines(tmpPath, n = 5)) # see what the internal csv output looks like
DT <- fread(tmpPath, ...)
try(system(paste("rm", tmpPath)))
return(DT)
}
# comparing what before, during, and after looks like
readLines("~/BEproj/covariate_check/1001.ETA", n = 5) # before
## [1] "TABLE NO. 5"
## [2] " ID REP CLI V2I KAI AMRTI F8I F2I K72I K82I"
## [3] " 3.0000E+00 0.0000E+00 3.5036E-02 9.4326E-02 7.3651E-02 -1.9508E-01 6.1227E-01 0.0000E+00 0.0000E+00 1.4106E-01"
## [4] " 6.0000E+00 0.0000E+00 2.7961E-01 5.6817E-02 2.9769E-02 -1.1283E-01 2.3112E-01 0.0000E+00 0.0000E+00 1.7832E-01"
## [5] " 8.0000E+00 0.0000E+00 1.9332E-01 7.6032E-02 3.2132E-02 -3.1584E-01 1.5444E-01 0.0000E+00 0.0000E+00 -1.9636E-01"
sedFread2("~/BEproj/covariate_check/1001.ETA") # during and after
## [1] "TABLE,NO.,5"
## [2] "ID,REP,CLI,V2I,KAI,AMRTI,F8I,F2I,K72I,K82I"
## [3] "3.0000E+00,0.0000E+00,3.5036E-02,9.4326E-02,7.3651E-02,-1.9508E-01,6.1227E-01,0.0000E+00,0.0000E+00,1.4106E-01"
## [4] "6.0000E+00,0.0000E+00,2.7961E-01,5.6817E-02,2.9769E-02,-1.1283E-01,2.3112E-01,0.0000E+00,0.0000E+00,1.7832E-01"
## [5] "8.0000E+00,0.0000E+00,1.9332E-01,7.6032E-02,3.2132E-02,-3.1584E-01,1.5444E-01,0.0000E+00,0.0000E+00,-1.9636E-01"
## Error: could not find function "fread"
suppressMessages(library(data.table))
suppressMessages(library(ggplot2))
suppressMessages(library(microbenchmark))
## helper functions
fileRowsCount <- function(file) {
if (file.exists(file)) {
sysCmd <- paste("wc -l", file)
rowCount <- system(sysCmd, intern = T)
rowCount <- sub("^\\s", "", rowCount)
as.numeric(strsplit(rowCount, "\\s")[[1]][1])
}
}
sedFread <- function(file, sedCmd = NULL, ...) {
if (is.null(sedCmd)) {
# default : sed for convert blank separated table to csv. Thanks NeronLevelu
# !
sedCmd <- "'s/^[[:blank:]]*//;s/[[:blank:]]\\{1,\\}/,/g'"
}
# sed into temp file
tmpPath <- tempfile(pattern = "tmp", fileext = ".txt")
sysCmd <- paste("sed", sedCmd, file, ">", tmpPath)
try(system(sysCmd))
DT <- fread(tmpPath, ...)
try(system(paste("rm", tmpPath)))
return(DT)
}
normal_read <- function() {
dat <- read.table("~/BEproj/covariate_check/sdtab4000sim")
}
colClasses_read <- function() {
dat <- read.table("~/BEproj/covariate_check/sdtab4000sim", colClasses = "numeric")
}
colClasses_nrows_read <- function() {
dat <- read.table("~/BEproj/covariate_check/sdtab4000sim", colClasses = "numeric",
nrows = fileRowsCount("~/BEproj/covariate_check/sdtab4000sim"))
}
# sed to covert to csv then fread
sed_Fread <- function() {
dat <- sedFread("~/BEproj/covariate_check/sdtab4000sim")
}
bm <- microbenchmark(normal_read(), colClasses_read(), colClasses_nrows_read(),
sed_Fread(), times = 20L, unit = "s")
bm
## Unit: seconds
## expr min lq median uq max neval
## normal_read() 8.708 8.854 8.933 9.171 9.407 20
## colClasses_read() 6.273 6.324 6.409 6.481 8.215 20
## colClasses_nrows_read() 6.205 6.244 6.290 6.350 6.751 20
## sed_Fread() 8.487 8.530 8.554 8.602 8.856 20
autoplot(bm)
sessionInfo()
## R version 3.0.2 (2013-09-25)
## Platform: x86_64-pc-linux-gnu (64-bit)
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=C
## [4] LC_COLLATE=C LC_MONETARY=C LC_MESSAGES=C
## [7] LC_PAPER=C LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] microbenchmark_1.3-0 ggplot2_0.9.3.1 data.table_1.9.2
## [4] knitr_1.5
##
## loaded via a namespace (and not attached):
## [1] MASS_7.3-29 RColorBrewer_1.0-5 Rcpp_0.11.0
## [4] codetools_0.2-8 colorspace_1.2-4 dichromat_2.0-0
## [7] digest_0.6.4 evaluate_0.5.1 formatR_0.10
## [10] grid_3.0.2 gtable_0.1.2 labeling_0.2
## [13] munsell_0.4.2 plyr_1.8.1 proto_0.3-10
## [16] reshape2_1.2.2 scales_0.2.3 stringr_0.6.2
## [19] tools_3.0.2