Filtering a large number of rows
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:data.table':
##
## last
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(microbenchmark)
set.seed(1)
mydf <- data.frame(person = sample(10000, 1e+07, TRUE), value = runif(1e+07))
DT <- data.table(mydf)
## Common to all tests...
A <- unique(mydf$person)
B <- sample(A, ceiling(0.1 * length(A)), FALSE)
fun1a <- function() {
mydf[mydf$person %in% B, ]
}
fun1b <- function() {
mydf[which(mydf$person %in% B), ]
}
fun2 <- function() {
filter(mydf, person %in% B)
}
fun3 <- function() {
DT[which(person %in% B)]
}
microbenchmark(fun1a(), fun1b(), fun2(), fun3(), times = 20)
## Unit: milliseconds
## expr min lq median uq max neval
## fun1a() 1752.0 1791.8 1799.8 1828.0 2230.9 20
## fun1b() 711.7 744.9 774.6 811.9 1022.1 20
## fun2() 673.4 692.2 718.5 750.4 981.0 20
## fun3() 647.7 659.9 668.4 710.6 744.3 20
all.equal(fun1b(), fun2())
## [1] TRUE
all.equal(fun1b(), fun3())
## [1] TRUE