This analysis compares performance between the odbc package and the RStudio Professional Drivers against the RPostgreSQL package. These tests are compared on small and large tables:
There are many ways to collect data into R from a database. These methods were examined, ranked in order from best performing to least:
knitr::opts_chunk$set(collapse = TRUE)
library(DBI)
library(RPostgreSQL)
library(microbenchmark)
library(tidyverse)
library(config)
conf <- get()
con1 <- dbConnect(odbc::odbc(),
Driver = conf$driver,
Database = conf$database,
Port = conf$port,
Server = conf$server,
UID = conf$UID,
PWD = conf$PWD
)
con2 <- dbConnect(RPostgreSQL::PostgreSQL(),
dbname = conf$database,
port = conf$port,
host = conf$server,
user = conf$UID,
password = conf$PWD
)
Median result (odbc / RPostgreSQL):
cat("# dbSendQuery\n")
# dbSendQuery
f <- function(con){
rs <- dbSendQuery(con, "select * from mtcars")
dbFetch(rs)
dbClearResult(rs)
}
microbenchmark(
odbc = f(con1),
RPostgreSQL = f(con2),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 4.713763 4.994552 5.26611 5.132599 5.259943 15.93048 100
RPostgreSQL 11.618141 12.353627 14.13958 12.702549 13.121257 72.67674 100
cat("# dbGetQuery\n")
# dbGetQuery
microbenchmark(
odbc = dbGetQuery(con1, "select * from mtcars"),
RPostgreSQL = dbGetQuery(con2, "select * from mtcars"),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 4.635817 4.919612 5.135489 5.102818 5.288199 7.078986 100
RPostgreSQL 11.991524 12.559285 15.137716 12.964886 13.385518 215.381654 100
cat("# dbReadTable\n")
# dbReadTable
microbenchmark(
odbc = dbReadTable(con1, "mtcars"),
RPostgreSQL = dbReadTable(con2, "mtcars"),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 5.345695 5.684614 6.239272 5.879808 6.09132 20.97787 100
RPostgreSQL 12.131515 12.582161 13.306330 12.887246 13.55050 27.46183 100
cat("# collect\n")
# collect
t1 <- tbl(con1, "mtcars")
t2 <- tbl(con2, "mtcars")
microbenchmark(
odbc = collect(t1),
RPostgreSQL = collect(t2),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 7.337842 7.516808 8.036092 7.639473 7.950758 20.11218 100
RPostgreSQL 14.095849 14.473211 15.475779 14.699705 15.466099 41.38101 100
cat("# dplyr\n")
# dplyr
microbenchmark(
odbc = tbl(con1, "mtcars") %>% collect,
RPostgreSQL = tbl(con2, "mtcars") %>% collect,
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 14.38527 14.94451 15.40478 15.11696 15.38241 21.05757 100
RPostgreSQL 26.59585 27.89762 29.22869 28.61760 29.11067 80.26529 100
Median result (small table / large table):
cat("# Small table\n")
# Small table
microbenchmark(
odbc = dbGetQuery(con1, "select * from mtcars"),
RPostgreSQL = dbGetQuery(con2, "select * from mtcars"),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 4.726137 5.057129 5.351527 5.213648 5.430405 14.34878 100
RPostgreSQL 12.027707 12.913718 15.706497 13.275593 13.692919 214.52382 100
cat("# Large table\n")
# Large table
microbenchmark(
odbc = dbGetQuery(con1, "select * from flights"),
RPostgreSQL = dbGetQuery(con2, "select * from flights"),
times = 5
)
Unit: seconds
expr min lq mean median uq max neval
odbc 2.548805 2.722749 2.697878 2.728980 2.731829 2.757028 5
RPostgreSQL 4.755813 4.782716 4.886354 4.958044 4.966141 4.969057 5
Median result (small table / large table):
cat("# Small table\n")
# Small table
microbenchmark(
odbc = dbGetQuery(con1, "select count(*) from mtcars"),
RPostgreSQL = dbGetQuery(con2, "select count(*) from mtcars"),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 4.286390 4.576856 4.756610 4.716919 4.837279 6.141267 100
RPostgreSQL 2.210132 2.318461 2.501728 2.426959 2.698684 4.173576 100
cat("# Large table\n")
# Large table
microbenchmark(
odbc = dbGetQuery(con1, "select count(*) from flights"),
RPostgreSQL = dbGetQuery(con2, "select count(*) from flights"),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 30.28474 31.02547 32.36276 31.28322 31.62073 96.40938 100
RPostgreSQL 27.63234 28.30320 28.69686 28.67596 28.96722 30.84177 100
Median result (month / origin & dest):
cat("# By month\n")
# By month
microbenchmark(
odbc = dbGetQuery(con1, "select month, count(*) from flights group by month"),
RPostgreSQL = dbGetQuery(con2, "select month, count(*) from flights group by month"),
times = 25
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 67.62446 68.79539 69.95684 69.17403 69.90676 85.68561 25
RPostgreSQL 66.93089 67.32935 67.69938 67.59283 67.87060 69.81070 25
cat("# By origin & dest\n")
# By origin & dest
microbenchmark(
odbc = dbGetQuery(con1, "select origin, dest, count(*) from flights group by origin, dest"),
RPostgreSQL = dbGetQuery(con2, "select origin, dest, count(*) from flights group by origin, dest"),
times = 25
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 125.1644 126.6711 127.6276 127.8869 128.4314 129.493 25
RPostgreSQL 123.0584 125.9069 129.3218 126.5206 127.9533 170.240 25
Median result:
cat("# Join and aggregate\n")
# Join and aggregate
microbenchmark(
odbc = dbGetQuery(con1, "select name, count(*) from flights, airports where origin = faa group by name"),
RPostgreSQL = dbGetQuery(con2, "select name, count(*) from flights, airports where origin = faa group by name"),
times = 25
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 153.1274 153.8547 155.2041 154.6656 156.9790 159.3070 25
RPostgreSQL 150.9227 152.8797 154.9814 153.8712 156.3154 178.3588 25
Median result (small / large):
cat("# Small table\n")
# Small table
microbenchmark(
odbc = dbWriteTable(con1, "tmp1", mtcars, overwrite = TRUE),
RPostgreSQL = dbWriteTable(con2, "tmp2", mtcars, overwrite = TRUE),
times = 25
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 86.44851 87.82760 89.44893 88.75644 90.06617 101.0527 25
RPostgreSQL 36.15477 37.30507 49.84299 39.25221 40.89235 264.8357 25
cat("# Large table\n")
# Large table
tmpdat <- nycflights13::flights[1:1000,]
microbenchmark(
odbc = dbWriteTable(con1, "tmp3", tmpdat, overwrite = TRUE),
RPostgreSQL = dbWriteTable(con2, "tmp4", tmpdat, overwrite = TRUE),
times = 5
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 865.7683 871.08201 932.02205 876.78658 879.12109 1167.35222 5
RPostgreSQL 46.9099 47.91023 50.68187 49.70822 49.95069 58.93033 5
There was no one decisive winner for all tests. The odbc R package along with the RStudio Professional Drivers was decisively faster than RPostgreSQL when collecting records. For all other operations, the differences in performance between the two packages were comparable.