This analysis compares performance between the odbc package and the RStudio Professional Drivers against the RPostgreSQL package. These tests are compared on small and large tables:

There are many ways to collect data into R from a database. These methods were examined, ranked in order from best performing to least:

odbc connection
con1 <- dbConnect(odbc::odbc(),
Driver = conf$driver,
Database = conf$database,
Port = conf$port,
Server = conf$server,
UID = conf$UID,
PWD = conf$PWD
)
RPostgreSQL connection
con2 <- dbConnect(RPostgreSQL::PostgreSQL(),
dbname = conf$database,
port = conf$port,
host = conf$server,
user = conf$UID,
password = conf$PWD
)
Query functions
Median result (odbc / RPostgreSQL):
- dbSendQuery (4.9 ms / 12.0 ms)
- dbGetQuery (4.9 ms / 12.5 ms)
- dbReadTable (5.8 ms / 12.9 ms)
- collect (7.4 ms / 14.4 ms)
- dplyr (14.9 ms / 27.6 ms)
cat("# dbSendQuery\n")
# dbSendQuery
f <- function(con){
rs <- dbSendQuery(con, "select * from mtcars")
dbFetch(rs)
dbClearResult(rs)
}
microbenchmark(
odbc = f(con1),
RPostgreSQL = f(con2),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 4.280109 4.424589 5.123516 4.52174 4.628829 59.40869 100
RPostgreSQL 10.708193 10.941653 12.106602 11.01144 11.139357 86.00633 100
cat("# dbGetQuery\n")
# dbGetQuery
microbenchmark(
odbc = dbGetQuery(con1, "select * from mtcars"),
RPostgreSQL = dbGetQuery(con2, "select * from mtcars"),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 4.369826 4.462516 4.639547 4.605441 4.78284 5.970596 100
RPostgreSQL 11.303488 11.523170 11.844457 11.642717 11.84386 21.660472 100
cat("# dbReadTable\n")
# dbReadTable
microbenchmark(
odbc = dbReadTable(con1, "mtcars"),
RPostgreSQL = dbReadTable(con2, "mtcars"),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 5.014894 5.224398 5.615451 5.402754 5.563106 16.63474 100
RPostgreSQL 11.410670 11.746018 12.680341 11.995234 12.305746 45.56513 100
cat("# collect\n")
# collect
t1 <- tbl(con1, "mtcars")
t2 <- tbl(con2, "mtcars")
microbenchmark(
odbc = collect(t1),
RPostgreSQL = collect(t2),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 6.823231 7.073533 7.937514 7.219613 7.46282 27.94842 100
RPostgreSQL 13.195611 13.699607 14.808574 13.937911 14.55319 71.37980 100
cat("# dplyr\n")
# dplyr
microbenchmark(
odbc = tbl(con1, "mtcars") %>% collect,
RPostgreSQL = tbl(con2, "mtcars") %>% collect,
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 13.38358 13.94573 14.34430 14.22169 14.56148 19.21916 100
RPostgreSQL 24.56986 25.83352 27.05007 26.66041 27.83492 47.06476 100
Select all
Median result (small table / large table):
- odbc (5.2 ms / 2.7 s)
- RPostgres (13.2 ms / 5.0 s)
cat("# Small table\n")
# Small table
microbenchmark(
odbc = dbGetQuery(con1, "select * from mtcars"),
RPostgreSQL = dbGetQuery(con2, "select * from mtcars"),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 4.321348 4.611888 4.858451 4.855945 5.062818 6.202066 100
RPostgreSQL 11.200706 12.134937 12.514874 12.486769 12.801218 16.967667 100
cat("# Large table\n")
# Large table
microbenchmark(
odbc = dbGetQuery(con1, "select * from flights"),
RPostgreSQL = dbGetQuery(con2, "select * from flights"),
times = 5
)
Unit: seconds
expr min lq mean median uq max neval
odbc 2.591428 2.633310 2.763152 2.636291 2.692626 3.262106 5
RPostgreSQL 4.637700 4.650428 4.713604 4.658108 4.718158 4.903626 5
Count records
Median result (small table / large table):
- odbc (4.6 ms / 31.0 ms)
- RPostgreSQL (2.3 ms / 28.8 ms)
cat("# Small table\n")
# Small table
microbenchmark(
odbc = dbGetQuery(con1, "select count(*) from mtcars"),
RPostgreSQL = dbGetQuery(con2, "select count(*) from mtcars"),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 3.869836 3.972087 4.277864 4.034261 4.095987 26.725616 100
RPostgreSQL 2.092483 2.139317 2.263315 2.186958 2.311471 4.324677 100
cat("# Large table\n")
# Large table
microbenchmark(
odbc = dbGetQuery(con1, "select count(*) from flights"),
RPostgreSQL = dbGetQuery(con2, "select count(*) from flights"),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 29.93849 30.42176 30.72725 30.72235 30.93322 33.8406 100
RPostgreSQL 28.07182 28.47232 29.79854 28.66594 28.86219 105.1817 100
Aggregate
Median result (month / origin & dest):
- odbc (70.0 ms / 127.1 ms)
- RPostgreSQL (68.4 ms / 126.0 ms)
cat("# By month\n")
# By month
microbenchmark(
odbc = dbGetQuery(con1, "select month, count(*) from flights group by month"),
RPostgreSQL = dbGetQuery(con2, "select month, count(*) from flights group by month"),
times = 25
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 68.27169 68.52957 69.49427 69.54900 70.02755 71.79220 25
RPostgreSQL 67.36743 67.98919 69.43012 68.41578 68.78742 92.34194 25
cat("# By origin & dest\n")
# By origin & dest
microbenchmark(
odbc = dbGetQuery(con1, "select origin, dest, count(*) from flights group by origin, dest"),
RPostgreSQL = dbGetQuery(con2, "select origin, dest, count(*) from flights group by origin, dest"),
times = 25
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 124.9043 125.8146 131.6079 126.9300 127.8341 190.1420 25
RPostgreSQL 124.2582 125.3030 126.0579 125.9069 126.5720 131.3172 25
Join and aggregate
Median result:
- odbc (154.7 ms)
- RPostgreSQL (153.9 ms)
cat("# Join and aggregate\n")
# Join and aggregate
microbenchmark(
odbc = dbGetQuery(con1, "select name, count(*) from flights, airports where origin = faa group by name"),
RPostgreSQL = dbGetQuery(con2, "select name, count(*) from flights, airports where origin = faa group by name"),
times = 25
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 151.2754 154.3244 156.4881 154.7501 155.4205 198.0783 25
RPostgreSQL 150.3930 151.8969 154.4057 153.2702 154.3652 177.8400 25
Write table
Median result (small / large):
- odbc (88.8 ms / 876.8 ms)
- RPostgreSQL (39.3 ms / 49.7 ms)
cat("# Small table\n")
# Small table
microbenchmark(
odbc = dbWriteTable(con1, "tmp1", mtcars, overwrite = TRUE),
RPostgreSQL = dbWriteTable(con2, "tmp2", mtcars, overwrite = TRUE),
times = 25
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 78.44993 80.11186 81.54664 80.57682 81.30703 99.96081 25
RPostgreSQL 32.84248 33.83805 34.73810 34.11803 35.70990 37.38110 25
cat("# Large table\n")
# Large table
tmpdat <- nycflights13::flights[1:1000,]
microbenchmark(
odbc = dbWriteTable(con1, "tmp3", tmpdat, overwrite = TRUE),
RPostgreSQL = dbWriteTable(con2, "tmp4", tmpdat, overwrite = TRUE),
times = 5
)
Unit: milliseconds
expr min lq mean median uq max neval
odbc 769.39862 776.23975 796.75253 794.52786 820.40447 823.1920 5
RPostgreSQL 45.63735 46.53974 47.50063 47.15208 47.90107 50.2729 5
Summary
There was no one decisive winner for all tests. The odbc R package along with the RStudio Professional Drivers was decisively faster than RPostgreSQL when collecting records. For all other operations, the differences in performance between the two packages were comparable.
- Select all (odbc faster)
- Count all records (RPostgreSQL faster)
- Aggregate (RPostgreSQL faster)
- Join and Aggregate (odbc faster)
- Write table (odbc faster)
---
title: 'Performance comparison: odbc vs RPostgreSQL'
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(collapse = TRUE)
library(DBI)
library(RPostgreSQL)
library(microbenchmark)
library(tidyverse)
library(config)
conf <- get()
```

This analysis compares performance between the odbc package and the RStudio Professional Drivers against the RPostgreSQL package. These tests are compared on small and large tables:

```{r echo=FALSE}
data.frame(
  Test = c("Read", "Count", "Group", "Join", "Write"),
  odbc = c(2700, 31, 127.1, 154.8, 876.8),
  RPostgreSQL = c(5100, 28.8, 126.0, 155.0, 49.7),
  stringsAsFactors = FALSE
) %>%
  gather("package", "milliseconds", odbc, RPostgreSQL) %>%
  ggplot(aes(reorder(Test, milliseconds, mean), milliseconds, fill=package)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  coord_flip() +
  ggtitle("Performance Comparison: odbc vs RPostgreSQL", "Median time in log10 milliseconds") +
  xlab("") +
  scale_fill_brewer(palette="Blues") +
  theme_minimal()
```

There are many ways to collect data into R from a database. These methods were examined, ranked in order from best performing to least:

```{r, echo=FALSE}
data.frame(
  Function = c("dbSendQuery", "dbGetQuery", "dbReadTable", "dplyr collect only", "dplyr"),
  MS = c(12.0, 12.5, 12.9, 14.4, 27.6)
) %>%
  ggplot(aes(reorder(Function, -MS), MS)) +
  geom_bar(stat = "Identity", fill = "steelblue") +
  coord_flip() +
  ggtitle("Performance comparison: Query methods", "Median time in milliseconds") +
  xlab("") +
  scale_fill_brewer(palette="Blues") +
  theme_minimal()
```

### odbc connection

```{r}
con1 <- dbConnect(odbc::odbc(),
  Driver = conf$driver,
  Database = conf$database,
  Port = conf$port,
  Server = conf$server, 
  UID = conf$UID, 
  PWD = conf$PWD
)
```

### RPostgreSQL connection

```{r}
con2 <- dbConnect(RPostgreSQL::PostgreSQL(),
  dbname = conf$database,
  port = conf$port,
  host = conf$server, 
  user = conf$UID, 
  password = conf$PWD
)
```

### Query functions

Median result (odbc / RPostgreSQL):

* dbSendQuery (4.9 ms / 12.0 ms)
* dbGetQuery (4.9 ms / 12.5 ms)
* dbReadTable (5.8 ms / 12.9 ms)
* collect (7.4 ms / 14.4 ms)
* dplyr (14.9 ms / 27.6 ms)

```{r, collapse=TRUE}
cat("# dbSendQuery\n")
f <- function(con){
  rs <- dbSendQuery(con, "select * from mtcars")
  dbFetch(rs)
  dbClearResult(rs)
}
microbenchmark(
  odbc = f(con1),
  RPostgreSQL = f(con2),
  times = 100
)
cat("# dbGetQuery\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select * from mtcars"),
  RPostgreSQL = dbGetQuery(con2, "select * from mtcars"),
  times = 100
)
cat("# dbReadTable\n")
microbenchmark(
  odbc = dbReadTable(con1, "mtcars"),
  RPostgreSQL = dbReadTable(con2, "mtcars"),
  times = 100
)
cat("# collect\n")
t1 <- tbl(con1, "mtcars")
t2 <- tbl(con2, "mtcars")
microbenchmark(
  odbc = collect(t1),
  RPostgreSQL = collect(t2),
  times = 100
)
cat("# dplyr\n")
microbenchmark(
  odbc = tbl(con1, "mtcars") %>% collect,
  RPostgreSQL = tbl(con2, "mtcars") %>% collect,
  times = 100
)
```

### Select all

Median result (small table / large table):

* odbc (5.2 ms / 2.7 s)
* RPostgres (13.2 ms / 5.0 s)

```{r, collapse=TRUE}
cat("# Small table\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select * from mtcars"),
  RPostgreSQL = dbGetQuery(con2, "select * from mtcars"),
  times = 100
)
cat("# Large table\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select * from flights"),
  RPostgreSQL = dbGetQuery(con2, "select * from flights"),
  times = 5
)
```

### Count records

Median result (small table / large table):

* odbc (4.6 ms / 31.0 ms)
* RPostgreSQL (2.3  ms / 28.8 ms)

```{r, collapse=TRUE}
cat("# Small table\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select count(*) from mtcars"),
  RPostgreSQL = dbGetQuery(con2, "select count(*) from mtcars"),
  times = 100
)
cat("# Large table\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select count(*) from flights"),
  RPostgreSQL = dbGetQuery(con2, "select count(*) from flights"),
  times = 100
)
```

### Aggregate

Median result (month / origin & dest):

* odbc (70.0 ms / 127.1 ms)
* RPostgreSQL (68.4 ms / 126.0 ms)

```{r, collapse=TRUE}
cat("# By month\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select month, count(*) from flights group by month"),
  RPostgreSQL = dbGetQuery(con2, "select month, count(*) from flights group by month"),
  times = 25
)
cat("# By origin & dest\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select origin, dest, count(*) from flights group by origin, dest"),
  RPostgreSQL = dbGetQuery(con2, "select origin, dest, count(*) from flights group by origin, dest"),
  times = 25
)
```

### Join and aggregate

Median result:

* odbc (154.7 ms)
* RPostgreSQL (153.9 ms)

```{r, collapse=TRUE}
cat("# Join and aggregate\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select name, count(*) from flights, airports where origin = faa group by name"),
  RPostgreSQL = dbGetQuery(con2, "select name, count(*) from flights, airports where origin = faa group by name"),
  times = 25
)
```

### Write table

Median result (small / large):

* odbc (88.8 ms / 876.8 ms)
* RPostgreSQL (39.3 ms / 49.7 ms)

```{r, collapse=TRUE}
cat("# Small table\n")
microbenchmark(
  odbc = dbWriteTable(con1, "tmp1", mtcars, overwrite = TRUE),
  RPostgreSQL = dbWriteTable(con2, "tmp2", mtcars, overwrite = TRUE),
  times = 25
)
cat("# Large table\n")
tmpdat <- nycflights13::flights[1:1000,]
microbenchmark(
  odbc = dbWriteTable(con1, "tmp3", tmpdat, overwrite = TRUE),
  RPostgreSQL = dbWriteTable(con2, "tmp4", tmpdat, overwrite = TRUE),
  times = 5
)
```

### Summary

There was no one decisive winner for all tests. The odbc R package along with the RStudio Professional Drivers was decisively faster than RPostgreSQL when collecting records. For all other operations, the differences in performance between the two packages were comparable.

* Select all (odbc faster)
* Count all records (RPostgreSQL faster)
* Aggregate (RPostgreSQL faster)
* Join and Aggregate (odbc faster)
* Write table (odbc faster)
