This analysis compares performance between the odbc package and the RStudio Professional Drivers against the RPostgreSQL package. These tests are compared on small and large tables:

There are many ways to collect data into R from a database. These methods were examined, ranked in order from best performing to least:

odbc connection

con1 <- dbConnect(odbc::odbc(),
  Driver = conf$driver,
  Database = conf$database,
  Port = conf$port,
  Server = conf$server, 
  UID = conf$UID, 
  PWD = conf$PWD
)

RPostgreSQL connection

con2 <- dbConnect(RPostgreSQL::PostgreSQL(),
  dbname = conf$database,
  port = conf$port,
  host = conf$server, 
  user = conf$UID, 
  password = conf$PWD
)

Query functions

Median result (odbc / RPostgreSQL):

cat("# dbSendQuery\n")
# dbSendQuery
f <- function(con){
  rs <- dbSendQuery(con, "select * from mtcars")
  dbFetch(rs)
  dbClearResult(rs)
}
microbenchmark(
  odbc = f(con1),
  RPostgreSQL = f(con2),
  times = 100
)
Unit: milliseconds
        expr       min        lq      mean   median        uq      max neval
        odbc  4.280109  4.424589  5.123516  4.52174  4.628829 59.40869   100
 RPostgreSQL 10.708193 10.941653 12.106602 11.01144 11.139357 86.00633   100
cat("# dbGetQuery\n")
# dbGetQuery
microbenchmark(
  odbc = dbGetQuery(con1, "select * from mtcars"),
  RPostgreSQL = dbGetQuery(con2, "select * from mtcars"),
  times = 100
)
Unit: milliseconds
        expr       min        lq      mean    median       uq       max neval
        odbc  4.369826  4.462516  4.639547  4.605441  4.78284  5.970596   100
 RPostgreSQL 11.303488 11.523170 11.844457 11.642717 11.84386 21.660472   100
cat("# dbReadTable\n")
# dbReadTable
microbenchmark(
  odbc = dbReadTable(con1, "mtcars"),
  RPostgreSQL = dbReadTable(con2, "mtcars"),
  times = 100
)
Unit: milliseconds
        expr       min        lq      mean    median        uq      max neval
        odbc  5.014894  5.224398  5.615451  5.402754  5.563106 16.63474   100
 RPostgreSQL 11.410670 11.746018 12.680341 11.995234 12.305746 45.56513   100
cat("# collect\n")
# collect
t1 <- tbl(con1, "mtcars")
t2 <- tbl(con2, "mtcars")
microbenchmark(
  odbc = collect(t1),
  RPostgreSQL = collect(t2),
  times = 100
)
Unit: milliseconds
        expr       min        lq      mean    median       uq      max neval
        odbc  6.823231  7.073533  7.937514  7.219613  7.46282 27.94842   100
 RPostgreSQL 13.195611 13.699607 14.808574 13.937911 14.55319 71.37980   100
cat("# dplyr\n")
# dplyr
microbenchmark(
  odbc = tbl(con1, "mtcars") %>% collect,
  RPostgreSQL = tbl(con2, "mtcars") %>% collect,
  times = 100
)
Unit: milliseconds
        expr      min       lq     mean   median       uq      max neval
        odbc 13.38358 13.94573 14.34430 14.22169 14.56148 19.21916   100
 RPostgreSQL 24.56986 25.83352 27.05007 26.66041 27.83492 47.06476   100

Select all

Median result (small table / large table):

cat("# Small table\n")
# Small table
microbenchmark(
  odbc = dbGetQuery(con1, "select * from mtcars"),
  RPostgreSQL = dbGetQuery(con2, "select * from mtcars"),
  times = 100
)
Unit: milliseconds
        expr       min        lq      mean    median        uq       max neval
        odbc  4.321348  4.611888  4.858451  4.855945  5.062818  6.202066   100
 RPostgreSQL 11.200706 12.134937 12.514874 12.486769 12.801218 16.967667   100
cat("# Large table\n")
# Large table
microbenchmark(
  odbc = dbGetQuery(con1, "select * from flights"),
  RPostgreSQL = dbGetQuery(con2, "select * from flights"),
  times = 5
)
Unit: seconds
        expr      min       lq     mean   median       uq      max neval
        odbc 2.591428 2.633310 2.763152 2.636291 2.692626 3.262106     5
 RPostgreSQL 4.637700 4.650428 4.713604 4.658108 4.718158 4.903626     5

Count records

Median result (small table / large table):

cat("# Small table\n")
# Small table
microbenchmark(
  odbc = dbGetQuery(con1, "select count(*) from mtcars"),
  RPostgreSQL = dbGetQuery(con2, "select count(*) from mtcars"),
  times = 100
)
Unit: milliseconds
        expr      min       lq     mean   median       uq       max neval
        odbc 3.869836 3.972087 4.277864 4.034261 4.095987 26.725616   100
 RPostgreSQL 2.092483 2.139317 2.263315 2.186958 2.311471  4.324677   100
cat("# Large table\n")
# Large table
microbenchmark(
  odbc = dbGetQuery(con1, "select count(*) from flights"),
  RPostgreSQL = dbGetQuery(con2, "select count(*) from flights"),
  times = 100
)
Unit: milliseconds
        expr      min       lq     mean   median       uq      max neval
        odbc 29.93849 30.42176 30.72725 30.72235 30.93322  33.8406   100
 RPostgreSQL 28.07182 28.47232 29.79854 28.66594 28.86219 105.1817   100

Aggregate

Median result (month / origin & dest):

cat("# By month\n")
# By month
microbenchmark(
  odbc = dbGetQuery(con1, "select month, count(*) from flights group by month"),
  RPostgreSQL = dbGetQuery(con2, "select month, count(*) from flights group by month"),
  times = 25
)
Unit: milliseconds
        expr      min       lq     mean   median       uq      max neval
        odbc 68.27169 68.52957 69.49427 69.54900 70.02755 71.79220    25
 RPostgreSQL 67.36743 67.98919 69.43012 68.41578 68.78742 92.34194    25
cat("# By origin & dest\n")
# By origin & dest
microbenchmark(
  odbc = dbGetQuery(con1, "select origin, dest, count(*) from flights group by origin, dest"),
  RPostgreSQL = dbGetQuery(con2, "select origin, dest, count(*) from flights group by origin, dest"),
  times = 25
)
Unit: milliseconds
        expr      min       lq     mean   median       uq      max neval
        odbc 124.9043 125.8146 131.6079 126.9300 127.8341 190.1420    25
 RPostgreSQL 124.2582 125.3030 126.0579 125.9069 126.5720 131.3172    25

Join and aggregate

Median result:

cat("# Join and aggregate\n")
# Join and aggregate
microbenchmark(
  odbc = dbGetQuery(con1, "select name, count(*) from flights, airports where origin = faa group by name"),
  RPostgreSQL = dbGetQuery(con2, "select name, count(*) from flights, airports where origin = faa group by name"),
  times = 25
)
Unit: milliseconds
        expr      min       lq     mean   median       uq      max neval
        odbc 151.2754 154.3244 156.4881 154.7501 155.4205 198.0783    25
 RPostgreSQL 150.3930 151.8969 154.4057 153.2702 154.3652 177.8400    25

Write table

Median result (small / large):

cat("# Small table\n")
# Small table
microbenchmark(
  odbc = dbWriteTable(con1, "tmp1", mtcars, overwrite = TRUE),
  RPostgreSQL = dbWriteTable(con2, "tmp2", mtcars, overwrite = TRUE),
  times = 25
)
Unit: milliseconds
        expr      min       lq     mean   median       uq      max neval
        odbc 78.44993 80.11186 81.54664 80.57682 81.30703 99.96081    25
 RPostgreSQL 32.84248 33.83805 34.73810 34.11803 35.70990 37.38110    25
cat("# Large table\n")
# Large table
tmpdat <- nycflights13::flights[1:1000,]
microbenchmark(
  odbc = dbWriteTable(con1, "tmp3", tmpdat, overwrite = TRUE),
  RPostgreSQL = dbWriteTable(con2, "tmp4", tmpdat, overwrite = TRUE),
  times = 5
)
Unit: milliseconds
        expr       min        lq      mean    median        uq      max neval
        odbc 769.39862 776.23975 796.75253 794.52786 820.40447 823.1920     5
 RPostgreSQL  45.63735  46.53974  47.50063  47.15208  47.90107  50.2729     5

Summary

There was no one decisive winner for all tests. The odbc R package along with the RStudio Professional Drivers was decisively faster than RPostgreSQL when collecting records. For all other operations, the differences in performance between the two packages were comparable.

---
title: 'Performance comparison: odbc vs RPostgreSQL'
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(collapse = TRUE)
library(DBI)
library(RPostgreSQL)
library(microbenchmark)
library(tidyverse)
library(config)
conf <- get()
```

This analysis compares performance between the odbc package and the RStudio Professional Drivers against the RPostgreSQL package. These tests are compared on small and large tables:

```{r echo=FALSE}
data.frame(
  Test = c("Read", "Count", "Group", "Join", "Write"),
  odbc = c(2700, 31, 127.1, 154.8, 876.8),
  RPostgreSQL = c(5100, 28.8, 126.0, 155.0, 49.7),
  stringsAsFactors = FALSE
) %>%
  gather("package", "milliseconds", odbc, RPostgreSQL) %>%
  ggplot(aes(reorder(Test, milliseconds, mean), milliseconds, fill=package)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  coord_flip() +
  ggtitle("Performance Comparison: odbc vs RPostgreSQL", "Median time in log10 milliseconds") +
  xlab("") +
  scale_fill_brewer(palette="Blues") +
  theme_minimal()
```

There are many ways to collect data into R from a database. These methods were examined, ranked in order from best performing to least:

```{r, echo=FALSE}
data.frame(
  Function = c("dbSendQuery", "dbGetQuery", "dbReadTable", "dplyr collect only", "dplyr"),
  MS = c(12.0, 12.5, 12.9, 14.4, 27.6)
) %>%
  ggplot(aes(reorder(Function, -MS), MS)) +
  geom_bar(stat = "Identity", fill = "steelblue") +
  coord_flip() +
  ggtitle("Performance comparison: Query methods", "Median time in milliseconds") +
  xlab("") +
  scale_fill_brewer(palette="Blues") +
  theme_minimal()
```

### odbc connection

```{r}
con1 <- dbConnect(odbc::odbc(),
  Driver = conf$driver,
  Database = conf$database,
  Port = conf$port,
  Server = conf$server, 
  UID = conf$UID, 
  PWD = conf$PWD
)
```

### RPostgreSQL connection

```{r}
con2 <- dbConnect(RPostgreSQL::PostgreSQL(),
  dbname = conf$database,
  port = conf$port,
  host = conf$server, 
  user = conf$UID, 
  password = conf$PWD
)
```

### Query functions

Median result (odbc / RPostgreSQL):

* dbSendQuery (4.9 ms / 12.0 ms)
* dbGetQuery (4.9 ms / 12.5 ms)
* dbReadTable (5.8 ms / 12.9 ms)
* collect (7.4 ms / 14.4 ms)
* dplyr (14.9 ms / 27.6 ms)

```{r, collapse=TRUE}
cat("# dbSendQuery\n")
f <- function(con){
  rs <- dbSendQuery(con, "select * from mtcars")
  dbFetch(rs)
  dbClearResult(rs)
}
microbenchmark(
  odbc = f(con1),
  RPostgreSQL = f(con2),
  times = 100
)
cat("# dbGetQuery\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select * from mtcars"),
  RPostgreSQL = dbGetQuery(con2, "select * from mtcars"),
  times = 100
)
cat("# dbReadTable\n")
microbenchmark(
  odbc = dbReadTable(con1, "mtcars"),
  RPostgreSQL = dbReadTable(con2, "mtcars"),
  times = 100
)
cat("# collect\n")
t1 <- tbl(con1, "mtcars")
t2 <- tbl(con2, "mtcars")
microbenchmark(
  odbc = collect(t1),
  RPostgreSQL = collect(t2),
  times = 100
)
cat("# dplyr\n")
microbenchmark(
  odbc = tbl(con1, "mtcars") %>% collect,
  RPostgreSQL = tbl(con2, "mtcars") %>% collect,
  times = 100
)
```

### Select all

Median result (small table / large table):

* odbc (5.2 ms / 2.7 s)
* RPostgres (13.2 ms / 5.0 s)

```{r, collapse=TRUE}
cat("# Small table\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select * from mtcars"),
  RPostgreSQL = dbGetQuery(con2, "select * from mtcars"),
  times = 100
)
cat("# Large table\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select * from flights"),
  RPostgreSQL = dbGetQuery(con2, "select * from flights"),
  times = 5
)
```

### Count records

Median result (small table / large table):

* odbc (4.6 ms / 31.0 ms)
* RPostgreSQL (2.3  ms / 28.8 ms)

```{r, collapse=TRUE}
cat("# Small table\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select count(*) from mtcars"),
  RPostgreSQL = dbGetQuery(con2, "select count(*) from mtcars"),
  times = 100
)
cat("# Large table\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select count(*) from flights"),
  RPostgreSQL = dbGetQuery(con2, "select count(*) from flights"),
  times = 100
)
```

### Aggregate

Median result (month / origin & dest):

* odbc (70.0 ms / 127.1 ms)
* RPostgreSQL (68.4 ms / 126.0 ms)

```{r, collapse=TRUE}
cat("# By month\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select month, count(*) from flights group by month"),
  RPostgreSQL = dbGetQuery(con2, "select month, count(*) from flights group by month"),
  times = 25
)
cat("# By origin & dest\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select origin, dest, count(*) from flights group by origin, dest"),
  RPostgreSQL = dbGetQuery(con2, "select origin, dest, count(*) from flights group by origin, dest"),
  times = 25
)
```

### Join and aggregate

Median result:

* odbc (154.7 ms)
* RPostgreSQL (153.9 ms)

```{r, collapse=TRUE}
cat("# Join and aggregate\n")
microbenchmark(
  odbc = dbGetQuery(con1, "select name, count(*) from flights, airports where origin = faa group by name"),
  RPostgreSQL = dbGetQuery(con2, "select name, count(*) from flights, airports where origin = faa group by name"),
  times = 25
)
```

### Write table

Median result (small / large):

* odbc (88.8 ms / 876.8 ms)
* RPostgreSQL (39.3 ms / 49.7 ms)

```{r, collapse=TRUE}
cat("# Small table\n")
microbenchmark(
  odbc = dbWriteTable(con1, "tmp1", mtcars, overwrite = TRUE),
  RPostgreSQL = dbWriteTable(con2, "tmp2", mtcars, overwrite = TRUE),
  times = 25
)
cat("# Large table\n")
tmpdat <- nycflights13::flights[1:1000,]
microbenchmark(
  odbc = dbWriteTable(con1, "tmp3", tmpdat, overwrite = TRUE),
  RPostgreSQL = dbWriteTable(con2, "tmp4", tmpdat, overwrite = TRUE),
  times = 5
)
```

### Summary

There was no one decisive winner for all tests. The odbc R package along with the RStudio Professional Drivers was decisively faster than RPostgreSQL when collecting records. For all other operations, the differences in performance between the two packages were comparable.

* Select all (odbc faster)
* Count all records (RPostgreSQL faster)
* Aggregate (RPostgreSQL faster)
* Join and Aggregate (odbc faster)
* Write table (odbc faster)
