CDMConnector

CDMConnector

Connect to a database using DBI

DBI = Database Interface

Various packages the driver functionality (e.g. RPostgres, odbc, DatabaseConnector)

con <- DBI::dbConnect(RPostgres::Postgres(),
                      dbname = Sys.getenv("CDM5_POSTGRESQL_DBNAME"),
                      host = Sys.getenv("CDM5_POSTGRESQL_HOST"),
                      user = Sys.getenv("CDM5_POSTGRESQL_USER"),
                      password = Sys.getenv("CDM5_POSTGRESQL_PASSWORD"))

Example CDM datasets (Eunomia)

library(CDMConnector)
exampleDatasets()
 [1] "GiBleed"                             "synthea-allergies-10k"              
 [3] "synthea-anemia-10k"                  "synthea-breast_cancer-10k"          
 [5] "synthea-contraceptives-10k"          "synthea-covid19-10k"                
 [7] "synthea-covid19-200k"                "synthea-dermatitis-10k"             
 [9] "synthea-heart-10k"                   "synthea-hiv-10k"                    
[11] "synthea-lung_cancer-10k"             "synthea-medications-10k"            
[13] "synthea-metabolic_syndrome-10k"      "synthea-opioid_addiction-10k"       
[15] "synthea-rheumatoid_arthritis-10k"    "synthea-snf-10k"                    
[17] "synthea-surgery-10k"                 "synthea-total_joint_replacement-10k"
[19] "synthea-veteran_prostate_cancer-10k" "synthea-veterans-10k"               
[21] "synthea-weight_loss-10k"             "empty_cdm"                          

Example CDM datasets (Eunomia)

Download and connect to an example CDM database

library(CDMConnector)
con <- DBI::dbConnect(duckdb::duckdb(), 
                      dbdir = eunomiaDir("synthea-covid19-200k"))

Create a cdm object reference

cdm <- cdmFromCon(
  con,
  cdmSchema = "main",
  writeSchema = "main"
)

print(cdm)
── # OMOP CDM reference (duckdb) of Synthea ────────────────────────────────────
• omop tables: person, observation_period, visit_occurrence, visit_detail,
condition_occurrence, drug_exposure, procedure_occurrence, device_exposure,
measurement, observation, death, note, note_nlp, specimen, fact_relationship,
location, care_site, provider, payer_plan_period, cost, drug_era, dose_era,
condition_era, metadata, cdm_source, concept, vocabulary, domain,
concept_class, concept_relationship, relationship, concept_synonym,
concept_ancestor, source_to_concept_map, drug_strength, cohort_definition,
attribute_definition
• cohort tables: -
• achilles tables: -
• other tables: -

Create a cdm object reference

cdmFromCon function creates a list of references to the cdm tables and does various validation checks

  • Do the tables have the correct column names?
  • Do we have write access to the cdm?

Create a cdm object reference

This object is a list of table references (like Andromda)

cdm$person
# Source:   table<main.person> [?? x 18]
# Database: DuckDB v0.10.0 [root@Darwin 23.0.0:R 4.3.1//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpGm2Ysp/file516f73404c4b.duckdb]
   person_id gender_concept_id year_of_birth month_of_birth day_of_birth
       <int>             <int>         <int>          <int>        <int>
 1         1              8532          1961             10            7
 2         2              8507          2020              1           13
 3         3              8532          1954              5           26
 4         4              8507          1967              4           22
 5         5              8532          1959              2           21
 6         6              8507          1938              6            1
 7         7              8532          1970              4           26
 8         8              8532          1945              8           13
 9         9              8532          2000              3           17
10        10              8507          1974              2            5
# ℹ more rows
# ℹ 13 more variables: birth_datetime <dttm>, race_concept_id <int>,
#   ethnicity_concept_id <int>, location_id <int>, provider_id <int>,
#   care_site_id <int>, person_source_value <chr>, gender_source_value <chr>,
#   gender_source_concept_id <int>, race_source_value <chr>,
#   race_source_concept_id <int>, ethnicity_source_value <chr>,
#   ethnicity_source_concept_id <int>

Database Characterization

We can get some basic info about the cdm using snapshot.

snapshot(cdm) %>% tidyr::gather()
# A tibble: 13 × 2
   key                                    value                                 
   <chr>                                  <chr>                                 
 1 cdm_name                               "Synthea"                             
 2 cdm_source_name                        "Synthea"                             
 3 cdm_description                        "Synthea Synthetic Data"              
 4 cdm_documentation_reference            "https://synthetichealth.github.io/sy…
 5 cdm_version                            "5.3.1"                               
 6 cdm_holder                             ""                                    
 7 cdm_release_date                       "2023-06-16"                          
 8 vocabulary_version                     "v5.0 22-JUN-22"                      
 9 person_count                           "213953"                              
10 observation_period_count               "213953"                              
11 earliest_observation_period_start_date "1923-11-26"                          
12 latest_observation_period_end_date     "2023-06-23"                          
13 snapshot_date                          "2024-04-26"                          

CDM Queries

  • Write dplyr code to query the CDM
  • dplyr gets translated to SQL automatically
  • We test on Postgres, SQL Server, Snowflake, Redshift, duckdb
  • Adding Databricks/Spark tests soon
  • Aim to support all OHDSI dbms and any R driver backend

CDM Queries

Most common drug ingredients

library(dplyr, warn.conflicts = F)

cdm$drug_era %>% 
  left_join(cdm$concept, by = c("drug_concept_id" = "concept_id")) %>% 
  select(drug_concept = concept_name, drug_concept_id) %>% 
  count(drug_concept, drug_concept_id, sort = T) 
# Source:     SQL [?? x 3]
# Database:   DuckDB v0.10.0 [root@Darwin 23.0.0:R 4.3.1//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpGm2Ysp/file516f73404c4b.duckdb]
# Ordered by: desc(n)
   drug_concept  drug_concept_id     n
   <chr>                   <int> <dbl>
 1 clopidogrel           1322184 34028
 2 nitroglycerin         1361711 25986
 3 simvastatin           1539403 22232
 4 amlodipine            1332418 21961
 5 digoxin               1326303 15389
 6 warfarin              1310149 15389
 7 verapamil             1307863 15385
 8 alteplase             1347450 11109
 9 acetaminophen         1125315 11024
10 hydrocodone           1174888  7957
# ℹ more rows

CDM Queries

Inspect the SQL

cdm$drug_exposure %>% 
  left_join(cdm$concept, by = c("drug_concept_id" = "concept_id")) %>% 
  select(drug_concept = concept_name) %>% 
  count(drug_concept, sort = T) %>% 
  show_query()
<SQL>
SELECT drug_concept, COUNT(*) AS n
FROM (
  SELECT concept_name AS drug_concept
  FROM main.drug_exposure
  LEFT JOIN main.concept
    ON (drug_exposure.drug_concept_id = concept.concept_id)
) q01
GROUP BY drug_concept
ORDER BY n DESC

Query functions

CDMConnector has some helper functions to make cross platform queries easier.

A good example is the quantile function which is quite important for data analysis but not implemented the same on all database systems.

Query functions

cdm$person %>% 
  group_by(gender_concept_id) %>% 
  summarize_quantile(year_of_birth, probs = seq(0,1, by = .1)) 
# Source:     SQL [2 x 12]
# Database:   DuckDB v0.10.0 [root@Darwin 23.0.0:R 4.3.1//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpGm2Ysp/file516f73404c4b.duckdb]
# Ordered by: year_of_birth
  gender_concept_id p0_value p10_value p20_value p30_value p40_value p50_value
              <int>    <int>     <int>     <int>     <int>     <int>     <int>
1              8507     1923      1930      1939      1948      1958      1968
2              8532     1923      1932      1942      1952      1962      1972
# ℹ 5 more variables: p60_value <int>, p70_value <int>, p80_value <int>,
#   p90_value <int>, p100_value <int>

Query functions

This saves us from writing a lot of SQL

cdm$person %>% 
  group_by(gender_concept_id) %>% 
  summarize_quantile(year_of_birth, probs = seq(0,1, by = .1)) %>% 
  show_query()
<SQL>
SELECT
  gender_concept_id,
  MIN(CASE WHEN (accumulated >= (0.0 * total)) THEN year_of_birth WHEN NOT (accumulated >= (0.0 * total)) THEN NULL END) AS p0_value,
  MIN(CASE WHEN (accumulated >= (0.1 * total)) THEN year_of_birth WHEN NOT (accumulated >= (0.1 * total)) THEN NULL END) AS p10_value,
  MIN(CASE WHEN (accumulated >= (0.2 * total)) THEN year_of_birth WHEN NOT (accumulated >= (0.2 * total)) THEN NULL END) AS p20_value,
  MIN(CASE WHEN (accumulated >= (0.3 * total)) THEN year_of_birth WHEN NOT (accumulated >= (0.3 * total)) THEN NULL END) AS p30_value,
  MIN(CASE WHEN (accumulated >= (0.4 * total)) THEN year_of_birth WHEN NOT (accumulated >= (0.4 * total)) THEN NULL END) AS p40_value,
  MIN(CASE WHEN (accumulated >= (0.5 * total)) THEN year_of_birth WHEN NOT (accumulated >= (0.5 * total)) THEN NULL END) AS p50_value,
  MIN(CASE WHEN (accumulated >= (0.6 * total)) THEN year_of_birth WHEN NOT (accumulated >= (0.6 * total)) THEN NULL END) AS p60_value,
  MIN(CASE WHEN (accumulated >= (0.7 * total)) THEN year_of_birth WHEN NOT (accumulated >= (0.7 * total)) THEN NULL END) AS p70_value,
  MIN(CASE WHEN (accumulated >= (0.8 * total)) THEN year_of_birth WHEN NOT (accumulated >= (0.8 * total)) THEN NULL END) AS p80_value,
  MIN(CASE WHEN (accumulated >= (0.9 * total)) THEN year_of_birth WHEN NOT (accumulated >= (0.9 * total)) THEN NULL END) AS p90_value,
  MIN(CASE WHEN (accumulated >= (1.0 * total)) THEN year_of_birth WHEN NOT (accumulated >= (1.0 * total)) THEN NULL END) AS p100_value
FROM (
  SELECT
    q01.*,
    SUM("..n") OVER (PARTITION BY gender_concept_id ORDER BY year_of_birth ROWS UNBOUNDED PRECEDING) AS accumulated,
    SUM("..n") OVER (PARTITION BY gender_concept_id) AS total
  FROM (
    SELECT gender_concept_id, year_of_birth, COUNT(*) AS "..n"
    FROM main.person
    GROUP BY gender_concept_id, year_of_birth
  ) q01
) q01
GROUP BY gender_concept_id

Cohort Generation

  • Put Atlas cohort json files in a folder
  • Use descriptive filenames
list.files("cohorts")
[1] "covid_19.json"        "type_2_diabetes.json"
(cohortSet <- readCohortSet("cohorts"))
# A tibble: 2 × 5
  cohort_definition_id cohort_name     cohort       json   cohort_name_snakecase
                 <int> <chr>           <list>       <list> <chr>                
1                    1 covid_19        <named list> <chr>  covid_19             
2                    2 type_2_diabetes <named list> <chr>  type_2_diabetes      

Cohort Generation

cdm <- cdm %>% 
  generateCohortSet(cohortSet,
                    name = "cohort", 
                    overwrite = TRUE)

cdm$cohort
# Source:   table<main.cohort> [?? x 4]
# Database: DuckDB v0.10.0 [root@Darwin 23.0.0:R 4.3.1//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpGm2Ysp/file516f73404c4b.duckdb]
   cohort_definition_id subject_id cohort_start_date cohort_end_date
                  <int>      <dbl> <date>            <date>         
 1                    1      21611 2020-11-16        2020-12-03     
 2                    1     107582 2020-04-01        2020-04-10     
 3                    1     117111 2020-07-08        2020-07-23     
 4                    1     117630 2020-07-20        2020-08-06     
 5                    1     127865 2021-01-14        2021-02-10     
 6                    1     134426 2021-04-01        2021-05-06     
 7                    1     145533 2021-04-19        2021-05-02     
 8                    1     151087 2021-08-05        2021-09-03     
 9                    1     179975 2020-07-13        2020-08-13     
10                    1     181016 2021-02-03        2021-03-03     
# ℹ more rows

Cohort table attributes

cohort_count(cdm$cohort)
# A tibble: 2 × 3
  cohort_definition_id number_records number_subjects
                 <int>          <int>           <int>
1                    1          18744           18744
2                    2              0               0
settings(cdm$cohort)
# A tibble: 2 × 2
  cohort_definition_id cohort_name    
                 <int> <chr>          
1                    1 covid_19       
2                    2 type_2_diabetes
cohort_attrition(cdm$cohort)
# A tibble: 2 × 7
  cohort_definition_id number_records number_subjects reason_id reason          
                 <int>          <int>           <int>     <int> <chr>           
1                    1          18744           18744         1 Qualifying init…
2                    2              0               0         1 Qualifying init…
# ℹ 2 more variables: excluded_records <int>, excluded_subjects <int>

Cohort tables from concepts sets

clopidogrel <- CodelistGenerator::getDescendants(cdm, 1322184)
nitroglycerin <- CodelistGenerator::getDescendants(cdm, 1361711)

cdm <- generateConceptCohortSet(
  cdm,
  name = "cohort",
  conceptSet = list("clopidogrel" = clopidogrel$concept_id,
                    "nitroglycerin" = nitroglycerin$concept_id),
  limit = "first",
  requiredObservation = c(0, 0),
  end = "observation_period_end_date",
  overwrite = TRUE
)

cohortCount(cdm$cohort)
# A tibble: 2 × 3
  cohort_definition_id number_records number_subjects
                 <int>          <int>           <int>
1                    1          29051           29051
2                    2          24596           24596

Subsetting the CDM

cdm2 <- cdm %>% 
  cdmSubsetCohort(cohortTable = "cohort", cohortId = 1, verbose = T)

tally(cdm$person)
# Source:   SQL [1 x 1]
# Database: DuckDB v0.10.0 [root@Darwin 23.0.0:R 4.3.1//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpGm2Ysp/file516f73404c4b.duckdb]
       n
   <dbl>
1 213953
tally(cdm2$person)
# Source:   SQL [1 x 1]
# Database: DuckDB v0.10.0 [root@Darwin 23.0.0:R 4.3.1//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpGm2Ysp/file516f73404c4b.duckdb]
      n
  <dbl>
1 29051

Flatten a CDM

cdm2 %>% 
  cdmFlatten()
# Source:   SQL [?? x 8]
# Database: DuckDB v0.10.0 [root@Darwin 23.0.0:R 4.3.1//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpGm2Ysp/file516f73404c4b.duckdb]
   person_id observation_concept_id start_date end_date   type_concept_id domain
       <int>                  <int> <date>     <date>               <int> <chr> 
 1    189991                4078793 2016-02-16 2016-02-16        38000267 proce…
 2    192041                4057420 2014-11-05 2014-11-05        38000267 proce…
 3    192876                4078793 2021-02-15 2021-02-15        38000267 proce…
 4    192930                4230911 2014-06-28 2014-06-28        38000267 proce…
 5    196310                4336464 2015-12-28 2015-12-28        38000267 proce…
 6    196350                4078793 2015-12-13 2015-12-13        38000267 proce…
 7    197665                4202451 2022-07-09 2022-07-09        38000267 proce…
 8    199025                4162736 2020-12-03 2020-12-03        38000267 proce…
 9    199510                4202451 2020-08-04 2020-08-04        38000267 proce…
10    200023                4078793 2014-07-03 2014-07-03        38000267 proce…
# ℹ more rows
# ℹ 2 more variables: observation_concept_name <chr>, type_concept_name <chr>

Save a cdm to disk

Save as csv, parquet, or duckdb

path <- tempfile()
dir.create(path)

cdm2 %>% 
  cdm_select_tbl(-tbl_group("vocab")) %>% 
  stow(path = path, format = "parquet")

Copy a cdm from one database to another

Experimental

con2 <- DBI::dbConnect(RPostgres::Postgres(), 
                      dbname = "test",
                      host = "localhost",
                      user = "postgres",
                      password = "postgres")


cdm2InPostgres <- cdm2 %>% 
  cdm_select_tbl(-tbl_group("vocab")) %>% 
  copyCdmTo(con2, ., schema = "cdm", overwrite = T)

Disconnect

DBI::dbDisconnect(con)