SQL In Positron

Introduction to SQL using Positron (DuckDB) with Synthea COVID-19 data.

I will demonstrate how to use SQL queries with the Positron (DuckDB) database to analyze synthetic COVID-19 patient data generated by Synthea. The dataset is stored in a DuckDB database file.

In Positron quarto document, you can run R code chunks to connect to the DuckDB database, execute SQL queries for filtering data, joining tables, and visualize the results.

Connecting to a database via SQL in Positron is straightforward and significantly simpler than using pgAdmin4. A SQL database only needs to be created once; thereafter, users can connect directly and execute SQL queries for data analysis. Furthermore, Positron integrates Anthropic, an AI-powered tool that assists with code generation, thereby enhancing efficiency and convenience for data analysts.

# Load necessary libraries
library(DBI)
library(duckdb)
library(RPostgres)
library(tools)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Loading the DuckDB database and connecting to it.

# Close any existing connections first
if (exists("con")) {
  try(dbDisconnect(con), silent = TRUE)
}

# Connect to DuckDB database
con <- dbConnect(
  duckdb::duckdb(),
  dbdir = "/Users/nnthieu/SyntheaData/SyntheaCovid19/synthea19.duckdb"
)
# Load the PostgreSQL extension
dbExecute(con, "INSTALL postgres; LOAD postgres;")
[1] 0
# Verify tables
dbListTables(con)
 [1] "allergies"         "careplans"         "conditions"       
 [4] "devices"           "encounters"        "imaging_studies"  
 [7] "immunizations"     "medications"       "observations"     
[10] "organizations"     "patients"          "payer_transitions"
[13] "payers"            "procedures"        "providers"        
[16] "supplies"         

Data Exploration and Analysis

Filtering conditions for patients who had COVID-19.

# Query conditions for patients who had COVID-19
q1 = "SELECT 
    ca.*,
    cb.start AS covid_date
FROM 
conditions ca
LEFT JOIN (
    SELECT patient, start
    FROM 
conditions cb
    WHERE code = '840539006'
) cb
    ON ca.patient = cb.patient
WHERE ca.patient IN (
    SELECT patient 
    FROM conditions
    WHERE code = '840539006'
); "
df1 = dbGetQuery(con, q1)
head(df1)
       START       STOP                              PATIENT
1 2020-02-22 2020-03-09 e1e16824-fa7d-4c1e-927a-33454cad601e
2 2020-02-22 2020-03-09 e1e16824-fa7d-4c1e-927a-33454cad601e
3 2020-02-22 2020-03-09 e1e16824-fa7d-4c1e-927a-33454cad601e
4 2020-02-22 2020-03-09 e1e16824-fa7d-4c1e-927a-33454cad601e
5 2020-02-22 2020-02-22 e1e16824-fa7d-4c1e-927a-33454cad601e
6 2020-02-22 2020-03-09 e1e16824-fa7d-4c1e-927a-33454cad601e
                             ENCOUNTER      CODE           DESCRIPTION
1 016e9591-c1ce-4f15-be1d-3ea2846757fb  49727002       Cough (finding)
2 016e9591-c1ce-4f15-be1d-3ea2846757fb  68962001 Muscle pain (finding)
3 016e9591-c1ce-4f15-be1d-3ea2846757fb  57676002  Joint pain (finding)
4 016e9591-c1ce-4f15-be1d-3ea2846757fb 386661006       Fever (finding)
5 016e9591-c1ce-4f15-be1d-3ea2846757fb 840544004    Suspected COVID-19
6 016e9591-c1ce-4f15-be1d-3ea2846757fb 840539006              COVID-19
  covid_date
1 2020-02-22
2 2020-02-22
3 2020-02-22
4 2020-02-22
5 2020-02-22
6 2020-02-22

Covid outomes analysis.

# Analyze conditions related to COVID-19 patients
df1 %>%
  filter(
    grepl("\\(disorder\\)", DESCRIPTION) &
      !grepl("Body mass index", DESCRIPTION)
  ) %>%
  group_by(DESCRIPTION) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  head(13)
# A tibble: 13 × 2
   DESCRIPTION                             count
   <chr>                                   <int>
 1 Anemia (disorder)                       22803
 2 Hypoxemia (disorder)                    18175
 3 Pneumonia (disorder)                    18175
 4 Chronic sinusitis (disorder)            17879
 5 Viral sinusitis (disorder)              11269
 6 Acute respiratory failure (disorder)     8699
 7 Sepsis caused by virus (disorder)        6939
 8 Acute deep venous thrombosis (disorder)  5874
 9 Acute viral pharyngitis (disorder)       5818
10 Acute pulmonary embolism (disorder)      5726
11 Hypertriglyceridemia (disorder)          5350
12 Metabolic syndrome X (disorder)          5264
13 Acute bronchitis (disorder)              4767
library(ggplot2)
library(stringr)

# Create the data for plotting - first group and summarize to create count
covid_outcomes <- df1 %>%
  filter(
    grepl("\\(disorder\\)", DESCRIPTION) &
      !grepl("Body mass index", DESCRIPTION)
  ) %>%
  group_by(DESCRIPTION) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  head(13) %>%
  # Clean up the labels by removing "(finding)"
  mutate(DESCRIPTION = str_remove(DESCRIPTION, " \\(disorder\\)"))

# Create the bar chart
ggplot(covid_outcomes, aes(x = reorder(DESCRIPTION, count), y = count)) +
  geom_col(fill = "steelblue", alpha = 0.8) +
  coord_flip() +
  labs(
    title = "Most Common COVID-19 Outcomes",
    subtitle = "Based on synthetic COVID-19 patient data",
    x = "Condition",
    y = "Number of Cases",
    caption = "Data includes conditions marked as disorders"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 12, color = "gray60"),
    axis.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 10),
    panel.grid.minor = element_blank()
  ) +
  # Add value labels on the bars
  geom_text(
    aes(label = scales::comma(count)),
    hjust = -0.1,
    size = 3.5,
    color = "black"
  )

Covid symptoms analysis.

library(ggplot2)
library(stringr)

# Create the data for plotting - first group and summarize to create count
covid_symptoms <- df1 %>%
  filter(
    grepl("\\(finding\\)", DESCRIPTION) & !grepl("Body mass index", DESCRIPTION)
  ) %>%
  group_by(DESCRIPTION) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  head(13) %>%
  # Clean up the labels by removing "(finding)"
  mutate(DESCRIPTION = str_remove(DESCRIPTION, " \\(finding\\)"))

# Create the bar chart
ggplot(covid_symptoms, aes(x = reorder(DESCRIPTION, count), y = count)) +
  geom_col(fill = "darkred", alpha = 0.8) +
  coord_flip() +
  labs(
    title = "Most Common COVID-19 Symptoms",
    subtitle = "Based on synthetic COVID-19 patient data",
    x = "Symptom",
    y = "Number of Cases",
    caption = "Data included conditions marked as findings"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 12, color = "gray60"),
    axis.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 10),
    panel.grid.minor = element_blank()
  ) +
  # Add value labels on the bars
  geom_text(
    aes(label = scales::comma(count)),
    hjust = -0.1,
    size = 3.5,
    color = "black"
  )

Querying patient demographics for those who had COVID-19.

# Query patients who had COVID-19
q2 = "SELECT 
    pa.*,
    cb.start AS covid_date
FROM
patients pa
LEFT JOIN (
    SELECT patient, start
    FROM
conditions cb
    WHERE code = '840539006'
) cb
    ON pa.id = cb.patient
WHERE pa.id IN (
    SELECT patient 
    FROM conditions
    WHERE code = '840539006'
); "
df2 = dbGetQuery(con, q2)
head(df2)
                                    Id  BIRTHDATE DEATHDATE         SSN
1 d357913c-3c86-488f-bab9-48bb64c5ceba 1953-12-13      <NA> 999-82-9434
2 942e3c28-b469-45cf-941f-ec41d40594e2 2010-01-19      <NA> 999-93-3645
3 30d52309-4402-4d40-abf0-e4cb6ad35279 1918-07-15      <NA> 999-64-6411
4 1d7b0ebe-e589-44b7-98be-cd9141295b35 1960-01-07      <NA> 999-64-4594
5 890d93a9-1a84-4df7-a5fe-2107da8718b9 1961-01-09      <NA> 999-35-2906
6 d8a502b4-ea35-4473-b54a-28279243b581 2001-03-07      <NA> 999-27-5245
    DRIVERS   PASSPORT PREFIX      FIRST            LAST SUFFIX      MAIDEN
1 S99945090  X4537382X   Mrs.   Tammy740       Ernser583   <NA> O'Conner199
2      <NA>       <NA>   <NA>  Dallas143       Klocko335   <NA>        <NA>
3 S99992918 X70893138X   Mrs.   Anisa442          Purdy2   <NA>  Schaden604
4 S99929284 X17108809X   Mrs.  Iliana226     Schmeler639   <NA>  Simonis280
5 S99937264  X2849571X    Mr. Anthony633        Yundt842   <NA>        <NA>
6 S99969673       <NA>    Mr.  Roscoe437 Pfannerstill264   <NA>        <NA>
  MARITAL  RACE   ETHNICITY GENDER                    BIRTHPLACE
1       M white nonhispanic      F     Boston  Massachusetts  US
2    <NA> white nonhispanic      F    Mashpee  Massachusetts  US
3       M white nonhispanic      F Boxborough  Massachusetts  US
4       M white nonhispanic      F  Westfield  Massachusetts  US
5       M white nonhispanic      M     Hudson  Massachusetts  US
6    <NA> white nonhispanic      M     Boston  Massachusetts  US
                       ADDRESS       CITY         STATE            COUNTY   ZIP
1            385 Gibson Hollow Fall River Massachusetts    Bristol County 02723
2               326 Swift Walk       Stow Massachusetts  Middlesex County  <NA>
3      395 Stehr Manor Unit 55    Methuen Massachusetts      Essex County 01841
4 164 Morissette Course Apt 91   Brewster Massachusetts Barnstable County  <NA>
5   322 Murray Orchard Unit 69      Salem Massachusetts      Essex County  <NA>
6       716 Hansen Fort Unit 1 Fall River Massachusetts    Bristol County  <NA>
       LAT       LON HEALTHCARE_EXPENSES HEALTHCARE_COVERAGE covid_date
1 41.70998 -71.18521          1546025.67            11675.31 2020-03-12
2 42.43461 -71.51808           279157.32             2454.04 2020-03-11
3 42.70580 -71.14069           263100.30            34393.57 2020-02-26
4 41.75305 -70.04894          1407960.93             6477.64 2020-03-08
5 42.51219 -70.90512          1575731.48             4293.84 2020-02-29
6 41.73174 -71.19918            59437.88             4025.60 2020-03-07

Calculate number of covid patients

# Count number of unique COVID-19 patients
n_distinct(df2$Id)
[1] 88166

Calculating age of patients at the time of COVID-19 diagnosis.

# from df2, create column 'age' from birthdate and covid_date
df2 <- df2 %>%
  mutate(
    birthdate = as.Date(BIRTHDATE),
    covid_date = as.Date(covid_date),
    age = as.numeric(difftime(covid_date, birthdate, units = "weeks")) / 52.25
  ) %>%
  filter(!is.na(age) & age >= 0 & age <= 120)

Analyzing age distribution of COVID-19 patients.

# Plot age distribution of COVID-19 patients
ggplot(df2, aes(x = age)) +
  geom_histogram(
    binwidth = 5,
    fill = "darkgreen",
    color = "black",
    alpha = 0.7
  ) +
  labs(
    title = "Age Distribution of COVID-19 Patients",
    subtitle = "Based on synthetic COVID-19 patient data",
    x = "Age",
    y = "Number of Patients"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 12, color = "gray60"),
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10),
    panel.grid.minor = element_blank()
  ) +
  scale_x_continuous(breaks = seq(0, 100, by = 5)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10))

Conclusion

This document demonstrated how to use SQL queries with Positron (DuckDB) to analyze synthetic COVID-19 patient data. We connected to a DuckDB database, executed SQL queries to filter and join tables, and visualized the results using R and ggplot2. We explored patient conditions, symptoms, demographics, and visualized key insights using R and ggplot2. This approach can be extended to other datasets for comprehensive health data analysis.