Children Immunization Reports - Postgre SQL in RStudio

Author

Thieu Nguyen

Intro

In this post I demonstrate how to use SLQ commands integrated into Rstudio to analyze data. I use SQL to create tables then save to local directory and read them to RStudio to analyze data.

The reason for using RStudio integrated with SQL is that loading tables is much easier compared to importing them into pgAdmin4.

Data

I downloaded csv files from the Synthea website (https://synthea.mitre.org/downloads) to my directory ’/Users/nnthieu/SyntheaData/.

Building database

Now I create a database using ‘duckdb’

library(DBI)
library(duckdb)
library(RPostgres)

con <- dbConnect(duckdb::duckdb(), dbdir = "/Users/nnthieu/SyntheaData/synthea.duckdb")

Now at my directory already have database ‘synthea.duckdb’. Next, I load PostgreSQL extension in DuckDB

# Load the PostgreSQL extension
dbExecute(con, "INSTALL postgres; LOAD postgres;")
[1] 0

Load many CSV Files into DuckDB

library(DBI)
library(tools)

csv_files <- list.files("/Users/nnthieu/SyntheaData/", pattern = "*.csv", full.names = TRUE)

for (file in csv_files) {
  table_name <- tools::file_path_sans_ext(basename(file))
  
  # Drop table if it exists
  dbExecute(con, sprintf("DROP TABLE IF EXISTS %s", table_name))
  
  # Create table from CSV
  query <- sprintf("CREATE TABLE %s AS SELECT * FROM read_csv_auto('%s')", table_name, file)
  dbExecute(con, query)
}

# Verify tables
dbListTables(con)
 [1] "allergies"              "careplans"              "claims"                
 [4] "claims_transactions"    "conditions"             "devices"               
 [7] "encounters"             "imaging_studies"        "immunizations"         
[10] "immunizations_children" "medications"            "observations"          
[13] "organizations"          "patients"               "payer_transitions"     
[16] "payers"                 "procedures"             "providers"             
[19] "supplies"              

Analyzing data using postgre SQL

Check if SQL is working with the ‘Immunizations’ table, joined with the ‘Patients’ table, and creating new columns for ‘Age’ and ‘Birthdate’ to set data for children in the year 2019.

dbGetQuery(con, "
SELECT 
    i.date, 
    i.patient, 
    i.description, 
    i.base_cost, 
    p.birthdate, 
    DATEDIFF('day', p.birthdate, i.date) / 365.25 AS Age
FROM immunizations i
LEFT JOIN patients p ON i.patient = p.id 
WHERE (DATEDIFF('day', p.birthdate, i.date) / 365.25) <= 15
AND EXTRACT(YEAR FROM i.date) = 2019
LIMIT 10;
")
                  DATE                              PATIENT
1  2019-08-05 08:20:52 c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8
2  2019-08-06 19:50:58 dc6c06d0-a7d8-100f-c08b-46c93700c188
3  2019-08-06 19:50:58 dc6c06d0-a7d8-100f-c08b-46c93700c188
4  2019-12-01 06:47:10 98e4223a-5ad8-94e0-2d73-ee5203dd4e2b
5  2019-12-01 06:47:10 98e4223a-5ad8-94e0-2d73-ee5203dd4e2b
6  2019-12-01 06:47:10 98e4223a-5ad8-94e0-2d73-ee5203dd4e2b
7  2019-12-01 06:47:10 98e4223a-5ad8-94e0-2d73-ee5203dd4e2b
8  2019-02-24 04:46:41 122b4063-18dc-f20f-204c-6f2da758717b
9  2019-02-24 04:46:41 122b4063-18dc-f20f-204c-6f2da758717b
10 2019-02-24 04:46:41 122b4063-18dc-f20f-204c-6f2da758717b
                                          DESCRIPTION BASE_COST  BIRTHDATE
1  Influenza  seasonal  injectable  preservative free    140.52 2005-07-04
2  Influenza  seasonal  injectable  preservative free    140.52 2006-07-11
3                                   HPV  quadrivalent    140.52 2006-07-11
4                                                Tdap    140.52 2008-11-09
5  Influenza  seasonal  injectable  preservative free    140.52 2008-11-09
6                                   HPV  quadrivalent    140.52 2008-11-09
7                                 meningococcal MCV4P    140.52 2008-11-09
8                                       Hib (PRP-OMP)    140.52 2018-09-16
9                               rotavirus  monovalent    140.52 2018-09-16
10                                                IPV    140.52 2018-09-16
         Age
1  14.086242
2  13.070500
3  13.070500
4  11.058179
5  11.058179
6  11.058179
7  11.058179
8   0.440794
9   0.440794
10  0.440794

Counting numbers of shotss by descriptions

library(DBI)

query <- "
WITH immunizations_filtered AS (
SELECT 
    i.date, 
    i.patient, 
    i.description, 
    i.base_cost, 
    p.birthdate, 
    DATEDIFF('day', p.birthdate, i.date) / 365.25 AS Age
FROM immunizations i
LEFT JOIN patients p ON i.patient = p.id 
WHERE (DATEDIFF('day', p.birthdate, i.date) / 365.25) <= 15
AND EXTRACT(YEAR FROM i.date) = 2019
) 
SELECT description, COUNT(*) AS n 
FROM immunizations_filtered
GROUP BY description
ORDER BY n DESC;
"

result <- dbGetQuery(con, query)
print(result)
                                          DESCRIPTION   n
1  Influenza  seasonal  injectable  preservative free 162
2                                                DTaP  56
3                       Pneumococcal conjugate PCV 13  48
4                                                 IPV  45
5                                       Hib (PRP-OMP)  38
6                                   HPV  quadrivalent  34
7                      Hep B  adolescent or pediatric  32
8                               rotavirus  monovalent  25
9                             Hep A  ped/adol  2 dose  23
10                                          varicella  23
11                                                MMR  23
12                                meningococcal MCV4P  11
13                                               Tdap  11
query <- "
SELECT COUNT(DISTINCT i.patient) AS number_of_children_getting_shots_in_2019
FROM immunizations i
LEFT JOIN patients p ON i.patient = p.id 
WHERE (DATEDIFF('day', p.birthdate, i.date) / 365.25) <= 15
  AND EXTRACT(YEAR FROM i.date) = 2019; "

result <- dbGetQuery(con, query)
print(result)
  number_of_children_getting_shots_in_2019
1                                      173

Save the output table to a local directory.

dbGetQuery(con, "
COPY (
    SELECT 
    i.date, 
    i.patient, 
    i.description, 
    i.base_cost, 
    p.birthdate, 
    DATEDIFF('day', p.birthdate, i.date) / 365.25 AS Age
FROM immunizations i
LEFT JOIN patients p ON i.patient = p.id 
WHERE (DATEDIFF('day', p.birthdate, i.date) / 365.25) <= 15
AND EXTRACT(YEAR FROM i.date) = 2019
) TO '/Users/nnthieu/SyntheaData/immunizations_children.csv' 
WITH CSV HEADER;
")
Warning in dbFetch(rs, n = n, ...): Should not call dbFetch() on results that
do not come from SELECT, got COPY
data frame with 0 columns and 0 rows

Close the connection when done

dbDisconnect(con, shutdown = TRUE)

Visualize the data

Reading the output table from local directory

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read.csv("/Users/nnthieu/SyntheaData/immunizations_children.csv")
head(data, 5)
                 DATE                              PATIENT
1 2019-08-05 08:20:52 c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8
2 2019-08-06 19:50:58 dc6c06d0-a7d8-100f-c08b-46c93700c188
3 2019-08-06 19:50:58 dc6c06d0-a7d8-100f-c08b-46c93700c188
4 2019-12-01 06:47:10 98e4223a-5ad8-94e0-2d73-ee5203dd4e2b
5 2019-12-01 06:47:10 98e4223a-5ad8-94e0-2d73-ee5203dd4e2b
                                         DESCRIPTION BASE_COST  BIRTHDATE
1 Influenza  seasonal  injectable  preservative free    140.52 2005-07-04
2 Influenza  seasonal  injectable  preservative free    140.52 2006-07-11
3                                  HPV  quadrivalent    140.52 2006-07-11
4                                               Tdap    140.52 2008-11-09
5 Influenza  seasonal  injectable  preservative free    140.52 2008-11-09
       Age
1 14.08624
2 13.07050
3 13.07050
4 11.05818
5 11.05818
library(ggplot2)

ggplot(data, aes(x = reorder(DESCRIPTION, -table(DESCRIPTION)[DESCRIPTION]))) +
  geom_bar(fill = "lightblue") +  # Set the bar color to light blue
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) + 
  theme_minimal() +
  labs(title = "NUMBERS OF VACCINE SHOTS for CHILDREN in 2019", x = "DESCRIPTION", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability
Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(count)` instead.