library(DBI)
library(duckdb)
library(RPostgres)
<- dbConnect(duckdb::duckdb(), dbdir = "/Users/nnthieu/SyntheaData/synthea.duckdb") con
Children Immunization Reports - Postgre SQL in RStudio
Intro
In this post I demonstrate how to use SLQ commands integrated into Rstudio to analyze data. I use SQL to create tables then save to local directory and read them to RStudio to analyze data.
The reason for using RStudio integrated with SQL is that loading tables is much easier compared to importing them into pgAdmin4.
Data
I downloaded csv files from the Synthea website (https://synthea.mitre.org/downloads) to my directory ’/Users/nnthieu/SyntheaData/.
Building database
Now I create a database using ‘duckdb’
Now at my directory already have database ‘synthea.duckdb’. Next, I load PostgreSQL extension in DuckDB
# Load the PostgreSQL extension
dbExecute(con, "INSTALL postgres; LOAD postgres;")
[1] 0
Load many CSV Files into DuckDB
library(DBI)
library(tools)
<- list.files("/Users/nnthieu/SyntheaData/", pattern = "*.csv", full.names = TRUE)
csv_files
for (file in csv_files) {
<- tools::file_path_sans_ext(basename(file))
table_name
# Drop table if it exists
dbExecute(con, sprintf("DROP TABLE IF EXISTS %s", table_name))
# Create table from CSV
<- sprintf("CREATE TABLE %s AS SELECT * FROM read_csv_auto('%s')", table_name, file)
query dbExecute(con, query)
}
# Verify tables
dbListTables(con)
[1] "allergies" "careplans" "claims"
[4] "claims_transactions" "conditions" "devices"
[7] "encounters" "imaging_studies" "immunizations"
[10] "immunizations_children" "medications" "observations"
[13] "organizations" "patients" "payer_transitions"
[16] "payers" "procedures" "providers"
[19] "supplies"
Analyzing data using postgre SQL
Check if SQL is working with the ‘Immunizations’ table, joined with the ‘Patients’ table, and creating new columns for ‘Age’ and ‘Birthdate’ to set data for children in the year 2019.
dbGetQuery(con, "
SELECT
i.date,
i.patient,
i.description,
i.base_cost,
p.birthdate,
DATEDIFF('day', p.birthdate, i.date) / 365.25 AS Age
FROM immunizations i
LEFT JOIN patients p ON i.patient = p.id
WHERE (DATEDIFF('day', p.birthdate, i.date) / 365.25) <= 15
AND EXTRACT(YEAR FROM i.date) = 2019
LIMIT 10;
")
DATE PATIENT
1 2019-08-05 08:20:52 c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8
2 2019-08-06 19:50:58 dc6c06d0-a7d8-100f-c08b-46c93700c188
3 2019-08-06 19:50:58 dc6c06d0-a7d8-100f-c08b-46c93700c188
4 2019-12-01 06:47:10 98e4223a-5ad8-94e0-2d73-ee5203dd4e2b
5 2019-12-01 06:47:10 98e4223a-5ad8-94e0-2d73-ee5203dd4e2b
6 2019-12-01 06:47:10 98e4223a-5ad8-94e0-2d73-ee5203dd4e2b
7 2019-12-01 06:47:10 98e4223a-5ad8-94e0-2d73-ee5203dd4e2b
8 2019-02-24 04:46:41 122b4063-18dc-f20f-204c-6f2da758717b
9 2019-02-24 04:46:41 122b4063-18dc-f20f-204c-6f2da758717b
10 2019-02-24 04:46:41 122b4063-18dc-f20f-204c-6f2da758717b
DESCRIPTION BASE_COST BIRTHDATE
1 Influenza seasonal injectable preservative free 140.52 2005-07-04
2 Influenza seasonal injectable preservative free 140.52 2006-07-11
3 HPV quadrivalent 140.52 2006-07-11
4 Tdap 140.52 2008-11-09
5 Influenza seasonal injectable preservative free 140.52 2008-11-09
6 HPV quadrivalent 140.52 2008-11-09
7 meningococcal MCV4P 140.52 2008-11-09
8 Hib (PRP-OMP) 140.52 2018-09-16
9 rotavirus monovalent 140.52 2018-09-16
10 IPV 140.52 2018-09-16
Age
1 14.086242
2 13.070500
3 13.070500
4 11.058179
5 11.058179
6 11.058179
7 11.058179
8 0.440794
9 0.440794
10 0.440794
Counting numbers of shotss by descriptions
library(DBI)
<- "
query WITH immunizations_filtered AS (
SELECT
i.date,
i.patient,
i.description,
i.base_cost,
p.birthdate,
DATEDIFF('day', p.birthdate, i.date) / 365.25 AS Age
FROM immunizations i
LEFT JOIN patients p ON i.patient = p.id
WHERE (DATEDIFF('day', p.birthdate, i.date) / 365.25) <= 15
AND EXTRACT(YEAR FROM i.date) = 2019
)
SELECT description, COUNT(*) AS n
FROM immunizations_filtered
GROUP BY description
ORDER BY n DESC;
"
<- dbGetQuery(con, query)
result print(result)
DESCRIPTION n
1 Influenza seasonal injectable preservative free 162
2 DTaP 56
3 Pneumococcal conjugate PCV 13 48
4 IPV 45
5 Hib (PRP-OMP) 38
6 HPV quadrivalent 34
7 Hep B adolescent or pediatric 32
8 rotavirus monovalent 25
9 varicella 23
10 MMR 23
11 Hep A ped/adol 2 dose 23
12 Tdap 11
13 meningococcal MCV4P 11
<- "
query SELECT COUNT(DISTINCT i.patient) AS number_of_children_getting_shots_in_2019
FROM immunizations i
LEFT JOIN patients p ON i.patient = p.id
WHERE (DATEDIFF('day', p.birthdate, i.date) / 365.25) <= 15
AND EXTRACT(YEAR FROM i.date) = 2019; "
<- dbGetQuery(con, query)
result print(result)
number_of_children_getting_shots_in_2019
1 173
Save the output table to a local directory.
dbGetQuery(con, "
COPY (
SELECT
i.date,
i.patient,
i.description,
i.base_cost,
p.birthdate,
DATEDIFF('day', p.birthdate, i.date) / 365.25 AS Age
FROM immunizations i
LEFT JOIN patients p ON i.patient = p.id
WHERE (DATEDIFF('day', p.birthdate, i.date) / 365.25) <= 15
AND EXTRACT(YEAR FROM i.date) = 2019
) TO '/Users/nnthieu/SyntheaData/immunizations_children.csv'
WITH CSV HEADER;
")
Warning in dbFetch(rs, n = n, ...): Should not call dbFetch() on results that
do not come from SELECT, got COPY
data frame with 0 columns and 0 rows
Close the connection when done
dbDisconnect(con, shutdown = TRUE)
Visualize the data
Reading the output table from local directory
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
<- read.csv("/Users/nnthieu/SyntheaData/immunizations_children.csv")
data head(data, 5)
DATE PATIENT
1 2019-08-05 08:20:52 c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8
2 2019-08-06 19:50:58 dc6c06d0-a7d8-100f-c08b-46c93700c188
3 2019-08-06 19:50:58 dc6c06d0-a7d8-100f-c08b-46c93700c188
4 2019-12-01 06:47:10 98e4223a-5ad8-94e0-2d73-ee5203dd4e2b
5 2019-12-01 06:47:10 98e4223a-5ad8-94e0-2d73-ee5203dd4e2b
DESCRIPTION BASE_COST BIRTHDATE
1 Influenza seasonal injectable preservative free 140.52 2005-07-04
2 Influenza seasonal injectable preservative free 140.52 2006-07-11
3 HPV quadrivalent 140.52 2006-07-11
4 Tdap 140.52 2008-11-09
5 Influenza seasonal injectable preservative free 140.52 2008-11-09
Age
1 14.08624
2 13.07050
3 13.07050
4 11.05818
5 11.05818
library(ggplot2)
ggplot(data, aes(x = reorder(DESCRIPTION, -table(DESCRIPTION)[DESCRIPTION]))) +
geom_bar(fill = "lightblue") + # Set bar color to light blue
geom_text(stat = "count", aes(label = ..count..), vjust = 1.5, color = "black", size = 3) + # Place labels inside bars
theme_minimal() +
labs(title = "NUMBERS OF VACCINE SHOTS for CHILDREN in 2019",
x = "DESCRIPTION",
y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(count)` instead.