library(DBI)
library(duckdb)
library(RPostgres)
con <- dbConnect(duckdb::duckdb(), dbdir = "/Users/nnthieu/SyntheaData/SyntheaCovid19/synthea19.duckdb")COVID-19 Reports - 2020 | Postgre SQL in RStudio
Intro
In this post I explore covid-19 patient data from Synthea_Covid19 database using SLQ commands integrated into Rstudio to analyze data.
The reason for using RStudio integrated with SQL is that loading tables is much easier compared to importing them into pgAdmin4.
Data
I downloaded csv files from the Synthea website (https://synthea.mitre.org/downloads) to my directory ’/Users/nnthieu/SyntheaData/SyntheaCovid19/.
Building database
Now I create a database using ‘duckdb’
Now at my directory already have database ‘synthea19.duckdb’. Next, I load PostgreSQL extension in DuckDB
# Load the PostgreSQL extension
dbExecute(con, "INSTALL postgres; LOAD postgres;")[1] 0
Load many CSV Files into DuckDB
library(DBI)
library(tools)
csv_files <- list.files("/Users/nnthieu/SyntheaData/SyntheaCovid19/", pattern = "*.csv", full.names = TRUE)
for (file in csv_files) {
table_name <- tools::file_path_sans_ext(basename(file))
# Drop table if it exists
dbExecute(con, sprintf("DROP TABLE IF EXISTS %s", table_name))
# Create table from CSV
query <- sprintf("CREATE TABLE %s AS SELECT * FROM read_csv_auto('%s')", table_name, file)
dbExecute(con, query)
}
# Verify tables
dbListTables(con) [1] "allergies" "careplans" "conditions"
[4] "devices" "encounters" "imaging_studies"
[7] "immunizations" "medications" "observations"
[10] "organizations" "patients" "payer_transitions"
[13] "payers" "procedures" "providers"
[16] "supplies"
Check if any csv files in the folder containing the word ‘covid’
grep -i -l 'covid' /Users/nnthieu/SyntheaData/SyntheaCovid19/*.csv/Users/nnthieu/SyntheaData/SyntheaCovid19/careplans.csv
/Users/nnthieu/SyntheaData/SyntheaCovid19/conditions.csv
/Users/nnthieu/SyntheaData/SyntheaCovid19/encounters.csv
/Users/nnthieu/SyntheaData/SyntheaCovid19/medications.csv
/Users/nnthieu/SyntheaData/SyntheaCovid19/observations.csv
/Users/nnthieu/SyntheaData/SyntheaCovid19/procedures.csv
There are 6 tables containing ‘covid’. Now I search the ‘encounters’ table for ‘covid’
awk -F',' 'NR==1 {for (i=1; i<=NF; i++) header[i]=$i; next}
{
for (i=1; i<=NF; i++) {
if (tolower($i) ~ /covid/) {
print header[i];
}
}
}' /Users/nnthieu/SyntheaData/SyntheaCovid19/encounters.csv | sort | uniqREASONDESCRIPTION
The column REASONDESCRIPTION contains the word ‘covid’.
dbGetQuery(con, "
SELECT * FROM encounters
WHERE REASONDESCRIPTION = 'COVID-19'
LIMIT 5
;
") Id START STOP
1 b7455838-3607-47f4-aaa5-fd89abea7d29 2020-02-19 06:11:02 2020-03-05 15:37:02
2 60b6cbc7-bbee-4fdd-9283-e8462a56dc34 2020-03-07 13:40:50 2020-03-28 18:09:50
3 7c4dcdf1-2c3b-4d53-a462-b1e5f62d4a82 2020-03-01 03:27:31 2020-03-07 05:40:31
4 4e304862-fd1d-4787-8f44-1b17019c73b2 2020-03-15 01:13:04 2020-04-02 07:06:04
5 f6f6f3f9-631d-4644-bdbe-2fc126c83afd 2020-03-10 05:23:24 2020-03-24 10:24:24
PATIENT ORGANIZATION
1 bd1c4ffc-7f1d-4590-adbb-1d6533fb623e fbf6180e-b800-3ebe-b91d-93d0288c400e
2 3c3b89b1-cb41-4f94-9193-2f3da4fe38e5 d692e283-0833-3201-8e55-4f868a9c0736
3 7e2c6949-dce9-44be-b6ef-50aca95840d7 465de31f-3098-365c-af70-48a071e1f5aa
4 0ab3b40f-3afd-42b8-9755-5834aa42eb67 6f122869-a856-3d65-8db9-099bf4f5bbb8
5 346b2b95-b8c5-4fe1-acf3-94de7e6e965b d692e283-0833-3201-8e55-4f868a9c0736
PROVIDER PAYER
1 7502b497-e32c-32d3-a0b8-f18f540a8a45 4d71f845-a6a9-3c39-b242-14d25ef86a8d
2 fb4c9467-dba4-387c-9148-c8d578fd1edf 047f6ec3-6215-35eb-9608-f9dda363a44c
3 84a692b5-42c9-39cf-9ffa-9a8304e00181 047f6ec3-6215-35eb-9608-f9dda363a44c
4 b394de28-e171-3124-a0d7-3be67982bbbc 047f6ec3-6215-35eb-9608-f9dda363a44c
5 fb4c9467-dba4-387c-9148-c8d578fd1edf d47b3510-2895-3b70-9897-342d681c769d
ENCOUNTERCLASS CODE DESCRIPTION
1 inpatient 1505002 Hospital admission for isolation (procedure)
2 inpatient 1505002 Hospital admission for isolation (procedure)
3 inpatient 1505002 Hospital admission for isolation (procedure)
4 inpatient 1505002 Hospital admission for isolation (procedure)
5 inpatient 1505002 Hospital admission for isolation (procedure)
BASE_ENCOUNTER_COST TOTAL_CLAIM_COST PAYER_COVERAGE REASONCODE
1 129.16 129.16 64.16 840539006
2 129.16 129.16 69.16 840539006
3 129.16 129.16 69.16 840539006
4 129.16 129.16 69.16 840539006
5 129.16 129.16 49.16 840539006
REASONDESCRIPTION
1 COVID-19
2 COVID-19
3 COVID-19
4 COVID-19
5 COVID-19
Analyzing data using postgre SQL
Count total patients in year 2020
dbGetQuery(con, "
SELECT COUNT(DISTINCT PATIENT) AS patients_counts FROM encounters
WHERE EXTRACT(YEAR FROM START) = 2020
; ") patients_counts
1 98400
There are 98,400 patients seeing care in 2020
Count total patients with covid-19 in year 2020
Firstly, I will determine the covid-19 patients in the ‘conditions’ table those with CODE = ‘840539006’ as covid-19 diagnosis.
covid_patient_ids = dbGetQuery(con, "
SELECT DISTINCT PATIENT AS covid_patient_ids FROM conditions
WHERE CODE = '840539006'
AND EXTRACT(YEAR FROM START) = 2020
")
print(head(covid_patient_ids)) covid_patient_ids
1 6442b1ac-59e0-42cb-a75c-83c424da6e10
2 dcbe0bbb-34e5-4110-82ba-221f8cdfc9b9
3 072ec772-ad1f-419a-ac8d-84f82737abcf
4 1682a25a-1cb2-4464-8c6a-baa699818cba
5 dfb4e3e2-9744-4d93-8fdd-f3ca23b141f4
6 16794f3c-66f9-4e14-9f90-7b97e71618de
cat("Number of COVID-19 patients:", nrow(covid_patient_ids), "\n")Number of COVID-19 patients: 88166
So in year 2020, there are 88,166 patients with covid-19 diagnosis.
covid_patient_ids = dbGetQuery(con, "
SELECT DISTINCT PATIENT AS covid_patient_ids FROM encounters
WHERE REASONDESCRIPTION = 'COVID-19'
AND EXTRACT(YEAR FROM START) = 2020
")
print(head(covid_patient_ids)) covid_patient_ids
1 751a782f-1a7b-4c72-8f2e-7e1aa2f3392c
2 1f4b8608-b76a-469c-ba70-31a2649672b2
3 931cc1a0-5cae-482f-aea1-7a906b886c41
4 3f743bfd-6b0b-428f-a11d-f5c4114d9827
5 b0a1a9c7-7c87-4115-84aa-9ed8b8bde753
6 e1730c57-6cea-4831-8140-e15b5618ea40
cat("Number of COVID-19 patients:", nrow(covid_patient_ids), "\n")Number of COVID-19 patients: 18180
However, there are just 18,180 patients with covid-19 diagnosis seeing care via encounters. The reason is that the patients with covid-19 diagnosis are not always seen in the encounters table.
dbGetQuery(con, "
SELECT COUNT(DISTINCT PATIENT) AS covid19_counts FROM encounters
WHERE REASONDESCRIPTION = 'COVID-19'
AND EXTRACT(YEAR FROM START) = 2020
; ") covid19_counts
1 18180
The percentage of COVID-19 patients diagnosed is 89.6% of all patients seen in 2020. A total of 18,180 COVID-19 patients were seen in 2020, which represents 20.6% of all individuals diagnosed with COVID-19.
Count total patients with covid-19 in year 2020 by age group
results = dbGetQuery(con, "
SELECT
age_group,
COUNT(DISTINCT id) AS covid_patient_count,
ROUND(
COUNT(DISTINCT id) * 100.0 /
SUM(COUNT(DISTINCT id)) OVER (),
2
) AS percent_of_total
FROM (
SELECT
p.id,
CASE
WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 < 18 THEN '0-17'
WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 BETWEEN 18 AND 34 THEN '18-34'
WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 BETWEEN 35 AND 49 THEN '35-49'
WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 BETWEEN 50 AND 64 THEN '50-64'
ELSE '65+'
END AS age_group
FROM conditions c
LEFT JOIN patients p ON c.patient = p.id
WHERE c.code = '840539006'
AND EXTRACT(YEAR FROM c.start) = 2020
) sub
GROUP BY age_group
ORDER BY age_group;
")
print(results) age_group covid_patient_count percent_of_total
1 0-17 17416 19.75
2 18-34 19364 21.96
3 35-49 15351 17.41
4 50-64 17682 20.06
5 65+ 18353 20.82
Visualize number and percent of covid-19 patients by age-group
library(ggplot2)
ggplot(results, aes(x = age_group, y = covid_patient_count)) +
geom_bar(stat = "identity", fill = "#2c7fb8") +
geom_text(
aes(label = covid_patient_count),
vjust = 1.5, # Puts the count inside the bar
color = "white", # White text for count
size = 4
) +
geom_text(
aes(label = paste("(", round(percent_of_total, 2), "%)", sep = "")),
vjust = 3.5, # Puts the percentage below the count
color = "white", # White text for percentage
size = 3
) +
labs(
title = "COVID Patients by Age Group in 2020",
x = "Age Group",
y = "Number of Patients"
) +
theme_minimal() +
theme(text = element_text(size = 12))hospitalized patients for covid-19
Number of patients hospitalized because of covid-19.
dbGetQuery(con, "
SELECT COUNT(*) AS hospitalized_patients,
ROUND(100.0 * COUNT(*) / (SELECT COUNT(DISTINCT patient)
FROM conditions
WHERE code = '840539006'), 2) AS percent_of_covid_hospitalized_patients
FROM encounters
WHERE REASONCODE = '840539006' AND CODE = '1505002' ;
") hospitalized_patients percent_of_covid_hospitalized_patients
1 18177 20.62
There are 69,989 covid-19 patients isolated at home.
Patients recovered from covid-19 whose covid-19 status is negative after home isolation.
Among 69,989 home-isolated patients, 437 patients are recovered from covid-19.
dbGetQuery(con, "
SELECT COUNT(DISTINCT c.patient) AS recovered_patients,
ROUND(100.0 * COUNT(*) / (SELECT COUNT(DISTINCT patient)
FROM conditions
WHERE code = '840539006'), 2) AS percent_of_home_covid_recovered_patients
FROM careplans c
LEFT JOIN observations o ON c.patient = o.patient
WHERE c.code = '736376001'
AND c.reasoncode = '840539006'
AND c.stop IS NOT NULL
AND o.code = '94531-1'
AND o.value = 'Not detected (qualifier value)';
") recovered_patients percent_of_home_covid_recovered_patients
1 437 0.99
Health outcomes experienced by COVID-19 patients
dbGetQuery(con, "
WITH covid_patients AS (
SELECT DISTINCT patient
FROM conditions
WHERE code = '840539006'
),
outcome_patients AS (
SELECT DISTINCT patient,
CASE
WHEN description ILIKE '%sepsis%' THEN 'Sepsis'
WHEN description ILIKE '%septic shock%' THEN 'Septic Shock'
WHEN description ILIKE '%respiratory failure%' THEN 'Respiratory Failure'
WHEN description ILIKE '%Respiratory distress%' THEN 'Respiratory distress'
WHEN description ILIKE '%acute cardiac injury%' THEN 'Acute Cardiac Injury'
WHEN description ILIKE '%heart failure%' THEN 'Heart Failure'
WHEN description ILIKE '%pneumonia%' THEN 'Pneumonia'
WHEN description ILIKE '%acute kidney failure%' THEN 'Acute Kidney Failure'
WHEN description ILIKE '%Acute pulmonary embolism%' THEN 'Acute pulmonary embolism'
END AS outcome
FROM conditions
WHERE description ILIKE '%sepsis%'
OR description ILIKE '%septic shock%'
OR description ILIKE '%respiratory failure%'
OR description ILIKE '%Respiratory distress%'
OR description ILIKE '%acute cardiac injury%'
OR description ILIKE '%heart failure%'
OR description ILIKE '%pneumonia%'
OR description ILIKE '%acute kidney failure%'
OR description ILIKE '%Acute pulmonary embolism%'
),
covid_with_outcomes AS (
SELECT cp.patient, op.outcome
FROM covid_patients cp
JOIN outcome_patients op ON cp.patient = op.patient
)
SELECT
outcome,
COUNT(DISTINCT patient) AS patient_count,
ROUND(100.0 * COUNT(DISTINCT patient) / (SELECT COUNT(*) FROM covid_patients), 2) AS percent_of_covid_patients
FROM covid_with_outcomes
GROUP BY outcome
ORDER BY percent_of_covid_patients DESC;
") outcome patient_count percent_of_covid_patients
1 Pneumonia 18187 20.63
2 Respiratory distress 18175 20.61
3 Respiratory Failure 8699 9.87
4 Sepsis 6939 7.87
5 Acute pulmonary embolism 5726 6.49
6 Heart Failure 1943 2.20
7 Septic Shock 1746 1.98
Pneumonia and respiratory distress are most common outcomes among COVID-19 patients, with 1,200 and 1,100 patients respectively.
Labs for COVID-19 patients
labs_covid19 <- dbGetQuery(con, "
SELECT
o.date,
o.patient,
o.encounter,
o.code,
o.value,
o.units,
CASE o.code
WHEN '48065-7' THEN 'D-dimer'
WHEN '2276-4' THEN 'Serum Ferritin'
WHEN '89579-7' THEN 'High Sensitivity Cardiac Troponin I'
WHEN '26881-3' THEN 'IL-6'
WHEN '731-0' THEN 'Lymphocytes'
WHEN '14804-9' THEN 'Lactate dehydrogenase'
ELSE NULL
END AS test,
c.start,
o.date - c.start AS covid_days,
c.code AS con_code,
c.description AS con_description,
CASE
WHEN p.deathdate IS NULL THEN 'survivor'
ELSE 'die'
END AS survivor
FROM observations o
LEFT JOIN conditions c ON o.patient = c.patient
LEFT JOIN patients p ON o.patient = p.id
WHERE c.code = '840539006'
AND o.code IN ('48065-7', '26881-3', '2276-4', '89579-7', '731-0', '14804-9')
")
head(labs_covid19, 5) DATE PATIENT
1 2020-03-05 ff086df1-3ea4-46d0-a36c-fde003552038
2 2020-03-05 ff086df1-3ea4-46d0-a36c-fde003552038
3 2020-03-05 ff086df1-3ea4-46d0-a36c-fde003552038
4 2020-03-05 ff086df1-3ea4-46d0-a36c-fde003552038
5 2020-03-05 ff086df1-3ea4-46d0-a36c-fde003552038
ENCOUNTER CODE VALUE UNITS
1 38a24171-27b6-48e2-a2f3-ceea6057ded0 731-0 1.0 10*3/uL
2 38a24171-27b6-48e2-a2f3-ceea6057ded0 48065-7 0.3 ug/mL
3 38a24171-27b6-48e2-a2f3-ceea6057ded0 2276-4 443.1 ug/L
4 38a24171-27b6-48e2-a2f3-ceea6057ded0 89579-7 1.7 pg/mL
5 38a24171-27b6-48e2-a2f3-ceea6057ded0 14804-9 249.1 U/L
test START covid_days con_code
1 Lymphocytes 2020-03-05 0 840539006
2 D-dimer 2020-03-05 0 840539006
3 Serum Ferritin 2020-03-05 0 840539006
4 High Sensitivity Cardiac Troponin I 2020-03-05 0 840539006
5 Lactate dehydrogenase 2020-03-05 0 840539006
con_description survivor
1 COVID-19 survivor
2 COVID-19 survivor
3 COVID-19 survivor
4 COVID-19 survivor
5 COVID-19 survivor
library(ggplot2)
options(repr.plot.width = 21, repr.plot.height = 17)
ggplot(labs_covid19, aes(x = as.factor(covid_days), y = as.numeric(VALUE), fill = survivor)) +
geom_boxplot(outlier.alpha = 0.3) +
facet_wrap(~ test, scales = "free_y", ncol = 2, nrow = 3) +
labs(
title = "Distribution of Lab Values by Days Since COVID-19 Diagnosis",
x = "Days Since COVID-19",
y = "Test Result Value"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
strip.text = element_text(face = "bold")
)library(ggplot2)
ggplot(labs_covid19, aes(x = as.numeric(covid_days), y = as.numeric(VALUE), color = survivor)) +
geom_smooth(method = "loess", se = FALSE, span = 0.5) +
facet_wrap(~ test, scales = "free_y", ncol = 2, nrow = 3) +
labs(
title = "Smoothed Lab Value Trends by Survival Status",
x = "Days Since COVID-19",
y = "Test Result Value",
color = "Survival Status"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
strip.text = element_text(face = "bold")
)`geom_smooth()` using formula = 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at 4
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 2
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 7.7689e-16
Symptoms experienced by COVID-19 patients
symptoms = dbGetQuery(con, "
WITH covid_patients AS (
SELECT DISTINCT patient
FROM conditions
WHERE code = '840539006'
),
patient_symptoms AS (
SELECT c.description
FROM conditions c
JOIN covid_patients cp ON c.patient = cp.patient
WHERE LOWER(c.description) NOT IN (
'covid-19',
'suspected covid-19',
'body mass index 30+ - obesity (finding)',
'prediabetes',
'hypertension'
)
AND LOWER(c.description) NOT LIKE '%chronic%'
AND LOWER(c.description) NOT LIKE '%anemia%'
AND LOWER(c.description) NOT LIKE '%Hypoxemia%'
)
SELECT
description,
COUNT(*) AS symptom_count,
ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM patient_symptoms), 2) AS percent
FROM patient_symptoms
GROUP BY description
ORDER BY symptom_count DESC
LIMIT 14;
")
symptoms DESCRIPTION symptom_count percent
1 Fever (finding) 78341 12.54
2 Cough (finding) 59766 9.57
3 Loss of taste (finding) 44646 7.15
4 Fatigue (finding) 33855 5.42
5 Sputum finding (finding) 29666 4.75
6 Pneumonia (disorder) 18175 2.91
7 Respiratory distress (finding) 18175 2.91
8 Hypoxemia (disorder) 18175 2.91
9 Dyspnea (finding) 17516 2.80
10 Wheezing (finding) 17516 2.80
11 Miscarriage in first trimester 14972 2.40
12 Joint pain (finding) 13290 2.13
13 Muscle pain (finding) 13290 2.13
14 Sore throat symptom (finding) 12397 1.98
Length of symptoms in days
symptom_length <- dbGetQuery(con, "
WITH covid_patients AS (
SELECT DISTINCT patient
FROM conditions
WHERE code = '840539006'
)
SELECT
c.description,
ROUND(AVG(EXTRACT(DAY FROM c.stop) - EXTRACT(DAY FROM c.start)), 2) AS avg_duration
FROM conditions c
JOIN covid_patients cp ON c.patient = cp.patient
WHERE c.stop IS NOT NULL AND c.start IS NOT NULL
AND c.description IN (
'Fever (finding)',
'Cough (finding)' ,
'Loss of taste (finding)' ,
'Fatigue (finding)',
'Sputum finding (finding)'
)
GROUP BY c.description
ORDER BY avg_duration DESC;
")
print(symptom_length) DESCRIPTION avg_duration
1 Sputum finding (finding) 4.90
2 Loss of taste (finding) 4.78
3 Fever (finding) 4.77
4 Fatigue (finding) 4.77
5 Cough (finding) 4.76
Health outcomes experienced by COVID-19 patients
Due to the heavy load of data tables, I split the code into three parts. The first part counts the number of deaths and recoveries among patients diagnosed with COVID-19 in 2020. The second part analyzes health outcomes such as isolation and hospital admission. The third part examines the number of ICU admissions and ventilations. Finally, I combine all the results into a summary table.
Outcomes: Recovered + Death
outcomes1 = dbGetQuery(con, "
WITH outcomes1 AS (
SELECT
DISTINCT c.patient,
CASE
WHEN p.deathdate IS NOT NULL THEN 'no'
WHEN o.code = '94531-1' AND o.value = 'Not detected (qualifier value)' THEN 'yes'
ELSE 'unknown'
END AS recovered,
CASE
WHEN p.deathdate IS NOT NULL THEN 'yes'
ELSE 'no'
END AS death
FROM conditions c
LEFT JOIN patients p ON c.patient = p.id
LEFT JOIN observations o ON c.patient = o.patient
WHERE c.code = '840539006'
)
SELECT
'recovered' AS category,
recovered AS value,
COUNT(*) AS count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS percent
FROM outcomes1
GROUP BY recovered
UNION ALL
SELECT
'death' AS category,
death AS value,
COUNT(*) AS count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS percent
FROM outcomes1
GROUP BY death;
")
outcomes1 category value count percent
1 recovered unknown 84525 82.23
2 recovered yes 14629 14.23
3 recovered no 3641 3.54
4 death yes 3641 3.54
5 death no 99154 96.46
Outcomes: Isolation + Admission
outcomes2 = dbGetQuery(con, "
WITH outcomes2 AS (
SELECT
c.patient,
CASE
WHEN EXISTS (
SELECT 1
FROM careplans cp
WHERE cp.patient = c.patient
AND cp.code = '736376001'
AND cp.reasoncode = '840539006'
AND cp.stop IS NOT NULL
) THEN 'yes'
ELSE 'no'
END AS home_isolated,
CASE
WHEN EXISTS (
SELECT 1
FROM encounters e
WHERE e.patient = c.patient
AND e.reasoncode = '840539006'
AND e.code = '1505002'
) THEN 'yes'
ELSE 'no'
END AS hospital_admitted
FROM conditions c
WHERE c.code = '840539006'
GROUP BY c.patient
)
SELECT
'home_isolated' AS category,
home_isolated AS value,
COUNT(*) AS count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS percent
FROM outcomes2
GROUP BY home_isolated
UNION ALL
SELECT
'hospital_admitted' AS category,
hospital_admitted AS value,
COUNT(*) AS count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS percent
FROM outcomes2
GROUP BY hospital_admitted;
")
outcomes2 category value count percent
1 home_isolated yes 70398 79.85
2 home_isolated no 17768 20.15
3 hospital_admitted yes 18175 20.61
4 hospital_admitted no 69991 79.39
hospital admission by age groups
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
hospitalized_age = dbGetQuery(con, "
WITH patient_outcomes AS (
SELECT
p.id AS patient,
CASE
WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 < 18 THEN '0-17'
WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 BETWEEN 18 AND 34 THEN '18-34'
WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 BETWEEN 35 AND 49 THEN '35-49'
WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 BETWEEN 50 AND 64 THEN '50-64'
ELSE '65+'
END AS age_group,
CASE
WHEN EXISTS (
SELECT 1
FROM encounters e
WHERE e.patient = p.id
AND e.reasoncode = '840539006'
AND e.code = '1505002'
) THEN 'yes'
ELSE 'no'
END AS hospital_admitted
FROM patients p
JOIN (
SELECT DISTINCT patient, start
FROM conditions
WHERE code = '840539006'
) c ON c.patient = p.id
)
SELECT
age_group,
hospital_admitted,
COUNT(*) AS count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (PARTITION BY age_group), 2) AS percent
FROM patient_outcomes
GROUP BY age_group, hospital_admitted
ORDER BY age_group, hospital_admitted;
")
hospitalized_age <- hospitalized_age %>%
filter(hospital_admitted != "no")
hospitalized_age age_group hospital_admitted count percent
1 0-17 yes 976 5.60
2 18-34 yes 3106 16.04
3 35-49 yes 2744 17.88
4 50-64 yes 5557 31.43
5 65+ yes 5792 31.56
library(ggplot2)
ggplot(hospitalized_age, aes(x = age_group, y = count)) +
geom_bar(stat = "identity", fill = "lightblue") +
geom_text(
aes(label = round(count, 2)),
vjust = 1.5, # Puts the count inside the bar
color = "red", # White text for count
size = 4
) +
geom_text(
aes(label = paste("(", round(percent, 2), "%)", sep = "")),
vjust = 3.5, # Puts the percentage below the count
color = "blue", # White text for percentage
size = 3
) +
labs(
title = "Covid-19 Hospital Admissions by Age Group in 2020",
x = "Age Group",
y = "Total Admissions"
) +
theme_minimal() +
theme(text = element_text(size = 12))Outcomes: ICU Admission + Ventilated
outcomes3 = dbGetQuery(con, "
WITH outcomes3 AS (
SELECT
DISTINCT c.patient,
CASE
WHEN e.code = '305351004' THEN 'yes'
ELSE 'no'
END AS ICU_admitted,
CASE
WHEN pr.code = '26763009' THEN 'yes'
ELSE 'no'
END AS ventilated
FROM conditions c
LEFT JOIN encounters e ON c.patient = e.patient
LEFT JOIN procedures pr ON c.patient = pr.patient
WHERE c.code = '840539006'
)
SELECT
'ICU_admitted' AS category,
ICU_admitted AS value,
COUNT(*) AS count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS percent
FROM outcomes3
GROUP BY ICU_admitted
UNION ALL
SELECT
'ventilated' AS category,
ventilated AS value,
COUNT(*) AS count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS percent
FROM outcomes3
GROUP BY ventilated;
")
outcomes3 category value count percent
1 ICU_admitted yes 6591 6.75
2 ICU_admitted no 91080 93.25
3 ventilated yes 5828 5.97
4 ventilated no 91843 94.03
Combine Summary
library(dplyr)
final_table <- bind_rows(outcomes1, outcomes2, outcomes3)
final_table <- final_table %>%
filter(value != "no")
final_table category value count percent
1 recovered unknown 84525 82.23
2 recovered yes 14629 14.23
3 death yes 3641 3.54
4 home_isolated yes 70398 79.85
5 hospital_admitted yes 18175 20.61
6 ICU_admitted yes 6591 6.75
7 ventilated yes 5828 5.97
Recovered ‘unkown’ means not death but have no negative results of the confirmed covid19 tests.
Deaths with COVID-19
dbGetQuery(con, "
WITH covid_patients AS (
SELECT DISTINCT patient
FROM conditions
WHERE code = '840539006'
)
SELECT
COUNT(DISTINCT p.id) AS death_count,
ROUND(100.0 * COUNT(DISTINCT p.id) / (SELECT COUNT(*) FROM covid_patients), 2) AS percent_of_covid_patients_deaths
FROM patients p
JOIN conditions c ON p.id = c.patient
WHERE p.deathdate IS NOT NULL
AND c.code = '840539006'
") death_count percent_of_covid_patients_deaths
1 3641 4.13
The percentage of COVID-19 patients who died in 2020 is 4.13%.
Deaths with COVID-19 by age-group
age_deaths = dbGetQuery(con, "
SELECT
CASE
WHEN age < 18 THEN '0-17'
WHEN age BETWEEN 18 AND 34 THEN '18-34'
WHEN age BETWEEN 35 AND 49 THEN '35-49'
WHEN age BETWEEN 50 AND 64 THEN '50-64'
ELSE '65+'
END AS age_group,
COUNT(DISTINCT id) AS death_count,
ROUND(100.0 * COUNT(DISTINCT id) / (SELECT COUNT(DISTINCT patient)
FROM conditions
WHERE code = '840539006'), 2) AS percent_of_covid_patients
FROM (
SELECT
p.id,
DATE_PART('year', c.start) - DATE_PART('year', p.birthdate) AS age
FROM patients p
JOIN conditions c ON p.id = c.patient
WHERE c.code = '840539006'
AND p.deathdate IS NOT NULL
AND EXTRACT(YEAR FROM c.start) = 2020
) AS covid_deaths
GROUP BY age_group
ORDER BY age_group;
")
print(age_deaths) age_group death_count percent_of_covid_patients
1 0-17 6 0.01
2 18-34 166 0.19
3 35-49 206 0.23
4 50-64 819 0.93
5 65+ 2444 2.77
Visualize deaths with COVID-19 by age-group
ggplot(age_deaths, aes(x = age_group, y = death_count)) +
geom_bar(stat = "identity", fill = "gray") +
geom_text(
aes(label = round(death_count, 2)),
vjust = 1.5, # Puts the count inside the bar
color = "black", # White text for count
size = 4
) +
geom_text(
aes(label = paste("(", round(percent_of_covid_patients, 2), "%)", sep = "")),
vjust = 3.5, # Puts the percentage below the count
color = "blue", # White text for percentage
size = 3
) +
labs(
title = "Covid-19 Deaths by Age Group in 2020",
x = "Age Group",
y = "Total Deaths"
) +
theme_minimal() +
theme(text = element_text(size = 12))Costs for treating COVID-19
dbGetQuery(con, "
SELECT SUM(TOTALCOST) AS total_cost
FROM medications
WHERE REASONDESCRIPTION = 'COVID-19'
AND EXTRACT(YEAR FROM START) = 2020
") total_cost
1 770444.8
Calculate costs for each age-group
cost_covid19 = dbGetQuery(con, "
SELECT
age_group,
SUM(TOTALCOST) AS total_cost,
ROUND(
SUM(TOTALCOST) * 100.0 / SUM(SUM(TOTALCOST)) OVER (),
2
) AS percent
FROM (
SELECT
m.TOTALCOST,
CASE
WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 < 18 THEN '0-17'
WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 BETWEEN 18 AND 34 THEN '18-34'
WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 BETWEEN 35 AND 49 THEN '35-49'
WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 BETWEEN 50 AND 64 THEN '50-64'
ELSE '65+'
END AS age_group
FROM medications m
JOIN conditions c ON m.encounter = c.encounter
JOIN patients p ON c.patient = p.id
WHERE m.REASONDESCRIPTION ILIKE '%covid%'
AND EXTRACT(YEAR FROM m.START) = 2020
) sub
GROUP BY age_group
ORDER BY age_group;
")
print(cost_covid19) age_group total_cost percent
1 18-34 662677.4 20.15
2 35-49 560706.7 17.05
3 50-64 1155403.7 35.14
4 65+ 909304.0 27.65
Visualize costs for treating covid-19 patients by age-group
ggplot(cost_covid19, aes(x = age_group, y = total_cost)) +
geom_bar(stat = "identity", fill = "lightblue") +
geom_text(
aes(label = round(total_cost, 2)),
vjust = 1.5, # Puts the count inside the bar
color = "blue", # White text for count
size = 4
) +
geom_text(
aes(label = paste("(", round(percent, 2), "%)", sep = "")),
vjust = 3.5,
color = "blue",
size = 3
) +
labs(
title = "Costs for Treating COVID Patients by Age Group in 2020",
x = "Age Group",
y = "Total Costs"
) +
theme_minimal() +
theme(text = element_text(size = 12))Conclusion
The analysis of COVID-19 patients in the Synthea database for the year 2020 reveals several key findings:
- A total of 98,400 patients were seen in 2020, with 88,166 diagnosed with COVID-19.
- Among the diagnosed patients, 18,180 were seen in encounters, indicating that not all diagnosed patients sought care.
- The majority of COVID-19 patients were in the 18-34 age group, but a significant percentage being hospitalized and deaths at age 65+, 31.56% and 2.77%, respectively.
- The most common health outcomes included pneumonia and respiratory distress.
- The analysis of lab values showed trends in D-dimer, IL-6, and other markers over time.
- The costs associated with treating COVID-19 patients varied by age group, with older patients incurring higher costs.
- The summary of health outcomes indicated that a significant percentage of patients experienced isolation, hospitalization, and ICU admission.
- The analysis of symptoms revealed a range of experiences among COVID-19 patients, with some patients recovering and others facing severe outcomes.
- The findings highlight the importance of monitoring and understanding the long-term effects of COVID-19 on patients’ health and healthcare costs.