library(DBI)
library(duckdb)
library(RPostgres)
<- dbConnect(duckdb::duckdb(), dbdir = "/Users/nnthieu/SyntheaData/synthea.duckdb") con
Financial Analysis - 2019 | Postgre SQL in RStudio
Intro
This post is for analyzing financial data using SQL in RStudio.
Data
I downloaded csv files from the Synthea website (https://synthea.mitre.org/downloads) to my directory ’/Users/nnthieu/SyntheaData/.
Building database
Now I create a database using ‘duckdb’
Now at my directory already have database ‘synthea.duckdb’. Next, I load PostgreSQL extension in DuckDB
# Load the PostgreSQL extension
dbExecute(con, "INSTALL postgres; LOAD postgres;")
[1] 0
Load many CSV Files into DuckDB
library(DBI)
library(tools)
<- list.files("/Users/nnthieu/SyntheaData/", pattern = "*.csv", full.names = TRUE)
csv_files
for (file in csv_files) {
<- tools::file_path_sans_ext(basename(file))
table_name
# Drop table if it exists
dbExecute(con, sprintf("DROP TABLE IF EXISTS %s", table_name))
# Create table from CSV
<- sprintf("CREATE TABLE %s AS SELECT * FROM read_csv_auto('%s')", table_name, file)
query dbExecute(con, query)
}
# Verify tables
dbListTables(con)
[1] "allergies" "careplans" "claims"
[4] "claims_transactions" "conditions" "devices"
[7] "encounters" "imaging_studies" "immunizations"
[10] "immunizations_children" "medications" "observations"
[13] "organizations" "patients" "payer_transitions"
[16] "payers" "procedures" "providers"
[19] "supplies"
Analyzing data using postgre SQL
Number of patients and cost average
dbGetQuery(con, "
SELECT
COUNT(DISTINCT patientid) AS patient_count,
AVG(total_charges) AS avg_cost_per_patient
FROM (
SELECT patientid, SUM(AMOUNT) AS total_charges
FROM claims_transactions
WHERE TYPE = 'CHARGE'
AND EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY patientid
) ;
")
patient_count avg_cost_per_patient
1 922 20042.12
Total charge in the year 2019
dbGetQuery(con, "
SELECT
SUM(AMOUNT) AS total_revenue
FROM claims_transactions
WHERE TYPE = 'CHARGE'
AND EXTRACT(YEAR FROM FROMDATE) = 2019;
")
total_revenue
1 18478833
Monthly charge
<- "
query SELECT
DATE_TRUNC('month', FROMDATE) AS month,
SUM(AMOUNT) AS monthly_charge
FROM claims_transactions
WHERE TYPE = 'CHARGE' AND EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY month
ORDER BY month;
"
<- dbGetQuery(con, query)
result print(result)
month monthly_charge
1 2019-01-01 1313310
2 2019-02-01 1422386
3 2019-03-01 1550802
4 2019-04-01 1505801
5 2019-05-01 1753439
6 2019-06-01 1269097
7 2019-07-01 1987173
8 2019-08-01 1619654
9 2019-09-01 1590527
10 2019-10-01 1335336
11 2019-11-01 1565859
12 2019-12-01 1565449
Monthly charge plot
library(ggplot2)
$month <- as.Date(result$month) # Ensure it's in date format
resultggplot(result, aes(x = month, y = monthly_charge)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = round(monthly_charge, 0)),
hjust = 1, vjust = 1, angle = 90, color = "white", size = 3.5) +
labs(title = "Monthly Charge (2019)", x = "Month", y = "Revenue") +
theme_minimal() +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Monthly revenue
<- "
query2 SELECT
DATE_TRUNC('month', FROMDATE) AS month,
SUM(PAYMENTS) AS monthly_revenue
FROM claims_transactions
WHERE TYPE = 'PAYMENT' AND EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY month
ORDER BY month;
"
<- dbGetQuery(con, query2)
result2 print(result2)
month monthly_revenue
1 2019-01-01 1315065
2 2019-02-01 1425222
3 2019-03-01 1552219
4 2019-04-01 1507599
5 2019-05-01 1755629
6 2019-06-01 1271131
7 2019-07-01 1989384
8 2019-08-01 1621383
9 2019-09-01 1593102
10 2019-10-01 1336772
11 2019-11-01 1567647
12 2019-12-01 1567189
Monthly revenue plot
library(ggplot2)
$month <- as.Date(result2$month) # Ensure it's in date format
result2ggplot(result2, aes(x = month, y = monthly_revenue)) +
geom_bar(stat = "identity", fill = "lightblue") +
geom_text(aes(label = round(monthly_revenue, 0)),
hjust = 1, vjust = 1, angle = 90, color = "blue", size = 3.5) +
labs(title = "Monthly Revenue (2019)", x = "Month", y = "Revenue") +
theme_minimal() +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Percentage of monthly revenue per charge - balance
<- "WITH charges AS (
query_f SELECT
DATE_TRUNC('month', FROMDATE) AS month,
SUM(AMOUNT) AS monthly_charge
FROM claims_transactions
WHERE TYPE = 'CHARGE' AND EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY month
),
payments AS (
SELECT
DATE_TRUNC('month', FROMDATE) AS month,
SUM(PAYMENTS) AS monthly_revenue
FROM claims_transactions
WHERE TYPE = 'PAYMENT' AND EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY month
)
SELECT
c.month,
c.monthly_charge,
p.monthly_revenue,
round(p.monthly_revenue*100 / c.monthly_charge, 2) AS percent
FROM charges c
LEFT JOIN payments p ON c.month = p.month
ORDER BY c.month;
"
<- dbGetQuery(con, query_f)
result_f print(result_f)
month monthly_charge monthly_revenue percent
1 2019-01-01 1313310 1315065 100.13
2 2019-02-01 1422386 1425222 100.20
3 2019-03-01 1550802 1552219 100.09
4 2019-04-01 1505801 1507599 100.12
5 2019-05-01 1753439 1755629 100.12
6 2019-06-01 1269097 1271131 100.16
7 2019-07-01 1987173 1989384 100.11
8 2019-08-01 1619654 1621383 100.11
9 2019-09-01 1590527 1593102 100.16
10 2019-10-01 1335336 1336772 100.11
11 2019-11-01 1565859 1567647 100.11
12 2019-12-01 1565449 1567189 100.11
Payment Source Distribution
<- "
query SELECT
METHOD,
SUM(PAYMENTS) AS total_payments,
COUNT(*) AS transaction_count
FROM claims_transactions
WHERE TYPE = 'PAYMENT'
AND EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY METHOD
ORDER BY total_payments DESC; "
<- dbGetQuery(con, query)
result3 print(result3)
METHOD total_payments transaction_count
1 CASH 4892864 3043
2 CC 4816750 2927
3 CHECK 4478845 2948
4 ECHECK 4192634 3519
5 COPAY 121250 2085
Payment method plots
library(ggplot2)
library(scales)
ggplot(result3, aes(x = reorder(METHOD, total_payments), y = total_payments)) +
geom_bar(stat = "identity", fill = "steelblue") +
# Add data labels inside bars
geom_text(aes(label = round(total_payments, 2)), hjust = 1.2, color = "white", size = 4) +
labs(title = "Total Payments by Methods (2019)", x = "Payment Method", y = "Total Payments") +
scale_y_continuous(labels = scales::label_number(scale = 1e-3, suffix = "K")) + # Format axis in 'K'
theme_minimal() +
coord_flip() # Rotate for better readability if many methods
library(ggplot2)
ggplot(result3, aes(x = reorder(METHOD, transaction_count), y = transaction_count)) +
geom_bar(stat = "identity", fill = "orange") +
# Add data labels inside bars
geom_text(aes(label = transaction_count), hjust = 1.2, color = "white", size = 4) +
labs(title = "Transaction Counts by Methods (2019)", x = "Payment Method", y = "Transaction Count") +
theme_minimal() +
coord_flip() # Rotate for better readability
Top insurance payers
dbGetQuery(con, "
SELECT patientinsuranceid,
SUM(PAYMENTS) AS total_paid
FROM claims_transactions
WHERE TYPE = 'PAYMENT'
AND EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY patientinsuranceid
ORDER BY total_paid DESC
LIMIT 10;
")
PATIENTINSURANCEID total_paid
1 <NA> 8826673.7
2 e6178711-3936-e332-814d-19d8851e314d 497743.4
3 2a48a5b5-97d4-63df-1172-524a47bc49b5 452664.4
4 06629c1a-687d-5121-1e28-ae123c1fe647 262918.3
5 ef82862b-c0f8-7440-2dcb-4d7137349f0d 248540.8
6 01d78eb5-7f50-45e9-f524-921196a3dffe 196989.4
7 382bd5b8-f7b6-1132-b8fd-49ed4a44f6da 193298.8
8 2eb19e95-d892-0973-f9c1-1c915d3e622d 192750.3
9 ef5697a6-2bd3-d1c3-67ea-0f7892f4e76f 189523.8
10 41aa16da-a60c-243b-5f97-eae742c63c68 183404.3
Dupplicate transactions
dbGetQuery(con, "
SELECT id, patientid, fromdate, AMOUNT, COUNT(*) AS duplicate_count
FROM claims_transactions
WHERE EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY id, patientid, fromdate, AMOUNT
HAVING COUNT(*) > 1;
")
[1] ID PATIENTID FROMDATE AMOUNT
[5] duplicate_count
<0 rows> (or 0-length row.names)
No dupplicate transactions
Refund and adjusment analysis
dbGetQuery(con, "
SELECT DATE_TRUNC('month', fromdate) AS month,
SUM(TRANSFERS) AS total_refunds
FROM claims_transactions
WHERE TYPE = 'TRANSFEROUT'
AND EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY month
ORDER BY month;
")
month total_refunds
1 2019-01-01 368428.3
2 2019-02-01 306047.4
3 2019-03-01 603930.9
4 2019-04-01 566728.3
5 2019-05-01 590598.8
6 2019-06-01 355586.1
7 2019-07-01 822590.0
8 2019-08-01 534343.8
9 2019-09-01 373978.2
10 2019-10-01 324308.7
11 2019-11-01 469261.1
12 2019-12-01 368648.8
Close the connection when done
dbDisconnect(con, shutdown = TRUE)