Financial Analysis - 2019 | Postgre SQL in RStudio

Author

Thieu Nguyen

Intro

This post is for analyzing financial data using SQL in RStudio.

Data

I downloaded csv files from the Synthea website (https://synthea.mitre.org/downloads) to my directory ’/Users/nnthieu/SyntheaData/.

Building database

Now I create a database using ‘duckdb’

library(DBI)
library(duckdb)
library(RPostgres)

con <- dbConnect(duckdb::duckdb(), dbdir = "/Users/nnthieu/SyntheaData/synthea.duckdb")

Now at my directory already have database ‘synthea.duckdb’. Next, I load PostgreSQL extension in DuckDB

# Load the PostgreSQL extension
dbExecute(con, "INSTALL postgres; LOAD postgres;")
[1] 0

Load many CSV Files into DuckDB

library(DBI)
library(tools)

csv_files <- list.files("/Users/nnthieu/SyntheaData/", pattern = "*.csv", full.names = TRUE)

for (file in csv_files) {
  table_name <- tools::file_path_sans_ext(basename(file))
  
  # Drop table if it exists
  dbExecute(con, sprintf("DROP TABLE IF EXISTS %s", table_name))
  
  # Create table from CSV
  query <- sprintf("CREATE TABLE %s AS SELECT * FROM read_csv_auto('%s')", table_name, file)
  dbExecute(con, query)
}

# Verify tables
dbListTables(con)
 [1] "allergies"              "careplans"              "claims"                
 [4] "claims_transactions"    "conditions"             "devices"               
 [7] "encounters"             "imaging_studies"        "immunizations"         
[10] "immunizations_children" "medications"            "observations"          
[13] "organizations"          "patients"               "payer_transitions"     
[16] "payers"                 "procedures"             "providers"             
[19] "supplies"              

Analyzing data using postgre SQL

Number of patients and cost average

dbGetQuery(con, "
SELECT 
    COUNT(DISTINCT patientid) AS patient_count,
    AVG(total_charges) AS avg_cost_per_patient
FROM (
    SELECT patientid, SUM(AMOUNT) AS total_charges
    FROM claims_transactions
    WHERE TYPE = 'CHARGE' 
    AND EXTRACT(YEAR FROM FROMDATE) = 2019
    GROUP BY patientid
) ;

")
  patient_count avg_cost_per_patient
1           922             20042.12

Total charge in the year 2019

dbGetQuery(con, "
SELECT 
SUM(AMOUNT) AS total_revenue
FROM claims_transactions
WHERE TYPE = 'CHARGE'
AND EXTRACT(YEAR FROM FROMDATE) = 2019;
")
  total_revenue
1      18478833

Monthly charge

query <- "
SELECT 
    DATE_TRUNC('month', FROMDATE) AS month,
    SUM(AMOUNT) AS monthly_charge
FROM claims_transactions
WHERE TYPE = 'CHARGE' AND EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY month
ORDER BY month;
"
result <- dbGetQuery(con, query)
print(result)
        month monthly_charge
1  2019-01-01        1313310
2  2019-02-01        1422386
3  2019-03-01        1550802
4  2019-04-01        1505801
5  2019-05-01        1753439
6  2019-06-01        1269097
7  2019-07-01        1987173
8  2019-08-01        1619654
9  2019-09-01        1590527
10 2019-10-01        1335336
11 2019-11-01        1565859
12 2019-12-01        1565449

Monthly charge plot

library(ggplot2)
result$month <- as.Date(result$month)  # Ensure it's in date format
ggplot(result, aes(x = month, y = monthly_charge)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_text(aes(label = round(monthly_charge, 0)), 
    hjust = 1,   vjust = 1, angle = 90, color = "white", size = 3.5) + 
  labs(title = "Monthly Charge (2019)", x = "Month", y = "Revenue") +
  theme_minimal() +
  scale_x_date(date_labels = "%b %Y", date_breaks = "1 month") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Monthly revenue

query2 <- "
SELECT 
    DATE_TRUNC('month', FROMDATE) AS month,
    SUM(PAYMENTS) AS monthly_revenue
FROM claims_transactions
WHERE TYPE = 'PAYMENT' AND EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY month
ORDER BY month;
"
result2 <- dbGetQuery(con, query2)
print(result2)
        month monthly_revenue
1  2019-01-01         1315065
2  2019-02-01         1425222
3  2019-03-01         1552219
4  2019-04-01         1507599
5  2019-05-01         1755629
6  2019-06-01         1271131
7  2019-07-01         1989384
8  2019-08-01         1621383
9  2019-09-01         1593102
10 2019-10-01         1336772
11 2019-11-01         1567647
12 2019-12-01         1567189

Monthly revenue plot

library(ggplot2)
result2$month <- as.Date(result2$month)  # Ensure it's in date format
ggplot(result2, aes(x = month, y = monthly_revenue)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  geom_text(aes(label = round(monthly_revenue, 0)), 
    hjust = 1,   vjust = 1, angle = 90, color = "blue", size = 3.5) + 
  labs(title = "Monthly Revenue (2019)", x = "Month", y = "Revenue") +
  theme_minimal() +
  scale_x_date(date_labels = "%b %Y", date_breaks = "1 month") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Percentage of monthly revenue per charge - balance

query_f  <- "WITH charges AS (
    SELECT 
        DATE_TRUNC('month', FROMDATE) AS month,
        SUM(AMOUNT) AS monthly_charge
    FROM claims_transactions
    WHERE TYPE = 'CHARGE' AND EXTRACT(YEAR FROM FROMDATE) = 2019
    GROUP BY month
),
payments AS (
    SELECT 
        DATE_TRUNC('month', FROMDATE) AS month,
        SUM(PAYMENTS) AS monthly_revenue
    FROM claims_transactions
    WHERE TYPE = 'PAYMENT' AND EXTRACT(YEAR FROM FROMDATE) = 2019
    GROUP BY month
)
SELECT 
    c.month,
    c.monthly_charge,
    p.monthly_revenue,
    round(p.monthly_revenue*100 / c.monthly_charge, 2) AS percent
FROM charges c
LEFT JOIN payments p ON c.month = p.month
ORDER BY c.month;
"

result_f <- dbGetQuery(con, query_f)
print(result_f)
        month monthly_charge monthly_revenue percent
1  2019-01-01        1313310         1315065  100.13
2  2019-02-01        1422386         1425222  100.20
3  2019-03-01        1550802         1552219  100.09
4  2019-04-01        1505801         1507599  100.12
5  2019-05-01        1753439         1755629  100.12
6  2019-06-01        1269097         1271131  100.16
7  2019-07-01        1987173         1989384  100.11
8  2019-08-01        1619654         1621383  100.11
9  2019-09-01        1590527         1593102  100.16
10 2019-10-01        1335336         1336772  100.11
11 2019-11-01        1565859         1567647  100.11
12 2019-12-01        1565449         1567189  100.11

Payment Source Distribution

query <- "
SELECT 
    METHOD, 
    SUM(PAYMENTS) AS total_payments,
    COUNT(*) AS transaction_count
FROM claims_transactions 
WHERE TYPE = 'PAYMENT' 
AND EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY METHOD
ORDER BY total_payments DESC; "

result3 <- dbGetQuery(con, query)
print(result3)
  METHOD total_payments transaction_count
1   CASH        4892864              3043
2     CC        4816750              2927
3  CHECK        4478845              2948
4 ECHECK        4192634              3519
5  COPAY         121250              2085

Payment method plots

library(ggplot2)
library(scales) 

ggplot(result3, aes(x = reorder(METHOD, total_payments), y = total_payments)) +
  geom_bar(stat = "identity", fill = "steelblue") +  
    # Add data labels inside bars
  geom_text(aes(label = round(total_payments, 2)), hjust = 1.2, color = "white", size = 4) +
  labs(title = "Total Payments by Methods (2019)", x = "Payment Method", y = "Total Payments") +
  scale_y_continuous(labels = scales::label_number(scale = 1e-3, suffix = "K")) +  # Format axis in 'K'
  
  theme_minimal() +
  coord_flip()  # Rotate for better readability if many methods

library(ggplot2)

ggplot(result3, aes(x = reorder(METHOD, transaction_count), y = transaction_count)) +
  geom_bar(stat = "identity", fill = "orange") +

  # Add data labels inside bars
  geom_text(aes(label = transaction_count), hjust = 1.2, color = "white", size = 4) +
  labs(title = "Transaction Counts by Methods (2019)", x = "Payment Method", y = "Transaction Count") +
  theme_minimal() +
  coord_flip()  # Rotate for better readability

Top insurance payers

dbGetQuery(con, "
SELECT patientinsuranceid, 
       SUM(PAYMENTS) AS total_paid
FROM claims_transactions
WHERE TYPE = 'PAYMENT'
AND EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY patientinsuranceid
ORDER BY total_paid DESC
LIMIT 10;

")
                     PATIENTINSURANCEID total_paid
1                                  <NA>  8826673.7
2  e6178711-3936-e332-814d-19d8851e314d   497743.4
3  2a48a5b5-97d4-63df-1172-524a47bc49b5   452664.4
4  06629c1a-687d-5121-1e28-ae123c1fe647   262918.3
5  ef82862b-c0f8-7440-2dcb-4d7137349f0d   248540.8
6  01d78eb5-7f50-45e9-f524-921196a3dffe   196989.4
7  382bd5b8-f7b6-1132-b8fd-49ed4a44f6da   193298.8
8  2eb19e95-d892-0973-f9c1-1c915d3e622d   192750.3
9  ef5697a6-2bd3-d1c3-67ea-0f7892f4e76f   189523.8
10 41aa16da-a60c-243b-5f97-eae742c63c68   183404.3

Dupplicate transactions

dbGetQuery(con, "
SELECT id, patientid, fromdate, AMOUNT, COUNT(*) AS duplicate_count
FROM claims_transactions
WHERE EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY id, patientid, fromdate, AMOUNT
HAVING COUNT(*) > 1;

")
[1] ID              PATIENTID       FROMDATE        AMOUNT         
[5] duplicate_count
<0 rows> (or 0-length row.names)

No dupplicate transactions

Refund and adjusment analysis

dbGetQuery(con, "
SELECT DATE_TRUNC('month', fromdate) AS month,
       SUM(TRANSFERS) AS total_refunds
FROM claims_transactions
WHERE TYPE = 'TRANSFEROUT'
AND EXTRACT(YEAR FROM FROMDATE) = 2019
GROUP BY month
ORDER BY month;

")
        month total_refunds
1  2019-01-01      368428.3
2  2019-02-01      306047.4
3  2019-03-01      603930.9
4  2019-04-01      566728.3
5  2019-05-01      590598.8
6  2019-06-01      355586.1
7  2019-07-01      822590.0
8  2019-08-01      534343.8
9  2019-09-01      373978.2
10 2019-10-01      324308.7
11 2019-11-01      469261.1
12 2019-12-01      368648.8

Close the connection when done

dbDisconnect(con, shutdown = TRUE)