library(readr)
library(dplyr)
library(stringr)
library(tidyverse)
library(arrow)
library(gt)
library(DBI)
## Warning: package 'DBI' was built under R version 4.3.3
library(duckdb)
library(dplyr)
library(stringr)


con <- dbConnect(duckdb::duckdb(), dbdir = ":memory:")


dbExecute(con, "
  CREATE OR REPLACE VIEW ip AS
  SELECT *
  FROM read_parquet('D:/thcic_parquet_IP/**/*.parquet', union_by_name=true);
")
## [1] 0
diag_cols_vec <- c("PRINC_DIAG_CODE", paste0("OTH_DIAG_CODE_", 1:24))
dx_list_sql <- paste(diag_cols_vec, collapse = ",")

write_codes(con, "codes_fall", fall_diags)
write_codes(con, "codes_home_scene", at_home_scene_ICD10s)
write_codes(con, "codes_hip", hip_frac_diags)
write_codes(con, "codes_tbi", TBI_ICD10s)
write_codes(con, "codes_adrd", ICD10_ADRD_NORC)
write_codes(con, "codes_covid", ICD10_COVID_conf_or_exposed)
write_codes(con, "codes_htn", HTN_ICD10s)
write_codes(con, "codes_hf", HF_ICD10s)
write_codes(con, "codes_t2dm", T2DM_ICD10s)

dbWriteTable(con, "icd_to_ccsr", ICD_DX_TO_CCSR, overwrite = TRUE)
dbExecute(con, "CREATE INDEX IF NOT EXISTS idx_icd_to_ccsr ON icd_to_ccsr(ICD10);")
## [1] 0
print("INPATIENT HOSPITALIZATIONS *FROM* ER (2021-2024)")
## [1] "INPATIENT HOSPITALIZATIONS *FROM* ER (2021-2024)"
dbGetQuery(con, "
  SELECT
    COUNT(*) AS n_all,
    SUM(has_ADRD) AS n_adrd,
    COUNT(*) - SUM(has_ADRD) AS n_noadrd
  FROM ip;
")
##     n_all n_adrd n_noadrd
## 1 4272704 622357  3650347
ip_denoms_all <- dbGetQuery(con, "
  SELECT
    20212024 AS year,
    COUNT(*) AS n_all,
    SUM(has_ADRD) AS n_adrd,
    COUNT(*) - SUM(has_ADRD) AS n_noadrd
  FROM ip;
")


ip_denoms <- dbGetQuery(con, "
  SELECT
    year,
    COUNT(*) AS n_all,
    SUM(has_ADRD) AS n_adrd,
    COUNT(*) - SUM(has_ADRD) AS n_noadrd
  FROM ip
  GROUP BY year
  ORDER BY year;
")

ip_denoms
##   year   n_all n_adrd n_noadrd
## 1 2021  983472 146190   837282
## 2 2022 1033097 148291   884806
## 3 2023 1103112 159723   943389
## 4 2024 1153023 168153   984870
ip_denoms <- dbGetQuery(con, "
  SELECT
    year,
    COUNT(*) AS n_all,
    SUM(has_ADRD) AS n_adrd,
    COUNT(*) - SUM(has_ADRD) AS n_noadrd
  FROM ip
  GROUP BY year
  ORDER BY year;
")
library(readxl)
library(dplyr)

X21to24_TX_cost_to_charge_ratios <- read_excel("21to24_TX_cost_to_charge_ratios.xlsx") %>%
    rename(CCR = 8) %>% 
    mutate(YEAR = as.integer(YEAR))

avg_TX_CCRs <- X21to24_TX_cost_to_charge_ratios %>%
    group_by(YEAR) %>%
    summarise(
        mean_CCR   = mean(CCR, na.rm = TRUE),
        median_CCR = median(CCR, na.rm = TRUE),
        n_facilities = n(),
        .groups = "drop"
    )

##note!! we've got a 550-ish per year, but 670ish for THCIC set
###// WARNING

##//  WARNING

## ORIGINAL ABSTRACT APPROACH BELOW (REQUIRES PULLING THE DATA FROM THE LAKE)

dbGetQuery(con, "SELECT * FROM ip;") ->TX_IP_21to24

## UPDATED TO ACTUALLY USE THE FALL CODES; HERE YOU GO, BUCKO

## THE ABOVE SQL/LAKE APPROACH IS GOOD, BUT IS ESPECIALLY USEFUL HERE IF YOU NEED TO SANITY CHECK THE NUMBER OF IP RECORDS (HATE THE 2024 DATA SO MUCH)

average_covered = mean(c(0.82, 0.72))

#derived from the State losses from Medicare/Medicaid
#Medicare: 82%, Medicaid: 72%
# source: https://www.tha.org/wp-content/uploads/2025/02/2025-Hospital-Payment-Sources.pdf

# 91% of all patients with a fall-related diagnosis were insured by Medicare, Medicare Advantage, Medicaid, or were uninsured — all but the latter of which are payer types that typically reimburse hospitals at 72–82% of actual care costs4.

## NEED TO BUILD NEW LOSS KEY FOR HUMAN-READABLE LABELS

pmt_src_key <- c("09" = "Self Pay",
        "HM" = "Health Maintenance Organization",
        "10" = "Central Certification",
        "LI" = "Liability",
        "11" = "Other Non-Federal Programs",
        "LM" = "Liability Medical",
        "12" = "Preferred Provider Organization (PPO)",
        "MA" = "Medicare Part A",
        "13" = "Point of Service (POS)",
        "MB" = "Medicare Part B",
        "14" = "Exclusive Provider Organization (EPO)",
        "MC" = "Medicaid",
        "15" = "Indemnity Insurance",
        "TV" = "Title V",
        "16" = "HMO Medicare Risk",
        "OF" = "Other Federal Program",
        "AM" = "Automobile Medical",
        "VA" = "Veterans Administration Plan",
        "BL" = "Blue Cross / Blue Shield",
        "WC" = "Workers' Compensation Health Claim",
        "CH" = "CHAMPUS",
        "ZZ" = "Charity / Indigent / Unknown",
        "CI" = "Commercial Insurance",
        "DS" = "Disability Insurance")


# this is the *code* vector you want to use in %in% for human readable

pmt_src_loss_key <- unname(pmt_src_key[loss_leaders])
pmt_src_loss_key
## [1] "Medicaid"                     "Medicare Part A"             
## [3] "Medicare Part B"              "HMO Medicare Risk"           
## [5] "Veterans Administration Plan"
TX_IP_21to24 %>% filter(has_hip_fracture == 1 | has_TBI == 1) %>% filter(is_fall == 1) %>% mutate(loss = ifelse(FIRST_PAYMENT_SRC %in% pmt_src_loss_key, "loss", "regular"), log_charge = log(TOTAL_CHARGES)) %>% .$loss %>% table(.) %>% as.data.frame(.) %>% dplyr::rename(., insurance=1) %>% mutate(prop = Freq/sum(Freq))
##   insurance  Freq     prop
## 1      loss 81293 0.891097
## 2   regular  9935 0.108903
# These “loss-leading” coverage types incurred, on average, ~$20,000 more in hospital charges per case than other payer groups.

cost_table<- TX_IP_21to24 %>%
    filter(has_hip_fracture == 1 | has_TBI == 1) %>% filter(is_fall == 1) %>%
    mutate(loss = ifelse(FIRST_PAYMENT_SRC %in% pmt_src_loss_key,
                         "loss", "regular"),
           log_charge = log(TOTAL_CHARGES), LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%
    group_by(loss) %>%
    summarise(
        median_log_charge = median(log_charge, na.rm = TRUE),
        mad_log_charge   = mad(log_charge, na.rm = TRUE),
        mean_lengthofstay = mean(LENGTH_OF_STAY, na.rm=TRUE),
        sd_lengthofstay = sd(LENGTH_OF_STAY, na.rm=TRUE)
    ) %>%
    mutate(
        upper_staylength = mean_lengthofstay + 2*sd_lengthofstay,
        lower_staylength = mean_lengthofstay - 2*sd_lengthofstay,
        ## original code was median, now is mean
        median_charge_dollars = exp(median_log_charge)
    )

#total amount of "loss" per patient with lossy insurance
#// "The median cost of care for those with a fall-related injury and a loss-leading insurance, such as Medicare or Medicaid, was $86,828, resulting in an approximate loss of $19,970 per patient seen".

cost_table %>% filter(loss == "loss") %>% mutate(net_loss_profit = (1-average_covered) * median_charge_dollars ) %>% .$net_loss_profit
## [1] 19641.98
####### TOTAL CUMULATIVE COST OF CARE FOR THOSE WITH HIP FRAC OR TBI

#cumulative

##// "A total of $17.5M was charged across all years for all records with a hip fracture and/or TBI..."
TX_IP_21to24 %>% filter(has_hip_fracture == 1 | has_TBI == 1) %>% .$TOTAL_CHARGES %>% sum(., na.rm=TRUE) %>% scales::comma(.)
## [1] "21,413,064,578"
#with fall code (possible undercount; fall codes hit-or-miss)
##// "...of which an esimated $9.4M was attributable to fall..."
TX_IP_21to24 %>% filter(has_hip_fracture == 1 | has_TBI == 1) %>% filter(is_fall == 1) %>% .$TOTAL_CHARGES %>% sum(., na.rm=TRUE) %>% scales::comma(.)
## [1] "10,014,089,431"
# # cumulative with lossy insurance only
# TX_IP_21to24 %>% filter(has_hip_fracture == 1 | has_TBI == 1) %>% mutate(loss = ifelse(FIRST_PAYMENT_SRC %in% pmt_src_loss_key, "loss", "regular"), log_charge = log(TOTAL_CHARGES)) %>% filter(loss == "loss") %>% .$TOTAL_CHARGES %>% sum(., na.rm=TRUE) %>% scales::comma(.)
# 
# #annual average
# #$4B
# TX_IP_21to24 %>% filter(has_hip_fracture == 1 | has_TBI == 1) %>% mutate(loss = ifelse(FIRST_PAYMENT_SRC %in% pmt_src_loss_key, "loss", "regular"), log_charge = log(TOTAL_CHARGES)) %>% filter(loss == "loss") %>% .$TOTAL_CHARGES %>% {sum(., na.rm=TRUE)/4} %>% scales::comma(.)

## average costs across PHRs by loss/regular

##### NOTE: ONLY USE MEDIAN VALUES NOW

avg_fall_costs_PHR<- TX_IP_21to24 %>%
    mutate(PUBLIC_HEALTH_REGION = as.numeric(PUBLIC_HEALTH_REGION) %>% as.character(.), LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%
    
    filter(has_hip_fracture == 1 | has_TBI == 1) %>% filter(is_fall == 1) %>%
    mutate(loss = ifelse(FIRST_PAYMENT_SRC %in% pmt_src_loss_key,
                         "loss", "regular"),
           log_charge = log(TOTAL_CHARGES)) %>%
    group_by(loss, PUBLIC_HEALTH_REGION) %>%
    summarise(
        median_log_charge = median(log_charge, na.rm = TRUE),
        mad_log_charge   = mad(log_charge, na.rm = TRUE),
        mean_lengthofstay = mean(LENGTH_OF_STAY, na.rm=TRUE),
        sd_lengthofstay = sd(LENGTH_OF_STAY, na.rm=TRUE)
    ) %>%
    mutate(
        upper_staylength = mean_lengthofstay + 2*sd_lengthofstay,
        lower_staylength = mean_lengthofstay - 2*sd_lengthofstay,
        upper_log_charge = median_log_charge + 2 * mad_log_charge,
        lower_log_charge = median_log_charge - 2 * mad_log_charge,
        median_charge_dollars = exp(median_log_charge)
    )
## `summarise()` has grouped output by 'loss'. You can override using the
## `.groups` argument.
## across all TX and all PHRs, but not grouped by loss status

# TX_IP_21to24 %>% filter(has_hip_fracture == 1 | has_TBI == 1) %>%
#     mutate(loss = ifelse(FIRST_PAYMENT_SRC %in% pmt_src_loss_key,
#                          "loss", "regular"),
#            log_charge = log(TOTAL_CHARGES), LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%
#     summarise(
#         mean_log_charge = mean(log_charge, na.rm = TRUE),
#         sd_log_charge   = sd(log_charge, na.rm = TRUE),
#         mean_lengthofstay = mean(LENGTH_OF_STAY, na.rm=TRUE),
#         sd_lengthofstay = sd(LENGTH_OF_STAY, na.rm=TRUE)
#     )


# avg_fall_costs_PHR<- TX_IP_21to24 %>%
#     mutate(PUBLIC_HEALTH_REGION = case_when(
#         .$PUBLIC_HEALTH_REGION == "1" ~ "01",
#         .$PUBLIC_HEALTH_REGION == "2" ~ "02",
#         .$PUBLIC_HEALTH_REGION == "3" ~ "03",
#         .$PUBLIC_HEALTH_REGION == "4" ~ "04",
#         .$PUBLIC_HEALTH_REGION == "5" ~ "05",
#         .$PUBLIC_HEALTH_REGION == "6" ~ "06",
#         .$PUBLIC_HEALTH_REGION == "7" ~ "07",
#         .$PUBLIC_HEALTH_REGION == "8" ~ "08",
#         .$PUBLIC_HEALTH_REGION == "9" ~ "09",
#         .default = PUBLIC_HEALTH_REGION
#     ), LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%
#
#     filter(has_hip_fracture == 1 | has_TBI == 1) %>%
#     mutate(loss = ifelse(FIRST_PAYMENT_SRC %in% pmt_src_loss_key,
#                          "loss", "regular"),
#            log_charge = log(TOTAL_CHARGES)) %>%
#     group_by(PUBLIC_HEALTH_REGION) %>%
#     summarise(
#         mean_log_charge = mean(log_charge, na.rm = TRUE),
#         sd_log_charge   = sd(log_charge, na.rm = TRUE),
#         mean_lengthofstay = mean(LENGTH_OF_STAY, na.rm=TRUE),
#         sd_lengthofstay = sd(LENGTH_OF_STAY, na.rm=TRUE)
#     ) %>%
#     mutate(
#         upper_staylength = mean_lengthofstay + 2*sd_lengthofstay,
#         lower_staylength = mean_lengthofstay - 2*sd_lengthofstay,
#         upper_log_charge = mean_log_charge + 2 * sd_log_charge,
#         lower_log_charge = mean_log_charge - 2 * sd_log_charge,
#         mean_charge_dollars = exp(mean_log_charge + 0.5 *       sd_log_charge^2), ## original code was median, now is mean
#         upper_dollars = exp(mean_log_charge + 2 * sd_log_charge),
#         lower_dollars = exp(mean_log_charge - 2 * sd_log_charge)
#     )

avg_fall_costs_sex<- TX_IP_21to24 %>%
    filter(has_hip_fracture == 1 | has_TBI == 1) %>% filter(is_fall == 1) %>%
    mutate(loss = ifelse(FIRST_PAYMENT_SRC %in% pmt_src_loss_key,
                         "loss", "regular"),
           log_charge = log(TOTAL_CHARGES),
           LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%
    group_by(loss, SEX_CODE) %>%
    summarise(
        median_log_charge = median(log_charge, na.rm = TRUE),
        mad_log_charge   = mad(log_charge, na.rm = TRUE),
        mean_lengthofstay = mean(LENGTH_OF_STAY, na.rm=TRUE),
        sd_lengthofstay = sd(LENGTH_OF_STAY, na.rm=TRUE)
    ) %>%
    mutate(
        upper_staylength = mean_lengthofstay + 2*sd_lengthofstay,
        lower_staylength = mean_lengthofstay - 2*sd_lengthofstay,
        upper_log_charge = median_log_charge + 2 * mad_log_charge,
        lower_log_charge = median_log_charge - 2 * mad_log_charge,
        median_charge_dollars = exp(median_log_charge)
    )
## `summarise()` has grouped output by 'loss'. You can override using the
## `.groups` argument.
## total amounts of profit loss per patient by sex

#by median
avg_fall_costs_sex %>% filter(loss == "loss") %>% mutate(net_loss_profit = median_charge_dollars * (1-average_covered)) %>% dplyr::select(loss, SEX_CODE, median_charge_dollars, net_loss_profit)
## # A tibble: 4 × 4
## # Groups:   loss [1]
##   loss  SEX_CODE median_charge_dollars net_loss_profit
##   <chr> <chr>                    <dbl>           <dbl>
## 1 loss  F                       84497.          19434.
## 2 loss  M                       86266.          19841.
## 3 loss  U                       84366.          19404.
## 4 loss  <NA>                    89875.          20671.
### DR. Lee wants investigation of differences in length of stay by age

# install.packages("nortest", repos = "https://cloud.r-project.org")
# install.packages("rstatix", repos = "https://cloud.r-project.org")
library(nortest)
library(rstatix)
## 
## Attaching package: 'rstatix'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## check if LOS distro is normal

#check visually

TX_IP_21to24 %>% filter(has_hip_fracture == 1 | has_TBI == 1, SEX_CODE %in% c("F","M")) %>% filter(is_fall == 1) %>% mutate(LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%
  .$LENGTH_OF_STAY %>% table(.) %>% plot(.)

#check w/ normality test

TX_IP_21to24 %>%
    filter(has_hip_fracture == 1 | has_TBI == 1, SEX_CODE %in% c("F","M")) %>% filter(is_fall == 1) %>% mutate(LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%.$LENGTH_OF_STAY %>% ad.test(.)
## 
##  Anderson-Darling normality test
## 
## data:  .
## A = 4929.8, p-value < 2.2e-16
##check median LOS values and the MAD for PHRs and Sex/Age

TX_IP_21to24 %>%
    mutate(PUBLIC_HEALTH_REGION = as.numeric(PUBLIC_HEALTH_REGION) %>% as.character(.), LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%
    filter(has_hip_fracture == 1 | has_TBI == 1) %>% filter(is_fall == 1) %>%
    mutate(LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%
    group_by(PUBLIC_HEALTH_REGION) %>%
    summarise(
        count = n(),
        mean_LOS = mean(LENGTH_OF_STAY, na.rm = TRUE),
        sd_LOS = sd(LENGTH_OF_STAY, na.rm=TRUE),
        median_LOS = median(LENGTH_OF_STAY, na.rm = TRUE),
        MAD_LOS = mad(LENGTH_OF_STAY)
    ) %>%
    mutate(upper_LOS = mean_LOS + 2*sd_LOS,
           lower_LOS = mean_LOS - 2*sd_LOS) %>% tibble(.) %>% View(.)

TX_IP_21to24 %>%
    filter(has_hip_fracture == 1 | has_TBI == 1, SEX_CODE %in% c("F","M")) %>% filter(is_fall == 1) %>% mutate(LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%
    group_by(age_years, SEX_CODE) %>%
    summarise(
        count = n(),
        mean_LOS = mean(LENGTH_OF_STAY, na.rm = TRUE),
        sd_LOS = sd(LENGTH_OF_STAY, na.rm=TRUE),
        median_LOS = median(LENGTH_OF_STAY, na.rm = TRUE),
        MAD_LOS = mad(LENGTH_OF_STAY)
    ) %>%
    mutate(upper_LOS = mean_LOS + 2*sd_LOS,
           lower_LOS = mean_LOS - 2*sd_LOS) %>% tibble(.)
## `summarise()` has grouped output by 'age_years'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 9
##    age_years SEX_CODE count mean_LOS sd_LOS median_LOS MAD_LOS upper_LOS
##    <chr>     <chr>    <int>    <dbl>  <dbl>      <dbl>   <dbl>     <dbl>
##  1 65-69     F         4574     5.56   5.33          4    2.97      16.2
##  2 65-69     M         3067     6.50   7.33          5    2.97      21.2
##  3 70-74     F         7426     5.52   5.32          4    2.97      16.2
##  4 70-74     M         4353     6.48   7.54          5    2.97      21.5
##  5 75-79     F        10250     5.53   4.54          5    2.97      14.6
##  6 75-79     M         5718     6.22   5.37          5    2.97      17.0
##  7 80-84     F        11736     5.60   4.15          5    2.97      13.9
##  8 80-84     M         6148     6.20   5.17          5    2.97      16.5
##  9 85-89     F        10883     5.44   3.59          5    2.97      12.6
## 10 85-89     M         5368     6.08   5.03          5    2.97      16.1
## 11 90+       F         9404     5.47   3.64          5    2.97      12.7
## 12 90+       M         3866     5.73   4.31          5    2.97      14.4
## # ℹ 1 more variable: lower_LOS <dbl>
## wilcox and Kruskal to see if LOS between groups is significantly different

#SEX
TX_IP_21to24 %>%
    filter(has_hip_fracture == 1 | has_TBI == 1, SEX_CODE %in% c("F","M")) %>% filter(is_fall == 1) %>%  
mutate(LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>% rstatix::wilcox_test(LENGTH_OF_STAY ~ SEX_CODE)
## # A tibble: 1 × 7
##   .y.            group1 group2    n1    n2  statistic        p
## * <chr>          <chr>  <chr>  <int> <int>      <dbl>    <dbl>
## 1 LENGTH_OF_STAY F      M      54273 28520 741063950. 4.32e-24
#PHRs
#PHR: LOS significance
TX_IP_21to24 %>%
    mutate(PUBLIC_HEALTH_REGION = as.numeric(PUBLIC_HEALTH_REGION) %>% as.character(.), LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%
    filter(has_hip_fracture == 1 | has_TBI == 1) %>% filter(is_fall == 1) %>%
    mutate(LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>% rstatix::kruskal_test(LENGTH_OF_STAY ~ PUBLIC_HEALTH_REGION)
## # A tibble: 1 × 6
##   .y.                n statistic    df         p method        
## * <chr>          <int>     <dbl> <int>     <dbl> <chr>         
## 1 LENGTH_OF_STAY 91228      902.    10 2.20e-187 Kruskal-Wallis
#PHR: log charge significance

TX_IP_21to24 %>%
    mutate(PUBLIC_HEALTH_REGION = as.numeric(PUBLIC_HEALTH_REGION) %>% as.character(.), LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY), log_charge = log(TOTAL_CHARGES)) %>%
    filter(has_hip_fracture == 1 | has_TBI == 1) %>% filter(is_fall == 1) %>%
    mutate(LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>% 
    rstatix::kruskal_test(log_charge ~ PUBLIC_HEALTH_REGION)
## # A tibble: 1 × 6
##   .y.            n statistic    df     p method        
## * <chr>      <int>     <dbl> <int> <dbl> <chr>         
## 1 log_charge 91228     3905.    10     0 Kruskal-Wallis
### OKAY, it was, but by how much?

# install.packages("coin", repos = "https://cloud.r-project.org")
library("coin")
## Warning: package 'coin' was built under R version 4.3.3
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.3.3
## 
## Attaching package: 'coin'
## 
## The following objects are masked from 'package:rstatix':
## 
##     chisq_test, friedman_test, kruskal_test, sign_test, wilcox_test
#SEX: LOS eff size
TX_IP_21to24 %>%
    filter(has_hip_fracture == 1 | has_TBI == 1, SEX_CODE %in% c("F","M")) %>%
  filter(is_fall == 1) %>%  
  mutate(LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%   wilcox_effsize(LENGTH_OF_STAY ~ SEX_CODE)
## # A tibble: 1 × 7
##   .y.            group1 group2 effsize    n1    n2 magnitude
## * <chr>          <chr>  <chr>    <dbl> <int> <int> <ord>    
## 1 LENGTH_OF_STAY F      M       0.0352 54273 28520 small
#PHR: LOS eff size
TX_IP_21to24 %>% mutate(PUBLIC_HEALTH_REGION = as.numeric(PUBLIC_HEALTH_REGION) %>% as.character(.), LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%
    filter(has_hip_fracture == 1 | has_TBI == 1) %>% filter(is_fall == 1) %>%
    mutate(LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>% rstatix::kruskal_effsize(LENGTH_OF_STAY ~ PUBLIC_HEALTH_REGION)
## # A tibble: 1 × 5
##   .y.                n effsize method  magnitude
## * <chr>          <int>   <dbl> <chr>   <ord>    
## 1 LENGTH_OF_STAY 91228 0.00978 eta2[H] small
#PHR: log charge eff size
TX_IP_21to24 %>%
    mutate(PUBLIC_HEALTH_REGION = as.numeric(PUBLIC_HEALTH_REGION) %>% as.character(.), LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY), log_charge = log(TOTAL_CHARGES)) %>%
    filter(has_hip_fracture == 1 | has_TBI == 1) %>% filter(is_fall == 1) %>%
    mutate(LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>% 
    rstatix::kruskal_effsize(log_charge ~ PUBLIC_HEALTH_REGION)
## # A tibble: 1 × 5
##   .y.            n effsize method  magnitude
## * <chr>      <int>   <dbl> <chr>   <ord>    
## 1 log_charge 91228  0.0427 eta2[H] small
#baby: 0.03 between SEX using Wilcox effsize
#0.00664 H-value for PHR using Kruskal effsize

TX_IP_21to24 %>%
    filter(has_hip_fracture == 1 | has_TBI == 1, SEX_CODE %in% c("F","M")) %>% filter(is_fall == 1) %>%mutate(LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY))%>%
    group_by(age_years, SEX_CODE) %>%
    summarise(
        count = n(),
        mean_LOS = mean(LENGTH_OF_STAY, na.rm = TRUE),
        sd_LOS = sd(LENGTH_OF_STAY, na.rm=TRUE),
        median_LOS = median(LENGTH_OF_STAY, na.rm = TRUE),
        MAD_LOS = mad(LENGTH_OF_STAY)
    ) %>%
    mutate(upper_LOS = mean_LOS + 2*sd_LOS,
           lower_LOS = mean_LOS - 2*sd_LOS) %>% tibble(.)
## `summarise()` has grouped output by 'age_years'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 9
##    age_years SEX_CODE count mean_LOS sd_LOS median_LOS MAD_LOS upper_LOS
##    <chr>     <chr>    <int>    <dbl>  <dbl>      <dbl>   <dbl>     <dbl>
##  1 65-69     F         4574     5.56   5.33          4    2.97      16.2
##  2 65-69     M         3067     6.50   7.33          5    2.97      21.2
##  3 70-74     F         7426     5.52   5.32          4    2.97      16.2
##  4 70-74     M         4353     6.48   7.54          5    2.97      21.5
##  5 75-79     F        10250     5.53   4.54          5    2.97      14.6
##  6 75-79     M         5718     6.22   5.37          5    2.97      17.0
##  7 80-84     F        11736     5.60   4.15          5    2.97      13.9
##  8 80-84     M         6148     6.20   5.17          5    2.97      16.5
##  9 85-89     F        10883     5.44   3.59          5    2.97      12.6
## 10 85-89     M         5368     6.08   5.03          5    2.97      16.1
## 11 90+       F         9404     5.47   3.64          5    2.97      12.7
## 12 90+       M         3866     5.73   4.31          5    2.97      14.4
## # ℹ 1 more variable: lower_LOS <dbl>
### NOTE: does a "weighted" LOS by age group change much?

#by age group

TX_IP_21to24 %>%
    filter(has_hip_fracture == 1 | has_TBI == 1, SEX_CODE %in% c("F","M")) %>% filter(is_fall == 1) %>%
    mutate(LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%
    group_by(age_years, SEX_CODE) %>%
    summarise(
        count = n(),
        median_LOS = median(LENGTH_OF_STAY, na.rm = TRUE)
    ) %>%
    ungroup() %>%
    group_by(age_years) %>%
    summarise(
        weighted_median_LOS = sum(median_LOS * count) / sum(count)
    )
## `summarise()` has grouped output by 'age_years'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 2
##   age_years weighted_median_LOS
##   <chr>                   <dbl>
## 1 65-69                    4.40
## 2 70-74                    4.37
## 3 75-79                    5   
## 4 80-84                    5   
## 5 85-89                    5   
## 6 90+                      5
#by sex

TX_IP_21to24 %>%
    filter(has_hip_fracture == 1 | has_TBI == 1, SEX_CODE %in% c("F","M")) %>% filter(is_fall == 1) %>%
    mutate(LENGTH_OF_STAY = as.numeric(LENGTH_OF_STAY)) %>%
    group_by(age_years, SEX_CODE) %>%
    summarise(
        count = n(),
        median_LOS = median(LENGTH_OF_STAY, na.rm = TRUE)
    ) %>%
    ungroup() %>%
    group_by(SEX_CODE) %>%
    summarise(
        weighted_median_LOS = sum(median_LOS * count) / sum(count)
    )
## `summarise()` has grouped output by 'age_years'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 2
##   SEX_CODE weighted_median_LOS
##   <chr>                  <dbl>
## 1 F                       4.78
## 2 M                       5
#answer: nope

## visualization of LOS distro by age group via boxplot

TX_IP_21to24 %>%
    filter(has_hip_fracture == 1 | has_TBI == 1, SEX_CODE %in% c("F","M")) %>% filter(is_fall == 1) %>% ggplot(., aes(x=age_years, y=LENGTH_OF_STAY, group = SEX_CODE, fill = SEX_CODE)) + geom_boxplot() + facet_wrap(~age_years) + labs(y="") +   theme(axis.ticks.y = element_blank(),                                         axis.text.y = element_blank())

04/06/2026 DAWN OF THE RISING DOCTORS BORN IN THE CRUCIBLE OF DR. LIU’S ABSTRACT STUFF REBORN AS A MORE COMPETENT VERSION OF THE ORIGINAL (PROBABLY)

BEHOLD

pmt_src_key <- c("09" = "Self Pay",
                 "9" = "Self Pay",
                 "HM" = "Health Maintenance Organization",
                 "10" = "Central Certification",
                 "LI" = "Liability",
                 "11" = "Other Non-Federal Programs",
                 "LM" = "Liability Medical",
                 "12" = "Preferred Provider Organization (PPO)",
                 "MA" = "Medicare Part A",
                 "13" = "Point of Service (POS)",
                 "MB" = "Medicare Part B",
                 "14" = "Exclusive Provider Organization (EPO)",
                 "MC" = "Medicaid",
                 "15" = "Indemnity Insurance",
                 "TV" = "Title V",
                 "16" = "HMO Medicare Risk",
                 "OF" = "Other Federal Program",
                 "AM" = "Automobile Medical",
                 "VA" = "Veterans Administration Plan",
                 "BL" = "Blue Cross / Blue Shield",
                 "WC" = "Workers' Compensation Health Claim",
                 "CH" = "CHAMPUS",
                 "ZZ" = "Charity / Indigent / Unknown",
                 "CI" = "Commercial Insurance",
                 "DS" = "Disability Insurance")


# this is the *code* vector you want to use in %in% for human readable

pmt_src_loss_key <- unname(pmt_src_key[loss_leaders])
pmt_src_loss_key
## [1] "Medicaid"                     "Medicare Part A"             
## [3] "Medicare Part B"              "HMO Medicare Risk"           
## [5] "Veterans Administration Plan"
dbGetQuery(con, "SELECT
has_ADRD,
year,
FIRST_PAYMENT_SRC,
age_years,
Sex,
Race_1,
Ethnicity_1,
has_TBI,
has_hip_fracture,
routine_DC_home,
is_fall,
SOURCE_OF_ADMISSION,
PAT_STATUS_desc,
TOTAL_CHARGES,
FROM ip;") ->TX_IP_logcols_21to24

factor_vars <- c(
    "has_ADRD",
    "age_years",
    "Sex",
    "Race_1",
    "Ethnicity_1",
    "has_TBI",
    "has_hip_fracture",
    "PAT_STATUS_desc",
    "routine_DC_home",
    "SOURCE_OF_ADMISSION",
    "is_fall"
)

TX_IP_logcols_21to24 <- TX_IP_logcols_21to24 |>
    mutate(across(all_of(factor_vars), as.factor))

## reference level establishment (inpatient)

TX_IP_logcols_21to24 <- TX_IP_logcols_21to24 %>%
    mutate(
        Race_1 = fct_relevel(Race_1, "White"),
        Ethnicity_1 = fct_relevel(Ethnicity_1, "Non-Hispanic"),
        Sex = fct_relevel(Sex, "Male"),
        age_years = fct_relevel(age_years, "65-69"),
        has_ADRD = factor(has_ADRD, levels = c(0,1)),
        has_TBI = factor(has_TBI, levels = c(0, 1)),
        is_fall = factor(is_fall, levels = c(0, 1)),
        routine_DC_home = factor(routine_DC_home, levels = c(0, 1)),
        has_hip_fracture = factor(has_hip_fracture, levels = c(0, 1)),
        PAT_STATUS_desc = fct_relevel(PAT_STATUS_desc, "Discharged to home or self-care (routine discharge)")
    )

TX_IP_logcols_21to24<- TX_IP_logcols_21to24 %>% mutate(loss = ifelse(FIRST_PAYMENT_SRC %in% pmt_src_loss_key, "loss", "regular"))

TX_IP_logcols_21to24<- left_join(TX_IP_logcols_21to24, avg_TX_CCRs, by=c("year"="YEAR")) %>% mutate(hosp_cost = TOTAL_CHARGES * median_CCR, expec_reimb = ifelse(loss == "loss", hosp_cost * average_covered, TOTAL_CHARGES), expec_gain = expec_reimb-hosp_cost)

####

TX_IP_logcols_21to24 %>% filter(has_ADRD == 1 & is_fall == 1) %>% .$TOTAL_CHARGES %>% sum(.) %>% scales::comma(.)
## [1] "6,084,374,833"
##

adrd_fall_hospcost<- TX_IP_logcols_21to24 %>% filter(has_ADRD == 1 & is_fall == 1 & loss == "loss") %>% .$hosp_cost %>% sum(.)

print("adrd_fall_hospcost")
## [1] "adrd_fall_hospcost"
adrd_fall_hospcost %>% scales::comma(.)
## [1] "2,662,122,002"
adrd_fall_hosp_reimburse<- TX_IP_logcols_21to24 %>% filter(has_ADRD == 1 & is_fall == 1 & loss == "loss") %>% .$expec_reimb %>% sum(.)

print("adrd_fall_hosp_reimburse")
## [1] "adrd_fall_hosp_reimburse"
adrd_fall_hosp_reimburse %>% scales::comma(.)
## [1] "2,049,833,942"
{adrd_fall_hospcost - adrd_fall_hosp_reimburse} %>% as.numeric(.) %>% scales::comma(.)
## [1] "612,288,061"
# From 2021 to 2024, $6.08B were charged to hospitalized patients with ADRD who experienced a fall. Of these, $5.6B were charged to patients with a loss-leading insurance. The total cost of care across all years for those with a "lossy" public source insurance was approximately $2.69B of which only $2.07B, on average, was expected to be reimbursed. Altogether, this resulted in a net loss of $618M for all hospitals across Texas, not including prospective profit.


TX_IP_logcols_21to24 %>% filter(has_ADRD == 1 & is_fall == 1) %>% .$hosp_cost %>% sum(.) %>% scales::comma(.)
## [1] "2,903,244,724"
## of the 

TX_IP_logcols_21to24 %>% filter(has_ADRD == 1 & is_fall == 1) %>% .$expec_reimb %>% sum(.) %>% scales::comma(.)
## [1] "2,555,423,168"
####

glm(TOTAL_CHARGES ~ has_hip_fracture*has_TBI*loss + age_years + Race_1+Ethnicity_1+Sex, family = Gamma(link = "log"),  data = TX_IP_logcols_21to24 %>% filter(TOTAL_CHARGES > 0 & has_ADRD ==1 & is_fall == 1)) ->ip_glm_logcharge_fall

library(marginaleffects)

mf <- model.frame(ip_glm_logcharge_fall)

preds<- avg_predictions(ip_glm_logcharge_fall, by = c("has_hip_fracture", "has_TBI", "loss"), type = "response", newdata = mf)

preds
## 
##  has_hip_fracture has_TBI    loss Estimate Std. Error      z Pr(>|z|)     S
##                 0       0 loss       88915        471 188.67   <0.001   Inf
##                 0       0 regular    84413       1441  58.57   <0.001   Inf
##                 0       1 loss      102697       1343  76.47   <0.001   Inf
##                 0       1 regular    87259       3828  22.79   <0.001 379.7
##                 1       0 loss      120106        960 125.14   <0.001   Inf
##                 1       0 regular   114596       3006  38.12   <0.001   Inf
##                 1       1 loss      151790       9130  16.63   <0.001 203.8
##                 1       1 regular    88331      18331   4.82   <0.001  19.4
##   2.5 % 97.5 %
##   87992  89839
##   81588  87237
##  100065 105329
##   79756  94762
##  118225 121987
##  108704 120487
##  133896 169685
##   52403 124258
## 
## Type: response
preds <- preds %>%
    mutate(group = case_when(
        has_hip_fracture == 0 & has_TBI == 0 ~ "None",
        has_hip_fracture == 0 & has_TBI == 1 ~ "TBI only",
        has_hip_fracture == 1 & has_TBI == 0 ~ "Hip fracture only",
        TRUE ~ "Both"
    ))

ratio_df <- preds %>%
    mutate(group = case_when(
        has_hip_fracture == 0 & has_TBI == 0 ~ "None",
        has_hip_fracture == 0 & has_TBI == 1 ~ "TBI only",
        has_hip_fracture == 1 & has_TBI == 0 ~ "Hip fracture only",
        TRUE ~ "Both"
    )) %>%
    select(group, loss, estimate) %>%
    pivot_wider(names_from = loss, values_from = estimate) %>%
    mutate(ratio = loss / regular)

ggplot(ratio_df, aes(x = group, y = ratio)) +
    geom_hline(yintercept = 1, linetype = "dashed", color = "gray40") +
    geom_point(size = 4, color = "#D55E00") +
    geom_text(aes(label = round(ratio, 2)), vjust = -1) +
    labs(
        y = "Loss / Regular Predicted Charges",
        x = "Injury Profile"
    ) +
    ylim(0.8, max(ratio_df$ratio) * 1.2)

###

cmp <- comparisons(
  ip_glm_logcharge_fall,
  variables = "loss",
  by = c("has_hip_fracture", "has_TBI"),
  type = "response"
)

cmp
## 
##  has_hip_fracture has_TBI Estimate Std. Error     z Pr(>|z|)    S   2.5 %
##                 0       0    -8160       1467 -5.56  < 0.001 25.2  -11034
##                 0       1   -18416       3949 -4.66  < 0.001 18.3  -26155
##                 1       0    -9643       3065 -3.15  0.00165  9.2  -15650
##                 1       1   -63180      20523 -3.08  0.00208  8.9 -103405
##  97.5 %
##   -5285
##  -10676
##   -3636
##  -22955
## 
## Term: loss
## Type: response
## Comparison: regular - loss
library(dplyr)
library(gt)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:coin':
## 
##     pvalue
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
cmp_tbl <- cmp %>%
    mutate(
        `Injury profile` = case_when(
            has_hip_fracture == 0 & has_TBI == 0 ~ "No HF/TBI",
            has_hip_fracture == 1 & has_TBI == 0 ~ "Hip fracture only",
            has_hip_fracture == 0 & has_TBI == 1 ~ "TBI only",
            has_hip_fracture == 1 & has_TBI == 1 ~ "Hip fracture + TBI"
        ),
        `Adjusted difference` = abs(estimate),
        `95% CI` = paste0(
            dollar(abs(`conf.high`), accuracy = 1),
            " to ",
            dollar(abs(`conf.low`), accuracy = 1)
        ),
        `P value` = case_when(
            `p.value` < 0.001 ~ "<0.001",
            TRUE ~ formatC(`p.value`, format = "f", digits = 3)
        )
    ) %>%
    select(
        `Injury profile`,
        `Adjusted difference`,
        `95% CI`,
        `P value`
    )

gt_tbl <- cmp_tbl %>%
    gt() %>%
    fmt_currency(
        columns = `Adjusted difference`,
        currency = "USD",
        decimals = 0
    ) %>%
    cols_label(
        `Injury profile` = "Injury profile",
        `Adjusted difference` = md("**Higher charges in loss-associated payer group**"),
        `95% CI` = "95% CI",
        `P value` = md("*p*")
    ) %>%
    tab_header(
        title = md("**Adjusted Charge Differences by Injury Profile**"),
        subtitle = "Model-based comparison of loss-associated vs other payers"
    ) %>%
    tab_source_note(
        source_note = md("Values are absolute differences from the adjusted model; comparison originally estimated as regular minus loss.")
    ) %>%
    cols_align(
        align = "left",
        columns = `Injury profile`
    ) %>%
    cols_align(
        align = "center",
        columns = c(`Adjusted difference`, `95% CI`, `P value`)
    ) %>%
    tab_style(
        style = list(
            cell_text(weight = "bold")
        ),
        locations = cells_column_labels(everything())
    ) %>%
    opt_row_striping() %>%
    tab_options(
        table.font.size = px(18),
        data_row.padding = px(8),
        heading.title.font.size = px(24),
        heading.subtitle.font.size = px(16),
        source_notes.font.size = px(12),
        table.border.top.width = px(2),
        table.border.bottom.width = px(2)
    )

gt_tbl
Adjusted Charge Differences by Injury Profile
Model-based comparison of loss-associated vs other payers
Injury profile Higher charges in loss-associated payer group 95% CI p
No HF/TBI $8,160 $5,285 to $11,034 <0.001
TBI only $18,416 $10,676 to $26,155 <0.001
Hip fracture only $9,643 $3,636 to $15,650 0.002
Hip fracture + TBI $63,180 $22,955 to $103,405 0.002
Values are absolute differences from the adjusted model; comparison originally estimated as regular minus loss.
gtsave(gt_tbl, "poster_table.png")
## file:///C:/Users/DANIEL~1.PIN/AppData/Local/Temp/RtmpsPa0vu/filecf027143303.html screenshot completed
##

library(dplyr)
library(ggplot2)
library(forcats)
library(scales)

plot_df <- preds %>%
    mutate(
        group = case_when(
            group %in% c("None") ~ "No HF/TBI",
            group %in% c("Hip fracture only") ~ "Hip fracture only",
            group %in% c("TBI only") ~ "TBI only",
            group %in% c("Both") ~ "Hip fracture + TBI",
            TRUE ~ as.character(group)
        ),
        group = factor(
            group,
            levels = c("No HF/TBI", "Hip fracture only", "TBI only", "Hip fracture + TBI")
        ),
        loss = recode(
            loss,
            "loss" = "Loss-associated payer",
            "regular" = "Other payer"
        )
    )

# label the rightmost points directly so the legend can go away
label_df <- plot_df %>%
    filter(group == "Hip fracture + TBI")

poster_chart<- ggplot(
    plot_df,
    aes(x = group, y = estimate, color = loss, group = loss)
) +
    geom_line(linewidth = 1.1, position = position_dodge(width = 0.20)) +
    geom_point(size = 3.8, position = position_dodge(width = 0.20)) +
    geom_errorbar(
        aes(ymin = conf.low, ymax = conf.high),
        width = 0.10,
        linewidth = 0.8,
        position = position_dodge(width = 0.20)
    ) +
    geom_text(
        data = label_df,
        aes(label = loss),
        hjust = -0.15,
        size = 4.2,
        show.legend = FALSE,
        position = position_dodge(width = 0.20)
    ) +
    annotate(
        "text",
        x = 3.5,
        y = max(plot_df$conf.high, na.rm = TRUE) * 0.93,
        label = "~$63K gap\nlargest difference",
        fontface = "bold",
        size = 4.4
    ) +
    scale_y_continuous(
        labels = label_dollar(scale = 1),
        expand = expansion(mult = c(0.05, 0.18))
    ) +
    coord_cartesian(clip = "off") +
    labs(
        title = "Loss-Associated Payers Have Higher Predicted Charges Across Injury Profiles",
        x = NULL,
        y = "Predicted Hospital Charges"
    )  +
    theme_minimal(base_size = 15) +
    theme(
        legend.position = "none",
        plot.title = element_text(face = "bold", size = 17),
        plot.subtitle = element_text(size = 12),
        axis.text.x = element_text(size = 12, face = "bold"),
        axis.text.y = element_text(size = 11),
        axis.title.y = element_text(size = 13, face = "bold"),
        panel.grid.minor = element_blank(),
        panel.grid.major.x = element_blank(),
        plot.margin = margin(15, 80, 15, 15)
    )

poster_chart