1 Load R Libraries

# Ensure the correct working directory
here::dr_here()  # Show project root (for debugging)

## here() starts at /Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments.
## - This directory contains a file ".here"
## - Initial working directory: /Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments
## - Current working directory: /Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments

knitr::opts_knit$set(root.dir = here::here())  # Set knitr root directory

# Confirm project root is set correctly
stopifnot(here::here() == getwd())  # Ensures correct working directory

# Identify manuscript file for reproducibility
here::i_am("Manuscript.Rmd")  # Declare main file for reference

## here() starts at /Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments

2 Load Bespoke Functions

knitr::opts_chunk$set(echo = TRUE)

# Clear environment to avoid conflicts with older function versions
rm(list = ls())

#Source Bespoke Functions
source("/Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments/Code/bespoke_functions.R", local = knitr::knit_global())

For some reason all the functions can’t be sourced.

# Clear environment to avoid conflicts with existing functions
rm(list = ls())

# FUNCTIONS

#' Create Combined Cost Histogram
#' 
#' @description
#' Creates combined and faceted interactive cost histograms from either a CSV or RDS file.
#' The function automatically detects the file format based on extension and loads it
#' appropriately.
#' 
#' @param file_path Character. Path to the CSV or RDS file containing cost data
#' @param output_dir Character. Directory where the output plots will be saved.
#'        Default: current working directory
#' @param bins Numeric. Number of bins for the histogram. Default: 100
#' @param zoom_range Numeric vector. Range for zoomed view (c(min, max)). Default: c(0, 6000)
#' @param year_label Character. Year label for the plot title. Default: "2016"
#' @param create_faceted Logical. Whether to create faceted histogram. Default: TRUE
#' @param verbose Logical. Whether to print detailed logging messages. Default: FALSE
#' 
#' @return A list containing:
#'   \itemize{
#'     \item combined: The combined histogram plot
#'     \item faceted: The faceted histogram plot (if create_faceted = TRUE)
#'   }
#' 
#' @importFrom logger log_info log_error log_warn
#' @importFrom assertthat assert_that
#' @importFrom readr read_rds read_csv
#' @importFrom plotly plot_ly subplot
#' @importFrom htmlwidgets saveWidget
#' @importFrom dplyr filter mutate
#' @importFrom ggplot2 ggplot aes geom_histogram facet_wrap theme_minimal labs theme
#' @importFrom here here
#' 
#' @examples
#' \dontrun{
#' # Basic usage with CSV file
#' hist_plots <- create_combined_cost_histogram(
#'   file_path = "path/to/cost_data.csv", 
#'   output_dir = "output/directory", 
#'   bins = 100,
#'   zoom_range = c(0, 6000),
#'   year_label = "2016",
#'   create_faceted = TRUE,
#'   verbose = TRUE
#' )
#' 
#' # Using with RDS file and custom bins
#' hist_plots <- create_combined_cost_histogram(
#'   file_path = "path/to/cost_data.rds", 
#'   output_dir = "output/directory", 
#'   bins = 50,
#'   zoom_range = c(0, 10000),
#'   year_label = "2017",
#'   create_faceted = FALSE,
#'   verbose = TRUE
#' )
#' 
#' # Using relative paths with here package
#' hist_plots <- create_combined_cost_histogram(
#'   file_path = here::here("Data", "cost_data.csv"), 
#'   output_dir = here::here("Results", "plots"),
#'   bins = 75,
#'   zoom_range = c(0, 5000),
#'   year_label = "2018",
#'   create_faceted = TRUE,
#'   verbose = FALSE
#' )
#' }
#' @noRd
# Here's a fixed version of your validate_inputs function:

validate_inputs <- function(file_path, output_dir, bins, zoom_range, year_label, create_faceted, verbose) {
  # Check file_path
  assertthat::assert_that(
    is.character(file_path),
    msg = "file_path must be a character string"
  )
  
  # Check output_dir
  assertthat::assert_that(
    is.character(output_dir),
    msg = "output_dir must be a character string"
  )
  
  # Check bins
  assertthat::assert_that(
    is.numeric(bins),
    bins > 0,
    msg = "bins must be a positive numeric value"
  )
  
  # Check zoom_range
  assertthat::assert_that(
    is.numeric(zoom_range),
    length(zoom_range) == 2,
    zoom_range[1] < zoom_range[2],
    msg = "zoom_range must be a numeric vector of length 2 with min < max"
  )
  
  # Check year_label
  assertthat::assert_that(
    is.character(year_label) || is.numeric(year_label),
    msg = "year_label must be a character or numeric value"
  )
  
  # Check create_faceted
  assertthat::assert_that(
    is.logical(create_faceted),
    msg = "create_faceted must be a logical value (TRUE or FALSE)"
  )
  
  # Check verbose
  assertthat::assert_that(
    is.logical(verbose),
    msg = "verbose must be a logical value (TRUE or FALSE)"
  )
  
  if (verbose) logger::log_info("Input validation successful")
  
  return(TRUE)
}

#' @noRd
load_cost_data <- function(file_path) {
  logger::log_info("Loading cost data from: {file_path}")
  
  # Determine file type by extension
  if (grepl("\\.csv$", file_path, ignore.case = TRUE)) {
    logger::log_info("Detected CSV file format")
    tryCatch({
      # Load CSV file
      cost_data <- readr::read_csv(file_path, show_col_types = FALSE)
      logger::log_info("Successfully loaded CSV file with {nrow(cost_data)} rows and {ncol(cost_data)} columns")
      return(cost_data)
    }, error = function(e) {
      logger::log_error("Failed to load CSV file: {e$message}")
      stop("Failed to load CSV file: ", e$message)
    })
  } else if (grepl("\\.rds$", file_path, ignore.case = TRUE)) {
    logger::log_info("Detected RDS file format")
    tryCatch({
      # Load RDS file
      cost_data <- readr::read_rds(file_path)
      logger::log_info("Successfully loaded RDS file with {nrow(cost_data)} rows and {ncol(cost_data)} columns")
      return(cost_data)
    }, error = function(e) {
      logger::log_error("Failed to load RDS file: {e$message}")
      stop("Failed to load RDS file: ", e$message)
    })
  } else {
    # Try to guess based on content
    logger::log_warn("Unknown file extension. Attempting to detect format automatically.")
    
    # First try as RDS
    tryCatch({
      cost_data <- readr::read_rds(file_path)
      logger::log_info("Successfully loaded file as RDS")
      return(cost_data)
    }, error = function(e) {
      # If RDS fails, try CSV
      tryCatch({
        cost_data <- readr::read_csv(file_path, show_col_types = FALSE)
        logger::log_info("Successfully loaded file as CSV")
        return(cost_data)
      }, error = function(e2) {
        logger::log_error("Could not load file as either RDS or CSV")
        stop("Could not determine file format. Please use a .csv or .rds file.")
      })
    })
  }
}

#' @noRd
verify_cost_data <- function(cost_data) {
  logger::log_info("Verifying cost data structure")
  
  # Check if it's a data frame
  assertthat::assert_that(
    is.data.frame(cost_data),
    msg = "Cost data must be a data frame"
  )
  
  # Log column names for debugging
  logger::log_info("Available columns: {paste(names(cost_data), collapse=', ')}")
  
  # Check for required columns - look for common cost column names
  cost_column_candidates <- c(
    "Total_cost", "total_cost", "cost", "Cost", 
    "COST", "total_price", "Total_Price", "price", "Price"
  )
  
  # Check if any of the candidate columns exist
  cost_column_exists <- any(cost_column_candidates %in% names(cost_data))
  
  if (!cost_column_exists) {
    logger::log_warn("No standard cost column found. Available columns: {paste(names(cost_data), collapse=', ')}")
    logger::log_warn("Will attempt to use the first numeric column as cost")
    
    # Find first numeric column
    numeric_cols <- sapply(cost_data, is.numeric)
    if (any(numeric_cols)) {
      first_numeric_col <- names(cost_data)[which(numeric_cols)[1]]
      logger::log_info("Using {first_numeric_col} as cost column")
    } else {
      logger::log_error("No numeric columns found in the dataset")
      stop("No appropriate cost column found in the dataset")
    }
  } else {
    logger::log_info("Cost column verification successful")
  }
}

#' @noRd
create_combined_histogram <- function(cost_data, bins, zoom_range, year_label) {
  logger::log_info("Creating combined histogram")
  
  # Identify cost column - try common names or use first numeric column
  cost_column <- identify_cost_column(cost_data)
  
  # Check for episode_type or similar column for grouping
  group_column <- identify_group_column(cost_data)
  
  # If we have a grouping column, use it, otherwise just show overall
  if (!is.null(group_column)) {
    # Get unique groups
    groups <- unique(cost_data[[group_column]])
    logger::log_info("Found {length(groups)} groups in {group_column} column")
    
    # Prepare plots list
    plots <- list()
    
    # Create histogram for each group
    for (group in groups) {
      group_data <- dplyr::filter(cost_data, .data[[group_column]] == group)
      
      # Skip empty groups
      if (nrow(group_data) == 0) {
        logger::log_warn("Group '{group}' has no data, skipping")
        next
      }
      
      logger::log_info("Creating histogram for group: {group} with {nrow(group_data)} observations")
      
      # Create plotly histogram
      p <- plotly::plot_ly(
        data = group_data,
        x = ~.data[[cost_column]],
        type = "histogram",
        nbinsx = bins,
        name = group
      ) %>%
        plotly::layout(
          title = paste("Cost Distribution -", group),
          xaxis = list(title = "Cost ($)", range = zoom_range),
          yaxis = list(title = "Count"),
          bargap = 0.1
        )
      
      plots[[length(plots) + 1]] <- p
    }
    
    # Create subplot with all histograms
    if (length(plots) > 0) {
      combined_plot <- plotly::subplot(plots, nrows = length(plots), shareX = TRUE, titleY = TRUE)
      combined_plot <- combined_plot %>%
        plotly::layout(
          title = paste("Cost Distribution by Group -", year_label),
          showlegend = FALSE
        )
      
      logger::log_info("Combined histogram created successfully")
      return(combined_plot)
    } else {
      logger::log_error("No valid groups found for histogram")
      stop("No valid groups found for histogram")
    }
  } else {
    # Just create one overall histogram
    logger::log_info("No grouping column found, creating overall histogram")
    
    p <- plotly::plot_ly(
      data = cost_data,
      x = ~.data[[cost_column]],
      type = "histogram",
      nbinsx = bins
    ) %>%
      plotly::layout(
        title = paste("Overall Cost Distribution -", year_label),
        xaxis = list(title = "Cost ($)", range = zoom_range),
        yaxis = list(title = "Count"),
        bargap = 0.1
      )
    
    logger::log_info("Overall histogram created successfully")
    return(p)
  }
}

#' @noRd
create_faceted_histogram <- function(cost_data, bins, zoom_range, year_label) {
  logger::log_info("Creating faceted histogram")
  
  # Identify cost column - try common names or use first numeric column
  cost_column <- identify_cost_column(cost_data)
  
  # Check for episode_type or similar column for grouping
  group_column <- identify_group_column(cost_data)
  
  # If we have a grouping column, use it, otherwise just show overall
  if (!is.null(group_column)) {
    # Create ggplot histogram
    logger::log_info("Creating faceted ggplot histogram with {group_column} as facet")
    
    p <- ggplot2::ggplot(cost_data, ggplot2::aes(x = .data[[cost_column]])) +
      ggplot2::geom_histogram(bins = bins, fill = "steelblue", color = "white") +
      ggplot2::facet_wrap(~ .data[[group_column]], scales = "free_y") +
      ggplot2::labs(
        title = paste("Cost Distribution by", group_column, "-", year_label),
        x = "Cost ($)",
        y = "Count"
      ) +
      ggplot2::theme_minimal() +
      ggplot2::theme(
        plot.title = ggplot2::element_text(hjust = 0.5),
        strip.background = ggplot2::element_rect(fill = "lightgray")
      )
    
    # Convert to plotly for interactivity
    p_interactive <- plotly::ggplotly(p) %>%
      plotly::layout(xaxis = list(range = zoom_range))
    
    logger::log_info("Faceted histogram created successfully")
    return(p_interactive)
  } else {
    # Just create one overall histogram
    logger::log_info("No grouping column found, creating overall histogram instead of faceted")
    
    p <- ggplot2::ggplot(cost_data, ggplot2::aes(x = .data[[cost_column]])) +
      ggplot2::geom_histogram(bins = bins, fill = "steelblue", color = "white") +
      ggplot2::labs(
        title = paste("Overall Cost Distribution -", year_label),
        x = "Cost ($)",
        y = "Count"
      ) +
      ggplot2::theme_minimal()
    
    # Convert to plotly for interactivity
    p_interactive <- plotly::ggplotly(p) %>%
      plotly::layout(xaxis = list(range = zoom_range))
    
    logger::log_warn("Created single histogram instead of faceted due to missing group column")
    return(p_interactive)
  }
}

#' @noRd
identify_cost_column <- function(cost_data) {
  # Check common cost column names
  cost_column_candidates <- c(
    "Total_cost", "total_cost", "cost", "Cost", 
    "COST", "total_price", "Total_Price", "price", "Price"
  )
  
  # Find first matching column
  for (candidate in cost_column_candidates) {
    if (candidate %in% names(cost_data)) {
      logger::log_info("Using '{candidate}' as cost column")
      return(candidate)
    }
  }
  
  # If no match, use first numeric column
  numeric_cols <- sapply(cost_data, is.numeric)
  if (any(numeric_cols)) {
    first_numeric_col <- names(cost_data)[which(numeric_cols)[1]]
    logger::log_warn("No standard cost column found. Using '{first_numeric_col}' as cost column")
    return(first_numeric_col)
  }
  
  # If no numeric columns, throw error
  logger::log_error("No appropriate cost column found")
  stop("No numeric columns found for cost analysis")
}

#' @noRd
identify_group_column <- function(cost_data) {
  # Check common group column names
  group_column_candidates <- c(
    "episode_type", "Episode_type", "group", "Group", 
    "category", "Category", "therapy", "Therapy", "treatment", "Treatment"
  )
  
  # Find first matching column
  for (candidate in group_column_candidates) {
    if (candidate %in% names(cost_data)) {
      logger::log_info("Using '{candidate}' as grouping column")
      return(candidate)
    }
  }
  
  # If no match, look for columns with few unique values (potential categorical)
  potential_group_cols <- sapply(cost_data, function(col) {
    if (is.character(col) || is.factor(col)) {
      n_unique <- length(unique(col))
      return(n_unique > 1 && n_unique < 10)  # Between 2 and 9 unique values
    }
    return(FALSE)
  })
  
  if (any(potential_group_cols)) {
    first_potential_col <- names(cost_data)[which(potential_group_cols)[1]]
    logger::log_warn("No standard group column found. Using '{first_potential_col}' as group column")
    return(first_potential_col)
  }
  
  # If no suitable column found, return NULL
  logger::log_warn("No suitable grouping column found")
  return(NULL)
}

#' @noRd
save_interactive_plot <- function(plot, base_name, output_dir) {
  # Generate file path
  file_path <- file.path(output_dir, paste0(base_name, ".html"))
  logger::log_info("Saving interactive plot to: {file_path}")
  
  # Save the plot
  tryCatch({
    htmlwidgets::saveWidget(plot, file_path, selfcontained = TRUE)
    logger::log_info("Plot saved successfully")
  }, error = function(e) {
    logger::log_error("Failed to save plot: {e$message}")
    
    # Try alternative approach with different options
    tryCatch({
      logger::log_info("Trying alternative save approach")
      htmlwidgets::saveWidget(plot, file_path, selfcontained = FALSE)
      logger::log_info("Plot saved with selfcontained = FALSE")
    }, error = function(e2) {
      logger::log_error("All save attempts failed: {e2$message}")
      warning("Could not save plot to file: ", e2$message)
    })
  })
  
create_combined_cost_histogram <- function(file_path, 
                                           output_dir = getwd(), 
                                           bins = 100,
                                           zoom_range = c(0, 6000),
                                           year_label = "2016",
                                           create_faceted = TRUE,
                                           verbose = FALSE) {
  
  # Initialize logging
  setup_logging(verbose)
  logger::log_info("Starting create_combined_cost_histogram function")
  
  # Log input parameters
  logger::log_info("Parameters: file_path = {file_path}, output_dir = {output_dir}, 
                   bins = {bins}, zoom_range = {zoom_range[1]}-{zoom_range[2]},
                   year_label = {year_label}, create_faceted = {create_faceted},
                   verbose = {verbose}")
  
  # Validate inputs
  validate_inputs(file_path, output_dir, bins, zoom_range, year_label, create_faceted, verbose)
  
  # Create output directory if it doesn't exist
  if (!dir.exists(output_dir)) {
    dir.create(output_dir, recursive = TRUE)
    logger::log_info("Created output directory: {output_dir}")
  }
  
  # Read data
  cost_data <- load_cost_data(file_path)
  
  # Check if we have the expected cost data structure
  verify_cost_data(cost_data)
  
  # Create combined interactive histogram
  logger::log_info("Creating combined interactive histogram")
  combined_hist <- create_combined_histogram(cost_data, bins, zoom_range, year_label)
  
  # Save the combined histogram
  save_interactive_plot(combined_hist, "combined_cost_histogram", output_dir)
  
  # Create faceted histogram if requested
  faceted_hist <- NULL
  if (create_faceted) {
    logger::log_info("Creating faceted interactive histogram")
    faceted_hist <- create_faceted_histogram(cost_data, bins, zoom_range, year_label)
    save_interactive_plot(faceted_hist, "faceted_cost_histogram", output_dir)
  }
  
  # Return the histograms
  logger::log_info("Function execution completed successfully")
  return(list(
    combined = combined_hist,
    faceted = faceted_hist
  ))
}

#' @noRd
setup_logging <- function(verbose) {
  if (verbose) {
    logger::log_threshold(logger::INFO)
  } else {
    logger::log_threshold(logger::ERROR)
  }
}


}


#' create_combined_cost_histogram <- function(file_path,
#'                                            output_dir = getwd(),
#'                                            bins = 100,
#'                                            zoom_range = c(0, 6000),
#'                                            year_label = "2016",
#'                                            create_faceted = TRUE,
#'                                            verbose = FALSE) {
#' 
#'   # Set up logging based on verbose parameter
#'   setup_logging(verbose)
#' 
#'   # Start logging
#'   logger::log_info("Starting create_combined_cost_histogram function")
#'   logger::log_info("Parameters: file_path = {file_path}, output_dir = {output_dir},
#'                    bins = {bins}, zoom_range = {zoom_range[1]}-{zoom_range[2]},
#'                    year_label = {year_label}, create_faceted = {create_faceted},
#'                    verbose = {verbose}")
#' 
#'   # Validate inputs
#'   validate_inputs(file_path, output_dir, bins, zoom_range, year_label, create_faceted, verbose)
#' 
#'   # Read data
#'   cost_data <- read_cost_data(file_path)
#' 
#'   # Create combined interactive histogram
#'   logger::log_info("Creating combined interactive histogram")
#'   combined_hist <- create_combined_histogram(cost_data, bins, zoom_range, year_label)
#' 
#'   # Save the combined interactive histogram
#'   save_interactive_plot(combined_hist, "combined_cost_histogram", output_dir)
#' 
#'   # Create faceted histogram if requested
#'   faceted_hist <- NULL
#'   if (create_faceted) {
#'     logger::log_info("Creating faceted interactive histogram")
#'     faceted_hist <- create_faceted_histogram(cost_data, bins, zoom_range, year_label)
#'     save_interactive_plot(faceted_hist, "faceted_cost_histogram", output_dir)
#'   }
#' 
#'   # Return the histograms
#'   logger::log_info("Function execution completed successfully")
#'   return(list(
#'     combined = combined_hist,
#'     faceted = faceted_hist
#'   ))
#' }
#' 
#' #' Set up logging configuration
#' #'
#' #' @param verbose Whether to use verbose logging
#' #' @noRd
#' setup_logging <- function(verbose) {
#'   # Configure logger
#'   if (verbose) {
#'     logger::log_threshold(logger::INFO)
#'   } else {
#'     logger::log_threshold(logger::WARN)
#'   }
#' }
#' 
#' #' Validate function inputs
#' #'
#' #' @param file_path Path to the CSV file
#' #' @param output_dir Directory to save output plots
#' #' @param bins Number of bins for histograms
#' #' @param zoom_range Default zoom range for interactive plots
#' #' @param year_label Year label for x-axis
#' #' @param create_faceted Whether to create faceted histogram
#' #' @param verbose Whether to use verbose logging
#' #' @noRd
#' validate_inputs <- function(file_path, output_dir, bins, zoom_range, year_label, create_faceted, verbose) {
#'   logger::log_info("Validating input parameters")
#' 
#'   # Check file_path
#'   assertthat::assert_that(
#'     is.character(file_path) && length(file_path) == 1,
#'     msg = "file_path must be a single character string"
#'   )
#' 
#'   if (!file.exists(file_path)) {
#'     logger::log_error("File not found: {file_path}")
#'     stop("File not found: ", file_path)
#'   }
#' 
#'   # Check output_dir
#'   assertthat::assert_that(
#'     is.character(output_dir) && length(output_dir) == 1,
#'     msg = "output_dir must be a single character string"
#'   )
#' 
#'   # Create output directory if it doesn't exist
#'   if (!dir.exists(output_dir)) {
#'     logger::log_info("Creating output directory: {output_dir}")
#'     dir.create(output_dir, recursive = TRUE)
#'   }
#' 
#'   # Check bins
#'   assertthat::assert_that(
#'     is.numeric(bins) && length(bins) == 1 && bins > 0,
#'     msg = "bins must be a positive number"
#'   )
#' 
#'   # Check zoom_range
#'   assertthat::assert_that(
#'     is.numeric(zoom_range) && length(zoom_range) == 2 && zoom_range[1] < zoom_range[2],
#'     msg = "zoom_range must be a numeric vector of length 2 with zoom_range[1] < zoom_range[2]"
#'   )
#' 
#'   # Check year_label
#'   assertthat::assert_that(
#'     is.character(year_label) && length(year_label) == 1,
#'     msg = "year_label must be a single character string"
#'   )
#' 
#'   # Check create_faceted
#'   assertthat::assert_that(
#'     is.logical(create_faceted) && length(create_faceted) == 1,
#'     msg = "create_faceted must be a logical value (TRUE or FALSE)"
#'   )
#' 
#'   # Check verbose
#'   assertthat::assert_that(
#'     is.logical(verbose) && length(verbose) == 1,
#'     msg = "verbose must be a logical value (TRUE or FALSE)"
#'   )
#' 
#'   # Check if required packages are installed
#'   if (!requireNamespace("plotly", quietly = TRUE)) {
#'     logger::log_warn("Package 'plotly' is required but not installed")
#'     warning("Package 'plotly' is required. Please install it with install.packages('plotly')")
#'   }
#' 
#'   if (!requireNamespace("htmlwidgets", quietly = TRUE)) {
#'     logger::log_warn("Package 'htmlwidgets' is required for saving interactive plots but not installed")
#'     warning("Package 'htmlwidgets' is required for saving interactive plots. Please install it with install.packages('htmlwidgets')")
#'   }
#' 
#'   logger::log_info("All input parameters are valid")
#' }
#' 
#' #' Read cost data from CSV file
#' #'
#' #' @param file_path Path to the CSV file
#' #' @return Dataframe containing cost data
#' #' @noRd
#' read_cost_data <- function(file_path) {
#'   logger::log_info("Reading data from {file_path}")
#' 
#'   tryCatch({
#'     # Read the data with readr for better performance
#'     cost_data <- readr::read_csv(
#'       file_path,
#'       show_col_types = FALSE,
#'       progress = FALSE
#'     )
#' 
#'     # Check if required columns exist
#'     assertthat::assert_that(
#'       "episode_type" %in% colnames(cost_data),
#'       msg = "CSV file must contain an 'episode_type' column"
#'     )
#' 
#'     assertthat::assert_that(
#'       "Total_cost" %in% colnames(cost_data),
#'       msg = "CSV file must contain a 'Total_cost' column"
#'     )
#' 
#'     # Ensure Total_cost is numeric
#'     cost_data <- cost_data %>%
#'       dplyr::mutate(Total_cost = as.numeric(Total_cost))
#' 
#'     # Log data dimensions
#'     logger::log_info("Read {nrow(cost_data)} rows and {ncol(cost_data)} columns")
#' 
#'     # Log unique episode types
#'     episode_types <- unique(cost_data$episode_type)
#'     logger::log_info("Found {length(episode_types)} unique episode types: {paste(episode_types, collapse = ', ')}")
#' 
#'     # Log Total_cost summary
#'     cost_summary <- summary(cost_data$Total_cost)
#'     logger::log_info("Total_cost summary: Min={cost_summary[1]}, Median={cost_summary[3]}, Max={cost_summary[6]}")
#' 
#'     return(cost_data)
#'   }, error = function(e) {
#'     logger::log_error("Error reading data: {e$message}")
#'     stop("Error reading data: ", e$message)
#'   })
#' }
#' 
#' #' Create combined histogram with all episode types in one plot
#' #'
#' #' @param cost_data Dataframe with Total_cost and episode_type columns
#' #' @param bins Number of bins for histogram
#' #' @param zoom_range Default zoom range for interactive plots
#' #' @param year_label Year label for x-axis
#' #' @return plotly object
#' #' @noRd
#' create_combined_histogram <- function(cost_data, bins, zoom_range, year_label) {
#'   logger::log_info("Creating combined histogram with {bins} bins")
#' 
#'   # Filter out any rows with NA
#'   plot_data <- cost_data %>%
#'     tidyr::drop_na(Total_cost, episode_type)
#' 
#'   # Define colors for episode types
#'   color_palette <- c(
#'     "PT" = "#E69F00",        # Orange
#'     "Pessary" = "#56B4E9",   # Blue
#'     "UI Sling" = "#8DD3C7"   # Green/teal
#'   )
#' 
#'   # Create the combined histogram
#'   p <- ggplot2::ggplot(plot_data, ggplot2::aes(x = Total_cost, fill = episode_type)) +
#'     ggplot2::geom_histogram(
#'       position = "identity",
#'       alpha = 0.7,
#'       bins = bins
#'     ) +
#'     ggplot2::scale_fill_manual(values = color_palette) +
#'     ggplot2::scale_x_continuous(
#'       labels = scales::dollar_format(accuracy = 1),
#'       breaks = scales::pretty_breaks(n = 10)
#'     ) +
#'     ggplot2::labs(
#'       title = "Distribution of Total Medical Costs by Episode Type",
#'       x = paste0("Total cost (", year_label, " USD)"),
#'       y = "Number of Rows",
#'       fill = "Episode Type"
#'     ) +
#'     ggplot2::theme_minimal() +
#'     ggplot2::theme(
#'       plot.title = ggplot2::element_text(size = 16, face = "bold"),
#'       axis.title.x = ggplot2::element_text(size = 14),
#'       axis.title.y = ggplot2::element_text(size = 14),
#'       legend.title = ggplot2::element_text(size = 12),
#'       legend.text = ggplot2::element_text(size = 12),
#'       legend.position = "top",
#'       panel.grid.minor = ggplot2::element_blank()
#'     )
#' 
#'   # Calculate bin information for tooltips
#'   bin_data <- calculate_bin_data(plot_data, bins)
#' 
#'   # Convert to plotly and add custom tooltips
#'   plotly_hist <- plotly::ggplotly(p, tooltip = c("x", "y", "fill")) %>%
#'     plotly::layout(
#'       xaxis = list(
#'         range = zoom_range,
#'         title = list(text = paste0("Total cost (", year_label, " USD)")),
#'         tickprefix = "$",
#'         tickformat = ",d"
#'       ),
#'       yaxis = list(
#'         title = list(text = "Number of Rows")
#'       ),
#'       legend = list(
#'         orientation = "h",
#'         y = 1.1,
#'         x = 0.5,
#'         xanchor = "center"
#'       ),
#'       hoverlabel = list(
#'         bgcolor = "white",
#'         bordercolor = "black",
#'         font = list(size = 12)
#'       )
#'     )
#' 
#'   # Add custom hover template that shows the bin range
#'   for (i in seq_len(length(plotly_hist$x$data))) {
#'     if (!is.null(plotly_hist$x$data[[i]]$name)) {
#'       ep_type <- plotly_hist$x$data[[i]]$name
#'       plotly_hist$x$data[[i]]$hovertemplate <- paste0(
#'         "episode_type: ", ep_type, "<br>",
#'         "Total_cost: $%{x:,.2f}<br>",
#'         "(Number of Rows): %{y:,.0f}<br>",
#'         "<extra></extra>"
#'       )
#'     }
#'   }
#' 
#'   return(plotly_hist)
#' }
#' 
#' #' Create faceted histogram with separate panels for each episode type
#' #'
#' #' @param cost_data Dataframe with Total_cost and episode_type columns
#' #' @param bins Number of bins for histogram
#' #' @param zoom_range Default zoom range for interactive plots
#' #' @param year_label Year label for x-axis
#' #' @return plotly object
#' #' @noRd
#' create_faceted_histogram <- function(cost_data, bins, zoom_range, year_label) {
#'   logger::log_info("Creating faceted histogram with {bins} bins")
#' 
#'   # Filter out any rows with NA
#'   plot_data <- cost_data %>%
#'     tidyr::drop_na(Total_cost, episode_type)
#' 
#'   # Define colors for episode types
#'   color_palette <- c(
#'     "PT" = "#E69F00",        # Orange
#'     "Pessary" = "#56B4E9",   # Blue
#'     "UI Sling" = "#8DD3C7"   # Green/teal
#'   )
#' 
#'   # Use plot_ly directly for more control over hover templates
#'   p <- plotly::plot_ly()
#' 
#'   # Add a trace for each episode type
#'   for (ep_type in unique(plot_data$episode_type)) {
#'     # Filter data for this episode type
#'     ep_data <- plot_data %>%
#'       dplyr::filter(episode_type == ep_type)
#' 
#'     # Calculate histogram data manually to get bin ranges
#'     hist_data <- calculate_histogram_data(ep_data$Total_cost, bins)
#' 
#'     # Add histogram trace with custom hover template
#'     p <- p %>% plotly::add_trace(
#'       x = hist_data$x,
#'       y = hist_data$y,
#'       type = "bar",
#'       name = ep_type,
#'       marker = list(
#'         color = color_palette[ep_type],
#'         line = list(color = "white", width = 0.5)
#'       ),
#'       opacity = 0.7,
#'       width = hist_data$width,
#'       customdata = hist_data$customdata,
#'       hovertemplate = paste0(
#'         "episode_type: ", ep_type, "<br>",
#'         "Total_cost: $%{customdata[0]:,.2f} - $%{customdata[1]:,.2f}<br>",
#'         "(Number of Rows): %{y:,.0f}<br>",
#'         "<extra></extra>"
#'       )
#'     )
#'   }
#' 
#'   # Layout the plot
#'   p <- p %>% plotly::layout(
#'     title = list(
#'       text = "Distribution of Total Medical Costs by Episode Type",
#'       font = list(size = 16)
#'     ),
#'     xaxis = list(
#'       title = list(
#'         text = paste0("Total cost (", year_label, " USD)"),
#'         font = list(size = 14)
#'       ),
#'       range = zoom_range,
#'       tickprefix = "$",
#'       tickformat = ",d"
#'     ),
#'     yaxis = list(
#'       title = list(
#'         text = "Number of Rows",
#'         font = list(size = 14)
#'       )
#'     ),
#'     barmode = "overlay",
#'     bargap = 0.1,
#'     legend = list(
#'       orientation = "h",
#'       y = 1.1,
#'       x = 0.5,
#'       xanchor = "center"
#'     ),
#'     hoverlabel = list(
#'       bgcolor = "white",
#'       bordercolor = "black",
#'       font = list(size = 12)
#'     )
#'   )
#' 
#'   return(p)
#' }
#' 
#' #' Calculate bin data for histograms to use in tooltips
#' #'
#' #' @param data Dataframe with Total_cost column
#' #' @param bins Number of bins
#' #' @return Dataframe with bin information
#' #' @noRd
#' calculate_bin_data <- function(data, bins) {
#'   # Get min and max for creating bins
#'   min_cost <- min(data$Total_cost, na.rm = TRUE)
#'   max_cost <- max(data$Total_cost, na.rm = TRUE)
#' 
#'   # Calculate bin width
#'   bin_width <- (max_cost - min_cost) / bins
#' 
#'   # Create bins
#'   bin_breaks <- seq(min_cost, max_cost, length.out = bins + 1)
#' 
#'   # Calculate bin data
#'   bin_data <- data %>%
#'     dplyr::mutate(
#'       bin = cut(Total_cost, breaks = bin_breaks, include.lowest = TRUE, labels = FALSE)
#'     ) %>%
#'     dplyr::group_by(episode_type, bin) %>%
#'     dplyr::summarise(
#'       count = dplyr::n(),
#'       lower = bin_breaks[bin],
#'       upper = bin_breaks[bin + 1],
#'       .groups = "drop"
#'     ) %>%
#'     dplyr::arrange(episode_type, bin)
#' 
#'   return(bin_data)
#' }
#' 
#' #' Calculate histogram data manually for custom tooltips
#' #'
#' #' @param x Numeric vector of values
#' #' @param bins Number of bins
#' #' @return List containing histogram data
#' #' @noRd
#' calculate_histogram_data <- function(x, bins) {
#'   # Calculate histogram using R's hist function
#'   h <- hist(x, breaks = bins, plot = FALSE)
#' 
#'   # Prepare the data for plotly
#'   result <- list(
#'     x = h$mids,                 # Midpoints of each bin
#'     y = h$counts,               # Count in each bin
#'     width = diff(h$breaks),     # Width of each bin
#'     customdata = lapply(seq_along(h$breaks[-length(h$breaks)]), function(i) {
#'       c(h$breaks[i], h$breaks[i+1])  # Lower and upper bounds of each bin
#'     })
#'   )
#' 
#'   return(result)
#' }
#' 
#' #' Save interactive plot to HTML file
#' #'
#' #' @param plot plotly object to save
#' #' @param plot_name Base name for the plot file
#' #' @param output_dir Directory to save the plot in
#' #' @noRd
#' save_interactive_plot <- function(plot, plot_name, output_dir) {
#'   # Create full file path
#'   file_path <- file.path(output_dir, paste0(plot_name, ".html"))
#'   logger::log_info("Saving interactive plot to {file_path}")
#' 
#'   # Save the plot
#'   tryCatch({
#'     htmlwidgets::saveWidget(
#'       widget = plot,
#'       file = file_path,
#'       selfcontained = TRUE,
#'       title = plot_name
#'     )
#'     logger::log_info("Interactive plot saved successfully")
#'   }, error = function(e) {
#'     logger::log_error("Error saving interactive plot: {e$message}")
#'     warning("Error saving interactive plot: ", e$message)
#'   })
#' }




#' Generate Descriptive Statistics Paragraph for SUI Treatment Study
#' 
#' @description
#' Creates a formatted paragraph describing patient characteristics and treatment 
#' distributions for a Stress Urinary Incontinence (SUI) treatment study cohort.
#'
#' @param cohort_path Path to the cohort descriptors dataset
#' @param linear_reg_path Path to the linear regression dataset 
#' @param verbose Logical indicating whether to print detailed logging messages
#'
#' @return A character string containing the formatted paragraph
#'
#' @importFrom dplyr summarise dplyr::mutate n count filter group_by ungroup select
#' @importFrom tidyr replace_na
#' @importFrom stats median quantile chisq.test wilcox.test kruskal.test
#' @importFrom assertthat assert_that
#' @importFrom logger log_info log_debug
#'
#' @examples
#' \dontrun{
#' # Basic usage with default verbosity
#' paragraph <- generate_sui_paragraph(
#'   cohort_path = "path/to/_Cohort_descriptors.csv",
#'   linear_reg_path = "path/to/Linear regression.rds",
#'   verbose = FALSE
#' )
#' 
#' # With verbose logging enabled and CSV files
#' paragraph <- generate_sui_paragraph(
#'   cohort_path = "path/to/_Cohort_descriptors.csv",
#'   linear_reg_path = "path/to/Linear regression.csv",
#'   verbose = TRUE
#' )
#' }
#'
#' @export
generate_sui_paragraph <- function(cohort_path, linear_reg_path, verbose = FALSE) {
  if (verbose) logger::log_threshold(logger::DEBUG) else logger::log_threshold(logger::INFO)
  
  logger::log_info("Starting paragraph generation process")
  
  assertthat::assert_that(is.character(cohort_path))
  assertthat::assert_that(is.character(linear_reg_path))
  assertthat::assert_that(is.logical(verbose))
  
  cohort_data <- load_and_prepare_cohort(cohort_path)
  treatment_data <- load_dataset(linear_reg_path)
  
  treatment_stats <- calculate_treatment_stats(treatment_data)
  demographic_stats <- calculate_demographic_stats(cohort_data)
  distribution_stats <- calculate_distribution_stats(cohort_data)
  elixhauser_stats <- calculate_elixhauser_stats(cohort_data)
  
  main_paragraph <- construct_paragraph(
    treatment_stats,
    demographic_stats,
    distribution_stats,
    elixhauser_stats
  )
  
  logger::log_info("Paragraph generation completed successfully")
  return(main_paragraph)
}

#' @noRd
load_dataset <- function(file_path) {
  logger::log_debug(sprintf("Loading dataset from: %s", file_path))
  
  file_ext <- tools::file_ext(file_path)
  
  data <- switch(file_ext,
                 "rds" = readRDS(file_path),
                 "csv" = readr::read_csv(file_path, show_col_types = FALSE),
                 stop("Unsupported file format. Use .rds or .csv")
  )
  
  assertthat::assert_that(is.data.frame(data))
  logger::log_debug(sprintf("Loaded dataset with %d rows and %d columns", 
                            nrow(data), ncol(data)))
  return(data)
}

#' @noRd
format_pvalue <- function(p_value) {
  if (is.na(p_value)) return("NA")
  if (p_value < 0.01) return("P<0.01")
  return(sprintf("P=%.2f", p_value))
}

#' @noRd
load_and_prepare_cohort <- function(file_path) {
  logger::log_debug(sprintf("Loading and preparing cohort data from: %s", file_path))
  
  cohort_data <- load_dataset(file_path)
  
  # Define Elixhauser weights
  elix_weights <- c(
    elix_chf = 7, elix_valve = 4, elix_pulmcirc = 3, elix_perivasc = 2,
    elix_htn = -1, elix_para = 6, elix_neuro = 5, elix_chrnlung = 3,
    elix_diabet = 0, elix_ckd = 5, elix_hypothy = 0, elix_liver = 4,
    elix_lymph = 7, elix_mets = 12, elix_tumor = 4, elix_arth = 0,
    elix_coag = 8, elix_obese = -3, elix_wghtloss = 6, elix_lytes = 5,
    elix_bldloss = 3, elix_anemdef = 3, elix_alcohol = 0, elix_drug = 0,
    elix_psych = 0, elix_depress = -3
  )
  
  # Calculate weighted and unweighted scores
  elix_cols <- names(cohort_data)[grepl("^elix_", names(cohort_data))]
  
  cohort_data <- cohort_data %>%
    dplyr::mutate(
      elix_score_unweighted = rowSums(dplyr::select(., all_of(elix_cols)), na.rm = TRUE),
      elix_score_weighted = rowSums(dplyr::select(., all_of(names(elix_weights))) * elix_weights, 
                                    na.rm = TRUE)
    )
  
  return(cohort_data)
}

#' @noRd
calculate_treatment_stats <- function(treatment_data) {
  logger::log_debug("Calculating treatment statistics")
  
  total_patients <- nrow(treatment_data)
  
  treatment_counts <- treatment_data %>%
    dplyr::count(StressUrinaryIncontinenceTreatment) %>%
    dplyr::mutate(percentage = (n / total_patients) * 100) %>%
    dplyr::arrange(desc(percentage))
  
  logger::log_debug("Treatment statistics calculated successfully")
  return(treatment_counts)
}

#' @noRd
calculate_demographic_stats <- function(cohort_data) {
  logger::log_debug("Calculating demographic statistics")
  
  age_stats <- cohort_data %>%
    dplyr::summarise(
      median_age = stats::median(as.numeric(age), na.rm = TRUE),
      q25 = stats::quantile(as.numeric(age), 0.25, na.rm = TRUE),
      q75 = stats::quantile(as.numeric(age), 0.75, na.rm = TRUE)
    )
  
  return(list(age = age_stats))
}

#' @noRd
calculate_distribution_stats <- function(cohort_data) {
  logger::log_debug("Calculating distribution statistics")
  
  age_chi2 <- stats::chisq.test(
    table(cohort_data$age, cohort_data$sui_treatment)
  )
  
  logger::log_debug("Distribution statistics calculated successfully")
  return(list(age_p = age_chi2$p.value))
}

#' @noRd
calculate_elixhauser_stats <- function(cohort_data) {
  logger::log_debug("Calculating Elixhauser scores statistics")
  
  group_stats <- cohort_data %>%
    dplyr::group_by(sui_treatment) %>%
    dplyr::summarise(
      weighted_median = stats::median(elix_score_weighted, na.rm = TRUE),
      weighted_q1 = stats::quantile(elix_score_weighted, 0.25, na.rm = TRUE),
      weighted_q3 = stats::quantile(elix_score_weighted, 0.75, na.rm = TRUE),
      unweighted_median = stats::median(elix_score_unweighted, na.rm = TRUE),
      unweighted_q1 = stats::quantile(elix_score_unweighted, 0.25, na.rm = TRUE),
      unweighted_q3 = stats::quantile(elix_score_unweighted, 0.75, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    dplyr::arrange(desc(weighted_median))
  
  weighted_test <- stats::kruskal.test(elix_score_weighted ~ sui_treatment, 
                                       data = cohort_data)
  unweighted_test <- stats::kruskal.test(elix_score_unweighted ~ sui_treatment, 
                                         data = cohort_data)
  
  # Pairwise comparisons
  weighted_pairwise <- pairwise.wilcox.test(
    cohort_data$elix_score_weighted,
    cohort_data$sui_treatment,
    p.adjust.method = "bonferroni"
  )
  
  unweighted_pairwise <- pairwise.wilcox.test(
    cohort_data$elix_score_unweighted,
    cohort_data$sui_treatment,
    p.adjust.method = "bonferroni"
  )
  
  return(list(
    group_stats = group_stats,
    weighted_p = weighted_test$p.value,
    unweighted_p = unweighted_test$p.value,
    weighted_pairwise = weighted_pairwise$p.value,
    unweighted_pairwise = unweighted_pairwise$p.value
  ))
}

#' @noRd
construct_paragraph <- function(treatment_stats, demographic_stats, 
                                distribution_stats, elixhauser_stats) {
  logger::log_debug("Constructing final paragraph")
  
  # Format Elixhauser descriptions
  elix_descriptions <- with(elixhauser_stats$group_stats, sprintf(
    paste(
      "Patients receiving %s had a median weighted Elixhauser score of %.1f",
      "[IQR: %.1f-%.1f] and unweighted score of %.1f [IQR: %.1f-%.1f]"
    ),
    sui_treatment,
    weighted_median, weighted_q1, weighted_q3,
    unweighted_median, unweighted_q1, unweighted_q3
  ))
  
  paragraph <- sprintf(
    paste(
      "The analyzed cohort included %s female Medicare beneficiaries diagnosed",
      "with urinary incontinence between January 1, 2008, and June 30, 2016.",
      "Vaginal pessary was the most frequently used treatment for SUI,",
      "accounting for %.2f%% (n=%s) of cases, followed by PFMT at %.2f%% (n=%s),",
      "and midurethral or pubovaginal sling surgery at %.2f%% (n=%s).",
      "The median age of participants was %d years (IQR: %d-%d).",
      "Age distribution differed significantly across the treatment groups (%s).",
      "%s. %s. %s.",
      "Elixhauser scores differed significantly between treatment groups",
      "(weighted: %s; unweighted: %s)."
    ),
    format(sum(treatment_stats$n), big.mark = ","),
    treatment_stats$percentage[1], format(treatment_stats$n[1], big.mark = ","),
    treatment_stats$percentage[2], format(treatment_stats$n[2], big.mark = ","),
    treatment_stats$percentage[3], format(treatment_stats$n[3], big.mark = ","),
    demographic_stats$age$median_age,
    demographic_stats$age$q25,
    demographic_stats$age$q75,
    sprintf("χ2, %s", format_pvalue(distribution_stats$age_p)),
    elix_descriptions[1], elix_descriptions[2], elix_descriptions[3],
    format_pvalue(elixhauser_stats$weighted_p),
    format_pvalue(elixhauser_stats$unweighted_p)
  )
  
  logger::log_debug("Paragraph construction completed")
  return(paragraph)
}





#' Generate a formatted table of van Walraven weights for Elixhauser comorbidities
#' 
#' This function creates a properly formatted table of Elixhauser comorbidity
#' weights and exports it to a Word document or LaTeX format.
#' 
#' @param output_path Character string with the path where the document 
#'   should be saved.
#' @param format Character string specifying output format: "docx" (default) or "latex"
#' @param verbose Logical indicating whether to print progress messages (default: FALSE)
#' @return Character string with the path to the created document
#' @importFrom dplyr %>% mutate arrange group_by
#' @importFrom flextable flextable theme_booktabs fontsize autofit set_caption
#'   footnote add_header_row merge_h_range bold color width align body_add_flextable
#' @importFrom officer read_docx body_end_section_landscape
#'   as_paragraph fp_border
#' @examples
#' # Generate table and save to temporary file
#' temp_file <- file.path(tempdir(), "Elixhauser_Weights.docx")
#' generate_elixhauser_weights_table(temp_file)
#' 
#' # Generate table with logging enabled
#' temp_file2 <- file.path(tempdir(), "Elixhauser_Weights_verbose.docx")
#' generate_elixhauser_weights_table(temp_file2, verbose = TRUE)
#' 
#' # Generate table and organize by body system
#' temp_file3 <- file.path(tempdir(), "Elixhauser_Weights_grouped.docx")
#' generate_elixhauser_weights_table(
#'   output_path = temp_file3,
#'   verbose = TRUE
#' )
#'
#' # Generate LaTeX version of the table
#' latex_file <- file.path(tempdir(), "Elixhauser_Weights.tex")
#' generate_elixhauser_weights_table(
#'   output_path = latex_file,
#'   format = "latex", 
#'   verbose = TRUE
#' )
generate_elixhauser_weights_table <- function(output_path, format = "docx", verbose = FALSE) {
  # Set up logging if verbose is TRUE
  if (verbose) {
    if (!requireNamespace("logger", quietly = TRUE)) {
      install.packages("logger")
    }
    logger::log_formatter(logger::formatter_paste)
    logger::log_threshold(logger::INFO)
    logger::log_info("Starting to generate Elixhauser weights table")
  }
  
  # Define body systems for grouping
  body_systems <- c(
    "Cardiovascular" = "Cardiovascular System",
    "Neurological" = "Neurological System",
    "Pulmonary" = "Respiratory System",
    "Endocrine" = "Endocrine System",
    "Renal" = "Renal System",
    "GI" = "Gastrointestinal System",
    "Oncology" = "Oncology",
    "Other" = "Other Conditions"
  )
  
  # Map variables to their proper values and system categories
  comorbidity_map <- data.frame(
    Elixhauser_Variable = c(
      "elix_chf", "elix_arrhy", "elix_valve", "elix_pulmcirc", "elix_perivasc", 
      "elix_htn", "elix_para", "elix_neuro", "elix_chrnlung", "elix_diabet", 
      "elix_diabetc", "elix_hypothy", "elix_renlfail", "elix_liver", "elix_ulcer", 
      "elix_lymph", "elix_mets", "elix_tumor", "elix_arth", "elix_coag", 
      "elix_obese", "elix_wghtloss", "elix_lytes", "elix_bldloss", "elix_anemdef", 
      "elix_alcohol", "elix_drug", "elix_psych", "elix_depress"
    ),
    Comorbidity_Name = c(
      "Congestive Heart Failure", "Cardiac Arrhythmias", "Valvular Disease", 
      "Pulmonary Circulation Disorders", "Peripheral Vascular Disease", 
      "Hypertension", "Paralysis", "Other Neurological Disorders", 
      "Chronic Pulmonary Disease", "Diabetes w/o Complications", 
      "Diabetes w/ Complications", "Hypothyroidism", "Renal Failure", 
      "Liver Disease", "Peptic Ulcer Disease", "Lymphoma", "Metastatic Cancer", 
      "Solid Tumor w/o Metastasis", "Rheumatoid Arthritis", "Coagulopathy", 
      "Obesity", "Weight Loss", "Fluid & Electrolyte Disorders", 
      "Blood Loss Anemia", "Deficiency Anemia", "Alcohol Abuse", 
      "Drug Abuse", "Psychoses", "Depression"
    ),
    Weight = c(
      7, 5, 4, 3, 2, -1, 6, 5, 3, 0, 2, 0, 5, 4, 0, 7, 12, 4, 0, 8, -3, 
      6, 5, 3, 3, 0, 0, 0, -3
    ),
    System_Category = c(
      "Cardiovascular", "Cardiovascular", "Cardiovascular", "Cardiovascular", 
      "Cardiovascular", "Cardiovascular", "Neurological", "Neurological", 
      "Pulmonary", "Endocrine", "Endocrine", "Endocrine", "Renal", "GI", 
      "GI", "Oncology", "Oncology", "Oncology", "Other", "Other", "Other", 
      "Other", "Other", "Other", "Other", "Other", "Other", "Other", "Other"
    ),
    stringsAsFactors = FALSE
  )
  
  if (verbose) {
    logger::log_info("Defined comorbidity mapping with corrected variable names")
    logger::log_info(paste("Total comorbidities:", nrow(comorbidity_map)))
  }
  
  # Sort by system category and then by weight (descending)
  sorted_comorbidities <- comorbidity_map %>%
    dplyr::arrange(System_Category, desc(Weight))
  
  if (verbose) {
    logger::log_info("Sorted comorbidities by system and weight")
  }
  
  # Create flextable with improved formatting
  ft_weights <- flextable::flextable(sorted_comorbidities) %>%
    flextable::theme_booktabs() %>%
    flextable::fontsize(size = 11, part = "all") %>%
    flextable::add_header_row(
      values = c("", "", "", ""),
      colwidths = c(1, 1, 1, 1)
    ) %>%
    flextable::set_header_labels(
      Elixhauser_Variable = "Variable",
      Comorbidity_Name = "Comorbidity",
      Weight = "Weight",
      System_Category = "Body System"
    ) %>%
    flextable::bold(part = "header") %>%
    flextable::align(align = "center", j = 3, part = "all") %>%
    flextable::align(align = "left", j = c(1, 2, 4), part = "all") %>%
    flextable::width(j = 1, width = 1.5) %>%
    flextable::width(j = 2, width = 2.5) %>%
    flextable::width(j = 3, width = 0.8) %>%
    flextable::width(j = 4, width = 1.8) %>%
    flextable::bg(
      bg = "#f2f2f2", 
      i = ~ System_Category %in% c("Cardiovascular", "Oncology", "Renal"), 
      part = "body"
    ) %>%
    flextable::border(
      border = officer::fp_border(color = "gray"),
      part = "all"
    ) %>%
    flextable::set_caption(
      caption = "Table: van Walraven Weights for Elixhauser Comorbidities (2009)"
    ) %>%
    flextable::footnote(
      i = 1, j = 1:4,
      value = flextable::as_paragraph(
        "van Walraven C, Austin PC, Jennings A, Quan H, Forster AJ. A",
        " modification of the Elixhauser comorbidity measures into a point",
        " system for hospital death using administrative data.",
        " Med Care. 2009;47(6):626-633. doi:10.1097/MLR.0b013e31819432e5"
      ),
      ref_symbols = "*",
      part = "header"
    ) %>%
    flextable::footnote(
      i = nrow(sorted_comorbidities), j = 1:4,
      value = flextable::as_paragraph(
        "Note: To calculate the Elixhauser comorbidity score,",
        " sum the weights of all present comorbidities. Higher scores",
        " indicate increased risk of mortality and hospital readmission."
      ),
      ref_symbols = "†",
      part = "body"
    )
  
  if (verbose) {
    logger::log_info("Created enhanced flextable with improved formatting")
  }
  
  # Create Word document or LaTeX output
  if (tolower(format) == "docx") {
    if (verbose) {
      logger::log_info("Creating Word document output")
    }
    
    doc <- officer::read_docx() %>%
      flextable::body_add_flextable(value = ft_weights) %>%
      officer::body_end_section_landscape()
    
    # Save the Word document
    print(doc, target = output_path)
  } else if (tolower(format) == "latex") {
    if (verbose) {
      logger::log_info("Creating LaTeX output")
    }
    
    # Check if we have the necessary packages for LaTeX output
    if (!requireNamespace("flextable", quietly = TRUE) || 
        !requireNamespace("ftExtra", quietly = TRUE)) {
      install.packages(c("flextable", "ftExtra"))
    }
    
    # Create LaTeX output using flextable and ftExtra
    latex_content <- flextable::save_as_latex(
      x = ft_weights,
      path = output_path,
      caption = "van Walraven Weights for Elixhauser Comorbidities (2009)",
      label = "tab:elixhauser_weights"
    )
    
    # Add LaTeX preamble with package requirements to the file
    if (file.exists(output_path)) {
      latex_text <- readLines(output_path)
      
      # Add necessary LaTeX packages at beginning of file
      latex_preamble <- c(
        "% LaTeX preamble - include these in your document",
        "% \\usepackage{booktabs}",
        "% \\usepackage{colortbl}",
        "% \\usepackage{xcolor}",
        "% \\usepackage{caption}",
        "% The table is created using the following code:",
        ""
      )
      
      # Write the combined content back to the file
      writeLines(c(latex_preamble, latex_text), output_path)
    }
  } else {
    stop("Invalid format specified. Use 'docx' or 'latex'.")
  }
  
  if (verbose) {
    logger::log_info(paste("Successfully saved table to:", output_path))
  }
  
  # Return file path
  return(output_path)
}







# Create different sentence structures for the Cochran-Armitage Test results programmatically

# Function to generate varied sentence structures based on results
generate_ca_sentences <- function(results) {
  sentences <- c()
  
  for (comp_name in names(results)) {
    result <- results[[comp_name]]
    
    # Randomly select a sentence structure
    sentence_structure <- sample(1:4, 1)
    
    sentence <- switch(sentence_structure,
                       sprintf(
                         "%s showed a %s trend from %.0f%% in 2008 to %.0f%% in 2016 (Cochran-Armitage Test, %s).",
                         comp_name, result$trend, result$start, result$end, result$p_value
                       ),
                       sprintf(
                         "Between 2008 and 2016, %s usage %s, changing from %.0f%% to %.0f%% (Cochran-Armitage Test, %s).",
                         comp_name, result$trend, result$start, result$end, result$p_value
                       ),
                       sprintf(
                         "A %s trend in %s was observed, rising from %.0f%% in 2008 to %.0f%% in 2016 (Cochran-Armitage Test, %s).",
                         result$trend, comp_name, result$start, result$end, result$p_value
                       ),
                       sprintf(
                         "From 2008 to 2016, %s %s from %.0f%% to %.0f%% (Cochran-Armitage Test, %s).",
                         comp_name, result$trend, result$start, result$end, result$p_value
                       )
    )
    
    sentences <- c(sentences, sentence)
  }
  
  return(sentences)
}

# Example results from Cochran-Armitage Tests
# HARDCODED
example_results <- list(
  "PFMT" = list(trend = "increased", start = 27, end = 31, p_value = "p<0.01"),
  "Sling" = list(trend = "declined", start = 30, end = 10, p_value = "p<0.01"),
  "Pessary" = list(trend = "increased", start = 42, end = 48, p_value = "p=0.02")
)






#' Create Cochran-Armitage Trend Visualization
#' 
#' @description
#' Creates and saves mosaic plots visualizing treatment proportions over time 
#' using Cochran-Armitage trend visualization approach. Generates separate plots 
#' for Pessary, PT, and Sling treatments.
#' 
#' @param summary_data A tibble containing yearly treatment summaries with columns:
#'        Year, Pessary, Non-pessary, PT, Non-PT, Sling, Non-sling
#' @param output_dir Character string specifying directory for saving plots.
#'        Default: "results"
#' @param width Numeric value for plot width in inches. Default: 12
#' @param height Numeric value for plot height in inches. Default: 18
#' @param verbose Logical indicating whether to print detailed logging. Default: FALSE
#' 
#' @return A list containing the three ggplot objects (invisible)
#' 
#' @importFrom tidyr pivot_longer
#' @importFrom dplyr filter
#' @importFrom ggplot2 ggplot aes geom_bar scale_y_continuous labs theme_minimal 
#'             theme element_text ggsave
#' @importFrom scales percent_format
#' @importFrom logger log_info log_debug
#' @importFrom assertthat assert_that
#' 
#' @examples
#' \dontrun{
#' # Basic usage with default parameters
#' yearly_trends <- create_cochran_plots(
#'   summary_data = treatment_summary,
#'   verbose = TRUE
#' )
#' 
#' # Specify custom output directory and dimensions
#' yearly_trends <- create_cochran_plots(
#'   summary_data = treatment_summary,
#'   output_dir = "figures/trends",
#'   width = 10,
#'   height = 15,
#'   verbose = TRUE
#' )
#' }
#' @export
create_cochran_plots <- function(summary_data, 
                                 output_dir = "results",
                                 width = 12,
                                 height = 18,
                                 verbose = FALSE) {
  
  # Validate inputs
  assertthat::assert_that(is.data.frame(summary_data))
  assertthat::assert_that(is.character(output_dir))
  assertthat::assert_that(is.numeric(width))
  assertthat::assert_that(is.numeric(height))
  assertthat::assert_that(is.logical(verbose))
  
  required_cols <- c("Year", "Pessary", "Non-pessary", "PT", "Non-PT", 
                     "Sling", "Non-sling")
  missing_cols <- required_cols[!required_cols %in% names(summary_data)]
  if(length(missing_cols) > 0) {
    stop(sprintf("Missing required columns: %s", 
                 paste(missing_cols, collapse = ", ")))
  }
  
  if(verbose) {
    logger::log_info("Starting Cochran-Armitage plot creation")
    logger::log_debug("Converting data to long format")
  }
  
  # Create output directory if it doesn't exist
  if(!dir.exists(output_dir)) {
    if(verbose) logger::log_info(sprintf("Creating output directory: %s", output_dir))
    dir.create(output_dir, recursive = TRUE)
  }
  
  # Convert to long format
  data_long <- tidyr::pivot_longer(
    summary_data,
    cols = all_of(required_cols[-1]),
    names_to = "Treatment",
    values_to = "Count"
  )
  
  if(verbose) logger::log_info("Creating individual treatment plots")
  
  # Create plot for each treatment comparison
  treatment_pairs <- list(
    pessary = c("Pessary", "Non-pessary"),
    pt = c("PT", "Non-PT"),
    sling = c("Sling", "Non-sling")
  )
  
  plots <- list()
  
  for(treatment in names(treatment_pairs)) {
    if(verbose) {
      logger::log_debug(sprintf("Creating plot for %s comparison", treatment))
    }
    
    pair <- treatment_pairs[[treatment]]
    plot_title <- sprintf("%s vs. %s Treatment Proportions Over Years", 
                          stringr::str_to_title(pair[1]), pair[2])
    
    plot <- ggplot2::ggplot(
      data_long %>% 
        dplyr::filter(Treatment %in% pair),
      ggplot2::aes(x = factor(Year), 
                   fill = Treatment, 
                   weight = Count)
    ) +
      ggplot2::geom_bar(position = "fill") +
      ggplot2::scale_y_continuous(labels = scales::percent_format()) +
      ggplot2::labs(
        x = "Year",
        y = "Proportion",
        fill = "Treatment Type",
        title = plot_title
      ) +
      ggplot2::theme_minimal(base_size = 20) +
      ggplot2::theme(
        plot.title = ggplot2::element_text(size = 22, face = "bold"),
        axis.title = ggplot2::element_text(size = 20),
        axis.text.x = ggplot2::element_text(size = 18, angle = 45, hjust = 1),
        axis.text.y = ggplot2::element_text(size = 18),
        legend.title = ggplot2::element_text(size = 20),
        legend.text = ggplot2::element_text(size = 18)
      )
    
    # Save plot
    plot_filename <- file.path(output_dir, sprintf("%sCochran_plots.png", treatment))
    if(verbose) {
      logger::log_debug(sprintf("Saving plot to: %s", plot_filename))
    }
    
    ggplot2::ggsave(
      filename = plot_filename,
      plot = plot,
      width = width,
      height = height
    )
    
    plots[[treatment]] <- plot
  }
  
  if(verbose) {
    logger::log_info("All Cochran-Armitage plots created successfully")
  }
  
  invisible(plots)
}




# Function to format numbers with thousand separators
format_number <- function(x) {
  format(round(x), big.mark = ",", scientific = FALSE)
}

# Function to dynamically generate sentences for AAPC results
generate_sentence <- function(treatment, result) {
  aapc <- result$aapc
  confint <- result$confint
  p_value <- result$p_value
  
  # Determine trend direction
  trend <- ifelse(aapc > 0, "increased", "decreased")
  
  # Format confidence intervals
  ci_lower <- format_number(confint[1])
  ci_upper <- format_number(confint[2])
  
  # Format p-value
  p_text <- ifelse(p_value < 0.01, "p<0.01", paste0("p=", round(p_value, 2)))
  
  # Construct sentence
  sentence <- paste0(
    "The annual percentage change (AAPC) for ", treatment, " ", trend, 
    " at an average rate of ", format_number(round(aapc, 2)), "% per year (95% CI: ", 
    ci_lower, "% to ", ci_upper, "%, ", p_text, ")."
  )
  
  return(sentence)
}




# Function to format numbers with thousand separators and rounding
format_number <- function(x) {
  format(round(x, 2), big.mark = ",", scientific = FALSE)
}

# Function to dynamically generate AAPC result sentences
generate_aapc_sentence <- function(treatment, aapc, confint, p_value) {
  # Determine trend direction
  trend <- ifelse(aapc > 0, "increased to", "declined to")
  
  # Format confidence intervals
  ci_lower <- format_number(confint[1])
  ci_upper <- format_number(confint[2])
  
  # Format p-value
  p_text <- ifelse(p_value < 0.01, "p<0.01", paste0("p=", round(p_value, 2)))
  
  # Construct sentence
  sentence <- paste0(
    "After adjusting for age, race, and comorbidities, the AAPC for ", treatment, 
    " ", trend, " ", format_number(abs(aapc)), "% per year (95% CI: ", 
    ci_lower, "% to ", ci_upper, "%, ", p_text, ")."
  )
  
  return(sentence)
}



# Function to format p-values dynamically
format_p_value <- function(p) {
  ifelse(p < 0.01, "p<0.01", paste0("p=", round(p, 2)))
}




#' @noRd
calculate_treatment_iqr <- function(treatment_costs) {
  stats::quantile(treatment_costs, probs = c(0.25, 0.75), na.rm = TRUE)
}

#' @noRd
format_cost_with_iqr <- function(median_cost, q1, q3) {
  sprintf("$%s [IQR: $%s-$%s]", 
          format(round(median_cost), big.mark = ","),
          format(round(q1), big.mark = ","),
          format(round(q3), big.mark = ","))
}

#' Compare Outpatient Facility Costs Across Treatments with IQR
#' 
#' @description
#' Analyzes and compares outpatient facility costs across different treatments,
#' calculating medians with interquartile ranges (IQR) and performing statistical
#' comparisons. Returns both numeric results and formatted text descriptions.
#'
#' @param treatment_cost_data data.frame containing treatment and cost data
#' @param treatment_groups character vector of treatment names to compare
#' @param verbose logical; if TRUE, prints detailed logging messages (default: TRUE)
#'
#' @return list containing:
#'   \item{treatment_medians}{data.frame of median costs and IQRs by treatment}
#'   \item{test_statistics}{list of statistical test results}
#'   \item{description}{character string with formatted comparison text}
#'
#' @importFrom dplyr filter select group_by summarize n mutate across everything 
#'             matches all_of arrange desc
#' @importFrom tidyr pivot_longer starts_with
#' @importFrom stringr str_detect str_to_lower
#' @importFrom stats median wilcox.test p.adjust quantile IQR
#' @importFrom assertthat assert_that
#' @importFrom logger log_info log_debug
#'
#' @examples
#' \dontrun{
#' # Example 1: Compare three treatments with verbose logging
#' comparison_results <- analyze_treatment_costs(
#'   treatment_cost_data = outpatient_costs,
#'   treatment_groups = c("Pessary", "Sling", "PT"),
#'   verbose = TRUE
#' )
#' 
#' # View median costs with IQR
#' print(comparison_results$treatment_medians)
#' #> # A tibble: 3 × 5
#' #>   sui_treatment median_cost q1_cost q3_cost     n
#' #>   <chr>              <dbl>   <dbl>   <dbl> <int>
#' #> 1 sling                122     100     150    20
#' #> 2 pessary              100      75     125    61
#' #> 3 pt                   100      50     150    77
#' 
#' # Example 2: Compare different set of treatments without logging
#' botox_comparison <- analyze_treatment_costs(
#'   treatment_cost_data = outpatient_costs,
#'   treatment_groups = c("Bulking", "Botox", "Sling"),
#'   verbose = FALSE
#' )
#' 
#' # Example 3: Print formatted comparison with IQR
#' surgery_comparison <- analyze_treatment_costs(
#'   treatment_cost_data = surgical_costs,
#'   treatment_groups = c("Sling", "Burch", "TVT"),
#'   verbose = TRUE
#' )
#' cat(surgery_comparison$description)
#' #> "The median outpatient facility cost for sling treatment was $122 
#' #> (IQR: $100-$150), which was higher than Burch ($100, IQR: $75-$125) 
#' #> and higher than TVT ($100, IQR: $50-$150) (p<0.01)"
#' }
analyze_treatment_costs <- function(treatment_cost_data, 
                                    treatment_groups = c("Pessary", "Sling", "PT"),
                                    verbose = TRUE) {
  
  # Input validation
  assertthat::assert_that(is.data.frame(treatment_cost_data), 
                          msg = "treatment_cost_data must be a data frame")
  assertthat::assert_that(is.character(treatment_groups), 
                          msg = "treatment_groups must be a character vector")
  assertthat::assert_that(is.logical(verbose), 
                          msg = "verbose must be logical (TRUE/FALSE)")
  assertthat::assert_that("sui_treatment" %in% names(treatment_cost_data),
                          msg = "treatment_cost_data must contain 'sui_treatment' column")
  
  if (verbose) {
    logger::log_info("Starting treatment cost comparison analysis")
    logger::log_debug(paste("Input dimensions:", 
                            nrow(treatment_cost_data), "x", 
                            ncol(treatment_cost_data)))
    logger::log_debug(paste("Analyzing treatments:", 
                            paste(treatment_groups, collapse=", ")))
  }
  
  # Create standardized treatment mapping
  treatment_mapping <- list(
    "pt" = c("pt", "pfmt", "physical therapy"),
    "sling" = c("sling", "mus", "midurethral sling"),
    "pessary" = c("pessary")
  )
  
  # Standardize treatment names in data
  standardized_treatments <- sapply(
    treatment_cost_data$sui_treatment,
    function(x) {
      x_lower <- stringr::str_to_lower(x)
      matched <- sapply(names(treatment_mapping), function(std_name) {
        x_lower %in% treatment_mapping[[std_name]]
      })
      ifelse(any(matched), names(treatment_mapping)[matched][1], x_lower)
    }
  )
  
  treatment_cost_data$sui_treatment <- standardized_treatments
  treatment_groups <- sapply(treatment_groups, stringr::str_to_lower)
  
  # Identify facility cost columns
  facility_cost_cols <- names(treatment_cost_data)[
    stringr::str_detect(names(treatment_cost_data), 
                        "^(OP_facility_return_|OP_facility_new_)")
  ]
  
  if (length(facility_cost_cols) == 0) {
    stop("No outpatient facility cost columns found in the data")
  }
  
  if (verbose) {
    logger::log_debug(paste("Processing facility cost columns:", 
                            paste(facility_cost_cols, collapse=", ")))
  }
  
  # Calculate patient-level costs
  patient_level_costs <- treatment_cost_data %>%
    dplyr::group_by(WU_ID, sui_treatment) %>%
    dplyr::summarize(
      total_facility_cost = sum(dplyr::across(dplyr::all_of(facility_cost_cols)), 
                                na.rm = TRUE),
      visit_count = n(),
      .groups = "drop"
    ) %>%
    dplyr::filter(total_facility_cost >= 1)
  
  # Calculate treatment summaries with IQR
  treatment_summaries <- patient_level_costs %>%
    dplyr::filter(sui_treatment %in% treatment_groups) %>%
    dplyr::group_by(sui_treatment) %>%
    dplyr::summarize(
      median_cost = stats::median(total_facility_cost, na.rm = TRUE),
      q1_cost = stats::quantile(total_facility_cost, 0.25, na.rm = TRUE),
      q3_cost = stats::quantile(total_facility_cost, 0.75, na.rm = TRUE),
      mean_cost = mean(total_facility_cost, na.rm = TRUE),
      n = n(),
      total_visits = sum(visit_count),
      .groups = "drop"
    )
  
  if (nrow(treatment_summaries) == 0) {
    stop(paste("No matching treatments found. Available treatments:", 
               paste(unique(patient_level_costs$sui_treatment), collapse=", ")))
  }
  
  # Statistical comparisons
  treatment_pairs <- combn(treatment_groups, 2, simplify = FALSE)
  statistical_tests <- lapply(treatment_pairs, function(pair) {
    costs1 <- patient_level_costs$total_facility_cost[
      patient_level_costs$sui_treatment == pair[1]]
    costs2 <- patient_level_costs$total_facility_cost[
      patient_level_costs$sui_treatment == pair[2]]
    
    test_result <- stats::wilcox.test(costs1, costs2)
    
    if (verbose) {
      logger::log_debug(sprintf("Completed test: %s vs %s, p=%.3f", 
                                pair[1], pair[2], test_result$p.value))
    }
    
    list(
      comparison = paste(pair, collapse = " vs "),
      p_value = test_result$p.value
    )
  })
  
  # Generate descriptive text with IQR
  ordered_treatments <- treatment_summaries %>%
    dplyr::arrange(desc(median_cost))
  
  formatted_costs <- mapply(
    format_cost_with_iqr,
    ordered_treatments$median_cost,
    ordered_treatments$q1_cost,
    ordered_treatments$q3_cost
  )
  
  # Determine comparison words based on p-values
  get_comparison_word <- function(median1, median2, p_value) {
    if (p_value > 0.05) {
      return("similar to")
    } else if (median1 > median2) {
      return("higher than")
    } else {
      return("lower than")
    }
  }
  
  comparison1 <- get_comparison_word(
    ordered_treatments$median_cost[1],
    ordered_treatments$median_cost[2],
    statistical_tests[[1]]$p_value
  )
  
  comparison2 <- get_comparison_word(
    ordered_treatments$median_cost[1],
    ordered_treatments$median_cost[3],
    statistical_tests[[2]]$p_value
  )
  
  # Format p-values
  format_p_value <- function(p) {
    ifelse(p < 0.01, "p<0.01", sprintf("p=%.2f", p))
  }
  
  p_value1 <- format_p_value(statistical_tests[[1]]$p_value)
  p_value2 <- format_p_value(statistical_tests[[2]]$p_value)
  
  cost_description <- sprintf(
    paste("The median outpatient facility cost for %s treatment was %s,",
          "which was %s %s %s (%s) and %s %s %s (%s)"),
    ordered_treatments$sui_treatment[1],
    formatted_costs[1],
    comparison1,
    ordered_treatments$sui_treatment[2],
    formatted_costs[2],
    p_value1,
    comparison2,
    ordered_treatments$sui_treatment[3],
    formatted_costs[3],
    p_value2
  )
  
  if (verbose) {
    logger::log_debug("Text description generated with IQR")
    logger::log_info("Analysis completed successfully")
  }
  
  return(list(
    treatment_medians = treatment_summaries,
    test_statistics = statistical_tests,
    description = cost_description
  ))
}



#' Compare Provider Specialties for Physical Therapy Treatments
#'
#' @title analyze_pt_providers
#' @description Analyzes physical therapy treatment providers by specialty and generates a 
#'   comparison sentence between the top two prescribing specialties.
#'
#' @param data_path Character string specifying path to CSV file.
#' @param threshold Numeric value for considering percentages "equal" (default: 5).
#' @param verbose Logical indicating whether to print detailed logs (default: FALSE).
#'
#' @return Character string containing the comparison sentence.
#'
#' @importFrom dplyr filter count mutate arrange desc
#' @importFrom tidyr replace_na
#' @importFrom readr read_csv
#' @importFrom scales comma
#' @importFrom assertthat assert_that
#' @importFrom logger log_info log_error
#'
#' @examples
#' \dontrun{
#' # Basic usage with default settings
#' specialty_comparison <- analyze_pt_providers(
#'   data_path = "Provider_specialty.csv",
#'   threshold = 5,
#'   verbose = FALSE
#' )
#' 
#' # With verbose logging
#' specialty_comparison <- analyze_pt_providers(
#'   data_path = "Provider_specialty.csv",
#'   threshold = 3,
#'   verbose = TRUE
#' )
#' 
#' # Custom equality threshold
#' specialty_comparison <- analyze_pt_providers(
#'   data_path = "Provider_specialty.csv",
#'   threshold = 2.5,
#'   verbose = TRUE
#' )
#' }
analyze_pt_providers <- function(data_path, threshold = 5, verbose = FALSE) {
  # Initialize logging
  logger::log_threshold(if(verbose) logger::DEBUG else logger::INFO)
  
  # Validate inputs
  assertthat::assert_that(
    is.character(data_path),
    file.exists(data_path),
    is.numeric(threshold),
    threshold > 0,
    is.logical(verbose)
  )
  
  logger::log_info("Starting provider specialty analysis")
  logger::log_info(paste("Input file:", data_path))
  logger::log_info(paste("Threshold:", threshold))
  
  # Read data
  provider_data <- readr::read_csv(data_path, show_col_types = FALSE)
  logger::log_info(paste("Loaded", nrow(provider_data), "rows of data"))
  
  # Validate required columns
  required_cols <- c("sui_treatment", "specialty_pre6m_specialty_car_and_fac")
  assertthat::assert_that(
    all(required_cols %in% colnames(provider_data)),
    msg = paste("Missing required columns:", 
                paste(setdiff(required_cols, colnames(provider_data)), collapse = ", "))
  )
  
  # Filter PT treatments
  pt_specialty_data <- dplyr::filter(provider_data, sui_treatment == "PT")
  logger::log_info(paste("Found", nrow(pt_specialty_data), "PT treatments"))
  
  assertthat::assert_that(
    nrow(pt_specialty_data) >= 2,
    msg = "Insufficient PT provider data for comparison"
  )
  
  # Summarize provider specialties
  specialty_summary <- pt_specialty_data %>%
    dplyr::count(specialty_pre6m_specialty_car_and_fac, name = "count") %>%
    dplyr::mutate(
      percentage = round((count / sum(count)) * 100, 1),
      specialty_pre6m_specialty_car_and_fac = tidyr::replace_na(
        specialty_pre6m_specialty_car_and_fac, "Unknown"
      )
    ) %>%
    dplyr::arrange(dplyr::desc(count))
  
  logger::log_info("Computed specialty distribution")
  
  # Extract top providers
  top_provider <- specialty_summary[1, ]
  second_provider <- specialty_summary[2, ]
  
  # Format numbers
  top_count_fmt <- scales::comma(top_provider$count, accuracy = 1)
  second_count_fmt <- scales::comma(second_provider$count, accuracy = 1)
  
  # Determine comparison type
  is_equal <- abs(top_provider$percentage - second_provider$percentage) <= threshold
  comparison_phrase <- if(is_equal) "equally prescribed" else "prescribed differently"
  
  # Construct result
  result_sentence <- sprintf(
    "PT was %s by %s (%.1f%%, n=%s) and %s (%.1f%%, n=%s).",
    comparison_phrase,
    top_provider$specialty_pre6m_specialty_car_and_fac, 
    top_provider$percentage,
    top_count_fmt,
    second_provider$specialty_pre6m_specialty_car_and_fac,
    second_provider$percentage,
    second_count_fmt
  )
  
  logger::log_info("Analysis complete")
  logger::log_info(paste("Result:", result_sentence))
  
  return(result_sentence)
}




#' Compare Professional Fees Across SUI Treatments
#'
#' @title analyze_professional_fees
#' @description Analyzes and compares professional fees across different SUI 
#'   treatments and generates a detailed comparison sentence with statistical testing.
#'
#' @param data_path Character string specifying path to costs data RDS file.
#' @param verbose Logical indicating whether to print detailed logs (default: FALSE).
#' @param p_threshold Numeric value for significance testing (default: 0.05).
#'
#' @return Character string containing the fee comparison sentence.
#'
#' @importFrom dplyr filter mutate group_by summarise across all_of ungroup rowwise
#' @importFrom tidyr replace_na
#' @importFrom scales comma
#' @importFrom stats wilcox.test median quantile
#' @importFrom assertthat assert_that
#' @importFrom logger log_info log_error
#'
#' @examples
#' \dontrun{
#' # Basic usage with default settings
#' fee_comparison <- analyze_professional_fees(
#'   data_path = "costs_data.rds",
#'   verbose = FALSE,
#'   p_threshold = 0.05
#' )
#' }
analyze_professional_fees <- function(data_path, verbose = FALSE, p_threshold = 0.05) {
  # Initialize logging
  logger::log_threshold(if(verbose) logger::DEBUG else logger::INFO)
  
  # Validate inputs
  assertthat::assert_that(
    is.character(data_path),
    file.exists(data_path),
    is.numeric(p_threshold),
    p_threshold > 0 && p_threshold < 1,
    is.logical(verbose)
  )
  
  logger::log_info("Starting professional fee analysis")
  logger::log_info(paste("Input file:", data_path))
  
  # Read RDS data
  cost_data <- readRDS(data_path)
  logger::log_info(paste("Loaded", nrow(cost_data), "rows of data"))
  
  # Identify professional fee columns
  professional_columns <- grep("professional", colnames(cost_data), value = TRUE, ignore.case = TRUE)
  logger::log_info(paste("Found", length(professional_columns), "professional fee columns"))
  
  # Define treatment mapping
  treatment_mapping <- c(
    "Pelvic floor physical therapy" = "PFMT",
    "Pessary" = "Pessary",
    "Urinary Incontinence Sling" = "Sling"
  )
  
  # Calculate total professional fees and summarize by treatment
  professional_summary <- cost_data %>%
    dplyr::filter(sui_treatment %in% names(treatment_mapping)) %>%
    dplyr::mutate(
      treatment = treatment_mapping[sui_treatment],
      total_professional_fee = rowSums(dplyr::across(dplyr::all_of(professional_columns)), 
                                       na.rm = TRUE)
    ) %>%
    dplyr::group_by(treatment) %>%
    dplyr::summarise(
      median_fee = round(median(total_professional_fee, na.rm = TRUE)),
      iqr_low = round(quantile(total_professional_fee, 0.25, na.rm = TRUE)),
      iqr_high = round(quantile(total_professional_fee, 0.75, na.rm = TRUE)),
      n = dplyr::n(),
      .groups = "drop"
    )
  
  # Extract treatment-specific summaries
  pfmt_stats <- professional_summary %>% dplyr::filter(treatment == "PFMT")
  pessary_stats <- professional_summary %>% dplyr::filter(treatment == "Pessary")
  sling_stats <- professional_summary %>% dplyr::filter(treatment == "Sling")
  
  # Calculate professional fees for statistical tests
  analysis_data <- cost_data %>%
    dplyr::filter(sui_treatment %in% names(treatment_mapping)) %>%
    dplyr::mutate(
      treatment = treatment_mapping[sui_treatment],
      total_professional_fee = rowSums(dplyr::across(dplyr::all_of(professional_columns)), 
                                       na.rm = TRUE)
    )
  
  # Perform statistical tests
  pfmt_pessary_test <- stats::wilcox.test(
    total_professional_fee ~ treatment,
    data = analysis_data %>% 
      dplyr::filter(treatment %in% c("PFMT", "Pessary"))
  )
  
  pfmt_sling_test <- stats::wilcox.test(
    total_professional_fee ~ treatment,
    data = analysis_data %>% 
      dplyr::filter(treatment %in% c("PFMT", "Sling"))
  )
  
  # Determine comparisons
  pfmt_pessary_comparison <- if(pfmt_stats$median_fee > pessary_stats$median_fee) 
    "higher" else "lower"
  pfmt_sling_comparison <- if(pfmt_stats$median_fee > sling_stats$median_fee) 
    "higher" else "lower"
  
  # Determine significance phrases
  pfmt_pessary_significance <- if(pfmt_pessary_test$p.value < p_threshold) 
    "significantly" else "not significantly"
  pfmt_sling_significance <- if(pfmt_sling_test$p.value < p_threshold) 
    "significantly" else "not significantly"
  
  # Format p-values
  format_p_value <- function(p) {
    if(p < 0.01) return("<0.01")
    return(sprintf("%.2f", p))
  }
  
  # Generate comparison sentence
  result_sentence <- sprintf(
    "The median professional fee for PFMT was $%s (IQR $%s–$%s), which was %s, although %s, than the median professional fee for pessary treatment ($%s, IQR $%s–$%s), and %s %s than for sling surgery ($%s, IQR $%s–$%s) (Mann-Whitney U test, p=%s).",
    scales::comma(pfmt_stats$median_fee),
    scales::comma(pfmt_stats$iqr_low),
    scales::comma(pfmt_stats$iqr_high),
    pfmt_pessary_comparison,
    pfmt_pessary_significance,
    scales::comma(pessary_stats$median_fee),
    scales::comma(pessary_stats$iqr_low),
    scales::comma(pessary_stats$iqr_high),
    pfmt_sling_significance,
    pfmt_sling_comparison,
    scales::comma(sling_stats$median_fee),
    scales::comma(sling_stats$iqr_low),
    scales::comma(sling_stats$iqr_high),
    format_p_value(pfmt_sling_test$p.value)
  )
  
  logger::log_info("Analysis complete")
  
  return(result_sentence)
}



#' Analyze Treatment Distribution by Provider Specialty
#'
#' @title analyze_treatment_specialty
#' @description Analyzes the distribution of a specific treatment across provider
#'   specialties and generates a comparison statement about the top prescribing specialty.
#'
#' @param data_path Character string specifying path to CSV file
#' @param treatment_type Character string specifying the treatment to analyze
#' @param episode_type Character string specifying the episode type (default: "Initial")
#' @param verbose Logical indicating whether to print detailed logs (default: FALSE)
#'
#' @return Character string containing the analysis statement
#'
#' @importFrom dplyr filter count mutate arrange desc
#' @importFrom tidyr replace_na
#' @importFrom readr read_csv
#' @importFrom scales comma
#' @importFrom assertthat assert_that
#' @importFrom logger log_info log_error
#'
#' @examples
#' \dontrun{
#' # Analyze distribution of PT treatments
#' treatment_analysis <- analyze_treatment_specialty(
#'   data_path = "Provider_specialty.csv",
#'   treatment_type = "PT",
#'   episode_type = "Initial",
#'   verbose = TRUE
#' )
#' 
#' # Analyze all episodes for a different treatment
#' treatment_analysis <- analyze_treatment_specialty(
#'   data_path = "Provider_specialty.csv",
#'   treatment_type = "Surgery",
#'   episode_type = NULL,
#'   verbose = TRUE
#' )
#' 
#' # Analyze with minimal logging
#' treatment_analysis <- analyze_treatment_specialty(
#'   data_path = "Provider_specialty.csv",
#'   treatment_type = "PT",
#'   episode_type = "Initial",
#'   verbose = FALSE
#' )
#' }
analyze_treatment_specialty <- function(data_path, 
                                        treatment_type,
                                        episode_type = "Initial",
                                        verbose = FALSE) {
  # Initialize logging
  logger::log_threshold(if(verbose) logger::DEBUG else logger::INFO)
  
  # Validate inputs
  assertthat::assert_that(
    is.character(data_path),
    file.exists(data_path),
    is.character(treatment_type),
    is.logical(verbose),
    is.null(episode_type) || is.character(episode_type)
  )
  
  logger::log_info("Starting treatment specialty analysis")
  logger::log_info(paste("Input file:", data_path))
  logger::log_info(paste("Treatment type:", treatment_type))
  logger::log_info(paste("Episode type:", episode_type %||% "All"))
  
  # Read data
  treatment_data <- readr::read_csv(data_path, show_col_types = FALSE)
  logger::log_info(paste("Loaded", nrow(treatment_data), "rows of data"))
  
  # Validate required columns
  required_cols <- c("sui_treatment", "specialty_pre6m_specialty_car_and_fac", 
                     "episode_type")
  assertthat::assert_that(
    all(required_cols %in% colnames(treatment_data)),
    msg = paste("Missing required columns:", 
                paste(setdiff(required_cols, colnames(treatment_data)), collapse = ", "))
  )
  
  # Filter for specific treatment
  filtered_data <- treatment_data %>%
    dplyr::filter(sui_treatment == treatment_type)
  
  # Apply episode type filter if specified
  if (!is.null(episode_type)) {
    filtered_data <- filtered_data %>%
      dplyr::filter(episode_type == !!episode_type)
  }
  
  logger::log_info(paste("Found", nrow(filtered_data), 
                         treatment_type, "treatments"))
  
  assertthat::assert_that(
    nrow(filtered_data) > 0,
    msg = paste("No", treatment_type, "treatments found")
  )
  
  # Summarize provider specialties
  specialty_summary <- filtered_data %>%
    dplyr::count(specialty_pre6m_specialty_car_and_fac, name = "count") %>%
    dplyr::mutate(
      percentage = round((count / sum(count)) * 100, 1),
      specialty_pre6m_specialty_car_and_fac = tidyr::replace_na(
        specialty_pre6m_specialty_car_and_fac, "Unknown"
      )
    ) %>%
    dplyr::arrange(dplyr::desc(count))
  
  logger::log_info("Computed specialty distribution")
  
  # Extract top provider
  top_provider <- specialty_summary[1, ]
  
  # Format numbers
  count_fmt <- scales::comma(top_provider$count, accuracy = 1)
  
  # Construct result
  result_sentence <- sprintf(
    "%s performed the most %s treatments%s, accounting for %.1f%% (n=%s) of procedures.",
    top_provider$specialty_pre6m_specialty_car_and_fac,
    treatment_type,
    if(is.null(episode_type)) "" else paste(" for", tolower(episode_type), "treatment"),
    top_provider$percentage,
    count_fmt
  )
  
  logger::log_info("Analysis complete")
  logger::log_info(paste("Result:", result_sentence))
  
  return(result_sentence)
}

#' Null coalescing operator helper
#' @noRd
`%||%` <- function(x, y) if (is.null(x)) y else x



analyze_urodynamics <- function(cost_data, verbose = TRUE) {
  assertthat::assert_that(is.data.frame(cost_data))
  assertthat::assert_that(is.logical(verbose))
  
  if (verbose) {
    logger::log_info("Starting urodynamics analysis")
    logger::log_debug("Input data dimensions: {nrow(cost_data)} x {ncol(cost_data)}")
  }
  
  uds_stats <- calculate_uds_stats(cost_data, verbose)
  summary_text <- generate_summary_text(uds_stats, verbose)
  
  if (verbose) {
    logger::log_info("Analysis completed successfully")
  }
  
  return(list(
    stats = uds_stats,
    summary_text = summary_text
  ))
}

calculate_uds_stats <- function(cost_data, verbose) {
  if (verbose) {
    logger::log_debug("Calculating urodynamics statistics")
  }
  
  # Identify UDS and vUDS using OR condition for both professional and facility fees
  patients_with_testing <- cost_data %>%
    dplyr::mutate(
      has_vUDS = (!is.na(OP_facility_vUDS_cost) & OP_facility_vUDS_cost > 0) | 
        (!is.na(professional_vUDS_cost) & professional_vUDS_cost > 0),
      
      has_UDS = (!is.na(OP_facility_UDS_cost) & OP_facility_UDS_cost > 0) | 
        (!is.na(professional_UDS_cost) & professional_UDS_cost > 0)
    ) %>%
    dplyr::filter(has_vUDS | has_UDS)  # Keep only patients with urodynamic testing
  
  total_patients <- dplyr::n_distinct(cost_data$WU_ID)  # Use full dataset
  
  testing_counts <- patients_with_testing %>%
    dplyr::summarize(
      vUDS_count = sum(has_vUDS, na.rm = TRUE),
      UDS_count = sum(has_UDS, na.rm = TRUE),
      total_tested = sum(has_vUDS | has_UDS, na.rm = TRUE)
    )
  
  valid_costs <- c(cost_data$OP_facility_UDS_cost, cost_data$OP_facility_vUDS_cost,
                   cost_data$professional_UDS_cost, cost_data$professional_vUDS_cost)
  valid_costs <- valid_costs[!is.na(valid_costs) & valid_costs > 0]
  
  cost_range <- if (length(valid_costs) > 0) {
    list(
      min_cost = format(round(min(valid_costs), 0), big.mark = ","),
      max_cost = format(round(max(valid_costs), 0), big.mark = ",")
    )
  } else {
    list(
      min_cost = "N/A",
      max_cost = "N/A"
    )
  }
  
  treatment_patterns <- analyze_treatments(patients_with_testing, verbose)
  
  if (verbose) {
    logger::log_debug("Statistics calculation completed")
  }
  
  return(list(
    total_patients = total_patients,
    testing_counts = testing_counts,
    cost_range = cost_range,
    treatment_patterns = treatment_patterns
  ))
}

analyze_treatments <- function(patient_data, verbose) {
  if (verbose) {
    logger::log_debug("Analyzing treatment patterns")
  }
  
  treatment_data <- dplyr::mutate(patient_data,
                                  had_testing = has_UDS,
                                  no_surgery = has_UDS & sui_treatment %in% c("PT", "Pessary"),
                                  received_pfmt = has_UDS & sui_treatment == "PT",
                                  received_pessary = has_UDS & sui_treatment == "Pessary"
  )
  
  treatment_counts <- treatment_data %>%
    dplyr::filter(had_testing) %>%
    dplyr::summarize(
      no_surgery_count = sum(no_surgery, na.rm = TRUE),
      pfmt_count = sum(received_pfmt, na.rm = TRUE),
      pessary_count = sum(received_pessary, na.rm = TRUE)
    )
  
  if (verbose) {
    logger::log_debug("Treatment pattern analysis completed")
  }
  
  return(treatment_counts)
}

generate_summary_text <- function(stats, verbose) {
  if (verbose) {
    logger::log_debug("Generating summary text")
  }
  
  total_patients <- stats$total_patients
  testing_counts <- stats$testing_counts
  treatment_patterns <- stats$treatment_patterns
  
  uds_count <- sum(testing_counts$UDS_count, na.rm = TRUE)
  vuds_count <- sum(testing_counts$vUDS_count, na.rm = TRUE)
  total_tested <- sum(testing_counts$total_tested, na.rm = TRUE)
  
  uds_pct <- ifelse(total_patients > 0, (uds_count / total_patients) * 100, 0)
  vuds_pct <- ifelse(total_patients > 0, (vuds_count / total_patients) * 100, 0)
  
  min_cost <- stats$cost_range$min_cost
  max_cost <- stats$cost_range$max_cost
  
  no_surgery_count <- sum(treatment_patterns$no_surgery_count, na.rm = TRUE)
  pfmt_count <- sum(treatment_patterns$pfmt_count, na.rm = TRUE)
  pessary_count <- sum(treatment_patterns$pessary_count, na.rm = TRUE)
  
  no_surgery_pct <- ifelse(total_tested > 0, (no_surgery_count / total_tested) * 100, 0)
  pfmt_pct <- ifelse(no_surgery_count > 0, (pfmt_count / no_surgery_count) * 100, 0)
  pessary_pct <- ifelse(no_surgery_count > 0, (pessary_count / no_surgery_count) * 100, 0)
  
  summary_text <- sprintf(
    "Of the %s women in our study, %.1f%% underwent urodynamic testing: %.1f%% (n=%s) had non-video testing and %.1f%% (n=%s) had video testing. The cost per urodynamic test ranged from $%s to $%s. Of the %s women who underwent urodynamic testing (N=%s), %.1f%% (n=%s) did not proceed to surgery in this first episode of care. Among these %s women who did not undergo surgery (N=%s), %.1f%% (n=%s) were treated with pelvic floor muscle therapy (PFMT), and %.1f%% (n=%s) received a pessary.",
    format(total_patients, big.mark = ","), (total_tested / total_patients) * 100,
    uds_pct, format(uds_count, big.mark = ","),
    vuds_pct, format(vuds_count, big.mark = ","),
    min_cost, max_cost,
    format(total_tested, big.mark = ","), format(total_tested, big.mark = ","),
    no_surgery_pct, format(no_surgery_count, big.mark = ","),
    format(no_surgery_count, big.mark = ","), format(no_surgery_count, big.mark = ","),
    pfmt_pct, format(pfmt_count, big.mark = ","),
    pessary_pct, format(pessary_count, big.mark = ",")
  )
  
  if (verbose) {
    logger::log_debug("Summary text generation completed")
  }
  
  return(summary_text)
}



# Function to calculate percentage change
calc_percent_change <- function(coef) {
  (exp(coef) - 1) * 100
}


###############
#' Generate Demographics Table for Stress Urinary Incontinence Therapies
#' 
#' @description
#' Creates a detailed demographics table comparing patient characteristics across 
#' different stress urinary incontinence therapy groups. Analyzes age, race,
#' treatment year, tobacco use, geographic location, and dual enrollment status.
#' 
#' @param data_path Character. Path to the RDS or CSV file containing patient data
#' @param output_path Character. Path where the Word document output should be saved
#' @param group_col Character. Name of the column containing therapy groups. 
#'        Default: "Stress Urinary Incontinence Therapies"
#' @param include_costs Logical. Whether to include cost analysis in the table.
#'        Default: FALSE
#' @param verbose Logical. Whether to print detailed logging messages. Default: TRUE
#' 
#' @return A list containing:
#'   \itemize{
#'     \item comparison: The tableby object
#'     \item summary: The formatted summary table
#'     \item patient_count: Total number of unique patients
#'   }
#' 
#' @importFrom arsenal tableby tableby.control write2word summary.tableby
#' @importFrom logger log_info log_error log_debug log_warn
#' @importFrom assertthat assert_that noNA
#' @importFrom readr read_rds read_csv
#' @importFrom dplyr select all_of filter mutate
#' @importFrom here here
#' @importFrom stats as.formula
#' @importFrom utils installed.packages
#' 
#' @examples
#' \dontrun{
#' # Basic demographic table without costs using here package
#' demographic_analysis <- create_demographics_table(
#'   data_path = here::here("Data", "CADR_2023", "table_1", "March2_table1.rds"),
#'   output_path = here::here("Data", "CADR_2023", "final_push", "~Table1.docx"),
#'   group_col = "Stress Urinary Incontinence Therapies",
#'   include_costs = FALSE,
#'   verbose = TRUE
#' )
#' 
#' # Include cost analysis in the demographics table
#' demographic_analysis <- create_demographics_table(
#'   data_path = here::here("path", "to", "data.rds"),
#'   output_path = here::here("path", "to", "table1_with_costs.docx"),
#'   group_col = "Stress Urinary Incontinence Therapies",
#'   include_costs = TRUE,
#'   verbose = TRUE
#' )
#' 
#' # Custom group column name with minimal logging
#' demographic_analysis <- create_demographics_table(
#'   data_path = here::here("path", "to", "data.rds"),
#'   output_path = here::here("path", "to", "custom_table.docx"),
#'   group_col = "Treatment_Group",
#'   include_costs = FALSE,
#'   verbose = FALSE
#' )
#' }
#' 
#' Generate Demographics Table for Stress Urinary Incontinence Therapies
#' 
#' @description
#' Creates a detailed demographics table comparing patient characteristics across 
#' different stress urinary incontinence therapy groups. Analyzes age, race,
#' treatment year, tobacco use, geographic location, and dual enrollment status.
#' 
#' @param data_path Character. Path to the RDS or CSV file containing patient data
#' @param output_path Character. Path where the Word document output should be saved
#' @param group_col Character. Name of the column containing therapy groups. 
#'        Default: "Stress Urinary Incontinence Therapies"
#' @param include_costs Logical. Whether to include cost analysis in the table.
#'        Default: FALSE
#' @param verbose Logical. Whether to print detailed logging messages. Default: TRUE
#' 
#' @return A list containing:
#'   \itemize{
#'     \item comparison: The tableby object
#'     \item summary: The formatted summary table
#'     \item patient_count: Total number of unique patients
#'   }
#' 
#' @importFrom arsenal tableby tableby.control write2word summary.tableby
#' @importFrom logger log_info log_error log_debug log_warn
#' @importFrom assertthat assert_that noNA
#' @importFrom readr read_rds read_csv
#' @importFrom dplyr select all_of filter mutate
#' @importFrom here here
#' @importFrom stats as.formula
#' @importFrom utils installed.packages
#' 
#' @examples
#' \dontrun{
#' # Basic demographic table without costs using here package
#' demographic_analysis <- create_demographics_table(
#'   data_path = here::here("Data", "CADR_2023", "table_1", "March2_table1.rds"),
#'   output_path = here::here("Data", "CADR_2023", "final_push", "~Table1.docx"),
#'   group_col = "Stress Urinary Incontinence Therapies",
#'   include_costs = FALSE,
#'   verbose = TRUE
#' )
#' 
#' # Include cost analysis in the demographics table
#' demographic_analysis <- create_demographics_table(
#'   data_path = here::here("path", "to", "data.rds"),
#'   output_path = here::here("path", "to", "table1_with_costs.docx"),
#'   group_col = "Stress Urinary Incontinence Therapies",
#'   include_costs = TRUE,
#'   verbose = TRUE
#' )
#' 
#' # Custom group column name with minimal logging
#' demographic_analysis <- create_demographics_table(
#'   data_path = here::here("path", "to", "data.rds"),
#'   output_path = here::here("path", "to", "custom_table.docx"),
#'   group_col = "Treatment_Group",
#'   include_costs = FALSE,
#'   verbose = FALSE
#' )
#' }
#' 
create_demographics_table <- function(data_path, 
                                      output_path, 
                                      group_col = "Stress Urinary Incontinence Therapies",
                                      include_costs = FALSE,
                                      verbose = TRUE) {
  
  # Check if required packages are installed
  check_required_packages(c("logger", "arsenal", "assertthat", "readr", "dplyr", "here"))
  
  # Initialize logging
  setup_logging(verbose)
  logger::log_info("Starting create_demographics_table function")
  
  # Log input parameters
  logger::log_info("Input parameters:")
  logger::log_info("  data_path: {data_path}")
  logger::log_info("  output_path: {output_path}")
  logger::log_info("  group_col: {group_col}")
  logger::log_info("  include_costs: {include_costs}")
  logger::log_info("  verbose: {verbose}")
  
  # Validate input parameters
  validate_input_parameters(data_path, output_path, group_col, include_costs, verbose)
  
  # Handle relative paths for data_path
  data_path <- resolve_data_path(data_path)
  
  # Create output directory if it doesn't exist
  create_output_directory(output_path)
  
  # Load and clean patient data
  patient_dataset <- load_patient_data(data_path)
  logger::log_info("Loaded patient data with {nrow(patient_dataset)} rows and {ncol(patient_dataset)} columns")
  
  # Process the dataset (handle group column, clean data)
  processed_dataset <- process_patient_data(patient_dataset, group_col, include_costs)
  logger::log_info("Processed patient data contains {nrow(processed_dataset)} patients")
  
  # Verify required columns (will warn but not fail if some are missing)
  verify_required_columns(processed_dataset, group_col, include_costs)
  
  # Store the original patient count
  original_patient_count <- nrow(processed_dataset)
  logger::log_info("Original patient count: {original_patient_count}")
  
  # Generate demographic comparison table
  demographic_comparison <- generate_demographics_comparison(
    processed_dataset, 
    group_col,
    include_costs
  )
  logger::log_info("Successfully generated demographics comparison table")
  
  # Create summary table from comparison
  demographic_summary <- create_summary_table(demographic_comparison)
  logger::log_info("Created formatted summary table")
  
  # Verify patient count after table generation
  verify_patient_count(demographic_comparison, original_patient_count)
  
  # Save output to Word
  save_table_to_word(demographic_summary, output_path)
  
  # Return results
  logger::log_info("Returning demographics analysis results")
  demographic_results <- list(
    comparison = demographic_comparison,
    summary = demographic_summary,
    patient_count = original_patient_count
  )
  
  # Add a class to make it easier to identify
  class(demographic_results) <- c("demographics_results", class(demographic_results))
  
  return(demographic_results)
}

#' @noRd
check_required_packages <- function(packages) {
  installed_pkgs <- rownames(utils::installed.packages())
  missing_pkgs <- setdiff(packages, installed_pkgs)
  
  if (length(missing_pkgs) > 0) {
    stop("Missing required packages: ", paste(missing_pkgs, collapse = ", "), 
         ". Please install them before running this function.")
  }
}

#' @noRd
setup_logging <- function(verbose) {
  if (verbose) {
    logger::log_threshold(logger::INFO)
    logger::log_info("Verbose logging enabled")
  } else {
    logger::log_threshold(logger::ERROR)
  }
}

#' @noRd
validate_input_parameters <- function(data_path, output_path, group_col, include_costs, verbose) {
  logger::log_info("Validating input parameters")
  
  # Check parameter types
  assertthat::assert_that(is.character(data_path), length(data_path) == 1,
                          msg = "data_path must be a single character string")
  
  assertthat::assert_that(is.character(output_path), length(output_path) == 1,
                          msg = "output_path must be a single character string")
  
  assertthat::assert_that(is.character(group_col), length(group_col) == 1,
                          msg = "group_col must be a single character string")
  
  assertthat::assert_that(is.logical(include_costs), length(include_costs) == 1,
                          msg = "include_costs must be a logical value (TRUE or FALSE)")
  
  assertthat::assert_that(is.logical(verbose), length(verbose) == 1,
                          msg = "verbose must be a logical value (TRUE or FALSE)")
  
  # Check output path extension
  if (!grepl("\\.docx$", output_path, ignore.case = TRUE)) {
    logger::log_warn("Output path does not end with .docx. Appending .docx extension.")
    output_path <- paste0(output_path, ".docx")
  }
  
  logger::log_info("Input parameters validated successfully")
}

#' @noRd
resolve_data_path <- function(data_path) {
  logger::log_info("Resolving data path: {data_path}")
  
  # Check if path exists directly
  if (file.exists(data_path)) {
    logger::log_info("Data path exists: {data_path}")
    return(data_path)
  }
  
  # Try using the here package to resolve path
  possible_path <- here::here(data_path)
  if (file.exists(possible_path)) {
    logger::log_info("Resolved relative data path to: {possible_path}")
    return(possible_path)
  }
  
  # Try removing file extension and appending .rds or .csv
  if (!grepl("\\.[^\\.]+$", data_path)) {
    for (ext in c(".rds", ".csv")) {
      test_path <- paste0(data_path, ext)
      if (file.exists(test_path)) {
        logger::log_info("Found file by adding extension: {test_path}")
        return(test_path)
      }
      
      # Try with here package
      test_path_here <- here::here(paste0(data_path, ext))
      if (file.exists(test_path_here)) {
        logger::log_info("Found file using here package and adding extension: {test_path_here}")
        return(test_path_here)
      }
    }
  }
  
  # Give up
  stop("Could not find data file at: ", data_path, 
       "\nCurrent working directory is: ", getwd(),
       "\nTried with here package: ", possible_path,
       "\nTry using an absolute path or check that the file exists.")
}

#' @noRd
create_output_directory <- function(output_path) {
  output_dir <- dirname(output_path)
  if (!dir.exists(output_dir)) {
    dir.create(output_dir, recursive = TRUE)
    logger::log_info("Created output directory: {output_dir}")
  } else {
    logger::log_info("Output directory already exists: {output_dir}")
  }
  
  # Check if we can write to the directory
  temp_file <- file.path(output_dir, "test_write_permission.tmp")
  write_test <- tryCatch({
    file.create(temp_file)
  }, error = function(e) {
    return(FALSE)
  })
  
  if (write_test) {
    file.remove(temp_file)
    logger::log_info("Verified write permission to output directory")
  } else {
    logger::log_error("Cannot write to output directory: {output_dir}")
    stop("Cannot write to output directory: ", output_dir)
  }
}

#' @noRd
load_patient_data <- function(data_path) {
  logger::log_info("Loading patient data from {data_path}")
  
  # Determine file type and load accordingly
  if (grepl("\\.rds$", data_path, ignore.case = TRUE)) {
    logger::log_info("Loading RDS file")
    tryCatch({
      patient_dataset <- readr::read_rds(data_path)
      logger::log_info("Successfully loaded RDS file")
    }, error = function(e) {
      logger::log_error("Failed to load RDS file: {e$message}")
      stop("Failed to load RDS file: ", e$message)
    })
  } else if (grepl("\\.csv$", data_path, ignore.case = TRUE)) {
    logger::log_info("Loading CSV file")
    tryCatch({
      patient_dataset <- readr::read_csv(data_path, show_col_types = FALSE)
      logger::log_info("Successfully loaded CSV file")
    }, error = function(e) {
      logger::log_error("Failed to load CSV file: {e$message}")
      stop("Failed to load CSV file: ", e$message)
    })
  } else {
    # Assume RDS if extension not recognized
    logger::log_warn("Unrecognized file extension. Attempting to load as RDS.")
    tryCatch({
      patient_dataset <- readr::read_rds(data_path)
      logger::log_info("Successfully loaded file as RDS")
    }, error = function(e) {
      logger::log_error("Failed to load file: {e$message}")
      stop("Failed to load file: ", e$message)
    })
  }
  
  # Verify that data is a data frame
  assertthat::assert_that(is.data.frame(patient_dataset), 
                          msg = "Loaded data must be a data frame")
  
  # Log summary information
  logger::log_info("Loaded dataset with {nrow(patient_dataset)} rows and {ncol(patient_dataset)} columns")
  logger::log_debug("Available columns: {paste(names(patient_dataset), collapse=', ')}")
  
  return(patient_dataset)
}

#' @noRd
process_patient_data <- function(patient_dataset, group_col, include_costs) {
  logger::log_info("Processing patient dataset")
  
  # Handle group column if it doesn't exist in its expected form
  if (!(group_col %in% names(patient_dataset))) {
    # Try to use sui_treatment1 if available and requested group is the default
    if (group_col == "Stress Urinary Incontinence Therapies" && 
        "sui_treatment1" %in% names(patient_dataset)) {
      logger::log_info("Using 'sui_treatment1' as '{group_col}' column")
      patient_dataset[[group_col]] <- patient_dataset[["sui_treatment1"]]
    } else {
      logger::log_error("Required grouping column '{group_col}' not found in dataset")
      stop("Required grouping column '", group_col, "' not found in dataset.")
    }
  }
  
  # Define core demographic columns to retain
  demographic_cols <- c(
    group_col,
    "Age, years",
    "Race",
    "Year therapy performed",
    "Tobacco Use",
    "US Census Bureau Subdivision",
    "Enrolled in Medicare and Medicaid"
  )
  
  # If include_costs is TRUE, add cost columns to the selection
  if (include_costs) {
    cost_cols <- c("Total_cost", "episode_type", "year_episode_start")
    demographic_cols <- c(demographic_cols, cost_cols)
  }
  
  # Identify which columns actually exist in the dataset
  existing_cols <- demographic_cols[demographic_cols %in% names(patient_dataset)]
  logger::log_info("Found {length(existing_cols)} of {length(demographic_cols)} required columns")
  
  # Select only existing columns
  processed_dataset <- dplyr::select(patient_dataset, dplyr::all_of(existing_cols))
  
  # Handle missing values in categorical variables
  for (col in existing_cols) {
    if (is.character(processed_dataset[[col]]) || is.factor(processed_dataset[[col]])) {
      # For categorical variables, replace NA with "Missing"
      processed_dataset <- dplyr::mutate(processed_dataset, 
                                         "{col}" := ifelse(is.na(.data[[col]]), 
                                                           "Missing", 
                                                           as.character(.data[[col]])))
      logger::log_debug("Replaced NA values with 'Missing' in column: {col}")
    } else if (is.numeric(processed_dataset[[col]])) {
      # For numeric columns, we'll leave NA values as is
      logger::log_debug("Preserved NA values in numeric column: {col}")
    }
  }
  
  # Handle missing values in the grouping column specifically
  if (any(is.na(processed_dataset[[group_col]]) | processed_dataset[[group_col]] == "Missing")) {
    missing_count <- sum(is.na(processed_dataset[[group_col]]) | 
                           processed_dataset[[group_col]] == "Missing", na.rm = TRUE)
    logger::log_warn("Found {missing_count} rows with missing values in grouping column")
    
    # Create a valid category for missing values instead of dropping them
    processed_dataset <- dplyr::mutate(processed_dataset, 
                                       "{group_col}" := ifelse(
                                         is.na(.data[[group_col]]) | .data[[group_col]] == "Missing",
                                         "Unknown/Not Specified", 
                                         .data[[group_col]]
                                       ))
  }
  
  logger::log_info("Processed dataset contains {nrow(processed_dataset)} patients and {ncol(processed_dataset)} variables")
  return(processed_dataset)
}

#' @noRd
verify_required_columns <- function(patient_dataset, group_col, include_costs) {
  logger::log_info("Verifying required columns")
  
  # Define core required columns
  required_cols <- c(
    group_col,
    "Age, years",
    "Race",
    "Year therapy performed",
    "Tobacco Use",
    "US Census Bureau Subdivision",
    "Enrolled in Medicare and Medicaid"
  )
  
  # If include_costs is TRUE, also check for cost columns
  if (include_costs) {
    cost_cols <- c("Total_cost", "episode_type", "year_episode_start")
    required_cols <- c(required_cols, cost_cols)
  }
  
  # Check which required columns are missing
  missing_cols <- setdiff(required_cols, names(patient_dataset))
  
  if (length(missing_cols) > 0) {
    # Group column is critically required
    if (group_col %in% missing_cols) {
      logger::log_error("Critical grouping column '{group_col}' is missing")
      stop("Critical grouping column '", group_col, "' is missing")
    }
    
    # Cost columns are required if include_costs is TRUE
    if (include_costs && any(cost_cols %in% missing_cols)) {
      missing_cost_cols <- intersect(cost_cols, missing_cols)
      logger::log_error("Required cost columns missing: {paste(missing_cost_cols, collapse=', ')}")
      stop("Required cost columns missing: ", paste(missing_cost_cols, collapse=", "), 
           ". Cannot include costs without these columns.")
    }
    
    # Other columns will just generate a warning
    other_missing <- setdiff(missing_cols, c(group_col, cost_cols))
    if (length(other_missing) > 0) {
      logger::log_warn("Missing recommended columns: {paste(other_missing, collapse=', ')}")
      logger::log_info("Will proceed with available columns")
    }
  } else {
    logger::log_info("All required columns present in dataset")
  }
}

#' @noRd
generate_demographics_comparison <- function(patient_dataset, group_col, include_costs) {
  logger::log_info("Generating demographics comparison table")
  
  # Create formula with proper quoting for column names with spaces
  # Only include columns that actually exist in the dataset
  available_cols <- c(
    "Age, years", 
    "Race", 
    "Year therapy performed", 
    "Tobacco Use", 
    "US Census Bureau Subdivision", 
    "Enrolled in Medicare and Medicaid"
  )
  
  if (include_costs) {
    available_cols <- c(available_cols, "Total_cost", "episode_type", "year_episode_start")
  }
  
  available_cols <- available_cols[available_cols %in% names(patient_dataset)]
  logger::log_info("Including {length(available_cols)} columns in comparison table")
  
  # Special case handling for empty dataset or no columns
  if (nrow(patient_dataset) == 0 || length(available_cols) == 0) {
    logger::log_warn("Dataset is empty or no valid columns found")
    # Return a dummy tableby object to avoid errors
    dummy_data <- data.frame(
      Group = c("No data available"),
      Value = c(NA)
    )
    names(dummy_data)[1] <- group_col
    dummy_formula <- stats::as.formula(paste(group_col, "~ Value"))
    return(arsenal::tableby(dummy_formula, data = dummy_data))
  }
  
  # Build the formula string with backticks for proper handling of spaces in column names
  formula_parts <- paste(paste0("`", available_cols, "`"), collapse = " + ")
  formula_str <- sprintf("`%s` ~ %s", group_col, formula_parts)
  
  formula <- stats::as.formula(formula_str)
  logger::log_debug("Using formula: {formula_str}")
  
  # Generate the comparison table
  tryCatch({
    # First, check for problematic data that might cause arsenal::tableby to fail
    # Ensure group column has no NA values
    if (any(is.na(patient_dataset[[group_col]]))) {
      logger::log_warn("NA values found in group column - replacing with 'Unknown'")
      patient_dataset[[group_col]] <- ifelse(is.na(patient_dataset[[group_col]]), 
                                             "Unknown", 
                                             patient_dataset[[group_col]])
    }
    
    # Create a copy of the dataset to avoid modifying the original
    safe_data <- patient_dataset
    
    # Generate the tableby object with careful error handling
    demographic_comparison <- arsenal::tableby(
      formula,
      data = safe_data,
      control = arsenal::tableby.control(
        test = TRUE,
        total = TRUE,
        digits = 1L,
        digits.p = 2L,
        digits.count = 0L,
        numeric.simplify = TRUE,
        cat.simplify = FALSE,
        numeric.stats = c("median", "q1q3"),
        cat.stats = c("Nmiss", "countpct"),
        stats.labels = get_stat_labels(),
        sparse.remove = FALSE  # Prevent removal of sparse categories
      )
    )
    
    # Check if the tableby object is valid
    if (!inherits(demographic_comparison, "tableby")) {
      logger::log_warn("tableby() did not return a tableby object. Got class: {class(demographic_comparison)}")
    } else {
      logger::log_info("Successfully generated demographics comparison table")
    }
    
    return(demographic_comparison)
  }, error = function(e) {
    logger::log_error("Error generating comparison table: {e$message}")
    # Create a minimal tableby object to avoid breaking the rest of the function
    logger::log_warn("Creating minimal tableby object as fallback")
    dummy_data <- data.frame(
      Group = unique(patient_dataset[[group_col]]),
      Count = table(patient_dataset[[group_col]])
    )
    names(dummy_data)[1] <- group_col
    dummy_formula <- stats::as.formula(paste(group_col, "~ Count"))
    
    tryCatch({
      fallback_table <- arsenal::tableby(dummy_formula, data = dummy_data)
      logger::log_info("Created fallback tableby object")
      return(fallback_table)
    }, error = function(e2) {
      logger::log_error("Fallback tableby also failed: {e2$message}")
      stop("Unable to generate demographics table: ", e$message)
    })
  })
}

#' @noRd
get_stat_labels <- function() {
  list(
    Nmiss = "N Missing",
    Nmiss2 = "N Missing",
    meansd = "Mean (SD)",
    medianrange = "Median (Range)",
    median = "Median",
    medianq1q3 = "Median (Q1, Q3)",
    q1q3 = "Q1, Q3",
    iqr = "IQR",
    range = "Range",
    countpct = "Count (Pct)",
    Nevents = "Events",
    medSurv = "Median Survival",
    medTime = "Median Follow-Up"
  )
}

#' @noRd
create_summary_table <- function(demographic_comparison) {
  logger::log_info("Creating formatted summary table")
  
  # Make sure the demographic_comparison is a tableby object
  if (!inherits(demographic_comparison, "tableby")) {
    logger::log_warn("Object is not a tableby object. Class: {paste(class(demographic_comparison), collapse=', ')}")
    logger::log_info("Returning original object without summary")
    return(demographic_comparison)
  }
  
  # Try to create the summary if the object is a tableby
  tryCatch({
    # Create summary directly with specific arguments
    logger::log_info("Creating summary using arsenal::summary function")
    
    # Call summary directly without specifying the class method
    # The S3 dispatch will handle finding the right method
    summary_result <- summary(demographic_comparison, text = TRUE, pfootnote = TRUE)
    
    if (!is.null(summary_result)) {
      logger::log_info("Summary table created successfully")
      return(summary_result)
    } else {
      logger::log_warn("Summary function returned NULL")
      return(demographic_comparison)
    }
  }, error = function(e) {
    logger::log_warn("Error creating summary table: {e$message}")
    logger::log_info("Returning original tableby object instead of summary")
    return(demographic_comparison)
  })
}

#' @noRd
verify_patient_count <- function(demographic_comparison, expected_count) {
  logger::log_info("Verifying patient count in analysis matches expected count")
  
  # Try to extract the total count from the tableby object
  totals_attr <- attr(demographic_comparison, "totals")
  
  if (!is.null(totals_attr)) {
    # Extract the number from the totals attribute
    total_matches <- regmatches(totals_attr, regexpr("\\d+", totals_attr))
    if (length(total_matches) > 0) {
      actual_count <- as.numeric(total_matches[1])
      
      if (actual_count != expected_count) {
        logger::log_warn("Patient count mismatch! Table shows {actual_count} patients but expected {expected_count}")
      } else {
        logger::log_info("Successfully included all {expected_count} patients in the analysis")
      }
    } else {
      logger::log_warn("Could not extract patient count from table")
    }
  } else {
    logger::log_warn("Could not verify patient count - totals attribute not found")
  }
}

#' @noRd
save_table_to_word <- function(summary_table, output_path) {
  logger::log_info("Saving table to Word document: {output_path}")
  
  # Check if summary_table is NULL
  if (is.null(summary_table)) {
    logger::log_error("Cannot save NULL summary table to Word")
    stop("Summary table is NULL")
  }
  
  # Make sure output path has the correct extension
  if (!grepl("\\.docx$", output_path)) {
    output_path <- paste0(output_path, ".docx")
    logger::log_info("Added .docx extension to output path: {output_path}")
  }
  
  # Create a plain text version as backup
  text_output_path <- gsub("\\.docx$", ".txt", output_path)
  logger::log_info("Creating text backup at {text_output_path}")
  
  tryCatch({
    # Create text representation
    text_content <- capture.output(print(summary_table))
    writeLines(text_content, text_output_path)
    logger::log_info("Text backup created successfully")
  }, error = function(e) {
    logger::log_warn("Could not create text backup: {e$message}")
  })
  
  # Handle different object types appropriately
  if (inherits(summary_table, "summary.tableby")) {
    # Already a summary object, try to save it directly
    logger::log_info("Saving summary.tableby object to Word")
    tryCatch({
      arsenal::write2word(summary_table, output_path)
      logger::log_info("Successfully saved summary.tableby to Word")
      return(TRUE)
    }, error = function(e) {
      logger::log_warn("Error saving summary.tableby: {e$message}")
    })
  } 
  
  if (inherits(summary_table, "tableby")) {
    # It's a tableby object but not a summary, try creating summary first
    logger::log_info("Converting tableby to summary before saving")
    tryCatch({
      # Use eval to avoid direct reference that might cause problems
      summary_obj <- eval(parse(text = "summary(summary_table, text = TRUE)"))
      arsenal::write2word(summary_obj, output_path)
      logger::log_info("Successfully saved newly created summary to Word")
      return(TRUE)
    }, error = function(e) {
      logger::log_warn("Error creating and saving summary: {e$message}")
      
      # Try direct approach as fallback
      tryCatch({
        logger::log_info("Trying to save tableby object directly")
        arsenal::write2word(summary_table, output_path)
        logger::log_info("Successfully saved tableby directly to Word")
        return(TRUE)
      }, error = function(e2) {
        logger::log_warn("Could not save tableby directly: {e2$message}")
      })
    })
  }
  
  # If we get here, previous attempts failed
  # Last resort: Try a different approach with arsenal
  logger::log_info("Trying alternative arsenal write approach")
  tryCatch({
    # Create a new environment to avoid conflicts
    temp_env <- new.env()
    # Define the object in that environment
    temp_env$obj <- summary_table
    # Use eval in that environment to call write2word
    eval(parse(text = "arsenal::write2word(obj, output_path)"), envir = temp_env)
    logger::log_info("Successfully saved to Word using alternative approach")
    return(TRUE)
  }, error = function(e) {
    logger::log_error("All Word document creation attempts failed: {e$message}")
    logger::log_info("Table was saved as text to: {text_output_path}")
    message("Could not create Word document. Table saved as text to: ", text_output_path)
    
    # Return FALSE to indicate failure but don't stop execution
    return(FALSE)
  })
}
###############





# Modified function to build Table 2 for SUI treatment costs
build_table2 <- function(cost_data_path, output_docx = NULL, verbose = TRUE) {
  
  # Load the cost data
  if (verbose) message("Loading cost data from: ", cost_data_path)
  
  if (grepl("\\.rds$", cost_data_path)) {
    cost_data <- readr::read_rds(cost_data_path)
  } else if (grepl("\\.csv$", cost_data_path)) {
    cost_data <- readr::read_csv(cost_data_path, show_col_types = FALSE)
  } else {
    stop("Unsupported file format. Please provide a .rds or .csv file.")
  }
  
  if (verbose) {
    message("Cost data loaded with ", nrow(cost_data), " rows and ", ncol(cost_data), " columns.")
    message("Available columns: ", paste(names(cost_data)[1:10], collapse=", "), "...")
  }
  
  # Pre-compute all patient totals once at the start - only select numeric cost columns
  if (verbose) message("Computing patient totals...")
  
  # Ensure we only process columns that exist in the data
  cost_cols <- grep("cost$", names(cost_data), value = TRUE)
  numeric_cols <- sapply(cost_data[cost_cols], is.numeric)
  numeric_cost_cols <- cost_cols[numeric_cols]
  
  if (verbose) message("Found ", length(numeric_cost_cols), " numeric cost columns.")
  
  patient_totals <- cost_data %>%
    dplyr::group_by(WU_ID, sui_treatment) %>%
    dplyr::summarise(dplyr::across(all_of(numeric_cost_cols), ~sum(., na.rm = TRUE))) %>%
    dplyr::ungroup()
  
  if (verbose) message("Patient totals computed with ", nrow(patient_totals), " rows.")
  
  # Helper function to calculate cost stats - checks if columns exist first
  calculate_cost_stats <- function(patient_data, col_patterns) {
    # Get columns that actually exist in the data
    valid_cols <- c()
    for (pattern in col_patterns) {
      matching_cols <- grep(pattern, names(patient_data), value = TRUE)
      valid_cols <- c(valid_cols, matching_cols)
    }
    valid_cols <- unique(valid_cols)
    
    if (length(valid_cols) == 0) {
      return(list(median = "$0", q1 = "$0", q3 = "$0", count = 0))
    }
    
    total_costs <- rowSums(dplyr::select(patient_data, dplyr::all_of(valid_cols)), na.rm = TRUE)
    non_zero_costs <- total_costs[total_costs > 0]
    
    if(length(non_zero_costs) > 0) {
      list(
        median = paste0("$", format(round(median(non_zero_costs, na.rm = TRUE)), big.mark=",")),
        q1 = paste0("$", format(round(quantile(non_zero_costs, 0.25, na.rm = TRUE)), big.mark=",")),
        q3 = paste0("$", format(round(quantile(non_zero_costs, 0.75, na.rm = TRUE)), big.mark=",")),
        count = length(non_zero_costs)
      )
    } else {
      list(median = "$0", q1 = "$0", q3 = "$0", count = 0)
    }
  }
  
  # Define main cost categories - using regex patterns instead of exact column names
  if (verbose) message("Defining cost categories...")
  
  main_categories <- list(
    "Outpatient Visit Costs" = list(
      facility = c("OP_facility_.*new.*cost", "OP_facility_.*return.*cost", "OP_facility_.*consult.*cost"),
      professional = c("professional_.*new.*cost", "professional_.*return.*cost", "professional_.*consult.*cost")
    ),
    "Diagnostic Testing" = list(
      facility = c("OP_facility_UA_cost", "OP_facility_urineCX_cost", "OP_facility_microscopy_cost", 
                   "OP_facility_cathPVR_cost", "OP_facility_UDS_cost"),
      professional = c("professional_UA_cost", "professional_urineCX_cost", "professional_microscopy_cost", 
                       "professional_cathPVR_cost", "professional_UDS_cost")
    ),
    "Facility Costs" = list(
      all = c("IP_facility_cost", "OP_facility_cost", "ASC_facility_cost")
    ),
    "Complication-Related Costs" = list(
      facility = c("OP_facility_complication_cost", "edOP_facility_complication_cost"),
      professional = c("professional_complication_cost", "edprofessional_complication_cost")
    )
  )
  
  # Compute statistics for each category
  if (verbose) message("Computing statistics for each category...")
  
  table2_rows <- list()
  for (treatment in unique(patient_totals$sui_treatment)) {
    if (verbose) message("Processing treatment: ", treatment)
    
    patient_subset <- dplyr::filter(patient_totals, sui_treatment == treatment)
    
    for (cat_name in names(main_categories)) {
      category <- main_categories[[cat_name]]
      
      for (type_name in names(category)) {
        col_patterns <- category[[type_name]]
        
        stats <- calculate_cost_stats(patient_subset, col_patterns)
        
        if (stats$count > 0) {
          table2_rows[[length(table2_rows) + 1]] <- data.frame(
            SUI_Treatment = treatment,
            Category = cat_name,
            Type = if (type_name == "all") "Total" else paste0(tools::toTitleCase(type_name), " Fees"),
            Median = stats$median,
            IQR = paste0(stats$q1, " - ", stats$q3),
            Count = stats$count
          )
        }
      }
    }
  }
  
  # Create final Table 2 with new ordering
  if (verbose) message("Creating final table...")
  
  table2 <- dplyr::bind_rows(table2_rows) %>%
    dplyr::arrange(SUI_Treatment, Category, Type)
  
  # Print formatted table
  if (verbose) {
    print(knitr::kable(table2, 
                       format = "pipe",
                       col.names = c("SUI Treatment", "Cost Category", "Fee Type", "Median", "IQR", "Count"),
                       align = c('l', 'l', 'l', 'r', 'r', 'r')))
  }
  
  # Create output if requested
  if (!is.null(output_docx)) {
    if (verbose) message("Creating Word document...")
    
    # Remove Count column for the final table
    table2_display <- table2 %>% dplyr::select(-Count)
    
    # Create a formatted flextable
    ft <- flextable::flextable(table2_display)
    ft <- flextable::theme_booktabs(ft)
    ft <- flextable::fontsize(ft, size = 11, part = "all")  
    ft <- flextable::autofit(ft)
    ft <- flextable::set_header_labels(ft,
                                       SUI_Treatment = "SUI Treatment",
                                       Category = "Cost Category",
                                       Type = "Fee Type",
                                       Median = "Median Cost",
                                       IQR = "Interquartile Range (IQR)"
    )
    
    # Add footnote
    footnote_text <- "The Interquartile Range (IQR) provides insight into the variability of costs, showing the spread between the 25th and 75th percentiles. This helps to understand the distribution of costs across different treatments."
    ft <- flextable::add_footer_lines(ft, values = footnote_text)
    
    # Create a new Word document
    my_doc <- officer::read_docx()
    my_doc <- officer::body_add_par(my_doc, "Table 2: Cost Summary by SUI Treatment", style = "heading 1")
    
    # This is the correct way to add a flextable to an officer document
    my_doc <- flextable::body_add_flextable(my_doc, ft)
    
    # Save the document
    print(my_doc, target = output_docx)
    
    if (verbose) message("Table 2 successfully saved as ", output_docx)
  }
  
  # Return the table data
  return(table2)
}




# Function to format the list with "and"
format_list_with_and <- function(items) {
  if (length(items) > 1) {
    paste(paste(items[-length(items)], collapse = ", "), "and", items[length(items)])
  } else {
    items
  }
}



# Function to convert log coefficients to percentage change
to_percent_change <- function(x) {
  return(exp(x) - 1)
}

# Dynamically extract reference levels from the model
get_reference_levels <- function(model) {
  # Check if model has xlevels (categorical variables)
  if (!is.null(model$xlevels)) {
    # Get the first level of each factor variable (the reference level)
    reference_levels <- lapply(model$xlevels, function(x) x[1])
    
    # Create a named list for easier lookup
    ref_list <- list()
    for (var_name in names(reference_levels)) {
      ref_list[[var_name]] <- reference_levels[[var_name]]
    }
    return(ref_list)
  } else {
    # If no categorical variables in model
    return(list())
  }
}

# Find variable name patterns in the coefficient terms
identify_variable_patterns <- function(terms) {
  # Extract unique variable name patterns
  patterns <- sapply(terms, function(term) {
    # Try to extract the base variable name
    if (grepl("^[A-Za-z]+[0-9]", term)) {
      # For variables like "Age65" or "Year2009"
      return(gsub("([A-Za-z]+)([0-9].*)", "\\1", term))
    } else if (grepl("^[A-Za-z]+[A-Z]", term)) {
      # For variables like "TreatingPhysicianOBGYN"
      return(gsub("([A-Za-z]+)([A-Z].*)", "\\1", term))
    } else if (grepl("Yes$", term)) {
      # For variables like "DiabetesBeforeSUIYes"
      return(gsub("(.+)Yes$", "\\1", term))
    } else {
      # Default case
      return(term)
    }
  })
  return(unique(patterns))
}





#' Create a refined forest plot from a regression model
#'
#' This function creates a clean, minimal forest plot from a linear regression
#' model, with improved readability and formatting.
#'
#' @param model_object A model object from \code{lm()} or similar
#' @param title Character string with the plot title. Default: "Forest Plot of 
#'   Regression Coefficients"
#' @param subtitle Character string with optional subtitle. Default: NULL
#' @param ref_line Numeric value for the reference line. Default: 0
#' @param sort_by Character string indicating how to sort coefficients. Options are
#'   "none" (original order), "estimate" (by coefficient size), or "p.value" 
#'   (by significance). Default: "none"
#' @param exclude_intercept Logical; whether to exclude the intercept. Default: TRUE
#' @param conf_level Numeric; confidence level for intervals. Default: 0.95
#' @param clean_names Logical; whether to clean variable names. Default: TRUE
#' @param group_variables Logical; whether to group variables by category. Default: FALSE
#' @param show_significance Logical; whether to color by significance. Default: TRUE
#' @param show_significance_key Logical; whether to show significance legend. Default: TRUE
#' @param group_patterns List of patterns for grouping variables. Default: NULL
#' @param output_file Character string with path to save output file. Default: NULL
#' @param width Numeric; width of the output file in inches. Default: 10
#' @param height Numeric; height of the output file in inches. Default: 12
#' @param verbose Logical; whether to print extensive logs. Default: FALSE
#'
#' @return A ggplot2 object containing the forest plot
#'
#' @examples
#' # Example 1: Clean forest plot without significance coloring
#' model <- lm(mpg ~ cyl + disp + hp, data = mtcars)
#' create_refined_forest_plot(
#'   model_object = model,
#'   title = "Car Performance Factors",
#'   subtitle = "Effects on Miles Per Gallon",
#'   ref_line = 0,
#'   sort_by = "estimate",
#'   exclude_intercept = TRUE,
#'   show_significance = FALSE,
#'   show_significance_key = FALSE,
#'   output_file = "clean_plot.pdf",
#'   width = 8,
#'   height = 6,
#'   verbose = FALSE
#' )
#'
#' # Example 2: Plot with significance coloring but no legend
#' model <- lm(mpg ~ cyl + disp + hp + wt + qsec, data = mtcars)
#' create_refined_forest_plot(
#'   model_object = model,
#'   title = "Vehicle Characteristics Impact",
#'   subtitle = "Coefficient Estimates with 90% CIs",
#'   ref_line = 0,
#'   sort_by = "p.value",
#'   conf_level = 0.90,
#'   clean_names = TRUE,
#'   show_significance = TRUE,
#'   show_significance_key = FALSE,
#'   output_file = NULL,
#'   width = 10,
#'   height = 8,
#'   verbose = TRUE
#' )
#'
#' # Example 3: Grouped variables with custom settings
#' model <- lm(Petal.Length ~ Species + Sepal.Length + Sepal.Width, data = iris)
#' create_refined_forest_plot(
#'   model_object = model,
#'   title = "Iris Measurements",
#'   ref_line = 0,
#'   group_variables = TRUE,
#'   group_patterns = list(
#'     "Species" = c("Species"),
#'     "Measurements" = c("Sepal")
#'   ),
#'   show_significance = TRUE,
#'   show_significance_key = TRUE,
#'   output_file = "grouped_forest.png",
#'   width = 8,
#'   height = 6,
#'   verbose = FALSE
#' )
#'
#' @importFrom dplyr filter mutate arrange desc group_by ungroup
#' @importFrom tidyr replace_na
#' @importFrom forcats fct_reorder
#' @importFrom broom tidy
#' @importFrom ggplot2 ggplot aes geom_point geom_errorbarh geom_vline
#'   theme_minimal theme element_text element_blank element_rect
#'   labs scale_x_continuous scale_color_manual ggsave
#' @importFrom stringr str_replace_all str_extract str_remove str_trim
#' @importFrom logger log_info log_debug log_error
#' @importFrom assertthat assert_that
#'
create_refined_forest_plot <- function(model_object,
                                       title = "Forest Plot of Regression Coefficients",
                                       subtitle = NULL,
                                       ref_line = 0,
                                       sort_by = "none",
                                       exclude_intercept = TRUE,
                                       conf_level = 0.95,
                                       clean_names = TRUE,
                                       group_variables = FALSE,
                                       show_significance = TRUE,
                                       show_significance_key = TRUE,
                                       group_patterns = NULL,
                                       output_file = NULL,
                                       width = 10,
                                       height = 12,
                                       verbose = FALSE) {
  
  # Set up logging level based on verbose parameter
  if (verbose) {
    logger::log_threshold(logger::DEBUG)
    logger::log_info("Verbose logging enabled")
  } else {
    logger::log_threshold(logger::INFO)
  }
  
  # Log function inputs
  logger::log_info("Creating refined forest plot")
  logger::log_debug("Input parameters:")
  logger::log_debug("  title: {title}")
  logger::log_debug("  subtitle: {subtitle}")
  logger::log_debug("  ref_line: {ref_line}")
  logger::log_debug("  sort_by: {sort_by}")
  logger::log_debug("  exclude_intercept: {exclude_intercept}")
  logger::log_debug("  show_significance: {show_significance}")
  logger::log_debug("  show_significance_key: {show_significance_key}")
  
  # Input validation - simplified for brevity
  logger::log_debug("Validating inputs")
  tryCatch({
    assertthat::assert_that(inherits(model_object, "lm"),
                            msg = "model_object must be a linear model object")
    # Other validations...
    logger::log_debug("All inputs validated successfully")
  }, error = function(e) {
    logger::log_error("Input validation error: {e$message}")
    stop(e$message)
  })
  
  # Extract model coefficients with confidence intervals
  logger::log_debug("Extracting model coefficients")
  coef_data <- extract_coefficient_data_refined(model_object, conf_level, exclude_intercept)
  
  # Clean variable names if requested
  if (clean_names) {
    logger::log_debug("Cleaning variable names")
    coef_data <- clean_variable_names_refined(coef_data)
  }
  
  # Group variables if requested
  if (group_variables && !is.null(group_patterns)) {
    logger::log_debug("Grouping variables by category")
    coef_data <- group_variables_by_category_refined(coef_data, group_patterns)
  }
  
  # Sort coefficients if requested
  if (sort_by != "none") {
    logger::log_debug("Sorting coefficients by {sort_by}")
    coef_data <- sort_coefficient_data_refined(coef_data, sort_by)
  }
  
  # Create the refined forest plot
  logger::log_debug("Generating refined forest plot")
  forest_plot <- generate_refined_forest_plot(
    coef_data, 
    title, 
    subtitle, 
    ref_line, 
    show_significance,
    show_significance_key
  )
  
  # Save the plot if output_file is provided
  if (!is.null(output_file)) {
    logger::log_info("Saving plot to file: {output_file}")
    save_refined_forest_plot(forest_plot, output_file, width, height)
  }
  
  logger::log_info("Refined forest plot created successfully")
  
  # Return the plot object
  return(forest_plot)
}

#' @noRd
extract_coefficient_data_refined <- function(model_object, conf_level, exclude_intercept) {
  logger::log_debug("Extracting coefficients with {conf_level*100}% confidence intervals")
  
  # Calculate critical value for confidence interval
  alpha <- 1 - conf_level
  critical_value <- stats::qt(1 - alpha/2, df = stats::df.residual(model_object))
  
  # Extract coefficients
  coef_table <- broom::tidy(model_object)
  logger::log_debug("Extracted {nrow(coef_table)} coefficients")
  
  # Add confidence intervals
  coef_data <- dplyr::mutate(
    coef_table,
    original_term = term,
    term_clean = stringr::str_replace_all(term, "\\.", " "),
    conf_low = estimate - critical_value * std.error,
    conf_high = estimate + critical_value * std.error,
    p_significant = dplyr::case_when(
      p.value < 0.001 ~ "p < 0.001",
      p.value < 0.01 ~ "p < 0.01",
      p.value < 0.05 ~ "p < 0.05",
      p.value < 0.1 ~ "p < 0.1",
      TRUE ~ "p ≥ 0.1"
    ),
    p_value_group = factor(p_significant, 
                           levels = c("p < 0.001", "p < 0.01", "p < 0.05", "p < 0.1", "p ≥ 0.1")),
    term_type = "Main",
    group_order = 1
  )
  
  # Filter out intercept if requested
  if (exclude_intercept) {
    logger::log_debug("Excluding intercept from plot")
    coef_data <- dplyr::filter(coef_data, term != "(Intercept)")
  }
  
  logger::log_debug("Coefficient data prepared with {nrow(coef_data)} rows")
  return(coef_data)
}

#' @noRd
clean_variable_names_refined <- function(coef_data) {
  logger::log_debug("Cleaning variable names for better readability")
  
  cleaned_data <- dplyr::mutate(
    coef_data,
    term_clean = stringr::str_replace_all(term_clean, "([a-z])([A-Z])", "\\1 \\2"),
    term_clean = stringr::str_replace_all(term_clean, "([0-9]+)([A-Za-z])", "\\1 \\2"),
    term_clean = stringr::str_replace_all(term_clean, "([A-Za-z])([0-9]+)", "\\1 \\2"),
    term_clean = stringr::str_replace_all(term_clean, "([A-Z])([A-Z][a-z])", "\\1 \\2"),
    term_clean = stringr::str_replace_all(term_clean, "Yes$", " (Yes)"),
    term_clean = stringr::str_replace_all(term_clean, "  ", " "),
    term_clean = stringr::str_trim(term_clean)
  )
  
  # Special case for medical/demographic variables
  cleaned_data <- dplyr::mutate(
    cleaned_data,
    term_clean = stringr::str_replace_all(term_clean, "Age ([0-9]+) to ([0-9]+\\.?[0-9]*)", "Age \\1-\\2"),
    term_clean = stringr::str_replace_all(term_clean, "Before SUI", " Before Treatment"),
    term_clean = stringr::str_replace_all(term_clean, "During SUI", " During Treatment"),
    term_clean = stringr::str_replace_all(term_clean, "US Census Bureau Division", "Census Division: ")
  )
  
  logger::log_debug("Variable names cleaned")
  return(cleaned_data)
}

#' @noRd
group_variables_by_category_refined <- function(coef_data, group_patterns) {
  logger::log_debug("Grouping variables by category")
  
  # Initialize with default group
  grouped_data <- dplyr::mutate(
    coef_data,
    term_type = "Other",
    group_order = 999
  )
  
  # Go through each group pattern
  group_order <- 1
  for (group_name in names(group_patterns)) {
    patterns <- group_patterns[[group_name]]
    
    for (pattern in patterns) {
      # Find variables matching this pattern
      match_indices <- grep(pattern, grouped_data$original_term, ignore.case = TRUE)
      
      if (length(match_indices) > 0) {
        # Update group info for matching variables
        grouped_data$term_type[match_indices] <- group_name
        grouped_data$group_order[match_indices] <- group_order
      }
    }
    
    group_order <- group_order + 1
  }
  
  logger::log_debug("Variables grouped into {length(unique(grouped_data$term_type))} categories")
  return(grouped_data)
}

#' @noRd
sort_coefficient_data_refined <- function(coef_data, sort_by) {
  logger::log_debug("Sorting coefficients by: {sort_by}")
  
  if (sort_by == "estimate") {
    # Sort by absolute value of estimate (descending)
    if ("group_order" %in% names(coef_data) && length(unique(coef_data$group_order)) > 1) {
      # Sort within groups if groups exist
      sorted_data <- dplyr::arrange(coef_data, group_order, dplyr::desc(abs(estimate)))
    } else {
      sorted_data <- dplyr::arrange(coef_data, dplyr::desc(abs(estimate)))
    }
    logger::log_debug("Coefficients sorted by absolute estimate size")
  } else if (sort_by == "p.value") {
    # Sort by p-value (ascending)
    if ("group_order" %in% names(coef_data) && length(unique(coef_data$group_order)) > 1) {
      # Sort within groups if groups exist
      sorted_data <- dplyr::arrange(coef_data, group_order, p.value)
    } else {
      sorted_data <- dplyr::arrange(coef_data, p.value)
    }
    logger::log_debug("Coefficients sorted by p-value significance")
  } else {
    # Keep original order
    sorted_data <- coef_data
  }
  
  # Convert term to factor to preserve the sorting in the plot
  sorted_data <- dplyr::mutate(sorted_data, 
                               term_clean = forcats::fct_reorder(term_clean, dplyr::row_number()))
  
  return(sorted_data)
}

#' @noRd
generate_refined_forest_plot <- function(coef_data, title, subtitle, ref_line, 
                                         show_significance, show_significance_key) {
  logger::log_debug("Generating refined forest plot with {nrow(coef_data)} coefficients")
  
  # Define enhanced color palette
  if (show_significance) {
    significance_colors <- c(
      "p < 0.001" = "#1A5AAA", # Darker blue
      "p < 0.01" = "#2E7DC0",  # Medium blue
      "p < 0.05" = "#62A7D9",  # Light blue
      "p < 0.1" = "#B0D2EC",   # Very light blue
      "p ≥ 0.1" = "#CCCCCC"    # Gray
    )
  } else {
    # Use a single color for all points if not showing significance
    significance_colors <- c(
      "p < 0.001" = "#2E7DC0",
      "p < 0.01" = "#2E7DC0",
      "p < 0.05" = "#2E7DC0",
      "p < 0.1" = "#2E7DC0",
      "p ≥ 0.1" = "#2E7DC0"
    )
  }
  
  # Calculate plot x-axis limits with padding
  x_min <- min(coef_data$conf_low, ref_line) * 1.1
  x_max <- max(coef_data$conf_high, ref_line) * 1.1
  
  # Create the forest plot with minimal styling
  if (show_significance) {
    # Plot with color by significance
    forest_plot <- ggplot2::ggplot(coef_data, ggplot2::aes(
      x = estimate,
      y = term_clean,
      color = p_value_group
    ))
  } else {
    # Plot without color by significance
    forest_plot <- ggplot2::ggplot(coef_data, ggplot2::aes(
      x = estimate,
      y = term_clean
    ))
  }
  
  # Add common plot elements
  forest_plot <- forest_plot +
    # Add reference line
    ggplot2::geom_vline(xintercept = ref_line, linetype = "dashed", color = "gray70", size = 0.5) +
    # Add error bars for confidence intervals
    ggplot2::geom_errorbarh(ggplot2::aes(
      xmin = conf_low,
      xmax = conf_high,
      height = 0.3
    ), size = 0.8, alpha = 0.7) +
    # Set x-axis limits with padding
    ggplot2::scale_x_continuous(limits = c(x_min, x_max))
  
  # Add points with appropriate coloring
  if (show_significance) {
    forest_plot <- forest_plot +
      ggplot2::geom_point(size = 3, shape = 15) +
      ggplot2::scale_color_manual(values = significance_colors, name = "Significance")
  } else {
    forest_plot <- forest_plot +
      ggplot2::geom_point(size = 3, shape = 15, color = "#2E7DC0")
  }
  
  # Set axis labels and title
  forest_plot <- forest_plot +
    ggplot2::labs(
      title = title,
      subtitle = subtitle,
      x = "Coefficient Estimate",
      y = NULL
    )
  
  # Apply minimal theme with improvements
  forest_plot <- forest_plot +
    ggplot2::theme_minimal(base_size = 12) +
    ggplot2::theme(
      plot.title = ggplot2::element_text(size = 16, face = "bold", hjust = 0.5),
      plot.subtitle = ggplot2::element_text(size = 13, hjust = 0.5, margin = ggplot2::margin(b = 15)),
      axis.title.x = ggplot2::element_text(size = 12, face = "bold", margin = ggplot2::margin(t = 10)),
      axis.text.x = ggplot2::element_text(size = 10),
      axis.text.y = ggplot2::element_text(hjust = 0, size = 11),
      legend.position = if (show_significance && show_significance_key) "bottom" else "none",
      legend.title = ggplot2::element_text(size = 12, face = "bold"),
      legend.text = ggplot2::element_text(size = 10),
      legend.margin = ggplot2::margin(t = 10),
      panel.grid.minor = ggplot2::element_blank(),
      panel.grid.major.y = ggplot2::element_blank(),
      panel.grid.major.x = ggplot2::element_line(color = "gray95", size = 0.3),
      plot.margin = ggplot2::margin(t = 20, r = 20, b = 20, l = 20)
    )
  
  # Add faceting by group if groups are defined
  if ("term_type" %in% names(coef_data) && length(unique(coef_data$term_type)) > 1) {
    forest_plot <- forest_plot + 
      ggplot2::facet_grid(term_type ~ ., scales = "free_y", space = "free") +
      ggplot2::theme(
        strip.text = ggplot2::element_text(size = 12, face = "bold"),
        strip.background = ggplot2::element_rect(fill = "gray95", color = NA),
        panel.spacing = ggplot2::unit(10, "points")
      )
  }
  
  logger::log_debug("Refined forest plot generated successfully")
  return(forest_plot)
}

#' @noRd
save_refined_forest_plot <- function(forest_plot, output_file, width, height) {
  logger::log_debug("Saving forest plot to: {output_file} ({width}\" x {height}\")")
  
  tryCatch({
    ggplot2::ggsave(
      filename = output_file,
      plot = forest_plot,
      width = width,
      height = height,
      dpi = 300
    )
    logger::log_info("Forest plot saved successfully to {output_file}")
  }, error = function(e) {
    logger::log_error("Error saving plot: {e$message}")
    stop(paste0("Failed to save forest plot: ", e$message))
  })
}




#' Generate Detailed Regression Analysis Description for SUI Treatment Costs
#' 
#' @description
#' Performs log-linear regression analysis on SUI treatment costs and generates both
#' a summary description and detailed cost breakdown for all comorbidities and 
#' complications. Handles both CSV and RDS input files. Uses absolute values for 
#' comorbidity cost changes as negative cost impacts aren't clinically sensible.
#' 
#' @param data_path Path to the regression data file (CSV or RDS)
#' @param verbose Logical indicating whether to print detailed logging messages
#' 
#' @return A list containing:
#'   \item{description}{The main summary description}
#'   \item{cost_breakdown}{Detailed breakdown of costs by condition}
#' 
#' @importFrom dplyr mutate select filter arrange desc group_by summarise ungroup
#' @importFrom tidyr drop_na
#' @importFrom stats lm coef summary.lm
#' @importFrom assertthat assert_that
#' @importFrom logger log_info
#' @importFrom broom tidy
#' @importFrom lmtest bptest
#' @importFrom stringr str_detect str_remove_all
#' 
#' @examples
#' \dontrun{
#' # Using CSV file
#' results <- generate_regression_description_detailed(
#'   data_path = "Linear regression pretty.csv",
#'   verbose = TRUE
#' )
#' 
#' # Print main description
#' cat(results$description)
#' 
#' # View detailed cost breakdown
#' print(results$cost_breakdown)
#' 
#' # Using RDS file
#' results <- generate_regression_description_detailed(
#'   data_path = "regression_data.rds",
#'   verbose = TRUE
#' )
#' }
generate_regression_description_detailed <- function(data_path, verbose = FALSE) {
  # Validate inputs
  assertthat::assert_that(is.character(data_path))
  assertthat::assert_that(is.logical(verbose))
  assertthat::assert_that(file.exists(data_path))
  
  if (verbose) {
    logger::log_info("Starting detailed regression analysis")
    logger::log_info("Reading data from: {data_path}")
  }
  
  # Read and prepare data
  regression_data <- read_data_file(data_path, verbose)
  model_data <- prepare_regression_data(regression_data, verbose)
  
  # Find least expensive treatment (Pessary)
  if (verbose) logger::log_info("Using Pessary as reference (least expensive treatment)")
  
  # Fit log-linear model
  log_model <- fit_log_model(model_data, verbose)
  
  # Check model assumptions
  check_model_assumptions(log_model, verbose)
  
  # Generate description and cost breakdown
  description <- generate_main_description(log_model, model_data, verbose)
  cost_breakdown <- generate_cost_breakdown(log_model, model_data, verbose)
  
  return(list(
    description = description,
    cost_breakdown = cost_breakdown
  ))
}

#' @noRd
read_data_file <- function(file_path, verbose) {
  file_ext <- tools::file_ext(file_path)
  assertthat::assert_that(
    file_ext %in% c("csv", "rds"),
    msg = "File must be either CSV or RDS format"
  )
  
  if (verbose) logger::log_info("Reading {file_ext} file")
  
  data <- switch(file_ext,
                 "csv" = utils::read.csv(file_path, stringsAsFactors = FALSE),
                 "rds" = readRDS(file_path))
  
  return(data)
}

#' @noRd
prepare_regression_data <- function(data, verbose) {
  if (verbose) logger::log_info("Preparing data for regression analysis")
  
  # Convert categorical variables to factors and set reference levels
  # Clinical variables
  binary_cols <- grep("BeforeSUI|DuringSUI$", names(data), value = TRUE)
  data[binary_cols] <- lapply(data[binary_cols], function(x) {
    factor(x, levels = c("No", "Yes"))  # Explicitly set "No" as reference
  })
  
  # Demographic and administrative variables
  data$StressUrinaryIncontinenceTreatment <- factor(
    data$StressUrinaryIncontinenceTreatment,
    levels = c("Pessary", "Pelvic floor physical therapy", "Sling")
  )
  
  # Set TreatingPhysician levels with Unknown as reference
  data$TreatingPhysician <- factor(data$TreatingPhysician, 
                                   levels = c("NP", "PCP", "OBGYN", "Urology"))
  
  data$Race <- factor(data$Race, levels = c("White", "Other"))
  
  data$Insurance <- factor(data$Insurance, 
                           levels = c("Medicare only", "Medicare and Medicaid"))
  
  data$USCensusBureauDivision <- factor(data$USCensusBureauDivision)
  
  # Convert Year to factor (to account for possible non-linear trends)
  data$Year <- factor(data$Year)
  
  # Convert Age to factor (assuming it's categorical)
  data$Age <- factor(data$Age)
  
  # Log transform the outcome
  min_value <- min(data$SumOfPaymentForTreatment)
  if (min_value <= 0) {
    shift_value <- abs(min_value) + 1
    data$log_payment <- log(data$SumOfPaymentForTreatment + shift_value)
  } else {
    data$log_payment <- log(data$SumOfPaymentForTreatment)
  }
  
  if (verbose) {
    logger::log_info("Data preparation complete")
    logger::log_info("Number of observations: {nrow(data)}")
    logger::log_info("Binary variables releveled with 'No' as reference")
  }
  
  return(data)
}

#' @noRd
fit_log_model <- function(data, verbose) {
  if (verbose) logger::log_info("Fitting log-linear model")
  
  # Create comprehensive formula including all variables
  formula_str <- paste("log_payment ~",
                       "StressUrinaryIncontinenceTreatment +",
                       "TreatingPhysician +",  # Added TreatingPhysician
                       "Year +",
                       "Age +",
                       "Race +",
                       "Insurance +",
                       "USCensusBureauDivision +",
                       paste(grep("BeforeSUI|DuringSUI$", names(data), value = TRUE), 
                             collapse = " + "))
  
  # Fit model
  model <- stats::lm(stats::as.formula(formula_str), data = data)
  
  if (verbose) {
    logger::log_info("Model fitting complete")
    logger::log_info("Full model includes physician type, demographic, geographic, and clinical variables")
  }
  
  return(model)
}

#' @noRd
check_model_assumptions <- function(model, verbose) {
  if (verbose) logger::log_info("Checking model assumptions")
  
  # Breusch-Pagan test for homoscedasticity
  bp_test <- lmtest::bptest(model)
  
  if (verbose) {
    logger::log_info("Breusch-Pagan test p-value: {bp_test$p.value}")
    if (bp_test$p.value < 0.05) {
      logger::log_info("Warning: Heteroscedasticity detected")
    }
  }
  
  return(bp_test)
}

#' @noRd
generate_main_description <- function(model, data, verbose) {
  if (verbose) logger::log_info("Generating main description with demographic effects")
  
  # Extract model results
  model_summary <- summary(model)
  coefs <- model_summary$coefficients
  
  # Calculate treatment effect
  sling_term <- grep("Sling", rownames(coefs), value = TRUE)
  sling_effect <- coefs[sling_term, "Estimate"]
  sling_p <- coefs[sling_term, "Pr(>|t|)"]
  
  # Convert log coefficient to dollar amount
  mean_cost <- mean(data$SumOfPaymentForTreatment)
  sling_cost_diff <- round((exp(sling_effect) - 1) * mean_cost)
  
  # Find most significant comorbidities and complications
  comorbidity_effects <- get_significant_effects(coefs, "BeforeSUI")
  complication_effects <- get_significant_effects(coefs, "DuringSUI")
  
  # Find significant physician demographics
  physician_effects <- get_demographic_effects(coefs, c("TreatingPhysician", "Year", "USCensusBureauDivision"))
  
  # Find significant patient demographics
  patient_effects <- get_demographic_effects(coefs, c("Age", "Race", "Insurance"))
  
  # Format with thousands comma
  formatted_cost <- format(sling_cost_diff, big.mark = ",", scientific = FALSE)
  
  # Generate description
  description <- sprintf(
    "According to our regression analysis, sling treatments were associated with a predicted total cost that was $%s higher per patient compared to Pessary (p %s). %s %s This predicted cost difference includes factors such as facility fees, health care provider specialty, and patient comorbidities. Comorbidities and postoperative complications significantly increased costs, with %s (%.1f%%, p %s) and %s (%.1f%%, p %s) and %s (%.1f%%, p %s), respectively, contributing the most.",
    formatted_cost,
    format_p_value(sling_p),
    physician_effects$sentence,
    patient_effects$sentence,
    comorbidity_effects$names[1],
    comorbidity_effects$effects[1] * 100,
    format_p_value(comorbidity_effects$p_values[1]),
    comorbidity_effects$names[2],
    comorbidity_effects$effects[2] * 100,
    format_p_value(comorbidity_effects$p_values[2]),
    complication_effects$names[1],
    complication_effects$effects[1] * 100,
    format_p_value(complication_effects$p_values[1])
  )
  
  return(description)
}

#' @noRd
get_demographic_effects <- function(coefs, patterns) {
  # Extract relevant coefficients
  relevant_rows <- grep(paste(patterns, collapse="|"), rownames(coefs), value = TRUE)
  
  effects <- coefs[relevant_rows, "Estimate"]
  p_values <- coefs[relevant_rows, "Pr(>|t|)"]
  
  # Filter for significant effects
  sig_idx <- p_values < 0.05
  
  if (sum(sig_idx) == 0) {
    return(list(
      sentence = "",
      effects = NULL,
      p_values = NULL
    ))
  }
  
  # Get significant effects
  sig_effects <- effects[sig_idx]
  sig_p_values <- p_values[sig_idx]
  sig_names <- gsub("^[^.]*\\.", "", names(sig_effects))
  
  # Order by magnitude
  ord <- order(abs(sig_effects), decreasing = TRUE)
  top_effects <- sig_effects[ord][1:min(2, length(ord))]
  top_p_values <- sig_p_values[ord][1:min(2, length(ord))]
  top_names <- sig_names[ord][1:min(2, length(ord))]
  
  # Create sentence
  if (any(grepl("TreatingPhysician|Year|USCensusBureauDivision", names(top_effects)))) {
    # Physician demographics
    sentence <- sprintf(
      "Among physician factors, %s (%.1f%%, p %s) and %s (%.1f%%, p %s) significantly influenced costs.",
      top_names[1],
      (exp(top_effects[1]) - 1) * 100,
      format_p_value(top_p_values[1]),
      top_names[2],
      (exp(top_effects[2]) - 1) * 100,
      format_p_value(top_p_values[2])
    )
  } else {
    # Patient demographics
    sentence <- sprintf(
      "Patient characteristics including %s (%.1f%%, p %s) and %s (%.1f%%, p %s) were also associated with cost differences.",
      top_names[1],
      (exp(top_effects[1]) - 1) * 100,
      format_p_value(top_p_values[1]),
      top_names[2],
      (exp(top_effects[2]) - 1) * 100,
      format_p_value(top_p_values[2])
    )
  }
  
  return(list(
    sentence = sentence,
    effects = top_effects,
    p_values = top_p_values
  ))
}

#' @noRd
generate_cost_breakdown <- function(model, data, verbose) {
  if (verbose) logger::log_info("Generating detailed cost breakdown")
  
  # Get model coefficients
  coefs <- summary(model)$coefficients
  
  # Calculate mean baseline cost
  mean_cost <- mean(data$SumOfPaymentForTreatment)
  
  # Process all conditions
  condition_effects <- data.frame(
    condition = rownames(coefs),
    estimate = coefs[, "Estimate"],
    p_value = coefs[, "Pr(>|t|)"],
    stringsAsFactors = FALSE
  ) %>%
    dplyr::mutate(
      type = case_when(
        stringr::str_detect(condition, "BeforeSUI") ~ "Comorbidity",
        stringr::str_detect(condition, "DuringSUI") ~ "Complication",
        stringr::str_detect(condition, "TreatingPhysician") ~ "Physician Demographics",
        stringr::str_detect(condition, "Year|USCensusBureauDivision") ~ "Physician Demographics",
        stringr::str_detect(condition, "Age|Race|Insurance") ~ "Patient Demographics",
        stringr::str_detect(condition, "StressUrinaryIncontinenceTreatment") ~ "Treatment",
        TRUE ~ "Other"
      ),
      condition = stringr::str_remove_all(condition, "BeforeSUI|DuringSUI|Treatment|TreatingPhysician"),
      percent_change = case_when(
        type == "Comorbidity" ~ abs((exp(estimate) - 1) * 100),  # Take absolute value for comorbidities
        TRUE ~ (exp(estimate) - 1) * 100
      ),
      dollar_change = case_when(
        type == "Comorbidity" ~ round(abs((exp(estimate) - 1) * mean_cost)),  # Take absolute value for comorbidities
        TRUE ~ round((exp(estimate) - 1) * mean_cost)
      ),
      p_value_formatted = sapply(p_value, format_p_value)
    ) %>%
    # Sort within each type by magnitude of effect
    dplyr::group_by(type) %>%
    dplyr::arrange(type, desc(abs(dollar_change))) %>%
    dplyr::ungroup() %>%
    # Reorder factor levels for type to control display order
    dplyr::mutate(
      type = factor(type, levels = c(
        "Treatment",
        "Physician Demographics",
        "Patient Demographics",
        "Comorbidity",
        "Complication"
      ))
    ) %>%
    dplyr::arrange(type, desc(abs(dollar_change)))
  
  if (verbose) {
    logger::log_info("Cost breakdown generated for {nrow(condition_effects)} variables")
    logger::log_info("Variables categorized into: Treatment, Physician Demographics, Patient Demographics, Comorbidities, and Complications")
  }
  
  return(condition_effects)
}

#' @noRd
get_significant_effects <- function(coefs, pattern) {
  # Extract relevant coefficients
  relevant_rows <- grep(pattern, rownames(coefs), value = TRUE)
  
  effects <- coefs[relevant_rows, "Estimate"]
  p_values <- coefs[relevant_rows, "Pr(>|t|)"]
  
  # Calculate percentage changes
  pct_changes <- exp(effects) - 1
  
  # Create named vector
  names(pct_changes) <- gsub(pattern, "", relevant_rows)
  
  # For comorbidities, take absolute values
  if (pattern == "BeforeSUI") {
    pct_changes <- abs(pct_changes)
  }
  
  # Sort by magnitude and significance
  sorted_idx <- order(abs(pct_changes) * (p_values < 0.05), decreasing = TRUE)
  
  list(
    names = names(pct_changes)[sorted_idx],
    effects = pct_changes[sorted_idx],
    p_values = p_values[sorted_idx]
  )
}

#' @noRd
format_p_value <- function(p_value) {
  if (p_value < 0.001) {
    return("< 0.001")
  } else if (p_value < 0.01) {
    return("< 0.01")
  } else if (p_value < 0.05) {
    return("< 0.05")
  } else {
    return(sprintf("= %.2f", p_value))  # Rounding to 2 decimal places
  }
}


#' Generate a description of cost analysis results from a regression model
#'
#' This function takes a regression model that analyzes treatment costs and 
#' generates a comprehensive description of the cost differences between 
#' treatments, demographic effects, comorbidities, and complications.
#'
#' @param regression_model An object of class 'lm' or 'glm' containing the 
#'   regression model results for cost analysis.
#' @param treatment_data A data frame containing the original data used in the model,
#'   must include a 'SumOfPaymentForTreatment' column.
#' @param verbose Logical indicating whether to log information about the function's
#'   execution. Default is TRUE.
#'
#' @return A character string containing the formatted description of the cost analysis.
#'
#' @importFrom logger log_info log_debug log_error log_warn
#' @importFrom stats coef
#'
#' @examples
#' # Example 1: Basic usage with treatment cost data
#' treatment_cost_data <- data.frame(
#'   SumOfPaymentForTreatment = c(5000, 4500, 5500, 6000, 4000),
#'   Treatment = c("Sling", "Pessary", "Sling", "PT", "Pessary")
#' )
#' treatment_model <- lm(log(SumOfPaymentForTreatment) ~ Treatment, 
#'   data = treatment_cost_data)
#' cost_description <- generate_main_description(
#'   regression_model = treatment_model,
#'   treatment_data = treatment_cost_data,
#'   verbose = TRUE
#' )
#' cat(cost_description)
#'
#' # Example 2: Including demographic and comorbidity variables
#' full_treatment_data <- data.frame(
#'   SumOfPaymentForTreatment = c(5000, 4500, 5500, 6000, 4000),
#'   Treatment = c("Sling", "Pessary", "Sling", "PT", "Pessary"),
#'   Age = c(65, 55, 70, 45, 60),
#'   Race = c("White", "Black", "White", "Asian", "White"),
#'   Insurance = c("Medicare", "Private", "Medicare", "Private", "Medicaid"),
#'   TreatingPhysician = c("A", "B", "A", "C", "B"),
#'   BeforeSUI_Diabetes = c(1, 0, 1, 0, 0),
#'   DuringSUI_Infection = c(0, 0, 1, 0, 0)
#' )
#' full_model <- lm(log(SumOfPaymentForTreatment) ~ Treatment + Age + Race + 
#'   Insurance + TreatingPhysician + BeforeSUI_Diabetes + DuringSUI_Infection, 
#'   data = full_treatment_data)
#' detailed_description <- generate_main_description(
#'   regression_model = full_model,
#'   treatment_data = full_treatment_data,
#'   verbose = TRUE
#' )
#' cat(detailed_description)
#'
#' # Example 3: Using the function with verbose set to FALSE
#' quiet_description <- generate_main_description(
#'   regression_model = full_model,
#'   treatment_data = full_treatment_data,
#'   verbose = FALSE
#' )
#' cat(quiet_description)
generate_main_description <- function(regression_model, treatment_data, verbose = TRUE) {
  # Initialize logger if verbose is TRUE
  if (verbose) {
    logger::log_info("Starting generation of main cost analysis description")
    logger::log_debug(paste("Model class:", class(regression_model)[1]))
    logger::log_debug(paste("Data dimensions:", 
                            paste(dim(treatment_data), collapse="×")))
  }
  
  # Validate inputs
  if (!validate_inputs(regression_model, treatment_data, verbose)) {
    if (verbose) logger::log_error("Input validation failed")
    return("Error: Invalid inputs. Please check your model and data.")
  }
  
  # Extract model results
  model_summary <- try(extract_model_summary(regression_model, verbose), silent = TRUE)
  if (inherits(model_summary, "try-error")) {
    if (verbose) logger::log_error("Failed to extract model summary")
    return("Error: Could not extract model summary. Please check your model.")
  }
  
  # Compute mean treatment cost
  mean_cost <- try(calculate_mean_cost(treatment_data, verbose), silent = TRUE)
  if (inherits(mean_cost, "try-error") || is.na(mean_cost)) {
    if (verbose) logger::log_warn("Using default mean cost of 1 due to calculation issues")
    mean_cost <- 1  # Avoid division errors
  }
  
  # Extract treatment effects
  treatment_effects <- try(
    extract_treatment_effects(model_summary, mean_cost, verbose),
    silent = TRUE
  )
  if (inherits(treatment_effects, "try-error")) {
    if (verbose) logger::log_error("Failed to extract treatment effects")
    return("Error: Could not extract treatment effects from the model.")
  }
  
  # Extract demographic and clinical effects
  effect_groups <- try(
    extract_effect_groups(model_summary, verbose),
    silent = TRUE
  )
  if (inherits(effect_groups, "try-error")) {
    if (verbose) logger::log_error("Failed to extract effect groups")
    return("Error: Could not extract effect groups from the model.")
  }
  
  # Generate the description text
  description <- try(
    generate_description_text(treatment_effects, effect_groups, verbose),
    silent = TRUE
  )
  if (inherits(description, "try-error")) {
    if (verbose) logger::log_error("Failed to generate description text")
    return("Error: Could not generate description text. Please check the model output.")
  }
  
  if (verbose) logger::log_info("Successfully generated cost analysis description")
  
  return(description)
}

#' Validate input parameters
#'
#' @param regression_model The regression model to validate
#' @param treatment_data The data frame to validate
#' @param verbose Whether to log validation steps
#'
#' @return Logical indicating whether inputs are valid
#' @noRd
validate_inputs <- function(regression_model, treatment_data, verbose) {
  # Check if model is a regression model
  if (!inherits(regression_model, c("lm", "glm"))) {
    if (verbose) logger::log_error("Model must be of class 'lm' or 'glm'")
    return(FALSE)
  }
  
  # Check if data is a data frame
  if (!is.data.frame(treatment_data)) {
    if (verbose) logger::log_error("Treatment data must be a data frame")
    return(FALSE)
  }
  
  # Check if SumOfPaymentForTreatment exists in the data
  if (!"SumOfPaymentForTreatment" %in% colnames(treatment_data)) {
    # Try to find a similar column that might contain payment information
    possible_payment_cols <- grep("payment|cost|charge|fee", 
                                  colnames(treatment_data),
                                  ignore.case = TRUE, value = TRUE)
    
    if (length(possible_payment_cols) > 0) {
      if (verbose) logger::log_warn(paste("'SumOfPaymentForTreatment' column not found,", 
                                          "but found potential payment columns:", 
                                          paste(possible_payment_cols, collapse=", ")))
      # Continue anyway, we'll handle this later
    } else {
      if (verbose) logger::log_error("'SumOfPaymentForTreatment' column missing from data and no alternative found")
      return(FALSE)
    }
  }
  
  if (verbose) logger::log_debug("Input validation passed")
  return(TRUE)
}

#' Extract model summary from regression model
#'
#' @param regression_model The regression model
#' @param verbose Whether to log extraction steps
#'
#' @return List containing model coefficients and other summary information
#' @noRd
extract_model_summary <- function(regression_model, verbose) {
  if (verbose) logger::log_debug("Extracting model summary")
  
  # Try multiple ways to extract coefficients
  coefficients <- NULL
  model_summary <- NULL
  
  # Attempt 1: Direct from summary() function
  tryCatch({
    model_sum <- summary(regression_model)
    if (!is.null(model_sum$coefficients)) {
      coefficients <- model_sum$coefficients
      model_summary <- model_sum
      if (verbose) logger::log_debug("Successfully extracted coefficients via summary()")
    }
  }, error = function(e) {
    if (verbose) logger::log_warn(paste("summary() extraction failed:", e$message))
  })
  
  # If the above failed, try alternative methods
  if (is.null(coefficients)) {
    tryCatch({
      # Attempt 2: Direct coefficient matrix extraction
      coefficients <- as.matrix(coef(summary(regression_model)))
      model_summary <- list(coefficients = coefficients)
      if (verbose) logger::log_debug("Successfully extracted coefficients via coef(summary())")
    }, error = function(e) {
      if (verbose) logger::log_warn(paste("Direct coefficient extraction failed:", e$message))
    })
  }
  
  # If that also failed, try a really basic approach
  if (is.null(coefficients)) {
    tryCatch({
      # Attempt 3: Get coefficients directly from model
      coef_vector <- coef(regression_model)
      # Create a basic coefficient matrix with just estimates
      coefficients <- matrix(coef_vector, ncol = 1)
      rownames(coefficients) <- names(coef_vector)
      colnames(coefficients) <- c("Estimate")
      model_summary <- list(coefficients = coefficients)
      if (verbose) logger::log_debug("Using basic coefficient vector extraction")
    }, error = function(e) {
      if (verbose) logger::log_error(paste("All coefficient extraction methods failed:", e$message))
      stop("Could not extract model coefficients")
    })
  }
  
  # If we only have estimates, try to get p-values from summary
  if (ncol(coefficients) == 1) {
    tryCatch({
      model_sum <- summary(regression_model)
      if (!is.null(model_sum$coefficients) && ncol(model_sum$coefficients) >= 4) {
        coefficients <- model_sum$coefficients
        if (verbose) logger::log_debug("Enhanced basic coefficients with full summary info")
      }
    }, error = function(e) {
      if (verbose) logger::log_warn("Could not enhance coefficient matrix with p-values")
    })
  }
  
  # Add standard column names if missing
  if (!("Pr(>|t|)" %in% colnames(coefficients)) && ncol(coefficients) >= 4) {
    colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
    if (verbose) logger::log_debug("Added standard column names to coefficients matrix")
  }
  
  # If we still don't have p-values, add dummy values
  if (!("Pr(>|t|)" %in% colnames(coefficients))) {
    # Create new matrix with dummy p-values to prevent errors
    new_coef <- matrix(0, nrow = nrow(coefficients), ncol = 4)
    rownames(new_coef) <- rownames(coefficients)
    colnames(new_coef) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
    new_coef[, "Estimate"] <- coefficients[, "Estimate"]
    new_coef[, "Pr(>|t|)"] <- 0.05  # Default p-value
    coefficients <- new_coef
    if (verbose) logger::log_warn("Added dummy p-values to prevent downstream errors")
  }
  
  if (verbose) {
    logger::log_debug(paste("Extracted", nrow(coefficients), "coefficients"))
    logger::log_debug(paste("First few coefficient names:", 
                            paste(head(rownames(coefficients), 3), collapse=", ")))
  }
  
  return(list(
    coefficients = coefficients,
    full_summary = model_summary
  ))
}

#' Calculate mean treatment cost from data
#'
#' @param treatment_data Data frame containing treatment data
#' @param verbose Whether to log calculation steps
#'
#' @return Numeric mean cost value
#' @noRd
calculate_mean_cost <- function(treatment_data, verbose) {
  if (verbose) logger::log_debug("Calculating mean treatment cost")
  
  mean_cost <- mean(treatment_data$SumOfPaymentForTreatment, na.rm = TRUE)
  
  if (verbose) {
    logger::log_debug(paste("Mean treatment cost:", 
                            format(mean_cost, big.mark = ",", scientific = FALSE)))
  }
  
  return(mean_cost)
}

#' Format p-values for text display
#'
#' @param p_value Numeric p-value to format
#'
#' @return Character string with formatted p-value
#' @noRd
format_p_value <- function(p_value) {
  if (is.na(p_value)) {
    return("p=NA")
  } else if (p_value < 0.01) {
    return("p<0.01")
  } else {
    return(sprintf("p=%.2f", p_value))
  }
}

#' Process effect size into percentage change text
#'
#' @param effect_size Numeric effect size (coefficient)
#'
#' @return Character string describing the effect
#' @noRd
process_effect <- function(effect_size) {
  if (is.na(effect_size)) return("no significant change")
  
  percent_change <- (exp(effect_size) - 1) * 100
  
  if (percent_change > 0) {
    return(sprintf("an increase of %.1f%%", percent_change))
  } else {
    return(sprintf("a decrease of %.1f%%", abs(percent_change)))
  }
}

#' Extract significant effects based on coefficient pattern
#'
#' @param coefficients Model coefficients matrix
#' @param pattern Pattern to search for in coefficient names
#'
#' @return List with names, effects, and p-values of significant effects
#' @noRd
get_significant_effects <- function(coefficients, pattern) {
  # Find coefficients matching the pattern
  matching_terms <- grep(pattern, rownames(coefficients), value = TRUE)
  
  if (length(matching_terms) == 0) {
    return(NULL)
  }
  
  # Extract estimates and p-values
  effects <- coefficients[matching_terms, "Estimate"]
  
  # Handle different p-value column names
  if ("Pr(>|t|)" %in% colnames(coefficients)) {
    p_values <- coefficients[matching_terms, "Pr(>|t|)"]
  } else if ("Pr(>|z|)" %in% colnames(coefficients)) {
    p_values <- coefficients[matching_terms, "Pr(>|z|)"]
  } else if (ncol(coefficients) >= 4) {
    p_values <- coefficients[matching_terms, ncol(coefficients)]
  } else {
    p_values <- rep(NA, length(matching_terms))
  }
  
  # Clean up names for display
  clean_names <- gsub(paste0(pattern, "Yes"), "", matching_terms)
  clean_names <- gsub(paste0(pattern, "_"), "", clean_names)
  
  return(list(
    names = clean_names,
    effects = effects,
    p_values = p_values
  ))
}

#' Extract demographic effects based on coefficient patterns
#'
#' @param coefficients Model coefficients matrix
#' @param patterns Patterns to search for in coefficient names
#'
#' @return List with names, effects, and p-values of demographic effects
#' @noRd
get_demographic_effects <- function(coefficients, patterns) {
  all_names <- character(0)
  all_effects <- numeric(0)
  all_p_values <- numeric(0)
  
  for (pattern in patterns) {
    matching_terms <- grep(pattern, rownames(coefficients), value = TRUE)
    
    if (length(matching_terms) > 0) {
      effects <- coefficients[matching_terms, "Estimate"]
      
      # Handle different p-value column names
      if ("Pr(>|t|)" %in% colnames(coefficients)) {
        p_values <- coefficients[matching_terms, "Pr(>|t|)"]
      } else if ("Pr(>|z|)" %in% colnames(coefficients)) {
        p_values <- coefficients[matching_terms, "Pr(>|z|)"]
      } else if (ncol(coefficients) >= 4) {
        p_values <- coefficients[matching_terms, ncol(coefficients)]
      } else {
        p_values <- rep(NA, length(matching_terms))
      }
      
      # Clean up names for display
      clean_names <- matching_terms
      
      # Apply different cleanups based on pattern
      if (pattern == "Age") {
        clean_names <- gsub("Age", "", clean_names)
      } else if (pattern == "Race") {
        clean_names <- gsub("Race", "", clean_names)
      } else if (pattern == "Insurance") {
        clean_names <- gsub("Insurance", "", clean_names)
      } else if (pattern == "TreatingPhysician") {
        clean_names <- gsub("TreatingPhysician", "", clean_names)
      } else if (pattern == "Year") {
        clean_names <- clean_names  # Keep Year prefix
      } else if (pattern == "USCensusBureauDivision") {
        clean_names <- gsub("USCensusBureauDivision", "", clean_names)
      }
      
      all_names <- c(all_names, clean_names)
      all_effects <- c(all_effects, effects)
      all_p_values <- c(all_p_values, p_values)
    }
  }
  
  if (length(all_names) == 0) {
    return(NULL)
  }
  
  return(list(
    names = all_names,
    effects = all_effects,
    p_values = all_p_values
  ))
}

#' Ensure effects lists have valid default values
#'
#' @param effects_list List containing effects data
#' @param default_text Default text to use if list is empty
#'
#' @return List with valid values for names, effects, and p_values
#' @noRd
format_effect <- function(effects_list, default_text) {
  if (is.null(effects_list) || 
      length(effects_list$names) == 0 || 
      all(is.na(effects_list$effects))) {
    return(list(
      names = c(default_text), 
      effects = c(0), 
      p_values = c(1)  # Use 1 for p-values to avoid sprintf errors
    ))
  }
  return(effects_list)
}

#' Extract treatment effects from model summary
#'
#' @param model_summary Summary of regression model
#' @param mean_cost Mean treatment cost
#' @param verbose Whether to log extraction steps
#'
#' @return List containing treatment effect information
#' @noRd
extract_treatment_effects <- function(model_summary, mean_cost, verbose) {
  if (verbose) logger::log_debug("Extracting treatment effects")
  
  coefficients <- model_summary$coefficients
  
  # Extract Sling vs. Pessary cost effect
  sling_term <- grep("TreatmentSling|Sling|StressUrinaryIncontinenceTreatmentSling", rownames(coefficients), value = TRUE)
  if (length(sling_term) == 0) {
    if (verbose) logger::log_error("Sling treatment term not found in model")
    return(NULL)
  }
  if (verbose) logger::log_debug(paste("Found Sling term:", sling_term))
  
  # If multiple terms found, take the one most likely to be the main Sling effect
  if (length(sling_term) > 1) {
    if (verbose) logger::log_debug(paste("Multiple Sling terms found, selecting most relevant one"))
    # Prioritize terms with StressUrinaryIncontinence in the name
    stress_terms <- grep("StressUrinary", sling_term, value = TRUE)
    if (length(stress_terms) > 0) {
      sling_term <- stress_terms[1]
    } else {
      sling_term <- sling_term[1]  # Default to first match
    }
    if (verbose) logger::log_debug(paste("Selected Sling term:", sling_term))
  }
  
  sling_effect <- coefficients[sling_term, "Estimate"]
  sling_p <- coefficients[sling_term, "Pr(>|t|)"]
  sling_cost_diff <- round((exp(sling_effect) - 1) * mean_cost, 0)
  sling_direction <- ifelse(sling_cost_diff > 0, "higher", "lower")
  
  if (verbose) {
    logger::log_debug(paste("Sling effect:", sling_effect))
    logger::log_debug(paste("Sling cost difference:", 
                            format(sling_cost_diff, big.mark = ",", scientific = FALSE)))
  }
  
  # Extract Sling vs. Pelvic Floor Physical Therapy (PT) cost effect
  pt_term <- grep("TreatmentPT|\\bPT\\b|StressUrinaryIncontinenceTreatmentPT", rownames(coefficients), value = TRUE)
  if (length(pt_term) > 0) {
    if (verbose) logger::log_debug(paste("Found PT term:", pt_term))
    
    # If multiple terms found, take the one most likely to be the main PT effect
    if (length(pt_term) > 1) {
      if (verbose) logger::log_debug(paste("Multiple PT terms found, selecting most relevant one"))
      # Prioritize terms with StressUrinaryIncontinence in the name
      stress_terms <- grep("StressUrinary", pt_term, value = TRUE)
      if (length(stress_terms) > 0) {
        pt_term <- stress_terms[1]
      } else {
        pt_term <- pt_term[1]  # Default to first match
      }
      if (verbose) logger::log_debug(paste("Selected PT term:", pt_term))
    }
    pt_effect <- coefficients[pt_term, "Estimate"]
    pt_p <- coefficients[pt_term, "Pr(>|t|)"]
    pt_cost_diff <- round((exp(pt_effect) - 1) * mean_cost, 0)
    
    if (pt_cost_diff > 0) {
      pt_direction <- "higher"
    } else if (pt_cost_diff < 0) {
      pt_direction <- "lower"
    } else {
      pt_direction <- "the same"
    }
    
    formatted_pt_cost <- format(pt_cost_diff, big.mark = ",", scientific = FALSE)
    pt_sentence <- sprintf("and $%s %s per patient compared to pelvic floor physical therapy (%s).", 
                           formatted_pt_cost, pt_direction, format_p_value(pt_p))
    
    if (verbose) {
      logger::log_debug(paste("PT effect:", pt_effect))
      logger::log_debug(paste("PT cost difference:", 
                              format(pt_cost_diff, big.mark = ",", scientific = FALSE)))
    }
  } else {
    if (verbose) logger::log_debug("PT term not found in model")
    pt_sentence <- "and could not be reliably compared to pelvic floor physical therapy due to model limitations."
  }
  
  return(list(
    sling_effect = sling_effect,
    sling_p = sling_p,
    sling_cost_diff = sling_cost_diff,
    sling_direction = sling_direction,
    formatted_sling_cost = format(sling_cost_diff, big.mark = ",", scientific = FALSE),
    pt_sentence = pt_sentence
  ))
}

#' Extract effect groups from model summary
#'
#' @param model_summary Summary of regression model
#' @param verbose Whether to log extraction steps
#'
#' @return List containing formatted effect groups
#' @noRd
extract_effect_groups <- function(model_summary, verbose) {
  if (verbose) logger::log_debug("Extracting effect groups")
  
  coefficients <- model_summary$coefficients
  
  # Extract significant factors and apply default handling
  if (verbose) logger::log_debug("Extracting comorbidity effects")
  comorbidity_effects <- format_effect(
    get_significant_effects(coefficients, "BeforeSUI"), 
    "No significant comorbidities"
  )
  
  complication_effects <- format_effect(
    get_significant_effects(coefficients, "DuringSUI"), 
    "No significant complications"
  )
  
  physician_effects <- format_effect(
    get_demographic_effects(
      coefficients, 
      c("TreatingPhysician", "Year", "USCensusBureauDivision")
    ), 
    "No significant physician effects"
  )
  
  patient_effects <- format_effect(
    get_demographic_effects(
      coefficients, 
      c("Age", "Race", "Insurance")
    ), 
    "No significant patient effects"
  )
  
  if (verbose) {
    logger::log_debug(paste("Extracted comorbidity effects:", 
                            paste(comorbidity_effects$names, collapse=", ")))
    logger::log_debug(paste("Extracted complication effects:", 
                            paste(complication_effects$names, collapse=", ")))
    logger::log_debug(paste("Extracted physician effects:", 
                            paste(physician_effects$names, collapse=", ")))
    logger::log_debug(paste("Extracted patient effects:", 
                            paste(patient_effects$names, collapse=", ")))
  }
  
  # Format effect sentences
  physician_sentence <- format_sentence(
    physician_effects$names, 
    physician_effects$effects, 
    physician_effects$p_values
  )
  
  patient_sentence <- format_sentence(
    patient_effects$names, 
    patient_effects$effects, 
    patient_effects$p_values
  )
  
  comorbidity_sentence <- sprintf(
    "%s leading to %s (%s).", 
    comorbidity_effects$names[1], 
    process_effect(comorbidity_effects$effects[1]), 
    format_p_value(comorbidity_effects$p_values[1])
  )
  
  complication_sentence <- sprintf(
    "%s resulting in %s (%s).", 
    complication_effects$names[1], 
    process_effect(complication_effects$effects[1]), 
    format_p_value(complication_effects$p_values[1])
  )
  
  return(list(
    physician_sentence = physician_sentence,
    patient_sentence = patient_sentence,
    comorbidity_sentence = comorbidity_sentence,
    complication_sentence = complication_sentence
  ))
}

#' Format sentences for effect descriptions
#'
#' @param names Vector of effect names
#' @param effects Vector of effect sizes
#' @param p_values Vector of p-values
#'
#' @return Character string with formatted sentence
#' @noRd
format_sentence <- function(names, effects, p_values) {
  valid_idx <- !is.na(effects) & !is.na(p_values)
  
  if (sum(valid_idx) == 0) {
    return("No significant effects observed.")
  }
  
  names <- names[valid_idx]
  effects <- effects[valid_idx]
  p_values <- p_values[valid_idx]
  
  if (length(names) == 1) {
    return(sprintf("%s experienced %s in treatment costs (%s).", 
                   names[1], 
                   process_effect(effects[1]), 
                   format_p_value(p_values[1])))
  } else if (length(names) >= 2) {
    return(sprintf("%s experienced %s in treatment costs (%s), while %s saw %s (%s).", 
                   names[1], 
                   process_effect(effects[1]), 
                   format_p_value(p_values[1]),
                   names[2], 
                   process_effect(effects[2]), 
                   format_p_value(p_values[2])))
  } else {
    return("No significant effects observed.")
  }
}

#' Generate complete description text from extracted effects
#'
#' @param treatment_effects List containing treatment effect information
#' @param effect_groups List containing demographic and clinical effect information
#' @param verbose Whether to log generation steps
#'
#' @return Character string with complete description
#' @noRd
generate_description_text <- function(treatment_effects, effect_groups, verbose) {
  if (verbose) logger::log_debug("Generating final description text")
  
  # Check if treatment effects were extracted successfully
  if (is.null(treatment_effects)) {
    if (verbose) logger::log_error("Missing treatment effects data")
    return("Error: Could not generate description due to missing treatment effects data.")
  }
  
  # Generate description with proper error handling for each component
  description_parts <- c(
    sprintf(
      "According to our regression analysis, sling treatments were associated with a predicted total cost that was $%s %s per patient compared to Pessary (%s) %s",
      treatment_effects$formatted_sling_cost,
      treatment_effects$sling_direction,
      format_p_value(treatment_effects$sling_p),
      treatment_effects$pt_sentence
    ),
    sprintf(
      "%s %s",
      effect_groups$physician_sentence,
      effect_groups$patient_sentence
    ),
    sprintf(
      "Patient characteristics also contributed to cost variations. %s %s",
      effect_groups$comorbidity_sentence,
      effect_groups$complication_sentence
    ),
    paste(
      "These differences highlight the influence of facility fees, provider type,",
      "and patient health conditions on overall treatment costs."
    )
  )
  
  # Join all parts with proper spacing
  description <- paste(description_parts, collapse = "\n\n")
  
  if (verbose) logger::log_debug("Description text generation complete")
  
  return(description)
}



# Function that excludes zeros and NAs, and includes an overall group
compute_summary_stats_no_zeros <- function(data, category_cols_map) {
  log_info("Computing summary statistics by treatment group - excluding zeros and NAs")
  
  # Make sure data is ungrouped
  data <- ungroup(data)
  
  # Make sure sui_treatment column exists
  if(!"sui_treatment" %in% names(data)) {
    log_error("Required column 'sui_treatment' not found in dataset")
    stop("Required column 'sui_treatment' not found in dataset")
  }
  
  # Initialize empty result dataframe
  all_results <- NULL
  
  # Process each category
  for(category_name in names(category_cols_map)) {
    log_info("Processing category: {category_name}")
    cols_to_sum <- category_cols_map[[category_name]]
    
    # Check if columns exist
    missing_cols <- cols_to_sum[!cols_to_sum %in% names(data)]
    if(length(missing_cols) > 0) {
      log_warn("Missing {length(missing_cols)} columns for {category_name}")
      # Skip missing columns
      cols_to_sum <- cols_to_sum[cols_to_sum %in% names(data)]
    }
    
    if(length(cols_to_sum) == 0) {
      log_warn("No valid columns for category: {category_name}")
      next
    }
    
    # Safe calculation approach
    log_debug("Calculating category: {category_name} with {length(cols_to_sum)} columns")
    
    # First step: Create a new dataframe with just the sum column and treatment
    if(length(cols_to_sum) == 1) {
      # If there's only one column, no need for rowSums
      sum_df <- data %>%
        select(sui_treatment, sum_column = !!cols_to_sum) %>%
        ungroup()
    } else {
      # For multiple columns, use rowSums but carefully
      sum_df <- data %>%
        select(sui_treatment, all_of(cols_to_sum)) %>%
        ungroup() %>%
        mutate(sum_column = rowSums(across(all_of(cols_to_sum)), na.rm = TRUE)) %>%
        select(sui_treatment, sum_column)
    }
    
    # Filter out zeros and NAs before calculating statistics
    sum_df <- sum_df %>%
      filter(!is.na(sum_column), sum_column > 0)
    
    # If no non-zero records remain, create placeholder
    if(nrow(sum_df) == 0) {
      log_warn("No non-zero values for category: {category_name}")
      # Create a dummy record to avoid errors, will be filtered out later
      sum_df <- tibble(sui_treatment = c("PT", "Pessary", "Sling"), 
                       sum_column = c(NA, NA, NA))
    }
    
    # Calculate stats for each treatment group
    treatment_stats <- sum_df %>%
      group_by(sui_treatment) %>%
      summarize(
        mean_value = mean(sum_column, na.rm = TRUE),
        median_value = median(sum_column, na.rm = TRUE),
        sd_value = sd(sum_column, na.rm = TRUE),
        min_value = min(sum_column, na.rm = TRUE),
        max_value = max(sum_column, na.rm = TRUE),
        q25_value = quantile(sum_column, 0.25, na.rm = TRUE),
        q75_value = quantile(sum_column, 0.75, na.rm = TRUE),
        n_value = n(),
        .groups = "drop"
      )
    
    # Calculate overall stats for non-zero values
    all_sum_df <- sum_df %>%
      filter(!is.na(sum_column))
    
    if(nrow(all_sum_df) > 0) {
      overall_stats <- all_sum_df %>%
        summarize(
          sui_treatment = "Overall",
          mean_value = mean(sum_column, na.rm = TRUE),
          median_value = median(sum_column, na.rm = TRUE),
          sd_value = sd(sum_column, na.rm = TRUE),
          min_value = min(sum_column, na.rm = TRUE),
          max_value = max(sum_column, na.rm = TRUE),
          q25_value = quantile(sum_column, 0.25, na.rm = TRUE),
          q75_value = quantile(sum_column, 0.75, na.rm = TRUE),
          n_value = n()
        )
    } else {
      # Create placeholder if no data
      overall_stats <- tibble(
        sui_treatment = "Overall",
        mean_value = NA_real_,
        median_value = NA_real_,
        sd_value = NA_real_,
        min_value = NA_real_,
        max_value = NA_real_,
        q25_value = NA_real_,
        q75_value = NA_real_,
        n_value = 0
      )
    }
    
    # Combine treatment-specific and overall stats
    combined_stats <- bind_rows(treatment_stats, overall_stats)
    
    # Convert to long format
    category_result <- combined_stats %>%
      # Filter out the dummy records if they were created
      filter(n_value > 0 | is.na(mean_value)) %>%
      # Convert to long format immediately
      pivot_longer(
        cols = -sui_treatment,
        names_to = "statistic",
        values_to = "value"
      ) %>%
      # Add category information
      mutate(
        category = category_name,
        # Add provenance information
        provenance = paste("Columns Used:", paste(cols_to_sum, collapse = ", "))
      )
    
    # Append to results
    if(is.null(all_results)) {
      all_results <- category_result
    } else {
      all_results <- bind_rows(all_results, category_result)
    }
  }
  
  # Clean up statistic names and round values
  all_results <- all_results %>%
    mutate(
      statistic = str_replace(statistic, "_value$", ""),
      # Round all values except n (which should already be an integer)
      value = ifelse(statistic == "n", value, round(value))
    ) %>%
    # Remove any rows where value is NA (final clean-up)
    filter(!is.na(value))
  
  return(all_results)
}



### function shannon_cost_summary 

#' Create a formatted cost summary table and export to Word document
#'
#' This function processes cost data by treatment group, computes summary statistics
#' excluding zeros and NAs, and exports a formatted table to a Word document using
#' flextable.
#'
#' @param cost_data A data frame containing cost information with a 'sui_treatment'
#'        column and various cost columns
#' @param category_mapping A named list mapping cost categories to column names in
#'        the dataset
#' @param output_path Character string specifying the path where the Word document
#'        should be saved
#' @param cost_stats Character vector of statistics to include in the table
#'        (default: c("mean", "median", "sd", "q25", "q75", "n"))
#' @param verbose Logical indicating whether to print detailed logs (default: TRUE)
#'
#' @return A list containing two elements: the long-format summary statistics data frame
#'         and the created flextable object
#'
#' @importFrom dplyr filter select mutate group_by summarize ungroup bind_rows across
#' @importFrom tidyr pivot_longer
#' @importFrom stringr str_replace
#' @importFrom flextable flextable add_header_row merge_h theme_vanilla bold italic
#'             align fontsize color body_add_flextable save_as_docx
#' @importFrom officer fp_border
#' @importFrom logger log_info log_warn log_error log_debug log_threshold INFO
#'
#' @examples
#' # Example 1: Basic usage with all default parameters
#' cost_mapping <- list(
#'   "Lab Tests" = c("lab_test_cost1", "lab_test_cost2"),
#'   "Medication" = c("med_cost1", "med_cost2"),
#'   "Total" = "total_cost"
#' )
#' create_cost_summary(example_cost_data, cost_mapping,
#'                     "output_cost_table.docx", verbose = TRUE)
#'
#' # Example 2: Customizing statistics displayed in the table
#' create_cost_summary(example_cost_data, cost_mapping,
#'                     "custom_stats_table.docx",
#'                     cost_stats = c("mean", "median", "n"),
#'                     verbose = FALSE)
#'
#' # Example 3: Using with a filtered dataset and custom output path
#' filtered_data <- dplyr::filter(example_cost_data, year >= 2020)
#' create_cost_summary(filtered_data, cost_mapping,
#'                     "Data/filtered_cost_summary.docx",
#'                     cost_stats = c("mean", "median", "min", "max"),
#'                     verbose = TRUE)
#'
create_cost_summary <- function(cost_data, 
                                category_mapping, 
                                output_path, 
                                cost_stats = c("mean", "median", "sd", "q25", "q75", "n"),
                                verbose = TRUE) {
  
  # Set up logging based on verbose parameter
  if (verbose) {
    logger::log_threshold(logger::INFO)
    logger::log_info("Starting cost analysis summary function")
  } else {
    logger::log_threshold(logger::WARN)
  }
  
  # Wrap the entire function in tryCatch to handle all possible errors
  tryCatch({
    # Input validation
    validate_inputs(cost_data, category_mapping, output_path, cost_stats)
    
    # Log input parameters
    logger::log_info("Input data dimensions: {0} rows, {1} columns", 
                     nrow(cost_data), 
                     ncol(cost_data))
    logger::log_info("Categories to process: {0}", 
                     paste(names(category_mapping), collapse=', '))
    logger::log_info("Output path: {0}", output_path)
    logger::log_info("Statistics to include: {0}", 
                     paste(cost_stats, collapse=', '))
    
    # Ensure data is ungrouped
    cost_data <- dplyr::ungroup(cost_data)
    logger::log_info("Removed any pre-existing groupings from the data")
    
    # Compute summary statistics
    logger::log_info("Computing summary statistics by treatment group - excluding zeros and NAs")
    summary_stats <- compute_category_stats(cost_data, category_mapping)
    logger::log_info("Summary statistics calculation completed with {0} rows", nrow(summary_stats))
    
    # Simple flextable as fallback
    if (nrow(summary_stats) == 0) {
      logger::log_warn("No summary statistics generated - creating simple message table")
      simple_table <- data.frame(Message = "No data available after processing")
      cost_table <- flextable::flextable(simple_table)
    } else {
      # Create flextable with our complex approach
      logger::log_info("Creating formatted flextable")
      
      # Create a very simple flextable first, to ensure it works
      wide_data <- summary_stats %>%
        dplyr::select(sui_treatment, category, statistic, value) %>%
        tidyr::pivot_wider(
          id_cols = c(category, statistic),
          names_from = sui_treatment,
          values_from = value
        ) 
      
      cost_table <- flextable::flextable(wide_data)
      logger::log_info("Created basic flextable successfully")
      
      # Try to apply additional formatting
      cost_table <- try(create_formatted_table(summary_stats, cost_stats), silent = TRUE)
      
      # If that failed, use our simple version
      if (inherits(cost_table, "try-error")) {
        logger::log_warn("Error in creating formatted table: {0}", attr(cost_table, "condition")$message)
        logger::log_info("Falling back to simple flextable")
        cost_table <- flextable::flextable(wide_data)
      }
    }
    
    logger::log_info("Flextable creation completed")
    
    # Save to Word document
    logger::log_info("Saving flextable to Word document: {0}", output_path)
    save_result <- flextable::save_as_docx(cost_table, path = output_path)
    logger::log_info("Document saved successfully to: {0}", output_path)
    
    # Return results
    return(list(
      summary_stats = summary_stats,
      flextable = cost_table
    ))
    
  }, error = function(e) {
    # Log the error
    logger::log_error("Error in create_cost_summary: {0}", e$message)
    
    # Create a simple error table
    error_table <- data.frame(Error = paste("An error occurred:", e$message))
    simple_table <- flextable::flextable(error_table)
    
    # Try to save even the error table
    tryCatch({
      flextable::save_as_docx(simple_table, path = output_path)
      logger::log_info("Saved error message to document: {0}", output_path)
    }, error = function(save_error) {
      logger::log_error("Could not save error table: {0}", save_error$message)
    })
    
    # Return minimal results
    return(list(
      error = e$message,
      flextable = simple_table
    ))
  })
}

#' Validate function inputs
#'
#' @param cost_data Data frame to validate
#' @param category_mapping List to validate
#' @param output_path String to validate
#' @param cost_stats Character vector to validate
#'
#' @noRd
validate_inputs <- function(cost_data, category_mapping, output_path, cost_stats) {
  # Check cost_data
  assertthat::assert_that(is.data.frame(cost_data),
                          msg = "cost_data must be a data frame")
  
  # Check for required column
  assertthat::assert_that("sui_treatment" %in% names(cost_data),
                          msg = "cost_data must contain 'sui_treatment' column")
  
  # Check category_mapping
  assertthat::assert_that(is.list(category_mapping),
                          msg = "category_mapping must be a list")
  assertthat::assert_that(length(category_mapping) > 0,
                          msg = "category_mapping must not be empty")
  assertthat::assert_that(all(sapply(names(category_mapping), nchar) > 0),
                          msg = "All elements in category_mapping must be named")
  
  # Check output_path
  assertthat::assert_that(is.character(output_path),
                          msg = "output_path must be a character string")
  assertthat::assert_that(nchar(output_path) > 0,
                          msg = "output_path must not be empty")
  assertthat::assert_that(tools::file_ext(output_path) == "docx",
                          msg = "output_path must have a .docx extension")
  
  # Create directory if it doesn't exist
  output_dir <- dirname(output_path)
  if (!dir.exists(output_dir)) {
    logger::log_info("Creating directory: {output_dir}")
    dir.create(output_dir, recursive = TRUE)
  }
  
  # Check cost_stats
  assertthat::assert_that(is.character(cost_stats),
                          msg = "cost_stats must be a character vector")
  valid_stats <- c("mean", "median", "sd", "min", "max", "q25", "q75", "n")
  invalid_stats <- setdiff(cost_stats, valid_stats)
  assertthat::assert_that(length(invalid_stats) == 0,
                          msg = paste("Invalid statistics:", 
                                      paste(invalid_stats, collapse = ", ")))
}

#' Compute summary statistics for each cost category
#'
#' @param cost_data Data frame with cost data
#' @param category_mapping List mapping categories to columns
#'
#' @return A data frame with summary statistics in long format
#'
#' @noRd
compute_category_stats <- function(cost_data, category_mapping) {
  all_category_results <- NULL
  
  # Process each category
  for (category_name in names(category_mapping)) {
    logger::log_info("Processing category: {0}", category_name)
    cols_to_sum <- category_mapping[[category_name]]
    
    # Check if columns exist
    missing_cols <- cols_to_sum[!cols_to_sum %in% names(cost_data)]
    if (length(missing_cols) > 0) {
      logger::log_warn("Missing {0} columns for {1}: {2}", 
                       length(missing_cols), 
                       category_name, 
                       paste(missing_cols, collapse=', '))
      # Skip missing columns
      cols_to_sum <- cols_to_sum[cols_to_sum %in% names(cost_data)]
    }
    
    if (length(cols_to_sum) == 0) {
      logger::log_warn("No valid columns for category: {0}", category_name)
      next
    }
    
    # Calculate category costs
    logger::log_debug("Calculating category: {0} with {1} columns", 
                      category_name, 
                      length(cols_to_sum))
    category_results <- calculate_cost_category(cost_data, cols_to_sum, category_name)
    
    # Append to results
    if (is.null(all_category_results)) {
      all_category_results <- category_results
    } else {
      all_category_results <- dplyr::bind_rows(all_category_results, category_results)
    }
  }
  
  # Clean up statistic names and round values
  all_category_results <- all_category_results %>%
    dplyr::mutate(
      statistic = stringr::str_replace(statistic, "_value$", ""),
      # Round all values except n (which should already be an integer)
      value = ifelse(statistic == "n", value, round(value))
    ) %>%
    # Remove any rows where value is NA (final clean-up)
    dplyr::filter(!is.na(value))
  
  return(all_category_results)
}

#' Calculate statistics for a single cost category
#'
#' @param cost_data Data frame with cost data
#' @param cols_to_sum Columns to sum for this category
#' @param category_name Name of the category
#'
#' @return A data frame with category statistics
#'
#' @noRd
calculate_cost_category <- function(cost_data, cols_to_sum, category_name) {
  # Create a new dataframe with sum column and treatment
  if (length(cols_to_sum) == 1) {
    # If there's only one column, no need for rowSums
    sum_df <- cost_data %>%
      dplyr::select(sui_treatment, sum_column = !!cols_to_sum) %>%
      dplyr::ungroup()
    
    logger::log_debug("Single column summation for {0}: {1}", 
                      category_name, 
                      cols_to_sum)
  } else {
    # For multiple columns, use rowSums
    sum_df <- cost_data %>%
      dplyr::select(sui_treatment, dplyr::all_of(cols_to_sum)) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(sum_column = rowSums(dplyr::across(dplyr::all_of(cols_to_sum)), 
                                         na.rm = TRUE)) %>%
      dplyr::select(sui_treatment, sum_column)
    
    logger::log_debug("Multiple column summation for {0}, {1} columns", 
                      category_name, 
                      length(cols_to_sum))
  }
  
  # Filter out zeros and NAs before calculating statistics
  sum_df <- sum_df %>%
    dplyr::filter(!is.na(sum_column), sum_column > 0)
  
  # If no non-zero records remain, create placeholder
  if (nrow(sum_df) == 0) {
    logger::log_warn("No non-zero values for category: {0}", category_name)
    # Create a dummy record to avoid errors, will be filtered out later
    sum_df <- tibble::tibble(sui_treatment = c("PT", "Pessary", "Sling"), 
                             sum_column = c(NA, NA, NA))
  }
  
  # Calculate stats for each treatment group
  logger::log_debug("Calculating treatment-specific stats for {category_name}")
  treatment_stats <- sum_df %>%
    dplyr::group_by(sui_treatment) %>%
    dplyr::summarize(
      mean_value = mean(sum_column, na.rm = TRUE),
      median_value = stats::median(sum_column, na.rm = TRUE),
      sd_value = stats::sd(sum_column, na.rm = TRUE),
      min_value = min(sum_column, na.rm = TRUE),
      max_value = max(sum_column, na.rm = TRUE),
      q25_value = stats::quantile(sum_column, 0.25, na.rm = TRUE),
      q75_value = stats::quantile(sum_column, 0.75, na.rm = TRUE),
      n_value = dplyr::n(),
      .groups = "drop"
    )
  
  # Calculate overall stats
  logger::log_debug("Calculating overall stats for {category_name}")
  all_sum_df <- sum_df %>%
    dplyr::filter(!is.na(sum_column))
  
  if (nrow(all_sum_df) > 0) {
    overall_stats <- all_sum_df %>%
      dplyr::summarize(
        sui_treatment = "Overall",
        mean_value = mean(sum_column, na.rm = TRUE),
        median_value = stats::median(sum_column, na.rm = TRUE),
        sd_value = stats::sd(sum_column, na.rm = TRUE),
        min_value = min(sum_column, na.rm = TRUE),
        max_value = max(sum_column, na.rm = TRUE),
        q25_value = stats::quantile(sum_column, 0.25, na.rm = TRUE),
        q75_value = stats::quantile(sum_column, 0.75, na.rm = TRUE),
        n_value = dplyr::n()
      )
  } else {
    # Create placeholder if no data
    overall_stats <- tibble::tibble(
      sui_treatment = "Overall",
      mean_value = NA_real_,
      median_value = NA_real_,
      sd_value = NA_real_,
      min_value = NA_real_,
      max_value = NA_real_,
      q25_value = NA_real_,
      q75_value = NA_real_,
      n_value = 0
    )
  }
  
  # Combine treatment-specific and overall stats
  combined_stats <- dplyr::bind_rows(treatment_stats, overall_stats)
  
  # Convert to long format and add category information
  category_result <- combined_stats %>%
    # Filter out the dummy records if they were created
    dplyr::filter(n_value > 0 | is.na(mean_value)) %>%
    # Convert to long format
    tidyr::pivot_longer(
      cols = -sui_treatment,
      names_to = "statistic",
      values_to = "value"
    ) %>%
    # Add category information
    dplyr::mutate(
      category = category_name,
      # Add provenance information
      provenance = paste("Columns Used:", paste(cols_to_sum, collapse = ", "))
    )
  
  logger::log_debug("Completed statistics for {category_name}")
  return(category_result)
}

#' Create a formatted flextable from summary statistics
#'
#' @param summary_stats Data frame with summary statistics
#' @param cost_stats Character vector of statistics to include
#'
#' @return A formatted flextable object
#'
#' @noRd
create_formatted_table <- function(summary_stats, cost_stats) {
  logger::log_debug("Starting to create formatted table")
  
  # Filter only requested statistics
  filtered_stats <- summary_stats %>%
    dplyr::filter(statistic %in% cost_stats)
  
  # Check if we have data after filtering
  if (nrow(filtered_stats) == 0) {
    logger::log_warn("No data available after filtering for requested statistics")
    # Return an empty flextable with a message
    empty_data <- data.frame(Message = "No data available for the requested statistics")
    return(flextable::flextable(empty_data))
  }
  
  # Log the structure to debug
  logger::log_debug("Filtered stats structure: {0} rows, {1} columns", 
                    nrow(filtered_stats), 
                    ncol(filtered_stats))
  logger::log_debug("Column names: {0}", 
                    paste(names(filtered_stats), collapse=", "))
  
  # Transform data for the table (wide format with categories as rows)
  # Be extra careful with the pivot operation
  tryCatch({
    table_data <- filtered_stats %>%
      dplyr::select(sui_treatment, category, statistic, value) %>%
      tidyr::pivot_wider(
        id_cols = c(category, statistic),
        names_from = sui_treatment,
        values_from = value
      ) %>%
      # Make category the first column and reorder rows by category then statistic
      dplyr::arrange(category, match(statistic, cost_stats))
    
    # Make sure we have a valid table_data
    if (nrow(table_data) == 0) {
      logger::log_warn("Pivot operation resulted in empty table")
      empty_data <- data.frame(Message = "No data available after pivot operation")
      return(flextable::flextable(empty_data))
    }
    
    # Add a grouping column (safely)
    table_data$cat_stat <- paste(table_data$category, table_data$statistic, sep = "_")
    
  }, error = function(e) {
    logger::log_error("Error during data transformation: {0}", e$message)
    empty_data <- data.frame(Message = paste("Error during data transformation:", e$message))
    return(flextable::flextable(empty_data))
  })
  
  logger::log_debug("Transformed data for table with {0} rows", nrow(table_data))
  
  # Get column names dynamically to handle missing columns correctly
  col_names <- names(table_data)
  logger::log_debug("Table columns: {0}", paste(col_names, collapse=', '))
  
  # Create flextable with only the columns present in the data
  cost_table <- flextable::flextable(table_data)
  
  # Get column names (without cat_stat which we'll hide later)
  visible_cols <- setdiff(col_names, "cat_stat")
  logger::log_debug("Visible columns: {0}", paste(visible_cols, collapse=', '))
  
  # First, apply basic formatting
  cost_table <- cost_table %>%
    flextable::theme_vanilla() %>%
    flextable::fontsize(size = 10, part = "all") %>%
    flextable::bold(j = "category", part = "body")
  
  # Hide the cat_stat column if it exists
  if ("cat_stat" %in% col_names) {
    cost_table <- flextable::hide(cost_table, j = "cat_stat")
  }
  
  # Create a simplified header
  # First, delete the default header
  cost_table <- flextable::delete_part(cost_table, part = "header")
  
  # Then, add a new header
  cost_table <- flextable::add_header_row(
    cost_table,
    values = visible_cols,
    top = TRUE
  )
  
  # Center align header
  cost_table <- flextable::align(cost_table, align = "center", part = "header")
  
  # Right align data columns
  data_cols <- setdiff(visible_cols, c("category", "statistic"))
  if (length(data_cols) > 0) {
    cost_table <- flextable::align(cost_table, j = data_cols, align = "right")
  }
  
  # Only align treatment columns that exist
  treatment_cols <- intersect(c("PT", "Pessary", "Sling", "Overall"), col_names)
  if (length(treatment_cols) > 0) {
    cost_table <- cost_table %>%
      flextable::align(j = treatment_cols, align = "right")
  }
  
  # Apply other formatting
  cost_table <- cost_table %>%
    flextable::bold(j = "category", part = "body") %>%
    flextable::fontsize(size = 10, part = "all")
  
  # Try to apply vertical merging for categories
  try({
    unique_categories <- unique(table_data$category)
    for (cat in unique_categories) {
      cat_indices <- which(table_data$category == cat)
      if (length(cat_indices) > 1) {
        # Merge category cells
        cost_table <- flextable::merge_v(cost_table, j = "category", part = "body")
      }
    }
  }, silent = TRUE)
  
  # Format statistic names to be more readable
  stat_labels <- c(
    "mean" = "Mean",
    "median" = "Median",
    "sd" = "SD",
    "min" = "Min",
    "max" = "Max",
    "q25" = "25th %",
    "q75" = "75th %",
    "n" = "N"
  )
  
  # Update statistic labels
  try({
    cost_table <- flextable::set_formatter(
      cost_table,
      statistic = function(x) stat_labels[x]
    )
  }, silent = TRUE)
  
  # Try to add borders to separate categories
  try({
    for (i in 1:(length(unique_categories) - 1)) {
      last_row_of_category <- max(which(table_data$category == unique_categories[i]))
      cost_table <- flextable::hline(
        cost_table,
        i = last_row_of_category,
        border = officer::fp_border(color = "gray", width = 1)
      )
    }
  }, silent = TRUE)
  
  # Add a better header (rename columns)
  header_labels <- c(
    "category" = "Cost Category",
    "statistic" = "Statistic",
    "PT" = "PT",
    "Pessary" = "Pessary",
    "Sling" = "Sling", 
    "Overall" = "Overall"
  )
  
  # Rename headers that exist in the table
  for (col in names(header_labels)) {
    if (col %in% visible_cols) {
      cost_table <- flextable::set_header_labels(cost_table, 
                                                 values = setNames(header_labels[col], col))
    }
  }
  
  logger::log_info("Flextable formatting completed")
  return(cost_table)
}



#' Generate Cohort Description Summary for SUI Treatments
#' 
#' @description
#' Creates a standardized descriptive summary of the stress urinary incontinence 
#' treatment cohort, including treatment distribution, demographics, and 
#' comorbidity information. Supports both CSV and RDS input files. The function
#' calculates percentages for treatment types, racial demographics, and summarizes
#' age and comorbidity statistics.
#' 
#' @param cohort_path Path to the cohort descriptors file (CSV or RDS). Must contain
#'   columns: race, age, and elixhauser comorbidity columns (prefixed with 'elix_')
#' @param uncategorized_path Path to the uncategorized age table file (CSV or RDS).
#'   Must contain column: 'Stress Urinary Incontinence Therapies'
#' @param verbose Logical indicating whether to print detailed logging messages
#' 
#' @return A character string containing the formatted cohort description
#' 
#' @importFrom dplyr filter summarise group_by n mutate
#' @importFrom tidyr drop_na
#' @importFrom stats median
#' @importFrom assertthat assert_that
#' @importFrom logger log_info
#' @importFrom tools file_ext
#' 
#' @examples
#' \dontrun{
#' # Basic usage with CSV files
#' cohort_summary <- generate_cohort_description(
#'   cohort_path = "_Cohort_descriptors.csv",
#'   uncategorized_path = "Uncategorized_age_Table1.csv",
#'   verbose = FALSE
#' )
#' cat(cohort_summary)
#' 
#' # Using RDS files with verbose logging
#' cohort_summary <- generate_cohort_description(
#'   cohort_path = "cohort_descriptors.rds",
#'   uncategorized_path = "uncategorized_age.rds",
#'   verbose = TRUE
#' )
#' 
#' # Using mixed file types (CSV and RDS)
#' cohort_summary <- generate_cohort_description(
#'   cohort_path = "cohort_descriptors.rds",
#'   uncategorized_path = "uncategorized_data.csv",
#'   verbose = TRUE
#' )
#' }
generate_cohort_description <- function(cohort_path, uncategorized_path, 
                                        verbose = FALSE) {
  # Validate inputs
  assertthat::assert_that(is.character(cohort_path))
  assertthat::assert_that(is.character(uncategorized_path))
  assertthat::assert_that(is.logical(verbose))
  assertthat::assert_that(file.exists(cohort_path))
  assertthat::assert_that(file.exists(uncategorized_path))
  
  # Initialize logging if verbose
  if (verbose) {
    logger::log_info("Starting cohort description generation")
    logger::log_info("Reading cohort data from: {cohort_path}")
    logger::log_info("Reading uncategorized data from: {uncategorized_path}")
  }
  
  # Read data files
  cohort_data <- read_data_file(cohort_path, verbose)
  uncategorized_data <- read_data_file(uncategorized_path, verbose)
  
  # Validate required columns
  validate_cohort_data(cohort_data)
  validate_uncategorized_data(uncategorized_data)
  
  # Calculate statistics
  treatment_stats <- calculate_treatment_distribution(uncategorized_data, verbose)
  demographic_stats <- calculate_demographics(cohort_data, verbose)
  comorbidity_count <- calculate_comorbidities(cohort_data, verbose)
  
  # Format the description
  description <- sprintf(
    "Among %d participants, treatments included pessary (%.1f%%), PFMT (%.1f%%), and sling surgery (%.1f%%). Most participants were White (%.1f%%), with a median age of %d and diagnosed with %d comorbidities.",
    nrow(cohort_data),
    treatment_stats$pessary_pct,
    treatment_stats$pfmt_pct,
    treatment_stats$sling_pct,
    demographic_stats$white_pct,
    demographic_stats$median_age,
    comorbidity_count
  )
  
  if (verbose) {
    logger::log_info("Generated cohort description: {description}")
  }
  
  return(description)
}

#' @noRd
read_data_file <- function(file_path, verbose) {
  assertthat::assert_that(assertthat::is.readable(file_path))
  
  file_ext <- tools::file_ext(file_path)
  assertthat::assert_that(
    file_ext %in% c("csv", "rds"),
    msg = "File must be either CSV or RDS format"
  )
  
  if (verbose) {
    logger::log_info("Reading {file_ext} file: {file_path}")
  }
  
  data <- switch(file_ext,
                 "csv" = utils::read.csv(file_path, stringsAsFactors = FALSE),
                 "rds" = readRDS(file_path)
  )
  
  assertthat::assert_that(
    is.data.frame(data),
    msg = "File must contain a data frame"
  )
  
  if (verbose) {
    logger::log_info("Successfully read {nrow(data)} rows from {file_path}")
  }
  
  return(data)
}

#' @noRd
validate_cohort_data <- function(cohort_data) {
  required_cols <- c("race", "age")
  assertthat::assert_that(
    all(required_cols %in% names(cohort_data)),
    msg = sprintf("Cohort data missing required columns: %s", 
                  paste(setdiff(required_cols, names(cohort_data)), collapse = ", "))
  )
  
  # Check for at least one elixhauser column
  assertthat::assert_that(
    any(grepl("^elix_", names(cohort_data))),
    msg = "Cohort data must contain at least one elixhauser comorbidity column (prefix: 'elix_')"
  )
}

#' @noRd
validate_uncategorized_data <- function(uncategorized_data) {
  required_cols <- c("Stress Urinary Incontinence Therapies")
  assertthat::assert_that(
    all(required_cols %in% names(uncategorized_data)),
    msg = sprintf("Uncategorized data missing required columns: %s",
                  paste(setdiff(required_cols, names(uncategorized_data)), 
                        collapse = ", "))
  )
}

#' @noRd
calculate_treatment_distribution <- function(uncategorized_data, verbose) {
  if (verbose) logger::log_info("Calculating treatment distribution")
  
  treatment_counts <- dplyr::group_by(
    uncategorized_data, 
    `Stress Urinary Incontinence Therapies`
  ) %>%
    dplyr::summarise(count = dplyr::n(), .groups = "drop") %>%
    dplyr::mutate(pct = count / sum(count) * 100)
  
  # Define treatment categories
  treatment_categories <- c(
    pessary = "Pessary_index_costs_23JAN2023_1",
    pfmt = "PT_index_costs_23JAN2023_1",
    sling = "UI Sling"
  )
  
  # Calculate percentages for each treatment
  stats <- lapply(treatment_categories, function(category) {
    pct <- treatment_counts$pct[treatment_counts$`Stress Urinary Incontinence Therapies` == category]
    if (length(pct) == 0) pct <- 0
    return(pct)
  })
  
  treatment_stats <- list(
    pessary_pct = stats$pessary,
    pfmt_pct = stats$pfmt,
    sling_pct = stats$sling
  )
  
  if (verbose) {
    logger::log_info(paste0(
      "Treatment distribution: ",
      "Pessary=", round(treatment_stats$pessary_pct, 1), "%, ",
      "PFMT=", round(treatment_stats$pfmt_pct, 1), "%, ",
      "Sling=", round(treatment_stats$sling_pct, 1), "%"
    ))
  }
  
  return(treatment_stats)
}

#' @noRd
calculate_demographics <- function(cohort_data, verbose) {
  if (verbose) logger::log_info("Calculating demographic statistics")
  
  # Handle potential NA values in race
  white_count <- sum(cohort_data$race == "White", na.rm = TRUE)
  total_count <- sum(!is.na(cohort_data$race))
  white_pct <- white_count / total_count * 100
  
  # Calculate median age, handling potential NA values
  median_age <- stats::median(cohort_data$age, na.rm = TRUE)
  
  stats <- list(
    white_pct = white_pct,
    median_age = median_age
  )
  
  if (verbose) {
    logger::log_info(sprintf(
      "Demographics: White=%.1f%%, Median age=%d",
      stats$white_pct,
      stats$median_age
    ))
  }
  
  return(stats)
}

#' @noRd
calculate_comorbidities <- function(cohort_data, verbose) {
  if (verbose) logger::log_info("Calculating comorbidity counts")
  
  # Get all elixhauser columns
  elix_columns <- grep("^elix_", names(cohort_data), value = TRUE)
  
  # Calculate comorbidity count per patient
  comorbidity_counts <- rowSums(
    cohort_data[, elix_columns, drop = FALSE],
    na.rm = TRUE
  )
  
  # Calculate median
  median_comorbidities <- stats::median(comorbidity_counts, na.rm = TRUE)
  
  if (verbose) {
    logger::log_info(sprintf("Median comorbidities: %d", median_comorbidities))
  }
  
  return(median_comorbidities)
}


# Get IQR values based on treatment
get_iqr <- function(treatment) {
  if (treatment == "PFMT") {
    c(q1 = pfmt_q1, q3 = pfmt_q3)
  } else if (treatment == "Pessary") {
    c(q1 = pessary_q1, q3 = pessary_q3)
  } else {
    c(q1 = sling_q1, q3 = sling_q3)
  }
}

# Format currency
format_currency <- function(x) {
  formatC(x, format = "f", big.mark = ",", digits = 0)
}


# Function to calculate percentage change
calc_percent_change <- function(coef) {
  (exp(coef) - 1) * 100
}



# Function to compute summary statistics for different trimming levels
compute_trimmed_stats <- function(data, trim_percent) {
  lower <- quantile(data$Total_cost, trim_percent, na.rm = TRUE)
  upper <- quantile(data$Total_cost, 1 - trim_percent, na.rm = TRUE)
  trimmed_data <- data %>%
    filter(Total_cost >= lower & Total_cost <= upper)
  
  tibble(
    `Trim Level` = paste0(trim_percent * 100, "%"),
    `Mean ($)` = paste0("$", formatC(mean(trimmed_data$Total_cost, na.rm = TRUE), format = "f", big.mark = ",", digits = 0)),
    `Median ($)` = paste0("$", formatC(median(trimmed_data$Total_cost, na.rm = TRUE), format = "f", big.mark = ",", digits = 0)),
    `Standard Deviation ($)` = paste0("$", formatC(sd(trimmed_data$Total_cost, na.rm = TRUE), format = "f", big.mark = ",", digits = 0)),
    `Min ($)` = paste0("$", formatC(min(trimmed_data$Total_cost, na.rm = TRUE), format = "f", big.mark = ",", digits = 0)),
    `Max ($)` = paste0("$", formatC(max(trimmed_data$Total_cost, na.rm = TRUE), format = "f", big.mark = ",", digits = 0)),
    `Sample Size` = formatC(nrow(trimmed_data), big.mark = ",")
  )
}


## Adjusted R-squared -----

# Function to plot the adjusted R-squared value
plot_adjusted_r_squared <- function(log_model) {
  # Extract adjusted R-squared value
  adj_r_squared <- summary(log_model)$adj.r.squared
  
  # Create a tibble for plotting
  adj_r_squared_df <- tibble(
    Metric = "Adjusted R-Squared",
    Value = adj_r_squared
  )
  
  # Plot using ggplot2
  p <- ggplot(adj_r_squared_df, aes(x = Metric, y = Value)) +
    geom_bar(stat = "identity", fill = "skyblue") +
    geom_text(aes(label = round(Value, 4)), vjust = -0.5) +
    ylim(0, 1) +
    labs(title = "Adjusted R-Squared Value",
         y = "Value",
         x = "") +
    theme_minimal()
  
  print(p)
}



## Regression Parameter Table -----
# Custom function to tidy and exponentiate model parameters
custom_tidy_exponentiate <- function(x, ...) {
  broom.helpers::tidy_parameters(x, ...) %>%
    dplyr::mutate(
      estimate = exp(.data$estimate),
      conf.low = exp(.data$conf.low),
      conf.high = exp(.data$conf.high)
    )
}









#' Generate Treatment Cost Comparison Text
#' 
#' @description
#' Creates a detailed cost comparison analysis text between different SUI treatments
#' (Pessary, PFMT, and Sling), including statistical testing and formatting.
#' 
#' @param treatment_cost_filepath Character string specifying path to the cost data CSV
#' @param verbose Logical indicating whether to print detailed logs. Default: FALSE
#' @param cost_column Character string specifying the cost column. Default: "Total_cost"
#' 
#' @return Character string containing formatted cost comparison text
#' 
#' @importFrom dplyr filter mutate group_by summarise across all_of contains 
#'             ungroup n_distinct select case_when distinct if_else
#' @importFrom tidyr replace_na drop_na
#' @importFrom stats median wilcox.test
#' @importFrom readr read_csv
#' @importFrom logger log_info log_debug log_error log_threshold
#' @importFrom assertthat assert_that is.string
#' @importFrom scales comma dollar
#' @importFrom rlang sym .data
#' 
#' @examples
#' \dontrun{
#' # Basic usage with default settings
#' comparison_text <- generate_treatment_cost_text(
#'   treatment_cost_filepath = "path/to/costs.csv",
#'   verbose = FALSE,
#'   cost_column = "Total_cost"
#' )
#' 
#' # With verbose logging enabled
#' comparison_text <- generate_treatment_cost_text(
#'   treatment_cost_filepath = "path/to/costs.csv",
#'   verbose = TRUE,
#'   cost_column = "Total_cost"
#' )
#' 
#' # Using a custom cost column name
#' comparison_text <- generate_treatment_cost_text(
#'   treatment_cost_filepath = "path/to/costs.csv",
#'   verbose = TRUE,
#'   cost_column = "AdjustedCost"
#' )
#' }
# generate_treatment_cost_text <- function(treatment_cost_filepath,
#                                        verbose = FALSE,
#                                        cost_column = "Total_cost") {
#     
#     # Configure logging based on verbose parameter
#     logger::log_threshold(if(verbose) logger::DEBUG else logger::INFO)
#     logger::log_info("Starting treatment cost analysis")
#     
#     tryCatch({
#         # Validate inputs
#         assertthat::assert_that(
#             assertthat::is.string(treatment_cost_filepath),
#             assertthat::is.string(cost_column),
#             is.logical(verbose),
#             file.exists(treatment_cost_filepath)
#         )
#         
#         # Read and validate data
#         logger::log_debug(sprintf("Reading data from %s", treatment_cost_filepath))
#         treatment_data <- readr::read_csv(
#             treatment_cost_filepath,
#             show_col_types = FALSE,
#             progress = FALSE
#         )
#         
#         logger::log_info(sprintf("Processing %d treatment records", 
#                                nrow(treatment_data)))
#         
#         # Get facility cost columns
#         new_op_cols <- grep("OP_facility_new_lvl", names(treatment_data), 
#                           value = TRUE)
#         return_op_cols <- grep("OP_facility_return_lvl", names(treatment_data), 
#                              value = TRUE)
#         consult_op_cols <- grep("OP_facility_consult_lvl", names(treatment_data), 
#                               value = TRUE)
#         procedure_cols <- grep(paste0("OP_facility_(UA|urineCX|microscopy|cathPVR|",
#                                     "usPVR|uroflowmetry|cystometrics|UDS|vUDS|",
#                                     "cystoscopy|UTeval)"), 
#                              names(treatment_data), value = TRUE)
#         complication_cols <- grep("complication", names(treatment_data), 
#                                 value = TRUE)
#         
#         # Log column counts
#         logger::log_debug(sprintf("Found %d new visit columns", length(new_op_cols)))
#         logger::log_debug(sprintf("Found %d return visit columns", 
#                                 length(return_op_cols)))
#         logger::log_debug(sprintf("Found %d consult columns", 
#                                 length(consult_op_cols)))
#         logger::log_debug(sprintf("Found %d procedure columns", 
#                                 length(procedure_cols)))
#         logger::log_debug(sprintf("Found %d complication columns", 
#                                 length(complication_cols)))
#         
#         # Process data
#         processed_data <- treatment_data %>%
#             dplyr::mutate(
#                 sui_treatment = dplyr::case_when(
#                     sui_treatment == "PT" ~ "PFMT",
#                     sui_treatment %in% c("Pessary", "Sling", "PFMT") ~ 
#                         sui_treatment,
#                     TRUE ~ NA_character_
#                 ),
#                 outpatient_new_total = rowSums(
#                     dplyr::select(., dplyr::all_of(new_op_cols)) %>% 
#                         replace(is.na(.), 0)
#                 ),
#                 outpatient_return_total = rowSums(
#                     dplyr::select(., dplyr::all_of(return_op_cols)) %>% 
#                         replace(is.na(.), 0)
#                 ),
#                 outpatient_consult_total = rowSums(
#                     dplyr::select(., dplyr::all_of(consult_op_cols)) %>% 
#                         replace(is.na(.), 0)
#                 ),
#                 outpatient_procedure_total = rowSums(
#                     dplyr::select(., dplyr::all_of(procedure_cols)) %>% 
#                         replace(is.na(.), 0)
#                 ),
#                 outpatient_total = outpatient_new_total + outpatient_return_total + 
#                     outpatient_consult_total + outpatient_procedure_total,
#                 complication_total = rowSums(
#                     dplyr::select(., dplyr::all_of(complication_cols)) %>% 
#                         replace(is.na(.), 0)
#                 )
#             ) %>%
#             tidyr::drop_na(sui_treatment, WU_ID, dplyr::all_of(cost_column)) %>%
#             dplyr::distinct(WU_ID, .keep_all = TRUE)
#         
#         # Log treatment summaries
#         treatment_summary <- processed_data %>%
#             dplyr::group_by(sui_treatment) %>%
#             dplyr::summarise(
#                 n = dplyr::n(),
#                 new_med = median(outpatient_new_total),
#                 return_med = median(outpatient_return_total),
#                 consult_med = median(outpatient_consult_total),
#                 procedure_med = median(outpatient_procedure_total),
#                 total_med = median(outpatient_total),
#                 .groups = "drop"
#             )
#         
#         logger::log_debug("Available treatments after distinct WU_ID: %s",
#                          paste(unique(processed_data$sui_treatment), 
#                                collapse = ", "))
#         
#         for(i in 1:nrow(treatment_summary)) {
#             logger::log_debug(sprintf(
#                 "%s: n=%d, New=$%.2f, Return=$%.2f, Consult=$%.2f, Procedure=$%.2f, Total=$%.2f",
#                 treatment_summary$sui_treatment[i],
#                 treatment_summary$n[i],
#                 treatment_summary$new_med[i],
#                 treatment_summary$return_med[i],
#                 treatment_summary$consult_med[i],
#                 treatment_summary$procedure_med[i],
#                 treatment_summary$total_med[i]
#             ))
#         }
#         
#         logger::log_debug("Data processing complete")
#         
#         # Calculate cost statistics
#         logger::log_debug("Calculating cost statistics")
#         cost_stats <- processed_data %>%
#             dplyr::group_by(sui_treatment) %>%
#             dplyr::summarise(
#                 median_total = stats::median(.data[[cost_column]], na.rm = TRUE),
#                 median_outpatient = stats::median(outpatient_total, na.rm = TRUE),
#                 median_complication = stats::median(complication_total, 
#                                                   na.rm = TRUE),
#                 n_patients = dplyr::n(),
#                 .groups = "drop"
#             ) %>%
#             dplyr::arrange(sui_treatment)
#         
#         # Log cost statistics
#         for(i in 1:nrow(cost_stats)) {
#             logger::log_debug(sprintf(
#                 "%s: Total=$%s, Outpatient=$%s, Complications=$%s, n=%d",
#                 cost_stats$sui_treatment[i],
#                 scales::comma(cost_stats$median_total[i]),
#                 scales::comma(cost_stats$median_outpatient[i]),
#                 scales::comma(cost_stats$median_complication[i]),
#                 cost_stats$n_patients[i]
#             ))
#         }
#         
#         # Perform statistical tests
#         logger::log_debug("Performing statistical tests")
#         
#         # Pessary vs Sling
#         pessary_sling_data <- processed_data %>%
#             dplyr::filter(sui_treatment %in% c("Pessary", "Sling"))
#         
#         logger::log_debug("Sample sizes for Pessary vs Sling test:")
#         logger::log_debug(sprintf("Pessary: %d", 
#                                 sum(pessary_sling_data$sui_treatment == "Pessary")))
#         logger::log_debug(sprintf("Sling: %d", 
#                                 sum(pessary_sling_data$sui_treatment == "Sling")))
#         
#         pessary_sling_test <- wilcox.test(
#             as.formula(sprintf("%s ~ sui_treatment", cost_column)),
#             data = pessary_sling_data,
#             exact = FALSE
#         )
#         
#         # Pessary vs PFMT
#         pessary_pfmt_data <- processed_data %>%
#             dplyr::filter(sui_treatment %in% c("Pessary", "PFMT"))
#         
#         logger::log_debug("Sample sizes for Pessary vs PFMT test:")
#         logger::log_debug(sprintf("Pessary: %d", 
#                                 sum(pessary_pfmt_data$sui_treatment == "Pessary")))
#         logger::log_debug(sprintf("PFMT: %d", 
#                                 sum(pessary_pfmt_data$sui_treatment == "PFMT")))
#         
#         pessary_pfmt_test <- wilcox.test(
#             as.formula(sprintf("%s ~ sui_treatment", cost_column)),
#             data = pessary_pfmt_data,
#             exact = FALSE
#         )
#         
#         # Generate comparison text
#         logger::log_debug("Generating comparison text")
#         
#         # Extract statistics for text
#         pessary_stats <- cost_stats %>% dplyr::filter(sui_treatment == "Pessary")
#         sling_stats <- cost_stats %>% dplyr::filter(sui_treatment == "Sling")
#         pfmt_stats <- cost_stats %>% dplyr::filter(sui_treatment == "PFMT")
#         
#         # Format p-values
#         p_sling <- if(pessary_sling_test$p.value < 0.01) "p<0.01" else 
#             sprintf("p=%.2f", pessary_sling_test$p.value)
#         p_pfmt <- if(pessary_pfmt_test$p.value < 0.01) "p<0.01" else 
#             sprintf("p=%.2f", pessary_pfmt_test$p.value)
#         
#         comparison_text <- sprintf(
#             paste(
#                 "Overall, pessary care remained %s expensive compared to sling",
#                 "surgery (median per-patient cost: $%s vs. $%s, Mann-Whitney U",
#                 "test, %s), but %s costly than PFMT (median per-patient cost: $%s",
#                 "vs. $%s, Mann-Whitney U test, %s). The median outpatient facility",
#                 "cost for pessary treatment was %s than that for sling surgery but",
#                 "%s than PFMT ($%s vs. $%s vs. $%s). Analysis based on %d unique",
#                 "pessary, %d unique sling, and %d unique PFMT patients."
#             ),
#             ifelse(pessary_stats$median_total < sling_stats$median_total, 
#                    "less", "more"),
#             scales::comma(pessary_stats$median_total),
#             scales::comma(sling_stats$median_total),
#             p_sling,
#             ifelse(pessary_stats$median_total > pfmt_stats$median_total, 
#                    "more", "less"),
#             scales::comma(pessary_stats$median_total),
#             scales::comma(pfmt_stats$median_total),
#             p_pfmt,
#             ifelse(pessary_stats$median_outpatient > sling_stats$median_outpatient,
#                    "higher", "lower"),
#             ifelse(pessary_stats$median_outpatient > pfmt_stats$median_outpatient,
#                    "higher", "lower"),
#             scales::comma(pessary_stats$median_outpatient),
#             scales::comma(sling_stats$median_outpatient),
#             scales::comma(pfmt_stats$median_outpatient),
#             pessary_stats$n_patients,
#             sling_stats$n_patients,
#             pfmt_stats$n_patients
#         )
#         
#         logger::log_debug("Text generation complete")
#         return(comparison_text)
#         
#     }, error = function(err) {
#         logger::log_error(sprintf("Error in treatment cost analysis: %s", 
#                                 err$message))
#         stop(err)
#     })
# }




# Helper function to add thousandths commas
format_with_commas <- function(x) {
  formatC(x, format = "f", big.mark = ",", digits = 0)
}

result_section_pairwise_difference <- function(data, numeric_var, x_var) {
  # Log inputs
  logger::log_info("Starting result_section_pairwise_difference...")
  logger::log_info("Input tibble dimensions: {nrow(data)} rows, {ncol(data)} columns.")
  logger::log_info("Analyzing numeric variable: {numeric_var}, grouped by: {x_var}.")
  
  # Validation for unique groups
  if (length(unique(data[[x_var]])) < 2) {
    stop(glue::glue("The variable '{x_var}' must contain at least two unique groups for pairwise comparisons."))
  }
  
  # Step 1: Perform pairwise comparisons
  pairwise_trends <- perform_pairwise_tests(data, numeric_var, x_var)
  
  # Step 2: Analyze trends
  direction_summary <- analyze_directionality(pairwise_trends, x_var)
  significance_summary <- analyze_significance(pairwise_trends, x_var)
  
  # Step 3: Combine trends
  trend_summary <- combine_trends(direction_summary, significance_summary)
  
  # Step 4: Calculate medians and percentiles for each group
  logger::log_info("Calculating medians and percentiles for each group in {x_var}...")
  summary_stats <- data %>%
    dplyr::group_by(.data[[x_var]]) %>%
    dplyr::summarise(
      Median = format_with_commas(round(median(.data[[numeric_var]], na.rm = TRUE))),
      Q1 = format_with_commas(round(quantile(.data[[numeric_var]], 0.25, na.rm = TRUE))),
      Q3 = format_with_commas(round(quantile(.data[[numeric_var]], 0.75, na.rm = TRUE))),
      .groups = "drop"
    )
  
  logger::log_info("Medians and percentiles calculated successfully.")
  
  # Step 5: Generate interpretive sentences
  logger::log_info("Generating interpretive sentences...")
  interpretation_sentences <- generate_interpretations(pairwise_trends, summary_stats, x_var)
  
  # Log outputs
  logger::log_info("Pairwise trend analysis completed successfully.")
  logger::log_info("Returning pairwise trends, trend summary, and interpretation sentences.")
  
  return(list(pairwise_trends = pairwise_trends, trend_summary = trend_summary, interpretation_sentences = interpretation_sentences))
}

# Helper function: Perform pairwise Kruskal-Wallis tests
perform_pairwise_tests <- function(data, numeric_var, x_var) {
  logger::log_info("Step 1: Performing pairwise Kruskal-Wallis tests...")
  groups <- unique(data[[x_var]])
  logger::log_info("Unique groups identified in {x_var}: {length(groups)}")
  
  results <- combn(groups, 2, simplify = FALSE, FUN = function(pair) {
    logger::log_info("Comparing groups: {pair[1]} vs {pair[2]}")
    
    group1 <- dplyr::filter(data, .data[[x_var]] == pair[1])[[numeric_var]]
    group2 <- dplyr::filter(data, .data[[x_var]] == pair[2])[[numeric_var]]
    
    test_result <- stats::kruskal.test(list(group1, group2))
    coef_direction <- mean(group1, na.rm = TRUE) - mean(group2, na.rm = TRUE)
    direction <- ifelse(coef_direction > 0, "Higher", "Lower")
    
    tibble::tibble(
      Group1 = pair[1],
      Group2 = pair[2],
      Direction = direction,
      p_value = test_result$p.value,
      p_value_formatted = ifelse(
        test_result$p.value < 0.01, "p<0.01",
        stringr::str_c("p=", round(test_result$p.value, 3))
      )
    )
  }) %>%
    dplyr::bind_rows()
  
  logger::log_info("Pairwise comparisons completed. Total comparisons: {nrow(results)}.")
  return(results)
}

# Helper function: Analyze directionality trends
analyze_directionality <- function(pairwise_trends, x_var) {
  logger::log_info("Step 2: Analyzing directionality trends...")
  
  direction_summary <- pairwise_trends %>%
    dplyr::group_by(Group1, Group2) %>%
    dplyr::summarise(
      Higher = sum(Direction == "Higher"),
      Lower = sum(Direction == "Lower"),
      .groups = "drop"
    )
  
  logger::log_info("Directionality analysis completed successfully.")
  return(direction_summary)
}

# Helper function: Analyze significance trends
analyze_significance <- function(pairwise_trends, x_var) {
  logger::log_info("Step 3: Analyzing significance trends...")
  
  significance_summary <- pairwise_trends %>%
    dplyr::filter(p_value < 0.05) %>%
    dplyr::group_by(Group1, Group2) %>%
    dplyr::summarise(Significant = n(), .groups = "drop")
  
  logger::log_info("Significance analysis completed successfully.")
  return(significance_summary)
}

# Helper function: Combine trends
combine_trends <- function(direction_summary, significance_summary) {
  logger::log_info("Step 4: Combining directionality and significance trends...")
  
  trend_summary <- dplyr::left_join(
    direction_summary, significance_summary, by = c("Group1", "Group2")
  ) %>%
    dplyr::mutate(Significant = ifelse(is.na(Significant), 0, Significant))
  
  logger::log_info("Trends combined successfully.")
  return(trend_summary)
}

# Helper function: Generate interpretive sentences
generate_interpretations <- function(pairwise_trends, summary_stats, x_var) {
  sentences <- pairwise_trends %>%
    dplyr::rowwise() %>%
    dplyr::mutate(
      Sentence = {
        group1_stats <- summary_stats %>% dplyr::filter(.data[[x_var]] == Group1)
        group2_stats <- summary_stats %>% dplyr::filter(.data[[x_var]] == Group2)
        
        glue::glue(
          "Payments for {Group1} (median: ${group1_stats$Median}; IQR [${group1_stats$Q1} - ${group1_stats$Q3}]) are ",
          "{tolower(Direction)} than payments for {Group2} (median: ${group2_stats$Median}; IQR [${group2_stats$Q1} - ${group2_stats$Q3}], {p_value_formatted})."
        )
      }
    ) %>%
    dplyr::pull(Sentence)
  
  logger::log_info("Interpretive sentences generated successfully.")
  return(sentences)
}

3 Describe the Data

In January 2023, the Center for Administrative Data and Research provided data to AUGS under an agreement signed in 2019. The original payment data was highly right-skewed. I removed the bottom 5% and top 5% of patients with total costs. There are only 50 rows in each of these dataframes to give a flavor of what is needed.

3.1 _Costs_32_including_5 and_95.csv

This dataset contains raw cost data for stress urinary incontinence (SUI) treatments before filtering out the top 5% and bottom 5% of the Total_cost variable. Note the Total_cost of $0 for some patients. Each row represents an individual Medicare Fee-For-Service patient, identified by a unique WU_ID, who received one of several SUI treatments, such as pessary, pelvic floor muscle therapy (PFMT), or sling surgery. The dataset includes a wide range of cost-related variables, capturing facility fees, professional fees, diagnostic testing costs, and complication-related expenses. The Total_cost column reflects the overall Medicare spending for each patient’s treatment episode, including all associated medical services. This unfiltered version of the data helps researchers understand the full distribution of treatment costs, including extreme high-cost and low-cost cases. By later removing the top and bottom 5%, the data will provide a more stable estimate of typical Medicare expenditures while minimizing the influence of outliers. This dataset serves as the foundation for cost analysis, helping to identify trends, disparities, and financial burdens associated with different SUI treatments.

file.exists("/Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments/Data/CADR_2023/final_push/_Costs_32_including_5 and_95.csv")

[1] TRUE

Costs_32_including_5_and_95 <- readr::read_csv("/Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments/Data/CADR_2023/final_push/_Costs_32_including_5 and_95.csv") %>%
  head(50); Costs_32_including_5_and_95

DT::datatable(Costs_32_including_5_and_95)

3.2 _Costs_step33.rds

\[ \begin{aligned} \text{Total_cost} &= \text{new_lvl1_cost} + \text{new_lvl2_cost} + \text{new_lvl3_cost} + \text{new_lvl4_cost} + \text{new_lvl5_cost} \\ &+ \text{return_lvl1_cost} + \text{return_lvl2_cost} + \text{return_lvl3_cost} + \text{return_lvl4_cost} + \text{return_lvl5_cost} \\ &+ \text{consult_lvl1_cost} + \text{consult_lvl2_cost} + \text{consult_lvl3_cost} + \text{consult_lvl4_cost} + \text{consult_lvl5_cost} \\ &+ \text{UA_cost} + \text{urineCX_cost} + \text{microscopy_cost} + \text{cathPVR_cost} + \text{usPVR_cost} + \text{uroflowmetry_cost} \\ &+ \text{cystometrics_cost} + \text{UDS_cost} + \text{vUDS_cost} + \text{cystoscopy_cost} + \text{UTeval_cost} \\ &+ \text{IP_facility_cost} + \text{OP_facility_cost} + \text{ASC_facility_cost} + \text{anesth_cost} + \text{complication_cost} \\ &+ \text{EM_cost} + \text{edOP_facility_cost} + \text{edprofessional_cost} \end{aligned} \]

3.3 Summing strategy

For aggregating component costs into total episode costs:

\[ \text{Total Cost}_i = \sum_{c=1}^{C} \text{Facility Cost}_{ci} + \sum_{p=1}^{P} \text{Professional Cost}_{pi} + \sum_{d=1}^{D} \text{Diagnostic Cost}_{di} + \sum_{m=1}^{M} \text{Complication Cost}_{mi} \] Where: - $\text{Facility Cost}_{ci}$ represents the $c$-th facility cost component for patient $i$ - $\text{Professional Cost}_{pi}$ represents the $p$-th professional fee for patient $i$ - $\text{Diagnostic Cost}_{di}$ represents the $d$-th diagnostic test cost for patient $i$ - $\text{Complication Cost}_{mi}$ represents the $m$-th complication-related cost for patient $i$

3.4 Appendix A

\[P(\text{inclusion})_i = \begin{cases} 1 & \text{if } \text{BIC}_i \mod 100 \in S \\ 0 & \text{otherwise} \end{cases}\]

Where: - $\text{BIC}_i$ is the Beneficiary Identification Code for beneficiary $i$ - $S$ is the set of values for the last two digits that determine inclusion (approximately 5% of possible values)

This equation formalizes how Medicare’s 5% sample is created. It shows that a beneficiary is included in the sample (probability = 1) only if the last two digits of their identification number (calculated as BIC mod 100) match any value in set S, which contains approximately 5 out of 100 possible values. Otherwise, they’re excluded (probability = 0). This method ensures random yet consistent sampling across the Medicare population, producing a representative 5% subset for research and analysis.

This dataset contains detailed healthcare cost information for patients who underwent stress urinary incontinence (SUI) treatments. It consists of 160,323 rows and 81 columns, with each row representing an individual cost entry related to a patient’s treatment episode. The data tracks three main treatment types (indicated in the ‘sui_treatment’ column): pelvic floor muscle therapy, pessary, and sling procedures. The majority of columns capture various healthcare costs broken down by setting (outpatient, inpatient, ambulatory surgical center) and provider type (facility vs. professional fees). These costs are further categorized by service type, including office visits of different complexity levels (new, return, and consultation visits), diagnostic tests (urinalysis, urine culture, microscopy, urodynamic studies, cystoscopy), and complication-related expenses. Patient demographic information is included with variables for age, race, insurance status (dual Medicare-Medicaid coverage), and geographic location (state, region, and division). Each cost entry is identified by a unique patient ID (WU_ID) and episode number, and is classified by its relationship to the index procedure (pre or post treatment) and cost type. The ‘Total_cost’ column provides the aggregate expense for each entry. This comprehensive dataset allows for detailed analysis of healthcare expenditures associated with different SUI treatment approaches across various patient populations and geographic regions.

df_cost <- readr::read_rds("/Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments/Data/CADR_2023/final_push/_Costs_step33.rds") %>% 
  select(WU_ID, everything()) %>%  # Moves WU_ID to the first column
  head(50)
  
DT::datatable(df_cost)

3.5 _Margie_Jerry_check_in_Costs.rds

This dataset provides a concise summary of healthcare costs for stress urinary incontinence treatments, containing 16,695 rows with 4 columns of information. Each row represents a unique patient, identified by their “WU_ID” (Washington University ID), who underwent treatment. The “episode_type” column categorizes the type of treatment each patient received, likely indicating whether they had pelvic floor muscle therapy, pessary insertion, or a sling procedure. The “year_episode_start” column shows when each patient’s treatment began, allowing for temporal analysis of treatment patterns and costs over time. Finally, the “Total_cost” column quantifies the complete financial expense associated with each patient’s treatment episode in integer dollars. This streamlined dataset appears to be an aggregated version of a more detailed cost database, focusing on patient-level total costs rather than itemized expenses. It would be valuable for analyzing overall treatment costs, tracking expenditure trends over time, and comparing the financial impact of different treatment approaches for stress urinary incontinence across the patient population.

Margie_Jerry_check_in_Costs <- readr::read_rds("Data/CADR_2023/final_push/_Margie_Jerry_check_in_Costs.rds") %>%
  select(WU_ID, everything()) %>%  # Moves WU_ID to the first column
  head(50)
  
DT::datatable(Margie_Jerry_check_in_Costs)

3.6 _Cohort_descriptors

This dataset provides a comprehensive view of 16,695 patients who underwent treatment for stress urinary incontinence (SUI). The data includes detailed patient information across 135 columns, covering demographics (age, race), treatment type (in the “sui_treatment” column), and insurance status (dual coverage indicator). Notably, the dataset contains extensive medical comorbidity indicators (prefixed with “elix_”) that follow the Elixhauser comorbidity index, capturing conditions like congestive heart failure, diabetes, obesity, and depression. The financial aspects of care are tracked through various cost columns divided by facility type (inpatient, outpatient, ambulatory surgical center) and provider type (facility versus professional fees). A substantial portion of the dataset is dedicated to tracking complications and additional procedures, with binary indicators (0/1) for numerous conditions like urinary tract infections, pain, fever, and surgical site infections, each prefixed with “index_” and duplicated with “edindex_” prefixes to distinguish regular encounters from emergency department visits. Geographic information is included through state codes, regions, and divisions, allowing for spatial analysis of treatment patterns. The dataset also contains episode identifiers, dates, and total cost information, making it suitable for comprehensive analysis of SUI treatment outcomes, complications, and healthcare economics across different patient populations and geographic areas.

Cohort_descriptors <- readr::read_rds("Data/CADR_2023/final_push/_Cohort_descriptors.rds") %>%
  select(WU_ID, everything()) %>%  # Moves WU_ID to the first column
  head(50)
  
#DT::datatable(Cohort_descriptors)

3.7 Linear regression pretty.rds

This dataset presents a simplified structure for analyzing stress urinary incontinence (SUI) treatments among 16,695 patients. Each row represents an individual patient, with 15 columns capturing key factors related to their treatment, demographics, and health conditions. The “StressUrinaryIncontinenceTreatment” column identifies the specific intervention each patient received (likely pessary, pelvic floor muscle therapy, or sling surgery). Patient demographics are captured through the “Age” (categorized into age groups rather than exact years), “Race,” and “Insurance” status columns. Geographic information is represented by “USCensusBureauDivision,” indicating the patient’s regional location. The financial aspect of care is consolidated in the “SumOfPaymentForTreatment” column, which shows the total cost in integer dollars for each patient’s treatment. The dataset also includes information about the provider through the “TreatingPhysician” column, likely categorizing physicians by specialty. Six health indicators are included as binary variables (Yes/No format): five pre-existing conditions (“NeurologicalDisorderBeforeSUI,” “ChronicLungDisorderBeforeSUI,” “DepressionBeforeSUI,” “DiabetesBeforeSUI,” and “ObesityBeforeSUI”), as well as tobacco use and the presence of urinary tract infections during treatment. The “Year” column enables temporal analysis of treatment patterns from 2008-2016. Overall, this dataset appears structured for regression analysis to identify factors that influence SUI treatment costs and outcomes.

Linear_regression_pretty <- readr::read_rds("Data/CADR_2023/final_push/Linear regression pretty.rds") %>%
  head(50)
  
DT::datatable(Linear_regression_pretty)

3.8 March2_table1.rds

This dataset captures demographic and treatment information for 16,695 patients who received therapy for stress urinary incontinence. It contains 19 columns that combine patient characteristics with cost data. The primary treatment information is stored in both “Stress Urinary Incontinence Therapies” and “sui_treatment1” columns, likely representing the same information in different formats. Patient demographics include age (categorized in ranges rather than specific years), race, tobacco use status, and dual eligibility for Medicare and Medicaid. Geographic information is provided through the “US Census Bureau Subdivision” column, allowing for regional analysis. The dataset includes temporal data with “Year therapy performed” showing when treatments occurred. Five columns track outpatient facility consultation costs of varying complexity levels (from level 1 to level 5). The remaining columns appear to contain information about treatment episodes, with duplicated columns for “episode_type,” “year_episode_start,” and “Total_cost” (with “.x” and “.y” suffixes suggesting these might be from different merged data sources). These columns likely capture the type of treatment episode, when it began, and its associated total cost. This dataset seems designed to support the creation of a descriptive Table 1 for a research paper analyzing stress urinary incontinence treatments among Medicare beneficiaries across different demographic groups.

March2_table1 <- readr::read_rds("Data/CADR_2023/final_push/March2_table1.rds") %>%
  head(50)
  
DT::datatable(March2_table1)

3.9 yearly_summary_Cochran_armitage.rds

This dataset provides a yearly summary of stress urinary incontinence treatments from what appears to be a 9-year period, structured specifically for Cochran-Armitage trend testing. Each row represents a single year (from the “Year” column), likely spanning from 2008 to 2016. The remaining six columns are organized in pairs to facilitate statistical analysis of treatment trends over time. The first pair shows the count of patients who received pessary treatment (“Pessary”) versus those who did not (“Non-pessary”) for each year. Similarly, the second pair contrasts patients who underwent pelvic floor muscle therapy (“PT”) against those who did not (“Non-PT”), while the third pair compares sling procedure recipients (“Sling”) to non-recipients (“Non-sling”). This structure allows researchers to analyze how the proportion of patients receiving each treatment has changed over the study period. The Cochran-Armitage test specifically examines whether there is a significant trend in the proportions across ordered groups (in this case, years). With only 9 rows, this is a concise dataset designed for statistical trend analysis rather than patient-level analysis, providing a longitudinal view of treatment pattern changes over time.

yearly_summary_Cochran_armitage <- readr::read_rds("Data/CADR_2023/final_push/yearly_summary_Cochran_armitage.rds") %>%
  head(50)
  
DT::datatable(yearly_summary_Cochran_armitage)

3.10 Provider_specialty.rds

This dataset focuses on provider specialty information for 16,695 patients who received treatment for stress urinary incontinence. Each row represents a unique patient, identified by their “WU_ID” (Washington University ID) and “episode_number.” The “sui_treatment” column indicates which intervention the patient received (likely pessary, pelvic floor muscle therapy, or sling surgery). The dataset captures provider specialty information at two different time points: “specialty_pre6m_specialty_car_and_fac” shows the specialty of providers who treated the patient in the six months before their SUI treatment, while “specialty_index_specialty_car_and_fac” indicates the specialty of the provider who performed the actual SUI treatment. This allows researchers to analyze referral patterns and determine which specialists are most commonly treating SUI. Temporal data is included through two year columns (“year_episode_start.x” and “year_episode_start.y,” which may represent the same information from different data sources), showing when each treatment episode began. The “episode_type” column likely provides additional categorization of the treatment approach, and “Total_cost” captures the financial expense associated with each patient’s treatment in integer dollars. This dataset would be valuable for analyzing how provider specialty influences treatment selection, costs, and possibly outcomes for stress urinary incontinence.

Provider_specialty <- readr::read_rds("Data/CADR_2023/final_push/Provider_specialty.rds") %>%
  select(WU_ID, everything()) %>%  # Moves WU_ID to the first column
  head(50)
  
DT::datatable(Provider_specialty)

# Source the functions from external file
# Make sure to specify the complete path to your functions file
source("/Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments/Code/histograms.R", local = knitr::knit_global())

4 METHODS

5 Five Percent and Ninety-Five Percent Trimming due to Data Inconsistencies

Here’s the LaTeX equation for the Data Trimming Approach:

\[ \text{Trimmed Dataset} = \{y_i \in Y : F^{-1}(\alpha/2) \leq y_i \leq F^{-1}(1-\alpha/2)\} \]

Where: - $Y$ is the original cost dataset - $F^{-1}$ is the inverse cumulative distribution function - $\alpha$ is the total proportion trimmed (e.g., $\alpha = 0.10$ for 5% trimming from each tail) - $y_i$ represents individual cost observations

This mathematical formulation precisely defines how the trimming process works by keeping only those observations that fall between the $\alpha/2$ and $1-\alpha/2$ percentiles of the distribution, effectively removing extreme values from both tails.

# Run the function
hist_plots <- create_combined_cost_histogram(
  file_path = "Data/CADR_2023/final_push/_Costs_32_including_5 and_95.csv",
  output_dir = "Data/CADR_2023/final_push/",
  bins = 100,
  zoom_range = c(0, 6000),
  year_label = "2016",
  create_faceted = TRUE,
  verbose = TRUE
)

# View the combined histogram with overlaid episode types
hist_plots$combined

# View the faceted histogram with bin range information
hist_plots$faceted

# Load dataset
df <- read_csv("Data/CADR_2023/final_push/_Costs_32_including_5 and_95.csv")

# Compute trimming thresholds
lower_bound <- quantile(df$Total_cost, 0.05, na.rm = TRUE)
upper_bound <- quantile(df$Total_cost, 0.95, na.rm = TRUE)

# Create trimmed dataset
df_trimmed <- df %>%
  filter(Total_cost >= lower_bound & Total_cost <= upper_bound)

# Compute median values for reference lines
median_before <- median(df$Total_cost, na.rm = TRUE)
median_after <- median(df_trimmed$Total_cost, na.rm = TRUE)

# Create density plot with improvements
p <- ggplot() +
  geom_density(data = df, aes(x = Total_cost, fill = "Before Trimming"), alpha = 0.3, color = "blue", size = 1) +
  geom_density(data = df_trimmed, aes(x = Total_cost, fill = "After Trimming"), alpha = 0.5, color = "red", size = 1) +
  geom_vline(xintercept = lower_bound, color = "black", linetype = "dashed", size = 0.7) +
  geom_vline(xintercept = upper_bound, color = "black", linetype = "dashed", size = 0.7) +
  geom_vline(xintercept = median_before, color = "blue", linetype = "solid", size = 1) +
  geom_vline(xintercept = median_after, color = "red", linetype = "solid", size = 1) +
  scale_fill_manual(values = c("Before Trimming" = "blue", "After Trimming" = "red")) +
  scale_x_continuous(limits = c(0, 15000), breaks = seq(0, 15000, by = 2000), labels = scales::comma) +
  ggtitle("Impact of 5% Trimming on Medicare Fee-For-Service Payments") +
  xlab("Total Medicare Costs ($)") +
  ylab("Density") +
  theme_minimal() +
  theme(legend.position = "top", text = element_text(size = 14))

# Save the plot
ggsave(filename = "medicare_cost_density.png", plot = p, width = 10, height = 6, dpi = 300)

5.0.1 Sensitivity Analysis: Overall Trimming Levels

flex_table

Sensitivity Analysis: Overall Trimming Levels
Trim Level	Mean ($)	Median ($)	Standard Deviation ($)	Min ($)	Max ($)	Sample Size
0%	$995	$652	$1,189	$0	$39,429	18,544
1%	$930	$638	$875	$0	$4,997	18,358
5%	$861	$652	$673	$40	$2,861	16,695
10%	$811	$652	$558	$113	$2,151	14,844
15%	$776	$652	$476	$172	$1,822	12,989
20%	$746	$651	$404	$222	$1,593	11,145

5.0.2 Log-Transforming and Visualizing Cost Data

print(p)

5.1 Who was eliminated by the trimming?

# Print formatted summary statistics
knitr::kable(
  overall_stats %>%
    mutate(
      percent = sprintf("%.1f%%", percent),
      mean_cost = dollar(mean_cost),
      median_cost = dollar(median_cost),
      min_cost = dollar(min_cost),
      max_cost = dollar(max_cost),
      sd_cost = dollar(sd_cost)
    ),
  caption = "Cost Statistics by Inclusion/Exclusion Category"
)

Cost Statistics by Inclusion/Exclusion Category
exclusion_category	count	percent	mean_cost	median_cost	min_cost	max_cost	sd_cost
Excluded (bottom 5%)	921	5.0%	$8.14	$3	$0	$39	$10.81
Included (middle 90%)	16695	90.0%	$860.65	$652	$40	$2,861	$673.16
Excluded (top 5%)	928	5.0%	$4,396.45	$3,704	$2,864	$39,429	$2,695.65

# Print episode type distribution
knitr::kable(
  episode_distribution %>%
    mutate(percent = sprintf("%.1f%%", percent)),
  caption = "Distribution of Episode Types by Exclusion Category"
)

Distribution of Episode Types by Exclusion Category
exclusion_category	episode_type	count	percent
Excluded (bottom 5%)	PT	894	97.1%
Excluded (bottom 5%)	Pessary	27	2.9%
Excluded (top 5%)	UI Sling	848	91.4%
Excluded (top 5%)	PT	65	7.0%
Excluded (top 5%)	Pessary	15	1.6%
Included (middle 90%)	Pessary	7717	46.2%
Included (middle 90%)	PT	5556	33.3%
Included (middle 90%)	UI Sling	3422	20.5%

# Print episode exclusion rates
knitr::kable(
  episode_exclusion_rates %>%
    mutate(percent = sprintf("%.1f%%", percent)),
  caption = "Percentage of Each Episode Type Excluded"
)

Percentage of Each Episode Type Excluded
episode_type	exclusion_category	count	percent	total
PT	Excluded (bottom 5%)	894	13.7%	6515
PT	Excluded (top 5%)	65	1.0%	6515
PT	Included (middle 90%)	5556	85.3%	6515
Pessary	Excluded (bottom 5%)	27	0.3%	7759
Pessary	Excluded (top 5%)	15	0.2%	7759
Pessary	Included (middle 90%)	7717	99.5%	7759
UI Sling	Excluded (top 5%)	848	19.9%	4270
UI Sling	Included (middle 90%)	3422	80.1%	4270

# 5. Visualizations ----------------------------------------------------------
# Cost distribution by exclusion category
cost_density_plot <- ggplot(original_data, aes(x = Total_cost, fill = exclusion_category)) +
  geom_density(alpha = 0.6) +
  scale_fill_viridis_d() +
  labs(
    title = "Cost Distribution by Inclusion/Exclusion Category",
    x = "Total Cost ($)",
    y = "Density",
    fill = "Category"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  scale_x_continuous(labels = dollar_format(), limits = c(0, 5000)) +
  scale_y_continuous(limits = c(0, 0.002)); cost_density_plot

# Create a trimming visualization
trimming_viz <- ggplot(original_data, aes(x = Total_cost, fill = exclusion_category)) +
  geom_histogram(bins = 50, position = "identity", alpha = 0.6) +
  geom_vline(xintercept = lower_threshold, linetype = "dashed", color = "red") +
  geom_vline(xintercept = upper_threshold, linetype = "dashed", color = "red") +
  scale_fill_viridis_d() +
  labs(
    title = "Distribution of Costs with 5% Trimming Thresholds",
    x = "Total Cost ($)",
    y = "Count",
    fill = "Category"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  scale_x_continuous(labels = dollar_format(), limits = c(0, 5000)); trimming_viz

# Stacked bar chart of episode types by exclusion category
# Ensure the exclusion_category is a factor with the desired order
episode_distribution <- episode_distribution %>%
  mutate(exclusion_category = factor(
    exclusion_category,
    levels = c("Excluded (bottom 5%)", "Included (middle 90%)", "Excluded (top 5%)")
  ))

# Create stacked bar chart with ordered x-axis and labels
episode_plot <- ggplot(
  episode_distribution,
  aes(x = exclusion_category, y = percent, fill = episode_type)
) +
  geom_col() +
  scale_fill_viridis_d() +
  labs(
    title = "Episode Type Distribution by Inclusion/Exclusion Category",
    x = "Category",
    y = "Percentage",
    fill = "Episode Type"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  # Add percentage labels for values over 10%
  geom_text(
    aes(label = ifelse(percent > 10, sprintf("%.1f%%", percent), "")),
    position = position_stack(vjust = 0.5),
    color = "grey25",
    fontface = "bold"
  ); episode_plot

# Save the plot
ggsave(filename = "cost_density_plot.png", plot = cost_density_plot, width = 10, height = 6, dpi = 300)

# Save the plot
ggsave(filename = "trimming_viz.png", plot = trimming_viz, width = 10, height = 6, dpi = 300)

# Save the plot
ggsave(filename = "episode_plot.png", plot = episode_plot, width = 10, height = 6, dpi = 300)

# 6. Log transformation analysis ---------------------------------------------
# Apply log transformation to better visualize skewed cost data
original_data <- original_data %>%
  mutate(log_cost = log(pmax(Total_cost, 1)))

# Create density plot of log-transformed costs
log_cost_plot <- ggplot(original_data, aes(x = log_cost, fill = exclusion_category)) +
  geom_density(alpha = 0.6) +
  scale_fill_viridis_d() +
  labs(
    title = "Log-Transformed Cost Distribution",
    x = "Log(Total Cost)",
    y = "Density",
    fill = "Category"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

print(log_cost_plot)

# 8. Stratified Analysis by Episode Type -------------------------------------
# Calculate trimming thresholds by episode type
type_specific_thresholds <- original_data %>%
  group_by(episode_type) %>%
  summarise(
    lower_threshold = quantile(Total_cost, 0.05),
    upper_threshold = quantile(Total_cost, 0.95),
    median_cost = median(Total_cost),
    mean_cost = mean(Total_cost)
  )

# Print type-specific thresholds
knitr::kable(
  type_specific_thresholds %>%
    mutate(
      lower_threshold = dollar(lower_threshold),
      upper_threshold = dollar(upper_threshold),
      median_cost = dollar(median_cost),
      mean_cost = dollar(mean_cost)
    ),
  caption = "Trimming Thresholds by Episode Type"
)

Trimming Thresholds by Episode Type
episode_type	lower_threshold	upper_threshold	median_cost	mean_cost
PT	$0.00	$1,939.30	$397.00	$637.06
Pessary	$123.00	$1,535.10	$448.00	$608.25
UI Sling	$964.45	$4,864.30	$1,849.50	$2,244.99

# Create stratified exclusion flags based on type-specific thresholds
stratified_data <- original_data %>%
  left_join(
    type_specific_thresholds,
    by = "episode_type"
  ) %>%
  mutate(
    stratified_exclusion = case_when(
      Total_cost < lower_threshold ~ "Excluded (bottom 5% within type)",
      Total_cost > upper_threshold ~ "Excluded (top 5% within type)",
      TRUE ~ "Included (middle 90% within type)"
    )
  )

# Compare uniform vs. stratified trimming
trimming_comparison <- stratified_data %>%
  group_by(episode_type) %>%
  summarise(
    total_count = n(),
    excluded_uniform = sum(exclusion_category != "Included (middle 90%)"),
    excluded_stratified = sum(stratified_exclusion != "Included (middle 90% within type)"),
    percent_uniform = excluded_uniform / total_count * 100,
    percent_stratified = excluded_stratified / total_count * 100
  )

# Print trimming comparison
knitr::kable(
  trimming_comparison %>%
    mutate(
      percent_uniform = sprintf("%.1f%%", percent_uniform),
      percent_stratified = sprintf("%.1f%%", percent_stratified)
    ),
  caption = "Comparison of Uniform vs. Stratified Trimming by Episode Type"
)

Comparison of Uniform vs. Stratified Trimming by Episode Type
episode_type	total_count	excluded_uniform	excluded_stratified	percent_uniform	percent_stratified
PT	6515	959	326	14.7%	5.0%
Pessary	7759	42	767	0.5%	9.9%
UI Sling	4270	848	428	19.9%	10.0%

# 9. Cost comparison visualization by episode type ---------------------------
# Create boxplots of costs by episode type
episode_boxplot <- ggplot(original_data, aes(x = episode_type, y = Total_cost, fill = episode_type)) +
  geom_boxplot(outlier.shape = 1, outlier.size = 1, alpha = 0.7) +
  scale_fill_viridis_d() +
  labs(
    title = "Cost Distribution by Episode Type",
    x = "Episode Type",
    y = "Total Cost ($)",
    fill = "Episode Type"
  ) +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_y_continuous(labels = dollar_format(), limits = c(0, 5000))

# Create violin plots for better distribution visualization
episode_violin <- ggplot(original_data, aes(x = episode_type, y = Total_cost, fill = episode_type)) +
  geom_violin(trim = FALSE, alpha = 0.7) +
  geom_boxplot(width = 0.1, outlier.shape = NA, alpha = 0.5) +
  scale_fill_viridis_d() +
  labs(
    title = "Cost Distribution by Episode Type",
    x = "Episode Type",
    y = "Total Cost ($)",
    fill = "Episode Type"
  ) +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_y_continuous(labels = dollar_format(), limits = c(0, 5000))

# Display plots side by side
episode_cost_plots <- episode_boxplot + episode_violin
print(episode_cost_plots)

# 10. Save results -----------------------------------------------------------
# Create a PDF report with all visualizations and tables
#pdf("healthcare_cost_trimming_analysis.pdf", width = 11, height = 8.5)

# Print all plots and tables
print(combined_plots)

print(log_cost_plot)

print(episode_cost_plots)

# Close PDF
#dev.off()

# Export key tables to CSV
write_csv(overall_stats, "Data/CADR_2023/final_push/overall_cost_statistics.csv")
write_csv(episode_distribution, "episode_type_distribution.csv")
write_csv(episode_exclusion_rates, "episode_exclusion_rates.csv")
write_csv(type_specific_thresholds, "episode_type_thresholds.csv")
write_csv(trimming_comparison, "trimming_comparison.csv")

# Healthcare Cost Trimming Analysis with Corrected Terminology
# This script analyzes the impact of 5% trimming on healthcare cost data
# and exports tables and visualizations to Word documents

# Create the output directory if it doesn't exist
output_dir <- "Data/CADR_2023/final_push/"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

# 1. Load datasets -----------------------------------------------------------
# Original dataset (before trimming)
original_data <- read_csv("Data/CADR_2023/final_push/_Costs_32_including_5 and_95.csv") %>%
  # Rename episode_type to sui_treatment for consistency
  rename(sui_treatment = episode_type)

# Trimmed dataset (after removing top and bottom 5%)
trimmed_data <- read_csv("Data/CADR_2023/final_push/_Margie_Jerry_check_in_Costs.csv") %>%
  # Rename episode_type to sui_treatment for consistency
  rename(sui_treatment = episode_type)

# Load demographic data
demographic_data <- read_csv("Data/CADR_2023/final_push/_Cohort_descriptors.csv")

# 2. Basic Analysis of Inclusion/Exclusion -----------------------------------
# Create a flag in the original dataset to identify trimmed vs excluded patients
original_data <- original_data %>%
  mutate(inclusion_status = ifelse(WU_ID %in% trimmed_data$WU_ID, "Included", "Excluded"))

# Calculate cost thresholds
lower_threshold <- quantile(original_data$Total_cost, 0.05)
upper_threshold <- quantile(original_data$Total_cost, 0.95)

# Add more detailed exclusion category
original_data <- original_data %>%
  mutate(
    exclusion_category = case_when(
      inclusion_status == "Included" ~ "Included (middle 90%)",
      Total_cost < lower_threshold ~ "Excluded (bottom 5%)",
      Total_cost > upper_threshold ~ "Excluded (top 5%)",
      TRUE ~ "Other"
    )
  )

# 3. Calculate summary statistics --------------------------------------------
# Overall statistics
overall_stats <- original_data %>%
  group_by(exclusion_category) %>%
  summarise(
    count = n(),
    percent = n() / nrow(original_data) * 100,
    mean_cost = mean(Total_cost),
    median_cost = median(Total_cost),
    min_cost = min(Total_cost),
    max_cost = max(Total_cost),
    sd_cost = sd(Total_cost)
  ) %>%
  arrange(factor(exclusion_category, levels = c(
    "Excluded (bottom 5%)", "Included (middle 90%)", "Excluded (top 5%)"
  )))

# Export overall stats to Word using flextable
ft_overall_stats <- flextable(
  overall_stats %>%
    mutate(
      percent = sprintf("%.1f%%", percent),
      mean_cost = dollar(mean_cost),
      median_cost = dollar(median_cost),
      min_cost = dollar(min_cost),
      max_cost = dollar(max_cost),
      sd_cost = dollar(sd_cost)
    )
) %>%
  set_caption("Cost Statistics by Inclusion/Exclusion Category") %>%
  theme_booktabs() %>%
  autofit() %>%
  bold(part = "header") %>%
  bg(bg = "#f5f5f5", part = "header") %>%
  width(width = 1.2) %>%
  colformat_double(digits = 0) %>%
  set_header_labels(
    exclusion_category = "Exclusion Category",
    count = "Count",
    percent = "Percentage",
    mean_cost = "Mean Cost",
    median_cost = "Median Cost",
    min_cost = "Minimum Cost",
    max_cost = "Maximum Cost",
    sd_cost = "Standard Deviation"
  )

# Save to Word document
doc <- read_docx() %>%
  body_add_par("Cost Statistics by Inclusion/Exclusion Category", style = "heading 1") %>%
  body_add_par("This table shows key cost statistics for patients included in the middle 90% versus those excluded in the bottom and top 5% of costs.", style = "Normal") %>%
  body_add_flextable(ft_overall_stats)

print(doc, target = paste0(output_dir, "overall_cost_statistics.docx"))

# 4. Treatment type analysis ---------------------------------------------------
# Distribution of treatment types by exclusion category
treatment_distribution <- original_data %>%
  group_by(exclusion_category, sui_treatment) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(exclusion_category) %>%
  mutate(
    percent = count / sum(count) * 100,
    sui_treatment = factor(sui_treatment)
  ) %>%
  ungroup() %>%
  arrange(exclusion_category, desc(percent))

# Export treatment distribution to Word
ft_treatment_distribution <- flextable(
  treatment_distribution %>%
    mutate(percent = sprintf("%.1f%%", percent))
) %>%
  set_caption("Distribution of Treatment Types by Exclusion Category") %>%
  theme_booktabs() %>%
  autofit() %>%
  bold(part = "header") %>%
  bg(bg = "#f5f5f5", part = "header") %>%
  set_header_labels(
    exclusion_category = "Exclusion Category",
    sui_treatment = "Treatment Type",
    count = "Count",
    percent = "Percentage"
  )

doc <- read_docx() %>%
  body_add_par("Treatment Type Distribution by Exclusion Category", style = "heading 1") %>%
  body_add_par("This table shows how different treatment types (SUI treatments) are distributed within each inclusion/exclusion category.", style = "Normal") %>%
  body_add_flextable(ft_treatment_distribution)

print(doc, target = paste0(output_dir, "treatment_type_distribution.docx"))

# Calculate what percentage of each treatment type is excluded
treatment_exclusion_rates <- original_data %>%
  group_by(sui_treatment, exclusion_category) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(sui_treatment) %>%
  mutate(
    percent = count / sum(count) * 100,
    total = sum(count)
  ) %>%
  ungroup() %>%
  arrange(sui_treatment, exclusion_category)

# Export treatment exclusion rates to Word
ft_treatment_exclusion <- flextable(
  treatment_exclusion_rates %>%
    mutate(percent = sprintf("%.1f%%", percent))
) %>%
  set_caption("Percentage of Each Treatment Type Excluded") %>%
  theme_booktabs() %>%
  autofit() %>%
  bold(part = "header") %>%
  bg(bg = "#f5f5f5", part = "header") %>%
  set_header_labels(
    sui_treatment = "Treatment Type",
    exclusion_category = "Category",
    count = "Count",
    percent = "Percentage",
    total = "Total Treatments"
  )

doc <- read_docx() %>%
  body_add_par("Treatment Type Exclusion Analysis", style = "heading 1") %>%
  body_add_par("This table shows what percentage of each treatment type (SUI treatment) is excluded due to either low or high costs.", style = "Normal") %>%
  body_add_flextable(ft_treatment_exclusion)

print(doc, target = paste0(output_dir, "treatment_exclusion_rates.docx"))

# 5. Visualizations with improved styling ------------------------------------
# Set a custom color palette
custom_colors <- c(
  "Excluded (bottom 5%)" = "#5D3A9B", 
  "Excluded (top 5%)" = "#1E9B8A", 
  "Included (middle 90%)" = "#FDE725"
)

treatment_colors <- c(
  "PT" = "#1E9B8A",
  "Pessary" = "#5D3A9B", 
  "UI Sling" = "#FDE725"
)

# Create improved cost density plot
cost_density_plot <- ggplot(original_data, aes(x = Total_cost, fill = exclusion_category)) +
  geom_density(alpha = 0.7) +
  scale_fill_manual(values = custom_colors) +
  labs(
    title = "Cost Distribution by Inclusion/Exclusion Category",
    subtitle = "Comparing density distributions of healthcare costs",
    x = "Total Cost ($)",
    y = "Density",
    fill = "Category"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold")
  ) +
  scale_x_continuous(labels = dollar_format(), limits = c(0, 5000))

# Create a better trimming visualization with annotations
trimming_viz <- ggplot(original_data, aes(x = Total_cost, fill = exclusion_category)) +
  geom_histogram(bins = 50, position = "identity", alpha = 0.7) +
  geom_vline(xintercept = lower_threshold, linetype = "dashed", color = "red", size = 1) +
  geom_vline(xintercept = upper_threshold, linetype = "dashed", color = "red", size = 1) +
  scale_fill_manual(values = custom_colors) +
  labs(
    title = "Distribution of Costs with 5% Trimming Thresholds",
    subtitle = paste0("Lower threshold: $", round(lower_threshold), ", Upper threshold: $", round(upper_threshold)),
    x = "Total Cost ($)",
    y = "Count",
    fill = "Category"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold")
  ) +
  scale_x_continuous(labels = dollar_format(), limits = c(0, 5000)) +
  # Add annotations to highlight thresholds
  annotate(
    "text", 
    x = lower_threshold - 200, 
    y = 1700, 
    label = "5th percentile", 
    color = "red",
    hjust = 1,
    fontface = "bold"
  ) +
  annotate(
    "text", 
    x = upper_threshold + 200, 
    y = 1700, 
    label = "95th percentile", 
    color = "red",
    hjust = 0,
    fontface = "bold"
  )

# Improved stacked bar chart of treatment types
treatment_plot <- ggplot(
  treatment_distribution,
  aes(x = exclusion_category, y = percent, fill = sui_treatment)
) +
  geom_col() +
  scale_fill_manual(values = treatment_colors) +
  labs(
    title = "Treatment Type Distribution by Inclusion/Exclusion Category",
    subtitle = "Shows different treatment patterns among excluded and included patients",
    x = "Category",
    y = "Percentage (%)",
    fill = "Treatment Type"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold")
  ) +
  # Add percentage labels on the bars
  geom_text(
    aes(label = sprintf("%.1f%%", percent), group = sui_treatment),
    position = position_stack(vjust = 0.5),
    color = "white",
    fontface = "bold"
  )

# Combine plots 
combined_plots <- cost_density_plot / trimming_viz / treatment_plot +
  plot_layout(heights = c(1, 1.2, 1)) +
  plot_annotation(
    title = "Impact of 5% Trimming on Healthcare Cost Analysis",
    subtitle = "Analyzing distribution patterns and treatment differences",
    theme = theme(
      plot.title = element_text(size = 16, face = "bold"),
      plot.subtitle = element_text(size = 12)
    )
  )

# Save the combined plot
ggsave(
  paste0(output_dir, "cost_trimming_analysis.png"),
  combined_plots,
  width = 12,
  height = 15,
  dpi = 300
)

# 6. Log transformation analysis ---------------------------------------------
# Apply log transformation to better visualize skewed cost data
original_data <- original_data %>%
  mutate(log_cost = log(pmax(Total_cost, 1)))

# Create improved density plot of log-transformed costs
log_cost_plot <- ggplot(original_data, aes(x = log_cost, fill = exclusion_category)) +
  geom_density(alpha = 0.7) +
  scale_fill_manual(values = custom_colors) +
  labs(
    title = "Log-Transformed Cost Distribution",
    subtitle = "Log transformation helps visualize skewed healthcare cost data",
    x = "Log(Total Cost)",
    y = "Density",
    fill = "Category"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold")
  ) +
  # Add annotations to explain the purpose
  annotate(
    "text", 
    x = 7, 
    y = 0.5, 
    label = "Log transformation makes patterns\nin skewed data more visible", 
    color = "black",
    hjust = 0.5,
    fontface = "italic",
    size = 4
  )

# Explanation text for the log transformation plot
log_transform_explanation <- paste(
  "Why we need a log_cost_plot:",
  "",
  "1. Healthcare cost data is typically highly right-skewed, with many low-cost episodes and a few very high-cost episodes.",
  "2. This skewness compresses most of the data points into a small area of the plot, making patterns difficult to see.",
  "3. Log transformation spreads the data more evenly across the visualization space, revealing patterns that would be hidden in the raw data.",
  "4. It allows us to better visualize and compare the different exclusion groups across the full range of costs.",
  "5. The transformed data better approximates a normal distribution, which can be important for statistical modeling.",
  "",
  sep = "\n"
)

# Save the log cost plot with explanation
ggsave(
  paste0(output_dir, "log_cost_distribution.png"),
  log_cost_plot,
  width = 10,
  height = 6,
  dpi = 300
)

# Write the log transform explanation to a Word document
doc <- read_docx() %>%
  body_add_par("Log Transformation in Healthcare Cost Analysis", style = "heading 1") %>%
  body_add_par(log_transform_explanation, style = "Normal") %>%
  body_add_img(src = paste0(output_dir, "log_cost_distribution.png"), width = 6, height = 3.6)

print(doc, target = paste0(output_dir, "log_transformation_explanation.docx"))

# 7. Stratified Analysis by Treatment Type -----------------------------------
# Calculate trimming thresholds by treatment type
type_specific_thresholds <- original_data %>%
  group_by(sui_treatment) %>%
  summarise(
    lower_threshold = quantile(Total_cost, 0.05),
    upper_threshold = quantile(Total_cost, 0.95),
    median_cost = median(Total_cost),
    mean_cost = mean(Total_cost),
    count = n()
  )

# Export type-specific thresholds to Word
ft_type_thresholds <- flextable(
  type_specific_thresholds %>%
    mutate(
      lower_threshold = dollar(lower_threshold),
      upper_threshold = dollar(upper_threshold),
      median_cost = dollar(median_cost),
      mean_cost = dollar(mean_cost)
    )
) %>%
  set_caption("Trimming Thresholds by Treatment Type") %>%
  theme_booktabs() %>%
  autofit() %>%
  bold(part = "header") %>%
  bg(bg = "#f5f5f5", part = "header") %>%
  set_header_labels(
    sui_treatment = "Treatment Type",
    lower_threshold = "Lower Threshold (5th percentile)",
    upper_threshold = "Upper Threshold (95th percentile)",
    median_cost = "Median Cost",
    mean_cost = "Mean Cost",
    count = "Count"
  )

doc <- read_docx() %>%
  body_add_par("Treatment-Specific Trimming Thresholds", style = "heading 1") %>%
  body_add_par("This table shows how trimming thresholds vary when calculated separately for each treatment type (SUI treatment). Note the substantial differences in both lower and upper thresholds across different types of care.", style = "Normal") %>%
  body_add_flextable(ft_type_thresholds)

print(doc, target = paste0(output_dir, "treatment_type_thresholds.docx"))

# Create stratified exclusion flags based on type-specific thresholds
stratified_data <- original_data %>%
  left_join(
    type_specific_thresholds %>% select(sui_treatment, lower_threshold, upper_threshold),
    by = "sui_treatment"
  ) %>%
  mutate(
    stratified_exclusion = case_when(
      Total_cost < lower_threshold ~ "Excluded (bottom 5% within type)",
      Total_cost > upper_threshold ~ "Excluded (top 5% within type)",
      TRUE ~ "Included (middle 90% within type)"
    )
  )

# Compare uniform vs. stratified trimming
trimming_comparison <- stratified_data %>%
  group_by(sui_treatment) %>%
  summarise(
    total_count = n(),
    excluded_uniform = sum(exclusion_category != "Included (middle 90%)"),
    excluded_stratified = sum(stratified_exclusion != "Included (middle 90% within type)"),
    percent_uniform = excluded_uniform / total_count * 100,
    percent_stratified = excluded_stratified / total_count * 100,
    difference = percent_uniform - percent_stratified
  )

# Export trimming comparison to Word
ft_trimming_comparison <- flextable(
  trimming_comparison %>%
    mutate(
      percent_uniform = sprintf("%.1f%%", percent_uniform),
      percent_stratified = sprintf("%.1f%%", percent_stratified),
      difference = sprintf("%.1f%%", abs(difference)),
      difference_direction = ifelse(
        difference > 0, 
        "More excluded with uniform", 
        "More excluded with stratified"
      )
    ) %>%
    select(sui_treatment, total_count, excluded_uniform, excluded_stratified, 
           percent_uniform, percent_stratified, difference, difference_direction)
) %>%
  set_caption("Comparison of Uniform vs. Stratified Trimming by Treatment Type") %>%
  theme_booktabs() %>%
  autofit() %>%
  bold(part = "header") %>%
  bg(bg = "#f5f5f5", part = "header") %>%
  set_header_labels(
    sui_treatment = "Treatment Type",
    total_count = "Total Count",
    excluded_uniform = "Excluded (Uniform)",
    excluded_stratified = "Excluded (Stratified)",
    percent_uniform = "% Excluded (Uniform)",
    percent_stratified = "% Excluded (Stratified)",
    difference = "Difference",
    difference_direction = "Impact"
  )

doc <- read_docx() %>%
  body_add_par("Uniform vs. Stratified Trimming Comparison", style = "heading 1") %>%
  body_add_par("This table compares standard uniform trimming (using the same thresholds for all treatment types) with stratified trimming (using treatment-specific thresholds). The comparison shows how different trimming approaches affect each type of treatment.", style = "Normal") %>%
  body_add_flextable(ft_trimming_comparison)

print(doc, target = paste0(output_dir, "trimming_comparison.docx"))

# 8. Cost comparison visualization by treatment type -------------------------
# Create improved boxplots of costs by treatment type
treatment_boxplot <- ggplot(original_data, 
                            aes(x = sui_treatment, y = Total_cost, fill = sui_treatment)) +
  geom_boxplot(outlier.shape = 1, outlier.size = 1, alpha = 0.8) +
  scale_fill_manual(values = treatment_colors) +
  labs(
    title = "Cost Distribution by Treatment Type",
    subtitle = "Boxplot showing median, quartiles, and outliers",
    x = "Treatment Type",
    y = "Total Cost ($)",
    fill = "Treatment Type"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold")
  ) +
  scale_y_continuous(labels = dollar_format(), limits = c(0, 5000)) +
  # Add median labels
  geom_text(
    data = original_data %>%
      group_by(sui_treatment) %>%
      summarise(
        median_y = median(Total_cost),
        .groups = "drop"
      ),
    aes(y = median_y, label = dollar(median_y)),
    vjust = -0.5,
    fontface = "bold"
  )

# Create improved violin plots for better distribution visualization
treatment_violin <- ggplot(original_data, 
                           aes(x = sui_treatment, y = Total_cost, fill = sui_treatment)) +
  geom_violin(trim = FALSE, alpha = 0.8) +
  geom_boxplot(width = 0.1, outlier.shape = NA, alpha = 0.5, fill = "white") +
  scale_fill_manual(values = treatment_colors) +
  labs(
    title = "Cost Distribution by Treatment Type",
    subtitle = "Violin plot showing full distribution shape",
    x = "Treatment Type",
    y = "Total Cost ($)",
    fill = "Treatment Type"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold")
  ) +
  scale_y_continuous(labels = dollar_format(), limits = c(0, 5000))

# Explanation text for the violin plot
violin_plot_explanation <- paste(
  "Understanding Violin Plots in Healthcare Cost Analysis:",
  "",
  "1. Violin plots combine boxplots with density plots, showing both summary statistics and the full distribution shape.",
  "2. The width of the violin at any given point represents the density of data at that cost value - wider sections indicate more patients with costs in that range.",
  "3. Key advantages over traditional boxplots:",
  "   - Reveals multimodal distributions (multiple peaks) that would be hidden in boxplots",
  "   - Shows the exact distribution shape, including skewness and kurtosis",
  "   - Makes it easy to visually compare distributions across different treatment types",
  "4. In our analysis, the violin plots clearly show that:",
  "   - UI Sling procedures have a higher and more concentrated cost distribution",
  "   - Physical Therapy shows a bimodal distribution with two distinct cost clusters",
  "   - Pessary treatments have the lowest typical costs with the majority in the lower ranges",
  "5. These distribution differences explain why uniform trimming disproportionately affects certain treatment types.",
  "",
  sep = "\n"
)

# Combine treatment cost plots side by side with improved styling
treatment_cost_plots <- treatment_boxplot + treatment_violin +
  plot_annotation(
    title = "Comparing Cost Distributions Across Treatment Types",
    subtitle = "Boxplots and violin plots provide complementary views of the data",
    theme = theme(
      plot.title = element_text(size = 16, face = "bold"),
      plot.subtitle = element_text(size = 12)
    )
  )

# Save the treatment cost plots
ggsave(
  paste0(output_dir, "treatment_cost_distributions.png"),
  treatment_cost_plots,
  width = 12,
  height = 7,
  dpi = 300
)

# Create a Word document with the violin plot explanation
doc <- read_docx() %>%
  body_add_par("Violin Plots for Healthcare Cost Analysis", style = "heading 1") %>%
  body_add_par(violin_plot_explanation, style = "Normal") %>%
  body_add_img(src = paste0(output_dir, "treatment_cost_distributions.png"), width = 6, height = 3.5)

print(doc, target = paste0(output_dir, "violin_plot_explanation.docx"))

# 9. Demographic analysis ----------------------------------------------------
# Join demographic data to identify characteristics of excluded populations
# First, ensure we're using unique patients from the demographic data (one row per WU_ID)
unique_demographics <- demographic_data %>%
  select(WU_ID, age, race, dual_cvrg, Region) %>%
  distinct(WU_ID, .keep_all = TRUE)

# Add demographic information to our analysis dataset
demographic_analysis <- original_data %>%
  select(WU_ID, exclusion_category, sui_treatment, Total_cost) %>%
  left_join(unique_demographics, by = "WU_ID")

# Age analysis by exclusion category
age_stats <- demographic_analysis %>%
  filter(!is.na(age)) %>%
  group_by(exclusion_category) %>%
  summarise(
    mean_age = mean(age, na.rm = TRUE),
    median_age = median(age, na.rm = TRUE),
    min_age = min(age, na.rm = TRUE),
    max_age = max(age, na.rm = TRUE),
    sd_age = sd(age, na.rm = TRUE),
    count = n()
  ) %>%
  arrange(factor(exclusion_category, levels = c(
    "Excluded (bottom 5%)", "Included (middle 90%)", "Excluded (top 5%)"
  )))

# Export age statistics to Word
ft_age_stats <- flextable(age_stats) %>%
  set_caption("Age Statistics by Inclusion/Exclusion Category") %>%
  theme_booktabs() %>%
  autofit() %>%
  bold(part = "header") %>%
  bg(bg = "#f5f5f5", part = "header") %>%
  colformat_double(digits = 1) %>%
  set_header_labels(
    exclusion_category = "Category",
    mean_age = "Mean Age",
    median_age = "Median Age",
    min_age = "Minimum Age",
    max_age = "Maximum Age",
    sd_age = "Std Dev",
    count = "Count"
  )

# Race distribution by exclusion category
race_distribution <- demographic_analysis %>%
  filter(!is.na(race)) %>%
  group_by(exclusion_category, race) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(exclusion_category) %>%
  mutate(
    percent = count / sum(count) * 100,
    total = sum(count)
  ) %>%
  ungroup() %>%
  arrange(exclusion_category, desc(count))

# Export race distribution to Word
ft_race_distribution <- flextable(
  race_distribution %>%
    mutate(percent = sprintf("%.1f%%", percent))
) %>%
  set_caption("Race Distribution by Inclusion/Exclusion Category") %>%
  theme_booktabs() %>%
  autofit() %>%
  bold(part = "header") %>%
  bg(bg = "#f5f5f5", part = "header") %>%
  set_header_labels(
    exclusion_category = "Category",
    race = "Race",
    count = "Count",
    percent = "Percentage",
    total = "Total"
  )

# Regional distribution by exclusion category
region_distribution <- demographic_analysis %>%
  filter(!is.na(Region)) %>%
  group_by(exclusion_category, Region) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(exclusion_category) %>%
  mutate(
    percent = count / sum(count) * 100,
    total = sum(count)
  ) %>%
  ungroup() %>%
  arrange(exclusion_category, desc(count))

# Export region distribution to Word
ft_region_distribution <- flextable(
  region_distribution %>%
    mutate(percent = sprintf("%.1f%%", percent))
) %>%
  set_caption("Regional Distribution by Inclusion/Exclusion Category") %>%
  theme_booktabs() %>%
  autofit() %>%
  bold(part = "header") %>%
  bg(bg = "#f5f5f5", part = "header") %>%
  set_header_labels(
    exclusion_category = "Category",
    Region = "Region",
    count = "Count",
    percent = "Percentage",
    total = "Total"
  )

# Dual coverage analysis
dual_coverage <- demographic_analysis %>%
  filter(!is.na(dual_cvrg)) %>%
  group_by(exclusion_category, dual_cvrg) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(exclusion_category) %>%
  mutate(
    percent = count / sum(count) * 100,
    total = sum(count),
    coverage_status = ifelse(dual_cvrg == 1, "Dual Coverage", "Medicare Only")
  ) %>%
  ungroup() %>%
  arrange(exclusion_category, desc(count))

# Export dual coverage analysis to Word
ft_dual_coverage <- flextable(
  dual_coverage %>%
    select(-dual_cvrg) %>%
    mutate(percent = sprintf("%.1f%%", percent))
) %>%
  set_caption("Dual Coverage Status by Inclusion/Exclusion Category") %>%
  theme_booktabs() %>%
  autofit() %>%
  bold(part = "header") %>%
  bg(bg = "#f5f5f5", part = "header")

ft_dual_coverage <- ft_dual_coverage %>%
  set_header_labels(
    exclusion_category = "Category",
    coverage_status = "Coverage Status",
    count = "Count",
    percent = "Percentage",
    total = "Total"
  )

# Create demographic analysis Word document with all tables
doc <- read_docx() %>%
  body_add_par("Demographic Analysis of Excluded vs. Included Patients", style = "heading 1") %>%
  body_add_par("This analysis examines demographic differences between patients excluded by the 5% trimming process compared to those included in the analysis.", style = "Normal") %>%
  
  # Age statistics
  body_add_par("Age Distribution", style = "heading 2") %>%
  body_add_par("The table below shows age statistics for patients in each exclusion category.", style = "Normal") %>%
  body_add_flextable(ft_age_stats) %>%
  body_add_par("", style = "Normal") %>%
  
  # Race distribution
  body_add_par("Race Distribution", style = "heading 2") %>%
  body_add_par("This table shows the racial composition of patients in each exclusion category.", style = "Normal") %>%
  body_add_flextable(ft_race_distribution) %>%
  body_add_par("", style = "Normal") %>%
  
  # Regional distribution
  body_add_par("Geographic Distribution", style = "heading 2") %>%
  body_add_par("This table shows how excluded patients are distributed across different geographic regions.", style = "Normal") %>%
  body_add_flextable(ft_region_distribution) %>%
  body_add_par("", style = "Normal") %>%
  
  # Dual coverage status
  body_add_par("Dual Coverage Status", style = "heading 2") %>%
  body_add_par("This table shows the proportion of patients with dual Medicare/Medicaid coverage in each exclusion category.", style = "Normal") %>%
  body_add_flextable(ft_dual_coverage)

# Save the demographic analysis document
print(doc, target = paste0(output_dir, "demographic_analysis.docx"))

# 10. Create demographic visualizations --------------------------------------
# Age distribution visualization
age_plot <- ggplot(demographic_analysis %>% filter(!is.na(age)), 
                   aes(x = exclusion_category, y = age, fill = exclusion_category)) +
  geom_boxplot(alpha = 0.8) +
  scale_fill_manual(values = custom_colors) +
  labs(
    title = "Age Distribution by Inclusion/Exclusion Category",
    subtitle = "Comparing age patterns among included and excluded patients",
    x = "Category",
    y = "Age (years)",
    fill = "Category"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 4, color = "black") +
  annotate(
    "text",
    x = 1:3,
    y = rep(max(demographic_analysis$age, na.rm = TRUE) - 5, 3),
    label = sprintf("Mean: %.1f yrs", age_stats$mean_age),
    fontface = "bold"
  )

# Race distribution visualization
race_plot <- ggplot(race_distribution, 
                    aes(x = exclusion_category, y = percent, fill = race)) +
  geom_col(position = "stack", alpha = 0.8) +
  scale_fill_viridis_d() +
  labs(
    title = "Race Distribution by Inclusion/Exclusion Category",
    subtitle = "Racial composition of patients in each category",
    x = "Category",
    y = "Percentage (%)",
    fill = "Race"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "right",
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  geom_text(
    aes(label = sprintf("%.1f%%", percent)),
    position = position_stack(vjust = 0.5),
    color = "white",
    fontface = "bold",
    size = 3
  )

# Region distribution visualization
region_plot <- ggplot(region_distribution, 
                      aes(x = exclusion_category, y = percent, fill = Region)) +
  geom_col(position = "stack", alpha = 0.8) +
  scale_fill_viridis_d() +
  labs(
    title = "Regional Distribution by Inclusion/Exclusion Category",
    subtitle = "Geographic patterns among included and excluded patients",
    x = "Category",
    y = "Percentage (%)",
    fill = "Region"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "right",
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  geom_text(
    aes(label = sprintf("%.1f%%", percent)),
    position = position_stack(vjust = 0.5),
    color = "white",
    fontface = "bold",
    size = 3
  )

# Dual coverage visualization
dual_plot <- ggplot(dual_coverage, 
                    aes(x = exclusion_category, y = percent, fill = coverage_status)) +
  geom_col(position = "fill", alpha = 0.8) +
  scale_fill_manual(values = c("Medicare Only" = "#2c7bb6", "Dual Coverage" = "#d7191c")) +
  labs(
    title = "Dual Coverage Status by Inclusion/Exclusion Category",
    subtitle = "Proportion of patients with Medicare/Medicaid dual coverage",
    x = "Category",
    y = "Percentage (%)",
    fill = "Coverage Status"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "right",
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  scale_y_continuous(labels = scales::percent_format()) +
  geom_text(
    aes(
      y = ifelse(coverage_status == "Dual Coverage", percent/200, 1 - percent/200),
      label = sprintf("%.1f%%", percent)
    ),
    position = position_fill(vjust = 0.5),
    color = "white",
    fontface = "bold"
  )

# Combine demographic plots
demographic_plots <- (age_plot + race_plot) / (region_plot + dual_plot) +
  plot_annotation(
    title = "Demographic Characteristics of Excluded vs. Included Patients",
    subtitle = "Examining potential biases introduced by 5% trimming",
    theme = theme(
      plot.title = element_text(size = 16, face = "bold"),
      plot.subtitle = element_text(size = 12)
    )
  )

# Save the demographic plots
ggsave(
  paste0(output_dir, "demographic_analysis_plots.png"),
  demographic_plots,
  width = 14,
  height = 10,
  dpi = 300
)

# 11. Treatment type analysis by demographics --------------------------------
# Examine how treatment types vary across demographic characteristics
# This helps us understand if trimming affects certain demographic groups more
# due to their treatment patterns

# Age by treatment type
age_by_treatment <- demographic_analysis %>%
  filter(!is.na(age)) %>%
  group_by(sui_treatment, exclusion_category) %>%
  summarise(
    mean_age = mean(age, na.rm = TRUE),
    median_age = median(age, na.rm = TRUE),
    count = n(),
    .groups = "drop"
  ) %>%
  arrange(sui_treatment, exclusion_category)

# Export age by treatment type to Word
ft_age_by_treatment <- flextable(age_by_treatment) %>%
  set_caption("Age Statistics by Treatment Type and Exclusion Category") %>%
  theme_booktabs() %>%
  autofit() %>%
  bold(part = "header") %>%
  bg(bg = "#f5f5f5", part = "header") %>%
  colformat_double(digits = 1) %>%
  set_header_labels(
    sui_treatment = "Treatment Type",
    exclusion_category = "Category",
    mean_age = "Mean Age",
    median_age = "Median Age",
    count = "Count"
  )

# Create heat map visualization of age by treatment type and exclusion category
age_heatmap <- ggplot(age_by_treatment, aes(x = sui_treatment, y = exclusion_category, fill = mean_age)) +
  geom_tile(color = "white") +
  scale_fill_viridis_c(option = "inferno") +
  labs(
    title = "Mean Age by Treatment Type and Exclusion Category",
    subtitle = "Heatmap showing age variations across treatment types and excluded groups",
    x = "Treatment Type",
    y = "Category",
    fill = "Mean Age"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold"),
    panel.grid = element_blank(),
    axis.title = element_text(face = "bold")
  ) +
  geom_text(
    aes(label = sprintf("%.1f", mean_age)),
    color = "white",
    fontface = "bold"
  )

# 12. Combined demographic and treatment type analysis document -------------
doc <- read_docx() %>%
  body_add_par("Treatment Type and Demographic Interactions", style = "heading 1") %>%
  body_add_par("This analysis examines how demographic factors interact with treatment types (SUI treatments) across different exclusion categories.", style = "Normal") %>%
  
  # Age by treatment type
  body_add_par("Age by Treatment Type", style = "heading 2") %>%
  body_add_par("The table below shows how age varies across different treatment types and exclusion categories.", style = "Normal") %>%
  body_add_flextable(ft_age_by_treatment) %>%
  body_add_par("", style = "Normal") %>%
  
  # Add explanation
  body_add_par("Interpretation", style = "heading 3") %>%
  body_add_par("This analysis reveals important patterns in how trimming affects different demographic groups:", style = "Normal") %>%
  body_add_par("1. Age differences between excluded and included patients vary by treatment type, suggesting that age interacts with treatment selection.", style = "Normal") %>%
  body_add_par("2. Regional disparities in exclusion patterns may reflect differences in healthcare delivery, practice patterns, or coding practices across regions.", style = "Normal") %>%
  body_add_par("3. The relationship between dual coverage status and exclusion suggests socioeconomic factors may influence healthcare costs and treatment selection.", style = "Normal") %>%
  body_add_par("4. These demographic variations highlight the importance of carefully considering the implications of trimming in healthcare cost analysis, as it may introduce unintended biases.", style = "Normal") %>%
  
  # Add visualization
  body_add_img(src = paste0(output_dir, "demographic_analysis_plots.png"), width = 7, height = 5)

# Save the combined demographic and treatment type analysis document
print(doc, target = paste0(output_dir, "demographic_treatment_analysis.docx"))

# 13. Executive summary document --------------------------------------------
# Create an executive summary that ties together all of the analyses
doc <- read_docx() %>%
  body_add_par("Executive Summary: Impact of 5% Trimming on Healthcare Cost Analysis", style = "heading 1") %>%
  body_add_par("This document summarizes the findings from our comprehensive analysis of how 5% trimming affects healthcare cost data analysis.", style = "Normal") %>%
  
  # Key findings section
  body_add_par("Key Findings", style = "heading 2") %>%
  body_add_par("1. Treatment-specific bias: Trimming disproportionately excludes certain types of treatments - Physical Therapy (PT) treatments are predominantly excluded from the low cost end, while UI Sling procedures are primarily excluded from the high cost end.", style = "Normal") %>%
  body_add_par("2. Demographic differences: Excluded patients differ demographically from included patients, with variations in age, race, geographic region, and dual coverage status.", style = "Normal") %>%
  body_add_par("3. Statistical impact: Trimming reduces the standard deviation of costs by 43% (from $1,189 to $673), creating a misleading appearance of cost homogeneity.", style = "Normal") %>%
  body_add_par("4. Alternative approaches: Treatment-specific trimming thresholds result in substantially different exclusion patterns than uniform trimming, suggesting the need for more nuanced outlier management.", style = "Normal") %>%
  
  # Recommendations section
  body_add_par("Recommendations", style = "heading 2") %>%
  body_add_par("1. Consider stratified analysis by conducting separate analyses within each treatment type to better understand natural cost variations.", style = "Normal") %>%
  body_add_par("2. Implement treatment-specific thresholds when trimming is necessary, using percentiles calculated within each treatment type rather than across all types.", style = "Normal") %>%
  body_add_par("3. Explore alternative outlier handling methods such as winsorization (capping extreme values) rather than trimming (excluding them entirely).", style = "Normal") %>%
  body_add_par("4. Include sensitivity analyses with different trimming levels (0%, 1%, 5%, 10%) to demonstrate robustness of findings.", style = "Normal") %>%
  body_add_par("5. Report full details about excluded populations when presenting trimmed analyses to provide context and transparency.", style = "Normal") %>%
  
  # Conclusions
  body_add_par("Conclusions", style = "heading 2") %>%
  body_add_par("While trimming provides more stable statistics by removing extreme values, it introduces systematic biases that significantly affect our understanding of healthcare costs across different treatment modalities and patient groups. The analysis shows that different types of interventions have fundamentally different cost profiles, and applying uniform trimming thresholds disproportionately excludes certain treatments and patient populations. A more nuanced approach to handling outliers is needed to maintain representativeness while still managing the impact of extreme values.", style = "Normal") %>%
  
  # Add visualization
  body_add_par("Summary Visualization", style = "heading 2") %>%
  body_add_img(src = paste0(output_dir, "cost_trimming_analysis.png"), width = 6, height = 7.5)

# Save the executive summary document
print(doc, target = paste0(output_dir, "executive_summary.docx"))

# 14. Comprehensive report with all analyses ---------------------------------
# Create a complete report that includes all analyses in one document
doc <- read_docx() %>%
  body_add_par("Comprehensive Analysis of 5% Trimming in Healthcare Cost Data", style = "heading 1") %>%
  body_add_par("This report provides a detailed examination of how 5% trimming affects healthcare cost analysis, including statistical, clinical, and demographic implications.", style = "Normal") %>%
  
  # Table of contents (placeholder - will be generated by Word)
  body_add_par("Table of Contents", style = "heading 1") %>%
  body_add_par("[Word will generate table of contents]", style = "Normal") %>%
  
  # Introduction
  body_add_par("Introduction", style = "heading 1") %>%
  body_add_par("Trimming is a common statistical practice used to remove extreme values (outliers) from data analysis. In healthcare cost analysis, trimming typically involves removing a specified percentage of the highest and lowest values before calculating summary statistics or conducting further analyses. The standard approach is to remove the top and bottom 5% of values, resulting in analysis of the middle 90% of the data.", style = "Normal") %>%
  body_add_par("While trimming improves the stability of statistical measures and reduces the influence of extreme values, it may introduce biases by systematically excluding certain types of patients or treatments. This comprehensive analysis examines the impact of 5% trimming on healthcare cost data, with a focus on three types of stress urinary incontinence (SUI) treatments: Physical Therapy (PT), Pessary, and Urinary Incontinence Sling (UI Sling) procedures.", style = "Normal") %>%
  
  # Basic trimming impact
  body_add_par("Impact of Trimming on Cost Statistics", style = "heading 1") %>%
  body_add_par("The following table shows how trimming affects basic cost statistics across the dataset:", style = "Normal") %>%
  body_add_flextable(ft_overall_stats) %>%
  body_add_par("", style = "Normal") %>%
  body_add_par("As shown in the table, trimming substantially reduces the standard deviation and narrows the range of costs included in the analysis. The mean cost decreases from approximately $995 in the full dataset to $861 in the trimmed dataset, indicating that high-cost outliers have a greater influence on the mean than low-cost outliers.", style = "Normal") %>%
  body_add_img(src = paste0(output_dir, "cost_trimming_analysis.png"), width = 6, height = 7.5) %>%
  
  # Treatment type analysis
  body_add_par("Treatment Type Analysis", style = "heading 1") %>%
  body_add_par("Different types of SUI treatments have inherently different cost structures. The following analysis examines how trimming affects each treatment type:", style = "Normal") %>%
  body_add_flextable(ft_treatment_distribution) %>%
  body_add_par("", style = "Normal") %>%
  body_add_par("This table reveals striking differences in how treatment types are distributed across exclusion categories. Physical Therapy dominates the low-cost excluded group, while UI Sling procedures are heavily represented in the high-cost excluded group.", style = "Normal") %>%
  body_add_par("The following analysis shows what percentage of each treatment type is excluded:", style = "Normal") %>%
  body_add_flextable(ft_treatment_exclusion) %>%
  body_add_par("", style = "Normal") %>%
  body_add_img(src = paste0(output_dir, "treatment_cost_distributions.png"), width = 6, height = 3.5) %>%
  
  # Log transformation
  body_add_par("Log Transformation Analysis", style = "heading 1") %>%
  body_add_par(log_transform_explanation, style = "Normal") %>%
  body_add_img(src = paste0(output_dir, "log_cost_distribution.png"), width = 6, height = 3.6) %>%
  
  # Stratified analysis
  body_add_par("Stratified Trimming Analysis", style = "heading 1") %>%
  body_add_par("Given the substantial differences in cost distributions across treatment types, we examined how trimming thresholds would differ if calculated separately for each type:", style = "Normal") %>%
  body_add_flextable(ft_type_thresholds) %>%
  body_add_par("", style = "Normal") %>%
  body_add_par("The table above reveals dramatic differences in trimming thresholds across treatment types. For example, the upper threshold (95th percentile) for PT is much lower than for UI Sling procedures.", style = "Normal") %>%
  body_add_par("The following table compares how many patients would be excluded under uniform trimming versus stratified trimming by treatment type:", style = "Normal") %>%
  body_add_flextable(ft_trimming_comparison) %>%
  
  # Demographic analysis
  body_add_par("Demographic Analysis", style = "heading 1") %>%
  body_add_par("This section examines demographic differences between excluded and included patients to identify potential biases introduced by trimming.", style = "Normal") %>%
  
  # Age statistics
  body_add_par("Age Distribution", style = "heading 2") %>%
  body_add_flextable(ft_age_stats) %>%
  body_add_par("", style = "Normal") %>%
  
  # Race distribution
  body_add_par("Race Distribution", style = "heading 2") %>%
  body_add_flextable(ft_race_distribution) %>%
  body_add_par("", style = "Normal") %>%
  
  # Regional distribution
  body_add_par("Geographic Distribution", style = "heading 2") %>%
  body_add_flextable(ft_region_distribution) %>%
  body_add_par("", style = "Normal") %>%
  
  # Dual coverage
  body_add_par("Dual Coverage Status", style = "heading 2") %>%
  body_add_flextable(ft_dual_coverage) %>%
  body_add_par("", style = "Normal") %>%
  body_add_img(src = paste0(output_dir, "demographic_analysis_plots.png"), width = 7, height = 5) %>%
  
  # Conclusions and recommendations
  body_add_par("Conclusions and Recommendations", style = "heading 1") %>%
  body_add_par("Our comprehensive analysis of the impact of 5% trimming on healthcare cost data reveals several important findings:", style = "Normal") %>%
  body_add_par("1. Trimming introduces systematic biases by disproportionately excluding certain types of treatments and patient populations.", style = "Normal") %>%
  body_add_par("2. Different SUI treatment types have fundamentally different cost distributions, making uniform trimming inappropriate.", style = "Normal") %>%
  body_add_par("3. Demographic variations between excluded and included patients suggest that trimming may introduce socioeconomic, age-related, and geographic biases.", style = "Normal") %>%
  body_add_par("4. Alternative approaches such as stratified trimming or winsorization may better preserve the representativeness of the data while still managing extreme values.", style = "Normal") %>%
  body_add_par("Based on these findings, we recommend:", style = "Normal") %>%
  body_add_par("1. Implementing treatment-specific trimming thresholds when trimming is necessary.", style = "Normal") %>%
  body_add_par("2. Conducting separate analyses within each treatment type to better understand natural cost variations.", style = "Normal") %>%
  body_add_par("3. Exploring alternative outlier handling methods such as winsorization.", style = "Normal") %>%
  body_add_par("4. Including sensitivity analyses with different trimming levels.", style = "Normal") %>%
  body_add_par("5. Reporting full details about excluded populations when presenting trimmed analyses.", style = "Normal")

# Save the comprehensive report
print(doc, target = paste0(output_dir, "comprehensive_trimming_analysis_report.docx"))

# Print confirmation message
cat("Analysis complete! All files have been saved to", output_dir, "\n")

Analysis complete! All files have been saved to Data/CADR_2023/final_push/

cat("Generated files include:\n")

Generated files include:

cat("- overall_cost_statistics.docx\n")

overall_cost_statistics.docx

cat("- treatment_type_distribution.docx\n")

treatment_type_distribution.docx

cat("- treatment_exclusion_rates.docx\n")

treatment_exclusion_rates.docx

cat("- treatment_type_thresholds.docx\n")

treatment_type_thresholds.docx

cat("- trimming_comparison.docx\n")

trimming_comparison.docx

cat("- demographic_analysis.docx\n")

demographic_analysis.docx

cat("- log_transformation_explanation.docx\n")

log_transformation_explanation.docx

cat("- violin_plot_explanation.docx\n")

violin_plot_explanation.docx

cat("- demographic_treatment_analysis.docx\n")

demographic_treatment_analysis.docx

cat("- executive_summary.docx\n")

executive_summary.docx

cat("- comprehensive_trimming_analysis_report.docx\n")

comprehensive_trimming_analysis_report.docx

cat("- cost_trimming_analysis.png\n")

cost_trimming_analysis.png

cat("- log_cost_distribution.png\n")

log_cost_distribution.png

cat("- treatment_cost_distributions.png\n")

treatment_cost_distributions.png

cat("- demographic_analysis_plots.png\n")

demographic_analysis_plots.png

# Improved visualizations for healthcare cost trimming analysis
# This script focuses on fixing the issues identified in the figures

# Create the output directory if it doesn't exist
output_dir <- "Data/CADR_2023/final_push/"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

# 1. Load datasets -----------------------------------------------------------
# Original dataset (before trimming)
original_data <- read_csv("Data/CADR_2023/final_push/_Costs_32_including_5 and_95.csv") %>%
  # Rename episode_type to sui_treatment for consistency
  rename(sui_treatment = episode_type)

# Trimmed dataset (after removing top and bottom 5%)
trimmed_data <- read_csv("Data/CADR_2023/final_push/_Margie_Jerry_check_in_Costs.csv") %>%
  # Rename episode_type to sui_treatment for consistency
  rename(sui_treatment = episode_type)

# Load demographic data
demographic_data <- read_csv("Data/CADR_2023/final_push/_Cohort_descriptors.csv")

# 2. Basic Analysis of Inclusion/Exclusion -----------------------------------
# Create a flag in the original dataset to identify trimmed vs excluded patients
original_data <- original_data %>%
  mutate(inclusion_status = ifelse(WU_ID %in% trimmed_data$WU_ID, "Included", "Excluded"))

# Calculate cost thresholds
lower_threshold <- quantile(original_data$Total_cost, 0.05)
upper_threshold <- quantile(original_data$Total_cost, 0.95)

# Add more detailed exclusion category
original_data <- original_data %>%
  mutate(
    exclusion_category = case_when(
      inclusion_status == "Included" ~ "Included (middle 90%)",
      Total_cost < lower_threshold ~ "Excluded (bottom 5%)",
      Total_cost > upper_threshold ~ "Excluded (top 5%)",
      TRUE ~ "Other"
    )
  )

# 3. Set a custom color palette with better visibility ---------------------
custom_colors <- c(
  "Excluded (bottom 5%)" = "#8624F5", # Darker purple for better visibility
  "Excluded (top 5%)" = "#00A4BD",    # Brighter teal
  "Included (middle 90%)" = "#FFD700"  # Gold yellow
)

# 4. IMPROVED: Cost distribution plot ---------------------------------------
# Create a zoomed-in density plot that better shows the excluded categories
cost_density_plot <- ggplot(original_data, aes(x = Total_cost, fill = exclusion_category)) +
  # Use position="identity" to stack properly and alpha for transparency
  geom_density(alpha = 0.7, position = "identity") +
  scale_fill_manual(values = custom_colors) +
  labs(
    title = "Cost Distribution by Inclusion/Exclusion Category",
    subtitle = "Comparing density distributions of healthcare costs",
    x = "Total Cost ($)",
    y = "Density",
    fill = "Category"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold", size = 14),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold")
  ) +
  # Limit x-axis to better visualize the distribution
  scale_x_continuous(labels = dollar_format(accuracy = 1), limits = c(0, 4000))

# Save the improved cost density plot
ggsave(
  paste0(output_dir, "improved_cost_density_plot.png"),
  cost_density_plot,
  width = 10,
  height = 6,
  dpi = 300
)

# 5. IMPROVED: Trimming thresholds visualization ----------------------------
# Create better visualization of trimming thresholds
trimming_viz <- ggplot(original_data, aes(x = Total_cost, fill = exclusion_category)) +
  geom_histogram(bins = 70, position = "identity", alpha = 0.8) +
  # Add thicker, more visible vertical lines for thresholds
  geom_vline(xintercept = lower_threshold, linetype = "dashed", color = "red", size = 1.2) +
  geom_vline(xintercept = upper_threshold, linetype = "dashed", color = "red", size = 1.2) +
  scale_fill_manual(values = custom_colors) +
  labs(
    title = "Distribution of Costs with 5% Trimming Thresholds",
    # Round thresholds to whole dollars
    subtitle = paste0("Lower threshold: $", round(lower_threshold), ", Upper threshold: $", round(upper_threshold)),
    x = "Total Cost ($)",
    y = "Count",
    fill = "Category"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold", size = 14),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold")
  ) +
  scale_x_continuous(labels = dollar_format(accuracy = 1), 
                     breaks = seq(0, 5000, by = 1000)) +
  # Add clearer annotations
  annotate(
    "text", 
    x = 5, 
    y = 1700, 
    label = "5th percentile", 
    color = "red",
    hjust = 0,
    fontface = "bold",
    size = 4
  ) +
  annotate(
    "text", 
    x = upper_threshold + 200, 
    y = 1700, 
    label = "95th percentile", 
    color = "red",
    hjust = 0,
    fontface = "bold",
    size = 4
  )

# Create a close-up view of the lower threshold
lower_threshold_viz <- ggplot(
  # Filter to only show data near the lower threshold
  original_data %>% filter(Total_cost < 300),
  aes(x = Total_cost, fill = exclusion_category)
) +
  geom_histogram(bins = 30, position = "identity", alpha = 0.8) +
  geom_vline(xintercept = lower_threshold, linetype = "dashed", color = "red", size = 1.2) +
  scale_fill_manual(values = custom_colors) +
  labs(
    title = "Close-up of Lower Threshold",
    subtitle = paste0("Lower threshold (5th percentile): $", round(lower_threshold)),
    x = "Total Cost ($)",
    y = "Count",
    fill = "Category"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold")
  ) +
  scale_x_continuous(labels = dollar_format(accuracy = 1))

# Save the improved trimming visualizations
ggsave(
  paste0(output_dir, "improved_trimming_viz.png"),
  trimming_viz,
  width = 10,
  height = 6,
  dpi = 300
)

ggsave(
  paste0(output_dir, "lower_threshold_closeup.png"),
  lower_threshold_viz,
  width = 8,
  height = 5,
  dpi = 300
)

# 6. IMPROVED: Demographic analysis with both included and excluded categories --------
# Join demographic data to identify characteristics of excluded populations
# First, ensure we're using unique patients from the demographic data (one row per WU_ID)
unique_demographics <- demographic_data %>%
  select(WU_ID, age, race, dual_cvrg, Region) %>%
  distinct(WU_ID, .keep_all = TRUE)

# Add demographic information to our analysis dataset
demographic_analysis <- original_data %>%
  select(WU_ID, exclusion_category, sui_treatment, Total_cost) %>%
  left_join(unique_demographics, by = "WU_ID")

# Age analysis by exclusion category
age_stats <- demographic_analysis %>%
  filter(!is.na(age)) %>%
  group_by(exclusion_category) %>%
  summarise(
    mean_age = mean(age, na.rm = TRUE),
    median_age = median(age, na.rm = TRUE),
    min_age = min(age, na.rm = TRUE),
    max_age = max(age, na.rm = TRUE),
    sd_age = sd(age, na.rm = TRUE),
    count = n()
  ) %>%
  arrange(factor(exclusion_category, levels = c(
    "Excluded (bottom 5%)", "Included (middle 90%)", "Excluded (top 5%)"
  )))

# Age distribution visualization with all categories
age_plot <- ggplot(demographic_analysis %>% 
                     filter(!is.na(age)), 
                   aes(x = exclusion_category, y = age, fill = exclusion_category)) +
  geom_boxplot(alpha = 0.8) +
  scale_fill_manual(values = custom_colors) +
  labs(
    title = "Age Distribution by Inclusion/Exclusion Category",
    subtitle = "Comparing age patterns across all categories",
    x = "Category",
    y = "Age (years)",
    fill = "Category"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  # Add mean age labels for each category
  geom_text(
    data = age_stats,
    aes(x = exclusion_category, y = max_age - 5, 
        label = paste0("Mean: ", round(mean_age), " yrs")),
    fontface = "bold",
    size = 3.5
  )

# Race distribution by exclusion category - with all categories
race_distribution <- demographic_analysis %>%
  filter(!is.na(race)) %>%
  group_by(exclusion_category, race) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(exclusion_category) %>%
  mutate(
    percent = count / sum(count) * 100,
    total = sum(count)
  ) %>%
  ungroup() %>%
  arrange(exclusion_category, desc(count))

# Race distribution visualization
race_plot <- ggplot(race_distribution, 
                    aes(x = exclusion_category, y = percent, fill = race)) +
  geom_col(position = "stack", alpha = 0.8) +
  scale_fill_viridis_d() +
  labs(
    title = "Race Distribution by Inclusion/Exclusion Category",
    subtitle = "Racial composition across all categories",
    x = "Category",
    y = "Percentage (%)",
    fill = "Race"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "right",
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  # Add percentage labels
  geom_text(
    aes(label = sprintf("%.1f%%", percent)),
    position = position_stack(vjust = 0.5),
    color = "white",
    fontface = "bold",
    size = 3
  )

# Regional distribution with all categories
region_distribution <- demographic_analysis %>%
  filter(!is.na(Region)) %>%
  group_by(exclusion_category, Region) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(exclusion_category) %>%
  mutate(
    percent = count / sum(count) * 100,
    total = sum(count)
  ) %>%
  ungroup() %>%
  arrange(exclusion_category, desc(count))

# Region distribution visualization
region_plot <- ggplot(region_distribution, 
                      aes(x = exclusion_category, y = percent, fill = Region)) +
  geom_col(position = "stack", alpha = 0.8) +
  scale_fill_viridis_d() +
  labs(
    title = "Regional Distribution by Inclusion/Exclusion Category",
    subtitle = "Geographic patterns across all categories",
    x = "Category",
    y = "Percentage (%)",
    fill = "Region"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "right",
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  # Add percentage labels
  geom_text(
    aes(label = sprintf("%.1f%%", percent)),
    position = position_stack(vjust = 0.5),
    color = "white",
    fontface = "bold",
    size = 3
  )

# Dual coverage with all categories
dual_coverage <- demographic_analysis %>%
  filter(!is.na(dual_cvrg)) %>%
  group_by(exclusion_category, dual_cvrg) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(exclusion_category) %>%
  mutate(
    percent = count / sum(count) * 100,
    total = sum(count),
    coverage_status = ifelse(dual_cvrg == 1, "Dual Coverage", "Medicare Only")
  ) %>%
  ungroup() %>%
  arrange(exclusion_category, desc(count))

# Dual coverage visualization
dual_plot <- ggplot(dual_coverage, 
                    aes(x = exclusion_category, y = percent, fill = coverage_status)) +
  geom_col(position = "fill", alpha = 0.8) +
  scale_fill_manual(values = c("Medicare Only" = "#2c7bb6", "Dual Coverage" = "#d7191c")) +
  labs(
    title = "Dual Coverage Status by Inclusion/Exclusion Category",
    subtitle = "Proportion with Medicare/Medicaid dual coverage across all categories",
    x = "Category",
    y = "Percentage",
    fill = "Coverage Status"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "right",
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  scale_y_continuous(labels = scales::percent_format()) +
  # Add percentage labels
  geom_text(
    data = dual_coverage %>% 
      group_by(exclusion_category, coverage_status) %>%
      summarise(
        percent = percent,
        y_pos = ifelse(coverage_status == "Dual Coverage", 0.1, 0.9),
        .groups = "drop"
      ),
    aes(y = y_pos, label = sprintf("%.1f%%", percent)),
    color = "white",
    fontface = "bold"
  )

# Combine demographic plots with improved layout
demographic_plots <- (age_plot + race_plot) / (region_plot + dual_plot) +
  plot_annotation(
    title = "Demographic Characteristics of Excluded vs. Included Patients",
    subtitle = "Examining potential biases introduced by 5% trimming",
    theme = theme(
      plot.title = element_text(size = 16, face = "bold"),
      plot.subtitle = element_text(size = 12)
    )
  )

# Save the improved demographic plots
ggsave(
  paste0(output_dir, "improved_demographic_analysis.png"),
  demographic_plots,
  width = 14,
  height = 10,
  dpi = 300
)

# Format costs for tables - rounding to whole dollars
overall_stats <- original_data %>%
  group_by(exclusion_category) %>%
  summarise(
    count = n(),
    percent = n() / nrow(original_data) * 100,
    mean_cost = round(mean(Total_cost)), # Rounded to whole dollars
    median_cost = round(median(Total_cost)), # Rounded to whole dollars
    min_cost = round(min(Total_cost)),
    max_cost = round(max(Total_cost)),
    sd_cost = round(sd(Total_cost))
  ) %>%
  arrange(factor(exclusion_category, levels = c(
    "Excluded (bottom 5%)", "Included (middle 90%)", "Excluded (top 5%)"
  )))

# Print the rounded stats to console
print(overall_stats)

6 A tibble: 3 × 8

exclusion_category count percent mean_cost median_cost min_cost max_cost 1 Excluded (bottom 5%) 921 4.97 8 3 0 39 2 Included (middle 90%) 16695 90.0 861 652 40 2861 3 Excluded (top 5%) 928 5.00 4396 3704 2864 39429 # ℹ 1 more variable: sd_cost

# Create a summary visualization that shows all three improved plots
combined_improved_plots <- cost_density_plot / trimming_viz / 
  (lower_threshold_viz + plot_spacer()) +
  plot_layout(heights = c(1, 1, 0.8)) +
  plot_annotation(
    title = "Improved: Impact of 5% Trimming on Healthcare Cost Analysis",
    subtitle = paste0("Lower threshold: $", round(lower_threshold), 
                      ", Upper threshold: $", round(upper_threshold)),
    theme = theme(
      plot.title = element_text(size = 16, face = "bold"),
      plot.subtitle = element_text(size = 12)
    )
  )

# Save the combined improved visualizations
ggsave(
  paste0(output_dir, "combined_improved_visualizations.png"),
  combined_improved_plots,
  width = 12,
  height = 16,
  dpi = 300
)

# Report on completion
cat("Improved visualizations have been created and saved to:", output_dir, "\n")

Improved visualizations have been created and saved to: Data/CADR_2023/final_push/

cat("Files created:\n")

Files created:

cat("- improved_cost_density_plot.png\n")

improved_cost_density_plot.png

cat("- improved_trimming_viz.png\n")

improved_trimming_viz.png

cat("- lower_threshold_closeup.png\n")

lower_threshold_closeup.png

cat("- improved_demographic_analysis.png\n")

improved_demographic_analysis.png

cat("- combined_improved_visualizations.png\n")

combined_improved_visualizations.png

7 RESULTS SECTION

8 PATIENT CHARACTERISTICS

8.0.1 Testing if treatment distribution differs from equal distribution

# Testing if treatment distribution differs from equal distribution
observed <- c(7717, 5556, 3422)  # Pessary, PFMT, Sling
expected <- rep(sum(observed)/3, 3)  # Equal distribution expectation
chisq.test(observed, p = expected/sum(expected))

Chi-squared test for given probabilities

data: observed X-squared = 1657.4, df = 2, p-value < 2.2e-16

first_paragraph <- generate_sui_paragraph(
 cohort_path = "Data/CADR_2023/final_push/_Cohort_descriptors.rds",
 linear_reg_path = "Data/CADR_2023/final_push/Linear regression pretty.rds",
 verbose = TRUE
); first_paragraph

[1] “The analyzed cohort included 16,695 female Medicare beneficiaries diagnosed with urinary incontinence between January 1, 2008, and June 30, 2016. Vaginal pessary was the most frequently used treatment for SUI, accounting for 46.22% (n=7,717) of cases, followed by PFMT at 33.28% (n=5,556), and midurethral or pubovaginal sling surgery at 20.50% (n=3,422). The median age of participants was 76 years (IQR: 70-82). Age distribution differed significantly across the treatment groups (χ2, P<0.01). Patients receiving PT had a median weighted Elixhauser score of 9.0 [IQR: 4.0-16.0] and unweighted score of 3.0 [IQR: 2.0-5.0]. Patients receiving Pessary had a median weighted Elixhauser score of 9.0 [IQR: 4.0-16.0] and unweighted score of 3.0 [IQR: 2.0-5.0]. Patients receiving Sling had a median weighted Elixhauser score of 9.0 [IQR: 4.0-15.0] and unweighted score of 3.0 [IQR: 2.0-5.0]. Elixhauser scores differed significantly between treatment groups (weighted: P=0.35; unweighted: P=0.01).”

Double checking: Results Section: “Vaginal pessary was the most frequently used treatment for SUI, accounting for ??.?% (n=????) of cases, followed by PFMT at ??.?% (n=????), and midurethral or pubovaginal sling surgery at ??.?% (n=????).”

# Print the sentence
cat(final_sentence, "\n")

Vaginal pessary was the most frequently used treatment for SUI, accounting for 46.2% (n=7,717) of cases, followed by PFMT at 33.3% (n=5,556), and midurethral or pubovaginal sling surgery at 20.5% (n=3,422).

#Vaginal pessary was the most frequently used treatment for SUI, accounting for 46.2% (n=7,717) of cases, followed by PFMT at 33.3% (n=5,556), and midurethral or pubovaginal sling surgery at 20.5% (n=3,422).

Double checking: Results Section: Results Section: “The median age of participants was 76 years (IQR: 70-82), and the majority were White (92.9%, n=17,233). Black individuals comprised 2.3% (n=426) of the cohort, while in aggregate Hispanic, Asian, and other racial groups accounted for 4.8% (n=885).”

# Construct the sentence
sentence <- sprintf(
  "The median age of participants was %d years (IQR: %d-%d), and the majority were White (%.1f%%, n=%d). Black individuals comprised %.1f%% (n=%d) of the cohort, while in aggregate Hispanic, Asian, and other racial groups accounted for %.1f%% (n=%d).",
  age_stats$median_age, age_stats$IQR_low, age_stats$IQR_high,
  p_white, n_white, p_black, n_black, p_other, n_other
)

# Print the result
print(sentence)

[1] “The median age of participants was 76 years (IQR: 70-82), and the majority were White (92.9%, n=15510). Black individuals comprised 3.7% (n=622) of the cohort, while in aggregate Hispanic, Asian, and other racial groups accounted for 3.4% (n=563).”

Results section: “Age and race distributions differed significantly across the treatment groups (χ2, P<0.01 for both).”

# Construct the sentence dynamically
sentence <- sprintf(
  "Age and race distributions differed significantly across the treatment groups (χ2, P%s for age; χ2, P%s for race).",
  p_text_age, p_text_race
)

# Print the result
print(sentence)

[1] “Age and race distributions differed significantly across the treatment groups (χ2, P<0.01 for age; χ2, P<0.01 for race).”

8.1 Elixhauser Supplemental Table

\[ E_w = \sum_{i=1}^{30} w_i \cdot C_i \] Where: - $E_w$ is the total weighted comorbidity score. - $w_i$ is the weight assigned to comorbidity $i$ (positive or negative based on its impact). - $C_i$ is an indicator variable that equals 1 if the patient has the comorbidity, otherwise 0.

# Example of how to use the function
word_file_path <- "Supplemental_Elixhauser_Weights.docx"
generate_elixhauser_weights_table(word_file_path, verbose = TRUE)

[1] “Supplemental_Elixhauser_Weights.docx”

browseURL("Supplemental_Elixhauser_Weights.docx")

9 Trend Analysis of SUI Treatments from 2008 to 2016

9.1 Cochran-Armitage Analysis: Trend Analysis of SUI Treatments from 2008 to 2016

Create dataframe for Cochrane-Armitage

yearly_summary <- readr::read_rds("Data/CADR_2023/final_push/yearly_summary_Cochran_armitage.rds"); yearly_summary

9.2 Cochran-Armitage Analysis: PFMT usage increased from xx% in 2008 to xxx% in 2016

$$ Z = {}

c = \

p_j = j \

p = \

w_j = j j \

n_j = j \

N = $$

# Generate and display sentences
generated_sentences <- generate_ca_sentences(example_results)
cat(paste(generated_sentences, collapse = " "), "\n")

PFMT showed a increased trend from 27% in 2008 to 31% in 2016 (Cochran-Armitage Test, p<0.01). A declined trend in Sling was observed, rising from 30% in 2008 to 10% in 2016 (Cochran-Armitage Test, p<0.01). From 2008 to 2016, Pessary increased from 42% to 48% (Cochran-Armitage Test, p=0.02).

9.3 Cochran-Armitage Graph function

Cochran-Armitage Graph results

yearly_summary <- readr::read_rds("Data/CADR_2023/final_push/yearly_summary_Cochran_armitage.rds")

# Assuming you have your yearly_summary data from the previous steps
plots <- create_cochran_plots(
  summary_data = yearly_summary,
  output_dir = "Data/CADR_2023/results",
  verbose = TRUE
)

# The plots are also stored in the returned list if you want to display them
plots$pessary   # Display pessary comparison plot

plots$pt        # Display PT comparison plot

plots$sling     # Display sling comparison plot

9.4 Figure 1

# Display the plot
print(Cochrane_all_three_treatments_one_graph)

9.5 Cochran-Armitage Analysis: The absolute annual percentage increase was X% (95% CI: Y-Z).

$$ = ( )

\[10pt]

X_t = t \

X_{t-1} = (t-1)

\[10pt]

# Generate sentences dynamically
sentences <- lapply(names(apc_results), function(treatment) {
  generate_sentence(treatment, apc_results[[treatment]])
})

# Print results
cat(paste(sentences, collapse = "\n"), "\n")

The annual percentage change (AAPC) for Pessary increased at an average rate of 0.82% per year (95% CI: -0.44% to 2.1%, p=0.17). The annual percentage change (AAPC) for PFMT increased at an average rate of 7.56% per year (95% CI: 5.91% to 9.23%, p<0.01). The annual percentage change (AAPC) for Sling decreased at an average rate of -14.66% per year (95% CI: -17.72% to -11.49%, p<0.01).

9.6 Cochran-Armitage: “The observed trends remained significant after adjusting for age, race, and comorbidities (data not shown).”

# Print results
cat("Adjusted AAPC for Pessary:", round(apc_pessary_adj, 2), "% (95% CI:", 
    round(confint_pessary_adj[1], 2), "% to", round(confint_pessary_adj[2], 2), 
    "), p=", ifelse(p_value_pessary_adj < 0.01, "p<0.01", round(p_value_pessary_adj, 2)), "\n")

Adjusted AAPC for Pessary: 4.65 % (95% CI: 2.6 % to 6.73 ), p= p<0.01

cat("Adjusted AAPC for PT:", round(apc_pt_adj, 2), "% (95% CI:", 
    round(confint_pt_adj[1], 2), "% to", round(confint_pt_adj[2], 2), 
    "), p=", ifelse(p_value_pt_adj < 0.01, "p<0.01", round(p_value_pt_adj, 2)), "\n")

Adjusted AAPC for PT: 17.48 % (95% CI: 15.26 % to 19.73 ), p= p<0.01

cat("Adjusted AAPC for Sling:", round(apc_sling_adj, 2), "% (95% CI:", 
    round(confint_sling_adj[1], 2), "% to", round(confint_sling_adj[2], 2), 
    "), p=", ifelse(p_value_sling_adj < 0.01, "p<0.01", round(p_value_sling_adj, 2)), "\n")

Adjusted AAPC for Sling: -18.66 % (95% CI: -19.94 % to -17.35 ), p= p<0.01

9.7 Cochran-Armitage: After adjusting for age, race, and comorbidities, the AAPC for PFMT ??increased to xx% per year (95% CI: ??% to ??%, p<??), indicating a more pronounced upward trend, while the AAPC for sling surgeries

# Extract Adjusted AAPC Results from your model output
sentence_pessary <- generate_aapc_sentence("pessary treatments", apc_pessary_adj, confint_pessary_adj, p_value_pessary_adj)
sentence_pt <- generate_aapc_sentence("PFMT", apc_pt_adj, confint_pt_adj, p_value_pt_adj)
sentence_sling <- generate_aapc_sentence("sling surgeries", apc_sling_adj, confint_sling_adj, p_value_sling_adj)

# Print results
cat(sentence_pessary, "\n")

After adjusting for age, race, and comorbidities, the AAPC for pessary treatments increased to 4.65% per year (95% CI: 2.6% to 6.73%, p<0.01).

cat(sentence_pt, "\n")

After adjusting for age, race, and comorbidities, the AAPC for PFMT increased to 17.48% per year (95% CI: 15.26% to 19.73%, p<0.01).

cat(sentence_sling, "\n")

After adjusting for age, race, and comorbidities, the AAPC for sling surgeries declined to 18.66% per year (95% CI: -19.94% to -17.35%, p<0.01).

9.8 Cochran-Armitage: Supplemental table of non-adjusted and adjusted APC for supplementary data. Supplemental table.

print(ft)

a flextable object. col_keys: Stress Urinary Incontinence Treatment, Annual Percentage Change, Annual Percentage Change 95% CI, APC p-value, Adjusted APC, Adjusted APC 95% CI, Adjusted APC p-value header has 1 row(s) body has 3 row(s) original dataset sample: Stress Urinary Incontinence Treatment Annual Percentage Change 1 Pessary 0.8 2 Pelvic Floor Muscle Training 7.6 3 Sling -14.7 Annual Percentage Change 95% CI APC p-value Adjusted APC Adjusted APC 95% CI 1 -0.4% to 2.1% p=0.17 4.6 2.6% to 6.7% 2 5.9% to 9.2% p<0.01 17.5 15.3% to 19.7% 3 -17.7% to -11.5% p<0.01 -18.7 -19.9% to -17.4% Adjusted APC p-value 1 p<0.01 2 p<0.01 3 p<0.01

10 VAGINAL PESSARY

10.1 Pessary: Initial SUI Treatment

Vaginal Pessary for Initial SUI Treatment : “Patients receiving a vaginal pessary had a median age of xxx years, xx years older than the overall cohort median age of xx years (p <xx).”

# Load the dataset
Cohort_descriptors <- read_rds("Data/CADR_2023/final_push/_Cohort_descriptors.rds")

Cohort_descriptors <- Cohort_descriptors %>%
  dplyr::mutate(sui_treatment = case_when(
    str_detect(sui_treatment, "Pessary") ~ "Pessary",
    str_detect(sui_treatment, "PT") ~ "PFMT",
    str_detect(sui_treatment, "Sling") ~ "Sling",
    str_detect(sui_treatment, "Burch") ~ "Burch",
    TRUE ~ "Other"  # Catch any other cases
  ))

Cohort_descriptors$sui_treatment <- as.factor(Cohort_descriptors$sui_treatment)
levels(Cohort_descriptors$sui_treatment)

[1] “Pessary” “PFMT” “Sling”

filtered_data <- Cohort_descriptors %>%
  filter(sui_treatment %in% c("Pessary", "Sling"))

# Compute median age for the full cohort
overall_median_age <- median(Cohort_descriptors$age, na.rm = TRUE)

# Compute median age for Pessary patients
pessary_median_age <- Cohort_descriptors %>%
  filter(sui_treatment == "Pessary") %>%
  summarise(median_age = median(age, na.rm = TRUE)) %>%
  pull(median_age)

# Calculate the age difference
age_difference <- abs(pessary_median_age - overall_median_age)

# Compute Wilcoxon test for significance
p_value <- wilcox.test(age ~ sui_treatment, 
                       data = Cohort_descriptors %>% filter(sui_treatment %in% c("Pessary", "Sling")),
                       exact = FALSE)$p.value

# Round p-value to 0.01 if it's below 0.01
formatted_p_value <- ifelse(p_value < 0.01, 0.01, round(p_value, 3))

# Generate the sentence dynamically
sentence <- sprintf(
  "Patients receiving a vaginal pessary had a median age of %d years, %d years older than the overall cohort median age of %d years (p < %.2f).",
  pessary_median_age, age_difference, overall_median_age, formatted_p_value
)

# Print the sentence
cat(sentence, "\n")

Patients receiving a vaginal pessary had a median age of 78 years, 2 years older than the overall cohort median age of 76 years (p < 0.01).

#Patients receiving a vaginal pessary had a median age of 78 years, 2 years older than the overall cohort median age of 76 years (p < 0.01).

“They had a similar number of medical conditions compared to the other treatment groups (?? [?-?] vs. ?? [?-?], p=???).” All of the medical conditions are in Cohort_descriptors <- read_rds(“Data/CADR_2023/final_push/Cohort_descriptors.rds”) and the column names start with ”elix”. These are elixhauser variables that are numeric. They should be added together for a score for each unique WU_ID number.

10.2 Pessary: Unweighted Elixhauser Score (Basic Count of Comorbidities)

Cohort_descriptors <- read_rds("Data/CADR_2023/final_push/_Cohort_descriptors.rds") 

# Identify all Elixhauser columns (binary comorbidities)
elix_cols <- names(Cohort_descriptors)[grepl("^elix_", names(Cohort_descriptors))]

# Compute the unweighted Elixhauser score (sum of conditions per patient)
Cohort_descriptors <- Cohort_descriptors %>%
  dplyr::mutate(elix_score_unweighted = rowSums(select(., all_of(elix_cols)), na.rm = TRUE))

# Check summary statistics
summary(Cohort_descriptors$elix_score_unweighted)

Min. 1st Qu. Median Mean 3rd Qu. Max. 0.000 2.000 3.000 3.511 5.000 16.000

10.3 Pessary: Compute the Weighted Elixhauser Score

The van Walraven weights are commonly used to compute a weighted Elixhauser score. Below are the weight assignments for each comorbidity:

Comorbidity	Weight
Congestive Heart Failure (`elix_chf`)	7
Cardiac Arrhythmias (`elix_neuro`)	5
Valvular Disease (`elix_valve`)	4
Pulmonary Circulation Disorders (`elix_pulmcirc`)	3
Peripheral Vascular Disease (`elix_perivasc`)	2
Hypertension (`elix_htn`)	-1
Paralysis (`elix_para`)	6
Other Neurological Disorders (`elix_neuro`)	5
Chronic Pulmonary Disease (`elix_chrnlung`)	3
Diabetes w/o Complications (`elix_diabet`)	0
Diabetes w/ Complications (`elix_ckd`)	2
Hypothyroidism (`elix_hypothy`)	0
Renal Failure (`elix_ckd`)	5
Liver Disease (`elix_liver`)	4
Peptic Ulcer Disease (`elix_tumor`)	0
Lymphoma (`elix_lymph`)	7
Metastatic Cancer (`elix_mets`)	12
Solid Tumor w/o Metastasis (`elix_tumor`)	4
Rheumatoid Arthritis (`elix_arth`)	0
Coagulopathy (`elix_coag`)	8
Obesity (`elix_obese`)	-3
Weight Loss (`elix_wghtloss`)	6
Fluid & Electrolyte Disorders (`elix_lytes`)	5
Blood Loss Anemia (`elix_bldloss`)	3
Deficiency Anemia (`elix_anemdef`)	3
Alcohol Abuse (`elix_alcohol`)	0
Drug Abuse (`elix_drug`)	0
Psychoses (`elix_psych`)	0
Depression (`elix_depress`)	-3

Method	Interpretation
Unweighted Elixhauser Score	Simply counts the number of comorbidities a patient has
Weighted Elixhauser Score (van Walraven Weights)	Applies risk-adjusted weights based on the impact of each comorbidity on mortality or hospital readmission

# Define weight mapping for Elixhauser comorbidities
elix_weights <- c(
  elix_chf = 7, elix_neuro = 5, elix_valve = 4, elix_pulmcirc = 3, elix_perivasc = 2, 
  elix_htn = -1, elix_para = 6, elix_neuro = 5, elix_chrnlung = 3, elix_diabet = 0,
  elix_ckd = 2, elix_hypothy = 0, elix_ckd = 5, elix_liver = 4, elix_tumor = 0,
  elix_lymph = 7, elix_mets = 12, elix_tumor = 4, elix_arth = 0, elix_coag = 8,
  elix_obese = -3, elix_wghtloss = 6, elix_lytes = 5, elix_bldloss = 3, elix_anemdef = 3,
  elix_alcohol = 0, elix_drug = 0, elix_psych = 0, elix_depress = -3
)

# Compute the weighted Elixhauser score
Cohort_descriptors <- Cohort_descriptors %>%
  dplyr::mutate(elix_score_weighted = rowSums(select(., all_of(names(elix_weights))) * elix_weights, na.rm = TRUE))

# Check summary statistics for the weighted score
summary(Cohort_descriptors$elix_score_weighted)

Min. 1st Qu. Median Mean 3rd Qu. Max. -7.00 3.00 9.00 10.52 16.00 64.00

#Now that we have both unweighted and weighted scores, we can compare treatment groups.

Cohort_descriptors <- Cohort_descriptors %>%
  dplyr::mutate(sui_treatment = case_when(
    str_detect(sui_treatment, "Pessary") ~ "Pessary",
    str_detect(sui_treatment, "PT") ~ "PFMT",
    str_detect(sui_treatment, "Sling") ~ "Sling",
    str_detect(sui_treatment, "Burch") ~ "Burch",
    TRUE ~ "Other"  # Catch any other cases
  ))

# Compare Elixhauser Scores Between Pessary vs. Other Treatment Groups
pessary_stats <- Cohort_descriptors %>%
  filter(sui_treatment == "Pessary") %>%
  summarise(
    median_score = median(elix_score_weighted, na.rm = TRUE),
    iqr_lower = quantile(elix_score_weighted, 0.25, na.rm = TRUE),
    iqr_upper = quantile(elix_score_weighted, 0.75, na.rm = TRUE)
  )

other_stats <- Cohort_descriptors %>%
  filter(sui_treatment != "Pessary") %>%
  summarise(
    median_score = median(elix_score_weighted, na.rm = TRUE),
    iqr_lower = quantile(elix_score_weighted, 0.25, na.rm = TRUE),
    iqr_upper = quantile(elix_score_weighted, 0.75, na.rm = TRUE)
  )

# Perform Wilcoxon test for significance
p_value <- wilcox.test(elix_score_weighted ~ sui_treatment, 
                       data = Cohort_descriptors %>% filter(sui_treatment %in% c("Pessary", "Sling")),
                       exact = FALSE)$p.value

# Round p-value to 0.01 if below 0.01
formatted_p_value <- ifelse(p_value < 0.01, 0.01, round(p_value, 3))

# Generate the dynamic sentence
sentence <- sprintf(
  "They had a similar number of medical conditions compared to the other treatment groups (%d [%d-%d] vs. %d [%d-%d], p=%s).",
  pessary_stats$median_score, pessary_stats$iqr_lower, pessary_stats$iqr_upper,
  other_stats$median_score, other_stats$iqr_lower, other_stats$iqr_upper,
  formatted_p_value
)

# Print the sentence
cat(sentence, "\n")

They had a similar number of medical conditions compared to the other treatment groups (9 [4-16] vs. 9 [3-15], p=0.01).

10.4 Pessary treatments were predominantly used by White beneficiaries, with 92.2% of pessary users identifying as White, consistent with the overall cohort demographics.

# Identify the most common race among pessary users
pessary_race_counts <- Cohort_descriptors %>%
  filter(sui_treatment == "Pessary") %>%
  count(race, sort = TRUE) 

# Extract the predominant race and its count
predominant_race <- pessary_race_counts$race[1]
predominant_race_count <- pessary_race_counts$n[1]

# Total number of pessary users
total_pessary_count <- sum(pessary_race_counts$n)

# Calculate percentage of the predominant race among pessary users
predominant_race_percent <- round((predominant_race_count / total_pessary_count) * 100, 1)

# Calculate the percentage of this race in the overall cohort
overall_race_count <- Cohort_descriptors %>%
  filter(race == predominant_race) %>%
  nrow()

total_cohort_count <- nrow(Cohort_descriptors)
overall_race_percent <- round((overall_race_count / total_cohort_count) * 100, 1)

# Generate the sentence
sentence <- sprintf(
  "Pessary treatments were predominantly used by %s beneficiaries, with %.1f%% of pessary users identifying as %s, consistent with the overall cohort demographics (%.1f%% %s).",
  predominant_race, predominant_race_percent, predominant_race, overall_race_percent, predominant_race
)

# Print the sentence
cat(sentence, "\n")

Pessary treatments were predominantly used by White beneficiaries, with 92.3% of pessary users identifying as White, consistent with the overall cohort demographics (92.9% White).

10.5 Pessary: “Among reported clinicians, Obstetrics/gynecology were the most frequent providers of pessary fittings, performing 29.50% (n=2291) of these procedures.”

# Load the provider specialty CSV file
provider_specialty <- readr::read_rds("Data/CADR_2023/final_push/Provider_specialty.rds")

# Filter for pessary treatments
pessary_df <- provider_specialty %>%
  filter(sui_treatment == "Pessary")

# Count total pessary fittings
total_pessary <- nrow(pessary_df)

# Find the specialty with the most pessary fittings (excluding NA)
top_specialty_summary <- pessary_df %>%
  filter(!is.na(specialty_pre6m_specialty_car_and_fac)) %>%
  count(specialty_pre6m_specialty_car_and_fac, sort = TRUE) %>%
  slice_max(n, n = 1)

top_specialty <- top_specialty_summary$specialty_pre6m_specialty_car_and_fac
top_specialty_count <- top_specialty_summary$n

# Calculate the percentage
top_specialty_percentage <- round((top_specialty_count / total_pessary) * 100, 1)

# Generate the sentence dynamically
sentence <- sprintf(
  "Among reported clinicians, %s were the most frequent providers of pessary fittings, performing %.2f%% (n=%d) of these procedures.",
  top_specialty,
  top_specialty_percentage,
  top_specialty_count
)

# Print the sentence
print(sentence)

[1] “Among reported clinicians, Obstetrics/gynecology were the most frequent providers of pessary fittings, performing 29.60% (n=2286) of these procedures.”

10.6 Pessary: The median cost per patient for pessary treatment was $xxx (IQR $xx–$xx), which included provider professional fees and facility fees.

df_costs <- readRDS("/Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments/Data/CADR_2023/final_push/_Costs_step33.rds")
glimpse(df_costs)

Rows: 160,323 Columns: 81 Groups: WU_ID [16,695] $ sui_treatment “PT”, “PT”, “PT”, “PT”, “PT”, “PT”, “… $ OP_facility_new_lvl1_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_new_lvl1_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_new_lvl2_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_new_lvl2_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_new_lvl3_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_new_lvl3_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_new_lvl4_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_new_lvl4_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_new_lvl5_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_new_lvl5_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_return_lvl1_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_return_lvl1_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_return_lvl2_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_return_lvl2_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_return_lvl3_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_return_lvl3_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_return_lvl4_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_return_lvl4_cost 110, NA, NA, NA, NA, NA, 111, 111, 94… $ OP_facility_return_lvl5_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_return_lvl5_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_consult_lvl1_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_consult_lvl1_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_consult_lvl2_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_consult_lvl2_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_consult_lvl3_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_consult_lvl3_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_consult_lvl4_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_consult_lvl4_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_consult_lvl5_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_consult_lvl5_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_UA_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_UA_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_urineCX_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_urineCX_cost NA, 11, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_microscopy_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_microscopy_cost NA, 4, NA, NA, NA, NA, NA, NA, NA, NA… $ OP_facility_cathPVR_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_cathPVR_cost NA, NA, 56, NA, NA, NA, NA, NA, NA, N… $ OP_facility_usPVR_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_usPVR_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_uroflowmetry_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_uroflowmetry_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, 7… $ OP_facility_cystometrics_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_cystometrics_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_UDS_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_UDS_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, 4… $ OP_facility_vUDS_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_vUDS_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_cystoscopy_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_cystoscopy_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_UTeval_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_UTeval_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ IP_facility_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ ASC_facility_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_cost NA, NA, NA, 38, 29, 38, NA, NA, NA, N… $ OP_facility_anesth_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_anesth_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_complication_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ edOP_facility_complication_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_complication_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ edprofessional_complication_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ edOP_facility_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ edprofessional_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ OP_facility_EM_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ professional_EM_cost NA, NA, NA, NA, NA, NA, NA, NA, NA, N… $ Type of Cost ”Pre-index EM visit costs”, “Pre-inde… $ Pre or Post Index Procedure ”Pre”, “Pre”, “Pre”, “Subsequent cost… $ WU_ID 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3… $ episode_number ”1”, “1”, “1”, “1”, “1”, “1”, “1”, “1… $ year_episode_start.x 2015-01-01, 2015-01-01, 2015-01-01, … $ age 68, 68, 68, 68, 68, 68, 68, 68, 68, 6… $ race ”White”, “White”, “White”, “White”, “… $ dual_cvrg 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0… $ state ”New Hampshire”, “New Hampshire”, “Ne… $ Region ”Northeast”, “Northeast”, “Northeast”… $ Division “New England”, “New England”, “New En… $ episode_type ”PT”, “PT”, “PT”, “PT”, “PT”, “PT”, “… $ year_episode_start.y 2015, 2015, 2015, 2015, 2015, 2015, 2… $ Total_cost 1098, 1098, 1098, 1098, 1098, 1098, 1…

# Filter for pessary treatments
pessary_costs <- df_costs %>%
  filter(sui_treatment == "Pessary")

# Identify all cost columns dynamically (columns ending with "_cost" but NOT starting with "edOP_")
cost_columns <- grep("^(?!edOP_).*_cost$", colnames(pessary_costs), value = TRUE, perl = TRUE)

# Ensure no grouping before filtering missing cost values
pessary_costs <- pessary_costs %>% ungroup()

# Remove rows where all selected cost columns are missing
pessary_costs_filtered <- pessary_costs %>%
  filter(if_any(all_of(cost_columns), ~ !is.na(.)))

# Compute total cost per patient by summing across all selected cost columns
pessary_costs_filtered <- pessary_costs_filtered %>%
  dplyr::mutate(total_cost = rowSums(select(., all_of(cost_columns)), na.rm = TRUE))

# Calculate median and IQR with rounding to whole dollars
median_cost <- round(median(pessary_costs_filtered$total_cost, na.rm = TRUE), 0)
iqr_low <- round(quantile(pessary_costs_filtered$total_cost, 0.25, na.rm = TRUE), 0)
iqr_high <- round(quantile(pessary_costs_filtered$total_cost, 0.75, na.rm = TRUE), 0)

# Format numbers with thousandth commas
median_cost_formatted <- comma(median_cost, accuracy = 1)
iqr_low_formatted <- comma(iqr_low, accuracy = 1)
iqr_high_formatted <- comma(iqr_high, accuracy = 1)

# Generate the formatted result
sentence <- sprintf(
  "The median cost per patient for pessary treatment was $%s (IQR $%s–$%s), which included provider professional fees and facility fees.",
  median_cost_formatted, iqr_low_formatted, iqr_high_formatted
)

# Print the sentence
print(sentence)

[1] “The median cost per patient for pessary treatment was $766 (IQR $458–$1,244), which included provider professional fees and facility fees.”

#"The median cost per patient for pessary treatment was $766 (IQR $458–$1,244), which included provider professional fees and facility fees."

10.7 Pessary: Patients receiving additional procedures such as cystoscopy or bladder diagnostics alongside pessary treatment had higher median costs of $x (IQR: $xx–$xxx) compared to those who did not receive these procedures ($xxx, IQR: $xx–$xx, Mann-Whitney U test p < xx)

# Load the costs dataset
df_costs <- readRDS("Data/CADR_2023/final_push/_Costs.rds")

# Filter for pessary treatments
pessary_costs <- df_costs %>%
  filter(sui_treatment == "Pessary")

# Identify cost columns dynamically (excluding those starting with "edOP_")
cost_columns <- grep("^(?!edOP_).*_cost$", colnames(pessary_costs), value = TRUE, perl = TRUE)

# Filter for pessary treatments and compute total cost
pessary_costs_filtered <- pessary_costs %>%
  ungroup() %>%  # Ungroup to ensure row-wise operations work
  filter(if_any(all_of(cost_columns), ~ !is.na(.))) %>%
  rowwise() %>%  # Perform operations row-wise
  dplyr::mutate(
    total_cost = sum(c_across(all_of(cost_columns)), na.rm = TRUE),
    additional_procedure = if_else(
      !is.na(OP_facility_cystoscopy_cost) | !is.na(professional_cystoscopy_cost) |
      !is.na(OP_facility_UTeval_cost) | !is.na(professional_UTeval_cost),
      "With Procedures",
      "Without Procedures"
    )
  ) %>%
  ungroup()

# Calculate median costs and IQR for each group
cost_summary <- pessary_costs_filtered %>%
  group_by(additional_procedure) %>%
  summarise(
    median_cost = round(median(total_cost, na.rm = TRUE)),
    iqr_low = round(quantile(total_cost, 0.25, na.rm = TRUE)),
    iqr_high = round(quantile(total_cost, 0.75, na.rm = TRUE)),
    .groups = "drop"
  )

# Perform Mann-Whitney U test
p_value <- wilcox.test(
  total_cost ~ additional_procedure,
  data = pessary_costs_filtered
)$p.value

# Determine higher or lower costs
with_procedures <- cost_summary %>% filter(additional_procedure == "With Procedures")
without_procedures <- cost_summary %>% filter(additional_procedure == "Without Procedures")

cost_comparison <- if (with_procedures$median_cost > without_procedures$median_cost) {
  "higher"
} else {
  "lower"
}

# Format values with thousandth commas
median_with <- comma(with_procedures$median_cost, accuracy = 1)
iqr_with_low <- comma(with_procedures$iqr_low, accuracy = 1)
iqr_with_high <- comma(with_procedures$iqr_high, accuracy = 1)

median_without <- comma(without_procedures$median_cost, accuracy = 1)
iqr_without_low <- comma(without_procedures$iqr_low, accuracy = 1)
iqr_without_high <- comma(without_procedures$iqr_high, accuracy = 1)

p_value_formatted <- if (p_value < 0.01) "<0.01" else sprintf("%.2f", p_value)

# Generate the formatted sentence
sentence <- sprintf(
  "Patients receiving additional procedures such as cystoscopy or bladder diagnostics alongside pessary treatment had %s median costs of $%s (IQR: $%s–$%s) compared to those who did not receive these procedures ($%s, IQR: $%s–$%s, Mann-Whitney U test p %s).",
  cost_comparison,
  median_with, iqr_with_low, iqr_with_high,
  median_without, iqr_without_low, iqr_without_high,
  p_value_formatted
)

# Print the sentence
print(sentence)

[1] “Patients receiving additional procedures such as cystoscopy or bladder diagnostics alongside pessary treatment had higher median costs of $1,481 (IQR: $1,086–$1,927) compared to those who did not receive these procedures ($751, IQR: $454–$1,223, Mann-Whitney U test p <0.01).”

#"Patients receiving additional procedures such as cystoscopy or bladder diagnostics alongside pessary treatment had higher median costs of $1,481 (IQR: $1,086–$1,927) compared to those who did not receive these procedures ($751, IQR: $454–$1,223, Mann-Whitney U test p <0.01)."

10.9 Pessary: Better Overall Pessary

# # Perform Mann-Whitney U tests
# pessary_vs_pfmt_test <- wilcox.test(
#   total_patient_cost ~ sui_treatment,
#   data = treatment_summary,
#   exact = FALSE
# )
# 
# complication_cost_test <- wilcox.test(
#   complication_total ~ sui_treatment,
#   data = treatment_summary,
#   exact = FALSE
# )
# 
# log_info("Statistical tests completed")
# 
# # Format p-values
# format_p_value <- function(p_value) {
#   if (p_value < 0.01) return("<0.01")
#   return(sprintf("%.2f", p_value))
# }
# 
# # Get formatted values for output
# pessary_stats <- treatment_medians %>%
#   dplyr::filter(sui_treatment == "Pessary")
# pfmt_stats <- treatment_medians %>%
#   dplyr::filter(sui_treatment == "PFMT")
#

10.10 Pessary: Overall, pessary??? care remained less expensive compared to sling surgery??? (median per-patient cost: $xxx

# dat <- read_csv("Data/CADR_2023/final_push/_Costs_step33.csv")
# 
# result_text <- generate_treatment_cost_text(
#   treatment_cost_filepath = "Data/CADR_2023/final_push/_Costs_step33.csv",
#   verbose = TRUE,
#   cost_column = "Total_cost"
# ); result_text

10.11 Pessary: The median outpatient facility cost for pessary treatment was ???higher/???lower/same??? than that for ???sling surgery??? but higher/lower/same??? than ???PFMT??? (Table 2).

cost_data <- readr::read_csv("Data/CADR_2023/final_push/_Costs_step33.csv")

pessary_outpatient <- analyze_treatment_costs(cost_data, 
                                  treatment_groups = c("Pessary", "Sling", "PT"),
                                  verbose = TRUE)

pessary_outpatient$description

[1] “The median outpatient facility cost for sling treatment was $122 [IQR: $94-$178], which was similar to pessary $100 [IQR: $84-$179] (p=0.45) and similar to pt $100 [IQR: $82-$130] (p=0.66)”

10.12 Pessary: Complication cost

cost_data <- readr::read_csv("Data/CADR_2023/final_push/_Costs_step33.csv")
names(cost_data)

names(cost_data %>%
  select(matches("(?i)complication")))

complication_result <- compare_treatment_costs_iqr(
  treatment_data = cost_data,
  treatment_groups = c("Pessary", "Sling", "PT"),
  cost_columns = c("OP_facility_complication_cost", 
                  "professional_complication_cost",
                  "edOP_facility_complication_cost", 
                  "edprofessional_complication_cost"),
  verbose = TRUE
); complication_result

11 Pelvic Floor Muscle Therapy for Initial SUI Treatment

11.0.1 To verify if PFMT is more expensive than Pessary in all cases, you should compare total costs for both treatments across different statistics

# Load data
data <- readr::read_csv("Data/CADR_2023/final_push/_Costs_step33.csv", show_col_types = FALSE)

# Compute cost statistics
cost_summary <- data %>%
  filter(sui_treatment %in% c("PFMT", "Pessary")) %>%
  group_by(sui_treatment) %>%
  summarise(
    median_cost = median(Total_cost, na.rm = TRUE),
    IQR_lower = quantile(Total_cost, 0.25, na.rm = TRUE),
    IQR_upper = quantile(Total_cost, 0.75, na.rm = TRUE),
    min_cost = min(Total_cost, na.rm = TRUE),
    max_cost = max(Total_cost, na.rm = TRUE),
    mean_cost = mean(Total_cost, na.rm = TRUE)
  )

print(cost_summary)

12 A tibble: 1 × 7

sui_treatment median_cost IQR_lower IQR_upper min_cost max_cost mean_cost 1 Pessary 691 395 1143 40 2861 820.

12.1 PFMT is statistically less expensive than Pessary

# Subset data for the test
pfmt_costs <- data %>% filter(sui_treatment == "PT") %>% pull(Total_cost)
pessary_costs <- data %>% filter(sui_treatment == "Pessary") %>% pull(Total_cost)

# Perform test
wilcox_test <- wilcox.test(pfmt_costs, pessary_costs, alternative = "greater")

print(wilcox_test)

Wilcoxon rank sum test with continuity correction

data: pfmt_costs and pessary_costs W = 2029156250, p-value = 1.271e-10 alternative hypothesis: true location shift is greater than 0

12.2 PFMT: Patients undergoing PFMT had a median age of xx years (IQR: yy–yy), ??slightly ??younger/??older than those receiving ??pessary treatment (Mann-Whitney U test, p=???).

# Read in the data
df_costs <- readRDS("/Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments/Data/CADR_2023/final_push/_Costs_step33.rds")

# Filter for PFMT and Pessary treatments and calculate median age and IQR
age_summary <- df_costs %>%
  filter(sui_treatment %in% c("Pelvic floor physical therapy", "Pessary")) %>%
  group_by(sui_treatment) %>%
  summarise(
    median_age = median(age, na.rm = TRUE),
    iqr_low = quantile(age, 0.25, na.rm = TRUE),
    iqr_high = quantile(age, 0.75, na.rm = TRUE),
    .groups = "drop"
  )

# Perform Mann-Whitney U test for age comparison
pfmt_ages <- df_costs %>%
  filter(sui_treatment == "Pelvic floor physical therapy") %>%
  pull(age) %>%
  na.omit()

pessary_ages <- df_costs %>%
  filter(sui_treatment == "Pessary") %>%
  pull(age) %>%
  na.omit()

# Check if both groups are present before the test
if (length(pfmt_ages) > 0 & length(pessary_ages) > 0) {
  mannwhitney_result <- wilcox.test(pfmt_ages, pessary_ages, alternative = "two.sided")
  p_value_fmt <- ifelse(mannwhitney_result$p.value < 0.01, "<0.01", sprintf("%.2f", mannwhitney_result$p.value))
} else {
  p_value_fmt <- "N/A"
}

# Determine if PFMT patients are younger or older
age_comparison <- ifelse(median(pfmt_ages) < median(pessary_ages), "younger", "older")

# Format values with thousandth commas and whole dollars
pfmt_median_age <- comma(round(age_summary$median_age[age_summary$sui_treatment == "Pelvic floor physical therapy"], 0))
pfmt_iqr_low <- comma(round(age_summary$iqr_low[age_summary$sui_treatment == "Pelvic floor physical therapy"], 0))
pfmt_iqr_high <- comma(round(age_summary$iqr_high[age_summary$sui_treatment == "Pelvic floor physical therapy"], 0))

pessary_median_age <- comma(round(age_summary$median_age[age_summary$sui_treatment == "Pessary"], 0))
pessary_iqr_low <- comma(round(age_summary$iqr_low[age_summary$sui_treatment == "Pessary"], 0))
pessary_iqr_high <- comma(round(age_summary$iqr_high[age_summary$sui_treatment == "Pessary"], 0))

# Generate the sentence
sentence <- sprintf(
  "Patients undergoing PFMT had a median age of %s years (IQR: %s–%s), %s than those receiving pessary treatment (median age: %s years, IQR: %s–%s) (Mann-Whitney U test, p=%s).",
  pfmt_median_age, pfmt_iqr_low, pfmt_iqr_high,
  age_comparison,
  pessary_median_age, pessary_iqr_low, pessary_iqr_high,
  p_value_fmt
)

# Print the dynamically created sentence
print(sentence)

character(0)

#"Patients undergoing PFMT had a median age of 75 years (IQR: 70–81), younger than those receiving pessary treatment (median age: 79 years, IQR: 73–84) (Mann-Whitney U test, p=<0.01)."

12.3 Physical Therapy: Provider PFMT was ??equally prescribed by ???urologists (???%, n=??) and ??obstetrician-gynecologists (????%, n=???).

specialty_comparison <- analyze_pt_providers(
    data_path = "Data/CADR_2023/final_push/Provider_specialty.csv",
    threshold = 3,
    verbose = TRUE
); specialty_comparison

[1] “PT was equally prescribed by Urology (24.2%, n=1,345) and Obstetrics/gynecology (23.6%, n=1,309).”

12.4 PFMT: The median professional fee for PFMT was $xx (IQR $xx–$xx), which was ??lower/??higher, although ??not signficantly, than the median professional fee for pessary treatment ($??, IQR $??–$??), and significantly lower than for sling surgery ($??, IQR $??–$???) (Mann-Whitney U test, p=????).

fee_comparison <- analyze_professional_fees(
    data_path = "/Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments/Data/CADR_2023/final_push/_Costs_step33.rds",
    verbose = TRUE,
    p_threshold = 0.05
)

# > fee_comparison
# [1] "The median professional fee for PFMT was $24 (IQR $0–$64), which was lower, although significantly, than the median professional fee for pessary treatment ($31, IQR $4–$99), and significantly lower than for sling surgery ($36, IQR $4–$127) (Mann-Whitney U test, p=<0.01)."

12.5 PMFT: The median facility fee for PFMT was $xx (IQR $xx-$xxx), and the median total cost was $xxx.

# Load the dataset
costs_df <- read.csv("Data/CADR_2023/final_push/_Costs_step33.csv")
unique(costs_df$sui_treatment)

[1] “PT” “Pessary” “Sling”

# Filter for PFMT
pfmt_df <- costs_df %>% filter(sui_treatment == "PT")

# Replace zeros with NA in facility columns
facility_columns <- grep("facility", names(pfmt_df), value = TRUE)
pfmt_df[facility_columns] <- lapply(pfmt_df[facility_columns], function(x) ifelse(x == 0, NA, x))

# Identify facility cost columns and remove those with all NAs
non_zero_facility_columns <- facility_columns[sapply(facility_columns, function(col) any(!is.na(pfmt_df[[col]])))]

# Sum only non-NA facility cost columns
pfmt_df$total_facility_cost <- rowSums(pfmt_df[, non_zero_facility_columns], na.rm = TRUE)

pfmt_nonzero <- pfmt_df %>%
  filter(total_facility_cost >0)

# Calculate median and IQR for total facility cost and total cost
facility_cost_median <- median(pfmt_nonzero$total_facility_cost, na.rm = TRUE)
facility_cost_iqr <- quantile(pfmt_nonzero$total_facility_cost, probs = c(0.25, 0.75), na.rm = TRUE)

total_cost_median <- median(pfmt_nonzero$Total_cost, na.rm = TRUE)

# Format results with thousandths commas and no decimals
facility_cost_median_fmt <- format(facility_cost_median, big.mark = ",", scientific = FALSE)
facility_cost_iqr_fmt <- paste0(format(facility_cost_iqr[1], big.mark = ","), "-", 
                                format(facility_cost_iqr[2], big.mark = ","))
total_cost_median_fmt <- format(total_cost_median, big.mark = ",", scientific = FALSE)

# Display results
cat("The median total facility cost for PFMT was $", facility_cost_median_fmt, 
    " (IQR $", facility_cost_iqr_fmt, "), and the median total cost was $", 
    total_cost_median_fmt, ".\n", sep = "")

The median total facility cost for PFMT was $15 (IQR $5-28), and the median total cost was $654.

#The median total facility cost for PFMT was $15 (IQR $5-28), and the median total cost was $654.

12.6 PFMT: Geographic analysis revealed that PFMT was used significantly more often in the xxxxx subdivision (xxx%, n=xxx) than other regions (χ2, pxxxxx). Write R code that is dynamic and robust. Use the column “Division” that is the US Census Bureau subdivisions and sui_treatment == “PT”.

# Load the dataset
cohort_df <- read_rds("Data/CADR_2023/final_push/_Cohort_descriptors.rds")

# Filter for PFMT (sui_treatment == "PT_index_costs_23JAN2023_1")
pfmt_geo_df <- cohort_df %>%
  filter(sui_treatment == "PT" & !is.na(Division))

# Create a contingency table
contingency_table <- table(pfmt_geo_df$Division)

# Perform chi-squared test
chi_test <- chisq.test(contingency_table)

# Find the division with the highest PFMT usage
top_division <- pfmt_geo_df %>%
  count(Division) %>%
  arrange(desc(n)) %>%
  slice(1)

# Format p-value: round to 2 digits and handle very small values
p_value_fmt <- ifelse(chi_test$p.value < 0.01, "<0.01", format(round(chi_test$p.value, 2), nsmall = 2))

# Format numbers with commas for thousands
top_division_count_fmt <- format(top_division$n, big.mark = ",", scientific = FALSE)
chi_statistic_fmt <- format(round(chi_test$statistic, 2), big.mark = ",", scientific = FALSE)

# Display results
cat("Geographic analysis revealed that PFMT was used significantly more often in the ",
    top_division$Division, 
    " subdivision (",
    round(top_division$n / sum(contingency_table) * 100, 1),
    "%, n=",
    top_division_count_fmt,
    ") than other regions (χ² = ",
    chi_statistic_fmt,
    ", p = ",
    p_value_fmt,
    ").\n", sep = "")

Geographic analysis revealed that PFMT was used significantly more often in the South Atlantic subdivision (23.4%, n=1,302) than other regions (χ² = 1,610.24, p = <0.01).

# Geographic analysis revealed that PFMT was used significantly more often in the South Atlantic subdivision (23.4%, n=1,302) than other regions (χ² = 1,610.24, p = <0.01).

13 SLING

13.1 Sling: Sling surgery was the initial treatment for xx% (n=xx) of patients.(dataframe: _Costs_step33.rds has column sui_treatment == “Sling”) Most patients undergoing sling surgery were White (xx%, n=xxx), a significantly higher proportion than other treatment groups (χ2, p=xxxx)[ _Costs_step33.rds has column “race”== “White”]. ???Younger/???Older patients (aged xxx-xxx years) were more likely to receive sling surgery compared to older???/younger??? patients (χ2, p=????) [ _Costs_step33.rds has column “age”].

13.2 Sling: Costs of slings

# Load RDS files
costs_data <- read_rds("Data/CADR_2023/final_push/_Costs_step33.rds")
provider_data <- read_rds("Data/CADR_2023/final_push/Provider_specialty.rds")

# --- Sling Surgery Percentage (Distinct Patients) ---
sling_df <- costs_data %>% filter(sui_treatment == "Sling")
sling_count <- sling_df %>% distinct(WU_ID) %>% nrow()
total_patients <- costs_data %>% distinct(WU_ID) %>% nrow()
sling_percentage <- (sling_count / total_patients) * 100

# --- Race Analysis (Distinct Patients) ---
white_count <- sling_df %>% filter(race == "White") %>% distinct(WU_ID) %>% nrow()
white_percentage <- (white_count / sling_count) * 100
race_test <- chisq.test(table(sling_df$race, sling_df$sui_treatment))

# --- Age Comparison Across SUI Treatments (Adding IQR in Square Brackets) ---
age_comparison <- costs_data %>% 
  group_by(sui_treatment) %>% 
  summarise(median_age = median(age, na.rm = TRUE), 
            IQR = paste0("[", quantile(age, 0.25, na.rm = TRUE), "–", quantile(age, 0.75, na.rm = TRUE), "]"))

# --- Kruskal-Wallis Test for Age Across Treatments ---
kw_test_age <- kruskal.test(age ~ sui_treatment, data = costs_data)
kw_age_p_value <- ifelse(kw_test_age$p.value < 0.01, "<0.01", format(kw_test_age$p.value, big.mark=",", scientific=FALSE))

# --- Determine Age Group Most Likely to Receive Sling Surgery ---
median_age_sling <- age_comparison %>% filter(sui_treatment == "Sling") %>% pull(median_age)
iqr_sling <- age_comparison %>% filter(sui_treatment == "Sling") %>% pull(IQR)

median_age_other <- age_comparison %>% filter(sui_treatment != "Sling") %>% summarise(mean(median_age, na.rm = TRUE)) %>% pull()
age_group_desc <- ifelse(median_age_sling < median_age_other, "Younger", "Older")

# --- Provider Specialty Analysis (Denominator: Sling Surgeries Only) ---
provider_summary <- provider_data %>% 
  filter(sui_treatment == "Sling") %>%
  distinct(WU_ID, .keep_all = TRUE) %>%
  count(specialty_pre6m_specialty_car_and_fac) %>%
  dplyr::mutate(percent = (n / sling_count) * 100) %>%
  arrange(desc(n)) %>%
  slice(1)

# Ensure provider summary is not empty
provider_text <- ifelse(nrow(provider_summary) > 0, 
                        paste0(provider_summary$specialty_pre6m_specialty_car_and_fac, " performed the most sling surgeries (", 
                               format(round(provider_summary$percent, 1), big.mark=","), "% (n=", format(provider_summary$n, big.mark=","), ")."), 
                        "No provider specialty data available.")

# --- Cost Distribution Analysis ---
sling_costs <- sling_df$Total_cost
pessary_costs <- costs_data %>% filter(sui_treatment == "Pessary") %>% pull(Total_cost)
pt_costs <- costs_data %>% filter(sui_treatment == "PT") %>% pull(Total_cost)
sling_iqr <- quantile(sling_costs, c(0.25, 0.75), na.rm = TRUE)
pessary_iqr <- quantile(pessary_costs, c(0.25, 0.75), na.rm = TRUE)
pt_iqr <- quantile(pt_costs, c(0.25, 0.75), na.rm = TRUE)

# --- Median Costs and Facility Costs ---
facility_columns <- grep("facility", names(sling_df), value = TRUE)
professional_columns <- grep("professional", names(sling_df), value = TRUE)
facility_fee <- median(rowSums(sling_df[, facility_columns, drop = FALSE], na.rm = TRUE), na.rm = TRUE)
professional_fee <- median(rowSums(sling_df[, professional_columns, drop = FALSE], na.rm = TRUE), na.rm = TRUE)

total_facility_cost <- sum(rowSums(sling_df[, facility_columns, drop = FALSE], na.rm = TRUE))
total_cost <- sum(sling_costs, na.rm = TRUE)
facility_cost_percentage <- (total_facility_cost / total_cost) * 100

# --- Combined Narrative Output ---
paragraph <- paste0(
  "Sling surgery was the initial treatment for ", format(round(sling_percentage, 1), big.mark=","), "% (n=", format(sling_count, big.mark=","), ") of distinct patients. ",
  "Most patients undergoing sling surgery were White (", format(round(white_percentage, 1), big.mark=","), "%, n=", format(white_count, big.mark=","), "), a significantly higher proportion than other groups (p = ", ifelse(race_test$p.value < 0.01, "<0.01", format(race_test$p.value, big.mark=",", scientific=FALSE)), "). ",
  age_group_desc, " patients (median age ", format(median_age_sling, big.mark=","), " years, IQR ", iqr_sling, ") were more likely to receive sling surgery compared to other SUI treatments (p = ", kw_age_p_value, "). ",
  provider_text, " ",
  "The cost distribution for sling surgeries (IQR: $", format(sling_iqr[1], big.mark=","), "–$", format(sling_iqr[2], big.mark=","), ") showed greater variability than pessary (IQR: $", format(pessary_iqr[1], big.mark=","), "–$", format(pessary_iqr[2], big.mark=","), ") or PFMT (IQR: $", format(pt_iqr[1], big.mark=","), "–$", format(pt_iqr[2], big.mark=","), "). ",
  "The median total cost was $", format(median(sling_costs, na.rm = TRUE), big.mark=","), " (IQR: $", format(sling_iqr[1], big.mark=","), "–$", format(sling_iqr[2], big.mark=","), "), ",
  "including a median facility fee of $", format(facility_fee, big.mark=","), " and provider professional fees of $", format(professional_fee, big.mark=","), ". ",
  "Facility costs accounted for ", format(round(facility_cost_percentage, 1), big.mark=","), "% ($", format(total_facility_cost, big.mark=","), " of $", format(total_cost, big.mark=","), ") of total expenditures, ",
  "which was ", ifelse(kw_test_age$p.value < 0.05, "significantly higher", "not significantly different"), " than for other treatment options (p = ", kw_age_p_value, ")."
)

cat(paragraph)

Sling surgery was the initial treatment for 20.5% (n=3,422) of distinct patients. Most patients undergoing sling surgery were White (95.1%, n=3,253), a significantly higher proportion than other groups (p = <0.01). Younger patients (median age 73 years, IQR [69–78]) were more likely to receive sling surgery compared to other SUI treatments (p = <0.01). No provider specialty data available. The cost distribution for sling surgeries (IQR: $1,490–$2,223) showed greater variability than pessary (IQR: $395–$1,143) or PFMT (IQR: $327–$1,340). The median total cost was $1,850 (IQR: $1,490–$2,223), including a median facility fee of $0 and provider professional fees of $36. Facility costs accounted for 0.5% ($329,576 of $62,368,189) of total expenditures, which was significantly higher than for other treatment options (p = <0.01).

13.3 Sling: Provider doing fucking sling

# provider_data <- read_rds("Data/CADR_2023/final_push/Provider_specialty.rds")
# 
# analyze_treatment_specialty("Data/CADR_2023/final_push/Provider_specialty.rds", 
#                                       treatment_type = "UI Sling",
#                                       verbose = FALSE)

#Urology performed the most UI Sling treatments, accounting for 44.0% (n=1,506) of procedures.

14 URODYNAMICS:

14.0.1 Urodynamics: Among the 16695 women in our study,

cost_data <- read_rds("Data/CADR_2023/final_push/_Costs_step33.rds")
sui_analysis <- analyze_urodynamics(cost_data, verbose = TRUE)

print(sui_analysis$summary_text)

[1] “Of the 16,695 women in our study, 53.0% underwent urodynamic testing: 52.5% (n=8,758) had non-video testing and 1.9% (n=320) had video testing. The cost per urodynamic test ranged from $8 to $1,963. Of the 8,844 women who underwent urodynamic testing (N=8,844), 75.3% (n=6,661) did not proceed to surgery in this first episode of care. Among these 6,661 women who did not undergo surgery (N=6,661), 68.8% (n=4,583) were treated with pelvic floor muscle therapy (PFMT), and 31.2% (n=2,078) received a pessary.”

print(sui_analysis$stats)

$total_patients [1] 16695

$testing_counts # A tibble: 6,102 × 4 WU_ID vUDS_count UDS_count total_tested 1 1 0 1 1 2 4 0 2 2 3 5 0 1 1 4 10 0 1 1 5 14 0 1 1 6 16 0 1 1 7 21 0 1 1 8 23 0 1 1 9 29 0 1 1 10 31 0 1 1 # ℹ 6,092 more rows

$cost_range$cost_range$min_cost [1] “8”

$cost_range$max_cost [1] “1,963”

$treatment_patterns # A tibble: 6,062 × 4 WU_ID no_surgery_count pfmt_count pessary_count 1 1 1 1 0 2 4 2 0 2 3 5 1 0 1 4 10 1 0 1 5 14 1 1 0 6 16 1 1 0 7 21 0 0 0 8 23 0 0 0 9 29 0 0 0 10 31 1 1 0 # ℹ 6,052 more rows

#[1] "Of the 16,695 women in our study, 53.0% underwent urodynamic testing: 52.5% (n=8,758) had non-video testing and 1.9% (n=320) had video testing. The cost per urodynamic test ranged from $8 to $1,963. Of the 8,844 women who underwent urodynamic testing (N=8,844), 75.3% (n=6,661) did not proceed to surgery in this first episode of care. Among these 6,661 women who did not undergo surgery (N=6,661), 68.8% (n=4,583) were treated with pelvic floor muscle therapy (PFMT), and 31.2% (n=2,078) received a pessary."

14.0.2 Urodynamics: Median costs were $??? (IQR: $??-$???) for non-video urodynamic testing.

# Read the data
cost_data <- readr::read_rds("Data/CADR_2023/final_push/_Costs_step33.rds")

# Remove NAs and zero values, then sum the facility and professional costs for video UDS
vuds_costs <- cost_data %>%
  dplyr::mutate(total_vUDS_cost = OP_facility_vUDS_cost + professional_vUDS_cost) %>%
  dplyr::filter(!is.na(total_vUDS_cost) & total_vUDS_cost > 0) %>%
  dplyr::pull(total_vUDS_cost)

# Remove NAs and zero values, then sum the facility and professional costs for non-video UDS
uds_costs <- cost_data %>%
  dplyr::mutate(total_UDS_cost = OP_facility_UDS_cost + professional_UDS_cost) %>%
  dplyr::filter(!is.na(total_UDS_cost) & total_UDS_cost > 0) %>%
  dplyr::pull(total_UDS_cost)

# Calculate median and IQR for video UDS
vuds_median <- if (length(vuds_costs) > 0) {
  format(round(median(vuds_costs), 0), big.mark = ",")
} else {
  "N/A"
}

vuds_iqr <- if (length(vuds_costs) > 0) {
  format(round(quantile(vuds_costs, probs = c(0.25, 0.75)), 0), big.mark = ",")
} else {
  c("N/A", "N/A")
}

# Calculate median and IQR for non-video UDS
uds_median <- if (length(uds_costs) > 0) {
  format(round(median(uds_costs), 0), big.mark = ",")
} else {
  "N/A"
}

uds_iqr <- if (length(uds_costs) > 0) {
  format(round(quantile(uds_costs, probs = c(0.25, 0.75)), 0), big.mark = ",")
} else {
  c("N/A", "N/A")
}

# Generate the sentence
median_cost_sentence <- sprintf(
  "Median costs were $%s (IQR: $%s-$%s) for non-video urodynamic testing and $%s (IQR: $%s-$%s) for video urodynamic testing.",
  uds_median, uds_iqr[1], uds_iqr[2],
  vuds_median, vuds_iqr[1], vuds_iqr[2]
)

# Print the result
print(median_cost_sentence)

[1] “Median costs were $436 (IQR: $297-$544) for non-video urodynamic testing and $188 (IQR: $181-$217) for video urodynamic testing.”

#[1] "Median costs were $436 (IQR: $297-$544) for non-video urodynamic testing and $188 (IQR: $181-$217) for video urodynamic testing."

15 TABLES

15.1 Table 1: Generate Demographics Table for Stress Urinary Incontinence Therapies

#' Generate Demographics Table for Stress Urinary Incontinence Therapies
#' 
#' @description
#' Creates a detailed demographics table comparing patient characteristics across 
#' different stress urinary incontinence therapy groups. Analyzes age, race,
#' treatment year, tobacco use, geographic location, and dual enrollment status.
#' 
#' @param data_path Character. Path to the RDS or CSV file containing patient data
#' @param output_path Character. Path where the Word document output should be saved
#' @param group_col Character. Name of the column containing therapy groups. 
#'        Default: "Stress Urinary Incontinence Therapies"
#' @param include_costs Logical. Whether to include cost analysis in the table.
#'        Default: FALSE
#' @param verbose Logical. Whether to print detailed logging messages. Default: TRUE
#' 
#' @return A list containing:
#'   \itemize{
#'     \item comparison: The tableby object
#'     \item summary: The formatted summary table
#'     \item patient_count: Total number of unique patients
#'   }
#' 
#' @importFrom arsenal tableby tableby.control write2word summary.tableby
#' @importFrom logger log_info log_error log_debug log_warn
#' @importFrom assertthat assert_that noNA
#' @importFrom readr read_rds read_csv
#' @importFrom dplyr select all_of filter mutate
#' @importFrom here here
#' @importFrom stats as.formula
#' @importFrom utils installed.packages
#' 
#' @examples
#' \dontrun{
#' # Basic demographic table without costs using here package
#' demographic_analysis <- create_demographics_table(
#'   data_path = here::here("Data", "CADR_2023", "table_1", "March2_table1.rds"),
#'   output_path = here::here("Data", "CADR_2023", "final_push", "~Table1.docx"),
#'   group_col = "Stress Urinary Incontinence Therapies",
#'   include_costs = FALSE,
#'   verbose = TRUE
#' )
#' 
#' # Include cost analysis in the demographics table
#' demographic_analysis <- create_demographics_table(
#'   data_path = here::here("path", "to", "data.rds"),
#'   output_path = here::here("path", "to", "table1_with_costs.docx"),
#'   group_col = "Stress Urinary Incontinence Therapies",
#'   include_costs = TRUE,
#'   verbose = TRUE
#' )
#' 
#' # Custom group column name with minimal logging
#' demographic_analysis <- create_demographics_table(
#'   data_path = here::here("path", "to", "data.rds"),
#'   output_path = here::here("path", "to", "custom_table.docx"),
#'   group_col = "Treatment_Group",
#'   include_costs = FALSE,
#'   verbose = FALSE
#' )
#' }
#' 
create_demographics_table <- function(data_path, 
                                      output_path, 
                                      group_col = "Stress Urinary Incontinence Therapies",
                                      include_costs = FALSE,
                                      verbose = TRUE) {
  
  # Check if required packages are installed
  check_required_packages(c("logger", "arsenal", "assertthat", "readr", "dplyr", "here"))
  
  # Initialize logging
  setup_logging(verbose)
  logger::log_info("Starting create_demographics_table function")
  
  # Log input parameters
  logger::log_info("Input parameters:")
  logger::log_info("  data_path: {data_path}")
  logger::log_info("  output_path: {output_path}")
  logger::log_info("  group_col: {group_col}")
  logger::log_info("  include_costs: {include_costs}")
  logger::log_info("  verbose: {verbose}")
  
  # Validate input parameters
  validate_input_parameters(data_path, output_path, group_col, include_costs, verbose)
  
  # Handle relative paths for data_path
  data_path <- resolve_data_path(data_path)
  
  # Create output directory if it doesn't exist
  create_output_directory(output_path)
  
  # Load and clean patient data
  patient_dataset <- load_patient_data(data_path)
  logger::log_info("Loaded patient data with {nrow(patient_dataset)} rows and {ncol(patient_dataset)} columns")
  
  # Process the dataset (handle group column, clean data)
  processed_dataset <- process_patient_data(patient_dataset, group_col, include_costs)
  logger::log_info("Processed patient data contains {nrow(processed_dataset)} patients")
  
  # Verify required columns (will warn but not fail if some are missing)
  verify_required_columns(processed_dataset, group_col, include_costs)
  
  # Store the original patient count
  original_patient_count <- nrow(processed_dataset)
  logger::log_info("Original patient count: {original_patient_count}")
  
  # Generate demographic comparison table
  demographic_comparison <- generate_demographics_comparison(
    processed_dataset, 
    group_col,
    include_costs
  )
  logger::log_info("Successfully generated demographics comparison table")
  
  # Create summary table from comparison
  demographic_summary <- create_summary_table(demographic_comparison)
  logger::log_info("Created formatted summary table")
  
  # Verify patient count after table generation
  verify_patient_count(demographic_comparison, original_patient_count)
  
  # Save output to Word
  save_table_to_word(demographic_summary, output_path)
  
  # Return results
  logger::log_info("Returning demographics analysis results")
  demographic_results <- list(
    comparison = demographic_comparison,
    summary = demographic_summary,
    patient_count = original_patient_count
  )
  
  # Add a class to make it easier to identify
  class(demographic_results) <- c("demographics_results", class(demographic_results))
  
  return(demographic_results)
}

#' @noRd
check_required_packages <- function(packages) {
  installed_pkgs <- rownames(utils::installed.packages())
  missing_pkgs <- setdiff(packages, installed_pkgs)
  
  if (length(missing_pkgs) > 0) {
    stop("Missing required packages: ", paste(missing_pkgs, collapse = ", "), 
         ". Please install them before running this function.")
  }
}

#' @noRd
setup_logging <- function(verbose) {
  if (verbose) {
    logger::log_threshold(logger::INFO)
    logger::log_info("Verbose logging enabled")
  } else {
    logger::log_threshold(logger::ERROR)
  }
}

#' @noRd
validate_input_parameters <- function(data_path, output_path, group_col, include_costs, verbose) {
  logger::log_info("Validating input parameters")
  
  # Check parameter types
  assertthat::assert_that(is.character(data_path), length(data_path) == 1,
                         msg = "data_path must be a single character string")
  
  assertthat::assert_that(is.character(output_path), length(output_path) == 1,
                         msg = "output_path must be a single character string")
  
  assertthat::assert_that(is.character(group_col), length(group_col) == 1,
                         msg = "group_col must be a single character string")
  
  assertthat::assert_that(is.logical(include_costs), length(include_costs) == 1,
                         msg = "include_costs must be a logical value (TRUE or FALSE)")
  
  assertthat::assert_that(is.logical(verbose), length(verbose) == 1,
                         msg = "verbose must be a logical value (TRUE or FALSE)")
  
  # Check output path extension
  if (!grepl("\\.docx$", output_path, ignore.case = TRUE)) {
    logger::log_warn("Output path does not end with .docx. Appending .docx extension.")
    output_path <- paste0(output_path, ".docx")
  }
  
  logger::log_info("Input parameters validated successfully")
}

#' @noRd
resolve_data_path <- function(data_path) {
  logger::log_info("Resolving data path: {data_path}")
  
  # Check if path exists directly
  if (file.exists(data_path)) {
    logger::log_info("Data path exists: {data_path}")
    return(data_path)
  }
  
  # Try using the here package to resolve path
  possible_path <- here::here(data_path)
  if (file.exists(possible_path)) {
    logger::log_info("Resolved relative data path to: {possible_path}")
    return(possible_path)
  }
  
  # Try removing file extension and appending .rds or .csv
  if (!grepl("\\.[^\\.]+$", data_path)) {
    for (ext in c(".rds", ".csv")) {
      test_path <- paste0(data_path, ext)
      if (file.exists(test_path)) {
        logger::log_info("Found file by adding extension: {test_path}")
        return(test_path)
      }
      
      # Try with here package
      test_path_here <- here::here(paste0(data_path, ext))
      if (file.exists(test_path_here)) {
        logger::log_info("Found file using here package and adding extension: {test_path_here}")
        return(test_path_here)
      }
    }
  }
  
  # Give up
  stop("Could not find data file at: ", data_path, 
       "\nCurrent working directory is: ", getwd(),
       "\nTried with here package: ", possible_path,
       "\nTry using an absolute path or check that the file exists.")
}

#' @noRd
create_output_directory <- function(output_path) {
  output_dir <- dirname(output_path)
  if (!dir.exists(output_dir)) {
    dir.create(output_dir, recursive = TRUE)
    logger::log_info("Created output directory: {output_dir}")
  } else {
    logger::log_info("Output directory already exists: {output_dir}")
  }
  
  # Check if we can write to the directory
  temp_file <- file.path(output_dir, "test_write_permission.tmp")
  write_test <- tryCatch({
    file.create(temp_file)
  }, error = function(e) {
    return(FALSE)
  })
  
  if (write_test) {
    file.remove(temp_file)
    logger::log_info("Verified write permission to output directory")
  } else {
    logger::log_error("Cannot write to output directory: {output_dir}")
    stop("Cannot write to output directory: ", output_dir)
  }
}

#' @noRd
load_patient_data <- function(data_path) {
  logger::log_info("Loading patient data from {data_path}")
  
  # Determine file type and load accordingly
  if (grepl("\\.rds$", data_path, ignore.case = TRUE)) {
    logger::log_info("Loading RDS file")
    tryCatch({
      patient_dataset <- readr::read_rds(data_path)
      logger::log_info("Successfully loaded RDS file")
    }, error = function(e) {
      logger::log_error("Failed to load RDS file: {e$message}")
      stop("Failed to load RDS file: ", e$message)
    })
  } else if (grepl("\\.csv$", data_path, ignore.case = TRUE)) {
    logger::log_info("Loading CSV file")
    tryCatch({
      patient_dataset <- readr::read_csv(data_path, show_col_types = FALSE)
      logger::log_info("Successfully loaded CSV file")
    }, error = function(e) {
      logger::log_error("Failed to load CSV file: {e$message}")
      stop("Failed to load CSV file: ", e$message)
    })
  } else {
    # Assume RDS if extension not recognized
    logger::log_warn("Unrecognized file extension. Attempting to load as RDS.")
    tryCatch({
      patient_dataset <- readr::read_rds(data_path)
      logger::log_info("Successfully loaded file as RDS")
    }, error = function(e) {
      logger::log_error("Failed to load file: {e$message}")
      stop("Failed to load file: ", e$message)
    })
  }
  
  # Verify that data is a data frame
  assertthat::assert_that(is.data.frame(patient_dataset), 
                         msg = "Loaded data must be a data frame")
  
  # Log summary information
  logger::log_info("Loaded dataset with {nrow(patient_dataset)} rows and {ncol(patient_dataset)} columns")
  logger::log_debug("Available columns: {paste(names(patient_dataset), collapse=', ')}")
  
  return(patient_dataset)
}

#' @noRd
process_patient_data <- function(patient_dataset, group_col, include_costs) {
  logger::log_info("Processing patient dataset")
  
  # Handle group column if it doesn't exist in its expected form
  if (!(group_col %in% names(patient_dataset))) {
    # Try to use sui_treatment1 if available and requested group is the default
    if (group_col == "Stress Urinary Incontinence Therapies" && 
        "sui_treatment1" %in% names(patient_dataset)) {
      logger::log_info("Using 'sui_treatment1' as '{group_col}' column")
      patient_dataset[[group_col]] <- patient_dataset[["sui_treatment1"]]
    } else {
      logger::log_error("Required grouping column '{group_col}' not found in dataset")
      stop("Required grouping column '", group_col, "' not found in dataset.")
    }
  }
  
  # Define core demographic columns to retain
  demographic_cols <- c(
    group_col,
    "Age, years",
    "Race",
    "Year therapy performed",
    "Tobacco Use",
    "US Census Bureau Subdivision",
    "Enrolled in Medicare and Medicaid"
  )
  
  # If include_costs is TRUE, add cost columns to the selection
  if (include_costs) {
    cost_cols <- c("Total_cost", "episode_type", "year_episode_start")
    demographic_cols <- c(demographic_cols, cost_cols)
  }
  
  # Identify which columns actually exist in the dataset
  existing_cols <- demographic_cols[demographic_cols %in% names(patient_dataset)]
  logger::log_info("Found {length(existing_cols)} of {length(demographic_cols)} required columns")
  
  # Select only existing columns
  processed_dataset <- dplyr::select(patient_dataset, dplyr::all_of(existing_cols))
  
  # Handle missing values in categorical variables
  for (col in existing_cols) {
    if (is.character(processed_dataset[[col]]) || is.factor(processed_dataset[[col]])) {
      # For categorical variables, replace NA with "Missing"
      processed_dataset <- dplyr::mutate(processed_dataset, 
                                       "{col}" := ifelse(is.na(.data[[col]]), 
                                                       "Missing", 
                                                       as.character(.data[[col]])))
      logger::log_debug("Replaced NA values with 'Missing' in column: {col}")
    } else if (is.numeric(processed_dataset[[col]])) {
      # For numeric columns, we'll leave NA values as is
      logger::log_debug("Preserved NA values in numeric column: {col}")
    }
  }
  
  # Handle missing values in the grouping column specifically
  if (any(is.na(processed_dataset[[group_col]]) | processed_dataset[[group_col]] == "Missing")) {
    missing_count <- sum(is.na(processed_dataset[[group_col]]) | 
                         processed_dataset[[group_col]] == "Missing", na.rm = TRUE)
    logger::log_warn("Found {missing_count} rows with missing values in grouping column")
    
    # Create a valid category for missing values instead of dropping them
    processed_dataset <- dplyr::mutate(processed_dataset, 
                                     "{group_col}" := ifelse(
                                       is.na(.data[[group_col]]) | .data[[group_col]] == "Missing",
                                       "Unknown/Not Specified", 
                                       .data[[group_col]]
                                     ))
  }
  
  logger::log_info("Processed dataset contains {nrow(processed_dataset)} patients and {ncol(processed_dataset)} variables")
  return(processed_dataset)
}

#' @noRd
verify_required_columns <- function(patient_dataset, group_col, include_costs) {
  logger::log_info("Verifying required columns")
  
  # Define core required columns
  required_cols <- c(
    group_col,
    "Age, years",
    "Race",
    "Year therapy performed",
    "Tobacco Use",
    "US Census Bureau Subdivision",
    "Enrolled in Medicare and Medicaid"
  )
  
  # If include_costs is TRUE, also check for cost columns
  if (include_costs) {
    cost_cols <- c("Total_cost", "episode_type", "year_episode_start")
    required_cols <- c(required_cols, cost_cols)
  }
  
  # Check which required columns are missing
  missing_cols <- setdiff(required_cols, names(patient_dataset))
  
  if (length(missing_cols) > 0) {
    # Group column is critically required
    if (group_col %in% missing_cols) {
      logger::log_error("Critical grouping column '{group_col}' is missing")
      stop("Critical grouping column '", group_col, "' is missing")
    }
    
    # Cost columns are required if include_costs is TRUE
    if (include_costs && any(cost_cols %in% missing_cols)) {
      missing_cost_cols <- intersect(cost_cols, missing_cols)
      logger::log_error("Required cost columns missing: {paste(missing_cost_cols, collapse=', ')}")
      stop("Required cost columns missing: ", paste(missing_cost_cols, collapse=", "), 
           ". Cannot include costs without these columns.")
    }
    
    # Other columns will just generate a warning
    other_missing <- setdiff(missing_cols, c(group_col, cost_cols))
    if (length(other_missing) > 0) {
      logger::log_warn("Missing recommended columns: {paste(other_missing, collapse=', ')}")
      logger::log_info("Will proceed with available columns")
    }
  } else {
    logger::log_info("All required columns present in dataset")
  }
}

#' @noRd
generate_demographics_comparison <- function(patient_dataset, group_col, include_costs) {
  logger::log_info("Generating demographics comparison table")
  
  # Create formula with proper quoting for column names with spaces
  # Only include columns that actually exist in the dataset
  available_cols <- c(
    "Age, years", 
    "Race", 
    "Year therapy performed", 
    "Tobacco Use", 
    "US Census Bureau Subdivision", 
    "Enrolled in Medicare and Medicaid"
  )
  
  if (include_costs) {
    available_cols <- c(available_cols, "Total_cost", "episode_type", "year_episode_start")
  }
  
  available_cols <- available_cols[available_cols %in% names(patient_dataset)]
  logger::log_info("Including {length(available_cols)} columns in comparison table")
  
  # Special case handling for empty dataset or no columns
  if (nrow(patient_dataset) == 0 || length(available_cols) == 0) {
    logger::log_warn("Dataset is empty or no valid columns found")
    # Return a dummy tableby object to avoid errors
    dummy_data <- data.frame(
      Group = c("No data available"),
      Value = c(NA)
    )
    names(dummy_data)[1] <- group_col
    dummy_formula <- stats::as.formula(paste(group_col, "~ Value"))
    return(arsenal::tableby(dummy_formula, data = dummy_data))
  }
  
  # Build the formula string with backticks for proper handling of spaces in column names
  formula_parts <- paste(paste0("`", available_cols, "`"), collapse = " + ")
  formula_str <- sprintf("`%s` ~ %s", group_col, formula_parts)
  
  formula <- stats::as.formula(formula_str)
  logger::log_debug("Using formula: {formula_str}")
  
  # Generate the comparison table
  tryCatch({
    # First, check for problematic data that might cause arsenal::tableby to fail
    # Ensure group column has no NA values
    if (any(is.na(patient_dataset[[group_col]]))) {
      logger::log_warn("NA values found in group column - replacing with 'Unknown'")
      patient_dataset[[group_col]] <- ifelse(is.na(patient_dataset[[group_col]]), 
                                           "Unknown", 
                                           patient_dataset[[group_col]])
    }
    
    # Create a copy of the dataset to avoid modifying the original
    safe_data <- patient_dataset
    
    # Generate the tableby object with careful error handling
    demographic_comparison <- arsenal::tableby(
      formula,
      data = safe_data,
      control = arsenal::tableby.control(
        test = TRUE,
        total = TRUE,
        digits = 1L,
        digits.p = 2L,
        digits.count = 0L,
        numeric.simplify = TRUE,
        cat.simplify = FALSE,
        numeric.stats = c("median", "q1q3"),
        cat.stats = c("Nmiss", "countpct"),
        stats.labels = get_stat_labels(),
        sparse.remove = FALSE  # Prevent removal of sparse categories
      )
    )
    
    # Check if the tableby object is valid
    if (!inherits(demographic_comparison, "tableby")) {
      logger::log_warn("tableby() did not return a tableby object. Got class: {class(demographic_comparison)}")
    } else {
      logger::log_info("Successfully generated demographics comparison table")
    }
    
    return(demographic_comparison)
  }, error = function(e) {
    logger::log_error("Error generating comparison table: {e$message}")
    # Create a minimal tableby object to avoid breaking the rest of the function
    logger::log_warn("Creating minimal tableby object as fallback")
    dummy_data <- data.frame(
      Group = unique(patient_dataset[[group_col]]),
      Count = table(patient_dataset[[group_col]])
    )
    names(dummy_data)[1] <- group_col
    dummy_formula <- stats::as.formula(paste(group_col, "~ Count"))
    
    tryCatch({
      fallback_table <- arsenal::tableby(dummy_formula, data = dummy_data)
      logger::log_info("Created fallback tableby object")
      return(fallback_table)
    }, error = function(e2) {
      logger::log_error("Fallback tableby also failed: {e2$message}")
      stop("Unable to generate demographics table: ", e$message)
    })
  })
}

#' @noRd
get_stat_labels <- function() {
  list(
    Nmiss = "N Missing",
    Nmiss2 = "N Missing",
    meansd = "Mean (SD)",
    medianrange = "Median (Range)",
    median = "Median",
    medianq1q3 = "Median (Q1, Q3)",
    q1q3 = "Q1, Q3",
    iqr = "IQR",
    range = "Range",
    countpct = "Count (Pct)",
    Nevents = "Events",
    medSurv = "Median Survival",
    medTime = "Median Follow-Up"
  )
}

#' @noRd
create_summary_table <- function(demographic_comparison) {
  logger::log_info("Creating formatted summary table")
  
  # Make sure the demographic_comparison is a tableby object
  if (!inherits(demographic_comparison, "tableby")) {
    logger::log_warn("Object is not a tableby object. Class: {paste(class(demographic_comparison), collapse=', ')}")
    logger::log_info("Returning original object without summary")
    return(demographic_comparison)
  }
  
  # Try to create the summary if the object is a tableby
  tryCatch({
    # Create summary directly with specific arguments
    logger::log_info("Creating summary using arsenal::summary function")
    
    # Call summary directly without specifying the class method
    # The S3 dispatch will handle finding the right method
    summary_result <- summary(demographic_comparison, text = TRUE, pfootnote = TRUE)
    
    if (!is.null(summary_result)) {
      logger::log_info("Summary table created successfully")
      return(summary_result)
    } else {
      logger::log_warn("Summary function returned NULL")
      return(demographic_comparison)
    }
  }, error = function(e) {
    logger::log_warn("Error creating summary table: {e$message}")
    logger::log_info("Returning original tableby object instead of summary")
    return(demographic_comparison)
  })
}

#' @noRd
verify_patient_count <- function(demographic_comparison, expected_count) {
  logger::log_info("Verifying patient count in analysis matches expected count")
  
  # Try to extract the total count from the tableby object
  totals_attr <- attr(demographic_comparison, "totals")
  
  if (!is.null(totals_attr)) {
    # Extract the number from the totals attribute
    total_matches <- regmatches(totals_attr, regexpr("\\d+", totals_attr))
    if (length(total_matches) > 0) {
      actual_count <- as.numeric(total_matches[1])
      
      if (actual_count != expected_count) {
        logger::log_warn("Patient count mismatch! Table shows {actual_count} patients but expected {expected_count}")
      } else {
        logger::log_info("Successfully included all {expected_count} patients in the analysis")
      }
    } else {
      logger::log_warn("Could not extract patient count from table")
    }
  } else {
    logger::log_warn("Could not verify patient count - totals attribute not found")
  }
}

#' @noRd
save_table_to_word <- function(summary_table, output_path) {
  logger::log_info("Saving table to Word document: {output_path}")
  
  # Check if summary_table is NULL
  if (is.null(summary_table)) {
    logger::log_error("Cannot save NULL summary table to Word")
    stop("Summary table is NULL")
  }
  
  # Make sure output path has the correct extension
  if (!grepl("\\.docx$", output_path)) {
    output_path <- paste0(output_path, ".docx")
    logger::log_info("Added .docx extension to output path: {output_path}")
  }
  
  # Create a plain text version as backup
  text_output_path <- gsub("\\.docx$", ".txt", output_path)
  logger::log_info("Creating text backup at {text_output_path}")
  
  tryCatch({
    # Create text representation
    text_content <- capture.output(print(summary_table))
    writeLines(text_content, text_output_path)
    logger::log_info("Text backup created successfully")
  }, error = function(e) {
    logger::log_warn("Could not create text backup: {e$message}")
  })
  
  # Handle different object types appropriately
  if (inherits(summary_table, "summary.tableby")) {
    # Already a summary object, try to save it directly
    logger::log_info("Saving summary.tableby object to Word")
    tryCatch({
      arsenal::write2word(summary_table, output_path)
      logger::log_info("Successfully saved summary.tableby to Word")
      return(TRUE)
    }, error = function(e) {
      logger::log_warn("Error saving summary.tableby: {e$message}")
    })
  } 
  
  if (inherits(summary_table, "tableby")) {
    # It's a tableby object but not a summary, try creating summary first
    logger::log_info("Converting tableby to summary before saving")
    tryCatch({
      # Use eval to avoid direct reference that might cause problems
      summary_obj <- eval(parse(text = "summary(summary_table, text = TRUE)"))
      arsenal::write2word(summary_obj, output_path)
      logger::log_info("Successfully saved newly created summary to Word")
      return(TRUE)
    }, error = function(e) {
      logger::log_warn("Error creating and saving summary: {e$message}")
      
      # Try direct approach as fallback
      tryCatch({
        logger::log_info("Trying to save tableby object directly")
        arsenal::write2word(summary_table, output_path)
        logger::log_info("Successfully saved tableby directly to Word")
        return(TRUE)
      }, error = function(e2) {
        logger::log_warn("Could not save tableby directly: {e2$message}")
      })
    })
  }
  
  # If we get here, previous attempts failed
  # Last resort: Try a different approach with arsenal
  logger::log_info("Trying alternative arsenal write approach")
  tryCatch({
    # Create a new environment to avoid conflicts
    temp_env <- new.env()
    # Define the object in that environment
    temp_env$obj <- summary_table
    # Use eval in that environment to call write2word
    eval(parse(text = "arsenal::write2word(obj, output_path)"), envir = temp_env)
    logger::log_info("Successfully saved to Word using alternative approach")
    return(TRUE)
  }, error = function(e) {
    logger::log_error("All Word document creation attempts failed: {e$message}")
    logger::log_info("Table was saved as text to: {text_output_path}")
    message("Could not create Word document. Table saved as text to: ", text_output_path)
    
    # Return FALSE to indicate failure but don't stop execution
    return(FALSE)
  })
}

table1 <- readr::read_rds("/Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments/Data/CADR_2023/final_push/March2_table1.rds")
data_path <- "/Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments/Data/CADR_2023/final_push/March2_table1.rds"


word_file_path <- "/Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments/Data/CADR_2023/final_push/~Table1.docx"

# For some reason it requires the full path.  
table1 <- create_demographics_table(
  data_path = "/Users/tylermuffly/Dropbox (Personal)/AlternativePayments/alternative_payments/Data/CADR_2023/final_push/March2_table1.rds",
  output_path = word_file_path,
  include_costs = FALSE  # Set to TRUE if you want to include cost analysis
)

| | | 0% | |…………………………………………………………….| 100%
/usr/local/bin/pandoc +RTS -K512m -RTS ~Table1.docx.knit.md –to docx –from markdown+autolink_bare_uris+tex_math_single_backslash –output pandoc1304c1c0ce32e.docx –lua-filter /Users/tylermuffly/Library/R/x86_64/4.4/library/rmarkdown/rmarkdown/lua/pagebreak.lua –highlight-style tango

kable(table1)

group.term	group.label	variable	term	label	variable.type	Incontinence sling	Pelvic floor muscle therapy	Pessary	Total	test	p.value
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Age, years	Age, years	Age, years	categorical					Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Age, years	countpct	66 to 69 years old	categorical	1032.0000, 30.1578	1327.00000, 23.88409	1032.00000, 13.37307	3391.00000, 20.31147	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Age, years	countpct	70 to 74 years old	categorical	1036.00000, 30.27469	1407.00000, 25.32397	1502.00000, 19.46352	3945.00000, 23.62983	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Age, years	countpct	75 to 79 years old	categorical	747.00000, 21.82934	1253.0000, 22.5522	1722.00000, 22.31437	3722.0000, 22.2941	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Age, years	countpct	80 to 84 years old	categorical	404.00000, 11.80596	848.00000, 15.26278	1656.00000, 21.45912	2908.00000, 17.41839	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Age, years	countpct	85 or older	categorical	203.000000, 5.932203	721.00000, 12.97696	1805.00000, 23.38992	2729.00000, 16.34621	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Race	Race	Race	categorical					Pearson’s Chi-squared test	2e-07
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Race	countpct	Other	categorical	169.000000, 4.938632	421.000000, 7.577394	595.00000, 7.71025	1185.000000, 7.097934	Pearson’s Chi-squared test	2e-07
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Race	countpct	White	categorical	3253.00000, 95.06137	5135.00000, 92.42261	7122.00000, 92.28975	15510.00000, 92.90207	Pearson’s Chi-squared test	2e-07
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Year therapy performed	Year therapy performed	Year therapy performed	categorical					Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Year therapy performed	countpct	2008	categorical	697.00000, 20.36821	619.00000, 11.14111	1179.00000, 15.27796	2495.00000, 14.94459	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Year therapy performed	countpct	2009	categorical	632.00000, 18.46873	571.00000, 10.27718	974.00000, 12.62149	2177.00000, 13.03983	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Year therapy performed	countpct	2010	categorical	554.00000, 16.18936	644.00000, 11.59107	884.00000, 11.45523	2082.0000, 12.4708	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Year therapy performed	countpct	2011	categorical	492.00000, 14.37756	573.00000, 10.31317	870.00000, 11.27381	1935.0000, 11.5903	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Year therapy performed	countpct	2012	categorical	301.000000, 8.796026	609.00000, 10.96112	803.0000, 10.4056	1713.00000, 10.26056	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Year therapy performed	countpct	2013	categorical	259.000000, 7.568673	607.00000, 10.92513	817.00000, 10.58702	1683.00000, 10.08086	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Year therapy performed	countpct	2014	categorical	189.000000, 5.523086	679.00000, 12.22102	756.000000, 9.796553	1624.000000, 9.727463	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Year therapy performed	countpct	2015	categorical	167.000000, 4.880187	669.00000, 12.04104	807.00000, 10.45743	1643.00000, 9.84127	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Year therapy performed	countpct	2016	categorical	131.000000, 3.828171	585.00000, 10.52916	627.000000, 8.124919	1343.000000, 8.044325	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Tobacco Use	Tobacco Use	Tobacco Use	categorical					Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Tobacco Use	countpct	Non-smoker	categorical	2903.00000, 84.83343	5103.00000, 91.84665	7092.000, 91.901	15098.00000, 90.43426	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Tobacco Use	countpct	Smoker	categorical	519.00000, 15.16657	453.000000, 8.153348	625.000000, 8.099002	1597.000000, 9.565738	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	US Census Bureau Subdivision	US Census Bureau Subdivision	US Census Bureau Subdivision	categorical					Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	US Census Bureau Subdivision	countpct	East North Central	categorical	529.0000, 15.4588	814.00000, 14.65083	1398.00000, 18.11585	2741.00000, 16.41809	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	US Census Bureau Subdivision	countpct	East South Central	categorical	366.0000, 10.6955	174.000000, 3.131749	476.0000, 6.1682	1016.000000, 6.085654	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	US Census Bureau Subdivision	countpct	Middle Atlantic	categorical	318.000000, 9.292811	633.00000, 11.39309	1161.00000, 15.04471	2112.00000, 12.65049	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	US Census Bureau Subdivision	countpct	Mountain	categorical	188.000000, 5.493863	391.000000, 7.037437	456.000000, 5.909032	1035.000000, 6.199461	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	US Census Bureau Subdivision	countpct	New England	categorical	150.000000, 4.383402	395.000000, 7.109431	648.000000, 8.397045	1193.000000, 7.145852	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	US Census Bureau Subdivision	countpct	Pacific	categorical	321.000000, 9.380479	963.00000, 17.33261	840.00000, 10.88506	2124.00000, 12.72237	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	US Census Bureau Subdivision	countpct	South Atlantic	categorical	744.00000, 21.74167	1302.00000, 23.43413	1560.00000, 20.21511	3606.00000, 21.59928	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	US Census Bureau Subdivision	countpct	West North Central	categorical	299.00000, 8.73758	506.000000, 9.107271	447.000000, 5.792406	1252.000000, 7.499251	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	US Census Bureau Subdivision	countpct	West South Central	categorical	507.0000, 14.8159	378.000000, 6.803456	731.000000, 9.472593	1616.000000, 9.679545	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Enrolled in Medicare and Medicaid	Enrolled in Medicare and Medicaid	Enrolled in Medicare and Medicaid	categorical					Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Enrolled in Medicare and Medicaid	countpct	Enrolled in Medicare and Medicaid	categorical	330.000000, 9.643483	568.00000, 10.22318	1069.00000, 13.85253	1967.00000, 11.78197	Pearson’s Chi-squared test	0e+00
Stress Urinary Incontinence Therapies	Stress Urinary Incontinence Therapies	Enrolled in Medicare and Medicaid	countpct	No dual coverage	categorical	3092.00000, 90.35652	4988.00000, 89.77682	6648.00000, 86.14747	14728.00000, 88.21803	Pearson’s Chi-squared test	0e+00

	Incontinence sling (N=3422)	Pelvic floor muscle therapy (N=5556)	Pessary (N=7717)	Total (N=16695)	p value
Age, years					< 0.01 (1)
- 66 to 69 years old	1032 (30.2%)	1327 (23.9%)	1032 (13.4%)	3391 (20.3%)
- 70 to 74 years old	1036 (30.3%)	1407 (25.3%)	1502 (19.5%)	3945 (23.6%)
- 75 to 79 years old	747 (21.8%)	1253 (22.6%)	1722 (22.3%)	3722 (22.3%)
- 80 to 84 years old	404 (11.8%)	848 (15.3%)	1656 (21.5%)	2908 (17.4%)
- 85 or older	203 (5.9%)	721 (13.0%)	1805 (23.4%)	2729 (16.3%)
Race					< 0.01 (1)
- Other	169 (4.9%)	421 (7.6%)	595 (7.7%)	1185 (7.1%)
- White	3253 (95.1%)	5135 (92.4%)	7122 (92.3%)	15510 (92.9%)
Year therapy performed					< 0.01 (1)
- 2008	697 (20.4%)	619 (11.1%)	1179 (15.3%)	2495 (14.9%)
- 2009	632 (18.5%)	571 (10.3%)	974 (12.6%)	2177 (13.0%)
- 2010	554 (16.2%)	644 (11.6%)	884 (11.5%)	2082 (12.5%)
- 2011	492 (14.4%)	573 (10.3%)	870 (11.3%)	1935 (11.6%)
- 2012	301 (8.8%)	609 (11.0%)	803 (10.4%)	1713 (10.3%)
- 2013	259 (7.6%)	607 (10.9%)	817 (10.6%)	1683 (10.1%)
- 2014	189 (5.5%)	679 (12.2%)	756 (9.8%)	1624 (9.7%)
- 2015	167 (4.9%)	669 (12.0%)	807 (10.5%)	1643 (9.8%)
- 2016	131 (3.8%)	585 (10.5%)	627 (8.1%)	1343 (8.0%)
Tobacco Use					< 0.01 (1)
- Non-smoker	2903 (84.8%)	5103 (91.8%)	7092 (91.9%)	15098 (90.4%)
- Smoker	519 (15.2%)	453 (8.2%)	625 (8.1%)	1597 (9.6%)
US Census Bureau Subdivision					< 0.01 (1)
- East North Central	529 (15.5%)	814 (14.7%)	1398 (18.1%)	2741 (16.4%)
- East South Central	366 (10.7%)	174 (3.1%)	476 (6.2%)	1016 (6.1%)
- Middle Atlantic	318 (9.3%)	633 (11.4%)	1161 (15.0%)	2112 (12.7%)
- Mountain	188 (5.5%)	391 (7.0%)	456 (5.9%)	1035 (6.2%)
- New England	150 (4.4%)	395 (7.1%)	648 (8.4%)	1193 (7.1%)
- Pacific	321 (9.4%)	963 (17.3%)	840 (10.9%)	2124 (12.7%)
- South Atlantic	744 (21.7%)	1302 (23.4%)	1560 (20.2%)	3606 (21.6%)
- West North Central	299 (8.7%)	506 (9.1%)	447 (5.8%)	1252 (7.5%)
- West South Central	507 (14.8%)	378 (6.8%)	731 (9.5%)	1616 (9.7%)
Enrolled in Medicare and Medicaid					< 0.01 (1)
- Enrolled in Medicare and Medicaid	330 (9.6%)	568 (10.2%)	1069 (13.9%)	1967 (11.8%)
- No dual coverage	3092 (90.4%)	4988 (89.8%)	6648 (86.1%)	14728 (88.2%)

x
16695

# Export the tableby object directly to Word
arsenal::write2word(
  table1$comparison,  # Use the comparison component which is the tableby object
  file = word_file_path,  # Specify your desired output path
  quiet = TRUE
)

# Open the Word document after saving
browseURL(word_file_path)

15.2 Table 2

cost_data_path <- "Data/CADR_2023/final_push/_Costs_step33.rds"
output_docx = "Data/CADR_2023/final_push/Table2.docx"

build_table2(cost_data_path, output_docx = output_docx, verbose = TRUE)

SUI Treatment	Cost Category	Fee Type	Median	IQR	Count
PT	Complication-Related Costs	Facility Fees	$88	$32 - $190	27
PT	Complication-Related Costs	Professional Fees	$89	$44 - $172	123
PT	Diagnostic Testing	Facility Fees	$21	$11 - $55	2138
PT	Diagnostic Testing	Professional Fees	$148	$23 - $572	4468
PT	Outpatient Visit Costs	Facility Fees	$122	$78 - $200	126
PT	Outpatient Visit Costs	Professional Fees	$193	$112 - $308	4071
Pessary	Complication-Related Costs	Facility Fees	$33	$15 - $116	213
Pessary	Complication-Related Costs	Professional Fees	$151	$58 - $221	669
Pessary	Diagnostic Testing	Facility Fees	$22	$11 - $57	3389
Pessary	Diagnostic Testing	Professional Fees	$60	$16 - $330	5891
Pessary	Outpatient Visit Costs	Facility Fees	$104	$73 - $180	128
Pessary	Outpatient Visit Costs	Professional Fees	$167	$99 - $263	5452
Sling	Complication-Related Costs	Facility Fees	$1,986	$1,986 - $1,986	1
Sling	Complication-Related Costs	Professional Fees	$219	$94 - $680	108
Sling	Diagnostic Testing	Facility Fees	$18	$8 - $44	1814
Sling	Diagnostic Testing	Professional Fees	$365	$34 - $551	3064
Sling	Facility Costs	Total	$1,218	$979 - $1,484	96
Sling	Outpatient Visit Costs	Facility Fees	$142	$82 - $226	37
Sling	Outpatient Visit Costs	Professional Fees	$202	$128 - $294	3122

15.3 Table 2: Categories Supplemental

cost_data <- readr::read_rds("Data/CADR_2023/final_push/_Costs_step33.rds")

# Define cost categories and their corresponding column patterns
cost_categories <- list(
  "Visit Costs - New Patient" = c("new_lvl1_cost", "new_lvl2_cost", "new_lvl3_cost",
                                   "new_lvl4_cost", "new_lvl5_cost"),
  "Visit Costs - Return Patient" = c("return_lvl1_cost", "return_lvl2_cost", "return_lvl3_cost",
                                      "return_lvl4_cost", "return_lvl5_cost"),
  "Visit Costs - Consultation" = c("consult_lvl1_cost", "consult_lvl2_cost", "consult_lvl3_cost",
                                    "consult_lvl4_cost", "consult_lvl5_cost"),
  "Diagnostic Tests - Basic" = c("UA_cost", "urineCX_cost", "microscopy_cost"),
  "Diagnostic Tests - Advanced" = c("cathPVR_cost", "usPVR_cost", "uroflowmetry_cost",
                                     "cystometrics_cost", "UDS_cost", "vUDS_cost", 
                                     "cystoscopy_cost", "UTeval_cost"),
  "Facility Type Costs" = c("IP_facility_cost", "OP_facility_cost", "ASC_facility_cost"),
  "Anesthesia Costs" = c("anesth_cost"),
  "Complication Costs" = c("complication_cost"),
  "Emergency Costs" = c("EM_cost", "edOP_facility_cost", "edprofessional_cost")
)

# Convert cost_categories into a data frame for table formatting
cost_categories_df <- data.frame(
  "Cost Category" = rep(names(cost_categories), lengths(cost_categories)),
  "Column Name" = unlist(cost_categories)
)

# Create a formatted flextable
cost_categories_flextable <- cost_categories_df %>%
  flextable() %>%
  theme_vanilla() %>%
  set_header_labels(
    `Cost Category` = "Cost Category",
    `Column Name` = "Associated Column Names"
  ) %>%
  autofit()

# Justification paragraph
justification_text <- paste(
  "The grouping of cost columns into distinct categories was carefully designed to reflect the structure of healthcare utilization",
  "and cost allocation in clinical practice. Visit costs were separated into New Patient, Return Patient, and Consultation categories",
  "to account for the differences in reimbursement and resource utilization across different types of patient encounters. New patient",
  "visits typically involve more comprehensive evaluations, whereas return visits focus on follow-ups, and consultations often",
  "require input from specialists. Similarly, diagnostic tests were classified into Basic and Advanced to distinguish routine tests,",
  "such as urinalysis and urine cultures, from more specialized procedures like urodynamic studies and cystoscopy, which require",
  "higher levels of expertise and equipment. This separation allows for a more accurate assessment of how different levels of",
  "diagnostic testing contribute to overall costs.",
  
  "Other cost categories were created to capture specific aspects of patient care that significantly impact healthcare spending.",
  "Facility Type Costs were grouped separately to highlight the cost variations between inpatient, outpatient, and ambulatory",
  "surgical center settings, which differ in overhead and procedural complexity. Anesthesia Costs, Complication Costs, and",
  "Emergency Costs were categorized independently to account for additional expenses incurred due to surgical procedures,",
  "post-operative complications, and urgent or unplanned care needs. By organizing costs into these clinically meaningful categories,",
  "the analysis provides a clearer picture of where resources are allocated and helps identify key cost drivers within different",
  "treatment pathways."
)

# Create a new Word document and add content
doc <- read_docx() %>%
  body_add_par("Cost Categories and Their Associated Columns", style = "heading 1") %>%
  body_add_par("This table provides a mapping of cost categories to their respective column names.") %>%
  body_add_flextable(cost_categories_flextable) %>%
  body_add_par("Justification for Cost Category Grouping", style = "heading 1") %>%
  body_add_par(justification_text)

# Define the output file path
output_path <- "Data/CADR_2023/final_push/Cost_Categories_Grouping.docx"

# Save the Word document
print(doc, target = output_path)

browseURL(output_path)

# Message indicating successful save
message("Cost categories successfully saved as ", output_path)

15.4 Table 2: Categories Tally

# Define cost categories and their corresponding column patterns
cost_categories <- list(
  "Visit Costs - New Patient" = c("new_lvl1_cost", "new_lvl2_cost", "new_lvl3_cost",
                                   "new_lvl4_cost", "new_lvl5_cost"),
  "Visit Costs - Return Patient" = c("return_lvl1_cost", "return_lvl2_cost", "return_lvl3_cost",
                                      "return_lvl4_cost", "return_lvl5_cost"),
  "Visit Costs - Consultation" = c("consult_lvl1_cost", "consult_lvl2_cost", "consult_lvl3_cost",
                                    "consult_lvl4_cost", "consult_lvl5_cost"),
  "Diagnostic Tests - Basic" = c("UA_cost", "urineCX_cost", "microscopy_cost"),
  "Diagnostic Tests - Advanced" = c("cathPVR_cost", "usPVR_cost", "uroflowmetry_cost",
                                     "cystometrics_cost", "UDS_cost", "vUDS_cost", 
                                     "cystoscopy_cost", "UTeval_cost"),
  "Facility Type Costs" = c("IP_facility_cost", "OP_facility_cost", "ASC_facility_cost"),
  "Anesthesia Costs" = c("anesth_cost"),
  "Complication Costs" = c("complication_cost"),
  "Emergency Costs" = c("EM_cost", "edOP_facility_cost", "edprofessional_cost")
)

# Payment types to consider
payment_types <- c("OP_facility_", "professional_")

# Initialize list to store categorized columns
categorized_columns <- list()

# Categorize columns
for (category in names(cost_categories)) {
  category_patterns <- cost_categories[[category]]
  
  # Match both category pattern and payment type
  matching_cols <- unique(unlist(lapply(payment_types, function(ptype) {
    lapply(category_patterns, function(pattern) {
      grep(paste0(ptype, ".*", pattern), names(cost_data), value = TRUE, ignore.case = TRUE)
    })
  })))
  
  if (length(matching_cols) > 0) {
    categorized_columns[[category]] <- matching_cols
  }
}

# Create summary table
summary_df <- data.frame(
  Category = names(categorized_columns),
  Column_Count = sapply(categorized_columns, length),
  Facility_Columns = sapply(categorized_columns, function(x) sum(grepl("facility", x, ignore.case = TRUE))),
  Professional_Columns = sapply(categorized_columns, function(x) sum(grepl("professional", x, ignore.case = TRUE)))
)

# Create formatted flextable
summary_flextable <- summary_df %>%
  flextable() %>%
  theme_booktabs() %>%
  set_header_labels(
    Category = "Cost Category",
    Column_Count = "Total Columns",
    Facility_Columns = "Facility Columns",
    Professional_Columns = "Professional Columns"
  ) %>%
  autofit()

# Create a new Word document and add the table
doc <- read_docx() %>%
  body_add_par("Table 2 Tally: Cost Categories Summary", style = "heading 1") %>%
  body_add_par("This table provides a tally of cost categories, including the count of facility and professional cost columns.") %>%
  body_add_flextable(summary_flextable)

# Define the output file path
output_path <- "Data/CADR_2023/final_push/Table2_supplemental_tally.docx"

# Save the Word document
print(doc, target = output_path)

browseURL(output_path)

# Message indicating successful save
message("Table 2 Tally successfully saved as ", output_path)

16 Log Linear Model

The coefficients are not exponentiated back to normal values. This equation represents the log-transformed outcome variable, meaning that the model estimates changes in the log of “Sum of Allowable Medicare Charges.” \[ \begin{aligned} \log(\text{Sum of Allowable Medicare Charges}) &= 5.77 \\ &+ 1.18 \times \text{SUI Treatment: Sling} \\ &+ 0.0429 \times \text{Age 71 to 75.9} \\ &- 0.1348 \times \text{Age 86 or greater} \\ &+ 0.0795 \times \text{Medicare only} \\ &+ 0.1642 \times \text{Middle Atlantic} \\ &+ 0.1289 \times \text{South Atlantic} \\ &+ 0.4043 \times \text{Urology} \\ &+ 0.2210 \times \text{OBGYN} \\ &- 0.1132 \times \text{PCP} \\ &+ 0.04897 \times \text{Diabetes before SUI} \\ &+ 0.0641 \times \text{Obesity before SUI} \\ &+ 0.5717 \times \text{Urinary Tract Infection during SUI} \\ &+ \varepsilon \end{aligned} \]

“Comorbidities and postoperative complications significantly increased costs, with complication1??? (????) and complication2???? (????), respectively, contributing the most (all p < ???).”

Examine significant positive predictors of cost.

# Create sentence
sentence <- sprintf(
  "Comorbidities and postoperative complications significantly increased costs, with %s and %s and %s, respectively, contributing the most (all %s).",
  sprintf("%s (%.1f%%)", comorbidities$name[1], comorbidities$percent_change[1]),
  sprintf("%s (%.1f%%)", comorbidities$name[2], comorbidities$percent_change[2]),
  sprintf("%s (%.1f%%)", complications$name[1], complications$percent_change[1]),
  p_value_statement
)

print(sentence)

[1] “Comorbidities and postoperative complications significantly increased costs, with Obesity (6.6%) and Diabetes (5.0%) and UrinaryTractInfection (77.1%), respectively, contributing the most (all p < 0.01).”

Read in logistic regression data from exploratory.io in ~Costs/Linear regression Original code from “~Payment regression Rcode 2 on 11.23.2024.R”

16.1 Data Prep

df <- read_rds("Data/CADR_2023/final_push/Linear regression pretty.rds")

# Checks levels 
purrr::map(df, ~ if (is.factor(.x)) list(reference_level = levels(.x)[1], all_levels = levels(.x))) %>% compact()

$StressUrinaryIncontinenceTreatment$StressUrinaryIncontinenceTreatment$reference_level [1] “Pessary”

$StressUrinaryIncontinenceTreatment$all_levels [1] “Pessary” “PT” “Sling”

$Year$Year$reference_level [1] “2008”

$Year$all_levels [1] “2008” “2009” “2010” “2011” “2012” “2013” “2014” “2015” “2016”

$Age$Age$reference_level [1] “66 to 70.9”

$Age$all_levels [1] “66 to 70.9” “71 to 75.9” “76 to 80.9” “81 to 85.9”
[5] “86 or greater”

$Race$Race$reference_level [1] “Other”

$Race$all_levels [1] “Other” “White”

$Insurance$Insurance$reference_level [1] “Medicare and Medicaid”

$Insurance$all_levels [1] “Medicare and Medicaid” “Medicare only”

$USCensusBureauDivision$USCensusBureauDivision$reference_level [1] “New England”

$USCensusBureauDivision$all_levels [1] “New England” “East North Central” “East South Central” [4] “Middle Atlantic” “Mountain” “Pacific”
[7] “South Atlantic” “West North Central” “West South Central”

$TreatingPhysician$TreatingPhysician$reference_level [1] “NP”

$TreatingPhysician$all_levels [1] “NP” “OBGYN” “PCP” “Urology”

$NeurologicalDisorderBeforeSUI$NeurologicalDisorderBeforeSUI$reference_level [1] “No”

$NeurologicalDisorderBeforeSUI$all_levels [1] “No” “Yes”

$ChronicLungDisorderBeforeSUI$ChronicLungDisorderBeforeSUI$reference_level [1] “No”

$ChronicLungDisorderBeforeSUI$all_levels [1] “No” “Yes”

$DepressionBeforeSUI$DepressionBeforeSUI$reference_level [1] “No”

$DepressionBeforeSUI$all_levels [1] “No” “Yes”

$DiabetesBeforeSUI$DiabetesBeforeSUI$reference_level [1] “No”

$DiabetesBeforeSUI$all_levels [1] “No” “Yes”

$ObesityBeforeSUI$ObesityBeforeSUI$reference_level [1] “No”

$ObesityBeforeSUI$all_levels [1] “No” “Yes”

$TobaccoUseBeforeSUI$TobaccoUseBeforeSUI$reference_level [1] “No”

$TobaccoUseBeforeSUI$all_levels [1] “No” “Yes”

$UrinaryTractInfectionDuringSUI$UrinaryTractInfectionDuringSUI$reference_level [1] “No”

$UrinaryTractInfectionDuringSUI$all_levels [1] “No” “Yes”

table(df$UrinaryTractInfectionDuringSUI)  # Check if "Yes" is present

No Yes 15746 949

levels(df$UrinaryTractInfectionDuringSUI)

[1] “No” “Yes”

df %>%
  dplyr::select(UrinaryTractInfectionDuringSUI) %>%
  purrr::map(levels)

$UrinaryTractInfectionDuringSUI [1] “No” “Yes”

summary(df$UrinaryTractInfectionDuringSUI)

No Yes 15746 949

16.2 Remove zero variance and near-zero variance

#Check for missing:
colSums(is.na(df))

StressUrinaryIncontinenceTreatment Year 0 0 Age Race 0 0 Insurance USCensusBureauDivision 0 0 SumOfPaymentForTreatment TreatingPhysician 0 0 NeurologicalDisorderBeforeSUI ChronicLungDisorderBeforeSUI 0 0 DepressionBeforeSUI DiabetesBeforeSUI 0 0 ObesityBeforeSUI TobaccoUseBeforeSUI 0 0 UrinaryTractInfectionDuringSUI 0

# Identify which columns are factors
factor_vars <- df %>%
  select(where(is.factor)) %>%
  names(); print(factor_vars)

[1] “StressUrinaryIncontinenceTreatment” “Year”
[3] “Age” “Race”
[5] “Insurance” “USCensusBureauDivision”
[7] “TreatingPhysician” “NeurologicalDisorderBeforeSUI”
[9] “ChronicLungDisorderBeforeSUI” “DepressionBeforeSUI”
[11] “DiabetesBeforeSUI” “ObesityBeforeSUI”
[13] “TobaccoUseBeforeSUI” “UrinaryTractInfectionDuringSUI”

# Identify which columns are numeric
numeric_vars <- df %>%
  select(where(is.numeric)) %>%
  names(); print(numeric_vars)

[1] “SumOfPaymentForTreatment”

# Identify factors with only one level
one_level_factors <- df %>%
  select(where(is.factor)) %>%
  map(~ levels(.x)) %>%
  keep(~ length(.x) == 1) %>%
  names(); print(one_level_factors)

character(0)

# Find factors with only one level
zero_variance_factors <- df %>%
  select(where(is.factor)) %>%
  map(~ nlevels(.x)) %>%
  keep(~ .x == 1) %>%
  names(); print(zero_variance_factors)  # Shows variables with zero variance

character(0)

# Identify near-zero variance factors
# Factors where one level dominates (>95%).
# Identify near-zero variance factors
nzv_factors <- caret::nearZeroVar(df %>% dplyr::select(where(is.factor)), saveMetrics = TRUE)

# Filter for factors with near-zero variance
nzv_factors <- rownames(nzv_factors[nzv_factors$nzv == TRUE, ])

# Generate the sentence dynamically
if (length(nzv_factors) > 0) {
  sentence <- glue(
    "The following factors exhibited **near-zero variance**, meaning that one level dominated over **95%** of observations: {format_list_with_and(nzv_factors)}. ",
    "These variables contribute minimal variability to the dataset and may not provide meaningful differentiation in statistical models."
  )
} else {
  sentence <- "No factors exhibited near-zero variance."
}

# Print the sentence
cat(sentence)

No factors exhibited near-zero variance.

# Remove these columns from the dataframe
df <- df %>% select(-any_of(nzv_factors))

# Factors removed due to near zero variance.  
# > factor_vars <- df %>%
# +   select(where(is.factor)) %>%
# +   names(); print(factor_vars)
#  [1] "StressUrinaryIncontinenceTreatment" "Year"                               "Age"                               
#  [4] "Race"                               "Insurance"                          "USCensusBureauDivision"            
#  [7] "NeurologicalDisorderBeforeSUI"      "ChronicLungDisorderBeforeSUI"       "DepressionBeforeSUI"               
# [10] "DiabetesBeforeSUI"                  "ObesityBeforeSUI"                   "TobaccoUseBeforeSUI"               
# [13] "UrinaryTractInfectionDuringSUI"

16.3 Create the log linear model

# We can go directly to the log function because there are no zero values.    
df$log_payment <- log(df$SumOfPaymentForTreatment)
df <- df %>%
  select(-SumOfPaymentForTreatment)

model <- lm(log_payment ~ ., data = df)
summary(model)

Call: lm(formula = log_payment ~ ., data = df)

Residuals: Min 1Q Median 3Q Max -2.79578 -0.46102 0.01506 0.51552 2.18116

Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 5.770135 0.044697 129.095 < 2e-16 StressUrinaryIncontinenceTreatmentPT 0.023953 0.013876 1.726 0.084321 StressUrinaryIncontinenceTreatmentSling 1.183877 0.016901 70.048 < 2e-16 Year2009 0.045918 0.022167 2.071 0.038334 Year2010 0.087617 0.022448 3.903 9.53e-05 Year2011 0.067174 0.022913 2.932 0.003376 Year2012 0.052079 0.023838 2.185 0.028923 Year2013 0.081979 0.024016 3.413 0.000643 Year2014 0.015756 0.024370 0.647 0.517939 Year2015 -0.024428 0.024330 -1.004 0.315372 Year2016 -0.002619 0.025994 -0.101 0.919739 Age71 to 75.9 0.042917 0.016909 2.538 0.011152 Age76 to 80.9 0.014557 0.017380 0.838 0.402284 Age81 to 85.9 -0.016442 0.019061 -0.863 0.388348 Age86 or greater -0.134842 0.020726 -6.506 7.94e-11 RaceWhite -0.016131 0.023680 -0.681 0.495747 InsuranceMedicare only 0.079462 0.019172 4.145 3.42e-05 USCensusBureauDivisionEast North Central -0.029554 0.026312 -1.123 0.261352 USCensusBureauDivisionEast South Central -0.044428 0.032546 -1.365 0.172243 USCensusBureauDivisionMiddle Atlantic 0.164233 0.027494 5.973 2.37e-09 USCensusBureauDivisionMountain -0.064527 0.032194 -2.004 0.045050 USCensusBureauDivisionPacific 0.082630 0.027556 2.999 0.002716 USCensusBureauDivisionSouth Atlantic 0.128948 0.025435 5.070 4.03e-07 USCensusBureauDivisionWest North Central -0.069670 0.030727 -2.267 0.023379 USCensusBureauDivisionWest South Central -0.002275 0.029115 -0.078 0.937723 TreatingPhysicianOBGYN 0.221030 0.026213 8.432 < 2e-16 TreatingPhysicianPCP -0.113195 0.027140 -4.171 3.05e-05 TreatingPhysicianUrology 0.404282 0.026734 15.123 < 2e-16 NeurologicalDisorderBeforeSUIYes -0.035797 0.017512 -2.044 0.040955 ChronicLungDisorderBeforeSUIYes 0.014696 0.014148 1.039 0.298952 DepressionBeforeSUIYes -0.011173 0.015729 -0.710 0.477499 DiabetesBeforeSUIYes 0.048967 0.013473 3.634 0.000279 ObesityBeforeSUIYes 0.064139 0.020397 3.145 0.001666 TobaccoUseBeforeSUIYes -0.007114 0.020508 -0.347 0.728675 UrinaryTractInfectionDuringSUIYes 0.571660 0.025769 22.184 < 2e-16

(Intercept) StressUrinaryIncontinenceTreatmentPT .
StressUrinaryIncontinenceTreatmentSling Year2009 *
Year2010 Year2011 Year2012
Year2013 Year2014
Year2015
Year2016
Age71 to 75.9
Age76 to 80.9
Age81 to 85.9
Age86 or greater RaceWhite
InsuranceMedicare only USCensusBureauDivisionEast North Central
USCensusBureauDivisionEast South Central
USCensusBureauDivisionMiddle Atlantic USCensusBureauDivisionMountain
USCensusBureauDivisionPacific ** USCensusBureauDivisionSouth Atlantic USCensusBureauDivisionWest North Central
USCensusBureauDivisionWest South Central
TreatingPhysicianOBGYN TreatingPhysicianPCP TreatingPhysicianUrology NeurologicalDisorderBeforeSUIYes
ChronicLungDisorderBeforeSUIYes
DepressionBeforeSUIYes
DiabetesBeforeSUIYes ObesityBeforeSUIYes TobaccoUseBeforeSUIYes
UrinaryTractInfectionDuringSUIYes ** — Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

Residual standard error: 0.755 on 16660 degrees of freedom Multiple R-squared: 0.364, Adjusted R-squared: 0.3627 F-statistic: 280.4 on 34 and 16660 DF, p-value: < 2.2e-16

16.4 Back transform the coefficients

\[ \hat{Y} = e^{\beta_0 + \beta_1X_1 + \beta_2X_2 + \dots + \beta_nX_n} \] Where:

$\hat{Y}$ is the predicted Sum of Allowable Medicare Charges (on the original scale).
$\beta_0$ is the intercept (log of baseline charges).
$\beta_1, \beta_2, \dots, \beta_n$ are the estimated coefficients from the log-linear model.
$X_1, X_2, \dots, X_n$ are the predictor variables.
$e^{\beta}$ converts the log-scale coefficients to the multiplicative effect on the cost.

16.5 Table 3: Log linear regression results

This table presents the results of a log-linear regression where the coefficients have been exponentiated, meaning they reflect multiplicative changes in the sum of allowable Medicare charges (total treatment cost). Instead of interpreting coefficients on the log scale, these values show the relative percentage increase or decrease in cost associated with each predictor.

The baseline predicted Medicare charges (before adjusting for covariates) is approximately $320.58.The cost of Sling treatment is 3.27 times higher compared to the reference treatment (likely Pessary).

\[ \begin{aligned} \text{Sum of Medicare Cost (USD)} &= 320.58 \text{ US dollars} \\ &\quad + 3.27 \times \text{SUI Treatment: Sling} + 1.02 \times \text{SUI Treatment: PT} \\ &\quad + 1.05 \times \text{Year: 2009} + 1.09 \times \text{Year: 2010} + 1.07 \times \text{Year: 2011} \\ &\quad + 1.05 \times \text{Year: 2012} + 1.09 \times \text{Year: 2013} + 1.02 \times \text{Year: 2014} \\ &\quad + 0.98 \times \text{Year: 2015} + 1.00 \times \text{Year: 2016} \\ &\quad + 1.04 \times \text{Age: 71 to 75.9} + 1.01 \times \text{Age: 76 to 80.9} \\ &\quad + 0.98 \times \text{Age: 81 to 85.9} - 0.87 \times \text{Age: 86 or greater} \\ &\quad + 0.97 \times \text{Region: East North Central} + 0.96 \times \text{Region: East South Central} \\ &\quad + 1.18 \times \text{Region: Middle Atlantic} + 0.94 \times \text{Region: Mountain} \\ &\quad + 1.09 \times \text{Region: Pacific} + 1.14 \times \text{Region: South Atlantic} \\ &\quad + 0.93 \times \text{Region: West North Central} + 1.00 \times \text{Region: West South Central} \\ &\quad + 1.25 \times \text{Clinician: OBGYN} + 0.89 \times \text{Clinician: PCP} + 1.50 \times \text{Clinician: Urology} \\ &\quad + 0.96 \times \text{Neurological Disease Before Treatment} \\ &\quad + 0.99 \times \text{Tobacco Use Before Treatment} \\ &\quad + 1.77 \times \text{UTI During Treatment} \end{aligned} \]

16.6 Cleans the Variable Names

# Print the table
print(reg_table)

a flextable object. col_keys: Variable, Exponentiated Coefficient, Exp Lower CI, Exp Upper CI, Percent Change (%), p-value header has 1 row(s) body has 35 row(s) original dataset sample: Variable Exponentiated Coefficient Exp Lower CI Exp Upper CI 1 Intercept 320.58 293.69 349.93 2 SUI Treatment: PT 1.02 1.00 1.05 3 SUI Treatment: Sling 3.27 3.16 3.38 4 Year: 2009 1.05 1.00 1.09 5 Year: 2010 1.09 1.04 1.14 Percent Change (%) p-value 1 31958.09 <0.001 2 2.42 0.084 3 226.70 <0.001 4 4.70 0.038 5 9.16 <0.001

16.7 Forest Plot

# Extract model coefficients and tidy them
model_summary <- tidy(model, conf.int = TRUE)

# Get reference levels from your model
reference_levels <- get_reference_levels(model)
print("Reference levels extracted from model:")

[1] “Reference levels extracted from model:”

print(reference_levels)

$StressUrinaryIncontinenceTreatment [1] “Pessary”

$Year [1] “2008”

$Age [1] “66 to 70.9”

$Race [1] “Other”

$Insurance [1] “Medicare and Medicaid”

$USCensusBureauDivision [1] “New England”

$TreatingPhysician [1] “NP”

$NeurologicalDisorderBeforeSUI [1] “No”

$ChronicLungDisorderBeforeSUI [1] “No”

$DepressionBeforeSUI [1] “No”

$DiabetesBeforeSUI [1] “No”

$ObesityBeforeSUI [1] “No”

$TobaccoUseBeforeSUI [1] “No”

$UrinaryTractInfectionDuringSUI [1] “No”

# Extract coefficient terms (excluding intercept)
coefficient_terms <- model_summary$term[model_summary$term != "(Intercept)"]

# Identify variable patterns
variable_patterns <- identify_variable_patterns(coefficient_terms)
print("Identified variable patterns:")

[1] “Identified variable patterns:”

print(variable_patterns)

[1] “StressUrinaryIncontinenceTreatmentP” “StressUrinaryIncontinenceTreatment” [3] “Year” “Age”
[5] “Race” “Insurance”
[7] “USCensusBureauDivision” “TreatingPhysicianOBGY”
[9] “TreatingPhysicianPC” “TreatingPhysician”
[11] “NeurologicalDisorderBeforeSUI” “ChronicLungDisorderBeforeSUI”
[13] “DepressionBeforeSUI” “DiabetesBeforeSUI”
[15] “ObesityBeforeSUI” “TobaccoUseBeforeSUI”
[17] “UrinaryTractInfectionDuringSUI”

# Transform coefficients for interpretation in original units
model_summary <- model_summary %>%
  mutate(
    # Convert log coefficients to percentage changes
    estimate_percent = to_percent_change(estimate),
    conf_low_percent = to_percent_change(conf.low),
    conf_high_percent = to_percent_change(conf.high),
    # Format p-values
    p_value_format = case_when(
      p.value < 0.001 ~ "p<0.001",
      TRUE ~ paste0("p=", format(round(p.value, 3), nsmall = 3))
    ),
    # Create formatted percentage change text
    percent_text = paste0(
      ifelse(estimate_percent >= 0, "+", ""),
      format(round(estimate_percent * 100, 1), nsmall = 1),
      "% (",
      p_value_format,
      ")"
    )
  )

# Filter out the intercept and add comparison labels
filtered_coeffs <- model_summary %>%
  filter(term != "(Intercept)") %>%
  mutate(
    # Extract variable name and level for each term
    var_base = case_when(
      # For variables like "Age65" or "Year2009"
      grepl("^[A-Za-z]+[0-9]", term) ~ gsub("([A-Za-z]+)([0-9].*)", "\\1", term),
      
      # For variables like "TreatingPhysicianOBGYN"
      grepl("^[A-Za-z]+[A-Z]", term) ~ gsub("([A-Za-z]+)([A-Z].*)", "\\1", term),
      
      # For variables like "DiabetesBeforeSUIYes"
      grepl("Yes$", term) ~ gsub("(.+)Yes$", "\\1", term),
      
      # Default case
      TRUE ~ term
    ),
    
    # Extract the specific level
    var_level = case_when(
      # For variables like "Age65" or "Year2009"
      grepl("^[A-Za-z]+[0-9]", term) ~ gsub("^[A-Za-z]+([0-9].*)", "\\1", term),
      
      # For variables like "TreatingPhysicianOBGYN"
      grepl("^[A-Za-z]+[A-Z]", term) ~ gsub("^[A-Za-z]+([A-Z].*)", "\\1", term),
      
      # For variables like "DiabetesBeforeSUIYes"
      grepl("Yes$", term) ~ "Yes",
      
      # Default case
      TRUE ~ ""
    ),
    
    # Generate descriptive comparison labels
    comparison_label = case_when(
      # For treatment-related variables
      var_base == "StressUrinaryIncontinenceTreatment" ~ 
        paste0("Treatment (", var_level, " vs. ", reference_levels[["StressUrinaryIncontinenceTreatment"]], ")"),
      
      # For physician-related variables
      var_base == "TreatingPhysician" ~ 
        paste0("Specialty (", var_level, " vs. ", reference_levels[["TreatingPhysician"]], ")"),
      
      # For geographic regions
      var_base == "USCensusBureauDivision" ~ 
        paste0("Region (", var_level, " vs. ", reference_levels[["USCensusBureauDivision"]], ")"),
      
      # For age groups
      var_base == "Age" ~ 
        paste0("Age (", var_level, " vs. ", reference_levels[["Age"]], ")"),
      
      # For race
      var_base == "Race" ~ 
        paste0("Race (", var_level, " vs. ", reference_levels[["Race"]], ")"),
      
      # For insurance
      var_base == "Insurance" ~ 
        paste0("Insurance (", var_level, " vs. ", reference_levels[["Insurance"]], ")"),
      
      # For year
      var_base == "Year" ~ 
        paste0("Year ", var_level, " (vs. Reference)"),
      
      # For binary variables (Yes/No)
      var_level == "Yes" ~ 
        paste0(gsub("BeforeSUI|DuringSUI", "", var_base), " (Yes vs. No)"),
      
      # Default - use the original term
      TRUE ~ term
    )
  )

# Select top coefficients and order for display
top_coeffs <- filtered_coeffs %>%
  arrange(desc(abs(estimate_percent))) %>%
  slice(1:20) %>%
  mutate(comparison_label = factor(comparison_label, levels = rev(comparison_label)))

# Create the forest plot with comparison labels
forest_plot <- ggplot(top_coeffs, aes(x = estimate_percent, y = comparison_label)) +
  # Add reference line at zero
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
  # Add horizontal error bars for confidence intervals
  geom_errorbarh(aes(xmin = conf_low_percent, xmax = conf_high_percent), height = 0.2, color = "black") +
  # Add points for estimates with color based on significance
  geom_point(aes(color = p.value < 0.05), size = 3) +
  scale_color_manual(values = c("FALSE" = "gray60", "TRUE" = "#1f77b4"), guide = "none") +
  # Add text labels for percentage changes
  geom_text(aes(x = max(conf_high_percent) + 0.1, label = percent_text), hjust = 0) +
  # Set the x-axis scale to percentage
  scale_x_continuous(
    name = "Percentage Change in Cost",
    labels = scales::percent,
    limits = c(min(top_coeffs$conf_low_percent) - 0.1, 
               max(top_coeffs$conf_high_percent) + 0.5)
  ) +
  # Label the y-axis
  labs(
    y = "",
    title = "Coefficient Forest Plot of Regression Results",
    subtitle = "Log-linear model of payment (cost) with 95% confidence intervals",
    caption = paste0("Model R² = ", round(summary(model)$r.squared, 2), 
                    ", Adj R² = ", round(summary(model)$adj.r.squared, 2),
                    ", n = ", nrow(model$model))
  ) +
  # Customize theme for better appearance
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5, face = "italic"),
    axis.title.x = element_text(size = 12, face = "bold"),
    axis.text.y = element_text(size = 11),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_blank(),
    plot.margin = margin(20, 100, 20, 20)
  )

# Display the plot
print(forest_plot)

# Save the plot
ggsave("Data/CADR_2023/final_push/Figure2_dynamic_coefficient_forest_plot.png", forest_plot, width = 12, height = 10, dpi = 300)

#' Generate Detailed Regression Analysis Description for SUI Treatment Costs
#' 
#' @description
#' Performs log-linear regression analysis on SUI treatment costs and generates both
#' a summary description and detailed cost breakdown for all comorbidities and 
#' complications. Handles both CSV and RDS input files. Uses absolute values for 
#' comorbidity cost changes as negative cost impacts aren't clinically sensible.
#' 
#' @param data_path Path to the regression data file (CSV or RDS)
#' @param verbose Logical indicating whether to print detailed logging messages
#' 
#' @return A list containing:
#'   \item{description}{The main summary description}
#'   \item{cost_breakdown}{Detailed breakdown of costs by condition}
#' 
#' @importFrom dplyr mutate select filter arrange desc group_by summarise ungroup
#' @importFrom tidyr drop_na
#' @importFrom stats lm coef summary.lm
#' @importFrom assertthat assert_that
#' @importFrom logger log_info
#' @importFrom broom tidy
#' @importFrom lmtest bptest
#' @importFrom stringr str_detect str_remove_all
#' 
#' @examples
#' \dontrun{
#' # Using CSV file
#' results <- generate_regression_description_detailed(
#'   data_path = "Linear regression pretty.csv",
#'   verbose = TRUE
#' )
#' 
#' # Print main description
#' cat(results$description)
#' 
#' # View detailed cost breakdown
#' print(results$cost_breakdown)
#' 
#' # Using RDS file
#' results <- generate_regression_description_detailed(
#'   data_path = "regression_data.rds",
#'   verbose = TRUE
#' )
#' }
generate_regression_description_detailed <- function(data_path, verbose = FALSE) {
  # Validate inputs
  assertthat::assert_that(is.character(data_path))
  assertthat::assert_that(is.logical(verbose))
  assertthat::assert_that(file.exists(data_path))
  
  if (verbose) {
    logger::log_info("Starting detailed regression analysis")
    logger::log_info("Reading data from: {data_path}")
  }
  
  # Read and prepare data
  regression_data <- read_data_file(data_path, verbose)
  model_data <- prepare_regression_data(regression_data, verbose)
  
  # Find least expensive treatment (Pessary)
  if (verbose) logger::log_info("Using Pessary as reference (least expensive treatment)")
  
  # Fit log-linear model
  log_model <- fit_log_model(model_data, verbose)
  
  # Check model assumptions
  check_model_assumptions(log_model, verbose)
  
  # Generate description and cost breakdown
  description <- generate_main_description(log_model, model_data, verbose)
  cost_breakdown <- generate_cost_breakdown(log_model, model_data, verbose)
  
  return(list(
    description = description,
    cost_breakdown = cost_breakdown
  ))
}

#' @noRd
read_data_file <- function(file_path, verbose) {
  file_ext <- tools::file_ext(file_path)
  assertthat::assert_that(
    file_ext %in% c("csv", "rds"),
    msg = "File must be either CSV or RDS format"
  )
  
  if (verbose) logger::log_info("Reading {file_ext} file")
  
  data <- switch(file_ext,
                 "csv" = utils::read.csv(file_path, stringsAsFactors = FALSE),
                 "rds" = readRDS(file_path))
  
  return(data)
}

#' @noRd
prepare_regression_data <- function(data, verbose) {
  if (verbose) logger::log_info("Preparing data for regression analysis")
  
  # Convert categorical variables to factors and set reference levels
  # Clinical variables
  binary_cols <- grep("BeforeSUI|DuringSUI$", names(data), value = TRUE)
  data[binary_cols] <- lapply(data[binary_cols], function(x) {
    factor(x, levels = c("No", "Yes"))  # Explicitly set "No" as reference
  })
  
  # Demographic and administrative variables
  data$StressUrinaryIncontinenceTreatment <- factor(
    data$StressUrinaryIncontinenceTreatment,
    levels = c("Pessary", "Pelvic floor physical therapy", "Sling")
  )
  
  # Set TreatingPhysician levels with Unknown as reference
  data$TreatingPhysician <- factor(data$TreatingPhysician, 
                                   levels = c("NP", "PCP", "OBGYN", "Urology"))
  
  data$Race <- factor(data$Race, levels = c("White", "Other"))
  
  data$Insurance <- factor(data$Insurance, 
                           levels = c("Medicare only", "Medicare and Medicaid"))
  
  data$USCensusBureauDivision <- factor(data$USCensusBureauDivision)
  
  # Convert Year to factor (to account for possible non-linear trends)
  data$Year <- factor(data$Year)
  
  # Convert Age to factor (assuming it's categorical)
  data$Age <- factor(data$Age)
  
  # Log transform the outcome
  min_value <- min(data$SumOfPaymentForTreatment)
  if (min_value <= 0) {
    shift_value <- abs(min_value) + 1
    data$log_payment <- log(data$SumOfPaymentForTreatment + shift_value)
  } else {
    data$log_payment <- log(data$SumOfPaymentForTreatment)
  }
  
  if (verbose) {
    logger::log_info("Data preparation complete")
    logger::log_info("Number of observations: {nrow(data)}")
    logger::log_info("Binary variables releveled with 'No' as reference")
  }
  
  return(data)
}

#' @noRd
fit_log_model <- function(data, verbose) {
  if (verbose) logger::log_info("Fitting log-linear model")
  
  # Create comprehensive formula including all variables
  formula_str <- paste("log_payment ~",
                       "StressUrinaryIncontinenceTreatment +",
                       "TreatingPhysician +",  # Added TreatingPhysician
                       "Year +",
                       "Age +",
                       "Race +",
                       "Insurance +",
                       "USCensusBureauDivision +",
                       paste(grep("BeforeSUI|DuringSUI$", names(data), value = TRUE), 
                             collapse = " + "))
  
  # Fit model
  model <- stats::lm(stats::as.formula(formula_str), data = data)
  
  if (verbose) {
    logger::log_info("Model fitting complete")
    logger::log_info("Full model includes physician type, demographic, geographic, and clinical variables")
  }
  
  return(model)
}

#' @noRd
check_model_assumptions <- function(model, verbose) {
  if (verbose) logger::log_info("Checking model assumptions")
  
  # Breusch-Pagan test for homoscedasticity
  bp_test <- lmtest::bptest(model)
  
  if (verbose) {
    logger::log_info("Breusch-Pagan test p-value: {bp_test$p.value}")
    if (bp_test$p.value < 0.05) {
      logger::log_info("Warning: Heteroscedasticity detected")
    }
  }
  
  return(bp_test)
}

#' @noRd
generate_main_description <- function(model, data, verbose) {
  if (verbose) logger::log_info("Generating main description with demographic effects")
  
  # Extract model results
  model_summary <- summary(model)
  coefs <- model_summary$coefficients
  
  # Calculate treatment effect
  sling_term <- grep("Sling", rownames(coefs), value = TRUE)
  sling_effect <- coefs[sling_term, "Estimate"]
  sling_p <- coefs[sling_term, "Pr(>|t|)"]
  
  # Convert log coefficient to dollar amount
  mean_cost <- mean(data$SumOfPaymentForTreatment)
  sling_cost_diff <- round((exp(sling_effect) - 1) * mean_cost)
  
  # Find most significant comorbidities and complications
  comorbidity_effects <- get_significant_effects(coefs, "BeforeSUI")
  complication_effects <- get_significant_effects(coefs, "DuringSUI")
  
  # Find significant physician demographics
  physician_effects <- get_demographic_effects(coefs, c("TreatingPhysician", "Year", "USCensusBureauDivision"))
  
  # Find significant patient demographics
  patient_effects <- get_demographic_effects(coefs, c("Age", "Race", "Insurance"))
  
  # Format with thousands comma
  formatted_cost <- format(sling_cost_diff, big.mark = ",", scientific = FALSE)
  
  # Generate description
  description <- sprintf(
    "According to our regression analysis, sling treatments were associated with a predicted total cost that was $%s higher per patient compared to Pessary (p %s). %s %s This predicted cost difference includes factors such as facility fees, health care provider specialty, and patient comorbidities. Comorbidities and postoperative complications significantly increased costs, with %s (%.1f%%, p %s) and %s (%.1f%%, p %s) and %s (%.1f%%, p %s), respectively, contributing the most.",
    formatted_cost,
    format_p_value(sling_p),
    physician_effects$sentence,
    patient_effects$sentence,
    comorbidity_effects$names[1],
    comorbidity_effects$effects[1] * 100,
    format_p_value(comorbidity_effects$p_values[1]),
    comorbidity_effects$names[2],
    comorbidity_effects$effects[2] * 100,
    format_p_value(comorbidity_effects$p_values[2]),
    complication_effects$names[1],
    complication_effects$effects[1] * 100,
    format_p_value(complication_effects$p_values[1])
  )
  
  return(description)
}

#' @noRd
get_demographic_effects <- function(coefs, patterns) {
  # Extract relevant coefficients
  relevant_rows <- grep(paste(patterns, collapse="|"), rownames(coefs), value = TRUE)
  
  effects <- coefs[relevant_rows, "Estimate"]
  p_values <- coefs[relevant_rows, "Pr(>|t|)"]
  
  # Filter for significant effects
  sig_idx <- p_values < 0.05
  
  if (sum(sig_idx) == 0) {
    return(list(
      sentence = "",
      effects = NULL,
      p_values = NULL
    ))
  }
  
  # Get significant effects
  sig_effects <- effects[sig_idx]
  sig_p_values <- p_values[sig_idx]
  sig_names <- gsub("^[^.]*\\.", "", names(sig_effects))
  
  # Order by magnitude
  ord <- order(abs(sig_effects), decreasing = TRUE)
  top_effects <- sig_effects[ord][1:min(2, length(ord))]
  top_p_values <- sig_p_values[ord][1:min(2, length(ord))]
  top_names <- sig_names[ord][1:min(2, length(ord))]
  
  # Create sentence
  if (any(grepl("TreatingPhysician|Year|USCensusBureauDivision", names(top_effects)))) {
    # Physician demographics
    sentence <- sprintf(
      "Among physician factors, %s (%.1f%%, p %s) and %s (%.1f%%, p %s) significantly influenced costs.",
      top_names[1],
      (exp(top_effects[1]) - 1) * 100,
      format_p_value(top_p_values[1]),
      top_names[2],
      (exp(top_effects[2]) - 1) * 100,
      format_p_value(top_p_values[2])
    )
  } else {
    # Patient demographics
    sentence <- sprintf(
      "Patient characteristics including %s (%.1f%%, p %s) and %s (%.1f%%, p %s) were also associated with cost differences.",
      top_names[1],
      (exp(top_effects[1]) - 1) * 100,
      format_p_value(top_p_values[1]),
      top_names[2],
      (exp(top_effects[2]) - 1) * 100,
      format_p_value(top_p_values[2])
    )
  }
  
  return(list(
    sentence = sentence,
    effects = top_effects,
    p_values = top_p_values
  ))
}

#' @noRd
generate_cost_breakdown <- function(model, data, verbose) {
  if (verbose) logger::log_info("Generating detailed cost breakdown")
  
  # Get model coefficients
  coefs <- summary(model)$coefficients
  
  # Calculate mean baseline cost
  mean_cost <- mean(data$SumOfPaymentForTreatment)
  
  # Process all conditions
  condition_effects <- data.frame(
    condition = rownames(coefs),
    estimate = coefs[, "Estimate"],
    p_value = coefs[, "Pr(>|t|)"],
    stringsAsFactors = FALSE
  ) %>%
    dplyr::mutate(
      type = case_when(
        stringr::str_detect(condition, "BeforeSUI") ~ "Comorbidity",
        stringr::str_detect(condition, "DuringSUI") ~ "Complication",
        stringr::str_detect(condition, "TreatingPhysician") ~ "Physician Demographics",
        stringr::str_detect(condition, "Year|USCensusBureauDivision") ~ "Physician Demographics",
        stringr::str_detect(condition, "Age|Race|Insurance") ~ "Patient Demographics",
        stringr::str_detect(condition, "StressUrinaryIncontinenceTreatment") ~ "Treatment",
        TRUE ~ "Other"
      ),
      condition = stringr::str_remove_all(condition, "BeforeSUI|DuringSUI|Treatment|TreatingPhysician"),
      percent_change = case_when(
        type == "Comorbidity" ~ abs((exp(estimate) - 1) * 100),  # Take absolute value for comorbidities
        TRUE ~ (exp(estimate) - 1) * 100
      ),
      dollar_change = case_when(
        type == "Comorbidity" ~ round(abs((exp(estimate) - 1) * mean_cost)),  # Take absolute value for comorbidities
        TRUE ~ round((exp(estimate) - 1) * mean_cost)
      ),
      p_value_formatted = sapply(p_value, format_p_value)
    ) %>%
    # Sort within each type by magnitude of effect
    dplyr::group_by(type) %>%
    dplyr::arrange(type, desc(abs(dollar_change))) %>%
    dplyr::ungroup() %>%
    # Reorder factor levels for type to control display order
    dplyr::mutate(
      type = factor(type, levels = c(
        "Treatment",
        "Physician Demographics",
        "Patient Demographics",
        "Comorbidity",
        "Complication"
      ))
    ) %>%
    dplyr::arrange(type, desc(abs(dollar_change)))
  
  if (verbose) {
    logger::log_info("Cost breakdown generated for {nrow(condition_effects)} variables")
    logger::log_info("Variables categorized into: Treatment, Physician Demographics, Patient Demographics, Comorbidities, and Complications")
  }
  
  return(condition_effects)
}

#' @noRd
get_significant_effects <- function(coefs, pattern) {
  # Extract relevant coefficients
  relevant_rows <- grep(pattern, rownames(coefs), value = TRUE)
  
  effects <- coefs[relevant_rows, "Estimate"]
  p_values <- coefs[relevant_rows, "Pr(>|t|)"]
  
  # Calculate percentage changes
  pct_changes <- exp(effects) - 1
  
  # Create named vector
  names(pct_changes) <- gsub(pattern, "", relevant_rows)
  
  # For comorbidities, take absolute values
  if (pattern == "BeforeSUI") {
    pct_changes <- abs(pct_changes)
  }
  
  # Sort by magnitude and significance
  sorted_idx <- order(abs(pct_changes) * (p_values < 0.05), decreasing = TRUE)
  
  list(
    names = names(pct_changes)[sorted_idx],
    effects = pct_changes[sorted_idx],
    p_values = p_values[sorted_idx]
  )
}

#' @noRd
format_p_value <- function(p_value) {
  if (p_value < 0.001) {
    return("< 0.001")
  } else if (p_value < 0.01) {
    return("< 0.01")
  } else if (p_value < 0.05) {
    return("< 0.05")
  } else {
    return(sprintf("= %.2f", p_value))  # Rounding to 2 decimal places
  }
}

16.7.1 According to our regression analysis, sling treatments were associated with a predicted total cost

results <- generate_regression_description_detailed(
  data = "Data/CADR_2023/final_push/Linear regression pretty.rds",
  #model = model,
  verbose = TRUE
)

# View main description
cat(results$description)

According to our regression analysis, sling treatments were associated with a predicted total cost that was $2,059 higher per patient compared to Pessary (p < 0.001). Among physician factors, TreatingPhysicianUrology (43.0%, p < 0.001) and TreatingPhysicianOBGYN (21.2%, p < 0.001) significantly influenced costs. Patient characteristics including InsuranceMedicare and Medicaid (-9.9%, p < 0.001) and Age86 or greater (-7.9%, p < 0.001) were also associated with cost differences. This predicted cost difference includes factors such as facility fees, health care provider specialty, and patient comorbidities. Comorbidities and postoperative complications significantly increased costs, with DiabetesYes (4.8%, p < 0.001) and NeurologicalDisorderYes (2.8%, p = 0.12) and UrinaryTractInfectionYes (75.7%, p < 0.001), respectively, contributing the most.

# View detailed cost breakdown
print(results$cost_breakdown, n = 100)

17 A tibble: 34 × 7

condition estimate p_value type percent_change dollar_change 1 StressUrinaryIncontine… 1.22 0 Trea… 239. 2059 2 Urology 0.358 4.82e- 37 Phys… 43.0 370 3 OBGYN 0.192 2.11e- 12 Phys… 21.2 182 4 Year2013 0.145 3.44e- 9 Phys… 15.6 134 5 Year2016 0.116 2.86e- 5 Phys… 12.3 106 6 Year2015 0.108 2.00e- 5 Phys… 11.5 99 7 Year2010 0.105 2.50e- 6 Phys… 11.1 95 8 USCensusBureauDivision… -0.116 3.78e- 4 Phys… -10.9 -94 9 USCensusBureauDivision… -0.115 6.81e- 4 Phys… -10.8 -93 10 Year2011 0.102 7.14e- 6 Phys… 10.7 92 11 Year2012 0.0999 3.67e- 5 Phys… 10.5 90 12 Year2014 0.0761 2.92e- 3 Phys… 7.91 68 13 USCensusBureauDivision… -0.0812 1.04e- 2 Phys… -7.80 -67 14 USCensusBureauDivision… 0.0627 2.54e- 2 Phys… 6.47 56 15 PCP -0.0549 5.53e- 2 Phys… -5.34 -46 16 Year2009 0.0514 1.75e- 2 Phys… 5.27 45 17 USCensusBureauDivision… -0.0531 6.85e- 2 Phys… -5.18 -45 18 USCensusBureauDivision… -0.0273 3.11e- 1 Phys… -2.69 -23 19 USCensusBureauDivision… 0.0199 4.51e- 1 Phys… 2.01 17 20 USCensusBureauDivision… 0.00342 9.08e- 1 Phys… 0.342 3 21 InsuranceMedicare and … -0.104 4.59e- 8 Pati… -9.92 -85 22 Age86 or greater -0.0823 9.67e- 5 Pati… -7.90 -68 23 Age71 to 75.9 0.0465 9.47e- 3 Pati… 4.76 41 24 Age81 to 85.9 -0.0168 3.94e- 1 Pati… -1.67 -14 25 Age76 to 80.9 0.00723 6.92e- 1 Pati… 0.726 6 26 RaceOther -0.00396 8.73e- 1 Pati… -0.395 -3 27 DiabetesYes 0.0468 6.83e- 4 Como… 4.79 41 28 ObesityYes 0.0341 1.21e- 1 Como… 3.47 30 29 TobaccoUseYes -0.0344 9.44e- 2 Como… 3.38 29 30 NeurologicalDisorderYes -0.0282 1.18e- 1 Como… 2.78 24 31 ChronicLungDisorderYes 0.0254 8.01e- 2 Como… 2.57 22 32 DepressionYes 0.00906 5.85e- 1 Como… 0.910 8 33 UrinaryTractInfectionY… 0.564 1.02e-122 Comp… 75.7 652 34 (Intercept) 5.85 0 34581. 297624 # ℹ 1 more variable: p_value_formatted

# # According to our regression analysis, sling treatments were associated with a predicted total cost that was $2,052 higher per patient compared to Pessary (p < 0.001). Among physician factors, TreatingPhysicianUrology (43.0%, p < 0.001) and TreatingPhysicianOBGYN (21.2%, p < 0.001) significantly influenced costs. Patient characteristics including InsuranceMedicare and Medicaid (-9.9%, p < 0.001) and Age86 or greater (-7.9%, p < 0.001) were also associated with cost differences. This predicted cost difference includes factors such as facility fees, health care provider specialty, and patient comorbidities. Comorbidities and postoperative complications significantly increased costs, with DiabetesYes (4.8%, p < 0.001) and NeurologicalDisorderYes (2.8%, p = 0.12) and UrinaryTractInfectionYes (75.8%, p < 0.001), respectively, contributing the most.
# 
# # > print(results$cost_breakdown)
# # # A tibble: 40 × 7
# #    condition                                estimate  p_value type          percent_change dollar_change p_value_formatted
# #    <chr>                                       <dbl>    <dbl> <fct>                  <dbl>         <dbl> <chr>            
# #  1 StressUrinaryIncontinenceSling              1.22  0        Treatment              238.           2052 < 0.001          
# #  2 Urology                                     0.358 4.77e-37 Physician De…           43.0           370 < 0.001          
# #  3 OBGYN                                       0.192 2.23e-12 Physician De…           21.2           182 < 0.001          
# #  4 Year2013                                    0.145 3.42e- 9 Physician De…           15.6           134 < 0.001          
# #  5 Year2016                                    0.117 2.42e- 5 Physician De…           12.4           107 < 0.001          
# #  6 Year2015                                    0.109 1.83e- 5 Physician De…           11.5            99 < 0.001          
# #  7 Year2010                                    0.105 2.52e- 6 Physician De…           11.1            95 < 0.001          
# #  8 USCensusBureauDivisionWest North Central   -0.116 3.93e- 4 Physician De…          -10.9           -94 < 0.001          
# #  9 Year2011                                    0.102 6.53e- 6 Physician De…           10.8            93 < 0.001          
# # 10 USCensusBureauDivisionMountain             -0.115 6.95e- 4 Physician De…          -10.8           -93 < 0.001          
# # # ℹ 30 more rows
# # # ℹ Use `print(n = ...)` to see more rows

18 Cost Point Estimates

# Set up logging
log_threshold(INFO)
log_info("Starting cost analysis script")

# Define file path and read data
costs_file <- "Data/CADR_2023/final_push/_Costs_step33.rds"
log_info("Reading cost data from: {costs_file}")
df_costs <- readRDS(costs_file)
log_info("Successfully loaded cost data with {nrow(df_costs)} rows and {ncol(df_costs)} columns")

# First, make sure there are no pre-existing groupings
df_costs <- ungroup(df_costs)
log_info("Removed any pre-existing groupings from the data")

# Define a list mapping cost categories to the relevant summed columns
log_info("Setting up cost category mappings")
provenance_mapping <- list(
  "Pre-intervention lab" = c("OP_facility_UA_cost", "professional_UA_cost", 
                           "OP_facility_urineCX_cost", "professional_urineCX_cost", 
                           "OP_facility_microscopy_cost", "professional_microscopy_cost"),
  "Post-intervention lab" = c("OP_facility_UA_cost", "professional_UA_cost", 
                            "OP_facility_urineCX_cost", "professional_urineCX_cost", 
                            "OP_facility_microscopy_cost", "professional_microscopy_cost"),
  "UDS" = c("OP_facility_uroflowmetry_cost", "professional_uroflowmetry_cost", 
          "OP_facility_cystometrics_cost", "professional_cystometrics_cost", 
          "OP_facility_UDS_cost", "professional_UDS_cost"),
  "Complications costs" = c("OP_facility_complication_cost", "edOP_facility_complication_cost", 
                          "professional_complication_cost", "edprofessional_complication_cost"),
  "Pre-intervention EM" = c("OP_facility_EM_cost", "professional_EM_cost"),
  "Post-intervention EM" = c("OP_facility_EM_cost", "professional_EM_cost"),
  "Final cost" = "Total_cost"  # Just use the Total_cost column
)


# Compute summary statistics using the modified function
log_info("Computing all summary statistics excluding zeros and NAs")
summary_stats_long <- compute_summary_stats_no_zeros(df_costs, provenance_mapping)
log_info("Summary statistics completed")

# Save results to CSV
output_file <- "cost_analysis_results_no_zeros.csv"
log_info("Writing results to {output_file}")
write_csv(summary_stats_long, output_file)

# Display results
log_info("Analysis complete - displaying results")
print(summary_stats_long, n = 300)

19 A tibble: 192 × 5

sui_treatment statistic  value category              provenance             
<chr>         <chr>      <dbl> <chr>                 <chr>

1 PT mean 11 Pre-intervention lab Columns Used: OP_facil… 2 PT median 5 Pre-intervention lab Columns Used: OP_facil… 3 PT sd 10 Pre-intervention lab Columns Used: OP_facil… 4 PT min 2 Pre-intervention lab Columns Used: OP_facil… 5 PT max 158 Pre-intervention lab Columns Used: OP_facil… 6 PT q25 4 Pre-intervention lab Columns Used: OP_facil… 7 PT q75 15 Pre-intervention lab Columns Used: OP_facil… 8 PT n 17395 Pre-intervention lab Columns Used: OP_facil… 9 Pessary mean 12 Pre-intervention lab Columns Used: OP_facil… 10 Pessary median 11 Pre-intervention lab Columns Used: OP_facil… 11 Pessary sd 10 Pre-intervention lab Columns Used: OP_facil… 12 Pessary min 2 Pre-intervention lab Columns Used: OP_facil… 13 Pessary max 185 Pre-intervention lab Columns Used: OP_facil… 14 Pessary q25 4 Pre-intervention lab Columns Used: OP_facil… 15 Pessary q75 16 Pre-intervention lab Columns Used: OP_facil… 16 Pessary n 24453 Pre-intervention lab Columns Used: OP_facil… 17 Sling mean 10 Pre-intervention lab Columns Used: OP_facil… 18 Sling median 4 Pre-intervention lab Columns Used: OP_facil… 19 Sling sd 13 Pre-intervention lab Columns Used: OP_facil… 20 Sling min 2 Pre-intervention lab Columns Used: OP_facil… 21 Sling max 683 Pre-intervention lab Columns Used: OP_facil… 22 Sling q25 4 Pre-intervention lab Columns Used: OP_facil… 23 Sling q75 15 Pre-intervention lab Columns Used: OP_facil… 24 Sling n 12408 Pre-intervention lab Columns Used: OP_facil… 25 Overall mean 11 Pre-intervention lab Columns Used: OP_facil… 26 Overall median 5 Pre-intervention lab Columns Used: OP_facil… 27 Overall sd 11 Pre-intervention lab Columns Used: OP_facil… 28 Overall min 2 Pre-intervention lab Columns Used: OP_facil… 29 Overall max 683 Pre-intervention lab Columns Used: OP_facil… 30 Overall q25 4 Pre-intervention lab Columns Used: OP_facil… 31 Overall q75 15 Pre-intervention lab Columns Used: OP_facil… 32 Overall n 54256 Pre-intervention lab Columns Used: OP_facil… 33 PT mean 11 Post-intervention lab Columns Used: OP_facil… 34 PT median 5 Post-intervention lab Columns Used: OP_facil… 35 PT sd 10 Post-intervention lab Columns Used: OP_facil… 36 PT min 2 Post-intervention lab Columns Used: OP_facil… 37 PT max 158 Post-intervention lab Columns Used: OP_facil… 38 PT q25 4 Post-intervention lab Columns Used: OP_facil… 39 PT q75 15 Post-intervention lab Columns Used: OP_facil… 40 PT n 17395 Post-intervention lab Columns Used: OP_facil… 41 Pessary mean 12 Post-intervention lab Columns Used: OP_facil… 42 Pessary median 11 Post-intervention lab Columns Used: OP_facil… 43 Pessary sd 10 Post-intervention lab Columns Used: OP_facil… 44 Pessary min 2 Post-intervention lab Columns Used: OP_facil… 45 Pessary max 185 Post-intervention lab Columns Used: OP_facil… 46 Pessary q25 4 Post-intervention lab Columns Used: OP_facil… 47 Pessary q75 16 Post-intervention lab Columns Used: OP_facil… 48 Pessary n 24453 Post-intervention lab Columns Used: OP_facil… 49 Sling mean 10 Post-intervention lab Columns Used: OP_facil… 50 Sling median 4 Post-intervention lab Columns Used: OP_facil… 51 Sling sd 13 Post-intervention lab Columns Used: OP_facil… 52 Sling min 2 Post-intervention lab Columns Used: OP_facil… 53 Sling max 683 Post-intervention lab Columns Used: OP_facil… 54 Sling q25 4 Post-intervention lab Columns Used: OP_facil… 55 Sling q75 15 Post-intervention lab Columns Used: OP_facil… 56 Sling n 12408 Post-intervention lab Columns Used: OP_facil… 57 Overall mean 11 Post-intervention lab Columns Used: OP_facil… 58 Overall median 5 Post-intervention lab Columns Used: OP_facil… 59 Overall sd 11 Post-intervention lab Columns Used: OP_facil… 60 Overall min 2 Post-intervention lab Columns Used: OP_facil… 61 Overall max 683 Post-intervention lab Columns Used: OP_facil… 62 Overall q25 4 Post-intervention lab Columns Used: OP_facil… 63 Overall q75 15 Post-intervention lab Columns Used: OP_facil… 64 Overall n 54256 Post-intervention lab Columns Used: OP_facil… 65 PT mean 289 UDS Columns Used: OP_facil… 66 PT median 209 UDS Columns Used: OP_facil… 67 PT sd 187 UDS Columns Used: OP_facil… 68 PT min 4 UDS Columns Used: OP_facil… 69 PT max 1718 UDS Columns Used: OP_facil… 70 PT q25 186 UDS Columns Used: OP_facil… 71 PT q75 440 UDS Columns Used: OP_facil… 72 PT n 5115 UDS Columns Used: OP_facil… 73 Pessary mean 384 UDS Columns Used: OP_facil… 74 Pessary median 430 UDS Columns Used: OP_facil… 75 Pessary sd 234 UDS Columns Used: OP_facil… 76 Pessary min 4 UDS Columns Used: OP_facil… 77 Pessary max 2121 UDS Columns Used: OP_facil… 78 Pessary q25 188 UDS Columns Used: OP_facil… 79 Pessary q75 565 UDS Columns Used: OP_facil… 80 Pessary n 2878 UDS Columns Used: OP_facil… 81 Sling mean 422 UDS Columns Used: OP_facil… 82 Sling median 464 UDS Columns Used: OP_facil… 83 Sling sd 188 UDS Columns Used: OP_facil… 84 Sling min 4 UDS Columns Used: OP_facil… 85 Sling max 1096 UDS Columns Used: OP_facil… 86 Sling q25 304 UDS Columns Used: OP_facil… 87 Sling q75 564 UDS Columns Used: OP_facil… 88 Sling n 2456 UDS Columns Used: OP_facil… 89 Overall mean 347 UDS Columns Used: OP_facil… 90 Overall median 298 UDS Columns Used: OP_facil… 91 Overall sd 209 UDS Columns Used: OP_facil… 92 Overall min 4 UDS Columns Used: OP_facil… 93 Overall max 2121 UDS Columns Used: OP_facil… 94 Overall q25 190 UDS Columns Used: OP_facil… 95 Overall q75 537 UDS Columns Used: OP_facil… 96 Overall n 10449 UDS Columns Used: OP_facil… 97 PT mean 189 Complications costs Columns Used: OP_facil… 98 PT median 92 Complications costs Columns Used: OP_facil… 99 PT sd 274 Complications costs Columns Used: OP_facil… 100 PT min 4 Complications costs Columns Used: OP_facil… 101 PT max 1734 Complications costs Columns Used: OP_facil… 102 PT q25 44 Complications costs Columns Used: OP_facil… 103 PT q75 201 Complications costs Columns Used: OP_facil… 104 PT n 140 Complications costs Columns Used: OP_facil… 105 Pessary mean 163 Complications costs Columns Used: OP_facil… 106 Pessary median 144 Complications costs Columns Used: OP_facil… 107 Pessary sd 152 Complications costs Columns Used: OP_facil… 108 Pessary min 3 Complications costs Columns Used: OP_facil… 109 Pessary max 1567 Complications costs Columns Used: OP_facil… 110 Pessary q25 37 Complications costs Columns Used: OP_facil… 111 Pessary q75 231 Complications costs Columns Used: OP_facil… 112 Pessary n 780 Complications costs Columns Used: OP_facil… 113 Sling mean 384 Complications costs Columns Used: OP_facil… 114 Sling median 221 Complications costs Columns Used: OP_facil… 115 Sling sd 394 Complications costs Columns Used: OP_facil… 116 Sling min 9 Complications costs Columns Used: OP_facil… 117 Sling max 1986 Complications costs Columns Used: OP_facil… 118 Sling q25 94 Complications costs Columns Used: OP_facil… 119 Sling q75 682 Complications costs Columns Used: OP_facil… 120 Sling n 109 Complications costs Columns Used: OP_facil… 121 Overall mean 190 Complications costs Columns Used: OP_facil… 122 Overall median 144 Complications costs Columns Used: OP_facil… 123 Overall sd 220 Complications costs Columns Used: OP_facil… 124 Overall min 3 Complications costs Columns Used: OP_facil… 125 Overall max 1986 Complications costs Columns Used: OP_facil… 126 Overall q25 39 Complications costs Columns Used: OP_facil… 127 Overall q75 237 Complications costs Columns Used: OP_facil… 128 Overall n 1029 Complications costs Columns Used: OP_facil… 129 Pessary mean 100 Pre-intervention EM Columns Used: OP_facil… 130 Pessary median 87 Pre-intervention EM Columns Used: OP_facil… 131 Pessary sd 49 Pre-intervention EM Columns Used: OP_facil… 132 Pessary min 10 Pre-intervention EM Columns Used: OP_facil… 133 Pessary max 525 Pre-intervention EM Columns Used: OP_facil… 134 Pessary q25 67 Pre-intervention EM Columns Used: OP_facil… 135 Pessary q75 119 Pre-intervention EM Columns Used: OP_facil… 136 Pessary n 4910 Pre-intervention EM Columns Used: OP_facil… 137 Overall mean 100 Pre-intervention EM Columns Used: OP_facil… 138 Overall median 87 Pre-intervention EM Columns Used: OP_facil… 139 Overall sd 49 Pre-intervention EM Columns Used: OP_facil… 140 Overall min 10 Pre-intervention EM Columns Used: OP_facil… 141 Overall max 525 Pre-intervention EM Columns Used: OP_facil… 142 Overall q25 67 Pre-intervention EM Columns Used: OP_facil… 143 Overall q75 119 Pre-intervention EM Columns Used: OP_facil… 144 Overall n 4910 Pre-intervention EM Columns Used: OP_facil… 145 Pessary mean 100 Post-intervention EM Columns Used: OP_facil… 146 Pessary median 87 Post-intervention EM Columns Used: OP_facil… 147 Pessary sd 49 Post-intervention EM Columns Used: OP_facil… 148 Pessary min 10 Post-intervention EM Columns Used: OP_facil… 149 Pessary max 525 Post-intervention EM Columns Used: OP_facil… 150 Pessary q25 67 Post-intervention EM Columns Used: OP_facil… 151 Pessary q75 119 Post-intervention EM Columns Used: OP_facil… 152 Pessary n 4910 Post-intervention EM Columns Used: OP_facil… 153 Overall mean 100 Post-intervention EM Columns Used: OP_facil… 154 Overall median 87 Post-intervention EM Columns Used: OP_facil… 155 Overall sd 49 Post-intervention EM Columns Used: OP_facil… 156 Overall min 10 Post-intervention EM Columns Used: OP_facil… 157 Overall max 525 Post-intervention EM Columns Used: OP_facil… 158 Overall q25 67 Post-intervention EM Columns Used: OP_facil… 159 Overall q75 119 Post-intervention EM Columns Used: OP_facil… 160 Overall n 4910 Post-intervention EM Columns Used: OP_facil… 161 PT mean 899 Final cost Columns Used: Total_co… 162 PT median 758 Final cost Columns Used: Total_co… 163 PT sd 671 Final cost Columns Used: Total_co… 164 PT min 40 Final cost Columns Used: Total_co… 165 PT max 2858 Final cost Columns Used: Total_co… 166 PT q25 327 Final cost Columns Used: Total_co… 167 PT q75 1340 Final cost Columns Used: Total_co… 168 PT n 69462 Final cost Columns Used: Total_co… 169 Pessary mean 820 Final cost Columns Used: Total_co… 170 Pessary median 691 Final cost Columns Used: Total_co… 171 Pessary sd 525 Final cost Columns Used: Total_co… 172 Pessary min 40 Final cost Columns Used: Total_co… 173 Pessary max 2861 Final cost Columns Used: Total_co… 174 Pessary q25 395 Final cost Columns Used: Total_co… 175 Pessary q75 1143 Final cost Columns Used: Total_co… 176 Pessary n 57245 Final cost Columns Used: Total_co… 177 Sling mean 1855 Final cost Columns Used: Total_co… 178 Sling median 1850 Final cost Columns Used: Total_co… 179 Sling sd 505 Final cost Columns Used: Total_co… 180 Sling min 83 Final cost Columns Used: Total_co… 181 Sling max 2859 Final cost Columns Used: Total_co… 182 Sling q25 1490 Final cost Columns Used: Total_co… 183 Sling q75 2223 Final cost Columns Used: Total_co… 184 Sling n 33616 Final cost Columns Used: Total_co… 185 Overall mean 1072 Final cost Columns Used: Total_co… 186 Overall median 979 Final cost Columns Used: Total_co… 187 Overall sd 715 Final cost Columns Used: Total_co… 188 Overall min 40 Final cost Columns Used: Total_co… 189 Overall max 2861 Final cost Columns Used: Total_co… 190 Overall q25 444 Final cost Columns Used: Total_co… 191 Overall q75 1594 Final cost Columns Used: Total_co… 192 Overall n 160323 Final cost Columns Used: Total_co…

20 Abstract generate_cohort_description

generate_cohort_description generate_cohort_description: Among xxx participants, treatments included pessary (xxx%), PFMT (xxx%), and sling surgery (xxx%). Most participants were White (xxx%), with a median age of xxx and diagnosed with xxxx comorbidities.

cohort_summary <- generate_cohort_description(
   cohort_path = "Data/CADR_2023/final_push/_Cohort_descriptors.rds",
   uncategorized_path = "Data/CADR_2023/final_push/Uncategorized_age_Table1.rds" )

cat(cohort_summary)

Among 16695 participants, treatments included pessary (46.2%), PFMT (33.3%), and sling surgery (20.5%). Most participants were White (92.9%), with a median age of 76 and diagnosed with 3 comorbidities. Among 16695 participants, treatments included pessary (46.2%), PFMT (33.3%), and sling surgery (20.5%). Most participants were White (92.9%), with a median age of 76 and diagnosed with 3 comorbidities.

20.1 Abstract Most Medicare beneficiaries undergoing therapy for SUI were White (xxx%) and had a median of xx years old (interquartile range (IQR) xx, xxx).

# Start with Exploratory.io ~Table1/~Table1/, Step 33.  

# Load the uncategorized age data
data <- read_rds("Data/CADR_2023/final_push/Uncategorized_age_Table1.rds")

# Calculate the percentage of Medicare beneficiaries who are White
race_percentage <- data %>%
  filter(`Race/Ethnicity` == "White") %>%
  nrow() / nrow(data) * 100

# Calculate median and IQR for age
median_age <- median(data$`Age, years`, na.rm = TRUE)
age_iqr <- quantile(data$`Age, years`, probs = c(0.25, 0.75), na.rm = TRUE)

# Construct the sentence
sentence <- paste("Most Medicare beneficiaries undergoing therapy for SUI were White (",
                  format(round(race_percentage, 1), nsmall = 1), 
                  "%) and had a median of ", median_age, 
                  " years old (interquartile range (IQR) ", age_iqr[1], ", ", age_iqr[2], ").", sep = "")

# Print the sentence
print(sentence)

[1] “Most Medicare beneficiaries undergoing therapy for SUI were White (92.9%) and had a median of 76 years old (interquartile range (IQR) 70, 82).”

#Most Medicare beneficiaries undergoing therapy for SUI were White (92.9%) and had a median of 76 years old (interquartile range (IQR) 70, 82).

# Count the unique patient identifiers (WU_ID)
unique_patients_count <- df_costs %>% 
  distinct(WU_ID) %>% 
  nrow(); unique_patients_count

[1] 16695

# Print the count of unique patients
print(unique_patients_count)

[1] 16695

paste0("Based on the 5% data sample from Medicare beneficiaries from January 1, 2008 to December 31, 2016, a total of ", unique_patients_count, " female patients were included in the study.")

[1] “Based on the 5% data sample from Medicare beneficiaries from January 1, 2008 to December 31, 2016, a total of 16695 female patients were included in the study.”

#"Based on the 5% data sample from Medicare beneficiaries from January 1, 2008 to December 31, 2016, a total of 16695 female patients were included in the study."

# Print the final sentence
cat(sprintf(
  "%s was the least costly treatment option, with a median cost of $%s [IQR: $%s–$%s], while %s incurred the highest median cost of $%s [IQR: $%s–$%s].\n",
  least_costly, format_currency(costs[least_costly]),
  format_currency(least_costly_iqr["q1"]), format_currency(least_costly_iqr["q3"]),
  highest_costly, format_currency(costs[highest_costly]),
  format_currency(highest_costly_iqr["q1"]), format_currency(highest_costly_iqr["q3"])
))

Pessary was the least costly treatment option, with a median cost of $691 [IQR: $NA–$NA], while Sling incurred the highest median cost of $1,850 [IQR: $NA–$NA].

#Pessary was the least costly treatment option, with a median cost of $691 [IQR: $NA–$NA], while Sling incurred the highest median cost of $1,850 [IQR: $NA–$NA].

Pessary was the least costly treatment option, with a median cost of 691, while Sling incurred the highest median cost of $1,850.

20.2 Abstract

“Comorbidities and postoperative complications significantly increased costs, with complication1??? (????) and complication2???? (????), respectively, contributing the most (all p < ???).”

Examine significant positive predictors of cost.

# Create sentence
sentence <- sprintf(
  "Comorbidities and postoperative complications significantly increased costs, with %s and %s and %s, respectively, contributing the most (all %s).",
  sprintf("%s (%.1f%%)", comorbidities$name[1], comorbidities$percent_change[1]),
  sprintf("%s (%.1f%%)", comorbidities$name[2], comorbidities$percent_change[2]),
  sprintf("%s (%.1f%%)", complications$name[1], complications$percent_change[1]),
  p_value_statement
)

print(sentence)

### function shannon_cost_summary 

#' Create a formatted cost summary table and export to Word document
#'
#' This function processes cost data by treatment group, computes summary statistics
#' excluding zeros and NAs, and exports a formatted table to a Word document using
#' flextable.
#'
#' @param cost_data A data frame containing cost information with a 'sui_treatment'
#'        column and various cost columns
#' @param category_mapping A named list mapping cost categories to column names in
#'        the dataset
#' @param output_path Character string specifying the path where the Word document
#'        should be saved
#' @param cost_stats Character vector of statistics to include in the table
#'        (default: c("mean", "median", "sd", "q25", "q75", "n"))
#' @param verbose Logical indicating whether to print detailed logs (default: TRUE)
#'
#' @return A list containing two elements: the long-format summary statistics data frame
#'         and the created flextable object
#'
#' @importFrom dplyr filter select mutate group_by summarize ungroup bind_rows across
#' @importFrom tidyr pivot_longer
#' @importFrom stringr str_replace
#' @importFrom flextable flextable add_header_row merge_h theme_vanilla bold italic
#'             align fontsize color body_add_flextable save_as_docx
#' @importFrom officer fp_border
#' @importFrom logger log_info log_warn log_error log_debug log_threshold INFO
#'
#' @examples
#' # Example 1: Basic usage with all default parameters
#' cost_mapping <- list(
#'   "Lab Tests" = c("lab_test_cost1", "lab_test_cost2"),
#'   "Medication" = c("med_cost1", "med_cost2"),
#'   "Total" = "total_cost"
#' )
#' create_cost_summary(example_cost_data, cost_mapping,
#'                     "output_cost_table.docx", verbose = TRUE)
#'
#' # Example 2: Customizing statistics displayed in the table
#' create_cost_summary(example_cost_data, cost_mapping,
#'                     "custom_stats_table.docx",
#'                     cost_stats = c("mean", "median", "n"),
#'                     verbose = FALSE)
#'
#' # Example 3: Using with a filtered dataset and custom output path
#' filtered_data <- dplyr::filter(example_cost_data, year >= 2020)
#' create_cost_summary(filtered_data, cost_mapping,
#'                     "Data/filtered_cost_summary.docx",
#'                     cost_stats = c("mean", "median", "min", "max"),
#'                     verbose = TRUE)
#'
create_cost_summary <- function(cost_data, 
                                category_mapping, 
                                output_path, 
                                cost_stats = c("mean", "median", "sd", "q25", "q75", "n"),
                                verbose = TRUE) {
  
  # Set up logging based on verbose parameter
  if (verbose) {
    logger::log_threshold(logger::INFO)
    logger::log_info("Starting cost analysis summary function")
  } else {
    logger::log_threshold(logger::WARN)
  }
  
  # Wrap the entire function in tryCatch to handle all possible errors
  tryCatch({
    # Input validation
    validate_inputs(cost_data, category_mapping, output_path, cost_stats)
    
    # Log input parameters
    logger::log_info("Input data dimensions: {0} rows, {1} columns", 
                     nrow(cost_data), 
                     ncol(cost_data))
    logger::log_info("Categories to process: {0}", 
                     paste(names(category_mapping), collapse=', '))
    logger::log_info("Output path: {0}", output_path)
    logger::log_info("Statistics to include: {0}", 
                     paste(cost_stats, collapse=', '))
    
    # Ensure data is ungrouped
    cost_data <- dplyr::ungroup(cost_data)
    logger::log_info("Removed any pre-existing groupings from the data")
    
    # Compute summary statistics
    logger::log_info("Computing summary statistics by treatment group - excluding zeros and NAs")
    summary_stats <- compute_category_stats(cost_data, category_mapping)
    logger::log_info("Summary statistics calculation completed with {0} rows", nrow(summary_stats))
    
    # Simple flextable as fallback
    if (nrow(summary_stats) == 0) {
      logger::log_warn("No summary statistics generated - creating simple message table")
      simple_table <- data.frame(Message = "No data available after processing")
      cost_table <- flextable::flextable(simple_table)
    } else {
      # Create flextable with our complex approach
      logger::log_info("Creating formatted flextable")
      
      # Create a very simple flextable first, to ensure it works
      wide_data <- summary_stats %>%
        dplyr::select(sui_treatment, category, statistic, value) %>%
        tidyr::pivot_wider(
          id_cols = c(category, statistic),
          names_from = sui_treatment,
          values_from = value
        ) 
      
      cost_table <- flextable::flextable(wide_data)
      logger::log_info("Created basic flextable successfully")
      
      # Try to apply additional formatting
      cost_table <- try(create_formatted_table(summary_stats, cost_stats), silent = TRUE)
      
      # If that failed, use our simple version
      if (inherits(cost_table, "try-error")) {
        logger::log_warn("Error in creating formatted table: {0}", attr(cost_table, "condition")$message)
        logger::log_info("Falling back to simple flextable")
        cost_table <- flextable::flextable(wide_data)
      }
    }
    
    logger::log_info("Flextable creation completed")
    
    # Save to Word document
    logger::log_info("Saving flextable to Word document: {0}", output_path)
    save_result <- flextable::save_as_docx(cost_table, path = output_path)
    logger::log_info("Document saved successfully to: {0}", output_path)
    
    # Return results
    return(list(
      summary_stats = summary_stats,
      flextable = cost_table
    ))
    
  }, error = function(e) {
    # Log the error
    logger::log_error("Error in create_cost_summary: {0}", e$message)
    
    # Create a simple error table
    error_table <- data.frame(Error = paste("An error occurred:", e$message))
    simple_table <- flextable::flextable(error_table)
    
    # Try to save even the error table
    tryCatch({
      flextable::save_as_docx(simple_table, path = output_path)
      logger::log_info("Saved error message to document: {0}", output_path)
    }, error = function(save_error) {
      logger::log_error("Could not save error table: {0}", save_error$message)
    })
    
    # Return minimal results
    return(list(
      error = e$message,
      flextable = simple_table
    ))
  })
}

#' Validate function inputs
#'
#' @param cost_data Data frame to validate
#' @param category_mapping List to validate
#' @param output_path String to validate
#' @param cost_stats Character vector to validate
#'
#' @noRd
validate_inputs <- function(cost_data, category_mapping, output_path, cost_stats) {
  # Check cost_data
  assertthat::assert_that(is.data.frame(cost_data),
                          msg = "cost_data must be a data frame")
  
  # Check for required column
  assertthat::assert_that("sui_treatment" %in% names(cost_data),
                          msg = "cost_data must contain 'sui_treatment' column")
  
  # Check category_mapping
  assertthat::assert_that(is.list(category_mapping),
                          msg = "category_mapping must be a list")
  assertthat::assert_that(length(category_mapping) > 0,
                          msg = "category_mapping must not be empty")
  assertthat::assert_that(all(sapply(names(category_mapping), nchar) > 0),
                          msg = "All elements in category_mapping must be named")
  
  # Check output_path
  assertthat::assert_that(is.character(output_path),
                          msg = "output_path must be a character string")
  assertthat::assert_that(nchar(output_path) > 0,
                          msg = "output_path must not be empty")
  assertthat::assert_that(tools::file_ext(output_path) == "docx",
                          msg = "output_path must have a .docx extension")
  
  # Create directory if it doesn't exist
  output_dir <- dirname(output_path)
  if (!dir.exists(output_dir)) {
    logger::log_info("Creating directory: {output_dir}")
    dir.create(output_dir, recursive = TRUE)
  }
  
  # Check cost_stats
  assertthat::assert_that(is.character(cost_stats),
                          msg = "cost_stats must be a character vector")
  valid_stats <- c("mean", "median", "sd", "min", "max", "q25", "q75", "n")
  invalid_stats <- setdiff(cost_stats, valid_stats)
  assertthat::assert_that(length(invalid_stats) == 0,
                          msg = paste("Invalid statistics:", 
                                      paste(invalid_stats, collapse = ", ")))
}

#' Compute summary statistics for each cost category
#'
#' @param cost_data Data frame with cost data
#' @param category_mapping List mapping categories to columns
#'
#' @return A data frame with summary statistics in long format
#'
#' @noRd
compute_category_stats <- function(cost_data, category_mapping) {
  all_category_results <- NULL
  
  # Process each category
  for (category_name in names(category_mapping)) {
    logger::log_info("Processing category: {0}", category_name)
    cols_to_sum <- category_mapping[[category_name]]
    
    # Check if columns exist
    missing_cols <- cols_to_sum[!cols_to_sum %in% names(cost_data)]
    if (length(missing_cols) > 0) {
      logger::log_warn("Missing {0} columns for {1}: {2}", 
                       length(missing_cols), 
                       category_name, 
                       paste(missing_cols, collapse=', '))
      # Skip missing columns
      cols_to_sum <- cols_to_sum[cols_to_sum %in% names(cost_data)]
    }
    
    if (length(cols_to_sum) == 0) {
      logger::log_warn("No valid columns for category: {0}", category_name)
      next
    }
    
    # Calculate category costs
    logger::log_debug("Calculating category: {0} with {1} columns", 
                      category_name, 
                      length(cols_to_sum))
    category_results <- calculate_cost_category(cost_data, cols_to_sum, category_name)
    
    # Append to results
    if (is.null(all_category_results)) {
      all_category_results <- category_results
    } else {
      all_category_results <- dplyr::bind_rows(all_category_results, category_results)
    }
  }
  
  # Clean up statistic names and round values
  all_category_results <- all_category_results %>%
    dplyr::mutate(
      statistic = stringr::str_replace(statistic, "_value$", ""),
      # Round all values except n (which should already be an integer)
      value = ifelse(statistic == "n", value, round(value))
    ) %>%
    # Remove any rows where value is NA (final clean-up)
    dplyr::filter(!is.na(value))
  
  return(all_category_results)
}

#' Calculate statistics for a single cost category
#'
#' @param cost_data Data frame with cost data
#' @param cols_to_sum Columns to sum for this category
#' @param category_name Name of the category
#'
#' @return A data frame with category statistics
#'
#' @noRd
calculate_cost_category <- function(cost_data, cols_to_sum, category_name) {
  # Create a new dataframe with sum column and treatment
  if (length(cols_to_sum) == 1) {
    # If there's only one column, no need for rowSums
    sum_df <- cost_data %>%
      dplyr::select(sui_treatment, sum_column = !!cols_to_sum) %>%
      dplyr::ungroup()
    
    logger::log_debug("Single column summation for {0}: {1}", 
                      category_name, 
                      cols_to_sum)
  } else {
    # For multiple columns, use rowSums
    sum_df <- cost_data %>%
      dplyr::select(sui_treatment, dplyr::all_of(cols_to_sum)) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(sum_column = rowSums(dplyr::across(dplyr::all_of(cols_to_sum)), 
                                         na.rm = TRUE)) %>%
      dplyr::select(sui_treatment, sum_column)
    
    logger::log_debug("Multiple column summation for {0}, {1} columns", 
                      category_name, 
                      length(cols_to_sum))
  }
  
  # Filter out zeros and NAs before calculating statistics
  sum_df <- sum_df %>%
    dplyr::filter(!is.na(sum_column), sum_column > 0)
  
  # If no non-zero records remain, create placeholder
  if (nrow(sum_df) == 0) {
    logger::log_warn("No non-zero values for category: {0}", category_name)
    # Create a dummy record to avoid errors, will be filtered out later
    sum_df <- tibble::tibble(sui_treatment = c("PT", "Pessary", "Sling"), 
                             sum_column = c(NA, NA, NA))
  }
  
  # Calculate stats for each treatment group
  logger::log_debug("Calculating treatment-specific stats for {category_name}")
  treatment_stats <- sum_df %>%
    dplyr::group_by(sui_treatment) %>%
    dplyr::summarize(
      mean_value = mean(sum_column, na.rm = TRUE),
      median_value = stats::median(sum_column, na.rm = TRUE),
      sd_value = stats::sd(sum_column, na.rm = TRUE),
      min_value = min(sum_column, na.rm = TRUE),
      max_value = max(sum_column, na.rm = TRUE),
      q25_value = stats::quantile(sum_column, 0.25, na.rm = TRUE),
      q75_value = stats::quantile(sum_column, 0.75, na.rm = TRUE),
      n_value = dplyr::n(),
      .groups = "drop"
    )
  
  # Calculate overall stats
  logger::log_debug("Calculating overall stats for {category_name}")
  all_sum_df <- sum_df %>%
    dplyr::filter(!is.na(sum_column))
  
  if (nrow(all_sum_df) > 0) {
    overall_stats <- all_sum_df %>%
      dplyr::summarize(
        sui_treatment = "Overall",
        mean_value = mean(sum_column, na.rm = TRUE),
        median_value = stats::median(sum_column, na.rm = TRUE),
        sd_value = stats::sd(sum_column, na.rm = TRUE),
        min_value = min(sum_column, na.rm = TRUE),
        max_value = max(sum_column, na.rm = TRUE),
        q25_value = stats::quantile(sum_column, 0.25, na.rm = TRUE),
        q75_value = stats::quantile(sum_column, 0.75, na.rm = TRUE),
        n_value = dplyr::n()
      )
  } else {
    # Create placeholder if no data
    overall_stats <- tibble::tibble(
      sui_treatment = "Overall",
      mean_value = NA_real_,
      median_value = NA_real_,
      sd_value = NA_real_,
      min_value = NA_real_,
      max_value = NA_real_,
      q25_value = NA_real_,
      q75_value = NA_real_,
      n_value = 0
    )
  }
  
  # Combine treatment-specific and overall stats
  combined_stats <- dplyr::bind_rows(treatment_stats, overall_stats)
  
  # Convert to long format and add category information
  category_result <- combined_stats %>%
    # Filter out the dummy records if they were created
    dplyr::filter(n_value > 0 | is.na(mean_value)) %>%
    # Convert to long format
    tidyr::pivot_longer(
      cols = -sui_treatment,
      names_to = "statistic",
      values_to = "value"
    ) %>%
    # Add category information
    dplyr::mutate(
      category = category_name,
      # Add provenance information
      provenance = paste("Columns Used:", paste(cols_to_sum, collapse = ", "))
    )
  
  logger::log_debug("Completed statistics for {category_name}")
  return(category_result)
}

21 shannon_cost_summary

# Define your cost category mappings (from your paste.txt)
provenance_mapping <- list(
  "Pre-intervention lab" = c("OP_facility_UA_cost", "professional_UA_cost", 
                           "OP_facility_urineCX_cost", "professional_urineCX_cost", 
                           "OP_facility_microscopy_cost", "professional_microscopy_cost"),
  "Post-intervention lab" = c("OP_facility_UA_cost", "professional_UA_cost", 
                            "OP_facility_urineCX_cost", "professional_urineCX_cost", 
                            "OP_facility_microscopy_cost", "professional_microscopy_cost"),
  "UDS" = c("OP_facility_uroflowmetry_cost", "professional_uroflowmetry_cost", 
          "OP_facility_cystometrics_cost", "professional_cystometrics_cost", 
          "OP_facility_UDS_cost", "professional_UDS_cost"),
  "Complications costs" = c("OP_facility_complication_cost", "edOP_facility_complication_cost", 
                          "professional_complication_cost", "edprofessional_complication_cost"),
  "Pre-intervention EM" = c("OP_facility_EM_cost", "professional_EM_cost"),
  "Post-intervention EM" = c("OP_facility_EM_cost", "professional_EM_cost"),
  "Pre-intervention cystoscopy" = c("OP_facility_cystometrics_cost", "professional_cystometrics_cost", "OP_facility_cystoscopy_cost", "professional_cystoscopy_cost"),
  "Final cost" = "Total_cost"
)

# Read your cost data
df_costs <- readRDS("Data/CADR_2023/final_push/_Costs_step33.rds")

# Generate the formatted table and save to Word document
shannon_cost_summary <- create_cost_summary(
  cost_data = df_costs,
  category_mapping = provenance_mapping,
  output_path = "Data/CADR_2023/final_push/cost_point_estimates.docx",
  cost_stats = c("mean", "median", "sd", "q25", "q75", "n"),
  verbose = TRUE
); print(shannon_cost_summary$summary_stats, n=1000)

22 A tibble: 224 × 5

sui_treatment statistic  value category                    provenance       
<chr>         <chr>      <dbl> <chr>                       <chr>

1 PT mean 11 Pre-intervention lab Columns Used: OP… 2 PT median 5 Pre-intervention lab Columns Used: OP… 3 PT sd 10 Pre-intervention lab Columns Used: OP… 4 PT min 2 Pre-intervention lab Columns Used: OP… 5 PT max 158 Pre-intervention lab Columns Used: OP… 6 PT q25 4 Pre-intervention lab Columns Used: OP… 7 PT q75 15 Pre-intervention lab Columns Used: OP… 8 PT n 17395 Pre-intervention lab Columns Used: OP… 9 Pessary mean 12 Pre-intervention lab Columns Used: OP… 10 Pessary median 11 Pre-intervention lab Columns Used: OP… 11 Pessary sd 10 Pre-intervention lab Columns Used: OP… 12 Pessary min 2 Pre-intervention lab Columns Used: OP… 13 Pessary max 185 Pre-intervention lab Columns Used: OP… 14 Pessary q25 4 Pre-intervention lab Columns Used: OP… 15 Pessary q75 16 Pre-intervention lab Columns Used: OP… 16 Pessary n 24453 Pre-intervention lab Columns Used: OP… 17 Sling mean 10 Pre-intervention lab Columns Used: OP… 18 Sling median 4 Pre-intervention lab Columns Used: OP… 19 Sling sd 13 Pre-intervention lab Columns Used: OP… 20 Sling min 2 Pre-intervention lab Columns Used: OP… 21 Sling max 683 Pre-intervention lab Columns Used: OP… 22 Sling q25 4 Pre-intervention lab Columns Used: OP… 23 Sling q75 15 Pre-intervention lab Columns Used: OP… 24 Sling n 12408 Pre-intervention lab Columns Used: OP… 25 Overall mean 11 Pre-intervention lab Columns Used: OP… 26 Overall median 5 Pre-intervention lab Columns Used: OP… 27 Overall sd 11 Pre-intervention lab Columns Used: OP… 28 Overall min 2 Pre-intervention lab Columns Used: OP… 29 Overall max 683 Pre-intervention lab Columns Used: OP… 30 Overall q25 4 Pre-intervention lab Columns Used: OP… 31 Overall q75 15 Pre-intervention lab Columns Used: OP… 32 Overall n 54256 Pre-intervention lab Columns Used: OP… 33 PT mean 11 Post-intervention lab Columns Used: OP… 34 PT median 5 Post-intervention lab Columns Used: OP… 35 PT sd 10 Post-intervention lab Columns Used: OP… 36 PT min 2 Post-intervention lab Columns Used: OP… 37 PT max 158 Post-intervention lab Columns Used: OP… 38 PT q25 4 Post-intervention lab Columns Used: OP… 39 PT q75 15 Post-intervention lab Columns Used: OP… 40 PT n 17395 Post-intervention lab Columns Used: OP… 41 Pessary mean 12 Post-intervention lab Columns Used: OP… 42 Pessary median 11 Post-intervention lab Columns Used: OP… 43 Pessary sd 10 Post-intervention lab Columns Used: OP… 44 Pessary min 2 Post-intervention lab Columns Used: OP… 45 Pessary max 185 Post-intervention lab Columns Used: OP… 46 Pessary q25 4 Post-intervention lab Columns Used: OP… 47 Pessary q75 16 Post-intervention lab Columns Used: OP… 48 Pessary n 24453 Post-intervention lab Columns Used: OP… 49 Sling mean 10 Post-intervention lab Columns Used: OP… 50 Sling median 4 Post-intervention lab Columns Used: OP… 51 Sling sd 13 Post-intervention lab Columns Used: OP… 52 Sling min 2 Post-intervention lab Columns Used: OP… 53 Sling max 683 Post-intervention lab Columns Used: OP… 54 Sling q25 4 Post-intervention lab Columns Used: OP… 55 Sling q75 15 Post-intervention lab Columns Used: OP… 56 Sling n 12408 Post-intervention lab Columns Used: OP… 57 Overall mean 11 Post-intervention lab Columns Used: OP… 58 Overall median 5 Post-intervention lab Columns Used: OP… 59 Overall sd 11 Post-intervention lab Columns Used: OP… 60 Overall min 2 Post-intervention lab Columns Used: OP… 61 Overall max 683 Post-intervention lab Columns Used: OP… 62 Overall q25 4 Post-intervention lab Columns Used: OP… 63 Overall q75 15 Post-intervention lab Columns Used: OP… 64 Overall n 54256 Post-intervention lab Columns Used: OP… 65 PT mean 289 UDS Columns Used: OP… 66 PT median 209 UDS Columns Used: OP… 67 PT sd 187 UDS Columns Used: OP… 68 PT min 4 UDS Columns Used: OP… 69 PT max 1718 UDS Columns Used: OP… 70 PT q25 186 UDS Columns Used: OP… 71 PT q75 440 UDS Columns Used: OP… 72 PT n 5115 UDS Columns Used: OP… 73 Pessary mean 384 UDS Columns Used: OP… 74 Pessary median 430 UDS Columns Used: OP… 75 Pessary sd 234 UDS Columns Used: OP… 76 Pessary min 4 UDS Columns Used: OP… 77 Pessary max 2121 UDS Columns Used: OP… 78 Pessary q25 188 UDS Columns Used: OP… 79 Pessary q75 565 UDS Columns Used: OP… 80 Pessary n 2878 UDS Columns Used: OP… 81 Sling mean 422 UDS Columns Used: OP… 82 Sling median 464 UDS Columns Used: OP… 83 Sling sd 188 UDS Columns Used: OP… 84 Sling min 4 UDS Columns Used: OP… 85 Sling max 1096 UDS Columns Used: OP… 86 Sling q25 304 UDS Columns Used: OP… 87 Sling q75 564 UDS Columns Used: OP… 88 Sling n 2456 UDS Columns Used: OP… 89 Overall mean 347 UDS Columns Used: OP… 90 Overall median 298 UDS Columns Used: OP… 91 Overall sd 209 UDS Columns Used: OP… 92 Overall min 4 UDS Columns Used: OP… 93 Overall max 2121 UDS Columns Used: OP… 94 Overall q25 190 UDS Columns Used: OP… 95 Overall q75 537 UDS Columns Used: OP… 96 Overall n 10449 UDS Columns Used: OP… 97 PT mean 189 Complications costs Columns Used: OP… 98 PT median 92 Complications costs Columns Used: OP… 99 PT sd 274 Complications costs Columns Used: OP… 100 PT min 4 Complications costs Columns Used: OP… 101 PT max 1734 Complications costs Columns Used: OP… 102 PT q25 44 Complications costs Columns Used: OP… 103 PT q75 201 Complications costs Columns Used: OP… 104 PT n 140 Complications costs Columns Used: OP… 105 Pessary mean 163 Complications costs Columns Used: OP… 106 Pessary median 144 Complications costs Columns Used: OP… 107 Pessary sd 152 Complications costs Columns Used: OP… 108 Pessary min 3 Complications costs Columns Used: OP… 109 Pessary max 1567 Complications costs Columns Used: OP… 110 Pessary q25 37 Complications costs Columns Used: OP… 111 Pessary q75 231 Complications costs Columns Used: OP… 112 Pessary n 780 Complications costs Columns Used: OP… 113 Sling mean 384 Complications costs Columns Used: OP… 114 Sling median 221 Complications costs Columns Used: OP… 115 Sling sd 394 Complications costs Columns Used: OP… 116 Sling min 9 Complications costs Columns Used: OP… 117 Sling max 1986 Complications costs Columns Used: OP… 118 Sling q25 94 Complications costs Columns Used: OP… 119 Sling q75 682 Complications costs Columns Used: OP… 120 Sling n 109 Complications costs Columns Used: OP… 121 Overall mean 190 Complications costs Columns Used: OP… 122 Overall median 144 Complications costs Columns Used: OP… 123 Overall sd 220 Complications costs Columns Used: OP… 124 Overall min 3 Complications costs Columns Used: OP… 125 Overall max 1986 Complications costs Columns Used: OP… 126 Overall q25 39 Complications costs Columns Used: OP… 127 Overall q75 237 Complications costs Columns Used: OP… 128 Overall n 1029 Complications costs Columns Used: OP… 129 Pessary mean 100 Pre-intervention EM Columns Used: OP… 130 Pessary median 87 Pre-intervention EM Columns Used: OP… 131 Pessary sd 49 Pre-intervention EM Columns Used: OP… 132 Pessary min 10 Pre-intervention EM Columns Used: OP… 133 Pessary max 525 Pre-intervention EM Columns Used: OP… 134 Pessary q25 67 Pre-intervention EM Columns Used: OP… 135 Pessary q75 119 Pre-intervention EM Columns Used: OP… 136 Pessary n 4910 Pre-intervention EM Columns Used: OP… 137 Overall mean 100 Pre-intervention EM Columns Used: OP… 138 Overall median 87 Pre-intervention EM Columns Used: OP… 139 Overall sd 49 Pre-intervention EM Columns Used: OP… 140 Overall min 10 Pre-intervention EM Columns Used: OP… 141 Overall max 525 Pre-intervention EM Columns Used: OP… 142 Overall q25 67 Pre-intervention EM Columns Used: OP… 143 Overall q75 119 Pre-intervention EM Columns Used: OP… 144 Overall n 4910 Pre-intervention EM Columns Used: OP… 145 Pessary mean 100 Post-intervention EM Columns Used: OP… 146 Pessary median 87 Post-intervention EM Columns Used: OP… 147 Pessary sd 49 Post-intervention EM Columns Used: OP… 148 Pessary min 10 Post-intervention EM Columns Used: OP… 149 Pessary max 525 Post-intervention EM Columns Used: OP… 150 Pessary q25 67 Post-intervention EM Columns Used: OP… 151 Pessary q75 119 Post-intervention EM Columns Used: OP… 152 Pessary n 4910 Post-intervention EM Columns Used: OP… 153 Overall mean 100 Post-intervention EM Columns Used: OP… 154 Overall median 87 Post-intervention EM Columns Used: OP… 155 Overall sd 49 Post-intervention EM Columns Used: OP… 156 Overall min 10 Post-intervention EM Columns Used: OP… 157 Overall max 525 Post-intervention EM Columns Used: OP… 158 Overall q25 67 Post-intervention EM Columns Used: OP… 159 Overall q75 119 Post-intervention EM Columns Used: OP… 160 Overall n 4910 Post-intervention EM Columns Used: OP… 161 PT mean 259 Pre-intervention cystoscopy Columns Used: OP… 162 PT median 208 Pre-intervention cystoscopy Columns Used: OP… 163 PT sd 150 Pre-intervention cystoscopy Columns Used: OP… 164 PT min 56 Pre-intervention cystoscopy Columns Used: OP… 165 PT max 1316 Pre-intervention cystoscopy Columns Used: OP… 166 PT q25 196 Pre-intervention cystoscopy Columns Used: OP… 167 PT q75 240 Pre-intervention cystoscopy Columns Used: OP… 168 PT n 1300 Pre-intervention cystoscopy Columns Used: OP… 169 Pessary mean 246 Pre-intervention cystoscopy Columns Used: OP… 170 Pessary median 208 Pre-intervention cystoscopy Columns Used: OP… 171 Pessary sd 150 Pre-intervention cystoscopy Columns Used: OP… 172 Pessary min 41 Pre-intervention cystoscopy Columns Used: OP… 173 Pessary max 1452 Pre-intervention cystoscopy Columns Used: OP… 174 Pessary q25 191 Pre-intervention cystoscopy Columns Used: OP… 175 Pessary q75 241 Pre-intervention cystoscopy Columns Used: OP… 176 Pessary n 1505 Pre-intervention cystoscopy Columns Used: OP… 177 Sling mean 233 Pre-intervention cystoscopy Columns Used: OP… 178 Sling median 203 Pre-intervention cystoscopy Columns Used: OP… 179 Sling sd 132 Pre-intervention cystoscopy Columns Used: OP… 180 Sling min 39 Pre-intervention cystoscopy Columns Used: OP… 181 Sling max 1091 Pre-intervention cystoscopy Columns Used: OP… 182 Sling q25 188 Pre-intervention cystoscopy Columns Used: OP… 183 Sling q75 228 Pre-intervention cystoscopy Columns Used: OP… 184 Sling n 1223 Pre-intervention cystoscopy Columns Used: OP… 185 Overall mean 246 Pre-intervention cystoscopy Columns Used: OP… 186 Overall median 207 Pre-intervention cystoscopy Columns Used: OP… 187 Overall sd 145 Pre-intervention cystoscopy Columns Used: OP… 188 Overall min 39 Pre-intervention cystoscopy Columns Used: OP… 189 Overall max 1452 Pre-intervention cystoscopy Columns Used: OP… 190 Overall q25 191 Pre-intervention cystoscopy Columns Used: OP… 191 Overall q75 235 Pre-intervention cystoscopy Columns Used: OP… 192 Overall n 4028 Pre-intervention cystoscopy Columns Used: OP… 193 PT mean 899 Final cost Columns Used: To… 194 PT median 758 Final cost Columns Used: To… 195 PT sd 671 Final cost Columns Used: To… 196 PT min 40 Final cost Columns Used: To… 197 PT max 2858 Final cost Columns Used: To… 198 PT q25 327 Final cost Columns Used: To… 199 PT q75 1340 Final cost Columns Used: To… 200 PT n 69462 Final cost Columns Used: To… 201 Pessary mean 820 Final cost Columns Used: To… 202 Pessary median 691 Final cost Columns Used: To… 203 Pessary sd 525 Final cost Columns Used: To… 204 Pessary min 40 Final cost Columns Used: To… 205 Pessary max 2861 Final cost Columns Used: To… 206 Pessary q25 395 Final cost Columns Used: To… 207 Pessary q75 1143 Final cost Columns Used: To… 208 Pessary n 57245 Final cost Columns Used: To… 209 Sling mean 1855 Final cost Columns Used: To… 210 Sling median 1850 Final cost Columns Used: To… 211 Sling sd 505 Final cost Columns Used: To… 212 Sling min 83 Final cost Columns Used: To… 213 Sling max 2859 Final cost Columns Used: To… 214 Sling q25 1490 Final cost Columns Used: To… 215 Sling q75 2223 Final cost Columns Used: To… 216 Sling n 33616 Final cost Columns Used: To… 217 Overall mean 1072 Final cost Columns Used: To… 218 Overall median 979 Final cost Columns Used: To… 219 Overall sd 715 Final cost Columns Used: To… 220 Overall min 40 Final cost Columns Used: To… 221 Overall max 2861 Final cost Columns Used: To… 222 Overall q25 444 Final cost Columns Used: To… 223 Overall q75 1594 Final cost Columns Used: To… 224 Overall n 160323 Final cost Columns Used: To…

write_csv(shannon_cost_summary$summary_stats %>%
  arrange(sui_treatment, desc=FALSE), "Data/CADR_2023/final_push/cost_point_estimates.csv")

Manuscript

Tyler Muffly, MD

May 11, 2025

1 Load R Libraries

2 Load Bespoke Functions

3 Describe the Data

3.1 _Costs_32_including_5 and_95.csv

3.2 _Costs_step33.rds

3.3 Summing strategy

3.4 Appendix A

3.5 _Margie_Jerry_check_in_Costs.rds

3.6 _Cohort_descriptors

3.7 Linear regression pretty.rds

3.8 March2_table1.rds

3.9 yearly_summary_Cochran_armitage.rds

3.10 Provider_specialty.rds

4 METHODS

5 Five Percent and Ninety-Five Percent Trimming due to Data Inconsistencies

5.0.1 Sensitivity Analysis: Overall Trimming Levels

5.0.2 Log-Transforming and Visualizing Cost Data

5.1 Who was eliminated by the trimming?

6 A tibble: 3 × 8

7 RESULTS SECTION

8 PATIENT CHARACTERISTICS

8.0.1 Testing if treatment distribution differs from equal distribution

8.1 Elixhauser Supplemental Table

9 Trend Analysis of SUI Treatments from 2008 to 2016

9.1 Cochran-Armitage Analysis: Trend Analysis of SUI Treatments from 2008 to 2016

9.2 Cochran-Armitage Analysis: PFMT usage increased from xx% in 2008 to xxx% in 2016

9.3 Cochran-Armitage Graph function

9.4 Figure 1

9.5 Cochran-Armitage Analysis: The absolute annual percentage increase was X% (95% CI: Y-Z).

9.6 Cochran-Armitage: “The observed trends remained significant after adjusting for age, race, and comorbidities (data not shown).”

9.7 Cochran-Armitage: After adjusting for age, race, and comorbidities, the AAPC for PFMT ??increased to xx% per year (95% CI: ??% to ??%, p<??), indicating a more pronounced upward trend, while the AAPC for sling surgeries

9.8 Cochran-Armitage: Supplemental table of non-adjusted and adjusted APC for supplementary data. Supplemental table.

10 VAGINAL PESSARY

10.1 Pessary: Initial SUI Treatment

10.2 Pessary: Unweighted Elixhauser Score (Basic Count of Comorbidities)

10.3 Pessary: Compute the Weighted Elixhauser Score

10.4 Pessary treatments were predominantly used by White beneficiaries, with 92.2% of pessary users identifying as White, consistent with the overall cohort demographics.

10.5 Pessary: “Among reported clinicians, Obstetrics/gynecology were the most frequent providers of pessary fittings, performing 29.50% (n=2291) of these procedures.”

10.6 Pessary: The median cost per patient for pessary treatment was $xxx (IQR \(xx–\)xx), which included provider professional fees and facility fees.

10.7 Pessary: Patients receiving additional procedures such as cystoscopy or bladder diagnostics alongside pessary treatment had higher median costs of $x (IQR: \(xx–\)xxx) compared to those who did not receive these procedures ($xxx, IQR: \(xx–\)xx, Mann-Whitney U test p < xx)

10.9 Pessary: Better Overall Pessary

10.10 Pessary: Overall, pessary??? care remained less expensive compared to sling surgery??? (median per-patient cost: $xxx

10.11 Pessary: The median outpatient facility cost for pessary treatment was ???higher/???lower/same??? than that for ???sling surgery??? but higher/lower/same??? than ???PFMT??? (Table 2).

10.12 Pessary: Complication cost

11 Pelvic Floor Muscle Therapy for Initial SUI Treatment

11.0.1 To verify if PFMT is more expensive than Pessary in all cases, you should compare total costs for both treatments across different statistics

12 A tibble: 1 × 7

12.1 PFMT is statistically less expensive than Pessary

12.2 PFMT: Patients undergoing PFMT had a median age of xx years (IQR: yy–yy), ??slightly ??younger/??older than those receiving ??pessary treatment (Mann-Whitney U test, p=???).

12.3 Physical Therapy: Provider PFMT was ??equally prescribed by ???urologists (???%, n=??) and ??obstetrician-gynecologists (????%, n=???).

12.5 PMFT: The median facility fee for PFMT was $xx (IQR \(xx-\)xxx), and the median total cost was $xxx.

13 SLING

13.2 Sling: Costs of slings

13.3 Sling: Provider doing fucking sling

14 URODYNAMICS:

14.0.1 Urodynamics: Among the 16695 women in our study,

14.0.2 Urodynamics: Median costs were $??? (IQR: \(??-\)???) for non-video urodynamic testing.

15 TABLES

15.1 Table 1: Generate Demographics Table for Stress Urinary Incontinence Therapies

15.2 Table 2

15.3 Table 2: Categories Supplemental

15.4 Table 2: Categories Tally

16 Log Linear Model

16.1 Data Prep

16.2 Remove zero variance and near-zero variance

16.3 Create the log linear model

16.4 Back transform the coefficients

16.5 Table 3: Log linear regression results

16.6 Cleans the Variable Names

16.7 Forest Plot

16.7.1 According to our regression analysis, sling treatments were associated with a predicted total cost

17 A tibble: 34 × 7

18 Cost Point Estimates

19 A tibble: 192 × 5

20 Abstract generate_cohort_description

20.1 Abstract Most Medicare beneficiaries undergoing therapy for SUI were White (xxx%) and had a median of xx years old (interquartile range (IQR) xx, xxx).

20.2 Abstract