library(haven)
library(dplyr)
library(tidyr)
library(forcats)
library(ggplot2)
library(knitr)
library(kableExtra)

setwd("C:/Users/MMDIAZ/Downloads")

1. Data Preparation and Cleaning

1.1 Data Loading and Merging

# Load data
data1 <- read_dta("Barbados_18Nov25_cleaned.dta")
data2 <- read_dta("Meddata_extraction_cleaned_20251118.dta")

# Merge datasets
df <- inner_join(data1, data2, by = "uid")

# Rename all columns to lowercase
names(df) <- tolower(names(df))

cat("Total records after merge:", nrow(df), "\n")
## Total records after merge: 992

1.2 Quality Control

# Remove records where primarydiagnosis exists but acsc is missing
records_before <- nrow(df)
dfcleaned <- df %>%
  filter(is.na(primarydiagnosis.x) | !is.na(acsc))
records_after <- nrow(dfcleaned)

cat("Records removed due to diagnosis-ACSC inconsistency:", 
    records_before - records_after, "\n")
## Records removed due to diagnosis-ACSC inconsistency: 166
# Verify data integrity
#stopifnot(all(!is.na(df$acsc[!is.na(df$primarydiagnosis.x)])))

1.3 Variable Creation

# Age groups
df <- df %>%
  mutate(
    agegroup = cut(ageatencounterstart, 
                   breaks = c(18, 40, 60, 80, 100), 
                   right = FALSE,
                   labels = c("18-39", "40-59", "60-79", "80+"))
  )

stopifnot(all(!is.na(df$agegroup)))

# Length of stay
df <- df %>%
  mutate(los_days = as.numeric(discharge_date.x - admission_date.x))

# Day of week and weekend indicator
df <- df %>%
  mutate(
    dow = as.integer(format(admission_date.x, "%w")),
    weekend = dow %in% c(0, 6),
    weekend = factor(weekend, levels = c(FALSE, TRUE), 
                    labels = c("Weekday", "Weekend"))
  )

# ACSC factor
df <- df %>%
  mutate(acsc = factor(acsc, levels = c(0, 1), labels = c("Non-ACSC", "ACSC")))

1.4 Data Filtering

# Variable to mark records for exclusion
df <- df %>%
  mutate(todrop = 0)

# Exclusion criteria
df <- df %>%
  mutate(
    todrop = ifelse(admission_date.x <= as.Date("2023-05-01"), 1, todrop),
    todrop = ifelse(los_days >= 60, 2, todrop)
  )

# Exclusion table
exclusion_table <- df %>%
  count(todrop) %>%
  mutate(
    Reason = case_when(
      todrop == 0 ~ "Included",
      todrop == 1 ~ "Admission before May 01, 2023",
      todrop == 2 ~ "Length of stay >= 60 days"
    )
  ) %>%
  select(Reason, n)

kable(exclusion_table, 
      caption = "Table 1: Exclusion Criteria",
      col.names = c("Criterion", "N")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Table 1: Exclusion Criteria
Criterion N
Included 971
Length of stay >= 60 days 21
# Apply filter
df <- df %>%
  filter(todrop == 0)

1.5 Missing Diagnoses

# Identify missing diagnoses
df <- df %>%
  mutate(missdiag = primarydiagnosis.y == "" | is.na(primarydiagnosis.y))

# Overall summary
missing_summary <- df %>%
  summarise(
    `Missing proportion` = mean(missdiag),
    `Total missing` = sum(missdiag),
    `Total records` = n()
  )

kable(missing_summary, 
      digits = 3,
      caption = "Table 2: Summary of Missing Diagnoses") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Table 2: Summary of Missing Diagnoses
Missing proportion Total missing Total records
0.165 160 971
# Comparison by weekend
missing_by_weekend <- df %>%
  group_by(weekend) %>%
  summarise(
    `Missing proportion` = mean(missdiag),
    `N missing` = sum(missdiag),
    `N total` = n()
  )

kable(missing_by_weekend, 
      digits = 3,
      caption = "Table 3: Missing Diagnoses by Day Type") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Table 3: Missing Diagnoses by Day Type
weekend Missing proportion N missing N total
Weekday 0.163 117 719
Weekend 0.171 43 252
ggplot(missing_by_weekend, aes(x = weekend, y = `Missing proportion`, fill = weekend)) +
  geom_col(alpha = 0.8) +
  geom_text(aes(label = scales::percent(`Missing proportion`, accuracy = 0.1)),
            vjust = -0.5, size = 4) +
  scale_y_continuous(labels = scales::percent, limits = c(0, NA)) +
  scale_fill_manual(values = c("#3498db", "#e74c3c")) +
  labs(
    title = "Proportion of Missing Diagnoses",
    subtitle = "Comparison between weekdays and weekends",
    x = "",
    y = "Proportion"
  ) +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold", size = 14))
Figure 1: Proportion of Missing Diagnoses by Day Type

Figure 1: Proportion of Missing Diagnoses by Day Type

df <- df %>%
  select(-missdiag)

2. Analysis - All Patients

2.1 ACSC Prevalence with Bounds

# Convert ACSC to numeric
df <- df %>%
  mutate(
    acsc_numeric = as.numeric(as.factor(acsc)) - 1,
    acsc100 = acsc_numeric * 100
  )

# Upper and lower bounds
df <- df %>%
  mutate(
    acsc_upper = ifelse(is.na(acsc100), 100, acsc100),
    acsc_lower = ifelse(is.na(acsc100), 0, acsc100)
  )

# Prevalence by day type
prevalence_table <- df %>%
  group_by(weekend) %>%
  summarise(
    `ACSC prevalence` = mean(acsc100, na.rm = TRUE),
    `Lower bound` = mean(acsc_lower),
    `Upper bound` = mean(acsc_upper),
    N = n()
  ) %>%
  bind_rows(
    df %>%
      summarise(
        weekend = "TOTAL",
        `ACSC prevalence` = mean(acsc100, na.rm = TRUE),
        `Lower bound` = mean(acsc_lower),
        `Upper bound` = mean(acsc_upper),
        N = n()
      )
  )

kable(prevalence_table, 
      digits = 2,
      caption = "Table 4: ACSC Prevalence (%) by Day Type",
      col.names = c("Period", "Prevalence", "Lower Bound", "Upper Bound", "N")) %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(nrow(prevalence_table), bold = TRUE, background = "#f0f0f0")
Table 4: ACSC Prevalence (%) by Day Type
Period Prevalence Lower Bound Upper Bound N
Weekday 17.77 14.88 31.15 719
Weekend 22.01 18.25 35.32 252
TOTAL 18.87 15.76 32.23 971
prevalence_plot_data <- prevalence_table %>%
  filter(weekend != "TOTAL")

ggplot(prevalence_plot_data, aes(x = weekend, y = `ACSC prevalence`)) +
  geom_col(aes(fill = weekend), alpha = 0.7, width = 0.6) +
  geom_errorbar(aes(ymin = `Lower bound`, ymax = `Upper bound`),
                width = 0.2, size = 1) +
  geom_text(aes(label = sprintf("%.1f%%", `ACSC prevalence`)),
            vjust = -1.5, size = 4, fontface = "bold") +
  scale_fill_manual(values = c("#3498db", "#e74c3c")) +
  labs(
    title = "ACSC Prevalence by Day Type",
    subtitle = "Error bars show lower and upper bounds",
    x = "",
    y = "Prevalence (%)"
  ) +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold", size = 14))
Figure 2: ACSC Prevalence with Uncertainty Intervals

Figure 2: ACSC Prevalence with Uncertainty Intervals

2.2 Prevalence by Age and Sex

prevalence_age_sex <- df %>%
  group_by(agegroup, sex) %>%
  summarise(mean_acsc100 = mean(acsc100, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(names_from = sex, values_from = mean_acsc100)

kable(prevalence_age_sex, 
      digits = 2,
      caption = "Table 5: ACSC Prevalence (%) by Age Group and Sex",
      col.names = c("Age Group", "Female", "Male")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Table 5: ACSC Prevalence (%) by Age Group and Sex
Age Group Female Male
18-39 6.35 11.36
40-59 17.65 31.34
60-79 31.25 30.97
80+ 37.93 27.59
prevalence_age_sex_long <- df %>%
  filter(!is.na(acsc100)) %>%
  group_by(agegroup, sex) %>%
  summarise(mean_acsc100 = mean(acsc100, na.rm = TRUE), .groups = "drop")

ggplot(prevalence_age_sex_long, aes(x = agegroup, y = mean_acsc100, fill = sex)) +
  geom_col(position = "dodge", alpha = 0.8) +
  geom_text(aes(label = sprintf("%.1f%%", mean_acsc100)),
            position = position_dodge(width = 0.9),
            vjust = -0.5, size = 3.5) +
  scale_fill_manual(
    values = c("F" = "#e91e63", "M" = "#2196f3"),
    labels = c("F" = "Female", "M" = "Male"),
    name = "Sex"  # Agregar nombre explícito
  ) +
  labs(
    title = "ACSC Prevalence by Age Group and Sex",
    x = "Age Group",
    y = "Prevalence (%)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "top"
  )
Figure 3: ACSC Prevalence by Age and Sex

Figure 3: ACSC Prevalence by Age and Sex

df <- df %>%
  select(-acsc_upper, -acsc_lower, -acsc100)

3. Analysis - Patients with Complete Diagnosis

df_complete <- df %>%
  filter(trimws(primarydiagnosis.x) != "")

stopifnot(all(!is.na(df_complete$acsc)))

cat("Total patients with complete diagnosis:", nrow(df_complete), "\n")
## Total patients with complete diagnosis: 811
cat("Proportion of total:", scales::percent(nrow(df_complete) / nrow(df)), "\n")
## Proportion of total: 84%

3.1 ACSC Composition by Classification

composition_table <- df_complete %>%
  filter(!is.na(acsc_clasiffication_1)) %>%
  count(acsc_clasiffication_1) %>%
  arrange(desc(n)) %>%
  mutate(
    Percentage = n / sum(n) * 100,
    `Cumulative percentage` = cumsum(Percentage)
  )

kable(composition_table, 
      digits = 2,
      caption = "Table 6: ACSC Composition by GBD Classification",
      col.names = c("Classification", "N", "Percentage (%)", "Cumulative %")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Table 6: ACSC Composition by GBD Classification
Classification N Percentage (%) Cumulative %
658 81.13 81.13
Chronic non-communicable diseases 101 12.45 93.59
Infectious diseases 49 6.04 99.63
Maternal, under-five and nutritional diseases 3 0.37 100.00
top_classifications <- composition_table %>%
  head(10)

ggplot(top_classifications, aes(x = reorder(acsc_clasiffication_1, n), y = n)) +
  geom_col(fill = "#9b59b6", alpha = 0.8) +
  geom_text(aes(label = sprintf("%d (%.1f%%)", n, Percentage)),
            hjust = -0.1, size = 3.5) +
  coord_flip() +
  labs(
    title = "Top 10 ACSC Classifications",
    subtitle = "By number of cases",
    x = "",
    y = "Number of Cases"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14))
Figure 4: Distribution of ACSC Cases by Classification

Figure 4: Distribution of ACSC Cases by Classification

3.2 Length of Stay

los_table <- df_complete %>%
  group_by(acsc) %>%
  summarise(
    `Mean stay` = mean(los_days, na.rm = TRUE),
    `Median stay` = median(los_days, na.rm = TRUE),
    `Total bed-days` = sum(los_days, na.rm = TRUE),
    N = n(),
    .groups = "drop"
  ) %>%
  mutate(`% bed-days` = (`Total bed-days` / sum(`Total bed-days`)) * 100)

kable(los_table, 
      digits = 2,
      caption = "Table 7: Length of Stay and Bed-Days by Condition Type",
      col.names = c("Type", "Mean Stay", "Median Stay", 
                    "Total Bed-Days", "N", "% Bed-Days")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Table 7: Length of Stay and Bed-Days by Condition Type
Type Mean Stay Median Stay Total Bed-Days N % Bed-Days
Non-ACSC 5.97 3 3925 658 78.44
ACSC 7.05 5 1079 153 21.56
los_plot_data <- df_complete %>%
  filter(!is.na(los_days), los_days < 30)  # Exclude extreme outliers

ggplot(los_plot_data, aes(x = acsc, y = los_days, fill = acsc)) +
  geom_boxplot(alpha = 0.7, outlier.alpha = 0.3) +
  stat_summary(fun = mean, geom = "point", shape = 23, size = 3, 
               fill = "white", color = "black") +
  scale_fill_manual(values = c("#27ae60", "#e74c3c")) +
  labs(
    title = "Length of Stay Distribution",
    subtitle = "White diamond indicates mean. Limited to < 30 days for visualization",
    x = "",
    y = "Days of Stay"
  ) +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold", size = 14))
Figure 5: Length of Stay Comparison

Figure 5: Length of Stay Comparison

bed_days_plot <- los_table %>%
  ggplot(aes(x = "", y = `% bed-days`, fill = acsc)) +
  geom_col(width = 1, color = "white", size = 2) +
  geom_text(aes(label = sprintf("%.1f%%\n(%s days)", 
                                `% bed-days`, 
                                scales::comma(`Total bed-days`))),
            position = position_stack(vjust = 0.5),
            size = 5, fontface = "bold", color = "white") +
  coord_polar("y") +
  scale_fill_manual(values = c("#27ae60", "#e74c3c")) +
  labs(
    title = "Proportion of Total Bed-Days",
    fill = "Condition Type"
  ) +
  theme_void() +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        legend.position = "bottom")

print(bed_days_plot)
Figure 6: Proportion of Bed-Days by Condition Type

Figure 6: Proportion of Bed-Days by Condition Type

4. Analysis - ACSC Patients Only

df_acsc <- df_complete %>%
  filter(acsc == "ACSC") %>%
  group_by(primarydiagnosis.x, acsc_clasiffication_1, acsc_clasiffication_2) %>%
  summarise(i = n(), .groups = "drop") %>%
  mutate(
    nobs = sum(i),
    percent = (i / nobs) * 100
  ) %>%
  arrange(desc(percent)) %>%
  mutate(runningpercent = cumsum(percent))

# Display diagnoses >= 3%
top_diagnoses <- df_acsc %>%
  filter(percent >= 3) %>%
  select(primarydiagnosis.x, acsc_clasiffication_1, i, percent, runningpercent)

kable(top_diagnoses, 
      digits = 2,
      caption = "Table 8: Top ACSC Diagnoses (≥ 3% of cases)",
      col.names = c("Primary Diagnosis", "Classification 1", 
                    "N", "Percentage (%)", "Cumulative %")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Table 8: Top ACSC Diagnoses (≥ 3% of cases)
Primary Diagnosis Classification 1 N Percentage (%) Cumulative %
Heart failure, unspecified Chronic non-communicable diseases 29 18.95 18.95
Local infection of skin and subcutaneous tissue, unspecified Infectious diseases 12 7.84 26.80
Other specified cerebrovascular diseases Chronic non-communicable diseases 11 7.19 33.99
Gastrointestinal haemorrhage, unspecified Chronic non-communicable diseases 10 6.54 40.52
Status asthmaticus Chronic non-communicable diseases 7 4.58 45.10
Urinary tract infection, site not specified Infectious diseases 7 4.58 49.67
Non-insulin-dependent diabetes mellitus: With ketoacidosis Chronic non-communicable diseases 6 3.92 53.59
Cutaneous abscess, furuncle and carbuncle of buttock Infectious diseases 5 3.27 56.86
Cutaneous abscess, furuncle and carbuncle of limb Infectious diseases 5 3.27 60.13
top_10_diagnoses <- df_acsc %>%
  head(10)

ggplot(top_10_diagnoses, aes(x = reorder(primarydiagnosis.x, percent), y = percent)) +
  geom_col(fill = "#e74c3c", alpha = 0.8) +
  geom_text(aes(label = sprintf("%.1f%%", percent)),
            hjust = -0.1, size = 3.5) +
  coord_flip() +
  labs(
    title = "Top 10 ACSC Diagnoses",
    subtitle = "By percentage of ACSC cases",
    x = "",
    y = "Percentage of Total ACSC (%)"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14))
Figure 7: Most Frequent ACSC Diagnoses

Figure 7: Most Frequent ACSC Diagnoses

pareto_data <- df_acsc %>%
  head(15)

ggplot(pareto_data, aes(x = reorder(primarydiagnosis.x, -percent))) +
  geom_col(aes(y = percent), fill = "#e74c3c", alpha = 0.7) +
  geom_line(aes(y = runningpercent, group = 1), 
            color = "#2c3e50", size = 1.2) +
  geom_point(aes(y = runningpercent), 
             color = "#2c3e50", size = 3) +
  geom_hline(yintercept = 80, linetype = "dashed", 
             color = "#34495e", alpha = 0.7) +
  scale_y_continuous(
    name = "Individual Percentage (%)",
    sec.axis = sec_axis(~., name = "Cumulative Percentage (%)")
  ) +
  labs(
    title = "Pareto Analysis - Top 15 ACSC Diagnoses",
    subtitle = "Line shows cumulative percentage (80/20 rule)",
    x = ""
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
    plot.title = element_text(face = "bold", size = 14),
    axis.title.y.right = element_text(color = "#2c3e50"),
    axis.text.y.right = element_text(color = "#2c3e50")
  )
Figure 8: Pareto Chart - ACSC Diagnoses

Figure 8: Pareto Chart - ACSC Diagnoses

5. Executive Summary

summary_stats <- list(
  `Total records analyzed` = nrow(df),
  `Records with complete diagnosis` = nrow(df_complete),
  `Total ACSC cases` = sum(df_complete$acsc == "ACSC"),
  `ACSC prevalence (%)` = mean(df_complete$acsc == "ACSC") * 100,
  `Mean stay ACSC (days)` = mean(df_complete$los_days[df_complete$acsc == "ACSC"], na.rm = TRUE),
  `Mean stay Non-ACSC (days)` = mean(df_complete$los_days[df_complete$acsc != "ACSC"], na.rm = TRUE),
  `% bed-days from ACSC` = (sum(df_complete$los_days[df_complete$acsc == "ACSC"], na.rm = TRUE) / 
                            sum(df_complete$los_days, na.rm = TRUE)) * 100
)

summary_df <- data.frame(
  Indicator = names(summary_stats),
  Value = sapply(summary_stats, function(x) {
    if(is.numeric(x)) sprintf("%.2f", x) else as.character(x)
  })
)

kable(summary_df, 
      caption = "Table 9: Key Performance Indicators Summary",
      col.names = c("Indicator", "Value"),
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(c(4, 7), bold = TRUE, background = "#fff3cd")
Table 9: Key Performance Indicators Summary
Indicator Value
Total records analyzed 971.00
Records with complete diagnosis 811.00
Total ACSC cases 153.00
ACSC prevalence (%) 18.87
Mean stay ACSC (days) 7.05
Mean stay Non-ACSC (days) 5.97
% bed-days from ACSC 21.56