library(haven)
library(dplyr)
library(tidyr)
library(forcats)
library(ggplot2)
library(knitr)
library(kableExtra)
setwd("C:/Users/MMDIAZ/Downloads")# Load data
data1 <- read_dta("Barbados_18Nov25_cleaned.dta")
data2 <- read_dta("Meddata_extraction_cleaned_20251118.dta")
# Merge datasets
df <- inner_join(data1, data2, by = "uid")
# Rename all columns to lowercase
names(df) <- tolower(names(df))
cat("Total records after merge:", nrow(df), "\n")## Total records after merge: 992
# Remove records where primarydiagnosis exists but acsc is missing
records_before <- nrow(df)
dfcleaned <- df %>%
filter(is.na(primarydiagnosis.x) | !is.na(acsc))
records_after <- nrow(dfcleaned)
cat("Records removed due to diagnosis-ACSC inconsistency:",
records_before - records_after, "\n")## Records removed due to diagnosis-ACSC inconsistency: 166
# Age groups
df <- df %>%
mutate(
agegroup = cut(ageatencounterstart,
breaks = c(18, 40, 60, 80, 100),
right = FALSE,
labels = c("18-39", "40-59", "60-79", "80+"))
)
stopifnot(all(!is.na(df$agegroup)))
# Length of stay
df <- df %>%
mutate(los_days = as.numeric(discharge_date.x - admission_date.x))
# Day of week and weekend indicator
df <- df %>%
mutate(
dow = as.integer(format(admission_date.x, "%w")),
weekend = dow %in% c(0, 6),
weekend = factor(weekend, levels = c(FALSE, TRUE),
labels = c("Weekday", "Weekend"))
)
# ACSC factor
df <- df %>%
mutate(acsc = factor(acsc, levels = c(0, 1), labels = c("Non-ACSC", "ACSC")))# Variable to mark records for exclusion
df <- df %>%
mutate(todrop = 0)
# Exclusion criteria
df <- df %>%
mutate(
todrop = ifelse(admission_date.x <= as.Date("2023-05-01"), 1, todrop),
todrop = ifelse(los_days >= 60, 2, todrop)
)
# Exclusion table
exclusion_table <- df %>%
count(todrop) %>%
mutate(
Reason = case_when(
todrop == 0 ~ "Included",
todrop == 1 ~ "Admission before May 01, 2023",
todrop == 2 ~ "Length of stay >= 60 days"
)
) %>%
select(Reason, n)
kable(exclusion_table,
caption = "Table 1: Exclusion Criteria",
col.names = c("Criterion", "N")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Criterion | N |
|---|---|
| Included | 971 |
| Length of stay >= 60 days | 21 |
# Identify missing diagnoses
df <- df %>%
mutate(missdiag = primarydiagnosis.y == "" | is.na(primarydiagnosis.y))
# Overall summary
missing_summary <- df %>%
summarise(
`Missing proportion` = mean(missdiag),
`Total missing` = sum(missdiag),
`Total records` = n()
)
kable(missing_summary,
digits = 3,
caption = "Table 2: Summary of Missing Diagnoses") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Missing proportion | Total missing | Total records |
|---|---|---|
| 0.165 | 160 | 971 |
# Comparison by weekend
missing_by_weekend <- df %>%
group_by(weekend) %>%
summarise(
`Missing proportion` = mean(missdiag),
`N missing` = sum(missdiag),
`N total` = n()
)
kable(missing_by_weekend,
digits = 3,
caption = "Table 3: Missing Diagnoses by Day Type") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| weekend | Missing proportion | N missing | N total |
|---|---|---|---|
| Weekday | 0.163 | 117 | 719 |
| Weekend | 0.171 | 43 | 252 |
ggplot(missing_by_weekend, aes(x = weekend, y = `Missing proportion`, fill = weekend)) +
geom_col(alpha = 0.8) +
geom_text(aes(label = scales::percent(`Missing proportion`, accuracy = 0.1)),
vjust = -0.5, size = 4) +
scale_y_continuous(labels = scales::percent, limits = c(0, NA)) +
scale_fill_manual(values = c("#3498db", "#e74c3c")) +
labs(
title = "Proportion of Missing Diagnoses",
subtitle = "Comparison between weekdays and weekends",
x = "",
y = "Proportion"
) +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(face = "bold", size = 14))Figure 1: Proportion of Missing Diagnoses by Day Type
# Convert ACSC to numeric
df <- df %>%
mutate(
acsc_numeric = as.numeric(as.factor(acsc)) - 1,
acsc100 = acsc_numeric * 100
)
# Upper and lower bounds
df <- df %>%
mutate(
acsc_upper = ifelse(is.na(acsc100), 100, acsc100),
acsc_lower = ifelse(is.na(acsc100), 0, acsc100)
)
# Prevalence by day type
prevalence_table <- df %>%
group_by(weekend) %>%
summarise(
`ACSC prevalence` = mean(acsc100, na.rm = TRUE),
`Lower bound` = mean(acsc_lower),
`Upper bound` = mean(acsc_upper),
N = n()
) %>%
bind_rows(
df %>%
summarise(
weekend = "TOTAL",
`ACSC prevalence` = mean(acsc100, na.rm = TRUE),
`Lower bound` = mean(acsc_lower),
`Upper bound` = mean(acsc_upper),
N = n()
)
)
kable(prevalence_table,
digits = 2,
caption = "Table 4: ACSC Prevalence (%) by Day Type",
col.names = c("Period", "Prevalence", "Lower Bound", "Upper Bound", "N")) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
row_spec(nrow(prevalence_table), bold = TRUE, background = "#f0f0f0")| Period | Prevalence | Lower Bound | Upper Bound | N |
|---|---|---|---|---|
| Weekday | 17.77 | 14.88 | 31.15 | 719 |
| Weekend | 22.01 | 18.25 | 35.32 | 252 |
| TOTAL | 18.87 | 15.76 | 32.23 | 971 |
prevalence_plot_data <- prevalence_table %>%
filter(weekend != "TOTAL")
ggplot(prevalence_plot_data, aes(x = weekend, y = `ACSC prevalence`)) +
geom_col(aes(fill = weekend), alpha = 0.7, width = 0.6) +
geom_errorbar(aes(ymin = `Lower bound`, ymax = `Upper bound`),
width = 0.2, size = 1) +
geom_text(aes(label = sprintf("%.1f%%", `ACSC prevalence`)),
vjust = -1.5, size = 4, fontface = "bold") +
scale_fill_manual(values = c("#3498db", "#e74c3c")) +
labs(
title = "ACSC Prevalence by Day Type",
subtitle = "Error bars show lower and upper bounds",
x = "",
y = "Prevalence (%)"
) +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(face = "bold", size = 14))Figure 2: ACSC Prevalence with Uncertainty Intervals
prevalence_age_sex <- df %>%
group_by(agegroup, sex) %>%
summarise(mean_acsc100 = mean(acsc100, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = sex, values_from = mean_acsc100)
kable(prevalence_age_sex,
digits = 2,
caption = "Table 5: ACSC Prevalence (%) by Age Group and Sex",
col.names = c("Age Group", "Female", "Male")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Age Group | Female | Male |
|---|---|---|
| 18-39 | 6.35 | 11.36 |
| 40-59 | 17.65 | 31.34 |
| 60-79 | 31.25 | 30.97 |
| 80+ | 37.93 | 27.59 |
prevalence_age_sex_long <- df %>%
filter(!is.na(acsc100)) %>%
group_by(agegroup, sex) %>%
summarise(mean_acsc100 = mean(acsc100, na.rm = TRUE), .groups = "drop")
ggplot(prevalence_age_sex_long, aes(x = agegroup, y = mean_acsc100, fill = sex)) +
geom_col(position = "dodge", alpha = 0.8) +
geom_text(aes(label = sprintf("%.1f%%", mean_acsc100)),
position = position_dodge(width = 0.9),
vjust = -0.5, size = 3.5) +
scale_fill_manual(
values = c("F" = "#e91e63", "M" = "#2196f3"),
labels = c("F" = "Female", "M" = "Male"),
name = "Sex" # Agregar nombre explÃcito
) +
labs(
title = "ACSC Prevalence by Age Group and Sex",
x = "Age Group",
y = "Prevalence (%)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
legend.position = "top"
)Figure 3: ACSC Prevalence by Age and Sex
df_complete <- df %>%
filter(trimws(primarydiagnosis.x) != "")
stopifnot(all(!is.na(df_complete$acsc)))
cat("Total patients with complete diagnosis:", nrow(df_complete), "\n")## Total patients with complete diagnosis: 811
## Proportion of total: 84%
composition_table <- df_complete %>%
filter(!is.na(acsc_clasiffication_1)) %>%
count(acsc_clasiffication_1) %>%
arrange(desc(n)) %>%
mutate(
Percentage = n / sum(n) * 100,
`Cumulative percentage` = cumsum(Percentage)
)
kable(composition_table,
digits = 2,
caption = "Table 6: ACSC Composition by GBD Classification",
col.names = c("Classification", "N", "Percentage (%)", "Cumulative %")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Classification | N | Percentage (%) | Cumulative % |
|---|---|---|---|
| 658 | 81.13 | 81.13 | |
| Chronic non-communicable diseases | 101 | 12.45 | 93.59 |
| Infectious diseases | 49 | 6.04 | 99.63 |
| Maternal, under-five and nutritional diseases | 3 | 0.37 | 100.00 |
top_classifications <- composition_table %>%
head(10)
ggplot(top_classifications, aes(x = reorder(acsc_clasiffication_1, n), y = n)) +
geom_col(fill = "#9b59b6", alpha = 0.8) +
geom_text(aes(label = sprintf("%d (%.1f%%)", n, Percentage)),
hjust = -0.1, size = 3.5) +
coord_flip() +
labs(
title = "Top 10 ACSC Classifications",
subtitle = "By number of cases",
x = "",
y = "Number of Cases"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14))Figure 4: Distribution of ACSC Cases by Classification
los_table <- df_complete %>%
group_by(acsc) %>%
summarise(
`Mean stay` = mean(los_days, na.rm = TRUE),
`Median stay` = median(los_days, na.rm = TRUE),
`Total bed-days` = sum(los_days, na.rm = TRUE),
N = n(),
.groups = "drop"
) %>%
mutate(`% bed-days` = (`Total bed-days` / sum(`Total bed-days`)) * 100)
kable(los_table,
digits = 2,
caption = "Table 7: Length of Stay and Bed-Days by Condition Type",
col.names = c("Type", "Mean Stay", "Median Stay",
"Total Bed-Days", "N", "% Bed-Days")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Type | Mean Stay | Median Stay | Total Bed-Days | N | % Bed-Days |
|---|---|---|---|---|---|
| Non-ACSC | 5.97 | 3 | 3925 | 658 | 78.44 |
| ACSC | 7.05 | 5 | 1079 | 153 | 21.56 |
los_plot_data <- df_complete %>%
filter(!is.na(los_days), los_days < 30) # Exclude extreme outliers
ggplot(los_plot_data, aes(x = acsc, y = los_days, fill = acsc)) +
geom_boxplot(alpha = 0.7, outlier.alpha = 0.3) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 3,
fill = "white", color = "black") +
scale_fill_manual(values = c("#27ae60", "#e74c3c")) +
labs(
title = "Length of Stay Distribution",
subtitle = "White diamond indicates mean. Limited to < 30 days for visualization",
x = "",
y = "Days of Stay"
) +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(face = "bold", size = 14))Figure 5: Length of Stay Comparison
bed_days_plot <- los_table %>%
ggplot(aes(x = "", y = `% bed-days`, fill = acsc)) +
geom_col(width = 1, color = "white", size = 2) +
geom_text(aes(label = sprintf("%.1f%%\n(%s days)",
`% bed-days`,
scales::comma(`Total bed-days`))),
position = position_stack(vjust = 0.5),
size = 5, fontface = "bold", color = "white") +
coord_polar("y") +
scale_fill_manual(values = c("#27ae60", "#e74c3c")) +
labs(
title = "Proportion of Total Bed-Days",
fill = "Condition Type"
) +
theme_void() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
legend.position = "bottom")
print(bed_days_plot)Figure 6: Proportion of Bed-Days by Condition Type
df_acsc <- df_complete %>%
filter(acsc == "ACSC") %>%
group_by(primarydiagnosis.x, acsc_clasiffication_1, acsc_clasiffication_2) %>%
summarise(i = n(), .groups = "drop") %>%
mutate(
nobs = sum(i),
percent = (i / nobs) * 100
) %>%
arrange(desc(percent)) %>%
mutate(runningpercent = cumsum(percent))
# Display diagnoses >= 3%
top_diagnoses <- df_acsc %>%
filter(percent >= 3) %>%
select(primarydiagnosis.x, acsc_clasiffication_1, i, percent, runningpercent)
kable(top_diagnoses,
digits = 2,
caption = "Table 8: Top ACSC Diagnoses (≥ 3% of cases)",
col.names = c("Primary Diagnosis", "Classification 1",
"N", "Percentage (%)", "Cumulative %")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Primary Diagnosis | Classification 1 | N | Percentage (%) | Cumulative % |
|---|---|---|---|---|
| Heart failure, unspecified | Chronic non-communicable diseases | 29 | 18.95 | 18.95 |
| Local infection of skin and subcutaneous tissue, unspecified | Infectious diseases | 12 | 7.84 | 26.80 |
| Other specified cerebrovascular diseases | Chronic non-communicable diseases | 11 | 7.19 | 33.99 |
| Gastrointestinal haemorrhage, unspecified | Chronic non-communicable diseases | 10 | 6.54 | 40.52 |
| Status asthmaticus | Chronic non-communicable diseases | 7 | 4.58 | 45.10 |
| Urinary tract infection, site not specified | Infectious diseases | 7 | 4.58 | 49.67 |
| Non-insulin-dependent diabetes mellitus: With ketoacidosis | Chronic non-communicable diseases | 6 | 3.92 | 53.59 |
| Cutaneous abscess, furuncle and carbuncle of buttock | Infectious diseases | 5 | 3.27 | 56.86 |
| Cutaneous abscess, furuncle and carbuncle of limb | Infectious diseases | 5 | 3.27 | 60.13 |
top_10_diagnoses <- df_acsc %>%
head(10)
ggplot(top_10_diagnoses, aes(x = reorder(primarydiagnosis.x, percent), y = percent)) +
geom_col(fill = "#e74c3c", alpha = 0.8) +
geom_text(aes(label = sprintf("%.1f%%", percent)),
hjust = -0.1, size = 3.5) +
coord_flip() +
labs(
title = "Top 10 ACSC Diagnoses",
subtitle = "By percentage of ACSC cases",
x = "",
y = "Percentage of Total ACSC (%)"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14))Figure 7: Most Frequent ACSC Diagnoses
pareto_data <- df_acsc %>%
head(15)
ggplot(pareto_data, aes(x = reorder(primarydiagnosis.x, -percent))) +
geom_col(aes(y = percent), fill = "#e74c3c", alpha = 0.7) +
geom_line(aes(y = runningpercent, group = 1),
color = "#2c3e50", size = 1.2) +
geom_point(aes(y = runningpercent),
color = "#2c3e50", size = 3) +
geom_hline(yintercept = 80, linetype = "dashed",
color = "#34495e", alpha = 0.7) +
scale_y_continuous(
name = "Individual Percentage (%)",
sec.axis = sec_axis(~., name = "Cumulative Percentage (%)")
) +
labs(
title = "Pareto Analysis - Top 15 ACSC Diagnoses",
subtitle = "Line shows cumulative percentage (80/20 rule)",
x = ""
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
plot.title = element_text(face = "bold", size = 14),
axis.title.y.right = element_text(color = "#2c3e50"),
axis.text.y.right = element_text(color = "#2c3e50")
)Figure 8: Pareto Chart - ACSC Diagnoses
summary_stats <- list(
`Total records analyzed` = nrow(df),
`Records with complete diagnosis` = nrow(df_complete),
`Total ACSC cases` = sum(df_complete$acsc == "ACSC"),
`ACSC prevalence (%)` = mean(df_complete$acsc == "ACSC") * 100,
`Mean stay ACSC (days)` = mean(df_complete$los_days[df_complete$acsc == "ACSC"], na.rm = TRUE),
`Mean stay Non-ACSC (days)` = mean(df_complete$los_days[df_complete$acsc != "ACSC"], na.rm = TRUE),
`% bed-days from ACSC` = (sum(df_complete$los_days[df_complete$acsc == "ACSC"], na.rm = TRUE) /
sum(df_complete$los_days, na.rm = TRUE)) * 100
)
summary_df <- data.frame(
Indicator = names(summary_stats),
Value = sapply(summary_stats, function(x) {
if(is.numeric(x)) sprintf("%.2f", x) else as.character(x)
})
)
kable(summary_df,
caption = "Table 9: Key Performance Indicators Summary",
col.names = c("Indicator", "Value"),
row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
row_spec(c(4, 7), bold = TRUE, background = "#fff3cd")| Indicator | Value |
|---|---|
| Total records analyzed | 971.00 |
| Records with complete diagnosis | 811.00 |
| Total ACSC cases | 153.00 |
| ACSC prevalence (%) | 18.87 |
| Mean stay ACSC (days) | 7.05 |
| Mean stay Non-ACSC (days) | 5.97 |
| % bed-days from ACSC | 21.56 |