1 Package Installation and Loading

# ── STEP 1: Install CRAN packages if missing ─────────────────────────────────
cran_packages <- c(
  "readxl",
  "tidyverse",
  "janitor",
  "skimr",
  "naniar",
  "mice",
  "VIM",
  "tableone",
  "gtsummary",
  "flextable",
  "gt",
  "ggpubr",
  "corrplot",
  "car",
  "lme4",
  "epitools",
  "epiR",
  "survey",
  "srvyr",
  "broom",
  "pROC",
  "ResourceSelection",
  "MASS",
  "nnet",
  "Hmisc",
  "psych",
  "nortest",
  "rstatix",
  "ggcorrplot",
  "patchwork",
  "scales",
  "kableExtra",
  "knitr",
  "rmarkdown"
)

new_cran <- cran_packages[!cran_packages %in% rownames(installed.packages())]
if (length(new_cran) > 0) {
  install.packages(new_cran, dependencies = TRUE,
                   repos = "https://cran.rstudio.com/")
}

# ── STEP 2: Install ggmosaic from GitHub (not reliably on CRAN for R 4.6) ───
if (!requireNamespace("ggmosaic", quietly = TRUE)) {
  if (!requireNamespace("remotes", quietly = TRUE))
    install.packages("remotes", repos = "https://cran.rstudio.com/")
  remotes::install_github("haleyjeppson/ggmosaic")
}

# ── STEP 3: Load packages — flextable loaded LAST to avoid masking ───────────
# Load all CRAN packages
invisible(lapply(cran_packages, library, character.only = TRUE))

# Load ggmosaic
library(ggmosaic)

# Reload flextable last so its border/font/rotate are NOT masked by ggpubr
library(flextable)

cat("R version:", R.version$version.string, "\n")
## R version: R version 4.6.0 (2026-04-24)
cat("All packages loaded successfully.\n")
## All packages loaded successfully.

2 Data Import and Initial Inspection

# ── SAFE IMPORT: rename columns by POSITION to avoid special-character issues ─
# The Excel file has these 12 columns in order:
# PATH NO. | NAME | SEX | AGE | PULSE (bpm) | BLOOD PRESSURE (mmHg) |
# BLOOD GLUCOSE (mmol/L) | HEPATITIS B (HBsAg | HEPATITIS C (HCV) |
# HIV(Retr) | SYPHILIS(VDRL) | CONTACT

# ── Update this path to match where your file is saved ───────────────────────
data_path <- '/Users/emmanuelsmac/Desktop/DAMPONG complete.xlsx'   # <-- change if needed

raw <- read_excel(data_path, sheet = 1)

# Rename all 12 columns by position — bypasses ALL special character problems
names(raw) <- c(
  "path_no",     # PATH NO.
  "name",        # NAME
  "sex",         # SEX
  "age",         # AGE
  "pulse_bpm",   # PULSE (bpm)
  "bp_mmhg",     # BLOOD PRESSURE (mmHg)
  "glucose_mmol",# BLOOD GLUCOSE (mmol/L)
  "hep_b",       # HEPATITIS B (HBsAg   [note: missing closing bracket in source]
  "hep_c",       # HEPATITIS C (HCV)
  "hiv",         # HIV(Retr)
  "syphilis",    # SYPHILIS(VDRL)
  "contact"      # CONTACT
)

cat("Import successful.\n")
## Import successful.
cat("Dimensions:", nrow(raw), "rows x", ncol(raw), "columns\n")
## Dimensions: 524 rows x 12 columns
cat("\nColumn names confirmed:\n")
## 
## Column names confirmed:
print(names(raw))
##  [1] "path_no"      "name"         "sex"          "age"          "pulse_bpm"   
##  [6] "bp_mmhg"      "glucose_mmol" "hep_b"        "hep_c"        "hiv"         
## [11] "syphilis"     "contact"
cat("\nFirst 5 rows:\n")
## 
## First 5 rows:
print(head(raw, 5))
## # A tibble: 5 × 12
##   path_no name      sex     age pulse_bpm bp_mmhg glucose_mmol hep_b hep_c hiv  
##   <chr>   <chr>     <chr> <dbl>     <dbl> <chr>          <dbl> <chr> <chr> <chr>
## 1 DAM0001 GLADYS A… FEMA…    60        68 150/94           5.6 NEGA… NEGA… NON …
## 2 DAM0002 ALFRED D… MALE     59        92 154/82          15.4 NEGA… NEGA… NON …
## 3 DAM0003 AGNES AS… FEMA…    82        96 141/88           5.2 <NA>  <NA>  <NA> 
## 4 DAM0004 AKUA SER… FEMA…    62        66 128/77           6   NEGA… NEGA… NON …
## 5 DAM0005 CECILIA … FEMA…    64        80 164/91          11.1 NEGA… NEGA… NON …
## # ℹ 2 more variables: syphilis <chr>, contact <chr>
# ── Comprehensive summary ─────────────────────────────────────────────────────
skim(raw)
Data summary
Name raw
Number of rows 524
Number of columns 12
_______________________
Column type frequency:
character 9
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
path_no 172 0.67 6 7 0 352 0
name 0 1.00 6 26 0 523 0
sex 0 1.00 4 6 0 2 0
bp_mmhg 129 0.75 4 7 0 358 0
hep_b 3 0.99 8 8 0 2 0
hep_c 3 0.99 8 8 0 2 0
hiv 4 0.99 8 12 0 2 0
syphilis 2 1.00 8 12 0 2 0
contact 238 0.55 9 10 0 272 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 6 0.99 45.01 24.78 1.0 18.00 48 65.0 97.0 ▇▃▇▇▂
pulse_bpm 129 0.75 80.99 13.34 48.0 71.00 80 89.0 124.0 ▂▇▇▃▁
glucose_mmol 212 0.60 6.46 2.58 3.1 4.97 6 7.1 24.3 ▇▂▁▁▁

3 Data Cleaning and Feature Engineering

# ── STEP 1: Standardise text fields ──────────────────────────────────────────
df <- raw %>%
  mutate(
    sex      = str_trim(str_to_upper(as.character(sex))),
    hep_b    = str_trim(str_to_upper(as.character(hep_b))),
    hep_c    = str_trim(str_to_upper(as.character(hep_c))),
    hiv      = str_trim(str_to_upper(as.character(hiv))),
    syphilis = str_trim(str_to_upper(as.character(syphilis)))
  )

# ── STEP 2: Parse blood pressure robustly ────────────────────────────────────
# Handles malformed entries like "124/" (row 399 in this dataset)
df <- df %>%
  mutate(
    bp_clean  = str_trim(as.character(bp_mmhg)),
    systolic  = suppressWarnings(
                  as.numeric(str_extract(bp_clean, "^[0-9]+"))),
    diastolic = suppressWarnings(
                  as.numeric(str_extract(bp_clean, "(?<=/)[0-9]+")))
  )

# Report any malformed BP entries
bp_malformed <- df %>%
  filter(!is.na(bp_clean), bp_clean != "NA",
         (is.na(systolic) | is.na(diastolic))) %>%
  select(path_no, name, bp_mmhg)

cat("Malformed BP entries:", nrow(bp_malformed), "\n")
## Malformed BP entries: 1
if (nrow(bp_malformed) > 0) print(bp_malformed)
## # A tibble: 1 × 3
##   path_no name             bp_mmhg
##   <chr>   <chr>            <chr>  
## 1 <NA>    ELIZABETH MENSAH 124/
# ── STEP 3: Clinical categories ──────────────────────────────────────────────
df <- df %>%
  mutate(

    # Age groups
    age_group = cut(age,
                    breaks = c(0, 18, 35, 50, 65, Inf),
                    labels = c("0-18","19-35","36-50","51-65","65+"),
                    right  = TRUE),

    # BP categories (ACC/AHA 2017)
    bp_category = case_when(
      is.na(systolic) | is.na(diastolic)            ~ NA_character_,
      systolic < 120 & diastolic < 80               ~ "Normal",
      systolic >= 120 & systolic <= 129 &
        diastolic < 80                              ~ "Elevated",
      (systolic >= 130 & systolic <= 139) |
        (diastolic >= 80 & diastolic <= 89)         ~ "Stage 1 HTN",
      systolic >= 140 | diastolic >= 90             ~ "Stage 2 HTN",
      TRUE                                          ~ NA_character_
    ),
    bp_category = factor(bp_category,
                         levels = c("Normal","Elevated",
                                    "Stage 1 HTN","Stage 2 HTN")),

    # Hypertension binary flag
    hypertension = case_when(
      is.na(systolic)                     ~ NA_integer_,
      systolic >= 130 | diastolic >= 80   ~ 1L,
      TRUE                                ~ 0L
    ),

    # Hypertensive crisis
    htn_crisis = case_when(
      is.na(systolic)                      ~ NA_integer_,
      systolic > 180 | diastolic > 120    ~ 1L,
      TRUE                                ~ 0L
    ),

    # Glucose categories (WHO/ADA)
    glucose_category = case_when(
      is.na(glucose_mmol)                            ~ NA_character_,
      glucose_mmol < 3.9                             ~ "Hypoglycaemia",
      glucose_mmol >= 3.9 & glucose_mmol < 5.6      ~ "Normal",
      glucose_mmol >= 5.6 & glucose_mmol < 7.0      ~ "Pre-diabetes",
      glucose_mmol >= 7.0                            ~ "Diabetes",
      TRUE                                           ~ NA_character_
    ),
    glucose_category = factor(glucose_category,
                              levels = c("Hypoglycaemia","Normal",
                                         "Pre-diabetes","Diabetes")),

    # Diabetes binary flag
    diabetes = case_when(
      is.na(glucose_mmol) ~ NA_integer_,
      glucose_mmol >= 7.0 ~ 1L,
      TRUE                ~ 0L
    ),

    # Pulse categories
    pulse_category = case_when(
      is.na(pulse_bpm)                      ~ NA_character_,
      pulse_bpm < 60                        ~ "Bradycardia",
      pulse_bpm >= 60 & pulse_bpm <= 100    ~ "Normal",
      pulse_bpm > 100                       ~ "Tachycardia",
      TRUE                                  ~ NA_character_
    ),
    pulse_category = factor(pulse_category,
                             levels = c("Bradycardia","Normal","Tachycardia")),

    # Communicable disease flags
    hep_b_pos    = case_when(
      is.na(hep_b)         ~ NA_integer_,
      hep_b == "POSITIVE"  ~ 1L,
      TRUE                 ~ 0L
    ),
    hep_c_pos    = case_when(
      is.na(hep_c)         ~ NA_integer_,
      hep_c == "POSITIVE"  ~ 1L,
      TRUE                 ~ 0L
    ),
    hiv_pos      = case_when(
      is.na(hiv)           ~ NA_integer_,
      hiv == "REACTIVE"    ~ 1L,
      TRUE                 ~ 0L
    ),
    syphilis_pos = case_when(
      is.na(syphilis)        ~ NA_integer_,
      syphilis == "REACTIVE" ~ 1L,
      TRUE                   ~ 0L
    ),

    # Any communicable disease
    any_comm_disease = case_when(
      hep_b_pos == 1 | hep_c_pos == 1 |
        hiv_pos == 1 | syphilis_pos == 1    ~ 1L,
      !is.na(hep_b_pos) & !is.na(hep_c_pos) &
        !is.na(hiv_pos) & !is.na(syphilis_pos) ~ 0L,
      TRUE                                  ~ NA_integer_
    ),

    # Any NCD
    any_ncd = case_when(
      hypertension == 1 | diabetes == 1               ~ 1L,
      !is.na(hypertension) & !is.na(diabetes)         ~ 0L,
      TRUE                                            ~ NA_integer_
    ),

    # HTN + DM comorbidity
    htn_dm_comorbid = case_when(
      hypertension == 1 & diabetes == 1               ~ 1L,
      !is.na(hypertension) & !is.na(diabetes)         ~ 0L,
      TRUE                                            ~ NA_integer_
    )
  )

cat("Data cleaning complete.\n")
## Data cleaning complete.
cat("Final dimensions:", nrow(df), "rows x", ncol(df), "columns\n")
## Final dimensions: 524 rows x 29 columns
cat("\nKey variable check:\n")
## 
## Key variable check:
cat("  sex values:     ", paste(unique(df$sex[!is.na(df$sex)]), collapse=", "), "\n")
##   sex values:      FEMALE, MALE
cat("  hep_b values:   ", paste(unique(df$hep_b[!is.na(df$hep_b)]), collapse=", "), "\n")
##   hep_b values:    NEGATIVE, POSITIVE
cat("  hiv values:     ", paste(unique(df$hiv[!is.na(df$hiv)]), collapse=", "), "\n")
##   hiv values:      NON REACTIVE, REACTIVE
cat("  syphilis values:", paste(unique(df$syphilis[!is.na(df$syphilis)]), collapse=", "), "\n")
##   syphilis values: REACTIVE, NON REACTIVE

4 Missing Data Analysis

miss_vars <- df %>%
  select(age, pulse_bpm, systolic, diastolic,
         glucose_mmol, hep_b, hep_c, hiv, syphilis)

miss_var_summary(miss_vars) %>%
  kable(caption = "Table M1: Missing Data Summary",
        col.names = c("Variable","N Missing","% Missing"),
        digits = 1) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE)
Table M1: Missing Data Summary
Variable N Missing % Missing
glucose_mmol 212 40.5
diastolic 130 24.8
pulse_bpm 129 24.6
systolic 129 24.6
age 6 1.15
hiv 4 0.763
hep_b 3 0.573
hep_c 3 0.573
syphilis 2 0.382
gg_miss_upset(miss_vars, nsets = 8, nintersects = 20)
Figure M1: Missing Data Pattern

Figure M1: Missing Data Pattern

vis_miss(miss_vars, sort_miss = TRUE) +
  labs(title = "Missing Data Heatmap",
       subtitle = "Dampong Medical Outreach, Ashanti Region, Ghana")
Figure M2: Missing Data Heatmap

Figure M2: Missing Data Heatmap

mcar_result <- mcar_test(miss_vars)
cat("Little's MCAR Test:\n")
## Little's MCAR Test:
cat("  Chi-square:", round(mcar_result$statistic, 2), "\n")
##   Chi-square: 348.24
cat("  df:", mcar_result$df, "\n")
##   df: 60
cat("  p-value:", format.pval(mcar_result$p.value, digits = 4), "\n")
##   p-value: < 2.2e-16
if (mcar_result$p.value < 0.05) {
  cat("  Result: NOT missing completely at random (MAR likely).\n")
  cat("  Action: Multiple Imputation applied in Section 9.\n")
} else {
  cat("  Result: Cannot reject MCAR.\n")
}
##   Result: NOT missing completely at random (MAR likely).
##   Action: Multiple Imputation applied in Section 9.
df %>%
  group_by(sex) %>%
  summarise(
    n             = n(),
    miss_glucose  = sum(is.na(glucose_mmol)),
    pct_glucose   = round(100 * miss_glucose / n, 1),
    miss_bp       = sum(is.na(systolic)),
    pct_bp        = round(100 * miss_bp / n, 1),
    .groups       = "drop"
  ) %>%
  kable(caption = "Table M2: Missingness by Sex") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Table M2: Missingness by Sex
sex n miss_glucose pct_glucose miss_bp pct_bp
FEMALE 380 126 33.2 70 18.4
MALE 144 86 59.7 59 41.0

5 Descriptive Statistics

vars_t1 <- c("age","age_group","pulse_bpm","pulse_category",
             "systolic","diastolic","bp_category",
             "glucose_mmol","glucose_category",
             "hep_b_pos","hep_c_pos","hiv_pos","syphilis_pos",
             "hypertension","diabetes","any_comm_disease",
             "any_ncd","htn_dm_comorbid")

cat_vars <- c("age_group","pulse_category","bp_category","glucose_category",
              "hep_b_pos","hep_c_pos","hiv_pos","syphilis_pos",
              "hypertension","diabetes","any_comm_disease",
              "any_ncd","htn_dm_comorbid")

tab1 <- CreateTableOne(
  vars       = vars_t1,
  data       = df,
  factorVars = cat_vars,
  strata     = "sex",
  addOverall = TRUE
)

print(tab1, showAllLevels = TRUE, quote = FALSE,
      noSpaces = TRUE, printToggle = FALSE) %>%
  kable(caption = "Table 1: Baseline Characteristics by Sex") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE, font_size = 11)
Table 1: Baseline Characteristics by Sex
level Overall FEMALE MALE p test
n 524 380 144
age (mean (SD)) 45.01 (24.78) 47.52 (23.39) 38.49 (27.11) <0.001
age_group (%) 0-18 132 (25.5) 72 (19.3) 60 (41.7) <0.001
19-35 54 (10.4) 45 (12.0) 9 (6.2)
36-50 92 (17.8) 72 (19.3) 20 (13.9)
51-65 113 (21.8) 90 (24.1) 23 (16.0)
65+ 127 (24.5) 95 (25.4) 32 (22.2)
pulse_bpm (mean (SD)) 80.99 (13.34) 81.46 (13.22) 79.28 (13.73) 0.182
pulse_category (%) Bradycardia 11 (2.8) 8 (2.6) 3 (3.5) 0.857
Normal 348 (88.1) 273 (88.1) 75 (88.2)
Tachycardia 36 (9.1) 29 (9.4) 7 (8.2)
systolic (mean (SD)) 136.17 (26.11) 136.23 (26.35) 135.93 (25.37) 0.925
diastolic (mean (SD)) 76.97 (14.85) 77.18 (15.33) 76.19 (13.03) 0.585
bp_category (%) Normal 106 (26.9) 83 (26.9) 23 (27.1) 0.719
Elevated 52 (13.2) 41 (13.3) 11 (12.9)
Stage 1 HTN 136 (34.5) 103 (33.3) 33 (38.8)
Stage 2 HTN 100 (25.4) 82 (26.5) 18 (21.2)
glucose_mmol (mean (SD)) 6.46 (2.58) 6.52 (2.65) 6.21 (2.27) 0.411
glucose_category (%) Hypoglycaemia 7 (2.2) 5 (2.0) 2 (3.4) 0.459
Normal 126 (40.4) 98 (38.6) 28 (48.3)
Pre-diabetes 95 (30.4) 80 (31.5) 15 (25.9)
Diabetes 84 (26.9) 71 (28.0) 13 (22.4)
hep_b_pos (%) 0 503 (96.5) 364 (96.3) 139 (97.2) 0.813
1 18 (3.5) 14 (3.7) 4 (2.8)
hep_c_pos (%) 0 519 (99.6) 376 (99.5) 143 (100.0) 0.938
1 2 (0.4) 2 (0.5) 0 (0.0)
hiv_pos (%) 0 515 (99.0) 374 (98.9) 141 (99.3) 1.000
1 5 (1.0) 4 (1.1) 1 (0.7)
syphilis_pos (%) 0 517 (99.0) 376 (99.2) 141 (98.6) 0.896
1 5 (1.0) 3 (0.8) 2 (1.4)
hypertension (%) 0 159 (40.3) 125 (40.3) 34 (40.0) 1.000
1 236 (59.7) 185 (59.7) 51 (60.0)
diabetes (%) 0 228 (73.1) 183 (72.0) 45 (77.6) 0.488
1 84 (26.9) 71 (28.0) 13 (22.4)
any_comm_disease (%) 0 490 (94.4) 355 (94.2) 135 (95.1) 0.852
1 29 (5.6) 22 (5.8) 7 (4.9)
any_ncd (%) 0 81 (23.5) 68 (24.7) 13 (18.8) 0.383
1 263 (76.5) 207 (75.3) 56 (81.2)
htn_dm_comorbid (%) 0 250 (81.4) 201 (80.4) 49 (86.0) 0.432
1 57 (18.6) 49 (19.6) 8 (14.0)
df %>%
  select(sex, age, age_group, pulse_bpm, systolic, diastolic,
         glucose_mmol, bp_category, glucose_category,
         hypertension, diabetes, hep_b_pos, hep_c_pos,
         hiv_pos, syphilis_pos, any_comm_disease, any_ncd) %>%
  tbl_summary(
    by = sex,
    statistic = list(
      all_continuous()  ~ "{mean} ({sd}) [{min}, {max}]",
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing_text = "Missing",
    label = list(
      age              ~ "Age (years)",
      age_group        ~ "Age Group",
      pulse_bpm        ~ "Pulse (bpm)",
      systolic         ~ "Systolic BP (mmHg)",
      diastolic        ~ "Diastolic BP (mmHg)",
      glucose_mmol     ~ "Blood Glucose (mmol/L)",
      bp_category      ~ "BP Category (ACC/AHA 2017)",
      glucose_category ~ "Glucose Category (WHO/ADA)",
      hypertension     ~ "Hypertension (Stage 1+2)",
      diabetes         ~ "Diabetes (≥7.0 mmol/L)",
      hep_b_pos        ~ "Hepatitis B Positive",
      hep_c_pos        ~ "Hepatitis C Positive",
      hiv_pos          ~ "HIV Reactive",
      syphilis_pos     ~ "Syphilis Reactive",
      any_comm_disease ~ "Any Communicable Disease",
      any_ncd          ~ "Any NCD (HTN or DM)"
    )
  ) %>%
  add_overall() %>%
  add_p(test = list(all_continuous()  ~ "wilcox.test",
                    all_categorical() ~ "chisq.test")) %>%
  bold_labels() %>%
  modify_caption("**Table 2: Baseline Characteristics — Dampong, Ghana (gtsummary)**")
Table 2: Baseline Characteristics — Dampong, Ghana (gtsummary)
Characteristic Overall
N = 524
1
FEMALE
N = 380
1
MALE
N = 144
1
p-value2
Age (years) 45 (25) [1, 97] 48 (23) [1, 97] 38 (27) [1, 92] <0.001
    Missing 6 6 0
Age Group


<0.001
    0-18 132 (25%) 72 (19%) 60 (42%)
    19-35 54 (10%) 45 (12%) 9 (6.3%)
    36-50 92 (18%) 72 (19%) 20 (14%)
    51-65 113 (22%) 90 (24%) 23 (16%)
    65+ 127 (25%) 95 (25%) 32 (22%)
    Missing 6 6 0
Pulse (bpm) 81 (13) [48, 124] 81 (13) [48, 120] 79 (14) [53, 124] 0.15
    Missing 129 70 59
Systolic BP (mmHg) 136 (26) [74, 245] 136 (26) [89, 245] 136 (25) [74, 222] 0.8
    Missing 129 70 59
Diastolic BP (mmHg) 77 (15) [45, 135] 77 (15) [49, 135] 76 (13) [45, 112] >0.9
    Missing 130 71 59
Blood Glucose (mmol/L) 6.46 (2.58) [3.10, 24.30] 6.52 (2.65) [3.10, 24.30] 6.21 (2.27) [3.60, 15.40] 0.2
    Missing 212 126 86
BP Category (ACC/AHA 2017)


0.7
    Normal 106 (27%) 83 (27%) 23 (27%)
    Elevated 52 (13%) 41 (13%) 11 (13%)
    Stage 1 HTN 136 (35%) 103 (33%) 33 (39%)
    Stage 2 HTN 100 (25%) 82 (27%) 18 (21%)
    Missing 130 71 59
Glucose Category (WHO/ADA)


0.5
    Hypoglycaemia 7 (2.2%) 5 (2.0%) 2 (3.4%)
    Normal 126 (40%) 98 (39%) 28 (48%)
    Pre-diabetes 95 (30%) 80 (31%) 15 (26%)
    Diabetes 84 (27%) 71 (28%) 13 (22%)
    Missing 212 126 86
Hypertension (Stage 1+2) 236 (60%) 185 (60%) 51 (60%) >0.9
    Missing 129 70 59
Diabetes (≥7.0 mmol/L) 84 (27%) 71 (28%) 13 (22%) 0.5
    Missing 212 126 86
Hepatitis B Positive 18 (3.5%) 14 (3.7%) 4 (2.8%) 0.8
    Missing 3 2 1
Hepatitis C Positive 2 (0.4%) 2 (0.5%) 0 (0%) >0.9
    Missing 3 2 1
HIV Reactive 5 (1.0%) 4 (1.1%) 1 (0.7%) >0.9
    Missing 4 2 2
Syphilis Reactive 5 (1.0%) 3 (0.8%) 2 (1.4%) 0.9
    Missing 2 1 1
Any Communicable Disease 29 (5.6%) 22 (5.8%) 7 (4.9%) 0.9
    Missing 5 3 2
Any NCD (HTN or DM) 263 (76%) 207 (75%) 56 (81%) 0.4
    Missing 180 105 75
1 Mean (SD) [Min, Max]; n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test

6 Age Distribution

df %>%
  filter(!is.na(age)) %>%
  ggplot(aes(x = age, fill = sex, colour = sex)) +
  geom_histogram(aes(y = after_stat(density)),
                 bins = 30, alpha = 0.5, position = "identity") +
  geom_density(alpha = 0, linewidth = 1) +
  facet_wrap(~sex, ncol = 2) +
  scale_fill_manual(values   = c("FEMALE" = "#E76F51", "MALE" = "#264653")) +
  scale_colour_manual(values = c("FEMALE" = "#E76F51", "MALE" = "#264653")) +
  labs(title = "Age Distribution of Study Participants",
       subtitle = "Dampong, Ashanti Region, Ghana",
       x = "Age (years)", y = "Density") +
  theme_pubr() + theme(legend.position = "bottom")
Figure 1: Age Distribution by Sex

Figure 1: Age Distribution by Sex

df %>%
  filter(!is.na(age_group)) %>%
  count(age_group, sex) %>%
  group_by(sex) %>%
  mutate(pct = 100 * n / sum(n)) %>%
  ggplot(aes(x = age_group, y = pct, fill = sex)) +
  geom_col(position = position_dodge(0.8), width = 0.7) +
  geom_text(aes(label = sprintf("%.1f%%\n(n=%d)", pct, n)),
            position = position_dodge(0.8), vjust = -0.3, size = 3.2) +
  scale_fill_manual(values = c("FEMALE" = "#E76F51", "MALE" = "#264653")) +
  labs(title = "Age Group Distribution by Sex",
       x = "Age Group", y = "Percentage (%)", fill = "Sex") +
  theme_pubr() + ylim(0, 40)
Figure 2: Age Group Distribution by Sex

Figure 2: Age Group Distribution by Sex

age_vals <- df$age[!is.na(df$age)]
sw <- shapiro.test(sample(age_vals, min(5000, length(age_vals))))
cat("Shapiro-Wilk: W =", round(sw$statistic,4), ", p =", format.pval(sw$p.value), "\n")
## Shapiro-Wilk: W = 0.9451 , p = 0

7 Blood Pressure Analysis

df %>%
  filter(!is.na(systolic), !is.na(diastolic)) %>%
  ggplot(aes(x = diastolic, y = systolic, colour = bp_category)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_hline(yintercept = c(130, 140), linetype="dashed",
             colour = c("orange","red")) +
  geom_vline(xintercept = c(80, 90),  linetype="dashed",
             colour = c("orange","red")) +
  scale_colour_manual(
    values   = c("Normal"="2EC4B6","Elevated"="#FFBF69",
                 "Stage 1 HTN"="#FF9F1C","Stage 2 HTN"="#E71D36"),
    na.value = "grey70"
  ) +
  labs(title  = "Systolic vs. Diastolic Blood Pressure",
       subtitle = "ACC/AHA 2017 Classification",
       x = "Diastolic BP (mmHg)", y = "Systolic BP (mmHg)",
       colour = "BP Category") +
  theme_pubr()
Figure 3: Systolic vs Diastolic BP

Figure 3: Systolic vs Diastolic BP

df %>%
  filter(!is.na(bp_category)) %>%
  count(bp_category, sex) %>%
  group_by(sex) %>%
  mutate(pct = 100 * n / sum(n)) %>%
  ggplot(aes(x = bp_category, y = pct, fill = sex)) +
  geom_col(position = position_dodge(0.8), width = 0.7) +
  geom_text(aes(label = sprintf("%.1f%%", pct)),
            position = position_dodge(0.8), vjust = -0.3, size = 3) +
  scale_fill_manual(values = c("FEMALE"="#E76F51","MALE"="#264653")) +
  labs(title="BP Category Distribution by Sex",
       x="BP Category", y="Percentage (%)", fill="Sex") +
  theme_pubr() + ylim(0, 65)
Figure 4: BP Category by Sex

Figure 4: BP Category by Sex

df %>%
  filter(!is.na(hypertension), !is.na(age_group)) %>%
  group_by(age_group) %>%
  summarise(
    n     = n(),
    htn_n = sum(hypertension, na.rm = TRUE),
    prev  = 100 * htn_n / n,
    se    = sqrt(prev * (100 - prev) / n),
    lower = pmax(0,   prev - 1.96 * se),
    upper = pmin(100, prev + 1.96 * se),
    .groups = "drop"
  ) %>%
  ggplot(aes(x = age_group, y = prev, group = 1)) +
  geom_line(colour="#E71D36", linewidth=1.2) +
  geom_point(size=4, colour="#E71D36") +
  geom_errorbar(aes(ymin=lower, ymax=upper),
                width=0.2, colour="#E71D36") +
  geom_label(aes(label=sprintf("%.1f%%\n(n=%d)", prev, n)),
             vjust=-0.5, size=3.2) +
  labs(title="Hypertension Prevalence by Age Group",
       subtitle="With 95% Confidence Intervals",
       x="Age Group", y="Prevalence (%)") +
  theme_pubr() + ylim(0, 100)
Figure 5: Hypertension Prevalence by Age Group

Figure 5: Hypertension Prevalence by Age Group

bp_sex <- df %>% filter(!is.na(systolic), !is.na(sex))
w_sbp  <- wilcox.test(systolic  ~ sex, data = bp_sex, conf.int = TRUE)
w_dbp  <- wilcox.test(diastolic ~ sex, data = bp_sex, conf.int = TRUE)

bp_sex %>%
  group_by(sex) %>%
  summarise(n      = n(),
            mean   = round(mean(systolic),1),
            sd     = round(sd(systolic),1),
            median = median(systolic),
            IQR    = IQR(systolic),
            htn_n  = sum(hypertension, na.rm=TRUE),
            htn_pct= round(100*htn_n/n,1),
            .groups="drop") %>%
  kable(caption="Systolic BP and Hypertension by Sex") %>%
  kable_styling(bootstrap_options=c("striped","hover"), full_width=FALSE)
Systolic BP and Hypertension by Sex
sex n mean sd median IQR htn_n htn_pct
FEMALE 310 136.2 26.3 131 32 185 59.7
MALE 85 135.9 25.4 136 33 51 60.0
cat("Wilcoxon (SBP): W =", w_sbp$statistic,
    ", p =", format.pval(w_sbp$p.value), "\n")
## Wilcoxon (SBP): W = 12970.5 , p = 0.82681
cat("Wilcoxon (DBP): W =", w_dbp$statistic,
    ", p =", format.pval(w_dbp$p.value), "\n")
## Wilcoxon (DBP): W = 13106 , p = 0.97769
bp_age <- df %>% filter(!is.na(systolic), !is.na(age))
cor_r  <- cor.test(bp_age$age, bp_age$systolic, method="spearman")
cat("Spearman rho =", round(cor_r$estimate,3),
    ", p =", format.pval(cor_r$p.value), "\n")
## Spearman rho = 0.428 , p = < 2.22e-16
ggscatter(bp_age, x="age", y="systolic",
          add="reg.line", conf.int=TRUE,
          add.params=list(colour="#E71D36"),
          colour="#264653", alpha=0.4,
          title="Age vs Systolic Blood Pressure",
          xlab="Age (years)", ylab="Systolic BP (mmHg)") +
  stat_cor(method="spearman", label.x=5, label.y=230)
Figure 6: Age vs Systolic BP

Figure 6: Age vs Systolic BP


8 Blood Glucose and Diabetes

df %>%
  filter(!is.na(glucose_mmol)) %>%
  ggplot(aes(x = glucose_mmol, fill = glucose_category)) +
  geom_histogram(bins=30, colour="white", alpha=0.85) +
  geom_vline(xintercept=5.6, linetype="dashed", colour="orange", linewidth=1) +
  geom_vline(xintercept=7.0, linetype="dashed", colour="red",    linewidth=1) +
  annotate("text", x=5.9, y=25, label="Pre-DM\n5.6",
           colour="orange", size=3.5) +
  annotate("text", x=7.3, y=25, label="DM\n7.0",
           colour="red",    size=3.5) +
  scale_fill_manual(
    values   = c("Hypoglycaemia"="#0077B6","Normal"="#2EC4B6",
                 "Pre-diabetes"="#FFBF69","Diabetes"="#E71D36"),
    na.value = "grey70"
  ) +
  facet_wrap(~sex, ncol=2) +
  labs(title="Blood Glucose Distribution by Sex",
       subtitle="WHO/ADA Diagnostic Thresholds Shown",
       x="Blood Glucose (mmol/L)", y="Count",
       fill="Category") +
  theme_pubr()
Figure 7: Blood Glucose Distribution by Sex

Figure 7: Blood Glucose Distribution by Sex

dm_sex <- df %>%
  filter(!is.na(diabetes), !is.na(sex)) %>%
  group_by(sex) %>%
  summarise(n=n(), dm_n=sum(diabetes,na.rm=TRUE),
            prev=round(100*dm_n/n,1),
            se  =sqrt(prev*(100-prev)/n),
            lower=round(pmax(0,   prev-1.96*se),1),
            upper=round(pmin(100, prev+1.96*se),1),
            .groups="drop")

dm_sex %>%
  kable(caption="Diabetes Prevalence by Sex",
        col.names=c("Sex","N","DM (n)","Prev (%)","SE","95% CI Low","95% CI High")) %>%
  kable_styling(bootstrap_options=c("striped","hover"), full_width=FALSE)
Diabetes Prevalence by Sex
Sex N DM (n) Prev (%) SE 95% CI Low 95% CI High
FEMALE 254 71 28.0 2.817270 22.5 33.5
MALE 58 13 22.4 5.474455 11.7 33.1
dm_chi <- chisq.test(table(df$sex, df$diabetes))
cat("Chi-square: X2 =", round(dm_chi$statistic,3),
    ", p =", format.pval(dm_chi$p.value), "\n")
## Chi-square: X2 = 0.482 , p = 0.48766
df %>%
  filter(!is.na(diabetes), !is.na(age_group)) %>%
  group_by(age_group) %>%
  summarise(n=n(), dm_n=sum(diabetes,na.rm=TRUE),
            prev=100*dm_n/n, se=sqrt(prev*(100-prev)/n),
            lower=pmax(0,prev-1.96*se), upper=pmin(100,prev+1.96*se),
            .groups="drop") %>%
  ggplot(aes(x=age_group, y=prev, group=1)) +
  geom_line(colour="#E76F51", linewidth=1.2) +
  geom_point(size=4, colour="#E76F51") +
  geom_errorbar(aes(ymin=lower,ymax=upper), width=0.2, colour="#E76F51") +
  geom_label(aes(label=sprintf("%.1f%%",prev)), vjust=-0.5, size=3.5) +
  labs(title="Diabetes Prevalence by Age Group",
       x="Age Group", y="Prevalence (%)") +
  theme_pubr() + ylim(0,60)
Figure 8: Diabetes Prevalence by Age Group

Figure 8: Diabetes Prevalence by Age Group


9 Communicable Disease Analysis

# Compute Wilson 95% CI
wilson_ci <- function(x, n) {
  z  <- qnorm(0.975)
  p  <- x / n
  lo <- (p + z^2/(2*n) - z*sqrt(p*(1-p)/n + z^2/(4*n^2))) / (1 + z^2/n)
  hi <- (p + z^2/(2*n) + z*sqrt(p*(1-p)/n + z^2/(4*n^2))) / (1 + z^2/n)
  c(p=p, lower=lo, upper=hi)
}

comm_prev <- tibble(
  Disease    = c("Hepatitis B (HBsAg)","Hepatitis C (HCV)",
                 "HIV (Retroviral)","Syphilis (VDRL/RPR)"),
  N_Tested   = c(sum(!is.na(df$hep_b_pos)), sum(!is.na(df$hep_c_pos)),
                 sum(!is.na(df$hiv_pos)),   sum(!is.na(df$syphilis_pos))),
  N_Positive = c(sum(df$hep_b_pos,    na.rm=TRUE),
                 sum(df$hep_c_pos,    na.rm=TRUE),
                 sum(df$hiv_pos,      na.rm=TRUE),
                 sum(df$syphilis_pos, na.rm=TRUE))
) %>%
  rowwise() %>%
  mutate(
    ci       = list(wilson_ci(N_Positive, N_Tested)),
    Prev_pct = round(100 * ci[["p"]],     2),
    CI_Lo    = round(100 * ci[["lower"]], 2),
    CI_Hi    = round(100 * ci[["upper"]], 2),
    CI_95    = paste0("(", CI_Lo, "–", CI_Hi, ")")
  ) %>%
  ungroup() %>%
  select(Disease, N_Tested, N_Positive, Prev_pct, CI_95)

comm_prev %>%
  kable(caption="Table 3: Communicable Disease Prevalence — Dampong, Ghana",
        col.names=c("Disease","N Tested","N Positive","Prevalence (%)","95% CI")) %>%
  kable_styling(bootstrap_options=c("striped","hover"), full_width=FALSE)
Table 3: Communicable Disease Prevalence — Dampong, Ghana
Disease N Tested N Positive Prevalence (%) 95% CI
Hepatitis B (HBsAg) 521 18 3.45 (2.2–5.39)
Hepatitis C (HCV) 521 2 0.38 (0.11–1.39)
HIV (Retroviral) 520 5 0.96 (0.41–2.23)
Syphilis (VDRL/RPR) 522 5 0.96 (0.41–2.22)
comm_prev %>%
  ggplot(aes(x=reorder(Disease, Prev_pct), y=Prev_pct, fill=Disease)) +
  geom_col(width=0.6, show.legend=FALSE) +
  geom_text(aes(label=paste0(Prev_pct,"% (n=",N_Positive,")")),
            hjust=-0.1, size=3.8) +
  scale_fill_brewer(palette="Set2") +
  coord_flip() +
  labs(title="Communicable Disease Prevalence",
       subtitle="Dampong, Ashanti Region, Ghana",
       x=NULL, y="Prevalence (%)") +
  theme_pubr() + ylim(0,6)
Figure 9: Communicable Disease Prevalence

Figure 9: Communicable Disease Prevalence

for (disease in c("hep_b_pos","hep_c_pos","hiv_pos","syphilis_pos")) {
  tbl <- table(df$sex, df[[disease]])
  ft  <- fisher.test(tbl)
  cat(disease, "— Fisher exact: OR =",
      round(ft$estimate, 3), ", p =",
      format.pval(ft$p.value), "\n")
}
## hep_b_pos — Fisher exact: OR = 0.749 , p = 0.79032 
## hep_c_pos — Fisher exact: OR = 0 , p = 1 
## hiv_pos — Fisher exact: OR = 0.664 , p = 1 
## syphilis_pos — Fisher exact: OR = 1.776 , p = 0.61809

10 Comorbidity Analysis

df_ncd <- df %>% filter(!is.na(hypertension), !is.na(diabetes))

comorbid <- df_ncd %>%
  count(hypertension, diabetes) %>%
  mutate(
    Category = case_when(
      hypertension==0 & diabetes==0 ~ "No HTN, No DM",
      hypertension==1 & diabetes==0 ~ "HTN only",
      hypertension==0 & diabetes==1 ~ "DM only",
      hypertension==1 & diabetes==1 ~ "HTN + DM (Comorbid)"
    ),
    pct = round(100*n/sum(n),1)
  ) %>%
  arrange(desc(n))

comorbid %>%
  select(Category, n, pct) %>%
  kable(caption="Table 4: NCD Comorbidity Patterns",
        col.names=c("Category","N","(%)")) %>%
  kable_styling(bootstrap_options=c("striped","hover"), full_width=FALSE)
Table 4: NCD Comorbidity Patterns
Category N (%)
HTN only 142 46.3
No HTN, No DM 81 26.4
HTN + DM (Comorbid) 57 18.6
DM only 27 8.8
comorbid %>%
  ggplot(aes(x="", y=pct, fill=Category)) +
  geom_col(width=1, colour="white") +
  coord_polar("y") +
  geom_text(aes(label=paste0(pct,"%")),
            position=position_stack(vjust=0.5), colour="white", size=4) +
  scale_fill_manual(
    values=c("No HTN, No DM"="#2EC4B6","HTN only"="#FF9F1C",
             "DM only"="#E76F51","HTN + DM (Comorbid)"="#E71D36")
  ) +
  labs(title="NCD Comorbidity Distribution", fill=NULL) +
  theme_void() +
  theme(legend.position="right",
        plot.title=element_text(face="bold", hjust=0.5))
Figure 10: NCD Comorbidity Pie Chart

Figure 10: NCD Comorbidity Pie Chart


11 Correlation Matrix

corr_df <- df %>%
  select(age, pulse_bpm, systolic, diastolic, glucose_mmol) %>%
  drop_na()

corr_mat <- cor(corr_df, method="spearman", use="complete.obs")

ggcorrplot(corr_mat, method="circle", type="lower",
           lab=TRUE, lab_size=3.5,
           colors=c("#0077B6","white","#E71D36"),
           title="Spearman Correlation — Clinical Variables",
           ggtheme=theme_pubr())
Figure 11: Spearman Correlation Matrix

Figure 11: Spearman Correlation Matrix


12 Logistic Regression

12.1 Hypertension

df_logit <- df %>%
  filter(!is.na(hypertension), !is.na(age),
         !is.na(sex), !is.na(glucose_mmol), !is.na(pulse_bpm)) %>%
  mutate(hypertension = as.integer(hypertension))

# Multivariable model
mv_htn <- glm(hypertension ~ age + sex + glucose_mmol + pulse_bpm,
              data   = df_logit,
              family = binomial(link="logit"))

tidy(mv_htn, exponentiate=TRUE, conf.int=TRUE) %>%
  filter(term != "(Intercept)") %>%
  kable(caption="Table 5: Multivariable Logistic Regression — Hypertension",
        digits=3,
        col.names=c("Predictor","OR","SE","Z","p-value","95% CI Low","95% CI High")) %>%
  kable_styling(bootstrap_options=c("striped","hover"), full_width=FALSE)
Table 5: Multivariable Logistic Regression — Hypertension
Predictor OR SE Z p-value 95% CI Low 95% CI High
age 1.048 0.009 5.451 0.000 1.031 1.066
sexMALE 0.993 0.334 -0.022 0.983 0.521 1.939
glucose_mmol 1.017 0.054 0.311 0.755 0.920 1.141
pulse_bpm 0.996 0.010 -0.452 0.651 0.976 1.015
# Hosmer-Lemeshow
hl <- hoslem.test(mv_htn$y, fitted(mv_htn), g=10)
cat("Hosmer-Lemeshow: X2 =", round(hl$statistic,3),
    ", p =", format.pval(hl$p.value), "\n")
## Hosmer-Lemeshow: X2 = 16.682 , p = 0.03359
# VIF
cat("VIF:\n"); print(vif(mv_htn))
## VIF:
##          age          sex glucose_mmol    pulse_bpm 
##     1.061836     1.016704     1.035811     1.042430
# Nagelkerke R2
ll_null <- logLik(glm(hypertension~1, data=df_logit, family=binomial))
ll_full <- logLik(mv_htn)
n       <- nobs(mv_htn)
cs  <- 1 - exp(2*(as.numeric(ll_null)-as.numeric(ll_full))/n)
nk  <- cs/(1-exp(2*as.numeric(ll_null)/n))
cat("Nagelkerke R2:", round(nk,3), "\n")
## Nagelkerke R2: 0.165
roc_htn <- roc(df_logit$hypertension, fitted(mv_htn))
plot(roc_htn, col="#E71D36", lwd=2,
     main=paste("ROC — Hypertension | AUC =", round(auc(roc_htn),3)))
abline(a=0, b=1, lty=2, col="grey60")
Figure 12: ROC Curve — Hypertension Model

Figure 12: ROC Curve — Hypertension Model

cat("AUC:", round(auc(roc_htn),3), "\n")
## AUC: 0.706

12.2 Diabetes

df_dm <- df %>%
  filter(!is.na(diabetes), !is.na(age), !is.na(sex),
         !is.na(systolic)) %>%
  mutate(diabetes = as.integer(diabetes))

mv_dm <- glm(diabetes ~ age + sex + systolic,
             data=df_dm, family=binomial(link="logit"))

tidy(mv_dm, exponentiate=TRUE, conf.int=TRUE) %>%
  filter(term != "(Intercept)") %>%
  kable(caption="Table 6: Multivariable Logistic Regression — Diabetes",
        digits=3) %>%
  kable_styling(bootstrap_options=c("striped","hover"), full_width=FALSE)
Table 6: Multivariable Logistic Regression — Diabetes
term estimate std.error statistic p.value conf.low conf.high
age 1.020 0.009 2.253 0.024 1.003 1.039
sexMALE 0.686 0.354 -1.063 0.288 0.332 1.342
systolic 1.012 0.005 2.293 0.022 1.002 1.023
roc_dm <- roc(df_dm$diabetes, fitted(mv_dm))
cat("Diabetes model AUC:", round(auc(roc_dm),3), "\n")
## Diabetes model AUC: 0.634

13 Population Burden Estimation

pop_dampong <- 12000
pop_ashanti <- 5800000

conditions <- list(
  list(label="Hypertension (Stage 1/2)",
       x=sum(df$hypertension,  na.rm=TRUE), n=sum(!is.na(df$hypertension))),
  list(label="Diabetes (≥7.0 mmol/L)",
       x=sum(df$diabetes,      na.rm=TRUE), n=sum(!is.na(df$diabetes))),
  list(label="Pre-diabetes",
       x=sum(df$glucose_mmol>=5.6 & df$glucose_mmol<7.0, na.rm=TRUE),
       n=sum(!is.na(df$glucose_mmol))),
  list(label="HTN + DM Comorbidity",
       x=sum(df$htn_dm_comorbid, na.rm=TRUE), n=sum(!is.na(df$htn_dm_comorbid))),
  list(label="Hepatitis B",
       x=sum(df$hep_b_pos,    na.rm=TRUE), n=sum(!is.na(df$hep_b_pos))),
  list(label="HIV",
       x=sum(df$hiv_pos,      na.rm=TRUE), n=sum(!is.na(df$hiv_pos))),
  list(label="Syphilis",
       x=sum(df$syphilis_pos, na.rm=TRUE), n=sum(!is.na(df$syphilis_pos))),
  list(label="Hepatitis C",
       x=sum(df$hep_c_pos,    na.rm=TRUE), n=sum(!is.na(df$hep_c_pos)))
)

pop_table <- map_dfr(conditions, function(cond) {
  ci <- wilson_ci(cond$x, cond$n)
  tibble(
    Condition    = cond$label,
    N_Tested     = cond$n,
    N_Positive   = cond$x,
    Prev_pct     = round(100*ci["p"],    2),
    CI_Lo        = round(100*ci["lower"],2),
    CI_Hi        = round(100*ci["upper"],2),
    Est_Dampong  = scales::comma(round(pop_dampong * ci["p"])),
    Est_Ashanti  = scales::comma(round(pop_ashanti * ci["p"]))
  )
})

pop_table %>%
  select(Condition, N_Tested, N_Positive, Prev_pct, CI_Lo, CI_Hi,
         Est_Dampong, Est_Ashanti) %>%
  kable(caption="Table 7: Population-Level Disease Burden Estimates",
        col.names=c("Condition","N Tested","N Positive","Prev (%)","CI Low","CI High",
                    "Est. Dampong","Est. Ashanti")) %>%
  kable_styling(bootstrap_options=c("striped","hover","condensed"),
                full_width=TRUE, font_size=11) %>%
  footnote(general=paste0(
    "Dampong est. population ≈ 12,000; Ashanti Region ≈ 5,800,000 (GSS 2021). ",
    "Wilson method 95% CI."))
Table 7: Population-Level Disease Burden Estimates
Condition N Tested N Positive Prev (%) CI Low CI High Est. Dampong Est. Ashanti
Hypertension (Stage 1/2) 395 236 59.75 54.84 64.47 7,170 3,465,316
Diabetes (≥7.0 mmol/L) 312 84 26.92 22.30 32.10 3,231 1,561,538
Pre-diabetes 312 95 30.45 25.61 35.77 3,654 1,766,026
HTN + DM Comorbidity 307 57 18.57 14.62 23.30 2,228 1,076,873
Hepatitis B 521 18 3.45 2.20 5.39 415 200,384
HIV 520 5 0.96 0.41 2.23 115 55,769
Syphilis 522 5 0.96 0.41 2.22 115 55,556
Hepatitis C 521 2 0.38 0.11 1.39 46 22,265
Note:
Dampong est. population ≈ 12,000; Ashanti Region ≈ 5,800,000 (GSS 2021). Wilson method 95% CI.
pop_table %>%
  mutate(Type=if_else(Condition %in% c("Hypertension (Stage 1/2)",
                                        "Diabetes (≥7.0 mmol/L)",
                                        "Pre-diabetes","HTN + DM Comorbidity"),
                      "Non-Communicable","Communicable")) %>%
  ggplot(aes(x=Prev_pct, y=reorder(Condition,Prev_pct),
             colour=Type, shape=Type)) +
  geom_point(size=5) +
  geom_errorbarh(aes(xmin=CI_Lo, xmax=CI_Hi), height=0.3, linewidth=1) +
  scale_colour_manual(values=c("Non-Communicable"="#E71D36",
                                "Communicable"="#0077B6")) +
  labs(title="Disease Prevalence — Dampong, Ashanti Region, Ghana",
       subtitle="With 95% Wilson Confidence Intervals",
       x="Prevalence (%)", y=NULL,
       colour="Disease Type", shape="Disease Type") +
  theme_pubr() + theme(legend.position="bottom")
Figure 13: Prevalence Forest Plot

Figure 13: Prevalence Forest Plot


14 Multiple Imputation Sensitivity Analysis

# ── Include hypertension and diabetes in the imputation dataset ───────────────
# Both must be present in mice_input so that with(mice_out, glm(...)) can
# find them inside each of the 20 imputed datasets.
mice_input <- df %>%
  select(sex, age, pulse_bpm, systolic, diastolic,
         glucose_mmol, hep_b_pos, hep_c_pos, hiv_pos, syphilis_pos,
         hypertension, diabetes) %>%          # ← these two are required
  mutate(
    sex          = factor(sex),
    hypertension = as.integer(hypertension),  # binary → logreg method
    diabetes     = as.integer(diabetes)       # binary → logreg method
  )

# Set imputation method per variable type
# pmm    = predictive mean matching (continuous/count)
# logreg = logistic regression (binary 0/1)
# ""     = do not impute (already complete)
meth <- make.method(mice_input)
meth["hypertension"] <- "logreg"
meth["diabetes"]     <- "logreg"

mice_out <- mice(mice_input,
                 m         = 20,
                 maxit     = 50,
                 method    = meth,
                 seed      = 2024,
                 printFlag = FALSE)

cat("MICE complete.\n")
## MICE complete.
# Pool logistic regression results across 20 imputed datasets
fit_imp <- with(mice_out,
                glm(hypertension ~ age + sex + glucose_mmol + pulse_bpm,
                    family = binomial))

pooled <- pool(fit_imp)

summary(pooled, conf.int = TRUE, exponentiate = TRUE) %>%
  filter(term != "(Intercept)") %>%
  kable(caption = "Table 8: Pooled MI Results — Hypertension (20 imputations)",
        digits  = 3) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Table 8: Pooled MI Results — Hypertension (20 imputations)
term estimate std.error statistic df p.value 2.5 % 97.5 % conf.low conf.high
2 age 1.036 0.006 5.695 54.562 0.000 1.023 1.049 1.023 1.049
3 sexMALE 0.875 0.267 -0.499 119.311 0.619 0.515 1.486 0.515 1.486
4 glucose_mmol 1.056 0.053 1.026 88.589 0.308 0.950 1.173 0.950 1.173
5 pulse_bpm 1.006 0.008 0.712 253.702 0.477 0.990 1.022 0.990 1.022

15 Combined Summary Panel

p1 <- df %>% filter(!is.na(bp_category)) %>%
  count(bp_category) %>% mutate(pct=100*n/sum(n)) %>%
  ggplot(aes(x=reorder(bp_category,pct), y=pct, fill=bp_category)) +
  geom_col(show.legend=FALSE) +
  geom_text(aes(label=paste0(round(pct,1),"%")), hjust=-0.1, size=3.5) +
  coord_flip() +
  scale_fill_manual(values=c("Normal"="#2EC4B6","Elevated"="#FFBF69",
                              "Stage 1 HTN"="#FF9F1C","Stage 2 HTN"="#E71D36")) +
  labs(title="BP Categories", x=NULL, y="%") +
  theme_pubr() + ylim(0,55)

p2 <- df %>% filter(!is.na(glucose_category)) %>%
  count(glucose_category) %>% mutate(pct=100*n/sum(n)) %>%
  ggplot(aes(x=reorder(glucose_category,pct), y=pct, fill=glucose_category)) +
  geom_col(show.legend=FALSE) +
  geom_text(aes(label=paste0(round(pct,1),"%")), hjust=-0.1, size=3.5) +
  coord_flip() +
  scale_fill_manual(values=c("Hypoglycaemia"="#0077B6","Normal"="#2EC4B6",
                              "Pre-diabetes"="#FFBF69","Diabetes"="#E71D36")) +
  labs(title="Glucose Categories", x=NULL, y="%") +
  theme_pubr() + ylim(0,50)

p3 <- comm_prev %>%
  ggplot(aes(x=reorder(Disease,Prev_pct), y=Prev_pct, fill=Disease)) +
  geom_col(show.legend=FALSE) +
  geom_text(aes(label=paste0(Prev_pct,"%")), hjust=-0.1, size=3.5) +
  coord_flip() +
  scale_fill_brewer(palette="Set2") +
  labs(title="Communicable Diseases", x=NULL, y="%") +
  theme_pubr() + ylim(0,6)

p4 <- df %>% filter(!is.na(age_group), !is.na(hypertension)) %>%
  group_by(age_group) %>%
  summarise(prev=100*mean(hypertension,na.rm=TRUE), .groups="drop") %>%
  ggplot(aes(x=age_group, y=prev, fill=age_group)) +
  geom_col(show.legend=FALSE) +
  scale_fill_viridis_d() +
  labs(title="HTN by Age Group", x="Age Group", y="%") +
  theme_pubr()

(p1 + p2) / (p3 + p4) +
  plot_annotation(
    title    = "Disease Burden in Dampong, Ashanti Region, Ghana",
    subtitle = "Community Medical Outreach Survey — N = 524",
    theme    = theme(
      plot.title    = element_text(face="bold", size=16, hjust=0.5),
      plot.subtitle = element_text(hjust=0.5)
    )
  )
Figure 14: Summary Panel

Figure 14: Summary Panel


16 Session Information

sessionInfo()
## R version 4.6.0 (2026-04-24)
## Platform: aarch64-apple-darwin23
## Running under: macOS Tahoe 26.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.6/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.6/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggmosaic_0.4.0          rmarkdown_2.31          knitr_1.51             
##  [4] kableExtra_1.4.0        scales_1.4.0            patchwork_1.3.2        
##  [7] ggcorrplot_0.1.4.1      rstatix_0.7.3           nortest_1.0-4          
## [10] psych_2.6.5             Hmisc_5.2-5             nnet_7.3-20            
## [13] MASS_7.3-65             ResourceSelection_0.3-6 pROC_1.19.0.1          
## [16] broom_1.0.13            srvyr_1.3.1             survey_4.5             
## [19] epiR_2.0.93             survival_3.8-6          epitools_0.5-10.1      
## [22] lme4_2.0-1              Matrix_1.7-5            car_3.1-5              
## [25] carData_3.0-6           corrplot_0.95           ggpubr_0.6.3           
## [28] gt_1.3.0                flextable_0.9.11        gtsummary_2.5.0        
## [31] tableone_0.13.2         VIM_7.0.0               colorspace_2.1-2       
## [34] mice_3.19.0             naniar_1.1.0            skimr_2.2.2            
## [37] janitor_2.2.1           lubridate_1.9.5         forcats_1.0.1          
## [40] stringr_1.6.0           dplyr_1.2.1             purrr_1.2.2            
## [43] readr_2.2.0             tidyr_1.3.2             tibble_3.3.1           
## [46] ggplot2_4.0.3.9000      tidyverse_2.0.0         readxl_1.5.0           
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.6.0           norm_1.0-11.1           palmerpenguins_0.1.1   
##   [4] BiasedUrn_2.0.12        cellranger_1.1.0        rpart_4.1.27           
##   [7] lifecycle_1.0.5         sf_1.1-1                Rdpack_2.6.6           
##  [10] globals_0.19.1          lattice_0.22-9          backports_1.5.1        
##  [13] magrittr_2.0.5          vcd_1.4-13              plotly_4.12.0          
##  [16] sass_0.4.10             jquerylib_0.1.4         yaml_2.3.12            
##  [19] otel_0.2.0              zip_2.3.3               askpass_1.2.1          
##  [22] sp_2.2-1                DBI_1.3.0               minqa_1.2.8            
##  [25] RColorBrewer_1.1-3      abind_1.4-8             mlr3misc_0.21.0        
##  [28] labelled_2.16.0         paradox_1.0.1           gdtools_0.5.1          
##  [31] ggrepel_0.9.8           listenv_0.10.1          mlr3learners_0.14.0    
##  [34] cards_0.8.0             units_1.0-1             cardx_0.3.2            
##  [37] parallelly_1.47.0       commonmark_2.0.0        svglite_2.2.2          
##  [40] codetools_0.2-20        xml2_1.5.2              tidyselect_1.2.1       
##  [43] shape_1.4.6.1           farver_2.1.2            stats4_4.6.0           
##  [46] base64enc_0.1-6         jsonlite_2.0.0          e1071_1.7-17           
##  [49] mitml_0.4-5             Formula_1.2-5           iterators_1.0.14       
##  [52] systemfonts_1.3.2       foreach_1.5.2           tools_4.6.0            
##  [55] bbotk_1.10.0            ragg_1.5.2              Rcpp_1.1.1-1.1         
##  [58] glue_1.8.1              mnormt_2.1.2            gridExtra_2.3          
##  [61] pan_1.9                 mgcv_1.9-4              laeken_0.5.3           
##  [64] xfun_0.57               ranger_0.18.0           mlr3_1.6.0             
##  [67] withr_3.0.2             fastmap_1.2.0           mitools_2.4            
##  [70] boot_1.3-32             openssl_2.4.1           litedown_0.9           
##  [73] digest_0.6.39           timechange_0.4.0        R6_2.6.1               
##  [76] mlr3tuning_1.6.0        visdat_0.6.0            textshaping_1.0.5      
##  [79] markdown_2.0            UpSetR_1.4.1            utf8_1.2.6             
##  [82] generics_0.1.4          fontLiberation_0.1.0    data.table_1.18.4      
##  [85] robustbase_0.99-7       class_7.3-23            httr_1.4.8             
##  [88] htmlwidgets_1.6.4       pkgconfig_2.0.3         gtable_0.3.6           
##  [91] mlr3pipelines_0.11.0    lmtest_0.9-40           S7_0.2.2               
##  [94] htmltools_0.5.9         fontBitstreamVera_0.1.1 reformulas_0.4.4       
##  [97] lgr_0.5.2               snakecase_0.11.1        rstudioapi_0.18.0      
## [100] reshape2_1.4.5          tzdb_0.5.0              uuid_1.2-2             
## [103] checkmate_2.3.4         nlme_3.1-169            nloptr_2.2.1           
## [106] repr_1.1.7              proxy_0.4-29            cachem_1.1.0           
## [109] zoo_1.8-15              KernSmooth_2.23-26      parallel_4.6.0         
## [112] foreign_0.8-91          pillar_1.11.1           vctrs_0.7.3            
## [115] jomo_2.7-6              cluster_2.1.8.2         htmlTable_2.5.0        
## [118] evaluate_1.0.5          cli_3.6.6               compiler_4.6.0         
## [121] rlang_1.2.0             crayon_1.5.3            ggsignif_0.6.4         
## [124] labeling_0.4.3          classInt_0.4-11         plyr_1.8.9             
## [127] fs_2.1.0                stringi_1.8.7           pander_0.6.6           
## [130] viridisLite_0.4.3       lazyeval_0.2.3          productplots_0.1.2     
## [133] glmnet_5.0              fontquiver_0.2.1        hms_1.1.4              
## [136] future_1.70.0           haven_2.5.5             rbibutils_2.4.1        
## [139] bslib_0.11.0            DEoptimR_1.1-4          officer_0.7.5

Emmanuel Nana Arko, DrPH(s), MPH, PA-C — University at Albany, SUNY
May 28, 2026