1 Package Installation and Loading

# ── STEP 1: Install CRAN packages if missing ─────────────────────────────────
cran_packages <- c(
  "readxl",
  "tidyverse",
  "janitor",
  "skimr",
  "naniar",
  "mice",
  "VIM",
  "tableone",
  "gtsummary",
  "flextable",
  "gt",
  "ggpubr",
  "corrplot",
  "car",
  "lme4",
  "epitools",
  "epiR",
  "survey",
  "srvyr",
  "broom",
  "pROC",
  "ResourceSelection",
  "MASS",
  "nnet",
  "Hmisc",
  "psych",
  "nortest",
  "rstatix",
  "ggcorrplot",
  "patchwork",
  "scales",
  "kableExtra",
  "knitr",
  "rmarkdown"
)

new_cran <- cran_packages[!cran_packages %in% rownames(installed.packages())]
if (length(new_cran) > 0) {
  install.packages(new_cran, dependencies = TRUE,
                   repos = "https://cran.rstudio.com/")
}

# ── STEP 2: Install ggmosaic from GitHub (not reliably on CRAN for R 4.6) ───
if (!requireNamespace("ggmosaic", quietly = TRUE)) {
  if (!requireNamespace("remotes", quietly = TRUE))
    install.packages("remotes", repos = "https://cran.rstudio.com/")
  remotes::install_github("haleyjeppson/ggmosaic")
}

# ── STEP 3: Load packages — flextable loaded LAST to avoid masking ───────────
# Load all CRAN packages
invisible(lapply(cran_packages, library, character.only = TRUE))

# Load ggmosaic
library(ggmosaic)

# Reload flextable last so its border/font/rotate are NOT masked by ggpubr
library(flextable)

cat("R version:", R.version$version.string, "\n")

## R version: R version 4.6.0 (2026-04-24)

cat("All packages loaded successfully.\n")

## All packages loaded successfully.

2 Data Import and Initial Inspection

# ── SAFE IMPORT: rename columns by POSITION to avoid special-character issues ─
# The Excel file has these 12 columns in order:
# PATH NO. | NAME | SEX | AGE | PULSE (bpm) | BLOOD PRESSURE (mmHg) |
# BLOOD GLUCOSE (mmol/L) | HEPATITIS B (HBsAg | HEPATITIS C (HCV) |
# HIV(Retr) | SYPHILIS(VDRL) | CONTACT

# ── Update this path to match where your file is saved ───────────────────────
data_path <- '/Users/emmanuelsmac/Desktop/DAMPONG complete.xlsx'   # <-- change if needed

raw <- read_excel(data_path, sheet = 1)

# Rename all 12 columns by position — bypasses ALL special character problems
names(raw) <- c(
  "path_no",     # PATH NO.
  "name",        # NAME
  "sex",         # SEX
  "age",         # AGE
  "pulse_bpm",   # PULSE (bpm)
  "bp_mmhg",     # BLOOD PRESSURE (mmHg)
  "glucose_mmol",# BLOOD GLUCOSE (mmol/L)
  "hep_b",       # HEPATITIS B (HBsAg   [note: missing closing bracket in source]
  "hep_c",       # HEPATITIS C (HCV)
  "hiv",         # HIV(Retr)
  "syphilis",    # SYPHILIS(VDRL)
  "contact"      # CONTACT
)

cat("Import successful.\n")

## Import successful.

cat("Dimensions:", nrow(raw), "rows x", ncol(raw), "columns\n")

## Dimensions: 524 rows x 12 columns

cat("\nColumn names confirmed:\n")

## 
## Column names confirmed:

print(names(raw))

##  [1] "path_no"      "name"         "sex"          "age"          "pulse_bpm"   
##  [6] "bp_mmhg"      "glucose_mmol" "hep_b"        "hep_c"        "hiv"         
## [11] "syphilis"     "contact"

cat("\nFirst 5 rows:\n")

## 
## First 5 rows:

print(head(raw, 5))

## # A tibble: 5 × 12
##   path_no name      sex     age pulse_bpm bp_mmhg glucose_mmol hep_b hep_c hiv  
##   <chr>   <chr>     <chr> <dbl>     <dbl> <chr>          <dbl> <chr> <chr> <chr>
## 1 DAM0001 GLADYS A… FEMA…    60        68 150/94           5.6 NEGA… NEGA… NON …
## 2 DAM0002 ALFRED D… MALE     59        92 154/82          15.4 NEGA… NEGA… NON …
## 3 DAM0003 AGNES AS… FEMA…    82        96 141/88           5.2 <NA>  <NA>  <NA> 
## 4 DAM0004 AKUA SER… FEMA…    62        66 128/77           6   NEGA… NEGA… NON …
## 5 DAM0005 CECILIA … FEMA…    64        80 164/91          11.1 NEGA… NEGA… NON …
## # ℹ 2 more variables: syphilis <chr>, contact <chr>

# ── Comprehensive summary ─────────────────────────────────────────────────────
skim(raw)

Data summary
Name	raw
Number of rows	524
Number of columns	12
_______________________
Column type frequency:
character	9
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
path_no	172	0.67	6	7	352
name	0	1.00	6	26	523
sex	0	1.00	4	6	2
bp_mmhg	129	0.75	4	7	358
hep_b	3	0.99	8	8	2
hep_c	3	0.99	8	8	2
hiv	4	0.99	8	12	2
syphilis	2	1.00	8	12	2
contact	238	0.55	9	10	272

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
age	6	0.99	45.01	24.78	1.0	18.00	48	65.0	97.0	▇▃▇▇▂
pulse_bpm	129	0.75	80.99	13.34	48.0	71.00	80	89.0	124.0	▂▇▇▃▁
glucose_mmol	212	0.60	6.46	2.58	3.1	4.97	6	7.1	24.3	▇▂▁▁▁

3 Data Cleaning and Feature Engineering

# ── STEP 1: Standardise text fields ──────────────────────────────────────────
df <- raw %>%
  mutate(
    sex      = str_trim(str_to_upper(as.character(sex))),
    hep_b    = str_trim(str_to_upper(as.character(hep_b))),
    hep_c    = str_trim(str_to_upper(as.character(hep_c))),
    hiv      = str_trim(str_to_upper(as.character(hiv))),
    syphilis = str_trim(str_to_upper(as.character(syphilis)))
  )

# ── STEP 2: Parse blood pressure robustly ────────────────────────────────────
# Handles malformed entries like "124/" (row 399 in this dataset)
df <- df %>%
  mutate(
    bp_clean  = str_trim(as.character(bp_mmhg)),
    systolic  = suppressWarnings(
                  as.numeric(str_extract(bp_clean, "^[0-9]+"))),
    diastolic = suppressWarnings(
                  as.numeric(str_extract(bp_clean, "(?<=/)[0-9]+")))
  )

# Report any malformed BP entries
bp_malformed <- df %>%
  filter(!is.na(bp_clean), bp_clean != "NA",
         (is.na(systolic) | is.na(diastolic))) %>%
  select(path_no, name, bp_mmhg)

cat("Malformed BP entries:", nrow(bp_malformed), "\n")

## Malformed BP entries: 1

if (nrow(bp_malformed) > 0) print(bp_malformed)

## # A tibble: 1 × 3
##   path_no name             bp_mmhg
##   <chr>   <chr>            <chr>  
## 1 <NA>    ELIZABETH MENSAH 124/

# ── STEP 3: Clinical categories ──────────────────────────────────────────────
df <- df %>%
  mutate(

    # Age groups
    age_group = cut(age,
                    breaks = c(0, 18, 35, 50, 65, Inf),
                    labels = c("0-18","19-35","36-50","51-65","65+"),
                    right  = TRUE),

    # BP categories (ACC/AHA 2017)
    bp_category = case_when(
      is.na(systolic) | is.na(diastolic)            ~ NA_character_,
      systolic < 120 & diastolic < 80               ~ "Normal",
      systolic >= 120 & systolic <= 129 &
        diastolic < 80                              ~ "Elevated",
      (systolic >= 130 & systolic <= 139) |
        (diastolic >= 80 & diastolic <= 89)         ~ "Stage 1 HTN",
      systolic >= 140 | diastolic >= 90             ~ "Stage 2 HTN",
      TRUE                                          ~ NA_character_
    ),
    bp_category = factor(bp_category,
                         levels = c("Normal","Elevated",
                                    "Stage 1 HTN","Stage 2 HTN")),

    # Hypertension binary flag
    hypertension = case_when(
      is.na(systolic)                     ~ NA_integer_,
      systolic >= 130 | diastolic >= 80   ~ 1L,
      TRUE                                ~ 0L
    ),

    # Hypertensive crisis
    htn_crisis = case_when(
      is.na(systolic)                      ~ NA_integer_,
      systolic > 180 | diastolic > 120    ~ 1L,
      TRUE                                ~ 0L
    ),

    # Glucose categories (WHO/ADA)
    glucose_category = case_when(
      is.na(glucose_mmol)                            ~ NA_character_,
      glucose_mmol < 3.9                             ~ "Hypoglycaemia",
      glucose_mmol >= 3.9 & glucose_mmol < 5.6      ~ "Normal",
      glucose_mmol >= 5.6 & glucose_mmol < 7.0      ~ "Pre-diabetes",
      glucose_mmol >= 7.0                            ~ "Diabetes",
      TRUE                                           ~ NA_character_
    ),
    glucose_category = factor(glucose_category,
                              levels = c("Hypoglycaemia","Normal",
                                         "Pre-diabetes","Diabetes")),

    # Diabetes binary flag
    diabetes = case_when(
      is.na(glucose_mmol) ~ NA_integer_,
      glucose_mmol >= 7.0 ~ 1L,
      TRUE                ~ 0L
    ),

    # Pulse categories
    pulse_category = case_when(
      is.na(pulse_bpm)                      ~ NA_character_,
      pulse_bpm < 60                        ~ "Bradycardia",
      pulse_bpm >= 60 & pulse_bpm <= 100    ~ "Normal",
      pulse_bpm > 100                       ~ "Tachycardia",
      TRUE                                  ~ NA_character_
    ),
    pulse_category = factor(pulse_category,
                             levels = c("Bradycardia","Normal","Tachycardia")),

    # Communicable disease flags
    hep_b_pos    = case_when(
      is.na(hep_b)         ~ NA_integer_,
      hep_b == "POSITIVE"  ~ 1L,
      TRUE                 ~ 0L
    ),
    hep_c_pos    = case_when(
      is.na(hep_c)         ~ NA_integer_,
      hep_c == "POSITIVE"  ~ 1L,
      TRUE                 ~ 0L
    ),
    hiv_pos      = case_when(
      is.na(hiv)           ~ NA_integer_,
      hiv == "REACTIVE"    ~ 1L,
      TRUE                 ~ 0L
    ),
    syphilis_pos = case_when(
      is.na(syphilis)        ~ NA_integer_,
      syphilis == "REACTIVE" ~ 1L,
      TRUE                   ~ 0L
    ),

    # Any communicable disease
    any_comm_disease = case_when(
      hep_b_pos == 1 | hep_c_pos == 1 |
        hiv_pos == 1 | syphilis_pos == 1    ~ 1L,
      !is.na(hep_b_pos) & !is.na(hep_c_pos) &
        !is.na(hiv_pos) & !is.na(syphilis_pos) ~ 0L,
      TRUE                                  ~ NA_integer_
    ),

    # Any NCD
    any_ncd = case_when(
      hypertension == 1 | diabetes == 1               ~ 1L,
      !is.na(hypertension) & !is.na(diabetes)         ~ 0L,
      TRUE                                            ~ NA_integer_
    ),

    # HTN + DM comorbidity
    htn_dm_comorbid = case_when(
      hypertension == 1 & diabetes == 1               ~ 1L,
      !is.na(hypertension) & !is.na(diabetes)         ~ 0L,
      TRUE                                            ~ NA_integer_
    )
  )

cat("Data cleaning complete.\n")

## Data cleaning complete.

cat("Final dimensions:", nrow(df), "rows x", ncol(df), "columns\n")

## Final dimensions: 524 rows x 29 columns

cat("\nKey variable check:\n")

## 
## Key variable check:

cat("  sex values:     ", paste(unique(df$sex[!is.na(df$sex)]), collapse=", "), "\n")

##   sex values:      FEMALE, MALE

cat("  hep_b values:   ", paste(unique(df$hep_b[!is.na(df$hep_b)]), collapse=", "), "\n")

##   hep_b values:    NEGATIVE, POSITIVE

cat("  hiv values:     ", paste(unique(df$hiv[!is.na(df$hiv)]), collapse=", "), "\n")

##   hiv values:      NON REACTIVE, REACTIVE

cat("  syphilis values:", paste(unique(df$syphilis[!is.na(df$syphilis)]), collapse=", "), "\n")

##   syphilis values: REACTIVE, NON REACTIVE

4 Missing Data Analysis

miss_vars <- df %>%
  select(age, pulse_bpm, systolic, diastolic,
         glucose_mmol, hep_b, hep_c, hiv, syphilis)

miss_var_summary(miss_vars) %>%
  kable(caption = "Table M1: Missing Data Summary",
        col.names = c("Variable","N Missing","% Missing"),
        digits = 1) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE)

Table M1: Missing Data Summary
Variable	N Missing	% Missing
glucose_mmol	212	40.5
diastolic	130	24.8
pulse_bpm	129	24.6
systolic	129	24.6
age	6	1.15
hiv	4	0.763
hep_b	3	0.573
hep_c	3	0.573
syphilis	2	0.382

gg_miss_upset(miss_vars, nsets = 8, nintersects = 20)

Figure M1: Missing Data Pattern

vis_miss(miss_vars, sort_miss = TRUE) +
  labs(title = "Missing Data Heatmap",
       subtitle = "Dampong Medical Outreach, Ashanti Region, Ghana")

Figure M2: Missing Data Heatmap

mcar_result <- mcar_test(miss_vars)
cat("Little's MCAR Test:\n")

## Little's MCAR Test:

cat("  Chi-square:", round(mcar_result$statistic, 2), "\n")

##   Chi-square: 348.24

cat("  df:", mcar_result$df, "\n")

##   df: 60

cat("  p-value:", format.pval(mcar_result$p.value, digits = 4), "\n")

##   p-value: < 2.2e-16

if (mcar_result$p.value < 0.05) {
  cat("  Result: NOT missing completely at random (MAR likely).\n")
  cat("  Action: Multiple Imputation applied in Section 9.\n")
} else {
  cat("  Result: Cannot reject MCAR.\n")
}

##   Result: NOT missing completely at random (MAR likely).
##   Action: Multiple Imputation applied in Section 9.

df %>%
  group_by(sex) %>%
  summarise(
    n             = n(),
    miss_glucose  = sum(is.na(glucose_mmol)),
    pct_glucose   = round(100 * miss_glucose / n, 1),
    miss_bp       = sum(is.na(systolic)),
    pct_bp        = round(100 * miss_bp / n, 1),
    .groups       = "drop"
  ) %>%
  kable(caption = "Table M2: Missingness by Sex") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Table M2: Missingness by Sex
sex	n	miss_glucose	pct_glucose	miss_bp	pct_bp
FEMALE	380	126	33.2	70	18.4
MALE	144	86	59.7	59	41.0

5 Descriptive Statistics

vars_t1 <- c("age","age_group","pulse_bpm","pulse_category",
             "systolic","diastolic","bp_category",
             "glucose_mmol","glucose_category",
             "hep_b_pos","hep_c_pos","hiv_pos","syphilis_pos",
             "hypertension","diabetes","any_comm_disease",
             "any_ncd","htn_dm_comorbid")

cat_vars <- c("age_group","pulse_category","bp_category","glucose_category",
              "hep_b_pos","hep_c_pos","hiv_pos","syphilis_pos",
              "hypertension","diabetes","any_comm_disease",
              "any_ncd","htn_dm_comorbid")

tab1 <- CreateTableOne(
  vars       = vars_t1,
  data       = df,
  factorVars = cat_vars,
  strata     = "sex",
  addOverall = TRUE
)

print(tab1, showAllLevels = TRUE, quote = FALSE,
      noSpaces = TRUE, printToggle = FALSE) %>%
  kable(caption = "Table 1: Baseline Characteristics by Sex") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE, font_size = 11)

Table 1: Baseline Characteristics by Sex
	level	Overall	FEMALE	MALE	p
n		524	380	144
age (mean (SD))		45.01 (24.78)	47.52 (23.39)	38.49 (27.11)	<0.001
age_group (%)	0-18	132 (25.5)	72 (19.3)	60 (41.7)	<0.001
	19-35	54 (10.4)	45 (12.0)	9 (6.2)
	36-50	92 (17.8)	72 (19.3)	20 (13.9)
	51-65	113 (21.8)	90 (24.1)	23 (16.0)
	65+	127 (24.5)	95 (25.4)	32 (22.2)
pulse_bpm (mean (SD))		80.99 (13.34)	81.46 (13.22)	79.28 (13.73)	0.182
pulse_category (%)	Bradycardia	11 (2.8)	8 (2.6)	3 (3.5)	0.857
	Normal	348 (88.1)	273 (88.1)	75 (88.2)
	Tachycardia	36 (9.1)	29 (9.4)	7 (8.2)
systolic (mean (SD))		136.17 (26.11)	136.23 (26.35)	135.93 (25.37)	0.925
diastolic (mean (SD))		76.97 (14.85)	77.18 (15.33)	76.19 (13.03)	0.585
bp_category (%)	Normal	106 (26.9)	83 (26.9)	23 (27.1)	0.719
	Elevated	52 (13.2)	41 (13.3)	11 (12.9)
	Stage 1 HTN	136 (34.5)	103 (33.3)	33 (38.8)
	Stage 2 HTN	100 (25.4)	82 (26.5)	18 (21.2)
glucose_mmol (mean (SD))		6.46 (2.58)	6.52 (2.65)	6.21 (2.27)	0.411
glucose_category (%)	Hypoglycaemia	7 (2.2)	5 (2.0)	2 (3.4)	0.459
	Normal	126 (40.4)	98 (38.6)	28 (48.3)
	Pre-diabetes	95 (30.4)	80 (31.5)	15 (25.9)
	Diabetes	84 (26.9)	71 (28.0)	13 (22.4)
hep_b_pos (%)	0	503 (96.5)	364 (96.3)	139 (97.2)	0.813
	1	18 (3.5)	14 (3.7)	4 (2.8)
hep_c_pos (%)	0	519 (99.6)	376 (99.5)	143 (100.0)	0.938
	1	2 (0.4)	2 (0.5)	0 (0.0)
hiv_pos (%)	0	515 (99.0)	374 (98.9)	141 (99.3)	1.000
	1	5 (1.0)	4 (1.1)	1 (0.7)
syphilis_pos (%)	0	517 (99.0)	376 (99.2)	141 (98.6)	0.896
	1	5 (1.0)	3 (0.8)	2 (1.4)
hypertension (%)	0	159 (40.3)	125 (40.3)	34 (40.0)	1.000
	1	236 (59.7)	185 (59.7)	51 (60.0)
diabetes (%)	0	228 (73.1)	183 (72.0)	45 (77.6)	0.488
	1	84 (26.9)	71 (28.0)	13 (22.4)
any_comm_disease (%)	0	490 (94.4)	355 (94.2)	135 (95.1)	0.852
	1	29 (5.6)	22 (5.8)	7 (4.9)
any_ncd (%)	0	81 (23.5)	68 (24.7)	13 (18.8)	0.383
	1	263 (76.5)	207 (75.3)	56 (81.2)
htn_dm_comorbid (%)	0	250 (81.4)	201 (80.4)	49 (86.0)	0.432
	1	57 (18.6)	49 (19.6)	8 (14.0)

df %>%
  select(sex, age, age_group, pulse_bpm, systolic, diastolic,
         glucose_mmol, bp_category, glucose_category,
         hypertension, diabetes, hep_b_pos, hep_c_pos,
         hiv_pos, syphilis_pos, any_comm_disease, any_ncd) %>%
  tbl_summary(
    by = sex,
    statistic = list(
      all_continuous()  ~ "{mean} ({sd}) [{min}, {max}]",
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing_text = "Missing",
    label = list(
      age              ~ "Age (years)",
      age_group        ~ "Age Group",
      pulse_bpm        ~ "Pulse (bpm)",
      systolic         ~ "Systolic BP (mmHg)",
      diastolic        ~ "Diastolic BP (mmHg)",
      glucose_mmol     ~ "Blood Glucose (mmol/L)",
      bp_category      ~ "BP Category (ACC/AHA 2017)",
      glucose_category ~ "Glucose Category (WHO/ADA)",
      hypertension     ~ "Hypertension (Stage 1+2)",
      diabetes         ~ "Diabetes (≥7.0 mmol/L)",
      hep_b_pos        ~ "Hepatitis B Positive",
      hep_c_pos        ~ "Hepatitis C Positive",
      hiv_pos          ~ "HIV Reactive",
      syphilis_pos     ~ "Syphilis Reactive",
      any_comm_disease ~ "Any Communicable Disease",
      any_ncd          ~ "Any NCD (HTN or DM)"
    )
  ) %>%
  add_overall() %>%
  add_p(test = list(all_continuous()  ~ "wilcox.test",
                    all_categorical() ~ "chisq.test")) %>%
  bold_labels() %>%
  modify_caption("**Table 2: Baseline Characteristics — Dampong, Ghana (gtsummary)**")

**Table 2: Baseline Characteristics — Dampong, Ghana (gtsummary)**
Characteristic	Overall N = 524¹	FEMALE N = 380¹	MALE N = 144¹	p-value²
Age (years)	45 (25) [1, 97]	48 (23) [1, 97]	38 (27) [1, 92]	<0.001
Missing	6	6	0
Age Group				<0.001
0-18	132 (25%)	72 (19%)	60 (42%)
19-35	54 (10%)	45 (12%)	9 (6.3%)
36-50	92 (18%)	72 (19%)	20 (14%)
51-65	113 (22%)	90 (24%)	23 (16%)
65+	127 (25%)	95 (25%)	32 (22%)
Missing	6	6	0
Pulse (bpm)	81 (13) [48, 124]	81 (13) [48, 120]	79 (14) [53, 124]	0.15
Missing	129	70	59
Systolic BP (mmHg)	136 (26) [74, 245]	136 (26) [89, 245]	136 (25) [74, 222]	0.8
Missing	129	70	59
Diastolic BP (mmHg)	77 (15) [45, 135]	77 (15) [49, 135]	76 (13) [45, 112]	>0.9
Missing	130	71	59
Blood Glucose (mmol/L)	6.46 (2.58) [3.10, 24.30]	6.52 (2.65) [3.10, 24.30]	6.21 (2.27) [3.60, 15.40]	0.2
Missing	212	126	86
BP Category (ACC/AHA 2017)				0.7
Normal	106 (27%)	83 (27%)	23 (27%)
Elevated	52 (13%)	41 (13%)	11 (13%)
Stage 1 HTN	136 (35%)	103 (33%)	33 (39%)
Stage 2 HTN	100 (25%)	82 (27%)	18 (21%)
Missing	130	71	59
Glucose Category (WHO/ADA)				0.5
Hypoglycaemia	7 (2.2%)	5 (2.0%)	2 (3.4%)
Normal	126 (40%)	98 (39%)	28 (48%)
Pre-diabetes	95 (30%)	80 (31%)	15 (26%)
Diabetes	84 (27%)	71 (28%)	13 (22%)
Missing	212	126	86
Hypertension (Stage 1+2)	236 (60%)	185 (60%)	51 (60%)	>0.9
Missing	129	70	59
Diabetes (≥7.0 mmol/L)	84 (27%)	71 (28%)	13 (22%)	0.5
Missing	212	126	86
Hepatitis B Positive	18 (3.5%)	14 (3.7%)	4 (2.8%)	0.8
Missing	3	2	1
Hepatitis C Positive	2 (0.4%)	2 (0.5%)	0 (0%)	>0.9
Missing	3	2	1
HIV Reactive	5 (1.0%)	4 (1.1%)	1 (0.7%)	>0.9
Missing	4	2	2
Syphilis Reactive	5 (1.0%)	3 (0.8%)	2 (1.4%)	0.9
Missing	2	1	1
Any Communicable Disease	29 (5.6%)	22 (5.8%)	7 (4.9%)	0.9
Missing	5	3	2
Any NCD (HTN or DM)	263 (76%)	207 (75%)	56 (81%)	0.4
Missing	180	105	75
¹ Mean (SD) [Min, Max]; n (%)
² Wilcoxon rank sum test; Pearson’s Chi-squared test

6 Age Distribution

df %>%
  filter(!is.na(age)) %>%
  ggplot(aes(x = age, fill = sex, colour = sex)) +
  geom_histogram(aes(y = after_stat(density)),
                 bins = 30, alpha = 0.5, position = "identity") +
  geom_density(alpha = 0, linewidth = 1) +
  facet_wrap(~sex, ncol = 2) +
  scale_fill_manual(values   = c("FEMALE" = "#E76F51", "MALE" = "#264653")) +
  scale_colour_manual(values = c("FEMALE" = "#E76F51", "MALE" = "#264653")) +
  labs(title = "Age Distribution of Study Participants",
       subtitle = "Dampong, Ashanti Region, Ghana",
       x = "Age (years)", y = "Density") +
  theme_pubr() + theme(legend.position = "bottom")

Figure 1: Age Distribution by Sex

df %>%
  filter(!is.na(age_group)) %>%
  count(age_group, sex) %>%
  group_by(sex) %>%
  mutate(pct = 100 * n / sum(n)) %>%
  ggplot(aes(x = age_group, y = pct, fill = sex)) +
  geom_col(position = position_dodge(0.8), width = 0.7) +
  geom_text(aes(label = sprintf("%.1f%%\n(n=%d)", pct, n)),
            position = position_dodge(0.8), vjust = -0.3, size = 3.2) +
  scale_fill_manual(values = c("FEMALE" = "#E76F51", "MALE" = "#264653")) +
  labs(title = "Age Group Distribution by Sex",
       x = "Age Group", y = "Percentage (%)", fill = "Sex") +
  theme_pubr() + ylim(0, 40)

Figure 2: Age Group Distribution by Sex

age_vals <- df$age[!is.na(df$age)]
sw <- shapiro.test(sample(age_vals, min(5000, length(age_vals))))
cat("Shapiro-Wilk: W =", round(sw$statistic,4), ", p =", format.pval(sw$p.value), "\n")

## Shapiro-Wilk: W = 0.9451 , p = 0

7 Blood Pressure Analysis

df %>%
  filter(!is.na(systolic), !is.na(diastolic)) %>%
  ggplot(aes(x = diastolic, y = systolic, colour = bp_category)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_hline(yintercept = c(130, 140), linetype="dashed",
             colour = c("orange","red")) +
  geom_vline(xintercept = c(80, 90),  linetype="dashed",
             colour = c("orange","red")) +
  scale_colour_manual(
    values   = c("Normal"="2EC4B6","Elevated"="#FFBF69",
                 "Stage 1 HTN"="#FF9F1C","Stage 2 HTN"="#E71D36"),
    na.value = "grey70"
  ) +
  labs(title  = "Systolic vs. Diastolic Blood Pressure",
       subtitle = "ACC/AHA 2017 Classification",
       x = "Diastolic BP (mmHg)", y = "Systolic BP (mmHg)",
       colour = "BP Category") +
  theme_pubr()

Figure 3: Systolic vs Diastolic BP

df %>%
  filter(!is.na(bp_category)) %>%
  count(bp_category, sex) %>%
  group_by(sex) %>%
  mutate(pct = 100 * n / sum(n)) %>%
  ggplot(aes(x = bp_category, y = pct, fill = sex)) +
  geom_col(position = position_dodge(0.8), width = 0.7) +
  geom_text(aes(label = sprintf("%.1f%%", pct)),
            position = position_dodge(0.8), vjust = -0.3, size = 3) +
  scale_fill_manual(values = c("FEMALE"="#E76F51","MALE"="#264653")) +
  labs(title="BP Category Distribution by Sex",
       x="BP Category", y="Percentage (%)", fill="Sex") +
  theme_pubr() + ylim(0, 65)

Figure 4: BP Category by Sex

df %>%
  filter(!is.na(hypertension), !is.na(age_group)) %>%
  group_by(age_group) %>%
  summarise(
    n     = n(),
    htn_n = sum(hypertension, na.rm = TRUE),
    prev  = 100 * htn_n / n,
    se    = sqrt(prev * (100 - prev) / n),
    lower = pmax(0,   prev - 1.96 * se),
    upper = pmin(100, prev + 1.96 * se),
    .groups = "drop"
  ) %>%
  ggplot(aes(x = age_group, y = prev, group = 1)) +
  geom_line(colour="#E71D36", linewidth=1.2) +
  geom_point(size=4, colour="#E71D36") +
  geom_errorbar(aes(ymin=lower, ymax=upper),
                width=0.2, colour="#E71D36") +
  geom_label(aes(label=sprintf("%.1f%%\n(n=%d)", prev, n)),
             vjust=-0.5, size=3.2) +
  labs(title="Hypertension Prevalence by Age Group",
       subtitle="With 95% Confidence Intervals",
       x="Age Group", y="Prevalence (%)") +
  theme_pubr() + ylim(0, 100)

Figure 5: Hypertension Prevalence by Age Group

bp_sex <- df %>% filter(!is.na(systolic), !is.na(sex))
w_sbp  <- wilcox.test(systolic  ~ sex, data = bp_sex, conf.int = TRUE)
w_dbp  <- wilcox.test(diastolic ~ sex, data = bp_sex, conf.int = TRUE)

bp_sex %>%
  group_by(sex) %>%
  summarise(n      = n(),
            mean   = round(mean(systolic),1),
            sd     = round(sd(systolic),1),
            median = median(systolic),
            IQR    = IQR(systolic),
            htn_n  = sum(hypertension, na.rm=TRUE),
            htn_pct= round(100*htn_n/n,1),
            .groups="drop") %>%
  kable(caption="Systolic BP and Hypertension by Sex") %>%
  kable_styling(bootstrap_options=c("striped","hover"), full_width=FALSE)

Systolic BP and Hypertension by Sex
sex	n	mean	sd	median	IQR	htn_n	htn_pct
FEMALE	310	136.2	26.3	131	32	185	59.7
MALE	85	135.9	25.4	136	33	51	60.0

cat("Wilcoxon (SBP): W =", w_sbp$statistic,
    ", p =", format.pval(w_sbp$p.value), "\n")

## Wilcoxon (SBP): W = 12970.5 , p = 0.82681

cat("Wilcoxon (DBP): W =", w_dbp$statistic,
    ", p =", format.pval(w_dbp$p.value), "\n")

## Wilcoxon (DBP): W = 13106 , p = 0.97769

bp_age <- df %>% filter(!is.na(systolic), !is.na(age))
cor_r  <- cor.test(bp_age$age, bp_age$systolic, method="spearman")
cat("Spearman rho =", round(cor_r$estimate,3),
    ", p =", format.pval(cor_r$p.value), "\n")

## Spearman rho = 0.428 , p = < 2.22e-16

ggscatter(bp_age, x="age", y="systolic",
          add="reg.line", conf.int=TRUE,
          add.params=list(colour="#E71D36"),
          colour="#264653", alpha=0.4,
          title="Age vs Systolic Blood Pressure",
          xlab="Age (years)", ylab="Systolic BP (mmHg)") +
  stat_cor(method="spearman", label.x=5, label.y=230)

Figure 6: Age vs Systolic BP

8 Blood Glucose and Diabetes

df %>%
  filter(!is.na(glucose_mmol)) %>%
  ggplot(aes(x = glucose_mmol, fill = glucose_category)) +
  geom_histogram(bins=30, colour="white", alpha=0.85) +
  geom_vline(xintercept=5.6, linetype="dashed", colour="orange", linewidth=1) +
  geom_vline(xintercept=7.0, linetype="dashed", colour="red",    linewidth=1) +
  annotate("text", x=5.9, y=25, label="Pre-DM\n5.6",
           colour="orange", size=3.5) +
  annotate("text", x=7.3, y=25, label="DM\n7.0",
           colour="red",    size=3.5) +
  scale_fill_manual(
    values   = c("Hypoglycaemia"="#0077B6","Normal"="#2EC4B6",
                 "Pre-diabetes"="#FFBF69","Diabetes"="#E71D36"),
    na.value = "grey70"
  ) +
  facet_wrap(~sex, ncol=2) +
  labs(title="Blood Glucose Distribution by Sex",
       subtitle="WHO/ADA Diagnostic Thresholds Shown",
       x="Blood Glucose (mmol/L)", y="Count",
       fill="Category") +
  theme_pubr()

Figure 7: Blood Glucose Distribution by Sex

dm_sex <- df %>%
  filter(!is.na(diabetes), !is.na(sex)) %>%
  group_by(sex) %>%
  summarise(n=n(), dm_n=sum(diabetes,na.rm=TRUE),
            prev=round(100*dm_n/n,1),
            se  =sqrt(prev*(100-prev)/n),
            lower=round(pmax(0,   prev-1.96*se),1),
            upper=round(pmin(100, prev+1.96*se),1),
            .groups="drop")

dm_sex %>%
  kable(caption="Diabetes Prevalence by Sex",
        col.names=c("Sex","N","DM (n)","Prev (%)","SE","95% CI Low","95% CI High")) %>%
  kable_styling(bootstrap_options=c("striped","hover"), full_width=FALSE)

Diabetes Prevalence by Sex
Sex	N	DM (n)	Prev (%)	SE	95% CI Low	95% CI High
FEMALE	254	71	28.0	2.817270	22.5	33.5
MALE	58	13	22.4	5.474455	11.7	33.1

dm_chi <- chisq.test(table(df$sex, df$diabetes))
cat("Chi-square: X2 =", round(dm_chi$statistic,3),
    ", p =", format.pval(dm_chi$p.value), "\n")

## Chi-square: X2 = 0.482 , p = 0.48766

df %>%
  filter(!is.na(diabetes), !is.na(age_group)) %>%
  group_by(age_group) %>%
  summarise(n=n(), dm_n=sum(diabetes,na.rm=TRUE),
            prev=100*dm_n/n, se=sqrt(prev*(100-prev)/n),
            lower=pmax(0,prev-1.96*se), upper=pmin(100,prev+1.96*se),
            .groups="drop") %>%
  ggplot(aes(x=age_group, y=prev, group=1)) +
  geom_line(colour="#E76F51", linewidth=1.2) +
  geom_point(size=4, colour="#E76F51") +
  geom_errorbar(aes(ymin=lower,ymax=upper), width=0.2, colour="#E76F51") +
  geom_label(aes(label=sprintf("%.1f%%",prev)), vjust=-0.5, size=3.5) +
  labs(title="Diabetes Prevalence by Age Group",
       x="Age Group", y="Prevalence (%)") +
  theme_pubr() + ylim(0,60)

Figure 8: Diabetes Prevalence by Age Group

9 Communicable Disease Analysis

# Compute Wilson 95% CI
wilson_ci <- function(x, n) {
  z  <- qnorm(0.975)
  p  <- x / n
  lo <- (p + z^2/(2*n) - z*sqrt(p*(1-p)/n + z^2/(4*n^2))) / (1 + z^2/n)
  hi <- (p + z^2/(2*n) + z*sqrt(p*(1-p)/n + z^2/(4*n^2))) / (1 + z^2/n)
  c(p=p, lower=lo, upper=hi)
}

comm_prev <- tibble(
  Disease    = c("Hepatitis B (HBsAg)","Hepatitis C (HCV)",
                 "HIV (Retroviral)","Syphilis (VDRL/RPR)"),
  N_Tested   = c(sum(!is.na(df$hep_b_pos)), sum(!is.na(df$hep_c_pos)),
                 sum(!is.na(df$hiv_pos)),   sum(!is.na(df$syphilis_pos))),
  N_Positive = c(sum(df$hep_b_pos,    na.rm=TRUE),
                 sum(df$hep_c_pos,    na.rm=TRUE),
                 sum(df$hiv_pos,      na.rm=TRUE),
                 sum(df$syphilis_pos, na.rm=TRUE))
) %>%
  rowwise() %>%
  mutate(
    ci       = list(wilson_ci(N_Positive, N_Tested)),
    Prev_pct = round(100 * ci[["p"]],     2),
    CI_Lo    = round(100 * ci[["lower"]], 2),
    CI_Hi    = round(100 * ci[["upper"]], 2),
    CI_95    = paste0("(", CI_Lo, "–", CI_Hi, ")")
  ) %>%
  ungroup() %>%
  select(Disease, N_Tested, N_Positive, Prev_pct, CI_95)

comm_prev %>%
  kable(caption="Table 3: Communicable Disease Prevalence — Dampong, Ghana",
        col.names=c("Disease","N Tested","N Positive","Prevalence (%)","95% CI")) %>%
  kable_styling(bootstrap_options=c("striped","hover"), full_width=FALSE)

Table 3: Communicable Disease Prevalence — Dampong, Ghana
Disease	N Tested	N Positive	Prevalence (%)	95% CI
Hepatitis B (HBsAg)	521	18	3.45	(2.2–5.39)
Hepatitis C (HCV)	521	2	0.38	(0.11–1.39)
HIV (Retroviral)	520	5	0.96	(0.41–2.23)
Syphilis (VDRL/RPR)	522	5	0.96	(0.41–2.22)

comm_prev %>%
  ggplot(aes(x=reorder(Disease, Prev_pct), y=Prev_pct, fill=Disease)) +
  geom_col(width=0.6, show.legend=FALSE) +
  geom_text(aes(label=paste0(Prev_pct,"% (n=",N_Positive,")")),
            hjust=-0.1, size=3.8) +
  scale_fill_brewer(palette="Set2") +
  coord_flip() +
  labs(title="Communicable Disease Prevalence",
       subtitle="Dampong, Ashanti Region, Ghana",
       x=NULL, y="Prevalence (%)") +
  theme_pubr() + ylim(0,6)

Figure 9: Communicable Disease Prevalence

for (disease in c("hep_b_pos","hep_c_pos","hiv_pos","syphilis_pos")) {
  tbl <- table(df$sex, df[[disease]])
  ft  <- fisher.test(tbl)
  cat(disease, "— Fisher exact: OR =",
      round(ft$estimate, 3), ", p =",
      format.pval(ft$p.value), "\n")
}

## hep_b_pos — Fisher exact: OR = 0.749 , p = 0.79032 
## hep_c_pos — Fisher exact: OR = 0 , p = 1 
## hiv_pos — Fisher exact: OR = 0.664 , p = 1 
## syphilis_pos — Fisher exact: OR = 1.776 , p = 0.61809

10 Comorbidity Analysis

df_ncd <- df %>% filter(!is.na(hypertension), !is.na(diabetes))

comorbid <- df_ncd %>%
  count(hypertension, diabetes) %>%
  mutate(
    Category = case_when(
      hypertension==0 & diabetes==0 ~ "No HTN, No DM",
      hypertension==1 & diabetes==0 ~ "HTN only",
      hypertension==0 & diabetes==1 ~ "DM only",
      hypertension==1 & diabetes==1 ~ "HTN + DM (Comorbid)"
    ),
    pct = round(100*n/sum(n),1)
  ) %>%
  arrange(desc(n))

comorbid %>%
  select(Category, n, pct) %>%
  kable(caption="Table 4: NCD Comorbidity Patterns",
        col.names=c("Category","N","(%)")) %>%
  kable_styling(bootstrap_options=c("striped","hover"), full_width=FALSE)

Table 4: NCD Comorbidity Patterns
Category	N	(%)
HTN only	142	46.3
No HTN, No DM	81	26.4
HTN + DM (Comorbid)	57	18.6
DM only	27	8.8

comorbid %>%
  ggplot(aes(x="", y=pct, fill=Category)) +
  geom_col(width=1, colour="white") +
  coord_polar("y") +
  geom_text(aes(label=paste0(pct,"%")),
            position=position_stack(vjust=0.5), colour="white", size=4) +
  scale_fill_manual(
    values=c("No HTN, No DM"="#2EC4B6","HTN only"="#FF9F1C",
             "DM only"="#E76F51","HTN + DM (Comorbid)"="#E71D36")
  ) +
  labs(title="NCD Comorbidity Distribution", fill=NULL) +
  theme_void() +
  theme(legend.position="right",
        plot.title=element_text(face="bold", hjust=0.5))

Figure 10: NCD Comorbidity Pie Chart

11 Correlation Matrix

corr_df <- df %>%
  select(age, pulse_bpm, systolic, diastolic, glucose_mmol) %>%
  drop_na()

corr_mat <- cor(corr_df, method="spearman", use="complete.obs")

ggcorrplot(corr_mat, method="circle", type="lower",
           lab=TRUE, lab_size=3.5,
           colors=c("#0077B6","white","#E71D36"),
           title="Spearman Correlation — Clinical Variables",
           ggtheme=theme_pubr())

Figure 11: Spearman Correlation Matrix

12 Logistic Regression

12.1 Hypertension

df_logit <- df %>%
  filter(!is.na(hypertension), !is.na(age),
         !is.na(sex), !is.na(glucose_mmol), !is.na(pulse_bpm)) %>%
  mutate(hypertension = as.integer(hypertension))

# Multivariable model
mv_htn <- glm(hypertension ~ age + sex + glucose_mmol + pulse_bpm,
              data   = df_logit,
              family = binomial(link="logit"))

tidy(mv_htn, exponentiate=TRUE, conf.int=TRUE) %>%
  filter(term != "(Intercept)") %>%
  kable(caption="Table 5: Multivariable Logistic Regression — Hypertension",
        digits=3,
        col.names=c("Predictor","OR","SE","Z","p-value","95% CI Low","95% CI High")) %>%
  kable_styling(bootstrap_options=c("striped","hover"), full_width=FALSE)

Table 5: Multivariable Logistic Regression — Hypertension
Predictor	OR	SE	Z	p-value	95% CI Low	95% CI High
age	1.048	0.009	5.451	0.000	1.031	1.066
sexMALE	0.993	0.334	-0.022	0.983	0.521	1.939
glucose_mmol	1.017	0.054	0.311	0.755	0.920	1.141
pulse_bpm	0.996	0.010	-0.452	0.651	0.976	1.015

# Hosmer-Lemeshow
hl <- hoslem.test(mv_htn$y, fitted(mv_htn), g=10)
cat("Hosmer-Lemeshow: X2 =", round(hl$statistic,3),
    ", p =", format.pval(hl$p.value), "\n")

## Hosmer-Lemeshow: X2 = 16.682 , p = 0.03359

# VIF
cat("VIF:\n"); print(vif(mv_htn))

## VIF:

##          age          sex glucose_mmol    pulse_bpm 
##     1.061836     1.016704     1.035811     1.042430

# Nagelkerke R2
ll_null <- logLik(glm(hypertension~1, data=df_logit, family=binomial))
ll_full <- logLik(mv_htn)
n       <- nobs(mv_htn)
cs  <- 1 - exp(2*(as.numeric(ll_null)-as.numeric(ll_full))/n)
nk  <- cs/(1-exp(2*as.numeric(ll_null)/n))
cat("Nagelkerke R2:", round(nk,3), "\n")

## Nagelkerke R2: 0.165

roc_htn <- roc(df_logit$hypertension, fitted(mv_htn))
plot(roc_htn, col="#E71D36", lwd=2,
     main=paste("ROC — Hypertension | AUC =", round(auc(roc_htn),3)))
abline(a=0, b=1, lty=2, col="grey60")

Figure 12: ROC Curve — Hypertension Model

cat("AUC:", round(auc(roc_htn),3), "\n")

## AUC: 0.706

12.2 Diabetes

df_dm <- df %>%
  filter(!is.na(diabetes), !is.na(age), !is.na(sex),
         !is.na(systolic)) %>%
  mutate(diabetes = as.integer(diabetes))

mv_dm <- glm(diabetes ~ age + sex + systolic,
             data=df_dm, family=binomial(link="logit"))

tidy(mv_dm, exponentiate=TRUE, conf.int=TRUE) %>%
  filter(term != "(Intercept)") %>%
  kable(caption="Table 6: Multivariable Logistic Regression — Diabetes",
        digits=3) %>%
  kable_styling(bootstrap_options=c("striped","hover"), full_width=FALSE)

Table 6: Multivariable Logistic Regression — Diabetes
term	estimate	std.error	statistic	p.value	conf.low	conf.high
age	1.020	0.009	2.253	0.024	1.003	1.039
sexMALE	0.686	0.354	-1.063	0.288	0.332	1.342
systolic	1.012	0.005	2.293	0.022	1.002	1.023

roc_dm <- roc(df_dm$diabetes, fitted(mv_dm))
cat("Diabetes model AUC:", round(auc(roc_dm),3), "\n")

## Diabetes model AUC: 0.634

13 Population Burden Estimation

pop_dampong <- 12000
pop_ashanti <- 5800000

conditions <- list(
  list(label="Hypertension (Stage 1/2)",
       x=sum(df$hypertension,  na.rm=TRUE), n=sum(!is.na(df$hypertension))),
  list(label="Diabetes (≥7.0 mmol/L)",
       x=sum(df$diabetes,      na.rm=TRUE), n=sum(!is.na(df$diabetes))),
  list(label="Pre-diabetes",
       x=sum(df$glucose_mmol>=5.6 & df$glucose_mmol<7.0, na.rm=TRUE),
       n=sum(!is.na(df$glucose_mmol))),
  list(label="HTN + DM Comorbidity",
       x=sum(df$htn_dm_comorbid, na.rm=TRUE), n=sum(!is.na(df$htn_dm_comorbid))),
  list(label="Hepatitis B",
       x=sum(df$hep_b_pos,    na.rm=TRUE), n=sum(!is.na(df$hep_b_pos))),
  list(label="HIV",
       x=sum(df$hiv_pos,      na.rm=TRUE), n=sum(!is.na(df$hiv_pos))),
  list(label="Syphilis",
       x=sum(df$syphilis_pos, na.rm=TRUE), n=sum(!is.na(df$syphilis_pos))),
  list(label="Hepatitis C",
       x=sum(df$hep_c_pos,    na.rm=TRUE), n=sum(!is.na(df$hep_c_pos)))
)

pop_table <- map_dfr(conditions, function(cond) {
  ci <- wilson_ci(cond$x, cond$n)
  tibble(
    Condition    = cond$label,
    N_Tested     = cond$n,
    N_Positive   = cond$x,
    Prev_pct     = round(100*ci["p"],    2),
    CI_Lo        = round(100*ci["lower"],2),
    CI_Hi        = round(100*ci["upper"],2),
    Est_Dampong  = scales::comma(round(pop_dampong * ci["p"])),
    Est_Ashanti  = scales::comma(round(pop_ashanti * ci["p"]))
  )
})

pop_table %>%
  select(Condition, N_Tested, N_Positive, Prev_pct, CI_Lo, CI_Hi,
         Est_Dampong, Est_Ashanti) %>%
  kable(caption="Table 7: Population-Level Disease Burden Estimates",
        col.names=c("Condition","N Tested","N Positive","Prev (%)","CI Low","CI High",
                    "Est. Dampong","Est. Ashanti")) %>%
  kable_styling(bootstrap_options=c("striped","hover","condensed"),
                full_width=TRUE, font_size=11) %>%
  footnote(general=paste0(
    "Dampong est. population ≈ 12,000; Ashanti Region ≈ 5,800,000 (GSS 2021). ",
    "Wilson method 95% CI."))

Table 7: Population-Level Disease Burden Estimates
Condition	N Tested	N Positive	Prev (%)	CI Low	CI High	Est. Dampong	Est. Ashanti
Hypertension (Stage 1/2)	395	236	59.75	54.84	64.47	7,170	3,465,316
Diabetes (≥7.0 mmol/L)	312	84	26.92	22.30	32.10	3,231	1,561,538
Pre-diabetes	312	95	30.45	25.61	35.77	3,654	1,766,026
HTN + DM Comorbidity	307	57	18.57	14.62	23.30	2,228	1,076,873
Hepatitis B	521	18	3.45	2.20	5.39	415	200,384
HIV	520	5	0.96	0.41	2.23	115	55,769
Syphilis	522	5	0.96	0.41	2.22	115	55,556
Hepatitis C	521	2	0.38	0.11	1.39	46	22,265
Note:
Dampong est. population ≈ 12,000; Ashanti Region ≈ 5,800,000 (GSS 2021). Wilson method 95% CI.

pop_table %>%
  mutate(Type=if_else(Condition %in% c("Hypertension (Stage 1/2)",
                                        "Diabetes (≥7.0 mmol/L)",
                                        "Pre-diabetes","HTN + DM Comorbidity"),
                      "Non-Communicable","Communicable")) %>%
  ggplot(aes(x=Prev_pct, y=reorder(Condition,Prev_pct),
             colour=Type, shape=Type)) +
  geom_point(size=5) +
  geom_errorbarh(aes(xmin=CI_Lo, xmax=CI_Hi), height=0.3, linewidth=1) +
  scale_colour_manual(values=c("Non-Communicable"="#E71D36",
                                "Communicable"="#0077B6")) +
  labs(title="Disease Prevalence — Dampong, Ashanti Region, Ghana",
       subtitle="With 95% Wilson Confidence Intervals",
       x="Prevalence (%)", y=NULL,
       colour="Disease Type", shape="Disease Type") +
  theme_pubr() + theme(legend.position="bottom")

Figure 13: Prevalence Forest Plot

14 Multiple Imputation Sensitivity Analysis

# ── Include hypertension and diabetes in the imputation dataset ───────────────
# Both must be present in mice_input so that with(mice_out, glm(...)) can
# find them inside each of the 20 imputed datasets.
mice_input <- df %>%
  select(sex, age, pulse_bpm, systolic, diastolic,
         glucose_mmol, hep_b_pos, hep_c_pos, hiv_pos, syphilis_pos,
         hypertension, diabetes) %>%          # ← these two are required
  mutate(
    sex          = factor(sex),
    hypertension = as.integer(hypertension),  # binary → logreg method
    diabetes     = as.integer(diabetes)       # binary → logreg method
  )

# Set imputation method per variable type
# pmm    = predictive mean matching (continuous/count)
# logreg = logistic regression (binary 0/1)
# ""     = do not impute (already complete)
meth <- make.method(mice_input)
meth["hypertension"] <- "logreg"
meth["diabetes"]     <- "logreg"

mice_out <- mice(mice_input,
                 m         = 20,
                 maxit     = 50,
                 method    = meth,
                 seed      = 2024,
                 printFlag = FALSE)

cat("MICE complete.\n")

## MICE complete.

# Pool logistic regression results across 20 imputed datasets
fit_imp <- with(mice_out,
                glm(hypertension ~ age + sex + glucose_mmol + pulse_bpm,
                    family = binomial))

pooled <- pool(fit_imp)

summary(pooled, conf.int = TRUE, exponentiate = TRUE) %>%
  filter(term != "(Intercept)") %>%
  kable(caption = "Table 8: Pooled MI Results — Hypertension (20 imputations)",
        digits  = 3) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Table 8: Pooled MI Results — Hypertension (20 imputations)
	term	estimate	std.error	statistic	df	p.value	2.5 %	97.5 %	conf.low	conf.high
2	age	1.036	0.006	5.695	54.562	0.000	1.023	1.049	1.023	1.049
3	sexMALE	0.875	0.267	-0.499	119.311	0.619	0.515	1.486	0.515	1.486
4	glucose_mmol	1.056	0.053	1.026	88.589	0.308	0.950	1.173	0.950	1.173
5	pulse_bpm	1.006	0.008	0.712	253.702	0.477	0.990	1.022	0.990	1.022

15 Combined Summary Panel

p1 <- df %>% filter(!is.na(bp_category)) %>%
  count(bp_category) %>% mutate(pct=100*n/sum(n)) %>%
  ggplot(aes(x=reorder(bp_category,pct), y=pct, fill=bp_category)) +
  geom_col(show.legend=FALSE) +
  geom_text(aes(label=paste0(round(pct,1),"%")), hjust=-0.1, size=3.5) +
  coord_flip() +
  scale_fill_manual(values=c("Normal"="#2EC4B6","Elevated"="#FFBF69",
                              "Stage 1 HTN"="#FF9F1C","Stage 2 HTN"="#E71D36")) +
  labs(title="BP Categories", x=NULL, y="%") +
  theme_pubr() + ylim(0,55)

p2 <- df %>% filter(!is.na(glucose_category)) %>%
  count(glucose_category) %>% mutate(pct=100*n/sum(n)) %>%
  ggplot(aes(x=reorder(glucose_category,pct), y=pct, fill=glucose_category)) +
  geom_col(show.legend=FALSE) +
  geom_text(aes(label=paste0(round(pct,1),"%")), hjust=-0.1, size=3.5) +
  coord_flip() +
  scale_fill_manual(values=c("Hypoglycaemia"="#0077B6","Normal"="#2EC4B6",
                              "Pre-diabetes"="#FFBF69","Diabetes"="#E71D36")) +
  labs(title="Glucose Categories", x=NULL, y="%") +
  theme_pubr() + ylim(0,50)

p3 <- comm_prev %>%
  ggplot(aes(x=reorder(Disease,Prev_pct), y=Prev_pct, fill=Disease)) +
  geom_col(show.legend=FALSE) +
  geom_text(aes(label=paste0(Prev_pct,"%")), hjust=-0.1, size=3.5) +
  coord_flip() +
  scale_fill_brewer(palette="Set2") +
  labs(title="Communicable Diseases", x=NULL, y="%") +
  theme_pubr() + ylim(0,6)

p4 <- df %>% filter(!is.na(age_group), !is.na(hypertension)) %>%
  group_by(age_group) %>%
  summarise(prev=100*mean(hypertension,na.rm=TRUE), .groups="drop") %>%
  ggplot(aes(x=age_group, y=prev, fill=age_group)) +
  geom_col(show.legend=FALSE) +
  scale_fill_viridis_d() +
  labs(title="HTN by Age Group", x="Age Group", y="%") +
  theme_pubr()

(p1 + p2) / (p3 + p4) +
  plot_annotation(
    title    = "Disease Burden in Dampong, Ashanti Region, Ghana",
    subtitle = "Community Medical Outreach Survey — N = 524",
    theme    = theme(
      plot.title    = element_text(face="bold", size=16, hjust=0.5),
      plot.subtitle = element_text(hjust=0.5)
    )
  )

Figure 14: Summary Panel

16 Session Information

sessionInfo()

## R version 4.6.0 (2026-04-24)
## Platform: aarch64-apple-darwin23
## Running under: macOS Tahoe 26.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.6/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.6/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggmosaic_0.4.0          rmarkdown_2.31          knitr_1.51             
##  [4] kableExtra_1.4.0        scales_1.4.0            patchwork_1.3.2        
##  [7] ggcorrplot_0.1.4.1      rstatix_0.7.3           nortest_1.0-4          
## [10] psych_2.6.5             Hmisc_5.2-5             nnet_7.3-20            
## [13] MASS_7.3-65             ResourceSelection_0.3-6 pROC_1.19.0.1          
## [16] broom_1.0.13            srvyr_1.3.1             survey_4.5             
## [19] epiR_2.0.93             survival_3.8-6          epitools_0.5-10.1      
## [22] lme4_2.0-1              Matrix_1.7-5            car_3.1-5              
## [25] carData_3.0-6           corrplot_0.95           ggpubr_0.6.3           
## [28] gt_1.3.0                flextable_0.9.11        gtsummary_2.5.0        
## [31] tableone_0.13.2         VIM_7.0.0               colorspace_2.1-2       
## [34] mice_3.19.0             naniar_1.1.0            skimr_2.2.2            
## [37] janitor_2.2.1           lubridate_1.9.5         forcats_1.0.1          
## [40] stringr_1.6.0           dplyr_1.2.1             purrr_1.2.2            
## [43] readr_2.2.0             tidyr_1.3.2             tibble_3.3.1           
## [46] ggplot2_4.0.3.9000      tidyverse_2.0.0         readxl_1.5.0           
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.6.0           norm_1.0-11.1           palmerpenguins_0.1.1   
##   [4] BiasedUrn_2.0.12        cellranger_1.1.0        rpart_4.1.27           
##   [7] lifecycle_1.0.5         sf_1.1-1                Rdpack_2.6.6           
##  [10] globals_0.19.1          lattice_0.22-9          backports_1.5.1        
##  [13] magrittr_2.0.5          vcd_1.4-13              plotly_4.12.0          
##  [16] sass_0.4.10             jquerylib_0.1.4         yaml_2.3.12            
##  [19] otel_0.2.0              zip_2.3.3               askpass_1.2.1          
##  [22] sp_2.2-1                DBI_1.3.0               minqa_1.2.8            
##  [25] RColorBrewer_1.1-3      abind_1.4-8             mlr3misc_0.21.0        
##  [28] labelled_2.16.0         paradox_1.0.1           gdtools_0.5.1          
##  [31] ggrepel_0.9.8           listenv_0.10.1          mlr3learners_0.14.0    
##  [34] cards_0.8.0             units_1.0-1             cardx_0.3.2            
##  [37] parallelly_1.47.0       commonmark_2.0.0        svglite_2.2.2          
##  [40] codetools_0.2-20        xml2_1.5.2              tidyselect_1.2.1       
##  [43] shape_1.4.6.1           farver_2.1.2            stats4_4.6.0           
##  [46] base64enc_0.1-6         jsonlite_2.0.0          e1071_1.7-17           
##  [49] mitml_0.4-5             Formula_1.2-5           iterators_1.0.14       
##  [52] systemfonts_1.3.2       foreach_1.5.2           tools_4.6.0            
##  [55] bbotk_1.10.0            ragg_1.5.2              Rcpp_1.1.1-1.1         
##  [58] glue_1.8.1              mnormt_2.1.2            gridExtra_2.3          
##  [61] pan_1.9                 mgcv_1.9-4              laeken_0.5.3           
##  [64] xfun_0.57               ranger_0.18.0           mlr3_1.6.0             
##  [67] withr_3.0.2             fastmap_1.2.0           mitools_2.4            
##  [70] boot_1.3-32             openssl_2.4.1           litedown_0.9           
##  [73] digest_0.6.39           timechange_0.4.0        R6_2.6.1               
##  [76] mlr3tuning_1.6.0        visdat_0.6.0            textshaping_1.0.5      
##  [79] markdown_2.0            UpSetR_1.4.1            utf8_1.2.6             
##  [82] generics_0.1.4          fontLiberation_0.1.0    data.table_1.18.4      
##  [85] robustbase_0.99-7       class_7.3-23            httr_1.4.8             
##  [88] htmlwidgets_1.6.4       pkgconfig_2.0.3         gtable_0.3.6           
##  [91] mlr3pipelines_0.11.0    lmtest_0.9-40           S7_0.2.2               
##  [94] htmltools_0.5.9         fontBitstreamVera_0.1.1 reformulas_0.4.4       
##  [97] lgr_0.5.2               snakecase_0.11.1        rstudioapi_0.18.0      
## [100] reshape2_1.4.5          tzdb_0.5.0              uuid_1.2-2             
## [103] checkmate_2.3.4         nlme_3.1-169            nloptr_2.2.1           
## [106] repr_1.1.7              proxy_0.4-29            cachem_1.1.0           
## [109] zoo_1.8-15              KernSmooth_2.23-26      parallel_4.6.0         
## [112] foreign_0.8-91          pillar_1.11.1           vctrs_0.7.3            
## [115] jomo_2.7-6              cluster_2.1.8.2         htmlTable_2.5.0        
## [118] evaluate_1.0.5          cli_3.6.6               compiler_4.6.0         
## [121] rlang_1.2.0             crayon_1.5.3            ggsignif_0.6.4         
## [124] labeling_0.4.3          classInt_0.4-11         plyr_1.8.9             
## [127] fs_2.1.0                stringi_1.8.7           pander_0.6.6           
## [130] viridisLite_0.4.3       lazyeval_0.2.3          productplots_0.1.2     
## [133] glmnet_5.0              fontquiver_0.2.1        hms_1.1.4              
## [136] future_1.70.0           haven_2.5.5             rbibutils_2.4.1        
## [139] bslib_0.11.0            DEoptimR_1.1-4          officer_0.7.5

Emmanuel Nana Arko, DrPH(s), MPH, PA-C — University at Albany, SUNY
May 28, 2026

Prevalence of Communicable and Non-Communicable Diseases Among Rural Residents of Dampong, Ashanti Region, Ghana: A Cross-Sectional Epidemiological Study

Emmanuel Nana Arko, DrPH(s), MPH, PA-C

May 28, 2026