Rosabel

library(ggplot2)
library(tidyr)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
# --- your data (unchanged) ---
hpv <- tribble(
  ~HPV_Type,     ~Single_infection, ~Multiple_infection,
  "HPV 42 LR",   0.05, 0.10,
  "HPV 16 HR",   0.025, 0.10,
  "HPV 18 HR",   0.075, 0.05,
  "HPV 68 HR",   0.075, 0.05,
  "HPV 06 LR",    0.05, 0.05,
  "HPV 44 LR",   0.05, 0.05,
  "HPV 35 HR",   0.025, 0.075,
  "HPV 52 HR",   0.02, 0.07,
  "HPV 59 HR",   0, 0.10,
  "HPV 82 HR",   0.025, 0.05,
  "HPV 43 LR",   0, 0.05,
  "HPV 54 LR",   0, 0.05,
  "HPV 61 LR",   0.025, 0.025,
  "HPV 70 LR",   0.025, 0.025,
  "HPV 31 HR",   0.025, 0.025,
  "HPV 51 HR",   0.025, 0.025,
  "HPV 53 HR",   0, 0.05,
  "HPV 66 HR",   0, 0.05,
  "HPV 69 HR",   0.025, 0.025,
  "HPV 73 HR",   0, 0.05,
  "HPV 11 LR",   0, 0.025,
  "HPV 40 LR",   0, 0.025,
  "HPV 33 HR",   0, 0.025,
  "HPV 39 HR",   0.025, 0,
  "HPV 45 HR",   0, 0.025,
  "HPV 56 HR",   0.025, 0,
  "HPV 58 HR",   0, 0.025
)

# Long format + Risk tag + clean label + numeric ID
hpv_long <- hpv %>%
  pivot_longer(c(Single_infection, Multiple_infection),
               names_to = "Infection_Type", values_to = "Proportion") %>%
  mutate(
    Risk = ifelse(grepl("HR$", HPV_Type), "High-risk", "Low-risk"),
    HPV_Type_clean = gsub(" (LR|HR)$", "", HPV_Type),
    HPV_Number = as.numeric(sub("HPV\\s*", "", HPV_Type_clean))
  )

# ---- ORDER: Low-risk (desc numeric), then High-risk (desc numeric) ----
order_df <- hpv_long %>%
  distinct(HPV_Type_clean, Risk, HPV_Number) %>%
  mutate(Risk = factor(Risk, levels = c("Low-risk", "High-risk"))) %>%
  arrange(Risk, desc(HPV_Number))

hpv_long <- hpv_long %>%
  mutate(HPV_Type_clean = factor(HPV_Type_clean, levels = order_df$HPV_Type_clean))

# counts for split & label positions
n_low  <- sum(order_df$Risk == "Low-risk")
n_high <- sum(order_df$Risk == "High-risk")
low_center  <- n_low / 2
high_center <- n_low + n_high / 2
y_ann <- max(hpv_long$Proportion, na.rm = TRUE) * 1.05

# ---- Plot ----
ggplot(hpv_long, aes(x = HPV_Type_clean, y = Proportion*100, fill = Infection_Type)) +
  geom_bar(stat = "identity", width = 0.9) +
  geom_vline(xintercept = n_low + 0.5, linetype = "dashed", linewidth = 0.6, alpha = 0.7) +
  annotate("text", x = low_center,  y = 16, label = "Low-risk",  fontface = "bold", color = "black") +
  annotate("text", x = high_center, y = 16, label = "High-risk", fontface = "bold", color = "black") +
  scale_fill_manual(
    values = c("Multiple_infection" = "#298c8c", "Single_infection" = "#f1a226"),
    breaks = c("Multiple_infection", "Single_infection"),
    labels = c("Multiple infection", "Single infection"),
    name = "Infection type"
  ) +
  labs(title = "HPV Infection Distribution",
       x = "HPV Type", y = "Percentages (%)") +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.grid.major.x = element_blank()) +
  coord_cartesian(clip = "off")

# ----- OPTIONAL: Faceted version (two panels: LR vs HR) -----
ggplot(hpv_long, aes(x = HPV_Type_clean, y = Proportion*100, fill = Infection_Type)) +
  geom_bar(stat = "identity", width = 0.9) +
  scale_fill_manual(values = c("Single_infection" = "#f1a226",
                               "Multiple_infection" = "#298c8c"),
                    name = "Infection type") +
  labs(title = "HPV Infection Distribution by Risk Category",
       x = "HPV Type", y = "Percentages (%)") +
  facet_grid(~ Risk, scales = "free_x", space = "free_x") +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.grid.major.x = element_blank(),
        strip.text = element_text(face = "bold"))

(hpv_data <- readxl::read_xlsx("C:/Users/VUSI/Downloads/hpv_data.xlsx"))
# A tibble: 185 × 30
   HPV_Type_of_Infection HPV_type_status HPV16 HPV18 HPV26 HPV31 HPV33 HPV35
   <chr>                 <chr>           <chr> <chr> <lgl> <chr> <chr> <chr>
 1 <NA>                  <NA>            <NA>  <NA>  NA    <NA>  <NA>  <NA> 
 2 <NA>                  <NA>            <NA>  <NA>  NA    <NA>  <NA>  <NA> 
 3 Single                High-risk HPV   <NA>  <NA>  NA    <NA>  <NA>  <NA> 
 4 <NA>                  <NA>            <NA>  <NA>  NA    <NA>  <NA>  <NA> 
 5 <NA>                  <NA>            <NA>  <NA>  NA    <NA>  <NA>  <NA> 
 6 Mulitple              High-risk HPV   Low   <NA>  NA    <NA>  <NA>  <NA> 
 7 <NA>                  <NA>            <NA>  <NA>  NA    <NA>  <NA>  <NA> 
 8 <NA>                  <NA>            <NA>  <NA>  NA    <NA>  <NA>  <NA> 
 9 Single                High-risk HPV   <NA>  <NA>  NA    <NA>  <NA>  <NA> 
10 <NA>                  <NA>            <NA>  <NA>  NA    <NA>  <NA>  <NA> 
# ℹ 175 more rows
# ℹ 22 more variables: HPV39 <chr>, HPV45 <chr>, HPV51 <chr>, HPV52 <chr>,
#   HPV53 <chr>, HPV56 <chr>, HPV58 <chr>, HPV59 <chr>, HPV66 <chr>,
#   HPV68 <chr>, HPV69 <chr>, HPV73 <chr>, HPV82 <chr>, HPV6 <chr>,
#   HPV11 <chr>, HPV40 <chr>, HPV42 <chr>, HPV43 <chr>, HPV44 <chr>,
#   HPV54 <chr>, HPV61 <chr>, HPV70 <chr>
# ---- Packages ----
library(dplyr)
library(readxl)
library(gtsummary)
library(gt)

# ---- Read data ----
hpv_data <- readxl::read_xlsx("C:/Users/VUSI/Downloads/hpv_data.xlsx")

# ---- Clean: only replace empty strings in character columns ----
hpv_data <- hpv_data %>%
  mutate(across(where(is.character), ~ na_if(.x, "")))

# ---- Fix common typo and set factors ----
hpv_data <- hpv_data %>%
  mutate(
    HPV_Type_of_Infection = recode(HPV_Type_of_Infection, "Mulitple" = "Multiple"),
    HPV_Type_of_Infection = factor(HPV_Type_of_Infection, levels = c("Single", "Multiple")),
    HPV_type_status = factor(
      HPV_type_status,
      levels = c("Low-risk HPV", "High-risk HPV", "Both low-risk and high-risk")
    )
  )

# ---- Identify HPV type columns ----
hpv_cols <- hpv_data %>%
  select(matches("^HPV(16|18|26|31|33|35|39|45|51|52|53|56|58|59|66|68|69|73|82|6|11|40|42|43|44|54|61|70)$")) %>%
  names()

# ---- Ensure HPV columns are ordered categorical (Low < Intermediate < High) ----
hpv_data <- hpv_data %>%
  mutate(
    across(all_of(hpv_cols), ~ as.character(.x)),
    across(all_of(hpv_cols), ~ factor(.x, levels = c("Low","Intermediate","High"), ordered = TRUE))
  )

# ========= Choose which TWO HPV statuses to present and merge =========
status_left  <- "High-risk HPV"
status_right <- "Low-risk HPV"
# (Change the strings above if you prefer another pair, e.g., "Both low-risk and high-risk")

# ---- Build LEFT table (e.g., High-risk HPV) ----
hpv_left <- hpv_data %>%
  filter(HPV_type_status == status_left) %>%
  select(HPV_Type_of_Infection, all_of(hpv_cols)) %>%
  tbl_summary(
    by = HPV_Type_of_Infection,
    statistic = all_categorical() ~ "{n} ({p}%)",
    missing_text = "Missing"
  ) %>%
  add_overall() %>%
  modify_header(label ~ "**HPV Type**") %>%
  bold_labels()

# ---- Build RIGHT table (e.g., Low-risk HPV) ----
hpv_right <- hpv_data %>%
  filter(HPV_type_status == status_right) %>%
  select(HPV_Type_of_Infection, all_of(hpv_cols)) %>%
  tbl_summary(
    by = HPV_Type_of_Infection,
    statistic = all_categorical() ~ "{n} ({p}%)",
    missing_text = "Missing"
  ) %>%
  add_overall() %>%
  modify_header(label ~ "**HPV Type**") %>%
  bold_labels()

# ---- MERGE the two tables side-by-side ----
hpv_merged <- tbl_merge(
  tbls = list(hpv_left, hpv_right),
  tab_spanner = c(paste0("**", status_left, "**"),
                  paste0("**", status_right, "**"))
)

# ---- Add caption and render with gt styling ----
hpv_gt <- hpv_merged %>%
  modify_caption("**HPV Infection Distribution by Type of Infection (Merged by HPV Status)**") %>%
  as_gt() %>%
  gt::tab_options(
    table.font.size = "small",
    heading.align = "center"
  ) %>%
  gt::tab_style(
    style = cell_text(weight = "bold", color = "darkblue"),
    locations = cells_title(groups = "title")
  )

hpv_gt
HPV Infection Distribution by Type of Infection (Merged by HPV Status)
HPV Type
High-risk HPV
Low-risk HPV
Overall
N = 211
Single
N = 151
Multiple
N = 61
Overall
N = 81
Single
N = 81
Multiple
N = 01
HPV16





    Low 2 (50%) 0 (0%) 2 (67%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 2 (50%) 1 (100%) 1 (33%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (0%) 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 17 14 3 8 8 0
HPV18





    Low 3 (75%) 2 (67%) 1 (100%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 1 (25%) 1 (33%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (0%) 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 17 12 5 8 8 0
HPV26





    Low 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 21 15 6 8 8 0
HPV31





    Low 1 (100%) 1 (100%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 20 14 6 8 8 0
HPV33





    Low 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 21 15 6 8 8 0
HPV35





    Low 2 (67%) 0 (0%) 2 (100%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 1 (33%) 1 (100%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (0%) 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 18 14 4 8 8 0
HPV39





    Low 1 (100%) 1 (100%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 20 14 6 8 8 0
HPV45





    Low 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 21 15 6 8 8 0
HPV51





    Low 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 1 (100%) 1 (100%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 20 14 6 8 8 0
HPV52





    Low 1 (50%) 1 (100%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 1 (50%) 0 (0%) 1 (100%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (0%) 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 19 14 5 8 8 0
HPV53





    Low 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 21 15 6 8 8 0
HPV56





    Low 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 1 (100%) 1 (100%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 20 14 6 8 8 0
HPV58





    Low 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 21 15 6 8 8 0
HPV59





    Low 0 (0%) 0 (NA%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 1 (100%) 0 (NA%) 1 (100%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (0%) 0 (NA%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 20 15 5 8 8 0
HPV66





    Low 1 (100%) 0 (NA%) 1 (100%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (0%) 0 (NA%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (0%) 0 (NA%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 20 15 5 8 8 0
HPV68





    Low 3 (100%) 3 (100%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 18 12 6 8 8 0
HPV69





    Low 0 (0%) 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 1 (50%) 1 (100%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 1 (50%) 0 (0%) 1 (100%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 19 14 5 8 8 0
HPV73





    Low 0 (0%) 0 (NA%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 1 (100%) 0 (NA%) 1 (100%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (0%) 0 (NA%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 20 15 5 8 8 0
HPV82





    Low 2 (100%) 1 (100%) 1 (100%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (0%) 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (0%) 0 (0%) 0 (0%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 19 14 5 8 8 0
HPV6





    Low 0 (NA%) 0 (NA%) 0 (NA%) 0 (0%) 0 (0%) 0 (NA%)
    Intermediate 0 (NA%) 0 (NA%) 0 (NA%) 1 (50%) 1 (50%) 0 (NA%)
    High 0 (NA%) 0 (NA%) 0 (NA%) 1 (50%) 1 (50%) 0 (NA%)
    Missing 21 15 6 6 6 0
HPV11





    Low 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 21 15 6 8 8 0
HPV40





    Low 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 21 15 6 8 8 0
HPV42





    Low 0 (NA%) 0 (NA%) 0 (NA%) 1 (50%) 1 (50%) 0 (NA%)
    Intermediate 0 (NA%) 0 (NA%) 0 (NA%) 1 (50%) 1 (50%) 0 (NA%)
    High 0 (NA%) 0 (NA%) 0 (NA%) 0 (0%) 0 (0%) 0 (NA%)
    Missing 21 15 6 6 6 0
HPV43





    Low 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 21 15 6 8 8 0
HPV44





    Low 0 (NA%) 0 (NA%) 0 (NA%) 0 (0%) 0 (0%) 0 (NA%)
    Intermediate 0 (NA%) 0 (NA%) 0 (NA%) 2 (100%) 2 (100%) 0 (NA%)
    High 0 (NA%) 0 (NA%) 0 (NA%) 0 (0%) 0 (0%) 0 (NA%)
    Missing 21 15 6 6 6 0
HPV54





    Low 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Intermediate 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    High 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%) 0 (NA%)
    Missing 21 15 6 8 8 0
HPV61





    Low 0 (NA%) 0 (NA%) 0 (NA%) 1 (100%) 1 (100%) 0 (NA%)
    Intermediate 0 (NA%) 0 (NA%) 0 (NA%) 0 (0%) 0 (0%) 0 (NA%)
    High 0 (NA%) 0 (NA%) 0 (NA%) 0 (0%) 0 (0%) 0 (NA%)
    Missing 21 15 6 7 7 0
HPV70





    Low 0 (NA%) 0 (NA%) 0 (NA%) 1 (100%) 1 (100%) 0 (NA%)
    Intermediate 0 (NA%) 0 (NA%) 0 (NA%) 0 (0%) 0 (0%) 0 (NA%)
    High 0 (NA%) 0 (NA%) 0 (NA%) 0 (0%) 0 (0%) 0 (NA%)
    Missing 21 15 6 7 7 0
1 n (%)

1. Probability of HPV DNA positivity vs CD4 count (logistic regression smooth):
This plot shows the modeled probability of HPV DNA positivity across the spectrum of CD4 counts. The fitted blue line indicates that the likelihood of HPV positivity is highest at lower CD4 counts and steadily decreases as CD4 count rises, reflecting immune recovery. The shaded gray region represents the 95% confidence interval around the predicted probabilities, showing uncertainty increases at the extremes. Overall, the plot demonstrates an inverse relationship: women with lower CD4 counts have a higher probability of being HPV DNA positive, while those with higher CD4 counts are less likely to test positive.

2. CD4 count distribution by HPV DNA status (violin and boxplots):
This violin plot compares the distribution of CD4 counts between HPV DNA–negative and HPV DNA–positive groups. The HPV-negative group displays a wider spread of CD4 counts, with many individuals reaching higher counts (>1000 cells/µL). In contrast, the HPV-positive group shows a tighter distribution, with most values clustered at lower CD4 counts (generally <500 cells/µL). The overlaid boxplots confirm this difference: the median CD4 count in the HPV-negative group is higher than in the HPV-positive group. This indicates that HPV positivity is more common among individuals with suppressed immune status (lower CD4 counts).

3. HPV DNA prevalence by CD4 count category (proportions with 95% CI):
This prevalence plot summarizes HPV positivity across ordered CD4 count categories. The prevalence of HPV DNA is highest in the lowest CD4 stratum (0–200 cells/µL) and decreases progressively across higher categories, reaching the lowest levels in the 501–1000 group. The confidence intervals are wider in the extreme categories (0–200 and >1000), reflecting smaller sample sizes and greater uncertainty in those groups. The overall pattern reinforces the trend observed in the logistic regression: HPV DNA positivity is more prevalent in individuals with lower CD4 counts, supporting the conclusion that immunosuppression increases the risk of HPV persistence or acquisition.

library(dplyr)
library(ggplot2)
library(forcats)

(df <- readxl::read_xlsx("C:/Users/VUSI/Downloads/plots.xlsx"))
# A tibble: 132 × 4
   CD4_Count CD4_count_category HIV_status `HPV DNA`
       <dbl> <chr>              <chr>      <chr>    
 1       985 501 - 1000         Positive   <NA>     
 2       440 201 - 500          Positive   Negative 
 3       591 501 - 1000         Positive   Positive 
 4       273 201 - 500          Positive   Negative 
 5       313 201 - 500          Positive   Negative 
 6       795 501 - 1000         Positive   Negative 
 7       312 201 - 500          Positive   Negative 
 8       420 201 - 500          Positive   Positive 
 9       361 201 - 500          Positive   Negative 
10       616 501 - 1000         Positive   Negative 
# ℹ 122 more rows
df1 <- df %>%
  mutate(
    HPV_DNA = na_if(`HPV DNA`, ""),                     # turn blanks into NA
    HPV_DNA = factor(HPV_DNA, levels = c("Negative","Positive")),
    CD4_count_category = fct_relevel(CD4_count_category,
      "0 - 200","201 - 500","501 - 1000","> 1000", after = 0
    )
  ) %>%
  filter(!is.na(HPV_DNA))                               # drop missing HPV DNA rows

# --- 1) MAIN PLOT: Prevalence by CD4 category w/ 95% CIs ---
prev_by_cat <- df1 %>%
  group_by(CD4_count_category) %>%
  summarise(
    n = n(),
    pos = sum(HPV_DNA == "Positive"),
    p = pos / n,
    # Wilson CI via prop.test (good enough for most sizes)
    ci_low  = suppressWarnings(prop.test(pos, n)$conf.int[1]),
    ci_high = suppressWarnings(prop.test(pos, n)$conf.int[2]),
    .groups = "drop"
  )

ggplot(prev_by_cat, aes(x = CD4_count_category, y = p)) +
  geom_pointrange(aes(ymin = ci_low, ymax = ci_high), linewidth = 0.8) +
  geom_hline(yintercept = 0.5, linetype = "dotted", alpha = 0.3) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(
    x = "CD4 count category",
    y = "HPV DNA positive (%)",
    title = "HPV DNA prevalence by CD4 count category",
    subtitle = "Points = prevalence; bars = 95% CI (Wilson via prop.test)"
  ) +
  theme_minimal(base_size = 13)

# --- 2) RAW DISTRIBUTION: CD4 Count vs HPV DNA (violin + box + jitter) ---
ggplot(df1, aes(x = HPV_DNA, y = CD4_Count, fill = HPV_DNA)) +
  geom_violin(trim = FALSE, alpha = 0.25) +
  geom_boxplot(width = 0.2, outlier.shape = NA) +
  geom_jitter(width = 0.15, height = 0, alpha = 0.35, size = 1) +
  scale_fill_manual(values = c("Negative" = "#66c2a5", "Positive" = "#fc8d62")) +
  labs(x = "HPV DNA", y = "CD4 count (cells/µL)",
       title = "CD4 count distribution by HPV DNA status") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

# --- 3) QUICK CHECK: Stacked percent bars by CD4 category ---
ggplot(df1, aes(x = CD4_count_category, fill = HPV_DNA)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_manual(values = c("Negative" = "#66c2a5", "Positive" = "#fc8d62")) +
  labs(x = "CD4 count category", y = "Percentage",
       fill = "HPV DNA",
       title = "HPV DNA status by CD4 count category") +
  theme_minimal(base_size = 13)

# --- (Optional) dose–response: CD4 count (continuous) vs HPV DNA with logistic smooth ---
ggplot(df1, aes(x = CD4_Count, y = as.numeric(HPV_DNA == "Positive"))) +
  geom_point(alpha = 0.25, size = 1) +
  geom_smooth(method = "glm", method.args = list(family = binomial()), se = TRUE) +
  scale_y_continuous(labels = scales::percent, limits = c(0,1)) +
  labs(x = "CD4 count (cells/µL)", y = "Pr(HPV DNA positive)",
       title = "Probability of HPV DNA positivity vs CD4 count") +
  theme_minimal(base_size = 13)
`geom_smooth()` using formula = 'y ~ x'