Rosabel

library(ggplot2)
library(tidyr)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

# --- your data (unchanged) ---
hpv <- tribble(
  ~HPV_Type,     ~Single_infection, ~Multiple_infection,
  "HPV 42 LR",   0.05, 0.10,
  "HPV 16 HR",   0.025, 0.10,
  "HPV 18 HR",   0.075, 0.05,
  "HPV 68 HR",   0.075, 0.05,
  "HPV 06 LR",    0.05, 0.05,
  "HPV 44 LR",   0.05, 0.05,
  "HPV 35 HR",   0.025, 0.075,
  "HPV 52 HR",   0.02, 0.07,
  "HPV 59 HR",   0, 0.10,
  "HPV 82 HR",   0.025, 0.05,
  "HPV 43 LR",   0, 0.05,
  "HPV 54 LR",   0, 0.05,
  "HPV 61 LR",   0.025, 0.025,
  "HPV 70 LR",   0.025, 0.025,
  "HPV 31 HR",   0.025, 0.025,
  "HPV 51 HR",   0.025, 0.025,
  "HPV 53 HR",   0, 0.05,
  "HPV 66 HR",   0, 0.05,
  "HPV 69 HR",   0.025, 0.025,
  "HPV 73 HR",   0, 0.05,
  "HPV 11 LR",   0, 0.025,
  "HPV 40 LR",   0, 0.025,
  "HPV 33 HR",   0, 0.025,
  "HPV 39 HR",   0.025, 0,
  "HPV 45 HR",   0, 0.025,
  "HPV 56 HR",   0.025, 0,
  "HPV 58 HR",   0, 0.025
)

# Long format + Risk tag + clean label + numeric ID
hpv_long <- hpv %>%
  pivot_longer(c(Single_infection, Multiple_infection),
               names_to = "Infection_Type", values_to = "Proportion") %>%
  mutate(
    Risk = ifelse(grepl("HR$", HPV_Type), "High-risk", "Low-risk"),
    HPV_Type_clean = gsub(" (LR|HR)$", "", HPV_Type),
    HPV_Number = as.numeric(sub("HPV\\s*", "", HPV_Type_clean))
  )

# ---- ORDER: Low-risk (desc numeric), then High-risk (desc numeric) ----
order_df <- hpv_long %>%
  distinct(HPV_Type_clean, Risk, HPV_Number) %>%
  mutate(Risk = factor(Risk, levels = c("Low-risk", "High-risk"))) %>%
  arrange(Risk, desc(HPV_Number))

hpv_long <- hpv_long %>%
  mutate(HPV_Type_clean = factor(HPV_Type_clean, levels = order_df$HPV_Type_clean))

# counts for split & label positions
n_low  <- sum(order_df$Risk == "Low-risk")
n_high <- sum(order_df$Risk == "High-risk")
low_center  <- n_low / 2
high_center <- n_low + n_high / 2
y_ann <- max(hpv_long$Proportion, na.rm = TRUE) * 1.05

# ---- Plot ----
ggplot(hpv_long, aes(x = HPV_Type_clean, y = Proportion*100, fill = Infection_Type)) +
  geom_bar(stat = "identity", width = 0.9) +
  geom_vline(xintercept = n_low + 0.5, linetype = "dashed", linewidth = 0.6, alpha = 0.7) +
  annotate("text", x = low_center,  y = 16, label = "Low-risk",  fontface = "bold", color = "black") +
  annotate("text", x = high_center, y = 16, label = "High-risk", fontface = "bold", color = "black") +
  scale_fill_manual(
    values = c("Multiple_infection" = "#298c8c", "Single_infection" = "#f1a226"),
    breaks = c("Multiple_infection", "Single_infection"),
    labels = c("Multiple infection", "Single infection"),
    name = "Infection type"
  ) +
  labs(title = "HPV Infection Distribution",
       x = "HPV Type", y = "Percentages (%)") +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.grid.major.x = element_blank()) +
  coord_cartesian(clip = "off")

# ----- OPTIONAL: Faceted version (two panels: LR vs HR) -----
ggplot(hpv_long, aes(x = HPV_Type_clean, y = Proportion*100, fill = Infection_Type)) +
  geom_bar(stat = "identity", width = 0.9) +
  scale_fill_manual(values = c("Single_infection" = "#f1a226",
                               "Multiple_infection" = "#298c8c"),
                    name = "Infection type") +
  labs(title = "HPV Infection Distribution by Risk Category",
       x = "HPV Type", y = "Percentages (%)") +
  facet_grid(~ Risk, scales = "free_x", space = "free_x") +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.grid.major.x = element_blank(),
        strip.text = element_text(face = "bold"))

(hpv_data <- readxl::read_xlsx("C:/Users/VUSI/Downloads/hpv_data.xlsx"))

# A tibble: 185 × 30
   HPV_Type_of_Infection HPV_type_status HPV16 HPV18 HPV26 HPV31 HPV33 HPV35
   <chr>                 <chr>           <chr> <chr> <lgl> <chr> <chr> <chr>
 1 <NA>                  <NA>            <NA>  <NA>  NA    <NA>  <NA>  <NA> 
 2 <NA>                  <NA>            <NA>  <NA>  NA    <NA>  <NA>  <NA> 
 3 Single                High-risk HPV   <NA>  <NA>  NA    <NA>  <NA>  <NA> 
 4 <NA>                  <NA>            <NA>  <NA>  NA    <NA>  <NA>  <NA> 
 5 <NA>                  <NA>            <NA>  <NA>  NA    <NA>  <NA>  <NA> 
 6 Mulitple              High-risk HPV   Low   <NA>  NA    <NA>  <NA>  <NA> 
 7 <NA>                  <NA>            <NA>  <NA>  NA    <NA>  <NA>  <NA> 
 8 <NA>                  <NA>            <NA>  <NA>  NA    <NA>  <NA>  <NA> 
 9 Single                High-risk HPV   <NA>  <NA>  NA    <NA>  <NA>  <NA> 
10 <NA>                  <NA>            <NA>  <NA>  NA    <NA>  <NA>  <NA> 
# ℹ 175 more rows
# ℹ 22 more variables: HPV39 <chr>, HPV45 <chr>, HPV51 <chr>, HPV52 <chr>,
#   HPV53 <chr>, HPV56 <chr>, HPV58 <chr>, HPV59 <chr>, HPV66 <chr>,
#   HPV68 <chr>, HPV69 <chr>, HPV73 <chr>, HPV82 <chr>, HPV6 <chr>,
#   HPV11 <chr>, HPV40 <chr>, HPV42 <chr>, HPV43 <chr>, HPV44 <chr>,
#   HPV54 <chr>, HPV61 <chr>, HPV70 <chr>

# ---- Packages ----
library(dplyr)
library(readxl)
library(gtsummary)
library(gt)

# ---- Read data ----
hpv_data <- readxl::read_xlsx("C:/Users/VUSI/Downloads/hpv_data.xlsx")

# ---- Clean: only replace empty strings in character columns ----
hpv_data <- hpv_data %>%
  mutate(across(where(is.character), ~ na_if(.x, "")))

# ---- Fix common typo and set factors ----
hpv_data <- hpv_data %>%
  mutate(
    HPV_Type_of_Infection = recode(HPV_Type_of_Infection, "Mulitple" = "Multiple"),
    HPV_Type_of_Infection = factor(HPV_Type_of_Infection, levels = c("Single", "Multiple")),
    HPV_type_status = factor(
      HPV_type_status,
      levels = c("Low-risk HPV", "High-risk HPV", "Both low-risk and high-risk")
    )
  )

# ---- Identify HPV type columns ----
hpv_cols <- hpv_data %>%
  select(matches("^HPV(16|18|26|31|33|35|39|45|51|52|53|56|58|59|66|68|69|73|82|6|11|40|42|43|44|54|61|70)$")) %>%
  names()

# ---- Ensure HPV columns are ordered categorical (Low < Intermediate < High) ----
hpv_data <- hpv_data %>%
  mutate(
    across(all_of(hpv_cols), ~ as.character(.x)),
    across(all_of(hpv_cols), ~ factor(.x, levels = c("Low","Intermediate","High"), ordered = TRUE))
  )

# ========= Choose which TWO HPV statuses to present and merge =========
status_left  <- "High-risk HPV"
status_right <- "Low-risk HPV"
# (Change the strings above if you prefer another pair, e.g., "Both low-risk and high-risk")

# ---- Build LEFT table (e.g., High-risk HPV) ----
hpv_left <- hpv_data %>%
  filter(HPV_type_status == status_left) %>%
  select(HPV_Type_of_Infection, all_of(hpv_cols)) %>%
  tbl_summary(
    by = HPV_Type_of_Infection,
    statistic = all_categorical() ~ "{n} ({p}%)",
    missing_text = "Missing"
  ) %>%
  add_overall() %>%
  modify_header(label ~ "**HPV Type**") %>%
  bold_labels()

# ---- Build RIGHT table (e.g., Low-risk HPV) ----
hpv_right <- hpv_data %>%
  filter(HPV_type_status == status_right) %>%
  select(HPV_Type_of_Infection, all_of(hpv_cols)) %>%
  tbl_summary(
    by = HPV_Type_of_Infection,
    statistic = all_categorical() ~ "{n} ({p}%)",
    missing_text = "Missing"
  ) %>%
  add_overall() %>%
  modify_header(label ~ "**HPV Type**") %>%
  bold_labels()

# ---- MERGE the two tables side-by-side ----
hpv_merged <- tbl_merge(
  tbls = list(hpv_left, hpv_right),
  tab_spanner = c(paste0("**", status_left, "**"),
                  paste0("**", status_right, "**"))
)

# ---- Add caption and render with gt styling ----
hpv_gt <- hpv_merged %>%
  modify_caption("**HPV Infection Distribution by Type of Infection (Merged by HPV Status)**") %>%
  as_gt() %>%
  gt::tab_options(
    table.font.size = "small",
    heading.align = "center"
  ) %>%
  gt::tab_style(
    style = cell_text(weight = "bold", color = "darkblue"),
    locations = cells_title(groups = "title")
  )

hpv_gt

**HPV Infection Distribution by Type of Infection (Merged by HPV Status)**
HPV Type	High-risk HPV			Low-risk HPV
HPV Type	Overall N = 21¹	Single N = 15¹	Multiple N = 6¹	Overall N = 8¹	Single N = 8¹	Multiple N = 0¹
HPV16
Low	2 (50%)	0 (0%)	2 (67%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	2 (50%)	1 (100%)	1 (33%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (0%)	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	17	14	3	8	8	0
HPV18
Low	3 (75%)	2 (67%)	1 (100%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	1 (25%)	1 (33%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (0%)	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	17	12	5	8	8	0
HPV26
Low	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	21	15	6	8	8	0
HPV31
Low	1 (100%)	1 (100%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	20	14	6	8	8	0
HPV33
Low	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	21	15	6	8	8	0
HPV35
Low	2 (67%)	0 (0%)	2 (100%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	1 (33%)	1 (100%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (0%)	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	18	14	4	8	8	0
HPV39
Low	1 (100%)	1 (100%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	20	14	6	8	8	0
HPV45
Low	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	21	15	6	8	8	0
HPV51
Low	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
High	1 (100%)	1 (100%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	20	14	6	8	8	0
HPV52
Low	1 (50%)	1 (100%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	1 (50%)	0 (0%)	1 (100%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (0%)	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	19	14	5	8	8	0
HPV53
Low	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	21	15	6	8	8	0
HPV56
Low	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	1 (100%)	1 (100%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	20	14	6	8	8	0
HPV58
Low	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	21	15	6	8	8	0
HPV59
Low	0 (0%)	0 (NA%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	1 (100%)	0 (NA%)	1 (100%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (0%)	0 (NA%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	20	15	5	8	8	0
HPV66
Low	1 (100%)	0 (NA%)	1 (100%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (0%)	0 (NA%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (0%)	0 (NA%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	20	15	5	8	8	0
HPV68
Low	3 (100%)	3 (100%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	18	12	6	8	8	0
HPV69
Low	0 (0%)	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	1 (50%)	1 (100%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
High	1 (50%)	0 (0%)	1 (100%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	19	14	5	8	8	0
HPV73
Low	0 (0%)	0 (NA%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	1 (100%)	0 (NA%)	1 (100%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (0%)	0 (NA%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	20	15	5	8	8	0
HPV82
Low	2 (100%)	1 (100%)	1 (100%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (0%)	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (0%)	0 (0%)	0 (0%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	19	14	5	8	8	0
HPV6
Low	0 (NA%)	0 (NA%)	0 (NA%)	0 (0%)	0 (0%)	0 (NA%)
Intermediate	0 (NA%)	0 (NA%)	0 (NA%)	1 (50%)	1 (50%)	0 (NA%)
High	0 (NA%)	0 (NA%)	0 (NA%)	1 (50%)	1 (50%)	0 (NA%)
Missing	21	15	6	6	6	0
HPV11
Low	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	21	15	6	8	8	0
HPV40
Low	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	21	15	6	8	8	0
HPV42
Low	0 (NA%)	0 (NA%)	0 (NA%)	1 (50%)	1 (50%)	0 (NA%)
Intermediate	0 (NA%)	0 (NA%)	0 (NA%)	1 (50%)	1 (50%)	0 (NA%)
High	0 (NA%)	0 (NA%)	0 (NA%)	0 (0%)	0 (0%)	0 (NA%)
Missing	21	15	6	6	6	0
HPV43
Low	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	21	15	6	8	8	0
HPV44
Low	0 (NA%)	0 (NA%)	0 (NA%)	0 (0%)	0 (0%)	0 (NA%)
Intermediate	0 (NA%)	0 (NA%)	0 (NA%)	2 (100%)	2 (100%)	0 (NA%)
High	0 (NA%)	0 (NA%)	0 (NA%)	0 (0%)	0 (0%)	0 (NA%)
Missing	21	15	6	6	6	0
HPV54
Low	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Intermediate	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
High	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)	0 (NA%)
Missing	21	15	6	8	8	0
HPV61
Low	0 (NA%)	0 (NA%)	0 (NA%)	1 (100%)	1 (100%)	0 (NA%)
Intermediate	0 (NA%)	0 (NA%)	0 (NA%)	0 (0%)	0 (0%)	0 (NA%)
High	0 (NA%)	0 (NA%)	0 (NA%)	0 (0%)	0 (0%)	0 (NA%)
Missing	21	15	6	7	7	0
HPV70
Low	0 (NA%)	0 (NA%)	0 (NA%)	1 (100%)	1 (100%)	0 (NA%)
Intermediate	0 (NA%)	0 (NA%)	0 (NA%)	0 (0%)	0 (0%)	0 (NA%)
High	0 (NA%)	0 (NA%)	0 (NA%)	0 (0%)	0 (0%)	0 (NA%)
Missing	21	15	6	7	7	0
¹ n (%)

1. Probability of HPV DNA positivity vs CD4 count (logistic regression smooth):
This plot shows the modeled probability of HPV DNA positivity across the spectrum of CD4 counts. The fitted blue line indicates that the likelihood of HPV positivity is highest at lower CD4 counts and steadily decreases as CD4 count rises, reflecting immune recovery. The shaded gray region represents the 95% confidence interval around the predicted probabilities, showing uncertainty increases at the extremes. Overall, the plot demonstrates an inverse relationship: women with lower CD4 counts have a higher probability of being HPV DNA positive, while those with higher CD4 counts are less likely to test positive.

2. CD4 count distribution by HPV DNA status (violin and boxplots):
This violin plot compares the distribution of CD4 counts between HPV DNA–negative and HPV DNA–positive groups. The HPV-negative group displays a wider spread of CD4 counts, with many individuals reaching higher counts (>1000 cells/µL). In contrast, the HPV-positive group shows a tighter distribution, with most values clustered at lower CD4 counts (generally <500 cells/µL). The overlaid boxplots confirm this difference: the median CD4 count in the HPV-negative group is higher than in the HPV-positive group. This indicates that HPV positivity is more common among individuals with suppressed immune status (lower CD4 counts).

3. HPV DNA prevalence by CD4 count category (proportions with 95% CI):
This prevalence plot summarizes HPV positivity across ordered CD4 count categories. The prevalence of HPV DNA is highest in the lowest CD4 stratum (0–200 cells/µL) and decreases progressively across higher categories, reaching the lowest levels in the 501–1000 group. The confidence intervals are wider in the extreme categories (0–200 and >1000), reflecting smaller sample sizes and greater uncertainty in those groups. The overall pattern reinforces the trend observed in the logistic regression: HPV DNA positivity is more prevalent in individuals with lower CD4 counts, supporting the conclusion that immunosuppression increases the risk of HPV persistence or acquisition.

library(dplyr)
library(ggplot2)
library(forcats)

(df <- readxl::read_xlsx("C:/Users/VUSI/Downloads/plots.xlsx"))

# A tibble: 132 × 4
   CD4_Count CD4_count_category HIV_status `HPV DNA`
       <dbl> <chr>              <chr>      <chr>    
 1       985 501 - 1000         Positive   <NA>     
 2       440 201 - 500          Positive   Negative 
 3       591 501 - 1000         Positive   Positive 
 4       273 201 - 500          Positive   Negative 
 5       313 201 - 500          Positive   Negative 
 6       795 501 - 1000         Positive   Negative 
 7       312 201 - 500          Positive   Negative 
 8       420 201 - 500          Positive   Positive 
 9       361 201 - 500          Positive   Negative 
10       616 501 - 1000         Positive   Negative 
# ℹ 122 more rows

df1 <- df %>%
  mutate(
    HPV_DNA = na_if(`HPV DNA`, ""),                     # turn blanks into NA
    HPV_DNA = factor(HPV_DNA, levels = c("Negative","Positive")),
    CD4_count_category = fct_relevel(CD4_count_category,
      "0 - 200","201 - 500","501 - 1000","> 1000", after = 0
    )
  ) %>%
  filter(!is.na(HPV_DNA))                               # drop missing HPV DNA rows

# --- 1) MAIN PLOT: Prevalence by CD4 category w/ 95% CIs ---
prev_by_cat <- df1 %>%
  group_by(CD4_count_category) %>%
  summarise(
    n = n(),
    pos = sum(HPV_DNA == "Positive"),
    p = pos / n,
    # Wilson CI via prop.test (good enough for most sizes)
    ci_low  = suppressWarnings(prop.test(pos, n)$conf.int[1]),
    ci_high = suppressWarnings(prop.test(pos, n)$conf.int[2]),
    .groups = "drop"
  )

ggplot(prev_by_cat, aes(x = CD4_count_category, y = p)) +
  geom_pointrange(aes(ymin = ci_low, ymax = ci_high), linewidth = 0.8) +
  geom_hline(yintercept = 0.5, linetype = "dotted", alpha = 0.3) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(
    x = "CD4 count category",
    y = "HPV DNA positive (%)",
    title = "HPV DNA prevalence by CD4 count category",
    subtitle = "Points = prevalence; bars = 95% CI (Wilson via prop.test)"
  ) +
  theme_minimal(base_size = 13)

# --- 2) RAW DISTRIBUTION: CD4 Count vs HPV DNA (violin + box + jitter) ---
ggplot(df1, aes(x = HPV_DNA, y = CD4_Count, fill = HPV_DNA)) +
  geom_violin(trim = FALSE, alpha = 0.25) +
  geom_boxplot(width = 0.2, outlier.shape = NA) +
  geom_jitter(width = 0.15, height = 0, alpha = 0.35, size = 1) +
  scale_fill_manual(values = c("Negative" = "#66c2a5", "Positive" = "#fc8d62")) +
  labs(x = "HPV DNA", y = "CD4 count (cells/µL)",
       title = "CD4 count distribution by HPV DNA status") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

# --- 3) QUICK CHECK: Stacked percent bars by CD4 category ---
ggplot(df1, aes(x = CD4_count_category, fill = HPV_DNA)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_manual(values = c("Negative" = "#66c2a5", "Positive" = "#fc8d62")) +
  labs(x = "CD4 count category", y = "Percentage",
       fill = "HPV DNA",
       title = "HPV DNA status by CD4 count category") +
  theme_minimal(base_size = 13)

# --- (Optional) dose–response: CD4 count (continuous) vs HPV DNA with logistic smooth ---
ggplot(df1, aes(x = CD4_Count, y = as.numeric(HPV_DNA == "Positive"))) +
  geom_point(alpha = 0.25, size = 1) +
  geom_smooth(method = "glm", method.args = list(family = binomial()), se = TRUE) +
  scale_y_continuous(labels = scales::percent, limits = c(0,1)) +
  labs(x = "CD4 count (cells/µL)", y = "Pr(HPV DNA positive)",
       title = "Probability of HPV DNA positivity vs CD4 count") +
  theme_minimal(base_size = 13)

`geom_smooth()` using formula = 'y ~ x'