Research Question: “How do immunological biomarkers and immune activation profiles differ in children with perinatally acquired HIV who are classified as long-term non-progressors (LTNPs), compared to HIV-positive progressors, HIV-positive individuals on treatment, and HIV-negative controls?”

#Step 1: Load Required Package and Read Excel File into df1 and displaying the missing percents of the data

# Load Required Libraries
library(dplyr)
library(readxl)
library(tidyr)

# Define File Path and Read Sheet 2 into df1
file_path <- "/Users/rupasree/Desktop/Dr-Alka_files/PIA_Visit1_age5.19_LTNP_rev2.2025.xlsx"
sheet_number <- 2
df1 <- read_excel(file_path, sheet = sheet_number)

# Convert All Columns to Character
df1 <- df1 %>% mutate(across(everything(), as.character))

# Count Total Missing Values in df1
total_values <- nrow(df1) * ncol(df1)
is_missing <- function(x) { is.na(x) | x == "" }
total_nulls <- sum(sapply(df1, is_missing))
null_percentage_total <- (total_nulls / total_values) * 100

# Identify Completely Empty Rows
empty_rows <- df1 %>% filter(if_all(everything(), ~ is.na(.) | . == ""))

# Display Missing Value Summary
cat("🔍 Total missing values:", total_nulls, "\n")

## 🔍 Total missing values: 1652

cat("📉 Missing percentage:", round(null_percentage_total, 2), "%\n")

## 📉 Missing percentage: 9.62 %

if (nrow(empty_rows) > 0) {
  cat("⚠️ There are", nrow(empty_rows), "completely empty rows.\n")
} else {
  cat("✅ No completely empty rows found.\n")
}

## ✅ No completely empty rows found.

# Remove Completely Empty Rows
df1 <- df1 %>% filter(!if_all(everything(), ~ is.na(.) | . == ""))

# Confirm Removal
cat("🗑️ Removed", nrow(empty_rows), "entirely empty rows.\n")

## 🗑️ Removed 0 entirely empty rows.

# Display Final Dimensions
cat("📊 Final dataset dimensions:", dim(df1)[1], "rows x", dim(df1)[2], "columns\n")

## 📊 Final dataset dimensions: 116 rows x 148 columns

# Calculate Missing Percentage Per Column
missing_summary <- df1 %>%
  summarise(across(everything(), ~ mean(is.na(.) | . == "") * 100)) %>%
  pivot_longer(cols = everything(), names_to = "Column", values_to = "Missing_Percentage") %>%
  arrange(desc(Missing_Percentage))

# Display the summary as a nice table
knitr::kable(missing_summary, caption = "Missing Value Percentage Per Column", digits = 2)

Missing Value Percentage Per Column
Column	Missing_Percentage
p13_CD8.p._HIVdex.p.	95.69
Category2_followup	64.66
p7_CD2.p.CD56.n._CD4.p._RO.p._total.IL13	48.28
p7_CD56.p._total.IFNg	43.10
p7_CD56.p._total.TNFa	43.10
WHO_Stage	42.24
p7_CD2.p.CD56.n._CD4.p._RO.p._total.IL4	41.38
p7_CD2.p.CD56.n._CD4.p._RO.p._total.IL21	39.66
p7_CD2.p.CD56.n._CD4.p._RO.p._total.TNFa	37.93
HIV_log_copies.ml	37.07
p7_CD2.p.CD56.n._CD4.p._RO.n._total.IFNg	37.07
p7_CD2.p.CD56.n._CD4.p._RO.n._total.TNFa	37.07
p7_CD2.p.CD56.n._CD4.p._RO.n._total.IL4	37.07
p7_CD2.p.CD56.n._CD8.p._total.IFNg	37.07
p7_CD2.p.CD56.n._CD8.p._total.TNFa	37.07
p7_CD2.p.CD56.n._CD8.p._total.IL4	37.07
p7_CD2.p.CD56.n._CD8.p._total.IL21	37.07
p8.d0_R6p_total.IL17A	24.14
ART_years	21.55
ART.Start_age_years	21.55
p8.d0_ROp_total.IL17A	19.83
p3_CD3.n.CD19.n._CD14.n.DR.p._CD4.p._CD303.p.CD33.n._in.PBMC	18.97
p6b_RO.p._FOXP3.p..Helios.p.	18.97
p6b_FOXP3.p.Helios.p.	18.10
p6b_RO.n._FOXP3.p..Helios.p.	18.10
p8.d0_ROp_total.IFNg	12.07
p8.d0_ROp_total.IL2	12.07
p9a_CD4.p._RO.p._CD38.p..DR.p.	10.34
p10_CD19.p.	10.34
p10_CD19.p._CD21.n.CD27.p.	10.34
p10_CD19.p._CD21.p.CD27.p.	10.34
p10_CD19.p._CD21.p.CD27.n.	10.34
p10_CD19.p._CD21.n.CD27.n.	10.34
p10_CD19.p._IgD.p.	10.34
p10_CD19.p._IgG.p.	10.34
p10_CD19.p._IgM.p.	10.34
p3_CD3.n.CD19.n._CD14.n.DR.p._CD4.p._CD123.p.CD303.p._in.PBMC	9.48
p2_CD4.p.RO.p._total.CCR9	7.76
p2_CD4.p.RO.n._total.CCR9	7.76
p3_CD3.n.CD19.n._CD14.n.DR.p._CD4.p._in.PBMC	7.76
p3_CD3.n.CD19.n._CD14.n.DR.p._CD4.p._CD303.n._in.PBMC	7.76
p3_CD3.n.CD19.n._CD14.n.DR.p._CD4.p._CD303.p..p._in.PBMC	7.76
p9a_CD8.p._.CD38.p..DR.p.	6.90
calculated Ki67 total in CD8	6.90
p9a_CD8p_ROp_Ki67	6.90
p9a_CD8p_GZB_total	6.90
p9a_CD8p_perforin_total	6.90
p1_CD4_memory_PD1.p.	6.03
p1_CD8_memory_PD1.p.	6.03
p1_CD4.p._PD1.p.CXCR5.p._in.memoryCD4	6.03
p9a_CD4.p._CD38.p..DR.p.	6.03
p9a_CD4p_memory_CD38	6.03
p9a_CD4p_memory_Ki67	6.03
p9a_CD4p_GZB_total	6.03
p9a_CD4p_memory_GZB	6.03
p9a_CD4p_naive_GZB	6.03
p9a_CD4p_perforin_total	6.03
p9a_CD4p_memory_perforin	6.03
p9a_CD4p_naive_perforin	6.03
p11_NK_CD160.p.	6.03
p11_NK_Tim3.p.	6.03
p11_NK_PD1.p.	6.03
calculated Ki67 total in CD4	5.17
p2_CD4.p.RO.p._total.CCR5	4.31
p2_CD4.p.RO.p._total.CCR6	4.31
p2_CD4.p.RO.p._total.CXCR3	4.31
p2_CD4.p.RO.p._total.CCR4	4.31
p2_CD4.p.RO.p._total.CLA	4.31
p2_CD4.p.RO.p._total.beta7	4.31
p4_totalMAIT_percent.ofTcells	4.31
p4_totalNKT_percent.ofTcells	4.31
p4_totalgdT_percent.ofTcells	4.31
p4_totalMAIT_percent.oflymph	4.31
p11m_CD4_memory_Tim3total	4.31
p11m_CD4_memory_PD1.p.2B4.p..CD160.n.Tim3.n._in.PD1total	4.31
p2_CD4.p.RO.n._total.CCR5	3.45
p2_CD4.p.RO.n._total.CCR6	3.45
p2_CD4.p.RO.n._total.CXCR3	3.45
p2_CD4.p.RO.n._total.CCR4	3.45
p2_CD4.p.RO.n._total.CLA	3.45
p2_CD4.p.RO.n._total.beta7	3.45
p11m_CD4_memory_PD1total	3.45
p11m_CD4_memory_2B4total	3.45
p11m_CD4_memory_CD160total	3.45
p11m_CD4_memory_PD1.p.2B4.p..CD160.n.Tim3.p._in.PD1total	3.45
p11m_CD4_memory_PD1.p.2B4.p..CD160.p.Tim3.p._in.PD1total	3.45
p11m_CD4_memory_PD1.p.2B4.p..CD160.p.Tim3.n._in.PD1total	3.45
p11m_CD4_memory_PD1.p.2B4.n..CD160.n.Tim3.p._in.PD1total	3.45
p11m_CD4_memory_PD1.p.2B4.n..CD160.p.Tim3.p._in.PD1total	3.45
p11m_CD4_memory_PD1.p.2B4.n..CD160.p.Tim3.n._in.PD1total	3.45
p11m_CD4_memory_PD1.p.2B4.n..CD160.n.Tim3.n._in.PD1total	3.45
p1_CD4.p._RO.p._CCR7.p.27.p._in.CD4	2.59
p1_CD4.p._RO.p._CCR7.n.27.p._in.CD4	2.59
p1_CD4.p._RO.p._CCR7.n.27.n._in.CD4	2.59
p1_CD4.p._RO.n._CCR7.n.27.n._in.CD4	2.59
p1_CD4.p._RO.n._CCR7.p.27.p._in.CD4	2.59
p1_CD4.p._totalmemory_in.CD4	2.59
p1_CD4.p._totalCD31_in.RO.n.	2.59
p1_CD4.p._totalCD31_in.RO.p.	2.59
p1_CD4.p._total.CXCR5_in.memoryCD4	2.59
p1_CD4.p._CD57.p._in.totalmemory.CD4	2.59
p1_CD4.p._CD31.p._in.totalmemory.CD4	2.59
p1_CD8.p._RO.p._CCR7.p.27.p._in.CD8	2.59
p1_CD8.p._RO.p._CCR7.n.27.p._in.CD8	2.59
p1_CD8.p._RO.p._CCR7.n.27.n._in.CD8	2.59
p1_CD8.p._RO.n._CCR7.n.27.n._in.CD8	2.59
p1_CD8.p._RO.n._CCR7.p.27.p._in.CD8	2.59
p1_CD8.p._totalmemory_in.CD8	2.59
p1_CD8.p._total.CXCR5_in.memoryCD8	2.59
p1_CD8.p._CD57.p._in.totalmemory.CD8	2.59
p1_CD8.p._CD31.p._in.totalmemory.CD8	2.59
p4_CD4.CD8.ratio	2.59
p4_CD16.n.CD56low_percent.oflymph	2.59
p4_CD16.n.CD56.p.p._percent.oflymph	2.59
p4_CD16.p.CD56.n._percent.oflymph	2.59
p4_CD16.p.p.CD56.n._percent.oflymph	2.59
p4_CD16.p.CD56.p._percent.oflymph	2.59
p4_NK.total_percent.of.lymph	2.59
p11m_CD8_memory_PD1total	2.59
p11m_CD8_memory_2B4total	2.59
p11m_CD8_memory_CD160total	2.59
p11m_CD8_memory_Tim3total	2.59
p11m_CD8_memory_PD1.p.2B4.p..CD160.n.Tim3.p._in.PD1total	2.59
p11m_CD8_memory_PD1.p.2B4.p..CD160.p.Tim3.p._in.PD1total	2.59
p11m_CD8_memory_PD1.p.2B4.p..CD160.p.Tim3.n._in.PD1total	2.59
p11m_CD8_memory_PD1.p.2B4.p..CD160.n.Tim3.n._in.PD1total	2.59
p11m_CD8_memory_PD1.p.2B4.n..CD160.n.Tim3.p._in.PD1total	2.59
p11m_CD8_memory_PD1.p.2B4.n..CD160.p.Tim3.p._in.PD1total	2.59
p11m_CD8_memory_PD1.p.2B4.n..CD160.p.Tim3.n._in.PD1total	2.59
p11m_CD8_memory_PD1.p.2B4.n..CD160.n.Tim3.n._in.PD1total	2.59
p11_CD4_naive_PD1total	2.59
p11_CD8_naive_PD1total	2.59
sCD40L_pg.mL	1.72
PIA	0.00
Visit	0.00
Entry.Date	0.00
Category	0.00
HIV.Category	0.00
HIV_SubCategory	0.00
HIV_PNP_AbsCD4	0.00
HIV_PNP_percentCD4	0.00
Sex	0.00
Age_years	0.00
Absolute_CD4	0.00
percent_CD4	0.00
sCD14_x10.6_pg.ml	0.00
sCD163_ng.ml	0.00
IFABP_pg.mL	0.00

#Step 2: Add HIV Status Category and Create df2

# Categorize Based on HIV_PNP_AbsCD4 and Create df2

# Ensure the column is numeric for comparison
df2 <- df1 %>%
  mutate(HIV_PNP_AbsCD4 = as.numeric(HIV_PNP_AbsCD4),
         Category = case_when(
           HIV_PNP_AbsCD4 == 1 ~ "HIV Negative",
           HIV_PNP_AbsCD4 == 2.3 ~ "LTNP",
           HIV_PNP_AbsCD4 == 2.4 ~ "Progressors",
           HIV_PNP_AbsCD4 == 3 ~ "HIV Positive with Treatment",
           TRUE ~ NA_character_
         ))
head(df2)

## # A tibble: 6 × 148
##   PIA   Visit Entry.Date Category                   HIV.Category HIV_SubCategory
##   <chr> <chr> <chr>      <chr>                      <chr>        <chr>          
## 1 27    1     2011-12-20 HIV Positive with Treatme… 3            3.4            
## 2 83    1     2012-02-16 HIV Negative               1            1              
## 3 141   1     2012-03-30 Progressors                2            2              
## 4 51    1     2012-01-13 HIV Positive with Treatme… 3            3.4            
## 5 145   1     2012-04-16 Progressors                2            2              
## 6 36    1     2012-01-03 HIV Negative               1            1              
## # ℹ 142 more variables: Category2_followup <chr>, HIV_PNP_AbsCD4 <dbl>,
## #   HIV_PNP_percentCD4 <chr>, Sex <chr>, Age_years <chr>, WHO_Stage <chr>,
## #   Absolute_CD4 <chr>, percent_CD4 <chr>, ART_years <chr>,
## #   ART.Start_age_years <chr>, HIV_log_copies.ml <chr>,
## #   sCD14_x10.6_pg.ml <chr>, sCD163_ng.ml <chr>, sCD40L_pg.mL <chr>,
## #   IFABP_pg.mL <chr>, p1_CD4.p._RO.p._CCR7.p.27.p._in.CD4 <chr>,
## #   p1_CD4.p._RO.p._CCR7.n.27.p._in.CD4 <chr>, …

#Step 3: Generate Summary Table by HIV Category

# Load required package for table display
library(knitr)

# Create summary table
df2_summary <- df2 %>%
  group_by(Category) %>%
  summarise(Count = n(), .groups = "drop") %>%
  arrange(match(Category, c("HIV Negative", "LTNP", "Progressors", "HIV Positive with Treatment")))

# Display summary in a formatted table
kable(df2_summary, col.names = c("HIV Category", "Count"))

HIV Category	Count
HIV Negative	43
LTNP	12
Progressors	30
HIV Positive with Treatment	31

#Step 4: Remove Selected Columns from df2 and Create df3 # removed columns with more than 37% missing data and categorical variables

# Define the columns to remove
columns_to_remove <- c(
  "p13_CD8.p._HIVdex.p.",
  "Category2_followup",
  "p7_CD2.p.CD56.n._CD4.p._RO.p._total.IL13",
  "p7_CD56.p._total.IFNg",
  "p7_CD56.p._total.TNFa",
  "WHO_Stage",
  "p7_CD2.p.CD56.n._CD4.p._RO.p._total.IL4",
  "p7_CD2.p.CD56.n._CD4.p._RO.p._total.IL21",
  "p7_CD2.p.CD56.n._CD4.p._RO.p._total.TNFa",
  "p7_CD2.p.CD56.n._CD4.p._RO.n._total.IFNg",
  "p7_CD2.p.CD56.n._CD4.p._RO.n._total.TNFa",
  "p7_CD2.p.CD56.n._CD4.p._RO.n._total.IL4",
  "p7_CD2.p.CD56.n._CD8.p._total.IFNg",
  "p7_CD2.p.CD56.n._CD8.p._total.TNFa",
  "p7_CD2.p.CD56.n._CD8.p._total.IL4",
  "p7_CD2.p.CD56.n._CD8.p._total.IL21",
  "Visit",
  "Entry.Date",
  "Sex",
  "HIV_PNP_AbsCD4",
  "HIV.Category",
  "HIV_SubCategory",
  "HIV_PNP_percentCD4"
)

# Remove columns and create df3
df3 <- df2 %>% select(-all_of(columns_to_remove))

# Print number of rows and columns in df3
cat("df3 has", nrow(df3), "rows and", ncol(df3), "columns.\n")

## df3 has 116 rows and 125 columns.

Step 5: Calculate Missing Count and Percentage per Sample, Using PIA # removing them from data frame

# Step 6: Calculate Missing Count and Percentage per Sample, Using PIA

# Confirm PIA exists
if (!"PIA" %in% colnames(df3)) {
  stop("PIA column not found in df3.")
}

# Calculate missing count and percentage
missing_summary <- df3 %>%
  mutate(across(everything(), ~ na_if(., ""))) %>%  # Treat empty strings as NA
  rowwise() %>%
  mutate(Missing_Count = sum(is.na(c_across(-PIA))),
         Missing_Percentage = (Missing_Count / (ncol(df3) - 1)) * 100) %>%  # exclude PIA
  ungroup() %>%
  select(PIA, Category, Missing_Count, Missing_Percentage)

# View top 3 rows with highest missing percentage
missing_summary %>%
  arrange(desc(Missing_Percentage)) %>%
  head(3)

## # A tibble: 3 × 4
##   PIA   Category     Missing_Count Missing_Percentage
##   <chr> <chr>                <int>              <dbl>
## 1 17    HIV Negative           116               93.5
## 2 110   HIV Negative           114               91.9
## 3 151   HIV Negative            89               71.8

#Step 6: Remove Top 3 Rows with Highest Missing Percentage and Create df4

# Identify top 3 PIA values with the highest missing percentage
top3_pia <- missing_summary %>%
  arrange(desc(Missing_Percentage)) %>%
  slice_head(n = 3) %>%
  pull(PIA)

# Remove those rows from df3 to create df4
df4 <- df3 %>% filter(!PIA %in% top3_pia)

# Confirm number of rows removed
cat("Removed", length(top3_pia), "rows. df4 has", nrow(df4), "rows and", ncol(df4), "columns.\n")

## Removed 3 rows. df4 has 113 rows and 125 columns.

#Step 7: Visualize Count of Samples per HIV Category

library(ggplot2)
library(dplyr)

# Step 8A: Count samples per Category
category_counts <- df4 %>%
  group_by(Category) %>%
  summarise(Count = n(), .groups = "drop")

# Step 8B: Bar Chart with Count Labels
ggplot(category_counts, aes(x = Category, y = Count, fill = Category)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = Count), vjust = -0.5, size = 5) +
  labs(title = "Sample Count per HIV Category", x = "HIV Category", y = "Count") +
  ylim(0, 50) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Step 8C: Pie Chart with Count Labels
category_counts <- category_counts %>%
  mutate(Label = paste0(Category, " (", Count, ")"))

ggplot(category_counts, aes(x = "", y = Count, fill = Label)) +
  geom_col(width = 1) +
  coord_polar(theta = "y") +
  labs(title = "Proportion of HIV Categories (by Count)") +
  theme_void()

#Step 8: Visualize of heatmap

# Load required libraries
library(dplyr)
library(tibble)
library(pheatmap)
library(RColorBrewer)

# Step 1: Create AgeGroup and annotation from df4
df4 <- df4 %>%
  mutate(
    Age_years = as.numeric(Age_years),
    AgeGroup = cut(Age_years, breaks = c(0, 9, 14, 19),
                   labels = c("5-9", "10-14", "15-19"))
  )

# Step 2: Create annotation dataframe (Category and AgeGroup)
annotation <- df4 %>%
  filter(!is.na(Category) & !is.na(AgeGroup)) %>%
  select(PIA, Category, AgeGroup) %>%
  column_to_rownames("PIA")

# Step 3: Extract immune markers (columns 4 to 126 = 123 markers)
heatmap_data <- df4 %>%
  filter(PIA %in% rownames(annotation)) %>%
  select(PIA, 4:126) %>%
  column_to_rownames("PIA") %>%
  mutate(across(everything(), ~ as.numeric(.))) %>%
  scale() %>%
  t() %>%
  as.data.frame()

# Step 4: Reorder columns by Category for visual "box" effect
annotation <- annotation[order(annotation$Category), ]
heatmap_data <- heatmap_data[, rownames(annotation)]

# Step 5: Define strong color contrast for AgeGroup
annotation_colors <- list(
  Category = c(
    "HIV Negative" = "#80b1d3",
    "LTNP" = "#fb8072",
    "Progressors" = "#fdb462",
    "HIV Positive with Treatment" = "#b3a2e3"
  ),
  AgeGroup = c(
    "5-9" = "light green",     # Red
    "10-14" = "#4daf4a",   # Green
    "15-19" = "dark green"    # Blue
  )
)

# Step 6: Compute gap positions for HIV Category boxes
category_gaps <- cumsum(table(annotation$Category))

# Step 7: Generate heatmap with horizontal boxes using `gaps_col`
pheatmap(
  mat = as.matrix(heatmap_data),
  annotation_col = annotation,
  annotation_colors = annotation_colors,
  gaps_col = category_gaps,               # 👈 Draws horizontal separation between groups
  cluster_cols = FALSE,                   # 👈 Keeps group order intact
  cluster_rows = TRUE,
  color = colorRampPalette(rev(brewer.pal(9, "RdBu")))(100),
  show_colnames = FALSE,
  fontsize_row = 3.2,
  main = "Immune Marker Heatmap with HIV Category Segments (Top Boxes)",
   na_col = "black"
)

# Step 8: Display heatmap structure info
cat("✅ Number of immune markers (rows):", nrow(heatmap_data), "\n")

## ✅ Number of immune markers (rows): 123

cat("✅ Number of samples (columns):", ncol(heatmap_data), "\n\n")

## ✅ Number of samples (columns): 113

cat("📊 HIV Category Counts:\n")

## 📊 HIV Category Counts:

print(table(annotation$Category))

## 
##                HIV Negative HIV Positive with Treatment 
##                          40                          31 
##                        LTNP                 Progressors 
##                          12                          30

cat("\n📊 Age Group Counts:\n")

## 
## 📊 Age Group Counts:

print(table(annotation$AgeGroup))

## 
##   5-9 10-14 15-19 
##    32    58    23

marking missing data in dark grey, remove the heads and add legend for z score

#Saves the image

# Save square-shaped heatmap image
png("heatmap_square.png", width = 3200, height = 3200, res = 300)

pheatmap(
  mat = as.matrix(heatmap_data),
  annotation_col = annotation,
  annotation_colors = annotation_colors,
  gaps_col = category_gaps,
  cluster_cols = FALSE,
  cluster_rows = TRUE,
  color = colorRampPalette(rev(brewer.pal(9, "RdBu")))(100),
  show_colnames = FALSE,
  fontsize_row = 4.5,
  main = "Immune Marker Heatmap - Square View (Category & AgeGroup)",
  na_col = "black"
)

dev.off()

## quartz_off_screen 
##                 3

#plot:Shows the expression variation in the top 6 biomarkers across all 4 groups. Visually distinguishes LTNP vs others—making it easier to observe unique biomarker patterns

# Load libraries
library(ggplot2)
library(dplyr)
library(tidyr)

# Step 1: Ensure immune marker columns are numeric
immune_cols <- colnames(df4)[4:126]
df4[immune_cols] <- df4[immune_cols] %>%
  mutate(across(everything(), as.numeric))

# Step 2: Find top 6 most variable biomarkers (by SD)
top6_markers <- df4 %>%
  summarise(across(all_of(immune_cols), ~ sd(., na.rm = TRUE))) %>%
  pivot_longer(everything(), names_to = "Marker", values_to = "SD") %>%
  arrange(desc(SD)) %>%
  slice(1:6) %>%
  pull(Marker)

# Step 3: Reshape data for plotting
df_long <- df4 %>%
  select(PIA, Category, all_of(top6_markers)) %>%
  pivot_longer(cols = all_of(top6_markers), names_to = "Marker", values_to = "Expression") %>%
  drop_na()

# Step 4: Plot faceted boxplots
ggplot(df_long, aes(x = Category, y = Expression, fill = Category)) +
  geom_boxplot(outlier.shape = NA) +
  facet_wrap(~ Marker, scales = "free_y", nrow = 2) +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text = element_text(face = "bold"),
        legend.position = "none") +
  labs(
    title = "Top 6 Variable Immune Biomarkers by HIV Category",
    y = "Expression Level",
    x = "HIV Category"
  )

The top six immune bio-markers visualized—Absolute_CD4, IFABP_pg.mL, p1_CD8.p._totalmemory_in.CD8, p9a_CD8p_GZB_total, sCD163_ng.ml, and sCD40L_pg.mL—were selected based on their high standard deviation (SD) across all samples, indicating greater inter-individual variability in immune response.

#plot:Shows the expression variation in the HIV disease progession immune bio-markers across all 4 groups. Visually distinguishes LTNP vs others—making it easier to observe unique biomarker patterns

# Load libraries
library(ggplot2)
library(dplyr)
library(tidyr)

# Step 1: Ensure immune marker columns are numeric
immune_cols <- colnames(df4)[4:126]
df4[immune_cols] <- df4[immune_cols] %>%
  mutate(across(everything(), as.numeric))

# Step 2: Specify actual biomarker names directly from df4
custom_markers <- c(
  "percent_CD4",
  "HIV_log_copies.ml",
  "p4_CD4.CD8.ratio",
  "p9a_CD4.p._RO.p._CD38.p..DR.p.",
  "p9a_CD8.p._.CD38.p..DR.p."
)

# Step 3: Reshape data for plotting
df_long_custom <- df4 %>%
  select(PIA, Category, all_of(custom_markers)) %>%
  pivot_longer(cols = all_of(custom_markers), names_to = "Marker", values_to = "Expression") %>%
  drop_na()

# Step 4: Faceted boxplot
ggplot(df_long_custom, aes(x = Category, y = Expression, fill = Category)) +
  geom_boxplot(outlier.shape = NA) +
  facet_wrap(~ Marker, scales = "free_y", nrow = 2) +
  theme_minimal(base_size = 13) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(face = "bold"),
    legend.position = "none"
  ) +
  labs(
    title = "Selected Immune Biomarkers by HIV Category",
    y = "Expression Level",
    x = "HIV Category"
  )

#This grouped bar chart visually compares LTNP with each category for the top 10 differing immune biomarkers.

# Load libraries
library(dplyr)
library(tidyr)
library(ggplot2)

# Step 1: Ensure immune marker columns are numeric
immune_cols <- colnames(df4)[4:126]
df4[immune_cols] <- df4[immune_cols] %>%
  mutate(across(everything(), as.numeric))

# Step 2: Calculate mean expression for each Category and each immune marker
marker_means <- df4 %>%
  filter(Category %in% c("LTNP", "HIV Negative", "Progressors", "HIV Positive with Treatment")) %>%
  group_by(Category) %>%
  summarise(across(all_of(immune_cols), ~ mean(., na.rm = TRUE)), .groups = "drop")

# Step 3: Reshape to long format
marker_means_long <- marker_means %>%
  pivot_longer(cols = -Category, names_to = "Marker", values_to = "Mean_Expression")

# Step 4: Select top 10 most differentially expressed markers for visual clarity
top10_diff <- marker_means_long %>%
  pivot_wider(names_from = Category, values_from = Mean_Expression) %>%
  mutate(Diff = `LTNP` - ((`HIV Negative` + Progressors + `HIV Positive with Treatment`) / 3)) %>%
  arrange(desc(abs(Diff))) %>%
  slice(1:10) %>%
  pull(Marker)

# Step 5: Filter the data for top 10 markers
plot_df <- marker_means_long %>%
  filter(Marker %in% top10_diff)

# Step 6 (Updated): Plot grouped bar chart with correct marker names
ggplot(plot_df, aes(x = factor(Marker, levels = top10_diff), y = Mean_Expression, fill = Category)) + 
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "Top 10 Immune Markers: LTNP vs Other Categories",
    y = "Mean Expression",
    x = "Immune Marker"
  )

# Load required libraries
library(dplyr)
library(ggplot2)

# Step 1: Convert all immune marker columns to numeric
immune_cols <- colnames(df4)[4:126]
df4[immune_cols] <- df4[immune_cols] %>%
  mutate(across(everything(), as.numeric))

# Step 2: Compute average expression per individual across all 123 markers
df4 <- df4 %>%
  rowwise() %>%
  mutate(Average_Immune_Expression = mean(c_across(all_of(immune_cols)), na.rm = TRUE)) %>%
  ungroup()

# Step 3: Group by HIV Category and compute group averages
group_summary <- df4 %>%
  group_by(Category) %>%
  summarise(Mean_Immune = mean(Average_Immune_Expression, na.rm = TRUE),
            SD_Immune = sd(Average_Immune_Expression, na.rm = TRUE),
            .groups = "drop")

# Step 4: Plot the bar chart
ggplot(group_summary, aes(x = Category, y = Mean_Immune, fill = Category)) +
  geom_bar(stat = "identity", width = 0.7) +
  geom_errorbar(aes(ymin = Mean_Immune - SD_Immune, ymax = Mean_Immune + SD_Immune),
                width = 0.2) +
  theme_minimal(base_size = 13) +
  labs(
    title = "Overall Immune Marker Burden Across HIV Categories",
    x = "HIV Category",
    y = "Average Expression of Immune Markers"
  ) +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

To assess overall immune activation profiles across HIV phenotypes, we calculated the average expression of 123 immune biomarkers per individual by taking the mean of all marker values for each participant. This composite metric represents the individual’s overall immune marker burden—a global reflection of systemic immune activity. Group-level summaries were then computed to compare the average burden across four categories: HIV Negative, HIV Positive with Treatment, LTNPs (Long-Term Non-Progressors), and Progressors. The resulting bar plot reveals that HIV-negative individuals and LTNPs exhibited the highest average immune expression, suggesting a broadly active or preserved immune profile. In contrast, the treated HIV-positive group displayed the lowest overall burden, likely due to immune suppression achieved through antiretroviral therapy. Progressors showed intermediate values, consistent with ongoing immune activation associated with disease advancement. These findings are directly aligned with the study’s aim of characterizing distinct immune activation signatures in perinatally infected children, highlighting that LTNPs may maintain immune activation levels closer to healthy controls, potentially contributing to their non-progressor phenotype despite chronic HIV infection.

# Load required libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)

# Step 1: Ensure immune marker columns are numeric
immune_cols <- colnames(df4)[4:126]  # Adjust if your marker columns are in a different range
df4[immune_cols] <- df4[immune_cols] %>%
  mutate(across(everything(), as.numeric))

# Step 2: Convert to long format (with real marker names)
df_long <- df4 %>%
  select(Category, all_of(immune_cols)) %>%
  pivot_longer(-Category, names_to = "Marker", values_to = "Expression") %>%
  filter(!is.na(Expression))

# Step 3: Calculate mean expression per marker per HIV category
marker_summary <- df_long %>%
  group_by(Category, Marker) %>%
  summarise(Mean_Expression = mean(Expression, na.rm = TRUE), .groups = "drop")

# Step 4: Plot bar chart with real marker names
ggplot(marker_summary, aes(x = Marker, y = Mean_Expression, fill = Category)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal(base_size = 11) +
  labs(
    title = "Average Expression of Immune Markers by HIV Category",
    x = "Immune Marker",
    y = "Mean Expression"
  ) +
  theme(
    axis.text.x = element_text(angle = 90, size = 6, hjust = 1),
    legend.position = "top"
  )

# Step 5: Optional – Create a table of immune markers if needed
# If you want a summary or export of marker names
marker_lookup_table <- data.frame(
  Marker = immune_cols,
  stringsAsFactors = FALSE
)

# Display the marker name table in Markdown/PDF output
kable(marker_lookup_table, caption = "List of Immune Markers Used in Analysis")

List of Immune Markers Used in Analysis
Marker
Absolute_CD4
percent_CD4
ART_years
ART.Start_age_years
HIV_log_copies.ml
sCD14_x10.6_pg.ml
sCD163_ng.ml
sCD40L_pg.mL
IFABP_pg.mL
p1_CD4.p._RO.p._CCR7.p.27.p._in.CD4
p1_CD4.p._RO.p._CCR7.n.27.p._in.CD4
p1_CD4.p._RO.p._CCR7.n.27.n._in.CD4
p1_CD4.p._RO.n._CCR7.n.27.n._in.CD4
p1_CD4.p._RO.n._CCR7.p.27.p._in.CD4
p1_CD4.p._totalmemory_in.CD4
p1_CD4.p._totalCD31_in.RO.n.
p1_CD4.p._totalCD31_in.RO.p.
p1_CD4.p._total.CXCR5_in.memoryCD4
p1_CD4.p._CD57.p._in.totalmemory.CD4
p1_CD4.p._CD31.p._in.totalmemory.CD4
p1_CD8.p._RO.p._CCR7.p.27.p._in.CD8
p1_CD8.p._RO.p._CCR7.n.27.p._in.CD8
p1_CD8.p._RO.p._CCR7.n.27.n._in.CD8
p1_CD8.p._RO.n._CCR7.n.27.n._in.CD8
p1_CD8.p._RO.n._CCR7.p.27.p._in.CD8
p1_CD8.p._totalmemory_in.CD8
p1_CD8.p._total.CXCR5_in.memoryCD8
p1_CD8.p._CD57.p._in.totalmemory.CD8
p1_CD8.p._CD31.p._in.totalmemory.CD8
p1_CD4_memory_PD1.p.
p1_CD8_memory_PD1.p.
p1_CD4.p._PD1.p.CXCR5.p._in.memoryCD4
p2_CD4.p.RO.p._total.CCR5
p2_CD4.p.RO.p._total.CCR6
p2_CD4.p.RO.p._total.CXCR3
p2_CD4.p.RO.p._total.CCR4
p2_CD4.p.RO.p._total.CLA
p2_CD4.p.RO.p._total.beta7
p2_CD4.p.RO.p._total.CCR9
p2_CD4.p.RO.n._total.CCR5
p2_CD4.p.RO.n._total.CCR6
p2_CD4.p.RO.n._total.CXCR3
p2_CD4.p.RO.n._total.CCR4
p2_CD4.p.RO.n._total.CLA
p2_CD4.p.RO.n._total.beta7
p2_CD4.p.RO.n._total.CCR9
p3_CD3.n.CD19.n._CD14.n.DR.p._CD4.p._in.PBMC
p3_CD3.n.CD19.n._CD14.n.DR.p._CD4.p._CD303.n._in.PBMC
p3_CD3.n.CD19.n._CD14.n.DR.p._CD4.p._CD303.p..p._in.PBMC
p3_CD3.n.CD19.n._CD14.n.DR.p._CD4.p._CD303.p.CD33.n._in.PBMC
p3_CD3.n.CD19.n._CD14.n.DR.p._CD4.p._CD123.p.CD303.p._in.PBMC
p4_CD4.CD8.ratio
p4_totalMAIT_percent.ofTcells
p4_totalNKT_percent.ofTcells
p4_totalgdT_percent.ofTcells
p4_totalMAIT_percent.oflymph
p4_CD16.n.CD56low_percent.oflymph
p4_CD16.n.CD56.p.p._percent.oflymph
p4_CD16.p.CD56.n._percent.oflymph
p4_CD16.p.p.CD56.n._percent.oflymph
p4_CD16.p.CD56.p._percent.oflymph
p4_NK.total_percent.of.lymph
p6b_FOXP3.p.Helios.p.
p6b_RO.p._FOXP3.p..Helios.p.
p6b_RO.n._FOXP3.p..Helios.p.
p8.d0_R6p_total.IL17A
p8.d0_ROp_total.IL17A
p8.d0_ROp_total.IFNg
p8.d0_ROp_total.IL2
p9a_CD4.p._CD38.p..DR.p.
calculated Ki67 total in CD4
p9a_CD4.p._RO.p._CD38.p..DR.p.
p9a_CD8.p._.CD38.p..DR.p.
calculated Ki67 total in CD8
p9a_CD4p_memory_CD38
p9a_CD4p_memory_Ki67
p9a_CD4p_GZB_total
p9a_CD4p_memory_GZB
p9a_CD4p_naive_GZB
p9a_CD4p_perforin_total
p9a_CD4p_memory_perforin
p9a_CD4p_naive_perforin
p9a_CD8p_ROp_Ki67
p9a_CD8p_GZB_total
p9a_CD8p_perforin_total
p10_CD19.p.
p10_CD19.p._CD21.n.CD27.p.
p10_CD19.p._CD21.p.CD27.p.
p10_CD19.p._CD21.p.CD27.n.
p10_CD19.p._CD21.n.CD27.n.
p10_CD19.p._IgD.p.
p10_CD19.p._IgG.p.
p10_CD19.p._IgM.p.
p11m_CD4_memory_PD1total
p11m_CD8_memory_PD1total
p11m_CD4_memory_2B4total
p11m_CD8_memory_2B4total
p11m_CD4_memory_CD160total
p11m_CD8_memory_CD160total
p11m_CD4_memory_Tim3total
p11m_CD8_memory_Tim3total
p11m_CD8_memory_PD1.p.2B4.p..CD160.n.Tim3.p._in.PD1total
p11m_CD8_memory_PD1.p.2B4.p..CD160.p.Tim3.p._in.PD1total
p11m_CD8_memory_PD1.p.2B4.p..CD160.p.Tim3.n._in.PD1total
p11m_CD8_memory_PD1.p.2B4.p..CD160.n.Tim3.n._in.PD1total
p11m_CD8_memory_PD1.p.2B4.n..CD160.n.Tim3.p._in.PD1total
p11m_CD8_memory_PD1.p.2B4.n..CD160.p.Tim3.p._in.PD1total
p11m_CD8_memory_PD1.p.2B4.n..CD160.p.Tim3.n._in.PD1total
p11m_CD8_memory_PD1.p.2B4.n..CD160.n.Tim3.n._in.PD1total
p11m_CD4_memory_PD1.p.2B4.p..CD160.n.Tim3.p._in.PD1total
p11m_CD4_memory_PD1.p.2B4.p..CD160.p.Tim3.p._in.PD1total
p11m_CD4_memory_PD1.p.2B4.p..CD160.p.Tim3.n._in.PD1total
p11m_CD4_memory_PD1.p.2B4.p..CD160.n.Tim3.n._in.PD1total
p11m_CD4_memory_PD1.p.2B4.n..CD160.n.Tim3.p._in.PD1total
p11m_CD4_memory_PD1.p.2B4.n..CD160.p.Tim3.p._in.PD1total
p11m_CD4_memory_PD1.p.2B4.n..CD160.p.Tim3.n._in.PD1total
p11m_CD4_memory_PD1.p.2B4.n..CD160.n.Tim3.n._in.PD1total
p11_CD4_naive_PD1total
p11_CD8_naive_PD1total
p11_NK_CD160.p.
p11_NK_Tim3.p.
p11_NK_PD1.p.
AgeGroup

#Shows the distribution of average marker expression across all 123 markers for each group.

# Boxplot of expression values across markers per category
ggplot(df_long, aes(x = Category, y = Expression, fill = Category)) +
  geom_boxplot(outlier.size = 0.5) +
  labs(
    title = "Distribution of Immune Marker Expression by HIV Category",
    y = "Expression Value", x = ""
  ) +
  theme_minimal() +
  theme(legend.position = "none")

library(dplyr)
library(tidyr)

# Step 1: Ensure immune markers are numeric
immune_cols <- colnames(df4)[4:126]
df4[immune_cols] <- df4[immune_cols] %>% mutate(across(everything(), as.numeric))

# Step 2: Reshape to long format
df_long_normality <- df4 %>%
  select(Category, all_of(immune_cols)) %>%
  pivot_longer(cols = -Category, names_to = "Marker", values_to = "Value") %>%
  filter(!is.na(Value) & !is.na(Category))

# Step 3: Run Shapiro-Wilk test for normality per Marker and Category
shapiro_results <- df_long_normality %>%
  group_by(Marker, Category) %>%
  summarise(
    Shapiro_p = tryCatch(shapiro.test(Value)$p.value, error = function(e) NA_real_),
    .groups = "drop"
  ) %>%
  arrange(Shapiro_p)

# Step 4: Identify non-normal combinations (p < 0.05)
non_normal <- shapiro_results %>% filter(Shapiro_p < 0.05)

# Step 5: Output Results
cat("🧪 Shapiro-Wilk Normality Test Results (df4):\n")

## 🧪 Shapiro-Wilk Normality Test Results (df4):

print(shapiro_results)

## # A tibble: 491 × 3
##    Marker                                                   Category   Shapiro_p
##    <chr>                                                    <chr>          <dbl>
##  1 p11m_CD4_memory_Tim3total                                HIV Negat…  4.61e-11
##  2 IFABP_pg.mL                                              HIV Posit…  1.24e-10
##  3 p11m_CD4_memory_PD1.p.2B4.n..CD160.p.Tim3.p._in.PD1total HIV Negat…  1.76e-10
##  4 HIV_log_copies.ml                                        HIV Posit…  2.24e-10
##  5 p9a_CD4p_memory_perforin                                 Progresso…  2.34e-10
##  6 p9a_CD4p_perforin_total                                  Progresso…  2.36e-10
##  7 p2_CD4.p.RO.n._total.CXCR3                               Progresso…  3.47e-10
##  8 IFABP_pg.mL                                              HIV Negat…  3.59e-10
##  9 p11m_CD4_memory_PD1.p.2B4.n..CD160.p.Tim3.n._in.PD1total HIV Posit…  4.86e-10
## 10 p4_totalNKT_percent.ofTcells                             HIV Negat…  5.15e-10
## # ℹ 481 more rows

cat("\n❌ Marker-Category combinations violating normality (p < 0.05):\n")

## 
## ❌ Marker-Category combinations violating normality (p < 0.05):

print(non_normal)

## # A tibble: 256 × 3
##    Marker                                                   Category   Shapiro_p
##    <chr>                                                    <chr>          <dbl>
##  1 p11m_CD4_memory_Tim3total                                HIV Negat…  4.61e-11
##  2 IFABP_pg.mL                                              HIV Posit…  1.24e-10
##  3 p11m_CD4_memory_PD1.p.2B4.n..CD160.p.Tim3.p._in.PD1total HIV Negat…  1.76e-10
##  4 HIV_log_copies.ml                                        HIV Posit…  2.24e-10
##  5 p9a_CD4p_memory_perforin                                 Progresso…  2.34e-10
##  6 p9a_CD4p_perforin_total                                  Progresso…  2.36e-10
##  7 p2_CD4.p.RO.n._total.CXCR3                               Progresso…  3.47e-10
##  8 IFABP_pg.mL                                              HIV Negat…  3.59e-10
##  9 p11m_CD4_memory_PD1.p.2B4.n..CD160.p.Tim3.n._in.PD1total HIV Posit…  4.86e-10
## 10 p4_totalNKT_percent.ofTcells                             HIV Negat…  5.15e-10
## # ℹ 246 more rows

cat("\n📊 Number of non-normal combinations:", nrow(non_normal), "out of", nrow(shapiro_results), "\n")

## 
## 📊 Number of non-normal combinations: 256 out of 491

# Step 6: Final conclusion
if (nrow(non_normal) / nrow(shapiro_results) > 0.5) {
  cat("\n⚠️ Conclusion: The data is NOT normally distributed in the majority of Marker-Category combinations.\n")
  cat("✅ Recommendation: Use non-parametric tests (e.g., Kruskal-Wallis, Wilcoxon).\n")
} else {
  cat("\n✅ Conclusion: Most Marker-Category combinations do not violate normality.\n")
  cat("📌 Recommendation: Parametric tests (e.g., t-test, ANOVA) may be appropriate.\n")
}

## 
## ⚠️ Conclusion: The data is NOT normally distributed in the majority of Marker-Category combinations.
## ✅ Recommendation: Use non-parametric tests (e.g., Kruskal-Wallis, Wilcoxon).

#tests

# Load required libraries
library(dplyr)
library(tidyr)

# Step 0: Identify immune marker columns directly from df4
immune_cols <- colnames(df4)[!(colnames(df4) %in% c("PIA", "Category", "Age_years"))]

# Step 1: Prepare data subset for testing (retain real marker names)
immune_data <- df4 %>%
  filter(!is.na(Category)) %>%
  select(Category, all_of(immune_cols))

# Step 2: Convert to long format and clean
immune_long <- immune_data %>%
  pivot_longer(-Category, names_to = "Marker", values_to = "Value") %>%
  mutate(Value = as.numeric(Value)) %>%
  filter(!is.na(Value))

# Step 3A: T-test: LTNP vs HIV Negative
ttest_results <- immune_long %>%
  filter(Category %in% c("LTNP", "HIV Negative")) %>%
  group_by(Marker) %>%
  summarise(
    p_value = tryCatch(t.test(Value ~ Category)$p.value, error = function(e) NA_real_),
    .groups = "drop"
  ) %>%
  arrange(p_value)

# Step 3B: Kruskal-Wallis Test (All Categories)
kruskal_results <- immune_long %>%
  group_by(Marker) %>%
  summarise(
    p_value = tryCatch(kruskal.test(Value ~ Category)$p.value, error = function(e) NA_real_),
    .groups = "drop"
  ) %>%
  arrange(p_value)

# Step 3C: Mann-Whitney U Test: LTNP vs Progressors
wilcox_results <- immune_long %>%
  filter(Category %in% c("LTNP", "Progressors")) %>%
  group_by(Marker) %>%
  summarise(
    p_value = tryCatch(wilcox.test(Value ~ Category, exact = FALSE)$p.value, error = function(e) NA_real_),
    .groups = "drop"
  ) %>%
  arrange(p_value)

# Step 4: Show top 5 significant markers with real names
cat("🔍 Top 5 Significant T-test Markers (LTNP vs HIV Negative):\n")

## 🔍 Top 5 Significant T-test Markers (LTNP vs HIV Negative):

print(ttest_results %>% slice_head(n = 5))

## # A tibble: 5 × 2
##   Marker                                                    p_value
##   <chr>                                                       <dbl>
## 1 p10_CD19.p._CD21.p.CD27.p.                               1.01e-10
## 2 p11m_CD8_memory_PD1.p.2B4.n..CD160.n.Tim3.n._in.PD1total 1.06e- 9
## 3 p1_CD8.p._RO.n._CCR7.p.27.p._in.CD8                      5.01e- 7
## 4 p9a_CD8p_GZB_total                                       9.09e- 7
## 5 p10_CD19.p._IgM.p.                                       1.26e- 6

cat("\n🔍 Top 5 Significant Kruskal-Wallis Markers (All Categories):\n")

## 
## 🔍 Top 5 Significant Kruskal-Wallis Markers (All Categories):

print(kruskal_results %>% slice_head(n = 5))

## # A tibble: 5 × 2
##   Marker                        p_value
##   <chr>                           <dbl>
## 1 ART_years                    1.04e-15
## 2 HIV_log_copies.ml            1.17e-12
## 3 p4_CD4.CD8.ratio             4.66e-12
## 4 p10_CD19.p._CD21.p.CD27.p.   1.04e-11
## 5 p1_CD8.p._totalmemory_in.CD8 1.31e-11

cat("\n🔍 Top 5 Significant Mann-Whitney Markers (LTNP vs Progressors):\n")

## 
## 🔍 Top 5 Significant Mann-Whitney Markers (LTNP vs Progressors):

print(wilcox_results %>% slice_head(n = 5))

## # A tibble: 5 × 2
##   Marker                                                       p_value
##   <chr>                                                          <dbl>
## 1 Absolute_CD4                                             0.000000580
## 2 p2_CD4.p.RO.n._total.CLA                                 0.000884   
## 3 p11m_CD8_memory_PD1.p.2B4.p..CD160.p.Tim3.p._in.PD1total 0.000969   
## 4 p4_CD4.CD8.ratio                                         0.000969   
## 5 percent_CD4                                              0.00100

Kruskal wallis test

# Kruskal-Wallis Test (All Categories)
kruskal_results <- immune_long %>%
  group_by(Marker) %>%
  summarise(
    p_value = tryCatch(kruskal.test(Value ~ Category)$p.value, error = function(e) NA_real_),
    .groups = "drop"
  ) %>%
  arrange(p_value)

# Significant markers before correction
significant_kruskal <- kruskal_results %>% filter(p_value < 0.05)
top5_kruskal <- kruskal_results %>% slice_head(n = 5)

# Kruskal-Wallis with BKY correction
kruskal_results_bky <- kruskal_results %>%
  mutate(p_bky = p.adjust(p_value, method = "BY")) %>%
  arrange(p_bky)

significant_bky <- kruskal_results_bky %>% filter(p_bky < 0.05)
top5_bky <- kruskal_results_bky %>% slice_head(n = 5)

# Output
cat("🔍 Top 5 Significant Markers (Uncorrected Kruskal-Wallis):\n")

## 🔍 Top 5 Significant Markers (Uncorrected Kruskal-Wallis):

print(top5_kruskal)

## # A tibble: 5 × 2
##   Marker                        p_value
##   <chr>                           <dbl>
## 1 ART_years                    1.04e-15
## 2 HIV_log_copies.ml            1.17e-12
## 3 p4_CD4.CD8.ratio             4.66e-12
## 4 p10_CD19.p._CD21.p.CD27.p.   1.04e-11
## 5 p1_CD8.p._totalmemory_in.CD8 1.31e-11

cat("\n All Significant Markers (Uncorrected Kruskal-Wallis, p < 0.05):\n")

## 
##  All Significant Markers (Uncorrected Kruskal-Wallis, p < 0.05):

print(significant_kruskal)

## # A tibble: 104 × 2
##    Marker                               p_value
##    <chr>                                  <dbl>
##  1 ART_years                           1.04e-15
##  2 HIV_log_copies.ml                   1.17e-12
##  3 p4_CD4.CD8.ratio                    4.66e-12
##  4 p10_CD19.p._CD21.p.CD27.p.          1.04e-11
##  5 p1_CD8.p._totalmemory_in.CD8        1.31e-11
##  6 calculated Ki67 total in CD8        1.67e-11
##  7 p1_CD8.p._RO.n._CCR7.p.27.p._in.CD8 1.79e-11
##  8 Absolute_CD4                        4.46e-11
##  9 p10_CD19.p._CD21.n.CD27.n.          8.25e-11
## 10 percent_CD4                         2.31e-10
## # ℹ 94 more rows

cat("\n Number of significant markers before BKY correction:", nrow(significant_kruskal), "\n")

## 
##  Number of significant markers before BKY correction: 104

cat("\n Top 5 Significant Markers (BKY-adjusted Kruskal-Wallis):\n")

## 
##  Top 5 Significant Markers (BKY-adjusted Kruskal-Wallis):

print(top5_bky)

## # A tibble: 5 × 3
##   Marker                        p_value    p_bky
##   <chr>                           <dbl>    <dbl>
## 1 ART_years                    1.04e-15 6.94e-13
## 2 HIV_log_copies.ml            1.17e-12 3.93e-10
## 3 p4_CD4.CD8.ratio             4.66e-12 1.04e- 9
## 4 p10_CD19.p._CD21.p.CD27.p.   1.04e-11 1.71e- 9
## 5 p1_CD8.p._totalmemory_in.CD8 1.31e-11 1.71e- 9

cat("\n All Significant Markers (BKY-adjusted p < 0.05):\n")

## 
##  All Significant Markers (BKY-adjusted p < 0.05):

print(significant_bky)

## # A tibble: 82 × 3
##    Marker                               p_value    p_bky
##    <chr>                                  <dbl>    <dbl>
##  1 ART_years                           1.04e-15 6.94e-13
##  2 HIV_log_copies.ml                   1.17e-12 3.93e-10
##  3 p4_CD4.CD8.ratio                    4.66e-12 1.04e- 9
##  4 p10_CD19.p._CD21.p.CD27.p.          1.04e-11 1.71e- 9
##  5 p1_CD8.p._totalmemory_in.CD8        1.31e-11 1.71e- 9
##  6 calculated Ki67 total in CD8        1.67e-11 1.71e- 9
##  7 p1_CD8.p._RO.n._CCR7.p.27.p._in.CD8 1.79e-11 1.71e- 9
##  8 Absolute_CD4                        4.46e-11 3.74e- 9
##  9 p10_CD19.p._CD21.n.CD27.n.          8.25e-11 6.14e- 9
## 10 percent_CD4                         2.31e-10 1.55e- 8
## # ℹ 72 more rows

cat("\n Number of significant markers after BKY correction:", nrow(significant_bky), "\n")

## 
##  Number of significant markers after BKY correction: 82

Mann-Whitney U Test Chunk (LTNP vs Progressors)

# Mann-Whitney U Test: LTNP vs Progressors
wilcox_results <- immune_long %>%
  filter(Category %in% c("LTNP", "Progressors")) %>%
  group_by(Marker) %>%
  summarise(
    p_value = tryCatch(wilcox.test(Value ~ Category, exact = FALSE)$p.value, error = function(e) NA_real_),
    .groups = "drop"
  ) %>%
  arrange(p_value)

# Top 5 significant markers
top5_wilcox <- wilcox_results %>% slice_head(n = 5)

# All markers with p < 0.05
significant_wilcox <- wilcox_results %>% filter(p_value < 0.05)

# Output
cat("🔍 Top 5 Significant Mann-Whitney Markers (LTNP vs Progressors):\n")

## 🔍 Top 5 Significant Mann-Whitney Markers (LTNP vs Progressors):

print(top5_wilcox)

## # A tibble: 5 × 2
##   Marker                                                       p_value
##   <chr>                                                          <dbl>
## 1 Absolute_CD4                                             0.000000580
## 2 p2_CD4.p.RO.n._total.CLA                                 0.000884   
## 3 p11m_CD8_memory_PD1.p.2B4.p..CD160.p.Tim3.p._in.PD1total 0.000969   
## 4 p4_CD4.CD8.ratio                                         0.000969   
## 5 percent_CD4                                              0.00100

cat("\n✅ All Significant Markers (Mann-Whitney, p < 0.05):\n")

## 
## ✅ All Significant Markers (Mann-Whitney, p < 0.05):

print(significant_wilcox)

## # A tibble: 17 × 2
##    Marker                                                       p_value
##    <chr>                                                          <dbl>
##  1 Absolute_CD4                                             0.000000580
##  2 p2_CD4.p.RO.n._total.CLA                                 0.000884   
##  3 p11m_CD8_memory_PD1.p.2B4.p..CD160.p.Tim3.p._in.PD1total 0.000969   
##  4 p4_CD4.CD8.ratio                                         0.000969   
##  5 percent_CD4                                              0.00100    
##  6 ART_years                                                0.00130    
##  7 p1_CD4.p._RO.n._CCR7.p.27.p._in.CD4                      0.00209    
##  8 p1_CD4.p._RO.p._CCR7.n.27.p._in.CD4                      0.00432    
##  9 p1_CD4.p._totalmemory_in.CD4                             0.00471    
## 10 p11m_CD4_memory_CD160total                               0.0305     
## 11 p10_CD19.p._IgG.p.                                       0.0310     
## 12 AgeGroup                                                 0.0331     
## 13 p4_totalgdT_percent.ofTcells                             0.0354     
## 14 p11_CD4_naive_PD1total                                   0.0435     
## 15 p1_CD8.p._totalmemory_in.CD8                             0.0435     
## 16 ART.Start_age_years                                      0.0438     
## 17 p10_CD19.p._CD21.n.CD27.n.                               0.0442

box

# Set threshold
alpha <- 0.05

# Kruskal-Wallis interpretation
cat("🧪 Kruskal-Wallis Test (All Categories):\n")

## 🧪 Kruskal-Wallis Test (All Categories):

sig_kruskal <- kruskal_results %>% filter(!is.na(p_value), p_value < alpha)
if (nrow(sig_kruskal) > 0) {
  cat("There are", nrow(sig_kruskal),
      "immune markers showing significant differences across the 4 categories (p < 0.05).\n")
  cat("This reflects the impact of disease stage and treatment on immune activation.\n\n")
} else {
  cat("No immune markers showed significant differences across all four groups (p ≥ 0.05).\n\n")
}

## There are 104 immune markers showing significant differences across the 4 categories (p < 0.05).
## This reflects the impact of disease stage and treatment on immune activation.

# Mann-Whitney U interpretation
cat("🧪 Mann-Whitney U Test (LTNP vs Progressors):\n")

## 🧪 Mann-Whitney U Test (LTNP vs Progressors):

sig_wilcox <- wilcox_results %>% filter(!is.na(p_value), p_value < alpha)
if (nrow(sig_wilcox) > 0) {
  cat("LTNPs differ significantly from Progressors in", nrow(sig_wilcox),
      "immune markers (p < 0.05).\n")
  cat("These differences suggest potential protective immune factors in LTNPs.\n\n")
} else {
  cat("No significant difference detected between LTNP and Progressors (p ≥ 0.05).\n\n")
}

## LTNPs differ significantly from Progressors in 17 immune markers (p < 0.05).
## These differences suggest potential protective immune factors in LTNPs.

#RESULT Children with perinatally acquired HIV classified as long-term non-progressors (LTNPs) exhibit distinct immunological profiles compared to HIV-negative controls, progressors, and those on treatment. The significant differences in immune marker expression—especially the 66 markers vs HIV-negative and 17 markers vs progressors—suggest that LTNPs maintain a unique immune activation state that may contribute to their ability to control disease progression without treatment. These findings support the hypothesis that immune biomarkers can differentiate clinical trajectories in pediatric HIV.

Untitled

2025-04-15

Research Question: “How do immunological biomarkers and immune activation profiles differ in children with perinatally acquired HIV who are classified as long-term non-progressors (LTNPs), compared to HIV-positive progressors, HIV-positive individuals on treatment, and HIV-negative controls?”

Step 5: Calculate Missing Count and Percentage per Sample, Using PIA # removing them from data frame

marking missing data in dark grey, remove the heads and add legend for z score

Kruskal wallis test

Mann-Whitney U Test Chunk (LTNP vs Progressors)

box