Importing final merged mortality and longitudinal dataset with accounting for unclean inputs and duplicates

FINAL_merged_df_mod <- read_csv("Data/FINAL_merged_df_mod.csv")
View(FINAL_merged_df_mod)

Outline for Visualization

Plot 1: Spaghetti plot of POA vs Time of All individuals - randomn 10 percent graphed and 25 individuals highlighted

  • Data Preperation
# Prepare the data for plotting:
# Assuming FINAL_merged_df_mod is already loaded and contains the data.
all_ind_poa_trajectory_data <- FINAL_merged_df_mod %>%
  # Select only the necessary columns using the original column names
  select(hhidpn_mod, poa.average.score, wave) %>%
  # Remove rows where POA score is missing, as these cannot be plotted
  filter(!is.na(poa.average.score)) %>%
  # Convert ID to factor for ggplot's grouping
  mutate(
    hhidpn_mod = as.factor(hhidpn_mod),
    # Convert 'wave' into an ordered factor for correct axis order
    wave_ordered = factor(wave, 
                          levels = c("baseline", "4yrs", "8yrs", "12yrs"),
                          ordered = TRUE)
  ) %>%
  # Filter out any rows where the wave is not one of the four key points
  filter(!is.na(wave_ordered))

# Determine the number of unique individuals
n_individuals <- n_distinct(all_ind_poa_trajectory_data$hhidpn_mod)

# --- MODIFICATIONS FOR SAMPLING ---
n_gray <- round(n_individuals * 0.1) # 10% for the gray background lines
n_highlight <- 25 # Fixed number of individuals to highlight

# Get all unique IDs
all_ids <- all_ind_poa_trajectory_data %>% distinct(hhidpn_mod)

# Set seed for reproducibility
set.seed(42) 

# 1. Sample 10% of IDs for the gray background lines
gray_ids <- all_ids %>%
  sample_n(n_gray) %>%
  pull(hhidpn_mod)

# 2. Sample 25 IDs for the highlighted lines
# Note: It's okay for these two samples to overlap. The colored lines will be drawn 
# over the gray lines, ensuring they stand out.
highlight_ids <- all_ids %>%
  sample_n(n_highlight) %>%
  pull(hhidpn_mod)

# Create two datasets:
# Dataset for the gray background lines (10% sample)
gray_lines_data <- all_ind_poa_trajectory_data %>%
  filter(hhidpn_mod %in% gray_ids)

# Dataset for the highlighted lines (25 individuals)
highlight_data <- all_ind_poa_trajectory_data %>%
  filter(hhidpn_mod %in% highlight_ids)

# Report the summary
print(paste("Total unique individuals in original data:", n_individuals))
## [1] "Total unique individuals in original data: 11521"
print(paste("Number of individuals in gray background (10%):", n_gray))
## [1] "Number of individuals in gray background (10%): 1152"
print(paste("Number of individuals highlighted (fixed):", n_highlight))
## [1] "Number of individuals highlighted (fixed): 25"
  • Interactive Plot
# --- 1. Create the base ggplot ---
all_ind_poa_trajectory_plot <- ggplot() +
  
  # A. Plot the 10% sample of individual trajectories (in light grey)
  geom_line(data = gray_lines_data, # Use the 10% sampled data
            aes(
              x = wave_ordered, 
              y = poa.average.score, 
              group = hhidpn_mod
            ), 
            color = "gray80", 
            linewidth = 0.1) +
  
  # B. Plot the random sample of 25 trajectories (in color) - TEXT MAPPED FOR INTERACTIVITY
  geom_line(data = highlight_data,
            aes(
              x = wave_ordered, 
              y = poa.average.score, 
              group = hhidpn_mod, 
              color = hhidpn_mod,
              # Custom tooltip text mapped here
              text = paste("ID:", hhidpn_mod, "<br>Time Point:", wave, "<br>POA Score:", poa.average.score) 
            ), 
            linewidth = 1, 
            alpha = 0.8) +
  
  # C. Add points for the highlighted lines - TEXT MAPPED FOR INTERACTIVITY
  geom_point(data = highlight_data,
             aes(
               x = wave_ordered, 
               y = poa.average.score, 
               group = hhidpn_mod, 
               color = hhidpn_mod,
               # Custom tooltip text mapped here (needed for points)
               text = paste("ID:", hhidpn_mod, "<br>Time Point:", wave, "<br>POA Score:", poa.average.score) 
             ), 
             size = 1.5,
             alpha = 0.8) +
  
  # --- 2. Styling and Labels ---
  labs(
    title = "POA Average Score Trajectories Over 12 Years",
    subtitle = paste0("Background (gray): ", n_gray, " trajectories (10% sample). Highlighted (color): ", n_highlight, " trajectories."),
    x = "Survey Wave (Time Point)",
    y = "POA Average Score",
    color = "Sample ID"
  ) +
  # Remove the legend for the highlight colors since there are too many
  guides(color = "none") +
  # Enhance aesthetics
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    axis.title = element_text(size = 12),
    panel.grid.minor = element_blank()
  )


# --- 3. Convert to Interactive Plotly ---
# We use tooltip="text" to only show the custom hover information we defined 
ggplotly(all_ind_poa_trajectory_plot, tooltip = "text") %>%
  plotly::layout(
    xaxis = list(fixedrange = TRUE), 
    yaxis = list(fixedrange = TRUE)
  )

Plot 2: Spaghetti plot of POA vs Time grouped by ADL Onset

  • Preperation of data for ADL onset
# - - - 1. Define ADL Onset Group for each individual (hhidpn_mod) - - -
# assuming ADL occurrence is any value other than 'no' or NA.
adl_onset_groups <- FINAL_merged_df_mod %>%
  # waves for onset definition
  filter(wave %in% c("baseline", "4yrs", "8yrs", "12yrs")) %>%
  
  # first start by definig if adl occurs or not. ADL occurrence: 1 if limitation is NOT 'no' and NOT NA
  mutate(adl_occurs = ifelse(tolower(adl.limitations) != "no" & !is.na(adl.limitations), 1, 0)) %>%
  
  # Arrange by ID and time to find the first occurrence - easier to use months.since.baseline because it can be arranged numerically
  arrange(hhidpn_mod, months.since.baseline) %>%
  group_by(hhidpn_mod) %>%
  
  # Summarize to find the wave of the first ADL occurrence
  summarise(
    # Get the wave corresponding to the first '1' in adl_occurs
    first_onset_wave = wave[which(adl_occurs == 1)[1]],
    .groups = 'drop'
  ) %>%
  
  # Assign final group label based on the first onset wave
  mutate(
    adl_onset_group = case_when(
      is.na(first_onset_wave) ~ "ADL never occurs",
      first_onset_wave == "4yrs" ~ "ADL occurs at 4yrs",
      first_onset_wave == "8yrs" ~ "ADL occurs at 8yrs",
      first_onset_wave == "12yrs" ~ "ADL occurs at 12yrs",
      # Any onset at baseline or other unexpected wave is grouped into 'never' 
      # or treated as 'excluded' for the user's specific 4 groups. We'll exclude it
      TRUE ~ "Onset at Baseline/Excluded"
    )
  ) %>%
  # Filter to keep only the 4 groups requested
  filter(adl_onset_group != "Onset at Baseline/Excluded") %>% 
  select(hhidpn_mod, adl_onset_group)
  • Merge adl sorting with main trajectory data
# - - - 2. Prepare the main trajectory data and merge groups - - -
adl_onset_poa_trajectory_data <- FINAL_merged_df_mod %>%
  # selecting necessary columns from final cohort dataset
  select(hhidpn_mod, poa.average.score, wave, disability.status) %>%
  # Remove rows where POA score is missing
  filter(!is.na(poa.average.score)) %>%
  # Merge in the ADL onset groups
  left_join(adl_onset_groups, by = "hhidpn_mod") %>%
  # Filter to keep only individuals who belong to one of the 4 defined ADL onset groups
  filter(!is.na(adl_onset_group)) %>%
  
  # Convert to factors and order time points
  mutate(
    hhidpn_mod = as.factor(hhidpn_mod),
    wave_ordered = factor(wave, 
                          levels = c("baseline", "4yrs", "8yrs", "12yrs"),
                          ordered = TRUE)
  ) %>%
  # Filter out any rows where the wave is not one of the four key points
  filter(!is.na(wave_ordered))

# - - - 3. Prepare aggregated data (Mean POA by ADL Onset Group) - - -
mean_poa_adl_onset_data <- adl_onset_poa_trajectory_data %>%
  group_by(adl_onset_group, wave_ordered) %>%
  summarise(
    mean_poa = mean(poa.average.score, na.rm = TRUE),
    n_obs = n(),
    .groups = 'drop'
  )

# Report the summary
n_individuals_grouped <- n_distinct(adl_onset_poa_trajectory_data$hhidpn_mod)
print(paste("Total individuals plotted and grouped by ADL onset:", n_individuals_grouped))
## [1] "Total individuals plotted and grouped by ADL onset: 11521"
  • Graphing either 20 or 10 percent of the individuals for graph clarity and readability
# - - - 4. Prepare sampled data for spaghetti plot (20% random sample) - - -
n_individuals_grouped <- n_distinct(adl_onset_poa_trajectory_data$hhidpn_mod)
set.seed(42) # Set seed for reproducibility
sample_size <- floor(n_individuals_grouped * 0.20)
sampled_ids <- adl_onset_poa_trajectory_data %>%
  distinct(hhidpn_mod) %>%
  sample_n(sample_size) %>%
  pull(hhidpn_mod)

adl_onset_poa_sampled_data <- adl_onset_poa_trajectory_data %>%
  filter(hhidpn_mod %in% sampled_ids)
  
print(paste("Total individuals plotted in the mean line:", n_individuals_grouped))
## [1] "Total individuals plotted in the mean line: 11521"
print(paste("Number of individuals plotted in the spaghetti lines (20% sample):", sample_size))
## [1] "Number of individuals plotted in the spaghetti lines (20% sample): 2304"
  • now let’s plot!
    • making only the mean lines interactive
    • should have four groups each a distinct color
# - - - 1. Create the base ggplot - - -
color_values_4 <- c(
  "ADL never occurs" = "#1B9E77",
  "ADL occurs at 4yrs" = "#D95F02",
  "ADL occurs at 8yrs" = "#7570B3",
  "ADL occurs at 12yrs" = "#E7298A"
)
group_order_4 <- names(color_values_4)

adl_onset_trajectory_mean_plot_20_sampled <- adl_onset_poa_sampled_data %>%
  # CRITICAL FIX: Manually set factor levels for combined_group to guarantee legend order
  mutate(adl_onset_group = factor(adl_onset_group, levels = group_order_4)) %>%
  ggplot() + 
  
  # A. Plot SAMPLED individual trajectories (colored by onset group, non-interactive)
  geom_line(aes(
    x = wave_ordered, 
    y = poa.average.score, 
    group = hhidpn_mod, 
    color = adl_onset_group
    # IMPORTANT: Removed 'text' aesthetic to disable interactivity on individual lines
  ), 
  linewidth = 0.3, 
  alpha = 0.15) + 
  
  # B. Plot the MEAN line for each group (using the aggregated data)
  geom_line(data = mean_poa_adl_onset_data,
            aes(
              x = wave_ordered, 
              y = mean_poa, 
              group = adl_onset_group,
              color = adl_onset_group,
              # Text mapping kept here for interactivity on the mean line
              text = paste("Group:", adl_onset_group, "<br>Mean POA:", round(mean_poa, 3))
            ),
            linewidth = 1.5, 
            alpha = 1) +
  
  # C. Points for the mean line
  geom_point(data = mean_poa_adl_onset_data,
             aes(
               x = wave_ordered, 
               y = mean_poa, 
               group = adl_onset_group,
               color = adl_onset_group,
               # Text mapping kept here for interactivity on the mean line
               text = paste("Group:", adl_onset_group, "<br>Mean POA:", round(mean_poa, 3))
             ),
             size = 3,
             alpha = 1) +
  
  # - - - 2. Styling and Labels - - -
  labs(
    title = "POA Score Trajectories by Timing of ADL Limitation Onset (with Group Means)",
    subtitle = paste0("Individual trajectories (faint lines, 20% sample) and mean trends (thick lines, 100% cohort) plotted for ", n_individuals_grouped, " individuals."),
    x = "Survey Wave (Time Point)",
    y = "POA Average Score",
    color = "ADL Onset Group"
  ) +
  ylim(2.5, 5) + 
  scale_color_manual(
    values = color_values_4,
    breaks = group_order_4 # Enforce the custom order
  ) +
  # Add guide to make legend lines thicker
  guides(color = guide_legend(override.aes = list(linewidth = 4, alpha = 4))) +
  # Ensure only ONE legend entry is used (for color)
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 12),
    axis.title = element_text(size = 12),
    legend.position = "bottom",
    panel.grid.minor = element_blank()
  )

# - - - 3. Convert to Interactive Plotly - - -
# We use tooltip="text" to only show the custom hover information defined on the mean lines/points
ggplotly(adl_onset_trajectory_mean_plot_20_sampled, tooltip = "text") %>%
  plotly::layout(
    xaxis = list(fixedrange = TRUE), 
    yaxis = list(fixedrange = TRUE)
  )

Plot 3: Spaghetti plot of POA vs Time grouped by ADL Onset and Early Disability Status

  • Data preperation - creating a new df where each mod ID has a disability status
# - - - 1. Extract Fixed Baseline Disability Status (One value per individual) - - -
# Since disability.status is constant, we find the first non-NA status for each ID.
disability_status_fixed_df <- FINAL_merged_df_mod %>%
  group_by(hhidpn_mod) %>%
  summarise(
    disability_status_fixed = disability.status[!is.na(disability.status)][1], # take the first disability status listed
    .groups = 'drop'
  ) %>%
  # Filter out IDs where fixed status couldn't be determined
  filter(!is.na(disability_status_fixed)) %>%
  # simplify disability status for a cleaner label
  mutate(
    disability_label = ifelse(disability_status_fixed == "no.disability", 
                              "No Disability", 
                              "Early Disability Only")
  ) %>%
  select(hhidpn_mod, disability_label)
  • combining fixed disability status with adl onset for 8 groups
# - - - 2. Merge ADL Onset with Fixed Disability Status to Create 8-Level Composite Group - - -
composite_groups <- adl_onset_groups %>%
  left_join(disability_status_fixed_df, by = "hhidpn_mod") %>%
  filter(!is.na(disability_label)) %>%
  # Create the 8-level composite factor
  mutate(
    combined_group = factor(paste(adl_onset_group, disability_label, sep = " | ")) # this will tell me first what time they had adl onset, and second, their disability status
  ) %>%
  select(hhidpn_mod, combined_group)
  • take the composite and use it to merge with main poa trajectory data
# - - - 3. Prepare the main trajectory data and merge groups - - -
adl_onset_with_disability_poa_trajectory_data <- FINAL_merged_df_mod %>%
 # selecting necessary columns from final cohort dataset
 select(hhidpn_mod, poa.average.score, wave) %>%
 # Remove rows where POA score is missing
 filter(!is.na(poa.average.score)) %>%
 # Merge in the composite groups
 left_join(composite_groups, by = "hhidpn_mod") %>%
 # Filter to keep only individuals who belong to one of the 8 defined combined groups
 filter(!is.na(combined_group)) %>%

 # Convert to factors and order time points
 mutate(
  hhidpn_mod = as.factor(hhidpn_mod),
  wave_ordered = factor(wave,
             levels = c("baseline", "4yrs", "8yrs", "12yrs"),
             ordered = TRUE)
 ) %>%
 # Filter out any rows where the wave is not one of the four key points
 filter(!is.na(wave_ordered))
  • Prepare the mean lines for each of the 8 grroups
# --- 4. Prepare aggregated data (Mean POA by 8-level Combined Group) ---
mean_poa_combined_group_data <- adl_onset_with_disability_poa_trajectory_data %>%
 group_by(combined_group, wave_ordered) %>%
 summarise(
  mean_poa = mean(poa.average.score, na.rm = TRUE),
  n_obs = n(),
  .groups = 'drop'
 )
  • Sampling the data for 20% individual lines
# --- 5. Prepare sampled data for spaghetti plot (20% random sample) ---
n_individuals_grouped <- n_distinct(adl_onset_with_disability_poa_trajectory_data$hhidpn_mod)
set.seed(42) # Set seed for reproducibility
sample_size <- floor(n_individuals_grouped * 0.20)
sampled_ids <- adl_onset_with_disability_poa_trajectory_data %>%
  distinct(hhidpn_mod) %>%
  sample_n(sample_size) %>%
  pull(hhidpn_mod)

adl_onset_poa_sampled_data <- adl_onset_with_disability_poa_trajectory_data %>%
  filter(hhidpn_mod %in% sampled_ids)
  
print(paste("Total individuals plotted in the mean line (8 groups):", n_individuals_grouped))
## [1] "Total individuals plotted in the mean line (8 groups): 11521"
print(paste("Number of individuals plotted in the spaghetti lines (20% sample):", sample_size))
## [1] "Number of individuals plotted in the spaghetti lines (20% sample): 2304"
  • Plotting of the graph
# step 0 for legend ordering
# Define the order and colors for the manual scale
color_values <- c(
  "ADL never occurs | No Disability" = "#1B9E77",
  "ADL occurs at 4yrs | No Disability" = "#D95F02",
  "ADL occurs at 8yrs | No Disability" = "#7570B3",
  "ADL occurs at 12yrs | No Disability" = "#E7298A",
  "ADL never occurs | Early Disability Only" = "#66A61E",
  "ADL occurs at 4yrs | Early Disability Only" = "#E6AB02",
  "ADL occurs at 8yrs | Early Disability Only" = "#A6761D",
  "ADL occurs at 12yrs | Early Disability Only" = "#666666"
)
# Extract the names of the groups in the desired order
group_order <- names(color_values)
combined_trajectory_mean_plot <- adl_onset_poa_sampled_data %>%
  # CRITICAL FIX: Manually set factor levels for combined_group to guarantee legend order
  mutate(combined_group = factor(combined_group, levels = group_order)) %>%
  ggplot() + 
  
  # A. Plot SAMPLED individual trajectories (colored by combined group, non-interactive)
  geom_line(aes(
    x = wave_ordered, 
    y = poa.average.score, 
    group = hhidpn_mod, 
    color = combined_group
  ), 
  linewidth = 0.3, 
  alpha = 0.15) + 
  
  # B. Plot the MEAN line for each group (using the aggregated data)
  geom_line(data = mean_poa_combined_group_data,
            aes(
              x = wave_ordered, 
              y = mean_poa, 
              group = combined_group,
              color = combined_group,
              # Text mapping kept here for interactivity on the mean line
              text = paste("Group:", combined_group, "<br>Mean POA:", round(mean_poa, 3))
            ),
            linewidth = 1.2, 
            alpha = 1) +
  
  # C. Points for the mean line
  geom_point(data = mean_poa_combined_group_data,
             aes(
               x = wave_ordered, 
               y = mean_poa, 
               group = combined_group,
               color = combined_group,
               # Text mapping kept here for interactivity on the mean line
               text = paste("Group:", combined_group, "<br>Mean POA:", round(mean_poa, 3))
             ),
             size = 2.5,
             alpha = 1) +
  
  # --- 2. Styling and Labels ---
  labs(
    title = "POA Score Trajectories by ADL Onset Timing and Baseline Disability Status (8 Groups)",
    subtitle = paste0("Individual trajectories (faint lines, 20% sample) and mean trends (thick lines, 100% cohort) plotted for ", n_individuals_grouped, " individuals. Note: 8-level grouping used."),
    x = "Survey Wave (Time Point)",
    y = "POA Average Score",
    color = "Composite Group (ADL Onset | Disability Status)"
  ) +
  # Set the Y-axis limits (zoom)
  ylim(2.5, 5) + 
  # Use a more diverse palette for 8 groups
  scale_color_manual(
    values = color_values,
    breaks = group_order # Enforce the custom order
  ) +
  # Add guide to make legend lines thicker
  guides(color = guide_legend(override.aes = list(linewidth = 4, alpha = 4))) +
  # Increase plot width for 8 legends and a long title
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 11), 
    axis.title = element_text(size = 10),
    # Adjust Legend Position to the top and size slightly
    legend.position = "top",
    legend.text = element_text(size = 8), # Smaller text
    legend.title = element_text(size = 9, face = "bold"), # Smaller title
    panel.grid.minor = element_blank()
  )

# --- 3. Convert to Interactive Plotly ---
ggplotly(combined_trajectory_mean_plot, tooltip = "text") %>%
  plotly::layout(
    xaxis = list(fixedrange = TRUE), 
    yaxis = list(fixedrange = TRUE)
  ) %>%
  plotly::config(
    displaylogo = FALSE, 
    showLink = FALSE
  )

Plot 4: KM plot for those above and below mean POA scores

  • Calculate the median poa.average.score
# The na.rm = TRUE argument tells R to ignore NA (missing) values in the calculation
median_poa_score <- median(FINAL_merged_df_mod$poa.average.score, na.rm = TRUE)
print(paste("The median of all POA scores is:", median_poa_score))
## [1] "The median of all POA scores is: 4"
  • Grouping factor based on the median
# 1. Calculate the mean POA score for each individual
# This creates a new dataframe with one row per individual
individual_means <- FINAL_merged_df_mod %>%
  group_by(hhidpn_mod) %>%
  summarise(
    mean_poa_score = mean(poa.average.score, na.rm = TRUE)
  )

# 2. Calculate the median of these individual mean scores
median_of_individual_means <- median(individual_means$mean_poa_score, na.rm = TRUE)
# The median is 3.9375

# 3. Merge the individual mean back into the main dataframe
FINAL_merged_df_mod <- FINAL_merged_df_mod %>%
  left_join(individual_means, by = "hhidpn_mod")

# 4. Create the new grouping variable and label based on the median (3.9375)
median_val_str <- "3.9375"
low_label <- paste0("Low POA (< ", median_val_str, ")") # <-- Shorter label
high_label <- paste0("High POA (> ", median_val_str, ")") # <-- Shorter label

FINAL_merged_df_mod <- FINAL_merged_df_mod %>%
  mutate(
    poa_group_new = ifelse( # this will be the column we are grouping by in the km curves
      mean_poa_score > median_of_individual_means,
      high_label,
      low_label
    )
  )

# Convert the new grouping variable to a factor
FINAL_merged_df_mod$poa_group_new <- factor(
    FINAL_merged_df_mod$poa_group_new,
    levels = c(low_label, high_label) # Ensures Low group is first
)
  • Fit the KM survival curve
library(survival)
library(survminer)
# 5. Fit the Kaplan-Meier survival curve
# Ensure only valid rows (where we have both time and group) are used
fit_median <- survfit(
    Surv(months.observed, status) ~ poa_group_new,
    data = FINAL_merged_df_mod
)
# summary(fit_median)
  • plot the KM curve
# 6. Generate the static Kaplan-Meier plot
km_plot_median <- ggsurvplot(
    fit_median,
    data = FINAL_merged_df_mod,
    risk.table = FALSE,      # No Risk table 
    pval = TRUE,
    conf.int = TRUE,
    title = "KM Survival Curves for Individuals Above and Below Median POA Score",
    legend.title = "POA Score Group", # Labels are now embedded in the factor levels
    legend.labs = c(low_label, high_label), # <-- Explicitly set legend labels
    xlab = "Time in Months",
    break.time.by = 24
)

# Print the static plot
print(km_plot_median)

Plot 5: KM plot for individuals grouped by ADL onset

# Merge ADL onset groups with survival data
adl_onset_survival_data <- FINAL_merged_df_mod %>%
  select(hhidpn_mod, months.observed, status) %>%
  # Join with the pre-calculated ADL onset groups (4 levels: never, 4yrs, 8yrs, 12yrs)
  left_join(adl_onset_groups, by = "hhidpn_mod") %>%
  # Filter to keep only individuals in the 4 defined ADL onset groups
  filter(!is.na(adl_onset_group)) %>%
  # Convert to factor with the desired plotting order (earlier onset = worse outcome, usually placed first)
  mutate(
    adl_onset_group = factor(
      adl_onset_group,
      levels = c("ADL occurs at 4yrs", "ADL occurs at 8yrs", "ADL occurs at 12yrs", "ADL never occurs")
    )
  )

# Report summary
n_individuals_adl_km <- n_distinct(adl_onset_survival_data$hhidpn_mod)
print(paste("Total individuals plotted in ADL Onset KM curve:", n_individuals_adl_km))
## [1] "Total individuals plotted in ADL Onset KM curve: 11521"
# Filter the data for the problematic group
adl_12yrs_data <- adl_onset_survival_data %>%
filter(adl_onset_group == "ADL occurs at 12yrs")

# Check the count and status distribution
print("Summary of ADL Occurs at 12yrs Group:")
## [1] "Summary of ADL Occurs at 12yrs Group:"
print(table(adl_12yrs_data$status))
## 
##    0 
## 1320
  • Fit the survival curves and plot
# 1. Fit the Kaplan-Meier survival curve
fit_adl_onset <- survfit(
    Surv(months.observed, status) ~ adl_onset_group,
    data = adl_onset_survival_data
)
new_labels <- c(
"ADL Occurs at 4 yrs", # Group 1
"ADL Occurs at 8 yrs", # Group 2
"ADL Occurs at 12 yrs", # Group 3
"ADL Never Occurs"   # Group 4
)

color_palette <- c("#D95F02", "#7570B3", "#E7298A", "#1B9E77")

# 2. Generate the static Kaplan-Meier plot
km_plot_adl_onset <- ggsurvplot(
    fit_adl_onset,
    data = adl_onset_survival_data,
    pval = TRUE,
    conf.int = FALSE, # Set to FALSE for cleaner plot with 4 lines
    title = "Survival Curves by Timing of ADL Limitation Onset",
    legend.title = "ADL Onset Group",
    legend = "bottom",
    legend.labs = new_labels,
    xlab = "Time in Months",
    break.time.by = 24,
    # Use a custom color palette for the 4 groups
    palette = color_palette, # Corrected palette vector
    ggtheme = theme_minimal()
)

km_plot_adl_onset$plot <- km_plot_adl_onset$plot +
  # Use guides to control the appearance of the legend keys and text
  guides(
    col = guide_legend(
      nrow = 1, # Arrange items into 2 rows to force wrapping
      byrow = TRUE,
      # Remove the explicit variable name ("combined_group =") from the keys.
      # This is often hidden by using 'legend.title = ""', but sometimes requires a manual override.
      title.position = "top"
    )
  ) +
# Further theme adjustments to control spacing if needed
  theme(
    # Legend appearance: This hides the variable name if it appears above the labels
    legend.title = element_text(face = "bold", size = 10),
    # Ensure legend text wraps (done via nrow/byrow in guides, but theme is good for spacing)
    legend.text = element_text(size = 8)
  )

# Print the static plot
print(km_plot_adl_onset)

Plot 6: KM plot grouped by ADL onset and Early Disability

# Merge composite groups with survival data
composite_survival_data <- FINAL_merged_df_mod %>%
  select(hhidpn_mod, months.observed, status) %>%
  # Join with the pre-calculated 8-level composite groups
  left_join(composite_groups, by = "hhidpn_mod") %>%
  # Filter to keep only individuals in the 8 defined composite groups
  filter(!is.na(combined_group)) %>%
  # Convert to factor with the desired plotting order (using the order established in Plot 3)
  mutate(
    combined_group = factor(
      combined_group,
      levels = group_order # Use the group_order defined in Plot 3 for consistency
    )
  )

# Report summary
n_individuals_composite_km <- n_distinct(composite_survival_data$hhidpn_mod)
print(paste("Total individuals plotted in 8-Group Composite KM curve:", n_individuals_composite_km))
## [1] "Total individuals plotted in 8-Group Composite KM curve: 11521"
library(survival)
library(survminer)

# # --- Step 0: Define Customization Vectors ---
# 
# # 1. Color Palette (Named vector for mapping)
# color_values <- c(
#  "ADL never occurs | No Disability" = "#1B9E77",
#  "ADL occurs at 4yrs | No Disability" = "#D95F02",
#  "ADL occurs at 8yrs | No Disability" = "#7570B3",
#  "ADL occurs at 12yrs | No Disability" = "#E7298A",
#  "ADL never occurs | Early Disability Only" = "#66A61E",
#  "ADL occurs at 4yrs | Early Disability Only" = "#E6AB02",
#  "ADL occurs at 8yrs | Early Disability Only" = "#A6761D",
#  "ADL occurs at 12yrs | Early Disability Only" = "#666666"
# )

# 2. Legend Labels (New, custom text in the correct order)
# Note: These labels must match the order of names in color_values.
new_labels <- c(
 "Never ADL | No Disability",   
 "4yr ADL | No Disability",    
 "8yr ADL | No Disability",    
 "12yr ADL | No Disability",    
 "Never ADL | Early Disability",  
 "4yr ADL | Early Disability",   
 "8yr ADL | Early Disability",   
 "12yr ADL | Early Disability"   
)

# 1. Fit the Kaplan-Meier survival curve
# Assuming 'fit_composite' and 'composite_survival_data' are available.
fit_composite <- survfit(
  Surv(months.observed, status) ~ combined_group,
  data = composite_survival_data
)

# 2. Generate the static Kaplan-Meier plot using all built-in arguments
km_plot_composite <- ggsurvplot(
  fit_composite,
  data = composite_survival_data,
  pval = TRUE,
  conf.int = FALSE,
    risk.table = FALSE, # Hide table for cleaner look
  title = "Survival Curves by ADL Onset Timing and Baseline Disability Status (8 Groups)",
 
  # --- Color and Label Assignment (CRITICAL) ---
  # Use the named color vector to map colors to data levels
  palette = c("#1B9E77", "#D95F02", "#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666"), # Matches Plot 2 colors
    # Use legend.labs to provide the new, succinct text labels
  legend.labs = new_labels, 
 
  # --- Legend Position and Title ---
  legend.title = "Composite Group:",
  legend = "bottom",
 
  # --- Styling ---
  xlab = "Time in Months",
  break.time.by = 24,
  ggtheme = theme_minimal(),
    
    # --- Custom Legend Theme (Handles size and wrapping) ---
    # Apply a custom theme using surv_theme to control appearance
    surv_theme = theme(
        legend.title = element_text(face = "bold", size = 10),
        legend.text = element_text(size = 8),
        # Use guides to force wrapping (2 rows) and ensure the legend key is shown
        legend.key.width = unit(0.5, "cm"),
        legend.key.height = unit(0.5, "cm")
    )
)

# To force wrapping the legend, we modify the guides of the primary ggplot object.
km_plot_composite$plot <- km_plot_composite$plot +
  guides(
    color = guide_legend(
        nrow = 2, # Wrap to 2 rows
        byrow = TRUE,
        title.position = "top",
        # We explicitly set this to NULL to ensure the key is NOT hidden by any previous settings
        # override.aes = list(linetype = 1, linewidth = 1, alpha = 1) 
    )
  )

# Print the final static plot
print(km_plot_composite)