Outline for Visualization

Spaghetti Plots for visualizing POA scores (y) vs time wave (x) layered by adl status and early disability
Will start by graphin overall POA scores for all individuals ungrouped
Then will start grouping datasets by adl status and early disability
1. First layer of plot will be adl status : never occurs, occurs at 4 yrs, occurs at 8 yrs, occurs at 12 yrs.
2. Second layer of plot will be early disability status
KM curves - compare mean of the individuals 1-4 measures with the median POA scores
1. Calculate the median POA score for inclusive of ALL measurements
2. Mean the individual POA scores then group by if they are above or below median
3. Graph each group as two different survival curves
Make KM plots for the different ADL subsets
1. First group by ADL onset
2. Next cross with Early Disability Status

Plot 1: Spaghetti plot of POA vs Time of All individuals - randomn 10 percent graphed and 25 individuals highlighted

Data Preperation

# Prepare the data for plotting:
# Assuming FINAL_merged_df_mod is already loaded and contains the data.
all_ind_poa_trajectory_data <- FINAL_merged_df_mod %>%
  # Select only the necessary columns using the original column names
  select(hhidpn_mod, poa.average.score, wave) %>%
  # Remove rows where POA score is missing, as these cannot be plotted
  filter(!is.na(poa.average.score)) %>%
  # Convert ID to factor for ggplot's grouping
  mutate(
    hhidpn_mod = as.factor(hhidpn_mod),
    # Convert 'wave' into an ordered factor for correct axis order
    wave_ordered = factor(wave, 
                          levels = c("baseline", "4yrs", "8yrs", "12yrs"),
                          ordered = TRUE)
  ) %>%
  # Filter out any rows where the wave is not one of the four key points
  filter(!is.na(wave_ordered))

# Determine the number of unique individuals
n_individuals <- n_distinct(all_ind_poa_trajectory_data$hhidpn_mod)

# --- MODIFICATIONS FOR SAMPLING ---
n_gray <- round(n_individuals * 0.1) # 10% for the gray background lines
n_highlight <- 25 # Fixed number of individuals to highlight

# Get all unique IDs
all_ids <- all_ind_poa_trajectory_data %>% distinct(hhidpn_mod)

# Set seed for reproducibility
set.seed(42) 

# 1. Sample 10% of IDs for the gray background lines
gray_ids <- all_ids %>%
  sample_n(n_gray) %>%
  pull(hhidpn_mod)

# 2. Sample 25 IDs for the highlighted lines
# Note: It's okay for these two samples to overlap. The colored lines will be drawn 
# over the gray lines, ensuring they stand out.
highlight_ids <- all_ids %>%
  sample_n(n_highlight) %>%
  pull(hhidpn_mod)

# Create two datasets:
# Dataset for the gray background lines (10% sample)
gray_lines_data <- all_ind_poa_trajectory_data %>%
  filter(hhidpn_mod %in% gray_ids)

# Dataset for the highlighted lines (25 individuals)
highlight_data <- all_ind_poa_trajectory_data %>%
  filter(hhidpn_mod %in% highlight_ids)

# Report the summary
print(paste("Total unique individuals in original data:", n_individuals))

## [1] "Total unique individuals in original data: 11521"

print(paste("Number of individuals in gray background (10%):", n_gray))

## [1] "Number of individuals in gray background (10%): 1152"

print(paste("Number of individuals highlighted (fixed):", n_highlight))

## [1] "Number of individuals highlighted (fixed): 25"

Interactive Plot

# --- 1. Create the base ggplot ---
all_ind_poa_trajectory_plot <- ggplot() +
  
  # A. Plot the 10% sample of individual trajectories (in light grey)
  geom_line(data = gray_lines_data, # Use the 10% sampled data
            aes(
              x = wave_ordered, 
              y = poa.average.score, 
              group = hhidpn_mod
            ), 
            color = "gray80", 
            linewidth = 0.1) +
  
  # B. Plot the random sample of 25 trajectories (in color) - TEXT MAPPED FOR INTERACTIVITY
  geom_line(data = highlight_data,
            aes(
              x = wave_ordered, 
              y = poa.average.score, 
              group = hhidpn_mod, 
              color = hhidpn_mod,
              # Custom tooltip text mapped here
              text = paste("ID:", hhidpn_mod, "<br>Time Point:", wave, "<br>POA Score:", poa.average.score) 
            ), 
            linewidth = 1, 
            alpha = 0.8) +
  
  # C. Add points for the highlighted lines - TEXT MAPPED FOR INTERACTIVITY
  geom_point(data = highlight_data,
             aes(
               x = wave_ordered, 
               y = poa.average.score, 
               group = hhidpn_mod, 
               color = hhidpn_mod,
               # Custom tooltip text mapped here (needed for points)
               text = paste("ID:", hhidpn_mod, "<br>Time Point:", wave, "<br>POA Score:", poa.average.score) 
             ), 
             size = 1.5,
             alpha = 0.8) +
  
  # --- 2. Styling and Labels ---
  labs(
    title = "POA Average Score Trajectories Over 12 Years",
    subtitle = paste0("Background (gray): ", n_gray, " trajectories (10% sample). Highlighted (color): ", n_highlight, " trajectories."),
    x = "Survey Wave (Time Point)",
    y = "POA Average Score",
    color = "Sample ID"
  ) +
  # Remove the legend for the highlight colors since there are too many
  guides(color = "none") +
  # Enhance aesthetics
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    axis.title = element_text(size = 12),
    panel.grid.minor = element_blank()
  )


# --- 3. Convert to Interactive Plotly ---
# We use tooltip="text" to only show the custom hover information we defined 
ggplotly(all_ind_poa_trajectory_plot, tooltip = "text") %>%
  plotly::layout(
    xaxis = list(fixedrange = TRUE), 
    yaxis = list(fixedrange = TRUE)
  )

Plot 2: Spaghetti plot of POA vs Time grouped by ADL Onset

Preperation of data for ADL onset

# - - - 1. Define ADL Onset Group for each individual (hhidpn_mod) - - -
# assuming ADL occurrence is any value other than 'no' or NA.
adl_onset_groups <- FINAL_merged_df_mod %>%
  # waves for onset definition
  filter(wave %in% c("baseline", "4yrs", "8yrs", "12yrs")) %>%
  
  # first start by definig if adl occurs or not. ADL occurrence: 1 if limitation is NOT 'no' and NOT NA
  mutate(adl_occurs = ifelse(tolower(adl.limitations) != "no" & !is.na(adl.limitations), 1, 0)) %>%
  
  # Arrange by ID and time to find the first occurrence - easier to use months.since.baseline because it can be arranged numerically
  arrange(hhidpn_mod, months.since.baseline) %>%
  group_by(hhidpn_mod) %>%
  
  # Summarize to find the wave of the first ADL occurrence
  summarise(
    # Get the wave corresponding to the first '1' in adl_occurs
    first_onset_wave = wave[which(adl_occurs == 1)[1]],
    .groups = 'drop'
  ) %>%
  
  # Assign final group label based on the first onset wave
  mutate(
    adl_onset_group = case_when(
      is.na(first_onset_wave) ~ "ADL never occurs",
      first_onset_wave == "4yrs" ~ "ADL occurs at 4yrs",
      first_onset_wave == "8yrs" ~ "ADL occurs at 8yrs",
      first_onset_wave == "12yrs" ~ "ADL occurs at 12yrs",
      # Any onset at baseline or other unexpected wave is grouped into 'never' 
      # or treated as 'excluded' for the user's specific 4 groups. We'll exclude it
      TRUE ~ "Onset at Baseline/Excluded"
    )
  ) %>%
  # Filter to keep only the 4 groups requested
  filter(adl_onset_group != "Onset at Baseline/Excluded") %>% 
  select(hhidpn_mod, adl_onset_group)

Merge adl sorting with main trajectory data

# - - - 2. Prepare the main trajectory data and merge groups - - -
adl_onset_poa_trajectory_data <- FINAL_merged_df_mod %>%
  # selecting necessary columns from final cohort dataset
  select(hhidpn_mod, poa.average.score, wave, disability.status) %>%
  # Remove rows where POA score is missing
  filter(!is.na(poa.average.score)) %>%
  # Merge in the ADL onset groups
  left_join(adl_onset_groups, by = "hhidpn_mod") %>%
  # Filter to keep only individuals who belong to one of the 4 defined ADL onset groups
  filter(!is.na(adl_onset_group)) %>%
  
  # Convert to factors and order time points
  mutate(
    hhidpn_mod = as.factor(hhidpn_mod),
    wave_ordered = factor(wave, 
                          levels = c("baseline", "4yrs", "8yrs", "12yrs"),
                          ordered = TRUE)
  ) %>%
  # Filter out any rows where the wave is not one of the four key points
  filter(!is.na(wave_ordered))

# - - - 3. Prepare aggregated data (Mean POA by ADL Onset Group) - - -
mean_poa_adl_onset_data <- adl_onset_poa_trajectory_data %>%
  group_by(adl_onset_group, wave_ordered) %>%
  summarise(
    mean_poa = mean(poa.average.score, na.rm = TRUE),
    n_obs = n(),
    .groups = 'drop'
  )

# Report the summary
n_individuals_grouped <- n_distinct(adl_onset_poa_trajectory_data$hhidpn_mod)
print(paste("Total individuals plotted and grouped by ADL onset:", n_individuals_grouped))

## [1] "Total individuals plotted and grouped by ADL onset: 11521"

Graphing either 20 or 10 percent of the individuals for graph clarity and readability

# - - - 4. Prepare sampled data for spaghetti plot (20% random sample) - - -
n_individuals_grouped <- n_distinct(adl_onset_poa_trajectory_data$hhidpn_mod)
set.seed(42) # Set seed for reproducibility
sample_size <- floor(n_individuals_grouped * 0.20)
sampled_ids <- adl_onset_poa_trajectory_data %>%
  distinct(hhidpn_mod) %>%
  sample_n(sample_size) %>%
  pull(hhidpn_mod)

adl_onset_poa_sampled_data <- adl_onset_poa_trajectory_data %>%
  filter(hhidpn_mod %in% sampled_ids)
  
print(paste("Total individuals plotted in the mean line:", n_individuals_grouped))

## [1] "Total individuals plotted in the mean line: 11521"

print(paste("Number of individuals plotted in the spaghetti lines (20% sample):", sample_size))

## [1] "Number of individuals plotted in the spaghetti lines (20% sample): 2304"

now let’s plot!
- making only the mean lines interactive
- should have four groups each a distinct color

# - - - 1. Create the base ggplot - - -
color_values_4 <- c(
  "ADL never occurs" = "#1B9E77",
  "ADL occurs at 4yrs" = "#D95F02",
  "ADL occurs at 8yrs" = "#7570B3",
  "ADL occurs at 12yrs" = "#E7298A"
)
group_order_4 <- names(color_values_4)

adl_onset_trajectory_mean_plot_20_sampled <- adl_onset_poa_sampled_data %>%
  # CRITICAL FIX: Manually set factor levels for combined_group to guarantee legend order
  mutate(adl_onset_group = factor(adl_onset_group, levels = group_order_4)) %>%
  ggplot() + 
  
  # A. Plot SAMPLED individual trajectories (colored by onset group, non-interactive)
  geom_line(aes(
    x = wave_ordered, 
    y = poa.average.score, 
    group = hhidpn_mod, 
    color = adl_onset_group
    # IMPORTANT: Removed 'text' aesthetic to disable interactivity on individual lines
  ), 
  linewidth = 0.3, 
  alpha = 0.15) + 
  
  # B. Plot the MEAN line for each group (using the aggregated data)
  geom_line(data = mean_poa_adl_onset_data,
            aes(
              x = wave_ordered, 
              y = mean_poa, 
              group = adl_onset_group,
              color = adl_onset_group,
              # Text mapping kept here for interactivity on the mean line
              text = paste("Group:", adl_onset_group, "<br>Mean POA:", round(mean_poa, 3))
            ),
            linewidth = 1.5, 
            alpha = 1) +
  
  # C. Points for the mean line
  geom_point(data = mean_poa_adl_onset_data,
             aes(
               x = wave_ordered, 
               y = mean_poa, 
               group = adl_onset_group,
               color = adl_onset_group,
               # Text mapping kept here for interactivity on the mean line
               text = paste("Group:", adl_onset_group, "<br>Mean POA:", round(mean_poa, 3))
             ),
             size = 3,
             alpha = 1) +
  
  # - - - 2. Styling and Labels - - -
  labs(
    title = "POA Score Trajectories by Timing of ADL Limitation Onset (with Group Means)",
    subtitle = paste0("Individual trajectories (faint lines, 20% sample) and mean trends (thick lines, 100% cohort) plotted for ", n_individuals_grouped, " individuals."),
    x = "Survey Wave (Time Point)",
    y = "POA Average Score",
    color = "ADL Onset Group"
  ) +
  ylim(2.5, 5) + 
  scale_color_manual(
    values = color_values_4,
    breaks = group_order_4 # Enforce the custom order
  ) +
  # Add guide to make legend lines thicker
  guides(color = guide_legend(override.aes = list(linewidth = 4, alpha = 4))) +
  # Ensure only ONE legend entry is used (for color)
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 12),
    axis.title = element_text(size = 12),
    legend.position = "bottom",
    panel.grid.minor = element_blank()
  )

# - - - 3. Convert to Interactive Plotly - - -
# We use tooltip="text" to only show the custom hover information defined on the mean lines/points
ggplotly(adl_onset_trajectory_mean_plot_20_sampled, tooltip = "text") %>%
  plotly::layout(
    xaxis = list(fixedrange = TRUE), 
    yaxis = list(fixedrange = TRUE)
  )

Plot 3: Spaghetti plot of POA vs Time grouped by ADL Onset and Early Disability Status

Data preperation - creating a new df where each mod ID has a disability status

# - - - 1. Extract Fixed Baseline Disability Status (One value per individual) - - -
# Since disability.status is constant, we find the first non-NA status for each ID.
disability_status_fixed_df <- FINAL_merged_df_mod %>%
  group_by(hhidpn_mod) %>%
  summarise(
    disability_status_fixed = disability.status[!is.na(disability.status)][1], # take the first disability status listed
    .groups = 'drop'
  ) %>%
  # Filter out IDs where fixed status couldn't be determined
  filter(!is.na(disability_status_fixed)) %>%
  # simplify disability status for a cleaner label
  mutate(
    disability_label = ifelse(disability_status_fixed == "no.disability", 
                              "No Disability", 
                              "Early Disability Only")
  ) %>%
  select(hhidpn_mod, disability_label)

combining fixed disability status with adl onset for 8 groups

# - - - 2. Merge ADL Onset with Fixed Disability Status to Create 8-Level Composite Group - - -
composite_groups <- adl_onset_groups %>%
  left_join(disability_status_fixed_df, by = "hhidpn_mod") %>%
  filter(!is.na(disability_label)) %>%
  # Create the 8-level composite factor
  mutate(
    combined_group = factor(paste(adl_onset_group, disability_label, sep = " | ")) # this will tell me first what time they had adl onset, and second, their disability status
  ) %>%
  select(hhidpn_mod, combined_group)

take the composite and use it to merge with main poa trajectory data

# - - - 3. Prepare the main trajectory data and merge groups - - -
adl_onset_with_disability_poa_trajectory_data <- FINAL_merged_df_mod %>%
 # selecting necessary columns from final cohort dataset
 select(hhidpn_mod, poa.average.score, wave) %>%
 # Remove rows where POA score is missing
 filter(!is.na(poa.average.score)) %>%
 # Merge in the composite groups
 left_join(composite_groups, by = "hhidpn_mod") %>%
 # Filter to keep only individuals who belong to one of the 8 defined combined groups
 filter(!is.na(combined_group)) %>%

 # Convert to factors and order time points
 mutate(
  hhidpn_mod = as.factor(hhidpn_mod),
  wave_ordered = factor(wave,
             levels = c("baseline", "4yrs", "8yrs", "12yrs"),
             ordered = TRUE)
 ) %>%
 # Filter out any rows where the wave is not one of the four key points
 filter(!is.na(wave_ordered))

Prepare the mean lines for each of the 8 grroups

# --- 4. Prepare aggregated data (Mean POA by 8-level Combined Group) ---
mean_poa_combined_group_data <- adl_onset_with_disability_poa_trajectory_data %>%
 group_by(combined_group, wave_ordered) %>%
 summarise(
  mean_poa = mean(poa.average.score, na.rm = TRUE),
  n_obs = n(),
  .groups = 'drop'
 )

Sampling the data for 20% individual lines

# --- 5. Prepare sampled data for spaghetti plot (20% random sample) ---
n_individuals_grouped <- n_distinct(adl_onset_with_disability_poa_trajectory_data$hhidpn_mod)
set.seed(42) # Set seed for reproducibility
sample_size <- floor(n_individuals_grouped * 0.20)
sampled_ids <- adl_onset_with_disability_poa_trajectory_data %>%
  distinct(hhidpn_mod) %>%
  sample_n(sample_size) %>%
  pull(hhidpn_mod)

adl_onset_poa_sampled_data <- adl_onset_with_disability_poa_trajectory_data %>%
  filter(hhidpn_mod %in% sampled_ids)
  
print(paste("Total individuals plotted in the mean line (8 groups):", n_individuals_grouped))

## [1] "Total individuals plotted in the mean line (8 groups): 11521"

print(paste("Number of individuals plotted in the spaghetti lines (20% sample):", sample_size))

## [1] "Number of individuals plotted in the spaghetti lines (20% sample): 2304"

Plotting of the graph

# step 0 for legend ordering
# Define the order and colors for the manual scale
color_values <- c(
  "ADL never occurs | No Disability" = "#1B9E77",
  "ADL occurs at 4yrs | No Disability" = "#D95F02",
  "ADL occurs at 8yrs | No Disability" = "#7570B3",
  "ADL occurs at 12yrs | No Disability" = "#E7298A",
  "ADL never occurs | Early Disability Only" = "#66A61E",
  "ADL occurs at 4yrs | Early Disability Only" = "#E6AB02",
  "ADL occurs at 8yrs | Early Disability Only" = "#A6761D",
  "ADL occurs at 12yrs | Early Disability Only" = "#666666"
)
# Extract the names of the groups in the desired order
group_order <- names(color_values)
combined_trajectory_mean_plot <- adl_onset_poa_sampled_data %>%
  # CRITICAL FIX: Manually set factor levels for combined_group to guarantee legend order
  mutate(combined_group = factor(combined_group, levels = group_order)) %>%
  ggplot() + 
  
  # A. Plot SAMPLED individual trajectories (colored by combined group, non-interactive)
  geom_line(aes(
    x = wave_ordered, 
    y = poa.average.score, 
    group = hhidpn_mod, 
    color = combined_group
  ), 
  linewidth = 0.3, 
  alpha = 0.15) + 
  
  # B. Plot the MEAN line for each group (using the aggregated data)
  geom_line(data = mean_poa_combined_group_data,
            aes(
              x = wave_ordered, 
              y = mean_poa, 
              group = combined_group,
              color = combined_group,
              # Text mapping kept here for interactivity on the mean line
              text = paste("Group:", combined_group, "<br>Mean POA:", round(mean_poa, 3))
            ),
            linewidth = 1.2, 
            alpha = 1) +
  
  # C. Points for the mean line
  geom_point(data = mean_poa_combined_group_data,
             aes(
               x = wave_ordered, 
               y = mean_poa, 
               group = combined_group,
               color = combined_group,
               # Text mapping kept here for interactivity on the mean line
               text = paste("Group:", combined_group, "<br>Mean POA:", round(mean_poa, 3))
             ),
             size = 2.5,
             alpha = 1) +
  
  # --- 2. Styling and Labels ---
  labs(
    title = "POA Score Trajectories by ADL Onset Timing and Baseline Disability Status (8 Groups)",
    subtitle = paste0("Individual trajectories (faint lines, 20% sample) and mean trends (thick lines, 100% cohort) plotted for ", n_individuals_grouped, " individuals. Note: 8-level grouping used."),
    x = "Survey Wave (Time Point)",
    y = "POA Average Score",
    color = "Composite Group (ADL Onset | Disability Status)"
  ) +
  # Set the Y-axis limits (zoom)
  ylim(2.5, 5) + 
  # Use a more diverse palette for 8 groups
  scale_color_manual(
    values = color_values,
    breaks = group_order # Enforce the custom order
  ) +
  # Add guide to make legend lines thicker
  guides(color = guide_legend(override.aes = list(linewidth = 4, alpha = 4))) +
  # Increase plot width for 8 legends and a long title
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 11), 
    axis.title = element_text(size = 10),
    # Adjust Legend Position to the top and size slightly
    legend.position = "top",
    legend.text = element_text(size = 8), # Smaller text
    legend.title = element_text(size = 9, face = "bold"), # Smaller title
    panel.grid.minor = element_blank()
  )

# --- 3. Convert to Interactive Plotly ---
ggplotly(combined_trajectory_mean_plot, tooltip = "text") %>%
  plotly::layout(
    xaxis = list(fixedrange = TRUE), 
    yaxis = list(fixedrange = TRUE)
  ) %>%
  plotly::config(
    displaylogo = FALSE, 
    showLink = FALSE
  )

Plot 4: KM plot for those above and below mean POA scores

Calculate the median poa.average.score

# The na.rm = TRUE argument tells R to ignore NA (missing) values in the calculation
median_poa_score <- median(FINAL_merged_df_mod$poa.average.score, na.rm = TRUE)
print(paste("The median of all POA scores is:", median_poa_score))

## [1] "The median of all POA scores is: 4"

Grouping factor based on the median

# 1. Calculate the mean POA score for each individual
# This creates a new dataframe with one row per individual
individual_means <- FINAL_merged_df_mod %>%
  group_by(hhidpn_mod) %>%
  summarise(
    mean_poa_score = mean(poa.average.score, na.rm = TRUE)
  )

# 2. Calculate the median of these individual mean scores
median_of_individual_means <- median(individual_means$mean_poa_score, na.rm = TRUE)
# The median is 3.9375

# 3. Merge the individual mean back into the main dataframe
FINAL_merged_df_mod <- FINAL_merged_df_mod %>%
  left_join(individual_means, by = "hhidpn_mod")

# 4. Create the new grouping variable and label based on the median (3.9375)
median_val_str <- "3.9375"
low_label <- paste0("Low POA (< ", median_val_str, ")") # <-- Shorter label
high_label <- paste0("High POA (> ", median_val_str, ")") # <-- Shorter label

FINAL_merged_df_mod <- FINAL_merged_df_mod %>%
  mutate(
    poa_group_new = ifelse( # this will be the column we are grouping by in the km curves
      mean_poa_score > median_of_individual_means,
      high_label,
      low_label
    )
  )

# Convert the new grouping variable to a factor
FINAL_merged_df_mod$poa_group_new <- factor(
    FINAL_merged_df_mod$poa_group_new,
    levels = c(low_label, high_label) # Ensures Low group is first
)

Fit the KM survival curve

library(survival)
library(survminer)
# 5. Fit the Kaplan-Meier survival curve
# Ensure only valid rows (where we have both time and group) are used
fit_median <- survfit(
    Surv(months.observed, status) ~ poa_group_new,
    data = FINAL_merged_df_mod
)
# summary(fit_median)

plot the KM curve

# 6. Generate the static Kaplan-Meier plot
km_plot_median <- ggsurvplot(
    fit_median,
    data = FINAL_merged_df_mod,
    risk.table = FALSE,      # No Risk table 
    pval = TRUE,
    conf.int = TRUE,
    title = "KM Survival Curves for Individuals Above and Below Median POA Score",
    legend.title = "POA Score Group", # Labels are now embedded in the factor levels
    legend.labs = c(low_label, high_label), # <-- Explicitly set legend labels
    xlab = "Time in Months",
    break.time.by = 24
)

# Print the static plot
print(km_plot_median)

Plot 5: KM plot for individuals grouped by ADL onset

# Merge ADL onset groups with survival data
adl_onset_survival_data <- FINAL_merged_df_mod %>%
  select(hhidpn_mod, months.observed, status) %>%
  # Join with the pre-calculated ADL onset groups (4 levels: never, 4yrs, 8yrs, 12yrs)
  left_join(adl_onset_groups, by = "hhidpn_mod") %>%
  # Filter to keep only individuals in the 4 defined ADL onset groups
  filter(!is.na(adl_onset_group)) %>%
  # Convert to factor with the desired plotting order (earlier onset = worse outcome, usually placed first)
  mutate(
    adl_onset_group = factor(
      adl_onset_group,
      levels = c("ADL occurs at 4yrs", "ADL occurs at 8yrs", "ADL occurs at 12yrs", "ADL never occurs")
    )
  )

# Report summary
n_individuals_adl_km <- n_distinct(adl_onset_survival_data$hhidpn_mod)
print(paste("Total individuals plotted in ADL Onset KM curve:", n_individuals_adl_km))

## [1] "Total individuals plotted in ADL Onset KM curve: 11521"

# Filter the data for the problematic group
adl_12yrs_data <- adl_onset_survival_data %>%
filter(adl_onset_group == "ADL occurs at 12yrs")

# Check the count and status distribution
print("Summary of ADL Occurs at 12yrs Group:")

## [1] "Summary of ADL Occurs at 12yrs Group:"

print(table(adl_12yrs_data$status))

## 
##    0 
## 1320

Fit the survival curves and plot

# 1. Fit the Kaplan-Meier survival curve
fit_adl_onset <- survfit(
    Surv(months.observed, status) ~ adl_onset_group,
    data = adl_onset_survival_data
)
new_labels <- c(
"ADL Occurs at 4 yrs", # Group 1
"ADL Occurs at 8 yrs", # Group 2
"ADL Occurs at 12 yrs", # Group 3
"ADL Never Occurs"   # Group 4
)

color_palette <- c("#D95F02", "#7570B3", "#E7298A", "#1B9E77")

# 2. Generate the static Kaplan-Meier plot
km_plot_adl_onset <- ggsurvplot(
    fit_adl_onset,
    data = adl_onset_survival_data,
    pval = TRUE,
    conf.int = FALSE, # Set to FALSE for cleaner plot with 4 lines
    title = "Survival Curves by Timing of ADL Limitation Onset",
    legend.title = "ADL Onset Group",
    legend = "bottom",
    legend.labs = new_labels,
    xlab = "Time in Months",
    break.time.by = 24,
    # Use a custom color palette for the 4 groups
    palette = color_palette, # Corrected palette vector
    ggtheme = theme_minimal()
)

km_plot_adl_onset$plot <- km_plot_adl_onset$plot +
  # Use guides to control the appearance of the legend keys and text
  guides(
    col = guide_legend(
      nrow = 1, # Arrange items into 2 rows to force wrapping
      byrow = TRUE,
      # Remove the explicit variable name ("combined_group =") from the keys.
      # This is often hidden by using 'legend.title = ""', but sometimes requires a manual override.
      title.position = "top"
    )
  ) +
# Further theme adjustments to control spacing if needed
  theme(
    # Legend appearance: This hides the variable name if it appears above the labels
    legend.title = element_text(face = "bold", size = 10),
    # Ensure legend text wraps (done via nrow/byrow in guides, but theme is good for spacing)
    legend.text = element_text(size = 8)
  )

# Print the static plot
print(km_plot_adl_onset)

Plot 6: KM plot grouped by ADL onset and Early Disability

# Merge composite groups with survival data
composite_survival_data <- FINAL_merged_df_mod %>%
  select(hhidpn_mod, months.observed, status) %>%
  # Join with the pre-calculated 8-level composite groups
  left_join(composite_groups, by = "hhidpn_mod") %>%
  # Filter to keep only individuals in the 8 defined composite groups
  filter(!is.na(combined_group)) %>%
  # Convert to factor with the desired plotting order (using the order established in Plot 3)
  mutate(
    combined_group = factor(
      combined_group,
      levels = group_order # Use the group_order defined in Plot 3 for consistency
    )
  )

# Report summary
n_individuals_composite_km <- n_distinct(composite_survival_data$hhidpn_mod)
print(paste("Total individuals plotted in 8-Group Composite KM curve:", n_individuals_composite_km))

## [1] "Total individuals plotted in 8-Group Composite KM curve: 11521"

library(survival)
library(survminer)

# # --- Step 0: Define Customization Vectors ---
# 
# # 1. Color Palette (Named vector for mapping)
# color_values <- c(
#  "ADL never occurs | No Disability" = "#1B9E77",
#  "ADL occurs at 4yrs | No Disability" = "#D95F02",
#  "ADL occurs at 8yrs | No Disability" = "#7570B3",
#  "ADL occurs at 12yrs | No Disability" = "#E7298A",
#  "ADL never occurs | Early Disability Only" = "#66A61E",
#  "ADL occurs at 4yrs | Early Disability Only" = "#E6AB02",
#  "ADL occurs at 8yrs | Early Disability Only" = "#A6761D",
#  "ADL occurs at 12yrs | Early Disability Only" = "#666666"
# )

# 2. Legend Labels (New, custom text in the correct order)
# Note: These labels must match the order of names in color_values.
new_labels <- c(
 "Never ADL | No Disability",   
 "4yr ADL | No Disability",    
 "8yr ADL | No Disability",    
 "12yr ADL | No Disability",    
 "Never ADL | Early Disability",  
 "4yr ADL | Early Disability",   
 "8yr ADL | Early Disability",   
 "12yr ADL | Early Disability"   
)

# 1. Fit the Kaplan-Meier survival curve
# Assuming 'fit_composite' and 'composite_survival_data' are available.
fit_composite <- survfit(
  Surv(months.observed, status) ~ combined_group,
  data = composite_survival_data
)

# 2. Generate the static Kaplan-Meier plot using all built-in arguments
km_plot_composite <- ggsurvplot(
  fit_composite,
  data = composite_survival_data,
  pval = TRUE,
  conf.int = FALSE,
    risk.table = FALSE, # Hide table for cleaner look
  title = "Survival Curves by ADL Onset Timing and Baseline Disability Status (8 Groups)",
 
  # --- Color and Label Assignment (CRITICAL) ---
  # Use the named color vector to map colors to data levels
  palette = c("#1B9E77", "#D95F02", "#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666"), # Matches Plot 2 colors
    # Use legend.labs to provide the new, succinct text labels
  legend.labs = new_labels, 
 
  # --- Legend Position and Title ---
  legend.title = "Composite Group:",
  legend = "bottom",
 
  # --- Styling ---
  xlab = "Time in Months",
  break.time.by = 24,
  ggtheme = theme_minimal(),
    
    # --- Custom Legend Theme (Handles size and wrapping) ---
    # Apply a custom theme using surv_theme to control appearance
    surv_theme = theme(
        legend.title = element_text(face = "bold", size = 10),
        legend.text = element_text(size = 8),
        # Use guides to force wrapping (2 rows) and ensure the legend key is shown
        legend.key.width = unit(0.5, "cm"),
        legend.key.height = unit(0.5, "cm")
    )
)

# To force wrapping the legend, we modify the guides of the primary ggplot object.
km_plot_composite$plot <- km_plot_composite$plot +
  guides(
    color = guide_legend(
        nrow = 2, # Wrap to 2 rows
        byrow = TRUE,
        title.position = "top",
        # We explicitly set this to NULL to ensure the key is NOT hidden by any previous settings
        # override.aes = list(linetype = 1, linewidth = 1, alpha = 1) 
    )
  )

# Print the final static plot
print(km_plot_composite)

11.06.2025 Presentation of Cohort Analysis Data Visualization

Alex Tan

2025-11-06

Importing final merged mortality and longitudinal dataset with accounting for unclean inputs and duplicates

Outline for Visualization

Plot 1: Spaghetti plot of POA vs Time of All individuals - randomn 10 percent graphed and 25 individuals highlighted

Plot 2: Spaghetti plot of POA vs Time grouped by ADL Onset

Plot 3: Spaghetti plot of POA vs Time grouped by ADL Onset and Early Disability Status

Plot 4: KM plot for those above and below mean POA scores

Plot 5: KM plot for individuals grouped by ADL onset

Plot 6: KM plot grouped by ADL onset and Early Disability