Outline for Visualization
- Spaghetti Plots for visualizing POA scores (y) vs time wave (x)
layered by adl status and early disability
- Will start by graphin overall POA scores for all individuals
ungrouped
- Then will start grouping datasets by adl status and early disability
- First layer of plot will be adl status : never occurs, occurs at 4
yrs, occurs at 8 yrs, occurs at 12 yrs.
- Second layer of plot will be early disability status
- KM curves - compare mean of the individuals 1-4 measures with the
median POA scores
- Calculate the median POA score for inclusive of ALL
measurements
- Mean the individual POA scores then group by if they are above or
below median
- Graph each group as two different survival curves
- Make KM plots for the different ADL subsets
- First group by ADL onset
- Next cross with Early Disability Status
Plot 1: Spaghetti plot of POA vs Time of All individuals - randomn
10 percent graphed and 25 individuals highlighted
# Prepare the data for plotting:
# Assuming FINAL_merged_df_mod is already loaded and contains the data.
all_ind_poa_trajectory_data <- FINAL_merged_df_mod %>%
# Select only the necessary columns using the original column names
select(hhidpn_mod, poa.average.score, wave) %>%
# Remove rows where POA score is missing, as these cannot be plotted
filter(!is.na(poa.average.score)) %>%
# Convert ID to factor for ggplot's grouping
mutate(
hhidpn_mod = as.factor(hhidpn_mod),
# Convert 'wave' into an ordered factor for correct axis order
wave_ordered = factor(wave,
levels = c("baseline", "4yrs", "8yrs", "12yrs"),
ordered = TRUE)
) %>%
# Filter out any rows where the wave is not one of the four key points
filter(!is.na(wave_ordered))
# Determine the number of unique individuals
n_individuals <- n_distinct(all_ind_poa_trajectory_data$hhidpn_mod)
# --- MODIFICATIONS FOR SAMPLING ---
n_gray <- round(n_individuals * 0.1) # 10% for the gray background lines
n_highlight <- 25 # Fixed number of individuals to highlight
# Get all unique IDs
all_ids <- all_ind_poa_trajectory_data %>% distinct(hhidpn_mod)
# Set seed for reproducibility
set.seed(42)
# 1. Sample 10% of IDs for the gray background lines
gray_ids <- all_ids %>%
sample_n(n_gray) %>%
pull(hhidpn_mod)
# 2. Sample 25 IDs for the highlighted lines
# Note: It's okay for these two samples to overlap. The colored lines will be drawn
# over the gray lines, ensuring they stand out.
highlight_ids <- all_ids %>%
sample_n(n_highlight) %>%
pull(hhidpn_mod)
# Create two datasets:
# Dataset for the gray background lines (10% sample)
gray_lines_data <- all_ind_poa_trajectory_data %>%
filter(hhidpn_mod %in% gray_ids)
# Dataset for the highlighted lines (25 individuals)
highlight_data <- all_ind_poa_trajectory_data %>%
filter(hhidpn_mod %in% highlight_ids)
# Report the summary
print(paste("Total unique individuals in original data:", n_individuals))
## [1] "Total unique individuals in original data: 11521"
print(paste("Number of individuals in gray background (10%):", n_gray))
## [1] "Number of individuals in gray background (10%): 1152"
print(paste("Number of individuals highlighted (fixed):", n_highlight))
## [1] "Number of individuals highlighted (fixed): 25"
# --- 1. Create the base ggplot ---
all_ind_poa_trajectory_plot <- ggplot() +
# A. Plot the 10% sample of individual trajectories (in light grey)
geom_line(data = gray_lines_data, # Use the 10% sampled data
aes(
x = wave_ordered,
y = poa.average.score,
group = hhidpn_mod
),
color = "gray80",
linewidth = 0.1) +
# B. Plot the random sample of 25 trajectories (in color) - TEXT MAPPED FOR INTERACTIVITY
geom_line(data = highlight_data,
aes(
x = wave_ordered,
y = poa.average.score,
group = hhidpn_mod,
color = hhidpn_mod,
# Custom tooltip text mapped here
text = paste("ID:", hhidpn_mod, "<br>Time Point:", wave, "<br>POA Score:", poa.average.score)
),
linewidth = 1,
alpha = 0.8) +
# C. Add points for the highlighted lines - TEXT MAPPED FOR INTERACTIVITY
geom_point(data = highlight_data,
aes(
x = wave_ordered,
y = poa.average.score,
group = hhidpn_mod,
color = hhidpn_mod,
# Custom tooltip text mapped here (needed for points)
text = paste("ID:", hhidpn_mod, "<br>Time Point:", wave, "<br>POA Score:", poa.average.score)
),
size = 1.5,
alpha = 0.8) +
# --- 2. Styling and Labels ---
labs(
title = "POA Average Score Trajectories Over 12 Years",
subtitle = paste0("Background (gray): ", n_gray, " trajectories (10% sample). Highlighted (color): ", n_highlight, " trajectories."),
x = "Survey Wave (Time Point)",
y = "POA Average Score",
color = "Sample ID"
) +
# Remove the legend for the highlight colors since there are too many
guides(color = "none") +
# Enhance aesthetics
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.title = element_text(size = 12),
panel.grid.minor = element_blank()
)
# --- 3. Convert to Interactive Plotly ---
# We use tooltip="text" to only show the custom hover information we defined
ggplotly(all_ind_poa_trajectory_plot, tooltip = "text") %>%
plotly::layout(
xaxis = list(fixedrange = TRUE),
yaxis = list(fixedrange = TRUE)
)
Plot 2: Spaghetti plot of POA vs Time grouped by ADL Onset
- Preperation of data for ADL onset
# - - - 1. Define ADL Onset Group for each individual (hhidpn_mod) - - -
# assuming ADL occurrence is any value other than 'no' or NA.
adl_onset_groups <- FINAL_merged_df_mod %>%
# waves for onset definition
filter(wave %in% c("baseline", "4yrs", "8yrs", "12yrs")) %>%
# first start by definig if adl occurs or not. ADL occurrence: 1 if limitation is NOT 'no' and NOT NA
mutate(adl_occurs = ifelse(tolower(adl.limitations) != "no" & !is.na(adl.limitations), 1, 0)) %>%
# Arrange by ID and time to find the first occurrence - easier to use months.since.baseline because it can be arranged numerically
arrange(hhidpn_mod, months.since.baseline) %>%
group_by(hhidpn_mod) %>%
# Summarize to find the wave of the first ADL occurrence
summarise(
# Get the wave corresponding to the first '1' in adl_occurs
first_onset_wave = wave[which(adl_occurs == 1)[1]],
.groups = 'drop'
) %>%
# Assign final group label based on the first onset wave
mutate(
adl_onset_group = case_when(
is.na(first_onset_wave) ~ "ADL never occurs",
first_onset_wave == "4yrs" ~ "ADL occurs at 4yrs",
first_onset_wave == "8yrs" ~ "ADL occurs at 8yrs",
first_onset_wave == "12yrs" ~ "ADL occurs at 12yrs",
# Any onset at baseline or other unexpected wave is grouped into 'never'
# or treated as 'excluded' for the user's specific 4 groups. We'll exclude it
TRUE ~ "Onset at Baseline/Excluded"
)
) %>%
# Filter to keep only the 4 groups requested
filter(adl_onset_group != "Onset at Baseline/Excluded") %>%
select(hhidpn_mod, adl_onset_group)
- Merge adl sorting with main trajectory data
# - - - 2. Prepare the main trajectory data and merge groups - - -
adl_onset_poa_trajectory_data <- FINAL_merged_df_mod %>%
# selecting necessary columns from final cohort dataset
select(hhidpn_mod, poa.average.score, wave, disability.status) %>%
# Remove rows where POA score is missing
filter(!is.na(poa.average.score)) %>%
# Merge in the ADL onset groups
left_join(adl_onset_groups, by = "hhidpn_mod") %>%
# Filter to keep only individuals who belong to one of the 4 defined ADL onset groups
filter(!is.na(adl_onset_group)) %>%
# Convert to factors and order time points
mutate(
hhidpn_mod = as.factor(hhidpn_mod),
wave_ordered = factor(wave,
levels = c("baseline", "4yrs", "8yrs", "12yrs"),
ordered = TRUE)
) %>%
# Filter out any rows where the wave is not one of the four key points
filter(!is.na(wave_ordered))
# - - - 3. Prepare aggregated data (Mean POA by ADL Onset Group) - - -
mean_poa_adl_onset_data <- adl_onset_poa_trajectory_data %>%
group_by(adl_onset_group, wave_ordered) %>%
summarise(
mean_poa = mean(poa.average.score, na.rm = TRUE),
n_obs = n(),
.groups = 'drop'
)
# Report the summary
n_individuals_grouped <- n_distinct(adl_onset_poa_trajectory_data$hhidpn_mod)
print(paste("Total individuals plotted and grouped by ADL onset:", n_individuals_grouped))
## [1] "Total individuals plotted and grouped by ADL onset: 11521"
- Graphing either 20 or 10 percent of the individuals for graph
clarity and readability
# - - - 4. Prepare sampled data for spaghetti plot (20% random sample) - - -
n_individuals_grouped <- n_distinct(adl_onset_poa_trajectory_data$hhidpn_mod)
set.seed(42) # Set seed for reproducibility
sample_size <- floor(n_individuals_grouped * 0.20)
sampled_ids <- adl_onset_poa_trajectory_data %>%
distinct(hhidpn_mod) %>%
sample_n(sample_size) %>%
pull(hhidpn_mod)
adl_onset_poa_sampled_data <- adl_onset_poa_trajectory_data %>%
filter(hhidpn_mod %in% sampled_ids)
print(paste("Total individuals plotted in the mean line:", n_individuals_grouped))
## [1] "Total individuals plotted in the mean line: 11521"
print(paste("Number of individuals plotted in the spaghetti lines (20% sample):", sample_size))
## [1] "Number of individuals plotted in the spaghetti lines (20% sample): 2304"
- now let’s plot!
- making only the mean lines interactive
- should have four groups each a distinct color
# - - - 1. Create the base ggplot - - -
color_values_4 <- c(
"ADL never occurs" = "#1B9E77",
"ADL occurs at 4yrs" = "#D95F02",
"ADL occurs at 8yrs" = "#7570B3",
"ADL occurs at 12yrs" = "#E7298A"
)
group_order_4 <- names(color_values_4)
adl_onset_trajectory_mean_plot_20_sampled <- adl_onset_poa_sampled_data %>%
# CRITICAL FIX: Manually set factor levels for combined_group to guarantee legend order
mutate(adl_onset_group = factor(adl_onset_group, levels = group_order_4)) %>%
ggplot() +
# A. Plot SAMPLED individual trajectories (colored by onset group, non-interactive)
geom_line(aes(
x = wave_ordered,
y = poa.average.score,
group = hhidpn_mod,
color = adl_onset_group
# IMPORTANT: Removed 'text' aesthetic to disable interactivity on individual lines
),
linewidth = 0.3,
alpha = 0.15) +
# B. Plot the MEAN line for each group (using the aggregated data)
geom_line(data = mean_poa_adl_onset_data,
aes(
x = wave_ordered,
y = mean_poa,
group = adl_onset_group,
color = adl_onset_group,
# Text mapping kept here for interactivity on the mean line
text = paste("Group:", adl_onset_group, "<br>Mean POA:", round(mean_poa, 3))
),
linewidth = 1.5,
alpha = 1) +
# C. Points for the mean line
geom_point(data = mean_poa_adl_onset_data,
aes(
x = wave_ordered,
y = mean_poa,
group = adl_onset_group,
color = adl_onset_group,
# Text mapping kept here for interactivity on the mean line
text = paste("Group:", adl_onset_group, "<br>Mean POA:", round(mean_poa, 3))
),
size = 3,
alpha = 1) +
# - - - 2. Styling and Labels - - -
labs(
title = "POA Score Trajectories by Timing of ADL Limitation Onset (with Group Means)",
subtitle = paste0("Individual trajectories (faint lines, 20% sample) and mean trends (thick lines, 100% cohort) plotted for ", n_individuals_grouped, " individuals."),
x = "Survey Wave (Time Point)",
y = "POA Average Score",
color = "ADL Onset Group"
) +
ylim(2.5, 5) +
scale_color_manual(
values = color_values_4,
breaks = group_order_4 # Enforce the custom order
) +
# Add guide to make legend lines thicker
guides(color = guide_legend(override.aes = list(linewidth = 4, alpha = 4))) +
# Ensure only ONE legend entry is used (for color)
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 12),
axis.title = element_text(size = 12),
legend.position = "bottom",
panel.grid.minor = element_blank()
)
# - - - 3. Convert to Interactive Plotly - - -
# We use tooltip="text" to only show the custom hover information defined on the mean lines/points
ggplotly(adl_onset_trajectory_mean_plot_20_sampled, tooltip = "text") %>%
plotly::layout(
xaxis = list(fixedrange = TRUE),
yaxis = list(fixedrange = TRUE)
)
Plot 3: Spaghetti plot of POA vs Time grouped by ADL Onset and Early
Disability Status
- Data preperation - creating a new df where each mod ID has a
disability status
# - - - 1. Extract Fixed Baseline Disability Status (One value per individual) - - -
# Since disability.status is constant, we find the first non-NA status for each ID.
disability_status_fixed_df <- FINAL_merged_df_mod %>%
group_by(hhidpn_mod) %>%
summarise(
disability_status_fixed = disability.status[!is.na(disability.status)][1], # take the first disability status listed
.groups = 'drop'
) %>%
# Filter out IDs where fixed status couldn't be determined
filter(!is.na(disability_status_fixed)) %>%
# simplify disability status for a cleaner label
mutate(
disability_label = ifelse(disability_status_fixed == "no.disability",
"No Disability",
"Early Disability Only")
) %>%
select(hhidpn_mod, disability_label)
- combining fixed disability status with adl onset for 8 groups
# - - - 2. Merge ADL Onset with Fixed Disability Status to Create 8-Level Composite Group - - -
composite_groups <- adl_onset_groups %>%
left_join(disability_status_fixed_df, by = "hhidpn_mod") %>%
filter(!is.na(disability_label)) %>%
# Create the 8-level composite factor
mutate(
combined_group = factor(paste(adl_onset_group, disability_label, sep = " | ")) # this will tell me first what time they had adl onset, and second, their disability status
) %>%
select(hhidpn_mod, combined_group)
- take the composite and use it to merge with main poa trajectory
data
# - - - 3. Prepare the main trajectory data and merge groups - - -
adl_onset_with_disability_poa_trajectory_data <- FINAL_merged_df_mod %>%
# selecting necessary columns from final cohort dataset
select(hhidpn_mod, poa.average.score, wave) %>%
# Remove rows where POA score is missing
filter(!is.na(poa.average.score)) %>%
# Merge in the composite groups
left_join(composite_groups, by = "hhidpn_mod") %>%
# Filter to keep only individuals who belong to one of the 8 defined combined groups
filter(!is.na(combined_group)) %>%
# Convert to factors and order time points
mutate(
hhidpn_mod = as.factor(hhidpn_mod),
wave_ordered = factor(wave,
levels = c("baseline", "4yrs", "8yrs", "12yrs"),
ordered = TRUE)
) %>%
# Filter out any rows where the wave is not one of the four key points
filter(!is.na(wave_ordered))
- Prepare the mean lines for each of the 8 grroups
# --- 4. Prepare aggregated data (Mean POA by 8-level Combined Group) ---
mean_poa_combined_group_data <- adl_onset_with_disability_poa_trajectory_data %>%
group_by(combined_group, wave_ordered) %>%
summarise(
mean_poa = mean(poa.average.score, na.rm = TRUE),
n_obs = n(),
.groups = 'drop'
)
- Sampling the data for 20% individual lines
# --- 5. Prepare sampled data for spaghetti plot (20% random sample) ---
n_individuals_grouped <- n_distinct(adl_onset_with_disability_poa_trajectory_data$hhidpn_mod)
set.seed(42) # Set seed for reproducibility
sample_size <- floor(n_individuals_grouped * 0.20)
sampled_ids <- adl_onset_with_disability_poa_trajectory_data %>%
distinct(hhidpn_mod) %>%
sample_n(sample_size) %>%
pull(hhidpn_mod)
adl_onset_poa_sampled_data <- adl_onset_with_disability_poa_trajectory_data %>%
filter(hhidpn_mod %in% sampled_ids)
print(paste("Total individuals plotted in the mean line (8 groups):", n_individuals_grouped))
## [1] "Total individuals plotted in the mean line (8 groups): 11521"
print(paste("Number of individuals plotted in the spaghetti lines (20% sample):", sample_size))
## [1] "Number of individuals plotted in the spaghetti lines (20% sample): 2304"
# step 0 for legend ordering
# Define the order and colors for the manual scale
color_values <- c(
"ADL never occurs | No Disability" = "#1B9E77",
"ADL occurs at 4yrs | No Disability" = "#D95F02",
"ADL occurs at 8yrs | No Disability" = "#7570B3",
"ADL occurs at 12yrs | No Disability" = "#E7298A",
"ADL never occurs | Early Disability Only" = "#66A61E",
"ADL occurs at 4yrs | Early Disability Only" = "#E6AB02",
"ADL occurs at 8yrs | Early Disability Only" = "#A6761D",
"ADL occurs at 12yrs | Early Disability Only" = "#666666"
)
# Extract the names of the groups in the desired order
group_order <- names(color_values)
combined_trajectory_mean_plot <- adl_onset_poa_sampled_data %>%
# CRITICAL FIX: Manually set factor levels for combined_group to guarantee legend order
mutate(combined_group = factor(combined_group, levels = group_order)) %>%
ggplot() +
# A. Plot SAMPLED individual trajectories (colored by combined group, non-interactive)
geom_line(aes(
x = wave_ordered,
y = poa.average.score,
group = hhidpn_mod,
color = combined_group
),
linewidth = 0.3,
alpha = 0.15) +
# B. Plot the MEAN line for each group (using the aggregated data)
geom_line(data = mean_poa_combined_group_data,
aes(
x = wave_ordered,
y = mean_poa,
group = combined_group,
color = combined_group,
# Text mapping kept here for interactivity on the mean line
text = paste("Group:", combined_group, "<br>Mean POA:", round(mean_poa, 3))
),
linewidth = 1.2,
alpha = 1) +
# C. Points for the mean line
geom_point(data = mean_poa_combined_group_data,
aes(
x = wave_ordered,
y = mean_poa,
group = combined_group,
color = combined_group,
# Text mapping kept here for interactivity on the mean line
text = paste("Group:", combined_group, "<br>Mean POA:", round(mean_poa, 3))
),
size = 2.5,
alpha = 1) +
# --- 2. Styling and Labels ---
labs(
title = "POA Score Trajectories by ADL Onset Timing and Baseline Disability Status (8 Groups)",
subtitle = paste0("Individual trajectories (faint lines, 20% sample) and mean trends (thick lines, 100% cohort) plotted for ", n_individuals_grouped, " individuals. Note: 8-level grouping used."),
x = "Survey Wave (Time Point)",
y = "POA Average Score",
color = "Composite Group (ADL Onset | Disability Status)"
) +
# Set the Y-axis limits (zoom)
ylim(2.5, 5) +
# Use a more diverse palette for 8 groups
scale_color_manual(
values = color_values,
breaks = group_order # Enforce the custom order
) +
# Add guide to make legend lines thicker
guides(color = guide_legend(override.aes = list(linewidth = 4, alpha = 4))) +
# Increase plot width for 8 legends and a long title
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11),
axis.title = element_text(size = 10),
# Adjust Legend Position to the top and size slightly
legend.position = "top",
legend.text = element_text(size = 8), # Smaller text
legend.title = element_text(size = 9, face = "bold"), # Smaller title
panel.grid.minor = element_blank()
)
# --- 3. Convert to Interactive Plotly ---
ggplotly(combined_trajectory_mean_plot, tooltip = "text") %>%
plotly::layout(
xaxis = list(fixedrange = TRUE),
yaxis = list(fixedrange = TRUE)
) %>%
plotly::config(
displaylogo = FALSE,
showLink = FALSE
)
Plot 4: KM plot for those above and below mean POA scores
- Calculate the median poa.average.score
# The na.rm = TRUE argument tells R to ignore NA (missing) values in the calculation
median_poa_score <- median(FINAL_merged_df_mod$poa.average.score, na.rm = TRUE)
print(paste("The median of all POA scores is:", median_poa_score))
## [1] "The median of all POA scores is: 4"
- Grouping factor based on the median
# 1. Calculate the mean POA score for each individual
# This creates a new dataframe with one row per individual
individual_means <- FINAL_merged_df_mod %>%
group_by(hhidpn_mod) %>%
summarise(
mean_poa_score = mean(poa.average.score, na.rm = TRUE)
)
# 2. Calculate the median of these individual mean scores
median_of_individual_means <- median(individual_means$mean_poa_score, na.rm = TRUE)
# The median is 3.9375
# 3. Merge the individual mean back into the main dataframe
FINAL_merged_df_mod <- FINAL_merged_df_mod %>%
left_join(individual_means, by = "hhidpn_mod")
# 4. Create the new grouping variable and label based on the median (3.9375)
median_val_str <- "3.9375"
low_label <- paste0("Low POA (< ", median_val_str, ")") # <-- Shorter label
high_label <- paste0("High POA (> ", median_val_str, ")") # <-- Shorter label
FINAL_merged_df_mod <- FINAL_merged_df_mod %>%
mutate(
poa_group_new = ifelse( # this will be the column we are grouping by in the km curves
mean_poa_score > median_of_individual_means,
high_label,
low_label
)
)
# Convert the new grouping variable to a factor
FINAL_merged_df_mod$poa_group_new <- factor(
FINAL_merged_df_mod$poa_group_new,
levels = c(low_label, high_label) # Ensures Low group is first
)
- Fit the KM survival curve
library(survival)
library(survminer)
# 5. Fit the Kaplan-Meier survival curve
# Ensure only valid rows (where we have both time and group) are used
fit_median <- survfit(
Surv(months.observed, status) ~ poa_group_new,
data = FINAL_merged_df_mod
)
# summary(fit_median)
# 6. Generate the static Kaplan-Meier plot
km_plot_median <- ggsurvplot(
fit_median,
data = FINAL_merged_df_mod,
risk.table = FALSE, # No Risk table
pval = TRUE,
conf.int = TRUE,
title = "KM Survival Curves for Individuals Above and Below Median POA Score",
legend.title = "POA Score Group", # Labels are now embedded in the factor levels
legend.labs = c(low_label, high_label), # <-- Explicitly set legend labels
xlab = "Time in Months",
break.time.by = 24
)
# Print the static plot
print(km_plot_median)

Plot 5: KM plot for individuals grouped by ADL onset
# Merge ADL onset groups with survival data
adl_onset_survival_data <- FINAL_merged_df_mod %>%
select(hhidpn_mod, months.observed, status) %>%
# Join with the pre-calculated ADL onset groups (4 levels: never, 4yrs, 8yrs, 12yrs)
left_join(adl_onset_groups, by = "hhidpn_mod") %>%
# Filter to keep only individuals in the 4 defined ADL onset groups
filter(!is.na(adl_onset_group)) %>%
# Convert to factor with the desired plotting order (earlier onset = worse outcome, usually placed first)
mutate(
adl_onset_group = factor(
adl_onset_group,
levels = c("ADL occurs at 4yrs", "ADL occurs at 8yrs", "ADL occurs at 12yrs", "ADL never occurs")
)
)
# Report summary
n_individuals_adl_km <- n_distinct(adl_onset_survival_data$hhidpn_mod)
print(paste("Total individuals plotted in ADL Onset KM curve:", n_individuals_adl_km))
## [1] "Total individuals plotted in ADL Onset KM curve: 11521"
# Filter the data for the problematic group
adl_12yrs_data <- adl_onset_survival_data %>%
filter(adl_onset_group == "ADL occurs at 12yrs")
# Check the count and status distribution
print("Summary of ADL Occurs at 12yrs Group:")
## [1] "Summary of ADL Occurs at 12yrs Group:"
print(table(adl_12yrs_data$status))
##
## 0
## 1320
- Fit the survival curves and plot
# 1. Fit the Kaplan-Meier survival curve
fit_adl_onset <- survfit(
Surv(months.observed, status) ~ adl_onset_group,
data = adl_onset_survival_data
)
new_labels <- c(
"ADL Occurs at 4 yrs", # Group 1
"ADL Occurs at 8 yrs", # Group 2
"ADL Occurs at 12 yrs", # Group 3
"ADL Never Occurs" # Group 4
)
color_palette <- c("#D95F02", "#7570B3", "#E7298A", "#1B9E77")
# 2. Generate the static Kaplan-Meier plot
km_plot_adl_onset <- ggsurvplot(
fit_adl_onset,
data = adl_onset_survival_data,
pval = TRUE,
conf.int = FALSE, # Set to FALSE for cleaner plot with 4 lines
title = "Survival Curves by Timing of ADL Limitation Onset",
legend.title = "ADL Onset Group",
legend = "bottom",
legend.labs = new_labels,
xlab = "Time in Months",
break.time.by = 24,
# Use a custom color palette for the 4 groups
palette = color_palette, # Corrected palette vector
ggtheme = theme_minimal()
)
km_plot_adl_onset$plot <- km_plot_adl_onset$plot +
# Use guides to control the appearance of the legend keys and text
guides(
col = guide_legend(
nrow = 1, # Arrange items into 2 rows to force wrapping
byrow = TRUE,
# Remove the explicit variable name ("combined_group =") from the keys.
# This is often hidden by using 'legend.title = ""', but sometimes requires a manual override.
title.position = "top"
)
) +
# Further theme adjustments to control spacing if needed
theme(
# Legend appearance: This hides the variable name if it appears above the labels
legend.title = element_text(face = "bold", size = 10),
# Ensure legend text wraps (done via nrow/byrow in guides, but theme is good for spacing)
legend.text = element_text(size = 8)
)
# Print the static plot
print(km_plot_adl_onset)

Plot 6: KM plot grouped by ADL onset and Early Disability
# Merge composite groups with survival data
composite_survival_data <- FINAL_merged_df_mod %>%
select(hhidpn_mod, months.observed, status) %>%
# Join with the pre-calculated 8-level composite groups
left_join(composite_groups, by = "hhidpn_mod") %>%
# Filter to keep only individuals in the 8 defined composite groups
filter(!is.na(combined_group)) %>%
# Convert to factor with the desired plotting order (using the order established in Plot 3)
mutate(
combined_group = factor(
combined_group,
levels = group_order # Use the group_order defined in Plot 3 for consistency
)
)
# Report summary
n_individuals_composite_km <- n_distinct(composite_survival_data$hhidpn_mod)
print(paste("Total individuals plotted in 8-Group Composite KM curve:", n_individuals_composite_km))
## [1] "Total individuals plotted in 8-Group Composite KM curve: 11521"
library(survival)
library(survminer)
# # --- Step 0: Define Customization Vectors ---
#
# # 1. Color Palette (Named vector for mapping)
# color_values <- c(
# "ADL never occurs | No Disability" = "#1B9E77",
# "ADL occurs at 4yrs | No Disability" = "#D95F02",
# "ADL occurs at 8yrs | No Disability" = "#7570B3",
# "ADL occurs at 12yrs | No Disability" = "#E7298A",
# "ADL never occurs | Early Disability Only" = "#66A61E",
# "ADL occurs at 4yrs | Early Disability Only" = "#E6AB02",
# "ADL occurs at 8yrs | Early Disability Only" = "#A6761D",
# "ADL occurs at 12yrs | Early Disability Only" = "#666666"
# )
# 2. Legend Labels (New, custom text in the correct order)
# Note: These labels must match the order of names in color_values.
new_labels <- c(
"Never ADL | No Disability",
"4yr ADL | No Disability",
"8yr ADL | No Disability",
"12yr ADL | No Disability",
"Never ADL | Early Disability",
"4yr ADL | Early Disability",
"8yr ADL | Early Disability",
"12yr ADL | Early Disability"
)
# 1. Fit the Kaplan-Meier survival curve
# Assuming 'fit_composite' and 'composite_survival_data' are available.
fit_composite <- survfit(
Surv(months.observed, status) ~ combined_group,
data = composite_survival_data
)
# 2. Generate the static Kaplan-Meier plot using all built-in arguments
km_plot_composite <- ggsurvplot(
fit_composite,
data = composite_survival_data,
pval = TRUE,
conf.int = FALSE,
risk.table = FALSE, # Hide table for cleaner look
title = "Survival Curves by ADL Onset Timing and Baseline Disability Status (8 Groups)",
# --- Color and Label Assignment (CRITICAL) ---
# Use the named color vector to map colors to data levels
palette = c("#1B9E77", "#D95F02", "#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666"), # Matches Plot 2 colors
# Use legend.labs to provide the new, succinct text labels
legend.labs = new_labels,
# --- Legend Position and Title ---
legend.title = "Composite Group:",
legend = "bottom",
# --- Styling ---
xlab = "Time in Months",
break.time.by = 24,
ggtheme = theme_minimal(),
# --- Custom Legend Theme (Handles size and wrapping) ---
# Apply a custom theme using surv_theme to control appearance
surv_theme = theme(
legend.title = element_text(face = "bold", size = 10),
legend.text = element_text(size = 8),
# Use guides to force wrapping (2 rows) and ensure the legend key is shown
legend.key.width = unit(0.5, "cm"),
legend.key.height = unit(0.5, "cm")
)
)
# To force wrapping the legend, we modify the guides of the primary ggplot object.
km_plot_composite$plot <- km_plot_composite$plot +
guides(
color = guide_legend(
nrow = 2, # Wrap to 2 rows
byrow = TRUE,
title.position = "top",
# We explicitly set this to NULL to ensure the key is NOT hidden by any previous settings
# override.aes = list(linetype = 1, linewidth = 1, alpha = 1)
)
)
# Print the final static plot
print(km_plot_composite)
