Creating a survival-swimmer plot in R

Background

Swimmer plots are a useful way to visualise a set of patient data.

Stacey Phillips demonstrates how to create a Swimmer plot here using PROC along with a comprehension description of swimmer plots.

We will use the same dataset to run in R to replicate the following plot. Swimmer Plot in PROC

Importing the required libraries

library(magrittr)
library(stringi)
library(readr)   # Reading in the dataset
library(ggplot2) # Viewing the dataset
library(forcats) # Sorting factors
library(RColorBrewer) # Plot colours
library(dplyr, warn.conflicts=FALSE)   # Manipulating the dataframes
library(purrr, warn.conflicts=FALSE)   # Manipulating dataframe metadata
library(zoo, warn.conflicts=FALSE)     # Filling in  NA values
library(reshape2) # Reformmating dataframes

Importing the dataset

The dataset used can be found here, in plain text.
Ideally this would be in csv format but we can work around this by taking only the lines that start with a number, placing NA values in long stretches of spaces and then using space as the delimiter and removing empty elements.

swimmer_file = "https://blogs.sas.com/content/graphicallyspeaking/files/2014/06/Swimmer_93.txt"
col.names = c("subjectID", "stage", "startTime", "endTime", 
              "isContinued", "responseType", "responseStartTime", "responseEndTime", "Durable")
df <- readr::read_lines(swimmer_file) %>%
  # Split by line recursion (\r\n)
  stringi::stri_split(fixed="\r\n", simplify=TRUE) %>%
  # Take only lines starting with a number (sample id)
  .[grepl("^[0-9]+", .)] %>%
  # Remove spaces from response column
  gsub(pattern="\\sresponse", replacement="_response") %>%
  # Remove spaces from stage column
  gsub(pattern="Stage\\s",  replacement="Stage_") %>%
  # Some lines missing 'Stage' and 'isContinued' column. 
  # Replace any set of 8 or more spaces with ' . '
  gsub(pattern="\\s{8,}", replacement=' . ') %>%
  # Split strings by spaces, do not include empty strings as columns
  stringi::stri_split(fixed=" ", simplify=TRUE, omit_empty=TRUE) %>%
  # Convert to dataframe
  as.data.frame(stringsAsFactors=FALSE) %>%
  # Set the column names
  purrr::set_names(col.names) %>%
  # We need to do some more cleaning up of the dataframe
  # Convert all . to NAs
  dplyr::na_if(".") %>%
  # Fill NAs in Stage column
  dplyr::mutate(stage=zoo::na.locf(stage)) %>%
  # Turn isContinued into boolean
  dplyr::mutate(isContinued=dplyr::if_else(isContinued=="FilledArrow", TRUE, FALSE, missing=FALSE)) %>%
  # Convert stage variable to factor, remove underscore
  dplyr::mutate(stage = as.factor(gsub(pattern="_", replacement=" ", x=stage))) %>%
  # Remove underscore from response types 
  dplyr::mutate(responseType = gsub("_", " ", responseType)) %>%
  # Change Durable from character to numeric
  dplyr::mutate(Durable = as.numeric(Durable)) %>%
  # Change Time variables from character to numeric
  dplyr::mutate_at(vars(dplyr::ends_with("Time")), as.numeric)

Viewing the data.

Let’s have a look at the data using the glimpse tool. We can see that the data frame is ‘tidy’. There is one row for every observation. A tidy dataframe may mean there are multiple entries for a given subject.

df %>% dplyr::glimpse()

## Observations: 15
## Variables: 9
## $ subjectID         <chr> "1", "2", "3", "3", "4", "4", "5", "5", "5",...
## $ stage             <fct> Stage 1, Stage 2, Stage 3, Stage 3, Stage 4,...
## $ startTime         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ endTime           <dbl> 18.5, 17.0, 14.0, 14.0, 13.5, 13.5, 12.5, 12...
## $ isContinued       <lgl> TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, T...
## $ responseType      <chr> "Complete response", "Complete response", "P...
## $ responseStartTime <dbl> 6.5, 10.5, 2.5, 6.0, 7.0, 11.5, 3.5, 6.5, 10...
## $ responseEndTime   <dbl> 13.5, 17.0, 3.5, NA, 11.0, NA, 4.5, 8.5, NA,...
## $ Durable           <dbl> -0.25, -0.25, -0.25, -0.25, NA, NA, -0.25, -...

Creating plotting attributes

For the shapes and arrows we require two smaller dataframes.

df.shapes will provide the positions and type of object to place on the plot.
unicode is a list to map shapes to their unicode id.

df.shapes <- df %>%
  # Get just the subject and response time columns
  dplyr::select(subjectID, responseType, responseStartTime) %>%
  # Melt the data frame, so one row per response value.
  reshape2::melt(id.vars=c("subjectID", "responseType"), value.name="time") %>%
  # Remove na values
  dplyr::filter(!is.na(time)) %>%
  # Remove response variable column
  dplyr::select(-variable) %>%
  # Add 'start' to the end of the response type
  dplyr::mutate(responseType=paste(responseType, "start", sep=" "))

# Add the end time for each 
df.shapes %<>%
  dplyr::bind_rows(df %>%
                     dplyr::select(subjectID, endTime, responseEndTime, isContinued) %>%
                     # Place endtime as response endtime if not continuing and responseEndTime is NA
                     dplyr::mutate(responseEndTime=dplyr::if_else(!isContinued & is.na(responseEndTime),
                                                                  endTime, responseEndTime)) %>%
                     dplyr::select(-endTime, -isContinued) %>%
                     # Remove other existing NA responseEndTimes
                     dplyr::filter(!is.na(responseEndTime)) %>%
                     dplyr::mutate(responseType="Response end") %>%
                     dplyr::rename(time=responseEndTime))

# Append on the durable column
df.shapes %<>% 
  dplyr::bind_rows(df %>% 
                      dplyr::select(subjectID, Durable) %>%
                      dplyr::filter(!is.na(Durable)) %>%
                      dplyr::mutate(responseType="Durable") %>%
                      dplyr::rename(time=Durable))
# Add on the arrow sets
df.shapes %<>% 
  dplyr::bind_rows(df %>%
                      dplyr::select(subjectID, endTime, isContinued) %>%
                      dplyr::filter(isContinued) %>%
                      dplyr::select(-isContinued) %>%
                      dplyr::mutate(responseType="Continued Treatment") %>%
                      dplyr::mutate(endTime=endTime+0.25) %>%
                      dplyr::rename(time=endTime))

responseLevels = c("Complete response start", "Partial response start", 
                   "Response end", "Durable", "Continued Treatment")

# Convert responseType to factor and set the levels
df.shapes %<>% 
  dplyr::mutate(responseType = factor(responseType, levels=responseLevels)) %>%
  # Order by response type
  dplyr::arrange(desc(responseType))

Let’s have a look at the shapes data frame.

df.shapes %>% dplyr::glimpse()

## Observations: 45
## Variables: 3
## $ subjectID    <chr> "1", "3", "3", "4", "4", "5", "5", "5", "6", "6",...
## $ responseType <fct> Continued Treatment, Continued Treatment, Continu...
## $ time         <dbl> 18.75, 14.25, 14.25, 13.75, 13.75, 12.75, 12.75, ...

The data frame uses subjectID and time column as the coordinates of where the shape sits on the next plot and the responseType as the type of observation to plot.

Now assign the unicode shapes.

unicode = list(triangle=sprintf('\u25B2'),
               circle=sprintf('\u25CF'),
               square=sprintf('\u25A0'),
               arrow=sprintf('\u2794'))

Plotting the data.

We will use the ggplot2 package to plot the dataframe.

df %>% 
  # Get just the variables we need for the base of the plot
  dplyr::select(subjectID, endTime, stage) %>%
  # Remove duplicate rows
  dplyr::distinct() %>%
  # Order subject ID by numeric value
  dplyr::mutate(subjectID=forcats::fct_reorder(.f=subjectID, .x=as.numeric(subjectID), .desc = TRUE)) %>%
  # Pipe into ggplot
  ggplot(aes(subjectID, endTime)) + # Base axis
    geom_bar(stat="identity", aes(fill=factor(stage))) + # Bar plot. Colour by stage
    geom_point(data=df.shapes, # Use df.shapes to add reponse points
               aes(subjectID, time, colour=responseType, shape=responseType), size=5) +
    coord_flip() + # Flip to horizonal bar plot.
    scale_colour_manual(values=c(RColorBrewer::brewer.pal(3, "Set1")[1:2], # Add colours
                                rep("black", 3))) + # min of brewerpal is three but we only need 2.
    scale_shape_manual(values=c(rep(unicode[["triangle"]], 2), # Add shapes
                                unicode[["circle"]], unicode[["square"]], unicode[["arrow"]])) +
    scale_y_continuous(limits=c(-0.5, 20), breaks=0:20) + # Set time limits
    labs(fill="Disease Stage", colour="Symbol Key", shape="Symbol Key",  # Add labels
         x="Subject ID ", y="Months since diagnosis",
         title="Swimmer Plot",
         caption="Durable defined as subject with six months or more of confirmed response") +
    theme(plot.title = element_text(hjust = 0.5), # Put title in the middle of plot
          plot.caption = element_text(size=7, hjust=0)) # Make caption size smaller

sessionInfo()

## R version 3.4.3 (2017-11-30)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 16299)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Australia.1252  LC_CTYPE=English_Australia.1252   
## [3] LC_MONETARY=English_Australia.1252 LC_NUMERIC=C                      
## [5] LC_TIME=English_Australia.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2     reshape2_1.4.3     zoo_1.8-1         
##  [4] purrr_0.2.4        dplyr_0.7.4        RColorBrewer_1.1-2
##  [7] forcats_0.3.0      ggplot2_2.2.1      readr_1.1.1       
## [10] stringi_1.1.7      magrittr_1.5      
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.16     pillar_1.2.2     compiler_3.4.3   plyr_1.8.4      
##  [5] bindr_0.1.1      tools_3.4.3      digest_0.6.15    evaluate_0.10.1 
##  [9] tibble_1.4.2     gtable_0.2.0     lattice_0.20-35  pkgconfig_2.0.1 
## [13] rlang_0.2.0      curl_3.2         yaml_2.1.19      stringr_1.3.1   
## [17] knitr_1.20       hms_0.4.2        rprojroot_1.3-2  grid_3.4.3      
## [21] glue_1.2.0       R6_2.2.2         rmarkdown_1.9    backports_1.1.2 
## [25] scales_0.5.0     htmltools_0.3.6  assertthat_0.2.0 colorspace_1.3-2
## [29] lazyeval_0.2.1   munsell_0.4.3