Swimmer plots are a useful way to visualise a set of patient data.
Stacey Phillips demonstrates how to create a Swimmer plot here using PROC along with a comprehension description of swimmer plots.
We will use the same dataset to run in R to replicate the following plot.
library(magrittr)
library(stringi)
library(readr) # Reading in the dataset
library(ggplot2) # Viewing the dataset
library(forcats) # Sorting factors
library(RColorBrewer) # Plot colours
library(dplyr, warn.conflicts=FALSE) # Manipulating the dataframes
library(purrr, warn.conflicts=FALSE) # Manipulating dataframe metadata
library(zoo, warn.conflicts=FALSE) # Filling in NA values
library(reshape2) # Reformmating dataframes
The dataset used can be found here, in plain text.
Ideally this would be in csv format but we can work around this by taking only the lines that start with a number, placing NA values in long stretches of spaces and then using space as the delimiter and removing empty elements.
swimmer_file = "https://blogs.sas.com/content/graphicallyspeaking/files/2014/06/Swimmer_93.txt"
col.names = c("subjectID", "stage", "startTime", "endTime",
"isContinued", "responseType", "responseStartTime", "responseEndTime", "Durable")
df <- readr::read_lines(swimmer_file) %>%
# Split by line recursion (\r\n)
stringi::stri_split(fixed="\r\n", simplify=TRUE) %>%
# Take only lines starting with a number (sample id)
.[grepl("^[0-9]+", .)] %>%
# Remove spaces from response column
gsub(pattern="\\sresponse", replacement="_response") %>%
# Remove spaces from stage column
gsub(pattern="Stage\\s", replacement="Stage_") %>%
# Some lines missing 'Stage' and 'isContinued' column.
# Replace any set of 8 or more spaces with ' . '
gsub(pattern="\\s{8,}", replacement=' . ') %>%
# Split strings by spaces, do not include empty strings as columns
stringi::stri_split(fixed=" ", simplify=TRUE, omit_empty=TRUE) %>%
# Convert to dataframe
as.data.frame(stringsAsFactors=FALSE) %>%
# Set the column names
purrr::set_names(col.names) %>%
# We need to do some more cleaning up of the dataframe
# Convert all . to NAs
dplyr::na_if(".") %>%
# Fill NAs in Stage column
dplyr::mutate(stage=zoo::na.locf(stage)) %>%
# Turn isContinued into boolean
dplyr::mutate(isContinued=dplyr::if_else(isContinued=="FilledArrow", TRUE, FALSE, missing=FALSE)) %>%
# Convert stage variable to factor, remove underscore
dplyr::mutate(stage = as.factor(gsub(pattern="_", replacement=" ", x=stage))) %>%
# Remove underscore from response types
dplyr::mutate(responseType = gsub("_", " ", responseType)) %>%
# Change Durable from character to numeric
dplyr::mutate(Durable = as.numeric(Durable)) %>%
# Change Time variables from character to numeric
dplyr::mutate_at(vars(dplyr::ends_with("Time")), as.numeric)
Let’s have a look at the data using the glimpse tool. We can see that the data frame is ‘tidy’. There is one row for every observation. A tidy dataframe may mean there are multiple entries for a given subject.
df %>% dplyr::glimpse()
## Observations: 15
## Variables: 9
## $ subjectID <chr> "1", "2", "3", "3", "4", "4", "5", "5", "5",...
## $ stage <fct> Stage 1, Stage 2, Stage 3, Stage 3, Stage 4,...
## $ startTime <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ endTime <dbl> 18.5, 17.0, 14.0, 14.0, 13.5, 13.5, 12.5, 12...
## $ isContinued <lgl> TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, T...
## $ responseType <chr> "Complete response", "Complete response", "P...
## $ responseStartTime <dbl> 6.5, 10.5, 2.5, 6.0, 7.0, 11.5, 3.5, 6.5, 10...
## $ responseEndTime <dbl> 13.5, 17.0, 3.5, NA, 11.0, NA, 4.5, 8.5, NA,...
## $ Durable <dbl> -0.25, -0.25, -0.25, -0.25, NA, NA, -0.25, -...
For the shapes and arrows we require two smaller dataframes.
df.shapes
will provide the positions and type of object to place on the plot.
unicode
is a list to map shapes to their unicode id.
df.shapes <- df %>%
# Get just the subject and response time columns
dplyr::select(subjectID, responseType, responseStartTime) %>%
# Melt the data frame, so one row per response value.
reshape2::melt(id.vars=c("subjectID", "responseType"), value.name="time") %>%
# Remove na values
dplyr::filter(!is.na(time)) %>%
# Remove response variable column
dplyr::select(-variable) %>%
# Add 'start' to the end of the response type
dplyr::mutate(responseType=paste(responseType, "start", sep=" "))
# Add the end time for each
df.shapes %<>%
dplyr::bind_rows(df %>%
dplyr::select(subjectID, endTime, responseEndTime, isContinued) %>%
# Place endtime as response endtime if not continuing and responseEndTime is NA
dplyr::mutate(responseEndTime=dplyr::if_else(!isContinued & is.na(responseEndTime),
endTime, responseEndTime)) %>%
dplyr::select(-endTime, -isContinued) %>%
# Remove other existing NA responseEndTimes
dplyr::filter(!is.na(responseEndTime)) %>%
dplyr::mutate(responseType="Response end") %>%
dplyr::rename(time=responseEndTime))
# Append on the durable column
df.shapes %<>%
dplyr::bind_rows(df %>%
dplyr::select(subjectID, Durable) %>%
dplyr::filter(!is.na(Durable)) %>%
dplyr::mutate(responseType="Durable") %>%
dplyr::rename(time=Durable))
# Add on the arrow sets
df.shapes %<>%
dplyr::bind_rows(df %>%
dplyr::select(subjectID, endTime, isContinued) %>%
dplyr::filter(isContinued) %>%
dplyr::select(-isContinued) %>%
dplyr::mutate(responseType="Continued Treatment") %>%
dplyr::mutate(endTime=endTime+0.25) %>%
dplyr::rename(time=endTime))
responseLevels = c("Complete response start", "Partial response start",
"Response end", "Durable", "Continued Treatment")
# Convert responseType to factor and set the levels
df.shapes %<>%
dplyr::mutate(responseType = factor(responseType, levels=responseLevels)) %>%
# Order by response type
dplyr::arrange(desc(responseType))
Let’s have a look at the shapes data frame.
df.shapes %>% dplyr::glimpse()
## Observations: 45
## Variables: 3
## $ subjectID <chr> "1", "3", "3", "4", "4", "5", "5", "5", "6", "6",...
## $ responseType <fct> Continued Treatment, Continued Treatment, Continu...
## $ time <dbl> 18.75, 14.25, 14.25, 13.75, 13.75, 12.75, 12.75, ...
The data frame uses subjectID
and time
column as the coordinates of where the shape sits on the next plot and the responseType
as the type of observation to plot.
Now assign the unicode shapes.
unicode = list(triangle=sprintf('\u25B2'),
circle=sprintf('\u25CF'),
square=sprintf('\u25A0'),
arrow=sprintf('\u2794'))
We will use the ggplot2 package to plot the dataframe.
df %>%
# Get just the variables we need for the base of the plot
dplyr::select(subjectID, endTime, stage) %>%
# Remove duplicate rows
dplyr::distinct() %>%
# Order subject ID by numeric value
dplyr::mutate(subjectID=forcats::fct_reorder(.f=subjectID, .x=as.numeric(subjectID), .desc = TRUE)) %>%
# Pipe into ggplot
ggplot(aes(subjectID, endTime)) + # Base axis
geom_bar(stat="identity", aes(fill=factor(stage))) + # Bar plot. Colour by stage
geom_point(data=df.shapes, # Use df.shapes to add reponse points
aes(subjectID, time, colour=responseType, shape=responseType), size=5) +
coord_flip() + # Flip to horizonal bar plot.
scale_colour_manual(values=c(RColorBrewer::brewer.pal(3, "Set1")[1:2], # Add colours
rep("black", 3))) + # min of brewerpal is three but we only need 2.
scale_shape_manual(values=c(rep(unicode[["triangle"]], 2), # Add shapes
unicode[["circle"]], unicode[["square"]], unicode[["arrow"]])) +
scale_y_continuous(limits=c(-0.5, 20), breaks=0:20) + # Set time limits
labs(fill="Disease Stage", colour="Symbol Key", shape="Symbol Key", # Add labels
x="Subject ID ", y="Months since diagnosis",
title="Swimmer Plot",
caption="Durable defined as subject with six months or more of confirmed response") +
theme(plot.title = element_text(hjust = 0.5), # Put title in the middle of plot
plot.caption = element_text(size=7, hjust=0)) # Make caption size smaller
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 16299)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_Australia.1252 LC_CTYPE=English_Australia.1252
## [3] LC_MONETARY=English_Australia.1252 LC_NUMERIC=C
## [5] LC_TIME=English_Australia.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 reshape2_1.4.3 zoo_1.8-1
## [4] purrr_0.2.4 dplyr_0.7.4 RColorBrewer_1.1-2
## [7] forcats_0.3.0 ggplot2_2.2.1 readr_1.1.1
## [10] stringi_1.1.7 magrittr_1.5
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.16 pillar_1.2.2 compiler_3.4.3 plyr_1.8.4
## [5] bindr_0.1.1 tools_3.4.3 digest_0.6.15 evaluate_0.10.1
## [9] tibble_1.4.2 gtable_0.2.0 lattice_0.20-35 pkgconfig_2.0.1
## [13] rlang_0.2.0 curl_3.2 yaml_2.1.19 stringr_1.3.1
## [17] knitr_1.20 hms_0.4.2 rprojroot_1.3-2 grid_3.4.3
## [21] glue_1.2.0 R6_2.2.2 rmarkdown_1.9 backports_1.1.2
## [25] scales_0.5.0 htmltools_0.3.6 assertthat_0.2.0 colorspace_1.3-2
## [29] lazyeval_0.2.1 munsell_0.4.3