knitr::opts_chunk$set(echo = params$printcode)

Introduction

Task outline

I am an Executive Director Medical Services of Hospital A (denoted in Col A in PCOR File.xlsx) and our hospital has been contributing to Prostate Cancer Outcomes Registry (PCOR) since 2016. Please provide me with the completeness of PSA assessment (denoted in Col C in PCOR File.xlsx) at diagnosis (denoted in Col B in PCOR File.xlsx) in my hospital compared to others? Please display this indicator (PSA assessment completed and documented at diagnosis) in my hospital over time.

Load R packages

library(readr)      #Load data
library(dplyr)      #Process data
library(tidyr)      #Data Wrangling
library(lubridate)  #Dealing with date/time variables
library(ggplot2)    #Graphical plotting
library(plotly)     #Interactive graphical plotting
library(knitr)      #Output package
library(kableExtra) 
library(magrittr)   #R piping function

Load data set

The data set table is comprised of 3 columns
1. Hospital
a. Hospital Name
2. DateofDiagnosis
a. Date of Diagnosis
3. Indicator_X
a. PSA assessment completed and documented at diagnosis
- 1: Yes
- 0: No

Data Source

PSA assessments -Prostate Cancer Registry
(Data for PSA assessments from the Prostate Cancer Outcomes Registry for various hospitals since 2016).
https://www.monash.edu/medicine/sphpm/pcor-vic/home

# Load excel file
pcor <- readxl::read_xlsx("PCOR.xlsx", col_names = TRUE, na = "", trim_ws = TRUE)

Check data set

Read column headers

#List headers
names(pcor)

## [1] "Hospital"        "Dateofdiagnosis" "Indicator_X"

Change headers

#Change column names
colnames(pcor) <- c("Hospital", "Date_of_Diagnosis", "PSA_Assessment_Completed")

Examine data types

# Check class for earch data column
class(pcor$Hospital)

## [1] "character"

class(pcor$Date_of_Diagnosis)

## [1] "POSIXct" "POSIXt"

class(pcor$PSA_Assessment_Completed)

## [1] "numeric"

The data type for ‘Hospital’ is character while ‘Date of Diagnosis’ is date-time and ‘PAC Assessment Completed’ is numeric. But ‘PAC Assessment Completed’ is binary so we should change this to factor.

Change ‘PAC Assessment Completed’ to factor

# Change class of 'PAC Assessment Completed' to factor
pcor$PSA_Assessment_Completed <- factor(pcor$PSA_Assessment_Completed, levels = c(0, 1), labels = c("No", "Yes"))

# Check class of 'PSA Assessment Completed'
class(pcor$PSA_Assessment_Completed)

## [1] "factor"

Summary statistics

summary(pcor)

##    Hospital         Date_of_Diagnosis             PSA_Assessment_Completed
##  Length:8164        Min.   :2016-01-01 00:00:00   No :3358                
##  Class :character   1st Qu.:2016-10-03 00:00:00   Yes:4806                
##  Mode  :character   Median :2017-06-26 00:00:00                           
##                     Mean   :2017-06-28 18:28:13                           
##                     3rd Qu.:2018-03-30 00:00:00                           
##                     Max.   :2018-12-31 00:00:00

Data transformation

Calculate the percent complete for each hospital

# Calculate PSA assessments completed and not completed for each hospital
hospitals_PSA_sum <- pcor %>% 
  group_by(Hospital, PSA_Assessment_Completed) %>%
  summarize(count=n())

# Calculate proportion of PSA assessments ocmpleted for each hospital
hospitals_PSA_sum$Proportion_of_total <- hospitals_PSA_sum$count/nrow(pcor)

# PSA assessments completed for each hospital as percentage of total
hospitals_PSA_sum$Percent_of_total <- hospitals_PSA_sum$count/nrow(pcor)*100

# Percentage of PSA assessments completed for each hospital
PSA_complete_hospital <- hospitals_PSA_sum %>% 
  group_by(Hospital) %>% 
  mutate(Percent_per_hospital = (count/sum(count) * 100))

# Display first 6 lines
head(PSA_complete_hospital)

Transpose data set

# Calculate number of PSA assessments on each day
PSA_num <- pcor %>% 
  group_by(Hospital, Date_of_Diagnosis, PSA_Assessment_Completed) %>% 
  summarize(count=n())

# Calculate cumulative sum of PSA assessments
PSA_num$cumSumTotal <- unlist(tapply(PSA_num$count, PSA_num$Hospital, cumsum))

# Display first 6 lines
head(PSA_num)

Calculate cumulative sum of PCA assessments completed and documented for each hospital

# Create table containing only PSA assessments completed
PSA_num_yes <- PSA_num %>% filter(PSA_Assessment_Completed == 'Yes')

# Calculate cumulative sum for PSA assessments completed
PSA_num_yes$cumSumYes <- unlist(tapply(PSA_num_yes$count, PSA_num_yes$Hospital, cumsum))

# Display first 6 rows
head(PSA_num_yes)

Join tables containing total and completed PSA Assessments.

# Join tables
PSA_percent_complete <- left_join(PSA_num, PSA_num_yes)

# Display first 6 rows
head(PSA_percent_complete)

Calculate the percentage completeness

# Calculate precent PSA assessments completed over time
PSA_percent_complete <- transform(PSA_percent_complete, percentComplete = (cumSumYes/cumSumTotal)*100)

# Display first 6 rows
head(PSA_percent_complete)

Plot data

Plot the percent complete for each hospital

#Create new table containing Hospital, PSA_Assessments_completed and Percent_per_hospital (Percent completed for each hospital)
PSA_complete_hospital_transpose <- PSA_complete_hospital %>% select(Hospital, PSA_Assessment_Completed, Percent_per_hospital)

#Transpose table so that data can be plotted in plotly
PSA_complete_hospital_transpose <- spread(PSA_complete_hospital_transpose, PSA_Assessment_Completed, Percent_per_hospital)

#Create interactive plotly graph
plot_ly(PSA_complete_hospital_transpose, x=~Hospital, y=~Yes, type = "bar", name = "Yes") %>% 
  add_trace(y=~No, name = "No") %>% 
  layout(
         xaxis = list(
           title = "Hospital",
           titlefont = list(
             size = 20,
             color = 'rgb(107, 107, 107)'),
           tickfont = list(
             size = 16,
             color = 'rgb(107, 107, 107)')),
         yaxis = list(
           title = 'Percent (%)',
           titlefont = list(
             size = 20,
             color = 'rgb(107, 107, 107)'),
           tickfont = list(
             size = 16, 
             color = 'rgb(107, 107, 107)')),
         legend = list(bgcolor = 'rgba(255, 255, 255, 0)', bordercolor = 'rgba(255, 255, 255, 0)'),
         barmode = 'group', bargap = 0.15, bargroupgap = 0.1)

Fig. 1 Completeness of PSA assessments for each hospital. Blue bars - Percentage of PSA assessments completed and documented. Orange bars - Percentage of PSA assessments not completed.

Plot PSA assessment completeness for each hospital with time

# Remove rows containing missing values
PSA_percent_complete_noNa <- PSA_percent_complete %>% na.omit()

# Legend title
legendtitle <- list(yref='paper',xref="paper",y=1.05,x=1.1, text="<b>Hospital</b>",showarrow=F)

# Plot PSA completeness for each hospital
plot_ly(PSA_percent_complete_noNa, 
        x=~Date_of_Diagnosis, 
        y=~percentComplete,
        type = 'scatter', 
        mode = 'lines',
        name = ~Hospital,
        color = ~Hospital == 'A',
        colors = c("lightgrey", "red"),
        text = paste("<b>Hospital:</b> ", PSA_percent_complete_noNa$Hospital)) %>% 
  layout(
    annotations=legendtitle,
         xaxis = list(
           title = "",
           tickangle = -45,
           titlefont = list(
             size = 12,
             color = 'rgb(107, 107, 107)'),
           tickfont = list(
             size = 14,
             color = 'rgb(107, 107, 107)')),
         yaxis = list(
           automargin = TRUE,
           title = 'Percent Completed (%)',
           titlefont = list(
             size = 20,
             color = 'rgb(107, 107, 107)'),
           tickfont = list(
             size = 16, 
             color = 'rgb(107, 107, 107)')),
         legend = list(bgcolor = 'rgba(255, 255, 255, 0)', 
                       bordercolor = 'rgba(255, 255, 255, 0)'),
         barmode = 'group', bargap = 0.15, bargroupgap = 0.1)

Fig. 2 Percentage PSA assessments completed and documented at diagnosis. Comparison of hospital A (red line) with other hospitals in the PSA registry.

Task for R Programmer Applicants (with code)

Prostate Cancer Registry

Charles Galea

05/11/2019