Preface

library(dplyr)
library(reshape2)
library(ggplot2)
library(coin) # for permutation test
# ------------------- Dataset import and some cleaning ------------------ 
# read the diagnostic and treatment data and 
diag <- read.csv('./data/Patient_Diagnosis.csv',stringsAsFactors = FALSE)
trt <- read.csv('./data/Patient_Treatment.csv',stringsAsFactors = FALSE)
# Some cleaning
# convert date columns to appropriate format
diag$diagnosis_date <- as.Date(diag$diagnosis_date,"%m/%d/%Y")
trt$treatment_date <- as.Date(trt$treatment_date,"%m/%d/%Y") 
# remove duplicate rows
diag = unique(diag)
trt = unique(trt)

General questions

  1. When presented with a new dataset or database, what steps do you generally take to evaluate it prior to working with it?
  1. Based on the information provided above and the attached dataset, what three questions would you like to understand prior to conducting any analysis of the data?
# join the two datasets into a single data frame
dt <- full_join(trt,diag,by="patient_id") 
summary(dt)
##    patient_id   treatment_date        drug_code         diagnosis_date      
##  Min.   :2038   Min.   :0010-01-20   Length:1208        Min.   :0010-01-09  
##  1st Qu.:2961   1st Qu.:0011-09-17   Class :character   1st Qu.:0011-04-19  
##  Median :4692   Median :0012-06-22   Mode  :character   Median :0012-04-27  
##  Mean   :5093   Mean   :0012-05-07                      Mean   :0012-02-11  
##  3rd Qu.:6877   3rd Qu.:0013-01-23                      3rd Qu.:0012-11-15  
##  Max.   :9489   Max.   :0017-02-20                      Max.   :0013-08-23  
##                 NA's   :2                                                   
##  diagnosis_code   diagnosis        
##  Min.   :153.3   Length:1208       
##  1st Qu.:153.9   Class :character  
##  Median :174.4   Mode  :character  
##  Mean   :168.4                     
##  3rd Qu.:174.8                     
##  Max.   :174.9                     
## 
# for non-numeric columns, what are the distinct values?  
table(dt$drug_code)
## 
##   A   B   C   D 
## 394 394 398  20
table(dt$diagnosis)
## 
## Breast Cancer  Colon Cancer 
##           854           354

From the results, I was able to tell that there are four drugs and two cancer types.

dt[is.na(dt$treatment_date),]
##      patient_id treatment_date drug_code diagnosis_date diagnosis_code
## 1207       4256           <NA>      <NA>     0011-11-07          174.5
## 1208       4256           <NA>      <NA>     0011-11-07          174.8
##          diagnosis
## 1207 Breast Cancer
## 1208 Breast Cancer

Data analysis questions

  1. First, the clinic would like to know the distribution of cancer types across their patients. Please provide the clinic with this information.
# Because the treatment dataset contains several data points for each patient,
# will only use the diagnosis dataset
diag$diagnosis %>% table(exclude=NULL) %>% prop.table()*100
## .
## Breast Cancer  Colon Cancer 
##      68.42105      31.57895
  1. The clinic wants to know how long it takes for patients to start therapy after being diagnosed, which they consider to be helpful in understanding the quality of care for the patient. How long after being diagnosed do patients start treatment?
# First create a column for the # of days passed since the diagosis
dt <- mutate(dt,date_diff = treatment_date-diagnosis_date)
dt$date_diff <- as.numeric(dt$date_diff)
# now find out when the treatment started which is the smallest # of days to any treatment
dt_days <- dt %>% group_by(patient_id,diagnosis) %>% summarize(days_to_txt = min(date_diff))
dt_days_stats <- dt_days %>% group_by(diagnosis) %>% summarize(min = min(days_to_txt,na.rm=TRUE),
                                                               quartile1 = quantile(days_to_txt,0.25,na.rm=TRUE),
                                                               quartile2 = quantile(days_to_txt,0.5,na.rm=TRUE),
                                                               quartile3 = quantile(days_to_txt,0.75,na.rm=TRUE),
                                                               max = max(days_to_txt,na.rm=TRUE),
                                                               missing = sum(is.na(days_to_txt)))
dt_days_stats
## # A tibble: 2 x 7
##   diagnosis       min quartile1 quartile2 quartile3   max missing
##   <chr>         <dbl>     <dbl>     <dbl>     <dbl> <dbl>   <int>
## 1 Breast Cancer    -6      3.5          5         6    20       1
## 2 Colon Cancer      0      2.25         5         7   304       0
ggplot(dt_days,aes(x=diagnosis,y=days_to_txt)) + geom_boxplot()
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

  1. Which treatment regimens [i.e., drug(s)] do you think would be indicated to be used as first-line of treatment for breast cancer? What about colon cancer?
# Find the smallest days to treatment for each patient, a drug type and diagnosis type
dt_days_drug <- dt %>% 
  group_by(patient_id,diagnosis,drug_code) %>% 
  summarize(days_to_txt = min(date_diff))
ggplot(na.omit(dt_days_drug),aes(x=drug_code,y=days_to_txt)) + 
  geom_boxplot() + facet_grid(diagnosis ~ .,scales="free")

  1. Do the patients taking Regimen A vs. Regimen B as first-line therapy for breast cancer vary in terms of duration of therapy? Please include statistical tests and visualizations, as appropriate.
# Below is a function that returns which drug is the first-line drug 
# given the time points that each drug is administered
find_first_drug <- function(drug,d_diff){
  # first line drug is identified as the drug/s with the smallest treatment day
  prim_raw <-  drug[d_diff == min(d_diff)]
  # the same drug can be selected twice as a primary drug 
  # due to the drug prescribed to multiple types of breast cancer
  # so, remove duplicates in the list
  # sort() is to fix c("B","A") to c("A","B")
  prim <- sort(unique(prim_raw)) 
  return(paste(prim,collapse=","))
}
# Apply that function to the breast cancer subset of the data 
# Also calculate the duration of the treatment (taken from the largest number of days for any treatment)
dt_brca_duration <- dt %>% 
  filter(diagnosis=="Breast Cancer") %>%
  group_by(patient_id) %>%
  summarize(duration = max(date_diff),
            first_drug = find_first_drug(drug_code,date_diff))
# show the distribution of treatment duration under the regimen A vs B
# excluded the patients that 1) received A and B at the same time 2) received C or D
dt_brca_duration_AB <- filter(dt_brca_duration,first_drug %in% c("A","B"))
dt_brca_duration_AB
## # A tibble: 11 x 3
##    patient_id duration first_drug
##         <int>    <dbl> <chr>     
##  1       2120       52 B         
##  2       2238     1001 B         
##  3       2475        0 B         
##  4       2607       94 B         
##  5       2720       87 B         
##  6       2762       79 B         
##  7       3025       83 B         
##  8       7937       80 A         
##  9       7976       82 A         
## 10       8480       83 A         
## 11       8827       90 A
dt_brca_duration_AB %>%  
  ggplot(aes(y = duration,x = first_drug)) + geom_boxplot()

dt[dt$patient_id == "2238",]
##     patient_id treatment_date drug_code diagnosis_date diagnosis_code
## 2         2238     0010-01-21         B     0010-01-21          174.9
## 18        2238     0010-01-31         B     0010-01-21          174.9
## 30        2238     0010-02-10         B     0010-01-21          174.9
## 42        2238     0010-02-20         B     0010-01-21          174.9
## 56        2238     0010-03-02         B     0010-01-21          174.9
## 62        2238     0010-03-12         B     0010-01-21          174.9
## 70        2238     0010-03-22         B     0010-01-21          174.9
## 76        2238     0010-04-01         B     0010-01-21          174.9
## 695       2238     0012-09-18         B     0010-01-21          174.9
## 713       2238     0012-09-28         B     0010-01-21          174.9
## 729       2238     0012-10-08         B     0010-01-21          174.9
## 743       2238     0012-10-18         B     0010-01-21          174.9
##         diagnosis date_diff
## 2   Breast Cancer         0
## 18  Breast Cancer        10
## 30  Breast Cancer        20
## 42  Breast Cancer        30
## 56  Breast Cancer        40
## 62  Breast Cancer        50
## 70  Breast Cancer        60
## 76  Breast Cancer        70
## 695 Breast Cancer       971
## 713 Breast Cancer       981
## 729 Breast Cancer       991
## 743 Breast Cancer      1001
dt_brca_duration_AB_clean <- filter(dt_brca_duration_AB,patient_id != "2238")
dt_brca_duration_AB_clean %>% 
  group_by(first_drug) %>% 
  summarize(n_patients = n(),
            minimum = min(duration),
            maximum = max(duration),
            median = median(duration))
## # A tibble: 2 x 5
##   first_drug n_patients minimum maximum median
##   <chr>           <int>   <dbl>   <dbl>  <dbl>
## 1 A                   4      80      90   82.5
## 2 B                   6       0      94   81
dt_brca_duration_AB_clean %>%  
  ggplot(aes(y = duration,x = first_drug)) + geom_boxplot()

# sample size is too small to use t-test, since the distribution can't be seen as normal
# so, will use permutation test(note that this requires coin package)
independence_test(duration ~ as.factor(first_drug),
                  data = dt_brca_duration_AB_clean)
## 
##  Asymptotic General Independence Test
## 
## data:  duration by as.factor(first_drug) (A, B)
## Z = 0.99088, p-value = 0.3217
## alternative hypothesis: two.sided