library(dplyr)
library(reshape2)
library(ggplot2)
library(coin) # for permutation test
# ------------------- Dataset import and some cleaning ------------------
# read the diagnostic and treatment data and
diag <- read.csv('./data/Patient_Diagnosis.csv',stringsAsFactors = FALSE)
trt <- read.csv('./data/Patient_Treatment.csv',stringsAsFactors = FALSE)
# Some cleaning
# convert date columns to appropriate format
diag$diagnosis_date <- as.Date(diag$diagnosis_date,"%m/%d/%Y")
trt$treatment_date <- as.Date(trt$treatment_date,"%m/%d/%Y")
# remove duplicate rows
diag = unique(diag)
trt = unique(trt)
# join the two datasets into a single data frame
dt <- full_join(trt,diag,by="patient_id")
summary(dt)
## patient_id treatment_date drug_code diagnosis_date
## Min. :2038 Min. :0010-01-20 Length:1208 Min. :0010-01-09
## 1st Qu.:2961 1st Qu.:0011-09-17 Class :character 1st Qu.:0011-04-19
## Median :4692 Median :0012-06-22 Mode :character Median :0012-04-27
## Mean :5093 Mean :0012-05-07 Mean :0012-02-11
## 3rd Qu.:6877 3rd Qu.:0013-01-23 3rd Qu.:0012-11-15
## Max. :9489 Max. :0017-02-20 Max. :0013-08-23
## NA's :2
## diagnosis_code diagnosis
## Min. :153.3 Length:1208
## 1st Qu.:153.9 Class :character
## Median :174.4 Mode :character
## Mean :168.4
## 3rd Qu.:174.8
## Max. :174.9
##
# for non-numeric columns, what are the distinct values?
table(dt$drug_code)
##
## A B C D
## 394 394 398 20
table(dt$diagnosis)
##
## Breast Cancer Colon Cancer
## 854 354
From the results, I was able to tell that there are four drugs and two cancer types.
dt[is.na(dt$treatment_date),]
## patient_id treatment_date drug_code diagnosis_date diagnosis_code
## 1207 4256 <NA> <NA> 0011-11-07 174.5
## 1208 4256 <NA> <NA> 0011-11-07 174.8
## diagnosis
## 1207 Breast Cancer
## 1208 Breast Cancer
# Because the treatment dataset contains several data points for each patient,
# will only use the diagnosis dataset
diag$diagnosis %>% table(exclude=NULL) %>% prop.table()*100
## .
## Breast Cancer Colon Cancer
## 68.42105 31.57895
# First create a column for the # of days passed since the diagosis
dt <- mutate(dt,date_diff = treatment_date-diagnosis_date)
dt$date_diff <- as.numeric(dt$date_diff)
# now find out when the treatment started which is the smallest # of days to any treatment
dt_days <- dt %>% group_by(patient_id,diagnosis) %>% summarize(days_to_txt = min(date_diff))
dt_days_stats <- dt_days %>% group_by(diagnosis) %>% summarize(min = min(days_to_txt,na.rm=TRUE),
quartile1 = quantile(days_to_txt,0.25,na.rm=TRUE),
quartile2 = quantile(days_to_txt,0.5,na.rm=TRUE),
quartile3 = quantile(days_to_txt,0.75,na.rm=TRUE),
max = max(days_to_txt,na.rm=TRUE),
missing = sum(is.na(days_to_txt)))
dt_days_stats
## # A tibble: 2 x 7
## diagnosis min quartile1 quartile2 quartile3 max missing
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Breast Cancer -6 3.5 5 6 20 1
## 2 Colon Cancer 0 2.25 5 7 304 0
ggplot(dt_days,aes(x=diagnosis,y=days_to_txt)) + geom_boxplot()
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
# Find the smallest days to treatment for each patient, a drug type and diagnosis type
dt_days_drug <- dt %>%
group_by(patient_id,diagnosis,drug_code) %>%
summarize(days_to_txt = min(date_diff))
ggplot(na.omit(dt_days_drug),aes(x=drug_code,y=days_to_txt)) +
geom_boxplot() + facet_grid(diagnosis ~ .,scales="free")
# Below is a function that returns which drug is the first-line drug
# given the time points that each drug is administered
find_first_drug <- function(drug,d_diff){
# first line drug is identified as the drug/s with the smallest treatment day
prim_raw <- drug[d_diff == min(d_diff)]
# the same drug can be selected twice as a primary drug
# due to the drug prescribed to multiple types of breast cancer
# so, remove duplicates in the list
# sort() is to fix c("B","A") to c("A","B")
prim <- sort(unique(prim_raw))
return(paste(prim,collapse=","))
}
# Apply that function to the breast cancer subset of the data
# Also calculate the duration of the treatment (taken from the largest number of days for any treatment)
dt_brca_duration <- dt %>%
filter(diagnosis=="Breast Cancer") %>%
group_by(patient_id) %>%
summarize(duration = max(date_diff),
first_drug = find_first_drug(drug_code,date_diff))
# show the distribution of treatment duration under the regimen A vs B
# excluded the patients that 1) received A and B at the same time 2) received C or D
dt_brca_duration_AB <- filter(dt_brca_duration,first_drug %in% c("A","B"))
dt_brca_duration_AB
## # A tibble: 11 x 3
## patient_id duration first_drug
## <int> <dbl> <chr>
## 1 2120 52 B
## 2 2238 1001 B
## 3 2475 0 B
## 4 2607 94 B
## 5 2720 87 B
## 6 2762 79 B
## 7 3025 83 B
## 8 7937 80 A
## 9 7976 82 A
## 10 8480 83 A
## 11 8827 90 A
dt_brca_duration_AB %>%
ggplot(aes(y = duration,x = first_drug)) + geom_boxplot()
dt[dt$patient_id == "2238",]
## patient_id treatment_date drug_code diagnosis_date diagnosis_code
## 2 2238 0010-01-21 B 0010-01-21 174.9
## 18 2238 0010-01-31 B 0010-01-21 174.9
## 30 2238 0010-02-10 B 0010-01-21 174.9
## 42 2238 0010-02-20 B 0010-01-21 174.9
## 56 2238 0010-03-02 B 0010-01-21 174.9
## 62 2238 0010-03-12 B 0010-01-21 174.9
## 70 2238 0010-03-22 B 0010-01-21 174.9
## 76 2238 0010-04-01 B 0010-01-21 174.9
## 695 2238 0012-09-18 B 0010-01-21 174.9
## 713 2238 0012-09-28 B 0010-01-21 174.9
## 729 2238 0012-10-08 B 0010-01-21 174.9
## 743 2238 0012-10-18 B 0010-01-21 174.9
## diagnosis date_diff
## 2 Breast Cancer 0
## 18 Breast Cancer 10
## 30 Breast Cancer 20
## 42 Breast Cancer 30
## 56 Breast Cancer 40
## 62 Breast Cancer 50
## 70 Breast Cancer 60
## 76 Breast Cancer 70
## 695 Breast Cancer 971
## 713 Breast Cancer 981
## 729 Breast Cancer 991
## 743 Breast Cancer 1001
dt_brca_duration_AB_clean <- filter(dt_brca_duration_AB,patient_id != "2238")
dt_brca_duration_AB_clean %>%
group_by(first_drug) %>%
summarize(n_patients = n(),
minimum = min(duration),
maximum = max(duration),
median = median(duration))
## # A tibble: 2 x 5
## first_drug n_patients minimum maximum median
## <chr> <int> <dbl> <dbl> <dbl>
## 1 A 4 80 90 82.5
## 2 B 6 0 94 81
dt_brca_duration_AB_clean %>%
ggplot(aes(y = duration,x = first_drug)) + geom_boxplot()
# sample size is too small to use t-test, since the distribution can't be seen as normal
# so, will use permutation test(note that this requires coin package)
independence_test(duration ~ as.factor(first_drug),
data = dt_brca_duration_AB_clean)
##
## Asymptotic General Independence Test
##
## data: duration by as.factor(first_drug) (A, B)
## Z = 0.99088, p-value = 0.3217
## alternative hypothesis: two.sided