Libraries

library("rcompanion")
library("data.table")

Load Data

## All Physicians' notes in MIMIC
notes <- fread("~/goals_of_care/external_validation/PHYSICIAN_NOTEEVENTS.csv", header = T, stringsAsFactors = F)
## 
Read 0.0% of 909800 rows
Read 1.1% of 909800 rows
Read 3.3% of 909800 rows
Read 4.4% of 909800 rows
Read 6.6% of 909800 rows
Read 8.8% of 909800 rows
Read 11.0% of 909800 rows
Read 13.2% of 909800 rows
Read 15.4% of 909800 rows
Read 141624 rows and 11 (of 11) columns from 0.986 GB file in 00:00:17
# Admissions, Patients, ICU Stay tables merged
mim <- fread("~/goals_of_care/external_validation/adm_pat_stay_merged.csv", header = T, stringsAsFactors = F)
## 
Read 97.5% of 61532 rows
Read 61532 rows and 33 (of 33) columns from 0.021 GB file in 00:00:03
dat <- merge(notes, mim, by = c("SUBJECT_ID", "HADM_ID"))
cat(length(unique(dat$SUBJECT_ID)), "Potentially Eligible Patients with Physicians' Notes", sep = ' ')
## 7564 Potentially Eligible Patients with Physicians' Notes

Utility Functions

ages() will accept the merged MIMIC-III ADMISSIONS and PATIENTS tables, where both ADMISSIONS.ADMITTIME and PATIENTS.DOB are already converted to numeric format (from Date), and will add AGE to the data.frame, imputing the median age of nonagenarians in MIMIC-III as 91.4.

ages <- function(dat){
    dat$AGE <- (dat$ADMITTIME - dat$DOB)/365
    #Correct for patient ages > 90 by imputing median age (91.4)
    dat[(dat$AGE >= 90),]$AGE <- 91.4
    return(dat)
}

admissionCount() will accept the ADMISSIONS table, and will order by SUBJECT_ID and ADMITTIME to determine the admission sequence numbers for each patient.

admissionCount <- function(dat){
  #order by subject id, then admittime
  dat <- dat[with(dat, order(SUBJECT_ID, ADMITTIME)),]
  
  tmp <- data.frame()
  res <- data.frame()
  for (subj in unique(dat$SUBJECT_ID)){
    tmp <- dat[(dat$SUBJECT_ID == subj),]
    tmp$ADMISSION_NUMBER <- seq.int(nrow(tmp))
    res <- rbind(res, tmp)
  }

  return(res)  
}

clean_dates() will accept a dataframe and convert dates to numeric values (in days).

clean_dates <- function(dat){
  #ADMISSIONS TABLE
  dat$ADMITTIME <- as.numeric(as.Date(dat$ADMITTIME, "%Y-%m-%d %H:%M:%S"))
  dat$DISCHTIME <- as.numeric(as.Date(dat$DISCHTIME, "%Y-%m-%d %H:%M:%S"))
  #PATIENTS TABLE
  dat$DOD <- as.numeric(as.Date(dat$DOD, "%Y-%m-%d %H:%M:%S"))
  dat$DOB <- as.numeric(as.Date(dat$DOB, "%Y-%m-%d %H:%M:%S"))
  #NOTEEVENTS
  dat$CHARTTIME <- as.numeric(as.Date(dat$CHARTTIME, "%Y-%m-%d %H:%M:%S"))
  
  return(dat)
}

plotDat() will plot data.

plotDat <- function(dat, column, bs, mn, xl, yl){
  tmp <- as.matrix(table(dat[[column]], dat[["cohort"]]))
  prop <- prop.table(tmp, margin = 2)#2 for column-wise proportions
  par(mar = c(5.0, 4.0, 4.0, 15), xpd = TRUE)
  barplot(prop, col = cm.colors(length(rownames(prop))), beside = bs,width = 2, main = mn, xlab = xl, ylab = yl)
  legend("topright", inset = c(-0.90,0), fill = cm.colors(length(rownames(prop))), legend=rownames(prop))
}

Initial data generation

dat <- clean_dates(dat)
dat <- ages(dat)
#dat <- admissionCount(dat)

#Days until death value
dat$DAYS_UNTIL_DEATH <-  dat$DOD - dat$ADMITTIME
#Look at distribution
hist(dat$DAYS_UNTIL_DEATH)

summary(dat$DAYS_UNTIL_DEATH)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0    27.0    76.0   232.6   279.0  1739.0   77494
#Note: NA's occur when patient is still alive

#Time since admission value
dat$TIME_SINCE_ADMIT <- as.numeric(as.Date(dat$CHARTDATE, "%Y-%m-%d")) - dat$ADMITTIME
#Look at distribution
hist(dat$TIME_SINCE_ADMIT)

summary(dat$TIME_SINCE_ADMIT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    2.00    5.00   11.02   13.00  131.00
#Generate TIME_UNTIL_DISCH variable
dat$TIME_UNTIL_DISCH <- dat$DISCHTIME - dat$ADMITTIME
hist(dat$TIME_UNTIL_DISCH)

summary(dat$TIME_UNTIL_DISCH)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    8.00   15.00   23.98   30.00  169.00

Annotation Set

cat(length(unique(dat$SUBJECT_ID)), "Potentially eligible patients with Physicians' Notes.", sep = ' ')
## 7564 Potentially eligible patients with Physicians' Notes.
##Ensure patients passed away during hospital visit
tmp <- dat[(dat$HOSPITAL_EXPIRE_FLAG == 1),]
cat(length(unique(tmp$SUBJECT_ID)), "Potentially eligible patients who died In hospital.", sep = ' ')
## 886 Potentially eligible patients who died In hospital.
cat((length(unique(dat$SUBJECT_ID)) - length(unique(tmp$SUBJECT_ID))), "Patients omitted due to hospital survival." )
## 6678 Patients omitted due to hospital survival.
##Ensure notes were documented within 48Hrs of admission
tmp_two <- tmp[(tmp$TIME_SINCE_ADMIT <= 2),]
cat(length(unique(tmp_two$SUBJECT_ID)), "Potentially Eligible Patients Who Died In Hospital.", sep = ' ')
## 729 Potentially Eligible Patients Who Died In Hospital.
cat((length(unique(tmp$SUBJECT_ID)) - length(unique(tmp_two$SUBJECT_ID))), "Patients omitted as they did not have physicians' notes documented within the first 48Hrs of admission." )
## 157 Patients omitted as they did not have physicians' notes documented within the first 48Hrs of admission.

Investigate Patients Without Documentation Within 48hrs

#Convert column names to lower
colnames(tmp) <- tolower(colnames(tmp))
#tmp_two contains our documented cohort
colnames(tmp_two) <- tolower(colnames(tmp_two))

#Look into patients who did not have physicians notes documented
check <- tmp[!(tmp$subject_id %in% tmp_two$subject_id),]

#Look at unique patients
length(unique(check$subject_id))
## [1] 157
#Number of observations
nrow(check)
## [1] 9337
#Add cohort information
check$cohort <- rep("No Notes", each = nrow(check))
tmp_two$cohort <- rep("Notes", each = nrow(tmp_two))

#bind data back for statistics
check <- rbind(check, tmp_two)

#Order data
check <- check[with(check, order(subject_id, hadm_id, admittime)),]

Notes Documented/Undocumented within 48Hrs of Admission Statistics(Hospital Admission Level Data)

#Hold the data for later use
test <- check
#Grab first admission
test <- test[!duplicated(test$hadm_id)]
nrow(test)
## [1] 886
table(test$cohort)
## 
## No Notes    Notes 
##      157      729
#Check time since admission, days until death, time until discharge, age, and length of stay
boxplot(test$time_since_admit ~ test$cohort, main = "time since admission\n(until patient note documentation) by cohort")

t.test(test$time_since_admit ~ test$cohort)
## 
##  Welch Two Sample t-test
## 
## data:  test$time_since_admit by test$cohort
## t = 10.679, df = 156.14, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  10.84200 15.76319
## sample estimates:
## mean in group No Notes    mean in group Notes 
##             14.2929936              0.9903978
wilcox.test(test$time_since_admit ~ test$cohort)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  test$time_since_admit by test$cohort
## W = 114450, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
#The difference makes sence here because they were selected as such

boxplot(test$days_until_death ~ test$cohort, 
        main = "days until death by cohort",
        ylab = "days until death",
        xlab = "Cohort")

t.test(test$days_until_death ~ test$cohort)
## 
##  Welch Two Sample t-test
## 
## data:  test$days_until_death by test$cohort
## t = 8.0465, df = 167.68, p-value = 1.484e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  12.69546 20.95057
## sample estimates:
## mean in group No Notes    mean in group Notes 
##              24.643312               7.820302
wilcox.test(test$days_until_death ~ test$cohort)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  test$days_until_death by test$cohort
## W = 94332, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
#Days until death is significant

## LOS
boxplot(test$los ~ test$cohort, 
        main = "ICU Length of Stay (days) by cohort", 
        xlab = "Cohort", 
        ylab = "Length of Stay (days)")

t.test(test$los ~ test$cohort)
## 
##  Welch Two Sample t-test
## 
## data:  test$los by test$cohort
## t = 2.5746, df = 178.26, p-value = 0.01085
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.577409 4.367754
## sample estimates:
## mean in group No Notes    mean in group Notes 
##               7.573550               5.100968
wilcox.test(test$los ~ test$cohort)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  test$los by test$cohort
## W = 64838, p-value = 0.008882
## alternative hypothesis: true location shift is not equal to 0
#Length of stay is significant...

## Admission Type
plotDat(test, "admission_type", bs = F, mn = "Admission Type Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_type, test$cohort)
tmp
##            
##             No Notes Notes
##   ELECTIVE        13    10
##   EMERGENCY      139   712
##   URGENT           5     7
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 29.592, df = 2, p-value = 3.752e-07
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##             Comparison  p.Chisq p.adj.Chisq
## 1 ELECTIVE : EMERGENCY 2.15e-06    6.45e-06
## 2    ELECTIVE : URGENT 6.32e-01    6.32e-01
## 3   EMERGENCY : URGENT 5.15e-02    7.72e-02
##Elective and Urgent are statistically significant in comparison

## Admission Type
plotDat(test, "admission_location", bs = F, mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_location, test$cohort)
tmp
##                            
##                             No Notes Notes
##   CLINIC REFERRAL/PREMATURE       18    13
##   EMERGENCY ROOM ADMIT            75   565
##   PHYS REFERRAL/NORMAL DELI       29    15
##   TRANSFER FROM HOSP/EXTRAM       32   132
##   TRANSFER FROM OTHER HEALT        2     1
##   TRANSFER FROM SKILLED NUR        1     3
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 125.93, df = 5, p-value < 2.2e-16
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##                                               Comparison  p.Chisq
## 1       CLINIC REFERRAL/PREMATURE : EMERGENCY ROOM ADMIT 2.10e-12
## 2  CLINIC REFERRAL/PREMATURE : PHYS REFERRAL/NORMAL DELI 6.53e-01
## 3  CLINIC REFERRAL/PREMATURE : TRANSFER FROM HOSP/EXTRAM 1.84e-05
## 4  CLINIC REFERRAL/PREMATURE : TRANSFER FROM OTHER HEALT 1.00e+00
## 5  CLINIC REFERRAL/PREMATURE : TRANSFER FROM SKILLED NUR 4.74e-01
## 6       EMERGENCY ROOM ADMIT : PHYS REFERRAL/NORMAL DELI 2.89e-21
## 7       EMERGENCY ROOM ADMIT : TRANSFER FROM HOSP/EXTRAM 1.27e-02
## 8       EMERGENCY ROOM ADMIT : TRANSFER FROM OTHER HEALT 4.20e-02
## 9       EMERGENCY ROOM ADMIT : TRANSFER FROM SKILLED NUR 9.65e-01
## 10 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM HOSP/EXTRAM 6.02e-09
## 11 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM OTHER HEALT 1.00e+00
## 12 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM SKILLED NUR 2.81e-01
## 13 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM OTHER HEALT 1.98e-01
## 14 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM SKILLED NUR 1.00e+00
## 15 TRANSFER FROM OTHER HEALT : TRANSFER FROM SKILLED NUR 7.41e-01
##    p.adj.Chisq
## 1     1.58e-11
## 2     9.80e-01
## 3     6.90e-05
## 4     1.00e+00
## 5     7.90e-01
## 6     4.34e-20
## 7     3.81e-02
## 8     1.05e-01
## 9     1.00e+00
## 10    3.01e-08
## 11    1.00e+00
## 12    5.27e-01
## 13    4.24e-01
## 14    1.00e+00
## 15    1.00e+00
##Very different cohorts

## Discharge Location to show they died in-hospital
table(test$discharge_location, test$cohort)
##               
##                No Notes Notes
##   DEAD/EXPIRED      157   729
##First care unit
plotDat(test, "first_careunit", bs = F, mn = "First Care Unit by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$first_careunit, test$cohort)
tmp
##        
##         No Notes Notes
##   CCU         18    79
##   CSRU         7    30
##   MICU       104   405
##   SICU        18   132
##   TSICU       10    83
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 9.1136, df = 4, p-value = 0.05832
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##      Comparison p.Chisq p.adj.Chisq
## 1    CCU : CSRU  1.0000       1.000
## 2    CCU : MICU  0.7760       1.000
## 3    CCU : SICU  0.2140       0.535
## 4   CCU : TSICU  0.1890       0.535
## 5   CSRU : MICU  0.9930       1.000
## 6   CSRU : SICU  0.4020       0.670
## 7  CSRU : TSICU  0.3380       0.670
## 8   MICU : SICU  0.0266       0.204
## 9  MICU : TSICU  0.0407       0.204
## 10 SICU : TSICU  0.9290       1.000
##TSICU, SICU relative to MICU almost statistically significant

##Last care unit
plotDat(test, "last_careunit", bs = F, mn = "Last Care Unit by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$last_careunit, test$cohort)
tmp
##        
##         No Notes Notes
##   CCU         18    76
##   CSRU         6    23
##   MICU       106   424
##   SICU        18   130
##   TSICU        9    76
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 8.2975, df = 4, p-value = 0.08127
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##      Comparison p.Chisq p.adj.Chisq
## 1    CCU : CSRU  1.0000       1.000
## 2    CCU : MICU  0.9600       1.000
## 3    CCU : SICU  0.1920       0.480
## 4   CCU : TSICU  0.1650       0.480
## 5   CSRU : MICU  1.0000       1.000
## 6   CSRU : SICU  0.3520       0.587
## 7  CSRU : TSICU  0.2840       0.568
## 8   MICU : SICU  0.0393       0.276
## 9  MICU : TSICU  0.0553       0.276
## 10 SICU : TSICU  0.8820       1.000
##TSICU, SICU relative to MICU almost statistically significant


##Dbsource
plotDat(test, "dbsource", bs = F, mn = "DBSource by Cohort\n(Hospital Admission Level)", xl = "Cohort", yl = "Proportion")

tmp <- table(test$dbsource, test$cohort)
tmp
##             
##              No Notes Notes
##   both             10     6
##   carevue           3     2
##   metavision      144   721
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 28.818, df = 2, p-value = 5.523e-07
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##             Comparison  p.Chisq p.adj.Chisq
## 1       both : carevue 1.00e+00    1.00e+00
## 2    both : metavision 8.47e-06    2.54e-05
## 3 carevue : metavision 4.76e-02    7.14e-02
#Metavision is associated with documentation of notes...

Patient Level Data

test <- check
#Remove duplicate patients
test <- test[!duplicated(test$subject_id),]

table(test$cohort)
## 
## No Notes    Notes 
##      157      729
##Age
boxplot(test$age ~ test$cohort, 
        main = "age by cohort",
        ylab = "age (years)",
        xlab = "Cohort")

t.test(test$age ~ test$cohort)
## 
##  Welch Two Sample t-test
## 
## data:  test$age by test$cohort
## t = -3.1325, df = 232.13, p-value = 0.001956
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.655805 -1.515984
## sample estimates:
## mean in group No Notes    mean in group Notes 
##               67.29659               71.38248
wilcox.test(test$age ~ test$cohort)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  test$age by test$cohort
## W = 46998, p-value = 0.0004355
## alternative hypothesis: true location shift is not equal to 0
#Age is significantly different-- younger patients

##Gender
plotDat(test, "gender", bs = F, mn = "Gender by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$gender, test$cohort)
tmp
##    
##     No Notes Notes
##   F       78   340
##   M       79   389
prop.test(tmp)
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  tmp
## X-squared = 0.36547, df = 1, p-value = 0.5455
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.03492877  0.07052767
## sample estimates:
##    prop 1    prop 2 
## 0.1866029 0.1688034
#No significant difference in gender


##Insurance
plotDat(test, "insurance", bs = F, mn = "Insurance by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$insurance, test$cohort)
tmp
##             
##              No Notes Notes
##   Government        3    11
##   Medicaid         11    50
##   Medicare         99   518
##   Private          44   142
##   Self Pay          0     8
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 7.5408, df = 4, p-value = 0.1099
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##               Comparison p.Chisq p.adj.Chisq
## 1  Government : Medicaid  1.0000       1.000
## 2  Government : Medicare  0.8620       1.000
## 3   Government : Private  1.0000       1.000
## 4  Government : Self Pay  0.4450       0.767
## 5    Medicaid : Medicare  0.8260       1.000
## 6     Medicaid : Private  0.4600       0.767
## 7    Medicaid : Self Pay  0.4260       0.767
## 8     Medicare : Private  0.0233       0.233
## 9    Medicare : Self Pay  0.4550       0.767
## 10    Private : Self Pay  0.2570       0.767
#No significant difference in insurance

##Ethnicity
##Clean ethnicity
test[(grepl("WHITE|PORTUGUESE", test$ethnicity)),]$ethnicity <- "WHITE" 
test[(grepl("ASIAN", test$ethnicity)),]$ethnicity <- "ASIAN" 
test[(grepl("BLACK", test$ethnicity)),]$ethnicity <- "BLACK" 
test[(grepl("HISPANIC", test$ethnicity)),]$ethnicity <- "HISPANIC"
test[(grepl("MIDDLE|NATIVE|MULTI|DECLINED|UNABLE|OTHER|NOT",test$ethnicity)),]$ethnicity <- "UNKNOWN"

plotDat(test, "ethnicity", bs = F, mn = "Ethnicity by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$ethnicity, test$cohort)
tmp
##           
##            No Notes Notes
##   ASIAN           3    20
##   BLACK          21    65
##   HISPANIC        2    20
##   UNKNOWN        14    71
##   WHITE         117   553
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 4.2367, df = 4, p-value = 0.3749
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##            Comparison p.Chisq p.adj.Chisq
## 1       ASIAN : BLACK   0.376       0.922
## 2    ASIAN : HISPANIC   1.000       1.000
## 3     ASIAN : UNKNOWN   0.938       1.000
## 4       ASIAN : WHITE   0.787       1.000
## 5    BLACK : HISPANIC   0.202       0.907
## 6     BLACK : UNKNOWN   0.272       0.907
## 7       BLACK : WHITE   0.155       0.907
## 8  HISPANIC : UNKNOWN   0.596       0.993
## 9    HISPANIC : WHITE   0.461       0.922
## 10    UNKNOWN : WHITE   0.940       1.000
##No significant difference in ethnicity


##Dbsource
plotDat(test, "dbsource", bs = F, mn = "DBSource by Cohort\n(Patient Level)", xl = "Cohort", yl = "Proportion")

tmp <- table(test$dbsource, test$cohort)
tmp
##             
##              No Notes Notes
##   both             10     6
##   carevue           3     2
##   metavision      144   721
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 28.818, df = 2, p-value = 5.523e-07
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##             Comparison  p.Chisq p.adj.Chisq
## 1       both : carevue 1.00e+00    1.00e+00
## 2    both : metavision 8.47e-06    2.54e-05
## 3 carevue : metavision 4.76e-02    7.14e-02
#Metavision associated with documentation of notes