Libraries & Utility Functions

library("tidyverse")
library("tidygraph")
library("ggraph")
library("corrr")
library("rcompanion") #Pairwise nominal

## allDup will return all duplicated values, without dropping any
## i.e. The directionality of duplicated() is removed by
## performing the operation on the vector from both directions
allDup <- function(val){
    duplicated(val) | duplicated(val, fromLast = TRUE)
}

## plotDat
plotDat <- function(dat, column, bs, mn, xl, yl){
  tmp <- as.matrix(table(dat[[column]], dat[["CG_DESCRIPTION"]]))
  prop <- prop.table(tmp, margin = 2)#2 for column-wise proportions
  par(mar = c(5.0, 4.0, 4.0, 15), xpd = TRUE)
  barplot(prop, col = cm.colors(length(rownames(prop))), beside = bs,width = 2, main = mn, xlab = xl, ylab = yl)
  legend("topright", inset = c(-0.90,0), fill = cm.colors(length(rownames(prop))), legend=rownames(prop))
}

Load Data & Initial Cleaning

Load NQF care measure cohort (drawn directly from NOTEEVENTS table), load CAREGIVERS, ADMISSIONS, PATIENTS, and ICUSTAYS for additional data.

## Load Labeled Note Data for NQF Caremeasure Cohort (From NOTEEVENTS table)
dat <- read.csv("~/nqf_caregivers/data/note_labels_over75.csv", header = T, stringsAsFactors = F)

## Remove X and note_name (artifact indexing columns) from dat
dat$X <- NULL
dat$note_name <- NULL

Note: FAM, CIM, LIM, CAR and COD refer to human annotations, and will be dropped, as we are intersted in the .machine annotations from NeuroNER.

dat$FAM <- NULL
dat$CIM <- NULL
dat$CIM_post <- NULL
dat$LIM <- NULL
dat$CAR <- NULL
dat$COD <- NULL
## Load CAREGIVERS Table for join on CGID
cg <- read.csv("~/nqf_caregivers/data/mimic/CAREGIVERS.csv", header = T, stringsAsFactors = F)

## Load ADMISSIONS Table to join on HADM_ID
adm <- read.csv("~/nqf_caregivers/data/mimic/ADMISSIONS.csv", header = T, stringsAsFactors = F)

## Load PATIENTS Table to join on SUBJECT_ID
pat <- read.csv("~/nqf_caregivers/data/mimic/PATIENTS.csv", header = T, stringsAsFactors = F)

## Load ICUSTAYS TAble to join on SUBJECT_ID, HADM_ID
#stays <- read.csv("~/nqf_caregivers/data/mimic/ICUSTAYS.csv", header = T, stringsAsFactors = F)

Duplicate Note Investigation

Some notes in MIMIC are exact duplicates; this includes notes from within our NQF Care Measure Cohort. Previously, they were removed because we were only interested in their unique content. Now we are interested in their relation to the caregivers who documented them, so we will investigate.

## Subset *all* duplicated texts and other data
tmp <- dat[allDup(dat$TEXT),c("CGID",
                              "SUBJECT_ID",
                              "STORETIME",
                              "DESCRIPTION")]

## Investigate the first 25 observations
head(tmp, 25)
##      CGID SUBJECT_ID           STORETIME
## 41  17155        698 2167-12-25 06:00:53
## 42  14327        698 2167-12-25 05:51:05
## 47  15237        711 2185-03-22 14:03:32
## 48  15237        711 2185-03-22 14:05:09
## 71  20952       1332 2121-05-15 23:30:18
## 72  20952       1332 2121-05-16 00:01:44
## 83  20066       1578 2140-12-23 09:30:41
## 84  20066       1578 2140-12-23 09:30:41
## 85  16088       1578 2140-12-23 10:17:03
## 86  16088       1578 2140-12-23 10:17:03
## 87  20066       1578 2140-12-23 17:34:08
## 88  20066       1578 2140-12-23 17:34:08
## 89  14180       1578 2140-12-22 21:47:49
## 90  14180       1578 2140-12-22 21:47:49
## 91  14180       1578 2140-12-22 22:02:32
## 92  14180       1578 2140-12-22 22:02:32
## 93  14180       1578 2140-12-22 21:07:20
## 94  14180       1578 2140-12-22 21:07:20
## 95  16088       1578 2140-12-23 05:37:41
## 96  16088       1578 2140-12-23 05:37:41
## 97  16088       1578 2140-12-23 05:38:26
## 98  16088       1578 2140-12-23 05:38:26
## 99  16088       1578 2140-12-23 05:39:37
## 100 16088       1578 2140-12-23 05:39:37
## 120 19714       2378 2140-11-04 06:28:16
##                                            DESCRIPTION
## 41                           Cardiology Physician Note
## 42                           Cardiology Physician Note
## 47                   Physician Resident Admission Note
## 48                   Physician Resident Admission Note
## 71                   Physician Resident Admission Note
## 72                   Physician Resident Admission Note
## 83  Physician Resident/Attending Admission Note - MICU
## 84  Physician Resident/Attending Admission Note - MICU
## 85                    Physician Resident Progress Note
## 86                    Physician Resident Progress Note
## 87   Physician Resident/Attending Progress Note - MICU
## 88   Physician Resident/Attending Progress Note - MICU
## 89                   Physician Resident Admission Note
## 90                   Physician Resident Admission Note
## 91                   Physician Resident Admission Note
## 92                   Physician Resident Admission Note
## 93                   Physician Resident Admission Note
## 94                   Physician Resident Admission Note
## 95                    Physician Resident Progress Note
## 96                    Physician Resident Progress Note
## 97                    Physician Resident Progress Note
## 98                    Physician Resident Progress Note
## 99                    Physician Resident Progress Note
## 100                   Physician Resident Progress Note
## 120                   Physician Resident Progress Note

We will need to discover the best way to deal with those. Some have different STORETIMEs and CGIDs, others only have different STORETIMEs. Others appear to be properly duplicated entries.

One strategy may be to keep only those duplicate notes with different CGIDs

Merge

In MIMIC-III, ROW_ID is an index used for each table, and DESCRIPTION is also a common variable for each table.

  1. Remove ROW_ID from all tables accept dat (from NOTEEVENTS)
  2. dat to CAREGIVERS on CGID
  3. dat to ADMISSIONS on HADM_ID
  4. dat to PATIENTS on SUBJECT_ID
  5. dat to ICUSTAYS on SUBJECT_ID and HADM_ID
    • (For now we will not )
## Change column name of "DESCRIPTION" to explicitly mention that it describes the note
colnames(dat)[which(colnames(dat) == "DESCRIPTION")] <- "NOTE_DESCRIPTION"

## Change column name of "DESCRIPTION" to explicitly mention that it describes the caregiver
colnames(cg)[which(colnames(cg) == "DESCRIPTION")] <- "CG_DESCRIPTION"

## (1)
cg$ROW_ID <- NULL
adm$ROW_ID <- NULL
pat$ROW_ID <- NULL
#stays$ROW_ID <- NULL

dim(dat)
## [1] 11575    17
## (2)
dat <- merge(dat, cg, by = "CGID")
dim(dat)
## [1] 11575    19
## (3)
dat <- merge(dat, adm, by = "HADM_ID")
dim(dat)
## [1] 11575    36
## This has duplicated SUBJECT_ID
identical(dat$SUBJECT_ID.x, dat$SUBJECT_ID.y)
## [1] TRUE
## Remove one
dat$SUBJECT_ID.y <- NULL

## Rename the other
colnames(dat)[which(colnames(dat) == "SUBJECT_ID.x")] <- "SUBJECT_ID"
dim(dat)
## [1] 11575    35
## (4)
dat <- merge(dat, pat, by = "SUBJECT_ID")
dim(dat)
## [1] 11575    41
## (5)
#dat <- merge(dat, stays, by = c("SUBJECT_ID", "HADM_ID"))
#dim(dat)

## Cleaning
rm(adm, pat)#, cg)#, stays)
gc()
##            used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1859799 99.4    3205452 171.2  3100834 165.7
## Vcells 11559250 88.2   20263605 154.6 16818857 128.4

Note: the merge() method we used is an inner join, which is a matrix manipulation that generates the Cartesian Product of two data matrices. The data frame will expand when joined to ICUSTAYS because a single hospital admission can be associated with a number of ICUSTAYS if the patient is transferred to the floor and back.

Check Variables

colnames(dat)
##  [1] "SUBJECT_ID"           "HADM_ID"              "CGID"                
##  [4] "ROW_ID"               "CHARTDATE"            "CHARTTIME"           
##  [7] "STORETIME"            "CATEGORY"             "NOTE_DESCRIPTION"    
## [10] "ISERROR"              "TEXT"                 "FAM.machine"         
## [13] "CIM.machine"          "LIM.machine"          "CAR.machine"         
## [16] "COD.machine"          "CIM_post.machine"     "LABEL"               
## [19] "CG_DESCRIPTION"       "ADMITTIME"            "DISCHTIME"           
## [22] "DEATHTIME"            "ADMISSION_TYPE"       "ADMISSION_LOCATION"  
## [25] "DISCHARGE_LOCATION"   "INSURANCE"            "LANGUAGE"            
## [28] "RELIGION"             "MARITAL_STATUS"       "ETHNICITY"           
## [31] "EDREGTIME"            "EDOUTTIME"            "DIAGNOSIS"           
## [34] "HOSPITAL_EXPIRE_FLAG" "HAS_CHARTEVENTS_DATA" "GENDER"              
## [37] "DOB"                  "DOD"                  "DOD_HOSP"            
## [40] "DOD_SSN"              "EXPIRE_FLAG"

We will want to convert CHARTDATE, representing the date the which the note was charted, to numeric from YYYY-MM-DD format.

dat$CHARTDATE <- as.numeric(as.Date(dat$CHARTDATE, "%Y-%m-%d"))

Exploration

table(dat$LABEL)
## 
##   1390   9596    eaw HMS MS     MD    Mds    MDs    MDS Med St MedRes 
##     40     10     39     22   4086     16   2569    105     11      1 
##  MedSt     ms     MS     NP     PA    PHD     RD    Res     RF     Rn 
##     45     35    131     27     35     94      1   4075     28      2 
##     RN    RPH    RRT    RTH    Std    STD Studen 
##     73      1      3      1     21     55     49
table(dat$CG_DESCRIPTION)
## 
##             Attending             Dietitian            Pharmacist 
##                  3546                     1                     1 
##             Read Only Resident/Fellow/PA/NP           Respiratory 
##                    70                  7915                     4 
##                    RN 
##                    38
cat("There are", length(unique(dat$SUBJECT_ID)), "unique patients in this cohort.\n")
## There are 1141 unique patients in this cohort.
cat("There are", length(unique(dat$HADM_ID)), "unique hospital admissions associated with this cohort.\n")
## There are 1350 unique hospital admissions associated with this cohort.
cat("There are", length(unique(dat$CGID)), "unique caregivers associated with this cohort.\n")
## There are 501 unique caregivers associated with this cohort.

Subsetting

For now, we will restrict our analysis to the clinicians’ ability to capture measures within the first 48 hours of an ICU admission, including:

  1. Documentation of family meetings FAM
  2. Documentation of I can’t remember which this one is CIM
  3. Documentation of code status limitations LIM
  4. Documentation of patient/family care preferences CAR
  5. Documentation of overall Code Status COD
  6. Documentation of I can’t remember which this one is CIM_post

We can collapse the data frame after subsetting, so we have every vertex, or node, represented in either CGID or SUBJECT_ID, and every edge is a row (or CGID and SUBJECT_ID) relation.

Since the documentation status is binary, we can take the average ussing aggregate(), which will give us the percentage of times, in the first 48 hours, the each concept was documented by the clinician.

tmp <- dat[ ,c("CGID", 
               "SUBJECT_ID",
               "FAM.machine",
               "CIM.machine",
               "LIM.machine",
               "CAR.machine",
               "COD.machine",
               "CIM_post.machine")]

## This aggregation method shows ALL relationships between caregivers and
## patients, as well as the percentage of measures documented in their
## interactions
tmp <- aggregate(cbind(FAM.machine, 
                       CIM.machine, 
                       LIM.machine, 
                       CAR.machine, 
                       COD.machine, 
                       CIM_post.machine) ~ 
                     CGID + 
                     SUBJECT_ID, 
                 data = tmp, 
                 FUN = mean)

head(tmp, 25)
##     CGID SUBJECT_ID FAM.machine CIM.machine LIM.machine CAR.machine
## 1  16076        605   0.7500000   0.7500000   0.0000000   0.7500000
## 2  16802        605   0.3333333   0.3333333   0.0000000   0.3333333
## 3  17192        605   0.3333333   0.6666667   0.0000000   0.6666667
## 4  17209        605   0.3333333   0.0000000   0.0000000   0.0000000
## 5  17280        605   0.0000000   0.0000000   0.0000000   0.0000000
## 6  17650        605   1.0000000   1.0000000   0.0000000   1.0000000
## 7  18917        605   1.0000000   0.0000000   0.0000000   1.0000000
## 8  19064        605   0.7500000   0.0000000   0.0000000   0.2500000
## 9  19248        605   0.0000000   0.0000000   0.0000000   0.0000000
## 10 19692        605   0.5000000   0.0000000   0.0000000   0.0000000
## 11 14327        698   0.0000000   1.0000000   1.0000000   0.0000000
## 12 15499        698   0.0000000   1.0000000   1.0000000   1.0000000
## 13 15688        698   0.0000000   1.0000000   1.0000000   0.0000000
## 14 17155        698   0.0000000   1.0000000   1.0000000   0.0000000
## 15 17802        698   1.0000000   1.0000000   1.0000000   0.0000000
## 16 17866        698   0.0000000   1.0000000   1.0000000   0.0000000
## 17 18452        698   0.0000000   1.0000000   1.0000000   0.0000000
## 18 21491        698   0.6666667   1.0000000   1.0000000   0.6666667
## 19 15237        711   1.0000000   1.0000000   1.0000000   1.0000000
## 20 17331        711   0.1250000   0.7500000   0.7500000   0.0000000
## 21 19006        711   0.4285714   0.5714286   0.5714286   0.5714286
## 22 16415        885   0.0000000   0.0000000   0.0000000   0.0000000
## 23 17866        885   0.0000000   0.0000000   0.0000000   0.0000000
## 24 18452        885   0.0000000   0.0000000   0.0000000   0.0000000
## 25 19862        885   0.0000000   1.0000000   1.0000000   0.0000000
##    COD.machine CIM_post.machine
## 1    1.0000000        0.7500000
## 2    1.0000000        0.3333333
## 3    1.0000000        0.6666667
## 4    0.6666667        0.0000000
## 5    1.0000000        0.0000000
## 6    1.0000000        1.0000000
## 7    1.0000000        1.0000000
## 8    1.0000000        0.2500000
## 9    1.0000000        0.0000000
## 10   0.5000000        0.0000000
## 11   0.0000000        1.0000000
## 12   0.0000000        1.0000000
## 13   0.0000000        1.0000000
## 14   0.0000000        1.0000000
## 15   0.0000000        1.0000000
## 16   0.0000000        1.0000000
## 17   0.0000000        1.0000000
## 18   0.0000000        1.0000000
## 19   1.0000000        1.0000000
## 20   0.2500000        0.7500000
## 21   0.7142857        0.5714286
## 22   1.0000000        0.0000000
## 23   1.0000000        0.0000000
## 24   1.0000000        0.0000000
## 25   1.0000000        1.0000000

Subset For Overall Rates of Documentation

tmp <- dat[ ,c("CGID", 
               "SUBJECT_ID",
               "FAM.machine",
               "CIM.machine",
               "LIM.machine",
               "CAR.machine",
               "COD.machine",
               "CIM_post.machine")]



## This aggregation method shows ALL relationships between caregivers and
## patients, as well as the percentage of measures documented in their
## interactions
tmp <- aggregate(cbind(FAM.machine, 
                       CIM.machine, 
                       LIM.machine, 
                       CAR.machine, 
                       COD.machine, 
                       CIM_post.machine) ~ 
                     CGID + 
                     SUBJECT_ID, 
                 data = tmp, 
                 FUN = mean)

head(tmp)
##    CGID SUBJECT_ID FAM.machine CIM.machine LIM.machine CAR.machine
## 1 16076        605   0.7500000   0.7500000           0   0.7500000
## 2 16802        605   0.3333333   0.3333333           0   0.3333333
## 3 17192        605   0.3333333   0.6666667           0   0.6666667
## 4 17209        605   0.3333333   0.0000000           0   0.0000000
## 5 17280        605   0.0000000   0.0000000           0   0.0000000
## 6 17650        605   1.0000000   1.0000000           0   1.0000000
##   COD.machine CIM_post.machine
## 1   1.0000000        0.7500000
## 2   1.0000000        0.3333333
## 3   1.0000000        0.6666667
## 4   0.6666667        0.0000000
## 5   1.0000000        0.0000000
## 6   1.0000000        1.0000000

Convert to Binary

Now we will convert to binary, such that if the clinician documented any of these concepts during any interaction within 48 hrs of admission, it will be counted as 1, else 0.

## Loop through columns
for (name in colnames(tmp)[grepl(".machine", colnames(tmp))]){
    tmp[[name]] <- ifelse(tmp[[name]] > 0, 1, 0)
}

## Take a look
head(tmp)
##    CGID SUBJECT_ID FAM.machine CIM.machine LIM.machine CAR.machine
## 1 16076        605           1           1           0           1
## 2 16802        605           1           1           0           1
## 3 17192        605           1           1           0           1
## 4 17209        605           1           0           0           0
## 5 17280        605           0           0           0           0
## 6 17650        605           1           1           0           1
##   COD.machine CIM_post.machine
## 1           1                1
## 2           1                1
## 3           1                1
## 4           1                0
## 5           1                0
## 6           1                1
## Merge again to caregivers
tmp <- merge(tmp, cg, by = "CGID")

## Keep only attending and others
tmp <- tmp[(tmp$CG_DESCRIPTION == "Attending" | tmp$CG_DESCRIPTION == "Resident/Fellow/PA/NP"), ]


plotDat(tmp, "FAM.machine", bs = F, mn = "FAM Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")

test <- table(tmp$FAM.machine, tmp$CG_DESCRIPTION)
test
##    
##     Attending Resident/Fellow/PA/NP
##   0      1233                  1970
##   1       641                  1244
chisq.test(test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  test
## X-squared = 10.09, df = 1, p-value = 0.001491
pairwiseNominalIndependence(
  as.matrix(test), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##   Comparison p.Chisq p.adj.Chisq
## 1      0 : 1 0.00149     0.00149
plotDat(tmp, "CIM.machine", bs = F, mn = "CIM Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")

test <- table(tmp$CIM.machine, tmp$CG_DESCRIPTION)
test
##    
##     Attending Resident/Fellow/PA/NP
##   0      1090                  1759
##   1       784                  1455
chisq.test(test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  test
## X-squared = 5.5301, df = 1, p-value = 0.01869
pairwiseNominalIndependence(
  as.matrix(test), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##   Comparison p.Chisq p.adj.Chisq
## 1      0 : 1  0.0187      0.0187
plotDat(tmp, "LIM.machine", bs = F, mn = "LIM Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")

test <- table(tmp$LIM.machine, tmp$CG_DESCRIPTION)
test
##    
##     Attending Resident/Fellow/PA/NP
##   0      1180                  2046
##   1       694                  1168
chisq.test(test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  test
## X-squared = 0.21544, df = 1, p-value = 0.6425
pairwiseNominalIndependence(
  as.matrix(test), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##   Comparison p.Chisq p.adj.Chisq
## 1      0 : 1   0.643       0.643
plotDat(tmp, "CAR.machine", bs = F, mn = "CAR Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")

test <- table(tmp$CAR.machine, tmp$CG_DESCRIPTION)
test
##    
##     Attending Resident/Fellow/PA/NP
##   0      1336                  1899
##   1       538                  1315
chisq.test(test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  test
## X-squared = 75.641, df = 1, p-value < 2.2e-16
pairwiseNominalIndependence(
  as.matrix(test), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##   Comparison p.Chisq p.adj.Chisq
## 1      0 : 1 3.4e-18     3.4e-18
plotDat(tmp, "COD.machine", bs = F, mn = "COD Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")

test <- table(tmp$COD.machine, tmp$CG_DESCRIPTION)
test
##    
##     Attending Resident/Fellow/PA/NP
##   0       623                  1015
##   1      1251                  2199
chisq.test(test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  test
## X-squared = 1.4259, df = 1, p-value = 0.2324
pairwiseNominalIndependence(
  as.matrix(test), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##   Comparison p.Chisq p.adj.Chisq
## 1      0 : 1   0.232       0.232
plotDat(tmp, "CIM_post.machine", bs = F, mn = "CIM Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")

test <- table(tmp$CIM_post.machine, tmp$CG_DESCRIPTION)
test
##    
##     Attending Resident/Fellow/PA/NP
##   0       978                  1478
##   1       896                  1736
chisq.test(test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  test
## X-squared = 17.985, df = 1, p-value = 2.227e-05
pairwiseNominalIndependence(
  as.matrix(test), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##   Comparison  p.Chisq p.adj.Chisq
## 1      0 : 1 2.23e-05    2.23e-05