Introduction

This is a rule-based patient note analysis to capture patient goals of care (GOC) and advanced care planning (ACP) from text at the note level. We will utilize text phrases generated by clinicians who have annotated notes contained in the MIMIC-III Intensive Care Unit database to generate a Bag of Words (BoW) Model.

The BoW model does not utilize grammar or semantic relatedness, as it classifies text without regard to the spacial relation of words making up the text. The model utilizes only the presence or absence of words known to be associated with GOC or ACP (as per the clinicians who generated these rules during text classification).

This BoW model is employed as a means to compare a rule-based method to a deep-learning method which account for grammar, semantic relatedness, and named-entity recognition NeuroNER.

Note: Aggregated data are representative of actual MIMIC data, but to ensure and maintain patient privacy, all strings of characters here that appear to be patient note text are made up by myself as representative cases.

Utility Functions

load_data

load_data() will accept a label, lab, the set identifier (train/valid) set, and the map from file to row_id, mp. It will return the portion of the map particular to the set input as an argument.

load_data <- function(lab, set, mp){
  #Read in data
  tmp <- read.csv(paste("~/goals_of_care/regex_v2/",lab,"_",set,"_processed.csv", sep = ''),
                  header = T, stringsAsFactors = F, quote = "", row.names = NULL)
  
  #Change "filename" to file for consistency
  colnames(tmp)[which(colnames(tmp) == "filename")] <- "file"
  
  #Subset unique filenames
  tmp <- unique(tmp$file)
  
  #Take substring of names to ensure we can map across both train_/valid_ files
  tmp <- substr(tmp, nchar(tmp) - 9, nchar(tmp))
  
  #Subset files in train set from key map
  tmp <- mp[(mp$file %in% tmp),]

  #Return the mapped data
  return(tmp)
}

strict_regex

strict_regex() will accept all phrases kwds, and all note texts, texts, it will utilize grepl() to find direct matches in the text, and will return a list of booleans.

strict_regex <- function(kwds, texts){
  #Create a list to store results
  tmpList <- list()
  #Loop through all keywords
  for (i in 1:length(kwds)){
    #Store results as a logical vector in its respective list entry position
    tmpList[[i]] <- grepl(kwds[i], texts, ignore.case = TRUE)
  }
  #Return list and control to environment
  return(tmpList)
}

to_df()

to_df() will convert the list to a data.frame and apply the rule tokens as column names to indicate what each variable represents.

to_df <- function(domain, rule){
  #Convert list from grepl to data frame
  domain <- as.data.frame(domain)
  #Show column names as phrases
  colnames(domain) <- rule[rule != '']
  #Multiply by 1 for binary numeric
  domain <- domain*1
  return(domain)
}

clean_text

clean_text() accept a string of text, tokens, as well as a boolean, printout. It will remove carriage returns, remove text obfuscations, and convert the text to lowercase. If printout is TRUE, it will print out example text resulting from the removal of the above.

clean_text <- function(tokens, printout){
    #Create a fake patient note phrase as a representative sample
    ex_token <- "Example note:\nThe patient is a 81yo m who was found down in [** location **] on [** date **] by daughter, [** name **].\n Pt was in usual state of health until four days ago, when began to complain to family of heartburn for which the pt was taking tums in addition to his prescribed PPI, without resolution."
  if (printout){
    print(substr(ex_token, 1, 100))
  }
  
  #Remove carriage returns, convert to lower
  tokens <- tolower(gsub('\n', ' ', tokens))
  ex_token <- tolower(gsub('\n', ' ', ex_token))
  if (printout){
    cat("Result after removing carriage returns:\n")
    print(substr(ex_token, 1, 100))
  }
  
  #https://stackoverflow.com/questions/13529360/replace-text-within-parenthesis-in-r
  #Remove obfuscations between '[' and ']'
  tokens <- gsub(" *\\[.*?\\] *", ' ', tokens)
  ex_token <- gsub(" *\\[.*?\\] *", ' ', ex_token)
  if (printout){
    cat("Result after leaving [obfuscation]:\n")
    print(substr(ex_token, 1, 100))
  }
  
  #Keep only words & numeric
  tokens <- gsub("[^[:alnum:][:space:]]", '', tokens)
  ex_token <- gsub("[^[:alnum:][:space:]]", '', ex_token)
  if (printout){
    cat("Result after removing all but alphanumeric and spaces:\n")
    print(substr(ex_token, 1, 100))
  }
  
  #Keep only a single white space
  #https://stackoverflow.com/questions/25707647/merge-multiple-spaces-to-single-space-remove-trailing-leading-spaces
  tokens <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', tokens, perl=TRUE)
  ex_token <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', ex_token, perl=TRUE)
  if (printout){
    cat("Result after keeping only single spaces:\n")
    print(substr(ex_token, 1, 100))
  }

  return(tokens)
}

cohort_gen

cohort_gen() will accept the annotation data dat, train_set, and validation_set, and will map cohorts, returning the annotation data frame.

cohort_gen <- function(dat, train_set, valid_set){
  dat$COHORT <- rep('', each = nrow(dat))
  for (i in 1:nrow(dat)){
    if (dat$ROW_ID[i] %in% train_set$row_id){
      dat$COHORT[i] <- "train"
    } else if (dat$ROW_ID[i] %in% valid_set$row_id){
      dat$COHORT[i] <- "validation"
    } else {
      dat$COHORT[i] <- ""
    }
  }
  return(dat)
}

statGen

statGen() will accept the data.frame, hum_lab (human label result), the re_lab (regex label result), the set (train/validation), and the threshold, or the cutoff for tokens from the note found in the bag of words.

statGen <- function(dat, hum_lab, re_lab, set, threshold){
  
  dat <- dat[(dat$COHORT == set),]
  #Vector to hold results
  rVec <- vector()
  #True Positives
  tp <- 0
  #True Negatives
  tn <- 0
  #False Positives
  fp <- 0
  #False Negatives
  fn <- 0
  
  #tmp to hold dat[[lab]][i]
  re_tmp <- as.numeric()
  hum_tmp <- as.numeric()
  for (i in 1:nrow(dat)){
    
    re_tmp <- dat[["bow_score"]][i]
    #Generate boolean, multiply by 1 for numeric
    re_tmp <- (re_tmp >= threshold)*1
    
    hum_tmp <- dat[[hum_lab]][i]
    
    rVec[length(rVec)+1] <- paste("Token: ", dat$TEXT[i], '\n', re_lab, " Assignment: ", re_tmp, '\n', sep = '')
    
    if (is.na(re_tmp) | is.na(hum_tmp)) break
    
    #If both model and human aren't negative
    if (re_tmp != 0 & hum_tmp != 0 & !is.na(hum_tmp)){
      
      #True positive
      tp <- tp + 1
      
    #if human marks negative and model assigns label
    } else if (hum_tmp == 0 & re_tmp != 0 & !is.na(hum_tmp)){
      
      #False positive
      fp <- fp + 1
      
    #if human marks label and model doesn't assign label
    } else if (hum_tmp != 0 & re_tmp == 0 & !is.na(hum_tmp)){
      
      #False negative
      fn <- fn + 1
    
    #if human marks negative and model marks negative
    } else if (re_tmp == 0 & hum_tmp == 0 & !is.na(hum_tmp)){
      
      #True negative
      tn <- tn + 1
      
    } else if (is.null(re_tmp) | is.null(hum_tmp)){
      break
    }
  }
  
  #Hold results in txt to check if necessary
  
  write.csv(rVec, 
            file = paste(set,"_",re_lab,"_OUTPUT_NOTE_LEVEL_06Jan17.txt", sep = ''), 
            quote = F, row.names = F)
  
  tmpFrame <- cbind(tp, tn, fp, fn)
  colnames(tmpFrame) <- c("tp", "tn", "fp", "fn")
  
  return(as.data.frame(tmpFrame))
  
}

modelStats

modelStats() accepts a data.frame with tp, tn, fp, tn values and returns common machine learning metrics.

modelStats <- function(dat){
  dat <- as.data.frame(dat)
  accuracy <- (dat$tp + dat$tn)/(dat$tp + dat$tn + dat$fp + dat$fn)
  precision <- dat$tp/(dat$tp + dat$fp)
  recall <- dat$tp/(dat$tp + dat$fn)
  specificity <- dat$tn/(dat$tn+dat$fp)
  F1 <- 2*(precision*recall)/(precision + recall)
  
  dat$accuracy <- round(accuracy*100, 2)
  dat$precision <- round(precision*100, 2)
  dat$recall <- round(recall*100, 2)
  dat$specificity <- round(specificity*100, 2)
  dat$F1 <- round(F1*100, 2)
  
  return(dat)
}

long_data

long_data() will accept the output of

long_data <- function(dat){
  metrics <- c(dat$accuracy, 
        dat$precision, 
        dat$recall, 
        dat$specificity, 
        dat$F1)
  labs <- c(rep("accuracy", each = nrow(dat)), 
        rep("precision", each = nrow(dat)),
        rep("recall", each = nrow(dat)),
        rep("specificity", each = nrow(dat)),
        rep("F1", each = nrow(dat)))
  
  ind <- rep(1:nrow(dat), 5)
  
  
  res <- as.data.frame(
          cbind(metrics, 
                labs, 
                ind)
          )
  
  colnames(res) <- c("metrics", "labs", "threshold")
  
  #Convert factors to numeric after as.data.frame conversion
  res$metrics <- as.numeric(levels(res$metrics)[res$metrics])
  res$threshold <- as.numeric(levels(res$threshold)[res$threshold])
  
  #Return only defined results
  return(na.omit(res))
  
}

Load Annotation Data

#Read csv
dat <- read.csv("~/goals_of_care/regex_v3/op_annotations_122017.csv", 
                header = T, stringsAsFactors = F)
#Check the data
nrow(dat)
## [1] 1500

Load Map for training/validation sets

row_id_to_file_num.txt contains the key we will use to convert file as output from NeuroNER to row_id from MIMICIII.NOTEEVENTS, which was used as substrate to generate the annotated data set.

#Contains map of files (BRAT format for neuroNER) to row_id (MIMIC NOTEEVENTS)
map <- read.csv("~/goals_of_care/summary_stats/row_id_to_file_num.txt", header = F, stringsAsFactors = F, sep = '\t')
head(map)
##       V1         V2
## 1 407635 text_00000
## 2 345097 text_00001
## 3 371117 text_00002
## 4 401545 text_00003
## 5 393792 text_00004
## 6 382263 text_00005
#Add column names for ease of merging
colnames(map) <- c("row_id", "file")

Load training/validation sets and generate cohort info

train_set <- load_data("CAR", "train", map)

valid_set <- load_data("CAR", "valid", map)

dat <- cohort_gen(dat, train_set, valid_set)

#Show cohort distribution
table(dat$COHORT)
## 
##      train validation 
##       1067        433

Subset Phrases

Note: Only use phrases from the training set.

##Only use phrases from training set
phrases <- dat$Patient.and.Family.Care.Preferences.Text[dat$COHORT == "train"]

##remove empty observations ""
phrases <- phrases[phrases != '']
length(phrases)
## [1] 246
##Remove duplicate observations (keep first)
dat <- dat[!duplicated(dat$TEXT), ]

Clean Phrases & Text

txts <- clean_text(dat$TEXT, TRUE)
## [1] "Example note:\nThe patient is a 81yo m who was found down in [** location **] on [** date **] by daug"
## Result after removing carriage returns:
## [1] "example note: the patient is a 81yo m who was found down in [** location **] on [** date **] by daug"
## Result after leaving [obfuscation]:
## [1] "example note: the patient is a 81yo m who was found down in  on  by daughter, .  pt was in usual sta"
## Result after removing all but alphanumeric and spaces:
## [1] "example note the patient is a 81yo m who was found down in  on  by daughter   pt was in usual state "
## Result after keeping only single spaces:
## [1] "example note the patient is a 81yo m who was found down in on by daughter pt was in usual state of h"
phrases <- clean_text(phrases, FALSE)

Create a Bag of Words

#Create bag of words by splitting phrases on spaces and unlisting result
bow <- unlist(strsplit(phrases, ' '))

#Remove empty entries
bow <- bow[bow != '']

#Tabulate
tab <- table(bow)[rev(order(table(bow)))]

#Plot
par(mai=c(1,2,1,1))
barplot(rev(head(tab, 20)), horiz = T, las = 1, main = "Most Frequent Words in Phrase Dictionary", xlab = "Frequency")

#Use only unique tokens
bow <- unique(bow)
length(bow)
## [1] 705

Run Regex

#Strict regex
system.time(tmp <- strict_regex(bow, txts))
##    user  system elapsed 
##  47.927   0.303  48.702
#Convert list to data.frame
system.time(tmp_two <- to_df(tmp, bow))
##    user  system elapsed 
##   0.192   0.005   0.197

Create Metrics for Analysis

#Create score, apply sum row-wise
tmp_two$bow_score <- apply(tmp_two, 1, sum)

#Add score to dat with annotation data
dat$bow_score <- tmp_two$bow_score

#Show distribution of token counts represented in the BoW
summary(tmp_two$bow_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    60.0   150.0   173.0   173.8   199.0   282.0
hist(tmp_two$bow_score, breaks = 25, main = "BoW Score Distribution (all notes)", xlab = "Tokens in Phrase Dictionary", ylab = "Frequency of Notes with BoW Score")

#Create a tmp data.frame with all data and corresponding labels
tmp <- cbind(dat, tmp_two)

#Show distribution by cohort
boxplot(tmp$bow_score ~ tmp$COHORT, main = "BoW Score Distribution by Cohort", xlab = "Set", ylab = "Number of Tokens in BoW Dictionary")

Generate Model Statistics

\[accuracy = \frac{(tp+tn)}{(tp + tn + fp + fn)}\] \[precision = \frac{tp}{(tp+fp)}\] \[recall = \frac{tp}{(tp+fn)}\] \[specificity = \frac{tn}{(tn+fp)}\] \[F_1 = 2 \cdot \frac{precision \cdot recall}{precision + recall}\]

Subset thresholds

thresholds <- as.numeric(levels(factor(tmp_two$bow_score)))
system.time(test <- statGen(dat, "Patient.and.Family.Care.Preferences", "CAR", "train", thresholds[1]))
##    user  system elapsed 
##   0.050   0.021   0.136
for (i in 2:max(thresholds)){
  test <- rbind(test, statGen(dat, "Patient.and.Family.Care.Preferences", "CAR", "train", thresholds[i]))
}
test <- modelStats(test)

test$threshold <- 1:max(thresholds)

head(test, 50)
##     tp tn  fp fn accuracy precision recall specificity    F1 threshold
## 1  111  0 338  0    24.72     24.72 100.00        0.00 39.64         1
## 2  111  0 338  0    24.72     24.72 100.00        0.00 39.64         2
## 3  110  0 338  1    24.50     24.55  99.10        0.00 39.36         3
## 4  110  1 337  1    24.72     24.61  99.10        0.30 39.43         4
## 5  110  2 336  1    24.94     24.66  99.10        0.59 39.50         5
## 6  110  3 335  1    25.17     24.72  99.10        0.89 39.57         6
## 7  110  3 335  1    25.17     24.72  99.10        0.89 39.57         7
## 8  110  4 334  1    25.39     24.77  99.10        1.18 39.64         8
## 9  110  4 334  1    25.39     24.77  99.10        1.18 39.64         9
## 10 110  6 332  1    25.84     24.89  99.10        1.78 39.78        10
## 11 110  6 332  1    25.84     24.89  99.10        1.78 39.78        11
## 12 110  6 332  1    25.84     24.89  99.10        1.78 39.78        12
## 13 110  7 331  1    26.06     24.94  99.10        2.07 39.86        13
## 14 110  8 330  1    26.28     25.00  99.10        2.37 39.93        14
## 15 109  8 330  2    26.06     24.83  98.20        2.37 39.64        15
## 16 109  9 329  2    26.28     24.89  98.20        2.66 39.71        16
## 17 109 11 327  2    26.73     25.00  98.20        3.25 39.85        17
## 18 108 13 325  3    26.95     24.94  97.30        3.85 39.71        18
## 19 108 15 323  3    27.39     25.06  97.30        4.44 39.85        19
## 20 108 17 321  3    27.84     25.17  97.30        5.03 40.00        20
## 21 108 17 321  3    27.84     25.17  97.30        5.03 40.00        21
## 22 108 18 320  3    28.06     25.23  97.30        5.33 40.07        22
## 23 108 20 318  3    28.51     25.35  97.30        5.92 40.22        23
## 24 107 21 317  4    28.51     25.24  96.40        6.21 40.00        24
## 25 107 21 317  4    28.51     25.24  96.40        6.21 40.00        25
## 26 106 22 316  5    28.51     25.12  95.50        6.51 39.77        26
## 27 106 22 316  5    28.51     25.12  95.50        6.51 39.77        27
## 28 106 22 316  5    28.51     25.12  95.50        6.51 39.77        28
## 29 106 23 315  5    28.73     25.18  95.50        6.80 39.85        29
## 30 106 24 314  5    28.95     25.24  95.50        7.10 39.92        30
## 31 106 24 314  5    28.95     25.24  95.50        7.10 39.92        31
## 32 106 25 313  5    29.18     25.30  95.50        7.40 40.00        32
## 33 105 27 311  6    29.40     25.24  94.59        7.99 39.85        33
## 34 105 29 309  6    29.84     25.36  94.59        8.58 40.00        34
## 35 104 29 309  7    29.62     25.18  93.69        8.58 39.69        35
## 36 103 33 305  8    30.29     25.25  92.79        9.76 39.69        36
## 37 103 34 304  8    30.51     25.31  92.79       10.06 39.77        37
## 38 103 37 301  8    31.18     25.50  92.79       10.95 40.00        38
## 39 103 39 299  8    31.63     25.62  92.79       11.54 40.16        39
## 40 103 40 298  8    31.85     25.69  92.79       11.83 40.23        40
## 41 102 42 296  9    32.07     25.63  91.89       12.43 40.08        41
## 42 102 43 295  9    32.29     25.69  91.89       12.72 40.16        42
## 43 101 43 295 10    32.07     25.51  90.99       12.72 39.84        43
## 44 101 45 293 10    32.52     25.63  90.99       13.31 40.00        44
## 45 100 45 293 11    32.29     25.45  90.09       13.31 39.68        45
## 46 100 46 292 11    32.52     25.51  90.09       13.61 39.76        46
## 47 100 49 289 11    33.18     25.71  90.09       14.50 40.00        47
## 48  98 52 286 13    33.41     25.52  88.29       15.38 39.60        48
## 49  98 54 284 13    33.85     25.65  88.29       15.98 39.76        49
## 50  98 60 278 13    35.19     26.06  88.29       17.75 40.25        50

Graph Metrics

system.time(val <- statGen(dat, "Patient.and.Family.Care.Preferences", "CAR", "validation", thresholds[1]))
##    user  system elapsed 
##   0.019   0.002   0.035
for (i in 2:max(thresholds)){
  val <- rbind(val, statGen(dat, "Patient.and.Family.Care.Preferences", "CAR", "validation", thresholds[i]))
}

val <- modelStats(val)

val$threshold <- 1:max(thresholds)

head(val, 50)
##    tp tn  fp fn accuracy precision recall specificity    F1 threshold
## 1  27  0 165  0    14.06     14.06 100.00        0.00 24.66         1
## 2  27  1 164  0    14.58     14.14 100.00        0.61 24.77         2
## 3  27  2 163  0    15.10     14.21 100.00        1.21 24.88         3
## 4  27  3 162  0    15.62     14.29 100.00        1.82 25.00         4
## 5  27  4 161  0    16.15     14.36 100.00        2.42 25.12         5
## 6  27  4 161  0    16.15     14.36 100.00        2.42 25.12         6
## 7  27  6 159  0    17.19     14.52 100.00        3.64 25.35         7
## 8  27  8 157  0    18.23     14.67 100.00        4.85 25.59         8
## 9  27  9 156  0    18.75     14.75 100.00        5.45 25.71         9
## 10 27  9 156  0    18.75     14.75 100.00        5.45 25.71        10
## 11 27 10 155  0    19.27     14.84 100.00        6.06 25.84        11
## 12 27 11 154  0    19.79     14.92 100.00        6.67 25.96        12
## 13 27 12 153  0    20.31     15.00 100.00        7.27 26.09        13
## 14 27 12 153  0    20.31     15.00 100.00        7.27 26.09        14
## 15 27 15 150  0    21.88     15.25 100.00        9.09 26.47        15
## 16 27 16 149  0    22.40     15.34 100.00        9.70 26.60        16
## 17 27 17 148  0    22.92     15.43 100.00       10.30 26.73        17
## 18 27 19 146  0    23.96     15.61 100.00       11.52 27.00        18
## 19 27 19 146  0    23.96     15.61 100.00       11.52 27.00        19
## 20 26 20 145  1    23.96     15.20  96.30       12.12 26.26        20
## 21 26 21 144  1    24.48     15.29  96.30       12.73 26.40        21
## 22 26 22 143  1    25.00     15.38  96.30       13.33 26.53        22
## 23 26 22 143  1    25.00     15.38  96.30       13.33 26.53        23
## 24 26 23 142  1    25.52     15.48  96.30       13.94 26.67        24
## 25 26 25 140  1    26.56     15.66  96.30       15.15 26.94        25
## 26 26 26 139  1    27.08     15.76  96.30       15.76 27.08        26
## 27 26 27 138  1    27.60     15.85  96.30       16.36 27.23        27
## 28 26 29 136  1    28.65     16.05  96.30       17.58 27.51        28
## 29 26 30 135  1    29.17     16.15  96.30       18.18 27.66        29
## 30 26 30 135  1    29.17     16.15  96.30       18.18 27.66        30
## 31 26 31 134  1    29.69     16.25  96.30       18.79 27.81        31
## 32 26 32 133  1    30.21     16.35  96.30       19.39 27.96        32
## 33 26 35 130  1    31.77     16.67  96.30       21.21 28.42        33
## 34 26 37 128  1    32.81     16.88  96.30       22.42 28.73        34
## 35 26 37 128  1    32.81     16.88  96.30       22.42 28.73        35
## 36 26 37 128  1    32.81     16.88  96.30       22.42 28.73        36
## 37 26 37 128  1    32.81     16.88  96.30       22.42 28.73        37
## 38 26 37 128  1    32.81     16.88  96.30       22.42 28.73        38
## 39 26 38 127  1    33.33     16.99  96.30       23.03 28.89        39
## 40 26 38 127  1    33.33     16.99  96.30       23.03 28.89        40
## 41 26 40 125  1    34.38     17.22  96.30       24.24 29.21        41
## 42 26 40 125  1    34.38     17.22  96.30       24.24 29.21        42
## 43 26 43 122  1    35.94     17.57  96.30       26.06 29.71        43
## 44 26 43 122  1    35.94     17.57  96.30       26.06 29.71        44
## 45 26 46 119  1    37.50     17.93  96.30       27.88 30.23        45
## 46 26 49 116  1    39.06     18.31  96.30       29.70 30.77        46
## 47 26 49 116  1    39.06     18.31  96.30       29.70 30.77        47
## 48 26 54 111  1    41.67     18.98  96.30       32.73 31.71        48
## 49 26 57 108  1    43.23     19.40  96.30       34.55 32.30        49
## 50 25 58 107  2    43.23     18.94  92.59       35.15 31.45        50

Data Visualization

library(ggplot2)

Generate long data for ggplot2

train_plot <- long_data(test)
valid_plot <- long_data(val)

Training Set Metrics

ggplot(train_plot, aes(x = threshold, y = metrics, group = labs, shape = labs, linetype = labs )) +
    geom_line(aes(color = labs), size = 1.1) +
    #xlim(0, 200) + 
    #geom_point(aes(color = labs)) +
    labs(title="Plot of Metrics by BoW Token Cutoff Threshold\n(Training Set)", x = "Token Number Cutoff", y = "Value") +
    theme_minimal()

Validation Set Metrics

Note: Rule-based Bag of Words algorithm derived from Training Set phrases as defined by clinicians applied to validation set at varying cutoffs for number of tokens present.

ggplot(valid_plot, aes(x = threshold, y = metrics, group = labs, shape = labs, linetype = labs )) +
    geom_line(aes(color = labs), size = 1.1) +
    #xlim(0, 200) + 
    #geom_point(aes(color = labs)) +
    labs(title="Plot of Metrics by BoW Token Cutoff Threshold\n(Validation Set)", x = "Token Number Cutoff", y = "Value") +
    theme_minimal()