Rule-based NLP Note Level Regex

Load Raw Data

dat <- read.csv("~/Desktop/goals_of_care/cleaned_annotations/op_annotations_112317.csv", header = T, stringsAsFactors = F)

colnames(dat)

##  [1] "X"                                       
##  [2] "ROW_ID"                                  
##  [3] "SUBJECT_ID"                              
##  [4] "HADM_ID"                                 
##  [5] "CATEGORY"                                
##  [6] "DESCRIPTION"                             
##  [7] "TEXT"                                    
##  [8] "COHORT"                                  
##  [9] "Patient.and.Family.Care.Preferences"     
## [10] "Patient.and.Family.Care.Preferences.Text"
## [11] "Communication.with.Family"               
## [12] "Communication.with.Family.Text"          
## [13] "Full.Code.Status"                        
## [14] "Full.Code.Status.Text"                   
## [15] "Code.Status.Limitations"                 
## [16] "Code.Status.Limitations.Text"            
## [17] "Palliative.Care.Team.Involvement"        
## [18] "Palliative.Care.Team.Involvement.Text"   
## [19] "Ambiguous"                               
## [20] "Ambiguous.Text"                          
## [21] "Ambiguous.Comments"                      
## [22] "None"                                    
## [23] "operator"                                
## [24] "STAMP"

nrow(dat)

## [1] 876

length(unique(dat$TEXT))

## [1] 641

Load Rules

#Load phrases
rules <- read.csv("~/Desktop/goals_of_care/regex/rule-based_NLP_domains.csv", header = T, stringsAsFactors = F)

#Rename columns
colnames(rules) <- c("CAR", "FAM", "LIM", "COD", "PAL")

#Show rules
print(rules)

##                               CAR                        FAM
## 1                   goals of care          family discussion
## 2                             goc         family discussions
## 3                  goals for care             family meeting
## 4              goals of treatment       family communication
## 5             goals for treatment  durable power of attorney
## 6                 treatment goals          health care proxy
## 7                care preferences                        hcp
## 8                  extending life   understanding of illness
## 9            comfort-focused care understanding of prognosis
## 10                supportive care                           
## 11                no feeding tube                           
## 12                    no dialysis                           
## 13                no hemodialysis                           
## 14                          no HD                           
## 15                quality of life                           
## 16                     priorities                           
## 17               end of life care                           
## 18                    living will                           
## 19                          molst                           
## 20             advance directives                           
## 21          advance care planning                           
## 22                            acp                           
## 23                        hospice                           
## 24            full code confirmed                           
## 25                  full code d/w                           
## 26            full code discussed                           
## 27             full code verified                           
## 28     would like to be full code                           
## 29         wishes to be full code                           
## 30 would like to remain full code                           
## 31     wishes to remain full code                           
## 32           wish to be full code                           
## 33            remaining full code                           
##                                         LIM       COD                 PAL
## 1                                       dnr full code            pallcare
## 2                                    dnrdni               palliative care
## 3                                       dni                     pall care
## 4                        do not resuscitate           palliative medicine
## 5                        do-not-resuscitate                              
## 6                           do not intubate                              
## 7                           do-not-intubate                              
## 8                             no intubation                              
## 9                        chest compressions                              
## 10                        no defibrillation                              
## 11                                   shocks                              
## 12                            shock therapy                              
## 13                                   no cpr                              
## 14               no endotracheal intubation                              
## 15                 no mechanical intubation                              
## 16                            no ventilator                              
## 17                     no breathing machine                              
## 18                        no breathing tube                              
## 19                    no chest compressions                              
## 20                                      cmo                              
## 21                         comfort measures                              
## 22                             comfort care                              
## 23 limitations of life-sustaining treatment                              
## 24                                     llst                              
## 25                                                                       
## 26                                                                       
## 27                                                                       
## 28                                                                       
## 29                                                                       
## 30                                                                       
## 31                                                                       
## 32                                                                       
## 33

Regex Utility Function

strictRegex() will accept all phrases kwds, and all note texts, texts, it will utilize grepl() to find direct matches in the text, and will return a list of booleans.

strictRegex <- function(kwds, texts){
  #Create a list to store results
  tmpList <- list()
  
  #Loop through all keywords
  for (i in 1:length(kwds)){
    #Store results as a logical vector in its respective list entry position
    tmpList[[i]] <- grepl(kwds[i], texts, ignore.case = TRUE)
  }
  
  #Return list and control to environment
  return(tmpList)
}

Apply Regex

CAR <- strictRegex(rules$CAR[rules$CAR != ''], dat$TEXT)
FAM <- strictRegex(rules$FAM[rules$FAM != ''], dat$TEXT)
LIM <- strictRegex(rules$LIM[rules$LIM != ''], dat$TEXT)
COD <- strictRegex(rules$COD[rules$COD != ''], dat$TEXT)
PAL <- strictRegex(rules$PAL[rules$PAL != ''], dat$TEXT)


to_df <- function(domain, rule){
  #Convert list from grepl to data frame
  domain <- as.data.frame(domain)
  #Show column names as phrases
  colnames(domain) <- rule[rule != '']
  #Multiply by 1 for binary numeric
  domain <- domain*1
  return(domain)
}


CAR <- to_df(CAR, rules$CAR)
FAM <- to_df(FAM, rules$FAM)
LIM <- to_df(LIM, rules$LIM)
COD <- to_df(COD, rules$COD)
PAL <- to_df(PAL, rules$PAL)

phrase_to_domain <- function(dat){
  inc <- vector()
  for (i in 1:nrow(dat)){
    #Collapse phrases by domain, presence of any phrase indicates domain
    inc[i] <- any(dat[i,] == 1)
  }
  #Multiply by one to convert logical to binary numeric
  return(inc*1)
}

res <- cbind(phrase_to_domain(CAR),
             phrase_to_domain(FAM),
             phrase_to_domain(LIM),
             phrase_to_domain(COD),
             phrase_to_domain(PAL))

colnames(res) <- c("RE_CAR", "RE_FAM", "RE_LIM", "RE_COD", "RE_PAL")

res <- as.data.frame(res)

head(res)

##   RE_CAR RE_FAM RE_LIM RE_COD RE_PAL
## 1      0      0      0      1      0
## 2      0      0      0      1      0
## 3      0      0      0      1      0
## 4      0      0      1      0      1
## 5      0      1      0      1      0
## 6      0      0      1      0      0

nrow(res)

## [1] 876

#Merge res and data
res <- cbind(dat, res)

Preprocess

Remove duplicates, keep first observation of the note, as the first observation will have the most information, subsequent annotations will have additional phrases associated with domains.

res <- res[!duplicated(res$TEXT),]

Model Stats

statGen <- function(dat, hum_lab, re_lab, set){
  #Vector to hold results
  rVec <- vector()
  #True Positives
  tp <- 0
  #True Negatives
  tn <- 0
  #False Positives
  fp <- 0
  #False Negatives
  fn <- 0
  
  #tmp to hold dat[[lab]][i]
  re_tmp <- as.character()
  hum_tmp <- as.character()
  for (i in 1:nrow(dat)){
    
    re_tmp <- dat[[re_lab]][i]
    hum_tmp <- dat[[hum_lab]][i]
    
    rVec[length(rVec)+1] <- paste("Token: ", dat$TEXT[i],"\n", re_lab, " Assignment: ", re_tmp, "\n", sep = '')
    
    #If both model and human aren't negative
    if (re_tmp != 0 & hum_tmp != 0 & !is.na(hum_tmp)){
      #True positive
      tp <- tp + 1
      
    #if human marks negative and model assigns label
    } else if (hum_tmp == 0 & re_tmp != 0 & !is.na(hum_tmp)){
      #False positive
      fp <- fp + 1
      
    #if human marks label and model doesn't assign label
    } else if (hum_tmp != 0 & re_tmp == 0 & !is.na(hum_tmp)){
      
      #False negative
      fn <- fn + 1
    
    #if human marks negative and model marks negative
    } else if (re_tmp == 0 & hum_tmp == 0 & !is.na(hum_tmp)){
      
      #True negative
      tn <- tn + 1
    }
  }
  
  #Hold results in txt to check if necessary
  
  write.csv(rVec, 
            file = paste(set,"_",re_lab,"_OUTPUT_NOTE_LEVEL_26Nov17.txt", sep = ''), 
            quote = F, row.names = F)
  
  tmpFrame <- cbind(tp, tn, fp, fn)
  colnames(tmpFrame) <- c("tp", "tn", "fp", "fn")
  return(as.data.frame(tmpFrame))
}

results <- rbind(statGen(res, "Patient.and.Family.Care.Preferences", "RE_CAR", "ALL"),
  statGen(res, "Full.Code.Status", "RE_COD", "ALL"),
  statGen(res, "Communication.with.Family", "RE_FAM", "ALL"),
  statGen(res, "Code.Status.Limitations","RE_LIM", "ALL"),
  statGen(res, "Palliative.Care.Team.Involvement", "RE_PAL", "ALL"))

row.names(results) <- c("RE_CAR", "RE_COD", "RE_FAM", "RE_LIM", "RE_PAL")

Model Stats

\[accuracy = \frac{(tp+tn)}{(tp + tn + fp + fn)}\] \[precision = \frac{tp}{(tp+fp)}\] \[accuracy = \frac{tp}{(tp+fn)}\] \[F_1 = 2 \cdot \frac{(precision*recall)}{precision + recall}\]

modelStats <- function(dat){
  accuracy <- (dat$tp + dat$tn)/(dat$tp + dat$tn + dat$fp + dat$fn)
  precision <- dat$tp/(dat$tp + dat$fp)
  recall <- dat$tp/(dat$tp + dat$fn)
  
  dat$accuracy <- paste(sprintf("%2.2f", round(100*accuracy,2)), '%', sep = '')
  dat$precision <- paste(sprintf("%2.2f", round(100*precision,2)), '%', sep = '')
  dat$recall <- paste(sprintf("%2.2f", round(100*recall,2)), '%', sep = '')
  
  dat$F1 <- round(100*2*(precision * recall)/(precision + recall),2)
  return(dat)
}

results <- modelStats(results)

print(results)

##         tp  tn fp fn accuracy precision recall    F1
## RE_CAR  45 447 56 93   76.76%    44.55% 32.61% 37.66
## RE_COD 272 163 15 48   87.35%    94.77% 85.00% 89.62
## RE_FAM  82 398 73 88   74.88%    52.90% 48.24% 50.46
## RE_LIM 195 300 82 64   77.22%    70.40% 75.29% 72.76
## RE_PAL   9 617 14  1   97.66%    39.13% 90.00% 54.55

Rule-based NLP Note Level Regex

Edward Moseley

November 26, 2017

Load Raw Data

Load Rules

Regex Utility Function

Apply Regex

Preprocess

Model Stats

Model Stats