Rule-Based Regex Algorithm Assessment

Load Rule Data

#Load phrases
rules <- read.csv("rule-based_NLP_domains.csv", header = T, stringsAsFactors = F)

#Rename columns
colnames(rules) <- c("CAR", "FAM", "LIM", "COD", "PAL")

#Check data
head(rules)

##                   CAR                       FAM                LIM
## 1       goals of care         family discussion                dnr
## 2                 goc        family discussions             dnrdni
## 3      goals for care            family meeting                dni
## 4  goals of treatment      family communication do not resuscitate
## 5 goals for treatment durable power of attorney do-not-resuscitate
## 6     treatment goals         health care proxy    do not intubate
##         COD                 PAL
## 1 full code            pallcare
## 2               palliative care
## 3                     pall care
## 4           palliative medicine
## 5                              
## 6

Create library of tokens in each category

For work-level statistics we will split our phrases on spaces to generate individual tokens. We will store them in each respective variable.

#Split strings on ' ',for tokens use unique() for a union
prep <- function(dat){
  return(unique(unlist(strsplit(dat, ' '))))
}

print(CAR <- prep(rules$CAR))

##  [1] "goals"           "of"              "care"           
##  [4] "goc"             "for"             "treatment"      
##  [7] "preferences"     "extending"       "life"           
## [10] "comfort-focused" "supportive"      "no"             
## [13] "feeding"         "tube"            "dialysis"       
## [16] "hemodialysis"    "HD"              "quality"        
## [19] "priorities"      "end"             "living"         
## [22] "will"            "molst"           "advance"        
## [25] "directives"      "planning"        "acp"            
## [28] "hospice"         "full"            "code"           
## [31] "confirmed"       "d/w"             "discussed"      
## [34] "verified"        "would"           "like"           
## [37] "to"              "be"              "wishes"         
## [40] "remain"          "wish"            "remaining"

print(COD <- prep(rules$COD))

## [1] "full" "code"

print(FAM <- prep(rules$FAM))

##  [1] "family"        "discussion"    "discussions"   "meeting"      
##  [5] "communication" "durable"       "power"         "of"           
##  [9] "attorney"      "health"        "care"          "proxy"        
## [13] "hcp"           "understanding" "illness"       "prognosis"

print(LIM <- prep(rules$LIM))

##  [1] "dnr"                "dnrdni"             "dni"               
##  [4] "do"                 "not"                "resuscitate"       
##  [7] "do-not-resuscitate" "intubate"           "do-not-intubate"   
## [10] "no"                 "intubation"         "chest"             
## [13] "compressions"       "defibrillation"     "shocks"            
## [16] "shock"              "therapy"            "cpr"               
## [19] "endotracheal"       "mechanical"         "ventilator"        
## [22] "breathing"          "machine"            "tube"              
## [25] "cmo"                "comfort"            "measures"          
## [28] "care"               "limitations"        "of"                
## [31] "life-sustaining"    "treatment"          "llst"

print(PAL <- prep(rules$PAL))

## [1] "pallcare"   "palliative" "care"       "pall"       "medicine"

Note: because multiple are not unique to each label there are likely to be multiple labels for the same token.

Load Testing/Training Data

Note: testing and training is not meaningful for this rule-based regex method, but the model will be applied to both set.

#Load testing/training sets
test <- read.csv("018_train.txt", header = F, stringsAsFactors = F, sep = ' ', quote = "", row.names = NULL)

train <- read.csv("018_valid.txt", header = F, stringsAsFactors = F, sep = ' ', quote = "", row.names = NULL)
  
#Give colnames
colnames(test) <- c("TOKEN", "FILE", "START", "END", "HUM", "NET")
#Check data
head(test)

##         TOKEN             FILE START END HUM NET
## 1       Chief train_text_00000     0   5   O   O
## 2   Complaint train_text_00000     6  15   O   O
## 3           : train_text_00000    15  16   O   O
## 4 respiratory train_text_00000    17  28   O   O
## 5     failure train_text_00000    29  36   O   O
## 6           I train_text_00000    38  39   O   O

#Give colnames
colnames(train) <- c("TOKEN", "FILE", "START", "END", "HUM", "NET")
#Check data
head(train)

##       TOKEN             FILE START END HUM NET
## 1     Chief valid_text_00380     0   5   O   O
## 2 Complaint valid_text_00380     6  15   O   O
## 3         : valid_text_00380    15  16   O   O
## 4        GI valid_text_00380    17  19   O   O
## 5     bleed valid_text_00380    20  25   O   O
## 6         . valid_text_00380    25  26   O   O

Show method works

test$TOKEN[1:10]

##  [1] "Chief"       "Complaint"   ":"           "respiratory" "failure"    
##  [6] "I"           "saw"         "and"         "examined"    "the"

test$TOKEN[which(head(match(test$TOKEN, PAL), 100000) > 0)]

##  [1] "care"       "care"       "care"       "care"       "care"      
##  [6] "care"       "care"       "palliative" "care"       "care"      
## [11] "care"       "care"       "care"       "care"       "care"      
## [16] "care"       "care"       "care"       "care"       "care"      
## [21] "care"       "palliative" "care"       "care"       "palliative"
## [26] "care"       "care"       "care"       "care"       "care"      
## [31] "care"       "care"       "care"       "care"       "care"      
## [36] "care"       "care"       "care"       "care"       "palliative"
## [41] "care"       "care"       "care"       "care"       "care"      
## [46] "care"       "care"       "care"       "care"       "care"      
## [51] "care"       "care"       "care"       "care"       "care"      
## [56] "care"       "care"       "care"       "care"       "care"      
## [61] "care"       "care"       "care"       "care"       "care"      
## [66] "care"       "care"       "care"       "care"       "care"      
## [71] "palliative" "care"       "care"

head(ifelse(match(test$TOKEN, CAR) > 0, "B-CAR"), 50)

##  [1] NA      NA      NA      NA      NA      NA      NA      NA     
##  [9] NA      NA      NA      NA      NA      NA      NA      NA     
## [17] NA      NA      NA      NA      "B-CAR" NA      NA      "B-CAR"
## [25] NA      NA      NA      NA      NA      NA      NA      NA     
## [33] NA      NA      NA      NA      NA      NA      NA      NA     
## [41] NA      NA      NA      NA      NA      NA      NA      NA     
## [49] NA      NA

Apply Method For Each Label

apply_regex <- function(dat,lab, res){
  return(ifelse((match(tolower(dat$TOKEN), tolower(lab), nomatch = "0") > 0), res, 'O'))
}

test$RE_CAR <- apply_regex(test, CAR, "B-CAR")
test$RE_COD <- apply_regex(test, COD, "B-COD")
test$RE_FAM <- apply_regex(test, FAM, "B-FAM")
test$RE_LIM <- apply_regex(test, LIM, "B-LIM")
test$RE_PAL <- apply_regex(test, PAL, "B-PAL")

#Show sample of data in tail(), no need to show FILE
#tail(test[,c(1,5,6,7,8,9,10,11)], 900)

write.csv(test, file = "018_test_with_rule-based_NLP_res.txt", row.names = F, quote = F)

train$RE_CAR <- apply_regex(train, CAR, "B-CAR")
train$RE_COD <- apply_regex(train, COD, "B-COD")
train$RE_FAM <- apply_regex(train, FAM, "B-FAM")
train$RE_LIM <- apply_regex(train, LIM, "B-LIM")
train$RE_PAL <- apply_regex(train, PAL, "B-PAL")

write.csv(train, file = "018_train_with_rule-based_NLP_res.txt", row.names = F, quote = F)

modStats <- function(dat, lab, set){
  #Vector to hold results
  rVec <- vector()
  #True Positives
  tp <- 0
  #True Negatives
  tn <- 0
  #False Positives
  fp <- 0
  #False Negatives
  fn <- 0
  
  #tmp to hold dat[[lab]][i]
  tmp <- as.character()
  for (i in 1:nrow(dat)){
    tmp <- dat[[lab]][i]
    
    rVec[length(rVec)+1] <- paste("Token: ", dat$TOKEN[i],"\n", lab, " Assignment: ", tmp, "\n", sep = '')
    
    #If both model and human aren't negative
    if (tmp != 'O' & dat$HUM[i] != 'O'){
      #True positive
      tp <- tp + 1
      
    #if human marks negative and model assigns label
    } else if (dat$HUM[i] == 'O' & tmp != 'O'){
      #False positive
      fp <- fp + 1
      
    #if human marks label and model doesn't assign label
    } else if (dat$HUM[i] != 'O' & tmp == 'O'){
      
      #False negative
      fn <- fn + 1
    
    #if human marks negative and model marks negative
    } else if (tmp == 'O' & dat$HUM[i] == 'O'){
      
      #True negative
      tn <- tn + 1
    }
  }
  
  #Hold results in txt to check if necessary
  
  write.csv(rVec, file = paste(set,"_",lab,"_OUTPUT_25Nov17.txt", sep = ''), quote = F, row.names = F)
  
  tmpFrame <- cbind(tp, tn, fp, fn)
  colnames(tmpFrame) <- c("tp", "tn", "fp", "fn")
  return(tmpFrame)
}


results <- rbind(modStats(test, "RE_CAR", "TEST"),
                modStats(test, "RE_COD", "TEST"),
                modStats(test, "RE_FAM", "TEST"),
                modStats(test, "RE_LIM", "TEST"),
                modStats(test, "RE_PAL", "TEST"))

row.names(results) <- c("RE_CAR", "RE_COD", "RE_FAM", "RE_LIM", "RE_PAL")

resultsTrain <- rbind(modStats(train, "RE_CAR", "TRAIN"),
                modStats(train, "RE_COD", "TRAIN"),
                modStats(train, "RE_FAM", "TRAIN"),
                modStats(train, "RE_LIM", "TRAIN"),
                modStats(train, "RE_PAL", "TRAIN"))


row.names(resultsTrain) <- c("RE_CAR", "RE_COD", "RE_FAM", "RE_LIM", "RE_PAL")

#Check structure of results
str(results)

##  num [1:5, 1:4] 1417 905 396 495 63 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:5] "RE_CAR" "RE_COD" "RE_FAM" "RE_LIM" ...
##   ..$ : chr [1:4] "tp" "tn" "fp" "fn"

str(resultsTrain)

##  num [1:5, 1:4] 454 358 81 120 10 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:5] "RE_CAR" "RE_COD" "RE_FAM" "RE_LIM" ...
##   ..$ : chr [1:4] "tp" "tn" "fp" "fn"

Model Stats

\[accuracy = \frac{(tp+tn)}{(tp + tn + fp + fn)}\] \[precision = \frac{tp}{(tp+fp)}\] \[accuracy = \frac{tp}{(tp+fn)}\] \[F_1 = 2 \cdot \frac{(precision*recall)}{precision + recall}\]

#Convert to data frame to avoid atomic vectors
results <- as.data.frame(results)
resultsTrain <- as.data.frame(resultsTrain)


modelStats <- function(dat){
  dat$accuracy <- (dat$tp + dat$tn)/(dat$tp + dat$tn + dat$fp + dat$fn)
  dat$precision <- dat$tp/(dat$tp + dat$fp)
  dat$recall <- dat$tp/(dat$tp + dat$fn)
  dat$F1 <- 2*(dat$precision * dat$recall)/(dat$precision + dat$recall)
  return(dat)
}

results <- modelStats(results)
resultsTrain <- modelStats(resultsTrain)

print(results)

##          tp     tn    fp   fn  accuracy  precision     recall         F1
## RE_CAR 1417 512706 18045 3376 0.9600014 0.07280855 0.29563947 0.11684189
## RE_COD  905 530564   187 3888 0.9923909 0.82875458 0.18881702 0.30756160
## RE_FAM  396 524363  6388 4397 0.9798616 0.05837264 0.08262049 0.06841151
## RE_LIM  495 519179 11572 4298 0.9703666 0.04102097 0.10327561 0.05871886
## RE_PAL   63 530064   687 4730 0.9898851 0.08400000 0.01314417 0.02273137

print(resultsTrain)

##         tp     tn   fp   fn  accuracy  precision      recall         F1
## RE_CAR 454 176120 6185  835 0.9617635 0.06838379 0.352211016 0.11453078
## RE_COD 358 182210   95  931 0.9944116 0.79028698 0.277734678 0.41102181
## RE_FAM  81 180019 2286 1208 0.9809689 0.03422053 0.062839410 0.04431072
## RE_LIM 120 178372 3933 1169 0.9722104 0.02960770 0.093095423 0.04492699
## RE_PAL  10 182105  200 1279 0.9919442 0.04761905 0.007757952 0.01334223

write.csv(results, file = "rule-based_NLP_test_results.txt", quote = FALSE)
write.csv(resultsTrain, file = "rule-based_NLP_train_results.txt", quote = FALSE)