#Load phrases
rules <- read.csv("rule-based_NLP_domains.csv", header = T, stringsAsFactors = F)
#Rename columns
colnames(rules) <- c("CAR", "FAM", "LIM", "COD", "PAL")
#Check data
head(rules)
## CAR FAM LIM
## 1 goals of care family discussion dnr
## 2 goc family discussions dnrdni
## 3 goals for care family meeting dni
## 4 goals of treatment family communication do not resuscitate
## 5 goals for treatment durable power of attorney do-not-resuscitate
## 6 treatment goals health care proxy do not intubate
## COD PAL
## 1 full code pallcare
## 2 palliative care
## 3 pall care
## 4 palliative medicine
## 5
## 6
For work-level statistics we will split our phrases on spaces to generate individual tokens. We will store them in each respective variable.
#Split strings on ' ',for tokens use unique() for a union
prep <- function(dat){
return(unique(unlist(strsplit(dat, ' '))))
}
print(CAR <- prep(rules$CAR))
## [1] "goals" "of" "care"
## [4] "goc" "for" "treatment"
## [7] "preferences" "extending" "life"
## [10] "comfort-focused" "supportive" "no"
## [13] "feeding" "tube" "dialysis"
## [16] "hemodialysis" "HD" "quality"
## [19] "priorities" "end" "living"
## [22] "will" "molst" "advance"
## [25] "directives" "planning" "acp"
## [28] "hospice" "full" "code"
## [31] "confirmed" "d/w" "discussed"
## [34] "verified" "would" "like"
## [37] "to" "be" "wishes"
## [40] "remain" "wish" "remaining"
print(COD <- prep(rules$COD))
## [1] "full" "code"
print(FAM <- prep(rules$FAM))
## [1] "family" "discussion" "discussions" "meeting"
## [5] "communication" "durable" "power" "of"
## [9] "attorney" "health" "care" "proxy"
## [13] "hcp" "understanding" "illness" "prognosis"
print(LIM <- prep(rules$LIM))
## [1] "dnr" "dnrdni" "dni"
## [4] "do" "not" "resuscitate"
## [7] "do-not-resuscitate" "intubate" "do-not-intubate"
## [10] "no" "intubation" "chest"
## [13] "compressions" "defibrillation" "shocks"
## [16] "shock" "therapy" "cpr"
## [19] "endotracheal" "mechanical" "ventilator"
## [22] "breathing" "machine" "tube"
## [25] "cmo" "comfort" "measures"
## [28] "care" "limitations" "of"
## [31] "life-sustaining" "treatment" "llst"
print(PAL <- prep(rules$PAL))
## [1] "pallcare" "palliative" "care" "pall" "medicine"
Note: because multiple are not unique to each label there are likely to be multiple labels for the same token.
Note: testing and training is not meaningful for this rule-based regex method, but the model will be applied to both set.
#Load testing/training sets
test <- read.csv("018_train.txt", header = F, stringsAsFactors = F, sep = ' ', quote = "", row.names = NULL)
train <- read.csv("018_valid.txt", header = F, stringsAsFactors = F, sep = ' ', quote = "", row.names = NULL)
#Give colnames
colnames(test) <- c("TOKEN", "FILE", "START", "END", "HUM", "NET")
#Check data
head(test)
## TOKEN FILE START END HUM NET
## 1 Chief train_text_00000 0 5 O O
## 2 Complaint train_text_00000 6 15 O O
## 3 : train_text_00000 15 16 O O
## 4 respiratory train_text_00000 17 28 O O
## 5 failure train_text_00000 29 36 O O
## 6 I train_text_00000 38 39 O O
#Give colnames
colnames(train) <- c("TOKEN", "FILE", "START", "END", "HUM", "NET")
#Check data
head(train)
## TOKEN FILE START END HUM NET
## 1 Chief valid_text_00380 0 5 O O
## 2 Complaint valid_text_00380 6 15 O O
## 3 : valid_text_00380 15 16 O O
## 4 GI valid_text_00380 17 19 O O
## 5 bleed valid_text_00380 20 25 O O
## 6 . valid_text_00380 25 26 O O
test$TOKEN[1:10]
## [1] "Chief" "Complaint" ":" "respiratory" "failure"
## [6] "I" "saw" "and" "examined" "the"
test$TOKEN[which(head(match(test$TOKEN, PAL), 100000) > 0)]
## [1] "care" "care" "care" "care" "care"
## [6] "care" "care" "palliative" "care" "care"
## [11] "care" "care" "care" "care" "care"
## [16] "care" "care" "care" "care" "care"
## [21] "care" "palliative" "care" "care" "palliative"
## [26] "care" "care" "care" "care" "care"
## [31] "care" "care" "care" "care" "care"
## [36] "care" "care" "care" "care" "palliative"
## [41] "care" "care" "care" "care" "care"
## [46] "care" "care" "care" "care" "care"
## [51] "care" "care" "care" "care" "care"
## [56] "care" "care" "care" "care" "care"
## [61] "care" "care" "care" "care" "care"
## [66] "care" "care" "care" "care" "care"
## [71] "palliative" "care" "care"
head(ifelse(match(test$TOKEN, CAR) > 0, "B-CAR"), 50)
## [1] NA NA NA NA NA NA NA NA
## [9] NA NA NA NA NA NA NA NA
## [17] NA NA NA NA "B-CAR" NA NA "B-CAR"
## [25] NA NA NA NA NA NA NA NA
## [33] NA NA NA NA NA NA NA NA
## [41] NA NA NA NA NA NA NA NA
## [49] NA NA
apply_regex <- function(dat,lab, res){
return(ifelse((match(tolower(dat$TOKEN), tolower(lab), nomatch = "0") > 0), res, 'O'))
}
test$RE_CAR <- apply_regex(test, CAR, "B-CAR")
test$RE_COD <- apply_regex(test, COD, "B-COD")
test$RE_FAM <- apply_regex(test, FAM, "B-FAM")
test$RE_LIM <- apply_regex(test, LIM, "B-LIM")
test$RE_PAL <- apply_regex(test, PAL, "B-PAL")
#Show sample of data in tail(), no need to show FILE
#tail(test[,c(1,5,6,7,8,9,10,11)], 900)
write.csv(test, file = "018_test_with_rule-based_NLP_res.txt", row.names = F, quote = F)
train$RE_CAR <- apply_regex(train, CAR, "B-CAR")
train$RE_COD <- apply_regex(train, COD, "B-COD")
train$RE_FAM <- apply_regex(train, FAM, "B-FAM")
train$RE_LIM <- apply_regex(train, LIM, "B-LIM")
train$RE_PAL <- apply_regex(train, PAL, "B-PAL")
write.csv(train, file = "018_train_with_rule-based_NLP_res.txt", row.names = F, quote = F)
modStats <- function(dat, lab, set){
#Vector to hold results
rVec <- vector()
#True Positives
tp <- 0
#True Negatives
tn <- 0
#False Positives
fp <- 0
#False Negatives
fn <- 0
#tmp to hold dat[[lab]][i]
tmp <- as.character()
for (i in 1:nrow(dat)){
tmp <- dat[[lab]][i]
rVec[length(rVec)+1] <- paste("Token: ", dat$TOKEN[i],"\n", lab, " Assignment: ", tmp, "\n", sep = '')
#If both model and human aren't negative
if (tmp != 'O' & dat$HUM[i] != 'O'){
#True positive
tp <- tp + 1
#if human marks negative and model assigns label
} else if (dat$HUM[i] == 'O' & tmp != 'O'){
#False positive
fp <- fp + 1
#if human marks label and model doesn't assign label
} else if (dat$HUM[i] != 'O' & tmp == 'O'){
#False negative
fn <- fn + 1
#if human marks negative and model marks negative
} else if (tmp == 'O' & dat$HUM[i] == 'O'){
#True negative
tn <- tn + 1
}
}
#Hold results in txt to check if necessary
write.csv(rVec, file = paste(set,"_",lab,"_OUTPUT_25Nov17.txt", sep = ''), quote = F, row.names = F)
tmpFrame <- cbind(tp, tn, fp, fn)
colnames(tmpFrame) <- c("tp", "tn", "fp", "fn")
return(tmpFrame)
}
results <- rbind(modStats(test, "RE_CAR", "TEST"),
modStats(test, "RE_COD", "TEST"),
modStats(test, "RE_FAM", "TEST"),
modStats(test, "RE_LIM", "TEST"),
modStats(test, "RE_PAL", "TEST"))
row.names(results) <- c("RE_CAR", "RE_COD", "RE_FAM", "RE_LIM", "RE_PAL")
resultsTrain <- rbind(modStats(train, "RE_CAR", "TRAIN"),
modStats(train, "RE_COD", "TRAIN"),
modStats(train, "RE_FAM", "TRAIN"),
modStats(train, "RE_LIM", "TRAIN"),
modStats(train, "RE_PAL", "TRAIN"))
row.names(resultsTrain) <- c("RE_CAR", "RE_COD", "RE_FAM", "RE_LIM", "RE_PAL")
#Check structure of results
str(results)
## num [1:5, 1:4] 1417 905 396 495 63 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:5] "RE_CAR" "RE_COD" "RE_FAM" "RE_LIM" ...
## ..$ : chr [1:4] "tp" "tn" "fp" "fn"
str(resultsTrain)
## num [1:5, 1:4] 454 358 81 120 10 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:5] "RE_CAR" "RE_COD" "RE_FAM" "RE_LIM" ...
## ..$ : chr [1:4] "tp" "tn" "fp" "fn"
\[accuracy = \frac{(tp+tn)}{(tp + tn + fp + fn)}\] \[precision = \frac{tp}{(tp+fp)}\] \[accuracy = \frac{tp}{(tp+fn)}\] \[F_1 = 2 \cdot \frac{(precision*recall)}{precision + recall}\]
#Convert to data frame to avoid atomic vectors
results <- as.data.frame(results)
resultsTrain <- as.data.frame(resultsTrain)
modelStats <- function(dat){
dat$accuracy <- (dat$tp + dat$tn)/(dat$tp + dat$tn + dat$fp + dat$fn)
dat$precision <- dat$tp/(dat$tp + dat$fp)
dat$recall <- dat$tp/(dat$tp + dat$fn)
dat$F1 <- 2*(dat$precision * dat$recall)/(dat$precision + dat$recall)
return(dat)
}
results <- modelStats(results)
resultsTrain <- modelStats(resultsTrain)
print(results)
## tp tn fp fn accuracy precision recall F1
## RE_CAR 1417 512706 18045 3376 0.9600014 0.07280855 0.29563947 0.11684189
## RE_COD 905 530564 187 3888 0.9923909 0.82875458 0.18881702 0.30756160
## RE_FAM 396 524363 6388 4397 0.9798616 0.05837264 0.08262049 0.06841151
## RE_LIM 495 519179 11572 4298 0.9703666 0.04102097 0.10327561 0.05871886
## RE_PAL 63 530064 687 4730 0.9898851 0.08400000 0.01314417 0.02273137
print(resultsTrain)
## tp tn fp fn accuracy precision recall F1
## RE_CAR 454 176120 6185 835 0.9617635 0.06838379 0.352211016 0.11453078
## RE_COD 358 182210 95 931 0.9944116 0.79028698 0.277734678 0.41102181
## RE_FAM 81 180019 2286 1208 0.9809689 0.03422053 0.062839410 0.04431072
## RE_LIM 120 178372 3933 1169 0.9722104 0.02960770 0.093095423 0.04492699
## RE_PAL 10 182105 200 1279 0.9919442 0.04761905 0.007757952 0.01334223
write.csv(results, file = "rule-based_NLP_test_results.txt", quote = FALSE)
write.csv(resultsTrain, file = "rule-based_NLP_train_results.txt", quote = FALSE)