dat <- read.csv("~/Desktop/goals_of_care/cleaned_annotations/op_annotations_112317.csv", header = T, stringsAsFactors = F)
colnames(dat)
## [1] "X"
## [2] "ROW_ID"
## [3] "SUBJECT_ID"
## [4] "HADM_ID"
## [5] "CATEGORY"
## [6] "DESCRIPTION"
## [7] "TEXT"
## [8] "COHORT"
## [9] "Patient.and.Family.Care.Preferences"
## [10] "Patient.and.Family.Care.Preferences.Text"
## [11] "Communication.with.Family"
## [12] "Communication.with.Family.Text"
## [13] "Full.Code.Status"
## [14] "Full.Code.Status.Text"
## [15] "Code.Status.Limitations"
## [16] "Code.Status.Limitations.Text"
## [17] "Palliative.Care.Team.Involvement"
## [18] "Palliative.Care.Team.Involvement.Text"
## [19] "Ambiguous"
## [20] "Ambiguous.Text"
## [21] "Ambiguous.Comments"
## [22] "None"
## [23] "operator"
## [24] "STAMP"
nrow(dat)
## [1] 876
length(unique(dat$TEXT))
## [1] 641
#Load phrases
rules <- read.csv("~/Desktop/goals_of_care/regex/rule-based_NLP_domains.csv", header = T, stringsAsFactors = F)
#Rename columns
colnames(rules) <- c("CAR", "FAM", "LIM", "COD", "PAL")
#Show rules
print(rules)
## CAR FAM
## 1 goals of care family discussion
## 2 goc family discussions
## 3 goals for care family meeting
## 4 goals of treatment family communication
## 5 goals for treatment durable power of attorney
## 6 treatment goals health care proxy
## 7 care preferences hcp
## 8 extending life understanding of illness
## 9 comfort-focused care understanding of prognosis
## 10 supportive care
## 11 no feeding tube
## 12 no dialysis
## 13 no hemodialysis
## 14 no HD
## 15 quality of life
## 16 priorities
## 17 end of life care
## 18 living will
## 19 molst
## 20 advance directives
## 21 advance care planning
## 22 acp
## 23 hospice
## 24 full code confirmed
## 25 full code d/w
## 26 full code discussed
## 27 full code verified
## 28 would like to be full code
## 29 wishes to be full code
## 30 would like to remain full code
## 31 wishes to remain full code
## 32 wish to be full code
## 33 remaining full code
## LIM COD PAL
## 1 dnr full code pallcare
## 2 dnrdni palliative care
## 3 dni pall care
## 4 do not resuscitate palliative medicine
## 5 do-not-resuscitate
## 6 do not intubate
## 7 do-not-intubate
## 8 no intubation
## 9 chest compressions
## 10 no defibrillation
## 11 shocks
## 12 shock therapy
## 13 no cpr
## 14 no endotracheal intubation
## 15 no mechanical intubation
## 16 no ventilator
## 17 no breathing machine
## 18 no breathing tube
## 19 no chest compressions
## 20 cmo
## 21 comfort measures
## 22 comfort care
## 23 limitations of life-sustaining treatment
## 24 llst
## 25
## 26
## 27
## 28
## 29
## 30
## 31
## 32
## 33
strictRegex() will accept all phrases kwds, and all note texts, texts, it will utilize grepl() to find direct matches in the text, and will return a list of booleans.
strictRegex <- function(kwds, texts){
#Create a list to store results
tmpList <- list()
#Loop through all keywords
for (i in 1:length(kwds)){
#Store results as a logical vector in its respective list entry position
tmpList[[i]] <- grepl(kwds[i], texts, ignore.case = TRUE)
}
#Return list and control to environment
return(tmpList)
}
CAR <- strictRegex(rules$CAR[rules$CAR != ''], dat$TEXT)
FAM <- strictRegex(rules$FAM[rules$FAM != ''], dat$TEXT)
LIM <- strictRegex(rules$LIM[rules$LIM != ''], dat$TEXT)
COD <- strictRegex(rules$COD[rules$COD != ''], dat$TEXT)
PAL <- strictRegex(rules$PAL[rules$PAL != ''], dat$TEXT)
to_df <- function(domain, rule){
#Convert list from grepl to data frame
domain <- as.data.frame(domain)
#Show column names as phrases
colnames(domain) <- rule[rule != '']
#Multiply by 1 for binary numeric
domain <- domain*1
return(domain)
}
CAR <- to_df(CAR, rules$CAR)
FAM <- to_df(FAM, rules$FAM)
LIM <- to_df(LIM, rules$LIM)
COD <- to_df(COD, rules$COD)
PAL <- to_df(PAL, rules$PAL)
phrase_to_domain <- function(dat){
inc <- vector()
for (i in 1:nrow(dat)){
#Collapse phrases by domain, presence of any phrase indicates domain
inc[i] <- any(dat[i,] == 1)
}
#Multiply by one to convert logical to binary numeric
return(inc*1)
}
res <- cbind(phrase_to_domain(CAR),
phrase_to_domain(FAM),
phrase_to_domain(LIM),
phrase_to_domain(COD),
phrase_to_domain(PAL))
colnames(res) <- c("RE_CAR", "RE_FAM", "RE_LIM", "RE_COD", "RE_PAL")
res <- as.data.frame(res)
head(res)
## RE_CAR RE_FAM RE_LIM RE_COD RE_PAL
## 1 0 0 0 1 0
## 2 0 0 0 1 0
## 3 0 0 0 1 0
## 4 0 0 1 0 1
## 5 0 1 0 1 0
## 6 0 0 1 0 0
nrow(res)
## [1] 876
#Merge res and data
res <- cbind(dat, res)
Remove duplicates, keep first observation of the note, as the first observation will have the most information, subsequent annotations will have additional phrases associated with domains.
res <- res[!duplicated(res$TEXT),]
statGen <- function(dat, hum_lab, re_lab, set){
#Vector to hold results
rVec <- vector()
#True Positives
tp <- 0
#True Negatives
tn <- 0
#False Positives
fp <- 0
#False Negatives
fn <- 0
#tmp to hold dat[[lab]][i]
re_tmp <- as.character()
hum_tmp <- as.character()
for (i in 1:nrow(dat)){
re_tmp <- dat[[re_lab]][i]
hum_tmp <- dat[[hum_lab]][i]
rVec[length(rVec)+1] <- paste("Token: ", dat$TEXT[i],"\n", re_lab, " Assignment: ", re_tmp, "\n", sep = '')
#If both model and human aren't negative
if (re_tmp != 0 & hum_tmp != 0 & !is.na(hum_tmp)){
#True positive
tp <- tp + 1
#if human marks negative and model assigns label
} else if (hum_tmp == 0 & re_tmp != 0 & !is.na(hum_tmp)){
#False positive
fp <- fp + 1
#if human marks label and model doesn't assign label
} else if (hum_tmp != 0 & re_tmp == 0 & !is.na(hum_tmp)){
#False negative
fn <- fn + 1
#if human marks negative and model marks negative
} else if (re_tmp == 0 & hum_tmp == 0 & !is.na(hum_tmp)){
#True negative
tn <- tn + 1
}
}
#Hold results in txt to check if necessary
write.csv(rVec,
file = paste(set,"_",re_lab,"_OUTPUT_NOTE_LEVEL_26Nov17.txt", sep = ''),
quote = F, row.names = F)
tmpFrame <- cbind(tp, tn, fp, fn)
colnames(tmpFrame) <- c("tp", "tn", "fp", "fn")
return(as.data.frame(tmpFrame))
}
results <- rbind(statGen(res, "Patient.and.Family.Care.Preferences", "RE_CAR", "ALL"),
statGen(res, "Full.Code.Status", "RE_COD", "ALL"),
statGen(res, "Communication.with.Family", "RE_FAM", "ALL"),
statGen(res, "Code.Status.Limitations","RE_LIM", "ALL"),
statGen(res, "Palliative.Care.Team.Involvement", "RE_PAL", "ALL"))
row.names(results) <- c("RE_CAR", "RE_COD", "RE_FAM", "RE_LIM", "RE_PAL")
\[accuracy = \frac{(tp+tn)}{(tp + tn + fp + fn)}\] \[precision = \frac{tp}{(tp+fp)}\] \[accuracy = \frac{tp}{(tp+fn)}\] \[F_1 = 2 \cdot \frac{(precision*recall)}{precision + recall}\]
modelStats <- function(dat){
accuracy <- (dat$tp + dat$tn)/(dat$tp + dat$tn + dat$fp + dat$fn)
precision <- dat$tp/(dat$tp + dat$fp)
recall <- dat$tp/(dat$tp + dat$fn)
dat$accuracy <- paste(sprintf("%2.2f", round(100*accuracy,2)), '%', sep = '')
dat$precision <- paste(sprintf("%2.2f", round(100*precision,2)), '%', sep = '')
dat$recall <- paste(sprintf("%2.2f", round(100*recall,2)), '%', sep = '')
dat$F1 <- round(100*2*(precision * recall)/(precision + recall),2)
return(dat)
}
results <- modelStats(results)
print(results)
## tp tn fp fn accuracy precision recall F1
## RE_CAR 45 447 56 93 76.76% 44.55% 32.61% 37.66
## RE_COD 272 163 15 48 87.35% 94.77% 85.00% 89.62
## RE_FAM 82 398 73 88 74.88% 52.90% 48.24% 50.46
## RE_LIM 195 300 82 64 77.22% 70.40% 75.29% 72.76
## RE_PAL 9 617 14 1 97.66% 39.13% 90.00% 54.55