This is a rule-based patient note analysis to capture patient goals of care (GOC) and advanced care planning (ACP) from text at the note level. We will utilize text phrases generated by clinicians who have annotated notes contained in the MIMIC-III Intensive Care Unit database to generate a Bag of Words (BoW) Model.
The BoW model does not utilize grammar or semantic relatedness, as it classifies text without regard to the spacial relation of words making up the text. The model utilizes only the presence or absence of words known to be associated with GOC or ACP (as per the clinicians who generated these rules during text classification).
This BoW model is employed as a means to compare a rule-based method to a deep-learning method which account for grammar, semantic relatedness, and named-entity recognition NeuroNER.
Note: Aggregated data are representative of actual MIMIC data, but to ensure and maintain patient privacy, all strings of characters here that appear to be patient note text are made up by myself as representative cases.
library("ggplot2")
library("data.table")
library("DescTools")
library("pscl")
library("PresenceAbsence")
load_data() will accept a label, lab, the set identifier (train/valid) set, and the map from file to row_id, mp. It will return the portion of the map particular to the set input as an argument.
load_data <- function(lab, set, mp){
#Read in data
tmp <- read.csv(paste("~/goals_of_care/regex_v2/",lab,"_",set,"_processed.csv", sep = ''),
header = T, stringsAsFactors = F, quote = "", row.names = NULL)
#Change "filename" to file for consistency
colnames(tmp)[which(colnames(tmp) == "filename")] <- "file"
#Subset unique filenames
tmp <- unique(tmp$file)
#Take substring of names to ensure we can map across both train_/valid_ files
tmp <- substr(tmp, nchar(tmp) - 9, nchar(tmp))
#Subset files in train set from key map
tmp <- mp[(mp$file %in% tmp),]
#Return the mapped data
return(tmp)
}
strict_regex() will accept all phrases kwds, and all note texts, texts, it will utilize grepl() to find direct matches in the text, and will return a list of booleans.
strict_regex <- function(kwds, texts){
#Create a list to store results
tmpList <- list()
#Loop through all keywords
for (i in 1:length(kwds)){
#Store results as a logical vector in its respective list entry position
tmpList[[i]] <- grepl(kwds[i], texts, ignore.case = TRUE)
}
#Return list and control to environment
return(tmpList)
}
to_df() will convert the list to a data.frame and apply the rule tokens as column names to indicate what each variable represents.
to_df <- function(domain, rule){
#Convert list from grepl to data frame
domain <- as.data.frame(domain)
#Show column names as phrases
colnames(domain) <- rule[rule != '']
#Multiply by 1 for binary numeric
domain <- domain*1
return(domain)
}
clean_text() accept a string of text, tokens, as well as a boolean, printout. It will remove carriage returns, remove text obfuscations, and convert the text to lowercase. If printout is TRUE, it will print out example text resulting from the removal of the above.
clean_text <- function(tokens, printout){
#Create a fake patient note phrase as a representative sample
ex_token <- "Example note:\nThe patient is a 81yo m who was found down in [** location **] on [** date **] by daughter, [** name **].\n Pt was in usual state of health until four days ago, when began to complain to family of heartburn for which the pt was taking tums in addition to his prescribed PPI, without resolution."
if (printout){
print(substr(ex_token, 1, 100))
}
#Remove carriage returns, convert to lower
tokens <- tolower(gsub('\r', ' ', tokens))
tokens <- tolower(gsub('\n', ' ', tokens))
ex_token <- tolower(gsub('\n', ' ', ex_token))
if (printout){
cat("Result after removing carriage returns:\n")
print(substr(ex_token, 1, 100))
}
#https://stackoverflow.com/questions/13529360/replace-text-within-parenthesis-in-r
#Remove obfuscations between '[' and ']'
tokens <- gsub(" *\\[.*?\\] *", ' ', tokens)
ex_token <- gsub(" *\\[.*?\\] *", ' ', ex_token)
if (printout){
cat("Result after leaving [obfuscation]:\n")
print(substr(ex_token, 1, 100))
}
#Keep only words & numeric
tokens <- gsub("[^[:alnum:][:space:]]", '', tokens)
ex_token <- gsub("[^[:alnum:][:space:]]", '', ex_token)
if (printout){
cat("Result after removing all but alphanumeric and spaces:\n")
print(substr(ex_token, 1, 100))
}
#Keep only a single white space
#https://stackoverflow.com/questions/25707647/merge-multiple-spaces-to-single-space-remove-trailing-leading-spaces
tokens <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', tokens, perl=TRUE)
ex_token <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', ex_token, perl=TRUE)
if (printout){
cat("Result after keeping only single spaces:\n")
print(substr(ex_token, 1, 100))
}
return(tokens)
}
bow() will accept a character vector, phrases, will split them on spaces, clean tokens of empty observations and return a character vector of tokens.
bow <- function(phrases){
tmp <- unlist(strsplit(phrases, ' '))
tmp <- tmp[tmp != '']
return(tmp)
}
plot_bow() will accept a character vector bows containing the bag of words, as well as a number, n, and a label lab,it will tabulate them and plot the top n-most represented words in a barplot, the title will contain the label information.
plot_bow <- function(bows, n, lab){
tmp_tab <- table(bows)[rev(order(table(bows)))]
par(mai=c(1,2,1,1))
barplot(rev(head(tmp_tab, n)), horiz = T, las = 1,
main = paste("Most Frequent Words in the ",lab ," Phrase Dictionary", sep = ''),
xlab = "Frequency")
}
cohort_gen() will accept the annotation data dat, train_set, and validation_set, and will map cohorts, returning the annotation data frame.
cohort_gen <- function(dat, train_set, valid_set){
dat$COHORT <- rep('', each = nrow(dat))
for (i in 1:nrow(dat)){
if (dat$ROW_ID[i] %in% train_set$row_id){
dat$COHORT[i] <- "train"
} else if (dat$ROW_ID[i] %in% valid_set$row_id){
dat$COHORT[i] <- "validation"
} else {
dat$COHORT[i] <- ""
}
}
return(dat)
}
standardize() will standardize numeric data.
standardize <- function(dat){
return((dat - mean(dat))/sd(dat))
}
model_info <- function(fit){
#Summary info
model_sum <- summary(fit)
#Odds ratio, confidence interval
odds_ratio <- cbind(OR = exp(fit$coef), exp(confint(fit)))
#Create list for return
my_list <- list(model_sum, odds_ratio)
#names
names(my_list) <- c("Model Summary","OR Summary")
return(my_list)
}
confusion_data <- function(fit){
#Create variables
f_val <- vector()
#Threshold sequence
threshold <- seq(0.1,0.9, by=.01)
for (i in 1:length(threshold)){
#F1 calculation from confusion matrix
f_val[i] <- Conf(fit, cutoff = threshold[i])$byclass[9]
}
#Confusion matrix
cutoff <- threshold[which.max(f_val)]
conf_mat <- Conf(fit, cutoff = cutoff)
cat("Maximum F1 is acheived at a cutoff of: ", cutoff, '\n', sep = '')
#Plot
layout(matrix(1:2,ncol = 2))
plot(threshold, f_val, type = "l", main = "Cutoff Based on F1 Score")
abline(h=max(f_val), v = cutoff, col="red")
fourfoldplot(conf_mat$table, main = "Confusion Matrix Plot",
color = c("red","green"))
return(conf_mat)
}
roc_plot <- function(observed, fit, lab){
df <- data.frame(ID=1:length(observed),observed = observed,
predicted = predict(fit, type = "response"))
#Maximize PCC (percent correctly classified)
auc.roc.plot(df, opt.methods = 5, color = TRUE, main = paste(lab, "AUC ROC Plot", sep = ' '))
}
#Read csv
dat <- fread("~/goals_of_care/regex_v3/op_annotations_122017.csv",
header = T, stringsAsFactors = F)
#Remove note w/ missing data
missing_note <- fread("~/goals_of_care/unmatched_note_result.csv", header = T, stringsAsFactors = F)
nrow(missing_note)
## [1] 1
#Remove note missing data
dat <- dat[!dat$ROW_ID %in% missing_note$ROW_ID, ]
#Make the names easier to deal with:
colnames(dat)[which(colnames(dat) == "Code Status Limitations")] <- "LIM"
colnames(dat)[which(colnames(dat) == "Full Code Status")] <- "COD"
colnames(dat)[which(colnames(dat) == "Patient and Family Care Preferences")] <- "CAR"
colnames(dat)[which(colnames(dat) == "Palliative Care Team Involvement")] <- "PAL"
colnames(dat)[which(colnames(dat) == "Communication with Family")] <- "FAM"
#Dat CIM is already accounted for
#Check the data
colnames(dat)
## [1] "V1"
## [2] "ROW_ID"
## [3] "SUBJECT_ID"
## [4] "HADM_ID"
## [5] "CATEGORY"
## [6] "DESCRIPTION"
## [7] "TEXT"
## [8] "COHORT"
## [9] "CAR"
## [10] "Patient and Family Care Preferences Text"
## [11] "FAM"
## [12] "Communication with Family Text"
## [13] "COD"
## [14] "Full Code Status Text"
## [15] "LIM"
## [16] "Code Status Limitations Text"
## [17] "PAL"
## [18] "Palliative Care Team Involvement Text"
## [19] "Ambiguous"
## [20] "Ambiguous Text"
## [21] "Ambiguous Comments"
## [22] "None"
## [23] "STAMP"
## [24] "operator"
## [25] "original_filename"
nrow(dat)
## [1] 1496
row_id_to_file_num.txt contains the key we will use to convert file as output from NeuroNER to row_id from MIMICIII.NOTEEVENTS, which was used as substrate to generate the annotated data set.
#Contains map of files (BRAT format for neuroNER) to row_id (MIMIC NOTEEVENTS)
map <- fread("~/goals_of_care/summary_stats/row_id_to_file_num.txt", header = F, stringsAsFactors = F, sep = '\t')
head(map)
## V1 V2
## 1: 407635 text_00000
## 2: 345097 text_00001
## 3: 371117 text_00002
## 4: 401545 text_00003
## 5: 393792 text_00004
## 6: 382263 text_00005
#Add column names for ease of merging
colnames(map) <- c("row_id", "file")
#Code Status Limitations
lim_train_set <- load_data("LIM", "train", map)
lim_valid_set <- load_data("LIM", "valid", map)
#Full Code
cod_train_set <- load_data("COD", "train", map)
cod_valid_set <- load_data("COD", "valid", map)
#Patient/Family Care Preferences
car_train_set <- load_data("CAR", "train", map)
car_valid_set <- load_data("CAR", "valid", map)
#Palliative Care Consult
pal_train_set <- load_data("PAL", "train", map)
pal_valid_set <- load_data("PAL", "valid", map)
#Family meeting held
fam_train_set <- load_data("FAM", "train", map)
fam_valid_set <- load_data("FAM", "valid", map)
#dat <- cohort_gen(dat, train_set, valid_set)
lim_dat <- cohort_gen(dat, lim_train_set, lim_valid_set)
cod_dat <- cohort_gen(dat, cod_train_set, cod_valid_set)
car_dat <- cohort_gen(dat, car_train_set, car_valid_set)
pal_dat <- cohort_gen(dat, pal_train_set, pal_valid_set)
fam_dat <- cohort_gen(dat, fam_train_set, fam_valid_set)
#apply train/valid cohort information to dat
if (identical(dat$ROW_ID, lim_dat$ROW_ID)){
dat$COHORT <- lim_dat$COHORT
}
levels(factor(dat$COHORT))
## [1] "train" "validation"
#Create CIM dat by using any other data source
cim_dat <- lim_dat
cim <- vector()
for (i in 1:nrow(lim_dat)){
if (lim_dat$LIM[i] == 1 | lim_dat$CAR[i] == 1){
cim[length(cim)+1] <- 1
} else {
cim[length(cim)+1] <- 0
}
}
#Populate CIM data
cim_dat$CIM <- cim
#Add to dat
dat$CIM <- cim
Note: Only use phrases from the training set.
##Only use phrases from training set
lim_phrases <- lim_dat$`Code Status Limitations Text`[lim_dat$COHORT == "train"]
cod_phrases <- cod_dat$`Full Code Status Text`[cod_dat$COHORT == "train"]
car_phrases <- car_dat$`Patient and Family Care Preferences Text`[car_dat$COHORT == "train"]
pal_phrases <- pal_dat$`Palliative Care Team Involvement Text`[pal_dat$COHORT == "train"]
fam_phrases <- fam_dat$`Communication with Family Text`[fam_dat$COHORT == "train"]
##remove empty observations ""
lim_phrases <- lim_phrases[lim_phrases != '']
cod_phrases <- cod_phrases[cod_phrases != '']
car_phrases <- car_phrases[car_phrases != '']
pal_phrases <- pal_phrases[pal_phrases != '']
fam_phrases <- fam_phrases[fam_phrases != '']
#CIM is a combination of care preferences/code status limitations
cim_phrases <- c(car_phrases, lim_phrases)
##Remove duplicate observations (keep first)
dat <- dat[!duplicated(dat$TEXT), ]
txts <- clean_text(dat$TEXT, TRUE)
## [1] "Example note:\nThe patient is a 81yo m who was found down in [** location **] on [** date **] by daug"
## Result after removing carriage returns:
## [1] "example note: the patient is a 81yo m who was found down in [** location **] on [** date **] by daug"
## Result after leaving [obfuscation]:
## [1] "example note: the patient is a 81yo m who was found down in on by daughter, . pt was in usual sta"
## Result after removing all but alphanumeric and spaces:
## [1] "example note the patient is a 81yo m who was found down in on by daughter pt was in usual state "
## Result after keeping only single spaces:
## [1] "example note the patient is a 81yo m who was found down in on by daughter pt was in usual state of h"
lim_phrases <- clean_text(lim_phrases, FALSE)
cod_phrases <- clean_text(cod_phrases, FALSE)
car_phrases <- clean_text(car_phrases, FALSE)
pal_phrases <- clean_text(pal_phrases, FALSE)
fam_phrases <- clean_text(fam_phrases, FALSE)
#
cim_phrases <- clean_text(cim_phrases, FALSE)
#Create bag of words by splitting phrases on spaces and unlisting result
lim_bow <- bow(lim_phrases)
cod_bow <- bow(cod_phrases)
car_bow <- bow(car_phrases)
pal_bow <- bow(pal_phrases)
fam_bow <- bow(fam_phrases)
#
cim_bow <- bow(cim_phrases)
plot_bow(lim_bow, 20, "LIM")
plot_bow(cod_bow, 20, "COD")
plot_bow(car_bow, 20, "CAR")
plot_bow(pal_bow, 20, "PAL")
plot_bow(fam_bow, 20, "FAM")
plot_bow(cim_bow, 20, "CIM")
#Use only unique tokens
lim_bow <- unique(lim_bow)
cat(length(lim_bow), "Unique Tokens in LIM BoW\n")
## 375 Unique Tokens in LIM BoW
cod_bow <- unique(cod_bow)
cat(length(cod_bow), "Unique Tokens in COD BoW\n")
## 152 Unique Tokens in COD BoW
car_bow <- unique(car_bow)
cat(length(car_bow), "Unique Tokens in CAR BoW\n")
## 705 Unique Tokens in CAR BoW
pal_bow <- unique(pal_bow)
cat(length(pal_bow), "Unique Tokens in PAL BoW\n")
## 113 Unique Tokens in PAL BoW
fam_bow <- unique(fam_bow)
cat(length(fam_bow), "Unique Tokens in FAM BoW\n")
## 639 Unique Tokens in FAM BoW
#CIM
cim_bow <- unique(cim_bow)
cat(length(cim_bow), "Unique Tokens in CIM BoW\n")
## 763 Unique Tokens in CIM BoW
#Strict regex
lim_tmp <- strict_regex(lim_bow, txts)
cod_tmp <- strict_regex(cod_bow, txts)
car_tmp <- strict_regex(car_bow, txts)
pal_tmp <- strict_regex(pal_bow, txts)
fam_tmp <- strict_regex(fam_bow, txts)
#CIM
cim_tmp <- strict_regex(cim_bow, txts)
#Convert list to data.frame
lim_tmp <- to_df(lim_tmp, lim_bow)
cod_tmp <- to_df(cod_tmp, cod_bow)
car_tmp <- to_df(car_tmp, car_bow)
pal_tmp <- to_df(pal_tmp, pal_bow)
fam_tmp <- to_df(fam_tmp, fam_bow)
#CIM
cim_tmp <- to_df(cim_tmp, cim_bow)
#Create score, apply sum row-wise (each column is a token)
lim_tmp$bow_score <- apply(lim_tmp, 1, sum)
cod_tmp$bow_score <- apply(cod_tmp, 1, sum)
car_tmp$bow_score <- apply(car_tmp, 1, sum)
pal_tmp$bow_score <- apply(pal_tmp, 1, sum)
fam_tmp$bow_score <- apply(fam_tmp, 1, sum)
#CIM
cim_tmp$bow_score <- apply(cim_tmp, 1, sum)
#Assign bow_scores to annotated data
dat <- cbind(dat,
lim_tmp$bow_score,
cod_tmp$bow_score,
car_tmp$bow_score,
pal_tmp$bow_score,
fam_tmp$bow_score,
cim_tmp$bow_score)
#Assign column names
colnames(dat)[27:32] <- c("lim_bow_score",
"cod_bow_score",
"car_bow_score",
"pal_bow_score",
"fam_bow_score",
"cim_bow_score")
boxplot(dat$lim_bow_score,
dat$cod_bow_score,
dat$car_bow_score,
dat$pal_bow_score,
dat$fam_bow_score,
dat$cim_bow_score, main = "Bag of Word Score Distribution by Domain\n (LIM) (COD) (CAR) (PAL) (FAM) (CIM)")
summary(dat$lim_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 41.0 97.0 112.0 111.7 127.2 175.0
summary(dat$cod_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27.00 51.00 58.00 58.31 67.00 83.00
summary(dat$car_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 60.0 150.0 173.0 173.8 199.0 282.0
summary(dat$pal_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.00 39.00 45.00 44.91 51.00 72.00
summary(dat$fam_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 63 144 165 166 190 261
summary(dat$cim_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 64.0 159.0 184.0 184.8 212.0 299.0
#Save Data
#write.csv(dat, file = "~/goals_of_care/cleaned_annotations/op_annot_bow_scores_23Jan18.csv", row.names = F)
dat$lim_bow_score <- standardize(dat$lim_bow_score)
dat$cod_bow_score <- standardize(dat$cod_bow_score)
dat$car_bow_score <- standardize(dat$car_bow_score)
dat$pal_bow_score <- standardize(dat$pal_bow_score)
dat$fam_bow_score <- standardize(dat$fam_bow_score)
dat$cim_bow_score <- standardize(dat$cim_bow_score)
boxplot(dat$lim_bow_score,
dat$cod_bow_score,
dat$car_bow_score,
dat$pal_bow_score,
dat$fam_bow_score,
dat$cim_bow_score, main = "Bag of Word Score Standardized Distribution by Domain\n (LIM) (COD) (CAR) (PAL) (FAM) (CIM)")
summary(dat$lim_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.08100 -0.64210 0.01123 0.00000 0.67550 2.75500
summary(dat$cod_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.79000 -0.65140 -0.02784 0.00000 0.77390 2.19900
summary(dat$car_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.08800 -0.64530 -0.02091 0.00000 0.68490 2.93800
summary(dat$pal_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.05700 -0.69750 0.01032 0.00000 0.71810 3.19500
summary(dat$fam_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.02300 -0.64550 -0.02908 0.00000 0.70470 2.78900
summary(dat$cim_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.07100 -0.65520 -0.01947 0.00000 0.69260 2.90500
#Subset general training/testing cohorts
train <- dat[(dat$COHORT == "train"),]
#Need to remove NAs for Harry's annotations
cod_train <- na.omit(train)
valid <- dat[(dat$COHORT == "validation"),]
#Need to remove NAs for Harry's annotations
cod_valid <- na.omit(valid)
#LIM
lim_reg <- glm(LIM ~ lim_bow_score, family = binomial(link = 'logit'), data = train)
#COD
cod_reg <- glm(COD ~ cod_bow_score, family = binomial(link = 'logit'), data = cod_train)
#CAR
car_reg <- glm(CAR ~ car_bow_score, family = binomial(link = 'logit'), data = train)
#PAL
pal_reg <- glm(PAL ~ pal_bow_score, family = binomial(link = 'logit'), data = train)
#FAM
fam_reg <- glm(FAM ~ fam_bow_score, family = binomial(link = 'logit'), data = train)
#CIM
cim_reg <- glm(CIM ~ cim_bow_score, family = binomial(link = 'logit'), dat = train)
model_info(lim_reg)
## Waiting for profiling to be done...
## $`Model Summary`
##
## Call:
## glm(formula = LIM ~ lim_bow_score, family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2702 -1.0462 -0.9308 1.2896 1.5524
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.37828 0.09875 -3.831 0.000128 ***
## lim_bow_score 0.22621 0.09949 2.274 0.022988 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 608.78 on 447 degrees of freedom
## Residual deviance: 603.53 on 446 degrees of freedom
## AIC: 607.53
##
## Number of Fisher Scoring iterations: 4
##
##
## $`OR Summary`
## OR 2.5 % 97.5 %
## (Intercept) 0.6850378 0.5635606 0.8302389
## lim_bow_score 1.2538426 1.0331444 1.5270322
confusion_data(lim_reg)
## Maximum F1 is acheived at a cutoff of: 0.56
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 261 186
## 1 0 1
##
## Accuracy : 0.5848
## 95% CI : (0.5387, 0.6295)
## No Information Rate : 0.5826
## P-Value [Acc > NIR] : 0.4819
##
## Kappa : 0.0062
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0053
## Pos Pred Value : 0.5839
## Neg Pred Value : 1.0000
## Prevalence : 0.5826
## Detection Rate : 0.5826
## Detection Prevalence : 0.5826
## Balanced Accuracy : 0.5027
## F-val Accuracy : 0.7373
##
## 'Positive' Class : 0
roc_plot(train$LIM, lim_reg, "LIM")
model_info(cod_reg)
## Waiting for profiling to be done...
## $`Model Summary`
##
## Call:
## glm(formula = COD ~ cod_bow_score, family = binomial(link = "logit"),
## data = cod_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4727 -1.4126 0.9312 0.9542 0.9958
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.55668 0.11166 4.986 6.18e-07 ***
## cod_bow_score 0.05231 0.11246 0.465 0.642
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 466.19 on 355 degrees of freedom
## Residual deviance: 465.98 on 354 degrees of freedom
## AIC: 469.98
##
## Number of Fisher Scoring iterations: 4
##
##
## $`OR Summary`
## OR 2.5 % 97.5 %
## (Intercept) 1.744867 1.4046036 2.176963
## cod_bow_score 1.053708 0.8450834 1.314566
confusion_data(cod_reg)
## Maximum F1 is acheived at a cutoff of: 0.1
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 227 129
## 0 0 0
##
## Accuracy : 0.6376
## 95% CI : (0.5865, 0.6859)
## No Information Rate : 0.6376
## P-Value [Acc > NIR] : 0.524
##
## Kappa : 0.0000
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.6376
## Neg Pred Value : NA
## Prevalence : 0.6376
## Detection Rate : 0.6376
## Detection Prevalence : 0.6376
## Balanced Accuracy : 0.5000
## F-val Accuracy : 0.7787
##
## 'Positive' Class : 1
roc_plot(cod_train$COD, cod_reg, "COD")
model_info(car_reg)
## Waiting for profiling to be done...
## $`Model Summary`
##
## Call:
## glm(formula = CAR ~ car_bow_score, family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0780 -0.7810 -0.6783 -0.4808 2.1240
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.2156 0.1184 -10.266 < 2e-16 ***
## car_bow_score 0.3946 0.1162 3.395 0.000686 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 501.64 on 447 degrees of freedom
## Residual deviance: 489.64 on 446 degrees of freedom
## AIC: 493.64
##
## Number of Fisher Scoring iterations: 4
##
##
## $`OR Summary`
## OR 2.5 % 97.5 %
## (Intercept) 0.2965439 0.2336425 0.3719443
## car_bow_score 1.4837556 1.1849924 1.8705131
confusion_data(car_reg)
## Maximum F1 is acheived at a cutoff of: 0.41
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 335 107
## 1 2 4
##
## Accuracy : 0.7567
## 95% CI : (0.7149, 0.7941)
## No Information Rate : 0.7522
## P-Value [Acc > NIR] : 0.4384
##
## Kappa : 0.0441
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9941
## Specificity : 0.0360
## Pos Pred Value : 0.7579
## Neg Pred Value : 0.6667
## Prevalence : 0.7522
## Detection Rate : 0.7478
## Detection Prevalence : 0.7522
## Balanced Accuracy : 0.5151
## F-val Accuracy : 0.8601
##
## 'Positive' Class : 0
roc_plot(train$CAR, car_reg, "CAR")
model_info(pal_reg)
## Waiting for profiling to be done...
## $`Model Summary`
##
## Call:
## glm(formula = PAL ~ pal_bow_score, family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8100 -0.1999 -0.1110 -0.0615 3.8093
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.3004 0.6935 -7.643 2.12e-14 ***
## pal_bow_score 1.6712 0.4210 3.969 7.20e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 88.154 on 447 degrees of freedom
## Residual deviance: 68.366 on 446 degrees of freedom
## AIC: 72.366
##
## Number of Fisher Scoring iterations: 8
##
##
## $`OR Summary`
## OR 2.5 % 97.5 %
## (Intercept) 0.004989763 0.001009654 0.01586794
## pal_bow_score 5.318489974 2.453397337 13.12882676
confusion_data(pal_reg)
## Maximum F1 is acheived at a cutoff of: 0.28
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 439 6
## 1 0 3
##
## Accuracy : 0.9866
## 95% CI : (0.9711, 0.9938)
## No Information Rate : 0.9799
## P-Value [Acc > NIR] : 0.204
##
## Kappa : 0.4949
## Mcnemar's Test P-Value : 0.04123
##
## Sensitivity : 1.0000
## Specificity : 0.3333
## Pos Pred Value : 0.9865
## Neg Pred Value : 1.0000
## Prevalence : 0.9799
## Detection Rate : 0.9799
## Detection Prevalence : 0.9799
## Balanced Accuracy : 0.6667
## F-val Accuracy : 0.9932
##
## 'Positive' Class : 0
roc_plot(train$PAL, pal_reg, "PAL")
model_info(fam_reg)
## Waiting for profiling to be done...
## $`Model Summary`
##
## Call:
## glm(formula = FAM ~ fam_bow_score, family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3122 -0.8726 -0.7218 1.2892 2.1261
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.9494 0.1119 -8.488 < 2e-16 ***
## fam_bow_score 0.5113 0.1127 4.537 5.72e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 551.67 on 447 degrees of freedom
## Residual deviance: 529.59 on 446 degrees of freedom
## AIC: 533.59
##
## Number of Fisher Scoring iterations: 4
##
##
## $`OR Summary`
## OR 2.5 % 97.5 %
## (Intercept) 0.3869577 0.3092593 0.4797569
## fam_bow_score 1.6674899 1.3420543 2.0893246
confusion_data(fam_reg)
## Maximum F1 is acheived at a cutoff of: 0.58
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 311 135
## 1 0 2
##
## Accuracy : 0.6987
## 95% CI : (0.6546, 0.7393)
## No Information Rate : 0.6942
## P-Value [Acc > NIR] : 0.4415
##
## Kappa : 0.0202
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0146
## Pos Pred Value : 0.6973
## Neg Pred Value : 1.0000
## Prevalence : 0.6942
## Detection Rate : 0.6942
## Detection Prevalence : 0.6942
## Balanced Accuracy : 0.5073
## F-val Accuracy : 0.8217
##
## 'Positive' Class : 0
roc_plot(train$FAM, fam_reg, "FAM")
model_info(cim_reg)
## Waiting for profiling to be done...
## $`Model Summary`
##
## Call:
## glm(formula = CIM ~ cim_bow_score, family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.424 -1.180 0.990 1.157 1.394
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0003259 0.0964159 -0.003 0.9973
## cim_bow_score 0.2127838 0.0975700 2.181 0.0292 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 620.92 on 447 degrees of freedom
## Residual deviance: 616.09 on 446 degrees of freedom
## AIC: 620.09
##
## Number of Fisher Scoring iterations: 4
##
##
## $`OR Summary`
## OR 2.5 % 97.5 %
## (Intercept) 0.9996741 0.827245 1.207559
## cim_bow_score 1.2371171 1.023084 1.500829
confusion_data(cim_reg)
## Maximum F1 is acheived at a cutoff of: 0.1
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 228 220
## 0 0 0
##
## Accuracy : 0.5089
## 95% CI : (0.4628, 0.5549)
## No Information Rate : 0.5089
## P-Value [Acc > NIR] : 0.519
##
## Kappa : 0.0000
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.5089
## Neg Pred Value : NA
## Prevalence : 0.5089
## Detection Rate : 0.5089
## Detection Prevalence : 0.5089
## Balanced Accuracy : 0.5000
## F-val Accuracy : 0.6746
##
## 'Positive' Class : 1
roc_plot(train$CIM, cim_reg, "CIM")
#Generate probability values
LIM_PRED <- predict(lim_reg, valid, type="response")
COD_PRED <- predict(cod_reg, valid, type="response")
CAR_PRED <- predict(car_reg, valid, type="response")
PAL_PRED <- predict(pal_reg, valid, type="response")
FAM_PRED <- predict(fam_reg, valid, type="response")
CIM_PRED <- predict(cim_reg, valid, type="response")
#Show distribution of probabilities
boxplot(LIM_PRED, COD_PRED, CAR_PRED, PAL_PRED, FAM_PRED, CIM_PRED, main = "Note Prediction Probability Distributions\n (LIM) (COD) (CAR) (PAL) (FAM) (CIM)")
#Apply predictions given cutoff probability
LIM_PRED <- ifelse(LIM_PRED > 0.56, 1, 0)
COD_PRED <- ifelse(COD_PRED > 0.10, 1, 0)
CAR_PRED <- ifelse(CAR_PRED > 0.38, 1, 0)
PAL_PRED <- ifelse(PAL_PRED > 0.28, 1, 0)
FAM_PRED <- ifelse(FAM_PRED > 0.38, 1, 0)
CIM_PRED <- ifelse(CIM_PRED > 0.49, 1, 0)
dat <- cbind(dat,
LIM_PRED,
COD_PRED,
CAR_PRED,
PAL_PRED,
FAM_PRED,
CIM_PRED)
## Warning in data.table::data.table(...): Item 2 is of size 192 but maximum
## size is 640 (recycled leaving remainder of 64 items)
## Warning in data.table::data.table(...): Item 3 is of size 192 but maximum
## size is 640 (recycled leaving remainder of 64 items)
## Warning in data.table::data.table(...): Item 4 is of size 192 but maximum
## size is 640 (recycled leaving remainder of 64 items)
## Warning in data.table::data.table(...): Item 5 is of size 192 but maximum
## size is 640 (recycled leaving remainder of 64 items)
## Warning in data.table::data.table(...): Item 6 is of size 192 but maximum
## size is 640 (recycled leaving remainder of 64 items)
## Warning in data.table::data.table(...): Item 7 is of size 192 but maximum
## size is 640 (recycled leaving remainder of 64 items)
ex_val <- fread("~/goals_of_care/external_validation/over_75_cohort_17Jan18.csv", header = T, stringsAsFactors = F)
ex_val <- ex_val[!duplicated(ex_val$TEXT),]
txts <- clean_text(ex_val$TEXT, TRUE)
## [1] "Example note:\nThe patient is a 81yo m who was found down in [** location **] on [** date **] by daug"
## Result after removing carriage returns:
## [1] "example note: the patient is a 81yo m who was found down in [** location **] on [** date **] by daug"
## Result after leaving [obfuscation]:
## [1] "example note: the patient is a 81yo m who was found down in on by daughter, . pt was in usual sta"
## Result after removing all but alphanumeric and spaces:
## [1] "example note the patient is a 81yo m who was found down in on by daughter pt was in usual state "
## Result after keeping only single spaces:
## [1] "example note the patient is a 81yo m who was found down in on by daughter pt was in usual state of h"
#Use only unique tokens
lim_bow <- unique(lim_bow)
cat(length(lim_bow), "Unique Tokens in LIM BoW\n")
## 375 Unique Tokens in LIM BoW
cod_bow <- unique(cod_bow)
cat(length(cod_bow), "Unique Tokens in COD BoW\n")
## 152 Unique Tokens in COD BoW
car_bow <- unique(car_bow)
cat(length(car_bow), "Unique Tokens in CAR BoW\n")
## 705 Unique Tokens in CAR BoW
pal_bow <- unique(pal_bow)
cat(length(pal_bow), "Unique Tokens in PAL BoW\n")
## 113 Unique Tokens in PAL BoW
fam_bow <- unique(fam_bow)
cat(length(fam_bow), "Unique Tokens in FAM BoW\n")
## 639 Unique Tokens in FAM BoW
#CIM
cim_bow <- unique(cim_bow)
cat(length(cim_bow), "Unique Tokens in CIM BoW\n")
## 763 Unique Tokens in CIM BoW
#Strict regex
lim_tmp <- strict_regex(lim_bow, txts)
cod_tmp <- strict_regex(cod_bow, txts)
car_tmp <- strict_regex(car_bow, txts)
pal_tmp <- strict_regex(pal_bow, txts)
fam_tmp <- strict_regex(fam_bow, txts)
#CIM
cim_tmp <- strict_regex(cim_bow, txts)
#Convert list to data.frame
lim_tmp <- to_df(lim_tmp, lim_bow)
cod_tmp <- to_df(cod_tmp, cod_bow)
car_tmp <- to_df(car_tmp, car_bow)
pal_tmp <- to_df(pal_tmp, pal_bow)
fam_tmp <- to_df(fam_tmp, fam_bow)
#CIM
cim_tmp <- to_df(cim_tmp, cim_bow)
#Create score, apply sum row-wise (each column is a token)
lim_tmp$bow_score <- apply(lim_tmp, 1, sum)
cod_tmp$bow_score <- apply(cod_tmp, 1, sum)
car_tmp$bow_score <- apply(car_tmp, 1, sum)
pal_tmp$bow_score <- apply(pal_tmp, 1, sum)
fam_tmp$bow_score <- apply(fam_tmp, 1, sum)
#CIM
cim_tmp$bow_score <- apply(cim_tmp, 1, sum)
#Temporarily hold bow scores for assignment to ex_val set
lim_bow_score <- lim_tmp$bow_score
cod_bow_score <- cod_tmp$bow_score
car_bow_score <- car_tmp$bow_score
pal_bow_score <- pal_tmp$bow_score
fam_bow_score <- fam_tmp$bow_score
cim_bow_score <- cim_tmp$bow_score
#Assign bow_scores to annotated data
ex_val <- cbind(ex_val,
lim_bow_score,
cod_bow_score,
car_bow_score,
pal_bow_score,
fam_bow_score,
cim_bow_score)
boxplot(ex_val$lim_bow_score,
ex_val$cod_bow_score,
ex_val$car_bow_score,
ex_val$pal_bow_score,
ex_val$fam_bow_score,
ex_val$cim_bow_score, main = "Bag of Word Score Distribution by Domain\n (LIM) (COD) (CAR) (PAL) (FAM) (CIM)")
summary(ex_val$lim_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.0 92.0 108.0 107.3 123.0 185.0
summary(ex_val$cod_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 49.00 58.00 57.08 65.00 91.00
summary(ex_val$car_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.0 144.0 167.0 167.3 191.0 300.0
summary(ex_val$pal_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 38.00 44.00 43.53 49.00 76.00
summary(ex_val$fam_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.0 139.0 160.0 160.5 183.0 277.0
summary(ex_val$cim_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.0 153.0 177.0 177.6 203.0 313.0
ex_val$lim_bow_score <- standardize(ex_val$lim_bow_score)
ex_val$cod_bow_score <- standardize(ex_val$cod_bow_score)
ex_val$car_bow_score <- standardize(ex_val$car_bow_score)
ex_val$pal_bow_score <- standardize(ex_val$pal_bow_score)
ex_val$fam_bow_score <- standardize(ex_val$fam_bow_score)
ex_val$cim_bow_score <- standardize(ex_val$cim_bow_score)
boxplot(ex_val$lim_bow_score,
ex_val$cod_bow_score,
ex_val$car_bow_score,
ex_val$pal_bow_score,
ex_val$fam_bow_score,
ex_val$cim_bow_score,
main = "Bag of Word Score Standardized Distribution by Domain\n (LIM) (COD) (CAR) (PAL) (FAM) (CIM)")
summary(ex_val$lim_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.59300 -0.67920 0.03233 0.00000 0.69940 3.45700
summary(ex_val$cod_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.93100 -0.72320 0.08261 0.00000 0.70930 3.03700
summary(ex_val$car_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.456000 -0.639900 -0.008477 0.000000 0.650400 3.643000
summary(ex_val$pal_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.09200 -0.66190 0.05649 0.00000 0.65510 3.88800
summary(ex_val$fam_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.50300 -0.63460 -0.01447 0.00000 0.66470 3.44000
summary(ex_val$cim_bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.45900 -0.63640 -0.01646 0.00000 0.65520 3.49700
Note: probabilities associated with highest accuracy will be used as cutoffs:
lim_reg: 0.56 cod_reg: 0.10 car_reg: 0.38 pal_reg: 0.28 fam_reg: 0.38 cim_reg: 0.49
#Generate probability values
LIM_PRED <- predict(lim_reg, ex_val, type="response")
COD_PRED <- predict(cod_reg, ex_val, type="response")
CAR_PRED <- predict(car_reg, ex_val, type="response")
PAL_PRED <- predict(pal_reg, ex_val, type="response")
FAM_PRED <- predict(fam_reg, ex_val, type="response")
CIM_PRED <- predict(cim_reg, ex_val, type="response")
boxplot(LIM_PRED, COD_PRED, CAR_PRED, PAL_PRED, FAM_PRED, CIM_PRED, main = "Note Prediction Probability Distributions\n (LIM) (COD) (CAR) (PAL) (FAM) (CIM)")
#Apply predictions given cutoff probability
LIM_PRED <- ifelse(LIM_PRED > 0.56, 1, 0)
COD_PRED <- ifelse(COD_PRED > 0.10, 1, 0)
CAR_PRED <- ifelse(CAR_PRED > 0.38, 1, 0)
PAL_PRED <- ifelse(PAL_PRED > 0.28, 1, 0)
FAM_PRED <- ifelse(FAM_PRED > 0.38, 1, 0)
CIM_PRED <- ifelse(CIM_PRED > 0.49, 1, 0)
ex_val <- cbind(ex_val,
LIM_PRED,
COD_PRED,
CAR_PRED,
PAL_PRED,
FAM_PRED,
CIM_PRED)
#write.csv(ex_val, file = "/Users/Edward/goals_of_care/ex_val_w_BOW_LR_preds.csv", row.names = F)