This is a rule-based patient note analysis to capture patient goals of care (GOC) and advanced care planning (ACP) from text at the note level. We will utilize text phrases generated by clinicians who have annotated notes contained in the MIMIC-III Intensive Care Unit database to generate a Bag of Words (BoW) Model.
The BoW model does not utilize grammar or semantic relatedness, as it classifies text without regard to the spacial relation of words making up the text. The model utilizes only the presence or absence of words known to be associated with GOC or ACP (as per the clinicians who generated these rules during text classification).
This BoW model is employed as a means to compare a rule-based method to a deep-learning method which account for grammar, semantic relatedness, and named-entity recognition NeuroNER.
Note: Aggregated data are representative of actual MIMIC data, but to ensure and maintain patient privacy, all strings of characters here that appear to be patient note text are made up by myself as representative cases.
load_data() will accept a label, lab, the set identifier (train/valid) set, and the map from file to row_id, mp. It will return the portion of the map particular to the set input as an argument.
load_data <- function(lab, set, mp){
#Read in data
tmp <- read.csv(paste("~/goals_of_care/regex_v2/",lab,"_",set,"_processed.csv", sep = ''),
header = T, stringsAsFactors = F, quote = "", row.names = NULL)
#Change "filename" to file for consistency
colnames(tmp)[which(colnames(tmp) == "filename")] <- "file"
#Subset unique filenames
tmp <- unique(tmp$file)
#Take substring of names to ensure we can map across both train_/valid_ files
tmp <- substr(tmp, nchar(tmp) - 9, nchar(tmp))
#Subset files in train set from key map
tmp <- mp[(mp$file %in% tmp),]
#Return the mapped data
return(tmp)
}
strict_regex() will accept all phrases kwds, and all note texts, texts, it will utilize grepl() to find direct matches in the text, and will return a list of booleans.
strict_regex <- function(kwds, texts){
#Create a list to store results
tmpList <- list()
#Loop through all keywords
for (i in 1:length(kwds)){
#Store results as a logical vector in its respective list entry position
tmpList[[i]] <- grepl(kwds[i], texts, ignore.case = TRUE)
}
#Return list and control to environment
return(tmpList)
}
to_df() will convert the list to a data.frame and apply the rule tokens as column names to indicate what each variable represents.
to_df <- function(domain, rule){
#Convert list from grepl to data frame
domain <- as.data.frame(domain)
#Show column names as phrases
colnames(domain) <- rule[rule != '']
#Multiply by 1 for binary numeric
domain <- domain*1
return(domain)
}
clean_text() accept a string of text, tokens, as well as a boolean, printout. It will remove carriage returns, remove text obfuscations, and convert the text to lowercase. If printout is TRUE, it will print out example text resulting from the removal of the above.
clean_text <- function(tokens, printout){
#Create a fake patient note phrase as a representative sample
ex_token <- "Example note:\nThe patient is a 81yo m who was found down in [** location **] on [** date **] by daughter, [** name **].\n Pt was in usual state of health until four days ago, when began to complain to family of heartburn for which the pt was taking tums in addition to his prescribed PPI, without resolution."
if (printout){
print(substr(ex_token, 1, 100))
}
#Remove carriage returns, convert to lower
tokens <- tolower(gsub('\n', ' ', tokens))
ex_token <- tolower(gsub('\n', ' ', ex_token))
if (printout){
cat("Result after removing carriage returns:\n")
print(substr(ex_token, 1, 100))
}
#https://stackoverflow.com/questions/13529360/replace-text-within-parenthesis-in-r
#Remove obfuscations between '[' and ']'
tokens <- gsub(" *\\[.*?\\] *", ' ', tokens)
ex_token <- gsub(" *\\[.*?\\] *", ' ', ex_token)
if (printout){
cat("Result after leaving [obfuscation]:\n")
print(substr(ex_token, 1, 100))
}
#Keep only words & numeric
tokens <- gsub("[^[:alnum:][:space:]]", '', tokens)
ex_token <- gsub("[^[:alnum:][:space:]]", '', ex_token)
if (printout){
cat("Result after removing all but alphanumeric and spaces:\n")
print(substr(ex_token, 1, 100))
}
#Keep only a single white space
#https://stackoverflow.com/questions/25707647/merge-multiple-spaces-to-single-space-remove-trailing-leading-spaces
tokens <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', tokens, perl=TRUE)
ex_token <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', ex_token, perl=TRUE)
if (printout){
cat("Result after keeping only single spaces:\n")
print(substr(ex_token, 1, 100))
}
return(tokens)
}
bow() will accept a character vector, phrases, will split them on spaces, clean tokens of empty observations and return a character vector of tokens.
bow <- function(phrases){
tmp <- unlist(strsplit(phrases, ' '))
tmp <- tmp[tmp != '']
return(tmp)
}
plot_bow() will accept a character vector bows containing the bag of words, as well as a number, n, and a label lab,it will tabulate them and plot the top n-most represented words in a barplot, the title will contain the label information.
plot_bow <- function(bows, n, lab){
tmp_tab <- table(bows)[rev(order(table(bows)))]
par(mai=c(1,2,1,1))
barplot(rev(head(tmp_tab, n)), horiz = T, las = 1,
main = paste("Most Frequent Words in the ",lab ," Phrase Dictionary", sep = ''),
xlab = "Frequency")
}
cohort_gen() will accept the annotation data dat, train_set, and validation_set, and will map cohorts, returning the annotation data frame.
cohort_gen <- function(dat, train_set, valid_set){
dat$COHORT <- rep('', each = nrow(dat))
for (i in 1:nrow(dat)){
if (dat$ROW_ID[i] %in% train_set$row_id){
dat$COHORT[i] <- "train"
} else if (dat$ROW_ID[i] %in% valid_set$row_id){
dat$COHORT[i] <- "validation"
} else {
dat$COHORT[i] <- ""
}
}
return(dat)
}
statGen() will accept the data.frame, hum_lab (human label result), the re_lab (regex label result), the set (train/validation), and the threshold, or the cutoff for tokens from the note found in the bag of words.
statGen <- function(dat, hum_lab, re_lab, set, threshold){
dat <- dat[(dat$COHORT == set),]
#Vector to hold results
rVec <- vector()
#True Positives
tp <- 0
#True Negatives
tn <- 0
#False Positives
fp <- 0
#False Negatives
fn <- 0
#tmp to hold dat[[lab]][i]
re_tmp <- as.numeric()
hum_tmp <- as.numeric()
for (i in 1:nrow(dat)){
re_tmp <- dat[["bow_score"]][i]
#Generate boolean, multiply by 1 for numeric
re_tmp <- (re_tmp >= threshold)*1
hum_tmp <- dat[[hum_lab]][i]
rVec[length(rVec)+1] <- paste("Token: ", dat$TEXT[i], '\n', re_lab, " Assignment: ", re_tmp, '\n', sep = '')
if (is.na(re_tmp) | is.na(hum_tmp)) break
#If both model and human aren't negative
if (re_tmp != 0 & hum_tmp != 0 & !is.na(hum_tmp)){
#True positive
tp <- tp + 1
#if human marks negative and model assigns label
} else if (hum_tmp == 0 & re_tmp != 0 & !is.na(hum_tmp)){
#False positive
fp <- fp + 1
#if human marks label and model doesn't assign label
} else if (hum_tmp != 0 & re_tmp == 0 & !is.na(hum_tmp)){
#False negative
fn <- fn + 1
#if human marks negative and model marks negative
} else if (re_tmp == 0 & hum_tmp == 0 & !is.na(hum_tmp)){
#True negative
tn <- tn + 1
} else if (is.null(re_tmp) | is.null(hum_tmp)){
break
}
}
#Hold results in txt to check if necessary
write.csv(rVec,
file = paste(set,"_",re_lab,"_OUTPUT_NOTE_LEVEL_06Jan17.txt", sep = ''),
quote = F, row.names = F)
tmpFrame <- cbind(tp, tn, fp, fn)
colnames(tmpFrame) <- c("tp", "tn", "fp", "fn")
return(as.data.frame(tmpFrame))
}
modelStats() accepts a data.frame with tp, tn, fp, tn values and returns common machine learning metrics.
modelStats <- function(dat){
dat <- as.data.frame(dat)
accuracy <- (dat$tp + dat$tn)/(dat$tp + dat$tn + dat$fp + dat$fn)
precision <- dat$tp/(dat$tp + dat$fp)
recall <- dat$tp/(dat$tp + dat$fn)
specificity <- dat$tn/(dat$tn+dat$fp)
F1 <- 2*(precision*recall)/(precision + recall)
dat$accuracy <- round(accuracy*100, 2)
dat$precision <- round(precision*100, 2)
dat$recall <- round(recall*100, 2)
dat$specificity <- round(specificity*100, 2)
dat$F1 <- round(F1*100, 2)
return(dat)
}
long_data() will accept the output of modelStats().
long_data <- function(dat){
metrics <- c(dat$accuracy,
dat$precision,
dat$recall,
dat$specificity,
dat$F1)
labs <- c(rep("accuracy", each = nrow(dat)),
rep("precision", each = nrow(dat)),
rep("recall", each = nrow(dat)),
rep("specificity", each = nrow(dat)),
rep("F1", each = nrow(dat)))
ind <- rep(1:nrow(dat), 5)
res <- as.data.frame(
cbind(metrics,
labs,
ind)
)
colnames(res) <- c("metrics", "labs", "threshold")
#Convert factors to numeric after as.data.frame conversion
res$metrics <- as.numeric(levels(res$metrics)[res$metrics])
res$threshold <- as.numeric(levels(res$threshold)[res$threshold])
#Return only defined results
return(na.omit(res))
}
#Read csv
dat <- read.csv("~/goals_of_care/regex_v3/op_annotations_122017.csv",
header = T, stringsAsFactors = F)
#Check the data
colnames(dat)
## [1] "X"
## [2] "ROW_ID"
## [3] "SUBJECT_ID"
## [4] "HADM_ID"
## [5] "CATEGORY"
## [6] "DESCRIPTION"
## [7] "TEXT"
## [8] "COHORT"
## [9] "Patient.and.Family.Care.Preferences"
## [10] "Patient.and.Family.Care.Preferences.Text"
## [11] "Communication.with.Family"
## [12] "Communication.with.Family.Text"
## [13] "Full.Code.Status"
## [14] "Full.Code.Status.Text"
## [15] "Code.Status.Limitations"
## [16] "Code.Status.Limitations.Text"
## [17] "Palliative.Care.Team.Involvement"
## [18] "Palliative.Care.Team.Involvement.Text"
## [19] "Ambiguous"
## [20] "Ambiguous.Text"
## [21] "Ambiguous.Comments"
## [22] "None"
## [23] "STAMP"
## [24] "operator"
## [25] "original_filename"
nrow(dat)
## [1] 1500
row_id_to_file_num.txt contains the key we will use to convert file as output from NeuroNER to row_id from MIMICIII.NOTEEVENTS, which was used as substrate to generate the annotated data set.
#Contains map of files (BRAT format for neuroNER) to row_id (MIMIC NOTEEVENTS)
map <- read.csv("~/goals_of_care/summary_stats/row_id_to_file_num.txt", header = F, stringsAsFactors = F, sep = '\t')
head(map)
## V1 V2
## 1 407635 text_00000
## 2 345097 text_00001
## 3 371117 text_00002
## 4 401545 text_00003
## 5 393792 text_00004
## 6 382263 text_00005
#Add column names for ease of merging
colnames(map) <- c("row_id", "file")
#Code Status Limitations
lim_train_set <- load_data("LIM", "train", map)
lim_valid_set <- load_data("LIM", "valid", map)
#Full Code
cod_train_set <- load_data("COD", "train", map)
cod_valid_set <- load_data("COD", "valid", map)
#Patient/Family Care Preferences
car_train_set <- load_data("CAR", "train", map)
car_valid_set <- load_data("CAR", "valid", map)
#Palliative Care Consult
pal_train_set <- load_data("PAL", "train", map)
pal_valid_set <- load_data("PAL", "valid", map)
#Family meeting held
fam_train_set <- load_data("FAM", "train", map)
fam_valid_set <- load_data("FAM", "valid", map)
#dat <- cohort_gen(dat, train_set, valid_set)
lim_dat <- cohort_gen(dat, lim_train_set, lim_valid_set)
cod_dat <- cohort_gen(dat, cod_train_set, cod_valid_set)
car_dat <- cohort_gen(dat, car_train_set, car_valid_set)
pal_dat <- cohort_gen(dat, pal_train_set, pal_valid_set)
fam_dat <- cohort_gen(dat, fam_train_set, fam_valid_set)
Note: Only use phrases from the training set.
##Only use phrases from training set
lim_phrases <- lim_dat$Code.Status.Limitations.Text[lim_dat$COHORT == "train"]
cod_phrases <- cod_dat$Full.Code.Status.Text[cod_dat$COHORT == "train"]
car_phrases <- car_dat$Patient.and.Family.Care.Preferences.Text[car_dat$COHORT == "train"]
pal_phrases <- pal_dat$Palliative.Care.Team.Involvement.Text[pal_dat$COHORT == "train"]
fam_phrases <- fam_dat$Communication.with.Family.Text[fam_dat$COHORT == "train"]
##remove empty observations ""
lim_phrases <- lim_phrases[lim_phrases != '']
cod_phrases <- cod_phrases[cod_phrases != '']
car_phrases <- car_phrases[car_phrases != '']
pal_phrases <- pal_phrases[pal_phrases != '']
fam_phrases <- fam_phrases[fam_phrases != '']
##Remove duplicate observations (keep first)
dat <- dat[!duplicated(dat$TEXT), ]
txts <- clean_text(dat$TEXT, TRUE)
## [1] "Example note:\nThe patient is a 81yo m who was found down in [** location **] on [** date **] by daug"
## Result after removing carriage returns:
## [1] "example note: the patient is a 81yo m who was found down in [** location **] on [** date **] by daug"
## Result after leaving [obfuscation]:
## [1] "example note: the patient is a 81yo m who was found down in on by daughter, . pt was in usual sta"
## Result after removing all but alphanumeric and spaces:
## [1] "example note the patient is a 81yo m who was found down in on by daughter pt was in usual state "
## Result after keeping only single spaces:
## [1] "example note the patient is a 81yo m who was found down in on by daughter pt was in usual state of h"
lim_phrases <- clean_text(lim_phrases, FALSE)
cod_phrases <- clean_text(cod_phrases, FALSE)
car_phrases <- clean_text(car_phrases, FALSE)
pal_phrases <- clean_text(pal_phrases, FALSE)
fam_phrases <- clean_text(fam_phrases, FALSE)
#Create bag of words by splitting phrases on spaces and unlisting result
lim_bow <- bow(lim_phrases)
cod_bow <- bow(cod_phrases)
car_bow <- bow(car_phrases)
pal_bow <- bow(pal_phrases)
fam_bow <- bow(fam_phrases)
plot_bow(lim_bow, 20, "LIM")
plot_bow(cod_bow, 20, "COD")
plot_bow(car_bow, 20, "CAR")
plot_bow(pal_bow, 20, "PAL")
plot_bow(fam_bow, 20, "FAM")
#Use only unique tokens
lim_bow <- unique(lim_bow)
cat(length(lim_bow), "Unique Tokens in LIM BoW\n")
## 377 Unique Tokens in LIM BoW
cod_bow <- unique(cod_bow)
cat(length(cod_bow), "Unique Tokens in COD BoW\n")
## 156 Unique Tokens in COD BoW
car_bow <- unique(car_bow)
cat(length(car_bow), "Unique Tokens in CAR BoW\n")
## 705 Unique Tokens in CAR BoW
pal_bow <- unique(pal_bow)
cat(length(pal_bow), "Unique Tokens in PAL BoW\n")
## 113 Unique Tokens in PAL BoW
fam_bow <- unique(fam_bow)
cat(length(fam_bow), "Unique Tokens in FAM BoW\n")
## 640 Unique Tokens in FAM BoW
#Strict regex
lim_tmp <- strict_regex(lim_bow, txts)
cod_tmp <- strict_regex(cod_bow, txts)
car_tmp <- strict_regex(car_bow, txts)
pal_tmp <- strict_regex(pal_bow, txts)
fam_tmp <- strict_regex(fam_bow, txts)
#Convert list to data.frame
lim_tmp <- to_df(lim_tmp, lim_bow)
cod_tmp <- to_df(cod_tmp, cod_bow)
car_tmp <- to_df(car_tmp, car_bow)
pal_tmp <- to_df(pal_tmp, pal_bow)
fam_tmp <- to_df(fam_tmp, fam_bow)
#Create score, apply sum row-wise
lim_tmp$bow_score <- apply(lim_tmp, 1, sum)
#Add score to dat with annotation data
dat$bow_score <- lim_tmp$bow_score
#Show distribution of token counts represented in the BoW
summary(lim_tmp$bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 41.0 97.0 112.0 112.3 128.0 175.0
hist(lim_tmp$bow_score, breaks = 25, main = "BoW Score Distribution (all notes)", xlab = "Tokens in Phrase Dictionary", ylab = "Frequency of Notes with BoW Score")
#Create a tmp data.frame with all data and corresponding labels
tmp <- cbind(dat, lim_tmp)
#Show distribution by cohort
boxplot(tmp$bow_score ~ tmp$COHORT, main = "LIM BoW Score Distribution by Cohort", xlab = "Set", ylab = "Number of Tokens in BoW Dictionary")
\[accuracy = \frac{(tp+tn)}{(tp + tn + fp + fn)}\] \[precision = \frac{tp}{(tp+fp)}\] \[recall = \frac{tp}{(tp+fn)}\] \[specificity = \frac{tn}{(tn+fp)}\] \[F_1 = 2 \cdot \frac{precision \cdot recall}{precision + recall}\]
thresholds <- as.numeric(levels(factor(lim_tmp$bow_score)))
system.time(test <- statGen(dat, "Code.Status.Limitations", "LIM", "train", thresholds[1]))
## user system elapsed
## 0.004 0.000 0.053
for (i in 2:max(thresholds)){
test <- rbind(test, statGen(dat, "Code.Status.Limitations", "LIM", "train", thresholds[i]))
}
test <- modelStats(test)
test$threshold <- 1:max(thresholds)
head(test, 50)
## tp tn fp fn accuracy precision recall specificity F1 threshold
## 1 0 0 0 0 NaN NaN NaN NaN NaN 1
## 2 0 0 0 0 NaN NaN NaN NaN NaN 2
## 3 0 0 0 0 NaN NaN NaN NaN NaN 3
## 4 0 0 0 0 NaN NaN NaN NaN NaN 4
## 5 0 0 0 0 NaN NaN NaN NaN NaN 5
## 6 0 0 0 0 NaN NaN NaN NaN NaN 6
## 7 0 0 0 0 NaN NaN NaN NaN NaN 7
## 8 0 0 0 0 NaN NaN NaN NaN NaN 8
## 9 0 0 0 0 NaN NaN NaN NaN NaN 9
## 10 0 0 0 0 NaN NaN NaN NaN NaN 10
## 11 0 0 0 0 NaN NaN NaN NaN NaN 11
## 12 0 0 0 0 NaN NaN NaN NaN NaN 12
## 13 0 0 0 0 NaN NaN NaN NaN NaN 13
## 14 0 0 0 0 NaN NaN NaN NaN NaN 14
## 15 0 0 0 0 NaN NaN NaN NaN NaN 15
## 16 0 0 0 0 NaN NaN NaN NaN NaN 16
## 17 0 0 0 0 NaN NaN NaN NaN NaN 17
## 18 0 0 0 0 NaN NaN NaN NaN NaN 18
## 19 0 0 0 0 NaN NaN NaN NaN NaN 19
## 20 0 0 0 0 NaN NaN NaN NaN NaN 20
## 21 0 0 0 0 NaN NaN NaN NaN NaN 21
## 22 0 0 0 0 NaN NaN NaN NaN NaN 22
## 23 0 0 0 0 NaN NaN NaN NaN NaN 23
## 24 0 0 0 0 NaN NaN NaN NaN NaN 24
## 25 0 0 0 0 NaN NaN NaN NaN NaN 25
## 26 0 0 0 0 NaN NaN NaN NaN NaN 26
## 27 0 0 0 0 NaN NaN NaN NaN NaN 27
## 28 0 0 0 0 NaN NaN NaN NaN NaN 28
## 29 0 0 0 0 NaN NaN NaN NaN NaN 29
## 30 0 0 0 0 NaN NaN NaN NaN NaN 30
## 31 0 0 0 0 NaN NaN NaN NaN NaN 31
## 32 0 0 0 0 NaN NaN NaN NaN NaN 32
## 33 0 0 0 0 NaN NaN NaN NaN NaN 33
## 34 0 0 0 0 NaN NaN NaN NaN NaN 34
## 35 0 0 0 0 NaN NaN NaN NaN NaN 35
## 36 0 0 0 0 NaN NaN NaN NaN NaN 36
## 37 0 0 0 0 NaN NaN NaN NaN NaN 37
## 38 0 0 0 0 NaN NaN NaN NaN NaN 38
## 39 0 0 0 0 NaN NaN NaN NaN NaN 39
## 40 0 0 0 0 NaN NaN NaN NaN NaN 40
## 41 0 0 0 0 NaN NaN NaN NaN NaN 41
## 42 0 0 0 0 NaN NaN NaN NaN NaN 42
## 43 0 0 0 0 NaN NaN NaN NaN NaN 43
## 44 0 0 0 0 NaN NaN NaN NaN NaN 44
## 45 0 0 0 0 NaN NaN NaN NaN NaN 45
## 46 0 0 0 0 NaN NaN NaN NaN NaN 46
## 47 0 0 0 0 NaN NaN NaN NaN NaN 47
## 48 0 0 0 0 NaN NaN NaN NaN NaN 48
## 49 0 0 0 0 NaN NaN NaN NaN NaN 49
## 50 0 0 0 0 NaN NaN NaN NaN NaN 50
system.time(val <- statGen(dat, "Code.Status.Limitations", "LIM", "validation", thresholds[1]))
## user system elapsed
## 0.002 0.001 0.034
for (i in 2:max(thresholds)){
val <- rbind(val, statGen(dat, "Code.Status.Limitations", "LIM", "validation", thresholds[i]))
}
val <- modelStats(val)
val$threshold <- 1:max(thresholds)
head(val, 50)
## tp tn fp fn accuracy precision recall specificity F1 threshold
## 1 0 0 0 0 NaN NaN NaN NaN NaN 1
## 2 0 0 0 0 NaN NaN NaN NaN NaN 2
## 3 0 0 0 0 NaN NaN NaN NaN NaN 3
## 4 0 0 0 0 NaN NaN NaN NaN NaN 4
## 5 0 0 0 0 NaN NaN NaN NaN NaN 5
## 6 0 0 0 0 NaN NaN NaN NaN NaN 6
## 7 0 0 0 0 NaN NaN NaN NaN NaN 7
## 8 0 0 0 0 NaN NaN NaN NaN NaN 8
## 9 0 0 0 0 NaN NaN NaN NaN NaN 9
## 10 0 0 0 0 NaN NaN NaN NaN NaN 10
## 11 0 0 0 0 NaN NaN NaN NaN NaN 11
## 12 0 0 0 0 NaN NaN NaN NaN NaN 12
## 13 0 0 0 0 NaN NaN NaN NaN NaN 13
## 14 0 0 0 0 NaN NaN NaN NaN NaN 14
## 15 0 0 0 0 NaN NaN NaN NaN NaN 15
## 16 0 0 0 0 NaN NaN NaN NaN NaN 16
## 17 0 0 0 0 NaN NaN NaN NaN NaN 17
## 18 0 0 0 0 NaN NaN NaN NaN NaN 18
## 19 0 0 0 0 NaN NaN NaN NaN NaN 19
## 20 0 0 0 0 NaN NaN NaN NaN NaN 20
## 21 0 0 0 0 NaN NaN NaN NaN NaN 21
## 22 0 0 0 0 NaN NaN NaN NaN NaN 22
## 23 0 0 0 0 NaN NaN NaN NaN NaN 23
## 24 0 0 0 0 NaN NaN NaN NaN NaN 24
## 25 0 0 0 0 NaN NaN NaN NaN NaN 25
## 26 0 0 0 0 NaN NaN NaN NaN NaN 26
## 27 0 0 0 0 NaN NaN NaN NaN NaN 27
## 28 0 0 0 0 NaN NaN NaN NaN NaN 28
## 29 0 0 0 0 NaN NaN NaN NaN NaN 29
## 30 0 0 0 0 NaN NaN NaN NaN NaN 30
## 31 0 0 0 0 NaN NaN NaN NaN NaN 31
## 32 0 0 0 0 NaN NaN NaN NaN NaN 32
## 33 0 0 0 0 NaN NaN NaN NaN NaN 33
## 34 0 0 0 0 NaN NaN NaN NaN NaN 34
## 35 0 0 0 0 NaN NaN NaN NaN NaN 35
## 36 0 0 0 0 NaN NaN NaN NaN NaN 36
## 37 0 0 0 0 NaN NaN NaN NaN NaN 37
## 38 0 0 0 0 NaN NaN NaN NaN NaN 38
## 39 0 0 0 0 NaN NaN NaN NaN NaN 39
## 40 0 0 0 0 NaN NaN NaN NaN NaN 40
## 41 0 0 0 0 NaN NaN NaN NaN NaN 41
## 42 0 0 0 0 NaN NaN NaN NaN NaN 42
## 43 0 0 0 0 NaN NaN NaN NaN NaN 43
## 44 0 0 0 0 NaN NaN NaN NaN NaN 44
## 45 0 0 0 0 NaN NaN NaN NaN NaN 45
## 46 0 0 0 0 NaN NaN NaN NaN NaN 46
## 47 0 0 0 0 NaN NaN NaN NaN NaN 47
## 48 0 0 0 0 NaN NaN NaN NaN NaN 48
## 49 0 0 0 0 NaN NaN NaN NaN NaN 49
## 50 0 0 0 0 NaN NaN NaN NaN NaN 50
library(ggplot2)
train_plot <- long_data(test)
valid_plot <- long_data(val)
ggplot(train_plot, aes(x = threshold, y = metrics, group = labs, shape = labs, linetype = labs )) +
geom_line(aes(color = labs), size = 1.1) +
#xlim(0, 200) +
#geom_point(aes(color = labs)) +
labs(title="Plot of Metrics by BoW Token Cutoff Threshold\n(Training Set)", x = "Token Number Cutoff", y = "Value") +
theme_minimal()
Note: Rule-based Bag of Words algorithm derived from Training Set phrases as defined by clinicians applied to validation set at varying cutoffs for number of tokens present.
ggplot(valid_plot, aes(x = threshold, y = metrics, group = labs, shape = labs, linetype = labs )) +
geom_line(aes(color = labs), size = 1.1) +
#xlim(0, 200) +
#geom_point(aes(color = labs)) +
labs(title="Plot of Metrics by BoW Token Cutoff Threshold\n(Validation Set)", x = "Token Number Cutoff", y = "Value") +
theme_minimal()