This is a rule-based patient note analysis to capture patient goals of care (GOC) and advanced care planning (ACP) from text at the note level. We will utilize text phrases generated by clinicians to generate a Bag of Words (BoW) Model.
The BoW model does not utilize grammar or semantic relatedness, as it classifies text without regard to the spacial relation of words. The model utilizes only the presence or absence of words known to be associated with GOC or ACP (as per the clinicians who generated these rules).
This BoW model is employed as a means to compare a rule-based method to a deep-learning method which account for grammar, semantic relatedness, and named-entity recognition NeuroNER.
dat <- read.csv("~/goals_of_care/regex_v3/op_annotations_122017.csv",
header = T, stringsAsFactors = F)
## ###
## NOTE: Remember to collapse results to remove duplicates
## ###
nrow(dat)
## [1] 1500
#Contains map of files (BRAT format for neuroNER) to row_id (MIMIC NOTEEVENTS)
map <- read.csv("~/goals_of_care/summary_stats/row_id_to_file_num.txt", header = F, stringsAsFactors = F, sep = '\t')
head(map)
## V1 V2
## 1 407635 text_00000
## 2 345097 text_00001
## 3 371117 text_00002
## 4 401545 text_00003
## 5 393792 text_00004
## 6 382263 text_00005
#Add column names for ease of merging
colnames(map) <- c("row_id", "file")
#Pull in training data from one file
train <- read.csv("~/goals_of_care/regex_v2/CAR_train_processed.csv", header = T, stringsAsFactors = F, quote = "", row.names = NULL)
head(train)
## X filename LIM COD CAR PAL FAM LIM.machine COD.machine
## 1 0 train_text_00000 0 0 0 0 0 0 0
## 2 1 train_text_00001 0 0 0 0 0 0 0
## 3 2 train_text_00002 0 0 1 0 0 0 0
## 4 3 train_text_00003 0 0 1 0 0 0 0
## 5 4 train_text_00004 0 0 0 0 0 0 0
## 6 5 train_text_00005 0 0 0 0 0 0 0
## CAR.machine PAL.machine FAM.machine
## 1 0 0 0
## 2 1 0 0
## 3 0 0 0
## 4 1 0 0
## 5 0 0 0
## 6 0 0 0
#Change "filename" to "file" for consistency
colnames(train)[which(colnames(train) == "filename")] <- "file"
#Manipulate names to ensure we can map across train_/valid_ files
train_set <- unique(train$file)
train_set <- gsub("train_", '', train_set)
#Subset files in train set from key map
train_set <- map[(map$file %in% train_set),]
nrow(train_set)
## [1] 449
#Pull in validation data from one file
valid <- read.csv("~/goals_of_care/regex_v2/CAR_valid_processed.csv", header = T, stringsAsFactors = F, quote = "", row.names = NULL)
head(valid)
## X filename LIM COD CAR PAL FAM LIM.machine COD.machine
## 1 0 valid_text_00449 0 0 0 0 0 0 0
## 2 1 valid_text_00450 0 0 0 0 0 0 0
## 3 2 valid_text_00451 0 0 0 0 0 0 0
## 4 3 valid_text_00452 0 0 0 0 0 0 0
## 5 4 valid_text_00453 0 0 0 0 0 0 0
## 6 5 valid_text_00454 0 0 0 0 0 0 0
## CAR.machine PAL.machine FAM.machine
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
#Change "filename" to "file" for consistency
colnames(valid)[which(colnames(valid) == "filename")] <- "file"
#Manipulate names to ensure we can map across train_/valid_ files
valid_set <- unique(valid$file)
valid_set <- gsub("valid_", '', valid_set)
valid_set <- map[(map$file %in% valid_set),]
nrow(valid_set)
## [1] 192
strictRegex()
will accept all phrases kwds
, and all note texts, texts
, it will utilize grepl()
to find direct matches in the text, and will return a list of booleans.
strict_regex <- function(kwds, texts){
#Create a list to store results
tmpList <- list()
#Loop through all keywords
for (i in 1:length(kwds)){
#Store results as a logical vector in its respective list entry position
tmpList[[i]] <- grepl(kwds[i], texts, ignore.case = TRUE)
}
#Return list and control to environment
return(tmpList)
}
to_df()
will convert the list to a data.frame
and apply the rule
tokens as column names to indicate what each variable represents.
to_df <- function(domain, rule){
#Convert list from grepl to data frame
domain <- as.data.frame(domain)
#Show column names as phrases
colnames(domain) <- rule[rule != '']
#Multiply by 1 for binary numeric
domain <- domain*1
return(domain)
}
clean_text()
will remove carriage returns, remove text obfuscations, and convert the text to lowercase.
clean_text <- function(tokens){
#Remove carriage returns, convert to lower
tokens <- tolower(gsub('\n', '', tokens))
cat("Result after removing carriage returns:\n")
print(substr(tokens[1], 1, 100))
#https://stackoverflow.com/questions/13529360/replace-text-within-parenthesis-in-r
#Remove obfuscations between '[' and ']'
tokens <- gsub(" *\\[.*?\\] *", '', tokens)
cat("Result after leaving [obfuscation]:\n")
print(substr(tokens[1], 1, 100))
#Keep only words & numeric
tokens <- gsub("[^[:alnum:][:space:]]", '', tokens)
cat("Result after removing all but alphanumeric and spaces:\n")
print(substr(tokens[1], 1, 100))
return(tokens)
}
cohort_gen()
will accept the annotation data dat
, train_set
, and validation_set
, and will map cohorts, returning the annotation data frame.
cohort_gen <- function(dat, train_set, valid_set){
dat$COHORT <- rep('', each = nrow(dat))
for (i in 1:nrow(dat)){
if (dat$ROW_ID[i] %in% train_set$row_id){
dat$COHORT[i] <- "train"
} else if (dat$ROW_ID[i] %in% valid_set$row_id){
dat$COHORT[i] <- "validation"
} else {
dat$COHORT[i] <- ""
}
}
return(dat)
}
dat <- cohort_gen(dat, train_set, valid_set)
#Show cohort distribution
table(dat$COHORT)
##
## train validation
## 1067 433
statGen()
will accept the data.frame
, hum_lab
(human label result), the re_lab
(regex label result), the set
(train/validation), and the threshold
, or the cutoff for tokens from the note found in the bag of words.
statGen <- function(dat, hum_lab, re_lab, set, threshold){
dat <- dat[(dat$COHORT == set),]
#Vector to hold results
rVec <- vector()
#True Positives
tp <- 0
#True Negatives
tn <- 0
#False Positives
fp <- 0
#False Negatives
fn <- 0
#tmp to hold dat[[lab]][i]
re_tmp <- as.numeric()
hum_tmp <- as.numeric()
for (i in 1:nrow(dat)){
re_tmp <- dat[["bow_score"]][i]
#Generate boolean, multiply by 1 for numeric
re_tmp <- (re_tmp >= threshold)*1
hum_tmp <- dat[[hum_lab]][i]
rVec[length(rVec)+1] <- paste("Token: ", dat$TEXT[i], '\n', re_lab, " Assignment: ", re_tmp, '\n', sep = '')
if (is.na(re_tmp) | is.na(hum_tmp)) break
#If both model and human aren't negative
if (re_tmp != 0 & hum_tmp != 0 & !is.na(hum_tmp)){
#True positive
tp <- tp + 1
#if human marks negative and model assigns label
} else if (hum_tmp == 0 & re_tmp != 0 & !is.na(hum_tmp)){
#False positive
fp <- fp + 1
#if human marks label and model doesn't assign label
} else if (hum_tmp != 0 & re_tmp == 0 & !is.na(hum_tmp)){
#False negative
fn <- fn + 1
#if human marks negative and model marks negative
} else if (re_tmp == 0 & hum_tmp == 0 & !is.na(hum_tmp)){
#True negative
tn <- tn + 1
} else if (is.null(re_tmp) | is.null(hum_tmp)){
break
}
}
#Hold results in txt to check if necessary
write.csv(rVec,
file = paste(set,"_",re_lab,"_OUTPUT_NOTE_LEVEL_25Dec17.txt", sep = ''),
quote = F, row.names = F)
tmpFrame <- cbind(tp, tn, fp, fn)
colnames(tmpFrame) <- c("tp", "tn", "fp", "fn")
return(as.data.frame(tmpFrame))
}
modelStats()
accepts a data.frame
with tp
, tn
, fp
, tn
values and returns common machine learning metrics.
modelStats <- function(dat){
dat <- as.data.frame(dat)
accuracy <- (dat$tp + dat$tn)/(dat$tp + dat$tn + dat$fp + dat$fn)
precision <- dat$tp/(dat$tp + dat$fp)
recall <- dat$tp/(dat$tp + dat$fn)
specificity <- dat$tn/(dat$tn+dat$fp)
F1 <- 2*(precision*recall)/(precision + recall)
dat$accuracy <- round(accuracy*100, 2)
dat$precision <- round(precision*100, 2)
dat$recall <- round(recall*100, 2)
dat$specificity <- round(specificity*100, 2)
dat$F1 <- round(F1*100, 2)
return(dat)
}
long_data()
will accept the output of
long_data <- function(dat){
metrics <- c(dat$accuracy,
dat$precision,
dat$recall,
dat$specificity,
dat$F1)
labs <- c(rep("accuracy", each = nrow(dat)),
rep("precision", each = nrow(dat)),
rep("recall", each = nrow(dat)),
rep("specificity", each = nrow(dat)),
rep("F1", each = nrow(dat)))
ind <- rep(1:nrow(dat), 5)
res <- as.data.frame(
cbind(metrics,
labs,
ind)
)
colnames(res) <- c("metrics", "labs", "threshold")
#Convert factors to numeric after as.data.frame conversion
res$metrics <- as.numeric(levels(res$metrics)[res$metrics])
res$threshold <- as.numeric(levels(res$threshold)[res$threshold])
#Return only defined results
return(na.omit(res))
}
Note: Only use phrases from the training set.
##Only use phrases from training set
phrases <- dat$Patient.and.Family.Care.Preferences.Text[dat$COHORT == "train"]
head(phrases)
## [1] ""
## [2] ""
## [3] ""
## [4] "Of note, palliative care was consulted on [**11-28**] and during a family\n meeting with PCP, [**Name10 (NameIs) **] attending, onc team, and family, code status\n was changed to DNR/DNI, however treatment was to be continued."
## [5] "CODE STATUS: Confirmed full"
## [6] ""
length(phrases)
## [1] 1067
##remove empty observations ""
phrases <- phrases[phrases != '']
length(phrases)
## [1] 246
head(phrases)
## [1] "Of note, palliative care was consulted on [**11-28**] and during a family\n meeting with PCP, [**Name10 (NameIs) **] attending, onc team, and family, code status\n was changed to DNR/DNI, however treatment was to be continued."
## [2] "CODE STATUS: Confirmed full"
## [3] "#. CODE: DNR/DNI after conversation with her son [**Name (NI) 1139**] [**Telephone/Fax (1) 5354**].\n [**Name2 (NI) **]ke at length about how BiPAP is normally a bridge to intubation but\n pt is DNR/DNI. At this point we are using BiPAP until pt's son can\n speak with her oncologist."
## [4] "# Code: full -confirmed with mother"
## [5] "Code status: Full code"
## [6] "After discussion yesterday with patient\ns family\n and primary oncologist, decision made to focus on comfort care."
##Remove duplicate observations (keep first)
dat <- dat[!duplicated(dat$TEXT), ]
txts <- clean_text(dat$TEXT)
## Result after removing carriage returns:
## [1] "tsicu hpi: 56 y/o male with recent osh admission for dizziness [**2-17**], subsequently dx w/ esld ["
## Result after leaving [obfuscation]:
## [1] "tsicu hpi: 56 y/o male with recent osh admission for dizziness, subsequently dx w/ esld etoh, renal "
## Result after removing all but alphanumeric and spaces:
## [1] "tsicu hpi 56 yo male with recent osh admission for dizziness subsequently dx w esld etoh renal insuf"
phrases <- clean_text(phrases)
## Result after removing carriage returns:
## [1] "of note, palliative care was consulted on [**11-28**] and during a family meeting with pcp, [**name1"
## Result after leaving [obfuscation]:
## [1] "of note, palliative care was consulted on and during a family meeting with pcp, attending, onc team,"
## Result after removing all but alphanumeric and spaces:
## [1] "of note palliative care was consulted on and during a family meeting with pcp attending onc team and"
#Create bag of words by splitting phrases on spaces and unlisting result
bow <- unlist(strsplit(phrases, ' '))
#Remove empty entries
bow <- bow[bow != '']
#Tabulate
tab <- table(bow)[rev(order(table(bow)))]
#Plot
par(mai=c(1,2,1,1))
barplot(rev(head(tab, 20)), horiz = T, las = 1, main = "Most Frequent Words in Phrase Dictionary", xlab = "Frequency")
#Use only unique tokens
bow <- unique(bow)
length(bow)
## [1] 705
#Strict regex
system.time(tmp <- strict_regex(bow, txts))
## user system elapsed
## 49.095 0.239 49.895
#Convert list to data.frame
system.time(tmp_two <- to_df(tmp, bow))
## user system elapsed
## 0.200 0.006 0.206
#Show results
tmp_two[1:4,1:25]
## of note palliative care was consulted on and during a family meeting
## 1 1 0 0 1 1 0 1 1 0 1 0 0
## 2 1 1 0 1 0 0 1 1 0 1 1 0
## 3 1 1 0 1 1 0 1 1 0 1 1 0
## 4 1 1 1 1 1 1 1 1 1 1 1 1
## with pcp attending onc team code status changed to dnrdni however
## 1 1 0 0 1 0 1 1 0 1 0 0
## 2 1 0 0 0 0 1 1 1 1 0 0
## 3 1 0 0 1 0 1 1 1 1 0 0
## 4 1 1 1 1 1 1 1 1 1 1 1
## treatment be
## 1 0 1
## 2 0 1
## 3 0 1
## 4 1 1
#Create score, apply sum row-wise
tmp_two$bow_score <- apply(tmp_two, 1, sum)
#Add score to dat with annotation data
dat$bow_score <- tmp_two$bow_score
#Show distribution of token counts represented in the BoW
summary(tmp_two$bow_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 59 148 171 172 197 280
hist(tmp_two$bow_score, breaks = 25, main = "BoW Score Distribution (all notes)", xlab = "Tokens in Phrase Dictionary", ylab = "Frequency of Notes with BoW Score")
#Create a tmp data.frame with all data and corresponding labels
tmp <- cbind(dat, tmp_two)
#Show distribution by cohort
boxplot(tmp$bow_score ~ tmp$COHORT, main = "BoW Score Distribution by Cohort", xlab = "Set", ylab = "Number of Tokens in BoW Dictionary")
\[accuracy = \frac{(tp+tn)}{(tp + tn + fp + fn)}\] \[precision = \frac{tp}{(tp+fp)}\] \[recall = \frac{tp}{(tp+fn)}\] \[specificity = \frac{tn}{(tn+fp)}\] \[F_1 = 2 \cdot \frac{(precision*recall)}{precision + recall}\]
thresholds <- as.numeric(levels(factor(tmp_two$bow_score)))
system.time(test <- statGen(dat, "Patient.and.Family.Care.Preferences", "CAR", "train", thresholds[1]))
## user system elapsed
## 0.050 0.018 0.107
for (i in 2:max(thresholds)){
test <- rbind(test, statGen(dat, "Patient.and.Family.Care.Preferences", "CAR", "train", thresholds[i]))
}
test <- modelStats(test)
test$threshold <- 1:max(thresholds)
head(test, 50)
## tp tn fp fn accuracy precision recall specificity F1 threshold
## 1 111 0 338 0 24.72 24.72 100.00 0.00 39.64 1
## 2 111 0 338 0 24.72 24.72 100.00 0.00 39.64 2
## 3 110 0 338 1 24.50 24.55 99.10 0.00 39.36 3
## 4 110 0 338 1 24.50 24.55 99.10 0.00 39.36 4
## 5 110 1 337 1 24.72 24.61 99.10 0.30 39.43 5
## 6 110 2 336 1 24.94 24.66 99.10 0.59 39.50 6
## 7 110 2 336 1 24.94 24.66 99.10 0.59 39.50 7
## 8 110 3 335 1 25.17 24.72 99.10 0.89 39.57 8
## 9 110 3 335 1 25.17 24.72 99.10 0.89 39.57 9
## 10 110 4 334 1 25.39 24.77 99.10 1.18 39.64 10
## 11 110 5 333 1 25.61 24.83 99.10 1.48 39.71 11
## 12 110 6 332 1 25.84 24.89 99.10 1.78 39.78 12
## 13 110 6 332 1 25.84 24.89 99.10 1.78 39.78 13
## 14 110 7 331 1 26.06 24.94 99.10 2.07 39.86 14
## 15 110 8 330 1 26.28 25.00 99.10 2.37 39.93 15
## 16 110 8 330 1 26.28 25.00 99.10 2.37 39.93 16
## 17 109 9 329 2 26.28 24.89 98.20 2.66 39.71 17
## 18 109 10 328 2 26.50 24.94 98.20 2.96 39.78 18
## 19 108 13 325 3 26.95 24.94 97.30 3.85 39.71 19
## 20 108 15 323 3 27.39 25.06 97.30 4.44 39.85 20
## 21 108 17 321 3 27.84 25.17 97.30 5.03 40.00 21
## 22 108 18 320 3 28.06 25.23 97.30 5.33 40.07 22
## 23 108 20 318 3 28.51 25.35 97.30 5.92 40.22 23
## 24 108 21 317 3 28.73 25.41 97.30 6.21 40.30 24
## 25 107 21 317 4 28.51 25.24 96.40 6.21 40.00 25
## 26 106 22 316 5 28.51 25.12 95.50 6.51 39.77 26
## 27 106 22 316 5 28.51 25.12 95.50 6.51 39.77 27
## 28 106 23 315 5 28.73 25.18 95.50 6.80 39.85 28
## 29 106 23 315 5 28.73 25.18 95.50 6.80 39.85 29
## 30 106 24 314 5 28.95 25.24 95.50 7.10 39.92 30
## 31 106 25 313 5 29.18 25.30 95.50 7.40 40.00 31
## 32 105 27 311 6 29.40 25.24 94.59 7.99 39.85 32
## 33 105 28 310 6 29.62 25.30 94.59 8.28 39.92 33
## 34 104 29 309 7 29.62 25.18 93.69 8.58 39.69 34
## 35 104 33 305 7 30.51 25.43 93.69 9.76 40.00 35
## 36 103 34 304 8 30.51 25.31 92.79 10.06 39.77 36
## 37 103 37 301 8 31.18 25.50 92.79 10.95 40.00 37
## 38 103 39 299 8 31.63 25.62 92.79 11.54 40.16 38
## 39 103 40 298 8 31.85 25.69 92.79 11.83 40.23 39
## 40 103 42 296 8 32.29 25.81 92.79 12.43 40.39 40
## 41 102 43 295 9 32.29 25.69 91.89 12.72 40.16 41
## 42 101 43 295 10 32.07 25.51 90.99 12.72 39.84 42
## 43 101 44 294 10 32.29 25.57 90.99 13.02 39.92 43
## 44 100 45 293 11 32.29 25.45 90.09 13.31 39.68 44
## 45 100 46 292 11 32.52 25.51 90.09 13.61 39.76 45
## 46 100 49 289 11 33.18 25.71 90.09 14.50 40.00 46
## 47 98 52 286 13 33.41 25.52 88.29 15.38 39.60 47
## 48 98 54 284 13 33.85 25.65 88.29 15.98 39.76 48
## 49 98 59 279 13 34.97 25.99 88.29 17.46 40.16 49
## 50 98 63 275 13 35.86 26.27 88.29 18.64 40.50 50
plot(test$F1 ~ test$threshold, type = 'l', main = "F1 by BoW Threshold", xlab = "Threshold", ylab = "F1")
system.time(val <- statGen(dat, "Patient.and.Family.Care.Preferences", "CAR", "validation", thresholds[1]))
## user system elapsed
## 0.020 0.002 0.023
for (i in 2:max(thresholds)){
val <- rbind(val, statGen(dat, "Patient.and.Family.Care.Preferences", "CAR", "validation", thresholds[i]))
}
val <- modelStats(val)
val$threshold <- 1:max(thresholds)
plot(val$F1 ~ val$threshold, type = 'l', main = "F1 by BoW Threshold", xlab = "Threshold", ylab = "F1")
head(val, 50)
## tp tn fp fn accuracy precision recall specificity F1 threshold
## 1 27 0 165 0 14.06 14.06 100.00 0.00 24.66 1
## 2 27 1 164 0 14.58 14.14 100.00 0.61 24.77 2
## 3 27 1 164 0 14.58 14.14 100.00 0.61 24.77 3
## 4 27 2 163 0 15.10 14.21 100.00 1.21 24.88 4
## 5 27 3 162 0 15.62 14.29 100.00 1.82 25.00 5
## 6 27 3 162 0 15.62 14.29 100.00 1.82 25.00 6
## 7 27 4 161 0 16.15 14.36 100.00 2.42 25.12 7
## 8 27 5 160 0 16.67 14.44 100.00 3.03 25.23 8
## 9 27 6 159 0 17.19 14.52 100.00 3.64 25.35 9
## 10 27 8 157 0 18.23 14.67 100.00 4.85 25.59 10
## 11 27 9 156 0 18.75 14.75 100.00 5.45 25.71 11
## 12 27 10 155 0 19.27 14.84 100.00 6.06 25.84 12
## 13 27 11 154 0 19.79 14.92 100.00 6.67 25.96 13
## 14 27 12 153 0 20.31 15.00 100.00 7.27 26.09 14
## 15 27 13 152 0 20.83 15.08 100.00 7.88 26.21 15
## 16 27 15 150 0 21.88 15.25 100.00 9.09 26.47 16
## 17 27 16 149 0 22.40 15.34 100.00 9.70 26.60 17
## 18 27 17 148 0 22.92 15.43 100.00 10.30 26.73 18
## 19 27 19 146 0 23.96 15.61 100.00 11.52 27.00 19
## 20 27 19 146 0 23.96 15.61 100.00 11.52 27.00 20
## 21 26 20 145 1 23.96 15.20 96.30 12.12 26.26 21
## 22 26 21 144 1 24.48 15.29 96.30 12.73 26.40 22
## 23 26 22 143 1 25.00 15.38 96.30 13.33 26.53 23
## 24 26 23 142 1 25.52 15.48 96.30 13.94 26.67 24
## 25 26 25 140 1 26.56 15.66 96.30 15.15 26.94 25
## 26 26 27 138 1 27.60 15.85 96.30 16.36 27.23 26
## 27 26 28 137 1 28.12 15.95 96.30 16.97 27.37 27
## 28 26 29 136 1 28.65 16.05 96.30 17.58 27.51 28
## 29 26 30 135 1 29.17 16.15 96.30 18.18 27.66 29
## 30 26 30 135 1 29.17 16.15 96.30 18.18 27.66 30
## 31 26 32 133 1 30.21 16.35 96.30 19.39 27.96 31
## 32 26 35 130 1 31.77 16.67 96.30 21.21 28.42 32
## 33 26 37 128 1 32.81 16.88 96.30 22.42 28.73 33
## 34 26 37 128 1 32.81 16.88 96.30 22.42 28.73 34
## 35 26 37 128 1 32.81 16.88 96.30 22.42 28.73 35
## 36 26 37 128 1 32.81 16.88 96.30 22.42 28.73 36
## 37 26 37 128 1 32.81 16.88 96.30 22.42 28.73 37
## 38 26 38 127 1 33.33 16.99 96.30 23.03 28.89 38
## 39 26 38 127 1 33.33 16.99 96.30 23.03 28.89 39
## 40 26 40 125 1 34.38 17.22 96.30 24.24 29.21 40
## 41 26 40 125 1 34.38 17.22 96.30 24.24 29.21 41
## 42 26 43 122 1 35.94 17.57 96.30 26.06 29.71 42
## 43 26 43 122 1 35.94 17.57 96.30 26.06 29.71 43
## 44 26 46 119 1 37.50 17.93 96.30 27.88 30.23 44
## 45 26 48 117 1 38.54 18.18 96.30 29.09 30.59 45
## 46 26 49 116 1 39.06 18.31 96.30 29.70 30.77 46
## 47 26 54 111 1 41.67 18.98 96.30 32.73 31.71 47
## 48 26 57 108 1 43.23 19.40 96.30 34.55 32.30 48
## 49 25 58 107 2 43.23 18.94 92.59 35.15 31.45 49
## 50 25 59 106 2 43.75 19.08 92.59 35.76 31.65 50
library(ggplot2)
#Remove NA for undefined values
train_plot <- long_data(test)
head(train_plot)
## metrics labs threshold
## 1 24.72 accuracy 1
## 2 24.72 accuracy 2
## 3 24.50 accuracy 3
## 4 24.50 accuracy 4
## 5 24.72 accuracy 5
## 6 24.94 accuracy 6
valid_plot <- long_data(val)
head(valid_plot)
## metrics labs threshold
## 1 14.06 accuracy 1
## 2 14.58 accuracy 2
## 3 14.58 accuracy 3
## 4 15.10 accuracy 4
## 5 15.62 accuracy 5
## 6 15.62 accuracy 6
ggplot(train_plot, aes(x = threshold, y = metrics, group = labs, shape = labs, linetype = labs )) +
geom_line(aes(color = labs), size = 1.1) +
#xlim(0, 200) +
#geom_point(aes(color = labs)) +
labs(title="Plot of Metrics by BoW Token Cutoff Threshold\n(Training Set)", x = "Token Number Cutoff", y = "Value") +
theme_minimal()
Note: Rule-based Bag of Words algorithm derived from Training Set phrases as defined by clinicians applied to validation set at varying cutoffs for number of tokens present in the
ggplot(valid_plot, aes(x = threshold, y = metrics, group = labs, shape = labs, linetype = labs )) +
geom_line(aes(color = labs), size = 1.1) +
#xlim(0, 200) +
#geom_point(aes(color = labs)) +
labs(title="Plot of Metrics by BoW Token Cutoff Threshold\n(Validation Set)", x = "Token Number Cutoff", y = "Value") +
theme_minimal()