#Contains map of files (BRAT format for neuroNER) to row_id (MIMIC NOTEEVENTS)
map <- read.csv("~/goals_of_care/summary_stats/row_id_to_file_num.txt", header = F, stringsAsFactors = F, sep = '\t')
head(map)
## V1 V2
## 1 407635 text_00000
## 2 345097 text_00001
## 3 371117 text_00002
## 4 401545 text_00003
## 5 393792 text_00004
## 6 382263 text_00005
colnames(map) <- c("row_id", "file")
#Pull in training data from one file
train <- read.csv("~/goals_of_care/regex_v2/CAR_train_processed.csv", header = T, stringsAsFactors = F, quote = "", row.names = NULL)
head(train)
## X filename LIM COD CAR PAL FAM LIM.machine COD.machine
## 1 0 train_text_00000 0 0 0 0 0 0 0
## 2 1 train_text_00001 0 0 0 0 0 0 0
## 3 2 train_text_00002 0 0 1 0 0 0 0
## 4 3 train_text_00003 0 0 1 0 0 0 0
## 5 4 train_text_00004 0 0 0 0 0 0 0
## 6 5 train_text_00005 0 0 0 0 0 0 0
## CAR.machine PAL.machine FAM.machine
## 1 0 0 0
## 2 1 0 0
## 3 0 0 0
## 4 1 0 0
## 5 0 0 0
## 6 0 0 0
colnames(train)[which(colnames(train) == "filename")] <- "file"
train_set <- unique(train$file)
train_set <- gsub("train_", '', train_set)
train_set <- map[(map$file %in% train_set),]
nrow(train_set)
## [1] 449
#Pull in validation data from one file
valid <- read.csv("~/goals_of_care/regex_v2/CAR_valid_processed.csv", header = T, stringsAsFactors = F, quote = "", row.names = NULL)
head(valid)
## X filename LIM COD CAR PAL FAM LIM.machine COD.machine
## 1 0 valid_text_00449 0 0 0 0 0 0 0
## 2 1 valid_text_00450 0 0 0 0 0 0 0
## 3 2 valid_text_00451 0 0 0 0 0 0 0
## 4 3 valid_text_00452 0 0 0 0 0 0 0
## 5 4 valid_text_00453 0 0 0 0 0 0 0
## 6 5 valid_text_00454 0 0 0 0 0 0 0
## CAR.machine PAL.machine FAM.machine
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
colnames(valid)[which(colnames(valid) == "filename")] <- "file"
valid_set <- unique(valid$file)
valid_set <- gsub("valid_", '', valid_set)
valid_set <- map[(map$file %in% valid_set),]
## ###
## Probably check to make sure they're identical outside of CAR, but safe bet
## ###
nrow(valid_set)
## [1] 192
res <- read.csv("~/goals_of_care/summary_stats/Annotations.csv", header = T, stringsAsFactors = F)
colnames(res) <- tolower(colnames(res))
colnames(res)
## [1] "x"
## [2] "row_id"
## [3] "subject_id"
## [4] "hadm_id"
## [5] "category"
## [6] "description"
## [7] "text"
## [8] "cohort"
## [9] "patient.and.family.care.preferences"
## [10] "patient.and.family.care.preferences.text"
## [11] "communication.with.family"
## [12] "communication.with.family.text"
## [13] "full.code.status"
## [14] "full.code.status.text"
## [15] "code.status.limitations"
## [16] "code.status.limitations.text"
## [17] "palliative.care.team.involvement"
## [18] "palliative.care.team.involvement.text"
## [19] "ambiguous"
## [20] "ambiguous.text"
## [21] "ambiguous.comments"
## [22] "none"
## [23] "operator"
## [24] "stamp"
nrow(res)
## [1] 876
dat <- read.csv("~/goals_of_care/regex/strict_regex_MIMIC_results06Nov17.csv", header = T, stringsAsFactors = F)
colnames(dat) <- tolower(colnames(dat))
colnames(dat)
## [1] "subject_id" "hadm_id" "row_id"
## [4] "chartdate" "charttime" "storetime"
## [7] "category" "description" "cgid"
## [10] "iserror" "text" "admittime"
## [13] "dischtime" "deathtime" "admission_type"
## [16] "admission_location" "discharge_location" "insurance"
## [19] "language" "religion" "marital_status"
## [22] "ethnicity" "edregtime" "edouttime"
## [25] "diagnosis" "hospital_expire_flag" "has_chartevents_data"
## [28] "gender" "dob" "dod"
## [31] "dod_hosp" "dod_ssn" "expire_flag"
## [34] "days_until_death" "chars" "cohort"
## [37] "time_since_admit"
nrow(dat)
## [1] 5339
dat <- merge(res, dat, by = "row_id")
nrow(dat)
## [1] 876
#Duplicated columns in R aren't dropped but given dimensions-- convert to simple name
colnames(dat)[which(colnames(dat) == "text.x")] <- "text"
length(unique(dat$text))
## [1] 641
colnames(dat)
## [1] "row_id"
## [2] "x"
## [3] "subject_id.x"
## [4] "hadm_id.x"
## [5] "category.x"
## [6] "description.x"
## [7] "text"
## [8] "cohort.x"
## [9] "patient.and.family.care.preferences"
## [10] "patient.and.family.care.preferences.text"
## [11] "communication.with.family"
## [12] "communication.with.family.text"
## [13] "full.code.status"
## [14] "full.code.status.text"
## [15] "code.status.limitations"
## [16] "code.status.limitations.text"
## [17] "palliative.care.team.involvement"
## [18] "palliative.care.team.involvement.text"
## [19] "ambiguous"
## [20] "ambiguous.text"
## [21] "ambiguous.comments"
## [22] "none"
## [23] "operator"
## [24] "stamp"
## [25] "subject_id.y"
## [26] "hadm_id.y"
## [27] "chartdate"
## [28] "charttime"
## [29] "storetime"
## [30] "category.y"
## [31] "description.y"
## [32] "cgid"
## [33] "iserror"
## [34] "text.y"
## [35] "admittime"
## [36] "dischtime"
## [37] "deathtime"
## [38] "admission_type"
## [39] "admission_location"
## [40] "discharge_location"
## [41] "insurance"
## [42] "language"
## [43] "religion"
## [44] "marital_status"
## [45] "ethnicity"
## [46] "edregtime"
## [47] "edouttime"
## [48] "diagnosis"
## [49] "hospital_expire_flag"
## [50] "has_chartevents_data"
## [51] "gender"
## [52] "dob"
## [53] "dod"
## [54] "dod_hosp"
## [55] "dod_ssn"
## [56] "expire_flag"
## [57] "days_until_death"
## [58] "chars"
## [59] "cohort.y"
## [60] "time_since_admit"
#Convert dob to numeric
dat$dob <- as.numeric(as.Date(dat$dob, "%Y-%m-%d %H:%M:%S"))
for (i in 1:nrow(dat)){
if (dat$row_id[i] %in% train_set$row_id){
dat$cohort[i] <- "train"
} else if (dat$row_id[i] %in% valid_set$row_id){
dat$cohort[i] <- "validation"
} else {
dat$cohort[i] <- ""
}
}
table(dat$cohort)
##
## train validation
## 626 250
#All notes accounted for
ages <- function(dat){
dat$age <- (dat$admittime - dat$dob)/365
#Correct for patient ages > 90 by imputing median age (91.4)
dat[(dat$age >= 90),]$age <- 91.4
return(dat)
}
dat <- ages(dat)
plotDat <- function(dat, column, bs, mn, xl, yl){
tmp <- as.matrix(table(dat[[column]], dat[["cohort"]]))
prop <- prop.table(tmp, margin = 2)#2 for column-wise proportions
par(mar = c(5.0, 4.0, 4.0, 15), xpd = TRUE)
barplot(prop, col = cm.colors(length(rownames(prop))), beside = bs,width = 2, main = mn, xlab = xl, ylab = yl)
legend("topright", inset = c(-0.90,0), fill = cm.colors(length(rownames(prop))), legend=rownames(prop))
}
#Check values when correcting for duplicates
test <- dat[!duplicated(dat$text),]
nrow(test)
## [1] 641
table(test$cohort)
##
## train validation
## 449 192
#rcompanion for pairWiseNominalIndependence
library("rcompanion")
test <- dat[!duplicated(dat$hadm_id.x),]
nrow(test)
## [1] 403
table(test$cohort)
##
## train validation
## 280 123
## Admission Type
plotDat(test, "admission_type", bs = F, mn = "Admission Type Proportions by Cohort", xl = "Cohort", yl = "Proportion")
tmp <- table(test$admission_type, test$cohort)
tmp
##
## train validation
## ELECTIVE 4 1
## EMERGENCY 272 121
## URGENT 4 1
chisq.test(tmp)
## Warning in chisq.test(tmp): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 0.53528, df = 2, p-value = 0.7652
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Comparison p.Chisq p.adj.Chisq
## 1 ELECTIVE : EMERGENCY 0.975 1
## 2 ELECTIVE : URGENT 1.000 1
## 3 EMERGENCY : URGENT 0.975 1
## Admission Location
plotDat(test, "admission_location", bs = F, mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")
tmp <- table(test$admission_location, test$cohort)
tmp
##
## train validation
## CLINIC REFERRAL/PREMATURE 3 2
## EMERGENCY ROOM ADMIT 213 91
## PHYS REFERRAL/NORMAL DELI 6 1
## TRANSFER FROM HOSP/EXTRAM 57 27
## TRANSFER FROM SKILLED NUR 1 2
chisq.test(tmp)
## Warning in chisq.test(tmp): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 3.0838, df = 4, p-value = 0.5439
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Comparison p.Chisq
## 1 CLINIC REFERRAL/PREMATURE : EMERGENCY ROOM ADMIT 1.000
## 2 CLINIC REFERRAL/PREMATURE : PHYS REFERRAL/NORMAL DELI 0.735
## 3 CLINIC REFERRAL/PREMATURE : TRANSFER FROM HOSP/EXTRAM 1.000
## 4 CLINIC REFERRAL/PREMATURE : TRANSFER FROM SKILLED NUR 1.000
## 5 EMERGENCY ROOM ADMIT : PHYS REFERRAL/NORMAL DELI 0.633
## 6 EMERGENCY ROOM ADMIT : TRANSFER FROM HOSP/EXTRAM 0.798
## 7 EMERGENCY ROOM ADMIT : TRANSFER FROM SKILLED NUR 0.455
## 8 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM HOSP/EXTRAM 0.577
## 9 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM SKILLED NUR 0.366
## 10 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM SKILLED NUR 0.533
## p.adj.Chisq
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 1
## 8 1
## 9 1
## 10 1
## Discharge Location
plotDat(test, "discharge_location", bs = T,mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")
tmp <- table(test$discharge_location, test$cohort)
tmp
##
## train validation
## DEAD/EXPIRED 280 123
chisq.test(tmp)
##
## Chi-squared test for given probabilities
##
## data: tmp
## X-squared = 61.164, df = 1, p-value = 5.252e-15
#Need to re-import dat
test <- dat[!duplicated(dat$subject_id.x),]
length(unique(test$subject_id.x))
## [1] 403
#All age
summary(test$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.87 61.73 73.87 71.01 83.00 91.40
## AGE
boxplot(test$age ~ test$cohort)
t.test(test$age ~ test$cohort)
##
## Welch Two Sample t-test
##
## data: test$age by test$cohort
## t = 0.97578, df = 213.74, p-value = 0.3303
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.661378 4.918835
## sample estimates:
## mean in group train mean in group validation
## 71.51172 69.88299
wilcox.test(test$age ~ test$cohort)
##
## Wilcoxon rank sum test with continuity correction
##
## data: test$age by test$cohort
## W = 18014, p-value = 0.4614
## alternative hypothesis: true location shift is not equal to 0
tmp_train <- test[(test$cohort == "train"),]$age
tmp_valid <- test[(test$cohort == "validation"),]$age
summary(tmp_train)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.87 61.79 74.38 71.51 82.49 91.40
## All gender
table(test$gender)
##
## F M
## 188 215
table(test$gender)/nrow(test)*100
##
## F M
## 46.65012 53.34988
## GENDER
plotDat(test, "gender", bs = F, mn = "Gender by Cohort", xl = "Cohort", yl = "Proportion")
tmp <- table(test$gender, test$cohort)
tmp
##
## train validation
## F 136 52
## M 144 71
prop.test(tmp)
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: tmp
## X-squared = 1.1195, df = 1, p-value = 0.29
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.04101645 0.14829007
## sample estimates:
## prop 1 prop 2
## 0.7234043 0.6697674
##Insurance
plotDat(test, "insurance", bs = F, mn = "Insurance by Cohort", xl = "Cohort", yl = "Proportion")
tmp <- table(test$insurance, test$cohort)
tmp
##
## train validation
## Government 3 2
## Medicaid 21 9
## Medicare 196 87
## Private 56 25
## Self Pay 4 0
chisq.test(tmp)
## Warning in chisq.test(tmp): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 1.9838, df = 4, p-value = 0.7387
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Comparison p.Chisq p.adj.Chisq
## 1 Government : Medicaid 1.000 1
## 2 Government : Medicare 1.000 1
## 3 Government : Private 1.000 1
## 4 Government : Self Pay 0.530 1
## 5 Medicaid : Medicare 1.000 1
## 6 Medicaid : Private 1.000 1
## 7 Medicaid : Self Pay 0.500 1
## 8 Medicare : Private 1.000 1
## 9 Medicare : Self Pay 0.435 1
## 10 Private : Self Pay 0.447 1
##Ethnicity
##Clean ethnicity
test[(grepl("WHITE|PORTUGUESE", test$ethnicity)),]$ethnicity <- "WHITE"
test[(grepl("ASIAN", test$ethnicity)),]$ethnicity <- "ASIAN"
test[(grepl("BLACK", test$ethnicity)),]$ethnicity <- "BLACK"
test[(grepl("HISPANIC", test$ethnicity)),]$ethnicity <- "HISPANIC"
test[(grepl("MIDDLE|NATIVE|MULTI|DECLINED|UNABLE|OTHER|NOT",test$ethnicity)),]$ethnicity <- "OTHER"
plotDat(test, "ethnicity", bs = F, mn = "Ethnicity by Cohort", xl = "Cohort", yl = "Proportion")
tmp <- table(test$ethnicity, test$cohort)
tmp
##
## train validation
## ASIAN 10 3
## BLACK 21 11
## HISPANIC 5 7
## OTHER 33 12
## WHITE 211 90
chisq.test(tmp)
## Warning in chisq.test(tmp): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 5.3111, df = 4, p-value = 0.2568
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Comparison p.Chisq p.adj.Chisq
## 1 ASIAN : BLACK 0.6990 0.920
## 2 ASIAN : HISPANIC 0.1650 0.550
## 3 ASIAN : OTHER 1.0000 1.000
## 4 ASIAN : WHITE 0.8280 0.920
## 5 BLACK : HISPANIC 0.2730 0.683
## 6 BLACK : OTHER 0.6340 0.920
## 7 BLACK : WHITE 0.7480 0.920
## 8 HISPANIC : OTHER 0.0849 0.424
## 9 HISPANIC : WHITE 0.0767 0.424
## 10 OTHER : WHITE 0.7880 0.920
## Marital Status
plotDat(test, "marital_status", bs = F, mn = "Marital Status by Cohort", xl = "Cohort", yl = "Proportion")
tmp <- table(test$marital_status, test$cohort)
tmp
##
## train validation
## 25 5
## DIVORCED 19 6
## MARRIED 128 57
## SEPARATED 6 1
## SINGLE 54 32
## UNKNOWN (DEFAULT) 0 1
## WIDOWED 48 21
chisq.test(tmp)
## Warning in chisq.test(tmp): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 8.185, df = 6, p-value = 0.2249
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Comparison p.Chisq p.adj.Chisq
## 1 : DIVORCED 0.7350 0.908
## 2 : MARRIED 0.1710 0.908
## 3 : SEPARATED 1.0000 1.000
## 4 : SINGLE 0.0641 0.908
## 5 : UNKNOWN (DEFAULT) 0.4300 0.908
## 6 : WIDOWED 0.2370 0.908
## 7 DIVORCED : MARRIED 0.6420 0.908
## 8 DIVORCED : SEPARATED 0.9740 1.000
## 9 DIVORCED : SINGLE 0.3240 0.908
## 10 DIVORCED : UNKNOWN (DEFAULT) 0.5960 0.908
## 11 DIVORCED : WIDOWED 0.7250 0.908
## 12 MARRIED : SEPARATED 0.6060 0.908
## 13 MARRIED : SINGLE 0.3650 0.908
## 14 MARRIED : UNKNOWN (DEFAULT) 0.6840 0.908
## 15 MARRIED : WIDOWED 1.0000 1.000
## 16 SEPARATED : SINGLE 0.4190 0.908
## 17 SEPARATED : UNKNOWN (DEFAULT) 0.5370 0.908
## 18 SEPARATED : WIDOWED 0.6450 0.908
## 19 SINGLE : UNKNOWN (DEFAULT) 0.8020 0.936
## 20 SINGLE : WIDOWED 0.4760 0.908
## 21 UNKNOWN (DEFAULT) : WIDOWED 0.6870 0.908
test <- dat[!duplicated(dat$text),]
#Words
test$words <- sapply(gregexpr("\\W+", unique(test$text)), length) + 1
#Chars
test$chars <- nchar(test$text)
##
boxplot(test$words ~ test$cohort)
t.test(test$words ~ test$cohort)
##
## Welch Two Sample t-test
##
## data: test$words by test$cohort
## t = 6.0222, df = 394.81, p-value = 3.94e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 129.2318 254.5052
## sample estimates:
## mean in group train mean in group validation
## 1058.5768 866.7083
wilcox.test(test$words ~ test$cohort)
##
## Wilcoxon rank sum test with continuity correction
##
## data: test$words by test$cohort
## W = 56340, p-value = 7.128e-10
## alternative hypothesis: true location shift is not equal to 0
## Word Stats
summary(test$words)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 92 731 946 1001 1214 2569
tmp_train <- test[(test$cohort == "train"),]$words
tmp_valid <- test[(test$cohort == "validation"),]$words
summary(tmp_train)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 110 772 1021 1059 1284 2328
summary(tmp_valid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 92.0 638.8 792.5 866.7 1059.0 2569.0
##Char Stats
boxplot(test$chars ~ test$cohort)
t.test(test$chars ~ test$cohort)
##
## Welch Two Sample t-test
##
## data: test$chars by test$cohort
## t = 5.7033, df = 388.99, p-value = 2.329e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 755.6381 1550.6901
## sample estimates:
## mean in group train mean in group validation
## 6404.154 5250.990
wilcox.test(test$chars ~ test$cohort)
##
## Wilcoxon rank sum test with continuity correction
##
## data: test$chars by test$cohort
## W = 55840, p-value = 3.027e-09
## alternative hypothesis: true location shift is not equal to 0
summary(test$chars)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 558 4369 5656 6059 7430 15840
tmp_train <- test[(test$cohort == "train"),]$chars
tmp_valid <- test[(test$cohort == "validation"),]$chars
summary(tmp_train)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 700 4593 6098 6404 7870 14080
summary(tmp_valid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 558 3798 4830 5251 6374 15840
#Load phrases
rules <- read.csv("~/goals_of_care/regex/rule-based_NLP_domains.csv", header = T, stringsAsFactors = F)
#Rename columns
colnames(rules) <- c("CAR", "FAM", "LIM", "COD", "PAL")
#Show rules
print(rules)
## CAR FAM
## 1 goals of care family discussion
## 2 goc family discussions
## 3 goals for care family meeting
## 4 goals of treatment family communication
## 5 goals for treatment durable power of attorney
## 6 treatment goals health care proxy
## 7 care preferences hcp
## 8 extending life understanding of illness
## 9 comfort-focused care understanding of prognosis
## 10 supportive care
## 11 no feeding tube
## 12 no dialysis
## 13 no hemodialysis
## 14 no HD
## 15 quality of life
## 16 priorities
## 17 end of life care
## 18 living will
## 19 molst
## 20 advance directives
## 21 advance care planning
## 22 acp
## 23 hospice
## 24 full code confirmed
## 25 full code d/w
## 26 full code discussed
## 27 full code verified
## 28 would like to be full code
## 29 wishes to be full code
## 30 would like to remain full code
## 31 wishes to remain full code
## 32 wish to be full code
## 33 remaining full code
## LIM COD PAL
## 1 dnr full code pallcare
## 2 dnrdni palliative care
## 3 dni pall care
## 4 do not resuscitate palliative medicine
## 5 do-not-resuscitate
## 6 do not intubate
## 7 do-not-intubate
## 8 no intubation
## 9 chest compressions
## 10 no defibrillation
## 11 shocks
## 12 shock therapy
## 13 no cpr
## 14 no endotracheal intubation
## 15 no mechanical intubation
## 16 no ventilator
## 17 no breathing machine
## 18 no breathing tube
## 19 no chest compressions
## 20 cmo
## 21 comfort measures
## 22 comfort care
## 23 limitations of life-sustaining treatment
## 24 llst
## 25
## 26
## 27
## 28
## 29
## 30
## 31
## 32
## 33
res <- dat
#colnames(res)[which(colnames(res) == "patient.and.family.care.preferences")] <- "CAR"
#colnames(res)[which(colnames(res) == "communication.with.family")] <- "FAM"
#colnames(res)[which(colnames(res) == "code.status.limitations")] <- "LIM"
#colnames(res)[which(colnames(res) == "full.code.status")] <- "COD"
#colnames(res)[which(colnames(res) == "palliative.care.team.involvement")] <- "PAL"
strictRegex() will accept all phrases kwds, and all note texts, texts, it will utilize grepl() to find direct matches in the text, and will return a list of booleans.
strictRegex <- function(kwds, texts){
#Create a list to store results
tmpList <- list()
#Loop through all keywords
for (i in 1:length(kwds)){
#Store results as a logical vector in its respective list entry position
tmpList[[i]] <- grepl(kwds[i], texts, ignore.case = TRUE)
}
#Return list and control to environment
return(tmpList)
}
## Preprocess domain dictionaries to remove ''
CAR <- strictRegex(rules$CAR[rules$CAR != ''], res$text)
FAM <- strictRegex(rules$FAM[rules$FAM != ''], res$text)
LIM <- strictRegex(rules$LIM[rules$LIM != ''], res$text)
COD <- strictRegex(rules$COD[rules$COD != ''], res$text)
PAL <- strictRegex(rules$PAL[rules$PAL != ''], res$text)
to_df <- function(domain, rule){
#Convert list from grepl to data frame
domain <- as.data.frame(domain)
#Show column names as phrases
colnames(domain) <- rule[rule != '']
#Multiply by 1 for binary numeric
domain <- domain*1
return(domain)
}
CAR <- to_df(CAR, rules$CAR)
FAM <- to_df(FAM, rules$FAM)
LIM <- to_df(LIM, rules$LIM)
COD <- to_df(COD, rules$COD)
PAL <- to_df(PAL, rules$PAL)
phrase_to_domain <- function(dat){
inc <- vector()
for (i in 1:nrow(dat)){
#Collapse phrases by domain, presence of any phrase indicates domain
inc[i] <- any(dat[i,] == 1)
}
#Multiply by one to convert logical to binary numeric
return(inc*1)
}
res <- cbind(phrase_to_domain(CAR),
phrase_to_domain(FAM),
phrase_to_domain(LIM),
phrase_to_domain(COD),
phrase_to_domain(PAL))
colnames(res) <- c("RE_CAR", "RE_FAM", "RE_LIM", "RE_COD", "RE_PAL")
res <- as.data.frame(res)
head(res)
## RE_CAR RE_FAM RE_LIM RE_COD RE_PAL
## 1 0 0 0 1 0
## 2 0 0 0 1 0
## 3 0 0 0 1 0
## 4 0 0 0 1 0
## 5 0 0 0 1 0
## 6 0 1 0 1 0
nrow(res)
## [1] 876
#Merge res and data
res <- cbind(dat, res)
Remove duplicates, keep first observation of the note, as the first observation will have the most information, subsequent annotations will have additional phrases associated with domains.
res <- res[!duplicated(res$text),]
nrow(res)
## [1] 641
statGen <- function(dat, hum_lab, re_lab, set){
dat <- dat[(dat$cohort == set),]
#Vector to hold results
rVec <- vector()
#True Positives
tp <- 0
#True Negatives
tn <- 0
#False Positives
fp <- 0
#False Negatives
fn <- 0
#tmp to hold dat[[lab]][i]
re_tmp <- as.character()
hum_tmp <- as.character()
for (i in 1:nrow(dat)){
re_tmp <- dat[[re_lab]][i]
hum_tmp <- dat[[hum_lab]][i]
rVec[length(rVec)+1] <- paste("Token: ", dat$TEXT[i],"\n", re_lab, " Assignment: ", re_tmp, "\n", sep = '')
#If both model and human aren't negative
if (re_tmp != 0 & hum_tmp != 0 & !is.na(hum_tmp)){
#True positive
tp <- tp + 1
#if human marks negative and model assigns label
} else if (hum_tmp == 0 & re_tmp != 0 & !is.na(hum_tmp)){
#False positive
fp <- fp + 1
#if human marks label and model doesn't assign label
} else if (hum_tmp != 0 & re_tmp == 0 & !is.na(hum_tmp)){
#False negative
fn <- fn + 1
#if human marks negative and model marks negative
} else if (re_tmp == 0 & hum_tmp == 0 & !is.na(hum_tmp)){
#True negative
tn <- tn + 1
} else if (is.null(re_tmp) | is.null(hum_tmp)){
break
}
}
#Hold results in txt to check if necessary
write.csv(rVec,
file = paste(set,"_",re_lab,"_OUTPUT_NOTE_LEVEL_07Dec17.txt", sep = ''),
quote = F, row.names = F)
tmpFrame <- cbind(tp, tn, fp, fn)
colnames(tmpFrame) <- c("tp", "tn", "fp", "fn")
return(as.data.frame(tmpFrame))
}
train_results <- rbind(statGen(res, "patient.and.family.care.preferences", "RE_CAR", "train"),
statGen(res, "full.code.status", "RE_COD", "train"),
statGen(res, "communication.with.family", "RE_FAM", "train"),
statGen(res, "code.status.limitations","RE_LIM", "train"),
statGen(res, "palliative.care.team.involvement", "RE_PAL", "train"))
row.names(train_results) <- c("RE_CAR", "RE_COD", "RE_FAM", "RE_LIM", "RE_PAL")
validation_results <- rbind(statGen(res, "patient.and.family.care.preferences", "RE_CAR", "validation"),
statGen(res, "full.code.status", "RE_COD", "validation"),
statGen(res, "communication.with.family", "RE_FAM", "validation"),
statGen(res, "code.status.limitations","RE_LIM", "validation"),
statGen(res, "palliative.care.team.involvement", "RE_PAL", "validation"))
row.names(validation_results) <- c("RE_CAR", "RE_COD", "RE_FAM", "RE_LIM", "RE_PAL")
\[accuracy = \frac{(tp+tn)}{(tp + tn + fp + fn)}\] \[precision = \frac{tp}{(tp+fp)}\] \[accuracy = \frac{tp}{(tp+fn)}\] \[specificity = \frac{tn}{(tn+fp)}\] \[F_1 = 2 \cdot \frac{(precision*recall)}{precision + recall}\]
modelStats <- function(dat){
accuracy <- (dat$tp + dat$tn)/(dat$tp + dat$tn + dat$fp + dat$fn)
precision <- dat$tp/(dat$tp + dat$fp)
recall <- dat$tp/(dat$tp + dat$fn)
specificity <- dat$tn/(dat$tn+dat$fp)
F1 <- 2*(precision*recall)/(precision + recall)
#dat$accuracy <- paste(sprintf("%2.2f", round(100*accuracy,3)), '%', sep = '')
#dat$precision <- paste(sprintf("%2.2f", round(100*precision,3)), '%', sep = '')
#dat$recall <- paste(sprintf("%2.2f", round(100*recall,3)), '%', sep = '')
#dat$specificity <- paste(sprintf("%2.2f", round(100*specificity,3)), '%', sep = '')
#dat$F1 <- round(100*2*(precision * recall)/(precision + recall),3)
dat$accuracy <- round(accuracy*100, 2)
dat$precision <- round(precision*100, 2)
dat$recall <- round(recall*100, 2)
dat$specificity <- round(specificity*100, 2)
dat$F1 <- round(F1*100, 2)
return(dat)
}
train_stats <- modelStats(train_results)
write.csv(train_stats, file = "regex_train_stats_07Dec17.csv", row.names = F)
print(train_stats)
## tp tn fp fn accuracy precision recall specificity F1
## RE_CAR 37 292 53 67 73.27 41.11 35.58 84.64 38.14
## RE_COD 192 116 12 36 86.52 94.12 84.21 90.62 88.89
## RE_FAM 69 246 69 65 70.16 50.00 51.49 78.10 50.74
## RE_LIM 144 204 58 43 77.51 71.29 77.01 77.86 74.04
## RE_PAL 10 427 12 0 97.33 45.45 100.00 97.27 62.50
validation_stats <- modelStats(validation_results)
write.csv(validation_stats, file = "regex_validation_stats_07Dec17.csv", row.names = F)
print(validation_stats)
## tp tn fp fn accuracy precision recall specificity F1
## RE_CAR 6 162 5 19 87.50 54.55 24.00 97.01 33.33
## RE_COD 80 46 3 13 88.73 96.39 86.02 93.88 90.91
## RE_FAM 12 156 5 19 87.50 70.59 38.71 96.89 50.00
## RE_LIM 49 96 26 21 75.52 65.33 70.00 78.69 67.59
## RE_PAL 0 190 1 1 98.96 0.00 0.00 99.48 NaN