Load Data
library("data.table")
dat <- fread("~/goals_of_care/external_validation/over_75_cohort_17Jan18.csv", header = T, stringsAsFactors = F)
nrow(dat)
## [1] 11575
#Remove duplicate notes
dat <- dat[!duplicated(dat$TEXT),]
nrow(dat)
## [1] 10250
mim <- fread("~/goals_of_care/external_validation/NOTES_STAYS_ADM_PAT.csv", header = T, stringsAsFactors = F)
##
Read 0.0% of 385794 rows
Read 5.2% of 385794 rows
Read 10.4% of 385794 rows
Read 15.6% of 385794 rows
Read 20.7% of 385794 rows
Read 25.9% of 385794 rows
Read 31.1% of 385794 rows
Read 33.7% of 385794 rows
Read 38.9% of 385794 rows
Read 169621 rows and 44 (of 44) columns from 1.243 GB file in 00:00:17
nrow(mim)
## [1] 169621
test <- merge(dat, mim, by = c("SUBJECT_ID", "HADM_ID", "ROW_ID"))
colnames(test)
## [1] "SUBJECT_ID" "HADM_ID" "ROW_ID"
## [4] "CHARTDATE.x" "CHARTTIME.x" "STORETIME.x"
## [7] "CATEGORY.x" "DESCRIPTION.x" "CGID.x"
## [10] "ISERROR.x" "TEXT.x" "CHARTDATE.y"
## [13] "CHARTTIME.y" "STORETIME.y" "CATEGORY.y"
## [16] "DESCRIPTION.y" "CGID.y" "ISERROR.y"
## [19] "TEXT.y" "ADMITTIME" "DISCHTIME"
## [22] "DEATHTIME" "ADMISSION_TYPE" "ADMISSION_LOCATION"
## [25] "DISCHARGE_LOCATION" "INSURANCE" "LANGUAGE"
## [28] "RELIGION" "MARITAL_STATUS" "ETHNICITY"
## [31] "EDREGTIME" "EDOUTTIME" "DIAGNOSIS"
## [34] "HOSPITAL_EXPIRE_FLAG" "HAS_CHARTEVENTS_DATA" "GENDER"
## [37] "DOB" "DOD" "DOD_HOSP"
## [40] "DOD_SSN" "EXPIRE_FLAG" "ICUSTAY_ID"
## [43] "DBSOURCE" "FIRST_CAREUNIT" "LAST_CAREUNIT"
## [46] "FIRST_WARDID" "LAST_WARDID" "INTIME"
## [49] "OUTTIME" "LOS" "AGE"
## [52] "ADMISSION_NUMBER"
#Order data
test <- test[with(test, order(SUBJECT_ID, HADM_ID, ADMITTIME)),]
Correct columnnames
for (name in colnames((test))){
#If the column name is duplicated
if (grepl(".y", name)){
#Remove duplicate
test[[name]] <- NULL
} else if (grepl(".x", name)){
#Rename original to remove the x dimension
colnames(test)[which(colnames(test) == name)] <- gsub(".x", '', name)
}
}
rm(name)
colnames(test)
## [1] "SUBJECT_ID" "HADM_ID" "ROW_ID"
## [4] "CHARTDATE" "CHARTTIME" "STORETIME"
## [7] "CATEGORY" "DESCRIPTION" "CGID"
## [10] "ISERROR" "TEXT" "ADMITTIME"
## [13] "DISCHTIME" "DEATHTIME" "ADMISSION_TYPE"
## [16] "ADMISSION_LOCATION" "DISCHARGE_LOCATION" "INSURANCE"
## [19] "LANGUAGE" "RELIGION" "MARITAL_STATUS"
## [22] "ETHNICITY" "EDREGTIME" "EDOUTTIME"
## [25] "DIAGNOSIS" "HOSPITAL_EXPIRE_FLAG" "HAS_CHARTEVENTS_DATA"
## [28] "GENDER" "DOB" "DOD"
## [31] "DOD_HOSP" "DOD_SSN" "EXPIRE_FLAG"
## [34] "ICUSTAY_ID" "DBSOURCE" "FIRST_CAREUNIT"
## [37] "LAST_CAREUNIT" "FIRST_WARDID" "LAST_WARDID"
## [40] "INTIME" "OUTTIME" "LOS"
## [43] "AGE" "ADMISSION_NUMBER"
length(unique(test$SUBJECT_ID))
## [1] 1141
length(unique(test$HADM_ID))
## [1] 1350
length(unique(test$TEXT))
## [1] 10250
#Add cohort data
test$COHORT <- rep("external validation", each = nrow(test))
Utility Functions
ages <- function(dat){
dat$AGE <- (dat$ADMITTIME - dat$DOB)/365
#Correct for patient ages > 90 by imputing median age (91.4)
dat[(dat$AGE >= 90),]$AGE <- 91.4
return(dat)
}
plotDat <- function(dat, column, bs, mn, xl, yl){
tmp <- as.matrix(table(dat[[column]]))
prop <- prop.table(tmp)#2 for column-wise proportions
par(mar = c(5.0, 4.0, 4.0, 15), xpd = TRUE)
barplot(prop, col = cm.colors(length(rownames(prop))), beside = bs,width = 2, main = mn, xlab = xl, ylab = yl)
legend("topright", inset = c(-0.90,0), fill = cm.colors(length(rownames(prop))), legend=rownames(prop))
}
clean_text <- function(tokens, printout){
#Create a fake patient note phrase as a representative sample
ex_token <- "Example note:\nThe patient is a 81yo m who was found down in [** location **] on [** date **] by daughter, [** name **].\n Pt was in usual state of health until four days ago, when began to complain to family of heartburn for which the pt was taking tums in addition to his prescribed PPI, without resolution."
if (printout){
print(substr(ex_token, 1, 100))
}
#Remove carriage returns, convert to lower
tokens <- tolower(gsub('\n', ' ', tokens))
tokens <- tolower(gsub('\r', ' ', tokens))
ex_token <- tolower(gsub('\n', ' ', ex_token))
if (printout){
cat("Result after removing carriage returns:\n")
print(substr(ex_token, 1, 100))
}
#https://stackoverflow.com/questions/13529360/replace-text-within-parenthesis-in-r
#Remove obfuscations between '[' and ']'
tokens <- gsub(" *\\[.*?\\] *", ' ', tokens)
ex_token <- gsub(" *\\[.*?\\] *", ' ', ex_token)
if (printout){
cat("Result after leaving [obfuscation]:\n")
print(substr(ex_token, 1, 100))
}
#Keep only words & numeric
tokens <- gsub("[^[:alnum:][:space:]]", '', tokens)
ex_token <- gsub("[^[:alnum:][:space:]]", '', ex_token)
if (printout){
cat("Result after removing all but alphanumeric and spaces:\n")
print(substr(ex_token, 1, 100))
}
#Remove numeric
tokens <- gsub("\\d+", '', tokens)
ex_token <- gsub("\\d+", '', ex_token)
if (printout){
cat("Result after removing all but alphabetic characters and spaces:\n")
print(substr(ex_token, 1, 100))
}
#Keep only a single white space
#https://stackoverflow.com/questions/25707647/merge-multiple-spaces-to-single-space-remove-trailing-leading-spaces
tokens <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', tokens, perl=TRUE)
ex_token <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', ex_token, perl=TRUE)
if (printout){
cat("Result after keeping only single spaces:\n")
print(substr(ex_token, 1, 100))
}
return(tokens)
}
plot_bow <- function(bows, n, lab){
tmp_tab <- table(bows)[rev(order(table(bows)))]
par(mai=c(1,2,1,1))
barplot(rev(head(tmp_tab, n)), horiz = T, las = 1,
main = paste("Most Frequent Words in the ",lab ," Set", sep = ''),
xlab = "Frequency")
}
#rcompanion for pairWiseNominalIndependence
library("rcompanion")
Create Ages
test <- ages(test)
#Colnames
colnames(test) <- tolower(colnames(test))
#Store data
tmp_two <- test
Hospital Admission Level Data
#Subset
test <- test[!duplicated(test$hadm_id),]
nrow(test)
## [1] 1350
table(test$cohort)
##
## external validation
## 1350
## Admission Type
plotDat(test, "admission_type", bs = F, mn = "Admission Type Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_type, test$cohort)
tmp
##
## external validation
## ELECTIVE 60
## EMERGENCY 1283
## URGENT 7
chisq.test(tmp)
##
## Chi-squared test for given probabilities
##
## data: tmp
## X-squared = 2316.1, df = 2, p-value < 2.2e-16
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 ELECTIVE : EMERGENCY 3.44e-244 5.16e-244
## 2 ELECTIVE : URGENT 9.48e-11 9.48e-11
## 3 EMERGENCY : URGENT 1.90e-276 5.70e-276
## Admission Location
plotDat(test, "admission_location", bs = F, mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_location, test$cohort)
tmp
##
## external validation
## CLINIC REFERRAL/PREMATURE 32
## EMERGENCY ROOM ADMIT 1067
## PHYS REFERRAL/NORMAL DELI 75
## TRANSFER FROM HOSP/EXTRAM 169
## TRANSFER FROM OTHER HEALT 1
## TRANSFER FROM SKILLED NUR 6
chisq.test(tmp)
##
## Chi-squared test for given probabilities
##
## data: tmp
## X-squared = 3866.6, df = 5, p-value < 2.2e-16
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq
## 1 CLINIC REFERRAL/PREMATURE : EMERGENCY ROOM ADMIT 5.59e-214
## 2 CLINIC REFERRAL/PREMATURE : PHYS REFERRAL/NORMAL DELI 3.23e-05
## 3 CLINIC REFERRAL/PREMATURE : TRANSFER FROM HOSP/EXTRAM 4.32e-22
## 4 CLINIC REFERRAL/PREMATURE : TRANSFER FROM OTHER HEALT 6.80e-08
## 5 CLINIC REFERRAL/PREMATURE : TRANSFER FROM SKILLED NUR 2.47e-05
## 6 EMERGENCY ROOM ADMIT : PHYS REFERRAL/NORMAL DELI 2.08e-189
## 7 EMERGENCY ROOM ADMIT : TRANSFER FROM HOSP/EXTRAM 6.62e-144
## 8 EMERGENCY ROOM ADMIT : TRANSFER FROM OTHER HEALT 2.20e-233
## 9 EMERGENCY ROOM ADMIT : TRANSFER FROM SKILLED NUR 3.75e-230
## 10 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM HOSP/EXTRAM 1.77e-09
## 11 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM OTHER HEALT 2.10e-17
## 12 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM SKILLED NUR 1.77e-14
## 13 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM OTHER HEALT 5.47e-38
## 14 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM SKILLED NUR 6.93e-35
## 15 TRANSFER FROM OTHER HEALT : TRANSFER FROM SKILLED NUR 5.88e-02
## p.adj.Chisq
## 1 2.80e-213
## 2 3.46e-05
## 3 8.10e-22
## 4 8.50e-08
## 5 2.85e-05
## 6 7.80e-189
## 7 1.99e-143
## 8 3.30e-232
## 9 2.81e-229
## 10 2.41e-09
## 11 3.50e-17
## 12 2.66e-14
## 13 1.37e-37
## 14 1.48e-34
## 15 5.88e-02
## Discharge Location (Note: Not much info here...)
plotDat(test, "discharge_location", bs = T,mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

Patient Level Data
#Subset
test <- tmp_two
test <- test[!duplicated(test$subject_id),]
nrow(test)
## [1] 1141
table(test$cohort)
##
## external validation
## 1141
#All age
boxplot(test$age ~ test$cohort, main = "Age Distribution by Cohort", xlab = "Set", ylab = "Age")

tmp_full <- test$age
summary(tmp_full)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 75.01 80.08 84.25 84.27 88.59 91.40
sd(tmp_full)
## [1] 5.164916
## GENDER
plotDat(test, "gender", bs = F, mn = "Gender by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$gender, test$cohort)
tmp
##
## external validation
## F 580
## M 561
##Insurance
plotDat(test, "insurance", bs = F, mn = "Insurance by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$insurance, test$cohort)
tmp
##
## external validation
## Medicaid 7
## Medicare 1084
## Private 50
chisq.test(tmp)
##
## Chi-squared test for given probabilities
##
## data: tmp
## X-squared = 1955.2, df = 2, p-value < 2.2e-16
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 Medicaid : Medicare 3.32e-233 9.96e-233
## 2 Medicaid : Private 1.23e-08 1.23e-08
## 3 Medicare : Private 4.83e-207 7.24e-207
##Ethnicity
##Clean ethnicity
test[(grepl("WHITE|PORTUGUESE", test$ethnicity)),]$ethnicity <- "WHITE"
test[(grepl("ASIAN", test$ethnicity)),]$ethnicity <- "ASIAN"
test[(grepl("BLACK", test$ethnicity)),]$ethnicity <- "BLACK"
test[(grepl("HISPANIC", test$ethnicity)),]$ethnicity <- "HISPANIC"
test[(grepl("MIDDLE|NATIVE|MULTI|DECLINED|UNABLE|OTHER|NOT",test$ethnicity)),]$ethnicity <- "OTHER"
plotDat(test, "ethnicity", bs = F, mn = "Ethnicity by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$ethnicity, test$cohort)
tmp
##
## external validation
## ASIAN 34
## BLACK 76
## HISPANIC 18
## OTHER 73
## WHITE 940
chisq.test(tmp)
##
## Chi-squared test for given probabilities
##
## data: tmp
## X-squared = 2786.2, df = 4, p-value < 2.2e-16
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 ASIAN : BLACK 6.21e-05 8.87e-05
## 2 ASIAN : HISPANIC 2.65e-02 2.94e-02
## 3 ASIAN : OTHER 1.63e-04 2.04e-04
## 4 ASIAN : WHITE 2.74e-185 1.37e-184
## 5 BLACK : HISPANIC 2.20e-09 4.40e-09
## 6 BLACK : OTHER 8.06e-01 8.06e-01
## 7 BLACK : WHITE 8.35e-162 2.09e-161
## 8 HISPANIC : OTHER 8.14e-09 1.36e-08
## 9 HISPANIC : WHITE 5.51e-195 5.51e-194
## 10 OTHER : WHITE 2.16e-163 7.20e-163
## Marital Status
## Note: Need to clean marital status
plotDat(test, "marital_status", bs = F, mn = "Marital Status by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$marital_status, test$cohort)
tmp
##
## external validation
## 52
## DIVORCED 49
## MARRIED 471
## SEPARATED 7
## SINGLE 164
## UNKNOWN (DEFAULT) 3
## WIDOWED 395
chisq.test(tmp)
##
## Chi-squared test for given probabilities
##
## data: tmp
## X-squared = 1373.9, df = 6, p-value < 2.2e-16
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 : DIVORCED 7.65e-01 7.65e-01
## 2 : MARRIED 5.57e-75 1.95e-74
## 3 : SEPARATED 4.67e-09 5.77e-09
## 4 : SINGLE 2.52e-14 3.78e-14
## 5 : UNKNOWN (DEFAULT) 3.92e-11 5.49e-11
## 6 : WIDOWED 3.45e-59 9.06e-59
## 7 DIVORCED : MARRIED 1.85e-76 7.77e-76
## 8 DIVORCED : SEPARATED 1.99e-08 2.32e-08
## 9 DIVORCED : SINGLE 3.28e-15 5.30e-15
## 10 DIVORCED : UNKNOWN (DEFAULT) 1.78e-10 2.34e-10
## 11 DIVORCED : WIDOWED 1.37e-60 4.11e-60
## 12 MARRIED : SEPARATED 5.87e-100 6.16e-99
## 13 MARRIED : SINGLE 3.83e-34 8.04e-34
## 14 MARRIED : UNKNOWN (DEFAULT) 1.70e-102 3.57e-101
## 15 MARRIED : WIDOWED 9.81e-03 1.08e-02
## 16 SEPARATED : SINGLE 3.30e-33 6.30e-33
## 17 SEPARATED : UNKNOWN (DEFAULT) 2.06e-01 2.16e-01
## 18 SEPARATED : WIDOWED 1.97e-83 1.03e-82
## 19 SINGLE : UNKNOWN (DEFAULT) 1.26e-35 2.94e-35
## 20 SINGLE : WIDOWED 1.51e-22 2.64e-22
## 21 UNKNOWN (DEFAULT) : WIDOWED 5.87e-86 4.11e-85
Text Level Statistics
test <- tmp_two
test <- test[!duplicated(test$text),]
nrow(test)
## [1] 10250
table(test$cohort)
##
## external validation
## 10250
#Words
test$words <- sapply(gregexpr("\\W+", test$text), length) + 1
#All words
boxplot(test$words ~ test$cohort, main = "Token Distribution by Cohort", xlab = "Set", ylab = "Tokens")

tmp_full <- test$words
summary(tmp_full)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.0 678.0 862.0 926.8 1115.0 2939.0
sd(tmp_full)
## [1] 380.5242
#Unique Words (word dictionary)
test$cln_txt <- clean_text(test$text, F)
#Total Tokens
dictionary <- unlist(strsplit(test$cln_txt, ' '))
length(dictionary)
## [1] 7134040
#Unique Tokens
length(unique(dictionary))
## [1] 34822
#Most frequent tokens
plot_bow(dictionary, 20, "External Validation")

#Chars
test$chars <- nchar(test$text)
#Total Chars
sum(test$chars)
## [1] 72554625
tmp_full <- test$chars
summary(tmp_full)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34 5378 6644 7078 8403 22240
sd(tmp_full)
## [1] 2715.056
boxplot(test$chars ~ test$cohort, main = "Word Count Distribution by Cohort", xlab = "Set", ylab = "Word Count")

boxplot(test$chars ~ test$cohort, main = "Character Count Distribution by Cohort", xlab = "Set", ylab = "Character Count")

ICU Level
test <- tmp_two
#Note-level ICU stats
test <- test[!duplicated(test$text),]
nrow(test)
## [1] 10250
table(test$cohort)
##
## external validation
## 10250
plotDat(test, "first_careunit", bs = F, mn = "First Care Unit by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$first_careunit, test$cohort)
tmp
##
## external validation
## CCU 1563
## CSRU 359
## MICU 6721
## SICU 1004
## TSICU 603
plotDat(test, "last_careunit", bs = F, mn = "Last Care Unit by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$last_careunit, test$cohort)
tmp
##
## external validation
## CCU 1625
## CSRU 259
## MICU 6710
## SICU 1036
## TSICU 620
#Admission Diagnosis
tmp <- table(test$diagnosis)
tmp_tab <- tmp[rev(order(tmp))]
head(tmp_tab)
##
## PNEUMONIA CONGESTIVE HEART FAILURE SEPSIS
## 1260 411 321
## LOWER GI BLEED ALTERED MENTAL STATUS GASTROINTESTINAL BLEED
## 246 231 227
sum(tmp_tab)
## [1] 10250
par(mai=c(1,2,1,1))
barplot(rev(head(tmp_tab, 5)), horiz = T, las = 1,
main = paste("Most Frequent Diagnoses in the External Validation Set", sep = ''),
xlab = "Frequency")
