Correct columnnames
for (name in colnames((test))){
#If the column name is duplicated
if (grepl(".y", name)){
#Remove duplicate
test[[name]] <- NULL
} else if (grepl(".x", name)){
#Rename original to remove the x dimension
colnames(test)[which(colnames(test) == name)] <- gsub(".x", '', name)
}
}
rm(name)
colnames(test)
## [1] "SUBJECT_ID"
## [2] "HADM_ID"
## [3] "ROW_ID"
## [4] "X"
## [5] "CATEGORY"
## [6] "DESCRIPTION"
## [7] "TEXT"
## [8] "COHORT"
## [9] "Full.Code.Status"
## [10] "Full.Code.Status.Tt"
## [11] "Code.Status.Limitations"
## [12] "Code.Status.Limitations.Tt"
## [13] "Palliative.Care.Team.Involvement"
## [14] "Palliative.Care.Team.Involvement.Tt"
## [15] "Ambiguous"
## [16] "Ambiguous.Tt"
## [17] "Ambiguous.Comments"
## [18] "None"
## [19] "STAMP"
## [20] "operator"
## [21] "original_filename"
## [22] "lim_bow_score"
## [23] "cod_bow_score"
## [24] "car_bow_score"
## [25] "pal_bow_score"
## [26] "fam_bow_score"
## [27] "CHARTDATE"
## [28] "CHARTTIME"
## [29] "STORETIME"
## [30] "CGID"
## [31] "ISERROR"
## [32] "ADMITTIME"
## [33] "DISCHTIME"
## [34] "DEATHTIME"
## [35] "ADMISSION_TYPE"
## [36] "ADMISSION_LOCATION"
## [37] "DISCHARGE_LOCATION"
## [38] "INSURANCE"
## [39] "LANGUAGE"
## [40] "RELIGION"
## [41] "MARITAL_STATUS"
## [42] "ETHNICITY"
## [43] "EDREGTIME"
## [44] "EDOUTTIME"
## [45] "DIAGNOSIS"
## [46] "HOSPITAL_EXPIRE_FLAG"
## [47] "HAS_CHARTEVENTS_DATA"
## [48] "GENDER"
## [49] "DOB"
## [50] "DOD"
## [51] "DOD_HOSP"
## [52] "DOD_SSN"
## [53] "EXPIRE_FLAG"
## [54] "ICUSTAY_ID"
## [55] "DBSOURCE"
## [56] "FIRST_CAREUNIT"
## [57] "LAST_CAREUNIT"
## [58] "FIRST_WARDID"
## [59] "LAST_WARDID"
## [60] "INTIME"
## [61] "OUTTIME"
## [62] "LOS"
## [63] "AGE"
## [64] "ADMISSION_NUMBER"
length(unique(test$SUBJECT_ID))
## [1] 402
length(unique(test$HADM_ID))
## [1] 402
length(unique(test$TEXT))
## [1] 640
Utility Functions
ages <- function(dat){
dat$AGE <- (dat$ADMITTIME - dat$DOB)/365
#Correct for patient ages > 90 by imputing median age (91.4)
dat[(dat$AGE >= 90),]$AGE <- 91.4
return(dat)
}
clean_text <- function(tokens, printout){
#Create a fake patient note phrase as a representative sample
ex_token <- "Example note:\nThe patient is a 81yo m who was found down in [** location **] on [** date **] by daughter, [** name **].\n Pt was in usual state of health until four days ago, when began to complain to family of heartburn for which the pt was taking tums in addition to his prescribed PPI, without resolution."
if (printout){
print(substr(ex_token, 1, 100))
}
#Remove carriage returns, convert to lower
tokens <- tolower(gsub('\n', ' ', tokens))
tokens <- tolower(gsub('\r', ' ', tokens))
ex_token <- tolower(gsub('\n', ' ', ex_token))
if (printout){
cat("Result after removing carriage returns:\n")
print(substr(ex_token, 1, 100))
}
#https://stackoverflow.com/questions/13529360/replace-text-within-parenthesis-in-r
#Remove obfuscations between '[' and ']'
tokens <- gsub(" *\\[.*?\\] *", ' ', tokens)
ex_token <- gsub(" *\\[.*?\\] *", ' ', ex_token)
if (printout){
cat("Result after leaving [obfuscation]:\n")
print(substr(ex_token, 1, 100))
}
#Keep only words & numeric
tokens <- gsub("[^[:alnum:][:space:]]", '', tokens)
ex_token <- gsub("[^[:alnum:][:space:]]", '', ex_token)
if (printout){
cat("Result after removing all but alphanumeric and spaces:\n")
print(substr(ex_token, 1, 100))
}
#Remove numeric
tokens <- gsub("\\d+", '', tokens)
ex_token <- gsub("\\d+", '', ex_token)
if (printout){
cat("Result after removing all but alphabetic characters and spaces:\n")
print(substr(ex_token, 1, 100))
}
#Keep only a single white space
#https://stackoverflow.com/questions/25707647/merge-multiple-spaces-to-single-space-remove-trailing-leading-spaces
tokens <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', tokens, perl=TRUE)
ex_token <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', ex_token, perl=TRUE)
if (printout){
cat("Result after keeping only single spaces:\n")
print(substr(ex_token, 1, 100))
}
return(tokens)
}
plot_bow <- function(bows, n, mn){
tmp_tab <- table(bows)[rev(order(table(bows)))]
par(mai=c(1,2,1,1))
barplot(rev(head(tmp_tab, n)), horiz = T, las = 1,
main = mn,
xlab = "Frequency")
}
plotDat <- function(dat, column, bs, mn, xl, yl){
tmp <- as.matrix(table(dat[[column]], dat[["cohort"]]))
prop <- prop.table(tmp, margin = 2)#2 for column-wise proportions
par(mar = c(5.0, 4.0, 4.0, 15), xpd = TRUE)
barplot(prop, col = cm.colors(length(rownames(prop))), beside = bs,width = 2, main = mn, xlab = xl, ylab = yl)
legend("topright", inset = c(-0.90,0), fill = cm.colors(length(rownames(prop))), legend=rownames(prop))
}
#rcompanion for pairWiseNominalIndependence
library("rcompanion")
Create Ages
test <- ages(test)
#Colnames to lowercase
colnames(test) <- tolower(colnames(test))
#Store data
tmp_two <- test
Hospital Admission Level Data
#Subset
test <- test[!duplicated(test$hadm_id),]
nrow(test)
## [1] 402
table(test$cohort)
##
## train validation
## 279 123
## Admission Type
plotDat(test, "admission_type", bs = F, mn = "Admission Type Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_type, test$cohort)
tmp
##
## train validation
## ELECTIVE 4 1
## EMERGENCY 271 121
## URGENT 4 1
chisq.test(tmp)
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 0.54231, df = 2, p-value = 0.7625
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 ELECTIVE : EMERGENCY 0.972 1
## 2 ELECTIVE : URGENT 1.000 1
## 3 EMERGENCY : URGENT 0.972 1
## Admission Location
plotDat(test, "admission_location", bs = F, mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_location, test$cohort)
tmp
##
## train validation
## CLINIC REFERRAL/PREMATURE 3 2
## EMERGENCY ROOM ADMIT 212 91
## PHYS REFERRAL/NORMAL DELI 6 1
## TRANSFER FROM HOSP/EXTRAM 57 27
## TRANSFER FROM SKILLED NUR 1 2
chisq.test(tmp)
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 3.0631, df = 4, p-value = 0.5473
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq
## 1 CLINIC REFERRAL/PREMATURE : EMERGENCY ROOM ADMIT 1.000
## 2 CLINIC REFERRAL/PREMATURE : PHYS REFERRAL/NORMAL DELI 0.735
## 3 CLINIC REFERRAL/PREMATURE : TRANSFER FROM HOSP/EXTRAM 1.000
## 4 CLINIC REFERRAL/PREMATURE : TRANSFER FROM SKILLED NUR 1.000
## 5 EMERGENCY ROOM ADMIT : PHYS REFERRAL/NORMAL DELI 0.629
## 6 EMERGENCY ROOM ADMIT : TRANSFER FROM HOSP/EXTRAM 0.812
## 7 EMERGENCY ROOM ADMIT : TRANSFER FROM SKILLED NUR 0.458
## 8 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM HOSP/EXTRAM 0.577
## 9 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM SKILLED NUR 0.366
## 10 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM SKILLED NUR 0.533
## p.adj.Chisq
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 1
## 8 1
## 9 1
## 10 1
## Discharge Location (Note: Not much info here...)
plotDat(test, "discharge_location", bs = T,mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

Patient Level Data
#Subset
test <- tmp_two
test <- test[!duplicated(test$subject_id),]
nrow(test)
## [1] 402
table(test$cohort)
##
## train validation
## 279 123
#All age
boxplot(test$age ~ test$cohort, main = "Age Distribution by Cohort", xlab = "Set", ylab = "Age")

tmp_train <- test[(test$cohort == "train"),]$age
tmp_valid <- test[(test$cohort == "validation"),]$age
tmp_full <- test$age
summary(tmp_train)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.87 61.73 74.58 71.51 82.63 91.40
sd(tmp_train)
## [1] 14.40688
summary(tmp_valid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.36 61.06 72.45 69.88 83.30 91.40
sd(tmp_valid)
## [1] 15.86933
summary(tmp_full)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.87 61.68 73.92 71.01 83.02 91.40
sd(tmp_full)
## [1] 14.86864
## GENDER
plotDat(test, "gender", bs = F, mn = "Gender by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$gender, test$cohort)
tmp
##
## train validation
## F 136 52
## M 143 71
##Insurance
plotDat(test, "insurance", bs = F, mn = "Insurance by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$insurance, test$cohort)
tmp
##
## train validation
## Government 3 2
## Medicaid 21 9
## Medicare 195 87
## Private 56 25
## Self Pay 4 0
chisq.test(tmp)
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 1.988, df = 4, p-value = 0.738
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 Government : Medicaid 1.000 1
## 2 Government : Medicare 1.000 1
## 3 Government : Private 1.000 1
## 4 Government : Self Pay 0.530 1
## 5 Medicaid : Medicare 1.000 1
## 6 Medicaid : Private 1.000 1
## 7 Medicaid : Self Pay 0.500 1
## 8 Medicare : Private 1.000 1
## 9 Medicare : Self Pay 0.433 1
## 10 Private : Self Pay 0.447 1
##Ethnicity
##Clean ethnicity
test[(grepl("WHITE|PORTUGUESE", test$ethnicity)),]$ethnicity <- "WHITE"
test[(grepl("ASIAN", test$ethnicity)),]$ethnicity <- "ASIAN"
test[(grepl("BLACK", test$ethnicity)),]$ethnicity <- "BLACK"
test[(grepl("HISPANIC", test$ethnicity)),]$ethnicity <- "HISPANIC"
test[(grepl("MIDDLE|NATIVE|MULTI|DECLINED|UNABLE|OTHER|NOT",test$ethnicity)),]$ethnicity <- "OTHER"
plotDat(test, "ethnicity", bs = F, mn = "Ethnicity by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$ethnicity, test$cohort)
tmp
##
## train validation
## ASIAN 10 3
## BLACK 21 11
## HISPANIC 5 7
## OTHER 33 12
## WHITE 210 90
chisq.test(tmp)
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 5.2863, df = 4, p-value = 0.2592
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 ASIAN : BLACK 0.6990 0.913
## 2 ASIAN : HISPANIC 0.1650 0.550
## 3 ASIAN : OTHER 1.0000 1.000
## 4 ASIAN : WHITE 0.8220 0.913
## 5 BLACK : HISPANIC 0.2730 0.683
## 6 BLACK : OTHER 0.6340 0.913
## 7 BLACK : WHITE 0.7570 0.913
## 8 HISPANIC : OTHER 0.0849 0.424
## 9 HISPANIC : WHITE 0.0782 0.424
## 10 OTHER : WHITE 0.7780 0.913
## Marital Status
## Note: Need to clean marital status
plotDat(test, "marital_status", bs = F, mn = "Marital Status by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$marital_status, test$cohort)
tmp
##
## train validation
## 25 5
## DIVORCED 19 6
## MARRIED 128 57
## SEPARATED 6 1
## SINGLE 53 32
## UNKNOWN (DEFAULT) 0 1
## WIDOWED 48 21
chisq.test(tmp)
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 8.3935, df = 6, p-value = 0.2107
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 : DIVORCED 0.7350 0.908
## 2 : MARRIED 0.1710 0.908
## 3 : SEPARATED 1.0000 1.000
## 4 : SINGLE 0.0591 0.908
## 5 : UNKNOWN (DEFAULT) 0.4300 0.908
## 6 : WIDOWED 0.2370 0.908
## 7 DIVORCED : MARRIED 0.6420 0.908
## 8 DIVORCED : SEPARATED 0.9740 1.000
## 9 DIVORCED : SINGLE 0.3070 0.908
## 10 DIVORCED : UNKNOWN (DEFAULT) 0.5960 0.908
## 11 DIVORCED : WIDOWED 0.7250 0.908
## 12 MARRIED : SEPARATED 0.6060 0.908
## 13 MARRIED : SINGLE 0.3320 0.908
## 14 MARRIED : UNKNOWN (DEFAULT) 0.6840 0.908
## 15 MARRIED : WIDOWED 1.0000 1.000
## 16 SEPARATED : SINGLE 0.4070 0.908
## 17 SEPARATED : UNKNOWN (DEFAULT) 0.5370 0.908
## 18 SEPARATED : WIDOWED 0.6450 0.908
## 19 SINGLE : UNKNOWN (DEFAULT) 0.8100 0.945
## 20 SINGLE : WIDOWED 0.4430 0.908
## 21 UNKNOWN (DEFAULT) : WIDOWED 0.6870 0.908
Text Level Statistics
test <- tmp_two
test <- test[!duplicated(test$text),]
nrow(test)
## [1] 640
table(test$cohort)
##
## train validation
## 448 192
#Words
test$words <- sapply(gregexpr("\\W+", test$text), length) + 1
#All words
boxplot(test$words ~ test$cohort, main = "Token Distribution by Cohort", xlab = "Set", ylab = "Tokens")

tmp_train <- test[(test$cohort == "train"),]$words
tmp_valid <- test[(test$cohort == "validation"),]$words
tmp_full <- test$words
summary(tmp_train)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 110.0 771.2 1020.0 1057.0 1282.0 2328.0
sd(tmp_train)
## [1] 393.7596
summary(tmp_valid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 92.0 638.8 792.5 866.7 1059.0 2569.0
sd(tmp_valid)
## [1] 358.3574
summary(tmp_full)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 92.0 731.0 945.5 1000.0 1212.0 2569.0
sd(tmp_full)
## [1] 393.0449
#Chars
test$chars <- nchar(test$text)
tmp_train <- test[(test$cohort == "train"),]$chars
tmp_valid <- test[(test$cohort == "validation"),]$chars
tmp_full <- test$chars
summary(tmp_train)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 700 4593 6094 6398 7850 14080
sd(tmp_train)
## [1] 2474.898
summary(tmp_valid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 558 3798 4833 5252 6374 15840
sd(tmp_valid)
## [1] 2286.812
summary(tmp_full)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 558 4368 5658 6054 7422 15840
sd(tmp_full)
## [1] 2474.669
boxplot(test$chars ~ test$cohort, main = "Word Count Distribution by Cohort", xlab = "Set", ylab = "Word Count")

boxplot(test$chars ~ test$cohort, main = "Character Count Distribution by Cohort", xlab = "Set", ylab = "Character Count")

###
#Unique Words (word dictionary)
test$cln_txt <- clean_text(test$text, F)
#Total Tokens
dictionary <- unlist(strsplit(test$cln_txt, ' '))
#Number of tokens
length(dictionary)
## [1] 472853
#Unique Tokens
length(unique(dictionary))
## [1] 13691
#Most frequent words
plot_bow(dictionary, 20, "Top 20 Most Frequent Tokens")

#Word stats
tmp_train <- test[(test$cohort == "train"),]$cln_txt
tmp_valid <- test[(test$cohort == "validation"),]$cln_txt
tmp_full <- test$cln_txt
summary(tmp_train)
## Length Class Mode
## 448 character character
sd(tmp_train)
## [1] NA
summary(tmp_valid)
## Length Class Mode
## 192 character character
sd(tmp_valid)
## [1] NA
summary(tmp_full)
## Length Class Mode
## 640 character character
sd(tmp_full)
## [1] NA
ICU Level
test <- tmp_two
#Note-level ICU stats
test <- test[!duplicated(test$text),]
nrow(test)
## [1] 640
table(test$cohort)
##
## train validation
## 448 192
plotDat(test, "first_careunit", bs = F, mn = "First Care Unit by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$first_careunit, test$cohort)
tmp
##
## train validation
## CCU 50 13
## CSRU 14 6
## MICU 313 127
## SICU 38 30
## TSICU 33 16
plotDat(test, "last_careunit", bs = F, mn = "Last Care Unit by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$last_careunit, test$cohort)
tmp
##
## train validation
## CCU 50 15
## CSRU 7 3
## MICU 320 134
## SICU 40 29
## TSICU 31 11
#Diagnoses
train <- test[(test$cohort == "train"),]$diagnosis
plot_bow(train, 5, "Most Frequent Diagnoses in Train Set")

train <- table(train)
head(train[rev(order(train))])
## train
## PNEUMONIA SEPSIS HYPOTENSION
## 56 28 20
## INTRACRANIAL HEMORRHAGE UPPER GI BLEED ALTERED MENTAL STATUS
## 12 10 10
valid <- test[(test$cohort == "validation"),]$diagnosis
plot_bow(valid, 5, "Most Frequent Diagnoses in Validation Set")

valid <- table(valid)
head(valid[rev(order(valid))])
## valid
## PNEUMONIA
## 24
## SEPSIS
## 17
## URINARY TRACT INFECTION;PYELONEPHRITIS
## 6
## SHORTNESS OF BREATH
## 4
## LOWER GASTROINTESTINAL BLEED
## 4
## ANEMIA
## 4
tmp_full <- test$diagnosis
plot_bow(tmp_full, 5, "Most Frequent Diagnoses in Full Set")

tmp_full <- table(tmp_full)
head(tmp_full[rev(order(tmp_full))])
## tmp_full
## PNEUMONIA
## 80
## SEPSIS
## 45
## HYPOTENSION
## 22
## URINARY TRACT INFECTION;PYELONEPHRITIS
## 14
## INTRACRANIAL HEMORRHAGE
## 14
## UPPER GI BLEED
## 13
Domains
#Use dat for annotation data
colnames(dat)
## [1] "X"
## [2] "ROW_ID"
## [3] "SUBJECT_ID"
## [4] "HADM_ID"
## [5] "CATEGORY"
## [6] "DESCRIPTION"
## [7] "TEXT"
## [8] "COHORT"
## [9] "Patient.and.Family.Care.Preferences"
## [10] "Patient.and.Family.Care.Preferences.Text"
## [11] "Communication.with.Family"
## [12] "Communication.with.Family.Text"
## [13] "Full.Code.Status"
## [14] "Full.Code.Status.Text"
## [15] "Code.Status.Limitations"
## [16] "Code.Status.Limitations.Text"
## [17] "Palliative.Care.Team.Involvement"
## [18] "Palliative.Care.Team.Involvement.Text"
## [19] "Ambiguous"
## [20] "Ambiguous.Text"
## [21] "Ambiguous.Comments"
## [22] "None"
## [23] "STAMP"
## [24] "operator"
## [25] "original_filename"
## [26] "lim_bow_score"
## [27] "cod_bow_score"
## [28] "car_bow_score"
## [29] "pal_bow_score"
## [30] "fam_bow_score"
nrow(dat)
## [1] 641
#Remove odd note
test <- dat[(dat$ROW_ID != tmp_note$ROW_ID),]
nrow(test)
## [1] 640
#Convert column names to lower
colnames(test) <- tolower(colnames(test))
train <- test[(test$cohort == "train"),]
valid <- test[(test$cohort == "validation"),]
full <- test
#Documentation of Patient and Family Care Preferences
#or Code Status Limitations
fam_or_lim <- function(dat){
count <- 0
for (i in 1:nrow(dat)){
#If one and not the other
if (dat$patient.and.family.care.preferences[i] == 1 & dat$code.status.limitations[i] == 1){
#increment
count <- count + 1
#The other way round
} else if (dat$patient.and.family.care.preferences[i] == 0 & dat$code.status.limitations[i] == 1){
count <- count + 1
} else if (dat$patient.and.family.care.preferences[i] == 1 & dat$code.status.limitations[i] == 0){
count <- count + 1
} #Else nothing
}
return(count)
}
fam_or_lim(train)
## [1] 228
paste(round(fam_or_lim(train)/nrow(train)*100, 1), '%', sep = '')
## [1] "50.9%"
fam_or_lim(valid)
## [1] 78
paste(round(fam_or_lim(valid)/nrow(valid)*100, 1), '%', sep = '')
## [1] "40.6%"
fam_or_lim(full)
## [1] 306
paste(round(fam_or_lim(full)/nrow(full)*100, 1), '%', sep = '')
## [1] "47.8%"