Load Data

library("data.table")

dat <- fread("~/goals_of_care/external_validation/over_75_cohort_17Jan18.csv", header = T, stringsAsFactors = F)
nrow(dat)
## [1] 11575
#Remove duplicate notes
dat <- dat[!duplicated(dat$TEXT),]
nrow(dat)
## [1] 10250
mim <- fread("~/goals_of_care/external_validation/NOTES_STAYS_ADM_PAT.csv", header = T, stringsAsFactors = F)
## 
Read 0.0% of 385794 rows
Read 5.2% of 385794 rows
Read 10.4% of 385794 rows
Read 15.6% of 385794 rows
Read 20.7% of 385794 rows
Read 25.9% of 385794 rows
Read 31.1% of 385794 rows
Read 33.7% of 385794 rows
Read 38.9% of 385794 rows
Read 169621 rows and 44 (of 44) columns from 1.243 GB file in 00:00:17
nrow(mim)
## [1] 169621
test <- merge(dat, mim, by = c("SUBJECT_ID", "HADM_ID", "ROW_ID"))
colnames(test)
##  [1] "SUBJECT_ID"           "HADM_ID"              "ROW_ID"              
##  [4] "CHARTDATE.x"          "CHARTTIME.x"          "STORETIME.x"         
##  [7] "CATEGORY.x"           "DESCRIPTION.x"        "CGID.x"              
## [10] "ISERROR.x"            "TEXT.x"               "CHARTDATE.y"         
## [13] "CHARTTIME.y"          "STORETIME.y"          "CATEGORY.y"          
## [16] "DESCRIPTION.y"        "CGID.y"               "ISERROR.y"           
## [19] "TEXT.y"               "ADMITTIME"            "DISCHTIME"           
## [22] "DEATHTIME"            "ADMISSION_TYPE"       "ADMISSION_LOCATION"  
## [25] "DISCHARGE_LOCATION"   "INSURANCE"            "LANGUAGE"            
## [28] "RELIGION"             "MARITAL_STATUS"       "ETHNICITY"           
## [31] "EDREGTIME"            "EDOUTTIME"            "DIAGNOSIS"           
## [34] "HOSPITAL_EXPIRE_FLAG" "HAS_CHARTEVENTS_DATA" "GENDER"              
## [37] "DOB"                  "DOD"                  "DOD_HOSP"            
## [40] "DOD_SSN"              "EXPIRE_FLAG"          "ICUSTAY_ID"          
## [43] "DBSOURCE"             "FIRST_CAREUNIT"       "LAST_CAREUNIT"       
## [46] "FIRST_WARDID"         "LAST_WARDID"          "INTIME"              
## [49] "OUTTIME"              "LOS"                  "AGE"                 
## [52] "ADMISSION_NUMBER"
#Order data
test <- test[with(test, order(SUBJECT_ID, HADM_ID, ADMITTIME)),]

Correct columnnames

for (name in colnames((test))){
  #If the column name is duplicated
  if (grepl(".y", name)){
    #Remove duplicate
    test[[name]] <- NULL
  } else if (grepl(".x", name)){
    #Rename original to remove the x dimension
    colnames(test)[which(colnames(test) == name)] <- gsub(".x", '', name)
  }
}
rm(name)

colnames(test)
##  [1] "SUBJECT_ID"           "HADM_ID"              "ROW_ID"              
##  [4] "CHARTDATE"            "CHARTTIME"            "STORETIME"           
##  [7] "CATEGORY"             "DESCRIPTION"          "CGID"                
## [10] "ISERROR"              "TEXT"                 "ADMITTIME"           
## [13] "DISCHTIME"            "DEATHTIME"            "ADMISSION_TYPE"      
## [16] "ADMISSION_LOCATION"   "DISCHARGE_LOCATION"   "INSURANCE"           
## [19] "LANGUAGE"             "RELIGION"             "MARITAL_STATUS"      
## [22] "ETHNICITY"            "EDREGTIME"            "EDOUTTIME"           
## [25] "DIAGNOSIS"            "HOSPITAL_EXPIRE_FLAG" "HAS_CHARTEVENTS_DATA"
## [28] "GENDER"               "DOB"                  "DOD"                 
## [31] "DOD_HOSP"             "DOD_SSN"              "EXPIRE_FLAG"         
## [34] "ICUSTAY_ID"           "DBSOURCE"             "FIRST_CAREUNIT"      
## [37] "LAST_CAREUNIT"        "FIRST_WARDID"         "LAST_WARDID"         
## [40] "INTIME"               "OUTTIME"              "LOS"                 
## [43] "AGE"                  "ADMISSION_NUMBER"
length(unique(test$SUBJECT_ID))
## [1] 1141
length(unique(test$HADM_ID))
## [1] 1350
length(unique(test$TEXT))
## [1] 10250
#Add cohort data
test$COHORT <- rep("external validation", each = nrow(test))

Utility Functions

ages <- function(dat){
    dat$AGE <- (dat$ADMITTIME - dat$DOB)/365
    #Correct for patient ages > 90 by imputing median age (91.4)
    dat[(dat$AGE >= 90),]$AGE <- 91.4
    return(dat)
}
plotDat <- function(dat, column, bs, mn, xl, yl){
  tmp <- as.matrix(table(dat[[column]]))
  prop <- prop.table(tmp)#2 for column-wise proportions
  par(mar = c(5.0, 4.0, 4.0, 15), xpd = TRUE)
  barplot(prop, col = cm.colors(length(rownames(prop))), beside = bs,width = 2, main = mn, xlab = xl, ylab = yl)
  legend("topright", inset = c(-0.90,0), fill = cm.colors(length(rownames(prop))), legend=rownames(prop))
}
clean_text <- function(tokens, printout){
    #Create a fake patient note phrase as a representative sample
    ex_token <- "Example note:\nThe patient is a 81yo m who was found down in [** location **] on [** date **] by daughter, [** name **].\n Pt was in usual state of health until four days ago, when began to complain to family of heartburn for which the pt was taking tums in addition to his prescribed PPI, without resolution."
  if (printout){
    print(substr(ex_token, 1, 100))
  }
  
  #Remove carriage returns, convert to lower
  tokens <- tolower(gsub('\n', ' ', tokens))
  tokens <- tolower(gsub('\r', ' ', tokens))
  ex_token <- tolower(gsub('\n', ' ', ex_token))
  if (printout){
    cat("Result after removing carriage returns:\n")
    print(substr(ex_token, 1, 100))
  }
  
  #https://stackoverflow.com/questions/13529360/replace-text-within-parenthesis-in-r
  #Remove obfuscations between '[' and ']'
  tokens <- gsub(" *\\[.*?\\] *", ' ', tokens)
  ex_token <- gsub(" *\\[.*?\\] *", ' ', ex_token)
  if (printout){
    cat("Result after leaving [obfuscation]:\n")
    print(substr(ex_token, 1, 100))
  }
  
  #Keep only words & numeric
  tokens <- gsub("[^[:alnum:][:space:]]", '', tokens)
  ex_token <- gsub("[^[:alnum:][:space:]]", '', ex_token)
  if (printout){
    cat("Result after removing all but alphanumeric and spaces:\n")
    print(substr(ex_token, 1, 100))
  }
  
  #Remove numeric
  tokens <- gsub("\\d+", '', tokens)
  ex_token <- gsub("\\d+", '', ex_token)
  if (printout){
    cat("Result after removing all but alphabetic characters and spaces:\n")
    print(substr(ex_token, 1, 100))
  }
  
  #Keep only a single white space
  #https://stackoverflow.com/questions/25707647/merge-multiple-spaces-to-single-space-remove-trailing-leading-spaces
  tokens <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', tokens, perl=TRUE)
  ex_token <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', ex_token, perl=TRUE)
  if (printout){
    cat("Result after keeping only single spaces:\n")
    print(substr(ex_token, 1, 100))
  }

  return(tokens)
}
plot_bow <- function(bows, n, lab){
  tmp_tab <- table(bows)[rev(order(table(bows)))]
  par(mai=c(1,2,1,1))
  barplot(rev(head(tmp_tab, n)), horiz = T, las = 1,
          main = paste("Most Frequent Words in the ",lab ," Set", sep = ''),
          xlab = "Frequency")
}
#rcompanion for pairWiseNominalIndependence
library("rcompanion")

Create Ages

test <- ages(test)
#Colnames
colnames(test) <- tolower(colnames(test))
#Store data
tmp_two <- test

Hospital Admission Level Data

#Subset
test <- test[!duplicated(test$hadm_id),]
nrow(test)
## [1] 1350
table(test$cohort)
## 
## external validation 
##                1350
## Admission Type
plotDat(test, "admission_type", bs = F, mn = "Admission Type Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_type, test$cohort)
tmp
##            
##             external validation
##   ELECTIVE                   60
##   EMERGENCY                1283
##   URGENT                      7
chisq.test(tmp)
## 
##  Chi-squared test for given probabilities
## 
## data:  tmp
## X-squared = 2316.1, df = 2, p-value < 2.2e-16
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##             Comparison   p.Chisq p.adj.Chisq
## 1 ELECTIVE : EMERGENCY 3.44e-244   5.16e-244
## 2    ELECTIVE : URGENT  9.48e-11    9.48e-11
## 3   EMERGENCY : URGENT 1.90e-276   5.70e-276
## Admission Location
plotDat(test, "admission_location", bs = F, mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_location, test$cohort)
tmp
##                            
##                             external validation
##   CLINIC REFERRAL/PREMATURE                  32
##   EMERGENCY ROOM ADMIT                     1067
##   PHYS REFERRAL/NORMAL DELI                  75
##   TRANSFER FROM HOSP/EXTRAM                 169
##   TRANSFER FROM OTHER HEALT                   1
##   TRANSFER FROM SKILLED NUR                   6
chisq.test(tmp)
## 
##  Chi-squared test for given probabilities
## 
## data:  tmp
## X-squared = 3866.6, df = 5, p-value < 2.2e-16
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##                                               Comparison   p.Chisq
## 1       CLINIC REFERRAL/PREMATURE : EMERGENCY ROOM ADMIT 5.59e-214
## 2  CLINIC REFERRAL/PREMATURE : PHYS REFERRAL/NORMAL DELI  3.23e-05
## 3  CLINIC REFERRAL/PREMATURE : TRANSFER FROM HOSP/EXTRAM  4.32e-22
## 4  CLINIC REFERRAL/PREMATURE : TRANSFER FROM OTHER HEALT  6.80e-08
## 5  CLINIC REFERRAL/PREMATURE : TRANSFER FROM SKILLED NUR  2.47e-05
## 6       EMERGENCY ROOM ADMIT : PHYS REFERRAL/NORMAL DELI 2.08e-189
## 7       EMERGENCY ROOM ADMIT : TRANSFER FROM HOSP/EXTRAM 6.62e-144
## 8       EMERGENCY ROOM ADMIT : TRANSFER FROM OTHER HEALT 2.20e-233
## 9       EMERGENCY ROOM ADMIT : TRANSFER FROM SKILLED NUR 3.75e-230
## 10 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM HOSP/EXTRAM  1.77e-09
## 11 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM OTHER HEALT  2.10e-17
## 12 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM SKILLED NUR  1.77e-14
## 13 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM OTHER HEALT  5.47e-38
## 14 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM SKILLED NUR  6.93e-35
## 15 TRANSFER FROM OTHER HEALT : TRANSFER FROM SKILLED NUR  5.88e-02
##    p.adj.Chisq
## 1    2.80e-213
## 2     3.46e-05
## 3     8.10e-22
## 4     8.50e-08
## 5     2.85e-05
## 6    7.80e-189
## 7    1.99e-143
## 8    3.30e-232
## 9    2.81e-229
## 10    2.41e-09
## 11    3.50e-17
## 12    2.66e-14
## 13    1.37e-37
## 14    1.48e-34
## 15    5.88e-02
## Discharge Location (Note: Not much info here...)
plotDat(test, "discharge_location", bs = T,mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

Patient Level Data

#Subset
test <- tmp_two
test <- test[!duplicated(test$subject_id),]
nrow(test)
## [1] 1141
table(test$cohort)
## 
## external validation 
##                1141
#All age
boxplot(test$age ~ test$cohort, main = "Age Distribution by Cohort", xlab = "Set", ylab = "Age")

tmp_full <- test$age
summary(tmp_full)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   75.01   80.08   84.25   84.27   88.59   91.40
sd(tmp_full)
## [1] 5.164916
## GENDER
plotDat(test, "gender", bs = F, mn = "Gender by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$gender, test$cohort)
tmp
##    
##     external validation
##   F                 580
##   M                 561
##Insurance
plotDat(test, "insurance", bs = F, mn = "Insurance by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$insurance, test$cohort)
tmp
##           
##            external validation
##   Medicaid                   7
##   Medicare                1084
##   Private                   50
chisq.test(tmp)
## 
##  Chi-squared test for given probabilities
## 
## data:  tmp
## X-squared = 1955.2, df = 2, p-value < 2.2e-16
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##            Comparison   p.Chisq p.adj.Chisq
## 1 Medicaid : Medicare 3.32e-233   9.96e-233
## 2  Medicaid : Private  1.23e-08    1.23e-08
## 3  Medicare : Private 4.83e-207   7.24e-207
##Ethnicity
##Clean ethnicity
test[(grepl("WHITE|PORTUGUESE", test$ethnicity)),]$ethnicity <- "WHITE" 
test[(grepl("ASIAN", test$ethnicity)),]$ethnicity <- "ASIAN" 
test[(grepl("BLACK", test$ethnicity)),]$ethnicity <- "BLACK" 
test[(grepl("HISPANIC", test$ethnicity)),]$ethnicity <- "HISPANIC"
test[(grepl("MIDDLE|NATIVE|MULTI|DECLINED|UNABLE|OTHER|NOT",test$ethnicity)),]$ethnicity <- "OTHER"

plotDat(test, "ethnicity", bs = F, mn = "Ethnicity by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$ethnicity, test$cohort)
tmp
##           
##            external validation
##   ASIAN                     34
##   BLACK                     76
##   HISPANIC                  18
##   OTHER                     73
##   WHITE                    940
chisq.test(tmp)
## 
##  Chi-squared test for given probabilities
## 
## data:  tmp
## X-squared = 2786.2, df = 4, p-value < 2.2e-16
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##          Comparison   p.Chisq p.adj.Chisq
## 1     ASIAN : BLACK  6.21e-05    8.87e-05
## 2  ASIAN : HISPANIC  2.65e-02    2.94e-02
## 3     ASIAN : OTHER  1.63e-04    2.04e-04
## 4     ASIAN : WHITE 2.74e-185   1.37e-184
## 5  BLACK : HISPANIC  2.20e-09    4.40e-09
## 6     BLACK : OTHER  8.06e-01    8.06e-01
## 7     BLACK : WHITE 8.35e-162   2.09e-161
## 8  HISPANIC : OTHER  8.14e-09    1.36e-08
## 9  HISPANIC : WHITE 5.51e-195   5.51e-194
## 10    OTHER : WHITE 2.16e-163   7.20e-163
## Marital Status

## Note: Need to clean marital status
plotDat(test, "marital_status", bs = F, mn = "Marital Status by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$marital_status, test$cohort)
tmp
##                    
##                     external validation
##                                      52
##   DIVORCED                           49
##   MARRIED                           471
##   SEPARATED                           7
##   SINGLE                            164
##   UNKNOWN (DEFAULT)                   3
##   WIDOWED                           395
chisq.test(tmp)
## 
##  Chi-squared test for given probabilities
## 
## data:  tmp
## X-squared = 1373.9, df = 6, p-value < 2.2e-16
pairwiseNominalIndependence(
  as.matrix(tmp),
  fisher = F, gtest = F, chisq = T, method = "fdr")
##                       Comparison   p.Chisq p.adj.Chisq
## 1                     : DIVORCED  7.65e-01    7.65e-01
## 2                      : MARRIED  5.57e-75    1.95e-74
## 3                    : SEPARATED  4.67e-09    5.77e-09
## 4                       : SINGLE  2.52e-14    3.78e-14
## 5            : UNKNOWN (DEFAULT)  3.92e-11    5.49e-11
## 6                      : WIDOWED  3.45e-59    9.06e-59
## 7             DIVORCED : MARRIED  1.85e-76    7.77e-76
## 8           DIVORCED : SEPARATED  1.99e-08    2.32e-08
## 9              DIVORCED : SINGLE  3.28e-15    5.30e-15
## 10  DIVORCED : UNKNOWN (DEFAULT)  1.78e-10    2.34e-10
## 11            DIVORCED : WIDOWED  1.37e-60    4.11e-60
## 12           MARRIED : SEPARATED 5.87e-100    6.16e-99
## 13              MARRIED : SINGLE  3.83e-34    8.04e-34
## 14   MARRIED : UNKNOWN (DEFAULT) 1.70e-102   3.57e-101
## 15             MARRIED : WIDOWED  9.81e-03    1.08e-02
## 16            SEPARATED : SINGLE  3.30e-33    6.30e-33
## 17 SEPARATED : UNKNOWN (DEFAULT)  2.06e-01    2.16e-01
## 18           SEPARATED : WIDOWED  1.97e-83    1.03e-82
## 19    SINGLE : UNKNOWN (DEFAULT)  1.26e-35    2.94e-35
## 20              SINGLE : WIDOWED  1.51e-22    2.64e-22
## 21   UNKNOWN (DEFAULT) : WIDOWED  5.87e-86    4.11e-85

Text Level Statistics

test <- tmp_two

test <- test[!duplicated(test$text),]
nrow(test)
## [1] 10250
table(test$cohort)
## 
## external validation 
##               10250
#Words
test$words <- sapply(gregexpr("\\W+", test$text), length) + 1
#All words
boxplot(test$words ~ test$cohort, main = "Token Distribution by Cohort", xlab = "Set", ylab = "Tokens")

tmp_full <- test$words
summary(tmp_full)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     5.0   678.0   862.0   926.8  1115.0  2939.0
sd(tmp_full)
## [1] 380.5242
#Unique Words (word dictionary)
test$cln_txt <- clean_text(test$text, F)

#Total Tokens
dictionary <- unlist(strsplit(test$cln_txt, ' '))
length(dictionary)
## [1] 7134040
#Unique Tokens
length(unique(dictionary))
## [1] 34822
#Most frequent tokens
plot_bow(dictionary, 20, "External Validation")

#Chars
test$chars <- nchar(test$text)

#Total Chars
sum(test$chars)
## [1] 72554625
tmp_full <- test$chars
summary(tmp_full)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      34    5378    6644    7078    8403   22240
sd(tmp_full)
## [1] 2715.056
boxplot(test$chars ~ test$cohort, main = "Word Count Distribution by Cohort", xlab = "Set", ylab = "Word Count")

boxplot(test$chars ~ test$cohort, main = "Character Count Distribution by Cohort", xlab = "Set", ylab = "Character Count")

ICU Level

test <- tmp_two
#Note-level ICU stats
test <- test[!duplicated(test$text),]
nrow(test)
## [1] 10250
table(test$cohort)
## 
## external validation 
##               10250
plotDat(test, "first_careunit", bs = F, mn = "First Care Unit by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$first_careunit, test$cohort)
tmp
##        
##         external validation
##   CCU                  1563
##   CSRU                  359
##   MICU                 6721
##   SICU                 1004
##   TSICU                 603
plotDat(test, "last_careunit", bs = F, mn = "Last Care Unit by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$last_careunit, test$cohort)
tmp
##        
##         external validation
##   CCU                  1625
##   CSRU                  259
##   MICU                 6710
##   SICU                 1036
##   TSICU                 620
#Admission Diagnosis
tmp <- table(test$diagnosis)
tmp_tab <- tmp[rev(order(tmp))]
head(tmp_tab)
## 
##                PNEUMONIA CONGESTIVE HEART FAILURE                   SEPSIS 
##                     1260                      411                      321 
##           LOWER GI BLEED    ALTERED MENTAL STATUS   GASTROINTESTINAL BLEED 
##                      246                      231                      227
sum(tmp_tab)
## [1] 10250
par(mai=c(1,2,1,1))
barplot(rev(head(tmp_tab, 5)), horiz = T, las = 1,
        main = paste("Most Frequent Diagnoses in the External Validation Set", sep = ''),
        xlab = "Frequency")