Load Data

#Contains map of files (BRAT format for neuroNER) to row_id (MIMIC NOTEEVENTS)
map <- read.csv("~/goals_of_care/summary_stats/row_id_to_file_num.txt", header = F, stringsAsFactors = F, sep = '\t')
head(map)
##       V1         V2
## 1 407635 text_00000
## 2 345097 text_00001
## 3 371117 text_00002
## 4 401545 text_00003
## 5 393792 text_00004
## 6 382263 text_00005
colnames(map) <- c("row_id", "file")



train <- read.csv("~/goals_of_care/summary_stats/018_train.txt", header = F, stringsAsFactors = F, sep = ' ', quote = "", row.names = NULL)
head(train)
##            V1               V2 V3 V4 V5 V6
## 1       Chief train_text_00000  0  5  O  O
## 2   Complaint train_text_00000  6 15  O  O
## 3           : train_text_00000 15 16  O  O
## 4 respiratory train_text_00000 17 28  O  O
## 5     failure train_text_00000 29 36  O  O
## 6           I train_text_00000 38 39  O  O
colnames(train) <- tolower(c("TOKEN", "FILE", "START", "END", "HUM", "NET"))


train_set <- unique(train$file)
train_set <- gsub("train_", "", train_set)
length(train_set)
## [1] 380
train_set <- map[(map$file %in% train_set),]
nrow(train_set)
## [1] 380
valid <- read.csv("~/goals_of_care/summary_stats/018_valid.txt", header = F, stringsAsFactors = F, sep = ' ', quote = "", row.names = NULL)
head(valid)
##          V1               V2 V3 V4 V5 V6
## 1     Chief valid_text_00380  0  5  O  O
## 2 Complaint valid_text_00380  6 15  O  O
## 3         : valid_text_00380 15 16  O  O
## 4        GI valid_text_00380 17 19  O  O
## 5     bleed valid_text_00380 20 25  O  O
## 6         . valid_text_00380 25 26  O  O
colnames(valid) <- tolower(c("TOKEN", "FILE", "START", "END", "HUM", "NET"))

valid_set <- unique(valid$file)
valid_set <- gsub("valid_", "", valid_set)
length(valid_set)
## [1] 162
valid_set <- map[(map$file %in% valid_set),]
nrow(valid_set)
## [1] 162
res <- read.csv("~/goals_of_care/summary_stats/Annotations.csv", header = T, stringsAsFactors = F)
colnames(res) <- tolower(colnames(res))
colnames(res)
##  [1] "x"                                       
##  [2] "row_id"                                  
##  [3] "subject_id"                              
##  [4] "hadm_id"                                 
##  [5] "category"                                
##  [6] "description"                             
##  [7] "text"                                    
##  [8] "cohort"                                  
##  [9] "patient.and.family.care.preferences"     
## [10] "patient.and.family.care.preferences.text"
## [11] "communication.with.family"               
## [12] "communication.with.family.text"          
## [13] "full.code.status"                        
## [14] "full.code.status.text"                   
## [15] "code.status.limitations"                 
## [16] "code.status.limitations.text"            
## [17] "palliative.care.team.involvement"        
## [18] "palliative.care.team.involvement.text"   
## [19] "ambiguous"                               
## [20] "ambiguous.text"                          
## [21] "ambiguous.comments"                      
## [22] "none"                                    
## [23] "operator"                                
## [24] "stamp"
nrow(res)
## [1] 876
dat <- read.csv("~/goals_of_care/regex/strict_regex_MIMIC_results06Nov17.csv", header = T, stringsAsFactors = F)
colnames(dat) <- tolower(colnames(dat))
colnames(dat)
##  [1] "subject_id"           "hadm_id"              "row_id"              
##  [4] "chartdate"            "charttime"            "storetime"           
##  [7] "category"             "description"          "cgid"                
## [10] "iserror"              "text"                 "admittime"           
## [13] "dischtime"            "deathtime"            "admission_type"      
## [16] "admission_location"   "discharge_location"   "insurance"           
## [19] "language"             "religion"             "marital_status"      
## [22] "ethnicity"            "edregtime"            "edouttime"           
## [25] "diagnosis"            "hospital_expire_flag" "has_chartevents_data"
## [28] "gender"               "dob"                  "dod"                 
## [31] "dod_hosp"             "dod_ssn"              "expire_flag"         
## [34] "days_until_death"     "chars"                "cohort"              
## [37] "time_since_admit"
nrow(dat)
## [1] 5339
dat <- merge(res, dat, by = "row_id")
nrow(dat)
## [1] 876
#Duplicated columns in R aren't dropped but given dimensions-- convert to simple name
colnames(dat)[which(colnames(dat) == "text.x")] <- "text"
length(unique(dat$text))
## [1] 641
colnames(dat)
##  [1] "row_id"                                  
##  [2] "x"                                       
##  [3] "subject_id.x"                            
##  [4] "hadm_id.x"                               
##  [5] "category.x"                              
##  [6] "description.x"                           
##  [7] "text"                                    
##  [8] "cohort.x"                                
##  [9] "patient.and.family.care.preferences"     
## [10] "patient.and.family.care.preferences.text"
## [11] "communication.with.family"               
## [12] "communication.with.family.text"          
## [13] "full.code.status"                        
## [14] "full.code.status.text"                   
## [15] "code.status.limitations"                 
## [16] "code.status.limitations.text"            
## [17] "palliative.care.team.involvement"        
## [18] "palliative.care.team.involvement.text"   
## [19] "ambiguous"                               
## [20] "ambiguous.text"                          
## [21] "ambiguous.comments"                      
## [22] "none"                                    
## [23] "operator"                                
## [24] "stamp"                                   
## [25] "subject_id.y"                            
## [26] "hadm_id.y"                               
## [27] "chartdate"                               
## [28] "charttime"                               
## [29] "storetime"                               
## [30] "category.y"                              
## [31] "description.y"                           
## [32] "cgid"                                    
## [33] "iserror"                                 
## [34] "text.y"                                  
## [35] "admittime"                               
## [36] "dischtime"                               
## [37] "deathtime"                               
## [38] "admission_type"                          
## [39] "admission_location"                      
## [40] "discharge_location"                      
## [41] "insurance"                               
## [42] "language"                                
## [43] "religion"                                
## [44] "marital_status"                          
## [45] "ethnicity"                               
## [46] "edregtime"                               
## [47] "edouttime"                               
## [48] "diagnosis"                               
## [49] "hospital_expire_flag"                    
## [50] "has_chartevents_data"                    
## [51] "gender"                                  
## [52] "dob"                                     
## [53] "dod"                                     
## [54] "dod_hosp"                                
## [55] "dod_ssn"                                 
## [56] "expire_flag"                             
## [57] "days_until_death"                        
## [58] "chars"                                   
## [59] "cohort.y"                                
## [60] "time_since_admit"
#Convert dob to numeric
dat$dob <- as.numeric(as.Date(dat$dob, "%Y-%m-%d %H:%M:%S"))
#Assign values for train/valid set data
#dat$cohort <- (dat$row_id %in% train_set$row_id), "train", "")
#dat$cohort <- ifelse((dat$row_id %in% valid_set$row_id), "validation", "")

for (i in 1:nrow(dat)){
  if (dat$row_id[i] %in% train_set$row_id){
    dat$cohort[i] <- "train"
  } else if (dat$row_id[i] %in% valid_set$row_id){
    dat$cohort[i] <- "validation"
  } else {
    dat$cohort[i] <- ""
  }
}

table(dat$cohort)
## 
##                 train validation 
##        123        524        229
#123 unnaccounted for-- may not have been used in the training/validation.
#head(dat[dat$cohort == "",])
#Strange, will deal with it later-- for now remove them.
dat <- dat[dat$cohort != "",]
nrow(dat)
## [1] 753

Remove duplicated text (keep first observation)

#Check values when correcting for duplicates
test <- dat[!duplicated(dat$text),]
nrow(test)
## [1] 542
table(test$cohort)
## 
##      train validation 
##        380        162

Utility Functions

ages() will accept a data frame, and using INTIME and DOB will calculate AGE, patients whose ages were over 89 during any visit are masked in MIMIC, so their median age (91.4) will be imputed.

ages <- function(dat){
    dat$age <- (dat$admittime - dat$dob)/365
    #Correct for patient ages > 90 by imputing median age (91.4)
    dat[(dat$age >= 90),]$age <- 91.4
    return(dat)
}

dat <- ages(dat)

plotDat will accept a data frame, a column name (string), a boolean for beside (TRUE)/stacked(FALSE), a main title (string), an xlabel (string), and a ylabel (string), and is a generic function to make proportion plots easier to generate.

plotDat <- function(dat, column, bs, mn, xl, yl){
  tmp <- as.matrix(table(dat[[column]], dat[["cohort"]]))
  prop <- prop.table(tmp, margin = 2)#2 for column-wise proportions
  par(mar = c(5.0, 4.0, 4.0, 15), xpd = TRUE)
  barplot(prop, col = cm.colors(length(rownames(prop))), beside = bs,width = 2, main = mn, xlab = xl, ylab = yl)
  legend("topright", inset = c(-0.90,0), fill = cm.colors(length(rownames(prop))), legend=rownames(prop))
}

Librarie

#rcompanion for pairWiseNominalIndependence
library("rcompanion")

Stats

Hospital Admission Level Data

Remove duplicated TEXT (keeping first) and HADM_ID for hospital admission level data.

test <- dat[!duplicated(dat$hadm_id.x),]
nrow(test)
## [1] 350
table(test$cohort)
## 
##      train validation 
##        243        107
## Admission Type
plotDat(test, "admission_type", bs = F, mn = "Admission Type Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_type, test$cohort)
tmp
##            
##             train validation
##   ELECTIVE      3          2
##   EMERGENCY   236        104
##   URGENT        4          1
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 0.47272, df = 2, p-value = 0.7895
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##             Comparison p.Chisq p.adj.Chisq
## 1 ELECTIVE : EMERGENCY   1.000           1
## 2    ELECTIVE : URGENT   1.000           1
## 3   EMERGENCY : URGENT   0.983           1
## Admission Location
plotDat(test, "admission_location", bs = F, mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_location, test$cohort)
tmp
##                            
##                             train validation
##   CLINIC REFERRAL/PREMATURE     3          0
##   EMERGENCY ROOM ADMIT        186         82
##   PHYS REFERRAL/NORMAL DELI     4          3
##   TRANSFER FROM HOSP/EXTRAM    49         21
##   TRANSFER FROM SKILLED NUR     1          1
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 2.1853, df = 4, p-value = 0.7017
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##                                               Comparison p.Chisq
## 1       CLINIC REFERRAL/PREMATURE : EMERGENCY ROOM ADMIT   0.606
## 2  CLINIC REFERRAL/PREMATURE : PHYS REFERRAL/NORMAL DELI   0.547
## 3  CLINIC REFERRAL/PREMATURE : TRANSFER FROM HOSP/EXTRAM   0.636
## 4  CLINIC REFERRAL/PREMATURE : TRANSFER FROM SKILLED NUR   0.819
## 5       EMERGENCY ROOM ADMIT : PHYS REFERRAL/NORMAL DELI   0.780
## 6       EMERGENCY ROOM ADMIT : TRANSFER FROM HOSP/EXTRAM   1.000
## 7       EMERGENCY ROOM ADMIT : TRANSFER FROM SKILLED NUR   1.000
## 8  PHYS REFERRAL/NORMAL DELI : TRANSFER FROM HOSP/EXTRAM   0.785
## 9  PHYS REFERRAL/NORMAL DELI : TRANSFER FROM SKILLED NUR   1.000
## 10 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM SKILLED NUR   1.000
##    p.adj.Chisq
## 1            1
## 2            1
## 3            1
## 4            1
## 5            1
## 6            1
## 7            1
## 8            1
## 9            1
## 10           1
## Discharge Location
plotDat(test, "discharge_location", bs = T,mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$discharge_location, test$cohort)
tmp
##               
##                train validation
##   DEAD/EXPIRED   243        107
chisq.test(tmp)
## 
##  Chi-squared test for given probabilities
## 
## data:  tmp
## X-squared = 52.846, df = 1, p-value = 3.608e-13

Patient Level Data

#Need to re-import dat
test <- dat[!duplicated(dat$subject_id.x),]

length(unique(test$subject_id.x))
## [1] 350
## AGE
boxplot(test$age ~ test$cohort)

t.test(test$age ~ test$cohort)
## 
##  Welch Two Sample t-test
## 
## data:  test$age by test$cohort
## t = 0.982, df = 183.98, p-value = 0.3274
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.774373  5.291097
## sample estimates:
##      mean in group train mean in group validation 
##                 71.68969                 69.93133
wilcox.test(test$age ~ test$cohort)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  test$age by test$cohort
## W = 13620, p-value = 0.4777
## alternative hypothesis: true location shift is not equal to 0
tmp_train <- test[(test$cohort == "train"),]$age
tmp_valid <- test[(test$cohort == "validation"),]$age
summary(tmp_train)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22.87   62.39   74.63   71.69   82.26   91.40
summary(tmp_valid)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   23.36   56.54   72.68   69.93   83.49   91.40
## GENDER
plotDat(test, "gender", bs = F, mn = "Gender by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$gender, test$cohort)
tmp
##    
##     train validation
##   F   116         50
##   M   127         57
prop.test(tmp)
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  tmp
## X-squared = 0.0033357, df = 1, p-value = 0.9539
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.09376824  0.11092382
## sample estimates:
##    prop 1    prop 2 
## 0.6987952 0.6902174
##Insurance
plotDat(test, "insurance", bs = F, mn = "Insurance by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$insurance, test$cohort)
tmp
##             
##              train validation
##   Government     3          1
##   Medicaid      18          9
##   Medicare     170         73
##   Private       50         22
##   Self Pay       2          2
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 0.89909, df = 4, p-value = 0.9247
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##               Comparison p.Chisq p.adj.Chisq
## 1  Government : Medicaid   1.000           1
## 2  Government : Medicare   1.000           1
## 3   Government : Private   1.000           1
## 4  Government : Self Pay   1.000           1
## 5    Medicaid : Medicare   0.895           1
## 6     Medicaid : Private   0.982           1
## 7    Medicaid : Self Pay   0.928           1
## 8     Medicare : Private   1.000           1
## 9    Medicare : Self Pay   0.754           1
## 10    Private : Self Pay   0.794           1
##Ethnicity
##Clean ethnicity
test[(grepl("WHITE|PORTUGUESE", test$ethnicity)),]$ethnicity <- "WHITE" 
test[(grepl("ASIAN", test$ethnicity)),]$ethnicity <- "ASIAN" 
test[(grepl("BLACK", test$ethnicity)),]$ethnicity <- "BLACK" 
test[(grepl("HISPANIC", test$ethnicity)),]$ethnicity <- "HISPANIC"
test[(grepl("MIDDLE|NATIVE|MULTI|DECLINED|UNABLE|OTHER|NOT",test$ethnicity)),]$ethnicity <- "OTHER"

plotDat(test, "ethnicity", bs = F, mn = "Ethnicity by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$ethnicity, test$cohort)
tmp
##           
##            train validation
##   ASIAN       10          2
##   BLACK       19         11
##   HISPANIC     4          4
##   OTHER       26         14
##   WHITE      184         76
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 3.6307, df = 4, p-value = 0.4583
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##          Comparison p.Chisq p.adj.Chisq
## 1     ASIAN : BLACK   0.370       0.826
## 2  ASIAN : HISPANIC   0.273       0.826
## 3     ASIAN : OTHER   0.395       0.826
## 4     ASIAN : WHITE   0.539       0.826
## 5  BLACK : HISPANIC   0.781       0.868
## 6     BLACK : OTHER   1.000       1.000
## 7     BLACK : WHITE   0.528       0.826
## 8  HISPANIC : OTHER   0.689       0.861
## 9  HISPANIC : WHITE   0.383       0.826
## 10    OTHER : WHITE   0.578       0.826
## MARITAL_STATUS
##Clean marital status
#test[(grepl("", test$marital_status)),]$marital_status <- "UNKNOWN (DEFAULT)" 
#test[(grepl("UNKNOWN (DEFAULT)", test$marital_status)),]$marital_status <- "UNKNOWN"

plotDat(test, "marital_status", bs = F, mn = "Marital Status by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$marital_status, test$cohort)
tmp
##                    
##                     train validation
##                        17         12
##   DIVORCED             15          8
##   MARRIED             115         46
##   SEPARATED             6          1
##   SINGLE               47         22
##   UNKNOWN (DEFAULT)     0          1
##   WIDOWED              43         17
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 5.4349, df = 6, p-value = 0.4894
pairwiseNominalIndependence(
  as.matrix(tmp),
  fisher = F, gtest = F, chisq = T, method = "fdr")
##                       Comparison p.Chisq p.adj.Chisq
## 1                     : DIVORCED   0.843       0.984
## 2                      : MARRIED   0.246       0.984
## 3                    : SEPARATED   0.368       0.984
## 4                       : SINGLE   0.504       0.984
## 5            : UNKNOWN (DEFAULT)   0.891       0.985
## 6                      : WIDOWED   0.322       0.984
## 7             DIVORCED : MARRIED   0.714       0.984
## 8           DIVORCED : SEPARATED   0.572       0.984
## 9              DIVORCED : SINGLE   1.000       1.000
## 10  DIVORCED : UNKNOWN (DEFAULT)   0.792       0.984
## 11            DIVORCED : WIDOWED   0.760       0.984
## 12           MARRIED : SEPARATED   0.693       0.984
## 13              MARRIED : SINGLE   0.729       0.984
## 14   MARRIED : UNKNOWN (DEFAULT)   0.643       0.984
## 15             MARRIED : WIDOWED   1.000       1.000
## 16            SEPARATED : SINGLE   0.593       0.984
## 17 SEPARATED : UNKNOWN (DEFAULT)   0.537       0.984
## 18           SEPARATED : WIDOWED   0.732       0.984
## 19    SINGLE : UNKNOWN (DEFAULT)   0.713       0.984
## 20              SINGLE : WIDOWED   0.806       0.984
## 21   UNKNOWN (DEFAULT) : WIDOWED   0.651       0.984