#Contains map of files (BRAT format for neuroNER) to row_id (MIMIC NOTEEVENTS)
map <- read.csv("~/goals_of_care/summary_stats/row_id_to_file_num.txt", header = F, stringsAsFactors = F, sep = '\t')
head(map)
##       V1         V2
## 1 407635 text_00000
## 2 345097 text_00001
## 3 371117 text_00002
## 4 401545 text_00003
## 5 393792 text_00004
## 6 382263 text_00005
colnames(map) <- c("row_id", "file")


#Pull in training data from one file
train <- read.csv("~/goals_of_care/regex_v2/CAR_train_processed.csv", header = T, stringsAsFactors = F, quote = "", row.names = NULL)
head(train)
##   X         filename LIM COD CAR PAL FAM LIM.machine COD.machine
## 1 0 train_text_00000   0   0   0   0   0           0           0
## 2 1 train_text_00001   0   0   0   0   0           0           0
## 3 2 train_text_00002   0   0   1   0   0           0           0
## 4 3 train_text_00003   0   0   1   0   0           0           0
## 5 4 train_text_00004   0   0   0   0   0           0           0
## 6 5 train_text_00005   0   0   0   0   0           0           0
##   CAR.machine PAL.machine FAM.machine
## 1           0           0           0
## 2           1           0           0
## 3           0           0           0
## 4           1           0           0
## 5           0           0           0
## 6           0           0           0
colnames(train)[which(colnames(train) == "filename")] <- "file"

train_set <- unique(train$file)
train_set <- gsub("train_", '', train_set)
train_set <- map[(map$file %in% train_set),]

nrow(train_set)
## [1] 449
#Pull in validation data from one file
valid <- read.csv("~/goals_of_care/regex_v2/CAR_valid_processed.csv", header = T, stringsAsFactors = F, quote = "", row.names = NULL)
head(valid)
##   X         filename LIM COD CAR PAL FAM LIM.machine COD.machine
## 1 0 valid_text_00449   0   0   0   0   0           0           0
## 2 1 valid_text_00450   0   0   0   0   0           0           0
## 3 2 valid_text_00451   0   0   0   0   0           0           0
## 4 3 valid_text_00452   0   0   0   0   0           0           0
## 5 4 valid_text_00453   0   0   0   0   0           0           0
## 6 5 valid_text_00454   0   0   0   0   0           0           0
##   CAR.machine PAL.machine FAM.machine
## 1           0           0           0
## 2           0           0           0
## 3           0           0           0
## 4           0           0           0
## 5           0           0           0
## 6           0           0           0
colnames(valid)[which(colnames(valid) == "filename")] <- "file"

valid_set <- unique(valid$file)
valid_set <- gsub("valid_", '', valid_set)
valid_set <- map[(map$file %in% valid_set),]
## ###
## Probably check to make sure they're identical outside of CAR, but safe bet
## ###

nrow(valid_set)
## [1] 192
res <- read.csv("~/goals_of_care/summary_stats/Annotations.csv", header = T, stringsAsFactors = F)
colnames(res) <- tolower(colnames(res))
colnames(res)
##  [1] "x"                                       
##  [2] "row_id"                                  
##  [3] "subject_id"                              
##  [4] "hadm_id"                                 
##  [5] "category"                                
##  [6] "description"                             
##  [7] "text"                                    
##  [8] "cohort"                                  
##  [9] "patient.and.family.care.preferences"     
## [10] "patient.and.family.care.preferences.text"
## [11] "communication.with.family"               
## [12] "communication.with.family.text"          
## [13] "full.code.status"                        
## [14] "full.code.status.text"                   
## [15] "code.status.limitations"                 
## [16] "code.status.limitations.text"            
## [17] "palliative.care.team.involvement"        
## [18] "palliative.care.team.involvement.text"   
## [19] "ambiguous"                               
## [20] "ambiguous.text"                          
## [21] "ambiguous.comments"                      
## [22] "none"                                    
## [23] "operator"                                
## [24] "stamp"
nrow(res)
## [1] 876
dat <- read.csv("~/goals_of_care/regex/strict_regex_MIMIC_results06Nov17.csv", header = T, stringsAsFactors = F)
colnames(dat) <- tolower(colnames(dat))
colnames(dat)
##  [1] "subject_id"           "hadm_id"              "row_id"              
##  [4] "chartdate"            "charttime"            "storetime"           
##  [7] "category"             "description"          "cgid"                
## [10] "iserror"              "text"                 "admittime"           
## [13] "dischtime"            "deathtime"            "admission_type"      
## [16] "admission_location"   "discharge_location"   "insurance"           
## [19] "language"             "religion"             "marital_status"      
## [22] "ethnicity"            "edregtime"            "edouttime"           
## [25] "diagnosis"            "hospital_expire_flag" "has_chartevents_data"
## [28] "gender"               "dob"                  "dod"                 
## [31] "dod_hosp"             "dod_ssn"              "expire_flag"         
## [34] "days_until_death"     "chars"                "cohort"              
## [37] "time_since_admit"
nrow(dat)
## [1] 5339
dat <- merge(res, dat, by = "row_id")
nrow(dat)
## [1] 876
#Duplicated columns in R aren't dropped but given dimensions-- convert to simple name
colnames(dat)[which(colnames(dat) == "text.x")] <- "text"
length(unique(dat$text))
## [1] 641
colnames(dat)
##  [1] "row_id"                                  
##  [2] "x"                                       
##  [3] "subject_id.x"                            
##  [4] "hadm_id.x"                               
##  [5] "category.x"                              
##  [6] "description.x"                           
##  [7] "text"                                    
##  [8] "cohort.x"                                
##  [9] "patient.and.family.care.preferences"     
## [10] "patient.and.family.care.preferences.text"
## [11] "communication.with.family"               
## [12] "communication.with.family.text"          
## [13] "full.code.status"                        
## [14] "full.code.status.text"                   
## [15] "code.status.limitations"                 
## [16] "code.status.limitations.text"            
## [17] "palliative.care.team.involvement"        
## [18] "palliative.care.team.involvement.text"   
## [19] "ambiguous"                               
## [20] "ambiguous.text"                          
## [21] "ambiguous.comments"                      
## [22] "none"                                    
## [23] "operator"                                
## [24] "stamp"                                   
## [25] "subject_id.y"                            
## [26] "hadm_id.y"                               
## [27] "chartdate"                               
## [28] "charttime"                               
## [29] "storetime"                               
## [30] "category.y"                              
## [31] "description.y"                           
## [32] "cgid"                                    
## [33] "iserror"                                 
## [34] "text.y"                                  
## [35] "admittime"                               
## [36] "dischtime"                               
## [37] "deathtime"                               
## [38] "admission_type"                          
## [39] "admission_location"                      
## [40] "discharge_location"                      
## [41] "insurance"                               
## [42] "language"                                
## [43] "religion"                                
## [44] "marital_status"                          
## [45] "ethnicity"                               
## [46] "edregtime"                               
## [47] "edouttime"                               
## [48] "diagnosis"                               
## [49] "hospital_expire_flag"                    
## [50] "has_chartevents_data"                    
## [51] "gender"                                  
## [52] "dob"                                     
## [53] "dod"                                     
## [54] "dod_hosp"                                
## [55] "dod_ssn"                                 
## [56] "expire_flag"                             
## [57] "days_until_death"                        
## [58] "chars"                                   
## [59] "cohort.y"                                
## [60] "time_since_admit"
#Convert dob to numeric
dat$dob <- as.numeric(as.Date(dat$dob, "%Y-%m-%d %H:%M:%S"))
for (i in 1:nrow(dat)){
  if (dat$row_id[i] %in% train_set$row_id){
    dat$cohort[i] <- "train"
  } else if (dat$row_id[i] %in% valid_set$row_id){
    dat$cohort[i] <- "validation"
  } else {
    dat$cohort[i] <- ""
  }
}

table(dat$cohort)
## 
##      train validation 
##        626        250
#All notes accounted for

Utility Functions

ages <- function(dat){
    dat$age <- (dat$admittime - dat$dob)/365
    #Correct for patient ages > 90 by imputing median age (91.4)
    dat[(dat$age >= 90),]$age <- 91.4
    return(dat)
}

dat <- ages(dat)
plotDat <- function(dat, column, bs, mn, xl, yl){
  tmp <- as.matrix(table(dat[[column]], dat[["cohort"]]))
  prop <- prop.table(tmp, margin = 2)#2 for column-wise proportions
  par(mar = c(5.0, 4.0, 4.0, 15), xpd = TRUE)
  barplot(prop, col = cm.colors(length(rownames(prop))), beside = bs,width = 2, main = mn, xlab = xl, ylab = yl)
  legend("topright", inset = c(-0.90,0), fill = cm.colors(length(rownames(prop))), legend=rownames(prop))
}
#Check values when correcting for duplicates
test <- dat[!duplicated(dat$text),]
nrow(test)
## [1] 641
table(test$cohort)
## 
##      train validation 
##        449        192
#rcompanion for pairWiseNominalIndependence
library("rcompanion")

HADM Level Data

test <- dat[!duplicated(dat$hadm_id.x),]
nrow(test)
## [1] 403
table(test$cohort)
## 
##      train validation 
##        280        123
## Admission Type
plotDat(test, "admission_type", bs = F, mn = "Admission Type Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_type, test$cohort)
tmp
##            
##             train validation
##   ELECTIVE      4          1
##   EMERGENCY   272        121
##   URGENT        4          1
chisq.test(tmp)
## Warning in chisq.test(tmp): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 0.53528, df = 2, p-value = 0.7652
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
##             Comparison p.Chisq p.adj.Chisq
## 1 ELECTIVE : EMERGENCY   0.975           1
## 2    ELECTIVE : URGENT   1.000           1
## 3   EMERGENCY : URGENT   0.975           1
## Admission Location
plotDat(test, "admission_location", bs = F, mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_location, test$cohort)
tmp
##                            
##                             train validation
##   CLINIC REFERRAL/PREMATURE     3          2
##   EMERGENCY ROOM ADMIT        213         91
##   PHYS REFERRAL/NORMAL DELI     6          1
##   TRANSFER FROM HOSP/EXTRAM    57         27
##   TRANSFER FROM SKILLED NUR     1          2
chisq.test(tmp)
## Warning in chisq.test(tmp): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 3.0838, df = 4, p-value = 0.5439
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
##                                               Comparison p.Chisq
## 1       CLINIC REFERRAL/PREMATURE : EMERGENCY ROOM ADMIT   1.000
## 2  CLINIC REFERRAL/PREMATURE : PHYS REFERRAL/NORMAL DELI   0.735
## 3  CLINIC REFERRAL/PREMATURE : TRANSFER FROM HOSP/EXTRAM   1.000
## 4  CLINIC REFERRAL/PREMATURE : TRANSFER FROM SKILLED NUR   1.000
## 5       EMERGENCY ROOM ADMIT : PHYS REFERRAL/NORMAL DELI   0.633
## 6       EMERGENCY ROOM ADMIT : TRANSFER FROM HOSP/EXTRAM   0.798
## 7       EMERGENCY ROOM ADMIT : TRANSFER FROM SKILLED NUR   0.455
## 8  PHYS REFERRAL/NORMAL DELI : TRANSFER FROM HOSP/EXTRAM   0.577
## 9  PHYS REFERRAL/NORMAL DELI : TRANSFER FROM SKILLED NUR   0.366
## 10 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM SKILLED NUR   0.533
##    p.adj.Chisq
## 1            1
## 2            1
## 3            1
## 4            1
## 5            1
## 6            1
## 7            1
## 8            1
## 9            1
## 10           1
## Discharge Location
plotDat(test, "discharge_location", bs = T,mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$discharge_location, test$cohort)
tmp
##               
##                train validation
##   DEAD/EXPIRED   280        123
chisq.test(tmp)
## 
##  Chi-squared test for given probabilities
## 
## data:  tmp
## X-squared = 61.164, df = 1, p-value = 5.252e-15

SUBJ Level Data

#Need to re-import dat
test <- dat[!duplicated(dat$subject_id.x),]

length(unique(test$subject_id.x))
## [1] 403
#All age
summary(test$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22.87   61.73   73.87   71.01   83.00   91.40
## AGE
boxplot(test$age ~ test$cohort)

t.test(test$age ~ test$cohort)
## 
##  Welch Two Sample t-test
## 
## data:  test$age by test$cohort
## t = 0.97578, df = 213.74, p-value = 0.3303
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.661378  4.918835
## sample estimates:
##      mean in group train mean in group validation 
##                 71.51172                 69.88299
wilcox.test(test$age ~ test$cohort)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  test$age by test$cohort
## W = 18014, p-value = 0.4614
## alternative hypothesis: true location shift is not equal to 0
tmp_train <- test[(test$cohort == "train"),]$age
tmp_valid <- test[(test$cohort == "validation"),]$age
summary(tmp_train)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22.87   61.79   74.38   71.51   82.49   91.40
## All gender
table(test$gender)
## 
##   F   M 
## 188 215
table(test$gender)/nrow(test)*100
## 
##        F        M 
## 46.65012 53.34988
## GENDER

plotDat(test, "gender", bs = F, mn = "Gender by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$gender, test$cohort)
tmp
##    
##     train validation
##   F   136         52
##   M   144         71
prop.test(tmp)
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  tmp
## X-squared = 1.1195, df = 1, p-value = 0.29
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.04101645  0.14829007
## sample estimates:
##    prop 1    prop 2 
## 0.7234043 0.6697674
##Insurance
plotDat(test, "insurance", bs = F, mn = "Insurance by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$insurance, test$cohort)
tmp
##             
##              train validation
##   Government     3          2
##   Medicaid      21          9
##   Medicare     196         87
##   Private       56         25
##   Self Pay       4          0
chisq.test(tmp)
## Warning in chisq.test(tmp): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 1.9838, df = 4, p-value = 0.7387
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
##               Comparison p.Chisq p.adj.Chisq
## 1  Government : Medicaid   1.000           1
## 2  Government : Medicare   1.000           1
## 3   Government : Private   1.000           1
## 4  Government : Self Pay   0.530           1
## 5    Medicaid : Medicare   1.000           1
## 6     Medicaid : Private   1.000           1
## 7    Medicaid : Self Pay   0.500           1
## 8     Medicare : Private   1.000           1
## 9    Medicare : Self Pay   0.435           1
## 10    Private : Self Pay   0.447           1
##Ethnicity
##Clean ethnicity
test[(grepl("WHITE|PORTUGUESE", test$ethnicity)),]$ethnicity <- "WHITE" 
test[(grepl("ASIAN", test$ethnicity)),]$ethnicity <- "ASIAN" 
test[(grepl("BLACK", test$ethnicity)),]$ethnicity <- "BLACK" 
test[(grepl("HISPANIC", test$ethnicity)),]$ethnicity <- "HISPANIC"
test[(grepl("MIDDLE|NATIVE|MULTI|DECLINED|UNABLE|OTHER|NOT",test$ethnicity)),]$ethnicity <- "OTHER"

plotDat(test, "ethnicity", bs = F, mn = "Ethnicity by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$ethnicity, test$cohort)
tmp
##           
##            train validation
##   ASIAN       10          3
##   BLACK       21         11
##   HISPANIC     5          7
##   OTHER       33         12
##   WHITE      211         90
chisq.test(tmp)
## Warning in chisq.test(tmp): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 5.3111, df = 4, p-value = 0.2568
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
##          Comparison p.Chisq p.adj.Chisq
## 1     ASIAN : BLACK  0.6990       0.920
## 2  ASIAN : HISPANIC  0.1650       0.550
## 3     ASIAN : OTHER  1.0000       1.000
## 4     ASIAN : WHITE  0.8280       0.920
## 5  BLACK : HISPANIC  0.2730       0.683
## 6     BLACK : OTHER  0.6340       0.920
## 7     BLACK : WHITE  0.7480       0.920
## 8  HISPANIC : OTHER  0.0849       0.424
## 9  HISPANIC : WHITE  0.0767       0.424
## 10    OTHER : WHITE  0.7880       0.920
## Marital Status
plotDat(test, "marital_status", bs = F, mn = "Marital Status by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$marital_status, test$cohort)
tmp
##                    
##                     train validation
##                        25          5
##   DIVORCED             19          6
##   MARRIED             128         57
##   SEPARATED             6          1
##   SINGLE               54         32
##   UNKNOWN (DEFAULT)     0          1
##   WIDOWED              48         21
chisq.test(tmp)
## Warning in chisq.test(tmp): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 8.185, df = 6, p-value = 0.2249
pairwiseNominalIndependence(
  as.matrix(tmp),
  fisher = F, gtest = F, chisq = T, method = "fdr")
## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(Dataz, ...): Chi-squared approximation may be
## incorrect
##                       Comparison p.Chisq p.adj.Chisq
## 1                     : DIVORCED  0.7350       0.908
## 2                      : MARRIED  0.1710       0.908
## 3                    : SEPARATED  1.0000       1.000
## 4                       : SINGLE  0.0641       0.908
## 5            : UNKNOWN (DEFAULT)  0.4300       0.908
## 6                      : WIDOWED  0.2370       0.908
## 7             DIVORCED : MARRIED  0.6420       0.908
## 8           DIVORCED : SEPARATED  0.9740       1.000
## 9              DIVORCED : SINGLE  0.3240       0.908
## 10  DIVORCED : UNKNOWN (DEFAULT)  0.5960       0.908
## 11            DIVORCED : WIDOWED  0.7250       0.908
## 12           MARRIED : SEPARATED  0.6060       0.908
## 13              MARRIED : SINGLE  0.3650       0.908
## 14   MARRIED : UNKNOWN (DEFAULT)  0.6840       0.908
## 15             MARRIED : WIDOWED  1.0000       1.000
## 16            SEPARATED : SINGLE  0.4190       0.908
## 17 SEPARATED : UNKNOWN (DEFAULT)  0.5370       0.908
## 18           SEPARATED : WIDOWED  0.6450       0.908
## 19    SINGLE : UNKNOWN (DEFAULT)  0.8020       0.936
## 20              SINGLE : WIDOWED  0.4760       0.908
## 21   UNKNOWN (DEFAULT) : WIDOWED  0.6870       0.908

Text Level Statistics

test <- dat[!duplicated(dat$text),]

#Words
test$words <- sapply(gregexpr("\\W+", unique(test$text)), length) + 1

#Chars
test$chars <- nchar(test$text)

##

boxplot(test$words ~ test$cohort)

t.test(test$words ~ test$cohort)
## 
##  Welch Two Sample t-test
## 
## data:  test$words by test$cohort
## t = 6.0222, df = 394.81, p-value = 3.94e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  129.2318 254.5052
## sample estimates:
##      mean in group train mean in group validation 
##                1058.5768                 866.7083
wilcox.test(test$words ~ test$cohort)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  test$words by test$cohort
## W = 56340, p-value = 7.128e-10
## alternative hypothesis: true location shift is not equal to 0
## Word Stats
summary(test$words)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      92     731     946    1001    1214    2569
tmp_train <- test[(test$cohort == "train"),]$words
tmp_valid <- test[(test$cohort == "validation"),]$words
summary(tmp_train)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     110     772    1021    1059    1284    2328
summary(tmp_valid)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    92.0   638.8   792.5   866.7  1059.0  2569.0
##Char Stats
boxplot(test$chars ~ test$cohort)

t.test(test$chars ~ test$cohort)
## 
##  Welch Two Sample t-test
## 
## data:  test$chars by test$cohort
## t = 5.7033, df = 388.99, p-value = 2.329e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   755.6381 1550.6901
## sample estimates:
##      mean in group train mean in group validation 
##                 6404.154                 5250.990
wilcox.test(test$chars ~ test$cohort)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  test$chars by test$cohort
## W = 55840, p-value = 3.027e-09
## alternative hypothesis: true location shift is not equal to 0
summary(test$chars)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     558    4369    5656    6059    7430   15840
tmp_train <- test[(test$cohort == "train"),]$chars
tmp_valid <- test[(test$cohort == "validation"),]$chars
summary(tmp_train)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     700    4593    6098    6404    7870   14080
summary(tmp_valid)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     558    3798    4830    5251    6374   15840

Regex Model

Load Rules

#Load phrases
rules <- read.csv("~/goals_of_care/regex/rule-based_NLP_domains.csv", header = T, stringsAsFactors = F)

#Rename columns
colnames(rules) <- c("CAR", "FAM", "LIM", "COD", "PAL")

#Show rules
print(rules)
##                               CAR                        FAM
## 1                   goals of care          family discussion
## 2                             goc         family discussions
## 3                  goals for care             family meeting
## 4              goals of treatment       family communication
## 5             goals for treatment  durable power of attorney
## 6                 treatment goals          health care proxy
## 7                care preferences                        hcp
## 8                  extending life   understanding of illness
## 9            comfort-focused care understanding of prognosis
## 10                supportive care                           
## 11                no feeding tube                           
## 12                    no dialysis                           
## 13                no hemodialysis                           
## 14                          no HD                           
## 15                quality of life                           
## 16                     priorities                           
## 17               end of life care                           
## 18                    living will                           
## 19                          molst                           
## 20             advance directives                           
## 21          advance care planning                           
## 22                            acp                           
## 23                        hospice                           
## 24            full code confirmed                           
## 25                  full code d/w                           
## 26            full code discussed                           
## 27             full code verified                           
## 28     would like to be full code                           
## 29         wishes to be full code                           
## 30 would like to remain full code                           
## 31     wishes to remain full code                           
## 32           wish to be full code                           
## 33            remaining full code                           
##                                         LIM       COD                 PAL
## 1                                       dnr full code            pallcare
## 2                                    dnrdni               palliative care
## 3                                       dni                     pall care
## 4                        do not resuscitate           palliative medicine
## 5                        do-not-resuscitate                              
## 6                           do not intubate                              
## 7                           do-not-intubate                              
## 8                             no intubation                              
## 9                        chest compressions                              
## 10                        no defibrillation                              
## 11                                   shocks                              
## 12                            shock therapy                              
## 13                                   no cpr                              
## 14               no endotracheal intubation                              
## 15                 no mechanical intubation                              
## 16                            no ventilator                              
## 17                     no breathing machine                              
## 18                        no breathing tube                              
## 19                    no chest compressions                              
## 20                                      cmo                              
## 21                         comfort measures                              
## 22                             comfort care                              
## 23 limitations of life-sustaining treatment                              
## 24                                     llst                              
## 25                                                                       
## 26                                                                       
## 27                                                                       
## 28                                                                       
## 29                                                                       
## 30                                                                       
## 31                                                                       
## 32                                                                       
## 33

Load Data & Format

res <- dat

#colnames(res)[which(colnames(res) == "patient.and.family.care.preferences")] <- "CAR"
#colnames(res)[which(colnames(res) == "communication.with.family")] <- "FAM"
#colnames(res)[which(colnames(res) == "code.status.limitations")] <- "LIM"
#colnames(res)[which(colnames(res) == "full.code.status")] <- "COD"
#colnames(res)[which(colnames(res) == "palliative.care.team.involvement")] <- "PAL"

Regex Utility Function

strictRegex() will accept all phrases kwds, and all note texts, texts, it will utilize grepl() to find direct matches in the text, and will return a list of booleans.

strictRegex <- function(kwds, texts){
  #Create a list to store results
  tmpList <- list()
  
  #Loop through all keywords
  for (i in 1:length(kwds)){
    #Store results as a logical vector in its respective list entry position
    tmpList[[i]] <- grepl(kwds[i], texts, ignore.case = TRUE)
  }
  
  #Return list and control to environment
  return(tmpList)
}

Apply Regex

## Preprocess domain dictionaries to remove ''
CAR <- strictRegex(rules$CAR[rules$CAR != ''], res$text)
FAM <- strictRegex(rules$FAM[rules$FAM != ''], res$text)
LIM <- strictRegex(rules$LIM[rules$LIM != ''], res$text)
COD <- strictRegex(rules$COD[rules$COD != ''], res$text)
PAL <- strictRegex(rules$PAL[rules$PAL != ''], res$text)


to_df <- function(domain, rule){
  #Convert list from grepl to data frame
  domain <- as.data.frame(domain)
  #Show column names as phrases
  colnames(domain) <- rule[rule != '']
  #Multiply by 1 for binary numeric
  domain <- domain*1
  return(domain)
}


CAR <- to_df(CAR, rules$CAR)
FAM <- to_df(FAM, rules$FAM)
LIM <- to_df(LIM, rules$LIM)
COD <- to_df(COD, rules$COD)
PAL <- to_df(PAL, rules$PAL)

phrase_to_domain <- function(dat){
  inc <- vector()
  for (i in 1:nrow(dat)){
    #Collapse phrases by domain, presence of any phrase indicates domain
    inc[i] <- any(dat[i,] == 1)
  }
  #Multiply by one to convert logical to binary numeric
  return(inc*1)
}

res <- cbind(phrase_to_domain(CAR),
             phrase_to_domain(FAM),
             phrase_to_domain(LIM),
             phrase_to_domain(COD),
             phrase_to_domain(PAL))

colnames(res) <- c("RE_CAR", "RE_FAM", "RE_LIM", "RE_COD", "RE_PAL")

res <- as.data.frame(res)

head(res)
##   RE_CAR RE_FAM RE_LIM RE_COD RE_PAL
## 1      0      0      0      1      0
## 2      0      0      0      1      0
## 3      0      0      0      1      0
## 4      0      0      0      1      0
## 5      0      0      0      1      0
## 6      0      1      0      1      0
nrow(res)
## [1] 876
#Merge res and data
res <- cbind(dat, res)

Preprocess

Remove duplicates, keep first observation of the note, as the first observation will have the most information, subsequent annotations will have additional phrases associated with domains.

res <- res[!duplicated(res$text),]
nrow(res)
## [1] 641

Model Stats

statGen <- function(dat, hum_lab, re_lab, set){
  
  dat <- dat[(dat$cohort == set),]
  #Vector to hold results
  rVec <- vector()
  #True Positives
  tp <- 0
  #True Negatives
  tn <- 0
  #False Positives
  fp <- 0
  #False Negatives
  fn <- 0
  
  #tmp to hold dat[[lab]][i]
  re_tmp <- as.character()
  hum_tmp <- as.character()
  for (i in 1:nrow(dat)){
    
    re_tmp <- dat[[re_lab]][i]
    hum_tmp <- dat[[hum_lab]][i]
    
    rVec[length(rVec)+1] <- paste("Token: ", dat$TEXT[i],"\n", re_lab, " Assignment: ", re_tmp, "\n", sep = '')
    
    #If both model and human aren't negative
    if (re_tmp != 0 & hum_tmp != 0 & !is.na(hum_tmp)){
      #True positive
      tp <- tp + 1
      
    #if human marks negative and model assigns label
    } else if (hum_tmp == 0 & re_tmp != 0 & !is.na(hum_tmp)){
      #False positive
      fp <- fp + 1
      
    #if human marks label and model doesn't assign label
    } else if (hum_tmp != 0 & re_tmp == 0 & !is.na(hum_tmp)){
      
      #False negative
      fn <- fn + 1
    
    #if human marks negative and model marks negative
    } else if (re_tmp == 0 & hum_tmp == 0 & !is.na(hum_tmp)){
      
      #True negative
      tn <- tn + 1
    } else if (is.null(re_tmp) | is.null(hum_tmp)){
      break
    }
  }
  
  #Hold results in txt to check if necessary
  
  write.csv(rVec, 
            file = paste(set,"_",re_lab,"_OUTPUT_NOTE_LEVEL_07Dec17.txt", sep = ''), 
            quote = F, row.names = F)
  
  tmpFrame <- cbind(tp, tn, fp, fn)
  colnames(tmpFrame) <- c("tp", "tn", "fp", "fn")
  return(as.data.frame(tmpFrame))
}
train_results <- rbind(statGen(res, "patient.and.family.care.preferences", "RE_CAR", "train"),
  statGen(res, "full.code.status", "RE_COD", "train"),
  statGen(res, "communication.with.family", "RE_FAM", "train"),
  statGen(res, "code.status.limitations","RE_LIM", "train"),
  statGen(res, "palliative.care.team.involvement", "RE_PAL", "train"))

row.names(train_results) <- c("RE_CAR", "RE_COD", "RE_FAM", "RE_LIM", "RE_PAL")
validation_results <- rbind(statGen(res, "patient.and.family.care.preferences", "RE_CAR", "validation"),
  statGen(res, "full.code.status", "RE_COD", "validation"),
  statGen(res, "communication.with.family", "RE_FAM", "validation"),
  statGen(res, "code.status.limitations","RE_LIM", "validation"),
  statGen(res, "palliative.care.team.involvement", "RE_PAL", "validation"))

row.names(validation_results) <- c("RE_CAR", "RE_COD", "RE_FAM", "RE_LIM", "RE_PAL")

Model Stats

\[accuracy = \frac{(tp+tn)}{(tp + tn + fp + fn)}\] \[precision = \frac{tp}{(tp+fp)}\] \[accuracy = \frac{tp}{(tp+fn)}\] \[specificity = \frac{tn}{(tn+fp)}\] \[F_1 = 2 \cdot \frac{(precision*recall)}{precision + recall}\]

modelStats <- function(dat){
  accuracy <- (dat$tp + dat$tn)/(dat$tp + dat$tn + dat$fp + dat$fn)
  precision <- dat$tp/(dat$tp + dat$fp)
  recall <- dat$tp/(dat$tp + dat$fn)
  specificity <- dat$tn/(dat$tn+dat$fp)
  F1 <- 2*(precision*recall)/(precision + recall)
  
  #dat$accuracy <- paste(sprintf("%2.2f", round(100*accuracy,3)), '%', sep = '')
  #dat$precision <- paste(sprintf("%2.2f", round(100*precision,3)), '%', sep = '')
  #dat$recall <- paste(sprintf("%2.2f", round(100*recall,3)), '%', sep = '')
  #dat$specificity <- paste(sprintf("%2.2f", round(100*specificity,3)), '%', sep = '')
  
  #dat$F1 <- round(100*2*(precision * recall)/(precision + recall),3)
  
  dat$accuracy <- round(accuracy*100, 2)
  dat$precision <- round(precision*100, 2)
  dat$recall <- round(recall*100, 2)
  dat$specificity <- round(specificity*100, 2)
  dat$F1 <- round(F1*100, 2)
  
  return(dat)
}

train_stats <- modelStats(train_results)
write.csv(train_stats, file = "regex_train_stats_07Dec17.csv", row.names = F)
print(train_stats)
##         tp  tn fp fn accuracy precision recall specificity    F1
## RE_CAR  37 292 53 67    73.27     41.11  35.58       84.64 38.14
## RE_COD 192 116 12 36    86.52     94.12  84.21       90.62 88.89
## RE_FAM  69 246 69 65    70.16     50.00  51.49       78.10 50.74
## RE_LIM 144 204 58 43    77.51     71.29  77.01       77.86 74.04
## RE_PAL  10 427 12  0    97.33     45.45 100.00       97.27 62.50
validation_stats <- modelStats(validation_results)
write.csv(validation_stats, file = "regex_validation_stats_07Dec17.csv", row.names = F)
print(validation_stats)
##        tp  tn fp fn accuracy precision recall specificity    F1
## RE_CAR  6 162  5 19    87.50     54.55  24.00       97.01 33.33
## RE_COD 80  46  3 13    88.73     96.39  86.02       93.88 90.91
## RE_FAM 12 156  5 19    87.50     70.59  38.71       96.89 50.00
## RE_LIM 49  96 26 21    75.52     65.33  70.00       78.69 67.59
## RE_PAL  0 190  1  1    98.96      0.00   0.00       99.48   NaN