Load Data

library("data.table")

dat <- fread("~/goals_of_care/cleaned_annotations/op_annot_bow_scores18Jan18.csv", header = T, stringsAsFactors = F)

mim <- fread("~/goals_of_care/external_validation/NOTES_STAYS_ADM_PAT.csv", header = T, stringsAsFactors = F)
## 
Read 0.0% of 385794 rows
Read 5.2% of 385794 rows
Read 10.4% of 385794 rows
Read 15.6% of 385794 rows
Read 20.7% of 385794 rows
Read 25.9% of 385794 rows
Read 31.1% of 385794 rows
Read 33.7% of 385794 rows
Read 38.9% of 385794 rows
Read 169621 rows and 44 (of 44) columns from 1.243 GB file in 00:00:19
test <- merge(dat, mim, by = c("SUBJECT_ID", "HADM_ID", "ROW_ID"))
colnames(test)
##  [1] "SUBJECT_ID"                              
##  [2] "HADM_ID"                                 
##  [3] "ROW_ID"                                  
##  [4] "X"                                       
##  [5] "CATEGORY.x"                              
##  [6] "DESCRIPTION.x"                           
##  [7] "TEXT.x"                                  
##  [8] "COHORT"                                  
##  [9] "Patient.and.Family.Care.Preferences"     
## [10] "Patient.and.Family.Care.Preferences.Text"
## [11] "Communication.with.Family"               
## [12] "Communication.with.Family.Text"          
## [13] "Full.Code.Status"                        
## [14] "Full.Code.Status.Text"                   
## [15] "Code.Status.Limitations"                 
## [16] "Code.Status.Limitations.Text"            
## [17] "Palliative.Care.Team.Involvement"        
## [18] "Palliative.Care.Team.Involvement.Text"   
## [19] "Ambiguous"                               
## [20] "Ambiguous.Text"                          
## [21] "Ambiguous.Comments"                      
## [22] "None"                                    
## [23] "STAMP"                                   
## [24] "operator"                                
## [25] "original_filename"                       
## [26] "lim_bow_score"                           
## [27] "cod_bow_score"                           
## [28] "car_bow_score"                           
## [29] "pal_bow_score"                           
## [30] "fam_bow_score"                           
## [31] "CHARTDATE"                               
## [32] "CHARTTIME"                               
## [33] "STORETIME"                               
## [34] "CATEGORY.y"                              
## [35] "DESCRIPTION.y"                           
## [36] "CGID"                                    
## [37] "ISERROR"                                 
## [38] "TEXT.y"                                  
## [39] "ADMITTIME"                               
## [40] "DISCHTIME"                               
## [41] "DEATHTIME"                               
## [42] "ADMISSION_TYPE"                          
## [43] "ADMISSION_LOCATION"                      
## [44] "DISCHARGE_LOCATION"                      
## [45] "INSURANCE"                               
## [46] "LANGUAGE"                                
## [47] "RELIGION"                                
## [48] "MARITAL_STATUS"                          
## [49] "ETHNICITY"                               
## [50] "EDREGTIME"                               
## [51] "EDOUTTIME"                               
## [52] "DIAGNOSIS"                               
## [53] "HOSPITAL_EXPIRE_FLAG"                    
## [54] "HAS_CHARTEVENTS_DATA"                    
## [55] "GENDER"                                  
## [56] "DOB"                                     
## [57] "DOD"                                     
## [58] "DOD_HOSP"                                
## [59] "DOD_SSN"                                 
## [60] "EXPIRE_FLAG"                             
## [61] "ICUSTAY_ID"                              
## [62] "DBSOURCE"                                
## [63] "FIRST_CAREUNIT"                          
## [64] "LAST_CAREUNIT"                           
## [65] "FIRST_WARDID"                            
## [66] "LAST_WARDID"                             
## [67] "INTIME"                                  
## [68] "OUTTIME"                                 
## [69] "LOS"                                     
## [70] "AGE"                                     
## [71] "ADMISSION_NUMBER"
#Order data
test <- test[with(test, order(SUBJECT_ID, HADM_ID, ADMITTIME)),]

#Note: Strange data point
tmp_note <- dat[which(dat$TEXT == setdiff(dat$TEXT, test$TEXT.x)), ]
#write.csv(tmp_note, file = "/Users/Edward/Desktop/unmatched_note_result.csv", row.names = F)

Correct columnnames

for (name in colnames((test))){
  #If the column name is duplicated
  if (grepl(".y", name)){
    #Remove duplicate
    test[[name]] <- NULL
  } else if (grepl(".x", name)){
    #Rename original to remove the x dimension
    colnames(test)[which(colnames(test) == name)] <- gsub(".x", '', name)
  }
}
rm(name)

colnames(test)
##  [1] "SUBJECT_ID"                         
##  [2] "HADM_ID"                            
##  [3] "ROW_ID"                             
##  [4] "X"                                  
##  [5] "CATEGORY"                           
##  [6] "DESCRIPTION"                        
##  [7] "TEXT"                               
##  [8] "COHORT"                             
##  [9] "Full.Code.Status"                   
## [10] "Full.Code.Status.Tt"                
## [11] "Code.Status.Limitations"            
## [12] "Code.Status.Limitations.Tt"         
## [13] "Palliative.Care.Team.Involvement"   
## [14] "Palliative.Care.Team.Involvement.Tt"
## [15] "Ambiguous"                          
## [16] "Ambiguous.Tt"                       
## [17] "Ambiguous.Comments"                 
## [18] "None"                               
## [19] "STAMP"                              
## [20] "operator"                           
## [21] "original_filename"                  
## [22] "lim_bow_score"                      
## [23] "cod_bow_score"                      
## [24] "car_bow_score"                      
## [25] "pal_bow_score"                      
## [26] "fam_bow_score"                      
## [27] "CHARTDATE"                          
## [28] "CHARTTIME"                          
## [29] "STORETIME"                          
## [30] "CGID"                               
## [31] "ISERROR"                            
## [32] "ADMITTIME"                          
## [33] "DISCHTIME"                          
## [34] "DEATHTIME"                          
## [35] "ADMISSION_TYPE"                     
## [36] "ADMISSION_LOCATION"                 
## [37] "DISCHARGE_LOCATION"                 
## [38] "INSURANCE"                          
## [39] "LANGUAGE"                           
## [40] "RELIGION"                           
## [41] "MARITAL_STATUS"                     
## [42] "ETHNICITY"                          
## [43] "EDREGTIME"                          
## [44] "EDOUTTIME"                          
## [45] "DIAGNOSIS"                          
## [46] "HOSPITAL_EXPIRE_FLAG"               
## [47] "HAS_CHARTEVENTS_DATA"               
## [48] "GENDER"                             
## [49] "DOB"                                
## [50] "DOD"                                
## [51] "DOD_HOSP"                           
## [52] "DOD_SSN"                            
## [53] "EXPIRE_FLAG"                        
## [54] "ICUSTAY_ID"                         
## [55] "DBSOURCE"                           
## [56] "FIRST_CAREUNIT"                     
## [57] "LAST_CAREUNIT"                      
## [58] "FIRST_WARDID"                       
## [59] "LAST_WARDID"                        
## [60] "INTIME"                             
## [61] "OUTTIME"                            
## [62] "LOS"                                
## [63] "AGE"                                
## [64] "ADMISSION_NUMBER"
length(unique(test$SUBJECT_ID))
## [1] 402
length(unique(test$HADM_ID))
## [1] 402
length(unique(test$TEXT))
## [1] 640

Utility Functions

ages <- function(dat){
    dat$AGE <- (dat$ADMITTIME - dat$DOB)/365
    #Correct for patient ages > 90 by imputing median age (91.4)
    dat[(dat$AGE >= 90),]$AGE <- 91.4
    return(dat)
}
clean_text <- function(tokens, printout){
    #Create a fake patient note phrase as a representative sample
    ex_token <- "Example note:\nThe patient is a 81yo m who was found down in [** location **] on [** date **] by daughter, [** name **].\n Pt was in usual state of health until four days ago, when began to complain to family of heartburn for which the pt was taking tums in addition to his prescribed PPI, without resolution."
  if (printout){
    print(substr(ex_token, 1, 100))
  }
  
  #Remove carriage returns, convert to lower
  tokens <- tolower(gsub('\n', ' ', tokens))
  tokens <- tolower(gsub('\r', ' ', tokens))
  ex_token <- tolower(gsub('\n', ' ', ex_token))
  if (printout){
    cat("Result after removing carriage returns:\n")
    print(substr(ex_token, 1, 100))
  }
  
  #https://stackoverflow.com/questions/13529360/replace-text-within-parenthesis-in-r
  #Remove obfuscations between '[' and ']'
  tokens <- gsub(" *\\[.*?\\] *", ' ', tokens)
  ex_token <- gsub(" *\\[.*?\\] *", ' ', ex_token)
  if (printout){
    cat("Result after leaving [obfuscation]:\n")
    print(substr(ex_token, 1, 100))
  }
  
  #Keep only words & numeric
  tokens <- gsub("[^[:alnum:][:space:]]", '', tokens)
  ex_token <- gsub("[^[:alnum:][:space:]]", '', ex_token)
  if (printout){
    cat("Result after removing all but alphanumeric and spaces:\n")
    print(substr(ex_token, 1, 100))
  }
  
  #Remove numeric
  tokens <- gsub("\\d+", '', tokens)
  ex_token <- gsub("\\d+", '', ex_token)
  if (printout){
    cat("Result after removing all but alphabetic characters and spaces:\n")
    print(substr(ex_token, 1, 100))
  }
  
  #Keep only a single white space
  #https://stackoverflow.com/questions/25707647/merge-multiple-spaces-to-single-space-remove-trailing-leading-spaces
  tokens <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', tokens, perl=TRUE)
  ex_token <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', ex_token, perl=TRUE)
  if (printout){
    cat("Result after keeping only single spaces:\n")
    print(substr(ex_token, 1, 100))
  }

  return(tokens)
}
plot_bow <- function(bows, n, mn){
  tmp_tab <- table(bows)[rev(order(table(bows)))]
  par(mai=c(1,2,1,1))
  barplot(rev(head(tmp_tab, n)), horiz = T, las = 1,
          main = mn,
          xlab = "Frequency")
}
plotDat <- function(dat, column, bs, mn, xl, yl){
  tmp <- as.matrix(table(dat[[column]], dat[["cohort"]]))
  prop <- prop.table(tmp, margin = 2)#2 for column-wise proportions
  par(mar = c(5.0, 4.0, 4.0, 15), xpd = TRUE)
  barplot(prop, col = cm.colors(length(rownames(prop))), beside = bs,width = 2, main = mn, xlab = xl, ylab = yl)
  legend("topright", inset = c(-0.90,0), fill = cm.colors(length(rownames(prop))), legend=rownames(prop))
}
#rcompanion for pairWiseNominalIndependence
library("rcompanion")

Create Ages

test <- ages(test)

#Colnames to lowercase
colnames(test) <- tolower(colnames(test))
#Store data
tmp_two <- test

Hospital Admission Level Data

#Subset
test <- test[!duplicated(test$hadm_id),]
nrow(test)
## [1] 402
table(test$cohort)
## 
##      train validation 
##        279        123
## Admission Type
plotDat(test, "admission_type", bs = F, mn = "Admission Type Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_type, test$cohort)
tmp
##            
##             train validation
##   ELECTIVE      4          1
##   EMERGENCY   271        121
##   URGENT        4          1
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 0.54231, df = 2, p-value = 0.7625
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##             Comparison p.Chisq p.adj.Chisq
## 1 ELECTIVE : EMERGENCY   0.972           1
## 2    ELECTIVE : URGENT   1.000           1
## 3   EMERGENCY : URGENT   0.972           1
## Admission Location
plotDat(test, "admission_location", bs = F, mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_location, test$cohort)
tmp
##                            
##                             train validation
##   CLINIC REFERRAL/PREMATURE     3          2
##   EMERGENCY ROOM ADMIT        212         91
##   PHYS REFERRAL/NORMAL DELI     6          1
##   TRANSFER FROM HOSP/EXTRAM    57         27
##   TRANSFER FROM SKILLED NUR     1          2
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 3.0631, df = 4, p-value = 0.5473
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##                                               Comparison p.Chisq
## 1       CLINIC REFERRAL/PREMATURE : EMERGENCY ROOM ADMIT   1.000
## 2  CLINIC REFERRAL/PREMATURE : PHYS REFERRAL/NORMAL DELI   0.735
## 3  CLINIC REFERRAL/PREMATURE : TRANSFER FROM HOSP/EXTRAM   1.000
## 4  CLINIC REFERRAL/PREMATURE : TRANSFER FROM SKILLED NUR   1.000
## 5       EMERGENCY ROOM ADMIT : PHYS REFERRAL/NORMAL DELI   0.629
## 6       EMERGENCY ROOM ADMIT : TRANSFER FROM HOSP/EXTRAM   0.812
## 7       EMERGENCY ROOM ADMIT : TRANSFER FROM SKILLED NUR   0.458
## 8  PHYS REFERRAL/NORMAL DELI : TRANSFER FROM HOSP/EXTRAM   0.577
## 9  PHYS REFERRAL/NORMAL DELI : TRANSFER FROM SKILLED NUR   0.366
## 10 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM SKILLED NUR   0.533
##    p.adj.Chisq
## 1            1
## 2            1
## 3            1
## 4            1
## 5            1
## 6            1
## 7            1
## 8            1
## 9            1
## 10           1
## Discharge Location (Note: Not much info here...)
plotDat(test, "discharge_location", bs = T,mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

Patient Level Data

#Subset
test <- tmp_two
test <- test[!duplicated(test$subject_id),]
nrow(test)
## [1] 402
table(test$cohort)
## 
##      train validation 
##        279        123
#All age
boxplot(test$age ~ test$cohort, main = "Age Distribution by Cohort", xlab = "Set", ylab = "Age")

tmp_train <- test[(test$cohort == "train"),]$age
tmp_valid <- test[(test$cohort == "validation"),]$age
tmp_full <- test$age
summary(tmp_train)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22.87   61.73   74.58   71.51   82.63   91.40
sd(tmp_train)
## [1] 14.40688
summary(tmp_valid)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   23.36   61.06   72.45   69.88   83.30   91.40
sd(tmp_valid)
## [1] 15.86933
summary(tmp_full)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22.87   61.68   73.92   71.01   83.02   91.40
sd(tmp_full)
## [1] 14.86864
## GENDER
plotDat(test, "gender", bs = F, mn = "Gender by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$gender, test$cohort)
tmp
##    
##     train validation
##   F   136         52
##   M   143         71
##Insurance
plotDat(test, "insurance", bs = F, mn = "Insurance by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$insurance, test$cohort)
tmp
##             
##              train validation
##   Government     3          2
##   Medicaid      21          9
##   Medicare     195         87
##   Private       56         25
##   Self Pay       4          0
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 1.988, df = 4, p-value = 0.738
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##               Comparison p.Chisq p.adj.Chisq
## 1  Government : Medicaid   1.000           1
## 2  Government : Medicare   1.000           1
## 3   Government : Private   1.000           1
## 4  Government : Self Pay   0.530           1
## 5    Medicaid : Medicare   1.000           1
## 6     Medicaid : Private   1.000           1
## 7    Medicaid : Self Pay   0.500           1
## 8     Medicare : Private   1.000           1
## 9    Medicare : Self Pay   0.433           1
## 10    Private : Self Pay   0.447           1
##Ethnicity
##Clean ethnicity
test[(grepl("WHITE|PORTUGUESE", test$ethnicity)),]$ethnicity <- "WHITE" 
test[(grepl("ASIAN", test$ethnicity)),]$ethnicity <- "ASIAN" 
test[(grepl("BLACK", test$ethnicity)),]$ethnicity <- "BLACK" 
test[(grepl("HISPANIC", test$ethnicity)),]$ethnicity <- "HISPANIC"
test[(grepl("MIDDLE|NATIVE|MULTI|DECLINED|UNABLE|OTHER|NOT",test$ethnicity)),]$ethnicity <- "OTHER"

plotDat(test, "ethnicity", bs = F, mn = "Ethnicity by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$ethnicity, test$cohort)
tmp
##           
##            train validation
##   ASIAN       10          3
##   BLACK       21         11
##   HISPANIC     5          7
##   OTHER       33         12
##   WHITE      210         90
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 5.2863, df = 4, p-value = 0.2592
pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##          Comparison p.Chisq p.adj.Chisq
## 1     ASIAN : BLACK  0.6990       0.913
## 2  ASIAN : HISPANIC  0.1650       0.550
## 3     ASIAN : OTHER  1.0000       1.000
## 4     ASIAN : WHITE  0.8220       0.913
## 5  BLACK : HISPANIC  0.2730       0.683
## 6     BLACK : OTHER  0.6340       0.913
## 7     BLACK : WHITE  0.7570       0.913
## 8  HISPANIC : OTHER  0.0849       0.424
## 9  HISPANIC : WHITE  0.0782       0.424
## 10    OTHER : WHITE  0.7780       0.913
## Marital Status

## Note: Need to clean marital status
plotDat(test, "marital_status", bs = F, mn = "Marital Status by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$marital_status, test$cohort)
tmp
##                    
##                     train validation
##                        25          5
##   DIVORCED             19          6
##   MARRIED             128         57
##   SEPARATED             6          1
##   SINGLE               53         32
##   UNKNOWN (DEFAULT)     0          1
##   WIDOWED              48         21
chisq.test(tmp)
## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 8.3935, df = 6, p-value = 0.2107
pairwiseNominalIndependence(
  as.matrix(tmp),
  fisher = F, gtest = F, chisq = T, method = "fdr")
##                       Comparison p.Chisq p.adj.Chisq
## 1                     : DIVORCED  0.7350       0.908
## 2                      : MARRIED  0.1710       0.908
## 3                    : SEPARATED  1.0000       1.000
## 4                       : SINGLE  0.0591       0.908
## 5            : UNKNOWN (DEFAULT)  0.4300       0.908
## 6                      : WIDOWED  0.2370       0.908
## 7             DIVORCED : MARRIED  0.6420       0.908
## 8           DIVORCED : SEPARATED  0.9740       1.000
## 9              DIVORCED : SINGLE  0.3070       0.908
## 10  DIVORCED : UNKNOWN (DEFAULT)  0.5960       0.908
## 11            DIVORCED : WIDOWED  0.7250       0.908
## 12           MARRIED : SEPARATED  0.6060       0.908
## 13              MARRIED : SINGLE  0.3320       0.908
## 14   MARRIED : UNKNOWN (DEFAULT)  0.6840       0.908
## 15             MARRIED : WIDOWED  1.0000       1.000
## 16            SEPARATED : SINGLE  0.4070       0.908
## 17 SEPARATED : UNKNOWN (DEFAULT)  0.5370       0.908
## 18           SEPARATED : WIDOWED  0.6450       0.908
## 19    SINGLE : UNKNOWN (DEFAULT)  0.8100       0.945
## 20              SINGLE : WIDOWED  0.4430       0.908
## 21   UNKNOWN (DEFAULT) : WIDOWED  0.6870       0.908

Text Level Statistics

test <- tmp_two

test <- test[!duplicated(test$text),]
nrow(test)
## [1] 640
table(test$cohort)
## 
##      train validation 
##        448        192
#Words
test$words <- sapply(gregexpr("\\W+", test$text), length) + 1
#All words
boxplot(test$words ~ test$cohort, main = "Token Distribution by Cohort", xlab = "Set", ylab = "Tokens")

tmp_train <- test[(test$cohort == "train"),]$words
tmp_valid <- test[(test$cohort == "validation"),]$words
tmp_full <- test$words
summary(tmp_train)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   110.0   771.2  1020.0  1057.0  1282.0  2328.0
sd(tmp_train)
## [1] 393.7596
summary(tmp_valid)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    92.0   638.8   792.5   866.7  1059.0  2569.0
sd(tmp_valid)
## [1] 358.3574
summary(tmp_full)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    92.0   731.0   945.5  1000.0  1212.0  2569.0
sd(tmp_full)
## [1] 393.0449
#Chars
test$chars <- nchar(test$text)
tmp_train <- test[(test$cohort == "train"),]$chars
tmp_valid <- test[(test$cohort == "validation"),]$chars
tmp_full <- test$chars
summary(tmp_train)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     700    4593    6094    6398    7850   14080
sd(tmp_train)
## [1] 2474.898
summary(tmp_valid)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     558    3798    4833    5252    6374   15840
sd(tmp_valid)
## [1] 2286.812
summary(tmp_full)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     558    4368    5658    6054    7422   15840
sd(tmp_full)
## [1] 2474.669
boxplot(test$chars ~ test$cohort, main = "Word Count Distribution by Cohort", xlab = "Set", ylab = "Word Count")

boxplot(test$chars ~ test$cohort, main = "Character Count Distribution by Cohort", xlab = "Set", ylab = "Character Count")

###
#Unique Words (word dictionary)
test$cln_txt <- clean_text(test$text, F)

#Total Tokens
dictionary <- unlist(strsplit(test$cln_txt, ' '))

#Number of tokens
length(dictionary)
## [1] 472853
#Unique Tokens
length(unique(dictionary))
## [1] 13691
#Most frequent words
plot_bow(dictionary, 20, "Top 20 Most Frequent Tokens")

#Word stats
tmp_train <- test[(test$cohort == "train"),]$cln_txt
tmp_valid <- test[(test$cohort == "validation"),]$cln_txt
tmp_full <- test$cln_txt
summary(tmp_train)
##    Length     Class      Mode 
##       448 character character
sd(tmp_train)
## [1] NA
summary(tmp_valid)
##    Length     Class      Mode 
##       192 character character
sd(tmp_valid)
## [1] NA
summary(tmp_full)
##    Length     Class      Mode 
##       640 character character
sd(tmp_full)
## [1] NA

ICU Level

test <- tmp_two
#Note-level ICU stats
test <- test[!duplicated(test$text),]
nrow(test)
## [1] 640
table(test$cohort)
## 
##      train validation 
##        448        192
plotDat(test, "first_careunit", bs = F, mn = "First Care Unit by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$first_careunit, test$cohort)
tmp
##        
##         train validation
##   CCU      50         13
##   CSRU     14          6
##   MICU    313        127
##   SICU     38         30
##   TSICU    33         16
plotDat(test, "last_careunit", bs = F, mn = "Last Care Unit by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$last_careunit, test$cohort)
tmp
##        
##         train validation
##   CCU      50         15
##   CSRU      7          3
##   MICU    320        134
##   SICU     40         29
##   TSICU    31         11
#Diagnoses
train <- test[(test$cohort == "train"),]$diagnosis
plot_bow(train, 5, "Most Frequent Diagnoses in Train Set")

train <- table(train)
head(train[rev(order(train))])
## train
##               PNEUMONIA                  SEPSIS             HYPOTENSION 
##                      56                      28                      20 
## INTRACRANIAL HEMORRHAGE          UPPER GI BLEED   ALTERED MENTAL STATUS 
##                      12                      10                      10
valid <- test[(test$cohort == "validation"),]$diagnosis
plot_bow(valid, 5, "Most Frequent Diagnoses in Validation Set")

valid <- table(valid)
head(valid[rev(order(valid))])
## valid
##                              PNEUMONIA 
##                                     24 
##                                 SEPSIS 
##                                     17 
## URINARY TRACT INFECTION;PYELONEPHRITIS 
##                                      6 
##                    SHORTNESS OF BREATH 
##                                      4 
##           LOWER GASTROINTESTINAL BLEED 
##                                      4 
##                                 ANEMIA 
##                                      4
tmp_full <- test$diagnosis
plot_bow(tmp_full, 5, "Most Frequent Diagnoses in Full Set")

tmp_full <- table(tmp_full)
head(tmp_full[rev(order(tmp_full))])
## tmp_full
##                              PNEUMONIA 
##                                     80 
##                                 SEPSIS 
##                                     45 
##                            HYPOTENSION 
##                                     22 
## URINARY TRACT INFECTION;PYELONEPHRITIS 
##                                     14 
##                INTRACRANIAL HEMORRHAGE 
##                                     14 
##                         UPPER GI BLEED 
##                                     13

Domains

#Use dat for annotation data
colnames(dat)
##  [1] "X"                                       
##  [2] "ROW_ID"                                  
##  [3] "SUBJECT_ID"                              
##  [4] "HADM_ID"                                 
##  [5] "CATEGORY"                                
##  [6] "DESCRIPTION"                             
##  [7] "TEXT"                                    
##  [8] "COHORT"                                  
##  [9] "Patient.and.Family.Care.Preferences"     
## [10] "Patient.and.Family.Care.Preferences.Text"
## [11] "Communication.with.Family"               
## [12] "Communication.with.Family.Text"          
## [13] "Full.Code.Status"                        
## [14] "Full.Code.Status.Text"                   
## [15] "Code.Status.Limitations"                 
## [16] "Code.Status.Limitations.Text"            
## [17] "Palliative.Care.Team.Involvement"        
## [18] "Palliative.Care.Team.Involvement.Text"   
## [19] "Ambiguous"                               
## [20] "Ambiguous.Text"                          
## [21] "Ambiguous.Comments"                      
## [22] "None"                                    
## [23] "STAMP"                                   
## [24] "operator"                                
## [25] "original_filename"                       
## [26] "lim_bow_score"                           
## [27] "cod_bow_score"                           
## [28] "car_bow_score"                           
## [29] "pal_bow_score"                           
## [30] "fam_bow_score"
nrow(dat)
## [1] 641
#Remove odd note
test <- dat[(dat$ROW_ID != tmp_note$ROW_ID),]
nrow(test)
## [1] 640
#Convert column names to lower
colnames(test) <- tolower(colnames(test))


train <- test[(test$cohort == "train"),]

valid <- test[(test$cohort == "validation"),]

full <- test
#Documentation of Patient and Family Care Preferences 
#or Code Status Limitations

fam_or_lim <- function(dat){
  count <- 0
  for (i in 1:nrow(dat)){
    #If one and not the other
    if (dat$patient.and.family.care.preferences[i] == 1 & dat$code.status.limitations[i] == 1){
      #increment
      count <- count + 1
      #The other way round
    } else if (dat$patient.and.family.care.preferences[i] == 0 & dat$code.status.limitations[i] == 1){
      count <- count + 1
    } else if (dat$patient.and.family.care.preferences[i] == 1 & dat$code.status.limitations[i] == 0){
      count <- count + 1
    } #Else nothing
  }
  return(count)
}


fam_or_lim(train)
## [1] 228
paste(round(fam_or_lim(train)/nrow(train)*100, 1), '%', sep = '')
## [1] "50.9%"
fam_or_lim(valid)
## [1] 78
paste(round(fam_or_lim(valid)/nrow(valid)*100, 1), '%', sep = '')
## [1] "40.6%"
fam_or_lim(full)
## [1] 306
paste(round(fam_or_lim(full)/nrow(full)*100, 1), '%', sep = '')
## [1] "47.8%"