Frequent Flyer Compiled Analysis

This is a summary data analysis of a corpus of patient notes (Discharge Summaries and Nursing Notes) extracted from the MIMIC-III database. This corpus resulted from the extraction of Frequent Flyer patients, who have had three or more ICU admissions over the course of a single year at any point, in addition to patients not within this cohort in an approximately 80-20 ratio.

These notes were annotated for several different clinical concepts (detailed below) over the period from August, 2015 to October, 2016. Note annotators included two clinical research associates (ETM & JWW), two junior resident physicians (JTW & JFF), and two senior resident physicians (PDT & DAG). Annotation was overseen by an attending physician (LAC).

The corpus of patient notes itself consists of the Subject ID, Hospital Admission ID, Text, Binary Indications, Batch Date, and Operators. These data will be joined to other tables of MIMIC-III v1.4 for the purpose of generating hospital admission-level as well as patient-level summary statistics.

Load Necessary MIMIC-III v1.4 Tables

Load ICUSTAYS, PATIENTS, ADMISSIONS, and NOTEEVENTS flat files which we will use to extract relevant data.

setwd("C:/Users/Edward Moseley/Documents")

stay <- read.csv("ICUSTAYS.csv",
    header = T, stringsAsFactors = F)
pat <- read.csv("PATIENTS.csv",
    header = T, stringsAsFactors = F)
adm <- read.csv("ADMISSIONS.csv",
    header = T, stringsAsFactors = F)
notes <- read.csv("NOTEEVENTS.csv",
    header = T, stringsAsFactors = F)

Utility Functions

ages() will accept a data frame, and using INTIME and DOB will calculate AGE, patients whose ages were over 89 during any visit are masked in MIMIC, so their median age (91.4) will be impuned. Patients aged 10 and younger will be discarded.

ages <- function(dat){
    dat$AGE <- (dat$INTIME - dat$DOB)/365
    #Correct for patient ages > 90 by impuning median age (91.4)
    dat[(dat$AGE >= 90),]$AGE <- 91.4
    #Remove any age < 10
    dat <- dat[(dat$AGE > 10),]
    return(dat)
}

admissionCount() will accept the ADMISSIONS table, and will order by SUBJECT_ID and ADMITTIME to determine the admission sequence numbers for each patient.

admissionCount <- function(dat){
  
  #Convert character object to Date then numeric 
  dat$ADMITTIME <- as.numeric(as.Date(gsub("([^ ]*).*", "\\1", dat$ADMITTIME), "%Y-%m-%d"))

  #order by subject id, then admittime
  dat <- dat[with(dat, order(SUBJECT_ID, ADMITTIME)),]
  
  tmp <- data.frame()
  res <- data.frame()
  for (subj in unique(dat$SUBJECT_ID)){
    tmp <- dat[(dat$SUBJECT_ID == subj),]
    tmp$ADMISSION_NUMBER <- seq.int(nrow(tmp))
    res <- rbind(res, tmp)
  }

  return(res)  
}

plotDat will

plotDat <- function(dat, column, bs, mn, xl, yl){
  tmp <- as.matrix(table(dat[[column]], dat[["cohort"]]))
  prop <- prop.table(tmp, margin = 2)#2 for column-wise proportions
  par(mar = c(5.0, 4.0, 4.0, 15), xpd = TRUE)
  barplot(prop, col = cm.colors(length(rownames(prop))), beside = bs,width = 2, main = mn, xlab = xl, ylab = yl)
  legend("topright", inset = c(-0.90,0), fill = cm.colors(length(rownames(prop))), legend=rownames(prop))
}

ffCheck() will accept the ICUSTAYS, PATIENTS, ADMISSIONS, AND NOTEEVENTS tables, in addition to a boolean which will control the call ages() on the output. It will look at each individual SUBJECT_ID for three or more hospital admissions over the course of a single year, if that condition is satisfied at any point for the patient, all of the patient’s admissions will be subset. ffCheck() can also be used to generate the entire MIMIC data set by setting ff = F

ffCheck <- function(dat, patDat, admDat, noteDat, A, ff){# A = T | F , ff = T | F
  
    admDat <- admissionCount(admDat)
  
    #Convert character object to Date then numeric 
    dat$INTIME <- as.numeric(as.Date(gsub("([^ ]*).*", "\\1", dat$INTIME), "%Y-%m-%d")) 

    dat$OUTTIME <- as.numeric(as.Date(gsub("([^ ]*).*", "\\1", dat$OUTTIME), "%Y-%m-%d"))

    #Drop ROW_ID in admDat
    admDat$ROW_ID <- NULL

    #Drop ROW_ID in patDat 
    patDat$ROW_ID <- NULL 

    #Convert INTIME/DOB/DOD/DOD_HOSP/DOD_SSN to numeric 
    patDat$DOB <- as.numeric(as.Date(gsub("([^ ]*).*", "\\1", patDat$DOB), "%Y-%m-%d")) 

    patDat$DOD <-  as.numeric(as.Date(gsub("([^ ]*).*", "\\1", patDat$DOD), "%Y-%m-%d")) 

    patDat$DOD_HOSP <- as.numeric(as.Date(gsub("([^ ]*).*", "\\1", patDat$DOD_HOSP), "%Y-%m-%d")) 

    patDat$DOD_SSN <- as.numeric(as.Date(gsub("([^ ]*).*", "\\1", patDat$DOD_SSN), "%Y-%m-%d")) 

    #Placeholder 
    #dat$ADMISSION_NUMBER <- rep(1, each = nrow(dat)) 
   
    #Generate some temporary variables for holding stuff 
    tmp <- data.frame() 
    tmpReal <- data.frame() 
    res <- data.frame()

    #for each unique subject in the data set 
    for (subj in unique(dat$SUBJECT_ID)){

      #subset subject data 
      tmp <- dat[(dat$SUBJECT_ID == subj),] 

      #order data by ICU intake 
      tmp <- tmp[(order(tmp$INTIME)),]

      #Store the data 
      tmpReal <- tmp
    
      #initial check to see if there are 3 ICU visits for that subject
      if (ff == TRUE){
        while (nrow(tmp) > 2){
          if (as.numeric(tmp$INTIME[3] - tmp$INTIME[1]) <= 365){
            res <- rbind(res, tmpReal)
            break
          } else {
            tmp <- tmp[-1,]
          }
        }
      } else {
        res <- rbind(res, tmpReal)
      }
    }
    
    res <- merge(res, patDat, by = "SUBJECT_ID")    

    #Drop SUBJECT_ID to avoid duplicates
    admDat$SUBJECT_ID <- NULL

    res <- merge(res, admDat, by = "HADM_ID")
  
    #No need for row_id anymore
    res$ROW_ID <- NULL
    
    #If A is TRUE, calculate ages
    if (A == T){
      res <- ages(res)
    }
  
    #Can drop ROW_ID, SUBJECT_ID, as HADM_ID is what we will join on
    noteDat$ROW_ID <- NULL
    noteDat$SUBJECT_ID <- NULL
  
    res <- merge(res, noteDat, by = "HADM_ID")

    res <- res[with(res, order(SUBJECT_ID, ADMITTIME)),]

    rownames(res) <- 1:nrow(res)

    return(res) 
} 

#check <- ffCheck(stay, pat, adm, notes, T, T)

#write.csv(check, "ffCohortFullM3.csv", row.names = F)

check <- read.csv("ffCohortFullM3.csv", header = T, stringsAsFactors = F)

#dat <- ffCheck(stay, pat, adm, notes, T, F)

#write.csv(dat, file = "boundTablesM3.csv", row.names = FALSE)

#dat is all data, frequent flyer inclusive

dat <- read.csv("boundTablesM3.csv", header = T, stringsAsFactors = F)

Annotation Results Data

We will pull in the annotation results file. Unfortunately, the MIMIC database from which the original note text was extracted (MIMIC2v3) was an intermediate form between MIMIC-II and MIMIC-III, and which no longer exists. This makes precise reconstruction of the original cohort selection impossible, and it is possible that some non-frequent flyers became frequent flyers in the transition from MIMIC2v3 to the recent MIMIC-III v1.4. For this reason, frequent flyer cohort designations will be recalculated and assigned.

## ###
## Results
## ###
noteDat <- read.csv(list.files()[grepl("finalFrequentFlyerResults", list.files())], 
                 header = T, stringsAsFactors = F)

noteDat$COHORT <- ifelse((noteDat$SUBJECT_ID %in% check$SUBJECT_ID), 1, 0)

dis <- noteDat[(grepl("Discharge", noteDat$CATEGORY)),]
nsn <- noteDat[(grepl("Nursing", noteDat$CATEGORY)),]

Selection Process and Annotated Notes

A frequent flyer is defined as a patient who, at any point, had three or more ICU admissions over the course of a single year. Anyone under the age of 10 years will not be considered.

#All of MIMIC
tmpTab <- table(factor(notes$CATEGORY))

paste("There are ", length(unique(adm$SUBJECT_ID))," unique subjects, ", length(unique(adm$HADM_ID)),
      " unique hospital admissions with ", tmpTab["Discharge summary"][[1]], " discharge summaries and ",
      tmpTab["Nursing"][[1]]+tmpTab["Nursing/other"][[1]], " nursing notes.", sep = '')

## [1] "There are 46520 unique subjects, 58976 unique hospital admissions with 59652 discharge summaries and 1046053 nursing notes."

#Remove 10 year olds
tmpTab <- table(factor(dat$CATEGORY))

paste("After removing those younger than 10 years of age, there are ", length(unique(dat$SUBJECT_ID))," unique subjects, ", length(unique(dat$HADM_ID)),
      " unique hospital admissions with ", tmpTab["Discharge summary"][[1]], " discharge summaries and ",
      tmpTab["Nursing"][[1]]+tmpTab["Nursing/other"][[1]], " nursing notes.", sep = '')

## [1] "After removing those younger than 10 years of age, there are 38352 unique subjects, 49533 unique hospital admissions with 58925 discharge summaries and 764780 nursing notes."

#Only frequent flyer cohort
tmpTab <- table(factor(check$CATEGORY))

paste("After selecting only for patients who had >= 3 hospital admissions in a single year, there are ", length(unique(check$SUBJECT_ID))," unique subjects, ", length(unique(check$HADM_ID)),
      " unique hospital admissions with ", tmpTab["Discharge summary"][[1]], " discharge summaries and ",
      tmpTab["Nursing"][[1]]+tmpTab["Nursing/other"][[1]], " nursing notes.", sep = '')

## [1] "After selecting only for patients who had >= 3 hospital admissions in a single year, there are 1951 unique subjects, 7078 unique hospital admissions with 10116 discharge summaries and 208789 nursing notes."

#Only annotated
tmp <- noteDat[(!duplicated(noteDat$TEXT)),]
tmpTab <- table(factor(tmp$CATEGORY))

paste("Within the annotated database, there are ", length(unique(noteDat$SUBJECT_ID))," unique subjects and ", length(unique(noteDat$HADM_ID)),
      " unique hospital admissions with ", tmpTab["Discharge summary"][[1]]+tmpTab["Discharge"][[1]], " unique discharge summaries (", round(nrow(tmp[(grepl("Discharge", tmp$CATEGORY)) & (tmp$COHORT == 1),])/nrow(tmp[(grepl("Discharge", tmp$CATEGORY)),])*100, 2), "% belonging to Frequent Flyers) and ",
      tmpTab["Nursing/Other"][[1]], " unique nursing notes (",round(nrow(tmp[(grepl("Nursing", tmp$CATEGORY)) & (tmp$COHORT == 1),])/nrow(tmp[(grepl("Nursing", tmp$CATEGORY)),])*100, 2), "% belonging to Frequent Flyers)", sep = '')

## [1] "Within the annotated database, there are 905 unique subjects and 2069 unique hospital admissions with 1102 unique discharge summaries (73.96% belonging to Frequent Flyers) and 1000 unique nursing notes (76.6% belonging to Frequent Flyers)"

#rm(adm, stay, pat, notes)
rm(tmp, tmpTab)

Missing Data

We will check to see if there are patients who have been removed in the transition to MIMIC-III v1.4.

tmp <- setdiff(unique(noteDat$SUBJECT_ID), unique(dat$SUBJECT_ID))
paste(length(tmp), "patients who are in the patient note database have been removed from MIMIC-III, and their data will not be considered further for hospital admission level or patient level analysis.")

## [1] "7 patients who are in the patient note database have been removed from MIMIC-III, and their data will not be considered further for hospital admission level or patient level analysis."

Resulting data

There are 46,520 unique subjects, 58,976 unique hospital admissions with 59,652 discharge summaries and 1,046,053 nursing notes in MIMIC-III v1.4. After removing those subjects younger than 10 years of age, there are 38,352 unique subjects, 49,533 unique hospital admissions with 58,925 discharge summaries and 764,780 nursing notes. After selecting only for patients who had >= 3 hospital admissions in a single year, there are 1,951 unique subjects, 7,078 unique hospital admissions with 10,116 discharge summaries and 208,789 nursing notes. Within the annotated database, there are 905 unique subjects and 2069 unique hospital admissions with 1102 unique discharge summaries (73.96% belonging to Frequent Flyers) and 1000 unique nursing notes (76.6% belonging to Frequent Flyers)

Hospital Admission Level Data

Here we will merge the patient note data to MIMIC on HADM_ID for hospital admission level data.

Note: 1 indicates frequent flyer status

## ###
## merge to data by hadm_id for hospital admission level info
## ###

#Drop columns to avoid duplicate variables

#Drop dat$subject_id
dat$SUBJECT_ID <- NULL
#drop dat$category
dat$CATEGORY <- NULL
#drop dat$text
dat$TEXT <- NULL

#Merge note data to dat to get all relevant data
test <- merge(noteDat, dat, by = "HADM_ID")

colnames(test) <- tolower(colnames(test))

#Remove duplicates for hospital admission level info
test <- test[(!duplicated(test$hadm_id)),]


## LOS
boxplot(test$los ~ test$cohort, 
        main = "ICU Length of Stay (days) by cohort (1 = Frequent Flyer)", 
        xlab = "Cohort", 
        ylab = "Length of Stay (days)")

t.test(test$los ~ test$cohort)

## 
##  Welch Two Sample t-test
## 
## data:  test$los by test$cohort
## t = -1.9857, df = 659.82, p-value = 0.04748
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.94223347 -0.01089127
## sample estimates:
## mean in group 0 mean in group 1 
##        4.454292        5.430854

wilcox.test(test$los ~ test$cohort)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  test$los by test$cohort
## W = 90738, p-value = 0.003568
## alternative hypothesis: true location shift is not equal to 0

## Admission Type
plotDat(test, "admission_type", bs = F, mn = "Admission Type Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_type, test$cohort)
tmp

##            
##               0   1
##   ELECTIVE   41  52
##   EMERGENCY 217 708
##   URGENT      6  22

chisq.test(tmp)

## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 19.275, df = 2, p-value = 6.523e-05

pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")

##             Comparison  p.Chisq p.adj.Chisq
## 1 ELECTIVE : EMERGENCY 0.000023    0.000069
## 2    ELECTIVE : URGENT 0.052900    0.079400
## 3   EMERGENCY : URGENT 0.981000    0.981000

## Admission Location
plotDat(test, "admission_location", bs = F, mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$admission_location, test$cohort)
tmp

##                            
##                               0   1
##   CLINIC REFERRAL/PREMATURE  34  81
##   EMERGENCY ROOM ADMIT      137 516
##   PHYS REFERRAL/NORMAL DELI  48  85
##   TRANSFER FROM HOSP/EXTRAM  45  90
##   TRANSFER FROM OTHER HEALT   0   4
##   TRANSFER FROM SKILLED NUR   0   6

chisq.test(tmp)

## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 23.781, df = 5, p-value = 0.0002392

pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")

##                                               Comparison  p.Chisq
## 1       CLINIC REFERRAL/PREMATURE : EMERGENCY ROOM ADMIT 0.055000
## 2  CLINIC REFERRAL/PREMATURE : PHYS REFERRAL/NORMAL DELI 0.340000
## 3  CLINIC REFERRAL/PREMATURE : TRANSFER FROM HOSP/EXTRAM 0.616000
## 4  CLINIC REFERRAL/PREMATURE : TRANSFER FROM OTHER HEALT 0.469000
## 5  CLINIC REFERRAL/PREMATURE : TRANSFER FROM SKILLED NUR 0.269000
## 6       EMERGENCY ROOM ADMIT : PHYS REFERRAL/NORMAL DELI 0.000281
## 7       EMERGENCY ROOM ADMIT : TRANSFER FROM HOSP/EXTRAM 0.002810
## 8       EMERGENCY ROOM ADMIT : TRANSFER FROM OTHER HEALT 0.680000
## 9       EMERGENCY ROOM ADMIT : TRANSFER FROM SKILLED NUR 0.450000
## 10 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM HOSP/EXTRAM 0.730000
## 11 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM OTHER HEALT 0.338000
## 12 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM SKILLED NUR 0.168000
## 13 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM OTHER HEALT 0.389000
## 14 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM SKILLED NUR 0.205000
## 15 TRANSFER FROM OTHER HEALT : TRANSFER FROM SKILLED NUR      NaN
##    p.adj.Chisq
## 1      0.25700
## 2      0.59500
## 3      0.71900
## 4      0.59700
## 5      0.59500
## 6      0.00393
## 7      0.01970
## 8      0.73000
## 9      0.59700
## 10     0.73000
## 11     0.59500
## 12     0.57400
## 13     0.59700
## 14     0.57400
## 15         NaN

## Discharge Location
plotDat(test, "discharge_location", bs = T,mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$discharge_location, test$cohort)
tmp

##                            
##                               0   1
##   DEAD/EXPIRED               24  44
##   DISC-TRAN CANCER/CHLDRN H  10  21
##   DISCH-TRAN TO PSYCH HOSP    0   4
##   HOME                       67 183
##   HOME HEALTH CARE           73 206
##   HOME WITH HOME IV PROVIDR   1   4
##   HOSPICE-MEDICAL FACILITY    0   2
##   LEFT AGAINST MEDICAL ADVI   2  20
##   LONG TERM CARE HOSPITAL     9  37
##   REHAB/DISTINCT PART HOSP   32 140
##   SHORT TERM HOSPITAL         1   4
##   SNF                        45 117

chisq.test(tmp)

## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 15.464, df = 11, p-value = 0.1622

pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")

##                                               Comparison p.Chisq
## 1               DEAD/EXPIRED : DISC-TRAN CANCER/CHLDRN H 0.94700
## 2                DEAD/EXPIRED : DISCH-TRAN TO PSYCH HOSP 0.36300
## 3                                    DEAD/EXPIRED : HOME 0.22100
## 4                        DEAD/EXPIRED : HOME HEALTH CARE 0.17600
## 5               DEAD/EXPIRED : HOME WITH HOME IV PROVIDR 0.83600
## 6                DEAD/EXPIRED : HOSPICE-MEDICAL FACILITY 0.77900
## 7               DEAD/EXPIRED : LEFT AGAINST MEDICAL ADVI 0.03690
## 8                 DEAD/EXPIRED : LONG TERM CARE HOSPITAL 0.10800
## 9                DEAD/EXPIRED : REHAB/DISTINCT PART HOSP 0.00973
## 10                    DEAD/EXPIRED : SHORT TERM HOSPITAL 0.83600
## 11                                    DEAD/EXPIRED : SNF 0.32800
## 12  DISC-TRAN CANCER/CHLDRN H : DISCH-TRAN TO PSYCH HOSP 0.45000
## 13                      DISC-TRAN CANCER/CHLDRN H : HOME 0.66800
## 14          DISC-TRAN CANCER/CHLDRN H : HOME HEALTH CARE 0.60800
## 15 DISC-TRAN CANCER/CHLDRN H : HOME WITH HOME IV PROVIDR 0.97700
## 16  DISC-TRAN CANCER/CHLDRN H : HOSPICE-MEDICAL FACILITY 0.86600
## 17 DISC-TRAN CANCER/CHLDRN H : LEFT AGAINST MEDICAL ADVI 0.09840
## 18   DISC-TRAN CANCER/CHLDRN H : LONG TERM CARE HOSPITAL 0.31900
## 19  DISC-TRAN CANCER/CHLDRN H : REHAB/DISTINCT PART HOSP 0.13700
## 20       DISC-TRAN CANCER/CHLDRN H : SHORT TERM HOSPITAL 0.97700
## 21                       DISC-TRAN CANCER/CHLDRN H : SNF 0.77200
## 22                       DISCH-TRAN TO PSYCH HOSP : HOME 0.52600
## 23           DISCH-TRAN TO PSYCH HOSP : HOME HEALTH CARE 0.54000
## 24  DISCH-TRAN TO PSYCH HOSP : HOME WITH HOME IV PROVIDR 1.00000
## 25   DISCH-TRAN TO PSYCH HOSP : HOSPICE-MEDICAL FACILITY     NaN
## 26  DISCH-TRAN TO PSYCH HOSP : LEFT AGAINST MEDICAL ADVI 1.00000
## 27    DISCH-TRAN TO PSYCH HOSP : LONG TERM CARE HOSPITAL 0.76500
## 28   DISCH-TRAN TO PSYCH HOSP : REHAB/DISTINCT PART HOSP 0.76600
## 29        DISCH-TRAN TO PSYCH HOSP : SHORT TERM HOSPITAL 1.00000
## 30                        DISCH-TRAN TO PSYCH HOSP : SNF 0.50600
## 31                               HOME : HOME HEALTH CARE 0.94700
## 32                      HOME : HOME WITH HOME IV PROVIDR 1.00000
## 33                       HOME : HOSPICE-MEDICAL FACILITY 0.95900
## 34                      HOME : LEFT AGAINST MEDICAL ADVI 0.11500
## 35                        HOME : LONG TERM CARE HOSPITAL 0.39600
## 36                       HOME : REHAB/DISTINCT PART HOSP 0.06650
## 37                            HOME : SHORT TERM HOSPITAL 1.00000
## 38                                            HOME : SNF 0.91700
## 39          HOME HEALTH CARE : HOME WITH HOME IV PROVIDR 1.00000
## 40           HOME HEALTH CARE : HOSPICE-MEDICAL FACILITY 0.97500
## 41          HOME HEALTH CARE : LEFT AGAINST MEDICAL ADVI 0.12700
## 42            HOME HEALTH CARE : LONG TERM CARE HOSPITAL 0.44000
## 43           HOME HEALTH CARE : REHAB/DISTINCT PART HOSP 0.08350
## 44                HOME HEALTH CARE : SHORT TERM HOSPITAL 1.00000
## 45                                HOME HEALTH CARE : SNF 0.79700
## 46  HOME WITH HOME IV PROVIDR : HOSPICE-MEDICAL FACILITY 1.00000
## 47 HOME WITH HOME IV PROVIDR : LEFT AGAINST MEDICAL ADVI 1.00000
## 48   HOME WITH HOME IV PROVIDR : LONG TERM CARE HOSPITAL 1.00000
## 49  HOME WITH HOME IV PROVIDR : REHAB/DISTINCT PART HOSP 1.00000
## 50       HOME WITH HOME IV PROVIDR : SHORT TERM HOSPITAL 1.00000
## 51                       HOME WITH HOME IV PROVIDR : SNF 1.00000
## 52  HOSPICE-MEDICAL FACILITY : LEFT AGAINST MEDICAL ADVI 1.00000
## 53    HOSPICE-MEDICAL FACILITY : LONG TERM CARE HOSPITAL 1.00000
## 54   HOSPICE-MEDICAL FACILITY : REHAB/DISTINCT PART HOSP 1.00000
## 55        HOSPICE-MEDICAL FACILITY : SHORT TERM HOSPITAL 1.00000
## 56                        HOSPICE-MEDICAL FACILITY : SNF 0.93800
## 57   LEFT AGAINST MEDICAL ADVI : LONG TERM CARE HOSPITAL 0.45600
## 58  LEFT AGAINST MEDICAL ADVI : REHAB/DISTINCT PART HOSP 0.41900
## 59       LEFT AGAINST MEDICAL ADVI : SHORT TERM HOSPITAL 1.00000
## 60                       LEFT AGAINST MEDICAL ADVI : SNF 0.10400
## 61    LONG TERM CARE HOSPITAL : REHAB/DISTINCT PART HOSP 1.00000
## 62         LONG TERM CARE HOSPITAL : SHORT TERM HOSPITAL 1.00000
## 63                         LONG TERM CARE HOSPITAL : SNF 0.35200
## 64        REHAB/DISTINCT PART HOSP : SHORT TERM HOSPITAL 1.00000
## 65                        REHAB/DISTINCT PART HOSP : SNF 0.06300
## 66                             SHORT TERM HOSPITAL : SNF 1.00000
##    p.adj.Chisq
## 1        1.000
## 2        1.000
## 3        1.000
## 4        0.953
## 5        1.000
## 6        1.000
## 7        0.810
## 8        0.810
## 9        0.632
## 10       1.000
## 11       1.000
## 12       1.000
## 13       1.000
## 14       1.000
## 15       1.000
## 16       1.000
## 17       0.810
## 18       1.000
## 19       0.810
## 20       1.000
## 21       1.000
## 22       1.000
## 23       1.000
## 24       1.000
## 25         NaN
## 26       1.000
## 27       1.000
## 28       1.000
## 29       1.000
## 30       1.000
## 31       1.000
## 32       1.000
## 33       1.000
## 34       0.810
## 35       1.000
## 36       0.810
## 37       1.000
## 38       1.000
## 39       1.000
## 40       1.000
## 41       0.810
## 42       1.000
## 43       0.810
## 44       1.000
## 45       1.000
## 46       1.000
## 47       1.000
## 48       1.000
## 49       1.000
## 50       1.000
## 51       1.000
## 52       1.000
## 53       1.000
## 54       1.000
## 55       1.000
## 56       1.000
## 57       1.000
## 58       1.000
## 59       1.000
## 60       0.810
## 61       1.000
## 62       1.000
## 63       1.000
## 64       1.000
## 65       0.810
## 66       1.000

Patient Level Data

Here we will merge the patient note data to MIMIC on SUBJECT_ID for patient level data.

#Need to re-import dat
dat <- read.csv("boundTablesM3.csv", header = T, stringsAsFactors = F)

#Merge note data to dat to get all data 
test <- merge(noteDat, dat, by = "SUBJECT_ID")
colnames(test) <- tolower(colnames(test))

#remove duplicates for patient level data
test <- test[(!duplicated(test$subject_id)),]

## AGE
boxplot(test$age ~ test$cohort)

t.test(test$age ~ test$cohort)

## 
##  Welch Two Sample t-test
## 
## data:  test$age by test$cohort
## t = 1.9949, df = 804.39, p-value = 0.04639
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.03514605 4.34916117
## sample estimates:
## mean in group 0 mean in group 1 
##        65.23107        63.03891

wilcox.test(test$age ~ test$cohort)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  test$age by test$cohort
## W = 105790, p-value = 0.04804
## alternative hypothesis: true location shift is not equal to 0

## GENDER
plotDat(test, "gender", bs = F, mn = "Gender by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$gender, test$cohort)
prop.test(tmp)

## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  tmp
## X-squared = 0.54146, df = 1, p-value = 0.4618
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.09406183  0.04059508
## sample estimates:
##    prop 1    prop 2 
## 0.4046392 0.4313725

#When to correct for continuity or not?
#https://www.r-bloggers.com/comparison-of-two-proportions-parametric-z-test-and-non-parametric-chi-squared-methods/
#for a low number of test samples, correct = TRUE


##Insurance
plotDat(test, "insurance", bs = F, mn = "Insurance by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$insurance, test$cohort)
chisq.test(tmp)

## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 19.607, df = 4, p-value = 0.0005971

pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")

##               Comparison  p.Chisq p.adj.Chisq
## 1  Government : Medicaid 0.168000     0.33600
## 2  Government : Medicare 0.208000     0.34700
## 3   Government : Private 0.911000     0.91100
## 4  Government : Self Pay 0.465000     0.58100
## 5    Medicaid : Medicare 0.662000     0.73600
## 6     Medicaid : Private 0.009750     0.04880
## 7    Medicaid : Self Pay 0.053300     0.16200
## 8     Medicare : Private 0.000678     0.00678
## 9    Medicare : Self Pay 0.064700     0.16200
## 10    Private : Self Pay 0.253000     0.36100

##Ethnicity
##Clean ethnicity
test[(grepl("WHITE|PORTUGUESE", test$ethnicity)),]$ethnicity <- "WHITE" 
test[(grepl("ASIAN", test$ethnicity)),]$ethnicity <- "ASIAN" 
test[(grepl("BLACK", test$ethnicity)),]$ethnicity <- "BLACK" 
test[(grepl("HISPANIC", test$ethnicity)),]$ethnicity <- "HISPANIC"
test[(grepl("MIDDLE|NATIVE|MULTI|DECLINED|UNABLE|OTHER|NOT",test$ethnicity)),]$ethnicity <- "UNKNOWN"

plotDat(test, "ethnicity", bs = F, mn = "Ethnicity by Cohort", xl = "Cohort", yl = "Proportion")

tmp <- table(test$ethnicity, test$cohort)
chisq.test(tmp)

## 
##  Pearson's Chi-squared test
## 
## data:  tmp
## X-squared = 29.12, df = 4, p-value = 7.392e-06

pairwiseNominalIndependence(
  as.matrix(tmp), 
  fisher = F, gtest = F, chisq = T, method = "fdr")

##            Comparison  p.Chisq p.adj.Chisq
## 1       ASIAN : BLACK 2.55e-02    6.38e-02
## 2    ASIAN : HISPANIC 8.45e-01    9.39e-01
## 3     ASIAN : UNKNOWN 5.95e-01    8.42e-01
## 4       ASIAN : WHITE 6.74e-01    8.42e-01
## 5    BLACK : HISPANIC 5.97e-02    1.19e-01
## 6     BLACK : UNKNOWN 2.26e-07    2.26e-06
## 7       BLACK : WHITE 1.01e-04    5.05e-04
## 8  HISPANIC : UNKNOWN 1.83e-01    3.05e-01
## 9    HISPANIC : WHITE 1.00e+00    1.00e+00
## 10    UNKNOWN : WHITE 3.94e-03    1.31e-02

## MARITAL_STATUS
#plotDat(test, "marital_status", bs = F, mn = "Marital Status by Cohort", xl = "Cohort", yl = "Proportion")
#table(test$marital_status, test$cohort)

## Mortality (in-hospital)
plotDat(test, "hospital_expire_flag", bs = F, mn = "In-Hospital Mortality by Cohort", xl = "Cohort", yl = "Proportion")

table(test$hospital_expire_flag, test$cohort)

##    
##       0   1
##   0 328 470
##   1  49  51

t.test(test$hospital_expire_flag, test$cohort)

## 
##  Welch Two Sample t-test
## 
## data:  test$hospital_expire_flag and test$cohort
## t = -23.991, df = 1522.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.5071502 -0.4304890
## sample estimates:
## mean of x mean of y 
## 0.1113586 0.5801782

wilcox.test(test$hospital_expire_flag, test$cohort)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  test$hospital_expire_flag and test$cohort
## W = 214170, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

## Mortality (overall)
plotDat(test, "expire_flag", bs = F, mn = "Overall Mortality by Cohort", xl = "Cohort", yl = "Proportion")

table(test$expire_flag, test$cohort)

##    
##       0   1
##   0 167 138
##   1 210 383

t.test(test$expire_flag, test$cohort)

## 
##  Welch Two Sample t-test
## 
## data:  test$expire_flag and test$cohort
## t = 3.5107, df = 1791, p-value = 0.000458
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.0353860 0.1249704
## sample estimates:
## mean of x mean of y 
## 0.6603563 0.5801782

wilcox.test(test$expire_flag, test$cohort)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  test$expire_flag and test$cohort
## W = 435530, p-value = 0.0004659
## alternative hypothesis: true location shift is not equal to 0

Survival

This survival curve will be generated by normalizing all patient’s day 0 to their first hospital admission.

## SURVIVAL
library("survival")

## Warning: package 'survival' was built under R version 3.3.3

library("survminer")

## Warning: package 'survminer' was built under R version 3.3.3

## Loading required package: ggplot2

## Loading required package: ggpubr

## Warning: package 'ggpubr' was built under R version 3.3.3

## Loading required package: magrittr

#if no death, time is length of data collection, else it is length from first admission to death
mimicDuration <- as.Date(c("01-June-2001", "01-October-2012"), format = "%d-%B-%Y")
maxSurv <- as.numeric(mimicDuration[2]-mimicDuration[1])

test$time <- ifelse(is.na(test$dod), maxSurv, test$dod - test$intime)

#Data are right censored
fit <- survfit(Surv(time, expire_flag == 1, type = "right") ~ cohort,
               data = test)


ggsurvplot(fit, data = test, 
    risk.table = T, 
    pval = TRUE,#false? 
    xlim = c(0,3650),#maxSurv), 
    break.time.by = 365)

Word Statistics

colnames(dis) <- tolower(colnames(dis))
colnames(nsn) <- tolower(colnames(nsn))

disWordCount <- sapply(gregexpr("\\W+", unique(dis$text)), length) + 1
nsnWordCount <- sapply(gregexpr("\\W+", unique(nsn$text)), length) + 1

tmp <- as.data.frame(
  cbind(
    append(disWordCount, nsnWordCount), 
    append(rep("Discharges", each = length(disWordCount)), 
           rep("Nursing Notes", each = length(nsnWordCount))))
  )

colnames(tmp) <- c("Word.Count", "Note.Type")

tmp$Word.Count <- as.numeric(as.character(tmp$Word.Count))

print(paste("Discharge Summaries had a median word count of ", summary(disWordCount)[3], " (Q1-Q3: ", summary(disWordCount)[2], '-', summary(disWordCount)[5], ")", sep = ''))

## [1] "Discharge Summaries had a median word count of 1676 (Q1-Q3: 1207-2284)"

print(paste("Nursing Notes had a median word count of ", summary(nsnWordCount)[3], " (Q1-Q3: ", summary(nsnWordCount)[2], '-', summary(nsnWordCount)[5], ")", sep = ''))

## [1] "Nursing Notes had a median word count of 223 (Q1-Q3: 132-329)"

boxplot(tmp$Word.Count~tmp$Note.Type, ylab = "Word Count", xlab = "Note Type", main = "Word Count by Note Type")

Vocabulary Diversity

#disVocab <- strsplit(unique(dis$text), split = ' ')

#Keep only alpha numeric
disVocab <- gsub('[^[:alnum:]]', ' ', unique(dis$text))
#Ditch numeric
disVocab <- gsub('[[:digit:]]+', '', disVocab)

#Split for tabulation
disVocab <- strsplit(disVocab, ' ')

disVocab <- lapply(disVocab, tolower)

cat("Discharge summaries have a vocabulary of", (length(unique(unlist(disVocab))) - 1), "unique words.\n") # -1 for ""

## Discharge summaries have a vocabulary of 21794 unique words.

rm(disVocab)

#Keep only alpha numeric
nsnVocab <- gsub('[^[:alnum:]]', ' ', unique(nsn$text))
#Ditch numeric
nsnVocab <- gsub('[[:digit:]]+', '', nsnVocab)

#Split for tabulation
nsnVocab <- strsplit(nsnVocab, ' ')
#tolower case to avoid duplications
nsnVocab <- lapply(nsnVocab, tolower)

length(unique(unlist(nsnVocab))) - 1 # -1 for ""

## [1] 10130

cat("Nursing Notes have a vocabulary of", (length(unique(unlist(nsnVocab))) - 1), "unique words.\n") # -1 for ""

## Nursing Notes have a vocabulary of 10130 unique words.

rm(nsnVocab)

Data Set Balance

## ###
## opStats accepts a the final note data.frame and an operator
## spits out some stats to console
## ###

opStats <- function(dat, operator, dis){
    dat <- dat[(dat$operator == operator),]
    
    tmp <- as.character()
    
    if (dis){
      tmp <- " unique Discharge Summaries."
    } else {
      tmp <- " unique Nursing Notes."
    }

    paste(operator, " has ", nrow(dat), 
    " observations of ", length(unique(dat$text)), 
    tmp, sep = '')
    #return(dat)
}

opStats(dis, "ETM", T)

## [1] "ETM has 535 observations of 496 unique Discharge Summaries."

opStats(dis, "JTW", T)

## [1] "JTW has 801 observations of 759 unique Discharge Summaries."

opStats(dis, "JFF", T)

## [1] "JFF has 433 observations of 415 unique Discharge Summaries."

opStats(dis, "JWW", T)

## [1] "JWW has 727 observations of 698 unique Discharge Summaries."

opStats(dis, "PAT", T)

## [1] "PAT has 50 observations of 50 unique Discharge Summaries."

opStats(dis, "DAG", T)

## [1] "DAG has 25 observations of 25 unique Discharge Summaries."

opStats(nsn, "ETM", F)

## [1] "ETM has 500 observations of 499 unique Nursing Notes."

opStats(nsn, "JTW", F)

## [1] "JTW has 499 observations of 499 unique Nursing Notes."

opStats(nsn, "JFF", F)

## [1] "JFF has 499 observations of 499 unique Nursing Notes."

opStats(nsn, "JWW", F)

## [1] "JWW has 500 observations of 500 unique Nursing Notes."

#PAT and DAG did not annotate nursing notes
#opStats(nsn, "PAT")
#opStats(nsn, "DAG")

dis <- dis[,c("subject_id", 
                  "batch.id", 
                  "category", 
                  "chart.time", 
                  "real.time", 
                  "cohort", 
                  "hadm_id", 
                  "icu.id", 
                  "note.type", 
                  "text", 
                  "operator",
                  "none", 
                  "advanced.cancer", 
                  "advanced.heart.disease", 
                  "advanced.lung.disease", 
                  "alcohol.abuse", 
                  "chronic.neurological.dystrophies", 
                  "chronic.pain.fibromyalgia", 
                  "dementia", 
                  "depression", 
                  "developmental.delay.retardation", 
                  "non.adherence", 
                  "obesity", 
                  "other.substance.abuse",
                  "schizophrenia.and.other.psychiatric.disorders",
                  "unsure")]

nsn <- nsn[,c("subject_id", 
                  "batch.id", 
                  "category", 
                  "chart.time", 
                  "real.time", 
                  "cohort", 
                  "hadm_id", 
                  "icu.id", 
                  "note.type", 
                  "text", 
                  "operator",
                  "none", 
                  "advanced.cancer", 
                  "advanced.heart.disease", 
                  "advanced.lung.disease", 
                  "alcohol.abuse", 
                  "chronic.neurological.dystrophies", 
                  "chronic.pain.fibromyalgia", 
                  "dementia", 
                  "depression", 
                  "developmental.delay.retardation", 
                  "non.adherence", 
                  "obesity", 
                  "other.substance.abuse",
                  "schizophrenia.and.other.psychiatric.disorders",
                  "unsure")]

percPosBatch <- function(dat, indication, operator, batch, res){

  #Subset the appropriate operator
  dat <- dat[(dat$operator == operator),]

  #Remove duplicate observations (grab the first)
  dat <- dat[(!duplicated(dat$text)),]

  #Subset the appropriate batch
  dat <- dat[(dat$batch.id == batch),]

  #keep this value to generate %'s
  obs <- nrow(dat)

  #Find index of indication so we can have data the long way from the get-go
  indexOne <- which(colnames(dat) == indication)

  #count
  count <- sum(dat[,indexOne])

  #percentage
  percent <- paste(round((count/obs)*100,2),'%', sep = '')
     
  res[nrow(res)+1,] <- c(batch, operator, obs, count, percent, indication)
  
  return(res)
}


posFrame <- data.frame()
for (name in c("Batch", "Operator", "Total.Obs", "Count", "Percent", "Indication")) posFrame[[name]] <- as.numeric()


batches <- levels(factor(dis$batch.id))

inds <- colnames(dis)[which(colnames(dis)=="none"):which(colnames(dis)=="unsure")]

for (bat in batches){
  for (ind in inds){
        posFrame <- percPosBatch(dis, ind, "ETM", bat, posFrame)
    }
}

for (bat in batches){
  for (ind in inds){
        posFrame <- percPosBatch(dis, ind, "JTW", bat, posFrame)
    }
}


for (bat in batches){
  for (ind in inds){
        posFrame <- percPosBatch(dis, ind, "JFF", bat, posFrame)
    }
}

for (bat in batches){
  for (ind in inds){
        posFrame <- percPosBatch(dis, ind, "JWW", bat, posFrame)
    }
}

#To clean:
#posFrame <- posFrame[(posFrame$Percent != "NaN%"),]

percPosTot <- function(dat, indication, res){

  #Remove duplicate observations (grab the first note-- 
  #would it be better to average the annotator's results?)
  dat <- dat[(!duplicated(dat$text)),]

  #keep this value to generate %'s
  obs <- nrow(dat)

  #Find index of indication so we can have data the long way from the get-go
  indexOne <- which(colnames(dat) == indication)

  #count
  count <- sum(dat[,indexOne])

  #percentage
  percent <- paste(round((count/obs)*100,2),'%', sep = '')
     
  res[nrow(res)+1,] <- c(obs, count, percent, indication)
  
  return(res)
}


posTotFrame <- data.frame()
for (name in c("Total.Obs", "Count", "Percent", "Indication")) posTotFrame[[name]] <- as.numeric()

inds <- colnames(dis)[which(colnames(dis)=="none"):which(colnames(dis)=="unsure")]

for (ind in inds){
    posTotFrame <- percPosTot(dis, ind, posTotFrame)
}

disPosTot <- posTotFrame
disPosTot$NoteType <- rep("Discharge", each = nrow(disPosTot))

disPosTot <- disPosTot[order(disPosTot$Indication),]
row.names(disPosTot) <- 1:nrow(disPosTot)
disPosTot

##    Total.Obs Count Percent                                    Indication
## 1       1102    74   6.72%                               advanced.cancer
## 2       1102   172  15.61%                        advanced.heart.disease
## 3       1102    88   7.99%                         advanced.lung.disease
## 4       1102   127  11.52%                                 alcohol.abuse
## 5       1102   187  16.97%              chronic.neurological.dystrophies
## 6       1102   150  13.61%                     chronic.pain.fibromyalgia
## 7       1102    43    3.9%                                      dementia
## 8       1102   267  24.23%                                    depression
## 9       1102    14   1.27%               developmental.delay.retardation
## 10      1102    78   7.08%                                 non.adherence
## 11      1102   315  28.58%                                          none
## 12      1102    72   6.53%                                       obesity
## 13      1102    92   8.35%                         other.substance.abuse
## 14      1102   167  15.15% schizophrenia.and.other.psychiatric.disorders
## 15      1102   150  13.61%                                        unsure
##     NoteType
## 1  Discharge
## 2  Discharge
## 3  Discharge
## 4  Discharge
## 5  Discharge
## 6  Discharge
## 7  Discharge
## 8  Discharge
## 9  Discharge
## 10 Discharge
## 11 Discharge
## 12 Discharge
## 13 Discharge
## 14 Discharge
## 15 Discharge

#To hang on to this
#write.csv(posTotFrame, file = "posDisNotesTotal.csv", row.names = FALSE)

#rm(posTotFrame)

library("ggplot2")
attach(posTotFrame)
ggplot(data=posTotFrame,
       aes(x=Indication, y=as.numeric(Count)), colour = as.factor(NoteType)) +
     geom_bar(stat = "identity", position = "dodge") + coord_flip()

detach(posTotFrame)

## ###
## Nursing Notes
## ###

posFrame <- data.frame()
for (name in c("Batch", "Operator", "Total.Obs", "Count", "Percent", "Indication")) posFrame[[name]] <- as.numeric()


ops <- levels(factor(nsn$operator))

batches <- levels(factor(nsn$batch.id))

inds <- colnames(nsn)[which(colnames(nsn)=="none"):which(colnames(nsn)=="unsure")]


for (bat in batches){
  for (ind in inds){
        posFrame <- percPosBatch(nsn, ind, "ETM", bat, posFrame)
    }
}

for (bat in batches){
  for (ind in inds){
        posFrame <- percPosBatch(nsn, ind, "JTW", bat, posFrame)
    }
}


for (bat in batches){
  for (ind in inds){
        posFrame <- percPosBatch(nsn, ind, "JFF", bat, posFrame)
    }
}

for (bat in batches){
  for (ind in inds){
        posFrame <- percPosBatch(nsn, ind, "JWW", bat, posFrame)
    }
}



posTotFrame <- data.frame()
for (name in c("Total.Obs", "Count", "Percent", "Indication")) posTotFrame[[name]] <- as.numeric()

inds <- colnames(nsn)[which(colnames(nsn)=="none"):which(colnames(nsn)=="unsure")]

for (ind in inds){
    posTotFrame <- percPosTot(nsn, ind, posTotFrame)
}

posTotFrame$NoteType <- rep("Nursing", each = nrow(posTotFrame))

posTotFrame

##    Total.Obs Count Percent                                    Indication
## 1       1000   693   69.3%                                          none
## 2       1000    24    2.4%                               advanced.cancer
## 3       1000    46    4.6%                        advanced.heart.disease
## 4       1000    31    3.1%                         advanced.lung.disease
## 5       1000    47    4.7%                                 alcohol.abuse
## 6       1000    51    5.1%              chronic.neurological.dystrophies
## 7       1000    47    4.7%                     chronic.pain.fibromyalgia
## 8       1000    25    2.5%                                      dementia
## 9       1000    55    5.5%                                    depression
## 10      1000     5    0.5%               developmental.delay.retardation
## 11      1000    48    4.8%                                 non.adherence
## 12      1000    12    1.2%                                       obesity
## 13      1000    22    2.2%                         other.substance.abuse
## 14      1000    33    3.3% schizophrenia.and.other.psychiatric.disorders
## 15      1000    95    9.5%                                        unsure
##    NoteType
## 1   Nursing
## 2   Nursing
## 3   Nursing
## 4   Nursing
## 5   Nursing
## 6   Nursing
## 7   Nursing
## 8   Nursing
## 9   Nursing
## 10  Nursing
## 11  Nursing
## 12  Nursing
## 13  Nursing
## 14  Nursing
## 15  Nursing

#hang on to this
#write.csv(posTotFrame, file = "posNurseNotesTotal.csv", row.names = FALSE)

attach(posTotFrame)
ggplot(data=posTotFrame,
       aes(x=Indication, y=as.numeric(Count)), colour = as.factor(NoteType)) +
     geom_bar(stat = "identity", position = "dodge") + coord_flip()

detach(posTotFrame)

Interoperator Error

Cohen’s Kappa was used to measure interoperator error for discharge summaries (nursing notes to follow). Note: this code needs to be fixed– getting NaN for Kappa and NA values for Operators

kappaCalc <- function(x){
  po <- (x[1,1]+x[2,2])/sum(x) #Observed proportional agreement
  peA <- (sum(x[,1])/sum(x)) #Probability of random agreements
  peB <- (sum(x[1,])/sum(x))
  pe1 <- (sum(x[,2])/sum(x))
  pe2 <- (sum(x[2,])/sum(x))
  prpos <- peA*peB #Probability of random positives
  prneg <- pe1*pe2 #Probability of random negatives
  pre <- prpos+prneg
  k <- (po-pre)/(1-pre) #Kappa calculation
  return(k)
}

## ###
## agreement() accepts a data frame and returns one
## ###

agreement <- function(dat, indication, team, batch, res){
  #Remove arbitrators
  dat <- dat[(dat$operator != "DAG"),]
  dat <- dat[(dat$operator != "PAT"),]
  
  #Subset the appropriate batch
  dat <- dat[(dat$batch.id == batch),]
  
  #Library of unique HADM.IDs
  lib <- unique(dat$hadm_id)
  
  #Find index of indication
  indexOne <- which(colnames(dat) == indication)
  
  #Create table for calculating kappa
  m <- matrix(0, nrow=2, ncol=2)

  opTmp <- as.character()
  
  #Loop through, populating data frame
  for (i in 1:length(lib)){
      #Subset by hadm id
      tmp <- dat[(dat$hadm_id == lib[i]),]
      
      #Check that the operators are not identical and there are two
      if (nrow(tmp) == 2 & (tmp$operator[1] != tmp$operator[2]) & (all(team %in% tmp$operator))){
          
          if(tmp[1,indexOne] == 1 & tmp[2,indexOne] == 1){
            m[1,1] <- m[1,1] + 2
          } else if (tmp[1, indexOne] == 1 & tmp[2, indexOne] == 0){
            m[1,1] <- m[1,1] + 1
            m[2,1] <- m[2,1] + 1 
          } else if (tmp[1, indexOne] == 0 & tmp[2, indexOne] == 1){
            m[1,1] <- m[1,1] + 1
            m[1,2] <- m[1,2] + 1
          } else if (tmp[1, indexOne] == 0 & tmp[2, indexOne] == 0){
            m[2,2] <- m[2,2] + 2
          }
        
          #Hold the operators
          opTmp <- sort(tmp$operator)
      }
      
  }
          
          res[nrow(res)+1,] <- c(batch,paste(opTmp[1],opTmp[2], sep = " & "), kappaCalc(m), indication)
  
  return(res)
}


kappaFrame <- data.frame()
for (name in c("Batch", "Operators", "Kappa", "Indication")) kappaFrame[[name]] <- as.numeric()

inds <- colnames(dis)[which(colnames(dis)=="none"):which(colnames(dis)=="unsure")]

batches <- levels(factor(dis$batch.id))



for (bat in batches){
  for (ind in inds){
        kappaFrame <- agreement(dis, ind, c("ETM", "JTW"), bat, kappaFrame)
    }
}


for (bat in batches){
  for (ind in inds){
        kappaFrame <- agreement(dis, ind, c("JFF", "JWW"), bat, kappaFrame)
    }
}

#Batches to dates
kappaFrame$Batch <- as.Date(kappaFrame$Batch, format = "%d%b%y")

#ejFrame <- kappaFrame[(kappaFrame$Operators == "ETM & JTW"),]
#jjFrame <- kappaFrame[(kappaFrame$Operators == "JFF & JWW"),]

kap <- read.csv(list.files()[grepl("Kappa", list.files())],
              header = T, stringsAsFactors = F)

kap

##    Operators     Kappa                                    Indication
## 1  ETM & JTW 0.8341776                               Advanced.Cancer
## 2  ETM & JTW 0.8199485                        Advanced.Heart.Disease
## 3  ETM & JTW 0.8053511                         Advanced.Lung.Disease
## 4  ETM & JTW 0.8557267                                 Alcohol.Abuse
## 5  ETM & JTW 0.7136757              Chronic.Neurological.Dystrophies
## 6  ETM & JTW 0.8327701                     Chronic.Pain.Fibromyalgia
## 7  ETM & JTW 0.9518131                                      Dementia
## 8  ETM & JTW 0.8534775                                    Depression
## 9  ETM & JTW 1.0000000               Developmental.Delay.Retardation
## 10 ETM & JTW 0.7775651                                 Non.Adherence
## 11 ETM & JTW 0.8379444                                          None
## 12 ETM & JTW 0.9406413                                       Obesity
## 13 ETM & JTW 0.8619174                         Other.Substance.Abuse
## 14 ETM & JTW 0.9081054 Schizophrenia.and.other.Psychiatric.Disorders
## 15 ETM & JTW 0.6266989                                        Unsure
## 16 JFF & JWW 0.8688963                               Advanced.Cancer
## 17 JFF & JWW 0.5611655                        Advanced.Heart.Disease
## 18 JFF & JWW 0.6538844                         Advanced.Lung.Disease
## 19 JFF & JWW 0.8417000                                 Alcohol.Abuse
## 20 JFF & JWW 0.7002536              Chronic.Neurological.Dystrophies
## 21 JFF & JWW 0.7305602                     Chronic.Pain.Fibromyalgia
## 22 JFF & JWW 0.9466168                                      Dementia
## 23 JFF & JWW 0.9450693                                    Depression
## 24 JFF & JWW 0.8685472               Developmental.Delay.Retardation
## 25 JFF & JWW 0.6935544                                 Non.Adherence
## 26 JFF & JWW 0.6699291                                          None
## 27 JFF & JWW 0.9676517                                       Obesity
## 28 JFF & JWW 0.8816716                         Other.Substance.Abuse
## 29 JFF & JWW 0.9396331 Schizophrenia.and.other.Psychiatric.Disorders
## 30 JFF & JWW 0.5053215                                        Unsure

Cooccurence matrices of labels (label correlations)

Define a function to generate these data.

library("reshape2")
library("ggplot2")


comat <- function(dat){
  #Find indices
  inds <- colnames(dat)[which(colnames(dat)=="none"):which(colnames(dat)=="unsure")]
  
  #Subset indications
  dat <- dat[,inds]
  
  #Find correlation, pearson by default, can have method = "pearson", "kendall", or "spearman"
  cormat <- cor(dat, method = "pearson")
  
  get_upper_tri <- function(cormat){
    cormat[lower.tri(cormat)]<- NA
    return(cormat)
  }
  
  #The matrix is symmetric across the diagonal so only one half is necessary
  upper_tri <- get_upper_tri(cormat)
  
  #Remove the diagnal values (they will all be perfectly correlated)
  diag(upper_tri) <- NA

  #Melt for long data
  melted_cormat <- melt(upper_tri, na.rm = TRUE)
  
  colnames(melted_cormat) <- c("Indication.1", "Indication.2", "Value")

  return(melted_cormat)
  
}

Discharge Summaries

melted_cormat <- comat(dis)


ggplot(data = melted_cormat, aes(Indication.1, Indication.2, fill = Value))+
 geom_tile(color = "white") +
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 10, hjust = 1)) + labs(title = "Correlation Matrix of Indications\n (Discharge Summaries)") +
 coord_fixed()

Nursing Notes

melted_cormat <- comat(nsn)


ggplot(data = melted_cormat, aes(Indication.1, Indication.2, fill = Value))+
 geom_tile(color = "white") +
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 10, hjust = 1)) + labs(title = "Correlation Matrix of Indications\n (Nursing Notes)") +
 coord_fixed()