Load data

library("data.table")
dat <- fread("~/goals_of_care/external_validation/NOTES_STAYS_ADM_PAT.csv", header = T, stringsAsFactors = F)
## 
Read 0.0% of 385794 rows
Read 5.2% of 385794 rows
Read 10.4% of 385794 rows
Read 15.6% of 385794 rows
Read 20.7% of 385794 rows
Read 25.9% of 385794 rows
Read 31.1% of 385794 rows
Read 33.7% of 385794 rows
Read 38.9% of 385794 rows
Read 169621 rows and 44 (of 44) columns from 1.243 GB file in 00:00:19
#Days until death value
dat$DAYS_UNTIL_DEATH <-  dat$DOD - dat$ADMITTIME

#Time since admission value
dat$TIME_SINCE_ADMIT <- as.numeric(as.Date(dat$CHARTDATE, "%Y-%m-%d")) - dat$ADMITTIME

Selection Process

length(unique(dat$SUBJECT_ID))
## [1] 7564
#[1] 7564 Potentially Eligible Patients
dat <- dat[(dat$AGE >= 75),]

length(unique(dat$SUBJECT_ID)) # Dropped 5166
## [1] 2398
#[1] 2398 Patients over age 75 on admission

#Patient survived 48hr since admit
dat <- dat[(dat$DAYS_UNTIL_DEATH >= 2),]

length(unique(dat$SUBJECT_ID)) # Dropped 1103
## [1] 1295
#[1] 1295 Patients survived 2 or more days since admission


#Only within 2 days since admission
dat <- dat[(dat$TIME_SINCE_ADMIT <= 2),]
length(unique(dat$SUBJECT_ID)) # Dropped 15
## [1] 1141
#[1] 1141 Patients who had notes within two days of admission

nrow(dat)
## [1] 11575
#[1] 11575 Notes associated with these patients
length(unique(dat$TEXT)) # Dropped 1325
## [1] 10250
#[1] 10250 Notes associated with these patients when duplicates are removed

#Use only noteevents columnnames
notes_out <- dat[ ,c("ROW_ID",
                    "SUBJECT_ID",
                    "HADM_ID",
                    "CHARTDATE",
                    "CHARTTIME",
                    "STORETIME",
                    "CATEGORY",
                    "DESCRIPTION",
                    "CGID",
                    "ISERROR",
                    "TEXT" )]

#write.csv(notes_out, file = "/Users/Edward/Desktop/over_75_cohort_17Jan18.csv", row.names = F)