Libraries

library("tidyverse")
library("tidygraph")
library("ggraph")
library("corrr")
library("rcompanion") #Pairwise nominal test

### For networks:
library("igraph")
library("rgexf")

Utility Functions

allDup

allDup will return all duplicated values, without dropping any. i.e. The directionality of duplicated() is removed by performing the operation on the vector from both directions

allDup <- function(val){
    duplicated(val) | duplicated(val, fromLast = TRUE)
}

plotDat

plotDat() is a convenient plotting function.

plotDat <- function(dat, column, bs, mn, xl, yl){
  tmp <- as.matrix(table(dat[[column]], dat[["CG_DESCRIPTION"]]))
  prop <- prop.table(tmp, margin = 2)#2 for column-wise proportions
  par(mar = c(5.0, 4.0, 4.0, 15), xpd = TRUE)
  barplot(prop, col = cm.colors(length(rownames(prop))), beside = bs,width = 2, main = mn, xlab = xl, ylab = yl)
  legend("topright", inset = c(-0.90,0), fill = cm.colors(length(rownames(prop))), legend=rownames(prop))
}

detach_packages

detach_packages will keep only base R packages, and will remove all other supplementary packages to avoid functional conflicts.

detach_packages <- function(){
    
    basic.packages <- c("package:stats","package:graphics","package:grDevices","package:utils","package:datasets","package:methods","package:base")
    
    package.list <- search()[ifelse(unlist(gregexpr("package:", search())) == 1, TRUE , FALSE)]
    
    package.list <- setdiff(package.list, basic.packages)
    
    if (length(package.list) > 0 )  for (package in package.list) detach(package, character.only=TRUE)
    
}

Load Data & Initial Cleaning

Load NQF care measure cohort (drawn directly from NOTEEVENTS table), load CAREGIVERS, ADMISSIONS, PATIENTS, and ICUSTAYS for additional data.

## Load Labeled Note Data for NQF Caremeasure Cohort (From NOTEEVENTS table)
dat <- read.csv("~/nqf_caregivers/data/note_labels_over75.csv", header = T, stringsAsFactors = F)

## Remove X and note_name (artifact indexing columns) from dat
dat$X <- NULL
dat$note_name <- NULL

Note: FAM, CIM, LIM, CAR and COD refer to human annotations, and will be dropped, as we are intersted in the .machine annotations from NeuroNER.

## Remove manual annotations
dat$FAM <- NULL
dat$CIM <- NULL
dat$CIM_post <- NULL
dat$LIM <- NULL
dat$CAR <- NULL
dat$COD <- NULL

## Load CAREGIVERS Table for join on CGID
cg <- read.csv("~/nqf_caregivers/data/mimic/CAREGIVERS.csv", 
               header = T, stringsAsFactors = F)

## Load ADMISSIONS Table to join on HADM_ID
adm <- read.csv("~/nqf_caregivers/data/mimic/ADMISSIONS.csv", 
                header = T, stringsAsFactors = F)

## Load PATIENTS Table to join on SUBJECT_ID
pat <- read.csv("~/nqf_caregivers/data/mimic/PATIENTS.csv", 
                header = T, stringsAsFactors = F)

## Load ICUSTAYS Table to join on SUBJECT_ID, HADM_ID
stays <- read.csv("~/nqf_caregivers/data/mimic/ICUSTAYS.csv", 
                  header = T, stringsAsFactors = F)

Count Words and Characters

##Word count
dat$WORD_COUNT <- stringr::str_count(dat$TEXT, "\\W+")

## Count characters
dat$NCHAR <- nchar(dat$TEXT)

plot(dat$WORD_COUNT, dat$NCHAR, 
     main = "Character Count as a Function of Word Count",
     xlab = "Word Count",
     ylab = "Character Count")

cor(dat$WORD_COUNT, dat$NCHAR)
## [1] 0.9914678

Unsurprisingly, word count and character count are correlated.

Clean and Merge

In MIMIC-III, ROW_ID is an index used for each table, and DESCRIPTION is also a common variable for each table.

  1. Remove ROW_ID from all tables accept dat (from NOTEEVENTS)
  2. dat to CAREGIVERS on CGID
  3. dat to ADMISSIONS on HADM_ID
  4. dat to PATIENTS on SUBJECT_ID
  5. dat to ICUSTAYS on SUBJECT_ID and HADM_ID
  6. Remove duplicates resulting from the Cartesian Product created during the join.
  7. Restrict analysis to ATTENDING and Resident/Fellow/PA/NP
## Change column name of "DESCRIPTION" to explicitly mention that it describes the note
colnames(dat)[which(colnames(dat) == "DESCRIPTION")] <- "NOTE_DESCRIPTION"

## Change column name of "DESCRIPTION" to explicitly mention that it describes the caregiver
colnames(cg)[which(colnames(cg) == "DESCRIPTION")] <- "CG_DESCRIPTION"

## (1)
cg$ROW_ID <- NULL
adm$ROW_ID <- NULL
pat$ROW_ID <- NULL
stays$ROW_ID <- NULL

dim(dat)
## [1] 11575    19
## (2)
dat <- merge(dat, cg, by = "CGID")
dim(dat)
## [1] 11575    21
## (3)
dat <- merge(dat, adm, by = "HADM_ID")
dim(dat)
## [1] 11575    38
## This has duplicated SUBJECT_ID
identical(dat$SUBJECT_ID.x, dat$SUBJECT_ID.y)
## [1] TRUE
## Remove one
dat$SUBJECT_ID.y <- NULL

## Rename the other
colnames(dat)[which(colnames(dat) == "SUBJECT_ID.x")] <- "SUBJECT_ID"
dim(dat)
## [1] 11575    37
## (4)
dat <- merge(dat, pat, by = "SUBJECT_ID")
dim(dat)
## [1] 11575    43
## (5)
dat <- merge(dat, stays, by = c("SUBJECT_ID", "HADM_ID"))
dim(dat)
## [1] 13369    52
## (6)
dat <- dat[!duplicated(dat), ]
dim(dat)
## [1] 11575    52
## Cleaning Environment
rm(adm, pat, stays)#cg, stays)
gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1997659 106.7    3886542 207.6  2983941 159.4
## Vcells 12037774  91.9   28511622 217.6 27543236 210.2
## Write
#write.csv(dat, file = "~/nqf_caregivers/data/note_labels_over75_merged12May18.csv", row.names = F)

## (7)
dat <- dat[(dat$CG_DESCRIPTION == "Attending" | 
                dat$CG_DESCRIPTION == "Resident/Fellow/PA/NP"), ]
dim(dat)
## [1] 11461    52
## Write
#write(...)

Note: the merge() method we used is an inner join, which is a matrix manipulation that generates the Cartesian Product of two data matrices. The data frame will expand when joined to ICUSTAYS because a single hospital admission can be associated with a number of ICUSTAYS if the patient is transferred to the floor and back.

Check Variables

colnames(dat)
##  [1] "SUBJECT_ID"           "HADM_ID"              "CGID"                
##  [4] "ROW_ID"               "CHARTDATE"            "CHARTTIME"           
##  [7] "STORETIME"            "CATEGORY"             "NOTE_DESCRIPTION"    
## [10] "ISERROR"              "TEXT"                 "FAM.machine"         
## [13] "CIM.machine"          "LIM.machine"          "CAR.machine"         
## [16] "COD.machine"          "CIM_post.machine"     "WORD_COUNT"          
## [19] "NCHAR"                "LABEL"                "CG_DESCRIPTION"      
## [22] "ADMITTIME"            "DISCHTIME"            "DEATHTIME"           
## [25] "ADMISSION_TYPE"       "ADMISSION_LOCATION"   "DISCHARGE_LOCATION"  
## [28] "INSURANCE"            "LANGUAGE"             "RELIGION"            
## [31] "MARITAL_STATUS"       "ETHNICITY"            "EDREGTIME"           
## [34] "EDOUTTIME"            "DIAGNOSIS"            "HOSPITAL_EXPIRE_FLAG"
## [37] "HAS_CHARTEVENTS_DATA" "GENDER"               "DOB"                 
## [40] "DOD"                  "DOD_HOSP"             "DOD_SSN"             
## [43] "EXPIRE_FLAG"          "ICUSTAY_ID"           "DBSOURCE"            
## [46] "FIRST_CAREUNIT"       "LAST_CAREUNIT"        "FIRST_WARDID"        
## [49] "LAST_WARDID"          "INTIME"               "OUTTIME"             
## [52] "LOS"

Time Conversion

We will want to convert CHARTDATE, representing the date the which the note was charted, to numeric from YYYY-MM-DD format. We will also want ICUSTAYS.INTIME in a similar format. The integer that results will be days since 1970-01-01, per ?as.Date.

## From ``
dat$CHARTDATE <- as.numeric(as.Date(dat$CHARTDATE, "%Y-%m-%d"))

## From ``
dat$STORETIME <- as.numeric(as.Date(dat$STORETIME, "%Y-%m-%d %H:%M:%S"))

## from ICUSTAYS
dat$INTIME <- as.numeric(as.Date(dat$INTIME, "%Y-%m-%d %H:%M:%S"))

SUBJECT_ID and CGID Conversion

SUBJECT_ID and CGID are integer values that may have overlap. To deal with this, we will add a character prefix to their integer values.

dat$SUBJECT_ID <- paste0("SUBJECT_", dat$SUBJECT_ID)
dat$CGID <- paste0("CG_", dat$CGID)

FIRST_CAREUNIT and LAST_CAREUNIT

While still learning Gephi, I will be converting some factors to counts, expanding the data frame out to be more sparse but to contain more variables. We will use xtabs() (Cross- or Pivot-tables) to count the number of occurrences of SUBJECT_ID and CGID pairs meeting in each particular care unit.

## First Care Unit Weights
## Make a cross table of SUBJECT_ID/CGID relative to CAREUNIT
tmp_tab <- xtabs( ~ paste(SUBJECT_ID, CGID) + FIRST_CAREUNIT, data = dat)

## Bind columns containing SUBJECT_ID and CGID to LOC columns
## (after splitting SUBJECT_IDs/CGIDs on spaces) to the cross table
first_care_res <- as.data.frame(cbind(t(matrix(unlist(strsplit(rownames(tmp_tab), ' ')), 
                                               nrow = 2)), tmp_tab), stringsAsFactors = F)
## Correct column names
colnames(first_care_res) <- c("SUBJECT_ID", 
                              "CGID", 
                              "FIRST_CCU", 
                              "FIRST_CSRU", 
                              "FIRST_MICU",
                              "FIRST_SICU",
                              "FIRST_TSICU")
## Remove row names
row.names(first_care_res) <- NULL

## Convert characters to numeric
first_care_res$FIRST_CCU <- as.numeric(first_care_res$FIRST_CCU)
first_care_res$FIRST_CSRU <- as.numeric(first_care_res$FIRST_CSRU)
first_care_res$FIRST_MICU <- as.numeric(first_care_res$FIRST_MICU)
first_care_res$FIRST_SICU <- as.numeric(first_care_res$FIRST_SICU)
first_care_res$FIRST_TSICU <- as.numeric(first_care_res$FIRST_TSICU)

## Look
head(first_care_res)
##      SUBJECT_ID     CGID FIRST_CCU FIRST_CSRU FIRST_MICU FIRST_SICU
## 1 SUBJECT_10134 CG_15165         0          0          1          0
## 2 SUBJECT_10134 CG_16383         0          0          1          0
## 3 SUBJECT_10134 CG_18189         0          0          3          0
## 4 SUBJECT_10134 CG_19458         0          0          4          0
## 5 SUBJECT_10134 CG_19692         0          0          2          0
## 6 SUBJECT_10207 CG_14080         0          0          2          0
##   FIRST_TSICU
## 1           0
## 2           0
## 3           0
## 4           0
## 5           0
## 6           0
## Last Care Unit Weights
## Make a cross table of SUBJECT_ID/CGID relative to CAREUNIT
tmp_tab <- xtabs( ~ paste(SUBJECT_ID, CGID) + LAST_CAREUNIT, data = dat)

## Bind columns containing SUBJECT_ID and CGID to LOC columns
## (after splitting SUBJECT_IDs/CGIDs on spaces) to the cross table
last_care_res <- as.data.frame(cbind(t(matrix(unlist(strsplit(rownames(tmp_tab), ' ')), 
                                               nrow = 2)), tmp_tab), stringsAsFactors = F)
## Correct column names
colnames(last_care_res) <- c("SUBJECT_ID", 
                              "CGID", 
                              "LAST_CCU", 
                              "LAST_CSRU", 
                              "LAST_MICU",
                              "LAST_SICU",
                              "LAST_TSICU")
## Remove row names
row.names(last_care_res) <- NULL

## Clean data types
last_care_res$LAST_CCU <- as.numeric(last_care_res$LAST_CCU)
last_care_res$LAST_CSRU <- as.numeric(last_care_res$LAST_CSRU)
last_care_res$LAST_MICU <- as.numeric(last_care_res$LAST_MICU)
last_care_res$LAST_SICU <- as.numeric(last_care_res$LAST_SICU)
last_care_res$LAST_TSICU <- as.numeric(last_care_res$LAST_TSICU)

## Merge
careunits <- merge(first_care_res, last_care_res, by = c("SUBJECT_ID", "CGID"))

## Clean Environment
rm(first_care_res, last_care_res, tmp_tab)

Duplicate Note/Provider Investigation (INCOMPLETE)

Some notes in MIMIC are exact duplicates; this includes notes from within our NQF Care Measure Cohort. Previously, they were removed because we were only interested in their unique content. Now we are interested in their relation to the caregivers who documented them, so we will investigate.

## Subset *all* duplicated texts and other data
tmp <- dat[allDup(dat$TEXT),c("CGID",
                              "SUBJECT_ID",
                              "CG_DESCRIPTION",
                              "LABEL",
                              "NOTE_DESCRIPTION",
                              "INTIME",
                              "STORETIME")]

## Investigate the first 25 observations
head(tmp, 25)
##         CGID    SUBJECT_ID        CG_DESCRIPTION LABEL
## 19  CG_18917 SUBJECT_10302             Attending   MDs
## 29  CG_18917 SUBJECT_10302             Attending   MDs
## 55  CG_17843 SUBJECT_10820 Resident/Fellow/PA/NP   Res
## 60  CG_18326 SUBJECT_10820 Resident/Fellow/PA/NP   Res
## 63  CG_21245 SUBJECT_10820 Resident/Fellow/PA/NP   Res
## 64  CG_18326 SUBJECT_10820 Resident/Fellow/PA/NP   Res
## 69  CG_18152 SUBJECT_10820 Resident/Fellow/PA/NP   Res
## 80  CG_19165 SUBJECT_10820 Resident/Fellow/PA/NP   Res
## 82  CG_18152 SUBJECT_10820 Resident/Fellow/PA/NP   Res
## 93  CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP   Res
## 94  CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP   Res
## 95  CG_18537 SUBJECT_11236 Resident/Fellow/PA/NP   MDs
## 96  CG_18537 SUBJECT_11236 Resident/Fellow/PA/NP   MDs
## 97  CG_18537 SUBJECT_11236 Resident/Fellow/PA/NP   MDs
## 98  CG_18537 SUBJECT_11236 Resident/Fellow/PA/NP   MDs
## 99  CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP   Res
## 100 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP   Res
## 105 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP   Res
## 106 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP   Res
## 107 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP   Res
## 108 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP   Res
## 109 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP   Res
## 110 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP   Res
## 113 CG_18537 SUBJECT_11236 Resident/Fellow/PA/NP   MDs
## 114 CG_18537 SUBJECT_11236 Resident/Fellow/PA/NP   MDs
##                       NOTE_DESCRIPTION INTIME STORETIME
## 19  Physician Attending Admission Note  66306     66307
## 29  Physician Attending Admission Note  66306     66307
## 55    Physician Resident Progress Note  73711     73713
## 60   Physician Resident Admission Note  73711     73711
## 63    Physician Resident Progress Note  73711     73713
## 64   Physician Resident Admission Note  73711     73711
## 69    Physician Resident Progress Note  73656     73658
## 80    Physician Resident Progress Note  73656     73658
## 82    Physician Resident Progress Note  73656     73658
## 93    Physician Resident Progress Note  82062     82058
## 94    Physician Resident Progress Note  82057     82058
## 95   Physician Resident Admission Note  82062     82057
## 96   Physician Resident Admission Note  82057     82057
## 97   Physician Resident Admission Note  82062     82057
## 98   Physician Resident Admission Note  82057     82057
## 99    Physician Resident Progress Note  82062     82058
## 100   Physician Resident Progress Note  82057     82058
## 105   Physician Resident Progress Note  82062     82058
## 106   Physician Resident Progress Note  82057     82058
## 107   Physician Resident Progress Note  82062     82058
## 108   Physician Resident Progress Note  82057     82058
## 109   Physician Resident Progress Note  82062     82059
## 110   Physician Resident Progress Note  82057     82059
## 113  Physician Resident Admission Note  82062     82057
## 114  Physician Resident Admission Note  82057     82057

We will need to discover the best way to deal with those. Some have different STORETIMEs and CGIDs, others only have different STORETIMEs. Others appear to be properly duplicated entries.

One strategy may be to keep only those duplicate notes with different CGIDs. Also, look into notes that have attending & residents. Consider asking MIMIC-III DB Admins about how the CG_DESCRIPTION is determined, and which is more accurate (DESCRIPTION vs. LABEL)

Exploration

table(dat$CG_DESCRIPTION)
## 
##             Attending Resident/Fellow/PA/NP 
##                  3546                  7915
table(dat$LABEL)
## 
##   1390   9596    eaw HMS MS     MD    Mds    MDs    MDS Med St MedRes 
##     40     10     39     22   4051     16   2536    105     11      1 
##  MedSt     ms     MS     NP     PA    PHD    Res     RF     RN    Std 
##     45     35    131     27     35     94   4075     28     35     21 
##    STD Studen 
##     55     49
head(table(dat$NOTE_DESCRIPTION)[rev(order(table(dat$NOTE_DESCRIPTION)))], 10)
## 
##                  Physician Resident Progress Note 
##                                              4754 
##                 Physician Resident Admission Note 
##                                              1874 
##                 Physician Attending Progress Note 
##                                              1506 
##                                  Intensivist Note 
##                                              1372 
##         Physician Attending Admission Note - MICU 
##                                               507 
##                Physician Attending Admission Note 
##                                               174 
##                 Physician Surgical Admission Note 
##                                               131 
## Physician Resident/Attending Progress Note - MICU 
##                                               117 
##                                    ICU Note - CVI 
##                                               112 
##                         Cardiology Physician Note 
##                                                82
cat("There are", length(unique(dat$SUBJECT_ID)), "unique patients in this cohort.\n")
## There are 1141 unique patients in this cohort.
cat("There are", length(unique(dat$HADM_ID)), "unique hospital admissions associated with this cohort.\n")
## There are 1350 unique hospital admissions associated with this cohort.
cat("There are", length(unique(dat$CGID)), "unique caregivers associated with this cohort.\n")
## There are 467 unique caregivers associated with this cohort.

Network Analysis

For now, we will restrict our analysis to the clinicians’ ability to capture measures within the first 48 hours of an ICU admission, including:

  1. Documentation of family meetings FAM
  2. Documentation of code status limitations LIM
  3. Documentation of patient/family care preferences CAR
  4. Documentation of code status limitations and documentation of care preferences CIM
  5. Documentation of overall Code Status COD
  6. Documentation of I can’t remember which this one is CIM_post

We can collapse the data frame after subsetting, so we have every vertex, or node, represented in either CGID or SUBJECT_ID, and every edge is a row (or CGID and SUBJECT_ID) relation.

Since the documentation status is binary, we can take the average ussing aggregate(), which will give us the percentage of times, in the first 48 hours, the each concept was documented by the clinician.

## Subset numeric data
tmp <- dat[ ,c("CGID", 
               "SUBJECT_ID",
               "FAM.machine",
               "CIM.machine",
               "LIM.machine",
               "CAR.machine",
               "COD.machine",
               "CIM_post.machine",
               "WORD_COUNT",
               "NCHAR")]

## #######################################################################################
## This aggregation method shows ALL relationships (edges, or vertices) between caregivers 
## and patients, as well as the percentage of measures documented in their interactions
## #######################################################################################
tmp <- aggregate(cbind(FAM.machine, 
                       CIM.machine, 
                       LIM.machine, 
                       CAR.machine, 
                       COD.machine, 
                       CIM_post.machine,
                       WORD_COUNT,
                       NCHAR) ~ 
                     CGID + 
                     SUBJECT_ID, 
                 data = tmp, 
                 FUN = mean) ## Apply the mean to all columns

## Look
dim(tmp)
## [1] 5088   10
head(tmp)
##       CGID    SUBJECT_ID FAM.machine CIM.machine LIM.machine CAR.machine
## 1 CG_15165 SUBJECT_10134   1.0000000         1.0         1.0   1.0000000
## 2 CG_16383 SUBJECT_10134   0.0000000         1.0         1.0   1.0000000
## 3 CG_18189 SUBJECT_10134   0.6666667         1.0         1.0   0.3333333
## 4 CG_19458 SUBJECT_10134   1.0000000         1.0         1.0   1.0000000
## 5 CG_19692 SUBJECT_10134   1.0000000         1.0         1.0   1.0000000
## 6 CG_14080 SUBJECT_10207   0.5000000         0.5         0.5   0.0000000
##   COD.machine CIM_post.machine WORD_COUNT    NCHAR
## 1         0.0              1.0    1243.00 9398.000
## 2         0.0              1.0     926.00 6835.000
## 3         0.0              1.0     839.00 6330.667
## 4         1.0              1.0     688.25 5256.750
## 5         0.5              1.0    1244.00 8997.500
## 6         0.0              0.5     986.50 7320.500

Network

## Nodes will be all unique subjects and caregivers
network_nodes <- c(unique(dat$SUBJECT_ID), unique(dat$CGID))
## Look
head(network_nodes)
## [1] "SUBJECT_10134" "SUBJECT_10207" "SUBJECT_10302" "SUBJECT_10502"
## [5] "SUBJECT_10757" "SUBJECT_10820"
##Generate edges to show relationships
network_edges <- merge(tmp, careunits, by = c("SUBJECT_ID", "CGID"))

## Look
head(network_edges)
##      SUBJECT_ID     CGID FAM.machine CIM.machine LIM.machine CAR.machine
## 1 SUBJECT_10134 CG_15165   1.0000000         1.0         1.0   1.0000000
## 2 SUBJECT_10134 CG_16383   0.0000000         1.0         1.0   1.0000000
## 3 SUBJECT_10134 CG_18189   0.6666667         1.0         1.0   0.3333333
## 4 SUBJECT_10134 CG_19458   1.0000000         1.0         1.0   1.0000000
## 5 SUBJECT_10134 CG_19692   1.0000000         1.0         1.0   1.0000000
## 6 SUBJECT_10207 CG_14080   0.5000000         0.5         0.5   0.0000000
##   COD.machine CIM_post.machine WORD_COUNT    NCHAR FIRST_CCU FIRST_CSRU
## 1         0.0              1.0    1243.00 9398.000         0          0
## 2         0.0              1.0     926.00 6835.000         0          0
## 3         0.0              1.0     839.00 6330.667         0          0
## 4         1.0              1.0     688.25 5256.750         0          0
## 5         0.5              1.0    1244.00 8997.500         0          0
## 6         0.0              0.5     986.50 7320.500         0          0
##   FIRST_MICU FIRST_SICU FIRST_TSICU LAST_CCU LAST_CSRU LAST_MICU LAST_SICU
## 1          1          0           0        0         0         1         0
## 2          1          0           0        0         0         1         0
## 3          3          0           0        0         0         3         0
## 4          4          0           0        0         0         4         0
## 5          2          0           0        0         0         2         0
## 6          2          0           0        0         0         2         0
##   LAST_TSICU
## 1          0
## 2          0
## 3          0
## 4          0
## 5          0
## 6          0
gD <- graph_from_data_frame(d = network_edges, vertices = network_nodes, directed = FALSE)

# Check the attributes
summary(gD)
## IGRAPH 0cee97d UN-- 1608 5088 -- 
## + attr: name (v/c), FAM.machine (e/n), CIM.machine (e/n),
## | LIM.machine (e/n), CAR.machine (e/n), COD.machine (e/n),
## | CIM_post.machine (e/n), WORD_COUNT (e/n), NCHAR (e/n), FIRST_CCU
## | (e/n), FIRST_CSRU (e/n), FIRST_MICU (e/n), FIRST_SICU (e/n),
## | FIRST_TSICU (e/n), LAST_CCU (e/n), LAST_CSRU (e/n), LAST_MICU
## | (e/n), LAST_SICU (e/n), LAST_TSICU (e/n)
## Create node dataframe

nodes_df <- data.frame(ID = c(1:vcount(gD)), NAME = V(gD)$name)

nodes_df$NAME <- as.character(nodes_df$NAME)

## Create a data.frame edges: 1st column = source node ID
##                            2nd column = target node ID
## (If direction is of interest)
edges_df <- as.data.frame(get.edges(gD, c(1:ecount(gD))))

## Create node attributes
nodes_att <- as.data.frame(cbind(network_nodes, 
                                 ifelse(nodes_df$NAME %in% dat$SUBJECT_ID, "PATIENT", "PROVIDER")))

## Correct Column names
colnames(nodes_att) <- c("NODE", "IDENTITY")

## Convert to Character (from factor)
nodes_att$NODE <- as.character(nodes_att$NODE)
nodes_att$IDENTITY <- as.character(nodes_att$IDENTITY)

## Calculate identity in the form of: PATIENTS/ATTENDING/RESIDENTS/etc.
for (i in 1:nrow(nodes_att)){
    if (nodes_df$NAME[i] %in% dat$CGID){
        nodes_att$IDENTITY[i] <- dat[dat$CGID == nodes_df$NAME[i], ]$CG_DESCRIPTION[1]
    }
}

## Create a dataframe with edge attributes, called edges_att
edges_att <- network_edges

## Aggregate again to generate avereage statistics for all CAREGIVERS
tmp <- aggregate(cbind(FAM.machine, 
                       CIM.machine, 
                       LIM.machine, 
                       CAR.machine, 
                       COD.machine, 
                       CIM_post.machine,
                       WORD_COUNT,
                       NCHAR) ~ 
                     CGID,
                 data = edges_att, 
                 FUN = mean)

## Take a look
head(tmp)
##       CGID FAM.machine CIM.machine LIM.machine CAR.machine COD.machine
## 1 CG_14010   0.2000000   0.3600000   0.3600000   0.3200000   0.5333333
## 2 CG_14022   0.0000000   0.0000000   0.0000000   0.0000000   0.0000000
## 3 CG_14037   0.2363636   0.2666667   0.2666667   0.1636364   0.6727273
## 4 CG_14045   0.4362745   0.5490196   0.4705882   0.3725490   0.6862745
## 5 CG_14056   0.2592593   0.5555556   0.4444444   0.4444444   0.5555556
## 6 CG_14080   0.1666667   0.1666667   0.1666667   0.0000000   0.6666667
##   CIM_post.machine WORD_COUNT     NCHAR
## 1        0.3600000   830.5833  6212.387
## 2        0.0000000   436.0000  3791.000
## 3        0.3151515   859.4727  6433.124
## 4        0.5490196  1070.6667  8107.373
## 5        0.6666667  1387.1852 10180.796
## 6        0.1666667  1231.3333  8912.667
dim(tmp)
## [1] 467   9
## Convert for Gephi
nodes_att$WORD_COUNT <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID),
                               edges_att$WORD_COUNT, 0)

nodes_att$CHAR_COUNT <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID), 
                               edges_att$NCHAR, 0)

nodes_att$FAM <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID), edges_att$FAM.machine, 0)

nodes_att$COD <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID), edges_att$COD.machine, 0)

nodes_att$LIM <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID), edges_att$LIM.machine, 0)

nodes_att$CAR <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID), edges_att$CAR.machine, 0)

nodes_att$CIM <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID), edges_att$CIM.machine, 0)

nodes_att$CIM_post <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID),
                             edges_att$CIM_post.machine, 0)


###########################################################################################
## ICU NOTE IS MOST FREQUENT COLUMN NAME FOR NOTE DOCUMENTATION
## THINK ABOUT USING FIRST DOCUMENTATION? OTHER METRIC?

edges_att$FIRST_ICU <- colnames(edges_att)[11:15][apply(edges_att[,11:15], 1 , which.max)]

edges_att$LAST_ICU <- colnames(edges_att)[16:20][apply(edges_att[,16:20], 1, which.max)]
###########################################################################################


# Write the network into a gexf (Gephi) file
#write.gexf(nodes = nodes_df, 
#           edges = edges_df, 
#           nodesAtt = nodes_att, 
#           edgesAtt = edges_att, 
#           output = "/Users/kai-ou/Desktop/NQF_cohort_test15May18.gexf")

Character and Word Counts

boxplot(dat$NCHAR ~ dat$CG_DESCRIPTION, 
        main = "Character Count By Clinician",
        xlab = "Clinician",
        ylab = "Character Count")

t.test(dat$NCHAR ~ dat$CG_DESCRIPTION)
## 
##  Welch Two Sample t-test
## 
## data:  dat$NCHAR by dat$CG_DESCRIPTION
## t = -8.1483, df = 6806.6, p-value = 4.357e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -542.8214 -332.2876
## sample estimates:
##             mean in group Attending mean in group Resident/Fellow/PA/NP 
##                            6606.670                            7044.224
wilcox.test(dat$NCHAR ~ dat$CG_DESCRIPTION)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  dat$NCHAR by dat$CG_DESCRIPTION
## W = 12611000, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
boxplot(dat$WORD_COUNT ~ dat$CG_DESCRIPTION, 
        main = "Word Count By Clinician",
        xlab = "Clinician",
        ylab = "Character Count")

t.test(dat$WORD_COUNT ~ dat$CG_DESCRIPTION)
## 
##  Welch Two Sample t-test
## 
## data:  dat$WORD_COUNT by dat$CG_DESCRIPTION
## t = -7.4702, df = 6771.1, p-value = 9.011e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -72.62815 -42.43394
## sample estimates:
##             mean in group Attending mean in group Resident/Fellow/PA/NP 
##                            888.9681                            946.4992
wilcox.test(dat$WORD_COUNT ~ dat$CG_DESCRIPTION)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  dat$WORD_COUNT by dat$CG_DESCRIPTION
## W = 12620000, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

Overall Rates of Documentation

plotDat(dat, "FAM.machine", bs = F, mn = "FAM Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")

test <- table(dat$FAM.machine, dat$CG_DESCRIPTION)
test
##    
##     Attending Resident/Fellow/PA/NP
##   0      2586                  5445
##   1       960                  2470
chisq.test(test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  test
## X-squared = 19.758, df = 1, p-value = 8.788e-06
pairwiseNominalIndependence(
  as.matrix(test), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##   Comparison  p.Chisq p.adj.Chisq
## 1      0 : 1 8.79e-06    8.79e-06
plotDat(dat, "CIM.machine", bs = F, mn = "CIM Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")

test <- table(dat$CIM.machine, dat$CG_DESCRIPTION)
test
##    
##     Attending Resident/Fellow/PA/NP
##   0      2232                  4624
##   1      1314                  3291
chisq.test(test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  test
## X-squared = 20.66, df = 1, p-value = 5.486e-06
pairwiseNominalIndependence(
  as.matrix(test), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##   Comparison  p.Chisq p.adj.Chisq
## 1      0 : 1 5.49e-06    5.49e-06
plotDat(dat, "LIM.machine", bs = F, mn = "LIM Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")

test <- table(dat$LIM.machine, dat$CG_DESCRIPTION)
test
##    
##     Attending Resident/Fellow/PA/NP
##   0      2361                  5249
##   1      1185                  2666
chisq.test(test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  test
## X-squared = 0.065631, df = 1, p-value = 0.7978
pairwiseNominalIndependence(
  as.matrix(test), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##   Comparison p.Chisq p.adj.Chisq
## 1      0 : 1   0.798       0.798
plotDat(dat, "CAR.machine", bs = F, mn = "CAR Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")

test <- table(dat$CAR.machine, dat$CG_DESCRIPTION)
test
##    
##     Attending Resident/Fellow/PA/NP
##   0      2753                  5104
##   1       793                  2811
chisq.test(test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  test
## X-squared = 195.88, df = 1, p-value < 2.2e-16
pairwiseNominalIndependence(
  as.matrix(test), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##   Comparison  p.Chisq p.adj.Chisq
## 1      0 : 1 1.66e-44    1.66e-44
plotDat(dat, "COD.machine", bs = F, mn = "COD Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")

test <- table(dat$COD.machine, dat$CG_DESCRIPTION)
test
##    
##     Attending Resident/Fellow/PA/NP
##   0      1313                  2782
##   1      2233                  5133
chisq.test(test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  test
## X-squared = 3.6845, df = 1, p-value = 0.05492
pairwiseNominalIndependence(
  as.matrix(test), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##   Comparison p.Chisq p.adj.Chisq
## 1      0 : 1  0.0549      0.0549
plotDat(dat, "CIM_post.machine", bs = F, mn = "CIM Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")

test <- table(dat$CIM_post.machine, dat$CG_DESCRIPTION)
test
##    
##     Attending Resident/Fellow/PA/NP
##   0      2048                  3995
##   1      1498                  3920
chisq.test(test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  test
## X-squared = 51.798, df = 1, p-value = 6.151e-13
pairwiseNominalIndependence(
  as.matrix(test), 
  fisher = F, gtest = F, chisq = T, method = "fdr")
##   Comparison  p.Chisq p.adj.Chisq
## 1      0 : 1 6.15e-13    6.15e-13

NOTE: LOOK FOR NOTES WRITTING BY ATTENDING ONLY VS ATTENDING AND RESIDENT