library("tidyverse")
library("tidygraph")
library("ggraph")
library("corrr")
library("rcompanion") #Pairwise nominal test
### For networks:
library("igraph")
library("rgexf")
allDup will return all duplicated values, without dropping any. i.e. The directionality of duplicated() is removed by performing the operation on the vector from both directions
allDup <- function(val){
duplicated(val) | duplicated(val, fromLast = TRUE)
}
plotDat() is a convenient plotting function.
plotDat <- function(dat, column, bs, mn, xl, yl){
tmp <- as.matrix(table(dat[[column]], dat[["CG_DESCRIPTION"]]))
prop <- prop.table(tmp, margin = 2)#2 for column-wise proportions
par(mar = c(5.0, 4.0, 4.0, 15), xpd = TRUE)
barplot(prop, col = cm.colors(length(rownames(prop))), beside = bs,width = 2, main = mn, xlab = xl, ylab = yl)
legend("topright", inset = c(-0.90,0), fill = cm.colors(length(rownames(prop))), legend=rownames(prop))
}
detach_packages will keep only base R packages, and will remove all other supplementary packages to avoid functional conflicts.
detach_packages <- function(){
basic.packages <- c("package:stats","package:graphics","package:grDevices","package:utils","package:datasets","package:methods","package:base")
package.list <- search()[ifelse(unlist(gregexpr("package:", search())) == 1, TRUE , FALSE)]
package.list <- setdiff(package.list, basic.packages)
if (length(package.list) > 0 ) for (package in package.list) detach(package, character.only=TRUE)
}
Load NQF care measure cohort (drawn directly from NOTEEVENTS table), load CAREGIVERS, ADMISSIONS, PATIENTS, and ICUSTAYS for additional data.
## Load Labeled Note Data for NQF Caremeasure Cohort (From NOTEEVENTS table)
dat <- read.csv("~/nqf_caregivers/data/note_labels_over75.csv", header = T, stringsAsFactors = F)
## Remove X and note_name (artifact indexing columns) from dat
dat$X <- NULL
dat$note_name <- NULL
Note: FAM, CIM, LIM, CAR and COD refer to human annotations, and will be dropped, as we are intersted in the .machine annotations from NeuroNER.
## Remove manual annotations
dat$FAM <- NULL
dat$CIM <- NULL
dat$CIM_post <- NULL
dat$LIM <- NULL
dat$CAR <- NULL
dat$COD <- NULL
## Load CAREGIVERS Table for join on CGID
cg <- read.csv("~/nqf_caregivers/data/mimic/CAREGIVERS.csv",
header = T, stringsAsFactors = F)
## Load ADMISSIONS Table to join on HADM_ID
adm <- read.csv("~/nqf_caregivers/data/mimic/ADMISSIONS.csv",
header = T, stringsAsFactors = F)
## Load PATIENTS Table to join on SUBJECT_ID
pat <- read.csv("~/nqf_caregivers/data/mimic/PATIENTS.csv",
header = T, stringsAsFactors = F)
## Load ICUSTAYS Table to join on SUBJECT_ID, HADM_ID
stays <- read.csv("~/nqf_caregivers/data/mimic/ICUSTAYS.csv",
header = T, stringsAsFactors = F)
##Word count
dat$WORD_COUNT <- stringr::str_count(dat$TEXT, "\\W+")
## Count characters
dat$NCHAR <- nchar(dat$TEXT)
plot(dat$WORD_COUNT, dat$NCHAR,
main = "Character Count as a Function of Word Count",
xlab = "Word Count",
ylab = "Character Count")
cor(dat$WORD_COUNT, dat$NCHAR)
## [1] 0.9914678
Unsurprisingly, word count and character count are correlated.
In MIMIC-III, ROW_ID is an index used for each table, and DESCRIPTION is also a common variable for each table.
ROW_ID from all tables accept dat (from NOTEEVENTS)dat to CAREGIVERS on CGIDdat to ADMISSIONS on HADM_IDdat to PATIENTS on SUBJECT_IDdat to ICUSTAYS on SUBJECT_ID and HADM_IDATTENDING and Resident/Fellow/PA/NP## Change column name of "DESCRIPTION" to explicitly mention that it describes the note
colnames(dat)[which(colnames(dat) == "DESCRIPTION")] <- "NOTE_DESCRIPTION"
## Change column name of "DESCRIPTION" to explicitly mention that it describes the caregiver
colnames(cg)[which(colnames(cg) == "DESCRIPTION")] <- "CG_DESCRIPTION"
## (1)
cg$ROW_ID <- NULL
adm$ROW_ID <- NULL
pat$ROW_ID <- NULL
stays$ROW_ID <- NULL
dim(dat)
## [1] 11575 19
## (2)
dat <- merge(dat, cg, by = "CGID")
dim(dat)
## [1] 11575 21
## (3)
dat <- merge(dat, adm, by = "HADM_ID")
dim(dat)
## [1] 11575 38
## This has duplicated SUBJECT_ID
identical(dat$SUBJECT_ID.x, dat$SUBJECT_ID.y)
## [1] TRUE
## Remove one
dat$SUBJECT_ID.y <- NULL
## Rename the other
colnames(dat)[which(colnames(dat) == "SUBJECT_ID.x")] <- "SUBJECT_ID"
dim(dat)
## [1] 11575 37
## (4)
dat <- merge(dat, pat, by = "SUBJECT_ID")
dim(dat)
## [1] 11575 43
## (5)
dat <- merge(dat, stays, by = c("SUBJECT_ID", "HADM_ID"))
dim(dat)
## [1] 13369 52
## (6)
dat <- dat[!duplicated(dat), ]
dim(dat)
## [1] 11575 52
## Cleaning Environment
rm(adm, pat, stays)#cg, stays)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1997659 106.7 3886542 207.6 2983941 159.4
## Vcells 12037774 91.9 28511622 217.6 27543236 210.2
## Write
#write.csv(dat, file = "~/nqf_caregivers/data/note_labels_over75_merged12May18.csv", row.names = F)
## (7)
dat <- dat[(dat$CG_DESCRIPTION == "Attending" |
dat$CG_DESCRIPTION == "Resident/Fellow/PA/NP"), ]
dim(dat)
## [1] 11461 52
## Write
#write(...)
Note: the merge() method we used is an inner join, which is a matrix manipulation that generates the Cartesian Product of two data matrices. The data frame will expand when joined to ICUSTAYS because a single hospital admission can be associated with a number of ICUSTAYS if the patient is transferred to the floor and back.
colnames(dat)
## [1] "SUBJECT_ID" "HADM_ID" "CGID"
## [4] "ROW_ID" "CHARTDATE" "CHARTTIME"
## [7] "STORETIME" "CATEGORY" "NOTE_DESCRIPTION"
## [10] "ISERROR" "TEXT" "FAM.machine"
## [13] "CIM.machine" "LIM.machine" "CAR.machine"
## [16] "COD.machine" "CIM_post.machine" "WORD_COUNT"
## [19] "NCHAR" "LABEL" "CG_DESCRIPTION"
## [22] "ADMITTIME" "DISCHTIME" "DEATHTIME"
## [25] "ADMISSION_TYPE" "ADMISSION_LOCATION" "DISCHARGE_LOCATION"
## [28] "INSURANCE" "LANGUAGE" "RELIGION"
## [31] "MARITAL_STATUS" "ETHNICITY" "EDREGTIME"
## [34] "EDOUTTIME" "DIAGNOSIS" "HOSPITAL_EXPIRE_FLAG"
## [37] "HAS_CHARTEVENTS_DATA" "GENDER" "DOB"
## [40] "DOD" "DOD_HOSP" "DOD_SSN"
## [43] "EXPIRE_FLAG" "ICUSTAY_ID" "DBSOURCE"
## [46] "FIRST_CAREUNIT" "LAST_CAREUNIT" "FIRST_WARDID"
## [49] "LAST_WARDID" "INTIME" "OUTTIME"
## [52] "LOS"
We will want to convert CHARTDATE, representing the date the which the note was charted, to numeric from YYYY-MM-DD format. We will also want ICUSTAYS.INTIME in a similar format. The integer that results will be days since 1970-01-01, per ?as.Date.
## From ``
dat$CHARTDATE <- as.numeric(as.Date(dat$CHARTDATE, "%Y-%m-%d"))
## From ``
dat$STORETIME <- as.numeric(as.Date(dat$STORETIME, "%Y-%m-%d %H:%M:%S"))
## from ICUSTAYS
dat$INTIME <- as.numeric(as.Date(dat$INTIME, "%Y-%m-%d %H:%M:%S"))
SUBJECT_ID and CGID are integer values that may have overlap. To deal with this, we will add a character prefix to their integer values.
dat$SUBJECT_ID <- paste0("SUBJECT_", dat$SUBJECT_ID)
dat$CGID <- paste0("CG_", dat$CGID)
While still learning Gephi, I will be converting some factors to counts, expanding the data frame out to be more sparse but to contain more variables. We will use xtabs() (Cross- or Pivot-tables) to count the number of occurrences of SUBJECT_ID and CGID pairs meeting in each particular care unit.
## First Care Unit Weights
## Make a cross table of SUBJECT_ID/CGID relative to CAREUNIT
tmp_tab <- xtabs( ~ paste(SUBJECT_ID, CGID) + FIRST_CAREUNIT, data = dat)
## Bind columns containing SUBJECT_ID and CGID to LOC columns
## (after splitting SUBJECT_IDs/CGIDs on spaces) to the cross table
first_care_res <- as.data.frame(cbind(t(matrix(unlist(strsplit(rownames(tmp_tab), ' ')),
nrow = 2)), tmp_tab), stringsAsFactors = F)
## Correct column names
colnames(first_care_res) <- c("SUBJECT_ID",
"CGID",
"FIRST_CCU",
"FIRST_CSRU",
"FIRST_MICU",
"FIRST_SICU",
"FIRST_TSICU")
## Remove row names
row.names(first_care_res) <- NULL
## Convert characters to numeric
first_care_res$FIRST_CCU <- as.numeric(first_care_res$FIRST_CCU)
first_care_res$FIRST_CSRU <- as.numeric(first_care_res$FIRST_CSRU)
first_care_res$FIRST_MICU <- as.numeric(first_care_res$FIRST_MICU)
first_care_res$FIRST_SICU <- as.numeric(first_care_res$FIRST_SICU)
first_care_res$FIRST_TSICU <- as.numeric(first_care_res$FIRST_TSICU)
## Look
head(first_care_res)
## SUBJECT_ID CGID FIRST_CCU FIRST_CSRU FIRST_MICU FIRST_SICU
## 1 SUBJECT_10134 CG_15165 0 0 1 0
## 2 SUBJECT_10134 CG_16383 0 0 1 0
## 3 SUBJECT_10134 CG_18189 0 0 3 0
## 4 SUBJECT_10134 CG_19458 0 0 4 0
## 5 SUBJECT_10134 CG_19692 0 0 2 0
## 6 SUBJECT_10207 CG_14080 0 0 2 0
## FIRST_TSICU
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## Last Care Unit Weights
## Make a cross table of SUBJECT_ID/CGID relative to CAREUNIT
tmp_tab <- xtabs( ~ paste(SUBJECT_ID, CGID) + LAST_CAREUNIT, data = dat)
## Bind columns containing SUBJECT_ID and CGID to LOC columns
## (after splitting SUBJECT_IDs/CGIDs on spaces) to the cross table
last_care_res <- as.data.frame(cbind(t(matrix(unlist(strsplit(rownames(tmp_tab), ' ')),
nrow = 2)), tmp_tab), stringsAsFactors = F)
## Correct column names
colnames(last_care_res) <- c("SUBJECT_ID",
"CGID",
"LAST_CCU",
"LAST_CSRU",
"LAST_MICU",
"LAST_SICU",
"LAST_TSICU")
## Remove row names
row.names(last_care_res) <- NULL
## Clean data types
last_care_res$LAST_CCU <- as.numeric(last_care_res$LAST_CCU)
last_care_res$LAST_CSRU <- as.numeric(last_care_res$LAST_CSRU)
last_care_res$LAST_MICU <- as.numeric(last_care_res$LAST_MICU)
last_care_res$LAST_SICU <- as.numeric(last_care_res$LAST_SICU)
last_care_res$LAST_TSICU <- as.numeric(last_care_res$LAST_TSICU)
## Merge
careunits <- merge(first_care_res, last_care_res, by = c("SUBJECT_ID", "CGID"))
## Clean Environment
rm(first_care_res, last_care_res, tmp_tab)
Some notes in MIMIC are exact duplicates; this includes notes from within our NQF Care Measure Cohort. Previously, they were removed because we were only interested in their unique content. Now we are interested in their relation to the caregivers who documented them, so we will investigate.
## Subset *all* duplicated texts and other data
tmp <- dat[allDup(dat$TEXT),c("CGID",
"SUBJECT_ID",
"CG_DESCRIPTION",
"LABEL",
"NOTE_DESCRIPTION",
"INTIME",
"STORETIME")]
## Investigate the first 25 observations
head(tmp, 25)
## CGID SUBJECT_ID CG_DESCRIPTION LABEL
## 19 CG_18917 SUBJECT_10302 Attending MDs
## 29 CG_18917 SUBJECT_10302 Attending MDs
## 55 CG_17843 SUBJECT_10820 Resident/Fellow/PA/NP Res
## 60 CG_18326 SUBJECT_10820 Resident/Fellow/PA/NP Res
## 63 CG_21245 SUBJECT_10820 Resident/Fellow/PA/NP Res
## 64 CG_18326 SUBJECT_10820 Resident/Fellow/PA/NP Res
## 69 CG_18152 SUBJECT_10820 Resident/Fellow/PA/NP Res
## 80 CG_19165 SUBJECT_10820 Resident/Fellow/PA/NP Res
## 82 CG_18152 SUBJECT_10820 Resident/Fellow/PA/NP Res
## 93 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP Res
## 94 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP Res
## 95 CG_18537 SUBJECT_11236 Resident/Fellow/PA/NP MDs
## 96 CG_18537 SUBJECT_11236 Resident/Fellow/PA/NP MDs
## 97 CG_18537 SUBJECT_11236 Resident/Fellow/PA/NP MDs
## 98 CG_18537 SUBJECT_11236 Resident/Fellow/PA/NP MDs
## 99 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP Res
## 100 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP Res
## 105 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP Res
## 106 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP Res
## 107 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP Res
## 108 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP Res
## 109 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP Res
## 110 CG_19229 SUBJECT_11236 Resident/Fellow/PA/NP Res
## 113 CG_18537 SUBJECT_11236 Resident/Fellow/PA/NP MDs
## 114 CG_18537 SUBJECT_11236 Resident/Fellow/PA/NP MDs
## NOTE_DESCRIPTION INTIME STORETIME
## 19 Physician Attending Admission Note 66306 66307
## 29 Physician Attending Admission Note 66306 66307
## 55 Physician Resident Progress Note 73711 73713
## 60 Physician Resident Admission Note 73711 73711
## 63 Physician Resident Progress Note 73711 73713
## 64 Physician Resident Admission Note 73711 73711
## 69 Physician Resident Progress Note 73656 73658
## 80 Physician Resident Progress Note 73656 73658
## 82 Physician Resident Progress Note 73656 73658
## 93 Physician Resident Progress Note 82062 82058
## 94 Physician Resident Progress Note 82057 82058
## 95 Physician Resident Admission Note 82062 82057
## 96 Physician Resident Admission Note 82057 82057
## 97 Physician Resident Admission Note 82062 82057
## 98 Physician Resident Admission Note 82057 82057
## 99 Physician Resident Progress Note 82062 82058
## 100 Physician Resident Progress Note 82057 82058
## 105 Physician Resident Progress Note 82062 82058
## 106 Physician Resident Progress Note 82057 82058
## 107 Physician Resident Progress Note 82062 82058
## 108 Physician Resident Progress Note 82057 82058
## 109 Physician Resident Progress Note 82062 82059
## 110 Physician Resident Progress Note 82057 82059
## 113 Physician Resident Admission Note 82062 82057
## 114 Physician Resident Admission Note 82057 82057
We will need to discover the best way to deal with those. Some have different STORETIMEs and CGIDs, others only have different STORETIMEs. Others appear to be properly duplicated entries.
One strategy may be to keep only those duplicate notes with different CGIDs. Also, look into notes that have attending & residents. Consider asking MIMIC-III DB Admins about how the CG_DESCRIPTION is determined, and which is more accurate (DESCRIPTION vs. LABEL)
table(dat$CG_DESCRIPTION)
##
## Attending Resident/Fellow/PA/NP
## 3546 7915
table(dat$LABEL)
##
## 1390 9596 eaw HMS MS MD Mds MDs MDS Med St MedRes
## 40 10 39 22 4051 16 2536 105 11 1
## MedSt ms MS NP PA PHD Res RF RN Std
## 45 35 131 27 35 94 4075 28 35 21
## STD Studen
## 55 49
head(table(dat$NOTE_DESCRIPTION)[rev(order(table(dat$NOTE_DESCRIPTION)))], 10)
##
## Physician Resident Progress Note
## 4754
## Physician Resident Admission Note
## 1874
## Physician Attending Progress Note
## 1506
## Intensivist Note
## 1372
## Physician Attending Admission Note - MICU
## 507
## Physician Attending Admission Note
## 174
## Physician Surgical Admission Note
## 131
## Physician Resident/Attending Progress Note - MICU
## 117
## ICU Note - CVI
## 112
## Cardiology Physician Note
## 82
cat("There are", length(unique(dat$SUBJECT_ID)), "unique patients in this cohort.\n")
## There are 1141 unique patients in this cohort.
cat("There are", length(unique(dat$HADM_ID)), "unique hospital admissions associated with this cohort.\n")
## There are 1350 unique hospital admissions associated with this cohort.
cat("There are", length(unique(dat$CGID)), "unique caregivers associated with this cohort.\n")
## There are 467 unique caregivers associated with this cohort.
For now, we will restrict our analysis to the clinicians’ ability to capture measures within the first 48 hours of an ICU admission, including:
FAMLIMCARCIMCODCIM_postWe can collapse the data frame after subsetting, so we have every vertex, or node, represented in either CGID or SUBJECT_ID, and every edge is a row (or CGID and SUBJECT_ID) relation.
Since the documentation status is binary, we can take the average ussing aggregate(), which will give us the percentage of times, in the first 48 hours, the each concept was documented by the clinician.
## Subset numeric data
tmp <- dat[ ,c("CGID",
"SUBJECT_ID",
"FAM.machine",
"CIM.machine",
"LIM.machine",
"CAR.machine",
"COD.machine",
"CIM_post.machine",
"WORD_COUNT",
"NCHAR")]
## #######################################################################################
## This aggregation method shows ALL relationships (edges, or vertices) between caregivers
## and patients, as well as the percentage of measures documented in their interactions
## #######################################################################################
tmp <- aggregate(cbind(FAM.machine,
CIM.machine,
LIM.machine,
CAR.machine,
COD.machine,
CIM_post.machine,
WORD_COUNT,
NCHAR) ~
CGID +
SUBJECT_ID,
data = tmp,
FUN = mean) ## Apply the mean to all columns
## Look
dim(tmp)
## [1] 5088 10
head(tmp)
## CGID SUBJECT_ID FAM.machine CIM.machine LIM.machine CAR.machine
## 1 CG_15165 SUBJECT_10134 1.0000000 1.0 1.0 1.0000000
## 2 CG_16383 SUBJECT_10134 0.0000000 1.0 1.0 1.0000000
## 3 CG_18189 SUBJECT_10134 0.6666667 1.0 1.0 0.3333333
## 4 CG_19458 SUBJECT_10134 1.0000000 1.0 1.0 1.0000000
## 5 CG_19692 SUBJECT_10134 1.0000000 1.0 1.0 1.0000000
## 6 CG_14080 SUBJECT_10207 0.5000000 0.5 0.5 0.0000000
## COD.machine CIM_post.machine WORD_COUNT NCHAR
## 1 0.0 1.0 1243.00 9398.000
## 2 0.0 1.0 926.00 6835.000
## 3 0.0 1.0 839.00 6330.667
## 4 1.0 1.0 688.25 5256.750
## 5 0.5 1.0 1244.00 8997.500
## 6 0.0 0.5 986.50 7320.500
## Nodes will be all unique subjects and caregivers
network_nodes <- c(unique(dat$SUBJECT_ID), unique(dat$CGID))
## Look
head(network_nodes)
## [1] "SUBJECT_10134" "SUBJECT_10207" "SUBJECT_10302" "SUBJECT_10502"
## [5] "SUBJECT_10757" "SUBJECT_10820"
##Generate edges to show relationships
network_edges <- merge(tmp, careunits, by = c("SUBJECT_ID", "CGID"))
## Look
head(network_edges)
## SUBJECT_ID CGID FAM.machine CIM.machine LIM.machine CAR.machine
## 1 SUBJECT_10134 CG_15165 1.0000000 1.0 1.0 1.0000000
## 2 SUBJECT_10134 CG_16383 0.0000000 1.0 1.0 1.0000000
## 3 SUBJECT_10134 CG_18189 0.6666667 1.0 1.0 0.3333333
## 4 SUBJECT_10134 CG_19458 1.0000000 1.0 1.0 1.0000000
## 5 SUBJECT_10134 CG_19692 1.0000000 1.0 1.0 1.0000000
## 6 SUBJECT_10207 CG_14080 0.5000000 0.5 0.5 0.0000000
## COD.machine CIM_post.machine WORD_COUNT NCHAR FIRST_CCU FIRST_CSRU
## 1 0.0 1.0 1243.00 9398.000 0 0
## 2 0.0 1.0 926.00 6835.000 0 0
## 3 0.0 1.0 839.00 6330.667 0 0
## 4 1.0 1.0 688.25 5256.750 0 0
## 5 0.5 1.0 1244.00 8997.500 0 0
## 6 0.0 0.5 986.50 7320.500 0 0
## FIRST_MICU FIRST_SICU FIRST_TSICU LAST_CCU LAST_CSRU LAST_MICU LAST_SICU
## 1 1 0 0 0 0 1 0
## 2 1 0 0 0 0 1 0
## 3 3 0 0 0 0 3 0
## 4 4 0 0 0 0 4 0
## 5 2 0 0 0 0 2 0
## 6 2 0 0 0 0 2 0
## LAST_TSICU
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
gD <- graph_from_data_frame(d = network_edges, vertices = network_nodes, directed = FALSE)
# Check the attributes
summary(gD)
## IGRAPH 0cee97d UN-- 1608 5088 --
## + attr: name (v/c), FAM.machine (e/n), CIM.machine (e/n),
## | LIM.machine (e/n), CAR.machine (e/n), COD.machine (e/n),
## | CIM_post.machine (e/n), WORD_COUNT (e/n), NCHAR (e/n), FIRST_CCU
## | (e/n), FIRST_CSRU (e/n), FIRST_MICU (e/n), FIRST_SICU (e/n),
## | FIRST_TSICU (e/n), LAST_CCU (e/n), LAST_CSRU (e/n), LAST_MICU
## | (e/n), LAST_SICU (e/n), LAST_TSICU (e/n)
## Create node dataframe
nodes_df <- data.frame(ID = c(1:vcount(gD)), NAME = V(gD)$name)
nodes_df$NAME <- as.character(nodes_df$NAME)
## Create a data.frame edges: 1st column = source node ID
## 2nd column = target node ID
## (If direction is of interest)
edges_df <- as.data.frame(get.edges(gD, c(1:ecount(gD))))
## Create node attributes
nodes_att <- as.data.frame(cbind(network_nodes,
ifelse(nodes_df$NAME %in% dat$SUBJECT_ID, "PATIENT", "PROVIDER")))
## Correct Column names
colnames(nodes_att) <- c("NODE", "IDENTITY")
## Convert to Character (from factor)
nodes_att$NODE <- as.character(nodes_att$NODE)
nodes_att$IDENTITY <- as.character(nodes_att$IDENTITY)
## Calculate identity in the form of: PATIENTS/ATTENDING/RESIDENTS/etc.
for (i in 1:nrow(nodes_att)){
if (nodes_df$NAME[i] %in% dat$CGID){
nodes_att$IDENTITY[i] <- dat[dat$CGID == nodes_df$NAME[i], ]$CG_DESCRIPTION[1]
}
}
## Create a dataframe with edge attributes, called edges_att
edges_att <- network_edges
## Aggregate again to generate avereage statistics for all CAREGIVERS
tmp <- aggregate(cbind(FAM.machine,
CIM.machine,
LIM.machine,
CAR.machine,
COD.machine,
CIM_post.machine,
WORD_COUNT,
NCHAR) ~
CGID,
data = edges_att,
FUN = mean)
## Take a look
head(tmp)
## CGID FAM.machine CIM.machine LIM.machine CAR.machine COD.machine
## 1 CG_14010 0.2000000 0.3600000 0.3600000 0.3200000 0.5333333
## 2 CG_14022 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3 CG_14037 0.2363636 0.2666667 0.2666667 0.1636364 0.6727273
## 4 CG_14045 0.4362745 0.5490196 0.4705882 0.3725490 0.6862745
## 5 CG_14056 0.2592593 0.5555556 0.4444444 0.4444444 0.5555556
## 6 CG_14080 0.1666667 0.1666667 0.1666667 0.0000000 0.6666667
## CIM_post.machine WORD_COUNT NCHAR
## 1 0.3600000 830.5833 6212.387
## 2 0.0000000 436.0000 3791.000
## 3 0.3151515 859.4727 6433.124
## 4 0.5490196 1070.6667 8107.373
## 5 0.6666667 1387.1852 10180.796
## 6 0.1666667 1231.3333 8912.667
dim(tmp)
## [1] 467 9
## Convert for Gephi
nodes_att$WORD_COUNT <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID),
edges_att$WORD_COUNT, 0)
nodes_att$CHAR_COUNT <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID),
edges_att$NCHAR, 0)
nodes_att$FAM <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID), edges_att$FAM.machine, 0)
nodes_att$COD <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID), edges_att$COD.machine, 0)
nodes_att$LIM <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID), edges_att$LIM.machine, 0)
nodes_att$CAR <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID), edges_att$CAR.machine, 0)
nodes_att$CIM <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID), edges_att$CIM.machine, 0)
nodes_att$CIM_post <- ifelse((as.character(nodes_df$NAME) %in% tmp$CGID),
edges_att$CIM_post.machine, 0)
###########################################################################################
## ICU NOTE IS MOST FREQUENT COLUMN NAME FOR NOTE DOCUMENTATION
## THINK ABOUT USING FIRST DOCUMENTATION? OTHER METRIC?
edges_att$FIRST_ICU <- colnames(edges_att)[11:15][apply(edges_att[,11:15], 1 , which.max)]
edges_att$LAST_ICU <- colnames(edges_att)[16:20][apply(edges_att[,16:20], 1, which.max)]
###########################################################################################
# Write the network into a gexf (Gephi) file
#write.gexf(nodes = nodes_df,
# edges = edges_df,
# nodesAtt = nodes_att,
# edgesAtt = edges_att,
# output = "/Users/kai-ou/Desktop/NQF_cohort_test15May18.gexf")
Here we will be merging to the CAREGIVERS table, while have kept only Attending and Resident/Fellow/PA/NP, this will allow us to investigate trends based on caregivers.
boxplot(dat$NCHAR ~ dat$CG_DESCRIPTION,
main = "Character Count By Clinician",
xlab = "Clinician",
ylab = "Character Count")
t.test(dat$NCHAR ~ dat$CG_DESCRIPTION)
##
## Welch Two Sample t-test
##
## data: dat$NCHAR by dat$CG_DESCRIPTION
## t = -8.1483, df = 6806.6, p-value = 4.357e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -542.8214 -332.2876
## sample estimates:
## mean in group Attending mean in group Resident/Fellow/PA/NP
## 6606.670 7044.224
wilcox.test(dat$NCHAR ~ dat$CG_DESCRIPTION)
##
## Wilcoxon rank sum test with continuity correction
##
## data: dat$NCHAR by dat$CG_DESCRIPTION
## W = 12611000, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
boxplot(dat$WORD_COUNT ~ dat$CG_DESCRIPTION,
main = "Word Count By Clinician",
xlab = "Clinician",
ylab = "Character Count")
t.test(dat$WORD_COUNT ~ dat$CG_DESCRIPTION)
##
## Welch Two Sample t-test
##
## data: dat$WORD_COUNT by dat$CG_DESCRIPTION
## t = -7.4702, df = 6771.1, p-value = 9.011e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -72.62815 -42.43394
## sample estimates:
## mean in group Attending mean in group Resident/Fellow/PA/NP
## 888.9681 946.4992
wilcox.test(dat$WORD_COUNT ~ dat$CG_DESCRIPTION)
##
## Wilcoxon rank sum test with continuity correction
##
## data: dat$WORD_COUNT by dat$CG_DESCRIPTION
## W = 12620000, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
plotDat(dat, "FAM.machine", bs = F, mn = "FAM Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")
test <- table(dat$FAM.machine, dat$CG_DESCRIPTION)
test
##
## Attending Resident/Fellow/PA/NP
## 0 2586 5445
## 1 960 2470
chisq.test(test)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: test
## X-squared = 19.758, df = 1, p-value = 8.788e-06
pairwiseNominalIndependence(
as.matrix(test),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 0 : 1 8.79e-06 8.79e-06
plotDat(dat, "CIM.machine", bs = F, mn = "CIM Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")
test <- table(dat$CIM.machine, dat$CG_DESCRIPTION)
test
##
## Attending Resident/Fellow/PA/NP
## 0 2232 4624
## 1 1314 3291
chisq.test(test)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: test
## X-squared = 20.66, df = 1, p-value = 5.486e-06
pairwiseNominalIndependence(
as.matrix(test),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 0 : 1 5.49e-06 5.49e-06
plotDat(dat, "LIM.machine", bs = F, mn = "LIM Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")
test <- table(dat$LIM.machine, dat$CG_DESCRIPTION)
test
##
## Attending Resident/Fellow/PA/NP
## 0 2361 5249
## 1 1185 2666
chisq.test(test)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: test
## X-squared = 0.065631, df = 1, p-value = 0.7978
pairwiseNominalIndependence(
as.matrix(test),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 0 : 1 0.798 0.798
plotDat(dat, "CAR.machine", bs = F, mn = "CAR Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")
test <- table(dat$CAR.machine, dat$CG_DESCRIPTION)
test
##
## Attending Resident/Fellow/PA/NP
## 0 2753 5104
## 1 793 2811
chisq.test(test)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: test
## X-squared = 195.88, df = 1, p-value < 2.2e-16
pairwiseNominalIndependence(
as.matrix(test),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 0 : 1 1.66e-44 1.66e-44
plotDat(dat, "COD.machine", bs = F, mn = "COD Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")
test <- table(dat$COD.machine, dat$CG_DESCRIPTION)
test
##
## Attending Resident/Fellow/PA/NP
## 0 1313 2782
## 1 2233 5133
chisq.test(test)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: test
## X-squared = 3.6845, df = 1, p-value = 0.05492
pairwiseNominalIndependence(
as.matrix(test),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 0 : 1 0.0549 0.0549
plotDat(dat, "CIM_post.machine", bs = F, mn = "CIM Documentation By Caregiver", xl = "Caregiver Group", yl = "Proportion")
test <- table(dat$CIM_post.machine, dat$CG_DESCRIPTION)
test
##
## Attending Resident/Fellow/PA/NP
## 0 2048 3995
## 1 1498 3920
chisq.test(test)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: test
## X-squared = 51.798, df = 1, p-value = 6.151e-13
pairwiseNominalIndependence(
as.matrix(test),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 0 : 1 6.15e-13 6.15e-13
NOTE: LOOK FOR NOTES WRITTING BY ATTENDING ONLY VS ATTENDING AND RESIDENT