This is a summary data analysis of a corpus of patient notes (Discharge Summaries and Nursing Notes) extracted from the MIMIC-III database. This corpus resulted from the extraction of Frequent Flyer patients, who have had three or more ICU admissions over the course of a single year at any point, in addition to patients not within this cohort in an approximately 80-20 ratio.
These notes were annotated for several different clinical concepts (detailed below) over the period from August, 2015 to October, 2016. Note annotators included two clinical research associates (ETM & JWW), two junior resident physicians (JTW & JFF), and two senior resident physicians (PDT & DAG). Annotation was overseen by an attending physician (LAC).
The corpus of patient notes itself consists of the Subject ID, Hospital Admission ID, Text, Binary Indications, Batch Date, and Operators. These data will be joined to other tables of MIMIC-III v1.4 for the purpose of generating hospital admission-level as well as patient-level summary statistics.
library("knitr")
library("rmarkdown")
library("rcompanion")
library("survival")
library("survminer")
library("ggplot2")
Load ICUSTAYS, PATIENTS, ADMISSIONS, and NOTEEVENTS flat files which we will use to extract relevant data.
setwd("C:/Users/Edward Moseley/Documents")
stay <- read.csv("ICUSTAYS.csv",
header = T, stringsAsFactors = F)
pat <- read.csv("PATIENTS.csv",
header = T, stringsAsFactors = F)
adm <- read.csv("ADMISSIONS.csv",
header = T, stringsAsFactors = F)
notes <- read.csv("NOTEEVENTS.csv",
header = T, stringsAsFactors = F)
ages() will accept a data frame, and using INTIME and DOB will calculate AGE, patients whose ages were over 89 during any visit are masked in MIMIC, so their median age (91.4) will be impuned. Patients aged 10 and younger will be discarded.
ages <- function(dat){
dat$AGE <- (dat$INTIME - dat$DOB)/365
#Correct for patient ages > 90 by impuning median age (91.4)
dat[(dat$AGE >= 90),]$AGE <- 91.4
#Remove any age < 10
dat <- dat[(dat$AGE > 10),]
return(dat)
}
admissionCount() will accept the ADMISSIONS table, and will order by SUBJECT_ID and ADMITTIME to determine the admission sequence numbers for each patient.
admissionCount <- function(dat){
#Convert character object to Date then numeric
dat$ADMITTIME <- as.numeric(as.Date(gsub("([^ ]*).*", "\\1", dat$ADMITTIME), "%Y-%m-%d"))
#order by subject id, then admittime
dat <- dat[with(dat, order(SUBJECT_ID, ADMITTIME)),]
tmp <- data.frame()
res <- data.frame()
for (subj in unique(dat$SUBJECT_ID)){
tmp <- dat[(dat$SUBJECT_ID == subj),]
tmp$ADMISSION_NUMBER <- seq.int(nrow(tmp))
res <- rbind(res, tmp)
}
return(res)
}
plotDat will accept a data frame, a column name (string), a boolean for beside (TRUE)/stacked(FALSE), a main title (string), an xlabel (string), and a ylabel (string), and is a generic function to make proportion plots easier to generate.
plotDat <- function(dat, column, bs, mn, xl, yl){
tmp <- as.matrix(table(dat[[column]], dat[["cohort"]]))
prop <- prop.table(tmp, margin = 2)#2 for column-wise proportions
par(mar = c(5.0, 4.0, 4.0, 15), xpd = TRUE)
barplot(prop, col = cm.colors(length(rownames(prop))), beside = bs,width = 2, main = mn, xlab = xl, ylab = yl)
legend("topright", inset = c(-0.90,0), fill = cm.colors(length(rownames(prop))), legend=rownames(prop))
}
ffCheck() will accept the ICUSTAYS, PATIENTS, ADMISSIONS, AND NOTEEVENTS tables, in addition to a boolean which will control the call ages() on the output. It will look at each individual SUBJECT_ID for three or more hospital admissions over the course of a single year, if that condition is satisfied at any point for the patient, all of the patient’s admissions will be subset. ffCheck() can also be used to generate the entire MIMIC data set by setting ff = F
ffCheck <- function(dat, patDat, admDat, noteDat, A, ff){
admDat <- admissionCount(admDat)
#Convert character object to Date then numeric
dat$INTIME <- as.numeric(as.Date(gsub("([^ ]*).*", "\\1", dat$INTIME), "%Y-%m-%d"))
dat$OUTTIME <- as.numeric(as.Date(gsub("([^ ]*).*", "\\1", dat$OUTTIME), "%Y-%m-%d"))
#Drop ROW_ID in admDat
admDat$ROW_ID <- NULL
#Drop ROW_ID in patDat
patDat$ROW_ID <- NULL
#Convert INTIME/DOB/DOD/DOD_HOSP/DOD_SSN to numeric
patDat$DOB <- as.numeric(as.Date(gsub("([^ ]*).*", "\\1", patDat$DOB), "%Y-%m-%d"))
patDat$DOD <- as.numeric(as.Date(gsub("([^ ]*).*", "\\1", patDat$DOD), "%Y-%m-%d"))
patDat$DOD_HOSP <- as.numeric(as.Date(gsub("([^ ]*).*", "\\1", patDat$DOD_HOSP), "%Y-%m-%d"))
patDat$DOD_SSN <- as.numeric(as.Date(gsub("([^ ]*).*", "\\1", patDat$DOD_SSN), "%Y-%m-%d"))
#Placeholder
#dat$ADMISSION_NUMBER <- rep(1, each = nrow(dat))
#Generate some temporary variables for holding stuff
tmp <- data.frame()
tmpReal <- data.frame()
res <- data.frame()
#for each unique subject in the data set
for (subj in unique(dat$SUBJECT_ID)){
#subset subject data
tmp <- dat[(dat$SUBJECT_ID == subj),]
#order data by ICU intake
tmp <- tmp[(order(tmp$INTIME)),]
#Store the data
tmpReal <- tmp
#initial check to see if there are 3 ICU visits for that subject
if (ff == TRUE){
while (nrow(tmp) > 2){
if (as.numeric(tmp$INTIME[3] - tmp$INTIME[1]) <= 365){
res <- rbind(res, tmpReal)
break
} else {
tmp <- tmp[-1,]
}
}
} else {
res <- rbind(res, tmpReal)
}
}
res <- merge(res, patDat, by = "SUBJECT_ID")
#Drop SUBJECT_ID to avoid duplicates
admDat$SUBJECT_ID <- NULL
res <- merge(res, admDat, by = "HADM_ID")
#No need for row_id anymore
res$ROW_ID <- NULL
#If A is TRUE, calculate ages
if (A == T){
res <- ages(res)
}
#Can drop ROW_ID, SUBJECT_ID, as HADM_ID is what we will join on
noteDat$ROW_ID <- NULL
noteDat$SUBJECT_ID <- NULL
res <- merge(res, noteDat, by = "HADM_ID")
res <- res[with(res, order(SUBJECT_ID, ADMITTIME)),]
rownames(res) <- 1:nrow(res)
return(res)
}
#check <- ffCheck(stay, pat, adm, notes, T, T)
#write.csv(check, "ffCohortFullM3.csv", row.names = F)
check <- read.csv("ffCohortFullM3.csv", header = T, stringsAsFactors = F)
#dat <- ffCheck(stay, pat, adm, notes, T, F)
#write.csv(dat, file = "boundTablesM3.csv", row.names = FALSE)
#dat is all data, frequent flyer inclusive
dat <- read.csv("boundTablesM3.csv", header = T, stringsAsFactors = F)
We will pull in the annotation results file. Unfortunately, the MIMIC database from which the original note text was extracted (MIMIC2v3) was an intermediate form between MIMIC-II and MIMIC-III, and which no longer exists. This makes precise reconstruction of the original cohort selection impossible, and it is possible that some non-frequent flyers became frequent flyers in the transition from MIMIC2v3 to the recent MIMIC-III v1.4. For this reason, frequent flyer cohort designations will be recalculated and assigned.
## ###
## Results
## ###
noteDat <- read.csv(list.files()[grepl("finalFrequentFlyerResults", list.files())],
header = T, stringsAsFactors = F)
noteDat$COHORT <- ifelse((noteDat$SUBJECT_ID %in% check$SUBJECT_ID), 1, 0)
dis <- noteDat[(grepl("Discharge", noteDat$CATEGORY)),]
nsn <- noteDat[(grepl("Nursing", noteDat$CATEGORY)),]
A frequent flyer is defined as a patient who, at any point, had three or more ICU admissions over the course of a single year. Anyone under the age of 10 years will not be considered.
#All of MIMIC
tmpTab <- table(factor(notes$CATEGORY))
paste("There are ", length(unique(adm$SUBJECT_ID))," unique subjects, ", length(unique(adm$HADM_ID)),
" unique hospital admissions with ", tmpTab["Discharge summary"][[1]], " discharge summaries and ",
tmpTab["Nursing"][[1]]+tmpTab["Nursing/other"][[1]], " nursing notes.", sep = '')
## [1] "There are 46520 unique subjects, 58976 unique hospital admissions with 59652 discharge summaries and 1046053 nursing notes."
#Remove 10 year olds
tmpTab <- table(factor(dat$CATEGORY))
paste("After removing those younger than 10 years of age, there are ", length(unique(dat$SUBJECT_ID))," unique subjects, ", length(unique(dat$HADM_ID)),
" unique hospital admissions with ", tmpTab["Discharge summary"][[1]], " discharge summaries and ",
tmpTab["Nursing"][[1]]+tmpTab["Nursing/other"][[1]], " nursing notes.", sep = '')
## [1] "After removing those younger than 10 years of age, there are 38352 unique subjects, 49533 unique hospital admissions with 58925 discharge summaries and 764780 nursing notes."
#Only frequent flyer cohort
tmpTab <- table(factor(check$CATEGORY))
paste("After selecting only for patients who had >= 3 hospital admissions in a single year, there are ", length(unique(check$SUBJECT_ID))," unique subjects, ", length(unique(check$HADM_ID)),
" unique hospital admissions with ", tmpTab["Discharge summary"][[1]], " discharge summaries and ",
tmpTab["Nursing"][[1]]+tmpTab["Nursing/other"][[1]], " nursing notes.", sep = '')
## [1] "After selecting only for patients who had >= 3 hospital admissions in a single year, there are 1951 unique subjects, 7078 unique hospital admissions with 10116 discharge summaries and 208789 nursing notes."
#Only annotated
tmp <- noteDat[(!duplicated(noteDat$TEXT)),]
tmpTab <- table(factor(tmp$CATEGORY))
paste("Within the annotated database, there are ", length(unique(noteDat$SUBJECT_ID))," unique subjects and ", length(unique(noteDat$HADM_ID)),
" unique hospital admissions with ", tmpTab["Discharge summary"][[1]]+tmpTab["Discharge"][[1]], " unique discharge summaries (", round(nrow(tmp[(grepl("Discharge", tmp$CATEGORY)) & (tmp$COHORT == 1),])/nrow(tmp[(grepl("Discharge", tmp$CATEGORY)),])*100, 2), "% belonging to Frequent Flyers) and ",
tmpTab["Nursing/Other"][[1]], " unique nursing notes (",round(nrow(tmp[(grepl("Nursing", tmp$CATEGORY)) & (tmp$COHORT == 1),])/nrow(tmp[(grepl("Nursing", tmp$CATEGORY)),])*100, 2), "% belonging to Frequent Flyers)", sep = '')
## [1] "Within the annotated database, there are 905 unique subjects and 2069 unique hospital admissions with 1102 unique discharge summaries (73.96% belonging to Frequent Flyers) and 1000 unique nursing notes (76.6% belonging to Frequent Flyers)"
#rm(adm, stay, pat, notes)
rm(tmp, tmpTab)
We will check to see if there are patients who have been removed in the transition to MIMIC-III v1.4.
tmp <- setdiff(unique(noteDat$SUBJECT_ID), unique(dat$SUBJECT_ID))
paste(length(tmp), "patients who are in the patient note database have been removed from MIMIC-III, and their data will not be considered further for hospital admission level or patient level analysis.")
## [1] "7 patients who are in the patient note database have been removed from MIMIC-III, and their data will not be considered further for hospital admission level or patient level analysis."
There are 46,520 unique subjects, 58,976 unique hospital admissions with 59,652 discharge summaries and 1,046,053 nursing notes in MIMIC-III v1.4. After removing those subjects younger than 10 years of age, there are 38,352 unique subjects, 49,533 unique hospital admissions with 58,925 discharge summaries and 764,780 nursing notes. After selecting only for patients who had >= 3 hospital admissions in a single year, there are 1,951 unique subjects, 7,078 unique hospital admissions with 10,116 discharge summaries and 208,789 nursing notes. Within the annotated database, there are 905 unique subjects and 2069 unique hospital admissions with 1102 unique discharge summaries (73.96% belonging to Frequent Flyers) and 1000 unique nursing notes (76.6% belonging to Frequent Flyers)
Here we will merge the patient note data to MIMIC on HADM_ID for hospital admission level data.
Note: 1 indicates frequent flyer status
## ###
## merge to data by hadm_id for hospital admission level info
## ###
#Drop columns to avoid duplicate variables
#Drop dat$subject_id
dat$SUBJECT_ID <- NULL
#drop dat$category
dat$CATEGORY <- NULL
#drop dat$text
dat$TEXT <- NULL
#Merge note data to dat to get all relevant data
test <- merge(noteDat, dat, by = "HADM_ID")
colnames(test) <- tolower(colnames(test))
#Remove duplicates for hospital admission level info
test <- test[(!duplicated(test$hadm_id)),]
## LOS
boxplot(test$los ~ test$cohort,
main = "ICU Length of Stay (days) by cohort (1 = Frequent Flyer)",
xlab = "Cohort",
ylab = "Length of Stay (days)")
t.test(test$los ~ test$cohort)
##
## Welch Two Sample t-test
##
## data: test$los by test$cohort
## t = -1.9857, df = 659.82, p-value = 0.04748
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.94223347 -0.01089127
## sample estimates:
## mean in group 0 mean in group 1
## 4.454292 5.430854
wilcox.test(test$los ~ test$cohort)
##
## Wilcoxon rank sum test with continuity correction
##
## data: test$los by test$cohort
## W = 90738, p-value = 0.003568
## alternative hypothesis: true location shift is not equal to 0
## Admission Type
plotDat(test, "admission_type", bs = F, mn = "Admission Type Proportions by Cohort", xl = "Cohort", yl = "Proportion")
tmp <- table(test$admission_type, test$cohort)
tmp
##
## 0 1
## ELECTIVE 41 52
## EMERGENCY 217 708
## URGENT 6 22
chisq.test(tmp)
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 19.275, df = 2, p-value = 6.523e-05
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 ELECTIVE : EMERGENCY 0.000023 0.000069
## 2 ELECTIVE : URGENT 0.052900 0.079400
## 3 EMERGENCY : URGENT 0.981000 0.981000
## Admission Location
plotDat(test, "admission_location", bs = F, mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")
tmp <- table(test$admission_location, test$cohort)
tmp
##
## 0 1
## CLINIC REFERRAL/PREMATURE 34 81
## EMERGENCY ROOM ADMIT 137 516
## PHYS REFERRAL/NORMAL DELI 48 85
## TRANSFER FROM HOSP/EXTRAM 45 90
## TRANSFER FROM OTHER HEALT 0 4
## TRANSFER FROM SKILLED NUR 0 6
chisq.test(tmp)
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 23.781, df = 5, p-value = 0.0002392
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq
## 1 CLINIC REFERRAL/PREMATURE : EMERGENCY ROOM ADMIT 0.055000
## 2 CLINIC REFERRAL/PREMATURE : PHYS REFERRAL/NORMAL DELI 0.340000
## 3 CLINIC REFERRAL/PREMATURE : TRANSFER FROM HOSP/EXTRAM 0.616000
## 4 CLINIC REFERRAL/PREMATURE : TRANSFER FROM OTHER HEALT 0.469000
## 5 CLINIC REFERRAL/PREMATURE : TRANSFER FROM SKILLED NUR 0.269000
## 6 EMERGENCY ROOM ADMIT : PHYS REFERRAL/NORMAL DELI 0.000281
## 7 EMERGENCY ROOM ADMIT : TRANSFER FROM HOSP/EXTRAM 0.002810
## 8 EMERGENCY ROOM ADMIT : TRANSFER FROM OTHER HEALT 0.680000
## 9 EMERGENCY ROOM ADMIT : TRANSFER FROM SKILLED NUR 0.450000
## 10 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM HOSP/EXTRAM 0.730000
## 11 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM OTHER HEALT 0.338000
## 12 PHYS REFERRAL/NORMAL DELI : TRANSFER FROM SKILLED NUR 0.168000
## 13 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM OTHER HEALT 0.389000
## 14 TRANSFER FROM HOSP/EXTRAM : TRANSFER FROM SKILLED NUR 0.205000
## 15 TRANSFER FROM OTHER HEALT : TRANSFER FROM SKILLED NUR NaN
## p.adj.Chisq
## 1 0.25700
## 2 0.59500
## 3 0.71900
## 4 0.59700
## 5 0.59500
## 6 0.00393
## 7 0.01970
## 8 0.73000
## 9 0.59700
## 10 0.73000
## 11 0.59500
## 12 0.57400
## 13 0.59700
## 14 0.57400
## 15 NaN
## Discharge Location
plotDat(test, "discharge_location", bs = T,mn = "Admission Location Proportions by Cohort", xl = "Cohort", yl = "Proportion")
tmp <- table(test$discharge_location, test$cohort)
tmp
##
## 0 1
## DEAD/EXPIRED 24 44
## DISC-TRAN CANCER/CHLDRN H 10 21
## DISCH-TRAN TO PSYCH HOSP 0 4
## HOME 67 183
## HOME HEALTH CARE 73 206
## HOME WITH HOME IV PROVIDR 1 4
## HOSPICE-MEDICAL FACILITY 0 2
## LEFT AGAINST MEDICAL ADVI 2 20
## LONG TERM CARE HOSPITAL 9 37
## REHAB/DISTINCT PART HOSP 32 140
## SHORT TERM HOSPITAL 1 4
## SNF 45 117
chisq.test(tmp)
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 15.464, df = 11, p-value = 0.1622
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq
## 1 DEAD/EXPIRED : DISC-TRAN CANCER/CHLDRN H 0.94700
## 2 DEAD/EXPIRED : DISCH-TRAN TO PSYCH HOSP 0.36300
## 3 DEAD/EXPIRED : HOME 0.22100
## 4 DEAD/EXPIRED : HOME HEALTH CARE 0.17600
## 5 DEAD/EXPIRED : HOME WITH HOME IV PROVIDR 0.83600
## 6 DEAD/EXPIRED : HOSPICE-MEDICAL FACILITY 0.77900
## 7 DEAD/EXPIRED : LEFT AGAINST MEDICAL ADVI 0.03690
## 8 DEAD/EXPIRED : LONG TERM CARE HOSPITAL 0.10800
## 9 DEAD/EXPIRED : REHAB/DISTINCT PART HOSP 0.00973
## 10 DEAD/EXPIRED : SHORT TERM HOSPITAL 0.83600
## 11 DEAD/EXPIRED : SNF 0.32800
## 12 DISC-TRAN CANCER/CHLDRN H : DISCH-TRAN TO PSYCH HOSP 0.45000
## 13 DISC-TRAN CANCER/CHLDRN H : HOME 0.66800
## 14 DISC-TRAN CANCER/CHLDRN H : HOME HEALTH CARE 0.60800
## 15 DISC-TRAN CANCER/CHLDRN H : HOME WITH HOME IV PROVIDR 0.97700
## 16 DISC-TRAN CANCER/CHLDRN H : HOSPICE-MEDICAL FACILITY 0.86600
## 17 DISC-TRAN CANCER/CHLDRN H : LEFT AGAINST MEDICAL ADVI 0.09840
## 18 DISC-TRAN CANCER/CHLDRN H : LONG TERM CARE HOSPITAL 0.31900
## 19 DISC-TRAN CANCER/CHLDRN H : REHAB/DISTINCT PART HOSP 0.13700
## 20 DISC-TRAN CANCER/CHLDRN H : SHORT TERM HOSPITAL 0.97700
## 21 DISC-TRAN CANCER/CHLDRN H : SNF 0.77200
## 22 DISCH-TRAN TO PSYCH HOSP : HOME 0.52600
## 23 DISCH-TRAN TO PSYCH HOSP : HOME HEALTH CARE 0.54000
## 24 DISCH-TRAN TO PSYCH HOSP : HOME WITH HOME IV PROVIDR 1.00000
## 25 DISCH-TRAN TO PSYCH HOSP : HOSPICE-MEDICAL FACILITY NaN
## 26 DISCH-TRAN TO PSYCH HOSP : LEFT AGAINST MEDICAL ADVI 1.00000
## 27 DISCH-TRAN TO PSYCH HOSP : LONG TERM CARE HOSPITAL 0.76500
## 28 DISCH-TRAN TO PSYCH HOSP : REHAB/DISTINCT PART HOSP 0.76600
## 29 DISCH-TRAN TO PSYCH HOSP : SHORT TERM HOSPITAL 1.00000
## 30 DISCH-TRAN TO PSYCH HOSP : SNF 0.50600
## 31 HOME : HOME HEALTH CARE 0.94700
## 32 HOME : HOME WITH HOME IV PROVIDR 1.00000
## 33 HOME : HOSPICE-MEDICAL FACILITY 0.95900
## 34 HOME : LEFT AGAINST MEDICAL ADVI 0.11500
## 35 HOME : LONG TERM CARE HOSPITAL 0.39600
## 36 HOME : REHAB/DISTINCT PART HOSP 0.06650
## 37 HOME : SHORT TERM HOSPITAL 1.00000
## 38 HOME : SNF 0.91700
## 39 HOME HEALTH CARE : HOME WITH HOME IV PROVIDR 1.00000
## 40 HOME HEALTH CARE : HOSPICE-MEDICAL FACILITY 0.97500
## 41 HOME HEALTH CARE : LEFT AGAINST MEDICAL ADVI 0.12700
## 42 HOME HEALTH CARE : LONG TERM CARE HOSPITAL 0.44000
## 43 HOME HEALTH CARE : REHAB/DISTINCT PART HOSP 0.08350
## 44 HOME HEALTH CARE : SHORT TERM HOSPITAL 1.00000
## 45 HOME HEALTH CARE : SNF 0.79700
## 46 HOME WITH HOME IV PROVIDR : HOSPICE-MEDICAL FACILITY 1.00000
## 47 HOME WITH HOME IV PROVIDR : LEFT AGAINST MEDICAL ADVI 1.00000
## 48 HOME WITH HOME IV PROVIDR : LONG TERM CARE HOSPITAL 1.00000
## 49 HOME WITH HOME IV PROVIDR : REHAB/DISTINCT PART HOSP 1.00000
## 50 HOME WITH HOME IV PROVIDR : SHORT TERM HOSPITAL 1.00000
## 51 HOME WITH HOME IV PROVIDR : SNF 1.00000
## 52 HOSPICE-MEDICAL FACILITY : LEFT AGAINST MEDICAL ADVI 1.00000
## 53 HOSPICE-MEDICAL FACILITY : LONG TERM CARE HOSPITAL 1.00000
## 54 HOSPICE-MEDICAL FACILITY : REHAB/DISTINCT PART HOSP 1.00000
## 55 HOSPICE-MEDICAL FACILITY : SHORT TERM HOSPITAL 1.00000
## 56 HOSPICE-MEDICAL FACILITY : SNF 0.93800
## 57 LEFT AGAINST MEDICAL ADVI : LONG TERM CARE HOSPITAL 0.45600
## 58 LEFT AGAINST MEDICAL ADVI : REHAB/DISTINCT PART HOSP 0.41900
## 59 LEFT AGAINST MEDICAL ADVI : SHORT TERM HOSPITAL 1.00000
## 60 LEFT AGAINST MEDICAL ADVI : SNF 0.10400
## 61 LONG TERM CARE HOSPITAL : REHAB/DISTINCT PART HOSP 1.00000
## 62 LONG TERM CARE HOSPITAL : SHORT TERM HOSPITAL 1.00000
## 63 LONG TERM CARE HOSPITAL : SNF 0.35200
## 64 REHAB/DISTINCT PART HOSP : SHORT TERM HOSPITAL 1.00000
## 65 REHAB/DISTINCT PART HOSP : SNF 0.06300
## 66 SHORT TERM HOSPITAL : SNF 1.00000
## p.adj.Chisq
## 1 1.000
## 2 1.000
## 3 1.000
## 4 0.953
## 5 1.000
## 6 1.000
## 7 0.810
## 8 0.810
## 9 0.632
## 10 1.000
## 11 1.000
## 12 1.000
## 13 1.000
## 14 1.000
## 15 1.000
## 16 1.000
## 17 0.810
## 18 1.000
## 19 0.810
## 20 1.000
## 21 1.000
## 22 1.000
## 23 1.000
## 24 1.000
## 25 NaN
## 26 1.000
## 27 1.000
## 28 1.000
## 29 1.000
## 30 1.000
## 31 1.000
## 32 1.000
## 33 1.000
## 34 0.810
## 35 1.000
## 36 0.810
## 37 1.000
## 38 1.000
## 39 1.000
## 40 1.000
## 41 0.810
## 42 1.000
## 43 0.810
## 44 1.000
## 45 1.000
## 46 1.000
## 47 1.000
## 48 1.000
## 49 1.000
## 50 1.000
## 51 1.000
## 52 1.000
## 53 1.000
## 54 1.000
## 55 1.000
## 56 1.000
## 57 1.000
## 58 1.000
## 59 1.000
## 60 0.810
## 61 1.000
## 62 1.000
## 63 1.000
## 64 1.000
## 65 0.810
## 66 1.000
Here we will merge the patient note data to MIMIC on SUBJECT_ID for patient level data.
#Need to re-import dat
dat <- read.csv("boundTablesM3.csv", header = T, stringsAsFactors = F)
#Merge note data to dat to get all data
test <- merge(noteDat, dat, by = "SUBJECT_ID")
colnames(test) <- tolower(colnames(test))
#remove duplicates for patient level data
test <- test[(!duplicated(test$subject_id)),]
## AGE
boxplot(test$age ~ test$cohort)
t.test(test$age ~ test$cohort)
##
## Welch Two Sample t-test
##
## data: test$age by test$cohort
## t = 1.9949, df = 804.39, p-value = 0.04639
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.03514605 4.34916117
## sample estimates:
## mean in group 0 mean in group 1
## 65.23107 63.03891
wilcox.test(test$age ~ test$cohort)
##
## Wilcoxon rank sum test with continuity correction
##
## data: test$age by test$cohort
## W = 105790, p-value = 0.04804
## alternative hypothesis: true location shift is not equal to 0
## GENDER
plotDat(test, "gender", bs = F, mn = "Gender by Cohort", xl = "Cohort", yl = "Proportion")
tmp <- table(test$gender, test$cohort)
prop.test(tmp)
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: tmp
## X-squared = 0.54146, df = 1, p-value = 0.4618
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.09406183 0.04059508
## sample estimates:
## prop 1 prop 2
## 0.4046392 0.4313725
##Insurance
plotDat(test, "insurance", bs = F, mn = "Insurance by Cohort", xl = "Cohort", yl = "Proportion")
tmp <- table(test$insurance, test$cohort)
chisq.test(tmp)
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 19.607, df = 4, p-value = 0.0005971
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 Government : Medicaid 0.168000 0.33600
## 2 Government : Medicare 0.208000 0.34700
## 3 Government : Private 0.911000 0.91100
## 4 Government : Self Pay 0.465000 0.58100
## 5 Medicaid : Medicare 0.662000 0.73600
## 6 Medicaid : Private 0.009750 0.04880
## 7 Medicaid : Self Pay 0.053300 0.16200
## 8 Medicare : Private 0.000678 0.00678
## 9 Medicare : Self Pay 0.064700 0.16200
## 10 Private : Self Pay 0.253000 0.36100
##Ethnicity
##Clean ethnicity
test[(grepl("WHITE|PORTUGUESE", test$ethnicity)),]$ethnicity <- "WHITE"
test[(grepl("ASIAN", test$ethnicity)),]$ethnicity <- "ASIAN"
test[(grepl("BLACK", test$ethnicity)),]$ethnicity <- "BLACK"
test[(grepl("HISPANIC", test$ethnicity)),]$ethnicity <- "HISPANIC"
test[(grepl("MIDDLE|NATIVE|MULTI|DECLINED|UNABLE|OTHER|NOT",test$ethnicity)),]$ethnicity <- "UNKNOWN"
plotDat(test, "ethnicity", bs = F, mn = "Ethnicity by Cohort", xl = "Cohort", yl = "Proportion")
tmp <- table(test$ethnicity, test$cohort)
chisq.test(tmp)
##
## Pearson's Chi-squared test
##
## data: tmp
## X-squared = 29.12, df = 4, p-value = 7.392e-06
pairwiseNominalIndependence(
as.matrix(tmp),
fisher = F, gtest = F, chisq = T, method = "fdr")
## Comparison p.Chisq p.adj.Chisq
## 1 ASIAN : BLACK 2.55e-02 6.38e-02
## 2 ASIAN : HISPANIC 8.45e-01 9.39e-01
## 3 ASIAN : UNKNOWN 5.95e-01 8.42e-01
## 4 ASIAN : WHITE 6.74e-01 8.42e-01
## 5 BLACK : HISPANIC 5.97e-02 1.19e-01
## 6 BLACK : UNKNOWN 2.26e-07 2.26e-06
## 7 BLACK : WHITE 1.01e-04 5.05e-04
## 8 HISPANIC : UNKNOWN 1.83e-01 3.05e-01
## 9 HISPANIC : WHITE 1.00e+00 1.00e+00
## 10 UNKNOWN : WHITE 3.94e-03 1.31e-02
## MARITAL_STATUS
#plotDat(test, "marital_status", bs = F, mn = "Marital Status by Cohort", xl = "Cohort", yl = "Proportion")
#table(test$marital_status, test$cohort)
## Mortality (in-hospital)
plotDat(test, "hospital_expire_flag", bs = F, mn = "In-Hospital Mortality by Cohort", xl = "Cohort", yl = "Proportion")
table(test$hospital_expire_flag, test$cohort)
##
## 0 1
## 0 328 470
## 1 49 51
t.test(test$hospital_expire_flag, test$cohort)
##
## Welch Two Sample t-test
##
## data: test$hospital_expire_flag and test$cohort
## t = -23.991, df = 1522.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.5071502 -0.4304890
## sample estimates:
## mean of x mean of y
## 0.1113586 0.5801782
wilcox.test(test$hospital_expire_flag, test$cohort)
##
## Wilcoxon rank sum test with continuity correction
##
## data: test$hospital_expire_flag and test$cohort
## W = 214170, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## Mortality (overall)
plotDat(test, "expire_flag", bs = F, mn = "Overall Mortality by Cohort", xl = "Cohort", yl = "Proportion")
table(test$expire_flag, test$cohort)
##
## 0 1
## 0 167 138
## 1 210 383
t.test(test$expire_flag, test$cohort)
##
## Welch Two Sample t-test
##
## data: test$expire_flag and test$cohort
## t = 3.5107, df = 1791, p-value = 0.000458
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.0353860 0.1249704
## sample estimates:
## mean of x mean of y
## 0.6603563 0.5801782
wilcox.test(test$expire_flag, test$cohort)
##
## Wilcoxon rank sum test with continuity correction
##
## data: test$expire_flag and test$cohort
## W = 435530, p-value = 0.0004659
## alternative hypothesis: true location shift is not equal to 0
## Number of hospital admissions
test <- merge(noteDat, dat, by = "SUBJECT_ID")
colnames(test) <- tolower(colnames(test))
#Grab last admission with fromLast (Note: data are ordered by subject_id and chronologically)
test <- test[(!duplicated(test$subject_id, fromLast = TRUE)),]
boxplot(test$admission_number ~ test$cohort,
main = "ICU Admission Number by cohort (1 = Frequent Flyer)",
xlab = "Cohort",
ylab = "Admission Number")
table(test$admission_number, test$cohort)
##
## 0 1
## 1 267 152
## 2 71 97
## 3 30 103
## 4 5 75
## 5 3 36
## 6 1 24
## 7 0 10
## 8 0 6
## 9 0 6
## 10 0 1
## 11 0 2
## 12 0 2
## 13 0 2
## 14 0 1
## 15 0 1
## 17 0 1
## 23 0 1
## 27 0 1
t.test(test$admission_number, test$cohort)
##
## Welch Two Sample t-test
##
## data: test$admission_number and test$cohort
## t = 23.977, df = 983.99, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.684991 1.985387
## sample estimates:
## mean of x mean of y
## 2.4153675 0.5801782
wilcox.test(test$admission_number, test$cohort)
##
## Wilcoxon rank sum test with continuity correction
##
## data: test$admission_number and test$cohort
## W = 697250, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
This survival curve will be generated by normalizing all patient’s day 0 to their first hospital admission.
## SURVIVAL
#if no death, time is length of data collection, else it is length from first admission to death
mimicDuration <- as.Date(c("01-June-2001", "01-October-2012"), format = "%d-%B-%Y")
maxSurv <- as.numeric(mimicDuration[2]-mimicDuration[1])
test$time <- ifelse(is.na(test$dod), maxSurv, test$dod - test$intime)
#Data are right censored
fit <- survfit(Surv(time, expire_flag == 1, type = "right") ~ cohort,
data = test)
ggsurvplot(fit, data = test,
risk.table = T,
pval = TRUE,#false?
xlim = c(0,3650),#maxSurv),
break.time.by = 365)
colnames(dis) <- tolower(colnames(dis))
colnames(nsn) <- tolower(colnames(nsn))
disWordCount <- sapply(gregexpr("\\W+", unique(dis$text)), length) + 1
nsnWordCount <- sapply(gregexpr("\\W+", unique(nsn$text)), length) + 1
tmp <- as.data.frame(
cbind(
append(disWordCount, nsnWordCount),
append(rep("Discharges", each = length(disWordCount)),
rep("Nursing Notes", each = length(nsnWordCount))))
)
colnames(tmp) <- c("Word.Count", "Note.Type")
tmp$Word.Count <- as.numeric(as.character(tmp$Word.Count))
print(paste("Discharge Summaries had a median word count of ", summary(disWordCount)[3], " (Q1-Q3: ", summary(disWordCount)[2], '-', summary(disWordCount)[5], ")", sep = ''))
## [1] "Discharge Summaries had a median word count of 1675.5 (Q1-Q3: 1207-2283.75)"
print(paste("Nursing Notes had a median word count of ", summary(nsnWordCount)[3], " (Q1-Q3: ", summary(nsnWordCount)[2], '-', summary(nsnWordCount)[5], ")", sep = ''))
## [1] "Nursing Notes had a median word count of 223 (Q1-Q3: 132-329)"
boxplot(tmp$Word.Count~tmp$Note.Type, ylab = "Word Count", xlab = "Note Type", main = "Word Count by Note Type")
#disVocab <- strsplit(unique(dis$text), split = ' ')
#Keep only alpha numeric
disVocab <- gsub('[^[:alnum:]]', ' ', unique(dis$text))
#Ditch numeric
disVocab <- gsub('[[:digit:]]+', '', disVocab)
#Split for tabulation
disVocab <- strsplit(disVocab, ' ')
disVocab <- lapply(disVocab, tolower)
cat("Discharge summaries have a vocabulary of", (length(unique(unlist(disVocab))) - 1), "unique words.\n") # -1 for ""
## Discharge summaries have a vocabulary of 21794 unique words.
rm(disVocab)
#Keep only alpha numeric
nsnVocab <- gsub('[^[:alnum:]]', ' ', unique(nsn$text))
#Ditch numeric
nsnVocab <- gsub('[[:digit:]]+', '', nsnVocab)
#Split for tabulation
nsnVocab <- strsplit(nsnVocab, ' ')
#tolower case to avoid duplications
nsnVocab <- lapply(nsnVocab, tolower)
length(unique(unlist(nsnVocab))) - 1 # -1 for ""
## [1] 10130
cat("Nursing Notes have a vocabulary of", (length(unique(unlist(nsnVocab))) - 1), "unique words.\n") # -1 for ""
## Nursing Notes have a vocabulary of 10130 unique words.
rm(nsnVocab)
## ###
## opStats accepts a the final note data.frame and an operator
## spits out some stats to console
## ###
opStats <- function(dat, operator, dis){
dat <- dat[(dat$operator == operator),]
tmp <- as.character()
if (dis){
tmp <- " unique Discharge Summaries."
} else {
tmp <- " unique Nursing Notes."
}
paste(operator, " has ", nrow(dat),
" observations of ", length(unique(dat$text)),
tmp, sep = '')
#return(dat)
}
opStats(dis, "ETM", T)
## [1] "ETM has 535 observations of 496 unique Discharge Summaries."
opStats(dis, "JTW", T)
## [1] "JTW has 801 observations of 759 unique Discharge Summaries."
opStats(dis, "JFF", T)
## [1] "JFF has 433 observations of 415 unique Discharge Summaries."
opStats(dis, "JWW", T)
## [1] "JWW has 727 observations of 698 unique Discharge Summaries."
opStats(dis, "PAT", T)
## [1] "PAT has 50 observations of 50 unique Discharge Summaries."
opStats(dis, "DAG", T)
## [1] "DAG has 25 observations of 25 unique Discharge Summaries."
opStats(nsn, "ETM", F)
## [1] "ETM has 500 observations of 499 unique Nursing Notes."
opStats(nsn, "JTW", F)
## [1] "JTW has 499 observations of 499 unique Nursing Notes."
opStats(nsn, "JFF", F)
## [1] "JFF has 499 observations of 499 unique Nursing Notes."
opStats(nsn, "JWW", F)
## [1] "JWW has 500 observations of 500 unique Nursing Notes."
#PAT and DAG did not annotate nursing notes
#opStats(nsn, "PAT")
#opStats(nsn, "DAG")
dis <- dis[,c("subject_id",
"batch.id",
"category",
"chart.time",
"real.time",
"cohort",
"hadm_id",
"icu.id",
"note.type",
"text",
"operator",
"none",
"advanced.cancer",
"advanced.heart.disease",
"advanced.lung.disease",
"alcohol.abuse",
"chronic.neurological.dystrophies",
"chronic.pain.fibromyalgia",
"dementia",
"depression",
"developmental.delay.retardation",
"non.adherence",
"obesity",
"other.substance.abuse",
"schizophrenia.and.other.psychiatric.disorders",
"unsure")]
nsn <- nsn[,c("subject_id",
"batch.id",
"category",
"chart.time",
"real.time",
"cohort",
"hadm_id",
"icu.id",
"note.type",
"text",
"operator",
"none",
"advanced.cancer",
"advanced.heart.disease",
"advanced.lung.disease",
"alcohol.abuse",
"chronic.neurological.dystrophies",
"chronic.pain.fibromyalgia",
"dementia",
"depression",
"developmental.delay.retardation",
"non.adherence",
"obesity",
"other.substance.abuse",
"schizophrenia.and.other.psychiatric.disorders",
"unsure")]
percPosBatch <- function(dat, indication, operator, batch, res){
#Subset the appropriate operator
dat <- dat[(dat$operator == operator),]
#Remove duplicate observations (grab the first)
dat <- dat[(!duplicated(dat$text)),]
#Subset the appropriate batch
dat <- dat[(dat$batch.id == batch),]
#keep this value to generate %'s
obs <- nrow(dat)
#Find index of indication so we can have data the long way from the get-go
indexOne <- which(colnames(dat) == indication)
#count
count <- sum(dat[,indexOne])
#percentage
percent <- paste(round((count/obs)*100,2),'%', sep = '')
res[nrow(res)+1,] <- c(batch, operator, obs, count, percent, indication)
return(res)
}
posFrame <- data.frame()
for (name in c("Batch", "Operator", "Total.Obs", "Count", "Percent", "Indication")) posFrame[[name]] <- as.numeric()
batches <- levels(factor(dis$batch.id))
inds <- colnames(dis)[which(colnames(dis)=="none"):which(colnames(dis)=="unsure")]
for (bat in batches){
for (ind in inds){
posFrame <- percPosBatch(dis, ind, "ETM", bat, posFrame)
}
}
for (bat in batches){
for (ind in inds){
posFrame <- percPosBatch(dis, ind, "JTW", bat, posFrame)
}
}
for (bat in batches){
for (ind in inds){
posFrame <- percPosBatch(dis, ind, "JFF", bat, posFrame)
}
}
for (bat in batches){
for (ind in inds){
posFrame <- percPosBatch(dis, ind, "JWW", bat, posFrame)
}
}
#To clean:
#posFrame <- posFrame[(posFrame$Percent != "NaN%"),]
percPosTot <- function(dat, indication, res){
#Remove duplicate observations (grab the first note--
#would it be better to average the annotator's results?)
dat <- dat[(!duplicated(dat$text)),]
#keep this value to generate %'s
obs <- nrow(dat)
#Find index of indication so we can have data the long way from the get-go
indexOne <- which(colnames(dat) == indication)
#count
count <- sum(dat[,indexOne])
#percentage
percent <- paste(round((count/obs)*100,2),'%', sep = '')
res[nrow(res)+1,] <- c(obs, count, percent, indication)
return(res)
}
posTotFrame <- data.frame()
for (name in c("Total.Obs", "Count", "Percent", "Indication")) posTotFrame[[name]] <- as.numeric()
inds <- colnames(dis)[which(colnames(dis)=="none"):which(colnames(dis)=="unsure")]
for (ind in inds){
posTotFrame <- percPosTot(dis, ind, posTotFrame)
}
disPosTot <- posTotFrame
disPosTot$NoteType <- rep("Discharge", each = nrow(disPosTot))
disPosTot <- disPosTot[order(disPosTot$Indication),]
row.names(disPosTot) <- 1:nrow(disPosTot)
disPosTot
## Total.Obs Count Percent Indication
## 1 1102 74 6.72% advanced.cancer
## 2 1102 172 15.61% advanced.heart.disease
## 3 1102 88 7.99% advanced.lung.disease
## 4 1102 127 11.52% alcohol.abuse
## 5 1102 187 16.97% chronic.neurological.dystrophies
## 6 1102 150 13.61% chronic.pain.fibromyalgia
## 7 1102 43 3.9% dementia
## 8 1102 267 24.23% depression
## 9 1102 14 1.27% developmental.delay.retardation
## 10 1102 78 7.08% non.adherence
## 11 1102 315 28.58% none
## 12 1102 72 6.53% obesity
## 13 1102 92 8.35% other.substance.abuse
## 14 1102 167 15.15% schizophrenia.and.other.psychiatric.disorders
## 15 1102 150 13.61% unsure
## NoteType
## 1 Discharge
## 2 Discharge
## 3 Discharge
## 4 Discharge
## 5 Discharge
## 6 Discharge
## 7 Discharge
## 8 Discharge
## 9 Discharge
## 10 Discharge
## 11 Discharge
## 12 Discharge
## 13 Discharge
## 14 Discharge
## 15 Discharge
#To hang on to this
#write.csv(posTotFrame, file = "posDisNotesTotal.csv", row.names = FALSE)
#rm(posTotFrame)
attach(posTotFrame)
ggplot(data=posTotFrame,
aes(x=Indication, y=as.numeric(Count)), colour = as.factor(NoteType)) +
geom_bar(stat = "identity", position = "dodge") + coord_flip()
detach(posTotFrame)
## ###
## Nursing Notes
## ###
posFrame <- data.frame()
for (name in c("Batch", "Operator", "Total.Obs", "Count", "Percent", "Indication")) posFrame[[name]] <- as.numeric()
ops <- levels(factor(nsn$operator))
batches <- levels(factor(nsn$batch.id))
inds <- colnames(nsn)[which(colnames(nsn)=="none"):which(colnames(nsn)=="unsure")]
for (bat in batches){
for (ind in inds){
posFrame <- percPosBatch(nsn, ind, "ETM", bat, posFrame)
}
}
for (bat in batches){
for (ind in inds){
posFrame <- percPosBatch(nsn, ind, "JTW", bat, posFrame)
}
}
for (bat in batches){
for (ind in inds){
posFrame <- percPosBatch(nsn, ind, "JFF", bat, posFrame)
}
}
for (bat in batches){
for (ind in inds){
posFrame <- percPosBatch(nsn, ind, "JWW", bat, posFrame)
}
}
posTotFrame <- data.frame()
for (name in c("Total.Obs", "Count", "Percent", "Indication")) posTotFrame[[name]] <- as.numeric()
inds <- colnames(nsn)[which(colnames(nsn)=="none"):which(colnames(nsn)=="unsure")]
for (ind in inds){
posTotFrame <- percPosTot(nsn, ind, posTotFrame)
}
posTotFrame$NoteType <- rep("Nursing", each = nrow(posTotFrame))
posTotFrame
## Total.Obs Count Percent Indication
## 1 1000 693 69.3% none
## 2 1000 24 2.4% advanced.cancer
## 3 1000 46 4.6% advanced.heart.disease
## 4 1000 31 3.1% advanced.lung.disease
## 5 1000 47 4.7% alcohol.abuse
## 6 1000 51 5.1% chronic.neurological.dystrophies
## 7 1000 47 4.7% chronic.pain.fibromyalgia
## 8 1000 25 2.5% dementia
## 9 1000 55 5.5% depression
## 10 1000 5 0.5% developmental.delay.retardation
## 11 1000 48 4.8% non.adherence
## 12 1000 12 1.2% obesity
## 13 1000 22 2.2% other.substance.abuse
## 14 1000 33 3.3% schizophrenia.and.other.psychiatric.disorders
## 15 1000 95 9.5% unsure
## NoteType
## 1 Nursing
## 2 Nursing
## 3 Nursing
## 4 Nursing
## 5 Nursing
## 6 Nursing
## 7 Nursing
## 8 Nursing
## 9 Nursing
## 10 Nursing
## 11 Nursing
## 12 Nursing
## 13 Nursing
## 14 Nursing
## 15 Nursing
#hang on to this
#write.csv(posTotFrame, file = "posNurseNotesTotal.csv", row.names = FALSE)
attach(posTotFrame)
ggplot(data=posTotFrame,
aes(x=Indication, y=as.numeric(Count)), colour = as.factor(NoteType)) +
geom_bar(stat = "identity", position = "dodge") + coord_flip()
detach(posTotFrame)
Cohen’s Kappa was used to measure interoperator error for discharge summaries (nursing notes to follow). Note: this code needs to be fixed– getting NaN for Kappa and NA values for Operators
kappaCalc <- function(x){
po <- (x[1,1]+x[2,2])/sum(x) #Observed proportional agreement
peA <- (sum(x[,1])/sum(x)) #Probability of random agreements
peB <- (sum(x[1,])/sum(x))
pe1 <- (sum(x[,2])/sum(x))
pe2 <- (sum(x[2,])/sum(x))
prpos <- peA*peB #Probability of random positives
prneg <- pe1*pe2 #Probability of random negatives
pre <- prpos+prneg
k <- (po-pre)/(1-pre) #Kappa calculation
return(k)
}
## ###
## agreement() accepts a data frame and returns one
## ###
agreement <- function(dat, indication, team, batch, res){
#Remove arbitrators
dat <- dat[(dat$operator != "DAG"),]
dat <- dat[(dat$operator != "PAT"),]
#Subset the appropriate batch
dat <- dat[(dat$batch.id == batch),]
#Library of unique HADM.IDs
lib <- unique(dat$hadm_id)
#Find index of indication
indexOne <- which(colnames(dat) == indication)
#Create table for calculating kappa
m <- matrix(0, nrow=2, ncol=2)
opTmp <- as.character()
#Loop through, populating data frame
for (i in 1:length(lib)){
#Subset by hadm id
tmp <- dat[(dat$hadm_id == lib[i]),]
#Check that the operators are not identical and there are two
if (nrow(tmp) == 2 & (tmp$operator[1] != tmp$operator[2]) & (all(team %in% tmp$operator))){
if(tmp[1,indexOne] == 1 & tmp[2,indexOne] == 1){
m[1,1] <- m[1,1] + 2
} else if (tmp[1, indexOne] == 1 & tmp[2, indexOne] == 0){
m[1,1] <- m[1,1] + 1
m[2,1] <- m[2,1] + 1
} else if (tmp[1, indexOne] == 0 & tmp[2, indexOne] == 1){
m[1,1] <- m[1,1] + 1
m[1,2] <- m[1,2] + 1
} else if (tmp[1, indexOne] == 0 & tmp[2, indexOne] == 0){
m[2,2] <- m[2,2] + 2
}
#Hold the operators
opTmp <- sort(tmp$operator)
}
}
res[nrow(res)+1,] <- c(batch,paste(opTmp[1],opTmp[2], sep = " & "), kappaCalc(m), indication)
return(res)
}
kappaFrame <- data.frame()
for (name in c("Batch", "Operators", "Kappa", "Indication")) kappaFrame[[name]] <- as.numeric()
inds <- colnames(dis)[which(colnames(dis)=="none"):which(colnames(dis)=="unsure")]
batches <- levels(factor(dis$batch.id))
for (bat in batches){
for (ind in inds){
kappaFrame <- agreement(dis, ind, c("ETM", "JTW"), bat, kappaFrame)
}
}
for (bat in batches){
for (ind in inds){
kappaFrame <- agreement(dis, ind, c("JFF", "JWW"), bat, kappaFrame)
}
}
#Batches to dates
kappaFrame$Batch <- as.Date(kappaFrame$Batch, format = "%d%b%y")
#ejFrame <- kappaFrame[(kappaFrame$Operators == "ETM & JTW"),]
#jjFrame <- kappaFrame[(kappaFrame$Operators == "JFF & JWW"),]
kap <- read.csv(list.files()[grepl("Kappa", list.files())],
header = T, stringsAsFactors = F)
kap
## Operators Kappa Indication
## 1 ETM & JTW 0.8341776 Advanced.Cancer
## 2 ETM & JTW 0.8199485 Advanced.Heart.Disease
## 3 ETM & JTW 0.8053511 Advanced.Lung.Disease
## 4 ETM & JTW 0.8557267 Alcohol.Abuse
## 5 ETM & JTW 0.7136757 Chronic.Neurological.Dystrophies
## 6 ETM & JTW 0.8327701 Chronic.Pain.Fibromyalgia
## 7 ETM & JTW 0.9518131 Dementia
## 8 ETM & JTW 0.8534775 Depression
## 9 ETM & JTW 1.0000000 Developmental.Delay.Retardation
## 10 ETM & JTW 0.7775651 Non.Adherence
## 11 ETM & JTW 0.8379444 None
## 12 ETM & JTW 0.9406413 Obesity
## 13 ETM & JTW 0.8619174 Other.Substance.Abuse
## 14 ETM & JTW 0.9081054 Schizophrenia.and.other.Psychiatric.Disorders
## 15 ETM & JTW 0.6266989 Unsure
## 16 JFF & JWW 0.8688963 Advanced.Cancer
## 17 JFF & JWW 0.5611655 Advanced.Heart.Disease
## 18 JFF & JWW 0.6538844 Advanced.Lung.Disease
## 19 JFF & JWW 0.8417000 Alcohol.Abuse
## 20 JFF & JWW 0.7002536 Chronic.Neurological.Dystrophies
## 21 JFF & JWW 0.7305602 Chronic.Pain.Fibromyalgia
## 22 JFF & JWW 0.9466168 Dementia
## 23 JFF & JWW 0.9450693 Depression
## 24 JFF & JWW 0.8685472 Developmental.Delay.Retardation
## 25 JFF & JWW 0.6935544 Non.Adherence
## 26 JFF & JWW 0.6699291 None
## 27 JFF & JWW 0.9676517 Obesity
## 28 JFF & JWW 0.8816716 Other.Substance.Abuse
## 29 JFF & JWW 0.9396331 Schizophrenia.and.other.Psychiatric.Disorders
## 30 JFF & JWW 0.5053215 Unsure
Define a function to generate these data.
library("reshape2")
library("ggplot2")
comat <- function(dat){
#Find indices
inds <- colnames(dat)[which(colnames(dat)=="none"):which(colnames(dat)=="unsure")]
#Subset indications
dat <- dat[,inds]
#Find correlation, pearson by default, can have method = "pearson", "kendall", or "spearman"
cormat <- cor(dat, method = "pearson")
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
#The matrix is symmetric across the diagonal so only one half is necessary
upper_tri <- get_upper_tri(cormat)
#Remove the diagnal values (they will all be perfectly correlated)
diag(upper_tri) <- NA
#Melt for long data
melted_cormat <- melt(upper_tri, na.rm = TRUE)
colnames(melted_cormat) <- c("Indication.1", "Indication.2", "Value")
return(melted_cormat)
}
melted_cormat <- comat(dis)
ggplot(data = melted_cormat, aes(Indication.1, Indication.2, fill = Value))+
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1)) + labs(title = "Correlation Matrix of Indications\n (Discharge Summaries)") +
coord_fixed()
melted_cormat <- comat(nsn)
ggplot(data = melted_cormat, aes(Indication.1, Indication.2, fill = Value))+
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1)) + labs(title = "Correlation Matrix of Indications\n (Nursing Notes)") +
coord_fixed()