# Uploading the dataset
df <- read.csv( "C:\\Users\\Aya Awad\\OneDrive - Harvard University\\J term 23\\R workshop\\training_v2.csv")
Provide a brief description of the dataset prior to any manipulation. There are a lot of variables, please do not describe each. Instead:
The dataframe contain 186 variables (columns) and 91713 observations (rows).
# Examining datatypes
str(df)
library(cowplot)
# Plotting a histogram and a boxplot of the variable height
a <- ggplot(df, aes(height))+ geom_histogram(alpha=1/2,bins=20,color='black', fill='#6F8FAF')
b <- ggplot(df, aes(y=height)) + geom_boxplot(alpha = 1/2, color='black', fill='#6F8FAF')
plot_grid(a,b, ncol = 2)
# Summary statistics of variable "height"
print(summary(df$height))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 137.2 162.5 170.1 169.6 177.8 195.6 1334
# Number of missing values
print(sum(is.na(df$height)))
## [1] 1334
The variable is of type numeric. It reports the height of the individuals in cm. It seems to follow a normal distribution, with a mean of 169.6, the minimum value is 137.2, the maximal value is 195.6. No outliers are detected. The number of missing values is 1334.
# Summary statistics of variable "height"
print(summary(df$ethnicity))
## Length Class Mode
## 91713 character character
# Number of missing values
print(sum(is.na(df$ethnicity)))
## [1] 0
# Unique values
df2=df %>% group_by(ethnicity) %>% summarize(n=n())
print(df2)
## # A tibble: 7 × 2
## ethnicity n
## <chr> <int>
## 1 "" 1395
## 2 "African American" 9547
## 3 "Asian" 1129
## 4 "Caucasian" 70684
## 5 "Hispanic" 3796
## 6 "Native American" 788
## 7 "Other/Unknown" 4374
# Count plot of ethnicity
ggplot(df, aes(x = ethnicity)) +
geom_bar(color = "#000000", fill = "#89CFF0", alpha=1/2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
The variable is of type char, it has 6 unique values (African American, Asian, Caucausian, Hispanic, Native American, other). 1395 observations are missing. The most prevalent ethnicity in the given population is Caucasian followed by African American.
# Bringing the two variables together.
# BoxPlot of Height by ethnicity
df3 <- subset(df, !is.na(height))
df3 %>% group_by(ethnicity) %>% summarize (n=n(), mean_height=mean(height))
## # A tibble: 7 × 3
## ethnicity n mean_height
## <chr> <int> <dbl>
## 1 "" 1242 171.
## 2 "African American" 9409 171.
## 3 "Asian" 1120 162.
## 4 "Caucasian" 69764 170.
## 5 "Hispanic" 3759 166.
## 6 "Native American" 780 168.
## 7 "Other/Unknown" 4305 167.
ggplot(df3, aes(x = ethnicity, y = height)) +
geom_boxplot(alpha=0.5,fill = "#5F9EA0") + theme(axis.text.x = element_text(angle = 45, hjust = 1))+theme_classic()+labs(x='Ethnicity', y='Height')
The variable is numeric integer. No missing values are detected. It contains 147 unique values, meaning patients from 147 different hospitals are included in this population.
# Data type of hospital_id
class(df$hospital_id)
## [1] "integer"
# Missing values
print(sum(is.na(df$hospital_id)))
## [1] 0
# Number of unique values
Number_hospitals <- length(unique(df$hospital_id))
print(Number_hospitals)
## [1] 147
The variable is of type integer. It has no missing values, and 2 value counts: 1==Death (7915 observations), 0== no_death (83798 observations).
# Data type of hospital_id
class(df$hospital_death)
## [1] "integer"
# Missing values
print(sum(is.na(df$hospital_death)))
## [1] 0
# Value counts
table(df$hospital_death)
##
## 0 1
## 83798 7915
# Plotting the values
ggplot(df, aes(hospital_death)) +
geom_bar(stat = "count",alpha=0.5,color='black',fill = "#5F9EA0" )+labs(x='Hospital Death')
The variable is of type numeric. 7947 values are not reported (missing). The values range between (-1) and (0.99), which doesn’t make sense since we are reporting probability (expected values between 0-1). There is 2371 observation with a score < 0.
# Data type of hospital_id
class(df$apache_4a_hospital_death_prob)
# Missing values
print(sum(is.na(df$apache_4a_hospital_death_prob)))
# Value counts
summary(df$apache_4a_hospital_death_prob)
# Filtering for negative values
negative_prob <- df %>% filter(apache_4a_hospital_death_prob<0)
print(negative_prob)
We will now apply some criteria for inclusion or exclusion in our study. a. Exclude any patient without (i.e., missing or ‘-1’ for the prediction) a hospital_death outcome or an apache_4a_hospital_death_prob prediction. b. Include only patients from hospitals (identified by hospital_id) with over 1000 admissions (as defined after 2a.) contributed to the dataset (Hint: There are 24 of them). c. Document the number of admissions now included in the dataset.
# Applying exclusion criteria
# Excluding negative prediction and missing values of 'apache_4a_hospital_death_prob'
df1 <- df %>% filter(apache_4a_hospital_death_prob>=0) %>% drop_na(apache_4a_hospital_death_prob)
# Filtering for hospitals that contributed >1000 admission.
df1 <- df1 %>% group_by(hospital_id) %>% filter(n()>1000)
# Checking the hint given- how many hospitals are included.
length(unique(df1$hospital_id))
# ungrouping the dataframe
df1 <- df1 %>% ungroup()
# Documenting the number of admissions now in the dataset
number_admissions <- nrow(df1)
The number of admissions after applying the exclusion criteria is 44605
Often researchers will want to examine how the mortality rate varies in particular sub-populations. Mortality rate is simply the number of hospital deaths divided by the total number of admissions. Using the subset of data created in 2):
# Total number on in hospital deaths
number_hospital_deaths <- sum(df1$hospital_death==1)
The number of in hospital deaths is 4159
# Overall mortality rate
mort_rate <- number_hospital_deaths/length(df1$hospital_death)
The mortality rate in the given patient population is 0.0932407
# An alternative calculation
mort_rate1 <- mean(df1$hospital_death)
#Create a new column with (0,1) values whether serum lactate test was done or not.
df1$lactate_test <- ifelse(is.na(df1$d1_lactate_max), 0, 1)
mortality_lactate <- df1 %>% group_by(lactate_test=as.factor(lactate_test)) %>%
summarise(n=n(),mort_rate2=mean(hospital_death))
ggplot(mortality_lactate,aes(lactate_test,mort_rate2)) +
geom_point() + labs(x='Was lactate test taken?', y='Mortality Rate')
We can see that patients who had their lactate test taken have a higher mortality rate.
df1$apache_3j_bodysystem <- replace(df1$apache_3j_bodysystem, df1$apache_3j_bodysystem == "", "Unknown")
mort_apache3 <- df1 %>% group_by(apache_3j_bodysystem= as.factor(apache_3j_bodysystem)) %>%
summarise(mort_rate=mean(hospital_death)) %>% arrange() %>% as.data.frame() %>% kable()
print(mort_apache3)
##
##
## |apache_3j_bodysystem | mort_rate|
## |:--------------------|---------:|
## |Cardiovascular | 0.0849199|
## |Gastrointestinal | 0.0788897|
## |Genitourinary | 0.0762332|
## |Gynecological | 0.0000000|
## |Hematological | 0.1140065|
## |Metabolic | 0.0218688|
## |Musculoskeletal/Skin | 0.0590717|
## |Neurological | 0.0842798|
## |Respiratory | 0.1253529|
## |Sepsis | 0.1664507|
## |Trauma | 0.0773601|
## |Unknown | 0.0095238|
# Examining the values of ICU admission source
table(df1$icu_admit_source)
mort_icuadmit <- df1 %>% group_by(icu_admit_source= as.factor(icu_admit_source)) %>%
summarise(mort_rate=mean(hospital_death)) %>% arrange() %>% as.data.frame() %>% kable()
print(mort_icuadmit)
##
##
## |icu_admit_source | mort_rate|
## |:-------------------------|---------:|
## |Accident & Emergency | 0.0984267|
## |Floor | 0.1439737|
## |Operating Room / Recovery | 0.0373090|
## |Other Hospital | 0.1318210|
## |Other ICU | 0.0588235|
df1 %>% group_by(apache_3j_bodysystem) %>%
summarise(mort_rate=mean(hospital_death)) %>% arrange() %>% ggplot(aes(apache_3j_bodysystem,mort_rate)) +
geom_point(color = "#00008B")+ theme(axis.text.x = element_text(angle = 45, hjust = 1)) +labs(x = "APACHE score by Body system", y= 'Mortality Rate')
# A display with confidence interval
# The confidence line interval Code is based on the workshop 11 provided by this class.
df1 %>% group_by(apache_3j_bodysystem) %>%
summarise(mort_rate=mean(hospital_death),n=n(), lb = mort_rate - qnorm(0.975)*sqrt(mort_rate*(1-mort_rate)/n),
ub = mort_rate + qnorm(0.975)*sqrt(mort_rate*(1-mort_rate)/n)) %>% arrange() %>%
ggplot(aes(apache_3j_bodysystem,mort_rate)) + geom_errorbar(aes(ymin=lb,ymax=ub),width=0.2,position=position_dodge(width = .2))+
geom_point(color = "#00008B") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(x = "APACHE by BodySystem", y= 'Mortality Rate')
df1 %>% group_by(icu_admit_source= as.factor(icu_admit_source)) %>%
summarise(mort_rate=mean(hospital_death)) %>% arrange() %>% ggplot(aes(icu_admit_source,mort_rate)) +
geom_point(color = "#00008B") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +labs(x = "ICU Admission Source", y= 'Mortality Rate')
mort_hospID <- df1 %>% group_by(hospital_id= as.factor(hospital_id)) %>%
summarise(mort_rate=mean(hospital_death)) %>% arrange(desc(mort_rate)) %>% as.data.frame() %>% kable()
print(mort_hospID)
##
##
## |hospital_id | mort_rate|
## |:-----------|---------:|
## |76 | 0.1485765|
## |100 | 0.1340206|
## |176 | 0.1239389|
## |21 | 0.1230254|
## |204 | 0.1128440|
## |194 | 0.1123338|
## |174 | 0.1069418|
## |89 | 0.1049020|
## |161 | 0.1016608|
## |18 | 0.0984570|
## |19 | 0.0981546|
## |79 | 0.0938053|
## |70 | 0.0862964|
## |196 | 0.0834008|
## |138 | 0.0813076|
## |116 | 0.0794574|
## |157 | 0.0784604|
## |188 | 0.0779848|
## |55 | 0.0716292|
## |62 | 0.0715361|
## |39 | 0.0689115|
## |118 | 0.0676270|
## |185 | 0.0621951|
## |2 | 0.0594855|
df1 %>% group_by(hospital_id= as.factor(hospital_id)) %>%
summarise(mort_rate=mean(hospital_death)) %>% arrange(mort_rate) %>% ggplot(aes(mort_rate)) +
geom_histogram(bins=30,color = "#000000", fill = "#89CFF0", alpha=0.6) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +labs(x='Mortality Rate')
As mentioned above, the APACHE IVa predictions estimate the probability of death for a patient based on characteristics taken during the first 24 hours of an ICU stay. Hospitals try to monitor their quality-of-care by benchmarking themselves against these predictions. Benchmarking can be done by computing the expected number of deaths for a hospital’s patients, and comparing it to the actual number of deaths observed. This levels the playing field for hospitals who take on the sickest patients, because their simple mortality rates will be higher based on the population of patients they serve, regardless of the quality-of-care they provide. Usually, the observed or actual death counts are divided by the expected number of deaths (often denoted O/E or A/E) to compute a standardized mortality ratio (or SMR). The following function will compute the SMR given a vector of deaths and a vector of probabilities of death.
SMR <- function(deaths,prob) {return(sum(deaths)/sum(prob))}
An SMR of 1 means that the institution observes about as many deaths as would be expected if the institution’s patients were seen by the hospitals used to develop APACHE-IVa. Less than 1, generally corresponds that the hospital observes fewer than expected deaths, and greater than one, the converse. If dat is our data.frame created in question 2), we can compute the overall SMR in this population as:
df1 %>% summarise(SMR=SMR(hospital_death,apache_4a_hospital_death_prob))
## # A tibble: 1 × 1
## SMR
## <dbl>
## 1 0.749
This would suggest that in the hospitals we have included have, on average, about 25% fewer deaths than the population from which APACHE IVa was developed upon. This could be due to a variety of reasons, chief among them being that APACHE IVa was developed a decade ago, and patient care has steadily improved over the intervening time period.
SMR_table <- df1 %>% group_by(hospital_id) %>% summarize(Sample_Size=n(), SMR=SMR(hospital_death,apache_4a_hospital_death_prob)) %>% arrange(SMR) %>% as.data.frame() %>% kable()
print(SMR_table)
##
##
## | hospital_id| Sample_Size| SMR|
## |-----------:|-----------:|---------:|
## | 2| 1244| 0.4755173|
## | 70| 2503| 0.5350905|
## | 185| 1640| 0.5436810|
## | 55| 1424| 0.5449882|
## | 89| 1020| 0.6257676|
## | 157| 1351| 0.6476051|
## | 116| 1032| 0.6515176|
## | 188| 2898| 0.6771333|
## | 18| 1361| 0.6814137|
## | 196| 2470| 0.6939766|
## | 79| 1130| 0.7315895|
## | 118| 4096| 0.7527788|
## | 174| 2132| 0.7777854|
## | 39| 1277| 0.7782101|
## | 62| 1328| 0.7808005|
## | 19| 3739| 0.7886029|
## | 161| 1987| 0.7987347|
## | 204| 1090| 0.8278926|
## | 194| 2181| 0.8416352|
## | 76| 1124| 0.8504354|
## | 100| 1940| 0.8952860|
## | 138| 1193| 0.9507008|
## | 21| 2089| 0.9601375|
## | 176| 2356| 1.0554471|
# Creating a histogram of the SMRs and density plot
df1 %>% group_by(hospital_id) %>% summarize(Sample_Size=n(), SMR=SMR(hospital_death,apache_4a_hospital_death_prob)) %>% arrange(SMR) %>% ggplot(aes(SMR))+geom_histogram(bins=30,color = "#000000", fill = "#89CFF0", alpha=0.6) +labs(x='Standardized Mortality Ratio', y='Count')+ geom_density(color = "#000000", fill = "#F85700", alpha = 0.1)
df1 %>% group_by(hospital_id=as.factor(hospital_id)) %>% summarize(Sample_Size=n(), SMR=SMR(hospital_death,apache_4a_hospital_death_prob),mort_rate=mean(hospital_death)) %>% arrange(SMR) %>% ggplot(aes(x=SMR, y=mort_rate,color=hospital_id))+ geom_point()+ labs(x= 'Standardized Mortality Ratio (SMR)', y='Mortality Rate') +guides(color = guide_legend(title = "Hospital ID"))+ geom_smooth()
Briefly summarize your work. Make sure to note:
The groups of patients at highest and lowest risk of in-hospital death.
If you think if some hospitals provide higher quality-of-care than others, at least in the context we examined here.
In summary, we were able to successfully examine and compare mortality rates among a number of different subpopulations included in the dataset provided. One particular subpopulation that was associated with a higher risk of mortality was patients that received a serum lactate test. Moreover, when mortality rates were compared on the basis of body systems, the three groups of patients at the highest risk of in-hospital death comprised of septic, respiratory and hematological patients. Conversely, the three groups of patients at lowest risk of in-hospital death were gynecological, unknown, and metabolic patients. Upon comparison of mortality rate based on admission source, patients with an ICU admission source of the floor or other hospitals had the highest mortality while patients with an ICU admission source of OR/Recovery or other ICU had the lowest mortality rate.
Upon calculation of standardized mortality rates SMR) for each hospital and comparing these along with the mortality rates (MR) of each hospital, I believe there are hospitals within our dataset that provide higher quality-of-care than others. Specifically, hospitals such as 2 and 18 have substantially lower SMR and MR values than several other hospitals included in the dataset, suggesting that these hospitals are responsible for providing care that has averted larger amounts of expected patient deaths among their patient populations on average.