In today’s post we will be doing some basic EDA (Exploratory Data Analysis) with the synthetic COVD data set provided by https://synthea.mitre.org/downloads. Additional information about the data can be found at https://github.com/synthetichealth/synthea/wiki/CSV-File-Data-Dictionary.
As always, let’s begin with the packages, functions and a little supplemental data…
library(tidyverse)
library(data.table)
library(plotly)
library(leaflet)
library(sf)
library(colorspace)
# Massachusetts County Population Data #https://worldpopulationreview.com/us-counties/states/ma
pop<- data.frame(
stringsAsFactors = FALSE,
COUNTY = c("Middlesex County",
"Worcester County","Suffolk County",
"Essex County","Norfolk County","Bristol County",
"Plymouth County","Hampden County","Barnstable County",
"Hampshire County","Berkshire County",
"Franklin County","Dukes County","Nantucket County"),
Population = c(1614710,830839,
807252,790638,705388,564022,518132,470406,
213413,161355,126348,70963,17352,11327))
death_rate_table<- function(thevar, titleText){
# Plotting function that creates a barplot specifically to compare death rates by a given factor
PID$thevar<- thevar
t0<- PID %>% group_by(thevar,death_status) %>% tally() %>% spread(death_status,n) #%>% print()
t1<- PID %>% group_by(thevar) %>% summarise(Total= n()) %>% left_join(t0) #%>% print()
t1<- t1 %>% mutate(Death_rate= Died/Total, label= paste0(round(Death_rate*100,1),"%"))
ymid <- mean(range(t1$Death_rate))
t1$Ordered<- fct_reorder(t1$thevar, t1$Death_rate)
g<- ggplot(t1, aes(x= Death_rate, y= Ordered, fill= Death_rate)) +
geom_col(color= "gray50", size= 0.25) +
geom_text(aes(label=label,hjust = ifelse(Death_rate < ymid, -0.5, 1.5)), vjust= 0.5, fontface="bold") +
scale_fill_gradient(low= "#7a5195",high= "#f95d6a") +
scale_x_continuous( expand = c(0, 0)) +
theme_bw() +
theme(legend.position = 'none',
plot.margin = margin(0.5, 0.5, 0.5, 0.5, "cm"),
panel.grid.major.y = element_blank(),
panel.border= element_blank(),
axis.ticks.y= element_blank(),
axis.text.y = element_text(size= 10)) +
labs(title= paste("Death Rate by",titleText), x= "\nDeath Rate", y= NULL)
print(g)
return(t1)
}
# Hospitals (A data set of hospitals in Massachusetts - derived from the main set)
hospitals<- readRDS("hospitals.rds")
# Some utility functions
putZeros<- function(OB) {
# Convert NAs in a matrix to zeros
OB<- OB %>% mutate_at(vars(-group_cols()),~replace(.,is.na(.),0))
return(OB)
}
label_cols<- function(sett,label,before="before") {
# Add labels to all but the first column of a data.frame
P<- ncol(sett)
if (before!= "before") {
names(sett)[2:P]<- paste0(names(sett)[2:P],label)
} else {
names(sett)[2:P]<- paste0(label,names(sett)[2:P])
}
return(sett)
}
howmany<- function(thevar) {
# Count how many unique values in a vector
res<- length(unique(thevar))
return(res)
}
Today we are picking up our data set that we cleansed in a previous session. Now, the purpose of EDA is really just to get an overall feel for what the data is doing. In particular, we might ask questions like:
IP<- readRDS("Dev Data 0/IP_0.rds") # 1766943
# Create a patient summary set
PID<- IP %>% select(PID, Status, age, sex, ethnicity_fac, allergy, CITY, COUNTY, ZIP, LAT, LON, PATIENT) %>% distinct() # 9040
# Some enrichment
PID$death_status<- ifelse(PID$Status==0, "Alive", "Died")
PID$sex_fac<- factor(PID$sex, labels= c("Male", "Female"))
PID$age_group<- floor(PID$age/10)*10
# Make some tidy labels for age group
age_labels<- names(table(PID$age_group))
age_labels<- paste(age_labels,c(age_labels[-1],"+"))
age_labels<- ifelse(age_labels %like% "\\+", gsub(" ","",age_labels), gsub(" ","-",age_labels)) %>% print()
[1] "0-10" "10-20" "20-30" "30-40" "40-50" "50-60" "60-70" "70-80" "80-90" "90-100" "100+"
PID$age_fac<- ordered(factor(PID$age_group), labels= age_labels)
table(PID$age_fac)
0-10 10-20 20-30 30-40 40-50 50-60 60-70 70-80 80-90 90-100 100+
950 1128 1239 1145 1190 1357 981 598 269 98 85
# Super basic initial plot of patient locations
ggplot(PID, aes(x= LON, y=LAT, color= death_status))+
geom_point(alpha= 0.3, size= 1) +
theme_void() +
labs(title= "Patient locations before any styling")
We can pull patient encounters from the active period, and look at the date at which each patient is first diagnosed. We can then use the cumsum function to derive the cumulative infection curve.
# Pull out records of patient encounters across the active period
encounters<- IP %>% filter(!type %like% "history") %>% left_join(hospitals)
encounters<- encounters %>% mutate(death_status= ifelse(Status==0,"Alive", "Died"),
COST= ifelse(is.na(TOTALCOST), BASE_COST, TOTALCOST),
age_group= floor(age/10)*10,
age_group= factor(age_group, labels= age_labels),
description= str_to_lower(DESCRIPTION))
encounters<- encounters %>% mutate(COVID= ifelse(description == "covid-19", 1, 0))
# Infection Rates
diags<- encounters %>% filter(COVID==1) %>% group_by(PID) %>% summarise(etime= min(etime))
t0<- diags %>% count(etime) %>% mutate(cumulative= cumsum(n))
plot(t0$etime,t0$cumulative, type="l", main= "Overall Infections", xlab= "Day of the active period",
ylab= "Cumulative Infections")
This curve is not very informative, lets split the infections by County…
# Infection Rates by County
diags<- encounters %>% filter(COVID==1) %>% group_by(PID,COUNTY) %>% summarise(etime= min(etime)) %>% ungroup()
t0<- diags %>% count(etime,COUNTY) %>% spread(COUNTY,n) %>% putZeros()
cummat<- sapply(t0[,2:15], cumsum) %>% data.frame() %>% mutate(etime= t0$etime)
clong<- cummat %>% gather(key, value, -etime) %>% mutate(key= gsub("\\."," ", key))
clong<- left_join(clong, pop, by=c("key"="COUNTY"))
clong<- clong %>% mutate(per_100k = value/Population * 100000,
County= word(key, 1))
g<- ggplot(clong, aes(x= etime, y= per_100k, color= County)) +
geom_line() +
labs(x= "Day of the active period", y= "Infections per 100k population",
title= "Infection Rate by County") +
theme_bw()
ggplotly(g)
Next we explore the total pathway cost (across the one-year active period) for each patient, broken down by age group, and whether or not the patient ultimately died.
# Summarise patient pathways
t0<- encounters %>% group_by(PID, COUNTY, age_group, death_status) %>%
summarise(COST= sum(COST,na.rm = T), length= max(etime, na.rm=T))
# Average cost by age group
t1<- t0 %>% group_by(age_group, death_status) %>% summarise(Average= mean(COST)) %>%
mutate(Label= paste0(round(Average/1000),"k"))
ggplot(t1, aes(x= Average, y= factor(age_group), fill= death_status)) +
geom_col(position= "dodge") +
geom_text(aes(label= Label), position= position_dodge(width= 1), hjust= 1.2, fontface= "bold", color="white") +
labs(title= "Total Cost by Age Group and Death Outcome",
subtitle= "Average Total Cost across the 1-year active period per patient",
x= "Average Overall Cost", y= "Age Group", fill= "Death Outcome") +
scale_fill_manual(values=c("#7a5195","#f95d6a")) +
theme_bw()
Next we try out our generic death rate function across various factors that exist in the data. It seems that Men have a slightly higher risk of death from COVID than women, and that patients who are African American might be at higher risk than White or Asian patients.
# Visual EDA of Death Rate by various factors
death_rate_table(PID$sex_fac, "Sex")
death_rate_table(PID$ethnicity_fac, "Ethnicity")
death_rate_table(PID$age_fac, "Age Group")
death_rate_table(PID$COUNTY, "County")
Because latitude and longitude data has been provided for each patient in the data set, it is an excellent opportunity to try out some geo-spatial tools. Please note that because the data has been synthesized from real data, sometimes the randomness inserted into the data makes the points lay outside the State or outside the land!
I also found a handy shapefile online (link below) with the county boundaries of Massachusetts. We also have a set of Hospitals, who coordinates are also approximate. Originally these plots were done with leaflet, however I was not able to get those to upload to RPubs from my mac.
# Download a shapefile
# http://maps-massgis.opendata.arcgis.com/datasets/aca1bbc8e91e4970b24bc4db8311e6f6_0
csh<- st_read("~/Documents/Synthea/Massachusetts_Counties-shp/Massachusetts_Counties.shp")
Reading layer `Massachusetts_Counties' from data source `/Users/celeste/Documents/Synthea/Massachusetts_Counties-shp/Massachusetts_Counties.shp' using driver `ESRI Shapefile'
Simple feature collection with 14 features and 3 fields
geometry type: MULTIPOLYGON
dimension: XY
bbox: xmin: -73.50821 ymin: 41.23876 xmax: -69.92809 ymax: 42.88679
CRS: 4326
csh %>%
leaflet() %>%
addTiles() %>%
# set color to green and create Wealth Zipcodes group
addPolygons(weight = 1, fillOpacity = .3, color = "green",
label = ~COUNTY,
highlightOptions = highlightOptions(weight = 5, color = "white", bringToFront = FALSE)) %>%
addCircleMarkers(data= hospitals, radius= 2, color= "yellow",
popup = hospitals$HOS_KEY)