In today’s post we will be doing some basic EDA (Exploratory Data Analysis) with the synthetic COVD data set provided by https://synthea.mitre.org/downloads. Additional information about the data can be found at https://github.com/synthetichealth/synthea/wiki/CSV-File-Data-Dictionary.

As always, let’s begin with the packages, functions and a little supplemental data…

library(tidyverse)
library(data.table)
library(plotly)
library(leaflet)
library(sf)
library(colorspace)

# Massachusetts County Population Data #https://worldpopulationreview.com/us-counties/states/ma
pop<- data.frame(
  stringsAsFactors = FALSE,
  COUNTY = c("Middlesex County",
             "Worcester County","Suffolk County",
             "Essex County","Norfolk County","Bristol County",
             "Plymouth County","Hampden County","Barnstable County",
             "Hampshire County","Berkshire County",
             "Franklin County","Dukes County","Nantucket County"),
  Population = c(1614710,830839,
                 807252,790638,705388,564022,518132,470406,
                 213413,161355,126348,70963,17352,11327))


death_rate_table<- function(thevar, titleText){
  # Plotting function that creates a barplot specifically to compare death rates by a given factor
  PID$thevar<- thevar
  t0<- PID %>% group_by(thevar,death_status) %>% tally() %>% spread(death_status,n) #%>% print()
  t1<- PID %>% group_by(thevar) %>% summarise(Total= n()) %>% left_join(t0) #%>% print()
  t1<- t1 %>% mutate(Death_rate= Died/Total, label= paste0(round(Death_rate*100,1),"%"))
  ymid <- mean(range(t1$Death_rate))
  t1$Ordered<- fct_reorder(t1$thevar, t1$Death_rate)
  g<- ggplot(t1, aes(x= Death_rate, y= Ordered, fill= Death_rate)) +
    geom_col(color= "gray50", size= 0.25) +
    geom_text(aes(label=label,hjust = ifelse(Death_rate < ymid, -0.5, 1.5)), vjust= 0.5, fontface="bold") +
    scale_fill_gradient(low=  "#7a5195",high= "#f95d6a") +
    scale_x_continuous( expand = c(0, 0)) +
    theme_bw() +
    theme(legend.position = 'none',
          plot.margin = margin(0.5, 0.5, 0.5, 0.5, "cm"),
          panel.grid.major.y = element_blank(),
          panel.border= element_blank(),
          axis.ticks.y= element_blank(),
          axis.text.y = element_text(size= 10)) +
    labs(title= paste("Death Rate by",titleText), x= "\nDeath Rate", y= NULL)
  print(g)
  
  return(t1)
}

# Hospitals (A data set of hospitals in Massachusetts - derived from the main set)
hospitals<- readRDS("hospitals.rds")

# Some utility functions
putZeros<- function(OB) {
  # Convert NAs in a matrix to zeros
  OB<- OB %>% mutate_at(vars(-group_cols()),~replace(.,is.na(.),0))
  return(OB)
}

label_cols<- function(sett,label,before="before") {
  # Add labels to all but the first column of a data.frame
  P<- ncol(sett)
  if (before!= "before") {
    names(sett)[2:P]<- paste0(names(sett)[2:P],label)
  } else {
    names(sett)[2:P]<- paste0(label,names(sett)[2:P])
  }
  return(sett)
}

howmany<- function(thevar) {
  # Count how many unique values in a vector
  res<- length(unique(thevar))
  return(res)
}

Today we are picking up our data set that we cleansed in a previous session. Now, the purpose of EDA is really just to get an overall feel for what the data is doing. In particular, we might ask questions like:

  1. Where are COVID cases occurring?
  2. Where are COVID deaths occurring?
  3. What do the timing patterns of infections look like?
  4. What are the average costs of COVID treatment?
  5. How do death patterns vary across different parts of the State?
IP<- readRDS("Dev Data 0/IP_0.rds") # 1766943

# Create a patient summary set

PID<- IP %>%  select(PID, Status, age, sex, ethnicity_fac, allergy, CITY, COUNTY, ZIP, LAT, LON, PATIENT) %>% distinct() # 9040

# Some enrichment
PID$death_status<- ifelse(PID$Status==0, "Alive", "Died")
PID$sex_fac<-      factor(PID$sex, labels= c("Male", "Female"))
PID$age_group<-    floor(PID$age/10)*10

# Make some tidy labels for age group
age_labels<- names(table(PID$age_group))
age_labels<- paste(age_labels,c(age_labels[-1],"+")) 
age_labels<- ifelse(age_labels %like% "\\+", gsub(" ","",age_labels), gsub(" ","-",age_labels)) %>% print()
 [1] "0-10"   "10-20"  "20-30"  "30-40"  "40-50"  "50-60"  "60-70"  "70-80"  "80-90"  "90-100" "100+"  
PID$age_fac<- ordered(factor(PID$age_group), labels= age_labels)
table(PID$age_fac)

  0-10  10-20  20-30  30-40  40-50  50-60  60-70  70-80  80-90 90-100   100+ 
   950   1128   1239   1145   1190   1357    981    598    269     98     85 
# Super basic initial plot of patient locations
ggplot(PID, aes(x= LON, y=LAT, color= death_status))+
  geom_point(alpha= 0.3, size= 1) +
  theme_void() +
  labs(title= "Patient locations before any styling")

We can pull patient encounters from the active period, and look at the date at which each patient is first diagnosed. We can then use the cumsum function to derive the cumulative infection curve.

# Pull out records of patient encounters across the active period

encounters<- IP %>% filter(!type %like% "history") %>% left_join(hospitals)

encounters<- encounters %>% mutate(death_status= ifelse(Status==0,"Alive", "Died"),
                                   COST= ifelse(is.na(TOTALCOST), BASE_COST, TOTALCOST),
                                   age_group= floor(age/10)*10,
                                   age_group= factor(age_group, labels= age_labels),
                                   description= str_to_lower(DESCRIPTION))

encounters<- encounters %>% mutate(COVID= ifelse(description == "covid-19", 1, 0))

# Infection Rates

diags<- encounters %>% filter(COVID==1) %>% group_by(PID) %>% summarise(etime= min(etime))
t0<- diags %>% count(etime) %>% mutate(cumulative= cumsum(n)) 
plot(t0$etime,t0$cumulative, type="l", main= "Overall Infections", xlab= "Day of the active period",
     ylab= "Cumulative Infections")

This curve is not very informative, lets split the infections by County…

# Infection Rates by County

diags<-   encounters %>% filter(COVID==1) %>% group_by(PID,COUNTY) %>% summarise(etime= min(etime)) %>% ungroup()
t0<-      diags %>% count(etime,COUNTY) %>% spread(COUNTY,n) %>% putZeros()
cummat<-  sapply(t0[,2:15], cumsum) %>% data.frame() %>%  mutate(etime= t0$etime)
clong<-   cummat %>% gather(key, value, -etime) %>% mutate(key= gsub("\\."," ", key))
clong<-   left_join(clong, pop, by=c("key"="COUNTY"))
clong<-   clong %>% mutate(per_100k = value/Population * 100000,
                           County=    word(key, 1))

g<- ggplot(clong, aes(x= etime, y= per_100k, color= County)) +
  geom_line() +
  labs(x= "Day of the active period", y= "Infections per 100k population",
               title= "Infection Rate by County") +
  theme_bw()

ggplotly(g)

Next we explore the total pathway cost (across the one-year active period) for each patient, broken down by age group, and whether or not the patient ultimately died.

# Summarise patient pathways
t0<- encounters %>% group_by(PID, COUNTY, age_group, death_status) %>% 
                    summarise(COST= sum(COST,na.rm = T), length= max(etime, na.rm=T))

# Average cost by age group
t1<- t0 %>% group_by(age_group, death_status) %>% summarise(Average= mean(COST)) %>%
            mutate(Label= paste0(round(Average/1000),"k"))
ggplot(t1, aes(x= Average, y= factor(age_group), fill= death_status)) +
     geom_col(position= "dodge") +
     geom_text(aes(label= Label), position= position_dodge(width= 1), hjust= 1.2, fontface= "bold", color="white") +
     labs(title= "Total Cost by Age Group and Death Outcome", 
        subtitle= "Average Total Cost across the 1-year active period per patient",
        x= "Average Overall Cost", y= "Age Group", fill= "Death Outcome") +
     scale_fill_manual(values=c("#7a5195","#f95d6a")) +
     theme_bw()

Next we try out our generic death rate function across various factors that exist in the data. It seems that Men have a slightly higher risk of death from COVID than women, and that patients who are African American might be at higher risk than White or Asian patients.

# Visual EDA of Death Rate by various factors
death_rate_table(PID$sex_fac, "Sex")


death_rate_table(PID$ethnicity_fac, "Ethnicity")


death_rate_table(PID$age_fac, "Age Group")


death_rate_table(PID$COUNTY, "County")

Because latitude and longitude data has been provided for each patient in the data set, it is an excellent opportunity to try out some geo-spatial tools. Please note that because the data has been synthesized from real data, sometimes the randomness inserted into the data makes the points lay outside the State or outside the land!

I also found a handy shapefile online (link below) with the county boundaries of Massachusetts. We also have a set of Hospitals, who coordinates are also approximate. Originally these plots were done with leaflet, however I was not able to get those to upload to RPubs from my mac.

# Download a shapefile
# http://maps-massgis.opendata.arcgis.com/datasets/aca1bbc8e91e4970b24bc4db8311e6f6_0

csh<- st_read("~/Documents/Synthea/Massachusetts_Counties-shp/Massachusetts_Counties.shp")
Reading layer `Massachusetts_Counties' from data source `/Users/celeste/Documents/Synthea/Massachusetts_Counties-shp/Massachusetts_Counties.shp' using driver `ESRI Shapefile'
Simple feature collection with 14 features and 3 fields
geometry type:  MULTIPOLYGON
dimension:      XY
bbox:           xmin: -73.50821 ymin: 41.23876 xmax: -69.92809 ymax: 42.88679
CRS:            4326
csh %>%
  leaflet() %>% 
  addTiles() %>% 
  # set color to green and create Wealth Zipcodes group
  addPolygons(weight = 1, fillOpacity = .3, color = "green",
              label = ~COUNTY,
              highlightOptions = highlightOptions(weight = 5, color = "white", bringToFront = FALSE)) %>%
  addCircleMarkers(data= hospitals, radius= 2, color= "yellow",
                 popup = hospitals$HOS_KEY)

Now let’s add the patient points in layers so that we can explore each set separately…


pal <- colorFactor(palette = c("blue", "red"), 
                   levels = c("Alive", "Died"))

massCentral<- c(-71.801299, 42.196323)

died<-  PID %>% filter(death_status=="Died")
alive<- PID %>% filter(death_status== "Alive")

g<- leaflet() %>% 
  addTiles() %>% 
  addCircleMarkers(data= alive, radius= 1, opacity= 0.3, color= ~pal(death_status), group= "Alive") %>%
  addCircleMarkers(data= died, radius=1, opacity= 0.5, color= ~pal(death_status), group= "Died") %>%
  addCircleMarkers(data= hospitals, radius= 2, opacity= 1, color= "yellow", 
                   popup = hospitals$HOS_KEY, group= "Hospitals") %>%
  addLayersControl(overlayGroups = c("Alive", "Died", "Hospitals"))

g

Clearly the population is unevenly spread across the state, so it would be interesting to explore various metrics and how they vary by county.

# Death metrics by county

# First get population size order for later plotting
bySize <-            pop %>% arrange(Population) %>% pull(COUNTY)
encounters$COUNTY <- factor(encounters$COUNTY, levels = bySize)

# Summarise statistics from patient pathways (by patient)
t0<- encounters %>% filter(!is.na(COUNTY)) %>%
       group_by(PID, CITY, COUNTY, age, age_group, death_status, LAT, LON) %>% 
       summarise(COST= sum(COST,na.rm = T)) %>% ungroup()

# Summarise across patients
t1<- t0  %>% group_by(COUNTY) %>% summarise(Patients= howmany(PID),
                                             Average_age= mean(age),
                                             Average_cost= mean(COST))
# Death Rate by COUNTY
t2<- t0 %>% group_by(COUNTY,death_status) %>% summarise(N= howmany(PID)) %>%
             left_join(t1) %>%
             mutate(Percent= N/Patients) %>% 
             select(COUNTY,death_status, Percent) %>%
             spread(death_status, Percent) %>% putZeros() %>% label_cols("Percent_")

tv<- left_join(t1, t2 %>% select(COUNTY, Percent_Died))   
tv<- left_join(tv, pop)

# Calculate some metrics
tv$Patients_per_100k= tv$Patients/tv$Population * 100000
tv$Raw_Deaths<-       tv$Percent_Died * tv$Patients
tv$Deaths_per_100k<-  tv$Raw_Deaths/tv$Population * 100000
tv$COUNTY<-           word(tv$COUNTY,1)
sizes<-               sort(table(encounters$COUNTY))
tv$COUNTY<-           fct_reorder(tv$COUNTY, names(sizes)) %>% str_to_upper(tv$COUNTY) %>% trimws()
argument `locale` should be a single character string; only the first element is used
# Re-shape for plotting
tlong<- tv %>% gather(key, value, -COUNTY) %>% 
                 mutate(Label= value,
                        COUNTY= factor(COUNTY, levels= str_to_upper(word(bySize, 1)))) 
tmid<-  tlong %>% group_by(key) %>% mutate(mid= mean(value))
tlong<- left_join(tlong, tmid)

# Tidy up the labels for plotting
tlong$key<- factor(tlong$key, levels= c("Population","Average_age","Average_cost","Patients",
                                       "Patients_per_100k","Raw_Deaths","Percent_Died","Deaths_per_100k"))
tlong$Label<- with(tlong, ifelse(key=="Average_cost", paste0("$",round(value/1000),"k"), round(value)))
tlong$Label<- with(tlong, ifelse(key=="Population", paste0(round(value/1000),"k"), Label))
tlong$Label<- with(tlong, ifelse(key=="Percent_Died", paste0(round(value*100,1),"%"), Label))
tlong$Label<- with(tlong, ifelse(key=="Deaths_per_100k", round(value,1), Label))

ggplot(tlong, aes(x= value, y= COUNTY, fill= key)) +
  geom_col() +
  geom_text(aes(label=Label,hjust = ifelse(value < mid, -0.3, 1.3)), vjust= 0.5, size= 2.5) +
  facet_wrap(~key, scales= "free_x", ncol= 4, nrow=2) +
  theme_bw() +
  theme(legend.position = "none") +
  ggtitle("Various Metrics by County")

The sf package allows data to be held alongside spatial geometries, which allows us to join various calculated fields into spatial objects. Here is the most basic plotting function provided via sf.

# Native shapefile plotting via sf
csht<- left_join(csh, tv)
Joining, by = "COUNTY"
plot(csht[,c("Deaths_per_100k","Patients_per_100k","Population","Average_age")])

Alternatively we can combine the functionality of sf with our trusty ggplot by making use of geom_sf. We have plotted the county names in the polygon centres via the st_centroid function. Here we have plotted Age, but you could substitute field that can be created at county granularity.

# Combining shapefile + data plotting with ggplot and geom_sf
pp<- st_centroid(csh) %>% st_coordinates()
st_centroid assumes attributes are constant over geometries of xst_centroid does not give correct centroids for longitude/latitude data
counties<- csht$COUNTY

ggplot() +
  geom_sf(data= csht, aes(fill= Average_age)) +
  geom_text(aes(x= pp[,1], y=pp[,2], label= counties), fontface="bold", color = "white")+
  theme_void() +
  theme(panel.background = element_rect(fill= "grey60"))+
  scale_fill_continuous_sequential(palette = "Plasma", rev= FALSE)

This has just been a quick breeze through of some alternatives for basic EDA, making some additional use of geo-spatial elements.

..

..

..

---
title: "Synthea: COVID Exploratory Analysis"
output: html_notebook
author: Cel McCracken
date: "2020-07-12"
---
![](/Users/Celeste/Desktop/Synthea_EDA.png)

In today's post we will be doing some basic EDA (Exploratory Data Analysis) with the synthetic COVD data set provided by [https://synthea.mitre.org/downloads](https://synthea.mitre.org/downloads).  Additional information about the data can be found at [https://github.com/synthetichealth/synthea/wiki/CSV-File-Data-Dictionary](https://github.com/synthetichealth/synthea/wiki/CSV-File-Data-Dictionary).

As always, let's begin with the packages, functions and a little supplemental data...

```{r ,message= FALSE, warning= FALSE}
library(tidyverse)
library(data.table)
library(plotly)
library(leaflet)
library(sf)
library(colorspace)

# Massachusetts County Population Data #https://worldpopulationreview.com/us-counties/states/ma
pop<- data.frame(
  stringsAsFactors = FALSE,
  COUNTY = c("Middlesex County",
             "Worcester County","Suffolk County",
             "Essex County","Norfolk County","Bristol County",
             "Plymouth County","Hampden County","Barnstable County",
             "Hampshire County","Berkshire County",
             "Franklin County","Dukes County","Nantucket County"),
  Population = c(1614710,830839,
                 807252,790638,705388,564022,518132,470406,
                 213413,161355,126348,70963,17352,11327))


death_rate_table<- function(thevar, titleText){
  # Plotting function that creates a barplot specifically to compare death rates by a given factor
  PID$thevar<- thevar
  t0<- PID %>% group_by(thevar,death_status) %>% tally() %>% spread(death_status,n) #%>% print()
  t1<- PID %>% group_by(thevar) %>% summarise(Total= n()) %>% left_join(t0) #%>% print()
  t1<- t1 %>% mutate(Death_rate= Died/Total, label= paste0(round(Death_rate*100,1),"%"))
  ymid <- mean(range(t1$Death_rate))
  t1$Ordered<- fct_reorder(t1$thevar, t1$Death_rate)
  g<- ggplot(t1, aes(x= Death_rate, y= Ordered, fill= Death_rate)) +
    geom_col(color= "gray50", size= 0.25) +
    geom_text(aes(label=label,hjust = ifelse(Death_rate < ymid, -0.5, 1.5)), vjust= 0.5, fontface="bold") +
    scale_fill_gradient(low=  "#7a5195",high= "#f95d6a") +
    scale_x_continuous( expand = c(0, 0)) +
    theme_bw() +
    theme(legend.position = 'none',
          plot.margin = margin(0.5, 0.5, 0.5, 0.5, "cm"),
          panel.grid.major.y = element_blank(),
          panel.border= element_blank(),
          axis.ticks.y= element_blank(),
          axis.text.y = element_text(size= 10)) +
    labs(title= paste("Death Rate by",titleText), x= "\nDeath Rate", y= NULL)
  print(g)
  
  return(t1)
}

# Hospitals (A data set of hospitals in Massachusetts - derived from the main set)
hospitals<- readRDS("hospitals.rds")

# Some utility functions
putZeros<- function(OB) {
  # Convert NAs in a matrix to zeros
  OB<- OB %>% mutate_at(vars(-group_cols()),~replace(.,is.na(.),0))
  return(OB)
}

label_cols<- function(sett,label,before="before") {
  # Add labels to all but the first column of a data.frame
  P<- ncol(sett)
  if (before!= "before") {
    names(sett)[2:P]<- paste0(names(sett)[2:P],label)
  } else {
    names(sett)[2:P]<- paste0(label,names(sett)[2:P])
  }
  return(sett)
}

howmany<- function(thevar) {
  # Count how many unique values in a vector
  res<- length(unique(thevar))
  return(res)
}
```


Today we are picking up our data set that we cleansed in a previous session. Now, the purpose of EDA is really just to get an overall feel for what the data is doing. In particular, we might ask questions like:

1. Where are COVID cases occurring?
2. Where are COVID deaths occurring?
3. What do the timing patterns of infections look like?
4. What are the average costs of COVID treatment?
5. How do death patterns vary across different parts of the State?
```{r}
IP<- readRDS("Dev Data 0/IP_0.rds") # 1766943

# Create a patient summary set

PID<- IP %>%  select(PID, Status, age, sex, ethnicity_fac, allergy, CITY, COUNTY, ZIP, LAT, LON, PATIENT) %>% distinct() # 9040

# Some enrichment
PID$death_status<- ifelse(PID$Status==0, "Alive", "Died")
PID$sex_fac<-      factor(PID$sex, labels= c("Male", "Female"))
PID$age_group<-    floor(PID$age/10)*10

# Make some tidy labels for age group
age_labels<- names(table(PID$age_group))
age_labels<- paste(age_labels,c(age_labels[-1],"+")) 
age_labels<- ifelse(age_labels %like% "\\+", gsub(" ","",age_labels), gsub(" ","-",age_labels)) %>% print()

PID$age_fac<- ordered(factor(PID$age_group), labels= age_labels)
table(PID$age_fac)

# Super basic initial plot of patient locations
ggplot(PID, aes(x= LON, y=LAT, color= death_status))+
  geom_point(alpha= 0.3, size= 1) +
  theme_void() +
  labs(title= "Patient locations before any styling")
```
We can pull patient encounters from the active period, and look at the date at which each patient is first diagnosed. We can then use the `cumsum` function to derive the cumulative infection curve.
```{r, message= FALSE}
# Pull out records of patient encounters across the active period

encounters<- IP %>% filter(!type %like% "history") %>% left_join(hospitals)

encounters<- encounters %>% mutate(death_status= ifelse(Status==0,"Alive", "Died"),
                                   COST= ifelse(is.na(TOTALCOST), BASE_COST, TOTALCOST),
                                   age_group= floor(age/10)*10,
                                   age_group= factor(age_group, labels= age_labels),
                                   description= str_to_lower(DESCRIPTION))

encounters<- encounters %>% mutate(COVID= ifelse(description == "covid-19", 1, 0))

# Infection Rates

diags<- encounters %>% filter(COVID==1) %>% group_by(PID) %>% summarise(etime= min(etime))
t0<- diags %>% count(etime) %>% mutate(cumulative= cumsum(n)) 
plot(t0$etime,t0$cumulative, type="l", main= "Overall Infections", xlab= "Day of the active period",
     ylab= "Cumulative Infections")

```
This curve is not very informative, lets split the infections by County...
```{r, message= FALSE}
# Infection Rates by County

diags<-   encounters %>% filter(COVID==1) %>% group_by(PID,COUNTY) %>% summarise(etime= min(etime)) %>% ungroup()
t0<-      diags %>% count(etime,COUNTY) %>% spread(COUNTY,n) %>% putZeros()
cummat<-  sapply(t0[,2:15], cumsum) %>% data.frame() %>%  mutate(etime= t0$etime)
clong<-   cummat %>% gather(key, value, -etime) %>% mutate(key= gsub("\\."," ", key))
clong<-   left_join(clong, pop, by=c("key"="COUNTY"))
clong<-   clong %>% mutate(per_100k = value/Population * 100000,
                           County=    word(key, 1))

g<- ggplot(clong, aes(x= etime, y= per_100k, color= County)) +
  geom_line() +
  labs(x= "Day of the active period", y= "Infections per 100k population",
               title= "Infection Rate by County") +
  theme_bw()

ggplotly(g)
```
Next we explore the total pathway cost (across the one-year active period) for each patient, broken down by age group, and whether or not the patient ultimately died.
```{r, message= FALSE}
# Summarise patient pathways
t0<- encounters %>% group_by(PID, COUNTY, age_group, death_status) %>% 
                    summarise(COST= sum(COST,na.rm = T), length= max(etime, na.rm=T))

# Average cost by age group
t1<- t0 %>% group_by(age_group, death_status) %>% summarise(Average= mean(COST)) %>%
            mutate(Label= paste0(round(Average/1000),"k"))
ggplot(t1, aes(x= Average, y= factor(age_group), fill= death_status)) +
     geom_col(position= "dodge") +
     geom_text(aes(label= Label), position= position_dodge(width= 1), hjust= 1.2, fontface= "bold", color="white") +
     labs(title= "Total Cost by Age Group and Death Outcome", 
        subtitle= "Average Total Cost across the 1-year active period per patient",
        x= "Average Overall Cost", y= "Age Group", fill= "Death Outcome") +
     scale_fill_manual(values=c("#7a5195","#f95d6a")) +
     theme_bw()
```
Next we try out our generic death rate function across various factors that exist in the data. It seems that Men have a slightly higher risk of death from COVID than women, and that patients who are African American might be at higher risk than White or Asian patients.
```{r, message= FALSE}
# Visual EDA of Death Rate by various factors
death_rate_table(PID$sex_fac, "Sex")

death_rate_table(PID$ethnicity_fac, "Ethnicity")

death_rate_table(PID$age_fac, "Age Group")

death_rate_table(PID$COUNTY, "County")
```
Because latitude and longitude data has been provided for each patient in the data set, it is an excellent opportunity to try out some geo-spatial tools. Please note that because the data has been synthesized from real data, sometimes the randomness inserted into the data makes the points lay outside the State or outside the land!

I also found a handy shapefile online (link below) with the county boundaries of Massachusetts. We also have a set of Hospitals, who coordinates are also approximate.  Originally these plots were done with leaflet, however I was not able to get those to upload to RPubs from my mac.
```{r, message= FALSE}
# Download a shapefile
# http://maps-massgis.opendata.arcgis.com/datasets/aca1bbc8e91e4970b24bc4db8311e6f6_0

csh<- st_read("~/Documents/Synthea/Massachusetts_Counties-shp/Massachusetts_Counties.shp")

csh %>%
  leaflet() %>% 
  addTiles() %>% 
  # set color to green and create Wealth Zipcodes group
  addPolygons(weight = 1, fillOpacity = .3, color = "green",
              label = ~COUNTY,
              highlightOptions = highlightOptions(weight = 5, color = "white", bringToFront = FALSE)) %>%
  addCircleMarkers(data= hospitals, radius= 2, color= "yellow",
                 popup = hospitals$HOS_KEY)
```
Now let's add the patient points in layers so that we can explore each set separately...
```{r, message= FALSE}

pal <- colorFactor(palette = c("blue", "red"), 
                   levels = c("Alive", "Died"))

massCentral<- c(-71.801299, 42.196323)

died<-  PID %>% filter(death_status=="Died")
alive<- PID %>% filter(death_status== "Alive")

g<- leaflet() %>% 
  addTiles() %>% 
  addCircleMarkers(data= alive, radius= 1, opacity= 0.3, color= ~pal(death_status), group= "Alive") %>%
  addCircleMarkers(data= died, radius=1, opacity= 0.5, color= ~pal(death_status), group= "Died") %>%
  addCircleMarkers(data= hospitals, radius= 2, opacity= 1, color= "yellow", 
                   popup = hospitals$HOS_KEY, group= "Hospitals") %>%
  addLayersControl(overlayGroups = c("Alive", "Died", "Hospitals"))

g
```
Clearly the population is unevenly spread across the state, so it would be interesting to explore various metrics and how they vary by county.
```{r, message= FALSE}
# Death metrics by county

# First get population size order for later plotting
bySize <-            pop %>% arrange(Population) %>% pull(COUNTY)
encounters$COUNTY <- factor(encounters$COUNTY, levels = bySize)

# Summarise statistics from patient pathways (by patient)
t0<- encounters %>% filter(!is.na(COUNTY)) %>%
       group_by(PID, CITY, COUNTY, age, age_group, death_status, LAT, LON) %>% 
       summarise(COST= sum(COST,na.rm = T)) %>% ungroup()

# Summarise across patients
t1<- t0  %>% group_by(COUNTY) %>% summarise(Patients= howmany(PID),
                                             Average_age= mean(age),
                                             Average_cost= mean(COST))
# Death Rate by COUNTY
t2<- t0 %>% group_by(COUNTY,death_status) %>% summarise(N= howmany(PID)) %>%
             left_join(t1) %>%
             mutate(Percent= N/Patients) %>% 
             select(COUNTY,death_status, Percent) %>%
             spread(death_status, Percent) %>% putZeros() %>% label_cols("Percent_")

tv<- left_join(t1, t2 %>% select(COUNTY, Percent_Died))   
tv<- left_join(tv, pop)

# Calculate some metrics
tv$Patients_per_100k= tv$Patients/tv$Population * 100000
tv$Raw_Deaths<-       tv$Percent_Died * tv$Patients
tv$Deaths_per_100k<-  tv$Raw_Deaths/tv$Population * 100000
tv$COUNTY<-           word(tv$COUNTY,1)
sizes<-               sort(table(encounters$COUNTY))
tv$COUNTY<-           fct_reorder(tv$COUNTY, names(sizes)) %>% str_to_upper(tv$COUNTY) %>% trimws()

# Re-shape for plotting
tlong<- tv %>% gather(key, value, -COUNTY) %>% 
                 mutate(Label= value,
                        COUNTY= factor(COUNTY, levels= str_to_upper(word(bySize, 1)))) 
tmid<-  tlong %>% group_by(key) %>% mutate(mid= mean(value))
tlong<- left_join(tlong, tmid)

# Tidy up the labels for plotting
tlong$key<- factor(tlong$key, levels= c("Population","Average_age","Average_cost","Patients",
                                       "Patients_per_100k","Raw_Deaths","Percent_Died","Deaths_per_100k"))
tlong$Label<- with(tlong, ifelse(key=="Average_cost", paste0("$",round(value/1000),"k"), round(value)))
tlong$Label<- with(tlong, ifelse(key=="Population", paste0(round(value/1000),"k"), Label))
tlong$Label<- with(tlong, ifelse(key=="Percent_Died", paste0(round(value*100,1),"%"), Label))
tlong$Label<- with(tlong, ifelse(key=="Deaths_per_100k", round(value,1), Label))

ggplot(tlong, aes(x= value, y= COUNTY, fill= key)) +
  geom_col() +
  geom_text(aes(label=Label,hjust = ifelse(value < mid, -0.3, 1.3)), vjust= 0.5, size= 2.5) +
  facet_wrap(~key, scales= "free_x", ncol= 4, nrow=2) +
  theme_bw() +
  theme(legend.position = "none") +
  ggtitle("Various Metrics by County")
```
The `sf` package allows data to be held alongside spatial geometries, which allows us to join various calculated fields into spatial objects.  Here is the most basic plotting function provided via `sf`.
```{r}
# Native shapefile plotting via sf
csht<- left_join(csh, tv)
plot(csht[,c("Deaths_per_100k","Patients_per_100k","Population","Average_age")])
```
Alternatively we can combine the functionality of `sf` with our trusty `ggplot` by making use of `geom_sf`. We have plotted the county names in the polygon centres via the `st_centroid` function. Here we have plotted Age, but you could substitute field that can be created at county granularity.
```{r, message= FALSE}
# Combining shapefile + data plotting with ggplot and geom_sf
pp<- st_centroid(csh) %>% st_coordinates()
counties<- csht$COUNTY

ggplot() +
  geom_sf(data= csht, aes(fill= Average_age)) +
  geom_text(aes(x= pp[,1], y=pp[,2], label= counties), fontface="bold", color = "white")+
  theme_void() +
  theme(panel.background = element_rect(fill= "grey60"))+
  scale_fill_continuous_sequential(palette = "Plasma", rev= FALSE)
```
This has just been a quick breeze through of some alternatives for basic EDA, making some additional use of geo-spatial elements.

..


..

..
