Intro

suppressPackageStartupMessages({

library(tidyverse)
library(ggplot2)
library(dplyr)
library(plotly)

})

Load data

conditions <- read_csv("/Users/nnthieu/SyntheaData/SyntheaCovid19/conditions.csv")
## Rows: 1143900 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): PATIENT, ENCOUNTER, DESCRIPTION
## dbl  (1): CODE
## date (2): START, STOP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
patients <- read_csv("/Users/nnthieu/SyntheaData/SyntheaCovid19/patients.csv")
## Rows: 124150 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (19): Id, SSN, DRIVERS, PASSPORT, PREFIX, FIRST, LAST, SUFFIX, MAIDEN, ...
## dbl   (4): LAT, LON, HEALTHCARE_EXPENSES, HEALTHCARE_COVERAGE
## date  (2): BIRTHDATE, DEATHDATE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
observations <- read_csv("/Users/nnthieu/SyntheaData/SyntheaCovid19/observations.csv")
## Rows: 16219969 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): PATIENT, ENCOUNTER, CODE, DESCRIPTION, VALUE, UNITS, TYPE
## date (1): DATE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Define patients with COVID-19

covid_patient_ids <- conditions %>%
  filter(grepl("COVID-19", DESCRIPTION)) %>%
  pull(PATIENT)
length(covid_patient_ids)
## [1] 179205
covid_conditions <- conditions %>%
  filter(PATIENT %in% covid_patient_ids) 
head(covid_conditions)
## # A tibble: 6 × 6
##   START      STOP       PATIENT                     ENCOUNTER   CODE DESCRIPTION
##   <date>     <date>     <chr>                       <chr>      <dbl> <chr>      
## 1 2019-02-15 2019-08-01 1ff7f10f-a204-4bb1-aa72-dd… d317dacb… 6.54e7 Otitis med…
## 2 2019-10-30 2020-01-30 1ff7f10f-a204-4bb1-aa72-dd… 759cfd9c… 6.54e7 Otitis med…
## 3 2020-03-01 2020-03-30 1ff7f10f-a204-4bb1-aa72-dd… 60584c6a… 3.87e8 Fever (fin…
## 4 2020-03-01 2020-03-01 1ff7f10f-a204-4bb1-aa72-dd… 60584c6a… 8.41e8 Suspected …
## 5 2020-03-01 2020-03-30 1ff7f10f-a204-4bb1-aa72-dd… 60584c6a… 8.41e8 COVID-19   
## 6 2020-02-12 2020-02-26 9bcf6ed5-d808-44af-98a0-7d… 5d9dfe80… 4.45e7 Sprain of …

Define covid date

covid_dates <- conditions %>%
  filter(grepl("COVID-19", DESCRIPTION)) %>%
  rename(covid_date = START) %>%
  mutate(covid_date = as.Date(covid_date)) %>%
  group_by(PATIENT) %>%
  summarise(covid_date = min(covid_date, na.rm = TRUE), .groups = "drop")

covid_dates_uni <- covid_dates %>%
  group_by(PATIENT) %>%
  slice_min(covid_date, with_ties = FALSE) %>%
  ungroup()

Covid patients

covid_patients <- patients %>% filter(Id %in% covid_patient_ids) %>%
left_join(covid_dates_uni, by = c("Id" = "PATIENT")) 
head(covid_patients)
## # A tibble: 6 × 26
##   Id      BIRTHDATE  DEATHDATE SSN    DRIVERS PASSPORT PREFIX FIRST LAST  SUFFIX
##   <chr>   <date>     <date>    <chr>  <chr>   <chr>    <chr>  <chr> <chr> <chr> 
## 1 1ff7f1… 2017-08-24 NA        999-6… <NA>    <NA>     <NA>   Jaci… Kris… <NA>  
## 2 9bcf6e… 2016-08-01 NA        999-1… <NA>    <NA>     <NA>   Alva… Kraj… <NA>  
## 3 5163c5… 2004-01-09 NA        999-7… S99999… <NA>     <NA>   Jimm… Harr… <NA>  
## 4 cc3c80… 1996-11-15 NA        999-6… S99924… X995294… Mr.    Greg… Auer… <NA>  
## 5 bd1c4f… 2019-06-12 NA        999-8… <NA>    <NA>     <NA>   Kary… Muel… <NA>  
## 6 4dc153… 1992-06-30 NA        999-2… S99973… X535779… Mr.    Jays… Fade… <NA>  
## # ℹ 16 more variables: MAIDEN <chr>, MARITAL <chr>, RACE <chr>,
## #   ETHNICITY <chr>, GENDER <chr>, BIRTHPLACE <chr>, ADDRESS <chr>, CITY <chr>,
## #   STATE <chr>, COUNTY <chr>, ZIP <chr>, LAT <dbl>, LON <dbl>,
## #   HEALTHCARE_EXPENSES <dbl>, HEALTHCARE_COVERAGE <dbl>, covid_date <date>

Deceased covid patients

deceased_patients <- patients %>%
  filter(Id %in% covid_patient_ids, !is.na(DEATHDATE)) %>%
  select(Id, DEATHDATE)

Lab values

deceased_patients <- patients %>%
  filter(Id %in% covid_patient_ids, !is.na(DEATHDATE)) %>%
  select(Id, DEATHDATE)

covid_lab_values <- observations %>%
  filter(PATIENT %in% covid_patient_ids) %>%
  filter(CODE %in% c("731-0", "26881-3")) %>%
  select(DATE, PATIENT, CODE, DESCRIPTION, VALUE, UNITS) %>%
  left_join(
    covid_dates %>%
      group_by(PATIENT) %>%
      slice_min(covid_date, with_ties = FALSE) %>%
      ungroup(),
    by = "PATIENT"
  ) %>%
  mutate(
    DATE = as.Date(DATE),
    covid_date = as.Date(covid_date),
    Days = as.integer(DATE - covid_date)
  ) %>%
  left_join(
    deceased_patients %>%
      distinct(Id, .keep_all = TRUE),
    by = c("PATIENT" = "Id")
  ) %>%
  mutate(survivor = is.na(DEATHDATE))

head(covid_lab_values, 5)
## # A tibble: 5 × 10
##   DATE       PATIENT   CODE  DESCRIPTION VALUE UNITS covid_date  Days DEATHDATE
##   <date>     <chr>     <chr> <chr>       <chr> <chr> <date>     <int> <date>   
## 1 2020-02-19 bd1c4ffc… 731-0 Lymphocyte… 1.1   10*3… 2020-02-19     0 NA       
## 2 2020-02-20 bd1c4ffc… 731-0 Lymphocyte… 1.0   10*3… 2020-02-19     1 NA       
## 3 2020-02-21 bd1c4ffc… 731-0 Lymphocyte… 1.1   10*3… 2020-02-19     2 NA       
## 4 2020-02-22 bd1c4ffc… 731-0 Lymphocyte… 1.0   10*3… 2020-02-19     3 NA       
## 5 2020-02-23 bd1c4ffc… 731-0 Lymphocyte… 1.0   10*3… 2020-02-19     4 NA       
## # ℹ 1 more variable: survivor <lgl>
# Summarise patient counts by city
city_summary <- covid_patients |>
  filter(!is.na(CITY), CITY != "") |>
  group_by(CITY) |>
  summarise(
    patient_count = n(),
    avg_lat = mean(LAT, na.rm = TRUE),
    avg_lon = mean(LON, na.rm = TRUE),
    .groups = "drop"
  ) |>
  arrange(desc(patient_count))

top_10_cities <- city_summary %>%
  slice_max(patient_count, n = 10)

Visualization

Top 10 cities with most patients

ggplot(top_10_cities, aes(x = reorder(CITY, patient_count), y = patient_count)) +
      geom_col(fill = "#2c7fb8") +
      geom_text(
        aes(label = patient_count),
        hjust = 1.1,
        color = "white",
        size = 3.5
      ) +
      coord_flip() +
      labs(x = "City", y = "Patient Count") +
      theme_minimal() +
      theme(axis.text.y = element_text(size = 10))

Map of COVID-19 patients by city

# Load required libraries
library(dplyr)
library(ggplot2)
library(maps)  # for map data
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
# Filter Massachusetts patients and sample 50%
covid_ma <- covid_patients %>%
  filter(STATE == "Massachusetts") 

# Count patients by CITY
city_counts <- covid_ma %>%
  group_by(CITY) %>%
  summarise(Patient_Count = n(), .groups = "drop")

# Get average latitude and longitude by CITY
city_coords <- covid_ma %>%
  group_by(CITY) %>%
  summarise(
    LAT = mean(LAT, na.rm = TRUE),
    LON = mean(LON, na.rm = TRUE),
    .groups = "drop"
  )

# Merge counts and coordinates
map_data <- left_join(city_counts, city_coords, by = "CITY")

# Set coordinate limits for Massachusetts
# Approximate bounding box: long -73.5 to -69.8, lat 41 to 43
ggplot(map_data, aes(x = LON, y = LAT)) +
  borders("state", regions = "massachusetts", fill = "gray95") +
  geom_point(aes(size = Patient_Count), color = "red", alpha = 0.7) +
  coord_quickmap(xlim = c(-73.5, -69.8), ylim = c(41, 43)) +
  labs(title = "COVID Patient Counts by City in Massachusetts",
    x = "Longitude", y = "Latitude"
  ) +
  theme_minimal()

Boxplot of lab values by days from COVID diagnosis

library(ggplot2)

ggplot(covid_lab_values, aes(x = factor(Days), y = as.numeric(VALUE), fill = survivor)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7, position = position_dodge()) +
  facet_wrap(~ DESCRIPTION, scales = "free_y") +
  scale_fill_manual(
    values = c("FALSE" = "#e41a1c", "TRUE" = "#377eb8"),
    labels = c("Deceased", "Survivor")
  ) +
  labs(
    title = "Lab Values by Days from COVID Diagnosis",
    x = "Days Since COVID Diagnosis",
    y = "Lab Value",
    fill = "Survival Status"
  ) +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, size = 8))