suppressPackageStartupMessages({
library(tidyverse)
library(ggplot2)
library(dplyr)
library(plotly)
})
conditions <- read_csv("/Users/nnthieu/SyntheaData/SyntheaCovid19/conditions.csv")
## Rows: 1143900 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): PATIENT, ENCOUNTER, DESCRIPTION
## dbl (1): CODE
## date (2): START, STOP
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
patients <- read_csv("/Users/nnthieu/SyntheaData/SyntheaCovid19/patients.csv")
## Rows: 124150 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (19): Id, SSN, DRIVERS, PASSPORT, PREFIX, FIRST, LAST, SUFFIX, MAIDEN, ...
## dbl (4): LAT, LON, HEALTHCARE_EXPENSES, HEALTHCARE_COVERAGE
## date (2): BIRTHDATE, DEATHDATE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
observations <- read_csv("/Users/nnthieu/SyntheaData/SyntheaCovid19/observations.csv")
## Rows: 16219969 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): PATIENT, ENCOUNTER, CODE, DESCRIPTION, VALUE, UNITS, TYPE
## date (1): DATE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
covid_patient_ids <- conditions %>%
filter(grepl("COVID-19", DESCRIPTION)) %>%
pull(PATIENT)
length(covid_patient_ids)
## [1] 179205
covid_conditions <- conditions %>%
filter(PATIENT %in% covid_patient_ids)
head(covid_conditions)
## # A tibble: 6 × 6
## START STOP PATIENT ENCOUNTER CODE DESCRIPTION
## <date> <date> <chr> <chr> <dbl> <chr>
## 1 2019-02-15 2019-08-01 1ff7f10f-a204-4bb1-aa72-dd… d317dacb… 6.54e7 Otitis med…
## 2 2019-10-30 2020-01-30 1ff7f10f-a204-4bb1-aa72-dd… 759cfd9c… 6.54e7 Otitis med…
## 3 2020-03-01 2020-03-30 1ff7f10f-a204-4bb1-aa72-dd… 60584c6a… 3.87e8 Fever (fin…
## 4 2020-03-01 2020-03-01 1ff7f10f-a204-4bb1-aa72-dd… 60584c6a… 8.41e8 Suspected …
## 5 2020-03-01 2020-03-30 1ff7f10f-a204-4bb1-aa72-dd… 60584c6a… 8.41e8 COVID-19
## 6 2020-02-12 2020-02-26 9bcf6ed5-d808-44af-98a0-7d… 5d9dfe80… 4.45e7 Sprain of …
covid_dates <- conditions %>%
filter(grepl("COVID-19", DESCRIPTION)) %>%
rename(covid_date = START) %>%
mutate(covid_date = as.Date(covid_date)) %>%
group_by(PATIENT) %>%
summarise(covid_date = min(covid_date, na.rm = TRUE), .groups = "drop")
covid_dates_uni <- covid_dates %>%
group_by(PATIENT) %>%
slice_min(covid_date, with_ties = FALSE) %>%
ungroup()
covid_patients <- patients %>% filter(Id %in% covid_patient_ids) %>%
left_join(covid_dates_uni, by = c("Id" = "PATIENT"))
head(covid_patients)
## # A tibble: 6 × 26
## Id BIRTHDATE DEATHDATE SSN DRIVERS PASSPORT PREFIX FIRST LAST SUFFIX
## <chr> <date> <date> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1ff7f1… 2017-08-24 NA 999-6… <NA> <NA> <NA> Jaci… Kris… <NA>
## 2 9bcf6e… 2016-08-01 NA 999-1… <NA> <NA> <NA> Alva… Kraj… <NA>
## 3 5163c5… 2004-01-09 NA 999-7… S99999… <NA> <NA> Jimm… Harr… <NA>
## 4 cc3c80… 1996-11-15 NA 999-6… S99924… X995294… Mr. Greg… Auer… <NA>
## 5 bd1c4f… 2019-06-12 NA 999-8… <NA> <NA> <NA> Kary… Muel… <NA>
## 6 4dc153… 1992-06-30 NA 999-2… S99973… X535779… Mr. Jays… Fade… <NA>
## # ℹ 16 more variables: MAIDEN <chr>, MARITAL <chr>, RACE <chr>,
## # ETHNICITY <chr>, GENDER <chr>, BIRTHPLACE <chr>, ADDRESS <chr>, CITY <chr>,
## # STATE <chr>, COUNTY <chr>, ZIP <chr>, LAT <dbl>, LON <dbl>,
## # HEALTHCARE_EXPENSES <dbl>, HEALTHCARE_COVERAGE <dbl>, covid_date <date>
deceased_patients <- patients %>%
filter(Id %in% covid_patient_ids, !is.na(DEATHDATE)) %>%
select(Id, DEATHDATE)
deceased_patients <- patients %>%
filter(Id %in% covid_patient_ids, !is.na(DEATHDATE)) %>%
select(Id, DEATHDATE)
covid_lab_values <- observations %>%
filter(PATIENT %in% covid_patient_ids) %>%
filter(CODE %in% c("731-0", "26881-3")) %>%
select(DATE, PATIENT, CODE, DESCRIPTION, VALUE, UNITS) %>%
left_join(
covid_dates %>%
group_by(PATIENT) %>%
slice_min(covid_date, with_ties = FALSE) %>%
ungroup(),
by = "PATIENT"
) %>%
mutate(
DATE = as.Date(DATE),
covid_date = as.Date(covid_date),
Days = as.integer(DATE - covid_date)
) %>%
left_join(
deceased_patients %>%
distinct(Id, .keep_all = TRUE),
by = c("PATIENT" = "Id")
) %>%
mutate(survivor = is.na(DEATHDATE))
head(covid_lab_values, 5)
## # A tibble: 5 × 10
## DATE PATIENT CODE DESCRIPTION VALUE UNITS covid_date Days DEATHDATE
## <date> <chr> <chr> <chr> <chr> <chr> <date> <int> <date>
## 1 2020-02-19 bd1c4ffc… 731-0 Lymphocyte… 1.1 10*3… 2020-02-19 0 NA
## 2 2020-02-20 bd1c4ffc… 731-0 Lymphocyte… 1.0 10*3… 2020-02-19 1 NA
## 3 2020-02-21 bd1c4ffc… 731-0 Lymphocyte… 1.1 10*3… 2020-02-19 2 NA
## 4 2020-02-22 bd1c4ffc… 731-0 Lymphocyte… 1.0 10*3… 2020-02-19 3 NA
## 5 2020-02-23 bd1c4ffc… 731-0 Lymphocyte… 1.0 10*3… 2020-02-19 4 NA
## # ℹ 1 more variable: survivor <lgl>
# Summarise patient counts by city
city_summary <- covid_patients |>
filter(!is.na(CITY), CITY != "") |>
group_by(CITY) |>
summarise(
patient_count = n(),
avg_lat = mean(LAT, na.rm = TRUE),
avg_lon = mean(LON, na.rm = TRUE),
.groups = "drop"
) |>
arrange(desc(patient_count))
top_10_cities <- city_summary %>%
slice_max(patient_count, n = 10)
Top 10 cities with most patients
ggplot(top_10_cities, aes(x = reorder(CITY, patient_count), y = patient_count)) +
geom_col(fill = "#2c7fb8") +
geom_text(
aes(label = patient_count),
hjust = 1.1,
color = "white",
size = 3.5
) +
coord_flip() +
labs(x = "City", y = "Patient Count") +
theme_minimal() +
theme(axis.text.y = element_text(size = 10))
# Load required libraries
library(dplyr)
library(ggplot2)
library(maps) # for map data
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
# Filter Massachusetts patients and sample 50%
covid_ma <- covid_patients %>%
filter(STATE == "Massachusetts")
# Count patients by CITY
city_counts <- covid_ma %>%
group_by(CITY) %>%
summarise(Patient_Count = n(), .groups = "drop")
# Get average latitude and longitude by CITY
city_coords <- covid_ma %>%
group_by(CITY) %>%
summarise(
LAT = mean(LAT, na.rm = TRUE),
LON = mean(LON, na.rm = TRUE),
.groups = "drop"
)
# Merge counts and coordinates
map_data <- left_join(city_counts, city_coords, by = "CITY")
# Set coordinate limits for Massachusetts
# Approximate bounding box: long -73.5 to -69.8, lat 41 to 43
ggplot(map_data, aes(x = LON, y = LAT)) +
borders("state", regions = "massachusetts", fill = "gray95") +
geom_point(aes(size = Patient_Count), color = "red", alpha = 0.7) +
coord_quickmap(xlim = c(-73.5, -69.8), ylim = c(41, 43)) +
labs(title = "COVID Patient Counts by City in Massachusetts",
x = "Longitude", y = "Latitude"
) +
theme_minimal()
library(ggplot2)
ggplot(covid_lab_values, aes(x = factor(Days), y = as.numeric(VALUE), fill = survivor)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7, position = position_dodge()) +
facet_wrap(~ DESCRIPTION, scales = "free_y") +
scale_fill_manual(
values = c("FALSE" = "#e41a1c", "TRUE" = "#377eb8"),
labels = c("Deceased", "Survivor")
) +
labs(
title = "Lab Values by Days from COVID Diagnosis",
x = "Days Since COVID Diagnosis",
y = "Lab Value",
fill = "Survival Status"
) +
theme_minimal(base_size = 13) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, size = 8))