Packages install

library(tidyverse)
library(plotly)
library(ggpol)
library(maps)
library(lubridate)
library(ggalluvial)
library(GGally)

Import dataset

patientinfo <- read.csv("C:/Users/admin/OneDrive/Desktop/Data Visualization & Dashboarding with R coursera/Course 5/archive/PatientInfo.csv")
region <- read.csv("C:/Users/admin/OneDrive/Desktop/Data Visualization & Dashboarding with R coursera/Course 5/archive/region.csv")
patientinfo[patientinfo == ""] <- NA

patientinfo <- tibble(patientinfo)
head(patientinfo)
## # A tibble: 6 × 14
##   patient_id sex    age   country province city       infection_case infected_by
##        <dbl> <chr>  <chr> <chr>   <chr>    <chr>      <chr>          <chr>      
## 1 1000000001 male   50s   Korea   Seoul    Gangseo-gu overseas infl… <NA>       
## 2 1000000002 male   30s   Korea   Seoul    Jungnang-… overseas infl… <NA>       
## 3 1000000003 male   50s   Korea   Seoul    Jongno-gu  contact with … 2002000001 
## 4 1000000004 male   20s   Korea   Seoul    Mapo-gu    overseas infl… <NA>       
## 5 1000000005 female 20s   Korea   Seoul    Seongbuk-… contact with … 1000000002 
## 6 1000000006 female 50s   Korea   Seoul    Jongno-gu  contact with … 1000000003 
## # ℹ 6 more variables: contact_number <chr>, symptom_onset_date <chr>,
## #   confirmed_date <chr>, released_date <chr>, deceased_date <chr>, state <chr>

Introduction

The following is an attempt to analyze and visualize COVID-19 statistics in Korea, 2020 Jan to Jun. Data originates from Kaggle.

Aim:

  • to understand the epidemiology across different cities, different provinces across Korea

Plot 1: Bubble Map of Patients of COVID-19

  • showing the patient counts of cities in South Korea
  • this plot is interactive
world <- map_data("world")

sk <- world %>%
  filter(region == "South Korea")

city_count <- patientinfo %>%
  count(city, patient_id = "city_count")

mybreaks <- c(5, 10, 25, 50, 75, 100, 125, 150, 200, 250, 300, 500)

city_count_with_coords <- city_count %>%
  left_join(region %>% select(city, latitude, longitude), by = "city")


city_count_with_coords <- city_count_with_coords %>%
  arrange(n) %>%
  mutate(city = factor(city, unique(city))) %>%
  mutate(mytext = paste(
    "City: ", city, "\n",
    "Patient Count: ", n,
    sep = ""
  ))

plot3 <- ggplot() +
  geom_polygon(data = sk, fill = "grey", aes(x=long, y = lat, group = group)) + 
  coord_fixed(1.3) + geom_jitter(data = city_count_with_coords, aes(x=longitude, y = latitude, size = n, text = mytext, colour = n), alpha = 0.8) + 
  scale_color_viridis_c(
    option = "magma", trans = "log",
    breaks = mybreaks, name = "Number of Patients"
  ) +
  theme_void() +
  guides(colour = guide_legend()) +
  scale_size_continuous(
    name = "Number of Patients", 
    range = c(1, 12), breaks = mybreaks
  ) +
  scale_alpha_continuous(
    name = "Number of Patients", 
    range = c(0.2, .9), breaks = mybreaks
  )
plot3 <- ggplotly(plot3, tooltip = "text")
plot3

Plot 2: Grouped Bar Plot

  • Comparing the total number of cases that take places in different provinces, and their age_groups

  • differentiate the cases with 4 age groups: 0-10s = children, 20s-30s = young adults, 40-60s = middle aged, 60s+ = elderly

age_groups <- patientinfo %>%
  mutate(age_group = case_when(
    age %in% c("0s", "10s") ~ "children",
    age %in% c("20s", "30s") ~ "young adults",
    age %in% c("40s", "50s") ~ "middle age",
    age %in% c("60s", "70s", "80s", "90s", "100s") ~ "elderly"
  )) %>%
  filter(!is.na(age_group))

age_groups$age_group <- factor(age_groups$age_group)
age_groups$age_group <- fct_relevel(age_groups$age_group, "children", "young adults", "middle age", "elderly")
#plot 1
ggplot(age_groups, aes(x=province, fill = age_group)) + 
  geom_bar(position = "dodge") + 
  coord_flip() + 
  scale_y_continuous(breaks = c(0,100, 200, 300, 400, 500, 600, 700, 800, 900, 1000)) + 
  labs(
    title = "The Provinces that the Cases located in and their Age Groups",
    y = "The Number of COVID-19 Cases Accumulative",
    x = "Province"
    ) +
  scale_fill_brewer(type = "div") +
  theme_bw()

Plot 3: Boxplot and jitter

  • Comparing the disease duration across different age groups. Symptom_onset_date, confirmed_date (diagnosis), released_date (recovery) and deceased_date (death) -> Duration of disease calculated from symptom onset date to recovery / death
  • Outliers marked as grey
patientinfo_date <- age_groups %>%
  filter(!is.na(symptom_onset_date)) %>%
  filter(!is.na(confirmed_date))

patientinfo_date <- patientinfo_date %>%
  mutate(
    symptom_onset_date = ymd(symptom_onset_date),
    released_date = ymd(released_date),
    deceased_date = ymd(deceased_date)
  )

patientinfo_date <- patientinfo_date %>%
  mutate(
    outcome = case_when(
      !is.na(deceased_date) ~ "deceased",
      !is.na(released_date) ~ "recovered",
      TRUE ~ NA_character_
    ),
    duration = case_when(
      outcome == "deceased" ~ as.numeric(deceased_date - symptom_onset_date),
      outcome == "recovered" ~ as.numeric(released_date - symptom_onset_date),
      TRUE ~ NA_real_
    )
  ) %>%
  filter(!is.na(duration))  # Keep only patients with known durations



patientinfo_date$age <- factor(patientinfo_date$age)
patientinfo_date$age <- fct_relevel(patientinfo_date$age, "0s", "10s", "20s", "30s", "40s", "50s", "60s", "70s", "80s", "90s", "100s")


plot2 <- ggplot(patientinfo_date, aes(x = age, y = duration, color = outcome)) +
  geom_boxjitter(position = "dodge", errorbar.draw = TRUE, outlier.colour = "grey") +
  labs(
    title = "Duration from symptom onset to outcome by age group",
    y = "Days needed for recovery"
  )
plot2

Plot4: Alluvial Plot showing epidemiology

  • what are the causes of infection in each age groups?
patientinfo_date$state <- as.factor(patientinfo_date$state)

age_infectioncase_state <-patientinfo_date %>%
  select(age_group, infection_case) %>%
  count(age_group, infection_case, name = "freq") %>%
  filter(!is.na(infection_case)) %>%
  filter(!infection_case == "etc")

age_infectioncase_state <- age_infectioncase_state %>%
  mutate(infection_case = as.factor(infection_case))

age_infectioncase_state <- age_infectioncase_state %>%
  mutate(infection_case_grouped = infection_case) %>%
  mutate(
    infection_case_grouped = case_when(
      str_detect(tolower(infection_case), "church") ~ "Church-related",
      str_detect(tolower(infection_case), "hospital") ~ "Hospital",
      str_detect(tolower(infection_case), "gym") ~ "Fitness center",
      TRUE ~ infection_case  # keep as is if no match
    )
  )

age_infectioncase_state$infection_case <- fct_reorder(age_infectioncase_state$infection_case, age_infectioncase_state$infection_case_grouped)

ggplot(age_infectioncase_state,
       aes(axis1 = age_group,
           axis2 = infection_case_grouped,
           y = freq)) +
  geom_alluvium(aes(fill = infection_case_grouped), width = 0.15) +
  geom_stratum(width = 0.15, fill = "grey90", color = "black") +

  geom_label(stat = "stratum", aes(label = after_stat(stratum)),
             size = 3,
             label.padding = unit(0.15, "lines"),
             label.r = unit(0.15, "lines")) +

  scale_x_discrete(limits = c("Age Group", "Grouped Case"),
                   expand = c(.1, .1)) +

  scale_fill_brewer(palette = "Set2") +

  labs(title = "Flow of Infection Cases by Age Group",
       y = "Number of Patients",
       x = NULL) +

  theme_minimal() + theme(
    legend.title = element_text(size = 9),
    legend.text = element_text(size = 7),
    legend.key.size = unit(0.4, "cm"),
    legend.spacing.y = unit(0.2, "cm")
  )

Plot 5: Stacked area plot showing changes of infection cases along time

using confirmed cases

weekly_counts <- patientinfo_date %>%
  mutate(date = as.Date(confirmed_date),
         week = floor_date(date, unit = "week", week_start = 7)) %>%
  group_by(week, age_group) %>%
  count(name = "count_of_confirmed_cases")

ggplot(weekly_counts, aes(x = week, y = count_of_confirmed_cases, fill = age_group)) +
  geom_area() +
  scale_x_date(
    breaks = function(x) seq(min(x), max(x), by = "2 weeks"), 
    date_labels = "W%U" 
  ) +
  labs(title = "Weekly Infections by Age groups in Korea, Jan 2020 to Jun 2020", x = "Week", y = "Confirmed Cases") +
  theme_minimal() + scale_y_continuous(breaks = c(0, 10, 20, 30, 40, 50, 60,70)) + scale_fill_brewer()

Plot 6: Stacked Barchart, divided by age and sex.

  • duration categorized as within 2 weeks (14d), within a month (30d), within 2 months (60d), and more than 2 months (60+)
plotsix <- patientinfo_date %>%
  select(duration, sex, age) %>%
  mutate(duration_gp = case_when(
    duration <= 14 ~ "within 2 weeks", 
    duration <= 30 ~ "within a month", 
    duration <= 60 ~ "within 2 months",
    duration > 60 ~ "more than 2 months",
    TRUE ~ NA_character_
  ))

duration_gp <- as.factor(plotsix$duration_gp)
plotsix$duration_gp <- fct_relevel(duration_gp, "within 2 weeks", "within a month", "within 2 months", "more than 2 months")

plotsixcount <- plotsix %>%
  group_by(sex, age, duration_gp) %>%
  summarize(
    countofduration = n()
  )
plotsixcount <- plotsixcount %>%
  mutate(count_mirrored = ifelse(sex == "female", -1* countofduration, countofduration))

ggplot(plotsixcount, aes(x = age, y = count_mirrored, fill = duration_gp)) +
  geom_bar(position = "stack", stat = "identity", color = "black") +
  coord_flip() +
  geom_hline(yintercept = 0, color = "black") +
  theme(
    axis.title.y = element_blank(),
    legend.position = "top",
    panel.grid.major.y = element_blank()
  ) + scale_fill_brewer() + theme_bw() + scale_y_continuous(breaks = c(-30, -25, -20, -15, -10, -5, 0, 5, 10, 15, 20, 30, 35)) + annotate("text", y=-25, x = 9, label = "female") + annotate("text", y=15, x = 9, label = "male")

Plot 7: Heat map showing the relationship between city statistics and count of patients

plot7dat <- region %>%
  left_join(city_count, by = "city") %>%
  filter(!is.na(n))

plot7dat <- plot7dat %>%
  select(elderly_population_ratio, elderly_alone_ratio, nursing_home_count, university_count, n)

ggcorr(plot7dat,label = TRUE, label_size = 3, size = 3, hjust = 0.75, layout.exp = 1)

Plot 8: Distribution of disease duration

ggplot(patientinfo_date, aes(x=duration)) + geom_histogram(aes(y = ..density..), binwidth=3, fill="#69b3a2", color="#e9ecef") + scale_x_continuous(breaks = seq(0, 70, by =5)) + labs(title = "Distribution of disease duration") +
  stat_function(fun = dnorm, 
                args = list(mean = mean(patientinfo_date$duration, na.rm = TRUE), 
                            sd = sd(patientinfo_date$duration, na.rm = TRUE)), 
                color = "black", linewidth = 1) + theme_classic()