library(tidyverse)
library(plotly)
library(ggpol)
library(maps)
library(lubridate)
library(ggalluvial)
library(GGally)
patientinfo <- read.csv("C:/Users/admin/OneDrive/Desktop/Data Visualization & Dashboarding with R coursera/Course 5/archive/PatientInfo.csv")
region <- read.csv("C:/Users/admin/OneDrive/Desktop/Data Visualization & Dashboarding with R coursera/Course 5/archive/region.csv")
patientinfo[patientinfo == ""] <- NA
patientinfo <- tibble(patientinfo)
head(patientinfo)
## # A tibble: 6 × 14
## patient_id sex age country province city infection_case infected_by
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1000000001 male 50s Korea Seoul Gangseo-gu overseas infl… <NA>
## 2 1000000002 male 30s Korea Seoul Jungnang-… overseas infl… <NA>
## 3 1000000003 male 50s Korea Seoul Jongno-gu contact with … 2002000001
## 4 1000000004 male 20s Korea Seoul Mapo-gu overseas infl… <NA>
## 5 1000000005 female 20s Korea Seoul Seongbuk-… contact with … 1000000002
## 6 1000000006 female 50s Korea Seoul Jongno-gu contact with … 1000000003
## # ℹ 6 more variables: contact_number <chr>, symptom_onset_date <chr>,
## # confirmed_date <chr>, released_date <chr>, deceased_date <chr>, state <chr>
The following is an attempt to analyze and visualize COVID-19 statistics in Korea, 2020 Jan to Jun. Data originates from Kaggle.
world <- map_data("world")
sk <- world %>%
filter(region == "South Korea")
city_count <- patientinfo %>%
count(city, patient_id = "city_count")
mybreaks <- c(5, 10, 25, 50, 75, 100, 125, 150, 200, 250, 300, 500)
city_count_with_coords <- city_count %>%
left_join(region %>% select(city, latitude, longitude), by = "city")
city_count_with_coords <- city_count_with_coords %>%
arrange(n) %>%
mutate(city = factor(city, unique(city))) %>%
mutate(mytext = paste(
"City: ", city, "\n",
"Patient Count: ", n,
sep = ""
))
plot3 <- ggplot() +
geom_polygon(data = sk, fill = "grey", aes(x=long, y = lat, group = group)) +
coord_fixed(1.3) + geom_jitter(data = city_count_with_coords, aes(x=longitude, y = latitude, size = n, text = mytext, colour = n), alpha = 0.8) +
scale_color_viridis_c(
option = "magma", trans = "log",
breaks = mybreaks, name = "Number of Patients"
) +
theme_void() +
guides(colour = guide_legend()) +
scale_size_continuous(
name = "Number of Patients",
range = c(1, 12), breaks = mybreaks
) +
scale_alpha_continuous(
name = "Number of Patients",
range = c(0.2, .9), breaks = mybreaks
)
plot3 <- ggplotly(plot3, tooltip = "text")
plot3
Comparing the total number of cases that take places in different provinces, and their age_groups
differentiate the cases with 4 age groups: 0-10s = children, 20s-30s = young adults, 40-60s = middle aged, 60s+ = elderly
age_groups <- patientinfo %>%
mutate(age_group = case_when(
age %in% c("0s", "10s") ~ "children",
age %in% c("20s", "30s") ~ "young adults",
age %in% c("40s", "50s") ~ "middle age",
age %in% c("60s", "70s", "80s", "90s", "100s") ~ "elderly"
)) %>%
filter(!is.na(age_group))
age_groups$age_group <- factor(age_groups$age_group)
age_groups$age_group <- fct_relevel(age_groups$age_group, "children", "young adults", "middle age", "elderly")
#plot 1
ggplot(age_groups, aes(x=province, fill = age_group)) +
geom_bar(position = "dodge") +
coord_flip() +
scale_y_continuous(breaks = c(0,100, 200, 300, 400, 500, 600, 700, 800, 900, 1000)) +
labs(
title = "The Provinces that the Cases located in and their Age Groups",
y = "The Number of COVID-19 Cases Accumulative",
x = "Province"
) +
scale_fill_brewer(type = "div") +
theme_bw()
patientinfo_date <- age_groups %>%
filter(!is.na(symptom_onset_date)) %>%
filter(!is.na(confirmed_date))
patientinfo_date <- patientinfo_date %>%
mutate(
symptom_onset_date = ymd(symptom_onset_date),
released_date = ymd(released_date),
deceased_date = ymd(deceased_date)
)
patientinfo_date <- patientinfo_date %>%
mutate(
outcome = case_when(
!is.na(deceased_date) ~ "deceased",
!is.na(released_date) ~ "recovered",
TRUE ~ NA_character_
),
duration = case_when(
outcome == "deceased" ~ as.numeric(deceased_date - symptom_onset_date),
outcome == "recovered" ~ as.numeric(released_date - symptom_onset_date),
TRUE ~ NA_real_
)
) %>%
filter(!is.na(duration)) # Keep only patients with known durations
patientinfo_date$age <- factor(patientinfo_date$age)
patientinfo_date$age <- fct_relevel(patientinfo_date$age, "0s", "10s", "20s", "30s", "40s", "50s", "60s", "70s", "80s", "90s", "100s")
plot2 <- ggplot(patientinfo_date, aes(x = age, y = duration, color = outcome)) +
geom_boxjitter(position = "dodge", errorbar.draw = TRUE, outlier.colour = "grey") +
labs(
title = "Duration from symptom onset to outcome by age group",
y = "Days needed for recovery"
)
plot2
patientinfo_date$state <- as.factor(patientinfo_date$state)
age_infectioncase_state <-patientinfo_date %>%
select(age_group, infection_case) %>%
count(age_group, infection_case, name = "freq") %>%
filter(!is.na(infection_case)) %>%
filter(!infection_case == "etc")
age_infectioncase_state <- age_infectioncase_state %>%
mutate(infection_case = as.factor(infection_case))
age_infectioncase_state <- age_infectioncase_state %>%
mutate(infection_case_grouped = infection_case) %>%
mutate(
infection_case_grouped = case_when(
str_detect(tolower(infection_case), "church") ~ "Church-related",
str_detect(tolower(infection_case), "hospital") ~ "Hospital",
str_detect(tolower(infection_case), "gym") ~ "Fitness center",
TRUE ~ infection_case # keep as is if no match
)
)
age_infectioncase_state$infection_case <- fct_reorder(age_infectioncase_state$infection_case, age_infectioncase_state$infection_case_grouped)
ggplot(age_infectioncase_state,
aes(axis1 = age_group,
axis2 = infection_case_grouped,
y = freq)) +
geom_alluvium(aes(fill = infection_case_grouped), width = 0.15) +
geom_stratum(width = 0.15, fill = "grey90", color = "black") +
geom_label(stat = "stratum", aes(label = after_stat(stratum)),
size = 3,
label.padding = unit(0.15, "lines"),
label.r = unit(0.15, "lines")) +
scale_x_discrete(limits = c("Age Group", "Grouped Case"),
expand = c(.1, .1)) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Flow of Infection Cases by Age Group",
y = "Number of Patients",
x = NULL) +
theme_minimal() + theme(
legend.title = element_text(size = 9),
legend.text = element_text(size = 7),
legend.key.size = unit(0.4, "cm"),
legend.spacing.y = unit(0.2, "cm")
)
using confirmed cases
weekly_counts <- patientinfo_date %>%
mutate(date = as.Date(confirmed_date),
week = floor_date(date, unit = "week", week_start = 7)) %>%
group_by(week, age_group) %>%
count(name = "count_of_confirmed_cases")
ggplot(weekly_counts, aes(x = week, y = count_of_confirmed_cases, fill = age_group)) +
geom_area() +
scale_x_date(
breaks = function(x) seq(min(x), max(x), by = "2 weeks"),
date_labels = "W%U"
) +
labs(title = "Weekly Infections by Age groups in Korea, Jan 2020 to Jun 2020", x = "Week", y = "Confirmed Cases") +
theme_minimal() + scale_y_continuous(breaks = c(0, 10, 20, 30, 40, 50, 60,70)) + scale_fill_brewer()
plotsix <- patientinfo_date %>%
select(duration, sex, age) %>%
mutate(duration_gp = case_when(
duration <= 14 ~ "within 2 weeks",
duration <= 30 ~ "within a month",
duration <= 60 ~ "within 2 months",
duration > 60 ~ "more than 2 months",
TRUE ~ NA_character_
))
duration_gp <- as.factor(plotsix$duration_gp)
plotsix$duration_gp <- fct_relevel(duration_gp, "within 2 weeks", "within a month", "within 2 months", "more than 2 months")
plotsixcount <- plotsix %>%
group_by(sex, age, duration_gp) %>%
summarize(
countofduration = n()
)
plotsixcount <- plotsixcount %>%
mutate(count_mirrored = ifelse(sex == "female", -1* countofduration, countofduration))
ggplot(plotsixcount, aes(x = age, y = count_mirrored, fill = duration_gp)) +
geom_bar(position = "stack", stat = "identity", color = "black") +
coord_flip() +
geom_hline(yintercept = 0, color = "black") +
theme(
axis.title.y = element_blank(),
legend.position = "top",
panel.grid.major.y = element_blank()
) + scale_fill_brewer() + theme_bw() + scale_y_continuous(breaks = c(-30, -25, -20, -15, -10, -5, 0, 5, 10, 15, 20, 30, 35)) + annotate("text", y=-25, x = 9, label = "female") + annotate("text", y=15, x = 9, label = "male")
plot7dat <- region %>%
left_join(city_count, by = "city") %>%
filter(!is.na(n))
plot7dat <- plot7dat %>%
select(elderly_population_ratio, elderly_alone_ratio, nursing_home_count, university_count, n)
ggcorr(plot7dat,label = TRUE, label_size = 3, size = 3, hjust = 0.75, layout.exp = 1)
ggplot(patientinfo_date, aes(x=duration)) + geom_histogram(aes(y = ..density..), binwidth=3, fill="#69b3a2", color="#e9ecef") + scale_x_continuous(breaks = seq(0, 70, by =5)) + labs(title = "Distribution of disease duration") +
stat_function(fun = dnorm,
args = list(mean = mean(patientinfo_date$duration, na.rm = TRUE),
sd = sd(patientinfo_date$duration, na.rm = TRUE)),
color = "black", linewidth = 1) + theme_classic()