# some color settings for aesthetics
# you can just replace scale_fill_manual() with scale_fill_brewer(palette = "Paired")
library(ggplot2)
library(RColorBrewer)
colors <- 11
mycolors <- colorRampPalette(brewer.pal(9, "YlGnBu"))(colors)
mycolors <- rev(mycolors)
We are performing some exploratory analysis with the Dengue data from
CDC.
Let’s download the data first, which can also be download locally then read it afterward.
if(!file.exists("Dengue.csv")){
download.file(url = "https://od.cdc.gov.tw/eic/Dengue_Daily_last12m_EN.csv",
destfile="Dengue.csv",method='curl')}
data <- read.csv('Dengue.csv')
We can know that data is in dataframe format, within
26764 observations and 30 variables, and some of the columns such as
date, sex, and age group, are all in character format, which needs some
modification later on.
# str(data)
# head(data)
Since it’s a time series data, what we can do first is to plot the case counts by day, for the x-axis We will be using Date Onset column, and for the count data, since that each row is per case, we may first group by the data by date then sum up the total counts per day! Let’s fix the format we’re using first. 1. clean up the formats 2. group up by date 3. sum by the group to get daily counts 4. squareroot the counts and plot it
library(dplyr)
data %>% mutate(across(starts_with("Date"), ~as.Date(., format="%Y/%m/%d")),
Sex = as.factor(Sex)) -> data
# Completely same as below
# data$Date_Onset <- as.Date(data$Date_Onset)
# data$Date_Confirmation <- as.Date(data$Date_Confirmation)
# data$Date_Notification <- as.Date(data$Date_Notification)
# data$Sex <- as.factor(data$Sex)
data %>% group_by(Date_Onset) %>% summarise(count = n()) -> df_counts
df_counts
## # A tibble: 257 × 2
## Date_Onset count
## <date> <int>
## 1 2023-01-01 1
## 2 2023-01-03 1
## 3 2023-01-14 1
## 4 2023-01-15 1
## 5 2023-01-29 1
## 6 2023-02-02 3
## 7 2023-02-11 1
## 8 2023-02-18 1
## 9 2023-03-01 2
## 10 2023-03-07 1
## # ℹ 247 more rows
We can see in the df_counts data, there are 2 columns,
Date_Onset and the count of that day. Let’s further on plot it in
ggplot2 ! Remember to squareroot it first befor
plotting.
library(ggplot2)
df_counts %>% mutate(count_sqrt = sqrt(count)) %>%
ggplot(aes(Date_Onset, count_sqrt)) + geom_col(fill = mycolors[1]) +
xlab("Date of Onset") + ylab("Squareroot of Counts") + theme_classic()
We can see 2 obvious peaks in the plot.
Since the data include cases all over Taiwan, but dengue actually only had outbreaks in only few cities, for these cities which are more severe than others all called Hotspot. Let’s find out the Hotspot and plot it by these places!!
data %>% group_by(County_living) %>% summarise(count = n()) -> city_count
head(city_count)
## # A tibble: 6 × 2
## County_living count
## <chr> <int>
## 1 Changhua County 25
## 2 Chiayi City 42
## 3 Chiayi County 242
## 4 Hsinchu City 28
## 5 Hsinchu County 25
## 6 Hualien County 3
city_count data have the accumulated cases by the
cities. We have to sort it and find the Hotspot!!
(city_count %>% arrange(desc(count)) %>% head(4) -> top4_county)
## # A tibble: 4 × 2
## County_living count
## <chr> <int>
## 1 Tainan City 21553
## 2 Kaohsiung City 3229
## 3 Yunlin County 740
## 4 Pingtung County 412
So the Hotspot will be Tainan City, Kaohsiung City, Yunlin County,
Pingtung County, we will plot it by these place so on! There are few
things to know about it, County_living and
County_infected have different meanings, the former
includes all administrative region and the latter have more
epidemiological meanings in it, and the None coded in
County_infected means the entry of a case
had due to some reasons are not recorded, the knowing
to the data codebook could be a crucial factor for us to interpret the
results.
# (unique(data$County_living) -> a)
# (unique(data$County_infected) -> b)
# intersect(a,b)
To achieve our goal to plot the cases by hotspot, we have to recode
the column County_living and County_infected
into hotspot region and others region. Second, since the outbreak
started in the middle of the year, we’ll use data from June 2023.
counties_to_consider <- top4_county$County_living
data %>% mutate(
Hotspot = if_else(County_living %in% counties_to_consider, County_living, "Others"),
Hotspot_infected = if_else(County_infected %in% c(counties_to_consider,"None"),County_infected, "Others")
) %>% filter(Date_Onset > "2023-06-01") -> data_hotspot
data_hotspot %>% group_by(Hotspot, Date_Onset) %>% summarise(count = n()) -> df_counts
head(df_counts)
## # A tibble: 6 × 3
## # Groups: Hotspot [1]
## Hotspot Date_Onset count
## <chr> <date> <int>
## 1 Kaohsiung City 2023-06-21 2
## 2 Kaohsiung City 2023-06-22 1
## 3 Kaohsiung City 2023-06-28 1
## 4 Kaohsiung City 2023-06-30 3
## 5 Kaohsiung City 2023-07-01 1
## 6 Kaohsiung City 2023-07-03 1
we can see that df_counts generated by hotspot region,
case count and its date of onset. Finally plot it out!
df_counts$Hotspot <- factor(df_counts$Hotspot, levels = c(counties_to_consider, "Others"))
df_counts %>% mutate(count_sqrt = sqrt(count)) %>%
ggplot(aes(Date_Onset, count_sqrt, fill = Hotspot)) +
geom_col() + facet_wrap(~Hotspot) + theme_classic() +
xlab("Date of Onset") + ylab("Squareroot of Counts") +
scale_fill_manual(values = mycolors)
We will be using the concept of County_living and
County_infected previously mentioned here. The plot will
first group by the Hotspot then filled with
Hotspot_infected, which can provide us a point view from
both administratively and epidemiologicaly. Especially in Tainan’s plot,
we can see that the gov decided to give up recording the epidemiological
infected place.
data %>%
mutate(across(starts_with("Date"), ~as.Date(., format="%Y/%m/%d")),
Sex = as.factor(Sex),
Hotspot = if_else(County_living %in% counties_to_consider, County_living, "Others"),
Hotspot_infected = if_else(County_infected %in% c(counties_to_consider, "None"), County_infected, "Others")
) %>%
filter(Date_Onset > "2023-06-01") -> df
df %>% group_by(Hotspot, Date_Onset) %>%
summarise(count = n(), .groups='drop') -> df_counts
df %>%
ggplot(aes(Date_Onset, fill=Hotspot_infected, color=Hotspot_infected)) +
geom_bar() + facet_wrap(~Hotspot, ncol=3) + theme_classic()
Let’s do everything at once!
# find the top 10
data %>% filter(County_living =='Tainan City') %>%
group_by(Township_living) %>%
summarise(count = n(), .groups = 'drop') %>%
arrange(desc(count)) %>% head(10) -> top10_tainan
# keep the top 10 into vector
(towns_to_consider <- top10_tainan$Township_living)
## [1] "East Dist." "Yongkang Dist." "Annan Dist."
## [4] "South Dist." "North Dist." "West Central Dist."
## [7] "Rende Dist." "Anping Dist." "Guanmiao Dist."
## [10] "Guiren Dist."
# keep the district's data
data %>%
mutate(town_Hotspot = if_else(Township_living %in% towns_to_consider, Township_living, "Others")) %>%
filter(Date_Onset > "2023-06-01", County_living == "Tainan City") -> tainan_town_hotspot
# counts per dist and days
tainan_town_hotspot %>% group_by(town_Hotspot, Date_Onset) %>% summarise(count = n()) -> df_counts
head(df_counts)
## # A tibble: 6 × 3
## # Groups: town_Hotspot [1]
## town_Hotspot Date_Onset count
## <chr> <date> <int>
## 1 Annan Dist. 2023-06-17 1
## 2 Annan Dist. 2023-06-25 1
## 3 Annan Dist. 2023-06-26 1
## 4 Annan Dist. 2023-06-28 1
## 5 Annan Dist. 2023-07-02 2
## 6 Annan Dist. 2023-07-05 1
# levels from the most to least
df_counts$town_Hotspot <- factor(df_counts$town_Hotspot, levels = c(towns_to_consider, "Others"))
df_counts %>% mutate(count_sqrt = sqrt(count)) %>%
ggplot(aes(Date_Onset, count_sqrt, fill = town_Hotspot)) +
geom_col() + scale_fill_manual(values = mycolors) +
facet_wrap(~town_Hotspot) + theme_classic() +
xlab("Date of Onset") + ylab("Squareroot of Counts")
df_counts %>% mutate(count_sqrt = sqrt(count)) %>%
ggplot(aes(Date_Onset, count_sqrt, fill = town_Hotspot)) +
geom_col() + scale_fill_manual(values = mycolors) +
theme_classic() + xlab("Date of Onset") + ylab("Squareroot of Counts")
To play and plot on the maps, we need to use this 2 packages, please download it if you haven’t!
library(sf)
library(geojsonio)
so the data we’re using, is originally from open data from National Land Surveying and Mapping Center , the file is so far(entry date : 2024/01/16) only provided in the Chinese version of the website, go grab it if the google link doesn’t work! (link : https://maps.nlsc.gov.tw/)
# Download the file by the link first
# https://drive.google.com/file/d/1ONN4kg7b3oZP02qgyEBQYN6PfDAMth0O/view?usp=sharing
# make sure it's in your working place
#list.files()
# untar the file to the specified folder
extracted_files <- "D:\\grad\\New folder\\extracted_files\\"
dir.create(extracted_files, showWarnings = FALSE)
untar("D:\\grad\\New folder\\TOWN_MOI_1120317.tar.gz",exdir = extracted_files)
# read the data
d <- read_sf(dsn = extracted_files, layer = "TOWN_MOI_1120317")
# class(d)
# str(d)
# head(d)
We’re just going to look at the data from Tainan, so filter it out and color the Tainan city on the plot.
d %>% mutate(isTainan = if_else(COUNTYNAME=='臺南市', T, F)) %>% # new col which identify if it's from Tainan
ggplot(aes(fill = isTainan)) + # fill the color with the identifier
geom_sf() + # plot the geographical plot
coord_sf(xlim = c(119.9, 122.1), ylim = c(22, 25.2)) + # reframe it for clarity
theme_bw()+ # aesthetic
scale_fill_brewer("YlGnBu")
We can see we’ve just generate a plot with colored Tainan City! We’ve
encountered a problem that in the geographical data d,
their town’s english column TOWNENG are different from the
filtered data towns_to_consider originally from the dengue
data we were previously using, so let’s fix that first.
towns_to_consider
## [1] "East Dist." "Yongkang Dist." "Annan Dist."
## [4] "South Dist." "North Dist." "West Central Dist."
## [7] "Rende Dist." "Anping Dist." "Guanmiao Dist."
## [10] "Guiren Dist."
towns_to_consider_2 <- gsub("\\.", "rict", towns_to_consider)
towns_to_consider_2
## [1] "East District" "Yongkang District" "Annan District"
## [4] "South District" "North District" "West Central District"
## [7] "Rende District" "Anping District" "Guanmiao District"
## [10] "Guiren District"
d %>% filter(COUNTYNAME=="臺南市") %>%
mutate(town_selected = if_else(TOWNENG %in% towns_to_consider_2, TOWNENG, "Others"),
town_selected = factor(town_selected, levels=c(towns_to_consider_2, "Others"))) %>%
ggplot(aes(fill=town_selected)) +
scale_fill_manual(values = mycolors) + theme_void() +
geom_sf() + coord_sf(xlim = c(120.02, 120.67), ylim = c(22.88, 23.42))
Our ultimate goal is to calculate the Rt with Cori method, first recall the definition for Cori method and decide our workflow, In Cori method, we get the cases first then calculate the expected number of its’ primary infections by generation time function. \[ R_t = \frac{{\text{{new secondary infections}}_t}}{{\text{{primary infections}}}} \]
Our workflow will be something like this: 1. get the scale and shape
parameters 2. form the g(t) with par and
pgamma() 3. calculate the expected number of primary
infections(Rt’s denominator) 4. generate the R(t) 5. plot it out
We will continuosly ise the data from dengue data from tainan, and
it’s just copy paste from what we’ve done previously, using 2 different
filter Imported != "Y" and
Date_Onset > "2023-05-01", and adding day
column calculated from the first index case’s date.
data %>% filter(County_living =='Tainan City') %>%
group_by(Township_living) %>%
summarise(count = n(), .groups = 'drop') %>%
arrange(desc(count)) %>% mutate(rank = 1:n()) %>% head(10) -> top10_tainan
# keep the top 10 into vector
towns_to_consider <- top10_tainan$Township_living
# keep the district's data
data %>%
mutate(town_Hotspot = if_else(Township_living %in% towns_to_consider, Township_living, "Others")) %>%
filter(Imported != "Y", Date_Onset > "2023-05-01", County_living == "Tainan City") -> tainan
# counts per dist and days
tainan%>% group_by(Date_Onset) %>% summarise(count = n()) %>% mutate(day = as.integer(Date_Onset - min(Date_Onset) + 1))-> df_tainan_counts
head(df_tainan_counts)
## # A tibble: 6 × 3
## Date_Onset count day
## <date> <int> <int>
## 1 2023-06-03 2 1
## 2 2023-06-04 1 2
## 3 2023-06-08 1 6
## 4 2023-06-09 2 7
## 5 2023-06-11 1 9
## 6 2023-06-12 1 10
But in order to calculate the number of expected infectors on time t,
we need to generate a dataframe with daily counts. So,
what we’re going to do is as below.
and follows the codes…
library(tidyr)
expand.grid(day = min(df_tainan_counts$day):max(df_tainan_counts$day)) %>%
left_join(df_tainan_counts, by = join_by(day)) %>%
select(-Date_Onset) %>%
replace_na(list(count = 0)) -> df
head(df)
## day count
## 1 1 2
## 2 2 1
## 3 3 0
## 4 4 0
## 5 5 0
## 6 6 1
#plot(df$count, xlab = "day", ylab = "top 10 tainan district's count")
# so as the and this is the epi curve for top10 Tainan district.
So as prof said, generation time will be calculate or retrieved from outside resources, usually CDC will pay extra attention on the beginning of the outbreak, and build up the g(t) curve for the disease, and observe which distribution does it fit better, and then publish a paper recommending which distribution to use with the generation time function’s parameter shape and scale, for our case, we adopted the research result from Bacallado, et al. 2020.
(ref link : https://www.eurosurveillance.org/content/10.2807/1560-7917.ES.2020.25.29.2001381# )
Our goal in this part, is to made up the g(t) function from the known parameters by Cori method.
# generation time
gt_shape = 3.44
gt_scale = 5.12
time <- seq(1,60)
cdf_val <- pgamma(time, shape = gt_shape, scale = gt_scale)
cdf_val2 <- pgamma(time-1, shape = gt_shape, scale = gt_scale)
plot(time,cdf_val,type = "l", col = "steelblue", lwd = 2, xlab = "time", ylab = "Gamma Distribution CDF")
lines(time,cdf_val2, col = "pink", lwd = 2)
Recall the Cori method, we found secondary infectees at time t first.
\[ R_t = \frac{{\text{{new secondary infections}}_t}}{{\text{{primary infections}}}} \]
pgamma(t,shape = gt_shape, scale = gt_scale) means the
CDF up to time t
pgamma(t-1,shape = gt_shape,scale = gt_scale means the
CDF up to time t - 1 In order to calculate the
PMF of the time interval(t-1 -> t), we subtract this 2
values.
So the generation_time(t) will take t as
input and calculate the PMF of the time interval(t-1 ->
t).
Since the area under the PMF curve is equal to 1, the
sum(GTs) will be near to 1.
GTs means the probability of the generation time at time t.
# in Cori et al, we need to discretize this gamma distribution
# pgamma at time t - pgamma at time t-1
generation_time = function(t){
pgamma(t,shape = gt_shape, scale = gt_scale)-pgamma(t-1,shape = gt_shape,scale = gt_scale)
}
GTs <- sapply(1:60, generation_time)
sum(GTs)
## [1] 0.9986903
plot(GTs, type = "b",xlab = "the length of generation time", ylab = "probability")
abline(v = which.max(GTs), col =2, lwd = 2)
For instance, in the top 10 tainan districts’ hotspot data, the highest probabilty for the generation time is around 13 days.
plot(time,cdf_val/max(cdf_val),type = "l", col = "steelblue", lwd = 2, xlab = "time", ylab = "Gamma Distribution",ylim = c(0,1))
par(new = TRUE)
plot(time,GTs/max(GTs), type = "l", col = "pink", lwd =2,xlab = "", ylab = "" , ylim = c(0,1))
legend("topright", legend = c("Normalized CDF", "Normalized PMF"), col = c("steelblue", "pink"), lwd = 2,cex = 0.75)
and these are just some plot of the PMF and CDF of the gamma distribution, which could help you to have a better knowing.
# tainan district's gt by the amount of their time
GTs = sapply(1:nrow(df), generation_time)
#for each day: sum(n[day-h] * GTs[h]) = sum(n[h] * GTs[day-h]), h=1..day-1
res = c(NA)
# because we can only calc Rt from day 2
for (day in 2:nrow(df)) {
res_ = 0.0
for (h in 1:(day-1))
res_ = res_ + df$count[h] * GTs[day-h]
# equivalently as a one-line formula
# res_ = head(df$count, day-1) %*% tail(rev(GTs), day-1)
res = c(res, res_)
}
df %>% mutate(conv = res) %>%
mutate(Rt = count / conv) %>%
filter(conv >0)-> df
df %>%
ggplot(aes(day)) +
geom_col(aes(y=count / max(count) * 5), alpha=.25) +
geom_point(aes(y=Rt, color = "Rt"),shape = 19, size = 2, alpha = 0.4) +
coord_cartesian(ylim=c(0,5)) + theme_classic() + ylab("cases count") +
labs(color = "") +
scale_color_manual(values = "steelblue", name = "") +
theme(legend.position ="right")
If you’re still confuse about how the for loop summation, why not try another point of view?
not done yet…..
# knitr::include_graphics()
# res_ = head(df$count, day-1) %*% tail(rev(GTs), day-1)
We call it step though…
Rt_step = 7
day <- 8
(day - 1) %/% Rt_step + 1
## [1] 2
# %/% integral division operator
df %>%
mutate(Rt_index = ((day - 1) %/% Rt_step) + 1) -> df_tainan_counts_
df_tainan_counts_ %>%
group_by(Rt_index) %>%
summarise(Rt = sum(count) / sum(conv)) -> df_tmp
df_tmp %>% right_join(df_tainan_counts_ %>% select(-Rt),by = join_by(Rt_index)) -> df_tainan_piecewise_constant
head(df_tainan_piecewise_constant)
## # A tibble: 6 × 5
## Rt_index Rt day count conv
## <dbl> <dbl> <int> <int> <dbl>
## 1 1 22.9 2 1 0.000584
## 2 1 22.9 3 0 0.00516
## 3 1 22.9 4 0 0.0160
## 4 1 22.9 5 0 0.0319
## 5 1 22.9 6 1 0.0506
## 6 1 22.9 7 2 0.0703
df_tainan_piecewise_constant %>%
ggplot(aes(day)) +
geom_col(aes(y = count / max(count) * 5), alpha=.25) +
geom_point(aes(y=Rt,color = "Rt")) +
coord_cartesian(ylim=c(0,5)) + ylab("cases count") +
scale_color_manual(values = "steelblue", name = "") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
theme_classic()