#载入相关宏包
library(EpiStats)
library(lubridate)
library(tidyverse)
library(magrittr)
setwd("D:/food_borne_outbreaks/outbreak")
#读入数据
os <- read.csv("oswego1.csv")
## 数据规整
oswego1 <- os %>%
mutate(
sex = factor(
sex,
levels = c('M', 'F'),
labels = c('男性', '女性')
),
onsetdate = paste0("1940/", onsetdate) %>% as.Date("%Y/%m/%d"),
onsettime = paste0(onsettime %/% 100, ":", onsettime %% 100),
onset_date_time = parse_date_time(paste0(onsetdate, " ", onsettime), "Ymd HM"),
timesupper = paste0("1940/04/18", " ", timesupper %/% 100, ":", timesupper %% 100),
timesupper = parse_date_time(timesupper, "Ymd HM")
) %>%
dplyr::select(age, sex, ill, timesupper, onset_date_time, everything())
1940年4月19日,纽约州Oswego县Lycoming村的地方卫生官员向位于Syracuse的区卫生官员报告了一起急性胃肠道疾病爆发。正在接受流行病学培训的A. M. Rubin医生被派到现场进行调查。现将本起疫情调查处理总结报告如下。
奥斯威戈( Oswego)是美国纽约州奥斯威戈县的一座城市,是奥斯威戈县的县城。2010年人口普査时人口为18142人。奥斯威戈位于组约州北部中心的安大略湖边,并将其称为“纽约州中部港口城市”。奥斯威戈市分别与西部,南部和东部的奥斯威戈镇,米奈托和Scribal以及北部的安大略湖相连接。奥斯威戈赛道是一个全国知名的汽车赛车设施。纽约州立大学奥斯威戈分校就位于城外湖泊边。在蒙大拿州,俄勒网州,伊利诺伊州和堪萨斯州有奥斯威戈的同名社区。
sample <- os %>%
nrow()
cases <- os %>%
summarise(cases = sum(ill))
1940年4月19日,纽约市Oswego县Lycoming村的卫生人员向Syracuse地区卫生官员报告了一起急性胃肠疾病暴发。到达现场后,调查组从当地卫生人员那里了解到所有已知患病的人都出席过前一晚上即4月18日举行的教堂晚餐。没有参加教堂晚餐的家属则未发病。调查的焦点就集中在教堂晚餐。对已知出席晚餐的80人中的75人进行了调查,收集了临床症状、症状出现时间,以及在教堂晚餐上所吃食物等信息。在目前发现的聚餐者中,有46人报告发病,罹患率为61.33%(75/46)。
1940年4月18日参加的教堂晚餐者出现腹泻或呕吐,伴或不伴发热等症状者。
index <- oswego1 %>%
dplyr::filter(onset_date_time == as_datetime(min(onset_date_time, na.rm = T))) %>%
mutate(index = paste0("某某某",",", sex,",", age,"岁,", "发病时间为", onset_date_time))
某某某,男性,8岁,发病时间为1940-04-18 15:00:00。
index1 <- oswego1 %>%
dplyr::filter(onset_date_time == as_datetime(min(onset_date_time, na.rm = T)))
index2 <- oswego1 %>%
dplyr::filter(onset_date_time == as_datetime(max(onset_date_time, na.rm = T)))
最早病例发病时间为1940-04-18 15:00:00,末例病例发病时间为1940-04-19 10:30:00。疫情发病曲线如下图。
oswego1 %>%
filter(ill == "TRUE") %>%
mutate(sex = as.character(sex)) %>%
ggplot(aes(onset_date_time, fill = sex)) +
geom_histogram(binwidth = 3600, color = 'grey50')+
scale_x_datetime(date_breaks = "2 hour",
labels = scales::label_date_short(format = c("%Y年", "%m月", "%d日", "%H:%M"))
)+
scale_y_continuous(breaks = seq(1, 10), expand = c(0, 0))+
labs(
title = "1940年4月18-19日纽约Oswego县胃肠道疾病发病曲线",
caption = "数据来源:FETP/CDC",
x = "发病时间",
y = "病例数",
fill = ""
)+
theme_classic()
按不同时间间隔生成发病曲线如下图。
## 按不同的时间间隔制作流行曲线
#平均潜伏期为4小时
incubation <- 4 * 3600
#时间间隔离分别设置为平均潜伏期的1/8 ~ 1/3
binwidth <- 1/8:3
pic <- vector("list", length(binwidth))
for (i in seq_along(binwidth)) {
pic[[i]] <- oswego1 %>%
filter(ill == "TRUE") %>%
mutate(sex = as.character(sex)) %>%
ggplot(aes(onset_date_time)) +
geom_histogram(binwidth = binwidth[i] * incubation, color = 'grey50')+
scale_x_datetime(date_breaks = "4 hour",
labels = scales::label_date_short(format = c("%Y年", "%m月", "%d日", "%H:%M"))
)+
scale_y_continuous(breaks = seq(1, 20), expand = c(0, 0))+
labs(
title = "Oswego县胃肠道疾病发病曲线",
subtitle = paste0("(以时间间隔的", round(binwidth[i],2), "倍即", round(binwidth[i] * incubation/3600, 2), "小时计)"),
x = "发病时间",
y = "病例数",
fill = ""
)+
theme_classic()+
theme(
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()
)
}
library(patchwork)
pic[[1]] + pic[[4]] + pic[[5]] + pic[[6]]
set.seed(2020)
oswego_map <- oswego1 %>%
mutate(
lon = runif(seq_along(ill), -76.5555, -76.2222),
lat = runif(seq_along(ill), 43.3333, 43.4444)
)
oswego_counties <- map_data("county", "new york,oswego") %>%
dplyr::select(lon = long, lat, group, id = subregion)
ggplot(oswego_counties, aes(lon, lat)) +
geom_polygon(fill = "white", colour = "grey50") +
geom_point(data = oswego_map, aes(x = lon, y = lat), color = "red")+
coord_quickmap()
#性别分布
sex <- oswego1 %>%
filter(ill == "TRUE") %>%
group_by(sex) %>%
summarise(n = n()) %>%
mutate(sum = sum(n),
percent = n / sum * 100) %>%
mutate(sex1 = paste0(sex,n, "例"))
m <- sex %>%
filter(sex == "男性")
f <- sex %>% filter(sex == "女性")
46例病人中男性16例、女性30例,性别比为0.53。性别分布图如下。
pie(sex$n,
caption = "Oswego教堂胃肠道疾病病例性别分布",
col = c("red", "blue"),
labels = paste0(sex$sex, "," , round(sex$percent, 2), "%"))
ages <- oswego1 %>%
filter(ill == "TRUE") %>%
summarise(
min = min(age, na.rm = T),
max = max(age, na.rm = T),
mean = mean(age, na.rm = T) %>% round(.,2),
median = median(age, na.rm = T),
)
ageg <- oswego1 %>%
filter(ill == "TRUE") %>%
mutate(agegroup = cut(age,
breaks = c(seq(0, 80, 10), Inf),
labels = paste0(seq(0, 80, 10), "岁~"),
right = F)) %>%
group_by(agegroup) %>%
tally() %>%
mutate(ageg = paste0(agegroup, "组",n, "例"))
病例中年龄最小3岁、最大77岁,均数为39.26岁,中位数为38.5岁。具体为:0岁~组4例, 10岁~组9例, 20岁~组3例, 30岁~组7例, 40岁~组4例, 50岁~组9例, 60岁~组6例, 70岁~组4例。年龄分布图如下。
# #年龄分布
oswego1 %>%
filter(ill == "TRUE") %>%
ggplot(aes(age, fill = sex))+
geom_density(alpha = .5)+
theme_classic()+
scale_y_continuous(expand = c(0,0))
oswego1 %>%
filter(ill == "TRUE") %>%
mutate(agegroup = cut(age,
breaks = c(seq(0, 80, 10), Inf),
labels = paste0(seq(0, 80, 10), "岁~"),
right = F)) %>%
group_by(agegroup, sex) %>%
tally() %>%
ggplot(aes(n, agegroup, fill = sex))+
geom_col()+
theme_classic()+
scale_x_continuous(expand = c(0,0))
### 病例潜伏期计算
cubation <- oswego1 %>%
filter( !is.na(timesupper) & !is.na(onset_date_time)) %>%
mutate(cub = onset_date_time - timesupper) %>%
dplyr::select(age, sex, cub)
### 潜伏期分布
cubation %>%
ggplot(aes(as.numeric(cub), fill = sex))+
geom_histogram(binwidth = 1)+
labs(
title = "Oswego一教堂晚餐胃肠道病例潜伏期分布图",
x = "潜伏期(小时)",
y = "",
fill = ""
)+
scale_y_continuous(expand = c(0,0))+
theme_classic()
# 平均潜伏期估计
cubs <- cubation %>%
summarise(`最短潜伏期` = min(cub),
`最长潜伏期` = max(cub),
`极差` = max(cub) - min (cub),
`潜伏期均期` = mean(cub),
`潜伏期中位数` = median(cub)) %>%
as_tibble()
因病例均与晚餐有关,以晚餐为暴露因素,做潜伏期分析如下。
#暴露与发病时间序列图
oswego1 %>%
pivot_longer(c(timesupper, onset_date_time),
names_to = "type",
values_to = "time") %>%
ggplot(aes(time, fill = type)) +
geom_histogram(binwidth = 3600, color = 'grey50', position = position_dodge())+
scale_x_datetime(date_breaks = "2 hour",
labels = scales::label_date_short(format = c("%Y年", "%m月", "%d日", "%H:%M"))
)+
labs(
title = "1940年4月18-19日纽约Oswego县胃肠道疾病暴露与发病时序图",
caption = "数据来源:FETP/CDC",
x = "暴露/发病时间",
y = "人数",
fill = "")+
scale_fill_discrete(
limits = c("timesupper", "onset_date_time" ),
labels = c("暴露", "发病"))+
scale_y_continuous(breaks = seq(1, 15), expand = c(0, 0))+
theme_classic()
该疫情的最短潜伏期为3小时,最长潜伏期为7小时,平均潜伏期为4小时。潜伏期分布如下图。
cubation %>%
ggplot(aes(as.numeric(cub), fill = sex))+
geom_histogram(binwidth = 1)+
theme_classic()+
scale_y_continuous(expand = c(0,0))
# 暴露与发病时间点线图
oswego1 %>%
# filter(!is.na(timesupper), !is.na(onset_date_time)) %>%
rownames_to_column() %>%
ggplot(aes(onset_date_time, fct_reorder(rowname, onset_date_time - timesupper), color = sex))+
geom_point(data = oswego1 %>% rownames_to_column(),
aes(timesupper, fct_reorder(rowname, onset_date_time - timesupper), color = sex),shape = 19, size = 2, show.legend = TRUE)+
geom_point(data = oswego1 %>% rownames_to_column(),
aes(onset_date_time, fct_reorder(rowname, onset_date_time - timesupper), color = sex), shape = 21,size = 2)+
# ggplot(aes(onset_date_time, fct_reorder(rowname, timesupper), color = sex))+
# geom_point(data = oswego1 %>% rownames_to_column(),
# aes(timesupper, fct_reorder(rowname, timesupper), color = sex),shape = 19, size = 2, show.legend = TRUE)+
# geom_point(data = oswego1 %>% rownames_to_column(),
# aes(onset_date_time, fct_reorder(rowname, timesupper), color = sex), shape = 21,size = 2)+
geom_linerange(aes(xmin = timesupper, xmax = onset_date_time)) +
scale_x_datetime(date_breaks = "2 hour",
labels = scales::label_date_short(format = c("%Y年", "%m月", "%d日", "%H:%M"))
)+
theme_classic()+
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank()
)
## 关系点阵图
oswego1 %>%
rownames_to_column() %>%
dplyr::select(rowname, ill, bakedham:fruitsalad) %>%
filter(ill == TRUE) %>%
pivot_longer(2:last_col(),
names_to = "var",
values_to = "value") %>%
mutate(value = as.numeric(value))%>%
ggplot(aes(var, rowname, color = factor(value) ))+
scale_color_manual(values = c("white", "red", "gray"))+
geom_point()+
coord_flip()
## 相关分析
oswego1 %>%
dplyr::select(ill, bakedham:fruitsalad) %>%
cor(use = "complete.obs", method = "kendall") %>%
rstatix::cor_plot(label = TRUE,
font.label = list(
size = .5,
style = "bold",
color = "white" ),
type = "upper")
根据发病曲线可见,该疫情呈现单峰特征,大部分病例集中在6小时左右发病。此种形态符合同源一次暴露疫情特征。
所有病例均与4月18日教堂晚餐有关联。教堂晚餐可能是暴露源头所在。
病人发病时间集中在一个最长潜伏期,发病高峰在晚餐后的一个平均潜伏期内。
发病与晚餐各菜肴的关联分析初步发现,发病与部分菜肴有一定统计学关联。
经初步调查分析,该疫情系一次同源暴露产生的聚集性疫情,与1940年4月18日Oswego一教堂的晚餐有关。
以病例为病例组,以未发病的作为对照,分别以晚餐菜肴食品作为暴露因素进行病例对照研究。
各暴露因素的比值比见下表
or <- CCTable(
oswego1,
"ill",
exposure = c(
"bakedham",
"spinach",
"mashedpota",
"cabbagesal",
"jello",
"rolls",
"brownbread",
"milk",
"coffee",
"water",
"cakes",
"vanilla",
"chocolate",
"fruitsalad"
),
exact = T,
sort = "pvalue"
) ##c(pvalue, rr, ar)
ortable <- or$df %>%
janitor::clean_names() %>%
rownames_to_column() %>%
mutate(across(!rowname, as.character)) %>%
mutate(across(!rowname, as.numeric)) %>%
as_tibble()
ortable %>%
as_tibble() %>%
mutate(OR = paste0(or, "(95%CI ", ci_ll, "-", ci_ul, ")"), .after = percent_2) %>%
dplyr::select(!c(or:ci_ul, 5:7)) %>%
flextable::flextable() %>%
flextable::autofit()
rowname | tot_cases | exposed | percent | OR | p_fisher |
vanilla | 46 | 43 | 93.5 | 23.45(95%CI 5.22-138.87) | 0.000 |
chocolate | 45 | 25 | 55.6 | 0.4(95%CI 0.12-1.23) | 0.089 |
cakes | 46 | 27 | 58.7 | 1.75(95%CI 0.62-4.97) | 0.342 |
jello | 46 | 16 | 34.8 | 1.68(95%CI 0.53-5.64) | 0.442 |
water | 46 | 13 | 28.3 | 0.64(95%CI 0.22-1.96) | 0.450 |
rolls | 46 | 21 | 45.6 | 0.68(95%CI 0.24-1.92) | 0.482 |
brownbread | 46 | 18 | 39.1 | 1.43(95%CI 0.48-4.38) | 0.622 |
milk | 46 | 2 | 4.3 | 0.61(95%CI 0.04-8.99) | 0.638 |
cabbagesal | 46 | 18 | 39.1 | 1.22(95%CI 0.42-3.65) | 0.808 |
bakedham | 46 | 29 | 63.0 | 1.2(95%CI 0.42-3.45) | 0.809 |
spinach | 46 | 26 | 56.5 | 0.92(95%CI 0.32-2.59) | 1.000 |
mashedpota | 46 | 23 | 50.0 | 1(95%CI 0.35-2.84) | 1.000 |
coffee | 46 | 19 | 41.3 | 1(95%CI 0.35-2.87) | 1.000 |
fruitsalad | 46 | 4 | 8.7 | 1.29(95%CI 0.17-15.08) | 1.000 |
ortable %>%
ggplot(aes(or, fct_reorder(rowname, or), color = p_fisher < 0.05))+
geom_pointrange(aes(xmin = ci_ll, xmax = ci_ul))+
scale_x_continuous(breaks = c(seq(-100, 1000, 10),1))+
geom_vline(xintercept = 1, color = "red", show.legend = T)+
scale_color_manual(values = c("grey50", "purple"))+
theme_classic()
vanilla
食物归因分析ar <- CC(oswego1, "ill", "vanilla", exact = T, full = T)
arv <- ar$df2 %>%
rownames_to_column() %>%
mutate(across(!rowname, as.character)) %>%
mutate(across(!rowname, as.numeric))
arv %>%
as_tibble() %>%
flextable::flextable() %>%
flextable::autofit()
rowname | Point estimate | 95%CI-ll | 95%CI-ul |
Odds ratio | 23.45 | 5.22 | 138.87 |
Attr. frac. ex. | 0.96 | 0.81 | 0.99 |
Attr. frac. pop | 0.89 | ||
chi2(1) | 27.22 | ||
Pr>chi2 | 0.00 | ||
Fisher p-value | 0.00 |
由上可见本起疫情中vanilla
食物的暴露比值比为23.45(95%可信区间为5.22-138.87),即病例中该食物的暴露率是非病例组的23.45倍。该食物的归因分值为96%。
参加晚餐的人员分别以是否食用某种食物为暴露因素分为暴露组和对照组,作回顾性队列研究。
各暴露因素的相对危险度见下表
rr <-
CSTable(
oswego1,
"ill",
exposure = c(
"bakedham",
"spinach",
"mashedpota",
"cabbagesal",
"jello",
"rolls",
"brownbread",
"milk",
"coffee",
"water",
"cakes",
"vanilla",
"chocolate",
"fruitsalad"
),
full = TRUE,
exact = T
)
rrtable <- rr$df %>%
janitor::clean_names() %>%
rownames_to_column() %>%
mutate(across(!rowname, as.character)) %>%
mutate(across(!rowname, as.numeric)) %>%
as_tibble()
rrtable %>%
as_tibble() %>%
mutate(RR = paste0(rr, "(95%CI ", ci_ll, "-", ci_ul, ")"), .after = ar_percent_2) %>%
dplyr::select(!c(rr:ci_ul, 5:7)) %>%
flextable::flextable() %>%
flextable::autofit()
rowname | tot_exp | exp_cases | ar_percent | RR | p_fisher |
vanilla | 54 | 43 | 80 | 5.57(95%CI 1.94-16.03) | 0.000 |
chocolate | 47 | 25 | 53 | 0.72(95%CI 0.51-1.02) | 0.089 |
cakes | 40 | 27 | 68 | 1.24(95%CI 0.86-1.8) | 0.342 |
jello | 23 | 16 | 70 | 1.21(95%CI 0.84-1.72) | 0.442 |
water | 24 | 13 | 54 | 0.84(95%CI 0.55-1.27) | 0.450 |
rolls | 37 | 21 | 57 | 0.86(95%CI 0.6-1.24) | 0.482 |
brownbread | 27 | 18 | 67 | 1.14(95%CI 0.8-1.64) | 0.622 |
milk | 4 | 2 | 50 | 0.81(95%CI 0.3-2.19) | 0.638 |
cabbagesal | 28 | 18 | 64 | 1.08(95%CI 0.75-1.55) | 0.808 |
bakedham | 46 | 29 | 63 | 1.08(95%CI 0.74-1.57) | 0.809 |
spinach | 43 | 26 | 60 | 0.97(95%CI 0.67-1.39) | 1.000 |
mashedpota | 37 | 23 | 62 | 1(95%CI 0.7-1.43) | 1.000 |
coffee | 31 | 19 | 61 | 1(95%CI 0.69-1.44) | 1.000 |
fruitsalad | 6 | 4 | 67 | 1.1(95%CI 0.6-1.99) | 1.000 |
rrtable %>%
ggplot(aes(rr, fct_reorder(rowname, rr), color = p_fisher < 0.05))+
geom_pointrange(aes(xmin = ci_ll, xmax = ci_ul))+
scale_x_continuous(breaks = c(seq(0, 100, 5),1))+
geom_vline(xintercept = 1, color = "red", show.legend = T)+
scale_color_manual(values = c("grey50", "purple"))+
theme_classic()
vanilla
食物归因分析ar1 <- CS(oswego1, "ill", "vanilla", exact = T, full = T)
arv1 <- ar1$df2 %>%
rownames_to_column() %>%
mutate(across(!rowname, as.character)) %>%
mutate(across(!rowname, as.double))
arv1 %>%
as_tibble() %>%
flextable::flextable() %>%
flextable::autofit()
rowname | Point estimate | 95%CI ll | 95%CI ul |
Risk difference | 0.65 | 0.47 | 0.84 |
Risk ratio | 5.57 | 1.94 | 16.03 |
Attr. frac. ex. | 0.82 | 0.48 | 0.94 |
Attr. frac. pop | 0.77 | ||
chi2(1) | 27.22 | ||
Pr>chi2 | 0.00 | ||
Fisher p.value | 0.00 |
由上可见本起疫情中vanilla
食物的相对危险度为5.57(95%可信区间为1.94-16.03),即暴露该食物者的发病率是未暴露者的5.57倍。该食物的归因分值为82%。
ageg <- os %>%
dplyr::mutate(across(where(is.logical), as.numeric)) %>%
dplyr::mutate(agegroup = cut(age,
breaks = c(seq(0, 80, 10), Inf),
labels = paste0(seq(0, 80, 10), "岁~"),
right = F)) %>%
dplyr::mutate(sex = ifelse(sex == "F", 0, 1))
logit_fit <- glm(ill ~ sex + agegroup + bakedham +
spinach + mashedpota + cabbagesal +
jello + rolls + brownbread + milk +
coffee + water + cakes + vanilla +
chocolate + fruitsalad,
family = binomial(),
data = ageg)
rr <- cbind(coef(logit_fit), confint.default(logit_fit)) %>%
#计算RR值及其可信区间
exp() %>%
#提取并合并概率值
cbind(broom::tidy(logit_fit)$p.value) %>%
as.data.frame() %>%
rownames_to_column() %>%
dplyr::mutate(across(where(is.double), ~ round(.x, 2)))
colnames(rr) <- c('items', 'RR', 'lowerCI', 'upperCI', 'p.value')
rr %>%
flextable::flextable() %>%
flextable::color(i = ~ p.value < 0.05, color = "red") %>%
flextable::autofit()
items | RR | lowerCI | upperCI | p.value |
(Intercept) | 7.2e-01 | 0.02 | 31.2 | 0.87 |
sex | 2.5e-01 | 0.04 | 1.4 | 0.12 |
agegroup10岁~ | 2.3e-01 | 0.01 | 4.4 | 0.33 |
agegroup20岁~ | 7.0e-02 | 0.00 | 3.5 | 0.18 |
agegroup30岁~ | 3.8e-01 | 0.01 | 19.1 | 0.63 |
agegroup40岁~ | 7.3e+00 | 0.06 | 846.0 | 0.41 |
agegroup50岁~ | 2.4e+00 | 0.02 | 262.9 | 0.72 |
agegroup60岁~ | 1.4e-01 | 0.00 | 6.0 | 0.31 |
agegroup70岁~ | 7.3e+06 | 0.00 | Inf | 0.99 |
bakedham | 5.5e+00 | 0.09 | 333.1 | 0.41 |
spinach | 2.2e-01 | 0.01 | 7.5 | 0.40 |
mashedpota | 8.4e-01 | 0.09 | 8.3 | 0.88 |
cabbagesal | 1.5e+00 | 0.11 | 20.1 | 0.77 |
jello | 5.6e-01 | 0.04 | 8.6 | 0.68 |
rolls | 4.5e-01 | 0.01 | 17.6 | 0.67 |
brownbread | 2.3e+00 | 0.21 | 23.9 | 0.50 |
milk | 1.1e-01 | 0.00 | 44.5 | 0.47 |
coffee | 3.8e-01 | 0.01 | 9.9 | 0.56 |
water | 1.8e+00 | 0.15 | 21.3 | 0.64 |
cakes | 1.9e+00 | 0.25 | 14.6 | 0.53 |
vanilla | 5.0e+01 | 6.31 | 402.7 | 0.00 |
chocolate | 5.2e-01 | 0.08 | 3.5 | 0.50 |
fruitsalad | 4.0e-01 | 0.01 | 17.6 | 0.63 |
rr %>%
filter(!items %in% c("agegroup40岁~", "agegroup50岁~", "agegroup60岁~", "agegroup70岁~")) %>%
ggplot(aes(RR, items, color = p.value < 0.05))+
geom_pointrange(aes(xmin = lowerCI, xmax = upperCI))+
geom_vline(xintercept = 1L, lty = 5)+
scale_x_continuous(breaks = 1)+
scale_color_manual(values = c("grey50", "purple"))+
theme_classic()
综合上述分析,可以认定该疫情的风险因素为4月18日教堂晚餐供应的vanilla食物。
根据病例发病特征、现场流行病学调查和分析流行病学研究结果,可确定此次疫情为一起教堂晚餐中供应的vanilla食物引起的胃肠道聚集性疫情。