Installing the required packages if not already installed:
> required <- c("dplyr", "magrittr", "purrr", "rlang", "stringr")
> to_install <- setdiff(required, row.names(installed.packages()))
> if (length(to_install)) install.packages(to_install)
Loading magrittr:
> library(magrittr)
> dates_origin_excel <- "1899-12-30"
> format_dates <- function(df, x, y) {
+ df %>%
+ split(factor(is.character(.[[x]]), levels = c(TRUE, FALSE))) %>%
+ purrr::map2(list(function(z) dplyr::mutate(z, !!y := dplyr::if_else(grepl("/", !!rlang::sym(x)),
+ as.Date(!!rlang::sym(x), "%d/%m/%Y"),
+ as.Date(as.numeric(!!rlang::sym(x)), origin = dates_origin_excel))),
+ function(z) dplyr::mutate(z, !!y := as.Date(!!rlang::sym(x)))), function(z, f) f(z)) %>%
+ dplyr::bind_rows()
+ }
> pertussis <- readxl::read_excel("Pertussi 300519.xlsx") %>%
+ format_dates("Date of taking sample", "sample") %>%
+ format_dates("Date of admission", "admission") %>%
+ format_dates("Date of out hospital", "discharge") %>%
+ format_dates("Date of onset", "onset") %>%
+ format_dates("Date of bird", "dob") %>%
+ dplyr::mutate(inpatient = `Inpatient/ outpatient` == "inpatient",
+ sex = dplyr::recode(Sex, M = "male", F = "female"),
+ nb_vaccine = dplyr::na_if(`number of vacxin`, "No information"),
+ ventilation = dplyr::na_if(vatenlation, "No information") == "Yes",
+ ecmo = dplyr::na_if(ecmo, "No information") == "Yes",
+ rrt = dplyr::na_if(`Lọc máu`, "No information") == "Yes",
+ oxygen = dplyr::na_if(oxy, "No information") == "Yes",
+ other_hosp = dplyr::na_if(`already stayed in satellite hospital`, "No information") == "Yes",
+ outcome = dplyr::recode(dplyr::na_if(`state of discharge`, "18-27/12"),
+ `out hospital` = "home",
+ `tranfer other hospital` = "other_hospital",
+ `orther department` = "other_department"),
+ source_nhp = as.character(`Soure of infection`) == "1") %>%
+ dplyr::rename(id = ID,
+ province = Province) %>%
+ dplyr::select(id, dob, sex, nb_vaccine, inpatient, province, other_hosp, onset,
+ admission, sample, discharge, outcome, ventilation, oxygen, source_nhp, ecmo, rrt) %>%
+ dplyr::mutate_at(c("id", "nb_vaccine"), as.integer)
A provinces names dictionary:
> dictionary <- matrix(c(
+ "ba ninh" , "Bac Ninh",
+ "bacninh" , "Bac Ninh",
+ "bac can" , "Bac Kan",
+ "hung yeb" , "Hung Yen",
+ "huong yen" , "Hung Yen",
+ "quanh ninh", "Quang Ninh",
+ "Thị Nội" , "Ha Noi",
+ "vĩnh phúc" , "Vinh Phuc",
+ "vĩnh yen" , "Vinh Yen",
+ "van co" , "Phu Tho",
+ "Xã Nguyên" , "Thai Nguyen",
+ "Xã Ninh" , "Bac Ninh"), ncol = 2, byrow = TRUE)
Hash table:
> hash <- rbind(dictionary, matrix(rep(setdiff(pertussis$province, dictionary[, 1]), 2), ncol = 2)) %>%
+ as.data.frame(stringsAsFactors = FALSE) %>%
+ setNames(c("from", "to")) %$%
+ setNames(to, from) %>%
+ na.exclude()
Cleaning provinces’ names:
> pertussis %<>%
+ dplyr::mutate(province = stringr::str_to_title(stringr::str_squish(stringr::str_trim(hash[province])))) %>%
+ unique()
Removing duplicated IDs:
> for (dupl in as.integer(names(which(table(pertussis$id) > 1)))) {
+ del <- which(pertussis$id == dupl)
+ pertussis <- pertussis[-del[2:length(del)], ]
+ }
Provinces’ names have mutliple spelling. Will likely be the same for the districts and communes. We fix provinces names and delete district and communes names.
I delete the dionogis of admission since I’m not able to understand it.
rrt: renal replacement therapy. ecmo: extracorporeal membrane oxygenation. Used in more severe cases than rrt.
> pertussis[pertussis$id == 15038222, "sample"] <- as.Date("2015-02-10")
> pertussis[pertussis$id == 160295471, "sample"] <- as.Date("2016-06-17")
> pertussis[pertussis$id == 160285356, "sample"] <- as.Date("2016-06-09")
> pertussis[pertussis$id == 160042229, "sample"] <- as.Date("2016-02-24")
> pertussis[pertussis$id == 189656313, "discharge"] <- as.Date("2018-10-12")
> pertussis[pertussis$id == 180570796, "discharge"] <- as.Date("2018-10-19")
> pertussis[pertussis$id == 180503731, "discharge"] <- as.Date("2018-10-22")
> pertussis[pertussis$id == 180402549, "discharge"] <- as.Date("2018-09-11")
> pertussis[pertussis$id == 160295471, "discharge"] <- as.Date("2016-06-22")
> pertussis[pertussis$id == 160295471, "admission"] <- as.Date("2016-06-17")
> pertussis[pertussis$id == 160050235, "admission"] <- as.Date("2016-02-21")
Checking that there is only one line per ID:
> unique(names(table(table(pertussis$id)))) == "1"
[1] TRUE
The date variables are
> (date_var <- names(which(sapply(pertussis, class) == "Date")))
[1] "dob" "onset" "admission" "sample" "discharge"
Checking the ranges of dates:
> for(i in date_var) print(range(pertussis[[i]], na.rm = TRUE))
[1] "1963-01-01" "2018-11-27"
[1] "2014-12-09" "2018-06-17"
[1] "2014-12-30" "2019-01-02"
[1] "2014-12-31" "2018-12-28"
[1] "2015-01-12" "2019-01-12"
Checking the sexes:
> all(na.exclude(pertussis$sex) %in% c("male", "female"))
[1] TRUE
Checking the number of vaccine shots:
> all(na.exclude(pertussis$nb_vaccine) %in% 0:4)
[1] TRUE
The provinces names:
> sort(unique(pertussis$province))
[1] "Bac Giang" "Bac Kan" "Bac Ninh" "Cao Bang" "Dak Lak" "Dien Bien" "Dong Nai" "Gia Lai" "Ha Giang"
[10] "Ha Nam" "Ha Noi" "Ha Tinh" "Hai Duong" "Hai Phong" "Ho Chi Minh" "Hoa Binh" "Hue" "Hung Yen"
[19] "Lai Chau" "Lang Son" "Lao Cai" "Nam Dinh" "Nghe An" "Ninh Binh" "Phu Tho" "Quang Binh" "Quang Ngai"
[28] "Quang Ninh" "Quang Tri" "Soc Trang" "Son La" "Thai Binh" "Thai Nguyen" "Thanh Hoa" "Tuyen Quang" "Vinh Phuc"
[37] "Vinh Yen" "Yen Bai"
Inconsistencies in the dates (if TRUE means there is no inconsistency):
> pertussis %>%
+ dplyr::filter(discharge < dob | discharge < onset | discharge < admission | discharge < sample |
+ dob > onset | dob > admission | dob > sample | dob > discharge) %>%
+ nrow() %>%
+ `<`(1)
[1] TRUE
Checking time ranges:
> date1 <- as.Date("2014-12-31") - 30
> date2 <- as.Date("2018-12-31") + 30
> pertussis %>%
+ dplyr::filter(onset < date1 | onset > date2 |
+ admission < date1 | admission > date2 |
+ sample < date1 | sample > date2 |
+ discharge < date1 | discharge > date2) %>%
+ nrow() %>%
+ `<`(1)
[1] TRUE
Time differences histogram:
> pertussis %$%
+ c(onset - admission,
+ onset - sample,
+ onset - discharge,
+ admission - sample,
+ admission - discharge,
+ sample - discharge) %>%
+ na.exclude() %>%
+ abs() %>%
+ as.integer() %>%
+ hist(0:4000, xlim = c(0, 100), col = "grey", main = NA,
+ xlab = "time difference (in days)",
+ ylab = "number of time differences")
Time differences above 3 months:
> time_diff <- 93
> pertussis %>%
+ dplyr::filter(abs(onset - admission) > time_diff |
+ abs(onset - sample) > time_diff |
+ abs(onset - discharge) > time_diff |
+ abs(admission - sample) > time_diff |
+ abs(admission - discharge) > time_diff |
+ abs(sample - discharge) > time_diff) %>%
+ dplyr::select(id, onset, admission, sample, discharge)
# A tibble: 3 x 5
id onset admission sample discharge
<int> <date> <date> <date> <date>
1 180174911 2018-03-15 2018-04-05 2018-04-05 2018-07-10
2 180234710 NA 2018-08-20 2018-11-28 2018-12-06
3 180377754 NA 2018-11-03 2018-08-02 2018-11-08
Checking outcomes:
> all(na.exclude(pertussis$outcome) %in% c("home", "death", "other_hospital", "other_department"))
[1] TRUE
Number of missing values for each of the date variables:
> pertussis %>%
+ dplyr::select(onset:discharge) %>%
+ sapply(function(x) sum(is.na(x)))
onset admission sample discharge
2212 1901 6 1916
How many cases with no dates at all:
> pertussis %>%
+ dplyr::filter(is.na(onset) & is.na(admission) & is.na(sample) & is.na(discharge)) %>%
+ nrow()
[1] 1
> pertussis %>%
+ dplyr::select(onset, admission) %>%
+ na.exclude() %>%
+ dplyr::mutate(onset2admission = as.integer(admission - onset)) %$%
+ hist(onset2admission, col = "grey",
+ xlab = "time of admission after onset (in days)",
+ ylab = "number of cases")
> pertussis %>%
+ dplyr::select(onset, sample) %>%
+ na.exclude() %>%
+ dplyr::mutate(onset2sample = as.integer(sample - onset)) %$%
+ hist(onset2sample, col = "grey",
+ xlab = "time of sample after onset (in days)",
+ ylab = "number of cases")
> pertussis %>%
+ dplyr::select(onset, discharge) %>%
+ na.exclude() %>%
+ dplyr::mutate(onset2discharge = as.integer(discharge - onset)) %$%
+ hist(onset2discharge, col = "grey",
+ xlab = "time of discharge after onset (in days)",
+ ylab = "number of cases")
> pertussis %>%
+ dplyr::select(admission, sample) %>%
+ na.exclude() %>%
+ dplyr::mutate(admission2sample = as.integer(sample - admission)) %$%
+ hist(admission2sample, col = "grey",
+ xlab = "time of sample after admission (in days)",
+ ylab = "number of cases")
> pertussis %>%
+ dplyr::select(admission, discharge) %>%
+ na.exclude() %>%
+ dplyr::mutate(admission2discharge = as.integer(discharge - admission)) %$%
+ hist(admission2discharge, col = "grey",
+ xlab = "time of discharge after admission (in days)",
+ ylab = "number of cases")
Mean number of days between onset and sample:
> onset2sample <- pertussis %>%
+ dplyr::select(onset, sample) %>%
+ na.exclude() %>%
+ dplyr::mutate(onset2sample = as.integer(sample - onset)) %$%
+ mean(onset2sample) %>%
+ round()
Mean number of days between onset and admission:
> onset2admission <- pertussis %>%
+ dplyr::select(onset, admission) %>%
+ na.exclude() %>%
+ dplyr::mutate(onset2admission = as.integer(admission - onset)) %$%
+ mean(onset2admission) %>%
+ round()
Generating composite onset variable onset2:
> pertussis2 <- pertussis %>%
+ dplyr::filter(!(is.na(onset) & is.na(admission) & is.na(sample) & is.na(discharge))) %>%
+ dplyr::mutate(onset2 = as.Date(ifelse(is.na(onset),
+ ifelse(is.na(sample),
+ admission - onset2admission,
+ sample - onset2sample),
+ onset), origin = "1970-01-01"),
+ year = lubridate::year(onset2),
+ month = lubridate::month(onset2),
+ week = lubridate::epiweek(onset2)) %>%
+ dplyr::filter(year > 2014) %>%
+ dplyr::mutate(age = as.numeric((onset2 - dob) / 365),
+ age = ifelse(age < 0, 0, age),
+ baby = age <= 1)
> pertussis2 %>%
+ dplyr::group_by(year) %>%
+ dplyr::tally() %>%
+ dplyr::ungroup() %>%
+ dplyr::arrange(year)
# A tibble: 4 x 2
year n
<dbl> <int>
1 2015 545
2 2016 566
3 2017 1010
4 2018 963
Very seasonal: by month:
> pertussis2 %>%
+ dplyr::group_by(year, month) %>%
+ dplyr::tally() %>%
+ dplyr::ungroup() %>%
+ dplyr::arrange(year, month) %$%
+ barplot(n, col = year, ylab = "monthly incidence")
By week:
> pertussis2 %>%
+ dplyr::group_by(year, week) %>%
+ dplyr::tally() %>%
+ dplyr::ungroup() %>%
+ dplyr::arrange(year, week) %$%
+ barplot(n, col = year, border = year, ylab = "weekly incidence")
> with(pertussis2, table(inpatient, year))
year
inpatient 2015 2016 2017 2018
FALSE 241 321 659 679
TRUE 304 245 351 284
> with(pertussis2, fisher.test(table(inpatient, year), simulate.p.value = TRUE))
Fisher's Exact Test for Count Data with simulated p-value (based on 2000 replicates)
data: table(inpatient, year)
p-value = 0.0004998
alternative hypothesis: two.sided
> with(pertussis2, chisq.test(table(inpatient, year)))
Pearson's Chi-squared test
data: table(inpatient, year)
X-squared = 113.31, df = 3, p-value < 2.2e-16
On a by-year comparison, the proportion of inpatient decreases as the number of cases increases:
> pertussis2 %>%
+ dplyr::group_by(year) %>%
+ dplyr::summarise(n = dplyr::n(), inpatient = sum(inpatient)) %>%
+ dplyr::ungroup() %$%
+ plot(n, inpatient / n, ylim = 0:1, pch = 19, col = "red",
+ xlab = "number of patients", ylab = "proportion of inpatients")
Binomial model:
> pertussis2 %>%
+ dplyr::group_by(year) %>%
+ dplyr::tally() %>%
+ dplyr::ungroup() %>%
+ dplyr::right_join(pertussis2, "year") %$%
+ glm(inpatient ~ n, binomial, .) %>%
+ summary()
Call:
glm(formula = inpatient ~ n, family = binomial, data = .)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.1716 -0.8976 -0.8693 1.1979 1.5206
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.8826439 0.1501538 5.878 4.15e-09 ***
n -0.0016445 0.0001777 -9.252 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4107.6 on 3083 degrees of freedom
Residual deviance: 4021.5 on 3082 degrees of freedom
AIC: 4025.5
Number of Fisher Scoring iterations: 4
On a by-month comparison, the proportion of inpatient decreases as the number of cases increases:
> pertussis2 %>%
+ dplyr::group_by(year, month) %>%
+ dplyr::summarise(n = dplyr::n(), inpatient = sum(inpatient)) %>%
+ dplyr::ungroup() %$%
+ plot(n, inpatient / n, ylim = 0:1, pch = 19, col = "red",
+ xlab = "number of patients", ylab = "proportion of inpatients")
Binomial model:
> pertussis2 %>%
+ dplyr::group_by(year, month) %>%
+ dplyr::tally() %>%
+ dplyr::ungroup() %>%
+ dplyr::right_join(pertussis2, c("year", "month")) %$%
+ glm(inpatient ~ n, binomial, .) %>%
+ summary()
Call:
glm(formula = inpatient ~ n, family = binomial, data = .)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.2310 -0.9897 -0.8968 1.2966 1.6126
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.195013 0.106519 1.831 0.0671 .
n -0.008719 0.001317 -6.621 3.57e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4107.6 on 3083 degrees of freedom
Residual deviance: 4062.9 on 3082 degrees of freedom
AIC: 4066.9
Number of Fisher Scoring iterations: 4
Comparing the time series of inpatients and outpatients:
> pertussis2 %>%
+ dplyr::filter(inpatient) %>%
+ dplyr::group_by(year) %>%
+ dplyr::tally() %>%
+ dplyr::ungroup()
# A tibble: 4 x 2
year n
<dbl> <int>
1 2015 304
2 2016 245
3 2017 351
4 2018 284
Both inpatients and outpatients are seasonal but outpatients have an increasing trends that inpatients do not have:
> pertussis2 %>%
+ dplyr::filter(inpatient) %>%
+ dplyr::group_by(year, month) %>%
+ dplyr::tally() %>%
+ dplyr::ungroup() %>%
+ dplyr::arrange(year, month) %$%
+ barplot(n, col = year, ylab = "monthly incidence")
> pertussis2 %>%
+ dplyr::filter(!inpatient) %>%
+ dplyr::group_by(year, month) %>%
+ dplyr::tally() %>%
+ dplyr::ungroup() %>%
+ dplyr::arrange(year, month) %$%
+ barplot(n, col = year, ylab = "monthly incidence")
There are significantly more males than females
> pertussis %$%
+ table(sex) %>%
+ as.matrix() %>%
+ t() %>%
+ as.data.frame() %$%
+ prop.test(male, male + female)
1-sample proportions test with continuity correction
data: male out of male + female, null probability 0.5
X-squared = 7.75, df = 1, p-value = 0.005371
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
0.5074009 0.5428588
sample estimates:
p
0.5251613
The proportion of males is not different between the inpatients and outpatients:
> pertussis %$%
+ table(sex, inpatient) %>%
+ fisher.test()
Fisher's Exact Test for Count Data
data: .
p-value = 0.4601
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.8168011 1.0971472
sample estimates:
odds ratio
0.9466715
> pertussis %$%
+ table(province) %>%
+ sort(TRUE) %T>% print() %>%
+ barplot()
province
Ha Noi Hung Yen Nam Dinh Bac Ninh Phu Tho Hai Duong Nghe An Ha Nam Vinh Phuc Bac Giang Hai Phong
1053 190 176 170 159 158 123 116 107 91 90
Thai Nguyen Hoa Binh Thanh Hoa Ninh Binh Thai Binh Yen Bai Son La Quang Ninh Tuyen Quang Lao Cai Ha Tinh
74 64 63 59 51 51 50 44 41 39 38
Lang Son Cao Bang Ha Giang Bac Kan Dien Bien Lai Chau Ho Chi Minh Hue Vinh Yen Dak Lak Dong Nai
33 12 11 10 7 5 3 2 2 1 1
Gia Lai Quang Binh Quang Ngai Quang Tri Soc Trang
1 1 1 1 1
As many inpatients from Hanoi than from outside Hanoi:
> pertussis2 %>%
+ dplyr::mutate(hanoi = province == "Ha Noi") %$%
+ table(inpatient, hanoi) %T>% print() %>%
+ fisher.test()
hanoi
inpatient FALSE TRUE
FALSE 1264 635
TRUE 769 415
Fisher's Exact Test for Count Data
data: .
p-value = 0.3689
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.9190995 1.2550247
sample estimates:
odds ratio
1.074199
Age is missing for only 8 patients:
> sum(is.na(pertussis$dob))
[1] 8
> range(pertussis2$age, na.rm = TRUE)
[1] 0.00000 54.21918
> hist(pertussis2$age, 0:ceiling(max(pertussis2$age, na.rm = TRUE)),
+ col = "grey", main = NA, xlab = "age (years)", ylab = "number of cases")
The proportion of babies (age < 1 year) decreases as the number of patients increases, on a yearly basis:
> pertussis2 %>%
+ dplyr::group_by(year) %>%
+ dplyr::summarise(n = dplyr::n(), nbabies = sum(baby, na.rm = TRUE)) %>%
+ dplyr::ungroup() %$%
+ plot(n, nbabies / n, ylim = 0:1, pch = 19, col = "red",
+ xlab = "number of patients", ylab = "proportion of babies")
Binomial model:
> pertussis2 %>%
+ dplyr::group_by(year) %>%
+ dplyr::tally() %>%
+ dplyr::ungroup() %>%
+ dplyr::right_join(pertussis2, "year") %$%
+ glm(baby ~ n, binomial, .) %>%
+ summary()
Call:
glm(formula = baby ~ n, family = binomial, data = .)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.6281 -1.2016 0.7859 1.1534 1.1992
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.2681115 0.1647948 13.76 <2e-16 ***
n -0.0022964 0.0001877 -12.23 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4180.0 on 3075 degrees of freedom
Residual deviance: 4021.3 on 3074 degrees of freedom
(8 observations deleted due to missingness)
AIC: 4025.3
Number of Fisher Scoring iterations: 4
Same on a monthly basis:
> pertussis2 %>%
+ dplyr::group_by(year, month) %>%
+ dplyr::summarise(n = dplyr::n(), nbabies = sum(baby, na.rm = TRUE)) %>%
+ dplyr::ungroup() %$%
+ plot(n, nbabies / n, ylim = 0:1, pch = 19, col = "red",
+ xlab = "number of patients", ylab = "proportion of babies")
Binomial model:
> pertussis2 %>%
+ dplyr::group_by(year, month) %>%
+ dplyr::tally() %>%
+ dplyr::ungroup() %>%
+ dplyr::right_join(pertussis2, c("year", "month")) %$%
+ glm(baby ~ n, binomial, .) %>%
+ summary()
Call:
glm(formula = baby ~ n, family = binomial, data = .)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.5147 -1.2838 0.9438 1.0425 1.2104
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.894703 0.107422 8.329 < 2e-16 ***
n -0.007200 0.001287 -5.594 2.22e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4180.0 on 3075 degrees of freedom
Residual deviance: 4148.3 on 3074 degrees of freedom
(8 observations deleted due to missingness)
AIC: 4152.3
Number of Fisher Scoring iterations: 4
Sex ratio in the same among babies and non-babies:
> with(pertussis2, fisher.test(table(sex, baby)))
Fisher's Exact Test for Count Data
data: table(sex, baby)
p-value = 0.3052
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.9334485 1.2503841
sample estimates:
odds ratio
1.080354
Many more babies among inpatients than outpatients:
> with(pertussis2, fisher.test(table(inpatient, baby)))
Fisher's Exact Test for Count Data
data: table(inpatient, baby)
p-value < 2.2e-16
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
5.923130 8.558158
sample estimates:
odds ratio
7.107206
10 non-children patients:
> pertussis2 %>%
+ dplyr::filter(age > 16) %>%
+ dplyr::select(id, sex, year, age)
# A tibble: 10 x 4
id sex year age
<int> <chr> <dbl> <dbl>
1 176548872 female 2017 24.0
2 186413232 female 2018 25.2
3 185655665 female 2018 26.0
4 182135465 female 2018 28.8
5 186577879 female 2018 30.7
6 173244545 female 2017 32.3
7 186565987 male 2018 33.1
8 179879874 male 2017 33.5
9 186565486 female 2018 33.8
10 170047563 female 2017 54.2
The proxies of severity:
> pertussis2 %>%
+ dplyr::select(oxygen, ecmo, rrt, ventilation) %>%
+ lapply(table)
$oxygen
FALSE TRUE
999 186
$ecmo
FALSE TRUE
883 6
$rrt
FALSE TRUE
876 12
$ventilation
FALSE TRUE
1001 184
The level of severity is higher among patients from outside Hanoi than from patients from Hanoi:
> pertussis2 %>%
+ dplyr::filter(inpatient) %>%
+ dplyr::mutate(hanoi = province == "Ha Noi",
+ severe = oxygen) %$%
+ table(severe, hanoi) %T>% print() %>%
+ fisher.test()
hanoi
severe FALSE TRUE
FALSE 614 384
TRUE 155 31
Fisher's Exact Test for Count Data
data: .
p-value = 2.97e-09
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.2058894 0.4844087
sample estimates:
odds ratio
0.3200783
However we compute the severity variable:
> pertussis2 %>%
+ dplyr::filter(inpatient) %>%
+ dplyr::mutate(hanoi = province == "Ha Noi",
+ severe = oxygen | ecmo | ventilation | rrt) %$%
+ table(severe, hanoi) %T>% print() %>%
+ fisher.test()
hanoi
severe FALSE TRUE
FALSE 372 264
TRUE 272 72
Fisher's Exact Test for Count Data
data: .
p-value = 4.368e-11
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.2712074 0.5099506
sample estimates:
odds ratio
0.3733574
Seasonality on severity:
> pertussis2 %>%
+ dplyr::filter(oxygen) %>%
+ dplyr::group_by(year, month) %>%
+ dplyr::tally() %>%
+ dplyr::ungroup() %>%
+ dplyr::arrange(year, month) %$%
+ barplot(n, col = year, ylab = "monthly number of severe cases")
The proportion of severe cases tends to decrease when the incidence increases:
> pertussis2 %>%
+ dplyr::group_by(year, month) %>%
+ dplyr::tally() %>%
+ dplyr::ungroup() %>%
+ dplyr::right_join(dplyr::mutate(pertussis2, severe = oxygen | ecmo | ventilation | rrt),
+ c("year", "month")) %$%
+ glm(severe ~ n, binomial, .) %>%
+ summary()
Call:
glm(formula = severe ~ n, family = binomial, data = .)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.1006 -0.9610 -0.8589 1.3769 1.6450
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.128463 0.175497 -0.732 0.46417
n -0.006856 0.002310 -2.968 0.00299 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1271.1 on 980 degrees of freedom
Residual deviance: 1262.1 on 979 degrees of freedom
(2103 observations deleted due to missingness)
AIC: 1266.1
Number of Fisher Scoring iterations: 4
However, this is not the case anymore when we consider oxygen use only as a mesure of severity:
> pertussis2 %>%
+ dplyr::group_by(year, month) %>%
+ dplyr::tally() %>%
+ dplyr::ungroup() %>%
+ dplyr::right_join(pertussis2, c("year", "month")) %$%
+ glm(oxygen ~ n, binomial, .) %>%
+ summary()
Call:
glm(formula = oxygen ~ n, family = binomial, data = .)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.5855 -0.5848 -0.5843 -0.5837 1.9261
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.677e+00 2.186e-01 -7.668 1.75e-14 ***
n -6.155e-05 2.785e-03 -0.022 0.982
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1030 on 1184 degrees of freedom
Residual deviance: 1030 on 1183 degrees of freedom
(1899 observations deleted due to missingness)
AIC: 1034
Number of Fisher Scoring iterations: 3
There are more severe cases among babies than non babies:
> pertussis2 %$%
+ table(baby, oxygen) %T>% print() %>%
+ fisher.test()
oxygen
baby FALSE TRUE
FALSE 180 11
TRUE 819 175
Fisher's Exact Test for Count Data
data: .
p-value = 1.068e-05
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
1.853508 7.280370
sample estimates:
odds ratio
3.494158
Severity does not depends on gender:
> pertussis2 %$%
+ table(sex, oxygen) %T>% print() %>%
+ fisher.test()
oxygen
sex FALSE TRUE
female 478 95
male 521 91
Fisher's Exact Test for Count Data
data: .
p-value = 0.4254
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.6344169 1.2171247
sample estimates:
odds ratio
0.8789333