Summary

Packages

Installing the required packages if not already installed:

> required <- c("dplyr", "magrittr", "purrr", "rlang", "stringr")
> to_install <- setdiff(required, row.names(installed.packages()))
> if (length(to_install)) install.packages(to_install)

Loading magrittr:

> library(magrittr)

Utilitary functions and general parameters

> dates_origin_excel <- "1899-12-30"
> format_dates <- function(df, x, y) {
+   df %>% 
+     split(factor(is.character(.[[x]]), levels = c(TRUE, FALSE))) %>% 
+     purrr::map2(list(function(z) dplyr::mutate(z, !!y := dplyr::if_else(grepl("/", !!rlang::sym(x)),
+                                                                         as.Date(!!rlang::sym(x), "%d/%m/%Y"),
+                                                                         as.Date(as.numeric(!!rlang::sym(x)), origin = dates_origin_excel))),
+                      function(z) dplyr::mutate(z, !!y := as.Date(!!rlang::sym(x)))), function(z, f) f(z)) %>% 
+     dplyr::bind_rows()
+ }

Reading and cleaning the data

> pertussis <- readxl::read_excel("Pertussi 300519.xlsx") %>% 
+   format_dates("Date of taking sample", "sample") %>% 
+   format_dates("Date of admission", "admission") %>% 
+   format_dates("Date of out hospital", "discharge") %>% 
+   format_dates("Date of onset", "onset") %>% 
+   format_dates("Date of bird", "dob") %>% 
+   dplyr::mutate(inpatient   = `Inpatient/ outpatient` == "inpatient",
+                 sex         = dplyr::recode(Sex, M = "male", F = "female"),
+                 nb_vaccine  = dplyr::na_if(`number of vacxin`, "No information"),
+                 ventilation = dplyr::na_if(vatenlation, "No information") == "Yes",
+                 ecmo        = dplyr::na_if(ecmo, "No information") == "Yes",
+                 rrt         = dplyr::na_if(`Lọc máu`, "No information") == "Yes",
+                 oxygen      = dplyr::na_if(oxy, "No information") == "Yes",
+                 other_hosp  = dplyr::na_if(`already stayed in satellite hospital`, "No information") == "Yes",
+                 outcome     = dplyr::recode(dplyr::na_if(`state of discharge`, "18-27/12"),
+                                             `out hospital`           = "home",
+                                             `tranfer other hospital` = "other_hospital",
+                                             `orther department`      = "other_department"),
+                 source_nhp  = as.character(`Soure of infection`) == "1") %>%
+   dplyr::rename(id          = ID,
+                 province    = Province) %>% 
+   dplyr::select(id, dob, sex, nb_vaccine, inpatient, province, other_hosp, onset,
+                 admission, sample, discharge, outcome, ventilation, oxygen, source_nhp, ecmo, rrt) %>% 
+   dplyr::mutate_at(c("id", "nb_vaccine"), as.integer)

A provinces names dictionary:

> dictionary <- matrix(c(
+   "ba ninh"   , "Bac Ninh",
+   "bacninh"   , "Bac Ninh",
+   "bac can"   , "Bac Kan",
+   "hung yeb"  , "Hung Yen",
+   "huong yen" , "Hung Yen",
+   "quanh ninh", "Quang Ninh",
+   "Thị Nội"   , "Ha Noi",
+   "vĩnh phúc" , "Vinh Phuc",
+   "vĩnh yen"  , "Vinh Yen",
+   "van co"    , "Phu Tho",
+   "Xã Nguyên" , "Thai Nguyen",
+   "Xã Ninh"   , "Bac Ninh"), ncol = 2, byrow = TRUE)

Hash table:

> hash <- rbind(dictionary, matrix(rep(setdiff(pertussis$province, dictionary[, 1]), 2), ncol = 2)) %>% 
+   as.data.frame(stringsAsFactors = FALSE) %>% 
+   setNames(c("from", "to")) %$%
+   setNames(to, from) %>%
+   na.exclude()

Cleaning provinces’ names:

> pertussis %<>% 
+   dplyr::mutate(province = stringr::str_to_title(stringr::str_squish(stringr::str_trim(hash[province])))) %>% 
+   unique()

Removing duplicated IDs:

> for (dupl in as.integer(names(which(table(pertussis$id) > 1)))) {
+   del <- which(pertussis$id == dupl)
+   pertussis <- pertussis[-del[2:length(del)], ]
+ }

Provinces’ names have mutliple spelling. Will likely be the same for the districts and communes. We fix provinces names and delete district and communes names.

I delete the dionogis of admission since I’m not able to understand it.

rrt: renal replacement therapy. ecmo: extracorporeal membrane oxygenation. Used in more severe cases than rrt.

Manual fixes

> pertussis[pertussis$id == 15038222,  "sample"]    <- as.Date("2015-02-10")
> pertussis[pertussis$id == 160295471, "sample"]    <- as.Date("2016-06-17")
> pertussis[pertussis$id == 160285356, "sample"]    <- as.Date("2016-06-09")
> pertussis[pertussis$id == 160042229, "sample"]    <- as.Date("2016-02-24")
> pertussis[pertussis$id == 189656313, "discharge"] <- as.Date("2018-10-12")
> pertussis[pertussis$id == 180570796, "discharge"] <- as.Date("2018-10-19")
> pertussis[pertussis$id == 180503731, "discharge"] <- as.Date("2018-10-22")
> pertussis[pertussis$id == 180402549, "discharge"] <- as.Date("2018-09-11")
> pertussis[pertussis$id == 160295471, "discharge"] <- as.Date("2016-06-22")
> pertussis[pertussis$id == 160295471, "admission"] <- as.Date("2016-06-17")
> pertussis[pertussis$id == 160050235, "admission"] <- as.Date("2016-02-21")

Checking the data

Checking that there is only one line per ID:

> unique(names(table(table(pertussis$id)))) == "1"
[1] TRUE

The date variables are

> (date_var <- names(which(sapply(pertussis, class) == "Date")))
[1] "dob"       "onset"     "admission" "sample"    "discharge"

Checking the ranges of dates:

> for(i in date_var) print(range(pertussis[[i]], na.rm = TRUE))
[1] "1963-01-01" "2018-11-27"
[1] "2014-12-09" "2018-06-17"
[1] "2014-12-30" "2019-01-02"
[1] "2014-12-31" "2018-12-28"
[1] "2015-01-12" "2019-01-12"

Checking the sexes:

> all(na.exclude(pertussis$sex) %in% c("male", "female"))
[1] TRUE

Checking the number of vaccine shots:

> all(na.exclude(pertussis$nb_vaccine) %in% 0:4)
[1] TRUE

The provinces names:

> sort(unique(pertussis$province))
 [1] "Bac Giang"   "Bac Kan"     "Bac Ninh"    "Cao Bang"    "Dak Lak"     "Dien Bien"   "Dong Nai"    "Gia Lai"     "Ha Giang"   
[10] "Ha Nam"      "Ha Noi"      "Ha Tinh"     "Hai Duong"   "Hai Phong"   "Ho Chi Minh" "Hoa Binh"    "Hue"         "Hung Yen"   
[19] "Lai Chau"    "Lang Son"    "Lao Cai"     "Nam Dinh"    "Nghe An"     "Ninh Binh"   "Phu Tho"     "Quang Binh"  "Quang Ngai" 
[28] "Quang Ninh"  "Quang Tri"   "Soc Trang"   "Son La"      "Thai Binh"   "Thai Nguyen" "Thanh Hoa"   "Tuyen Quang" "Vinh Phuc"  
[37] "Vinh Yen"    "Yen Bai"    

Inconsistencies in the dates (if TRUE means there is no inconsistency):

> pertussis %>% 
+   dplyr::filter(discharge < dob | discharge < onset | discharge < admission | discharge < sample |
+                 dob > onset | dob > admission | dob > sample | dob > discharge) %>% 
+   nrow() %>% 
+   `<`(1)
[1] TRUE

Checking time ranges:

> date1 <- as.Date("2014-12-31") - 30
> date2 <- as.Date("2018-12-31") + 30
> pertussis %>% 
+   dplyr::filter(onset     < date1 | onset     > date2 |
+                 admission < date1 | admission > date2 |
+                 sample    < date1 | sample    > date2 |
+                 discharge < date1 | discharge > date2) %>% 
+   nrow() %>% 
+   `<`(1)
[1] TRUE

Time differences histogram:

> pertussis %$%
+   c(onset - admission,
+     onset - sample,
+     onset - discharge,
+     admission - sample,
+     admission - discharge,
+     sample - discharge) %>% 
+   na.exclude() %>% 
+   abs() %>% 
+   as.integer() %>% 
+   hist(0:4000, xlim = c(0, 100), col = "grey", main = NA,
+        xlab = "time difference (in days)",
+        ylab = "number of time differences")

Time differences above 3 months:

> time_diff <- 93
> pertussis %>% 
+   dplyr::filter(abs(onset - admission)     > time_diff |
+                 abs(onset - sample)        > time_diff |
+                 abs(onset - discharge)     > time_diff |
+                 abs(admission - sample)    > time_diff |
+                 abs(admission - discharge) > time_diff |
+                 abs(sample - discharge)    > time_diff) %>% 
+   dplyr::select(id, onset, admission, sample, discharge)
# A tibble: 3 x 5
         id onset      admission  sample     discharge 
      <int> <date>     <date>     <date>     <date>    
1 180174911 2018-03-15 2018-04-05 2018-04-05 2018-07-10
2 180234710 NA         2018-08-20 2018-11-28 2018-12-06
3 180377754 NA         2018-11-03 2018-08-02 2018-11-08

Checking outcomes:

> all(na.exclude(pertussis$outcome) %in% c("home", "death", "other_hospital", "other_department"))
[1] TRUE

Aggregating in time

Number of missing values for each of the date variables:

> pertussis %>% 
+   dplyr::select(onset:discharge) %>% 
+   sapply(function(x) sum(is.na(x)))
    onset admission    sample discharge 
     2212      1901         6      1916 

How many cases with no dates at all:

> pertussis %>% 
+   dplyr::filter(is.na(onset) & is.na(admission) & is.na(sample) & is.na(discharge)) %>% 
+   nrow()
[1] 1

A bunch of histograms

> pertussis %>% 
+   dplyr::select(onset, admission) %>% 
+   na.exclude() %>% 
+   dplyr::mutate(onset2admission = as.integer(admission - onset)) %$%
+   hist(onset2admission, col = "grey",
+        xlab = "time of admission after onset (in days)",
+        ylab = "number of cases")

> pertussis %>% 
+   dplyr::select(onset, sample) %>% 
+   na.exclude() %>% 
+   dplyr::mutate(onset2sample = as.integer(sample - onset)) %$%
+   hist(onset2sample, col = "grey",
+        xlab = "time of sample after onset (in days)",
+        ylab = "number of cases")

> pertussis %>% 
+   dplyr::select(onset, discharge) %>% 
+   na.exclude() %>% 
+   dplyr::mutate(onset2discharge = as.integer(discharge - onset)) %$%
+   hist(onset2discharge, col = "grey",
+        xlab = "time of discharge after onset (in days)",
+        ylab = "number of cases")

> pertussis %>% 
+   dplyr::select(admission, sample) %>% 
+   na.exclude() %>% 
+   dplyr::mutate(admission2sample = as.integer(sample - admission)) %$%
+   hist(admission2sample, col = "grey",
+        xlab = "time of sample after admission (in days)",
+        ylab = "number of cases")

> pertussis %>% 
+   dplyr::select(admission, discharge) %>% 
+   na.exclude() %>% 
+   dplyr::mutate(admission2discharge = as.integer(discharge - admission)) %$%
+   hist(admission2discharge, col = "grey",
+        xlab = "time of discharge after admission (in days)",
+        ylab = "number of cases")

Fixing dates

Mean number of days between onset and sample:

> onset2sample <- pertussis %>% 
+   dplyr::select(onset, sample) %>% 
+   na.exclude() %>% 
+   dplyr::mutate(onset2sample = as.integer(sample - onset)) %$%
+   mean(onset2sample) %>% 
+   round()

Mean number of days between onset and admission:

> onset2admission <- pertussis %>% 
+   dplyr::select(onset, admission) %>% 
+   na.exclude() %>% 
+   dplyr::mutate(onset2admission = as.integer(admission - onset)) %$%
+   mean(onset2admission) %>% 
+   round()

Generating composite onset variable onset2:

> pertussis2 <- pertussis %>% 
+   dplyr::filter(!(is.na(onset) & is.na(admission) & is.na(sample) & is.na(discharge))) %>% 
+   dplyr::mutate(onset2 = as.Date(ifelse(is.na(onset),
+                                         ifelse(is.na(sample),
+                                                admission - onset2admission,
+                                                sample - onset2sample),
+                                         onset), origin = "1970-01-01"),
+                 year = lubridate::year(onset2),
+                 month = lubridate::month(onset2),
+                 week = lubridate::epiweek(onset2)) %>% 
+   dplyr::filter(year > 2014) %>% 
+   dplyr::mutate(age  = as.numeric((onset2 - dob) / 365),
+                 age  = ifelse(age < 0, 0, age),
+                 baby = age <= 1)
> pertussis2 %>% 
+   dplyr::group_by(year) %>% 
+   dplyr::tally() %>% 
+   dplyr::ungroup() %>% 
+   dplyr::arrange(year)
# A tibble: 4 x 2
   year     n
  <dbl> <int>
1  2015   545
2  2016   566
3  2017  1010
4  2018   963

Very seasonal: by month:

> pertussis2 %>% 
+   dplyr::group_by(year, month) %>% 
+   dplyr::tally() %>% 
+   dplyr::ungroup() %>% 
+   dplyr::arrange(year, month) %$% 
+   barplot(n, col = year, ylab = "monthly incidence")

By week:

> pertussis2 %>%
+   dplyr::group_by(year, week) %>% 
+   dplyr::tally() %>% 
+   dplyr::ungroup() %>% 
+   dplyr::arrange(year, week) %$% 
+   barplot(n, col = year, border = year, ylab = "weekly incidence")

Inpatients / outpatients

> with(pertussis2, table(inpatient, year))
         year
inpatient 2015 2016 2017 2018
    FALSE  241  321  659  679
    TRUE   304  245  351  284
> with(pertussis2, fisher.test(table(inpatient, year), simulate.p.value = TRUE))

    Fisher's Exact Test for Count Data with simulated p-value (based on 2000 replicates)

data:  table(inpatient, year)
p-value = 0.0004998
alternative hypothesis: two.sided
> with(pertussis2, chisq.test(table(inpatient, year)))

    Pearson's Chi-squared test

data:  table(inpatient, year)
X-squared = 113.31, df = 3, p-value < 2.2e-16

On a by-year comparison, the proportion of inpatient decreases as the number of cases increases:

> pertussis2 %>% 
+   dplyr::group_by(year) %>% 
+   dplyr::summarise(n = dplyr::n(), inpatient = sum(inpatient)) %>% 
+   dplyr::ungroup() %$%
+   plot(n, inpatient / n, ylim = 0:1, pch = 19, col = "red",
+        xlab = "number of patients", ylab = "proportion of inpatients")

Binomial model:

> pertussis2 %>% 
+   dplyr::group_by(year) %>% 
+   dplyr::tally() %>% 
+   dplyr::ungroup() %>% 
+   dplyr::right_join(pertussis2, "year") %$% 
+   glm(inpatient ~ n, binomial, .) %>% 
+   summary()

Call:
glm(formula = inpatient ~ n, family = binomial, data = .)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1716  -0.8976  -0.8693   1.1979   1.5206  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  0.8826439  0.1501538   5.878 4.15e-09 ***
n           -0.0016445  0.0001777  -9.252  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 4107.6  on 3083  degrees of freedom
Residual deviance: 4021.5  on 3082  degrees of freedom
AIC: 4025.5

Number of Fisher Scoring iterations: 4

On a by-month comparison, the proportion of inpatient decreases as the number of cases increases:

> pertussis2 %>% 
+   dplyr::group_by(year, month) %>% 
+   dplyr::summarise(n = dplyr::n(), inpatient = sum(inpatient)) %>% 
+   dplyr::ungroup() %$%
+   plot(n, inpatient / n, ylim = 0:1, pch = 19, col = "red",
+        xlab = "number of patients", ylab = "proportion of inpatients")

Binomial model:

> pertussis2 %>% 
+   dplyr::group_by(year, month) %>% 
+   dplyr::tally() %>% 
+   dplyr::ungroup() %>% 
+   dplyr::right_join(pertussis2, c("year", "month")) %$% 
+   glm(inpatient ~ n, binomial, .) %>% 
+   summary()

Call:
glm(formula = inpatient ~ n, family = binomial, data = .)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.2310  -0.9897  -0.8968   1.2966   1.6126  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  0.195013   0.106519   1.831   0.0671 .  
n           -0.008719   0.001317  -6.621 3.57e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 4107.6  on 3083  degrees of freedom
Residual deviance: 4062.9  on 3082  degrees of freedom
AIC: 4066.9

Number of Fisher Scoring iterations: 4

Comparing the time series of inpatients and outpatients:

> pertussis2 %>% 
+   dplyr::filter(inpatient) %>% 
+   dplyr::group_by(year) %>% 
+   dplyr::tally() %>% 
+   dplyr::ungroup()
# A tibble: 4 x 2
   year     n
  <dbl> <int>
1  2015   304
2  2016   245
3  2017   351
4  2018   284

Both inpatients and outpatients are seasonal but outpatients have an increasing trends that inpatients do not have:

> pertussis2 %>% 
+   dplyr::filter(inpatient) %>% 
+   dplyr::group_by(year, month) %>% 
+   dplyr::tally() %>% 
+   dplyr::ungroup() %>% 
+   dplyr::arrange(year, month) %$% 
+   barplot(n, col = year, ylab = "monthly incidence")

> pertussis2 %>% 
+   dplyr::filter(!inpatient) %>% 
+   dplyr::group_by(year, month) %>% 
+   dplyr::tally() %>% 
+   dplyr::ungroup() %>% 
+   dplyr::arrange(year, month) %$% 
+   barplot(n, col = year, ylab = "monthly incidence")

Sex

There are significantly more males than females

> pertussis %$%
+   table(sex) %>% 
+   as.matrix() %>% 
+   t() %>% 
+   as.data.frame() %$%
+   prop.test(male, male + female)

    1-sample proportions test with continuity correction

data:  male out of male + female, null probability 0.5
X-squared = 7.75, df = 1, p-value = 0.005371
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
 0.5074009 0.5428588
sample estimates:
        p 
0.5251613 

The proportion of males is not different between the inpatients and outpatients:

> pertussis %$%
+   table(sex, inpatient) %>% 
+   fisher.test()

    Fisher's Exact Test for Count Data

data:  .
p-value = 0.4601
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.8168011 1.0971472
sample estimates:
odds ratio 
 0.9466715 

Provinces

> pertussis %$% 
+   table(province) %>% 
+   sort(TRUE) %T>% print() %>% 
+   barplot()
province
     Ha Noi    Hung Yen    Nam Dinh    Bac Ninh     Phu Tho   Hai Duong     Nghe An      Ha Nam   Vinh Phuc   Bac Giang   Hai Phong 
       1053         190         176         170         159         158         123         116         107          91          90 
Thai Nguyen    Hoa Binh   Thanh Hoa   Ninh Binh   Thai Binh     Yen Bai      Son La  Quang Ninh Tuyen Quang     Lao Cai     Ha Tinh 
         74          64          63          59          51          51          50          44          41          39          38 
   Lang Son    Cao Bang    Ha Giang     Bac Kan   Dien Bien    Lai Chau Ho Chi Minh         Hue    Vinh Yen     Dak Lak    Dong Nai 
         33          12          11          10           7           5           3           2           2           1           1 
    Gia Lai  Quang Binh  Quang Ngai   Quang Tri   Soc Trang 
          1           1           1           1           1 

As many inpatients from Hanoi than from outside Hanoi:

> pertussis2 %>%
+   dplyr::mutate(hanoi  = province == "Ha Noi") %$% 
+   table(inpatient, hanoi) %T>% print() %>% 
+   fisher.test()
         hanoi
inpatient FALSE TRUE
    FALSE  1264  635
    TRUE    769  415

    Fisher's Exact Test for Count Data

data:  .
p-value = 0.3689
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.9190995 1.2550247
sample estimates:
odds ratio 
  1.074199 

Age

Age is missing for only 8 patients:

> sum(is.na(pertussis$dob))
[1] 8
> range(pertussis2$age, na.rm = TRUE)
[1]  0.00000 54.21918
> hist(pertussis2$age, 0:ceiling(max(pertussis2$age, na.rm = TRUE)),
+      col = "grey", main = NA, xlab = "age (years)", ylab = "number of cases")

The proportion of babies (age < 1 year) decreases as the number of patients increases, on a yearly basis:

> pertussis2 %>% 
+   dplyr::group_by(year) %>% 
+   dplyr::summarise(n = dplyr::n(), nbabies = sum(baby, na.rm = TRUE)) %>% 
+   dplyr::ungroup() %$%
+   plot(n, nbabies / n, ylim = 0:1, pch = 19, col = "red",
+        xlab = "number of patients", ylab = "proportion of babies")

Binomial model:

> pertussis2 %>% 
+   dplyr::group_by(year) %>% 
+   dplyr::tally() %>% 
+   dplyr::ungroup() %>% 
+   dplyr::right_join(pertussis2, "year") %$% 
+   glm(baby ~ n, binomial, .) %>% 
+   summary()

Call:
glm(formula = baby ~ n, family = binomial, data = .)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6281  -1.2016   0.7859   1.1534   1.1992  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  2.2681115  0.1647948   13.76   <2e-16 ***
n           -0.0022964  0.0001877  -12.23   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 4180.0  on 3075  degrees of freedom
Residual deviance: 4021.3  on 3074  degrees of freedom
  (8 observations deleted due to missingness)
AIC: 4025.3

Number of Fisher Scoring iterations: 4

Same on a monthly basis:

> pertussis2 %>% 
+   dplyr::group_by(year, month) %>% 
+   dplyr::summarise(n = dplyr::n(), nbabies = sum(baby, na.rm = TRUE)) %>% 
+   dplyr::ungroup() %$%
+   plot(n, nbabies / n, ylim = 0:1, pch = 19, col = "red",
+        xlab = "number of patients", ylab = "proportion of babies")

Binomial model:

> pertussis2 %>% 
+   dplyr::group_by(year, month) %>% 
+   dplyr::tally() %>% 
+   dplyr::ungroup() %>% 
+   dplyr::right_join(pertussis2, c("year", "month")) %$% 
+   glm(baby ~ n, binomial, .) %>% 
+   summary()

Call:
glm(formula = baby ~ n, family = binomial, data = .)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5147  -1.2838   0.9438   1.0425   1.2104  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  0.894703   0.107422   8.329  < 2e-16 ***
n           -0.007200   0.001287  -5.594 2.22e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 4180.0  on 3075  degrees of freedom
Residual deviance: 4148.3  on 3074  degrees of freedom
  (8 observations deleted due to missingness)
AIC: 4152.3

Number of Fisher Scoring iterations: 4

Sex ratio in the same among babies and non-babies:

> with(pertussis2, fisher.test(table(sex, baby)))

    Fisher's Exact Test for Count Data

data:  table(sex, baby)
p-value = 0.3052
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.9334485 1.2503841
sample estimates:
odds ratio 
  1.080354 

Many more babies among inpatients than outpatients:

> with(pertussis2, fisher.test(table(inpatient, baby)))

    Fisher's Exact Test for Count Data

data:  table(inpatient, baby)
p-value < 2.2e-16
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 5.923130 8.558158
sample estimates:
odds ratio 
  7.107206 

10 non-children patients:

> pertussis2 %>%
+   dplyr::filter(age > 16) %>%
+   dplyr::select(id, sex, year, age)
# A tibble: 10 x 4
          id sex     year   age
       <int> <chr>  <dbl> <dbl>
 1 176548872 female  2017  24.0
 2 186413232 female  2018  25.2
 3 185655665 female  2018  26.0
 4 182135465 female  2018  28.8
 5 186577879 female  2018  30.7
 6 173244545 female  2017  32.3
 7 186565987 male    2018  33.1
 8 179879874 male    2017  33.5
 9 186565486 female  2018  33.8
10 170047563 female  2017  54.2

Severity

The proxies of severity:

> pertussis2 %>% 
+   dplyr::select(oxygen, ecmo, rrt, ventilation) %>% 
+   lapply(table)
$oxygen

FALSE  TRUE 
  999   186 

$ecmo

FALSE  TRUE 
  883     6 

$rrt

FALSE  TRUE 
  876    12 

$ventilation

FALSE  TRUE 
 1001   184 

The level of severity is higher among patients from outside Hanoi than from patients from Hanoi:

> pertussis2 %>%
+   dplyr::filter(inpatient) %>% 
+   dplyr::mutate(hanoi  = province == "Ha Noi",
+                 severe = oxygen) %$% 
+   table(severe, hanoi) %T>% print() %>% 
+   fisher.test()
       hanoi
severe  FALSE TRUE
  FALSE   614  384
  TRUE    155   31

    Fisher's Exact Test for Count Data

data:  .
p-value = 2.97e-09
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.2058894 0.4844087
sample estimates:
odds ratio 
 0.3200783 

However we compute the severity variable:

> pertussis2 %>%
+   dplyr::filter(inpatient) %>% 
+   dplyr::mutate(hanoi  = province == "Ha Noi",
+                 severe = oxygen | ecmo | ventilation | rrt) %$% 
+   table(severe, hanoi) %T>% print() %>% 
+   fisher.test()
       hanoi
severe  FALSE TRUE
  FALSE   372  264
  TRUE    272   72

    Fisher's Exact Test for Count Data

data:  .
p-value = 4.368e-11
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.2712074 0.5099506
sample estimates:
odds ratio 
 0.3733574 

Seasonality on severity:

> pertussis2 %>% 
+   dplyr::filter(oxygen) %>% 
+   dplyr::group_by(year, month) %>% 
+   dplyr::tally() %>% 
+   dplyr::ungroup() %>% 
+   dplyr::arrange(year, month) %$%
+   barplot(n, col = year, ylab = "monthly number of severe cases")

The proportion of severe cases tends to decrease when the incidence increases:

> pertussis2 %>%
+   dplyr::group_by(year, month) %>% 
+   dplyr::tally() %>% 
+   dplyr::ungroup() %>% 
+   dplyr::right_join(dplyr::mutate(pertussis2, severe = oxygen | ecmo | ventilation | rrt),
+                     c("year", "month")) %$% 
+   glm(severe ~ n, binomial, .) %>% 
+   summary()

Call:
glm(formula = severe ~ n, family = binomial, data = .)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1006  -0.9610  -0.8589   1.3769   1.6450  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)   
(Intercept) -0.128463   0.175497  -0.732  0.46417   
n           -0.006856   0.002310  -2.968  0.00299 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1271.1  on 980  degrees of freedom
Residual deviance: 1262.1  on 979  degrees of freedom
  (2103 observations deleted due to missingness)
AIC: 1266.1

Number of Fisher Scoring iterations: 4

However, this is not the case anymore when we consider oxygen use only as a mesure of severity:

> pertussis2 %>% 
+   dplyr::group_by(year, month) %>% 
+   dplyr::tally() %>% 
+   dplyr::ungroup() %>% 
+   dplyr::right_join(pertussis2, c("year", "month")) %$% 
+   glm(oxygen ~ n, binomial, .) %>% 
+   summary()

Call:
glm(formula = oxygen ~ n, family = binomial, data = .)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.5855  -0.5848  -0.5843  -0.5837   1.9261  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.677e+00  2.186e-01  -7.668 1.75e-14 ***
n           -6.155e-05  2.785e-03  -0.022    0.982    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1030  on 1184  degrees of freedom
Residual deviance: 1030  on 1183  degrees of freedom
  (1899 observations deleted due to missingness)
AIC: 1034

Number of Fisher Scoring iterations: 3

There are more severe cases among babies than non babies:

> pertussis2 %$% 
+   table(baby, oxygen) %T>% print() %>% 
+   fisher.test()
       oxygen
baby    FALSE TRUE
  FALSE   180   11
  TRUE    819  175

    Fisher's Exact Test for Count Data

data:  .
p-value = 1.068e-05
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 1.853508 7.280370
sample estimates:
odds ratio 
  3.494158 

Severity does not depends on gender:

> pertussis2 %$% 
+   table(sex, oxygen) %T>% print() %>% 
+   fisher.test()
        oxygen
sex      FALSE TRUE
  female   478   95
  male     521   91

    Fisher's Exact Test for Count Data

data:  .
p-value = 0.4254
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.6344169 1.2171247
sample estimates:
odds ratio 
 0.8789333