Preambule

The analysis detailed here requires the presence of the following data files in a directory named data in the current working directory:

> dir("data")
[1] "abbreviations.txt"                             
[2] "AMR_pathogens_221217.xlsx"                     
[3] "Scoring_pathogens_DrThu_Juan_Niwat_221217.xlsx"
[4] "Use_Antimicrobial_by_week_221217.xlsx"         

External packages

If not installed:

> install.packages(c("magrittr", "readxl", "naivebayes", "dplyr", "devtools",
+                    "vioplot", "magrittr", "tidyr", "purrr"))
> devtools::install_github("choisy/mcutils")

Loading required packages for interactive use:

> library(vioplot)    # vioplot
> library(tidyr)      # gather
> library(mcutils)    # ovv
> library(magrittr)   # %>%, %$%, %<>%
> library(readxl)     # read_excel
> library(naivebayes) # naive_bayes, predict.naive_bayes
> library(dplyr)      # filter, funs, group_by, left_join, matches, mutate,
>                     # mutate_at, starts_with, right_join, select, summarize,
>                     # summarise_all, ungroup

Utilitary functions

> data.frame2 <- function(...) data.frame(..., stringsAsFactors = FALSE) 

Functions for reading the data

There are 2 types of data collected from the farms: symptoms surveillance and antibiotics usage. There are 2 types of data from the literature and expert opinion: antibiotic resistance and diseases symptoms.

Disease symptoms from surveillance

The function reshape takes 4 vectors cycle, farmcode, min and max and returns a data frame of 3 columns with the sequence from min to max (by step of 1) for the given cycles and farms.

> reshape <- function(cycle, farmcode, min, max) {
+   require(purrr) # map2
+   map2(min, max, seq) %>% 
+     map2(farmcode, ., data.frame2) %>% 
+     map2(cycle, ., data.frame2) %>% 
+     do.call(rbind, .)
+ }

The above function reshape is used exclusively by the function add_missing_rows that works on a data frame:

> add_missing_rows <- function(df) {
+   require(magrittr) # %$%, %>%  
+   require(dplyr)    # group_by, summarise, left_join
+   by <- c("CYCLE", "FARMCODE", "WEEK")
+   df %>%
+     group_by(CYCLE, FARMCODE) %>%
+     summarise(min = min(WEEK), max = max(WEEK)) %$%
+     reshape(CYCLE, FARMCODE, min, max) %>% 
+     setNames(by) %>% 
+     left_join(df, by)
+ }

(there is indeed week 4 for cycle 2, farm 88 that is missing). The following function uses the previously described functions add_missing_rows and reads a file of symptoms:

> read_symptoms <- function(file) {
+   require(dplyr)
+   file %>% 
+     readxl::read_excel() %>% 
+     select(FARMCODE, CYCLE, WEEK, starts_with("S")) %>% # deletes useless columns
+     filter(!is.na(FARMCODE)) %>% # removes empty lines
+     mutate_if(is.numeric, as.integer) %>%
+     add_missing_rows() %>% # (there is still one missing row...)
+     mutate_at(vars(starts_with("S")), funs(. > 0)) # recodes symptoms
+ }

Antimicrobials usage from surveillance

The following function reads AMU data:

> read_amu <- function(file) {
+   require(dplyr)
+   file %>% 
+     readxl::read_excel() %>% 
+     rename(WEEK  = WEEKNO,
+            CYCLE = FLOCKSEQUENCE) %>% 
+     mutate_at(vars(matches("^C|W")), as.integer)
+ }

Reading surveillance data altogether

The following function extract the vector of antibiotics names used for a given cycle, farm and week. df is the data frame that contains the antimicrobial column in addition to the cycle, farm and week columns.

> extract_ab <- function(cycle, farm, week, df) {
+   require(dplyr)
+   df %>%
+     filter(CYCLE == cycle, FARMCODE == farm, WEEK == week) %>%
+     select(ANTIMICROBIAL) %>%
+     unlist() %>% 
+     unname()
+ }

The following function uses read_symptoms, read_amu and extract_ab and returns the surveillance data:

> read_surveillance <- function(symptoms_file, amu_file) {
+   require(dplyr)
+   symptoms <- read_symptoms(symptoms_file)
+   amu <- read_amu(amu_file)
+   select(amu, -ANTIMICROBIAL) %>%
+     unique() %>%
+     mutate(antimicrobial = pmap(list(CYCLE, FARMCODE, WEEK), extract_ab, amu),
+ # correcting some antimicrobials' names:
+            antimicrobial = lapply(antimicrobial,
+                                   function(x) sub("OXTETRACYCLINE",
+                                                   "OXYTETRACYCLINE", x)),
+            antimicrobial = lapply(antimicrobial,
+                                   function(x) sub("SULFADIMETHOXIN",
+                                                   "SULFADIMETHOXINE", x))) %>%
+ # joining with symptoms before returning:
+     right_join(symptoms, by = c("CYCLE", "FARMCODE", "WEEK"))
+ }

Etiologic causes of diseases from literature and expert opinion

This function reads the data linking symptoms to etiology from literature and experts opinions:

> read_etiology <- function(file) {
+   file %>% readxl::read_excel() %>% 
+     mutate_at(vars(matches("^S_|S\\.")), funs(!is.na(.))) %>% 
+     mutate(old   = Age %in% c("All ages", "Older"),
+            young = Age %in% c("All ages", "Young")) %>% 
+     select(-`High mortality`, -X__1, -Age)
+ }

Antimicrobial resistance from the literature and expert opinion

The function replace_na replaces each missing value of a vector by the last non-missing value that precedes it:

> replace_na <- function(x) {
+   for(i in 2:length(x))
+     if(is.na(x[i])) x[i] <- x[i - 1]
+   x
+ }

This function is used exclusively by the following read_amr function:

> .read_amr <- function(file) {
+   file %>%
+     readxl::read_excel() %>% 
+     rename(class = X__1, drug = X__2) %>%
+     select(-starts_with("X__")) %>% 
+     mutate(class = replace_na(class))
+ }

This function reads the AMR data and adds information about viruses:

> read_amr <- function(file, viruses = NULL) {
+   file %>%
+     .read_amr() %>%
+     cbind(setNames(data.frame(matrix(rep(100, length(viruses)), 1, length(viruses))), viruses)) %>% 
+ # correting some parasites and drugs names:
+     rename(`Ornithobacterium rhinotracheale` = `Ornitobacterium rhinotracheale`) %>% 
+     mutate(drug = sub("SULPHAMETHOXAZOLE", "SULFAMETHOXAZOLE", drug),
+            drug = sub("SULPHATHIAZOLE",    "SULFATHIAZOLE",    drug))
+ }

Reading the data

The data files:

> symptoms_file <- "data/Disasease_in_flocks_221217_cBao.xlsx"
> amu_file <- "data/Use_Antimicrobial_by_week_221217.xlsx"
> etiology_file <- "data/Scoring_pathogens_DrThu_Juan_Niwat_221217.xlsx"
> amr_file <- "data/AMR_pathogens_221217.xlsx"

Surveillance data, including symptoms and AMU:

> surveillance <- read_surveillance(symptoms_file, amu_file)

Data on etiologic causes:

> etiology <- read_etiology(etiology_file)

AMR data:

> amr <- read_amr(amr_file,
+                 grep("virus|coccidiosis", etiology$Pathogen, TRUE, value = TRUE))

Reading the abbreviations of pathogens and drugs’ names:

> abbreviations <- "data/abbreviations.txt" %>% 
+   read.csv(FALSE, stringsAsFactors = FALSE) %$%
+   setNames(V2, V1)

Exploration of data sets

Funtions to generate diseases episodes

The function fill_gaps takes a vector of logicals (or zeros and ones) and replaces all the 1 0 1 sequences by 1 1 1:

> fill_gaps <- function(x) {
+   as.integer(x) %>%         # because easier to deal with 1-character encoding
+     paste(collapse = "") %>% 
+     gsub("101", "111", .) %>% 
+     gsub("NA", "x", .) %>%  # need to replace potential NA by 1-character...
+     strsplit("") %>%        # ... because of the strsplit that follows
+     unlist(FALSE) %>%       # because strsplit returns a list
+     sub("x", NA, .) %>%     # back to NA
+     as.numeric() %>% 
+     as.logical()
+ }

The function episodes identifies the different disease episodes. It would transform 1 1 1 0 0 1 1 0 0 0 1 0 into 1 1 1 0 0 2 2 0 0 0 3 0:

> episodes <- function(x) {
+   x <- rle(x)
+   sel <- which(x$values > 0)
+   x$values[sel] <- seq_along(sel)
+   inverse.rle(x)
+ }

The following function uses the two above-defined functions fill_gaps and episodes to generate an episode column on a surveillance data frame. There is one fillgaps option that specifies whether 1 0 1 sequences of weeks should be transformed into 1 1 1.

> make_episodes <- function(df, fillgaps = TRUE) {
+   require(dplyr)
+   if (fillgaps) f <- fill_gaps else f <- I
+   df %>%
+     mutate(disease = select(., starts_with("S")) %>% # computing...
+                        rowSums() %>%                 # ... the presence...
+                        `>`(., 0)) %>%                # ... of disease
+     group_by(CYCLE, FARMCODE) %>% 
+     mutate(disease = f(disease), # optionally transforms "1 0 1" into "1 1 1"
+            episode = episodes(disease)) %>% # generates diseases episodes
+     group_by(episode, add = TRUE) %>% 
+     mutate_if(is.logical, any) %>% # merge the symptoms for each episod
+     ungroup() %>% 
+     select(-disease) %>% 
+     mutate_at(vars(starts_with("S"), episode),  # removes symptoms where there...
+               funs(ifelse(episode > 0, ., NA))) # ... is no disease episode
+ }

Looking at surveillance data

> (cycles <- surveillance %>%
+   make_episodes(fillgaps = TRUE) %>% 
+   group_by(CYCLE) %>%
+   select(CYCLE, FARMCODE) %>%
+   unique() %>%
+   ungroup() %>%
+   split(.$CYCLE) %>%
+   lapply(`[[`, 2))
$`1`
 [1] "75-001" "75-002" "75-003" "75-004" "75-005" "75-006" "75-007" "75-008"
 [9] "75-009" "75-010" "75-011" "75-012" "75-013" "75-014" "75-015" "75-016"
[17] "75-018" "75-019" "75-020" "75-021" "75-022" "75-023" "75-026" "75-027"
[25] "75-028" "75-029" "75-030" "75-031" "75-032" "75-033" "75-034" "75-035"
[33] "75-036" "75-037" "75-038" "75-039" "75-040" "75-041" "75-042" "75-043"
[41] "75-044" "75-045" "75-046" "75-047" "75-048" "75-049" "75-050" "75-051"
[49] "75-052" "75-053" "75-061" "75-062" "75-063" "75-064" "75-065" "75-066"
[57] "75-067" "75-068" "75-069" "75-070" "75-071" "75-072" "75-073" "75-074"
[65] "75-075" "75-076" "75-077" "75-078" "75-079" "75-080" "75-081" "75-082"
[73] "75-083" "75-085" "75-086" "75-087" "75-088" "75-089" "75-090" "75-091"
[81] "75-092" "75-093" "75-094" "75-095" "75-096" "75-097" "75-098" "75-099"
[89] "75-100" "75-101" "75-102" "75-103" "75-104" "75-105" "75-106"

$`2`
 [1] "75-001" "75-002" "75-004" "75-005" "75-006" "75-008" "75-011" "75-012"
 [9] "75-014" "75-015" "75-019" "75-020" "75-021" "75-022" "75-025" "75-026"
[17] "75-027" "75-028" "75-029" "75-030" "75-031" "75-033" "75-034" "75-036"
[25] "75-039" "75-040" "75-041" "75-044" "75-046" "75-047" "75-049" "75-062"
[33] "75-064" "75-065" "75-070" "75-072" "75-073" "75-074" "75-079" "75-080"
[41] "75-081" "75-086" "75-087" "75-088" "75-091" "75-095"

$`3`
[1] "75-012" "75-021" "75-027" "75-029" "75-047" "75-070" "75-080"
> sapply(cycles, length)
 1  2  3 
95 46  7 

All farms from cycle 3 are in cycle 2:

> all(cycles[[3]] %in% cycles[[2]])
[1] TRUE

But one farm from cyle 2 is not in cycle 1:

> all(cycles[[2]] %in% cycles[[1]])
[1] FALSE
> cycles[[2]][!cycles[[2]] %in% cycles[[1]]]
[1] "75-025"

Antiobiotics usage from surveillance

Note that in some farms, some weeks, there is no antibiotic use, whereas in other weeks and farms, there is 1 or more than 1 antibiotic used at the same time:

> amu_file %>% 
+   read_amu() %$%
+   table(FARMCODE, WEEK) %>%
+   ovv()
        1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 20 21
 75-001 4 0 2 2 4 0 0 1 1  1  2  0  0  0  0  2  0  0  0  0
 75-002 4 2 0 2 3 1 0 0 1  0  0  0  0  0  0  0  0  0  0  0
 75-003 0 2 2 2 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0
 75-004 4 2 2 4 2 1 0 0 1  1  0  1  0  0  0  0  0  0  0  0
 75-005 4 2 0 2 2 2 2 0 0  3  0  2  0  0  0  0  0  0  0  0
 75-006 4 1 0 1 4 2 2 3 0  4  2  2  3  3  0  0  0  0  0  0
      . . . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .
      . . . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .
      . . . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .
 75-101 3 2 0 1 0 3 3 0 0  0  0  0  0  0  0  0  0  0  0  0
 75-102 3 0 3 0 0 0 3 0 0  2  2  1  0  0  0  0  0  0  0  0
 75-103 2 3 2 2 0 0 2 0 2  2  2  0  0  0  0  0  0  0  0  0
 75-104 2 4 4 5 2 2 4 5 4  3  4  2  4  2  2  0  0  0  0  0
 75-105 2 0 0 0 0 0 0 3 2  0  0  0  0  0  0  0  0  0  0  0
 75-106 2 0 0 0 2 4 1 0 0  0  0  0  0  0  0  0  0  0  0  0

Plotting surveillance data

The function survey_episodes puts in shape a surveillance object (for a given cycle) for easier visualization of both the surveyed weeks and the weeks for which there is a disease episode. 0 means absence of survey, 1 means survey but absence of disease episode and 2 means survey and disease episode.

> survey_episodes <- function(df) {
+ # utilitary function -----------------------------------------------------------
+   make_farm_week_table <- function(df) {
+     df %$%
+       table(FARMCODE, WEEK) %>% 
+       unclass() %>% 
+       as.data.frame()
+   }
+   surveyed_weeks <- make_farm_week_table(df)
+   df %>% filter(episode > 0) %>% 
+     make_farm_week_table() %>% 
+     mutate(FARMCODE = rownames(.)) %>% 
+     right_join(data.frame2(FARMCODE = rownames(surveyed_weeks)), "FARMCODE") %>% 
+     select(-FARMCODE) %>% 
+     t() %>% 
+     as.data.frame() %>% 
+     mutate(., WEEK = rownames(.)) %>% 
+     right_join(data.frame2(WEEK = colnames(surveyed_weeks)), "WEEK") %>%
+     select(-WEEK) %>% 
+     t() %>%
+     as.data.frame() %>% 
+     mutate_all(funs(replace(., is.na(.), 0))) %>%
+     `+`(surveyed_weeks, .)
+ }

The following function sorts the farms of an output of the survey_episodes function according to the duration of their cycle:

> sort_episodes <- function(df) {
+   df[names(sort(rowSums(df > 0))), ]
+ }

Among the arguments of the following function, df is an output of the function survey_episodes (for one given cycle):

> plot_survey_episode <- function(df, sort = TRUE, with_amu = FALSE,
+                                 col_no_survey = "grey",
+                                 col_survey_no_dis = "blue",
+                                 col_survey_disease = "red",
+                                 amu_col = RColorBrewer::brewer.pal(7, "YlOrRd")) {
+   require(magrittr)
+ # a tuned image function ---------------------------------------------------------
+   image2 <- function(z) {
+     image(1:nrow(z), 1:ncol(z), z,
+           xlab = "week", ylab = "farm",
+           col = c(col_no_survey, col_survey_no_dis, col_survey_disease))
+     abline(v = 1:nrow(z) - .5, col = "white", lwd = .5)
+     abline(h = 1:ncol(z) - .5, col = "white", lwd = .5)
+     box(bty = "o")
+     z # output equal to input to make it pipe compliant
+   }
+ # a function to retrieve farms codes ---------------------------------------------
+   get_farms_codes <- function(df) setNames(1:ncol(df), colnames(df))
+ # optionally sorting the data ----------------------------------------------------
+   if (sort) optionally_sort <- sort_episodes else optionally_sort <- I
+ # plotting the survey and disease episodes ---------------------------------------
+   farms <- # codes of the farms for output
+     df %>% 
+     survey_episodes() %>% 
+     optionally_sort() %>% 
+     t() %>%
+     image2() %>% 
+     get_farms_codes()
+ # adding AMU ---------------------------------------------------------------------
+   if (with_amu) {
+     df %>% 
+       mutate(nb_ab = sapply(antimicrobial, length)) %>% 
+       filter(nb_ab > 0) %$% 
+       points(WEEK, farms[FARMCODE], pch = 19, col = amu_col[nb_ab])
+   }
+ # returning output ---------------------------------------------------------------
+   farms
+ }

The following function calculates the prevalence of disease by week and across farms. It takes an output of the function survey_episodes (for one given cycle) as input argument.

> binom_test <- function(df, ci = .95) {
+   df %>%
+     (function(y)
+        Map(function(x, n)
+              unlist(binom.test(x, n, conf.level = ci)[c("estimate", "conf.int")]),
+            colSums(y > 1), colSums(y > 0))) %>%
+     data.frame
+ }

The following function uses the binom_test function to plot the prevalence of disease by week and across farms, with confidence interval.

> plot_nb_episodes <- function(df) {
+   xlim <- range(df$WEEK) + c(-.5, .5)
+   df %<>% survey_episodes()
+   prevalences <- binom_test(df)
+   xval <- as.numeric(names(df)) %>%
+     (function(x) as.vector(t(cbind(x - .5, x + .5))))
+   plot(NA, xlim = xlim, ylim = c(0, 1), type = "n", xaxs = "i", yaxs = "i",
+        axes = FALSE, ann = FALSE)
+   abline(h = seq(0, 1, .1), col = "lightgrey", lwd = .75)
+   abline(h = seq(0, 1, .2), col = "darkgrey")
+   abline(v = seq(.5, 26.5, 1), col = "lightgrey")
+   polygon(c(xval, rev(xval)),
+           c(rep(prevalences[2, ], each = 2), rep(rev(prevalences[3, ]), each = 2)),
+           border = NA, col = adjustcolor("lightblue", .6))
+   lines(xval, rep(prevalences[1, ], each = 2), col = "darkblue")
+   axis(3)
+   mtext("week", 3, 1.5)
+   axis(4)
+   mtext("prevalence", 4, 1.5)
+   box(bty = "o")
+ }

The following function puts the previous plot_survey_episode and plot_nb_episodes together:

> plot_epis_amu <- function(df, sort = TRUE, with_amu = FALSE, top = .25,
+                           col_no_survey = "grey",
+                           col_survey_no_dis = "blue",
+                           col_survey_disease = "red",
+                           amu_col = RColorBrewer::brewer.pal(7, "YlOrRd")) {
+ # parameters of the layout of the plot -----------------------------------------
+   npar <- par("plt")
+   npar[3] <- .075
+   npar[4] <- 1 - npar[3]
+   npar[2] <- 1 - npar[1]
+   y4 <- npar[4]
+   y3 <- y4 - top * diff(npar[3:4])
+   npar[4] <- y3
+ # bottom subfigure -------------------------------------------------------------
+   opar <- par(plt = npar)
+   farms <- plot_survey_episode(df, sort, with_amu, col_no_survey,
+                                col_survey_no_dis, col_survey_disease, amu_col)
+ # top subfigure ----------------------------------------------------------------
+   npar[3] <- y3
+   npar[4] <- y4
+   par(plt = npar, new = TRUE)
+   plot_nb_episodes(df)
+   par(opar)
+ # the output -------------------------------------------------------------------
+   farms
+ }

Let’s see that for the 3 cycles:

> surveillance %>%
+   make_episodes(fillgaps = TRUE) %>%
+   filter(CYCLE == 1) %>%
+   plot_epis_amu(sort = TRUE, with_amu = FALSE)

75-037 75-007 75-061 75-089 75-092 75-034 75-036 75-053 75-106 75-085 75-042 
     1      2      3      4      5      6      7      8      9     10     11 
75-075 75-101 75-012 75-003 75-074 75-019 75-031 75-033 75-011 75-015 75-040 
    12     13     14     15     16     17     18     19     20     21     22 
75-091 75-093 75-095 75-099 75-013 75-014 75-023 75-030 75-032 75-048 75-052 
    23     24     25     26     27     28     29     30     31     32     33 
75-065 75-076 75-078 75-079 75-081 75-002 75-004 75-006 75-008 75-020 75-028 
    34     35     36     37     38     39     40     41     42     43     44 
75-043 75-051 75-068 75-071 75-072 75-073 75-077 75-088 75-094 75-100 75-005 
    45     46     47     48     49     50     51     52     53     54     55 
75-010 75-026 75-038 75-039 75-044 75-046 75-064 75-066 75-082 75-087 75-098 
    56     57     58     59     60     61     62     63     64     65     66 
75-102 75-104 75-021 75-027 75-041 75-045 75-070 75-097 75-103 75-009 75-016 
    67     68     69     70     71     72     73     74     75     76     77 
75-022 75-047 75-050 75-067 75-069 75-086 75-090 75-049 75-063 75-096 75-018 
    78     79     80     81     82     83     84     85     86     87     88 
75-062 75-029 75-080 75-083 75-105 75-035 75-001 
    89     90     91     92     93     94     95 
> surveillance %>%
+   make_episodes(fillgaps = TRUE) %>%
+   filter(CYCLE == 2) %>%
+   plot_epis_amu(sort = TRUE, with_amu = FALSE)

75-049 75-026 75-065 75-040 75-088 75-036 75-046 75-034 75-044 75-015 75-039 
     1      2      3      4      5      6      7      8      9     10     11 
75-041 75-072 75-095 75-019 75-028 75-033 75-074 75-079 75-008 75-011 75-031 
    12     13     14     15     16     17     18     19     20     21     22 
75-064 75-081 75-086 75-012 75-070 75-073 75-004 75-020 75-022 75-027 75-029 
    23     24     25     26     27     28     29     30     31     32     33 
75-047 75-087 75-021 75-025 75-030 75-080 75-091 75-002 75-006 75-062 75-001 
    34     35     36     37     38     39     40     41     42     43     44 
75-005 75-014 
    45     46 
> surveillance %>%
+   make_episodes(fillgaps = TRUE) %>%
+   filter(CYCLE == 3) %>%
+   plot_epis_amu(sort = TRUE, with_amu = FALSE)

75-029 75-070 75-047 75-080 75-027 75-012 75-021 
     1      2      3      4      5      6      7 

We can also put all the cycles together, in which case we need to somehow rename the farms so that it not only accounts for the farm code, but also for the cycle number:

> surveillance %>%
+   mutate(FARMCODE = paste(CYCLE, FARMCODE, sep = "-")) %>%
+   make_episodes(fillgaps = TRUE) %>%
+   plot_epis_amu(sort = TRUE, with_amu = FALSE)

1-75-037 1-75-007 1-75-061 1-75-089 1-75-092 2-75-049 3-75-029 3-75-070 
       1        2        3        4        5        6        7        8 
1-75-034 1-75-036 1-75-053 1-75-106 2-75-026 2-75-065 1-75-085 2-75-040 
       9       10       11       12       13       14       15       16 
2-75-088 3-75-047 3-75-080 1-75-042 1-75-075 1-75-101 2-75-036 3-75-027 
      17       18       19       20       21       22       23       24 
1-75-012 2-75-046 2-75-034 2-75-044 1-75-003 1-75-074 1-75-019 1-75-031 
      25       26       27       28       29       30       31       32 
1-75-033 2-75-015 2-75-039 2-75-041 2-75-072 2-75-095 3-75-012 1-75-011 
      33       34       35       36       37       38       39       40 
1-75-015 1-75-040 1-75-091 1-75-093 1-75-095 1-75-099 1-75-013 1-75-014 
      41       42       43       44       45       46       47       48 
1-75-023 1-75-030 1-75-032 1-75-048 1-75-052 1-75-065 1-75-076 1-75-078 
      49       50       51       52       53       54       55       56 
1-75-079 1-75-081 2-75-019 2-75-028 2-75-033 2-75-074 2-75-079 1-75-002 
      57       58       59       60       61       62       63       64 
1-75-004 1-75-006 1-75-008 1-75-020 1-75-028 1-75-043 1-75-051 1-75-068 
      65       66       67       68       69       70       71       72 
1-75-071 1-75-072 1-75-073 1-75-077 1-75-088 1-75-094 1-75-100 2-75-008 
      73       74       75       76       77       78       79       80 
2-75-011 2-75-031 2-75-064 2-75-081 2-75-086 1-75-005 1-75-010 1-75-026 
      81       82       83       84       85       86       87       88 
1-75-038 1-75-039 1-75-044 1-75-046 1-75-064 1-75-066 1-75-082 1-75-087 
      89       90       91       92       93       94       95       96 
1-75-098 1-75-102 1-75-104 2-75-012 2-75-070 2-75-073 1-75-021 1-75-027 
      97       98       99      100      101      102      103      104 
1-75-041 1-75-045 1-75-070 1-75-097 1-75-103 2-75-004 2-75-020 2-75-022 
     105      106      107      108      109      110      111      112 
2-75-027 2-75-029 2-75-047 2-75-087 1-75-009 1-75-016 1-75-022 1-75-047 
     113      114      115      116      117      118      119      120 
1-75-050 1-75-067 1-75-069 1-75-086 1-75-090 2-75-021 2-75-025 2-75-030 
     121      122      123      124      125      126      127      128 
2-75-080 2-75-091 1-75-049 1-75-063 1-75-096 2-75-002 2-75-006 1-75-018 
     129      130      131      132      133      134      135      136 
1-75-062 2-75-062 3-75-021 1-75-029 1-75-080 1-75-083 1-75-105 2-75-001 
     137      138      139      140      141      142      143      144 
1-75-035 2-75-005 2-75-014 1-75-001 
     145      146      147      148 

A naive Bayes model to infer pathogens from symptoms

The following function uses model object and data newdata to make predictions (we reversed the order of the arguments compared to predict in order to make it pipe compliant):

> predict2 <- function(newdata, object) {
+   require(dplyr) # %>%, mutate, select, mutate_all, funs
+   newdata %>% 
+     predict(object, ., type = "prob") %>% 
+     cbind(select(newdata, episode), .) %>%
+     mutate_all(funs(ifelse(episode < 1, NA, .))) %>% 
+     cbind(select(newdata, CYCLE, FARMCODE, WEEK, antimicrobial), .)
+ }

Let’s retrieve the predicted probabilities based on the sets of symptoms recorded in surveillance. First, let’s define a naive Bayes model based on etiology that contains both data from the literature (variables of the formula) and prior information from experts:

> model <- etiology %>%
+   mutate(score = select(., starts_with("Score")) %>% rowMeans) %>% # averaging the scores
+   naive_bayes(Pathogen ~ S_Respiratory + S.Diarrhoea + S.CNS + S.Malaise +
+                          S.Leglesions + S.Suddendeath + old + young, ., .$score, 1)

Then we can combine this model and (pre-processed) surveillance data to make predictions:

> predictions <-
+   surveillance %>% 
+   mutate(old   = WEEK > 6, # using a cut-off of 6 w to separate youngs from old
+          young = !old) %>% 
+   make_episodes(fillgaps = TRUE) %>% 
+   predict2(model)

Using the model’s predictions

A pathogen table

Let’s start by defining a number of utilitary functions:

> summary_by <- function(x, name) {
+   x %>% 
+     colMeans() %>% 
+     `*`(100) %>% 
+     (function(x) lapply(list(names, I), function(f) f(x))) %>%
+     as.data.frame(stringsAsFactors = FALSE) %>%
+     setNames(c("Pathogen", name))
+ }

The above function is used exclusively by the following 2 functions. The function by_week computes the probability of presence of pathogens, by week of disease episode:

> by_week <- function(pred) {
+   pred %>%
+     select(-CYCLE, -FARMCODE, -WEEK, -antimicrobial, -episode) %>%
+     na.exclude() %>%
+     summary_by("proba_week")
+ }

The following function computes the probability of presence of pathogens, by disease episode:

> by_episode <- function(pred) {
+   pred %>%
+     select(-WEEK, -antimicrobial) %>%
+     na.exclude() %>%
+     unique() %>% 
+     select(-CYCLE, -FARMCODE, -episode) %>%
+     summary_by("proba_episode")
+ }

Let’s use the 2 above functions to generate the table of pathogens:

> table2 <- lapply(c(by_week, by_episode), function(f) f(predictions)) %>% 
+   do.call(merge, .) %>% 
+   merge(select(etiology, Pathogen, starts_with("Score"))) %>% 
+   arrange(desc(proba_week)) %>% 
+   mutate_at(-1, as.numeric) %>% 
+   mutate(x      = select(., starts_with("Score")) %>% rowMeans(),
+          y      = select(., starts_with("proba")) %>% rowMeans(),
+          ratio0 = x / y,
+          ratio  = ifelse(ratio0 > 1, ratio0, 1 / ratio0))
> ovv(table2)
                                          Pathogen proba_week proba_episode
  1      Infectious Bursal Disease virus (Gumboro)    14.7527       16.2418
  2                          Gallibacterium anatis    10.9674         7.277
  3                          Avian metapneumovirus     9.1017       10.5088
  4                   Erysipelothrix rhusiopathiae     8.6873        8.0736
  5                            Salmonella pullorum     6.1916        6.8166
  6                       Mycoplasma gallisepticum      5.935        6.8525
  .                                              .          .             .
  .                                              .          .             .
  .                                              .          .             .
 20                  Avian encephalomielitis virus     0.7148        0.9139
 21                    Avibacterium paragallinarum     0.6246        0.8119
 22                               Pseudomonas spp.     0.5528        0.5283
 23                     Eimeria spp. (coccidiosis)     0.5399        0.6635
 24 Pasteurella multocida (chronic pasteurellosis)      0.516        0.5226
 25        Highly Pathogenic Avian Influenza virus     0.3428        0.2079
 Score.DrThu Score.Juan Score.Niwat       x       y  ratio0   ratio
     10.0189    10.3686      9.7561 10.0478 15.4973  0.6484  1.5423
      0.4554     0.9426      0.9756  0.7912  9.1222  0.0867 11.5295
      0.4554     1.8852      1.9512  1.4306  9.8052  0.1459  6.8539
      0.4554     1.8852      0.4878  0.9428  8.3805  0.1125  8.8889
      6.3756     1.8852      4.3902   4.217  6.5041  0.6484  1.5423
      9.1081     5.6556      7.4146  7.3928  6.3938  1.1562  1.1562
           .          .           .       .       .       .       .
           .          .           .       .       .       .       .
           .          .           .       .       .       .       .
      0.4554     0.9426      1.9512  1.1164  0.8144  1.3709  1.3709
      3.6432     1.8852      3.9024  3.1436  0.7182  4.3769  4.3769
      0.9108     1.8852      0.4878  1.0946  0.5405   2.025   2.025
      3.7707     2.8278       4.878  3.8255  0.6017  6.3576  6.3576
      0.4554     1.8852       0.878  1.0729  0.5193  2.0661  2.0661
      9.1081    13.1963     11.7073 11.3372  0.2754 41.1699 41.1699
> lim <-
+   table2 %>%
+   select(proba_week, proba_episode) %>% 
+   range() %>% 
+   map2(c(floor, ceiling), function(x, f) f(x)) %>% 
+   unlist()
> 
> opar <- par(pty = "s")
> 
> with(table2, plot(proba_week, proba_episode, pch = 21, pty = "s", bty = "o",
+                   bg = adjustcolor("red", .3), xlim = lim, ylim = lim,
+                   xlab = "average score of presence per week",
+                   ylab = "average score of presence per episode"))
> 
> abline(0, 1)
> abline(0, 1.5, lty = 2)
> abline(0, 2 / 3, lty = 2)
> 
> table2 %>% 
+   filter(proba_week > 7) %$%
+   text(proba_week, proba_episode, abbreviations[Pathogen], pos = 4,
+        font = ifelse(grepl("\\.", abbreviations[Pathogen]), 3, 1))

> par(opar)
> with(table2, cor.test(proba_week, proba_episode))

    Pearson's product-moment correlation

data:  proba_week and proba_episode
t = 15.393, df = 23, p-value = 1.329e-13
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.8985951 0.9801202
sample estimates:
      cor 
0.9547351 

Looking at the correlations between the scores of the three experts:

> table2 %>% 
+   select(starts_with("Score")) %>% 
+   setNames(paste("expert", 1:3)) %>% 
+   mcgraph::paircor(digits = 2, cex = 1.5, pch = 21, bty_upper = "o",
+                    bty_lower = "o", bg = adjustcolor("red", .3),
+                    axeslim = c(0, 14), cex.labels = 1.5, cex.axis = 1.5,
+                    axes_lower = TRUE)

Let’s compare the average prior score with the average posterior score:

> opar <- par(pty = "s")
> 
> table2 %$% 
+   plot(x, y, pty = "s", bty = "o", col = "white",
+        xlim = c(0, max(x, y)),
+        ylim = c(0, max(x, y)),
+        xlab = "average prior score from experts",
+        ylab = "average posterior score from naive Bayes model")
> 
> abline(0, 1)
> for(i in c(.5, 2)) abline(0, i, lty = 2)
> for(i in c(3:10, 2:5 * 10)) {
+   abline(0, i, col = "lightgrey", lwd = .5)
+   abline(0, 1 / i, col = "lightgrey", lwd = .5)
+ }
> 
> table2 %$% points(x, y, pch = 21, bg = adjustcolor("red", .3)) 
> 
> table2 %>% 
+   filter(ratio > 2.1) %>% 
+   mutate(pos  = ifelse(Pathogen == "Eimeria spp. (coccidiosis)", 1,
+                        ifelse(Pathogen == "Avibacterium paragallinarum" , 3, 4)),
+          font = ifelse(Pathogen %in% c("Highly Pathogenic Avian Influenza virus",
+                                        "Infectious Bronchitis Virus",
+                                        "Avian metapneumovirus"), 1, 3)) %$% 
+   text(x, y, abbreviations[as.character(Pathogen)], .5, pos, font = font, cex = .75)

> 
> par(opar)
> with(table2, cor.test(x, y))

    Pearson's product-moment correlation

data:  x and y
t = 0.85005, df = 23, p-value = 0.4041
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.2369441  0.5329078
sample estimates:
      cor 
0.1745267 

Let’s see another to compare the prior and posterior scores of presence of the pathogens. For that, we need a number of utilitary functions. First a tuned version of the graphics::barplot function:

> barplot2 <- function(df, width = 1, space = NULL, ...) {
+   if (is.null(space)) space <- .2
+   df %>%
+     select(prior, posterior) %>%
+     as.matrix() %>%
+     barplot(width, space, ...)
+   df %$%
+     segments(width + space, c(0, cumsum(prior)),
+              width + 2 * space, c(0, cumsum(posterior)), lwd = .5)
+   df
+ }

The following function calculate of the mid values of a vector:

> midval <- function(x) x %>% 
+   cumsum() %>%
+   cbind(c(0, .[-length(.)]), .) %>%
+   rowMeans()

The following function is a tuned version of the graphics::text function:

> text2 <- function(df, var, x, threshold = 0, ...) {
+   sel <- df[[var]] > threshold
+   textval <- abbreviations[df$Pathogen]
+   fontval <- ifelse(grepl("\\.", textval), 3, 1)
+   text(x, midval(df[[var]])[sel], textval[sel], font = fontval[sel])
+   invisible(df)
+ }

Let’s use the 3 previous functions for the following plot:

> width <- 1
> space <- .2
> table2 %>%
+   mutate(prior     = select(., starts_with("Score")) %>% rowMeans,
+          posterior = select(., starts_with("proba")) %>% rowMeans) %>%
+   arrange(desc(prior)) %T>% 
+   barplot2(width, space, ylab = "score") %>% 
+   text2("prior", (width + 2 * space) / 2, 4, cex = .9) %>% 
+   text2("posterior", (3 * width + 4 * space) / 2, 4, cex = .9)

Diseases episodes durations

Let’s now investigate the durations of the diseases episodes and see whether there are any difference between the pathogens. First, we need to removing the episodes that start or end a cycle:

> not_sick <- function(x, pattern) x %>%
+   as.character() %>%
+   paste(collapse = "") %>%
+   grepl(pattern, .)

The above function is used exclusively by the following function that is itself used exclusively by the function after:

> .not_sick_df <- function(df, pattern, name) df %>%
+   survey_episodes() %>%
+   t() %>%
+   as.data.frame() %>%
+   sapply(not_sick, pattern) %>%
+   data.frame2(names(.), .) %>% 
+   setNames(c("FARMCODE", name))
> not_sick_df <- function(pattern, name, df) df %>%
+   split(.$CYCLE) %>%
+   lapply(.not_sick_df, pattern, name) %>% 
+   do.call(function(...) bind_rows(..., .id = "CYCLE"), .) %>%
+   mutate(CYCLE = as.numeric(CYCLE))

We can now compute the durations of the diseases episodes for each pathogen, without excluding the episodes that start or end a cycle:

> epi_duration <- predictions %>%
+   group_by(CYCLE, FARMCODE, episode) %>%
+   na.exclude() %>%
+   tally() %>% 
+   ungroup()

and excluding them:

> epi_duration2 <- predictions %>%
+   map2(c("^1", "10"), c("startnotsick", "endnotsick"), not_sick_df, .) %>% 
+   do.call(merge, .) %>% 
+   filter(startnotsick, endnotsick) %>% 
+   select(-startnotsick, -endnotsick) %>%
+   left_join(epi_duration, c("CYCLE", "FARMCODE")) %>% 
+   na.exclude()

We can compare the distributions of the two computed durations in order to see there is a significant censoring effect:

> hist2 <- function(x, to = NULL, ...) {
+   if (is.null(to)) to <- max(x + 1)
+   hist(x, seq(0, to), xlab = "duration of episode",
+        ylab = "number of episodes", main = NA, ...)
+ }
> fct <- function(x, ...) hist2(x, 22, ylim = c(0, 110), ...)
> hist_all <- fct(epi_duration$n, col = "grey")
> par(new = TRUE)
> hist_sbs <- fct(epi_duration2$n, col = "red")

The two distributions are not significantly different:

> list(hist_all, hist_sbs) %>%
+   sapply(`[[`, "counts") %>%
+   fisher.test(simulate.p.value = TRUE, B = 1e7)

    Fisher's Exact Test for Count Data with simulated p-value (based on
    1e+07 replicates)

data:  .
p-value = 0.6241
alternative hypothesis: two.sided

Let’s now see whether there are differences in episodes durations between the various pathogens. For that we first need to tune a bit the vioplot::vioplot function:

> vioplot2 <- function(df, range=1.5, horizontal=FALSE, col="magenta",
+                      border="black", lty=1, lwd=1, rectCol="black",
+                      colMed="white", pchMed=19, at, add=FALSE, wex=1,
+                      drawRect=TRUE) {
+   if (missing(at)) at <- 1:length(df)
+   do.call(function(...) vioplot(df[, 1], ..., range = range,
+                                 horizontal = horizontal, col = col,
+                                 border = border, lty = lty, lwd = lwd,
+                                 rectCol = rectCol, colMed = colMed,
+                                 pchMed = pchMed, at = at, add = add, wex = wex,
+                                 drawRect = drawRect), df[, -1])
+ }

The following function arranges the columns of a data frame according to some criterion computed by the f function passed as an argument:

> arrange_columns <- function(df, f, ...) {
+   df[, order(apply(df, 2, f, ...))]
+ }

Let’s compute a processed version of the prediction table:

> predictions3 <- predictions %>%
+   select(-WEEK, -antimicrobial) %>%
+   na.exclude() %>%
+   unique() %>%
+   right_join(epi_duration, c("CYCLE", "FARMCODE", "episode")) %>%
+   mutate_at(vars(-one_of(c("CYCLE", "FARMCODE", "episode", "n"))), funs(. * n)) %>% 
+   select(-CYCLE, -FARMCODE, -episode, -n)

Let’s now see whether the medians (or mean, or min, or max) of these distributions are correlated with the probability of presence per episode. If yes, it means that the pathogens do not differ in terms of duration of infection. If not, it means that they do.

A linear model:

> model <- predictions3 %>% 
+   gather("x", "y") %>%
+   mutate(x = log(with(table2, setNames(proba_episode, Pathogen))[x]),
+          y = log(y)) %>%
+   lm(y ~ x, .)

The plot:

> # the plot frame ---------------------------------------------------------------
> plot(NA, xlim = c(-1, 3), ylim = c(-8, 2), axes = FALSE,
+      xlab = "average score of presence per episode",
+      ylab = "durations of infection episodes")
> 
> # the violin plots -------------------------------------------------------------
> table2 %>% 
+   arrange(proba_episode) %>% 
+   select(Pathogen) %>% 
+   unlist() %>% 
+   `[`(predictions3, .) %>% 
+   mutate_all(log) %>%
+   vioplot2(at = log(sort(table2$proba_episode)), col = adjustcolor("grey", .25),
+            drawRect = FALSE, wex = .5, add = T,
+            rectCol = adjustcolor("black", .25))
> 
> 
> # the confidence and prediction intervals --------------------------------------
> xs <- seq(-1, 3, le = 100)
> fct <- function(int, ltype) {
+   pred <- predict(model, data.frame(x = xs), interval = int)
+   with(as.data.frame(pred), {
+     lines(xs, lwr, lty = ltype)
+     lines(xs, upr, lty = ltype)
+   })
+ }
> abline(model)
> fct("confidence", 2)
> fct("prediction", 3)
> 
> # the log-scale axes -----------------------------------------------------------
> par(new = TRUE)
> plot(NA, xlim = exp(c(-1, 3)), ylim = exp(c(-8, 2)), log = "xy", ann = FALSE)

The probability of resistance for each pathogen

For each pathogen, we want to compute the probability of treatment failure (i.e. of using the incorrect treatment). The analysis will necessarily be by week. This is an NA returned value used by the function compute_resistance that follows after:

> naval <-
+   predictions %>%
+   select(-CYCLE, -FARMCODE, -WEEK, -antimicrobial, -episode) %>%
+   names() %>% 
+   setNames(rep(NA, length(.)), .)

The following function computes resistance level for each pathogen and the used set of antibiotics for a given week:

> 
> compute_resistance <- function(x) {
+   if (is.null(x)) return(naval)
+   amr %>%
+     filter(drug %in% x) %>%
+     select(-class, -drug) %>%
+     apply(2, min)
+ }

The following function allows to correct for the fact that there is not chlortetracycline information in the amr data set. We thus consider that it is a simple tetracycline:

> correction <- function(x) {
+   if (is.null(x)) return(NULL)
+   sub("CHLORTETRACYCLINE", "TETRACYCLINE", x)
+ }

Let’s use the 2 previous functions to compute the levels of resistance:

> resistance <- predictions$antimicrobial %>% 
+   lapply(correction) %>%
+   lapply(compute_resistance) %>% 
+   do.call(bind_rows, .)

The following function allows to separate the columns of a data frame into two sets of data frames, according to a pattern to match in the columns names:

> separate_columns <- function(df, match) {
+   select(df, ends_with(match)) %>% 
+     setNames(., sub(match, "", names(.)))
+ }

A utilitary function that applies 2 different functions to a list of 2 lists:

> f1 <- function(df) sweep(df, 2, apply(df, 2, sum), "/")
> f2 <- function(df, var) df[, var]
> f3 <- function(lst) list(f1(lst[[1]]), f2(lst[[2]], names(lst[[1]]))) 

The following function uses the 2 previous functions to compute the levels of resistance:

> proba_treat_failure <- function(pred, res) {
+   cbind(setNames(pred, paste0(names(pred), "_p")),
+         setNames(res,  paste0(names(res),  "_r"))) %>% 
+     filter(!sapply(.$antimicrobial_p, is.null), !is.na(episode_p)) %>%
+     select(-CYCLE_p, -FARMCODE_p, -WEEK_p, -antimicrobial_p, -episode_p) %>%
+     (function(df) lapply(c("_p", "_r"),
+                          function(match) separate_columns(df, match))) %>% 
+     f3() %>% 
+     do.call(`*`, .) %>% 
+     apply(2, sum)
+ }

And it gives:

> proba_treat_failure(predictions, resistance) %>% head()
               Avian encephalomielitis virus 
                                   100.00000 
                       Avian metapneumovirus 
                                   100.00000 
                 Avibacterium paragallinarum 
                                    25.16986 
                       Chicken Anaemia Virus 
                                   100.00000 
                          Chlamydia psittaci 
                                    33.41604 
Clostridium perfringens (Necrotic enteritis) 
                                    37.70307 

The following function computes the probability of use of antimicrobials:

> proba_use_ab <- function(df) {
+   df %>%
+     filter(!is.na(episode)) %>%
+     mutate(antimicrobial = !sapply(antimicrobial, is.null)) %>%
+     select(-CYCLE, -FARMCODE, -WEEK, -episode) %>%
+     mutate_if(is.numeric, funs(antimicrobial * . / sum(.))) %>%
+     select(-antimicrobial) %>%
+     apply(2, sum)
+ }

And it gives:

> proba_use_ab(predictions) %>% head()
               Avian encephalomielitis virus 
                                   0.4769973 
                       Avian metapneumovirus 
                                   0.4104427 
                 Avibacterium paragallinarum 
                                   0.4066906 
                       Chicken Anaemia Virus 
                                   0.4161991 
                          Chlamydia psittaci 
                                   0.4377418 
Clostridium perfringens (Necrotic enteritis) 
                                   0.4079402 

Let’s include the two previous functions into a pipe that gathers the prevalence, the probability of antimicrobial use, the probability of treatment failure and the the combined probability of treatment failure for all the pathogens:

> a <- proba_treat_failure(predictions, resistance)
> b <- proba_use_ab(predictions)
> 
> table3 <- list(select(table2, Pathogen, proba_week),
+                data.frame2(Pathogen = names(b), proba_use_ab = 100 * b),
+                data.frame2(Pathogen = names(a), proba_treat_failure = a)) %>% 
+   Reduce(function(x, y) merge(x, y, all = TRUE), ., accumulate = FALSE) %>% 
+   arrange(desc(proba_week)) %>% 
+   mutate(proba_tot = 100 * (proba_treat_failure / 100) *
+                            (proba_use_ab / 100) *
+                            (proba_week / 100))

Adding the total level of resistance

> bind_rows(table3, setNames(data.frame2("total", NA, NA, NA, sum(table3$proba_tot)), names(table3))) %>% ovv()
                                          Pathogen proba_week proba_use_ab
  1      Infectious Bursal Disease virus (Gumboro)    14.7527      45.0123
  2                          Gallibacterium anatis    10.9674      44.4418
  3                          Avian metapneumovirus     9.1017      41.0443
  4                   Erysipelothrix rhusiopathiae     8.6873       40.613
  5                            Salmonella pullorum     6.1916      45.0123
  6                       Mycoplasma gallisepticum      5.935      41.0443
  .                                              .          .            .
  .                                              .          .            .
  .                                              .          .            .
 21                    Avibacterium paragallinarum     0.6246      40.6691
 22                               Pseudomonas spp.     0.5528       46.224
 23                     Eimeria spp. (coccidiosis)     0.5399      44.2778
 24 Pasteurella multocida (chronic pasteurellosis)      0.516      44.7388
 25        Highly Pathogenic Avian Influenza virus     0.3428       45.482
 26                                          total       <NA>         <NA>
 proba_treat_failure proba_tot
                 100    6.6405
             37.2064    1.8135
                 100    3.7357
             42.0195    1.4825
               6.789    0.1892
             51.6126    1.2573
                   .         .
                   .         .
                   .         .
             25.1699    0.0639
             18.7814     0.048
                 100    0.2391
               4.272    0.0099
                 100    0.1559
                <NA>   26.2162

It means that whenever we use an antimicrobial during a disease episode, there about 26 % chance of treatment failure. Let’s end this section by looking at the share of viruses in this treatment failure:

> (virus_share <- table3 %>%
+   filter(grepl("irus", Pathogen)) %>%
+   select(proba_tot) %>%
+   sum())
[1] 18.3178

The percentage of treatement failure due to viruses:

> 100 * virus_share / sum(table3$proba_tot)
[1] 69.87206

The proba per week for viruses:

> table3 %>%
+   filter(grepl("irus", Pathogen)) %>%
+   select(proba_week) %>%
+   sum()
[1] 41.96493

Let’s look at the relationship between the prevalence and resistance of pathogens:

> with(table3, plot(proba_week, proba_treat_failure, ylim = c(0, 100), pch = 21,
+                   xlab = "probability of presence", xlim = c(0, 15),
+                   ylab = "resistance", bg = adjustcolor("red", .3)))
> 
> table3 %>%
+   filter(proba_week > 7) %>%
+   mutate(pos = ifelse(grepl("Erysipelothrix", Pathogen), 3, 1)) %$% 
+   text(proba_week, proba_treat_failure, abbreviations[Pathogen], pos = pos,
+        font = ifelse(grepl("\\.", abbreviations[Pathogen]), 3, 1))

The probability of resistance for each drug

For each antibiotic, let’s compute the probability of usage per week. First, the probability of usage per week of episode:

> table4 <- amr$drug %>% 
+   sapply(function(x) sapply(predictions$antimicrobial, function(y) x %in% y)) %>% 
+   cbind(select(predictions, episode)) %>% 
+   split(., is.na(.$episode)) %>% 
+   lapply(function(x) select(x, -episode) %>% colMeans) %>% 
+   data.frame() %>% 
+   transmute(drug     = rownames(.),
+             no_dis_w = FALSE.,
+             dis_week = TRUE.)

The ratio of the number of week with disease over the number of weeks without diseases:

> ratio <- predictions %>%
+   select(episode) %>% 
+   is.na() %>% 
+   table() %>% 
+   as.list() %>% 
+   do.call(`/`, .)

A linear model and its confidence intervals:

> model <- with(table4, lm(dis_week ~ no_dis_w))
> xs <- seq(-.02, .22,, le = 100)
> pred95 <- model %>% 
+   predict(data.frame(no_dis_w = xs), interval = "confidence") %>% 
+   as.data.frame()
> pred99 <- model %>% 
+   predict(data.frame(no_dis_w = xs), interval = "confidence", level = .99) %>% 
+   as.data.frame()

The plot:

> # the fram of the plot ---------------------------------------------------------
> with(table4, plot(dis_week ~ no_dis_w, type = "n",
+                   xlab = "probability of use during a week without disease",
+                   ylab = "probability of use during a week with disease"))
> 
> # the linear model prediction with confidence intervals ------------------------
> polygon(c(xs, rev(xs)), c(pred99$lwr, rev(pred99$upr)),
+         col = "lightgrey", border = NA)
> polygon(c(xs, rev(xs)), c(pred95$lwr, rev(pred95$upr)),
+         col = "grey", border = NA)
> lines(xs, pred95$fit)
> abline(0, ratio, lty = 2) # the expectation
> 
> # adding the data points -------------------------------------------------------
> with(table4, points(dis_week ~ no_dis_w, pch = 21, bg = adjustcolor("red", .3)))
> 
> # the points labels ------------------------------------------------------------
> coeff <- coef(model)
> table4 %>% 
+   mutate(pos = ifelse(dis_week > coeff[1] + coeff[2] * no_dis_w, 2, 4),
+          pos = ifelse(no_dis_w > .19, 2, pos),
+          pos = ifelse(drug %in% c("NEOMYCIN"), 3, pos),
+          pos = ifelse(drug %in% c("AMOXICILLIN", "TRIMETHOPRIM"), 2, pos),
+          pos = ifelse(drug %in% c("SULFAMETHOXAZOLE"), 4, pos)) %>% 
+   filter(no_dis_w > .02) %$%
+   text(no_dis_w, dis_week, abbreviations[drug], pos = pos)

In the following temporary data frame we put next to each other the antimicrobial usage (boolean) for each week and each drug and the probability of presence, for each week and each pathogen. Note that each row of the tmp data frame is a week and that for each week of this data frame there is both a disease episode and the use of antimicriobial(s).

> tmp <- amr$drug %>% 
+   sapply(function(x) sapply(predictions$antimicrobial, function(y) x %in% y)) %>% 
+   cbind(select(predictions, -CYCLE, -FARMCODE, -WEEK)) %>% 
+   filter(!sapply(antimicrobial, is.null), !is.na(episode)) %>% 
+   select(-antimicrobial, -episode)

The following function compute the level of resistance for a given drug in a given week. This level of resistance depends on the assemblage of the different pathogens during the week in question. Note that we need to rename the drugs because the dplyr::select function does not manage variables names that contain space.

> amr2 <- amr
> amr2$drug <- sub(" ", "_", amr2$drug)
> resistance_for_drug <- function(x) {
+   amr2 %>% 
+     filter(drug == x) %>%
+     select(-class, -drug) %>%
+     unlist() %>%
+     data.frame2(Pathogen = names(.), proba = .)
+ } 

The following function uses the tmp temporary data frame and the above function and computes the resistance level of a given drug (for all weeks). Note that we here too have to deal with the space in some of the variables’ names.

> names(tmp) <- sub("OXOLINIC ACID", "OXOLINIC_ACID", names(tmp))
> drug_resistance <- function(drug) {
+   tmp %>% 
+     filter_(drug) %>% 
+     select_if(is.numeric) %>% 
+     colSums() %>% 
+     (function(x) x / sum(x)) %>% 
+     data.frame2(Pathogen = names(.), proba = .) %>% 
+     left_join(resistance_for_drug(drug), "Pathogen") %$%
+     sum(proba.x * proba.y)
+ }

which gives:

> (a <- sapply(amr2$drug, drug_resistance))
           AMOXICILLIN             AMPICILLIN             CEPHALEXIN 
              66.15594               65.30821                    NaN 
            CEFOTAXIME              APRAMYCIN             GENTAMICIN 
                   NaN               67.74840               69.87261 
         SPECTINOMYCIN           STREPTOMYCIN               NEOMYCIN 
              74.95046               71.98947               70.14567 
          ERYTHROMYCIN                TYLOSIN             LINCOMYCIN 
              72.71759               84.90127               79.92009 
            SPIRAMYCIN              JOSAMYCIN             TILMICOSIN 
              84.38410               77.49651               76.45433 
       CHLORAMPHENICOL            FLORFENICOL          THIAMPHENICOL 
              56.02207               53.17762               55.97457 
          TETRACYCLINE        OXYTETRACYCLINE            DOXYCYCLINE 
                   NaN               70.61575               70.39626 
 SULFACHLOROPYRIDAZINE       SULFADIMETHOXINE          SULFADIMIDINE 
                   NaN               73.60987               73.72130 
         SULFAGUANIDIN SULFAMETHOXYPYRIDAZINE           SULFADIAZINE 
              80.41504               76.26193               80.41504 
        SULFAMETHAZINE       SULFAMETHOXAZOLE          SULFATHIAZOLE 
                   NaN               76.44810               69.11633 
          TRIMETHOPRIM          CIPROFLOXACIN           ENROFLOXACIN 
              69.25256               59.06172               52.75907 
            FLUMEQUINE            NORFLOXACIN          OXOLINIC_ACID 
              53.01580               57.29017               59.77029 
             ENRAMYCIN               COLISTIN 
                   NaN               81.27459 

Note that there are 6 drugs that are just never used during disease episodes:

> sapply(names(a[is.nan(a)]), function(x) sum(tmp[[x]]))
           CEPHALEXIN            CEFOTAXIME          TETRACYCLINE 
                    0                     0                     0 
SULFACHLOROPYRIDAZINE        SULFAMETHAZINE             ENRAMYCIN 
                    0                     0                     0 

Adding the resistance level to table4:

> table4b <- table4 %>% 
+   left_join(data.frame2(drug = sub("_", " ", names(a)), resistance = a), "drug") %>% 
+   arrange(desc(no_dis_w + dis_week))

Which gives:

> ovv(table4b)
                     drug no_dis_w dis_week resistance
  1              COLISTIN   0.2044   0.0848    81.2746
  2       OXYTETRACYCLINE   0.1607   0.0711    70.6158
  3               TYLOSIN   0.0797   0.0332    84.9013
  4           DOXYCYCLINE   0.0591   0.0269    70.3963
  5            GENTAMICIN   0.0514   0.0155    69.8726
  6           AMOXICILLIN   0.0463   0.0143    66.1559
  .                     .        .        .          .
  .                     .        .        .          .
  .                     .        .        .          .
 33            CEPHALEXIN        0    6e-04        NaN
 34            CEFOTAXIME        0    6e-04        NaN
 35        SULFAMETHAZINE        0    6e-04        NaN
 36             ENRAMYCIN        0    6e-04        NaN
 37          TETRACYCLINE        0        0        NaN
 38 SULFACHLOROPYRIDAZINE        0        0        NaN

Plotting the level of resistance of each antibiotic as a function of the probability of being used.

> with(table4b, plot(no_dis_w + dis_week, resistance,
+      xlab = "probability of usage", pch = 21, bg = adjustcolor("red", .3)))
> 
> table4b %>% 
+   filter(no_dis_w + dis_week > .1 | resistance < 65) %>%
+   mutate(pos = ifelse(resistance > 60, 2, 4),
+          pos = ifelse(abbreviations[drug] %in% c("oxa", "ffc"), 3, pos),
+          pos = ifelse(abbreviations[drug] %in% c("flu"), 2, pos),
+          pos = ifelse(abbreviations[drug] %in% c("cam"), 1, pos)) %$% 
+   text(no_dis_w + dis_week, resistance, abbreviations[drug], pos = pos)

It shows that the more a drug is used, the more it tends to be resistant. It also shows that there is a group of 8 antimicrobials that could certainly be used more advantageously. To end this section, let’s look at what would be the best antibiotic use if antimicriobials were still to be basically used at random. The following function takes a vector a pathogens’ prevalence and a vector of antimicrobials usage (expressed in probability) and use the amr data frame to draw a heatmap showing for what drug and pathogen combination resistance level is the highest.

> heat_map <- function(path_p, drug_p) {
+   path_p <- sort(path_p, TRUE)
+   drug_p <- sort(drug_p)
+   path_names <- names(path_p)
+   drug_names <- names(drug_p)
+ # the colors -------------------------------------------------------------------
+   col <- amr %>%
+     right_join(data.frame2(drug = drug_names), "drug") %>% 
+     select(path_names) %>% 
+     t() %>%
+     as.vector() %>% 
+     cut(c(-.1, seq(10, 100, 10))) %>% 
+     as.numeric() %>% 
+     `[`(rev(heat.colors(10)), .)
+ # the coordinates --------------------------------------------------------------
+   x_path <- c(0, cumsum(path_p))
+   y_drug <- c(0, cumsum(drug_p))
+   coords <- expand.grid(seq_along(path_names), seq_along(drug_names))
+ # the heat map -----------------------------------------------------------------
+   opar <- par(pty = "s")
+   on.exit(par(opar))
+   plot(NA, xlim = c(0, sum(path_p)), ylim = c(0, sum(drug_p)), axes = FALSE,
+      xlab = "pathogens", ylab = "drugs", xaxs = "i", yaxs = "i")
+   with(coords, rect(x_path[Var1],     y_drug[Var2],
+                     x_path[Var1 + 1], y_drug[Var2 + 1], col = col, border = NA))
+   abline(v = cumsum(path_p), lwd = .5)
+   abline(h = cumsum(drug_p), lwd = .5)
+   box(bty = "o")
+ }

And it gives:

> heat_map(with(table3, setNames(proba_week, Pathogen)),
+          with(table4, setNames(no_dis_w + dis_week, drug)))

The same heatmap, without viruses and parasite:

> heat_map(with(filter(table3, round(proba_treat_failure, 6) < 100),
+               setNames(proba_week, Pathogen)),
+          with(table4, setNames(no_dis_w + dis_week, drug)))

Let’s compute the resistance of the community: