The analysis detailed here requires the presence of the following data files in a directory named data in the current working directory:
> dir("data")
[1] "abbreviations.txt"
[2] "AMR_pathogens_221217.xlsx"
[3] "Scoring_pathogens_DrThu_Juan_Niwat_221217.xlsx"
[4] "Use_Antimicrobial_by_week_221217.xlsx"
If not installed:
> install.packages(c("magrittr", "readxl", "naivebayes", "dplyr", "devtools",
+ "vioplot", "magrittr", "tidyr", "purrr"))
> devtools::install_github("choisy/mcutils")
Loading required packages for interactive use:
> library(vioplot) # vioplot
> library(tidyr) # gather
> library(mcutils) # ovv
> library(magrittr) # %>%, %$%, %<>%
> library(readxl) # read_excel
> library(naivebayes) # naive_bayes, predict.naive_bayes
> library(dplyr) # filter, funs, group_by, left_join, matches, mutate,
> # mutate_at, starts_with, right_join, select, summarize,
> # summarise_all, ungroup
> data.frame2 <- function(...) data.frame(..., stringsAsFactors = FALSE)
There are 2 types of data collected from the farms: symptoms surveillance and antibiotics usage. There are 2 types of data from the literature and expert opinion: antibiotic resistance and diseases symptoms.
The function reshape takes 4 vectors cycle, farmcode, min and max and returns a data frame of 3 columns with the sequence from min to max (by step of 1) for the given cycles and farms.
> reshape <- function(cycle, farmcode, min, max) {
+ require(purrr) # map2
+ map2(min, max, seq) %>%
+ map2(farmcode, ., data.frame2) %>%
+ map2(cycle, ., data.frame2) %>%
+ do.call(rbind, .)
+ }
The above function reshape is used exclusively by the function add_missing_rows that works on a data frame:
> add_missing_rows <- function(df) {
+ require(magrittr) # %$%, %>%
+ require(dplyr) # group_by, summarise, left_join
+ by <- c("CYCLE", "FARMCODE", "WEEK")
+ df %>%
+ group_by(CYCLE, FARMCODE) %>%
+ summarise(min = min(WEEK), max = max(WEEK)) %$%
+ reshape(CYCLE, FARMCODE, min, max) %>%
+ setNames(by) %>%
+ left_join(df, by)
+ }
(there is indeed week 4 for cycle 2, farm 88 that is missing). The following function uses the previously described functions add_missing_rows and reads a file of symptoms:
> read_symptoms <- function(file) {
+ require(dplyr)
+ file %>%
+ readxl::read_excel() %>%
+ select(FARMCODE, CYCLE, WEEK, starts_with("S")) %>% # deletes useless columns
+ filter(!is.na(FARMCODE)) %>% # removes empty lines
+ mutate_if(is.numeric, as.integer) %>%
+ add_missing_rows() %>% # (there is still one missing row...)
+ mutate_at(vars(starts_with("S")), funs(. > 0)) # recodes symptoms
+ }
The following function reads AMU data:
> read_amu <- function(file) {
+ require(dplyr)
+ file %>%
+ readxl::read_excel() %>%
+ rename(WEEK = WEEKNO,
+ CYCLE = FLOCKSEQUENCE) %>%
+ mutate_at(vars(matches("^C|W")), as.integer)
+ }
The following function extract the vector of antibiotics names used for a given cycle, farm and week. df is the data frame that contains the antimicrobial column in addition to the cycle, farm and week columns.
> extract_ab <- function(cycle, farm, week, df) {
+ require(dplyr)
+ df %>%
+ filter(CYCLE == cycle, FARMCODE == farm, WEEK == week) %>%
+ select(ANTIMICROBIAL) %>%
+ unlist() %>%
+ unname()
+ }
The following function uses read_symptoms, read_amu and extract_ab and returns the surveillance data:
> read_surveillance <- function(symptoms_file, amu_file) {
+ require(dplyr)
+ symptoms <- read_symptoms(symptoms_file)
+ amu <- read_amu(amu_file)
+ select(amu, -ANTIMICROBIAL) %>%
+ unique() %>%
+ mutate(antimicrobial = pmap(list(CYCLE, FARMCODE, WEEK), extract_ab, amu),
+ # correcting some antimicrobials' names:
+ antimicrobial = lapply(antimicrobial,
+ function(x) sub("OXTETRACYCLINE",
+ "OXYTETRACYCLINE", x)),
+ antimicrobial = lapply(antimicrobial,
+ function(x) sub("SULFADIMETHOXIN",
+ "SULFADIMETHOXINE", x))) %>%
+ # joining with symptoms before returning:
+ right_join(symptoms, by = c("CYCLE", "FARMCODE", "WEEK"))
+ }
This function reads the data linking symptoms to etiology from literature and experts opinions:
> read_etiology <- function(file) {
+ file %>% readxl::read_excel() %>%
+ mutate_at(vars(matches("^S_|S\\.")), funs(!is.na(.))) %>%
+ mutate(old = Age %in% c("All ages", "Older"),
+ young = Age %in% c("All ages", "Young")) %>%
+ select(-`High mortality`, -X__1, -Age)
+ }
The function replace_na replaces each missing value of a vector by the last non-missing value that precedes it:
> replace_na <- function(x) {
+ for(i in 2:length(x))
+ if(is.na(x[i])) x[i] <- x[i - 1]
+ x
+ }
This function is used exclusively by the following read_amr function:
> .read_amr <- function(file) {
+ file %>%
+ readxl::read_excel() %>%
+ rename(class = X__1, drug = X__2) %>%
+ select(-starts_with("X__")) %>%
+ mutate(class = replace_na(class))
+ }
This function reads the AMR data and adds information about viruses:
> read_amr <- function(file, viruses = NULL) {
+ file %>%
+ .read_amr() %>%
+ cbind(setNames(data.frame(matrix(rep(100, length(viruses)), 1, length(viruses))), viruses)) %>%
+ # correting some parasites and drugs names:
+ rename(`Ornithobacterium rhinotracheale` = `Ornitobacterium rhinotracheale`) %>%
+ mutate(drug = sub("SULPHAMETHOXAZOLE", "SULFAMETHOXAZOLE", drug),
+ drug = sub("SULPHATHIAZOLE", "SULFATHIAZOLE", drug))
+ }
The data files:
> symptoms_file <- "data/Disasease_in_flocks_221217_cBao.xlsx"
> amu_file <- "data/Use_Antimicrobial_by_week_221217.xlsx"
> etiology_file <- "data/Scoring_pathogens_DrThu_Juan_Niwat_221217.xlsx"
> amr_file <- "data/AMR_pathogens_221217.xlsx"
Surveillance data, including symptoms and AMU:
> surveillance <- read_surveillance(symptoms_file, amu_file)
Data on etiologic causes:
> etiology <- read_etiology(etiology_file)
AMR data:
> amr <- read_amr(amr_file,
+ grep("virus|coccidiosis", etiology$Pathogen, TRUE, value = TRUE))
Reading the abbreviations of pathogens and drugs’ names:
> abbreviations <- "data/abbreviations.txt" %>%
+ read.csv(FALSE, stringsAsFactors = FALSE) %$%
+ setNames(V2, V1)
The function fill_gaps takes a vector of logicals (or zeros and ones) and replaces all the 1 0 1 sequences by 1 1 1:
> fill_gaps <- function(x) {
+ as.integer(x) %>% # because easier to deal with 1-character encoding
+ paste(collapse = "") %>%
+ gsub("101", "111", .) %>%
+ gsub("NA", "x", .) %>% # need to replace potential NA by 1-character...
+ strsplit("") %>% # ... because of the strsplit that follows
+ unlist(FALSE) %>% # because strsplit returns a list
+ sub("x", NA, .) %>% # back to NA
+ as.numeric() %>%
+ as.logical()
+ }
The function episodes identifies the different disease episodes. It would transform 1 1 1 0 0 1 1 0 0 0 1 0 into 1 1 1 0 0 2 2 0 0 0 3 0:
> episodes <- function(x) {
+ x <- rle(x)
+ sel <- which(x$values > 0)
+ x$values[sel] <- seq_along(sel)
+ inverse.rle(x)
+ }
The following function uses the two above-defined functions fill_gaps and episodes to generate an episode column on a surveillance data frame. There is one fillgaps option that specifies whether 1 0 1 sequences of weeks should be transformed into 1 1 1.
> make_episodes <- function(df, fillgaps = TRUE) {
+ require(dplyr)
+ if (fillgaps) f <- fill_gaps else f <- I
+ df %>%
+ mutate(disease = select(., starts_with("S")) %>% # computing...
+ rowSums() %>% # ... the presence...
+ `>`(., 0)) %>% # ... of disease
+ group_by(CYCLE, FARMCODE) %>%
+ mutate(disease = f(disease), # optionally transforms "1 0 1" into "1 1 1"
+ episode = episodes(disease)) %>% # generates diseases episodes
+ group_by(episode, add = TRUE) %>%
+ mutate_if(is.logical, any) %>% # merge the symptoms for each episod
+ ungroup() %>%
+ select(-disease) %>%
+ mutate_at(vars(starts_with("S"), episode), # removes symptoms where there...
+ funs(ifelse(episode > 0, ., NA))) # ... is no disease episode
+ }
> (cycles <- surveillance %>%
+ make_episodes(fillgaps = TRUE) %>%
+ group_by(CYCLE) %>%
+ select(CYCLE, FARMCODE) %>%
+ unique() %>%
+ ungroup() %>%
+ split(.$CYCLE) %>%
+ lapply(`[[`, 2))
$`1`
[1] "75-001" "75-002" "75-003" "75-004" "75-005" "75-006" "75-007" "75-008"
[9] "75-009" "75-010" "75-011" "75-012" "75-013" "75-014" "75-015" "75-016"
[17] "75-018" "75-019" "75-020" "75-021" "75-022" "75-023" "75-026" "75-027"
[25] "75-028" "75-029" "75-030" "75-031" "75-032" "75-033" "75-034" "75-035"
[33] "75-036" "75-037" "75-038" "75-039" "75-040" "75-041" "75-042" "75-043"
[41] "75-044" "75-045" "75-046" "75-047" "75-048" "75-049" "75-050" "75-051"
[49] "75-052" "75-053" "75-061" "75-062" "75-063" "75-064" "75-065" "75-066"
[57] "75-067" "75-068" "75-069" "75-070" "75-071" "75-072" "75-073" "75-074"
[65] "75-075" "75-076" "75-077" "75-078" "75-079" "75-080" "75-081" "75-082"
[73] "75-083" "75-085" "75-086" "75-087" "75-088" "75-089" "75-090" "75-091"
[81] "75-092" "75-093" "75-094" "75-095" "75-096" "75-097" "75-098" "75-099"
[89] "75-100" "75-101" "75-102" "75-103" "75-104" "75-105" "75-106"
$`2`
[1] "75-001" "75-002" "75-004" "75-005" "75-006" "75-008" "75-011" "75-012"
[9] "75-014" "75-015" "75-019" "75-020" "75-021" "75-022" "75-025" "75-026"
[17] "75-027" "75-028" "75-029" "75-030" "75-031" "75-033" "75-034" "75-036"
[25] "75-039" "75-040" "75-041" "75-044" "75-046" "75-047" "75-049" "75-062"
[33] "75-064" "75-065" "75-070" "75-072" "75-073" "75-074" "75-079" "75-080"
[41] "75-081" "75-086" "75-087" "75-088" "75-091" "75-095"
$`3`
[1] "75-012" "75-021" "75-027" "75-029" "75-047" "75-070" "75-080"
> sapply(cycles, length)
1 2 3
95 46 7
All farms from cycle 3 are in cycle 2:
> all(cycles[[3]] %in% cycles[[2]])
[1] TRUE
But one farm from cyle 2 is not in cycle 1:
> all(cycles[[2]] %in% cycles[[1]])
[1] FALSE
> cycles[[2]][!cycles[[2]] %in% cycles[[1]]]
[1] "75-025"
Note that in some farms, some weeks, there is no antibiotic use, whereas in other weeks and farms, there is 1 or more than 1 antibiotic used at the same time:
> amu_file %>%
+ read_amu() %$%
+ table(FARMCODE, WEEK) %>%
+ ovv()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 20 21
75-001 4 0 2 2 4 0 0 1 1 1 2 0 0 0 0 2 0 0 0 0
75-002 4 2 0 2 3 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
75-003 0 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
75-004 4 2 2 4 2 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0
75-005 4 2 0 2 2 2 2 0 0 3 0 2 0 0 0 0 0 0 0 0
75-006 4 1 0 1 4 2 2 3 0 4 2 2 3 3 0 0 0 0 0 0
. . . . . . . . . . . . . . . . . . . . .
. . . . . . . . . . . . . . . . . . . . .
. . . . . . . . . . . . . . . . . . . . .
75-101 3 2 0 1 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0
75-102 3 0 3 0 0 0 3 0 0 2 2 1 0 0 0 0 0 0 0 0
75-103 2 3 2 2 0 0 2 0 2 2 2 0 0 0 0 0 0 0 0 0
75-104 2 4 4 5 2 2 4 5 4 3 4 2 4 2 2 0 0 0 0 0
75-105 2 0 0 0 0 0 0 3 2 0 0 0 0 0 0 0 0 0 0 0
75-106 2 0 0 0 2 4 1 0 0 0 0 0 0 0 0 0 0 0 0 0
The function survey_episodes puts in shape a surveillance object (for a given cycle) for easier visualization of both the surveyed weeks and the weeks for which there is a disease episode. 0 means absence of survey, 1 means survey but absence of disease episode and 2 means survey and disease episode.
> survey_episodes <- function(df) {
+ # utilitary function -----------------------------------------------------------
+ make_farm_week_table <- function(df) {
+ df %$%
+ table(FARMCODE, WEEK) %>%
+ unclass() %>%
+ as.data.frame()
+ }
+ surveyed_weeks <- make_farm_week_table(df)
+ df %>% filter(episode > 0) %>%
+ make_farm_week_table() %>%
+ mutate(FARMCODE = rownames(.)) %>%
+ right_join(data.frame2(FARMCODE = rownames(surveyed_weeks)), "FARMCODE") %>%
+ select(-FARMCODE) %>%
+ t() %>%
+ as.data.frame() %>%
+ mutate(., WEEK = rownames(.)) %>%
+ right_join(data.frame2(WEEK = colnames(surveyed_weeks)), "WEEK") %>%
+ select(-WEEK) %>%
+ t() %>%
+ as.data.frame() %>%
+ mutate_all(funs(replace(., is.na(.), 0))) %>%
+ `+`(surveyed_weeks, .)
+ }
The following function sorts the farms of an output of the survey_episodes function according to the duration of their cycle:
> sort_episodes <- function(df) {
+ df[names(sort(rowSums(df > 0))), ]
+ }
Among the arguments of the following function, df is an output of the function survey_episodes (for one given cycle):
> plot_survey_episode <- function(df, sort = TRUE, with_amu = FALSE,
+ col_no_survey = "grey",
+ col_survey_no_dis = "blue",
+ col_survey_disease = "red",
+ amu_col = RColorBrewer::brewer.pal(7, "YlOrRd")) {
+ require(magrittr)
+ # a tuned image function ---------------------------------------------------------
+ image2 <- function(z) {
+ image(1:nrow(z), 1:ncol(z), z,
+ xlab = "week", ylab = "farm",
+ col = c(col_no_survey, col_survey_no_dis, col_survey_disease))
+ abline(v = 1:nrow(z) - .5, col = "white", lwd = .5)
+ abline(h = 1:ncol(z) - .5, col = "white", lwd = .5)
+ box(bty = "o")
+ z # output equal to input to make it pipe compliant
+ }
+ # a function to retrieve farms codes ---------------------------------------------
+ get_farms_codes <- function(df) setNames(1:ncol(df), colnames(df))
+ # optionally sorting the data ----------------------------------------------------
+ if (sort) optionally_sort <- sort_episodes else optionally_sort <- I
+ # plotting the survey and disease episodes ---------------------------------------
+ farms <- # codes of the farms for output
+ df %>%
+ survey_episodes() %>%
+ optionally_sort() %>%
+ t() %>%
+ image2() %>%
+ get_farms_codes()
+ # adding AMU ---------------------------------------------------------------------
+ if (with_amu) {
+ df %>%
+ mutate(nb_ab = sapply(antimicrobial, length)) %>%
+ filter(nb_ab > 0) %$%
+ points(WEEK, farms[FARMCODE], pch = 19, col = amu_col[nb_ab])
+ }
+ # returning output ---------------------------------------------------------------
+ farms
+ }
The following function calculates the prevalence of disease by week and across farms. It takes an output of the function survey_episodes (for one given cycle) as input argument.
> binom_test <- function(df, ci = .95) {
+ df %>%
+ (function(y)
+ Map(function(x, n)
+ unlist(binom.test(x, n, conf.level = ci)[c("estimate", "conf.int")]),
+ colSums(y > 1), colSums(y > 0))) %>%
+ data.frame
+ }
The following function uses the binom_test function to plot the prevalence of disease by week and across farms, with confidence interval.
> plot_nb_episodes <- function(df) {
+ xlim <- range(df$WEEK) + c(-.5, .5)
+ df %<>% survey_episodes()
+ prevalences <- binom_test(df)
+ xval <- as.numeric(names(df)) %>%
+ (function(x) as.vector(t(cbind(x - .5, x + .5))))
+ plot(NA, xlim = xlim, ylim = c(0, 1), type = "n", xaxs = "i", yaxs = "i",
+ axes = FALSE, ann = FALSE)
+ abline(h = seq(0, 1, .1), col = "lightgrey", lwd = .75)
+ abline(h = seq(0, 1, .2), col = "darkgrey")
+ abline(v = seq(.5, 26.5, 1), col = "lightgrey")
+ polygon(c(xval, rev(xval)),
+ c(rep(prevalences[2, ], each = 2), rep(rev(prevalences[3, ]), each = 2)),
+ border = NA, col = adjustcolor("lightblue", .6))
+ lines(xval, rep(prevalences[1, ], each = 2), col = "darkblue")
+ axis(3)
+ mtext("week", 3, 1.5)
+ axis(4)
+ mtext("prevalence", 4, 1.5)
+ box(bty = "o")
+ }
The following function puts the previous plot_survey_episode and plot_nb_episodes together:
> plot_epis_amu <- function(df, sort = TRUE, with_amu = FALSE, top = .25,
+ col_no_survey = "grey",
+ col_survey_no_dis = "blue",
+ col_survey_disease = "red",
+ amu_col = RColorBrewer::brewer.pal(7, "YlOrRd")) {
+ # parameters of the layout of the plot -----------------------------------------
+ npar <- par("plt")
+ npar[3] <- .075
+ npar[4] <- 1 - npar[3]
+ npar[2] <- 1 - npar[1]
+ y4 <- npar[4]
+ y3 <- y4 - top * diff(npar[3:4])
+ npar[4] <- y3
+ # bottom subfigure -------------------------------------------------------------
+ opar <- par(plt = npar)
+ farms <- plot_survey_episode(df, sort, with_amu, col_no_survey,
+ col_survey_no_dis, col_survey_disease, amu_col)
+ # top subfigure ----------------------------------------------------------------
+ npar[3] <- y3
+ npar[4] <- y4
+ par(plt = npar, new = TRUE)
+ plot_nb_episodes(df)
+ par(opar)
+ # the output -------------------------------------------------------------------
+ farms
+ }
Let’s see that for the 3 cycles:
> surveillance %>%
+ make_episodes(fillgaps = TRUE) %>%
+ filter(CYCLE == 1) %>%
+ plot_epis_amu(sort = TRUE, with_amu = FALSE)
75-037 75-007 75-061 75-089 75-092 75-034 75-036 75-053 75-106 75-085 75-042
1 2 3 4 5 6 7 8 9 10 11
75-075 75-101 75-012 75-003 75-074 75-019 75-031 75-033 75-011 75-015 75-040
12 13 14 15 16 17 18 19 20 21 22
75-091 75-093 75-095 75-099 75-013 75-014 75-023 75-030 75-032 75-048 75-052
23 24 25 26 27 28 29 30 31 32 33
75-065 75-076 75-078 75-079 75-081 75-002 75-004 75-006 75-008 75-020 75-028
34 35 36 37 38 39 40 41 42 43 44
75-043 75-051 75-068 75-071 75-072 75-073 75-077 75-088 75-094 75-100 75-005
45 46 47 48 49 50 51 52 53 54 55
75-010 75-026 75-038 75-039 75-044 75-046 75-064 75-066 75-082 75-087 75-098
56 57 58 59 60 61 62 63 64 65 66
75-102 75-104 75-021 75-027 75-041 75-045 75-070 75-097 75-103 75-009 75-016
67 68 69 70 71 72 73 74 75 76 77
75-022 75-047 75-050 75-067 75-069 75-086 75-090 75-049 75-063 75-096 75-018
78 79 80 81 82 83 84 85 86 87 88
75-062 75-029 75-080 75-083 75-105 75-035 75-001
89 90 91 92 93 94 95
> surveillance %>%
+ make_episodes(fillgaps = TRUE) %>%
+ filter(CYCLE == 2) %>%
+ plot_epis_amu(sort = TRUE, with_amu = FALSE)
75-049 75-026 75-065 75-040 75-088 75-036 75-046 75-034 75-044 75-015 75-039
1 2 3 4 5 6 7 8 9 10 11
75-041 75-072 75-095 75-019 75-028 75-033 75-074 75-079 75-008 75-011 75-031
12 13 14 15 16 17 18 19 20 21 22
75-064 75-081 75-086 75-012 75-070 75-073 75-004 75-020 75-022 75-027 75-029
23 24 25 26 27 28 29 30 31 32 33
75-047 75-087 75-021 75-025 75-030 75-080 75-091 75-002 75-006 75-062 75-001
34 35 36 37 38 39 40 41 42 43 44
75-005 75-014
45 46
> surveillance %>%
+ make_episodes(fillgaps = TRUE) %>%
+ filter(CYCLE == 3) %>%
+ plot_epis_amu(sort = TRUE, with_amu = FALSE)
75-029 75-070 75-047 75-080 75-027 75-012 75-021
1 2 3 4 5 6 7
We can also put all the cycles together, in which case we need to somehow rename the farms so that it not only accounts for the farm code, but also for the cycle number:
> surveillance %>%
+ mutate(FARMCODE = paste(CYCLE, FARMCODE, sep = "-")) %>%
+ make_episodes(fillgaps = TRUE) %>%
+ plot_epis_amu(sort = TRUE, with_amu = FALSE)
1-75-037 1-75-007 1-75-061 1-75-089 1-75-092 2-75-049 3-75-029 3-75-070
1 2 3 4 5 6 7 8
1-75-034 1-75-036 1-75-053 1-75-106 2-75-026 2-75-065 1-75-085 2-75-040
9 10 11 12 13 14 15 16
2-75-088 3-75-047 3-75-080 1-75-042 1-75-075 1-75-101 2-75-036 3-75-027
17 18 19 20 21 22 23 24
1-75-012 2-75-046 2-75-034 2-75-044 1-75-003 1-75-074 1-75-019 1-75-031
25 26 27 28 29 30 31 32
1-75-033 2-75-015 2-75-039 2-75-041 2-75-072 2-75-095 3-75-012 1-75-011
33 34 35 36 37 38 39 40
1-75-015 1-75-040 1-75-091 1-75-093 1-75-095 1-75-099 1-75-013 1-75-014
41 42 43 44 45 46 47 48
1-75-023 1-75-030 1-75-032 1-75-048 1-75-052 1-75-065 1-75-076 1-75-078
49 50 51 52 53 54 55 56
1-75-079 1-75-081 2-75-019 2-75-028 2-75-033 2-75-074 2-75-079 1-75-002
57 58 59 60 61 62 63 64
1-75-004 1-75-006 1-75-008 1-75-020 1-75-028 1-75-043 1-75-051 1-75-068
65 66 67 68 69 70 71 72
1-75-071 1-75-072 1-75-073 1-75-077 1-75-088 1-75-094 1-75-100 2-75-008
73 74 75 76 77 78 79 80
2-75-011 2-75-031 2-75-064 2-75-081 2-75-086 1-75-005 1-75-010 1-75-026
81 82 83 84 85 86 87 88
1-75-038 1-75-039 1-75-044 1-75-046 1-75-064 1-75-066 1-75-082 1-75-087
89 90 91 92 93 94 95 96
1-75-098 1-75-102 1-75-104 2-75-012 2-75-070 2-75-073 1-75-021 1-75-027
97 98 99 100 101 102 103 104
1-75-041 1-75-045 1-75-070 1-75-097 1-75-103 2-75-004 2-75-020 2-75-022
105 106 107 108 109 110 111 112
2-75-027 2-75-029 2-75-047 2-75-087 1-75-009 1-75-016 1-75-022 1-75-047
113 114 115 116 117 118 119 120
1-75-050 1-75-067 1-75-069 1-75-086 1-75-090 2-75-021 2-75-025 2-75-030
121 122 123 124 125 126 127 128
2-75-080 2-75-091 1-75-049 1-75-063 1-75-096 2-75-002 2-75-006 1-75-018
129 130 131 132 133 134 135 136
1-75-062 2-75-062 3-75-021 1-75-029 1-75-080 1-75-083 1-75-105 2-75-001
137 138 139 140 141 142 143 144
1-75-035 2-75-005 2-75-014 1-75-001
145 146 147 148
The following function uses model object and data newdata to make predictions (we reversed the order of the arguments compared to predict in order to make it pipe compliant):
> predict2 <- function(newdata, object) {
+ require(dplyr) # %>%, mutate, select, mutate_all, funs
+ newdata %>%
+ predict(object, ., type = "prob") %>%
+ cbind(select(newdata, episode), .) %>%
+ mutate_all(funs(ifelse(episode < 1, NA, .))) %>%
+ cbind(select(newdata, CYCLE, FARMCODE, WEEK, antimicrobial), .)
+ }
Let’s retrieve the predicted probabilities based on the sets of symptoms recorded in surveillance. First, let’s define a naive Bayes model based on etiology that contains both data from the literature (variables of the formula) and prior information from experts:
> model <- etiology %>%
+ mutate(score = select(., starts_with("Score")) %>% rowMeans) %>% # averaging the scores
+ naive_bayes(Pathogen ~ S_Respiratory + S.Diarrhoea + S.CNS + S.Malaise +
+ S.Leglesions + S.Suddendeath + old + young, ., .$score, 1)
Then we can combine this model and (pre-processed) surveillance data to make predictions:
> predictions <-
+ surveillance %>%
+ mutate(old = WEEK > 6, # using a cut-off of 6 w to separate youngs from old
+ young = !old) %>%
+ make_episodes(fillgaps = TRUE) %>%
+ predict2(model)
Let’s start by defining a number of utilitary functions:
> summary_by <- function(x, name) {
+ x %>%
+ colMeans() %>%
+ `*`(100) %>%
+ (function(x) lapply(list(names, I), function(f) f(x))) %>%
+ as.data.frame(stringsAsFactors = FALSE) %>%
+ setNames(c("Pathogen", name))
+ }
The above function is used exclusively by the following 2 functions. The function by_week computes the probability of presence of pathogens, by week of disease episode:
> by_week <- function(pred) {
+ pred %>%
+ select(-CYCLE, -FARMCODE, -WEEK, -antimicrobial, -episode) %>%
+ na.exclude() %>%
+ summary_by("proba_week")
+ }
The following function computes the probability of presence of pathogens, by disease episode:
> by_episode <- function(pred) {
+ pred %>%
+ select(-WEEK, -antimicrobial) %>%
+ na.exclude() %>%
+ unique() %>%
+ select(-CYCLE, -FARMCODE, -episode) %>%
+ summary_by("proba_episode")
+ }
Let’s use the 2 above functions to generate the table of pathogens:
> table2 <- lapply(c(by_week, by_episode), function(f) f(predictions)) %>%
+ do.call(merge, .) %>%
+ merge(select(etiology, Pathogen, starts_with("Score"))) %>%
+ arrange(desc(proba_week)) %>%
+ mutate_at(-1, as.numeric) %>%
+ mutate(x = select(., starts_with("Score")) %>% rowMeans(),
+ y = select(., starts_with("proba")) %>% rowMeans(),
+ ratio0 = x / y,
+ ratio = ifelse(ratio0 > 1, ratio0, 1 / ratio0))
> ovv(table2)
Pathogen proba_week proba_episode
1 Infectious Bursal Disease virus (Gumboro) 14.7527 16.2418
2 Gallibacterium anatis 10.9674 7.277
3 Avian metapneumovirus 9.1017 10.5088
4 Erysipelothrix rhusiopathiae 8.6873 8.0736
5 Salmonella pullorum 6.1916 6.8166
6 Mycoplasma gallisepticum 5.935 6.8525
. . . .
. . . .
. . . .
20 Avian encephalomielitis virus 0.7148 0.9139
21 Avibacterium paragallinarum 0.6246 0.8119
22 Pseudomonas spp. 0.5528 0.5283
23 Eimeria spp. (coccidiosis) 0.5399 0.6635
24 Pasteurella multocida (chronic pasteurellosis) 0.516 0.5226
25 Highly Pathogenic Avian Influenza virus 0.3428 0.2079
Score.DrThu Score.Juan Score.Niwat x y ratio0 ratio
10.0189 10.3686 9.7561 10.0478 15.4973 0.6484 1.5423
0.4554 0.9426 0.9756 0.7912 9.1222 0.0867 11.5295
0.4554 1.8852 1.9512 1.4306 9.8052 0.1459 6.8539
0.4554 1.8852 0.4878 0.9428 8.3805 0.1125 8.8889
6.3756 1.8852 4.3902 4.217 6.5041 0.6484 1.5423
9.1081 5.6556 7.4146 7.3928 6.3938 1.1562 1.1562
. . . . . . .
. . . . . . .
. . . . . . .
0.4554 0.9426 1.9512 1.1164 0.8144 1.3709 1.3709
3.6432 1.8852 3.9024 3.1436 0.7182 4.3769 4.3769
0.9108 1.8852 0.4878 1.0946 0.5405 2.025 2.025
3.7707 2.8278 4.878 3.8255 0.6017 6.3576 6.3576
0.4554 1.8852 0.878 1.0729 0.5193 2.0661 2.0661
9.1081 13.1963 11.7073 11.3372 0.2754 41.1699 41.1699
> lim <-
+ table2 %>%
+ select(proba_week, proba_episode) %>%
+ range() %>%
+ map2(c(floor, ceiling), function(x, f) f(x)) %>%
+ unlist()
>
> opar <- par(pty = "s")
>
> with(table2, plot(proba_week, proba_episode, pch = 21, pty = "s", bty = "o",
+ bg = adjustcolor("red", .3), xlim = lim, ylim = lim,
+ xlab = "average score of presence per week",
+ ylab = "average score of presence per episode"))
>
> abline(0, 1)
> abline(0, 1.5, lty = 2)
> abline(0, 2 / 3, lty = 2)
>
> table2 %>%
+ filter(proba_week > 7) %$%
+ text(proba_week, proba_episode, abbreviations[Pathogen], pos = 4,
+ font = ifelse(grepl("\\.", abbreviations[Pathogen]), 3, 1))
> par(opar)
> with(table2, cor.test(proba_week, proba_episode))
Pearson's product-moment correlation
data: proba_week and proba_episode
t = 15.393, df = 23, p-value = 1.329e-13
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8985951 0.9801202
sample estimates:
cor
0.9547351
Looking at the correlations between the scores of the three experts:
> table2 %>%
+ select(starts_with("Score")) %>%
+ setNames(paste("expert", 1:3)) %>%
+ mcgraph::paircor(digits = 2, cex = 1.5, pch = 21, bty_upper = "o",
+ bty_lower = "o", bg = adjustcolor("red", .3),
+ axeslim = c(0, 14), cex.labels = 1.5, cex.axis = 1.5,
+ axes_lower = TRUE)
Let’s compare the average prior score with the average posterior score:
> opar <- par(pty = "s")
>
> table2 %$%
+ plot(x, y, pty = "s", bty = "o", col = "white",
+ xlim = c(0, max(x, y)),
+ ylim = c(0, max(x, y)),
+ xlab = "average prior score from experts",
+ ylab = "average posterior score from naive Bayes model")
>
> abline(0, 1)
> for(i in c(.5, 2)) abline(0, i, lty = 2)
> for(i in c(3:10, 2:5 * 10)) {
+ abline(0, i, col = "lightgrey", lwd = .5)
+ abline(0, 1 / i, col = "lightgrey", lwd = .5)
+ }
>
> table2 %$% points(x, y, pch = 21, bg = adjustcolor("red", .3))
>
> table2 %>%
+ filter(ratio > 2.1) %>%
+ mutate(pos = ifelse(Pathogen == "Eimeria spp. (coccidiosis)", 1,
+ ifelse(Pathogen == "Avibacterium paragallinarum" , 3, 4)),
+ font = ifelse(Pathogen %in% c("Highly Pathogenic Avian Influenza virus",
+ "Infectious Bronchitis Virus",
+ "Avian metapneumovirus"), 1, 3)) %$%
+ text(x, y, abbreviations[as.character(Pathogen)], .5, pos, font = font, cex = .75)
>
> par(opar)
> with(table2, cor.test(x, y))
Pearson's product-moment correlation
data: x and y
t = 0.85005, df = 23, p-value = 0.4041
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.2369441 0.5329078
sample estimates:
cor
0.1745267
Let’s see another to compare the prior and posterior scores of presence of the pathogens. For that, we need a number of utilitary functions. First a tuned version of the graphics::barplot function:
> barplot2 <- function(df, width = 1, space = NULL, ...) {
+ if (is.null(space)) space <- .2
+ df %>%
+ select(prior, posterior) %>%
+ as.matrix() %>%
+ barplot(width, space, ...)
+ df %$%
+ segments(width + space, c(0, cumsum(prior)),
+ width + 2 * space, c(0, cumsum(posterior)), lwd = .5)
+ df
+ }
The following function calculate of the mid values of a vector:
> midval <- function(x) x %>%
+ cumsum() %>%
+ cbind(c(0, .[-length(.)]), .) %>%
+ rowMeans()
The following function is a tuned version of the graphics::text function:
> text2 <- function(df, var, x, threshold = 0, ...) {
+ sel <- df[[var]] > threshold
+ textval <- abbreviations[df$Pathogen]
+ fontval <- ifelse(grepl("\\.", textval), 3, 1)
+ text(x, midval(df[[var]])[sel], textval[sel], font = fontval[sel])
+ invisible(df)
+ }
Let’s use the 3 previous functions for the following plot:
> width <- 1
> space <- .2
> table2 %>%
+ mutate(prior = select(., starts_with("Score")) %>% rowMeans,
+ posterior = select(., starts_with("proba")) %>% rowMeans) %>%
+ arrange(desc(prior)) %T>%
+ barplot2(width, space, ylab = "score") %>%
+ text2("prior", (width + 2 * space) / 2, 4, cex = .9) %>%
+ text2("posterior", (3 * width + 4 * space) / 2, 4, cex = .9)
Let’s now investigate the durations of the diseases episodes and see whether there are any difference between the pathogens. First, we need to removing the episodes that start or end a cycle:
> not_sick <- function(x, pattern) x %>%
+ as.character() %>%
+ paste(collapse = "") %>%
+ grepl(pattern, .)
The above function is used exclusively by the following function that is itself used exclusively by the function after:
> .not_sick_df <- function(df, pattern, name) df %>%
+ survey_episodes() %>%
+ t() %>%
+ as.data.frame() %>%
+ sapply(not_sick, pattern) %>%
+ data.frame2(names(.), .) %>%
+ setNames(c("FARMCODE", name))
> not_sick_df <- function(pattern, name, df) df %>%
+ split(.$CYCLE) %>%
+ lapply(.not_sick_df, pattern, name) %>%
+ do.call(function(...) bind_rows(..., .id = "CYCLE"), .) %>%
+ mutate(CYCLE = as.numeric(CYCLE))
We can now compute the durations of the diseases episodes for each pathogen, without excluding the episodes that start or end a cycle:
> epi_duration <- predictions %>%
+ group_by(CYCLE, FARMCODE, episode) %>%
+ na.exclude() %>%
+ tally() %>%
+ ungroup()
and excluding them:
> epi_duration2 <- predictions %>%
+ map2(c("^1", "10"), c("startnotsick", "endnotsick"), not_sick_df, .) %>%
+ do.call(merge, .) %>%
+ filter(startnotsick, endnotsick) %>%
+ select(-startnotsick, -endnotsick) %>%
+ left_join(epi_duration, c("CYCLE", "FARMCODE")) %>%
+ na.exclude()
We can compare the distributions of the two computed durations in order to see there is a significant censoring effect:
> hist2 <- function(x, to = NULL, ...) {
+ if (is.null(to)) to <- max(x + 1)
+ hist(x, seq(0, to), xlab = "duration of episode",
+ ylab = "number of episodes", main = NA, ...)
+ }
> fct <- function(x, ...) hist2(x, 22, ylim = c(0, 110), ...)
> hist_all <- fct(epi_duration$n, col = "grey")
> par(new = TRUE)
> hist_sbs <- fct(epi_duration2$n, col = "red")
The two distributions are not significantly different:
> list(hist_all, hist_sbs) %>%
+ sapply(`[[`, "counts") %>%
+ fisher.test(simulate.p.value = TRUE, B = 1e7)
Fisher's Exact Test for Count Data with simulated p-value (based on
1e+07 replicates)
data: .
p-value = 0.6241
alternative hypothesis: two.sided
Let’s now see whether there are differences in episodes durations between the various pathogens. For that we first need to tune a bit the vioplot::vioplot function:
> vioplot2 <- function(df, range=1.5, horizontal=FALSE, col="magenta",
+ border="black", lty=1, lwd=1, rectCol="black",
+ colMed="white", pchMed=19, at, add=FALSE, wex=1,
+ drawRect=TRUE) {
+ if (missing(at)) at <- 1:length(df)
+ do.call(function(...) vioplot(df[, 1], ..., range = range,
+ horizontal = horizontal, col = col,
+ border = border, lty = lty, lwd = lwd,
+ rectCol = rectCol, colMed = colMed,
+ pchMed = pchMed, at = at, add = add, wex = wex,
+ drawRect = drawRect), df[, -1])
+ }
The following function arranges the columns of a data frame according to some criterion computed by the f function passed as an argument:
> arrange_columns <- function(df, f, ...) {
+ df[, order(apply(df, 2, f, ...))]
+ }
Let’s compute a processed version of the prediction table:
> predictions3 <- predictions %>%
+ select(-WEEK, -antimicrobial) %>%
+ na.exclude() %>%
+ unique() %>%
+ right_join(epi_duration, c("CYCLE", "FARMCODE", "episode")) %>%
+ mutate_at(vars(-one_of(c("CYCLE", "FARMCODE", "episode", "n"))), funs(. * n)) %>%
+ select(-CYCLE, -FARMCODE, -episode, -n)
Let’s now see whether the medians (or mean, or min, or max) of these distributions are correlated with the probability of presence per episode. If yes, it means that the pathogens do not differ in terms of duration of infection. If not, it means that they do.
A linear model:
> model <- predictions3 %>%
+ gather("x", "y") %>%
+ mutate(x = log(with(table2, setNames(proba_episode, Pathogen))[x]),
+ y = log(y)) %>%
+ lm(y ~ x, .)
The plot:
> # the plot frame ---------------------------------------------------------------
> plot(NA, xlim = c(-1, 3), ylim = c(-8, 2), axes = FALSE,
+ xlab = "average score of presence per episode",
+ ylab = "durations of infection episodes")
>
> # the violin plots -------------------------------------------------------------
> table2 %>%
+ arrange(proba_episode) %>%
+ select(Pathogen) %>%
+ unlist() %>%
+ `[`(predictions3, .) %>%
+ mutate_all(log) %>%
+ vioplot2(at = log(sort(table2$proba_episode)), col = adjustcolor("grey", .25),
+ drawRect = FALSE, wex = .5, add = T,
+ rectCol = adjustcolor("black", .25))
>
>
> # the confidence and prediction intervals --------------------------------------
> xs <- seq(-1, 3, le = 100)
> fct <- function(int, ltype) {
+ pred <- predict(model, data.frame(x = xs), interval = int)
+ with(as.data.frame(pred), {
+ lines(xs, lwr, lty = ltype)
+ lines(xs, upr, lty = ltype)
+ })
+ }
> abline(model)
> fct("confidence", 2)
> fct("prediction", 3)
>
> # the log-scale axes -----------------------------------------------------------
> par(new = TRUE)
> plot(NA, xlim = exp(c(-1, 3)), ylim = exp(c(-8, 2)), log = "xy", ann = FALSE)
For each pathogen, we want to compute the probability of treatment failure (i.e. of using the incorrect treatment). The analysis will necessarily be by week. This is an NA returned value used by the function compute_resistance that follows after:
> naval <-
+ predictions %>%
+ select(-CYCLE, -FARMCODE, -WEEK, -antimicrobial, -episode) %>%
+ names() %>%
+ setNames(rep(NA, length(.)), .)
The following function computes resistance level for each pathogen and the used set of antibiotics for a given week:
>
> compute_resistance <- function(x) {
+ if (is.null(x)) return(naval)
+ amr %>%
+ filter(drug %in% x) %>%
+ select(-class, -drug) %>%
+ apply(2, min)
+ }
The following function allows to correct for the fact that there is not chlortetracycline information in the amr data set. We thus consider that it is a simple tetracycline:
> correction <- function(x) {
+ if (is.null(x)) return(NULL)
+ sub("CHLORTETRACYCLINE", "TETRACYCLINE", x)
+ }
Let’s use the 2 previous functions to compute the levels of resistance:
> resistance <- predictions$antimicrobial %>%
+ lapply(correction) %>%
+ lapply(compute_resistance) %>%
+ do.call(bind_rows, .)
The following function allows to separate the columns of a data frame into two sets of data frames, according to a pattern to match in the columns names:
> separate_columns <- function(df, match) {
+ select(df, ends_with(match)) %>%
+ setNames(., sub(match, "", names(.)))
+ }
A utilitary function that applies 2 different functions to a list of 2 lists:
> f1 <- function(df) sweep(df, 2, apply(df, 2, sum), "/")
> f2 <- function(df, var) df[, var]
> f3 <- function(lst) list(f1(lst[[1]]), f2(lst[[2]], names(lst[[1]])))
The following function uses the 2 previous functions to compute the levels of resistance:
> proba_treat_failure <- function(pred, res) {
+ cbind(setNames(pred, paste0(names(pred), "_p")),
+ setNames(res, paste0(names(res), "_r"))) %>%
+ filter(!sapply(.$antimicrobial_p, is.null), !is.na(episode_p)) %>%
+ select(-CYCLE_p, -FARMCODE_p, -WEEK_p, -antimicrobial_p, -episode_p) %>%
+ (function(df) lapply(c("_p", "_r"),
+ function(match) separate_columns(df, match))) %>%
+ f3() %>%
+ do.call(`*`, .) %>%
+ apply(2, sum)
+ }
And it gives:
> proba_treat_failure(predictions, resistance) %>% head()
Avian encephalomielitis virus
100.00000
Avian metapneumovirus
100.00000
Avibacterium paragallinarum
25.16986
Chicken Anaemia Virus
100.00000
Chlamydia psittaci
33.41604
Clostridium perfringens (Necrotic enteritis)
37.70307
The following function computes the probability of use of antimicrobials:
> proba_use_ab <- function(df) {
+ df %>%
+ filter(!is.na(episode)) %>%
+ mutate(antimicrobial = !sapply(antimicrobial, is.null)) %>%
+ select(-CYCLE, -FARMCODE, -WEEK, -episode) %>%
+ mutate_if(is.numeric, funs(antimicrobial * . / sum(.))) %>%
+ select(-antimicrobial) %>%
+ apply(2, sum)
+ }
And it gives:
> proba_use_ab(predictions) %>% head()
Avian encephalomielitis virus
0.4769973
Avian metapneumovirus
0.4104427
Avibacterium paragallinarum
0.4066906
Chicken Anaemia Virus
0.4161991
Chlamydia psittaci
0.4377418
Clostridium perfringens (Necrotic enteritis)
0.4079402
Let’s include the two previous functions into a pipe that gathers the prevalence, the probability of antimicrobial use, the probability of treatment failure and the the combined probability of treatment failure for all the pathogens:
> a <- proba_treat_failure(predictions, resistance)
> b <- proba_use_ab(predictions)
>
> table3 <- list(select(table2, Pathogen, proba_week),
+ data.frame2(Pathogen = names(b), proba_use_ab = 100 * b),
+ data.frame2(Pathogen = names(a), proba_treat_failure = a)) %>%
+ Reduce(function(x, y) merge(x, y, all = TRUE), ., accumulate = FALSE) %>%
+ arrange(desc(proba_week)) %>%
+ mutate(proba_tot = 100 * (proba_treat_failure / 100) *
+ (proba_use_ab / 100) *
+ (proba_week / 100))
Adding the total level of resistance
> bind_rows(table3, setNames(data.frame2("total", NA, NA, NA, sum(table3$proba_tot)), names(table3))) %>% ovv()
Pathogen proba_week proba_use_ab
1 Infectious Bursal Disease virus (Gumboro) 14.7527 45.0123
2 Gallibacterium anatis 10.9674 44.4418
3 Avian metapneumovirus 9.1017 41.0443
4 Erysipelothrix rhusiopathiae 8.6873 40.613
5 Salmonella pullorum 6.1916 45.0123
6 Mycoplasma gallisepticum 5.935 41.0443
. . . .
. . . .
. . . .
21 Avibacterium paragallinarum 0.6246 40.6691
22 Pseudomonas spp. 0.5528 46.224
23 Eimeria spp. (coccidiosis) 0.5399 44.2778
24 Pasteurella multocida (chronic pasteurellosis) 0.516 44.7388
25 Highly Pathogenic Avian Influenza virus 0.3428 45.482
26 total <NA> <NA>
proba_treat_failure proba_tot
100 6.6405
37.2064 1.8135
100 3.7357
42.0195 1.4825
6.789 0.1892
51.6126 1.2573
. .
. .
. .
25.1699 0.0639
18.7814 0.048
100 0.2391
4.272 0.0099
100 0.1559
<NA> 26.2162
It means that whenever we use an antimicrobial during a disease episode, there about 26 % chance of treatment failure. Let’s end this section by looking at the share of viruses in this treatment failure:
> (virus_share <- table3 %>%
+ filter(grepl("irus", Pathogen)) %>%
+ select(proba_tot) %>%
+ sum())
[1] 18.3178
The percentage of treatement failure due to viruses:
> 100 * virus_share / sum(table3$proba_tot)
[1] 69.87206
The proba per week for viruses:
> table3 %>%
+ filter(grepl("irus", Pathogen)) %>%
+ select(proba_week) %>%
+ sum()
[1] 41.96493
Let’s look at the relationship between the prevalence and resistance of pathogens:
> with(table3, plot(proba_week, proba_treat_failure, ylim = c(0, 100), pch = 21,
+ xlab = "probability of presence", xlim = c(0, 15),
+ ylab = "resistance", bg = adjustcolor("red", .3)))
>
> table3 %>%
+ filter(proba_week > 7) %>%
+ mutate(pos = ifelse(grepl("Erysipelothrix", Pathogen), 3, 1)) %$%
+ text(proba_week, proba_treat_failure, abbreviations[Pathogen], pos = pos,
+ font = ifelse(grepl("\\.", abbreviations[Pathogen]), 3, 1))
For each antibiotic, let’s compute the probability of usage per week. First, the probability of usage per week of episode:
> table4 <- amr$drug %>%
+ sapply(function(x) sapply(predictions$antimicrobial, function(y) x %in% y)) %>%
+ cbind(select(predictions, episode)) %>%
+ split(., is.na(.$episode)) %>%
+ lapply(function(x) select(x, -episode) %>% colMeans) %>%
+ data.frame() %>%
+ transmute(drug = rownames(.),
+ no_dis_w = FALSE.,
+ dis_week = TRUE.)
The ratio of the number of week with disease over the number of weeks without diseases:
> ratio <- predictions %>%
+ select(episode) %>%
+ is.na() %>%
+ table() %>%
+ as.list() %>%
+ do.call(`/`, .)
A linear model and its confidence intervals:
> model <- with(table4, lm(dis_week ~ no_dis_w))
> xs <- seq(-.02, .22,, le = 100)
> pred95 <- model %>%
+ predict(data.frame(no_dis_w = xs), interval = "confidence") %>%
+ as.data.frame()
> pred99 <- model %>%
+ predict(data.frame(no_dis_w = xs), interval = "confidence", level = .99) %>%
+ as.data.frame()
The plot:
> # the fram of the plot ---------------------------------------------------------
> with(table4, plot(dis_week ~ no_dis_w, type = "n",
+ xlab = "probability of use during a week without disease",
+ ylab = "probability of use during a week with disease"))
>
> # the linear model prediction with confidence intervals ------------------------
> polygon(c(xs, rev(xs)), c(pred99$lwr, rev(pred99$upr)),
+ col = "lightgrey", border = NA)
> polygon(c(xs, rev(xs)), c(pred95$lwr, rev(pred95$upr)),
+ col = "grey", border = NA)
> lines(xs, pred95$fit)
> abline(0, ratio, lty = 2) # the expectation
>
> # adding the data points -------------------------------------------------------
> with(table4, points(dis_week ~ no_dis_w, pch = 21, bg = adjustcolor("red", .3)))
>
> # the points labels ------------------------------------------------------------
> coeff <- coef(model)
> table4 %>%
+ mutate(pos = ifelse(dis_week > coeff[1] + coeff[2] * no_dis_w, 2, 4),
+ pos = ifelse(no_dis_w > .19, 2, pos),
+ pos = ifelse(drug %in% c("NEOMYCIN"), 3, pos),
+ pos = ifelse(drug %in% c("AMOXICILLIN", "TRIMETHOPRIM"), 2, pos),
+ pos = ifelse(drug %in% c("SULFAMETHOXAZOLE"), 4, pos)) %>%
+ filter(no_dis_w > .02) %$%
+ text(no_dis_w, dis_week, abbreviations[drug], pos = pos)
In the following temporary data frame we put next to each other the antimicrobial usage (boolean) for each week and each drug and the probability of presence, for each week and each pathogen. Note that each row of the tmp data frame is a week and that for each week of this data frame there is both a disease episode and the use of antimicriobial(s).
> tmp <- amr$drug %>%
+ sapply(function(x) sapply(predictions$antimicrobial, function(y) x %in% y)) %>%
+ cbind(select(predictions, -CYCLE, -FARMCODE, -WEEK)) %>%
+ filter(!sapply(antimicrobial, is.null), !is.na(episode)) %>%
+ select(-antimicrobial, -episode)
The following function compute the level of resistance for a given drug in a given week. This level of resistance depends on the assemblage of the different pathogens during the week in question. Note that we need to rename the drugs because the dplyr::select function does not manage variables names that contain space.
> amr2 <- amr
> amr2$drug <- sub(" ", "_", amr2$drug)
> resistance_for_drug <- function(x) {
+ amr2 %>%
+ filter(drug == x) %>%
+ select(-class, -drug) %>%
+ unlist() %>%
+ data.frame2(Pathogen = names(.), proba = .)
+ }
The following function uses the tmp temporary data frame and the above function and computes the resistance level of a given drug (for all weeks). Note that we here too have to deal with the space in some of the variables’ names.
> names(tmp) <- sub("OXOLINIC ACID", "OXOLINIC_ACID", names(tmp))
> drug_resistance <- function(drug) {
+ tmp %>%
+ filter_(drug) %>%
+ select_if(is.numeric) %>%
+ colSums() %>%
+ (function(x) x / sum(x)) %>%
+ data.frame2(Pathogen = names(.), proba = .) %>%
+ left_join(resistance_for_drug(drug), "Pathogen") %$%
+ sum(proba.x * proba.y)
+ }
which gives:
> (a <- sapply(amr2$drug, drug_resistance))
AMOXICILLIN AMPICILLIN CEPHALEXIN
66.15594 65.30821 NaN
CEFOTAXIME APRAMYCIN GENTAMICIN
NaN 67.74840 69.87261
SPECTINOMYCIN STREPTOMYCIN NEOMYCIN
74.95046 71.98947 70.14567
ERYTHROMYCIN TYLOSIN LINCOMYCIN
72.71759 84.90127 79.92009
SPIRAMYCIN JOSAMYCIN TILMICOSIN
84.38410 77.49651 76.45433
CHLORAMPHENICOL FLORFENICOL THIAMPHENICOL
56.02207 53.17762 55.97457
TETRACYCLINE OXYTETRACYCLINE DOXYCYCLINE
NaN 70.61575 70.39626
SULFACHLOROPYRIDAZINE SULFADIMETHOXINE SULFADIMIDINE
NaN 73.60987 73.72130
SULFAGUANIDIN SULFAMETHOXYPYRIDAZINE SULFADIAZINE
80.41504 76.26193 80.41504
SULFAMETHAZINE SULFAMETHOXAZOLE SULFATHIAZOLE
NaN 76.44810 69.11633
TRIMETHOPRIM CIPROFLOXACIN ENROFLOXACIN
69.25256 59.06172 52.75907
FLUMEQUINE NORFLOXACIN OXOLINIC_ACID
53.01580 57.29017 59.77029
ENRAMYCIN COLISTIN
NaN 81.27459
Note that there are 6 drugs that are just never used during disease episodes:
> sapply(names(a[is.nan(a)]), function(x) sum(tmp[[x]]))
CEPHALEXIN CEFOTAXIME TETRACYCLINE
0 0 0
SULFACHLOROPYRIDAZINE SULFAMETHAZINE ENRAMYCIN
0 0 0
Adding the resistance level to table4:
> table4b <- table4 %>%
+ left_join(data.frame2(drug = sub("_", " ", names(a)), resistance = a), "drug") %>%
+ arrange(desc(no_dis_w + dis_week))
Which gives:
> ovv(table4b)
drug no_dis_w dis_week resistance
1 COLISTIN 0.2044 0.0848 81.2746
2 OXYTETRACYCLINE 0.1607 0.0711 70.6158
3 TYLOSIN 0.0797 0.0332 84.9013
4 DOXYCYCLINE 0.0591 0.0269 70.3963
5 GENTAMICIN 0.0514 0.0155 69.8726
6 AMOXICILLIN 0.0463 0.0143 66.1559
. . . . .
. . . . .
. . . . .
33 CEPHALEXIN 0 6e-04 NaN
34 CEFOTAXIME 0 6e-04 NaN
35 SULFAMETHAZINE 0 6e-04 NaN
36 ENRAMYCIN 0 6e-04 NaN
37 TETRACYCLINE 0 0 NaN
38 SULFACHLOROPYRIDAZINE 0 0 NaN
Plotting the level of resistance of each antibiotic as a function of the probability of being used.
> with(table4b, plot(no_dis_w + dis_week, resistance,
+ xlab = "probability of usage", pch = 21, bg = adjustcolor("red", .3)))
>
> table4b %>%
+ filter(no_dis_w + dis_week > .1 | resistance < 65) %>%
+ mutate(pos = ifelse(resistance > 60, 2, 4),
+ pos = ifelse(abbreviations[drug] %in% c("oxa", "ffc"), 3, pos),
+ pos = ifelse(abbreviations[drug] %in% c("flu"), 2, pos),
+ pos = ifelse(abbreviations[drug] %in% c("cam"), 1, pos)) %$%
+ text(no_dis_w + dis_week, resistance, abbreviations[drug], pos = pos)
It shows that the more a drug is used, the more it tends to be resistant. It also shows that there is a group of 8 antimicrobials that could certainly be used more advantageously. To end this section, let’s look at what would be the best antibiotic use if antimicriobials were still to be basically used at random. The following function takes a vector a pathogens’ prevalence and a vector of antimicrobials usage (expressed in probability) and use the amr data frame to draw a heatmap showing for what drug and pathogen combination resistance level is the highest.
> heat_map <- function(path_p, drug_p) {
+ path_p <- sort(path_p, TRUE)
+ drug_p <- sort(drug_p)
+ path_names <- names(path_p)
+ drug_names <- names(drug_p)
+ # the colors -------------------------------------------------------------------
+ col <- amr %>%
+ right_join(data.frame2(drug = drug_names), "drug") %>%
+ select(path_names) %>%
+ t() %>%
+ as.vector() %>%
+ cut(c(-.1, seq(10, 100, 10))) %>%
+ as.numeric() %>%
+ `[`(rev(heat.colors(10)), .)
+ # the coordinates --------------------------------------------------------------
+ x_path <- c(0, cumsum(path_p))
+ y_drug <- c(0, cumsum(drug_p))
+ coords <- expand.grid(seq_along(path_names), seq_along(drug_names))
+ # the heat map -----------------------------------------------------------------
+ opar <- par(pty = "s")
+ on.exit(par(opar))
+ plot(NA, xlim = c(0, sum(path_p)), ylim = c(0, sum(drug_p)), axes = FALSE,
+ xlab = "pathogens", ylab = "drugs", xaxs = "i", yaxs = "i")
+ with(coords, rect(x_path[Var1], y_drug[Var2],
+ x_path[Var1 + 1], y_drug[Var2 + 1], col = col, border = NA))
+ abline(v = cumsum(path_p), lwd = .5)
+ abline(h = cumsum(drug_p), lwd = .5)
+ box(bty = "o")
+ }
And it gives:
> heat_map(with(table3, setNames(proba_week, Pathogen)),
+ with(table4, setNames(no_dis_w + dis_week, drug)))
The same heatmap, without viruses and parasite:
> heat_map(with(filter(table3, round(proba_treat_failure, 6) < 100),
+ setNames(proba_week, Pathogen)),
+ with(table4, setNames(no_dis_w + dis_week, drug)))
Let’s compute the resistance of the community: