Generating the data from the CliRes database (and other data sets)

Packages
Utilitary funtion
Additional data
- IU to mg conversion
- Age-weight relationship
ViParc data
- Drug codes from general information data
- The surveillance data
Merging into one single data set

Packages

Installing the required packages:

> required <- c("dplyr", "grDevices", "magrittr", "mgcv", "purrr", "readxl", "tibble", "tidyr")
> to_install <- which(! required %in% row.names(installed.packages()))
> if (length(to_install) > 0) install.packages(to_install)

Loading magrittr:

> library(magrittr)

Utilitary funtion

> lapply2 <- function(X, FUN, ...) setNames(lapply(X, FUN, ...), X)

Additional data

IU to mg conversion

The ESVAC conversion factors are available from the table 2 of page 62 of the OIE report. A copy-paste of this table without headers is available here. We can read this file and format it to compute the following IU-to-mg convertion vector:

> if (! file.exists("esvac.txt"))
+   download.file("https://raw.githubusercontent.com/viparc/prophylactic/master/data/esvac.txt", "esvac.txt")

> esvac <- readLines("esvac.txt") %>% 
+   sub("^#.*", "", .) %>% 
+   sub(" *", "", .) %>% 
+   {.[. != ""]} %>% 
+   matrix(ncol = 4, byrow = TRUE) %>% 
+   as.data.frame(stringsAsFactors = FALSE) %>% 
+   setNames(c("ab_vet_med", "ab_oie", "IU.mg", "mg.IU")) %>% 
+   dplyr::filter(!grepl("methane", ab_vet_med)) %>% 
+   dplyr::transmute(ab_oie = sub(" .*$", "", tolower(ab_oie)),
+                    mg.IU  = as.numeric(mg.IU)) %>% 
+   dplyr::bind_rows(data.frame(ab_oie = "josamycin",
+                               mg.IU  = 0.001, stringsAsFactors = FALSE)) %$%
+   setNames(mg.IU, ab_oie)

Note that we got rid off one of the colistin data and that we manually added the data for josamycin that was absent from the table.

Age-weight relationship

Data on chicken weights as a function of age were collected for a previous study. The data are here. We will use these data for the current study:

> age_weight_data <- "https://www.dropbox.com/s/mbbejp9d8an1l4h/Supplementary_Data_Frontiers.xlsx?dl=0"
> if (! file.exists("age_weight.xlsx"))
+   download.file(sub("dl=0", "raw=1", age_weight_data), "age_weight.xlsx")

> age_weight <- readxl::read_excel("age_weight.xlsx") %>% 
+   tidyr::gather("week", "weight.kg", -FlockID, -Chicken) %>% 
+   na.exclude() %>% 
+   dplyr::mutate(week = sub("Week ", "", week)) %>% 
+   dplyr::mutate_at(c("FlockID", "Chicken", "week"), as.integer)

ViParc data

The ViParc data are in 2 excel files, each with many tabs. One of these files contains general information and the one contains the surveillance data:

> general_data <- "https://www.dropbox.com/s/gt6k1m781537vd2/16-4-2019-_18ZN_Category_V1_Data.xls?dl=0"
> surveillance_data <- "https://www.dropbox.com/s/7gjt77eltpm4ygw/16-4-2019-_18ZN_V1_Data.xls?dl=0"

Let’s download the excel files:

> if (! file.exists("general_data.xls"))
+   download.file(sub("dl=0", "raw=1", general_data), "general_data.xls")
> if (! file.exists("surveillance_data.xls"))
+   download.file(sub("dl=0", "raw=1", surveillance_data), "surveillance_data.xls")

For antimicrobial, disease and chicken data in the surveillance file, we need the following pairs of tabs respectively, that we have to merge:

> antimicrobial <- c("ANTIMICROBIAL", "ANTIMICROBIAL_GridEvent")
> disease <- c("DISEASE", "DISEASE_GridEvent")
> chicken <- c("MID_INOUT", "MID_INOUT_GridEvent")

Drug codes from general information data

Here we’ll merge with the esvac dataframe that we generated above.

> drug_codes <- c("ANTIMICROBIAL", "ANTIMICROBIAL_GridAnti") %>% 
+   lapply(readxl::read_excel, path = "general_data.xls") %>% 
+   purrr::invoke(dplyr::right_join, ., "ANTIMICROBIAL_SEQ") %>% 
+   dplyr::select(CODE, ANTINAME1, CONCENTRATION, GRIDUNIT, GRIDPACK, GRIDPACKUNIT) %>%
+   dplyr::filter(! ANTINAME1 %in% c("alicin", "axit oxolinic", "iodo-hydroxyquinoline", "metronidazol", "nystatin")) %>% 
+   dplyr::mutate(ANTINAME1     = dplyr::recode(ANTINAME1, sunfadimethoxine  = "sulfadimethoxine",
+                                                          sunphamethoxazole = "sulphamethoxazole"),
+                 GRIDUNIT      = dplyr::na_if(GRIDUNIT, "NA"),
+                 GRIDUNIT      = dplyr::na_if(GRIDUNIT, "na"),
+                 GRIDUNIT      = dplyr::recode(GRIDUNIT, UI = "IU", mg = "MG"),
+                 GRIDPACKUNIT  = dplyr::na_if(GRIDPACKUNIT, "na"),
+                 GRIDPACKUNIT  = dplyr::na_if(GRIDPACKUNIT, "VI"),
+                 GRIDPACKUNIT  = dplyr::recode(GRIDPACKUNIT, g  = "G", ml = "G", ML = "G"),
+                 CONCENTRATION = dplyr::na_if(CONCENTRATION, "NA"),
+                 CONCENTRATION = dplyr::na_if(CONCENTRATION, "na"),
+                 CONCENTRATION = dplyr::recode(CONCENTRATION, S = "500"), # this will ultimately be corrected in CliRes
+                 CONCENTRATION = as.numeric(CONCENTRATION),
+                 CONCENTRATION = ifelse(GRIDUNIT == "IU", CONCENTRATION * esvac[ANTINAME1], CONCENTRATION),
+                 GRIDUNIT      = ifelse(GRIDUNIT == "IU", "MG", GRIDUNIT), # HAS TO BE HERE!!!
+                 CONCENTRATION = ifelse(GRIDUNIT == "MG", CONCENTRATION / 1000, CONCENTRATION),
+                 GRIDUNIT      = ifelse(GRIDUNIT == "MG", "G", GRIDUNIT), # HAS TO BE HERE!!!
+                 GRIDPACK      = dplyr::na_if(GRIDPACK, "na"),
+                 GRIDPACK      = as.numeric(GRIDPACK),
+                 GRIDPACK      = ifelse(GRIDPACKUNIT == "KG", 1000 * GRIDPACK, GRIDPACK),
+                 GRIDPACKUNIT  = ifelse(GRIDPACKUNIT == "KG", "G", GRIDPACKUNIT), # HAS TO BE HERE!!! 
+                 proportion    = CONCENTRATION / GRIDPACK)

The surveillance data

> surveillance <- c(antimicrobial, disease, chicken, "SAMPLE") %>%
+   lapply2(readxl::read_excel, path = "surveillance_data.xls")

Sampling dates

> samplesdata <- surveillance$SAMPLE %>% 
+   dplyr::select(USUBJID, FLOCKSEQUENCE, SAMPLINGDATE, SAMPLINGVISIT) %>% 
+   unique()

Here we have to break the pipeline because we need to use the samplesdata at two distincts points in the section that follows:

> samples <- samplesdata %>%
+   dplyr::filter(SAMPLINGVISIT == "S") %>%
+   dplyr::select(-SAMPLINGVISIT) %>%
+   dplyr::rename(start = SAMPLINGDATE) %>%
+   dplyr::right_join(samplesdata, c("USUBJID", "FLOCKSEQUENCE")) %>%
+   dplyr::mutate(WEEK     = as.numeric(floor((SAMPLINGDATE - start) / (60 * 60 * 24 * 7)) + 1),
+                 sampling = TRUE) %>% 
+   dplyr::select(-start, -SAMPLINGDATE, -SAMPLINGVISIT)

Clinical signs

> raw_symptoms <- surveillance[disease] %>% 
+   unname() %>% 
+   purrr::invoke(dplyr::left_join, ., c("USUBJID", "DISEASE_SEQ")) %>%
+   dplyr::select(USUBJID, FLOCKSEQUENCE, WEEK, RESPIRATORY, DIARRHOEA, CNS, MALAISE, LEGLESIONS, SUDDENDEATH)

There are two corrections that cannot be done in CliRes and that we have to do here. We also have to break the pipeline because we uses the raw_symptoms dataframe that we just created at two distinct points in the section that follow:

> symptoms <- raw_symptoms %>% 
+ # correction 1: 2 weeks 8 and no week 9 for cycle 2 of farm 8. These 2 weeks are identical
+   dplyr::filter(USUBJID == "75-008", FLOCKSEQUENCE == "02", WEEK == 8) %>% 
+   dplyr::mutate(WEEK = 8:9) %>%
+   dplyr::bind_rows(dplyr::filter(raw_symptoms, !(USUBJID == "75-008" & FLOCKSEQUENCE == "02" & WEEK == 8))) %>% 
+ # correction 2: week 19 instead of 10 for cycle 6 of farm 21:
+   dplyr::mutate(WEEK = ifelse(USUBJID == "75-021" & FLOCKSEQUENCE == "06" & WEEK == 19, 10, WEEK))

Antimicrobial usage

Here we’ll merge with the drug_codes dataframe that we generated above.

> amu <- surveillance[antimicrobial] %>% 
+   unname() %>% 
+   purrr::invoke(dplyr::left_join, ., c("USUBJID", "ANTIMICROBIAL_SEQ")) %>% 
+   dplyr::select(USUBJID, FLOCKSEQUENCE, WEEKNO, CODE, AMOUTUSED, PACKAGEUNIT) %>%
+   na.exclude() %>% ### DO WE NEED THIS???
+   dplyr::mutate(PACKAGEUNIT = dplyr::na_if(PACKAGEUNIT, "TAB")) %>% 
+   dplyr::left_join(drug_codes, "CODE") %>%
+   tidyr::replace_na(list(ANTINAME1 = "unknown")) %>% # because some antibiotic names are unknown
+   dplyr::mutate(presence    = 1,
+                 antibiotic  = proportion * AMOUTUSED,
+                 antibiotic2 = antibiotic,
+                 antiname2   = paste0(ANTINAME1, "_g"),
+                 antiname3   = paste0(ANTINAME1, "_g.kg"),
+                 ANTINAME1   = paste0(ANTINAME1, "_use")) %>%
+   dplyr::select(USUBJID, FLOCKSEQUENCE, WEEKNO, ANTINAME1, presence, antiname2, antibiotic, antiname3, antibiotic2) %>% 
+   tibble::rowid_to_column() %>% # spread require unique row identifiers
+   tidyr::spread(ANTINAME1, presence) %>%
+   tidyr::spread(antiname2, antibiotic) %>%
+   tidyr::spread(antiname3, antibiotic2) %>%
+   dplyr::select(-rowid) %>% # we don't need rowid anymore
+   replace(is.na(.), 0) %>%
+   dplyr::group_by(USUBJID, FLOCKSEQUENCE, WEEKNO) %>%
+   dplyr::summarise_all(~sum(.)) %>%
+   dplyr::mutate_at(dplyr::vars(dplyr::ends_with("_use", FALSE)), function(x) x > 0)

Chicken data

> chickdata <- surveillance[chicken] %>% 
+   unname() %>% 
+   purrr::invoke(dplyr::left_join, ., c("USUBJID", "MID_INOUT_SEQ")) %>% 
+   dplyr::select(USUBJID, FLOCKSEQUENCE, WEEK, CHICKENTOTAL) %>% 
+   unique() %>% 
+   dplyr::mutate_at("CHICKENTOTAL", as.integer)

Here we have to break the pipeline because chickdata is used at two distinct points in the part below:

> chickenn <- chickdata %>%
+   dplyr::group_by(USUBJID, FLOCKSEQUENCE) %>%
+   dplyr::summarize(completed = min(CHICKENTOTAL) < 1) %>%
+   dplyr::ungroup() %>% 
+   dplyr::right_join(chickdata, c("USUBJID", "FLOCKSEQUENCE"))

Now, we will extrapolate the weight data in order to cover the age range of the symptoms dataframe. First let’s extract the vector of weeks from symptoms:

> # weeks <- as.integer(1:max(symptoms$WEEK))
> weeks <- 1:max(symptoms$WEEK)

Then, let’s do the extrapolation, using a GAM with a gamma distribution of errors and a log link:

> chickendata <- age_weight %>% 
+   mgcv::gam(weight.kg ~ s(week), Gamma(link = log), .) %>% 
+   mgcv::predict.gam(data.frame(week = weeks), "response") %>% 
+   list(weeks) %>% 
+   as.data.frame() %>% 
+   setNames(c("individual_weight_kg", "WEEK")) %>% 
+   dplyr::right_join(chickenn, "WEEK") %>% 
+   dplyr::mutate(farm_weight_kg = individual_weight_kg * CHICKENTOTAL)

Merging into one single data set

Let’s now merge the symptoms, samples, amu and chickendata dataframes that we created above. For that, we will use the symptoms dataframe as the reference. It means that we want to check that

there is no missing weeks in the symptoms dataframe.
there is no weeks in amu or chickendata that are not in symptoms

Note that, for now, we don’t check samples because, as a matter of fact there are some issues here, as outlined below.

Checking that there is no missing weeks in symptoms:

> symptoms %>%
+   dplyr::group_by(USUBJID, FLOCKSEQUENCE) %>%
+   dplyr::arrange(WEEK) %>%
+   dplyr::summarise(a = length(unique(diff(WEEK)))) %>%
+   dplyr::ungroup() %>%
+   dplyr::filter(a > 1) %>% 
+   nrow() %>% 
+   `<`(1)
[1] TRUE

Checking that there is no week defined in amu but not in symptoms:

> dplyr::anti_join(amu, symptoms, c("USUBJID", "FLOCKSEQUENCE", "WEEKNO" = "WEEK")) %>% 
+   nrow() %>% 
+   `<`(1)
[1] TRUE

Checking that there is no week defined in chickenn but not in viparc:

> dplyr::anti_join(chickendata, symptoms, c("USUBJID", "FLOCKSEQUENCE", "WEEK")) %>% 
+     nrow() %>% 
+   `<`(1)
[1] TRUE

Checking that there is no week defined in chickenn but not in viparc:

> dplyr::anti_join(samples, symptoms, c("USUBJID", "FLOCKSEQUENCE", "WEEK")) %>% 
+     nrow() %>% 
+   `<`(1)
[1] FALSE

We are ready for the merging and writing to disk. Note that the amu dataframe contains only weeks for which there was some AMU. For that reason, we will have to perform some NA replacement after the merging of symptoms and amu.

> viparc <- symptoms %>% 
+   dplyr::left_join(amu, c("USUBJID", "FLOCKSEQUENCE", "WEEK" = "WEEKNO")) %>% 
+   dplyr::mutate_at(dplyr::vars(dplyr::ends_with("_use")), . %>% ifelse(is.na(.), FALSE, .)) %>% 
+   dplyr::mutate_at(dplyr::vars(dplyr::ends_with("_g")), . %>% ifelse(is.na(.), 0, .)) %>% 
+   dplyr::mutate_at(dplyr::vars(dplyr::ends_with("_g.kg")), . %>% ifelse(is.na(.), 0, .)) %>% 
+   dplyr::left_join(chickendata, c("USUBJID", "FLOCKSEQUENCE", "WEEK")) %>% 
+   dplyr::mutate_at(dplyr::vars(dplyr::ends_with("_g.kg")), ~ . / farm_weight_kg) %>% 
+   dplyr::left_join(samples, c("USUBJID", "FLOCKSEQUENCE", "WEEK")) %>% 
+   dplyr::mutate_at(dplyr::vars(FLOCKSEQUENCE, WEEK), as.integer) %>% 
+   dplyr::arrange(USUBJID, FLOCKSEQUENCE, WEEK)

Some of the completed flocks do not have 3 samplings:

> (tmp <- viparc %>% 
+   dplyr::select(USUBJID, FLOCKSEQUENCE, completed, sampling) %>% 
+   dplyr::group_by(USUBJID, FLOCKSEQUENCE) %>%
+   dplyr::summarise(completed = mean(completed), sampling = sum(sampling, na.rm = T)) %>% 
+   dplyr::ungroup() %>% 
+   dplyr::filter(completed > 0, sampling < 3))
# A tibble: 31 x 4
   USUBJID FLOCKSEQUENCE completed sampling
   <chr>           <int>     <dbl>    <int>
 1 75-003              1         1        2
 2 75-012              1         1        2
 3 75-012              4         1        2
 4 75-015              2         1        2
 5 75-016              2         1        2
 6 75-019              2         1        0
 7 75-022              2         1        2
 8 75-022              4         1        2
 9 75-029              3         1        0
10 75-029              4         1        2
# … with 21 more rows

These are likely to be cycles shorter than expected, as suggested by the following figure:

> hist2 <- function(x, ...) hist(x, breaks = 0:30, ...)
> 
> viparc %>%
+   dplyr::group_by(USUBJID, FLOCKSEQUENCE) %>%
+   dplyr::tally() %>%
+   dplyr::ungroup() %$%
+   hist2(n, col = "grey", xlab = "duration (weeks)", ylab = "number of flocks", main = NA)
> 
> tmp %>% 
+   dplyr::filter(sampling == 1) %$% 
+   purrr::map2(USUBJID,
+               FLOCKSEQUENCE,
+               function(x, y)
+                 dplyr::filter(viparc, USUBJID == x, FLOCKSEQUENCE == y) %>%
+                 nrow()
+   ) %>% 
+   unlist() %>% 
+   hist2(col = adjustcolor("red", .5), add = TRUE)
> 
> tmp %>% 
+   dplyr::filter(sampling == 2) %$% 
+   purrr::map2(USUBJID,
+               FLOCKSEQUENCE,
+               function(x, y)
+                 dplyr::filter(viparc, USUBJID == x, FLOCKSEQUENCE == y) %>%
+                 nrow()
+   ) %>% 
+   unlist() %>% 
+   hist2(col = adjustcolor("blue", .5), add = TRUE)
> 
> legend("left", c("1 sampling only", "2 samplings only"), fill = adjustcolor(c("red", "blue"), .5), bty = "n")

Writing to disk anyway:

> if (!dir.exists("data")) dir.create("data")
> write.csv(viparc, "data/viparc.csv", FALSE, row.names = FALSE)