Food poisoning Outbreak investigation using R: A case study approach

#An outbreak of gastroenteritis in Stegen, Germany, June 1998
#case study from https://www.reconlearn.org/post/stegen.html
#In this case study, we will take you through the analysis of this epidemic using R. This will be the occasion to illustrate more generally useful practices for data analysis using R, including how to:
#read and import data from Excel
#explore data using tables and summaries
#clean data
#make graphics to describe the data
#test which food items were associated with disease
#plot a very basic spatial overview of cases


#####load libraries####
#to find path of the file
require(here)|| install.packages(here)

## Loading required package: here

## here() starts at C:/Users/Gurpreet/Desktop/OUTBREAK INV/R/Food poisoning case study

## [1] TRUE

#to read excel sheets into R
library(readxl)
#to build epicurves
library(incidence)
#to clean labels in spreadsheet
library(epitrix)
#to clean data, plot graphs
library(tidyverse)

## -- Attaching packages ----------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.1.1       v purrr   0.3.2  
## v tibble  2.1.1       v dplyr   0.8.0.1
## v tidyr   0.8.3       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0

## -- Conflicts -------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

#to calculate risk ratios
library(epitools)
#to read and work on shape files
library(sf)

## Linking to GEOS 3.6.1, GDAL 2.2.3, PROJ 4.9.3

#to demonstrate interactive maps
library(leaflet)

####loading excel sheet and cleaning data####

#set working directory
setwd("C:/Users/Gurpreet/Desktop/OUTBREAK INV/R/Food poisoning case study")
#Load data
data = read_excel("stegen_raw.xlsx")
#To know dimensions (number of rows and columns)
dim(data)

## [1] 291  23

#To look at column labels
names(data)

##  [1] "Unique key"  "ill"         "Date-onset"  "SEX"         "Age"        
##  [6] "tiramisu"    "tportion"    "wmousse"     "dmousse"     "mousse"     
## [11] "mportion"    "Beer"        "redjelly"    "Fruit salad" "tomato"     
## [16] "mince"       "salmon"      "horseradish" "chickenwin"  "roastbeef"  
## [21] "PORK"        "latitude"    "longitude"

#To see summary
summary(data)

##    Unique key         ill         Date-onset             SEX        
##  Min.   :  1.0   Min.   :0.000   Length:291         Min.   :0.0000  
##  1st Qu.: 73.5   1st Qu.:0.000   Class :character   1st Qu.:0.0000  
##  Median :146.0   Median :0.000   Mode  :character   Median :1.0000  
##  Mean   :146.0   Mean   :0.354                      Mean   :0.5223  
##  3rd Qu.:218.5   3rd Qu.:1.000                      3rd Qu.:1.0000  
##  Max.   :291.0   Max.   :1.000                      Max.   :1.0000  
##                                                                     
##       Age           tiramisu         tportion         wmousse      
##  Min.   :12.00   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:18.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :20.00   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :26.66   Mean   :0.4231   Mean   :0.6678   Mean   :0.2599  
##  3rd Qu.:27.00   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :80.00   Max.   :1.0000   Max.   :3.0000   Max.   :1.0000  
##  NA's   :8       NA's   :5        NA's   :5        NA's   :14      
##     dmousse           mousse          mportion           Beer       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.3937   Mean   :0.4256   Mean   :0.6523   Mean   :0.3911  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :3.0000   Max.   :1.0000  
##  NA's   :4        NA's   :2        NA's   :12       NA's   :20      
##     redjelly       Fruit salad        tomato           mince      
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :0.000   Median :0.0000   Median :0.000  
##  Mean   :0.2715   Mean   :0.244   Mean   :0.2852   Mean   :0.299  
##  3rd Qu.:1.0000   3rd Qu.:0.000   3rd Qu.:1.0000   3rd Qu.:1.000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.000  
##                                                                   
##      salmon        horseradish       chickenwin       roastbeef      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :0.4811   Mean   :0.3093   Mean   :0.2887   Mean   :0.09966  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :9.0000   Max.   :9.0000   Max.   :1.0000   Max.   :1.00000  
##                                                                      
##       PORK           latitude       longitude    
##  Min.   :0.0000   Min.   :47.98   Min.   :7.952  
##  1st Qu.:0.0000   1st Qu.:47.98   1st Qu.:7.959  
##  Median :0.0000   Median :47.98   Median :7.961  
##  Mean   :0.4742   Mean   :47.98   Mean   :7.962  
##  3rd Qu.:1.0000   3rd Qu.:47.98   3rd Qu.:7.965  
##  Max.   :9.0000   Max.   :47.99   Max.   :7.974  
##                   NA's   :160     NA's   :160

#to see structure of dataset
str(data)

## Classes 'tbl_df', 'tbl' and 'data.frame':    291 obs. of  23 variables:
##  $ Unique key : num  210 12 288 186 20 148 201 106 272 50 ...
##  $ ill        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date-onset : chr  "1998-06-27" "1998-06-27" "1998-06-27" "1998-06-27" ...
##  $ SEX        : num  1 0 1 0 1 0 0 0 1 0 ...
##  $ Age        : num  18 57 56 17 19 16 19 19 40 53 ...
##  $ tiramisu   : num  1 1 0 1 1 1 1 1 1 1 ...
##  $ tportion   : num  3 1 0 1 2 2 3 2 2 1 ...
##  $ wmousse    : num  0 0 0 1 0 1 0 1 1 1 ...
##  $ dmousse    : num  1 1 0 0 0 1 1 1 1 1 ...
##  $ mousse     : num  1 1 0 1 0 1 1 1 1 1 ...
##  $ mportion   : num  1 1 0 NA 0 1 1 1 2 1 ...
##  $ Beer       : num  0 0 0 0 1 0 0 0 1 0 ...
##  $ redjelly   : num  0 0 0 1 0 0 0 1 0 1 ...
##  $ Fruit salad: num  0 1 0 0 0 1 1 1 0 0 ...
##  $ tomato     : num  0 0 1 0 0 0 0 0 1 0 ...
##  $ mince      : num  0 1 1 0 0 1 0 0 0 0 ...
##  $ salmon     : num  0 1 1 9 0 1 0 0 1 1 ...
##  $ horseradish: num  0 1 0 0 0 0 0 1 0 1 ...
##  $ chickenwin : num  0 0 0 0 0 1 0 1 0 1 ...
##  $ roastbeef  : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ PORK       : num  1 0 0 9 0 0 0 0 0 0 ...
##  $ latitude   : num  48 48 48 48 48 ...
##  $ longitude  : num  7.96 7.96 7.96 7.96 7.97 ...

#Now, there are issues in the dataset: 
#1. the label names are messy, they are not standard
#2. Variable types are wrong..Unique key should be character, dateofonset should be date, illness and sex should be factor
#3. Labels are ambigous.. sex should be male/female, illness should be ill/healthy
#4. biniary variables such as consumption of pork, salmon and horseradish have max of 9??
#5. there are missing values

#Its very tempting to reopen excel and clean the dataset. However, 
#NOTE: It is almost always quicker and more reliable to clean data using R

#make a copy of the old dataset before further action
data_old = data
#To generate standard lable names
new_labels = clean_labels(names(data))
#To check the new labels
new_labels

##  [1] "unique_key"  "ill"         "date_onset"  "sex"         "age"        
##  [6] "tiramisu"    "tportion"    "wmousse"     "dmousse"     "mousse"     
## [11] "mportion"    "beer"        "redjelly"    "fruit_salad" "tomato"     
## [16] "mince"       "salmon"      "horseradish" "chickenwin"  "roastbeef"  
## [21] "pork"        "latitude"    "longitude"

#To change labels in the dataset
names(data) = new_labels
#to make unique key as character
data$unique_key = as.character(data$unique_key)
#to make sex as factor variable
data$sex = factor(data$sex)
#to make illness as factor variable
data$ill = factor(data$ill)
#to make date of onset as date
data$date_onset = as.Date(data$date_onset)
#to recode sex as male and female
data$sex  = recode_factor(data$sex, "0" = "male", "1" = "female")
#to recode illness as ill and healthy
data$ill = recode_factor(data$ill, "0" = "healthy", "1" = "ill")
#biniary variables with 9 as a value can be due to faulty data entry..
#to clean data, we will consider them as missing values
#to convert 9 as missing value
data$pork [data$pork == 9] = NA
data$salmon [ data$salmon == 9] = NA
data$horseradish [data$horseradish == 9] = NA
#to confirm that 9 has been converted, lets use table function and see
table(data$pork)

## 
##   0   1 
## 169 120

table(data$salmon)

## 
##   0   1 
## 183 104

table(data$horseradish)

## 
##   0   1 
## 217  72

#Once data has been cleaned, its a good idea to save the cleaned file
#write.csv(data, file = "clean data")

####Performing descriptive analytics####

#Tabular presentation of data
#to know age distribution
summary(data$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   12.00   18.00   20.00   26.66   27.00   80.00       8

#To know sex distribution
summary(data$sex)

##   male female 
##    139    152

#To know age distribution by gender
#use function tapply(vector, index, function)
tapply(data$age, data$sex, FUN = "summary")

## $male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   12.00   17.50   19.00   26.38   28.00   80.00       4 
## 
## $female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   13.00   18.00   20.00   26.93   24.50   65.00       4

#Graphical presentation of data using ggplot2 package
#example: to see age distribution as per sex
ggplot(data) + geom_histogram(aes(x = age, fill = sex), binwidth = 10)

## Warning: Removed 8 rows containing non-finite values (stat_bin).

#to see time distribution (making epidemic curve)
epi <- incidence(data$date_onset)

## 160 missing observations were removed.

plot(epi)

#there are observations beyond single incubation period plotted.. so to make epidemic curve 
#of only ill cases
epi_cases <- incidence(data$date_onset, group = data$ill)

## 160 missing observations were removed.

as.data.frame(epi_cases)

##         dates healthy ill
## 1  1998-06-26       1   0
## 2  1998-06-27       8  48
## 3  1998-06-28       5  46
## 4  1998-06-29       2   8
## 5  1998-06-30       3   0
## 6  1998-07-01       3   0
## 7  1998-07-02       3   0
## 8  1998-07-03       0   0
## 9  1998-07-04       1   0
## 10 1998-07-05       1   0
## 11 1998-07-06       1   0
## 12 1998-07-07       0   0
## 13 1998-07-08       0   0
## 14 1998-07-09       1   0

plot(epi_cases, color = c("healthy" = "blue", "ill" = "red"))

#age and gender distribution of the cases
ggplot(data) + 
  geom_histogram(aes(x = age, fill = ill), binwidth = 1) +
  scale_fill_manual("Illness", values = c("healthy" = "blue", "ill" = "red")) +
  facet_wrap(~sex, ncol = 1) + # stratify the sex into a single column of panels
  labs(title = "Cases by age and gender") + 
  theme_light()

## Warning: Removed 8 rows containing non-finite values (stat_bin).

####looking for the most likely food item which led to disease####
#reading clean data
clean_data = read.csv("clean data")
#to see data head 
head(clean_data)

##   X unique_key ill date_onset    sex age tiramisu tportion wmousse dmousse
## 1 1        210 ill 1998-06-27 female  18        1        3       0       1
## 2 2         12 ill 1998-06-27   male  57        1        1       0       1
## 3 3        288 ill 1998-06-27 female  56        0        0       0       0
## 4 4        186 ill 1998-06-27   male  17        1        1       1       0
## 5 5         20 ill 1998-06-27 female  19        1        2       0       0
## 6 6        148 ill 1998-06-27   male  16        1        2       1       1
##   mousse mportion beer redjelly fruit_salad tomato mince salmon
## 1      1        1    0        0           0      0     0      0
## 2      1        1    0        0           1      0     1      1
## 3      0        0    0        0           0      1     1      1
## 4      1       NA    0        1           0      0     0     NA
## 5      0        0    1        0           0      0     0      0
## 6      1        1    0        0           1      0     1      1
##   horseradish chickenwin roastbeef pork latitude longitude
## 1           0          0         0    1 47.97743  7.962910
## 2           1          0         0    0 47.98074  7.957386
## 3           0          0         0    0 47.98150  7.959177
## 4           0          0         0   NA 47.98255  7.958295
## 5           0          0         0    0 47.98497  7.971787
## 6           0          1         0    0 47.98440  7.959775

#to see names
names(clean_data)

##  [1] "X"           "unique_key"  "ill"         "date_onset"  "sex"        
##  [6] "age"         "tiramisu"    "tportion"    "wmousse"     "dmousse"    
## [11] "mousse"      "mportion"    "beer"        "redjelly"    "fruit_salad"
## [16] "tomato"      "mince"       "salmon"      "horseradish" "chickenwin" 
## [21] "roastbeef"   "pork"        "latitude"    "longitude"

#isolate the variables to test
#in this case we need food items from 7 to 22 excluding tportion and mportion which are not biniary
#creating a subset
food = c('tiramisu', 'wmousse', 'dmousse', 'mousse', 'beer', 'redjelly',
         'fruit_salad', 'tomato', 'mince', 'salmon', 'horseradish',
         'chickenwin', 'roastbeef', 'pork')
food

##  [1] "tiramisu"    "wmousse"     "dmousse"     "mousse"      "beer"       
##  [6] "redjelly"    "fruit_salad" "tomato"      "mince"       "salmon"     
## [11] "horseradish" "chickenwin"  "roastbeef"   "pork"

#to see the properties of vector subset created
head(clean_data[food])

##   tiramisu wmousse dmousse mousse beer redjelly fruit_salad tomato mince
## 1        1       0       1      1    0        0           0      0     0
## 2        1       0       1      1    0        0           1      0     1
## 3        0       0       0      0    0        0           0      1     1
## 4        1       1       0      1    0        1           0      0     0
## 5        1       0       0      0    1        0           0      0     0
## 6        1       1       1      1    0        0           1      0     1
##   salmon horseradish chickenwin roastbeef pork
## 1      0           0          0         0    1
## 2      1           1          0         0    0
## 3      1           0          0         0    0
## 4     NA           0          0         0   NA
## 5      0           0          0         0    0
## 6      1           0          1         0    0

#to calculate risk ratio.. first create a table and then calculate measures
#first step.. creating table
pork_table = epitable(clean_data$pork, clean_data$ill)
pork_table

##          Outcome
## Predictor healthy ill
##         0     115  54
##         1      72  48

#second step.. calculating risk ratio
#calculating risk ratio
pork_rr = riskratio(pork_table, correct = TRUE, method = "wald")
# calculating CI of RR  for pork consumption
pork_est_ci <- pork_rr$measure[2, ]
# calculating p value using chi square test
pork_p <- pork_rr$p.value[2, "fisher.exact"]
# calculating CI for RR as well as p value in a single line
res <- data.frame(estimate = pork_est_ci["estimate"],
                  lower    = pork_est_ci["lower"],
                  upper    = pork_est_ci["upper"],
                  p.value  = pork_p
)
res

##          estimate     lower    upper   p.value
## estimate 1.251852 0.9176849 1.707703 0.1708777

#hurray! we have found risk estimates for a single item.. now we have two options
#option 01.. calcualte risk estimates for each food item separately
#option 02.. create a program and get risk estimates for each food item
#.. i choose option 02
#so, to create a program, we need ingrediets.. in this case food item is exposure ingridient and being ill or not is outcome ingredient
single_risk_ratio = function(exposure, outcome) { # ingredients defined here
  et  = epitable(exposure, outcome) # ingredients used here
  rr  = riskratio(et)
  estimate = rr$measure[2, ]
  res = data.frame(estimate = estimate["estimate"],
                    lower    = estimate["lower"],
                    upper    = estimate["upper"],
                    p.value  = rr$p.value[2, "fisher.exact"]
  )
  return(res) # return the data frame
}
#now, RR for pork
pork_rr = single_risk_ratio(exposure = clean_data$pork, outcome = clean_data$ill)
pork_rr

##          estimate     lower    upper   p.value
## estimate 1.251852 0.9176849 1.707703 0.1708777

#RR for fruit
fruit_rr = single_risk_ratio(exposure = clean_data$fruit_salad, outcome = clean_data$ill)
fruit_rr

##          estimate    lower    upper      p.value
## estimate 2.500618 1.886773 3.314171 9.998203e-09

#Great, we can calculate risk ratios for each item now
#still, we can have errors, hence, we can use command to calculate for all in one go
all_rr <- lapply(clean_data[food], FUN = single_risk_ratio, outcome = clean_data$ill)
head(all_rr)

## $tiramisu
##          estimate    lower    upper      p.value
## estimate 18.31169 8.814202 38.04291 1.794084e-41
## 
## $wmousse
##          estimate    lower    upper      p.value
## estimate 2.847222 2.128267 3.809049 5.825494e-11
## 
## $dmousse
##          estimate    lower    upper      p.value
## estimate 4.501021 3.086945 6.562862 1.167009e-19
## 
## $mousse
##          estimate    lower    upper      p.value
## estimate 4.968958 3.299403 7.483336 1.257341e-20
## 
## $beer
##           estimate     lower   upper    p.value
## estimate 0.6767842 0.4757688 0.96273 0.02806394
## 
## $redjelly
##          estimate    lower    upper      p.value
## estimate  2.08206 1.555917 2.786123 4.415074e-06

#now, we can reframe the result into a dataframe
all_food_df <- bind_rows(all_rr, .id = "exposure")
all_food_df

##       exposure   estimate     lower     upper      p.value
## 1     tiramisu 18.3116883 8.8142022 38.042913 1.794084e-41
## 2      wmousse  2.8472222 2.1282671  3.809049 5.825494e-11
## 3      dmousse  4.5010211 3.0869446  6.562862 1.167009e-19
## 4       mousse  4.9689579 3.2994031  7.483336 1.257341e-20
## 5         beer  0.6767842 0.4757688  0.962730 2.806394e-02
## 6     redjelly  2.0820602 1.5559166  2.786123 4.415074e-06
## 7  fruit_salad  2.5006177 1.8867735  3.314171 9.998203e-09
## 8       tomato  1.2898653 0.9379601  1.773799 1.368934e-01
## 9        mince  1.0568237 0.7571858  1.475036 7.893882e-01
## 10      salmon  1.0334249 0.7452531  1.433026 8.976422e-01
## 11 horseradish  1.2557870 0.9008457  1.750578 2.026013e-01
## 12  chickenwin  1.1617347 0.8376217  1.611261 4.176598e-01
## 13   roastbeef  0.7607985 0.4129094  1.401795 4.172932e-01
## 14        pork  1.2518519 0.9176849  1.707703 1.708777e-01

#to arrange it into descending order as per risk
all_food_df <- arrange(all_food_df, desc(estimate))
all_food_df

##       exposure   estimate     lower     upper      p.value
## 1     tiramisu 18.3116883 8.8142022 38.042913 1.794084e-41
## 2       mousse  4.9689579 3.2994031  7.483336 1.257341e-20
## 3      dmousse  4.5010211 3.0869446  6.562862 1.167009e-19
## 4      wmousse  2.8472222 2.1282671  3.809049 5.825494e-11
## 5  fruit_salad  2.5006177 1.8867735  3.314171 9.998203e-09
## 6     redjelly  2.0820602 1.5559166  2.786123 4.415074e-06
## 7       tomato  1.2898653 0.9379601  1.773799 1.368934e-01
## 8  horseradish  1.2557870 0.9008457  1.750578 2.026013e-01
## 9         pork  1.2518519 0.9176849  1.707703 1.708777e-01
## 10  chickenwin  1.1617347 0.8376217  1.611261 4.176598e-01
## 11       mince  1.0568237 0.7571858  1.475036 7.893882e-01
## 12      salmon  1.0334249 0.7452531  1.433026 8.976422e-01
## 13   roastbeef  0.7607985 0.4129094  1.401795 4.172932e-01
## 14        beer  0.6767842 0.4757688  0.962730 2.806394e-02

#now, since we have results in tabular form, it will be great to have them in graphical form for better understanding
# first, make sure the exposures are factored in the right order
all_food_df$exposure <- factor(all_food_df$exposure, unique(all_food_df$exposure))
# plot
p <- ggplot(all_food_df, aes(x = estimate, y = exposure, color = p.value)) +
  geom_point() +
  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
  geom_vline(xintercept = 1, linetype = 2) + 
  scale_x_log10() + 
  scale_color_viridis_c(trans = "log10") + 
  labs(x = "Risk Ratio (log scale)", 
       y = "Exposure",
       title = "Risk Ratio for gastroenteritis in Stegen, Germany")
p

####spatial overview####
#reading the shp file
setwd("C:/Users/Gurpreet/Desktop/OUTBREAK INV/R/Food poisoning case study/stegen-map")
shp <- read_sf("stegen_households.shp")
#plotting map
ggplot(clean_data) +
  geom_sf(data = shp) +
  geom_point(aes(x = longitude, y = latitude, color = ill)) + 
  scale_color_manual("Illness", values = c("healthy" = "blue", "ill" = "red"))

## Warning: Removed 160 rows containing missing values (geom_point).

##interactive maps
stegen_sub <- clean_data[!is.na(clean_data$longitude), ]
# create the map
lmap <- leaflet()
# add open street map tiles
lmap <- addTiles(lmap)
# set the coordinates for Stegen
lmap <- setView(lmap, lng = 7.963, lat = 47.982, zoom = 15)
# Add the shapefile
lmap <- addPolygons(lmap, data = st_transform(shp, '+proj=longlat +ellps=GRS80'))

## Warning: sf layer has inconsistent datum (+proj=longlat +ellps=GRS80 +no_defs).
## Need '+proj=longlat +datum=WGS84'

# Add the cases
lmap <- addCircleMarkers(lmap, 
                         label = ~ill, 
                         color = ~ifelse(ill == "healthy", "blue", "red"), 
                         stroke = FALSE,
                         fillOpacity = 0.8,
                         data = stegen_sub)

## Assuming "longitude" and "latitude" are longitude and latitude, respectively

# show the map
lmap

####acknowledgement####
#This case study was first designed by Alain Moren and Gilles Desve, EPIET. It is based on an real outbreak investigation conducted by Anja Hauri, RKI, Berlin, 1998.

Food poisoning Outbreak investigation using R: A case study approach

Dr Gurpreet Singh, Dr Biju Soman

26/08/2019