#An outbreak of gastroenteritis in Stegen, Germany, June 1998
#case study from https://www.reconlearn.org/post/stegen.html
#In this case study, we will take you through the analysis of this epidemic using R. This will be the occasion to illustrate more generally useful practices for data analysis using R, including how to:
#read and import data from Excel
#explore data using tables and summaries
#clean data
#make graphics to describe the data
#test which food items were associated with disease
#plot a very basic spatial overview of cases
#####load libraries####
#to find path of the file
require(here)|| install.packages(here)
## Loading required package: here
## here() starts at C:/Users/Gurpreet/Desktop/OUTBREAK INV/R/Food poisoning case study
## [1] TRUE
#to read excel sheets into R
library(readxl)
#to build epicurves
library(incidence)
#to clean labels in spreadsheet
library(epitrix)
#to clean data, plot graphs
library(tidyverse)
## -- Attaching packages ----------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.0.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#to calculate risk ratios
library(epitools)
#to read and work on shape files
library(sf)
## Linking to GEOS 3.6.1, GDAL 2.2.3, PROJ 4.9.3
#to demonstrate interactive maps
library(leaflet)
####loading excel sheet and cleaning data####
#set working directory
setwd("C:/Users/Gurpreet/Desktop/OUTBREAK INV/R/Food poisoning case study")
#Load data
data = read_excel("stegen_raw.xlsx")
#To know dimensions (number of rows and columns)
dim(data)
## [1] 291 23
#To look at column labels
names(data)
## [1] "Unique key" "ill" "Date-onset" "SEX" "Age"
## [6] "tiramisu" "tportion" "wmousse" "dmousse" "mousse"
## [11] "mportion" "Beer" "redjelly" "Fruit salad" "tomato"
## [16] "mince" "salmon" "horseradish" "chickenwin" "roastbeef"
## [21] "PORK" "latitude" "longitude"
#To see summary
summary(data)
## Unique key ill Date-onset SEX
## Min. : 1.0 Min. :0.000 Length:291 Min. :0.0000
## 1st Qu.: 73.5 1st Qu.:0.000 Class :character 1st Qu.:0.0000
## Median :146.0 Median :0.000 Mode :character Median :1.0000
## Mean :146.0 Mean :0.354 Mean :0.5223
## 3rd Qu.:218.5 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :291.0 Max. :1.000 Max. :1.0000
##
## Age tiramisu tportion wmousse
## Min. :12.00 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:18.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :20.00 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :26.66 Mean :0.4231 Mean :0.6678 Mean :0.2599
## 3rd Qu.:27.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :80.00 Max. :1.0000 Max. :3.0000 Max. :1.0000
## NA's :8 NA's :5 NA's :5 NA's :14
## dmousse mousse mportion Beer
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.3937 Mean :0.4256 Mean :0.6523 Mean :0.3911
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :3.0000 Max. :1.0000
## NA's :4 NA's :2 NA's :12 NA's :20
## redjelly Fruit salad tomato mince
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :0.000 Median :0.0000 Median :0.000
## Mean :0.2715 Mean :0.244 Mean :0.2852 Mean :0.299
## 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.000
##
## salmon horseradish chickenwin roastbeef
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.4811 Mean :0.3093 Mean :0.2887 Mean :0.09966
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :9.0000 Max. :9.0000 Max. :1.0000 Max. :1.00000
##
## PORK latitude longitude
## Min. :0.0000 Min. :47.98 Min. :7.952
## 1st Qu.:0.0000 1st Qu.:47.98 1st Qu.:7.959
## Median :0.0000 Median :47.98 Median :7.961
## Mean :0.4742 Mean :47.98 Mean :7.962
## 3rd Qu.:1.0000 3rd Qu.:47.98 3rd Qu.:7.965
## Max. :9.0000 Max. :47.99 Max. :7.974
## NA's :160 NA's :160
#to see structure of dataset
str(data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 291 obs. of 23 variables:
## $ Unique key : num 210 12 288 186 20 148 201 106 272 50 ...
## $ ill : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Date-onset : chr "1998-06-27" "1998-06-27" "1998-06-27" "1998-06-27" ...
## $ SEX : num 1 0 1 0 1 0 0 0 1 0 ...
## $ Age : num 18 57 56 17 19 16 19 19 40 53 ...
## $ tiramisu : num 1 1 0 1 1 1 1 1 1 1 ...
## $ tportion : num 3 1 0 1 2 2 3 2 2 1 ...
## $ wmousse : num 0 0 0 1 0 1 0 1 1 1 ...
## $ dmousse : num 1 1 0 0 0 1 1 1 1 1 ...
## $ mousse : num 1 1 0 1 0 1 1 1 1 1 ...
## $ mportion : num 1 1 0 NA 0 1 1 1 2 1 ...
## $ Beer : num 0 0 0 0 1 0 0 0 1 0 ...
## $ redjelly : num 0 0 0 1 0 0 0 1 0 1 ...
## $ Fruit salad: num 0 1 0 0 0 1 1 1 0 0 ...
## $ tomato : num 0 0 1 0 0 0 0 0 1 0 ...
## $ mince : num 0 1 1 0 0 1 0 0 0 0 ...
## $ salmon : num 0 1 1 9 0 1 0 0 1 1 ...
## $ horseradish: num 0 1 0 0 0 0 0 1 0 1 ...
## $ chickenwin : num 0 0 0 0 0 1 0 1 0 1 ...
## $ roastbeef : num 0 0 0 0 0 0 0 0 1 0 ...
## $ PORK : num 1 0 0 9 0 0 0 0 0 0 ...
## $ latitude : num 48 48 48 48 48 ...
## $ longitude : num 7.96 7.96 7.96 7.96 7.97 ...
#Now, there are issues in the dataset:
#1. the label names are messy, they are not standard
#2. Variable types are wrong..Unique key should be character, dateofonset should be date, illness and sex should be factor
#3. Labels are ambigous.. sex should be male/female, illness should be ill/healthy
#4. biniary variables such as consumption of pork, salmon and horseradish have max of 9??
#5. there are missing values
#Its very tempting to reopen excel and clean the dataset. However,
#NOTE: It is almost always quicker and more reliable to clean data using R
#make a copy of the old dataset before further action
data_old = data
#To generate standard lable names
new_labels = clean_labels(names(data))
#To check the new labels
new_labels
## [1] "unique_key" "ill" "date_onset" "sex" "age"
## [6] "tiramisu" "tportion" "wmousse" "dmousse" "mousse"
## [11] "mportion" "beer" "redjelly" "fruit_salad" "tomato"
## [16] "mince" "salmon" "horseradish" "chickenwin" "roastbeef"
## [21] "pork" "latitude" "longitude"
#To change labels in the dataset
names(data) = new_labels
#to make unique key as character
data$unique_key = as.character(data$unique_key)
#to make sex as factor variable
data$sex = factor(data$sex)
#to make illness as factor variable
data$ill = factor(data$ill)
#to make date of onset as date
data$date_onset = as.Date(data$date_onset)
#to recode sex as male and female
data$sex = recode_factor(data$sex, "0" = "male", "1" = "female")
#to recode illness as ill and healthy
data$ill = recode_factor(data$ill, "0" = "healthy", "1" = "ill")
#biniary variables with 9 as a value can be due to faulty data entry..
#to clean data, we will consider them as missing values
#to convert 9 as missing value
data$pork [data$pork == 9] = NA
data$salmon [ data$salmon == 9] = NA
data$horseradish [data$horseradish == 9] = NA
#to confirm that 9 has been converted, lets use table function and see
table(data$pork)
##
## 0 1
## 169 120
table(data$salmon)
##
## 0 1
## 183 104
table(data$horseradish)
##
## 0 1
## 217 72
#Once data has been cleaned, its a good idea to save the cleaned file
#write.csv(data, file = "clean data")
####Performing descriptive analytics####
#Tabular presentation of data
#to know age distribution
summary(data$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 12.00 18.00 20.00 26.66 27.00 80.00 8
#To know sex distribution
summary(data$sex)
## male female
## 139 152
#To know age distribution by gender
#use function tapply(vector, index, function)
tapply(data$age, data$sex, FUN = "summary")
## $male
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 12.00 17.50 19.00 26.38 28.00 80.00 4
##
## $female
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.00 18.00 20.00 26.93 24.50 65.00 4
#Graphical presentation of data using ggplot2 package
#example: to see age distribution as per sex
ggplot(data) + geom_histogram(aes(x = age, fill = sex), binwidth = 10)
## Warning: Removed 8 rows containing non-finite values (stat_bin).

#to see time distribution (making epidemic curve)
epi <- incidence(data$date_onset)
## 160 missing observations were removed.
plot(epi)

#there are observations beyond single incubation period plotted.. so to make epidemic curve
#of only ill cases
epi_cases <- incidence(data$date_onset, group = data$ill)
## 160 missing observations were removed.
as.data.frame(epi_cases)
## dates healthy ill
## 1 1998-06-26 1 0
## 2 1998-06-27 8 48
## 3 1998-06-28 5 46
## 4 1998-06-29 2 8
## 5 1998-06-30 3 0
## 6 1998-07-01 3 0
## 7 1998-07-02 3 0
## 8 1998-07-03 0 0
## 9 1998-07-04 1 0
## 10 1998-07-05 1 0
## 11 1998-07-06 1 0
## 12 1998-07-07 0 0
## 13 1998-07-08 0 0
## 14 1998-07-09 1 0
plot(epi_cases, color = c("healthy" = "blue", "ill" = "red"))

#age and gender distribution of the cases
ggplot(data) +
geom_histogram(aes(x = age, fill = ill), binwidth = 1) +
scale_fill_manual("Illness", values = c("healthy" = "blue", "ill" = "red")) +
facet_wrap(~sex, ncol = 1) + # stratify the sex into a single column of panels
labs(title = "Cases by age and gender") +
theme_light()
## Warning: Removed 8 rows containing non-finite values (stat_bin).

####looking for the most likely food item which led to disease####
#reading clean data
clean_data = read.csv("clean data")
#to see data head
head(clean_data)
## X unique_key ill date_onset sex age tiramisu tportion wmousse dmousse
## 1 1 210 ill 1998-06-27 female 18 1 3 0 1
## 2 2 12 ill 1998-06-27 male 57 1 1 0 1
## 3 3 288 ill 1998-06-27 female 56 0 0 0 0
## 4 4 186 ill 1998-06-27 male 17 1 1 1 0
## 5 5 20 ill 1998-06-27 female 19 1 2 0 0
## 6 6 148 ill 1998-06-27 male 16 1 2 1 1
## mousse mportion beer redjelly fruit_salad tomato mince salmon
## 1 1 1 0 0 0 0 0 0
## 2 1 1 0 0 1 0 1 1
## 3 0 0 0 0 0 1 1 1
## 4 1 NA 0 1 0 0 0 NA
## 5 0 0 1 0 0 0 0 0
## 6 1 1 0 0 1 0 1 1
## horseradish chickenwin roastbeef pork latitude longitude
## 1 0 0 0 1 47.97743 7.962910
## 2 1 0 0 0 47.98074 7.957386
## 3 0 0 0 0 47.98150 7.959177
## 4 0 0 0 NA 47.98255 7.958295
## 5 0 0 0 0 47.98497 7.971787
## 6 0 1 0 0 47.98440 7.959775
#to see names
names(clean_data)
## [1] "X" "unique_key" "ill" "date_onset" "sex"
## [6] "age" "tiramisu" "tportion" "wmousse" "dmousse"
## [11] "mousse" "mportion" "beer" "redjelly" "fruit_salad"
## [16] "tomato" "mince" "salmon" "horseradish" "chickenwin"
## [21] "roastbeef" "pork" "latitude" "longitude"
#isolate the variables to test
#in this case we need food items from 7 to 22 excluding tportion and mportion which are not biniary
#creating a subset
food = c('tiramisu', 'wmousse', 'dmousse', 'mousse', 'beer', 'redjelly',
'fruit_salad', 'tomato', 'mince', 'salmon', 'horseradish',
'chickenwin', 'roastbeef', 'pork')
food
## [1] "tiramisu" "wmousse" "dmousse" "mousse" "beer"
## [6] "redjelly" "fruit_salad" "tomato" "mince" "salmon"
## [11] "horseradish" "chickenwin" "roastbeef" "pork"
#to see the properties of vector subset created
head(clean_data[food])
## tiramisu wmousse dmousse mousse beer redjelly fruit_salad tomato mince
## 1 1 0 1 1 0 0 0 0 0
## 2 1 0 1 1 0 0 1 0 1
## 3 0 0 0 0 0 0 0 1 1
## 4 1 1 0 1 0 1 0 0 0
## 5 1 0 0 0 1 0 0 0 0
## 6 1 1 1 1 0 0 1 0 1
## salmon horseradish chickenwin roastbeef pork
## 1 0 0 0 0 1
## 2 1 1 0 0 0
## 3 1 0 0 0 0
## 4 NA 0 0 0 NA
## 5 0 0 0 0 0
## 6 1 0 1 0 0
#to calculate risk ratio.. first create a table and then calculate measures
#first step.. creating table
pork_table = epitable(clean_data$pork, clean_data$ill)
pork_table
## Outcome
## Predictor healthy ill
## 0 115 54
## 1 72 48
#second step.. calculating risk ratio
#calculating risk ratio
pork_rr = riskratio(pork_table, correct = TRUE, method = "wald")
# calculating CI of RR for pork consumption
pork_est_ci <- pork_rr$measure[2, ]
# calculating p value using chi square test
pork_p <- pork_rr$p.value[2, "fisher.exact"]
# calculating CI for RR as well as p value in a single line
res <- data.frame(estimate = pork_est_ci["estimate"],
lower = pork_est_ci["lower"],
upper = pork_est_ci["upper"],
p.value = pork_p
)
res
## estimate lower upper p.value
## estimate 1.251852 0.9176849 1.707703 0.1708777
#hurray! we have found risk estimates for a single item.. now we have two options
#option 01.. calcualte risk estimates for each food item separately
#option 02.. create a program and get risk estimates for each food item
#.. i choose option 02
#so, to create a program, we need ingrediets.. in this case food item is exposure ingridient and being ill or not is outcome ingredient
single_risk_ratio = function(exposure, outcome) { # ingredients defined here
et = epitable(exposure, outcome) # ingredients used here
rr = riskratio(et)
estimate = rr$measure[2, ]
res = data.frame(estimate = estimate["estimate"],
lower = estimate["lower"],
upper = estimate["upper"],
p.value = rr$p.value[2, "fisher.exact"]
)
return(res) # return the data frame
}
#now, RR for pork
pork_rr = single_risk_ratio(exposure = clean_data$pork, outcome = clean_data$ill)
pork_rr
## estimate lower upper p.value
## estimate 1.251852 0.9176849 1.707703 0.1708777
#RR for fruit
fruit_rr = single_risk_ratio(exposure = clean_data$fruit_salad, outcome = clean_data$ill)
fruit_rr
## estimate lower upper p.value
## estimate 2.500618 1.886773 3.314171 9.998203e-09
#Great, we can calculate risk ratios for each item now
#still, we can have errors, hence, we can use command to calculate for all in one go
all_rr <- lapply(clean_data[food], FUN = single_risk_ratio, outcome = clean_data$ill)
head(all_rr)
## $tiramisu
## estimate lower upper p.value
## estimate 18.31169 8.814202 38.04291 1.794084e-41
##
## $wmousse
## estimate lower upper p.value
## estimate 2.847222 2.128267 3.809049 5.825494e-11
##
## $dmousse
## estimate lower upper p.value
## estimate 4.501021 3.086945 6.562862 1.167009e-19
##
## $mousse
## estimate lower upper p.value
## estimate 4.968958 3.299403 7.483336 1.257341e-20
##
## $beer
## estimate lower upper p.value
## estimate 0.6767842 0.4757688 0.96273 0.02806394
##
## $redjelly
## estimate lower upper p.value
## estimate 2.08206 1.555917 2.786123 4.415074e-06
#now, we can reframe the result into a dataframe
all_food_df <- bind_rows(all_rr, .id = "exposure")
all_food_df
## exposure estimate lower upper p.value
## 1 tiramisu 18.3116883 8.8142022 38.042913 1.794084e-41
## 2 wmousse 2.8472222 2.1282671 3.809049 5.825494e-11
## 3 dmousse 4.5010211 3.0869446 6.562862 1.167009e-19
## 4 mousse 4.9689579 3.2994031 7.483336 1.257341e-20
## 5 beer 0.6767842 0.4757688 0.962730 2.806394e-02
## 6 redjelly 2.0820602 1.5559166 2.786123 4.415074e-06
## 7 fruit_salad 2.5006177 1.8867735 3.314171 9.998203e-09
## 8 tomato 1.2898653 0.9379601 1.773799 1.368934e-01
## 9 mince 1.0568237 0.7571858 1.475036 7.893882e-01
## 10 salmon 1.0334249 0.7452531 1.433026 8.976422e-01
## 11 horseradish 1.2557870 0.9008457 1.750578 2.026013e-01
## 12 chickenwin 1.1617347 0.8376217 1.611261 4.176598e-01
## 13 roastbeef 0.7607985 0.4129094 1.401795 4.172932e-01
## 14 pork 1.2518519 0.9176849 1.707703 1.708777e-01
#to arrange it into descending order as per risk
all_food_df <- arrange(all_food_df, desc(estimate))
all_food_df
## exposure estimate lower upper p.value
## 1 tiramisu 18.3116883 8.8142022 38.042913 1.794084e-41
## 2 mousse 4.9689579 3.2994031 7.483336 1.257341e-20
## 3 dmousse 4.5010211 3.0869446 6.562862 1.167009e-19
## 4 wmousse 2.8472222 2.1282671 3.809049 5.825494e-11
## 5 fruit_salad 2.5006177 1.8867735 3.314171 9.998203e-09
## 6 redjelly 2.0820602 1.5559166 2.786123 4.415074e-06
## 7 tomato 1.2898653 0.9379601 1.773799 1.368934e-01
## 8 horseradish 1.2557870 0.9008457 1.750578 2.026013e-01
## 9 pork 1.2518519 0.9176849 1.707703 1.708777e-01
## 10 chickenwin 1.1617347 0.8376217 1.611261 4.176598e-01
## 11 mince 1.0568237 0.7571858 1.475036 7.893882e-01
## 12 salmon 1.0334249 0.7452531 1.433026 8.976422e-01
## 13 roastbeef 0.7607985 0.4129094 1.401795 4.172932e-01
## 14 beer 0.6767842 0.4757688 0.962730 2.806394e-02
#now, since we have results in tabular form, it will be great to have them in graphical form for better understanding
# first, make sure the exposures are factored in the right order
all_food_df$exposure <- factor(all_food_df$exposure, unique(all_food_df$exposure))
# plot
p <- ggplot(all_food_df, aes(x = estimate, y = exposure, color = p.value)) +
geom_point() +
geom_errorbarh(aes(xmin = lower, xmax = upper)) +
geom_vline(xintercept = 1, linetype = 2) +
scale_x_log10() +
scale_color_viridis_c(trans = "log10") +
labs(x = "Risk Ratio (log scale)",
y = "Exposure",
title = "Risk Ratio for gastroenteritis in Stegen, Germany")
p

####spatial overview####
#reading the shp file
setwd("C:/Users/Gurpreet/Desktop/OUTBREAK INV/R/Food poisoning case study/stegen-map")
shp <- read_sf("stegen_households.shp")
#plotting map
ggplot(clean_data) +
geom_sf(data = shp) +
geom_point(aes(x = longitude, y = latitude, color = ill)) +
scale_color_manual("Illness", values = c("healthy" = "blue", "ill" = "red"))
## Warning: Removed 160 rows containing missing values (geom_point).

##interactive maps
stegen_sub <- clean_data[!is.na(clean_data$longitude), ]
# create the map
lmap <- leaflet()
# add open street map tiles
lmap <- addTiles(lmap)
# set the coordinates for Stegen
lmap <- setView(lmap, lng = 7.963, lat = 47.982, zoom = 15)
# Add the shapefile
lmap <- addPolygons(lmap, data = st_transform(shp, '+proj=longlat +ellps=GRS80'))
## Warning: sf layer has inconsistent datum (+proj=longlat +ellps=GRS80 +no_defs).
## Need '+proj=longlat +datum=WGS84'
# Add the cases
lmap <- addCircleMarkers(lmap,
label = ~ill,
color = ~ifelse(ill == "healthy", "blue", "red"),
stroke = FALSE,
fillOpacity = 0.8,
data = stegen_sub)
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
# show the map
lmap
####acknowledgement####
#This case study was first designed by Alain Moren and Gilles Desve, EPIET. It is based on an real outbreak investigation conducted by Anja Hauri, RKI, Berlin, 1998.