-The code use several datasets EMPRES-I (FAO), ECDC Surveillance Atlas ECDC, Global Health observatory (WHO). Adagio. -Please feel free to analyse the code and run from your terminal (there is a folder with the code and data on: DG1 Disease occurrence > datasources > Spill-over > sources > spillover.Rmd)
# Spill over analysis
# Update: 17/01/2024
library(tidyverse)
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(stringr)
library(readr)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(DT)
library(arsenal)
## Warning: package 'arsenal' was built under R version 4.3.3
##
## Attaching package: 'arsenal'
##
## The following object is masked from 'package:lubridate':
##
## is.Date
setwd("C:/Users/alfredo.acosta/SVA/LiRA_consortium - Documents/WG1 Disease occurrence/datasources/Spill-over/sources/sc2")
# EGran
# soe <- read.csv(file="ECDC_surveillance_data_Echinococcosis.27.11.24.csv")
soe <- read.csv(file="ECDC_surveillance_data_Echinococcosis.granulosus.27.11.24.csv")
# Lyme
soe1 <- read.csv(file="ECDC_surveillance_data_Lyme_Neuroborreliosis.27.11.24.csv")
# Qfever
soe2 <- read.csv(file="ECDC_surveillance_data_Q_fever.27.11.24.csv")
# TBE
soe3 <- read.csv(file="ECDC_surveillance_data_Tick-borne_encephalitis.27.11.24.csv")
# Influenza
soe4 <- read.csv(file="ECDC_surveillance_data_Influenza.27.11.24.csv")
soe <- rbind(soe, soe1, soe2, soe3)
rm(soe1, soe2, soe3, soe4)
soe1 <- soe %>%
group_by(HealthTopic, RegionName,Time) %>%
summarise(cases=sum(as.numeric(NumValue))) %>%
filter(cases != 0)
## `summarise()` has grouped output by 'HealthTopic', 'RegionName'. You can
## override using the `.groups` argument.
colnames(soe1) <- c("disease", "country", "time", "cases")
# Number of human cases
soe %>%
filter(!str_detect(RegionName, "EU")) %>%
group_by(HealthTopic) %>%
summarise(cases=sum(NumValue)) %>%
filter(cases != 0)
## # A tibble: 4 × 2
## HealthTopic cases
## <chr> <dbl>
## 1 Echinococcosis 7772
## 2 Lyme Neuroborreliosis 7220
## 3 Q fever 15673
## 4 Tick-borne encephalitis 34633
# Number of countries
soe %>%
filter(!str_detect(RegionName, "EU")) %>%
group_by(HealthTopic) %>%
summarise(countries=length(unique(RegionName)))
## # A tibble: 4 × 2
## HealthTopic countries
## <chr> <int>
## 1 Echinococcosis 30
## 2 Lyme Neuroborreliosis 19
## 3 Q fever 30
## 4 Tick-borne encephalitis 30
table(soe$Time)
##
## 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
## 31 61 59 60 60 87 89 91 92 93 94 108 114 110 113 117
## 2023
## 118
table(soe$Time, soe$HealthTopic)
##
## Echinococcosis Lyme Neuroborreliosis Q fever Tick-borne encephalitis
## 2007 31 0 0 0
## 2008 31 0 30 0
## 2009 30 0 29 0
## 2010 30 0 30 0
## 2011 30 0 30 0
## 2012 31 0 31 25
## 2013 32 0 31 26
## 2014 32 0 31 28
## 2015 32 0 32 28
## 2016 31 0 33 29
## 2017 32 0 33 29
## 2018 31 14 33 30
## 2019 32 19 33 30
## 2020 31 18 31 30
## 2021 31 19 32 31
## 2022 31 21 32 33
## 2023 32 21 32 33
# Reading ADAGIO database
adg <- read.csv(file= "Outbreaks and cases with labels.csv")
str(adg)
## 'data.frame': 3670 obs. of 9 variables:
## $ Disease : chr "African swine fever virus" "African swine fever virus" "African swine fever virus" "African swine fever virus" ...
## $ FunctionalGroup: chr "Wildlife amplification" "Wildlife amplification" "Wildlife amplification" "Wildlife amplification" ...
## $ Species : chr "Wild Boar" "Wild Boar" "Wild Boar" "Wild Boar" ...
## $ Country : chr "Poland" "Poland" "Poland" "Poland" ...
## $ NUTS2 : chr "Warmińsko-Mazurskie" "Warmińsko-Mazurskie" "Warmińsko-Mazurskie" "Warmińsko-Mazurskie" ...
## $ Year : int 2019 2019 2019 2018 2019 2019 2019 2019 2019 2019 ...
## $ Month : chr "November" "October" "February" "November" ...
## $ SumCases : int 168 97 102 93 38 71 105 130 53 58 ...
## $ SumOutbreaks : int 122 87 60 50 37 49 62 87 45 42 ...
library(readxl)
adg_pop <- read_excel("FAO human population.xlsx")
# Create matrix
db <- expand.grid(unique(adg_pop$Country), c(2003:2023), unique(soe1$disease))
colnames(db) <- c("country", "time", "disease")
# first index to fill the human cases db & soe1
index <- match(paste(db$country, db$time, db$disease), paste(soe1$country, soe1$time, soe1$disease))
db$human_c <- soe1$cases[index]
# second index to pass the FAO Adagio human population
index <- match(paste(db$country, db$time), paste(adg_pop$Country, adg_pop$Year))
db$human_pop <- adg_pop$`Human population by country.Population in thousands`[index]
db$human_pop <- db$human_pop*1000
db$human_cases_hab <- db$human_c/db$human_pop*100000
unregions <- read.csv("all.csv")
unregions$name[unregions$name == "United States of America"] <- "United States"
unregions$name[unregions$name == "United Kingdom of Great Britain and Northern Ireland"] <- "United Kingdom"
str(unregions)
## 'data.frame': 249 obs. of 11 variables:
## $ name : chr "Afghanistan" "Ã…land Islands" "Albania" "Algeria" ...
## $ alpha.2 : chr "AF" "AX" "AL" "DZ" ...
## $ alpha.3 : chr "AFG" "ALA" "ALB" "DZA" ...
## $ country.code : int 4 248 8 12 16 20 24 660 10 28 ...
## $ iso_3166.2 : chr "ISO 3166-2:AF" "ISO 3166-2:AX" "ISO 3166-2:AL" "ISO 3166-2:DZ" ...
## $ region : chr "Asia" "Europe" "Europe" "Africa" ...
## $ sub.region : chr "Southern Asia" "Northern Europe" "Southern Europe" "Northern Africa" ...
## $ intermediate.region : chr "" "" "" "" ...
## $ region.code : int 142 150 150 2 9 150 2 19 NA 19 ...
## $ sub.region.code : int 34 154 39 15 61 39 202 419 NA 419 ...
## $ intermediate.region.code: int NA NA NA NA NA NA 17 29 NA 29 ...
str(db)
## 'data.frame': 20160 obs. of 6 variables:
## $ country : Factor w/ 240 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ time : int 2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 ...
## $ disease : Factor w/ 4 levels "Echinococcosis",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ human_c : num NA NA NA NA NA NA NA NA NA NA ...
## $ human_pop : num 22644000 3092000 32055000 57000 73000 ...
## $ human_cases_hab: num NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "out.attrs")=List of 2
## ..$ dim : int [1:3] 240 21 4
## ..$ dimnames:List of 3
## .. ..$ Var1: chr [1:240] "Var1=Afghanistan" "Var1=Albania" "Var1=Algeria" "Var1=American Samoa" ...
## .. ..$ Var2: chr [1:21] "Var2=2003" "Var2=2004" "Var2=2005" "Var2=2006" ...
## .. ..$ Var3: chr [1:4] "Var3=Echinococcosis" "Var3=Lyme Neuroborreliosis" "Var3=Q fever" "Var3=Tick-borne encephalitis"
db$country <- as.character(db$country)
db <- data.frame(db)
db$unr <- unregions$sub.region[match(db$country, unregions$name)]
db %>%
filter(time > 2003) %>%
filter(!is.na(human_cases_hab)) %>%
ggplot(aes(time, human_cases_hab, group=country))+
geom_point()

db %>%
filter(time > 2003) %>%
filter(!is.na(human_cases_hab)) %>%
ggplot(aes(time, human_cases_hab, group=country))+
geom_point()+
scale_y_log10()+
facet_wrap(vars(disease), ncol = 1)
# General plots by country
# List of diseases
diseases <- unique(db$disease)
# Function to generate plots for a given disease
generate_plots <- function(disease_name) {
# Filter data for the specific disease
disease_data <- db %>%
filter(time > 2003, disease == disease_name, !is.na(human_cases_hab))
# Check if disease_data is empty
if (nrow(disease_data) == 0) {
return(list(plot1 = NULL, plot2 = NULL, plot3 = NULL))
}
# Plot 1: Boxplot per country
plot1 <- disease_data %>%
ggplot(aes(human_cases_hab, color = country)) +
geom_boxplot() +
scale_x_log10() +
facet_wrap(vars(country), ncol = 1) +
theme_minimal() +
theme(legend.position = "none") +
xlab(paste(disease_name, "human cases / 100.000 habitants")) +
ylab(NULL) +
guides(y = "none")
# Plot 2: Boxplot grouped by year
plot2 <- disease_data %>%
ggplot(aes(human_cases_hab, time, group = time)) +
geom_boxplot() +
scale_x_log10() +
theme_minimal() +
theme(legend.position = "none") +
xlab(paste(disease_name, "human cases / 100.000 habitants (boxplot grouped by year)")) +
ylab(NULL)
# Plot 3: Time-series plot
plot3 <- disease_data %>%
ggplot(aes(time, human_cases_hab, color = country)) +
geom_point() +
geom_line(size = 1) +
scale_y_log10() +
facet_wrap(vars(country), ncol = 3) +
theme_minimal() +
theme(legend.position = "none") +
xlab(paste(disease_name, "human cases / 100.000 habitants (by year-country)")) +
ylab(NULL)
# Return a list of plots
list(plot1 = plot1, plot2 = plot2, plot3 = plot3)
}
# Generate all plots for all diseases
all_plots <- map(diseases, ~ generate_plots(.x))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
names(all_plots) <- diseases
all_plots
## $Echinococcosis
## $Echinococcosis$plot1
##
## $Echinococcosis$plot2
##
## $Echinococcosis$plot3
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
##
##
## $`Lyme Neuroborreliosis`
## $`Lyme Neuroborreliosis`$plot1
##
## $`Lyme Neuroborreliosis`$plot2
##
## $`Lyme Neuroborreliosis`$plot3
##
##
## $`Q fever`
## $`Q fever`$plot1
##
## $`Q fever`$plot2
##
## $`Q fever`$plot3
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
##
##
## $`Tick-borne encephalitis`
## $`Tick-borne encephalitis`$plot1
##
## $`Tick-borne encephalitis`$plot2
##
## $`Tick-borne encephalitis`$plot3
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
# Access plots for specific diseases
# Qfever_plots <- all_plots[["Q fever"]]
# EGran_plots <- all_plots[["Echinococcosis"]]
# Lyme_plots <- all_plots[["Lyme Neuroborreliosis"]]
# TBE_plots <- all_plots[["Tick-borne encephalitis"]]
# Display specific plots
# Qfever_plots$plot1
# Qfever_plots$plot2
# Qfever_plots$plot3
# EGran_plots$plot1
# Lyme_plots$plot2
# TBE_plots$plot3
db %>%
filter(time > 2003) %>%
# filter(disease == "Rabies") %>%
# filter(unr != "Western Asia") %>%
filter(!is.na(human_cases_hab)) %>%
ggplot(aes(human_cases_hab, time, group=time, color=disease))+
geom_boxplot()+
scale_x_log10()+
facet_wrap(vars(disease), ncol = 1)+
theme_minimal() +
theme(legend.position="none") +
ylab(NULL)+
xlab("human cases/100.000 habitants (log scaled) country rates")
db %>%
filter(time > 2003 & !is.na(human_cases_hab & !is.na(disease))) %>%
ggplot(aes(human_cases_hab, unr, group=unr, color=disease))+
geom_boxplot()+
scale_x_log10()+
facet_wrap(vars(disease), scales="free", ncol = 1)+
theme_minimal() +
theme(legend.position="none") +
ylab(NULL)+
xlab("human cases/100.000 habitants (log scaled) country rates ")
level <- db %>%
filter(time > 2003) %>%
# filter(!is.na(unr)) %>%
filter(!is.na(human_cases_hab)) %>%
group_by(disease) %>%
mutate(name=cut(human_cases_hab, breaks = 3, labels = c("low","medium","high"))) %>%
mutate(ranges=cut(human_cases_hab, breaks = 3)) %>%
select(-unr)
level2 <- level %>%
group_by(disease,name,ranges) %>%
dplyr::summarize(mean_point=median(human_cases_hab),
human_cases_hab_unr=(sum(human_c)/sum(human_pop*100000)),
mean2=mean(human_cases_hab_unr))
## `summarise()` has grouped output by 'disease', 'name'. You can override using
## the `.groups` argument.
diseases <- unique(db$disease)
generate_plots_unr <- function(disease_name) {
disease_data <- db %>%
filter(time > 2003, disease == disease_name, !is.na(human_cases_hab))
#boxplot by unr
p1 <- disease_data %>%
ggplot(aes(human_cases_hab, color=unr))+
geom_boxplot()+
scale_x_log10()+
facet_wrap(vars(unr), ncol = 1)+
theme_minimal() +
theme(legend.position="none")+
xlab(paste(disease_name, "human cases / 100.000 habitants"))+
ylab(NULL) +
guides(y="none")
# boxplot by year
p2 <- disease_data %>%
ggplot(aes(human_cases_hab, time, group=time))+
geom_boxplot()+
scale_x_log10()+
theme_minimal() +
theme(legend.position="none")+
xlab(paste(disease_name, "human cases / 100.000 habitants"))+
ylab(NULL)
# Time series plot
p3 <- disease_data %>%
ggplot(aes(time, human_cases_hab, color=unr))+
geom_point()+
geom_line(size=1)+
scale_y_log10()+
facet_wrap(vars(unr), ncol = 3)+
theme_minimal() +
theme(legend.position="none") +
ylab(NULL)+
xlab(paste(disease_name, "human cases / 100.000 habitants (by year-unr)"))
# Return a list or plots
list(p1 = p1, p2 = p2, p3 = p3)
}
# Generate all plots for all diseases
all_plots2 <- map(diseases, generate_plots_unr)
all_plots2
## [[1]]
## [[1]]$p1
##
## [[1]]$p2
##
## [[1]]$p3
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
##
##
## [[2]]
## [[2]]$p1
##
## [[2]]$p2
##
## [[2]]$p3
##
##
## [[3]]
## [[3]]$p1
##
## [[3]]$p2
##
## [[3]]$p3
##
##
## [[4]]
## [[4]]$p1
##
## [[4]]$p2
##
## [[4]]$p3
# Access plots for a specific disease, e.g., "Q fever"
# Qfever_plots_unr<- all_plots2[[which(diseases == "Q fever")]]
# EGran_plots_unr <- all_plots2[[which(diseases == "Echinococcosis")]]
# Lyme_plots_unr <- all_plots2[[which(diseases == "Lyme Neuroborreliosis")]]
# TBE_plots_unr <- all_plots2[[which(diseases == "Tick-borne encephalitis")]]
UNRegion_db_unr <- db %>%
group_by(disease, unr) %>%
filter(time > 2003) %>%
filter(unr != "Western Asia") %>%
filter(!is.na(human_cases_hab)) %>%
summarise(human_cases_hab_unr=(sum(human_c)/sum(human_pop)*100000)) %>%
mutate(name=cut(human_cases_hab_unr, breaks = 3, labels = c("low","medium","high"))) %>%
mutate(range=cut(human_cases_hab_unr, breaks = 3))
## `summarise()` has grouped output by 'disease'. You can override using the
## `.groups` argument.
UNRegion_db_unr
## # A tibble: 16 × 5
## # Groups: disease [4]
## disease unr human_cases_hab_unr name range
## <fct> <chr> <dbl> <fct> <fct>
## 1 Echinococcosis Eastern Europe 0.429 high (0.303,0.…
## 2 Echinococcosis Northern Europe 0.0521 low (0.0517,0…
## 3 Echinococcosis Southern Europe 0.0720 low (0.0517,0…
## 4 Echinococcosis Western Europe 0.0916 low (0.0517,0…
## 5 Lyme Neuroborreliosis Eastern Europe 0.921 high (0.751,1.…
## 6 Lyme Neuroborreliosis Northern Europe 1.07 high (0.751,1.…
## 7 Lyme Neuroborreliosis Southern Europe 0.117 low (0.116,0.…
## 8 Lyme Neuroborreliosis Western Europe 0.243 low (0.116,0.…
## 9 Q fever Eastern Europe 0.181 medium (0.119,0.…
## 10 Q fever Northern Europe 0.0532 low (0.053,0.…
## 11 Q fever Southern Europe 0.250 high (0.184,0.…
## 12 Q fever Western Europe 0.227 high (0.184,0.…
## 13 Tick-borne encephalitis Eastern Europe 1.22 medium (0.943,1.…
## 14 Tick-borne encephalitis Northern Europe 2.27 high (1.61,2.2…
## 15 Tick-borne encephalitis Southern Europe 0.279 low (0.277,0.…
## 16 Tick-borne encephalitis Western Europe 0.346 low (0.277,0.…
UNRegion_db <- db %>%
group_by(disease) %>%
filter(time > 2003) %>%
filter(unr != "Western Asia") %>%
filter(!is.na(human_cases_hab)) %>%
summarise(human_cases_hab_unr=(sum(human_c)/sum(human_pop)*100000)) %>%
mutate(name=cut(human_cases_hab_unr, breaks = 3, labels = c("low","medium","high"))) %>%
mutate(range=cut(human_cases_hab_unr, breaks = 3))
UNRegion_db
## # A tibble: 4 × 4
## disease human_cases_hab_unr name range
## <fct> <dbl> <fct> <fct>
## 1 Echinococcosis 0.151 low (0.151,0.367]
## 2 Lyme Neuroborreliosis 0.692 high (0.583,0.799]
## 3 Q fever 0.192 low (0.151,0.367]
## 4 Tick-borne encephalitis 0.799 high (0.583,0.799]
db %>%
filter(time > 2003) %>%
filter(!is.na(human_cases_hab)) %>%
ggplot(aes(human_cases_hab, color=disease, group=unr))+
geom_boxplot()+
scale_x_log10()+
facet_wrap(vars(disease, unr))+
theme_minimal() +
theme(legend.position="none") +
ylab(NULL)+
xlab("human cases / 100.000 habitants (agregated by UNRegions and years)")
lira_table_sp <- db %>%
filter(time > 2003) %>%
# filter(unr != "Western Asia") %>%
filter(!is.na(human_cases_hab)) %>%
group_by(disease, UNregions=unr) %>%
summarise(
human_cases=sum(human_c),
human_pop=sum(human_pop, na.rm = TRUE),
human_rate_median=round(median(human_cases_hab), 3),
CI05 = round(quantile(human_cases_hab, probs = 0.025, na.rm = TRUE), 3),
CI95 = round(quantile(human_cases_hab, probs = 0.975, na.rm = TRUE) ,3)) %>%
mutate(level=cut(human_rate_median, breaks = 3, labels = c("low","medium","high"))) %>%
arrange(disease, desc(human_rate_median), UNregions)
## `summarise()` has grouped output by 'disease'. You can override using the
## `.groups` argument.
datatable(lira_table_sp)
# lira_table_sp
# write.csv(lira_table_sp, file="lira_table_sp.csv")
#Graph
library(ggplot2)
# Visualization
ggplot(lira_table_sp, aes(x = reorder(UNregions, -human_rate_median), y = human_rate_median, color = disease)) +
geom_point(size = 3, position = position_dodge(width = 0.1)) + # Points for median
geom_errorbar(aes(ymin = CI05, ymax = CI95), width = 0.2, position = position_dodge(width = 0.5)) + # Error bars for CI
facet_wrap(vars(disease), scales = "free_y") + # Separate panels for each disease
coord_flip() + # Flip axes for better readability
theme_minimal() +
labs(
title = "Median Human Cases and 95% CI by Disease and UN Region",
x = "UN Region",
y = "Human Cases / 100,000 habitants",
color = "Disease"
) +
theme(
legend.position = "top",
axis.text.x = element_text(angle = 45, hjust = 1)
)
ggplot(lira_table_sp, aes(x = level, y = human_rate_median, fill = level)) +
geom_boxplot(outlier.shape = NA, alpha = 0.8) + # Boxplot for distribution
geom_jitter(aes(color = level), width = 0.2, size = 2, alpha = 0.6) + # Add points for individual medians
facet_wrap(vars(disease), scales = "free_y") + # Separate panels for each disease
theme_minimal() +
labs(
title = "Distribution of Human Rate Median by Level (High, Medium, Low)",
x = "Level",
y = "Human Cases / 100,000 habitants",
fill = "Level",
color = "Level"
) +
theme(
legend.position = "top",
axis.text.x = element_text(angle = 45, hjust = 1),
panel.spacing = unit(1, "lines")
)
# Credits Acosta, Alfredo PhD1.
SVA1: SVA http://www.sva.se/.