# Install / load needed packages
#install.packages(c("outbreaks", "tidyverse", "lubridate"))
#library(outbreaks)
#library(tidyverse)
#library(lubridate)
# Load the Sierra Leone Ebola data
EBOLA <- read.csv("ebola_sierra_leone.csv")
# Look at the first few rows
head(EBOLA)
## id age sex status date_of_onset date_of_sample district
## 1 92 6 M confirmed 2014-06-10 2014-06-15 Kailahun
## 2 51 46 F confirmed 2014-05-30 2014-06-04 Kailahun
## 3 230 NA M confirmed 2014-06-26 2014-06-30 Kenema
## 4 139 25 F confirmed 2014-06-13 2014-06-18 Kailahun
## 5 8 8 F confirmed 2014-05-22 2014-05-27 Kailahun
## 6 215 49 M confirmed 2014-06-24 2014-06-29 Kailahun
str(EBOLA)
## 'data.frame': 200 obs. of 7 variables:
## $ id : int 92 51 230 139 8 215 189 115 218 159 ...
## $ age : num 6 46 NA 25 8 49 13 50 35 38 ...
## $ sex : chr "M" "F" "M" "F" ...
## $ status : chr "confirmed" "confirmed" "confirmed" "confirmed" ...
## $ date_of_onset : chr "2014-06-10" "2014-05-30" "2014-06-26" "2014-06-13" ...
## $ date_of_sample: chr "2014-06-15" "2014-06-04" "2014-06-30" "2014-06-18" ...
## $ district : chr "Kailahun" "Kailahun" "Kenema" "Kailahun" ...
summary(EBOLA)
## id age sex status
## Min. : 1.00 Min. : 1.80 Length:200 Length:200
## 1st Qu.: 62.75 1st Qu.:20.00 Class :character Class :character
## Median :131.50 Median :35.00 Mode :character Mode :character
## Mean :136.72 Mean :33.85
## 3rd Qu.:208.25 3rd Qu.:45.00
## Max. :285.00 Max. :80.00
## NA's :4
## date_of_onset date_of_sample district
## Length:200 Length:200 Length:200
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
# Count how many cases by status
library(ggplot2)
library(tidyverse)
EBOLA %>%
count(status)
## status n
## 1 confirmed 182
## 2 suspected 18
# Age distribution
EBOLA %>%
filter(!is.na(age)) %>%
ggplot(aes(x = age)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
labs(title = "Age Distribution of Ebola Cases (Sierra Leone, 2014)",
x = "Age", y = "Count")
# Cases by sex
EBOLA %>%
count(sex) %>%
ggplot(aes(x = sex, y = n, fill = sex)) +
geom_bar(stat = "identity") +
labs(title = "Ebola Cases by Sex", x = "Sex", y = "Number of Cases")
# Cases by district
EBOLA %>%
count(district) %>%
arrange(desc(n)) %>%
head(10) # top 10 districts
## district n
## 1 Kailahun 155
## 2 Kenema 34
## 3 Western Urban 4
## 4 Bo 2
## 5 Kono 2
## 6 Port Loko 2
## 7 Kambia 1
# Convert date columns to Date type
EBOLA1 <- EBOLA %>%
mutate(
onset = as_date(date_of_onset),
sample = as_date(date_of_sample))
# Check how many missing dates
sum(is.na(EBOLA1 $onset))
## [1] 0
sum(is.na(EBOLA1 $sample))
## [1] 0
# Create a new variable: delay between onset and sampling
ebola <- EBOLA1 %>%
mutate(delay_sample = as.numeric(sample - onset))
# View summary of the delay
summary(ebola$delay_sample)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 5.0 5.0 5.6 5.0 22.0
Here are some visualizations to explore the epidemic curve and spatial (by district) trends.
# Epidemic curve: number of cases by onset date
ebola %>%
filter(!is.na(onset)) %>%
group_by(onset) %>%
summarise(daily_cases = n()) %>%
ggplot(aes(x = onset, y = daily_cases)) +
geom_line(color = "firebrick") +
geom_point() +
labs(title = "Ebola Epidemic Curve in Sierra Leone (2014-2015)",
x = "Date of Symptom Onset", y = "Daily New Cases") +
theme_minimal()
# Cumulative cases over time
ebola %>%
filter(!is.na(onset)) %>%
arrange(onset) %>%
mutate(cum_cases = row_number()) %>%
ggplot(aes(x = onset, y = cum_cases)) +
geom_line(color = "darkgreen", size = 1) +
labs(title = "Cumulative Ebola Cases Over Time",
x = "Date of Symptom Onset", y = "Cumulative Number of Cases") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Boxplot of delay between onset and sampling by district (only for districts with enough data)
ebola %>%
filter(!is.na(delay_sample), !is.na(district)) %>%
group_by(district) %>%
filter(n() > 50) %>% # only districts with more than 50 cases
ggplot(aes(x = reorder(district, delay_sample, FUN = median),
y = delay_sample)) +
geom_boxplot(fill = "skyblue") +
coord_flip() +
labs(title = "Delay from Onset to Sampling by District",
x = "District", y = "Delay (days)")
# Average delay between onset and sample
mean_delay <- mean(ebola$delay_sample, na.rm = TRUE)
median_delay <- median(ebola$delay_sample, na.rm = TRUE)
mean_delay; median_delay
## [1] 5.6
## [1] 5
# Is delay different between confirmed vs. suspected cases?
ebola %>%
filter(!is.na(delay_sample)) %>%
group_by(status) %>%
summarise(
mean_delay = mean(delay_sample),
median_delay = median(delay_sample),
count = n()
)
## # A tibble: 2 × 4
## status mean_delay median_delay count
## <chr> <dbl> <dbl> <int>
## 1 confirmed 5.67 5 182
## 2 suspected 4.89 5 18
# Age vs. delay (are older patients sampled slower?)
ebola %>%
filter(!is.na(age) & !is.na(delay_sample)) %>%
ggplot(aes(x = age, y = delay_sample)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess", color = "blue") +
labs(title = "Sampling Delay vs Age", x = "Age", y = "Delay (days)")
The epidemic curve shows a clear rise in new cases starting mid-2014, peaking around [peak date from your plot], then gradually declining.
This indicates how the outbreak progressed and when most transmissions likely occurred.
The cumulative curve helps us understand the total burden of disease. A steep rise suggests rapid spread; flattening means control measures might have started working.
The average delay between onset and sampling (e.g., mean = 5 days) suggests how long on average people waited before being tested.
If confirmed cases have a shorter delay than suspected ones (or vice versa), that could imply differences in how quickly patients are identified or prioritized.
Delay might vary by district — some districts might be slower, indicating weaker surveillance or access issues.
The relationship between age and delay (if any) might hint whether certain age groups faced more barriers.
Longer delays between onset and sampling can reduce the effectiveness of contact tracing and isolation, contributing to spread.
Districts with longer delays might need targeted interventions (better lab accessibility, faster reporting, more resources).
Understanding the shape and timing of the epidemic is crucial for evaluating response efforts and planning for future outbreaks.