DATA ANALYSIS SAMPLE

1. INSTALLING LIBRIARIES

# Install / load needed packages
#install.packages(c("outbreaks", "tidyverse", "lubridate"))
#library(outbreaks)
#library(tidyverse)
#library(lubridate)

2. IMPORTING THE DATA

# Load the Sierra Leone Ebola data

EBOLA <- read.csv("ebola_sierra_leone.csv")

# Look at the first few rows
head(EBOLA)

##    id age sex    status date_of_onset date_of_sample district
## 1  92   6   M confirmed    2014-06-10     2014-06-15 Kailahun
## 2  51  46   F confirmed    2014-05-30     2014-06-04 Kailahun
## 3 230  NA   M confirmed    2014-06-26     2014-06-30   Kenema
## 4 139  25   F confirmed    2014-06-13     2014-06-18 Kailahun
## 5   8   8   F confirmed    2014-05-22     2014-05-27 Kailahun
## 6 215  49   M confirmed    2014-06-24     2014-06-29 Kailahun

str(EBOLA)

## 'data.frame':    200 obs. of  7 variables:
##  $ id            : int  92 51 230 139 8 215 189 115 218 159 ...
##  $ age           : num  6 46 NA 25 8 49 13 50 35 38 ...
##  $ sex           : chr  "M" "F" "M" "F" ...
##  $ status        : chr  "confirmed" "confirmed" "confirmed" "confirmed" ...
##  $ date_of_onset : chr  "2014-06-10" "2014-05-30" "2014-06-26" "2014-06-13" ...
##  $ date_of_sample: chr  "2014-06-15" "2014-06-04" "2014-06-30" "2014-06-18" ...
##  $ district      : chr  "Kailahun" "Kailahun" "Kenema" "Kailahun" ...

summary(EBOLA)

##        id              age            sex               status         
##  Min.   :  1.00   Min.   : 1.80   Length:200         Length:200        
##  1st Qu.: 62.75   1st Qu.:20.00   Class :character   Class :character  
##  Median :131.50   Median :35.00   Mode  :character   Mode  :character  
##  Mean   :136.72   Mean   :33.85                                        
##  3rd Qu.:208.25   3rd Qu.:45.00                                        
##  Max.   :285.00   Max.   :80.00                                        
##                   NA's   :4                                            
##  date_of_onset      date_of_sample       district        
##  Length:200         Length:200         Length:200        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##

3.Exploratory Data Analysis (EDA)

# Count how many cases by status
library(ggplot2)
library(tidyverse)


EBOLA %>%
  count(status)

##      status   n
## 1 confirmed 182
## 2 suspected  18

# Age distribution
EBOLA %>%
  filter(!is.na(age)) %>%
  ggplot(aes(x = age)) +
    geom_histogram(bins = 30, fill = "steelblue", color = "white") +
    labs(title = "Age Distribution of Ebola Cases (Sierra Leone, 2014)",
         x = "Age", y = "Count")

# Cases by sex
EBOLA %>%
  count(sex) %>%
  ggplot(aes(x = sex, y = n, fill = sex)) +
    geom_bar(stat = "identity") +
    labs(title = "Ebola Cases by Sex", x = "Sex", y = "Number of Cases")

# Cases by district
EBOLA %>%
  count(district) %>%
  arrange(desc(n)) %>%
  head(10)  # top 10 districts

##        district   n
## 1      Kailahun 155
## 2        Kenema  34
## 3 Western Urban   4
## 4            Bo   2
## 5          Kono   2
## 6     Port Loko   2
## 7        Kambia   1

4.Data Cleaning / Preprocessing

# Convert date columns to Date type
EBOLA1 <- EBOLA %>%
  mutate(
    onset = as_date(date_of_onset),
    sample = as_date(date_of_sample))

# Check how many missing dates
sum(is.na(EBOLA1 $onset))

## [1] 0

sum(is.na(EBOLA1 $sample))

## [1] 0

# Create a new variable: delay between onset and sampling
ebola <- EBOLA1 %>%
  mutate(delay_sample = as.numeric(sample - onset))

# View summary of the delay
summary(ebola$delay_sample)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     5.0     5.0     5.6     5.0    22.0

5. Data Visualization

Here are some visualizations to explore the epidemic curve and spatial (by district) trends.

# Epidemic curve: number of cases by onset date
ebola %>%
  filter(!is.na(onset)) %>%
  group_by(onset) %>%
  summarise(daily_cases = n()) %>%
  ggplot(aes(x = onset, y = daily_cases)) +
    geom_line(color = "firebrick") +
    geom_point() +
    labs(title = "Ebola Epidemic Curve in Sierra Leone (2014-2015)",
         x = "Date of Symptom Onset", y = "Daily New Cases") +
    theme_minimal()

# Cumulative cases over time
ebola %>%
  filter(!is.na(onset)) %>%
  arrange(onset) %>%
  mutate(cum_cases = row_number()) %>%
  ggplot(aes(x = onset, y = cum_cases)) +
    geom_line(color = "darkgreen", size = 1) +
    labs(title = "Cumulative Ebola Cases Over Time",
         x = "Date of Symptom Onset", y = "Cumulative Number of Cases") +
    theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Boxplot of delay between onset and sampling by district (only for districts with enough data)
ebola %>%
  filter(!is.na(delay_sample), !is.na(district)) %>%
  group_by(district) %>%
  filter(n() > 50) %>%   # only districts with more than 50 cases
  ggplot(aes(x = reorder(district, delay_sample, FUN = median),
             y = delay_sample)) +
    geom_boxplot(fill = "skyblue") +
    coord_flip() +
    labs(title = "Delay from Onset to Sampling by District",
         x = "District", y = "Delay (days)")

6.Simple Statistical Analysis

# Average delay between onset and sample
mean_delay <- mean(ebola$delay_sample, na.rm = TRUE)
median_delay <- median(ebola$delay_sample, na.rm = TRUE)
mean_delay; median_delay

## [1] 5.6

## [1] 5

# Is delay different between confirmed vs. suspected cases?
ebola %>%
  filter(!is.na(delay_sample)) %>%
  group_by(status) %>%
  summarise(
    mean_delay = mean(delay_sample),
    median_delay = median(delay_sample),
    count = n()
  )

## # A tibble: 2 × 4
##   status    mean_delay median_delay count
##   <chr>          <dbl>        <dbl> <int>
## 1 confirmed       5.67            5   182
## 2 suspected       4.89            5    18

# Age vs. delay (are older patients sampled slower?)
ebola %>%
  filter(!is.na(age) & !is.na(delay_sample)) %>%
  ggplot(aes(x = age, y = delay_sample)) +
    geom_point(alpha = 0.5) +
    geom_smooth(method = "loess", color = "blue") +
    labs(title = "Sampling Delay vs Age", x = "Age", y = "Delay (days)")

7. Interpretation / Analysis

Epidemic Curve

The epidemic curve shows a clear rise in new cases starting mid-2014, peaking around [peak date from your plot], then gradually declining.

This indicates how the outbreak progressed and when most transmissions likely occurred.

Cumulative Cases

The cumulative curve helps us understand the total burden of disease. A steep rise suggests rapid spread; flattening means control measures might have started working.

Delay Analysis

The average delay between onset and sampling (e.g., mean = 5 days) suggests how long on average people waited before being tested.

If confirmed cases have a shorter delay than suspected ones (or vice versa), that could imply differences in how quickly patients are identified or prioritized.

Delay might vary by district — some districts might be slower, indicating weaker surveillance or access issues.

The relationship between age and delay (if any) might hint whether certain age groups faced more barriers.

Public Health Implications

Longer delays between onset and sampling can reduce the effectiveness of contact tracing and isolation, contributing to spread.

Districts with longer delays might need targeted interventions (better lab accessibility, faster reporting, more resources).

Understanding the shape and timing of the epidemic is crucial for evaluating response efforts and planning for future outbreaks.

DATA ANALYSIS SAMPLE - EBOLA

Leonessa Adobas

2025-11-17