# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
#Load the data
df <- read.csv("Data/Epi_Task_Data.csv") %>% clean_names()
head(df)
##   year epi_week  county age_group ili_percentage population
## 1 2023        1 Nairobi    0-4yrs            6.1       3452
## 2 2024        1 Nairobi    0-4yrs            3.2       3658
## 3 2023        2 Nairobi    0-4yrs            4.5       3043
## 4 2024        2 Nairobi    0-4yrs            4.9       2765
## 5 2023        3 Nairobi    0-4yrs            4.6       6975
## 6 2024        3 Nairobi    0-4yrs            3.8        609
#Descriptive analysis (mean ILI percentage per county per year)
mean_ili <- df %>%
  group_by(county, year) %>%
  summarise(mean_ili = mean(ili_percentage, na.rm = TRUE)) %>%
  tidyr::pivot_wider(names_from = year, values_from = mean_ili)
## `summarise()` has grouped output by 'county'. You can override using the
## `.groups` argument.
print(mean_ili)
## # A tibble: 7 × 3
## # Groups:   county [7]
##   county   `2023` `2024`
##   <chr>     <dbl>  <dbl>
## 1 Kakamega   4.06   4.54
## 2 Kiambu     3.90   3.70
## 3 Kisumu     4.07   3.72
## 4 Machakos   3.38   3.98
## 5 Mombasa    3.90   3.99
## 6 Nairobi    4.22   3.99
## 7 Nakuru     3.78   4.53
#Plot weekly ILI trend
weekly_trend <- df %>%
  group_by(epi_week, county) %>%
  summarise(mean_ili = mean(ili_percentage, na.rm = TRUE))
## `summarise()` has grouped output by 'epi_week'. You can override using the
## `.groups` argument.
ggplot(weekly_trend, aes(x = epi_week, y = mean_ili, color = county)) +
  geom_line() +
  labs(title = "Weekly ILI Trend by County",
       x = "Epidemiological Week",
       y = "Mean ILI Percentage") +
  facet_wrap(~ county) +
  theme_minimal()

#Compute incidence rate per 100,000 population
selected <- df %>% filter(county %in% c("Nairobi", "Kakamega", "Kisumu"))

selected <- selected %>%
  mutate(ili_cases = (ili_percentage / 100) * population)

incidence <- selected %>%
  group_by(county) %>%
  summarise(total_cases = sum(ili_cases, na.rm = TRUE),
            total_pop = sum(population)) %>%
  mutate(incidence_per_100k = (total_cases / total_pop) * 1e5)

print(incidence)
## # A tibble: 3 × 4
##   county   total_cases total_pop incidence_per_100k
##   <chr>          <dbl>     <int>              <dbl>
## 1 Kakamega       9268.    228303              4060.
## 2 Kisumu         7573.    196715              3850.
## 3 Nairobi        6814.    164774              4135.
# One-way ANOVA
subset_data <- df %>% filter(county %in% c("Nairobi", "Kakamega", "Kisumu"))
anova_result <- aov(ili_percentage ~ county, data = subset_data)
summary(anova_result)
##              Df Sum Sq Mean Sq F value Pr(>F)
## county        2   3.28   1.641   0.868  0.422
## Residuals   117 221.16   1.890
#check pairwise differences
TukeyHSD(anova_result)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = ili_percentage ~ county, data = subset_data)
## 
## $county
##                    diff        lwr       upr     p adj
## Kisumu-Kakamega  -0.405 -1.1348068 0.3248068 0.3884043
## Nairobi-Kakamega -0.195 -0.9248068 0.5348068 0.8015505
## Nairobi-Kisumu    0.210 -0.5198068 0.9398068 0.7738045