###### PROLOG ########


# PROLOG   ###
# PROJECT: Epidemiology Program Manager (Epidemiologist III) Position                   
# PURPOSE: Skill Assessment                                      
# DIR:     C:\Users\kesha\Downloads                    
# DATA:    Data - In-Basket A.xlsx                         
# AUTHOR:  Dr. Keshav Kumar                                            
# CREATED: JUNE 23, 2025                                           
# LATEST:  JUNE 24, 2025                                     
# NOTES:   N/A
# PROLOG   ### 


# libraries 


library(magrittr) # for pipes
library(table1) # for descriptive statistics 
library(tidyverse) # for tidy code
library(sessioninfo) # for session_info at bottom
library(details) # for session_info at bottom
library(ggthemes) # for tufte theme
library(ggrepel) # for text plotting
library(patchwork) # for combining plots
library(readxl) # for reading xlxs files
library(kableExtra) # for table headers formatting
library(knitr) # for tables
library(sjPlot) # for model tables 
# plot theme
theme_set(theme_tufte())  # but might not carry over in chunks

# Okabe-Ito colorblind-friendly color palette:
# https://jfly.uni-koeln.de/color/

oi_pal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", 
     "#0072B2", "#D55E00", "#CC79A7", "#999999")


###### DATA MGMT #####

# original dataset
df <- read_xlsx("C:/Users/kesha/Downloads/Data - In-Basket A.xlsx", sheet = "Case Line List Disease X")
df_y <- read_excel("C:/Users/kesha/Downloads/Data - In-Basket A.xlsx", sheet = "Disease Y", col_names = FALSE)
dict <- read_excel("C:/Users/kesha/Downloads/Data - In-Basket A.xlsx", sheet = "Data Dictionary")

# Clean and rename columns
colnames(df) <- str_trim(colnames(df))
dict <- dict %>% mutate(Column = str_trim(Column))
rename_map <- setNames(dict$Description, dict$Column)
df <- df %>% rename(any_of(rename_map)) %>%
  janitor::clean_names()

# Convert date columns and calculate age
df <- df %>%
  mutate(
    Age = round(as.numeric(difftime(`onset_date`, `dob`, units = "days")) / 365.25, 2),
    Year = year(`onset_date`)
  )

Descriptive Summary

# Descriptive Summary
label(df$case_status) <-"Case classification (confirmed, probable, suspect )"
label(df$sex) <-"Gender of case"
label(df$hospitalized) <-"Was the patient hospitalized for the disease"
label(df$ethnicity) <- "Ethnicity of case"
label(df$race) <- "Race of case"
label(df$died) <- "Did the patient die from the disease"
label(df$lab) <- "Laboratory test done"
label(df$Age) <- "Age of Patient at the time of Onset of Disease"
label(df$sx) <- "Symptoms"

desc<- table1(
  ~case_status + sex + hospitalized + died + race + ethnicity + lab +
    sx + Age,
  data= df,
  render.continous="median(IQR)",
  caption = "Descriptive Statistics of Disease X",
)

kable(desc)
Overall
(N=202)
Case classification (confirmed, probable, suspect )
Confirmed 140 (69.3%)
Probable 42 (20.8%)
Suspect 20 (9.9%)
Gender of case
Female 81 (40.1%)
Male 118 (58.4%)
Unknown 3 (1.5%)
Was the patient hospitalized for the disease
No 5 (2.5%)
Unknown 58 (28.7%)
Yes 139 (68.8%)
Did the patient die from the disease
No 94 (46.5%)
Unknown 92 (45.5%)
Yes 16 (7.9%)
Race of case
Asian 1 (0.5%)
Black/African American 20 (9.9%)
Native American 2 (1.0%)
Other 2 (1.0%)
Unknown 73 (36.1%)
White 104 (51.5%)
Ethnicity of case
Hispanic 29 (14.4%)
Non-Hispanic 74 (36.6%)
Unknown 99 (49.0%)
Laboratory test done
Ab + 62 (30.7%)
Cx 8 (4.0%)
CX 1 (0.5%)
PCR + 131 (64.9%)
Symptoms
No 22 (10.9%)
Yes 180 (89.1%)
Age of Patient at the time of Onset of Disease
Mean (SD) 52.5 (20.3)
Median [Min, Max] 57.1 [0.980, 90.4]
# Round age to nearest whole number
df$age_rounded <- round(df$Age)


# Frequency table for Year
year_count <- table(df$Year)
kable(year_count, caption = "Case Count by Year")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Case Count by Year
Var1 Freq
2018 36
2019 46
2020 72
2021 48
# Percentage table for Year
year_perc <- prop.table(table(df$Year)) * 100
kable(year_perc, caption = "Percentage of Cases by Year", digits = 2)%>%
  kable_classic(full_width = F, html_font = "Cambria")
Percentage of Cases by Year
Var1 Freq
2018 17.82
2019 22.77
2020 35.64
2021 23.76

Descriptive Graphs

# Histogram of age

ggplot(df, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "steelblue", color = "white") +
   stat_bin(binwidth = 5, geom = "text", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Distribution of Age at Onset",
       x = "Age (Years)", y = "Number of Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

# Bar Plot of Case counts by Year
ggplot(df, aes(x = factor(Year))) +
  geom_bar(fill = "orange", color = "black") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Number of Disease X Cases by Year",
       x = "Year", y = "Number of Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

# Boxplot of Age by Year
ggplot(df, aes(x = factor(Year), y = Age)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Age Distribution of Cases by Year",
       x = "Year", y = "Age at Onset") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

# Line Plot: Trend of Confirmed Cases Over Years
df %>%
  filter(case_status == "Confirmed") %>%
  count(Year) %>%
  ggplot(aes(x = Year, y = n)) +
  geom_line(group = 1, color = "darkblue") +
  geom_point(size = 3) +
  geom_text(aes(label = n), vjust = -0.5) +
  labs(title = "Trend of Confirmed Cases (Disease X)",
       x = "Year", y = "Number of Confirmed Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Q1.a- Confirmed Cases Count in 2020

# Confirmed Cases in 2020

confirmed_2020 <- df %>%
  filter(`case_status` == "Confirmed", year(onset_date) == 2020)

kable(count(confirmed_2020), caption= "Confirmed cases in year 2020")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Confirmed cases in year 2020
n
52

Q1.b- Percentage of Confirmed & Probable Cases

#Percentage of Confirmed & Probable Cases with Known Hospitalization

# Filter confirmed and probable cases
confirmed_probable <- df %>%
  filter(case_status %in% c("Confirmed", "Probable"))

# Filter those with known hospitalization (Yes or No)
known_hosp <- confirmed_probable %>%
  filter(hospitalized %in% c("Yes", "No"))

# Calculate percentage of known hospitalizations
percent_known <- nrow(known_hosp) / nrow(confirmed_probable) * 100

kable(round(percent_known, 2), caption="Percentage of Confirmed & Probable Cases with Known Hospitalization")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Percentage of Confirmed & Probable Cases with Known Hospitalization
x
76.92

Q1.c- Female to Male Ratio

# What is the female to male ratio among confirmed cases?
gender_ratio <- df %>%
  filter(case_status == "Confirmed") %>%
  count(sex) %>%
  pivot_wider(names_from = sex, values_from = n, values_fill = 0)

# Female to male ratio
ratio_female_male <- round(gender_ratio$Female / gender_ratio$Male, 4)

kable(gender_ratio, caption="Female to Male number among confirmed cases")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Female to Male number among confirmed cases
Female Male Unknown
52 87 1
kable(ratio_female_male, caption="Female to Male ratio among confirmed cases")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Female to Male ratio among confirmed cases
x
0.5977

Q1.d- Disease X Trend from 2018-20 is Increasing

#Is disease X increasing or decreasing from 2018 to 2020?

cases <- df %>%
  filter(case_status == "Confirmed", Year %in% 2018:2020) %>%
  count(Year)

kable(cases, caption="Disease X count from 2018 to 2020 is Increasing")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Disease X count from 2018 to 2020 is Increasing
Year n
2018 25
2019 30
2020 52

Q1.e- Average & Median age of Confirmed and Probable cases

# What is the average and median age of confirmed + probable cases?
age_stats <- df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  summarise(
    average_age = round(mean(Age, na.rm = TRUE), 2),
    median_age = round(median(Age, na.rm = TRUE), 2)
  )

kable(age_stats, caption=" Average and Median age of Confirmed and Probable cases")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Average and Median age of Confirmed and Probable cases
average_age median_age
53.69 58.81

Q1.f- Disease X Seasonality

Yes, Disease X appears to be seasonal. Based on the data, most cases cluster around late spring to early fall (June to September), which suggests a summer seasonality—possibly linked to environmental or vector exposure patterns common in warmer months.

# Is Disease X seasonal? Why or why not?
# Prepare data
df_season <- df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  mutate(
    month_onset = month(onset_date, label = TRUE, abbr = FALSE)
  ) %>%
  count(Year, month_onset)

# Reshape for wide-format table (months as rows, years as columns)
seasonality_wide <- df_season %>%
  pivot_wider(names_from = Year, values_from = n, values_fill = 0) %>%
  mutate(
    Total_cases = rowSums(select(., `2018`:`2021`))
  ) %>%
  arrange(match(month_onset, month.name))

kable(seasonality_wide, caption="Disease X Seasonality")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Disease X Seasonality
month_onset 2018 2019 2020 2021 Total_cases
January 2 1 6 3 12
February 2 2 1 1 6
March 1 4 6 3 14
April 2 2 3 2 9
May 2 1 6 4 13
June 3 4 5 6 18
July 5 10 7 3 25
August 9 5 10 6 30
September 3 5 7 5 20
October 2 2 7 4 15
November 0 2 4 4 10
December 2 3 4 1 10
df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  mutate(month = month(onset_date, label = TRUE)) %>%
  count(month) %>%
  ggplot(aes(x = month, y = n)) +
  geom_col(fill = "tomato") +
  geom_text(aes(label = n), vjust = -0.5) +
  labs(title = "Monthly Distribution of Disease X Cases",
       x = "Month", y = "Number of Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

# Faceted bar plot
ggplot(df_season, aes(x = month_onset, y = n)) +
  geom_col(fill = "steelblue") +
  facet_wrap(~ Year, ncol = 2) +
  geom_text(aes(label = n), vjust = -0.5) +
  labs(
    title = "Monthly Distribution of Disease X Cases by Year",
    x = "Month",
    y = "Number of Cases"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 12, face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Q1.g- Incidence of Disease X from 2018-21

# Incidence of cases (combined confirmed and probable) of disease X from 2018 to 2021

pop <- 559139

# Ensure lowercase and dates
df_i <- df %>%
  mutate(
    case_id = as.character(case_id),
    case_status = str_to_lower(case_status),
    died = str_to_lower(died),
    onset_date = as.Date(onset_date),
    dob = as.Date(dob),
    age = as.numeric(difftime(onset_date, dob, units = "days")) / 365.25,
    Year = year(onset_date)
  )

#keeping only deduplicated cases (unique case id)
df_unique <- df_i %>%
  filter(case_status %in% c("confirmed", "probable")) %>%
  distinct(case_id, .keep_all = TRUE)

# Filter years 2018–2020
df_1821 <- df_unique %>%
  filter(Year %in% 2018:2021)

# Duplicate cases based on case_id (all entries, not deduplicated)
duplicate_cases <- df_i %>%
  filter(case_status %in% c("confirmed", "probable", "suspect")) %>%
  group_by(case_id) %>%
  filter(n() > 1) %>%
  arrange(case_id)

# View duplicate cases
kable(duplicate_cases, caption = "Duplicate Case IDs in Dataset") %>%
  kable_classic(full_width = F, html_font = "Cambria")
Duplicate Case IDs in Dataset
condition form_rvd nbs_status case_status case_id dob sex race ethnicity county hsr onset_date lab sx hospitalized died risk_factors travel Age Year age_rounded age
NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
:——— :——– :———- :———– :——- :— :— :—- :——— :—— :— :———- :— :– :———— :—- :———— :—— —: —-: ———–: —:
# Death cases among deduplicated data from 2018–2021
death_cases <- df_1821 %>%
  filter(died == "yes")

# Deaths per year for adjustment
death_counts <- df_1821 %>%
  filter(died == "yes") %>%
  count(Year, name = "Deaths")

# View death cases
kable(death_counts, caption = "Deceased Cases (2018–2021)") %>%
  kable_classic(full_width = F, html_font = "Cambria")
Deceased Cases (2018–2021)
Year Deaths
2018 2
2019 4
2020 5
2021 4
# Count new cases
incidence_summary <- df_1821 %>%
  count(Year, name = "New_Cases") %>%
  left_join(death_counts, by = "Year") %>%
  mutate(
    Deaths = replace_na(Deaths, 0),
    Adjusted_Pop = pop - Deaths,
    Person_Time_Years = Adjusted_Pop,  # Assuming each person contributes 1 year
    incidence_per_100k = round((New_Cases / Person_Time_Years) * 100000, 2)
  )

# View final incidence table
kable(incidence_summary, caption = "Adjusted Incidence of Disease X cases from 2018 to 2020") %>%
  kable_classic(full_width = F, html_font = "Cambria")%>%
  add_footnote(
    label = "* Assumes each individual contributes 1 full person-year unless deceased during that year.",
    notation = "symbol"
  )
Adjusted Incidence of Disease X cases from 2018 to 2020
Year New_Cases Deaths Adjusted_Pop Person_Time_Years incidence_per_100k
2018 33 2 559137 559137 5.90
2019 41 4 559135 559135 7.33
2020 66 5 559134 559134 11.80
2021 42 4 559135 559135 7.51
Assumes each individual contributes 1 full person-year unless deceased during that year.
# Plot incidence
ggplot(incidence_summary, aes(x = factor(Year), y = incidence_per_100k)) +
  geom_col(fill = "dodgerblue") +
  geom_text(aes(label = incidence_per_100k), vjust = -0.5) +
  labs(
    title = "Incidence of Disease X (Confirmed + Probable)",
    subtitle = "Per 100,000 population (2018–2021)",
    x = "Year",
    y = "Incidence Rate"
  ) +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Q2- Incidence Rates & Counts of Disease Y

# Case counts and incidence rates of Disease Y
# Since data already loaded on top as df_y using it to further clean
#  Extract case data from top 4 rows
df_y_cases <- df_y[1:5, ]%>%
  mutate(across(everything(), as.character))
colnames(df_y_cases) <- c("County", "2015", "2016", "2017", "2018", "2019", "2020", "2021")

#  Create population table manually
population_tbl <- tribble(
  ~County,      ~Population,
  "Jefferson",   150000,
  "Washington",  500000,
  "Lincoln",      95000,
  "Jackson",      40000
)

# Reshape and join population data
df_y_long <- df_y_cases %>%
  pivot_longer(cols = -County, names_to = "Year", values_to = "Case_Count") %>%
  mutate(
    Year = as.numeric(Year),
    Case_Count = as.numeric(Case_Count)
  ) %>%
  left_join(population_tbl, by = "County") %>%
  mutate(
    Person_Time_Years = Population,  # Assume each person contributes 1 year
    Incidence_per_100k = round((Case_Count / Population) * 100000, 2)
  )

## Remove rows where County is "Disease Y" or Case_Count is NA
df_y_long <- df_y_long %>%
  filter(County != "Disease Y", !is.na(Case_Count))

# View final result
kable(df_y_long, caption="Counts and Incidence Rate for Disease Y")%>%
  kable_classic(full_width = F, html_font = "Cambria") %>%
  add_footnote(
    label = "* Assumes each individual contributes 1 full person-year annually for the population denominator.",
    notation = "symbol"
  )
Counts and Incidence Rate for Disease Y
County Year Case_Count Population Person_Time_Years Incidence_per_100k
Jefferson 2015 120 150000 150000 80.00
Jefferson 2016 140 150000 150000 93.33
Jefferson 2017 100 150000 150000 66.67
Jefferson 2018 98 150000 150000 65.33
Jefferson 2019 45 150000 150000 30.00
Jefferson 2020 140 150000 150000 93.33
Jefferson 2021 162 150000 150000 108.00
Washington 2015 183 500000 500000 36.60
Washington 2016 349 500000 500000 69.80
Washington 2017 438 500000 500000 87.60
Washington 2018 522 500000 500000 104.40
Washington 2019 414 500000 500000 82.80
Washington 2020 348 500000 500000 69.60
Washington 2021 412 500000 500000 82.40
Lincoln 2015 148 95000 95000 155.79
Lincoln 2016 91 95000 95000 95.79
Lincoln 2017 94 95000 95000 98.95
Lincoln 2018 134 95000 95000 141.05
Lincoln 2019 97 95000 95000 102.11
Lincoln 2020 101 95000 95000 106.32
Lincoln 2021 140 95000 95000 147.37
Jackson 2015 50 40000 40000 125.00
Jackson 2016 74 40000 40000 185.00
Jackson 2017 80 40000 40000 200.00
Jackson 2018 104 40000 40000 260.00
Jackson 2019 118 40000 40000 295.00
Jackson 2020 96 40000 40000 240.00
Jackson 2021 132 40000 40000 330.00
Assumes each individual contributes 1 full person-year annually for the population denominator.
# Bar Plot: Case Counts by County over Time
ggplot(df_y_long, aes(x = factor(Year), y = Case_Count, fill = County)) +
  geom_col(position = position_dodge(width = 0.9)) +
  geom_text(
    aes(label = Case_Count),
    position = position_dodge(width = 0.9),
    vjust = -0.5,
    size = 3
  ) +
  labs(
    title = "Disease Y Case Counts by County and Year",
    x = "Year",
    y = "Number of Cases"
  ) +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

# Line Plot: Incidence Rates by County
ggplot(df_y_long, aes(x = Year, y = Incidence_per_100k, color = County)) +
  geom_line(size = 1) +
  geom_point() +
  geom_text(
    aes(label = Incidence_per_100k),
    vjust = -0.5,
    size = 3
  ) +
  labs(
    title = "Incidence Rate of Disease Y by County (Per 100,000)",
    x = "Year",
    y = "Incidence per 100k"
  ) +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Newsletter Summary (for school nurses):

Understanding Disease Y Risk: Why Incidence Rates Matter

From 2015 to 2021, reported case counts of Disease Y varied substantially across counties. While Washington County consistently reported the highest number of total cases, its incidence rates were moderate due to its large population base of 500,000. For example, in 2018, Washington had 522 cases (incidence: 104.40 per 100,000), whereas Jackson County, with only 132 cases in 2021, had a much higher incidence of 330.00 per 100,000 due to a smaller population of 40,000.

This means that although Washington had more cases, Jackson residents were at a higher individual risk of contracting Disease Y. Similarly, Lincoln County showed a consistently elevated risk, peaking at 155.79 per 100,000 in 2015.

These findings underscore why school nurses—and public health professionals in general—should not rely solely on raw case counts. Incidence rates, which adjust for population size, provide a clearer picture of community-level risk. This approach helps prioritize health messaging, prevention efforts, and resource allocation, especially in high-risk counties like Jackson and Lincoln.

Note: Incidence rates were calculated assuming each individual contributes 1 full person-year annually, based on standard epidemiological methodology.