###### PROLOG ########


# PROLOG   ###
# PROJECT: Epidemiology Skill Assessment           
# PURPOSE: Skill Assessment                                      
# DIR:     C:\Users\kesha\Downloads\Skill Assessment                      
# DATA:    Data - In-Basket A.xlsx                         
# AUTHOR:  Dr. Keshav Kumar                                            
# CREATED: JULY 11, 2025                                           
# LATEST:  JULY 11, 2025                                     
# NOTES:   N/A
# PROLOG   ### 


# libraries 


library(magrittr) # for pipes
library(table1) # for descriptive statistics 
library(tidyverse) # for tidy code
library(sessioninfo) # for session_info at bottom
library(details) # for session_info at bottom
library(ggthemes) # for tufte theme
library(ggrepel) # for text plotting
library(patchwork) # for combining plots
library(readxl) # for reading xlxs files
library(kableExtra) # for table headers formatting
library(knitr) # for tables
library(sjPlot) # for model tables 
# plot theme
theme_set(theme_tufte())  # but might not carry over in chunks

# Okabe-Ito colorblind-friendly color palette:
# https://jfly.uni-koeln.de/color/

oi_pal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", 
     "#0072B2", "#D55E00", "#CC79A7", "#999999")


###### DATA MGMT #####

# original dataset
df <- read_xlsx("C:/Users/kesha/Downloads/Skill Assessment/Data - In-Basket A.xlsx", sheet = "Case Line List Disease X")
df_y <- read_excel("C:/Users/kesha/Downloads/Skill Assessment/Data - In-Basket A.xlsx", sheet = "Disease Y", col_names = FALSE)
dict <- read_excel("C:/Users/kesha/Downloads/Skill Assessment/Data - In-Basket A.xlsx", sheet = "Data Dictionary")

# Clean and rename columns
colnames(df) <- str_trim(colnames(df))
dict <- dict %>% mutate(Column = str_trim(Column))
rename_map <- setNames(dict$Description, dict$Column)
df <- df %>% rename(any_of(rename_map)) %>%
  janitor::clean_names()

# Convert date columns and calculate age
df <- df %>%
  mutate(
    Age = round(as.numeric(difftime(`onset_date`, `dob`, units = "days")) / 365.25, 2),
    Year = year(`onset_date`)
  )

Descriptive Summary

# Descriptive Summary
label(df$case_status) <-"Case classification (confirmed, probable, suspect )"
label(df$sex) <-"Gender of case"
label(df$hospitalized) <-"Was the patient hospitalized for the disease"
label(df$ethnicity) <- "Ethnicity of case"
label(df$race) <- "Race of case"
label(df$died) <- "Did the patient die from the disease"
label(df$lab) <- "Laboratory test done"
label(df$Age) <- "Age of Patient at the time of Onset of Disease"
label(df$sx) <- "Symptoms"

table1(
  ~sex + hospitalized + died + race + ethnicity + lab +
    sx + Age |case_status,
  data= df,,render.missing=NULL,
  render.continous="median(IQR)",
  caption = "Descriptive Statistics of Disease X",
)
Descriptive Statistics of Disease X
Confirmed
(N=140)
Probable
(N=42)
Suspect
(N=20)
Overall
(N=202)
Gender of case
Female 52 (37.1%) 18 (42.9%) 11 (55.0%) 81 (40.1%)
Male 87 (62.1%) 22 (52.4%) 9 (45.0%) 118 (58.4%)
Unknown 1 (0.7%) 2 (4.8%) 0 (0%) 3 (1.5%)
Was the patient hospitalized for the disease
No 2 (1.4%) 1 (2.4%) 2 (10.0%) 5 (2.5%)
Unknown 4 (2.9%) 38 (90.5%) 16 (80.0%) 58 (28.7%)
Yes 134 (95.7%) 3 (7.1%) 2 (10.0%) 139 (68.8%)
Did the patient die from the disease
No 90 (64.3%) 3 (7.1%) 1 (5.0%) 94 (46.5%)
Unknown 35 (25.0%) 39 (92.9%) 18 (90.0%) 92 (45.5%)
Yes 15 (10.7%) 0 (0%) 1 (5.0%) 16 (7.9%)
Race of case
Asian 1 (0.7%) 0 (0%) 0 (0%) 1 (0.5%)
Black/African American 18 (12.9%) 1 (2.4%) 1 (5.0%) 20 (9.9%)
Native American 2 (1.4%) 0 (0%) 0 (0%) 2 (1.0%)
Other 2 (1.4%) 0 (0%) 0 (0%) 2 (1.0%)
Unknown 17 (12.1%) 39 (92.9%) 17 (85.0%) 73 (36.1%)
White 100 (71.4%) 2 (4.8%) 2 (10.0%) 104 (51.5%)
Ethnicity of case
Hispanic 27 (19.3%) 2 (4.8%) 0 (0%) 29 (14.4%)
Non-Hispanic 70 (50.0%) 2 (4.8%) 2 (10.0%) 74 (36.6%)
Unknown 43 (30.7%) 38 (90.5%) 18 (90.0%) 99 (49.0%)
Laboratory test done
Cx 8 (5.7%) 0 (0%) 0 (0%) 8 (4.0%)
CX 1 (0.7%) 0 (0%) 0 (0%) 1 (0.5%)
PCR + 131 (93.6%) 0 (0%) 0 (0%) 131 (64.9%)
Ab + 0 (0%) 42 (100%) 20 (100%) 62 (30.7%)
Symptoms
No 2 (1.4%) 0 (0%) 20 (100%) 22 (10.9%)
Yes 138 (98.6%) 42 (100%) 0 (0%) 180 (89.1%)
Age of Patient at the time of Onset of Disease
Mean (SD) 54.7 (19.5) 50.2 (22.7) 42.1 (17.4) 52.5 (20.3)
Median [Min, Max] 59.3 [0.980, 90.4] 56.3 [2.75, 87.2] 40.9 [13.3, 74.4] 57.1 [0.980, 90.4]
# Round age to nearest whole number
df$age_rounded <- round(df$Age)


# Frequency table for Year
year_count <- table(df$Year)
kable(year_count, caption = "Case Count by Year")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Case Count by Year
Var1 Freq
2018 36
2019 46
2020 72
2021 48
# Percentage table for Year
year_perc <- prop.table(table(df$Year)) * 100
kable(year_perc, caption = "Percentage of Cases by Year", digits = 2)%>%
  kable_classic(full_width = F, html_font = "Cambria")
Percentage of Cases by Year
Var1 Freq
2018 17.82
2019 22.77
2020 35.64
2021 23.76

The descriptive table stratified by case status (confirmed, probable, suspect) reveals important demographic and clinical patterns. The majority of confirmed cases were male (62.1%), with a median age of 59.3 years, indicating a greater burden among older adult males. Hospitalization was nearly universal among confirmed cases (95.7%), suggesting a high severity profile. However, hospitalization status was unknown for 90.5% of probable and 80% of suspect cases, indicating incomplete case investigation. Mortality was reported in 10.7% of confirmed cases, with the death status missing in nearly half of all records, limiting the reliability of fatality estimates. Race and ethnicity data were also largely missing, particularly among probable and suspect cases, which hinders equity analysis. Most confirmed cases were PCR-positive, whereas all probable and suspect cases were antibody-positive, highlighting diagnostic differences based on case classification.

Descriptive Graphs

# Histogram of age

ggplot(df, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "steelblue", color = "white") +
   stat_bin(binwidth = 5, geom = "text", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Distribution of Age at Onset",
       x = "Age (Years)", y = "Number of Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

The age distribution shows a unimodal pattern skewed toward older adults, with most cases clustered between ages 40 and 70. This supports earlier findings that older populations are disproportionately affected and may warrant targeted prevention strategies.

# Bar Plot of Case counts by Year
ggplot(df, aes(x = factor(Year))) +
  geom_bar(fill = "orange", color = "black") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Number of Disease X Cases by Year",
       x = "Year", y = "Number of Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

The bar chart reinforces the tabular findings, showing the sharpest rise in case counts in 2020, which could correspond to an outbreak or improvement in surveillance and testing capacity.

# Boxplot of Age by Year
ggplot(df, aes(x = factor(Year), y = Age)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Age Distribution of Cases by Year",
       x = "Year", y = "Age at Onset") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Age distributions remained relatively stable year-over-year, though the interquartile range was narrower in 2021, suggesting slightly more concentrated age groups being affected. No significant shift in median age was observed.

# Line Plot: Trend of Confirmed Cases Over Years
df %>%
  filter(case_status == "Confirmed") %>%
  count(Year) %>%
  ggplot(aes(x = Year, y = n)) +
  geom_line(group = 1, color = "darkblue") +
  geom_point(size = 3) +
  geom_text(aes(label = n), vjust = -0.5) +
  labs(title = "Trend of Confirmed Cases (Disease X)",
       x = "Year", y = "Number of Confirmed Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

A steady increase in confirmed cases was observed from 2018 (25) to 2020 (52), before decreasing to 48 in 2021. This trend supports a possible outbreak curve with a peak in 2020.

Q1.a- Confirmed Cases Count in 2020

# Confirmed Cases in 2020

confirmed_2020 <- df %>%
  filter(`case_status` == "Confirmed", year(onset_date) == 2020)

kable(count(confirmed_2020), caption= "Confirmed cases in year 2020")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Confirmed cases in year 2020
n
52

There were 52 confirmed cases of Disease X in 2020, making it the peak year within the observation period. This aligns with other indications of increased transmission or enhanced surveillance that year.

Q1.b- Percentage of Confirmed & Probable Cases

#Percentage of Confirmed & Probable Cases with Known Hospitalization

# Filter confirmed and probable cases
confirmed_probable <- df %>%
  filter(case_status %in% c("Confirmed", "Probable"))

# Filter those with known hospitalization (Yes or No)
known_hosp <- confirmed_probable %>%
  filter(hospitalized %in% c("Yes", "No"))

# Calculate percentage of known hospitalizations
percent_known <- nrow(known_hosp) / nrow(confirmed_probable) * 100

kable(round(percent_known, 2), caption="Percentage of Confirmed & Probable Cases with Known Hospitalization")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Percentage of Confirmed & Probable Cases with Known Hospitalization
x
76.92

Among confirmed and probable cases, 76.92% had known hospitalization status (Yes/No). This reflects moderate data completeness for key clinical outcome fields, but efforts to reduce unknowns are needed for accurate severity assessments.

Q1.c- Female to Male Ratio

# What is the female to male ratio among confirmed cases?
gender_ratio <- df %>%
  filter(case_status == "Confirmed") %>%
  count(sex) %>%
  pivot_wider(names_from = sex, values_from = n, values_fill = 0)

# Female to male ratio
ratio_female_male <- round(gender_ratio$Female / gender_ratio$Male, 4)

kable(gender_ratio, caption="Female to Male number among confirmed cases")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Female to Male number among confirmed cases
Female Male Unknown
52 87 1
kable(ratio_female_male, caption="Female to Male ratio among confirmed cases")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Female to Male ratio among confirmed cases
x
0.5977

Among confirmed cases, the female-to-male ratio was 0.60, indicating males were disproportionately affected. This gender disparity could reflect differential exposure, occupation-related risks, or health-seeking behavior.

Q1.d- Disease X Trend from 2018-20 is Increasing

#Is disease X increasing or decreasing from 2018 to 2020?

cases <- df %>%
  filter(case_status == "Confirmed", Year %in% 2018:2020) %>%
  count(Year)

kable(cases, caption="Disease X count from 2018 to 2020 is Increasing")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Disease X count from 2018 to 2020 is Increasing
Year n
2018 25
2019 30
2020 52

Confirmed case counts rose steadily from 25 in 2018 to 52 in 2020, consistent with an increasing trend in either disease transmission or surveillance intensity. Public health interventions may have been warranted post-2019.

Q1.e- Average & Median age of Confirmed and Probable cases

# What is the average and median age of confirmed + probable cases?
age_stats <- df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  summarise(
    average_age = round(mean(Age, na.rm = TRUE), 2),
    median_age = round(median(Age, na.rm = TRUE), 2)
  )

kable(age_stats, caption=" Average and Median age of Confirmed and Probable cases")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Average and Median age of Confirmed and Probable cases
average_age median_age
53.69 58.81

Among confirmed and probable cases, the mean age was 53.7 years, and the median was 58.8 years. These figures confirm a concentration of cases in middle-aged and older adults, reinforcing the need for age-targeted interventions.

Q1.f- Disease X Seasonality

# Is Disease X seasonal? Why or why not?
# Prepare data
df_season <- df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  mutate(
    month_onset = month(onset_date, label = TRUE, abbr = FALSE)
  ) %>%
  count(Year, month_onset)

# Reshape for wide-format table (months as rows, years as columns)
seasonality_wide <- df_season %>%
  pivot_wider(names_from = Year, values_from = n, values_fill = 0) %>%
  mutate(
    Total_cases = rowSums(select(., `2018`:`2021`))
  ) %>%
  arrange(match(month_onset, month.name))

kable(seasonality_wide, caption="Disease X Seasonality")%>%
  kable_classic(full_width = F, html_font = "Cambria")
Disease X Seasonality
month_onset 2018 2019 2020 2021 Total_cases
January 2 1 6 3 12
February 2 2 1 1 6
March 1 4 6 3 14
April 2 2 3 2 9
May 2 1 6 4 13
June 3 4 5 6 18
July 5 10 7 3 25
August 9 5 10 6 30
September 3 5 7 5 20
October 2 2 7 4 15
November 0 2 4 4 10
December 2 3 4 1 10
df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  mutate(month = month(onset_date, label = TRUE)) %>%
  count(month) %>%
  ggplot(aes(x = month, y = n)) +
  geom_col(fill = "tomato") +
  geom_text(aes(label = n), vjust = -0.5) +
  labs(title = "Monthly Distribution of Disease X Cases",
       x = "Month", y = "Number of Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Disease X exhibited clear summer seasonality, with cases peaking between June and September. This pattern was consistent across multiple years and suggests an environmental or vector-related exposure pathway. Public health messaging and control efforts should be intensified during these months.

# Faceted bar plot
ggplot(df_season, aes(x = month_onset, y = n)) +
  geom_col(fill = "steelblue") +
  facet_wrap(~ Year, ncol = 2) +
  geom_text(aes(label = n), vjust = -0.5) +
  labs(
    title = "Monthly Distribution of Disease X Cases by Year",
    x = "Month",
    y = "Number of Cases"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 12, face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Yes, Disease X appears to be seasonal. Based on the data, most cases cluster around late spring to early fall (June to September), which suggests a summer seasonality—possibly linked to environmental or vector exposure patterns common in warmer months.

Q1.g- Incidence of Disease X from 2018-21

# Incidence of cases (combined confirmed and probable) of disease X from 2018 to 2021

pop <- 559139

# Ensure lowercase and dates
df_i <- df %>%
  mutate(
    case_id = as.character(case_id),
    case_status = str_to_lower(case_status),
    died = str_to_lower(died),
    onset_date = as.Date(onset_date),
    dob = as.Date(dob),
    age = as.numeric(difftime(onset_date, dob, units = "days")) / 365.25,
    Year = year(onset_date)
  )

#keeping only deduplicated cases (unique case id)
df_unique <- df_i %>%
  filter(case_status %in% c("confirmed", "probable")) %>%
  distinct(case_id, .keep_all = TRUE)

# Filter years 2018–2020
df_1821 <- df_unique %>%
  filter(Year %in% 2018:2021)

# Duplicate cases based on case_id (all entries, not deduplicated)
duplicate_cases <- df_i %>%
  filter(case_status %in% c("confirmed", "probable", "suspect")) %>%
  group_by(case_id) %>%
  filter(n() > 1) %>%
  arrange(case_id)

# View duplicate cases
kable(duplicate_cases, caption = "Duplicate Case IDs in Dataset") %>%
  kable_classic(full_width = F, html_font = "Cambria")
Duplicate Case IDs in Dataset
condition form_rvd nbs_status case_status case_id dob sex race ethnicity county hsr onset_date lab sx hospitalized died risk_factors travel Age Year age_rounded age
NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
:——— :——– :———- :———– :——- :— :— :—- :——— :—— :— :———- :— :– :———— :—- :———— :—— —: —-: ———–: —:

There is no any duplicate data row present.

# Death cases among deduplicated data from 2018–2021
death_cases <- df_1821 %>%
  filter(died == "yes")

# Deaths per year for adjustment
death_counts <- df_1821 %>%
  filter(died == "yes") %>%
  count(Year, name = "Deaths")

# View death cases
kable(death_counts, caption = "Deceased Cases (2018–2021)") %>%
  kable_classic(full_width = F, html_font = "Cambria")
Deceased Cases (2018–2021)
Year Deaths
2018 2
2019 4
2020 5
2021 4
# Count new cases
incidence_summary <- df_1821 %>%
  count(Year, name = "New_Cases") %>%
  left_join(death_counts, by = "Year") %>%
  mutate(
    Deaths = replace_na(Deaths, 0),
    Adjusted_Pop = pop - Deaths,
    Person_Time_Years = Adjusted_Pop,  # Assuming each person contributes 1 year
    incidence_per_100k = round((New_Cases / Person_Time_Years) * 100000, 2)
  )

# View final incidence table
kable(incidence_summary, caption = "Adjusted Incidence of Disease X cases from 2018 to 2020") %>%
  kable_classic(full_width = F, html_font = "Cambria")%>%
  add_footnote(
    label = "* Assumes each individual contributes 1 full person-year unless deceased during that year.",
    notation = "symbol"
  )
Adjusted Incidence of Disease X cases from 2018 to 2020
Year New_Cases Deaths Adjusted_Pop Person_Time_Years incidence_per_100k
2018 33 2 559137 559137 5.90
2019 41 4 559135 559135 7.33
2020 66 5 559134 559134 11.80
2021 42 4 559135 559135 7.51
Assumes each individual contributes 1 full person-year unless deceased during that year.
# Plot incidence
ggplot(incidence_summary, aes(x = factor(Year), y = incidence_per_100k)) +
  geom_col(fill = "dodgerblue") +
  geom_text(aes(label = incidence_per_100k), vjust = -0.5) +
  labs(
    title = "Incidence of Disease X (Confirmed + Probable)",
    subtitle = "Per 100,000 population (2018–2021)",
    x = "Year",
    y = "Incidence Rate"
  ) +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Adjusted incidence rates per 100,000 population showed a peak in 2020 (11.8/100,000), followed by a decline in 2021 (7.5/100,000). These rates were adjusted for deaths to reflect true person-time at risk. The increasing trend until 2020 highlights a period of elevated risk or improved detection, while the 2021 decline may suggest successful containment or reporting fatigue.

Q2- Incidence Rates & Counts of Disease Y

# Case counts and incidence rates of Disease Y
# Since data already loaded on top as df_y using it to further clean
#  Extract case data from top 4 rows
df_y_cases <- df_y[1:5, ]%>%
  mutate(across(everything(), as.character))
colnames(df_y_cases) <- c("County", "2015", "2016", "2017", "2018", "2019", "2020", "2021")

#  Create population table manually
population_tbl <- tribble(
  ~County,      ~Population,
  "Jefferson",   150000,
  "Washington",  500000,
  "Lincoln",      95000,
  "Jackson",      40000
)

# Reshape and join population data
df_y_long <- df_y_cases %>%
  pivot_longer(cols = -County, names_to = "Year", values_to = "Case_Count") %>%
  mutate(
    Year = as.numeric(Year),
    Case_Count = as.numeric(Case_Count)
  ) %>%
  left_join(population_tbl, by = "County") %>%
  mutate(
    Person_Time_Years = Population,  # Assume each person contributes 1 year
    Incidence_per_100k = round((Case_Count / Population) * 100000, 2)
  )

## Remove rows where County is "Disease Y" or Case_Count is NA
df_y_long <- df_y_long %>%
  filter(County != "Disease Y", !is.na(Case_Count))

# View final result
kable(df_y_long, caption="Counts and Incidence Rate for Disease Y")%>%
  kable_classic(full_width = F, html_font = "Cambria") %>%
  add_footnote(
    label = "* Assumes each individual contributes 1 full person-year annually for the population denominator.",
    notation = "symbol"
  )
Counts and Incidence Rate for Disease Y
County Year Case_Count Population Person_Time_Years Incidence_per_100k
Jefferson 2015 120 150000 150000 80.00
Jefferson 2016 140 150000 150000 93.33
Jefferson 2017 100 150000 150000 66.67
Jefferson 2018 98 150000 150000 65.33
Jefferson 2019 45 150000 150000 30.00
Jefferson 2020 140 150000 150000 93.33
Jefferson 2021 162 150000 150000 108.00
Washington 2015 183 500000 500000 36.60
Washington 2016 349 500000 500000 69.80
Washington 2017 438 500000 500000 87.60
Washington 2018 522 500000 500000 104.40
Washington 2019 414 500000 500000 82.80
Washington 2020 348 500000 500000 69.60
Washington 2021 412 500000 500000 82.40
Lincoln 2015 148 95000 95000 155.79
Lincoln 2016 91 95000 95000 95.79
Lincoln 2017 94 95000 95000 98.95
Lincoln 2018 134 95000 95000 141.05
Lincoln 2019 97 95000 95000 102.11
Lincoln 2020 101 95000 95000 106.32
Lincoln 2021 140 95000 95000 147.37
Jackson 2015 50 40000 40000 125.00
Jackson 2016 74 40000 40000 185.00
Jackson 2017 80 40000 40000 200.00
Jackson 2018 104 40000 40000 260.00
Jackson 2019 118 40000 40000 295.00
Jackson 2020 96 40000 40000 240.00
Jackson 2021 132 40000 40000 330.00
Assumes each individual contributes 1 full person-year annually for the population denominator.
# Bar Plot: Case Counts by County over Time
ggplot(df_y_long, aes(x = factor(Year), y = Case_Count, fill = County)) +
  geom_col(position = position_dodge(width = 0.9)) +
  geom_text(
    aes(label = Case_Count),
    position = position_dodge(width = 0.9),
    vjust = -0.5,
    size = 3
  ) +
  labs(
    title = "Disease Y Case Counts by County and Year",
    x = "Year",
    y = "Number of Cases"
  ) +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Disease Y incidence varied significantly across counties and years:

Jackson County consistently reported the highest incidence, peaking at 330/100,000 in 2021, suggesting an urgent need for investigation and control.

Lincoln and Washington Counties maintained moderately high but more stable incidence rates.

Jefferson County showed more fluctuation, with a resurgence in 2021. These variations suggest differing risk profiles, reporting capacity, or intervention effectiveness and highlight the importance of local-level surveillance and response.

# Line Plot: Incidence Rates by County
ggplot(df_y_long, aes(x = Year, y = Incidence_per_100k, color = County)) +
  geom_line(size = 1) +
  geom_point() +
  geom_text(
    aes(label = Incidence_per_100k),
    vjust = -0.5,
    size = 3
  ) +
  labs(
    title = "Incidence Rate of Disease Y by County (Per 100,000)",
    x = "Year",
    y = "Incidence per 100k"
  ) +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Newsletter Summary (for school nurses):

Understanding Disease Y Risk: Why Incidence Rates Matter

From 2015 to 2021, reported case counts of Disease Y varied substantially across counties. While Washington County consistently reported the highest number of total cases, its incidence rates were moderate due to its large population base of 500,000. For example, in 2018, Washington had 522 cases (incidence: 104.40 per 100,000), whereas Jackson County, with only 132 cases in 2021, had a much higher incidence of 330.00 per 100,000 due to a smaller population of 40,000.

This means that although Washington had more cases, Jackson residents were at a higher individual risk of contracting Disease Y. Similarly, Lincoln County showed a consistently elevated risk, peaking at 155.79 per 100,000 in 2015.

These findings underscore why school nurses—and public health professionals in general—should not rely solely on raw case counts. Incidence rates, which adjust for population size, provide a clearer picture of community-level risk. This approach helps prioritize health messaging, prevention efforts, and resource allocation, especially in high-risk counties like Jackson and Lincoln.

Note: Incidence rates were calculated assuming each individual contributes 1 full person-year annually, based on standard epidemiological methodology.