Epidemiology Skill Assessment

###### PROLOG ########


# PROLOG   ###
# PROJECT: Epidemiology Skill Assessment           
# PURPOSE: Skill Assessment                                      
# DIR:     C:\Users\kesha\Downloads\Skill Assessment                      
# DATA:    Data - In-Basket A.xlsx                         
# AUTHOR:  Dr. Keshav Kumar                                            
# CREATED: JULY 11, 2025                                           
# LATEST:  JULY 11, 2025                                     
# NOTES:   N/A
# PROLOG   ### 


# libraries 


library(magrittr) # for pipes
library(table1) # for descriptive statistics 
library(tidyverse) # for tidy code
library(sessioninfo) # for session_info at bottom
library(details) # for session_info at bottom
library(ggthemes) # for tufte theme
library(ggrepel) # for text plotting
library(patchwork) # for combining plots
library(readxl) # for reading xlxs files
library(kableExtra) # for table headers formatting
library(knitr) # for tables
library(sjPlot) # for model tables 
# plot theme
theme_set(theme_tufte())  # but might not carry over in chunks

# Okabe-Ito colorblind-friendly color palette:
# https://jfly.uni-koeln.de/color/

oi_pal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", 
     "#0072B2", "#D55E00", "#CC79A7", "#999999")


###### DATA MGMT #####

# original dataset
df <- read_xlsx("C:/Users/kesha/Downloads/Skill Assessment/Data - In-Basket A.xlsx", sheet = "Case Line List Disease X")
df_y <- read_excel("C:/Users/kesha/Downloads/Skill Assessment/Data - In-Basket A.xlsx", sheet = "Disease Y", col_names = FALSE)
dict <- read_excel("C:/Users/kesha/Downloads/Skill Assessment/Data - In-Basket A.xlsx", sheet = "Data Dictionary")

# Clean and rename columns
colnames(df) <- str_trim(colnames(df))
dict <- dict %>% mutate(Column = str_trim(Column))
rename_map <- setNames(dict$Description, dict$Column)
df <- df %>% rename(any_of(rename_map)) %>%
  janitor::clean_names()

# Convert date columns and calculate age
df <- df %>%
  mutate(
    Age = round(as.numeric(difftime(`onset_date`, `dob`, units = "days")) / 365.25, 2),
    Year = year(`onset_date`)
  )

Descriptive Summary

# Descriptive Summary
label(df$case_status) <-"Case classification (confirmed, probable, suspect )"
label(df$sex) <-"Gender of case"
label(df$hospitalized) <-"Was the patient hospitalized for the disease"
label(df$ethnicity) <- "Ethnicity of case"
label(df$race) <- "Race of case"
label(df$died) <- "Did the patient die from the disease"
label(df$lab) <- "Laboratory test done"
label(df$Age) <- "Age of Patient at the time of Onset of Disease"
label(df$sx) <- "Symptoms"

table1(
  ~sex + hospitalized + died + race + ethnicity + lab +
    sx + Age |case_status,
  data= df,,render.missing=NULL,
  render.continous="median(IQR)",
  caption = "Descriptive Statistics of Disease X",
)

Descriptive Statistics of Disease X
	Confirmed (N=140)	Probable (N=42)	Suspect (N=20)	Overall (N=202)
Gender of case
Female	52 (37.1%)	18 (42.9%)	11 (55.0%)	81 (40.1%)
Male	87 (62.1%)	22 (52.4%)	9 (45.0%)	118 (58.4%)
Unknown	1 (0.7%)	2 (4.8%)	0 (0%)	3 (1.5%)
Was the patient hospitalized for the disease
No	2 (1.4%)	1 (2.4%)	2 (10.0%)	5 (2.5%)
Unknown	4 (2.9%)	38 (90.5%)	16 (80.0%)	58 (28.7%)
Yes	134 (95.7%)	3 (7.1%)	2 (10.0%)	139 (68.8%)
Did the patient die from the disease
No	90 (64.3%)	3 (7.1%)	1 (5.0%)	94 (46.5%)
Unknown	35 (25.0%)	39 (92.9%)	18 (90.0%)	92 (45.5%)
Yes	15 (10.7%)	0 (0%)	1 (5.0%)	16 (7.9%)
Race of case
Asian	1 (0.7%)	0 (0%)	0 (0%)	1 (0.5%)
Black/African American	18 (12.9%)	1 (2.4%)	1 (5.0%)	20 (9.9%)
Native American	2 (1.4%)	0 (0%)	0 (0%)	2 (1.0%)
Other	2 (1.4%)	0 (0%)	0 (0%)	2 (1.0%)
Unknown	17 (12.1%)	39 (92.9%)	17 (85.0%)	73 (36.1%)
White	100 (71.4%)	2 (4.8%)	2 (10.0%)	104 (51.5%)
Ethnicity of case
Hispanic	27 (19.3%)	2 (4.8%)	0 (0%)	29 (14.4%)
Non-Hispanic	70 (50.0%)	2 (4.8%)	2 (10.0%)	74 (36.6%)
Unknown	43 (30.7%)	38 (90.5%)	18 (90.0%)	99 (49.0%)
Laboratory test done
Cx	8 (5.7%)	0 (0%)	0 (0%)	8 (4.0%)
CX	1 (0.7%)	0 (0%)	0 (0%)	1 (0.5%)
PCR +	131 (93.6%)	0 (0%)	0 (0%)	131 (64.9%)
Ab +	0 (0%)	42 (100%)	20 (100%)	62 (30.7%)
Symptoms
No	2 (1.4%)	0 (0%)	20 (100%)	22 (10.9%)
Yes	138 (98.6%)	42 (100%)	0 (0%)	180 (89.1%)
Age of Patient at the time of Onset of Disease
Mean (SD)	54.7 (19.5)	50.2 (22.7)	42.1 (17.4)	52.5 (20.3)
Median [Min, Max]	59.3 [0.980, 90.4]	56.3 [2.75, 87.2]	40.9 [13.3, 74.4]	57.1 [0.980, 90.4]

# Round age to nearest whole number
df$age_rounded <- round(df$Age)


# Frequency table for Year
year_count <- table(df$Year)
kable(year_count, caption = "Case Count by Year")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Case Count by Year
Var1	Freq
2018	36
2019	46
2020	72
2021	48

# Percentage table for Year
year_perc <- prop.table(table(df$Year)) * 100
kable(year_perc, caption = "Percentage of Cases by Year", digits = 2)%>%
  kable_classic(full_width = F, html_font = "Cambria")

Percentage of Cases by Year
Var1	Freq
2018	17.82
2019	22.77
2020	35.64
2021	23.76

The descriptive table stratified by case status (confirmed, probable, suspect) reveals important demographic and clinical patterns. The majority of confirmed cases were male (62.1%), with a median age of 59.3 years, indicating a greater burden among older adult males. Hospitalization was nearly universal among confirmed cases (95.7%), suggesting a high severity profile. However, hospitalization status was unknown for 90.5% of probable and 80% of suspect cases, indicating incomplete case investigation. Mortality was reported in 10.7% of confirmed cases, with the death status missing in nearly half of all records, limiting the reliability of fatality estimates. Race and ethnicity data were also largely missing, particularly among probable and suspect cases, which hinders equity analysis. Most confirmed cases were PCR-positive, whereas all probable and suspect cases were antibody-positive, highlighting diagnostic differences based on case classification.

Descriptive Graphs

# Histogram of age

ggplot(df, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "steelblue", color = "white") +
   stat_bin(binwidth = 5, geom = "text", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Distribution of Age at Onset",
       x = "Age (Years)", y = "Number of Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

The age distribution shows a unimodal pattern skewed toward older adults, with most cases clustered between ages 40 and 70. This supports earlier findings that older populations are disproportionately affected and may warrant targeted prevention strategies.

# Bar Plot of Case counts by Year
ggplot(df, aes(x = factor(Year))) +
  geom_bar(fill = "orange", color = "black") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Number of Disease X Cases by Year",
       x = "Year", y = "Number of Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

The bar chart reinforces the tabular findings, showing the sharpest rise in case counts in 2020, which could correspond to an outbreak or improvement in surveillance and testing capacity.

# Boxplot of Age by Year
ggplot(df, aes(x = factor(Year), y = Age)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Age Distribution of Cases by Year",
       x = "Year", y = "Age at Onset") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Age distributions remained relatively stable year-over-year, though the interquartile range was narrower in 2021, suggesting slightly more concentrated age groups being affected. No significant shift in median age was observed.

# Line Plot: Trend of Confirmed Cases Over Years
df %>%
  filter(case_status == "Confirmed") %>%
  count(Year) %>%
  ggplot(aes(x = Year, y = n)) +
  geom_line(group = 1, color = "darkblue") +
  geom_point(size = 3) +
  geom_text(aes(label = n), vjust = -0.5) +
  labs(title = "Trend of Confirmed Cases (Disease X)",
       x = "Year", y = "Number of Confirmed Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

A steady increase in confirmed cases was observed from 2018 (25) to 2020 (52), before decreasing to 48 in 2021. This trend supports a possible outbreak curve with a peak in 2020.

Q1.a- Confirmed Cases Count in 2020

# Confirmed Cases in 2020

confirmed_2020 <- df %>%
  filter(`case_status` == "Confirmed", year(onset_date) == 2020)

kable(count(confirmed_2020), caption= "Confirmed cases in year 2020")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Confirmed cases in year 2020
n
52

There were 52 confirmed cases of Disease X in 2020, making it the peak year within the observation period. This aligns with other indications of increased transmission or enhanced surveillance that year.

Q1.b- Percentage of Confirmed & Probable Cases

#Percentage of Confirmed & Probable Cases with Known Hospitalization

# Filter confirmed and probable cases
confirmed_probable <- df %>%
  filter(case_status %in% c("Confirmed", "Probable"))

# Filter those with known hospitalization (Yes or No)
known_hosp <- confirmed_probable %>%
  filter(hospitalized %in% c("Yes", "No"))

# Calculate percentage of known hospitalizations
percent_known <- nrow(known_hosp) / nrow(confirmed_probable) * 100

kable(round(percent_known, 2), caption="Percentage of Confirmed & Probable Cases with Known Hospitalization")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Percentage of Confirmed & Probable Cases with Known Hospitalization
x
76.92

Among confirmed and probable cases, 76.92% had known hospitalization status (Yes/No). This reflects moderate data completeness for key clinical outcome fields, but efforts to reduce unknowns are needed for accurate severity assessments.

Q1.c- Female to Male Ratio

# What is the female to male ratio among confirmed cases?
gender_ratio <- df %>%
  filter(case_status == "Confirmed") %>%
  count(sex) %>%
  pivot_wider(names_from = sex, values_from = n, values_fill = 0)

# Female to male ratio
ratio_female_male <- round(gender_ratio$Female / gender_ratio$Male, 4)

kable(gender_ratio, caption="Female to Male number among confirmed cases")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Female to Male number among confirmed cases
Female	Male	Unknown
52	87	1

kable(ratio_female_male, caption="Female to Male ratio among confirmed cases")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Female to Male ratio among confirmed cases
x
0.5977

Among confirmed cases, the female-to-male ratio was 0.60, indicating males were disproportionately affected. This gender disparity could reflect differential exposure, occupation-related risks, or health-seeking behavior.

Q1.d- Disease X Trend from 2018-20 is Increasing

#Is disease X increasing or decreasing from 2018 to 2020?

cases <- df %>%
  filter(case_status == "Confirmed", Year %in% 2018:2020) %>%
  count(Year)

kable(cases, caption="Disease X count from 2018 to 2020 is Increasing")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Disease X count from 2018 to 2020 is Increasing
Year	n
2018	25
2019	30
2020	52

Confirmed case counts rose steadily from 25 in 2018 to 52 in 2020, consistent with an increasing trend in either disease transmission or surveillance intensity. Public health interventions may have been warranted post-2019.

Q1.e- Average & Median age of Confirmed and Probable cases

# What is the average and median age of confirmed + probable cases?
age_stats <- df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  summarise(
    average_age = round(mean(Age, na.rm = TRUE), 2),
    median_age = round(median(Age, na.rm = TRUE), 2)
  )

kable(age_stats, caption=" Average and Median age of Confirmed and Probable cases")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Average and Median age of Confirmed and Probable cases
average_age	median_age
53.69	58.81

Among confirmed and probable cases, the mean age was 53.7 years, and the median was 58.8 years. These figures confirm a concentration of cases in middle-aged and older adults, reinforcing the need for age-targeted interventions.

Q1.f- Disease X Seasonality

# Is Disease X seasonal? Why or why not?
# Prepare data
df_season <- df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  mutate(
    month_onset = month(onset_date, label = TRUE, abbr = FALSE)
  ) %>%
  count(Year, month_onset)

# Reshape for wide-format table (months as rows, years as columns)
seasonality_wide <- df_season %>%
  pivot_wider(names_from = Year, values_from = n, values_fill = 0) %>%
  mutate(
    Total_cases = rowSums(select(., `2018`:`2021`))
  ) %>%
  arrange(match(month_onset, month.name))

kable(seasonality_wide, caption="Disease X Seasonality")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Disease X Seasonality
month_onset	2018	2019	2020	2021	Total_cases
January	2	1	6	3	12
February	2	2	1	1	6
March	1	4	6	3	14
April	2	2	3	2	9
May	2	1	6	4	13
June	3	4	5	6	18
July	5	10	7	3	25
August	9	5	10	6	30
September	3	5	7	5	20
October	2	2	7	4	15
November	0	2	4	4	10
December	2	3	4	1	10

df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  mutate(month = month(onset_date, label = TRUE)) %>%
  count(month) %>%
  ggplot(aes(x = month, y = n)) +
  geom_col(fill = "tomato") +
  geom_text(aes(label = n), vjust = -0.5) +
  labs(title = "Monthly Distribution of Disease X Cases",
       x = "Month", y = "Number of Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Disease X exhibited clear summer seasonality, with cases peaking between June and September. This pattern was consistent across multiple years and suggests an environmental or vector-related exposure pathway. Public health messaging and control efforts should be intensified during these months.

# Faceted bar plot
ggplot(df_season, aes(x = month_onset, y = n)) +
  geom_col(fill = "steelblue") +
  facet_wrap(~ Year, ncol = 2) +
  geom_text(aes(label = n), vjust = -0.5) +
  labs(
    title = "Monthly Distribution of Disease X Cases by Year",
    x = "Month",
    y = "Number of Cases"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 12, face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Yes, Disease X appears to be seasonal. Based on the data, most cases cluster around late spring to early fall (June to September), which suggests a summer seasonality—possibly linked to environmental or vector exposure patterns common in warmer months.

Q1.g- Incidence of Disease X from 2018-21

# Incidence of cases (combined confirmed and probable) of disease X from 2018 to 2021

pop <- 559139

# Ensure lowercase and dates
df_i <- df %>%
  mutate(
    case_id = as.character(case_id),
    case_status = str_to_lower(case_status),
    died = str_to_lower(died),
    onset_date = as.Date(onset_date),
    dob = as.Date(dob),
    age = as.numeric(difftime(onset_date, dob, units = "days")) / 365.25,
    Year = year(onset_date)
  )

#keeping only deduplicated cases (unique case id)
df_unique <- df_i %>%
  filter(case_status %in% c("confirmed", "probable")) %>%
  distinct(case_id, .keep_all = TRUE)

# Filter years 2018–2020
df_1821 <- df_unique %>%
  filter(Year %in% 2018:2021)

# Duplicate cases based on case_id (all entries, not deduplicated)
duplicate_cases <- df_i %>%
  filter(case_status %in% c("confirmed", "probable", "suspect")) %>%
  group_by(case_id) %>%
  filter(n() > 1) %>%
  arrange(case_id)

# View duplicate cases
kable(duplicate_cases, caption = "Duplicate Case IDs in Dataset") %>%
  kable_classic(full_width = F, html_font = "Cambria")

Duplicate Case IDs in Dataset
condition	form_rvd	nbs_status	case_status	case_id	dob	sex	race	ethnicity	county	hsr	onset_date	lab	sx	hospitalized	died	risk_factors	travel	Age	Year	age_rounded	age
NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
:———	:——–	:———-	:———–	:——-	:—	:—	:—-	:———	:——	:—	:———-	:—	:–	:————	:—-	:————	:——	—:	—-:	———–:	—:

There is no any duplicate data row present.

# Death cases among deduplicated data from 2018–2021
death_cases <- df_1821 %>%
  filter(died == "yes")

# Deaths per year for adjustment
death_counts <- df_1821 %>%
  filter(died == "yes") %>%
  count(Year, name = "Deaths")

# View death cases
kable(death_counts, caption = "Deceased Cases (2018–2021)") %>%
  kable_classic(full_width = F, html_font = "Cambria")

Deceased Cases (2018–2021)
Year	Deaths
2018	2
2019	4
2020	5
2021	4

# Count new cases
incidence_summary <- df_1821 %>%
  count(Year, name = "New_Cases") %>%
  left_join(death_counts, by = "Year") %>%
  mutate(
    Deaths = replace_na(Deaths, 0),
    Adjusted_Pop = pop - Deaths,
    Person_Time_Years = Adjusted_Pop,  # Assuming each person contributes 1 year
    incidence_per_100k = round((New_Cases / Person_Time_Years) * 100000, 2)
  )

# View final incidence table
kable(incidence_summary, caption = "Adjusted Incidence of Disease X cases from 2018 to 2020") %>%
  kable_classic(full_width = F, html_font = "Cambria")%>%
  add_footnote(
    label = "* Assumes each individual contributes 1 full person-year unless deceased during that year.",
    notation = "symbol"
  )

Adjusted Incidence of Disease X cases from 2018 to 2020
Year	New_Cases	Deaths	Adjusted_Pop	Person_Time_Years	incidence_per_100k
2018	33	2	559137	559137	5.90
2019	41	4	559135	559135	7.33
2020	66	5	559134	559134	11.80
2021	42	4	559135	559135	7.51
Assumes each individual contributes 1 full person-year unless deceased during that year.

# Plot incidence
ggplot(incidence_summary, aes(x = factor(Year), y = incidence_per_100k)) +
  geom_col(fill = "dodgerblue") +
  geom_text(aes(label = incidence_per_100k), vjust = -0.5) +
  labs(
    title = "Incidence of Disease X (Confirmed + Probable)",
    subtitle = "Per 100,000 population (2018–2021)",
    x = "Year",
    y = "Incidence Rate"
  ) +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Adjusted incidence rates per 100,000 population showed a peak in 2020 (11.8/100,000), followed by a decline in 2021 (7.5/100,000). These rates were adjusted for deaths to reflect true person-time at risk. The increasing trend until 2020 highlights a period of elevated risk or improved detection, while the 2021 decline may suggest successful containment or reporting fatigue.

Q2- Incidence Rates & Counts of Disease Y

# Case counts and incidence rates of Disease Y
# Since data already loaded on top as df_y using it to further clean
#  Extract case data from top 4 rows
df_y_cases <- df_y[1:5, ]%>%
  mutate(across(everything(), as.character))
colnames(df_y_cases) <- c("County", "2015", "2016", "2017", "2018", "2019", "2020", "2021")

#  Create population table manually
population_tbl <- tribble(
  ~County,      ~Population,
  "Jefferson",   150000,
  "Washington",  500000,
  "Lincoln",      95000,
  "Jackson",      40000
)

# Reshape and join population data
df_y_long <- df_y_cases %>%
  pivot_longer(cols = -County, names_to = "Year", values_to = "Case_Count") %>%
  mutate(
    Year = as.numeric(Year),
    Case_Count = as.numeric(Case_Count)
  ) %>%
  left_join(population_tbl, by = "County") %>%
  mutate(
    Person_Time_Years = Population,  # Assume each person contributes 1 year
    Incidence_per_100k = round((Case_Count / Population) * 100000, 2)
  )

## Remove rows where County is "Disease Y" or Case_Count is NA
df_y_long <- df_y_long %>%
  filter(County != "Disease Y", !is.na(Case_Count))

# View final result
kable(df_y_long, caption="Counts and Incidence Rate for Disease Y")%>%
  kable_classic(full_width = F, html_font = "Cambria") %>%
  add_footnote(
    label = "* Assumes each individual contributes 1 full person-year annually for the population denominator.",
    notation = "symbol"
  )

Counts and Incidence Rate for Disease Y
County	Year	Case_Count	Population	Person_Time_Years	Incidence_per_100k
Jefferson	2015	120	150000	150000	80.00
Jefferson	2016	140	150000	150000	93.33
Jefferson	2017	100	150000	150000	66.67
Jefferson	2018	98	150000	150000	65.33
Jefferson	2019	45	150000	150000	30.00
Jefferson	2020	140	150000	150000	93.33
Jefferson	2021	162	150000	150000	108.00
Washington	2015	183	500000	500000	36.60
Washington	2016	349	500000	500000	69.80
Washington	2017	438	500000	500000	87.60
Washington	2018	522	500000	500000	104.40
Washington	2019	414	500000	500000	82.80
Washington	2020	348	500000	500000	69.60
Washington	2021	412	500000	500000	82.40
Lincoln	2015	148	95000	95000	155.79
Lincoln	2016	91	95000	95000	95.79
Lincoln	2017	94	95000	95000	98.95
Lincoln	2018	134	95000	95000	141.05
Lincoln	2019	97	95000	95000	102.11
Lincoln	2020	101	95000	95000	106.32
Lincoln	2021	140	95000	95000	147.37
Jackson	2015	50	40000	40000	125.00
Jackson	2016	74	40000	40000	185.00
Jackson	2017	80	40000	40000	200.00
Jackson	2018	104	40000	40000	260.00
Jackson	2019	118	40000	40000	295.00
Jackson	2020	96	40000	40000	240.00
Jackson	2021	132	40000	40000	330.00
Assumes each individual contributes 1 full person-year annually for the population denominator.

# Bar Plot: Case Counts by County over Time
ggplot(df_y_long, aes(x = factor(Year), y = Case_Count, fill = County)) +
  geom_col(position = position_dodge(width = 0.9)) +
  geom_text(
    aes(label = Case_Count),
    position = position_dodge(width = 0.9),
    vjust = -0.5,
    size = 3
  ) +
  labs(
    title = "Disease Y Case Counts by County and Year",
    x = "Year",
    y = "Number of Cases"
  ) +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Disease Y incidence varied significantly across counties and years:

Jackson County consistently reported the highest incidence, peaking at 330/100,000 in 2021, suggesting an urgent need for investigation and control.

Lincoln and Washington Counties maintained moderately high but more stable incidence rates.

Jefferson County showed more fluctuation, with a resurgence in 2021. These variations suggest differing risk profiles, reporting capacity, or intervention effectiveness and highlight the importance of local-level surveillance and response.

# Line Plot: Incidence Rates by County
ggplot(df_y_long, aes(x = Year, y = Incidence_per_100k, color = County)) +
  geom_line(size = 1) +
  geom_point() +
  geom_text(
    aes(label = Incidence_per_100k),
    vjust = -0.5,
    size = 3
  ) +
  labs(
    title = "Incidence Rate of Disease Y by County (Per 100,000)",
    x = "Year",
    y = "Incidence per 100k"
  ) +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))