Epidemiology Program Manager (Epidemiologist III) Position

###### PROLOG ########


# PROLOG   ###
# PROJECT: Epidemiology Program Manager (Epidemiologist III) Position                   
# PURPOSE: Skill Assessment                                      
# DIR:     C:\Users\kesha\Downloads                    
# DATA:    Data - In-Basket A.xlsx                         
# AUTHOR:  Dr. Keshav Kumar                                            
# CREATED: JUNE 23, 2025                                           
# LATEST:  JUNE 24, 2025                                     
# NOTES:   N/A
# PROLOG   ### 


# libraries 


library(magrittr) # for pipes
library(table1) # for descriptive statistics 
library(tidyverse) # for tidy code
library(sessioninfo) # for session_info at bottom
library(details) # for session_info at bottom
library(ggthemes) # for tufte theme
library(ggrepel) # for text plotting
library(patchwork) # for combining plots
library(readxl) # for reading xlxs files
library(kableExtra) # for table headers formatting
library(knitr) # for tables
library(sjPlot) # for model tables 
# plot theme
theme_set(theme_tufte())  # but might not carry over in chunks

# Okabe-Ito colorblind-friendly color palette:
# https://jfly.uni-koeln.de/color/

oi_pal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", 
     "#0072B2", "#D55E00", "#CC79A7", "#999999")


###### DATA MGMT #####

# original dataset
df <- read_xlsx("C:/Users/kesha/Downloads/Data - In-Basket A.xlsx", sheet = "Case Line List Disease X")
df_y <- read_excel("C:/Users/kesha/Downloads/Data - In-Basket A.xlsx", sheet = "Disease Y", col_names = FALSE)
dict <- read_excel("C:/Users/kesha/Downloads/Data - In-Basket A.xlsx", sheet = "Data Dictionary")

# Clean and rename columns
colnames(df) <- str_trim(colnames(df))
dict <- dict %>% mutate(Column = str_trim(Column))
rename_map <- setNames(dict$Description, dict$Column)
df <- df %>% rename(any_of(rename_map)) %>%
  janitor::clean_names()

# Convert date columns and calculate age
df <- df %>%
  mutate(
    Age = round(as.numeric(difftime(`onset_date`, `dob`, units = "days")) / 365.25, 2),
    Year = year(`onset_date`)
  )

Descriptive Summary

# Descriptive Summary
label(df$case_status) <-"Case classification (confirmed, probable, suspect )"
label(df$sex) <-"Gender of case"
label(df$hospitalized) <-"Was the patient hospitalized for the disease"
label(df$ethnicity) <- "Ethnicity of case"
label(df$race) <- "Race of case"
label(df$died) <- "Did the patient die from the disease"
label(df$lab) <- "Laboratory test done"
label(df$Age) <- "Age of Patient at the time of Onset of Disease"
label(df$sx) <- "Symptoms"

desc<- table1(
  ~case_status + sex + hospitalized + died + race + ethnicity + lab +
    sx + Age,
  data= df,
  render.continous="median(IQR)",
  caption = "Descriptive Statistics of Disease X",
)

kable(desc)

	Overall
	(N=202)
Case classification (confirmed, probable, suspect )
Confirmed	140 (69.3%)
Probable	42 (20.8%)
Suspect	20 (9.9%)
Gender of case
Female	81 (40.1%)
Male	118 (58.4%)
Unknown	3 (1.5%)
Was the patient hospitalized for the disease
No	5 (2.5%)
Unknown	58 (28.7%)
Yes	139 (68.8%)
Did the patient die from the disease
No	94 (46.5%)
Unknown	92 (45.5%)
Yes	16 (7.9%)
Race of case
Asian	1 (0.5%)
Black/African American	20 (9.9%)
Native American	2 (1.0%)
Other	2 (1.0%)
Unknown	73 (36.1%)
White	104 (51.5%)
Ethnicity of case
Hispanic	29 (14.4%)
Non-Hispanic	74 (36.6%)
Unknown	99 (49.0%)
Laboratory test done
Ab +	62 (30.7%)
Cx	8 (4.0%)
CX	1 (0.5%)
PCR +	131 (64.9%)
Symptoms
No	22 (10.9%)
Yes	180 (89.1%)
Age of Patient at the time of Onset of Disease
Mean (SD)	52.5 (20.3)
Median [Min, Max]	57.1 [0.980, 90.4]

# Round age to nearest whole number
df$age_rounded <- round(df$Age)


# Frequency table for Year
year_count <- table(df$Year)
kable(year_count, caption = "Case Count by Year")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Case Count by Year
Var1	Freq
2018	36
2019	46
2020	72
2021	48

# Percentage table for Year
year_perc <- prop.table(table(df$Year)) * 100
kable(year_perc, caption = "Percentage of Cases by Year", digits = 2)%>%
  kable_classic(full_width = F, html_font = "Cambria")

Percentage of Cases by Year
Var1	Freq
2018	17.82
2019	22.77
2020	35.64
2021	23.76

Descriptive Graphs

# Histogram of age

ggplot(df, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "steelblue", color = "white") +
   stat_bin(binwidth = 5, geom = "text", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Distribution of Age at Onset",
       x = "Age (Years)", y = "Number of Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

# Bar Plot of Case counts by Year
ggplot(df, aes(x = factor(Year))) +
  geom_bar(fill = "orange", color = "black") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Number of Disease X Cases by Year",
       x = "Year", y = "Number of Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

# Boxplot of Age by Year
ggplot(df, aes(x = factor(Year), y = Age)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Age Distribution of Cases by Year",
       x = "Year", y = "Age at Onset") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

# Line Plot: Trend of Confirmed Cases Over Years
df %>%
  filter(case_status == "Confirmed") %>%
  count(Year) %>%
  ggplot(aes(x = Year, y = n)) +
  geom_line(group = 1, color = "darkblue") +
  geom_point(size = 3) +
  geom_text(aes(label = n), vjust = -0.5) +
  labs(title = "Trend of Confirmed Cases (Disease X)",
       x = "Year", y = "Number of Confirmed Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Q1.a- Confirmed Cases Count in 2020

# Confirmed Cases in 2020

confirmed_2020 <- df %>%
  filter(`case_status` == "Confirmed", year(onset_date) == 2020)

kable(count(confirmed_2020), caption= "Confirmed cases in year 2020")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Confirmed cases in year 2020
n
52

Q1.b- Percentage of Confirmed & Probable Cases

#Percentage of Confirmed & Probable Cases with Known Hospitalization

# Filter confirmed and probable cases
confirmed_probable <- df %>%
  filter(case_status %in% c("Confirmed", "Probable"))

# Filter those with known hospitalization (Yes or No)
known_hosp <- confirmed_probable %>%
  filter(hospitalized %in% c("Yes", "No"))

# Calculate percentage of known hospitalizations
percent_known <- nrow(known_hosp) / nrow(confirmed_probable) * 100

kable(round(percent_known, 2), caption="Percentage of Confirmed & Probable Cases with Known Hospitalization")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Percentage of Confirmed & Probable Cases with Known Hospitalization
x
76.92

Q1.c- Female to Male Ratio

# What is the female to male ratio among confirmed cases?
gender_ratio <- df %>%
  filter(case_status == "Confirmed") %>%
  count(sex) %>%
  pivot_wider(names_from = sex, values_from = n, values_fill = 0)

# Female to male ratio
ratio_female_male <- round(gender_ratio$Female / gender_ratio$Male, 4)

kable(gender_ratio, caption="Female to Male number among confirmed cases")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Female to Male number among confirmed cases
Female	Male	Unknown
52	87	1

kable(ratio_female_male, caption="Female to Male ratio among confirmed cases")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Female to Male ratio among confirmed cases
x
0.5977

Q1.d- Disease X Trend from 2018-20 is Increasing

#Is disease X increasing or decreasing from 2018 to 2020?

cases <- df %>%
  filter(case_status == "Confirmed", Year %in% 2018:2020) %>%
  count(Year)

kable(cases, caption="Disease X count from 2018 to 2020 is Increasing")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Disease X count from 2018 to 2020 is Increasing
Year	n
2018	25
2019	30
2020	52

Q1.e- Average & Median age of Confirmed and Probable cases

# What is the average and median age of confirmed + probable cases?
age_stats <- df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  summarise(
    average_age = round(mean(Age, na.rm = TRUE), 2),
    median_age = round(median(Age, na.rm = TRUE), 2)
  )

kable(age_stats, caption=" Average and Median age of Confirmed and Probable cases")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Average and Median age of Confirmed and Probable cases
average_age	median_age
53.69	58.81

Q1.f- Disease X Seasonality

Yes, Disease X appears to be seasonal. Based on the data, most cases cluster around late spring to early fall (June to September), which suggests a summer seasonality—possibly linked to environmental or vector exposure patterns common in warmer months.

# Is Disease X seasonal? Why or why not?
# Prepare data
df_season <- df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  mutate(
    month_onset = month(onset_date, label = TRUE, abbr = FALSE)
  ) %>%
  count(Year, month_onset)

# Reshape for wide-format table (months as rows, years as columns)
seasonality_wide <- df_season %>%
  pivot_wider(names_from = Year, values_from = n, values_fill = 0) %>%
  mutate(
    Total_cases = rowSums(select(., `2018`:`2021`))
  ) %>%
  arrange(match(month_onset, month.name))

kable(seasonality_wide, caption="Disease X Seasonality")%>%
  kable_classic(full_width = F, html_font = "Cambria")

Disease X Seasonality
month_onset	2018	2019	2020	2021	Total_cases
January	2	1	6	3	12
February	2	2	1	1	6
March	1	4	6	3	14
April	2	2	3	2	9
May	2	1	6	4	13
June	3	4	5	6	18
July	5	10	7	3	25
August	9	5	10	6	30
September	3	5	7	5	20
October	2	2	7	4	15
November	0	2	4	4	10
December	2	3	4	1	10

df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  mutate(month = month(onset_date, label = TRUE)) %>%
  count(month) %>%
  ggplot(aes(x = month, y = n)) +
  geom_col(fill = "tomato") +
  geom_text(aes(label = n), vjust = -0.5) +
  labs(title = "Monthly Distribution of Disease X Cases",
       x = "Month", y = "Number of Cases") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

# Faceted bar plot
ggplot(df_season, aes(x = month_onset, y = n)) +
  geom_col(fill = "steelblue") +
  facet_wrap(~ Year, ncol = 2) +
  geom_text(aes(label = n), vjust = -0.5) +
  labs(
    title = "Monthly Distribution of Disease X Cases by Year",
    x = "Month",
    y = "Number of Cases"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 12, face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Q1.g- Incidence of Disease X from 2018-21

# Incidence of cases (combined confirmed and probable) of disease X from 2018 to 2021

pop <- 559139

# Ensure lowercase and dates
df_i <- df %>%
  mutate(
    case_id = as.character(case_id),
    case_status = str_to_lower(case_status),
    died = str_to_lower(died),
    onset_date = as.Date(onset_date),
    dob = as.Date(dob),
    age = as.numeric(difftime(onset_date, dob, units = "days")) / 365.25,
    Year = year(onset_date)
  )

#keeping only deduplicated cases (unique case id)
df_unique <- df_i %>%
  filter(case_status %in% c("confirmed", "probable")) %>%
  distinct(case_id, .keep_all = TRUE)

# Filter years 2018–2020
df_1821 <- df_unique %>%
  filter(Year %in% 2018:2021)

# Duplicate cases based on case_id (all entries, not deduplicated)
duplicate_cases <- df_i %>%
  filter(case_status %in% c("confirmed", "probable", "suspect")) %>%
  group_by(case_id) %>%
  filter(n() > 1) %>%
  arrange(case_id)

# View duplicate cases
kable(duplicate_cases, caption = "Duplicate Case IDs in Dataset") %>%
  kable_classic(full_width = F, html_font = "Cambria")

Duplicate Case IDs in Dataset
condition	form_rvd	nbs_status	case_status	case_id	dob	sex	race	ethnicity	county	hsr	onset_date	lab	sx	hospitalized	died	risk_factors	travel	Age	Year	age_rounded	age
NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
:———	:——–	:———-	:———–	:——-	:—	:—	:—-	:———	:——	:—	:———-	:—	:–	:————	:—-	:————	:——	—:	—-:	———–:	—:

# Death cases among deduplicated data from 2018–2021
death_cases <- df_1821 %>%
  filter(died == "yes")

# Deaths per year for adjustment
death_counts <- df_1821 %>%
  filter(died == "yes") %>%
  count(Year, name = "Deaths")

# View death cases
kable(death_counts, caption = "Deceased Cases (2018–2021)") %>%
  kable_classic(full_width = F, html_font = "Cambria")

Deceased Cases (2018–2021)
Year	Deaths
2018	2
2019	4
2020	5
2021	4

# Count new cases
incidence_summary <- df_1821 %>%
  count(Year, name = "New_Cases") %>%
  left_join(death_counts, by = "Year") %>%
  mutate(
    Deaths = replace_na(Deaths, 0),
    Adjusted_Pop = pop - Deaths,
    Person_Time_Years = Adjusted_Pop,  # Assuming each person contributes 1 year
    incidence_per_100k = round((New_Cases / Person_Time_Years) * 100000, 2)
  )

# View final incidence table
kable(incidence_summary, caption = "Adjusted Incidence of Disease X cases from 2018 to 2020") %>%
  kable_classic(full_width = F, html_font = "Cambria")%>%
  add_footnote(
    label = "* Assumes each individual contributes 1 full person-year unless deceased during that year.",
    notation = "symbol"
  )

Adjusted Incidence of Disease X cases from 2018 to 2020
Year	New_Cases	Deaths	Adjusted_Pop	Person_Time_Years	incidence_per_100k
2018	33	2	559137	559137	5.90
2019	41	4	559135	559135	7.33
2020	66	5	559134	559134	11.80
2021	42	4	559135	559135	7.51
Assumes each individual contributes 1 full person-year unless deceased during that year.

# Plot incidence
ggplot(incidence_summary, aes(x = factor(Year), y = incidence_per_100k)) +
  geom_col(fill = "dodgerblue") +
  geom_text(aes(label = incidence_per_100k), vjust = -0.5) +
  labs(
    title = "Incidence of Disease X (Confirmed + Probable)",
    subtitle = "Per 100,000 population (2018–2021)",
    x = "Year",
    y = "Incidence Rate"
  ) +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Q2- Incidence Rates & Counts of Disease Y

# Case counts and incidence rates of Disease Y
# Since data already loaded on top as df_y using it to further clean
#  Extract case data from top 4 rows
df_y_cases <- df_y[1:5, ]%>%
  mutate(across(everything(), as.character))
colnames(df_y_cases) <- c("County", "2015", "2016", "2017", "2018", "2019", "2020", "2021")

#  Create population table manually
population_tbl <- tribble(
  ~County,      ~Population,
  "Jefferson",   150000,
  "Washington",  500000,
  "Lincoln",      95000,
  "Jackson",      40000
)

# Reshape and join population data
df_y_long <- df_y_cases %>%
  pivot_longer(cols = -County, names_to = "Year", values_to = "Case_Count") %>%
  mutate(
    Year = as.numeric(Year),
    Case_Count = as.numeric(Case_Count)
  ) %>%
  left_join(population_tbl, by = "County") %>%
  mutate(
    Person_Time_Years = Population,  # Assume each person contributes 1 year
    Incidence_per_100k = round((Case_Count / Population) * 100000, 2)
  )

## Remove rows where County is "Disease Y" or Case_Count is NA
df_y_long <- df_y_long %>%
  filter(County != "Disease Y", !is.na(Case_Count))

# View final result
kable(df_y_long, caption="Counts and Incidence Rate for Disease Y")%>%
  kable_classic(full_width = F, html_font = "Cambria") %>%
  add_footnote(
    label = "* Assumes each individual contributes 1 full person-year annually for the population denominator.",
    notation = "symbol"
  )

Counts and Incidence Rate for Disease Y
County	Year	Case_Count	Population	Person_Time_Years	Incidence_per_100k
Jefferson	2015	120	150000	150000	80.00
Jefferson	2016	140	150000	150000	93.33
Jefferson	2017	100	150000	150000	66.67
Jefferson	2018	98	150000	150000	65.33
Jefferson	2019	45	150000	150000	30.00
Jefferson	2020	140	150000	150000	93.33
Jefferson	2021	162	150000	150000	108.00
Washington	2015	183	500000	500000	36.60
Washington	2016	349	500000	500000	69.80
Washington	2017	438	500000	500000	87.60
Washington	2018	522	500000	500000	104.40
Washington	2019	414	500000	500000	82.80
Washington	2020	348	500000	500000	69.60
Washington	2021	412	500000	500000	82.40
Lincoln	2015	148	95000	95000	155.79
Lincoln	2016	91	95000	95000	95.79
Lincoln	2017	94	95000	95000	98.95
Lincoln	2018	134	95000	95000	141.05
Lincoln	2019	97	95000	95000	102.11
Lincoln	2020	101	95000	95000	106.32
Lincoln	2021	140	95000	95000	147.37
Jackson	2015	50	40000	40000	125.00
Jackson	2016	74	40000	40000	185.00
Jackson	2017	80	40000	40000	200.00
Jackson	2018	104	40000	40000	260.00
Jackson	2019	118	40000	40000	295.00
Jackson	2020	96	40000	40000	240.00
Jackson	2021	132	40000	40000	330.00
Assumes each individual contributes 1 full person-year annually for the population denominator.

# Bar Plot: Case Counts by County over Time
ggplot(df_y_long, aes(x = factor(Year), y = Case_Count, fill = County)) +
  geom_col(position = position_dodge(width = 0.9)) +
  geom_text(
    aes(label = Case_Count),
    position = position_dodge(width = 0.9),
    vjust = -0.5,
    size = 3
  ) +
  labs(
    title = "Disease Y Case Counts by County and Year",
    x = "Year",
    y = "Number of Cases"
  ) +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

# Line Plot: Incidence Rates by County
ggplot(df_y_long, aes(x = Year, y = Incidence_per_100k, color = County)) +
  geom_line(size = 1) +
  geom_point() +
  geom_text(
    aes(label = Incidence_per_100k),
    vjust = -0.5,
    size = 3
  ) +
  labs(
    title = "Incidence Rate of Disease Y by County (Per 100,000)",
    x = "Year",
    y = "Incidence per 100k"
  ) +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold"))

Epidemiology Program Manager (Epidemiologist III) Position - Skill Assessment

Dr. Keshav Kumar

24 June, 2025

Descriptive Summary

Descriptive Graphs

Q1.a- Confirmed Cases Count in 2020

Q1.b- Percentage of Confirmed & Probable Cases

Q1.c- Female to Male Ratio

Q1.d- Disease X Trend from 2018-20 is Increasing

Q1.e- Average & Median age of Confirmed and Probable cases

Q1.f- Disease X Seasonality

Q1.g- Incidence of Disease X from 2018-21

Q2- Incidence Rates & Counts of Disease Y

Epidemiology Program Manager (Epidemiologist III) Position - Skill Assessment

Dr. Keshav Kumar

24 June, 2025

Descriptive Summary

Descriptive Graphs

Q1.a- Confirmed Cases Count in 2020

Q1.b- Percentage of Confirmed & Probable Cases

Q1.c- Female to Male Ratio

Q1.d- Disease X Trend from 2018-20 is Increasing

Q1.e- Average & Median age of Confirmed and Probable cases

Q1.f- Disease X Seasonality

Q1.g- Incidence of Disease X from 2018-21

Q2- Incidence Rates & Counts of Disease Y

Newsletter Summary (for school nurses):