###### PROLOG ########


# PROLOG   ###
# PROJECT: Epidemiology Program Manager (Epidemiologist III) Position                   
# PURPOSE: Skill Assessment                                      
# DIR:     C:\Users\kesha\Downloads                    
# DATA:    Data - In-Basket A.xlsx                         
# AUTHOR:  Dr. Keshav Kumar                                            
# CREATED: JUNE 23, 2025                                           
# LATEST:  JUNE 23, 2025                                     
# NOTES:   N/A
# PROLOG   ### 


# libraries 


library(magrittr) # for pipes
library(table1) # for descriptive statistics 
library(tidyverse) # for tidy code
library(sessioninfo) # for session_info at bottom
library(details) # for session_info at bottom
library(ggthemes) # for tufte theme
library(ggrepel) # for text plotting
library(patchwork) # for combining plots
library(readxl) # for reading xlxs files
library(lme4) # for linear mixed models
library(knitr) # for tables
library(sjPlot) # for model tables 
# plot theme
theme_set(theme_tufte())  # but might not carry over in chunks

# Okabe-Ito colorblind-friendly color palette:
# https://jfly.uni-koeln.de/color/

oi_pal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", 
     "#0072B2", "#D55E00", "#CC79A7", "#999999")


###### DATA MGMT #####

# original dataset
df <- read_xlsx("C:/Users/kesha/Downloads/Data - In-Basket A.xlsx", sheet = "Case Line List Disease X")
df_y <- read_excel("C:/Users/kesha/Downloads/Data - In-Basket A.xlsx", sheet = "Disease Y", col_names = FALSE)
dict <- read_excel("C:/Users/kesha/Downloads/Data - In-Basket A.xlsx", sheet = "Data Dictionary")

# Clean and rename columns
colnames(df) <- str_trim(colnames(df))
dict <- dict %>% mutate(Column = str_trim(Column))
rename_map <- setNames(dict$Description, dict$Column)
df <- df %>% rename(any_of(rename_map)) %>%
  janitor::clean_names()

# Convert date columns and calculate age
df <- df %>%
  mutate(
    Age = as.numeric(difftime(`onset_date`, `dob`, units = "days")) / 365.25,
    Year = year(`onset_date`)
  )

Descriptive Summary

# Descriptive Summary
desc<- table1(
  ~case_status + sex + hospitalized + died + race + ethnicity + lab +
    sx + Age,
  data= df,
  render.continous="median(IQR)",
  caption = "Descriptive Statistics of Disease X"
)

kable(desc)
Overall
(N=202)
case_status
Confirmed 140 (69.3%)
Probable 42 (20.8%)
Suspect 20 (9.9%)
sex
Female 81 (40.1%)
Male 118 (58.4%)
Unknown 3 (1.5%)
hospitalized
No 5 (2.5%)
Unknown 58 (28.7%)
Yes 139 (68.8%)
died
No 94 (46.5%)
Unknown 92 (45.5%)
Yes 16 (7.9%)
race
Asian 1 (0.5%)
Black/African American 20 (9.9%)
Native American 2 (1.0%)
Other 2 (1.0%)
Unknown 73 (36.1%)
White 104 (51.5%)
ethnicity
Hispanic 29 (14.4%)
Non-Hispanic 74 (36.6%)
Unknown 99 (49.0%)
lab
Ab + 62 (30.7%)
Cx 8 (4.0%)
CX 1 (0.5%)
PCR + 131 (64.9%)
sx
No 22 (10.9%)
Yes 180 (89.1%)
Age
Mean (SD) 52.5 (20.3)
Median [Min, Max] 57.1 [0.977, 90.4]
# Round age to nearest whole number
df$age_rounded <- round(df$Age)


# Frequency table for Year
year_count <- table(df$Year)
kable(year_count)
Var1 Freq
2018 36
2019 46
2020 72
2021 48
# Percentage table for Year
year_perc <- prop.table(table(df$Year)) * 100
kable(year_perc)
Var1 Freq
2018 17.82178
2019 22.77228
2020 35.64356
2021 23.76238

Descriptive Graphs

# Histogram of age

ggplot(df, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "steelblue", color = "white") +
   stat_bin(binwidth = 5, geom = "text", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Distribution of Age at Onset",
       x = "Age (Years)", y = "Number of Cases") +
  theme_minimal()

# Bar Plot of Case counts by Year
ggplot(df, aes(x = factor(Year))) +
  geom_bar(fill = "orange", color = "black") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Number of Disease X Cases by Year",
       x = "Year", y = "Number of Cases") +
  theme_minimal()

# Boxplot of Age by Year
ggplot(df, aes(x = factor(Year), y = Age)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Age Distribution of Cases by Year",
       x = "Year", y = "Age at Onset") +
  theme_minimal()

# Line Plot: Trend of Confirmed Cases Over Years
df %>%
  filter(case_status == "Confirmed") %>%
  count(Year) %>%
  ggplot(aes(x = Year, y = n)) +
  geom_line(group = 1, color = "darkblue") +
  geom_point(size = 3) +
  geom_text(aes(label = n), vjust = -0.5) +
  labs(title = "Trend of Confirmed Cases (Disease X)",
       x = "Year", y = "Number of Confirmed Cases") +
  theme_minimal()

Confirmed Cases in 2020

# Confirmed Cases in 2020

confirmed_2020 <- df %>%
  filter(`case_status` == "Confirmed", year(onset_date) == 2020)

kable(confirmed_2020)
condition form_rvd nbs_status case_status case_id dob sex race ethnicity county hsr onset_date lab sx hospitalized died risk_factors travel Age Year age_rounded
Disease X No Pending Confirmed 3909145 1947-10-30 Male Unknown Non-Hispanic Bell R7 2020-10-01 PCR + Yes Yes Unknown unknown Rockport 72.922656 2020 73
Disease X Yes Approved Confirmed 3933593 1950-11-27 Male White Non-Hispanic Tarrant R2_3 2020-11-15 PCR + Yes Yes No unknown Mexico 69.968515 2020 70
Disease X Yes Approved Confirmed 4100280 1956-08-31 Male Black/African American Non-Hispanic Tarrant R2_3 2020-08-31 PCR + Yes Yes No ESRD, HIV/AIDS, immunosuppressed None 64.000000 2020 64
Disease X Yes Approved Confirmed 4187620 1950-01-03 Male White Non-Hispanic Harris R6_5 2020-12-05 Cx Yes Yes Yes none Unknown 70.921287 2020 71
Disease X Yes Approved Confirmed 4204794 1993-01-15 Male White Non-Hispanic Coryell R7 2020-10-25 PCR + Yes Yes Unknown diabetes, immunosuppressed El Paso 27.775496 2020 28
Disease X Yes Approved Confirmed 4212631 1978-10-19 Male White Hispanic Caldwell R7 2020-03-20 PCR + Yes Yes Unknown cancer, smoker, immunosuppressed None 41.418207 2020 41
Disease X Yes Approved Confirmed 4213141 1961-04-18 Male Unknown Unknown Dallas R2_3 2020-05-18 PCR + Yes Yes Unknown unknown Unknown 59.082820 2020 59
Disease X Yes Approved Confirmed 4229885 1964-08-09 Male White Unknown Dallas R2_3 2020-03-12 PCR + Yes Yes No Chronuc lung diseases, smoker, HIV?AIDS None 55.589322 2020 56
Disease X Yes Approved Confirmed 4254761 1943-09-27 Male Black/African American Non-Hispanic Travis R7 2020-12-18 PCR + Yes Yes Unknown unknown Tyler 77.226557 2020 77
Disease X Yes Approved Confirmed 4278160 1950-07-12 Female White Hispanic Bexar R8 2020-05-19 PCR + Yes Yes No corticosteriods, immunosuppressed None 69.853525 2020 70
Disease X Yes Approved Confirmed 4287521 1969-08-28 Male White Hispanic Travis R7 2020-10-25 PCR + Yes Yes No HIV Mexico 51.159480 2020 51
Disease X Yes Approved Confirmed 4294888 1950-12-20 Female Native American Unknown Harris R6_5 2020-12-28 PCR + Yes Yes No Chronic lung disease, diabetes, corticosteriods None 70.023272 2020 70
Disease X Yes Approved Confirmed 4297657 2019-04-14 Female White Non-Hispanic Bexar R8 2020-07-21 PCR + Yes Yes Yes none None 1.270363 2020 1
Disease X Yes Approved Confirmed 4341166 1981-10-17 Male White Non-Hispanic Williamson R7 2020-01-18 PCR + Yes Yes No immunosuppressed None 38.253251 2020 38
Disease X Yes Approved Confirmed 4377136 1944-09-16 Male White Non-Hispanic Travis R7 2020-09-24 PCR + Yes Yes No Cortocpsteriods None 76.021903 2020 76
Disease X Yes Approved Confirmed 4448346 1952-08-02 Male White Non-Hispanic Tarrant R2_3 2020-12-06 PCR + Yes Yes No cancer, immunosuppressed None 68.344969 2020 68
Disease X Yes Pending Confirmed 4577796 1996-07-24 Male Unknown Unknown Bell R7 2020-08-05 PCR + Yes Yes Unknown unknown San Antonio 24.032854 2020 24
Disease X Yes Approved Confirmed 4651412 1967-04-30 Female White Unknown Dallas R2_3 2020-06-16 PCR + Yes Yes Yes smoker None 53.130732 2020 53
Disease X No Entered Confirmed 4742323 1965-03-17 Female White Non-Hispanic Travis R7 2020-01-07 Cx Yes Unknown Unknown unknown Corpus Christi 54.809035 2020 55
Disease X Yes Approved Confirmed 4779266 1971-05-10 Male White Non-Hispanic Dallas R2_3 2020-07-02 PCR + Yes Yes No chronic lung disease, smoker, diabetes, corticosteriods None 49.147160 2020 49
Disease X Yes Approved Confirmed 4800426 1957-11-02 Female Unknown Unknown Travis R7 2020-08-17 PCR + Yes Yes Unknown smoker Unknown 62.789870 2020 63
Disease X Yes Approved Confirmed 4816461 1949-10-28 Male White Non-Hispanic Bexar R8 2020-09-29 PCR + Yes Yes No unknown Unknown 70.921287 2020 71
Disease X Yes Approved Confirmed 4841807 1955-08-23 Male White Non-Hispanic Harris R6_5 2020-01-28 Cx Yes Yes No ESRD, corticosteriods, immunosuppressed None 64.432580 2020 64
Disease X Yes Approved Confirmed 4884572 1946-06-16 Male White Non-Hispanic Travis R7 2020-11-23 PCR + Yes Yes No chronic lung disease, ESRD None 74.439425 2020 74
Disease X Yes Approved Confirmed 4896006 1953-02-24 Female White Unknown Dallas R2_3 2020-08-31 PCR + Yes Yes No smoker, HTN None 67.515400 2020 68
Disease X Yes Approved Confirmed 4950746 1934-12-25 Male White Hispanic Bexar R8 2020-06-18 PCR + Yes Yes Yes chronic lung disease None 85.481177 2020 85
Disease X No Approved Confirmed 4970545 1947-10-20 Male Unknown Unknown Harris R6_5 2020-10-10 PCR + Yes Unknown Unknown unknown Unknown 72.974675 2020 73
Disease X Yes Approved Confirmed 4973689 1965-07-17 Male Unknown Unknown Dallas R2_3 2020-09-07 PCR + Yes Yes No diabetes None 55.143053 2020 55
Disease X Yes Approved Confirmed 5001088 1956-08-27 Female White Hispanic Travis R7 2020-07-08 PCR + Yes Yes No Diabetes, ESRD None 63.863107 2020 64
Disease X Yes Approved Confirmed 5082971 1952-01-17 Male White Hispanic Bexar R8 2020-04-16 PCR + Yes Yes No none None 68.246407 2020 68
Disease X Yes Approved Confirmed 5300544 1973-01-22 Male Black/African American Non-Hispanic Dallas R2_3 2020-09-17 Cx Yes Yes No ESRD, Smoker None 47.652293 2020 48
Disease X Yes Approved Confirmed 5322226 1949-08-08 Male Black/African American Non-Hispanic Harris R6_5 2020-03-16 PCR + Yes Yes No Diabetes, ESRD, smoker Unknown 70.603696 2020 71
Disease X Yes Approved Confirmed 5440837 1947-03-03 Male Black/African American Unknown Dallas R2_3 2020-05-19 PCR + Yes Yes No smoker None 73.212868 2020 73
Disease X Yes Approved Confirmed 5468553 1967-10-14 Male Unknown Hispanic Hays R7 2020-06-18 PCR + Yes Yes No diabetes, smoker None 52.678987 2020 53
Disease X Yes Approved Confirmed 5484340 1968-12-07 Male White Unknown McLennan R7 2020-06-10 PCR + Yes Yes No smoker, HCV None 51.507187 2020 52
Disease X No Approved Confirmed 5486335 1952-03-04 Male White Hispanic Travis R7 2020-08-16 Cx Yes Yes No ESRD Nevada 68.451745 2020 68
Disease X Yes Approved Confirmed 5486820 1948-04-15 Male White Non-Hispanic Tarrant R2_3 2020-01-23 PCR + Yes Yes No cancer, chronic lung disease, immunospurpessed None 71.772758 2020 72
Disease X Yes Approved Confirmed 5561144 1967-04-27 Male Black/African American Non-Hispanic Dallas R2_3 2020-06-13 PCR + Yes Yes No diabetes, smoker None 53.130732 2020 53
Disease X No Entered Confirmed 5613448 1984-05-20 Male Unknown Unknown Brazos R7 2020-08-05 PCR + Yes Yes Unknown unknown Unknown 36.210814 2020 36
Disease X Yes Approved Confirmed 5638597 1963-05-29 Male White Non-Hispanic Coryell R7 2020-07-01 PCR + Yes Yes Unknown cancer, diabetes, immunosuppressed None 57.092402 2020 57
Disease X Yes Approved Confirmed 5645308 1954-11-04 Female White Hispanic Travis R7 2020-05-19 PCR + Yes Yes No diabetee None 65.538672 2020 66
Disease X Yes Approved Confirmed 5721484 1945-08-28 Male Black/African American Non-Hispanic McLennan R7 2020-01-16 PCR + Yes Yes No lung disease, Diabetes, ESRD Louisanna 74.384668 2020 74
Disease X Yes Approved Confirmed 5947265 1956-10-17 Female White Hispanic Williamson R7 2020-01-03 PCR + Yes Yes Unknown diabetes Mexico 63.211499 2020 63
Disease X Yes Approved Confirmed 5958676 1955-09-06 Male White Non-Hispanic Bell R7 2020-08-12 PCR + Yes Yes Unknown ESRD Unknown 64.933607 2020 65
Disease X Yes Approved Confirmed 6018975 1960-09-02 Male Unknown Hispanic Dallas R2_3 2020-07-06 Cx Yes Yes No leukemia None 59.841205 2020 60
Disease X Yes Approved Confirmed 6070570 1968-03-17 Female White Unknown Dallas R2_3 2020-09-28 PCR + Yes Yes Unknown cancer, diabetes, smoker None 52.533881 2020 53
Disease X Yes Approved Confirmed 6098024 1951-10-20 Male White Non-Hispanic McLennan R7 2020-09-05 PCR + Yes Yes No unknown Amarillo 68.878850 2020 69
Disease X Yes Approved Confirmed 6166237 1952-11-16 Female White Non-Hispanic Tarrant R2_3 2020-03-26 PCR + Yes Yes No unknown Wisconsin and Florida 67.356605 2020 67
Disease X Yes Approved Confirmed 6192630 1948-09-28 Male White Non-Hispanic Harris R6_5 2020-10-14 PCR + Yes Yes Yes cancer, cortcicosteroids, smoker None 72.043806 2020 72
Disease X Yes Approved Confirmed 6213323 2004-12-11 Female White Non-Hispanic Limestone R7 2020-08-31 PCR + Yes Yes No unknown None 15.720739 2020 16
Disease X Yes Approved Confirmed 6221784 1998-06-13 Male White Non-Hispanic Coryell R7 2020-03-30 PCR + Yes Yes Unknown Diabetes, smoker Houston 21.796030 2020 22
Disease X Yes Approved Confirmed 6238081 1945-06-02 Male Asian Non-Hispanic Harris R6_5 2020-03-31 PCR + Yes Yes Unknown cancer, smoker None 74.828200 2020 75

% Confirmed & Probable Cases

#% of Confirmed & Probable Cases with Known Hospitalization

known_hosp <- df %>%
  filter(`case_status` %in% c("Confirmed", "Probable")) %>%
  filter(hospitalized %in% c("Yes", "No"))

percent_known <- nrow(known_hosp) / 
  nrow(filter(df, `case_status` %in% c("Confirmed", "Probable"))) * 100

kable(round(percent_known, 2))
x
76.92

female to male ratio

# What is the female to male ratio among confirmed cases?
gender_ratio <- df %>%
  filter(case_status == "Confirmed") %>%
  count(sex) %>%
  pivot_wider(names_from = sex, values_from = n, values_fill = 0)

# Female to male ratio
ratio_female_male <- round(gender_ratio$Female / gender_ratio$Male, 2)

kable(ratio_female_male)
x
0.6

Disease X increase or decrease

#Is disease X increasing or decreasing from 2018 to 2020?

cases <- df %>%
  filter(case_status == "Confirmed", Year %in% 2018:2020) %>%
  count(Year)

kable(cases)
Year n
2018 25
2019 30
2020 52

Average & median age(confirmed + probable)

# What is the average and median age of confirmed + probable cases?
age_stats <- df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  summarise(
    average_age = round(mean(Age, na.rm = TRUE), 1),
    median_age = round(median(Age, na.rm = TRUE), 1)
  )

kable(age_stats)
average_age median_age
53.7 58.8

Disease X seasonal or not

# Is Disease X seasonal? Why or why not?
# Prepare data
df_season <- df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  mutate(
    month_onset = month(onset_date, label = TRUE, abbr = FALSE)
  ) %>%
  count(Year, month_onset)

# Reshape for wide-format table (months as rows, years as columns)
seasonality_wide <- df_season %>%
  pivot_wider(names_from = Year, values_from = n, values_fill = 0) %>%
  arrange(match(month_onset, month.name))

kable(seasonality_wide)
month_onset 2018 2019 2020 2021
January 2 1 6 3
February 2 2 1 1
March 1 4 6 3
April 2 2 3 2
May 2 1 6 4
June 3 4 5 6
July 5 10 7 3
August 9 5 10 6
September 3 5 7 5
October 2 2 7 4
November 0 2 4 4
December 2 3 4 1
df %>%
  filter(case_status %in% c("Confirmed", "Probable")) %>%
  mutate(month = month(onset_date, label = TRUE)) %>%
  count(month) %>%
  ggplot(aes(x = month, y = n)) +
  geom_col(fill = "tomato") +
  labs(title = "Monthly Distribution of Disease X Cases",
       x = "Month", y = "Number of Cases") +
  theme_minimal()

# Faceted bar plot
ggplot(df_season, aes(x = month_onset, y = n)) +
  geom_col(fill = "steelblue") +
  facet_wrap(~ Year, ncol = 2) +
  labs(
    title = "Monthly Distribution of Disease X Cases by Year",
    x = "Month",
    y = "Number of Cases"
  ) +
  theme_minimal() +
  theme(
    strip.text = element_text(size = 12, face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Incidence of Disease X

# Incidence of cases (combined confirmed and probable) of disease X from 2018 to 2021
pop <- 559139

# Filter and calculate incidence
incidence_summary <- df %>%
  filter(case_status %in% c("Confirmed", "Probable"), Year %in% 2018:2021) %>%
  count(Year) %>%
  mutate(incidence_per_100k = round((n / pop) * 100000, 2))

# View the frequency table
kable(incidence_summary)
Year n incidence_per_100k
2018 33 5.90
2019 41 7.33
2020 66 11.80
2021 42 7.51
# Plot incidence
ggplot(incidence_summary, aes(x = factor(Year), y = incidence_per_100k)) +
  geom_col(fill = "dodgerblue") +
  geom_text(aes(label = incidence_per_100k), vjust = -0.5) +
  labs(
    title = "Incidence of Disease X (Confirmed + Probable)",
    subtitle = "Per 100,000 population (2018–2021)",
    x = "Year",
    y = "Incidence Rate"
  ) +
  theme_minimal()

Counts & incidence rates of Disease Y

# Case counts and incidence rates of Disease Y
# Load and clean Disease Y data
# Step 2: Extract case data from top 4 rows
df_y_cases <- df_y[1:4, ]%>%
  mutate(across(everything(), as.character))
#colnames(df_y_cases) <- c("County", as.character(2015:2021)) 
colnames(df_y_cases) <- c("County", "2015", "2016", "2017", "2018", "2019", "2020", "2021")

# Step 3: Create population table manually
population_tbl <- tribble(
  ~County,      ~Population,
  "Jefferson",   150000,
  "Washington",  500000,
  "Lincoln",      95000,
  "Jackson",      40000
)

# Step 4: Reshape and join population data
df_y_long <- df_y_cases %>%
  pivot_longer(cols = -County, names_to = "Year", values_to = "Case_Count") %>%
  mutate(
    Year = as.numeric(Year),
    Case_Count = as.numeric(Case_Count)
  ) %>%
  left_join(population_tbl, by = "County") %>%
  mutate(
    Incidence_per_100k = round((Case_Count / Population) * 100000, 2)
  )

# Step 5: View final result
kable(df_y_long)
County Year Case_Count Population Incidence_per_100k
Disease Y 2015 2015 NA NA
Disease Y 2016 2016 NA NA
Disease Y 2017 2017 NA NA
Disease Y 2018 2018 NA NA
Disease Y 2019 2019 NA NA
Disease Y 2020 2020 NA NA
Disease Y 2021 2021 NA NA
Jefferson 2015 120 150000 80.00
Jefferson 2016 140 150000 93.33
Jefferson 2017 100 150000 66.67
Jefferson 2018 98 150000 65.33
Jefferson 2019 45 150000 30.00
Jefferson 2020 140 150000 93.33
Jefferson 2021 162 150000 108.00
Washington 2015 183 500000 36.60
Washington 2016 349 500000 69.80
Washington 2017 438 500000 87.60
Washington 2018 522 500000 104.40
Washington 2019 414 500000 82.80
Washington 2020 348 500000 69.60
Washington 2021 412 500000 82.40
Lincoln 2015 148 95000 155.79
Lincoln 2016 91 95000 95.79
Lincoln 2017 94 95000 98.95
Lincoln 2018 134 95000 141.05
Lincoln 2019 97 95000 102.11
Lincoln 2020 101 95000 106.32
Lincoln 2021 140 95000 147.37
## Remove rows where County is "Disease Y" or Case_Count is NA
df_y_long <- df_y_long %>%
  filter(County != "Disease Y", !is.na(Case_Count))


# Bar Plot: Case Counts by County over Time
ggplot(df_y_long, aes(x = factor(Year), y = Case_Count, fill = County)) +
  geom_col(position = position_dodge(width = 0.9)) +
  geom_text(
    aes(label = Case_Count),
    position = position_dodge(width = 0.9),
    vjust = -0.5,
    size = 3
  ) +
  labs(
    title = "Disease Y Case Counts by County and Year",
    x = "Year",
    y = "Number of Cases"
  ) +
  theme_minimal()

# Line Plot: Incidence Rates by County
ggplot(df_y_long, aes(x = Year, y = Incidence_per_100k, color = County)) +
  geom_line(size = 1) +
  geom_point() +
  geom_text(
    aes(label = Incidence_per_100k),
    vjust = -0.5,
    size = 3
  ) +
  labs(
    title = "Incidence Rate of Disease Y by County (Per 100,000)",
    x = "Year",
    y = "Incidence per 100k"
  ) +
  theme_minimal()