###### PROLOG ########
# PROLOG ###
# PROJECT: Epidemiology Program Manager (Epidemiologist III) Position
# PURPOSE: Skill Assessment
# DIR: C:\Users\kesha\Downloads
# DATA: Data - In-Basket A.xlsx
# AUTHOR: Dr. Keshav Kumar
# CREATED: JUNE 23, 2025
# LATEST: JUNE 23, 2025
# NOTES: N/A
# PROLOG ###
# libraries
library(magrittr) # for pipes
library(table1) # for descriptive statistics
library(tidyverse) # for tidy code
library(sessioninfo) # for session_info at bottom
library(details) # for session_info at bottom
library(ggthemes) # for tufte theme
library(ggrepel) # for text plotting
library(patchwork) # for combining plots
library(readxl) # for reading xlxs files
library(lme4) # for linear mixed models
library(knitr) # for tables
library(sjPlot) # for model tables
# plot theme
theme_set(theme_tufte()) # but might not carry over in chunks
# Okabe-Ito colorblind-friendly color palette:
# https://jfly.uni-koeln.de/color/
oi_pal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7", "#999999")
###### DATA MGMT #####
# original dataset
df <- read_xlsx("C:/Users/kesha/Downloads/Data - In-Basket A.xlsx", sheet = "Case Line List Disease X")
df_y <- read_excel("C:/Users/kesha/Downloads/Data - In-Basket A.xlsx", sheet = "Disease Y", col_names = FALSE)
dict <- read_excel("C:/Users/kesha/Downloads/Data - In-Basket A.xlsx", sheet = "Data Dictionary")
# Clean and rename columns
colnames(df) <- str_trim(colnames(df))
dict <- dict %>% mutate(Column = str_trim(Column))
rename_map <- setNames(dict$Description, dict$Column)
df <- df %>% rename(any_of(rename_map)) %>%
janitor::clean_names()
# Convert date columns and calculate age
df <- df %>%
mutate(
Age = as.numeric(difftime(`onset_date`, `dob`, units = "days")) / 365.25,
Year = year(`onset_date`)
)
Descriptive Summary
# Descriptive Summary
desc<- table1(
~case_status + sex + hospitalized + died + race + ethnicity + lab +
sx + Age,
data= df,
render.continous="median(IQR)",
caption = "Descriptive Statistics of Disease X"
)
kable(desc)
|
(N=202) |
case_status |
|
Confirmed |
140 (69.3%) |
Probable |
42 (20.8%) |
Suspect |
20 (9.9%) |
sex |
|
Female |
81 (40.1%) |
Male |
118 (58.4%) |
Unknown |
3 (1.5%) |
hospitalized |
|
No |
5 (2.5%) |
Unknown |
58 (28.7%) |
Yes |
139 (68.8%) |
died |
|
No |
94 (46.5%) |
Unknown |
92 (45.5%) |
Yes |
16 (7.9%) |
race |
|
Asian |
1 (0.5%) |
Black/African American |
20 (9.9%) |
Native American |
2 (1.0%) |
Other |
2 (1.0%) |
Unknown |
73 (36.1%) |
White |
104 (51.5%) |
ethnicity |
|
Hispanic |
29 (14.4%) |
Non-Hispanic |
74 (36.6%) |
Unknown |
99 (49.0%) |
lab |
|
Ab + |
62 (30.7%) |
Cx |
8 (4.0%) |
CX |
1 (0.5%) |
PCR + |
131 (64.9%) |
sx |
|
No |
22 (10.9%) |
Yes |
180 (89.1%) |
Age |
|
Mean (SD) |
52.5 (20.3) |
Median [Min, Max] |
57.1 [0.977, 90.4] |
# Round age to nearest whole number
df$age_rounded <- round(df$Age)
# Frequency table for Year
year_count <- table(df$Year)
kable(year_count)
2018 |
36 |
2019 |
46 |
2020 |
72 |
2021 |
48 |
# Percentage table for Year
year_perc <- prop.table(table(df$Year)) * 100
kable(year_perc)
2018 |
17.82178 |
2019 |
22.77228 |
2020 |
35.64356 |
2021 |
23.76238 |
Descriptive Graphs
# Histogram of age
ggplot(df, aes(x = Age)) +
geom_histogram(binwidth = 5, fill = "steelblue", color = "white") +
stat_bin(binwidth = 5, geom = "text", aes(label = ..count..), vjust = -0.5) +
labs(title = "Distribution of Age at Onset",
x = "Age (Years)", y = "Number of Cases") +
theme_minimal()

# Bar Plot of Case counts by Year
ggplot(df, aes(x = factor(Year))) +
geom_bar(fill = "orange", color = "black") +
geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
labs(title = "Number of Disease X Cases by Year",
x = "Year", y = "Number of Cases") +
theme_minimal()

# Boxplot of Age by Year
ggplot(df, aes(x = factor(Year), y = Age)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "Age Distribution of Cases by Year",
x = "Year", y = "Age at Onset") +
theme_minimal()

# Line Plot: Trend of Confirmed Cases Over Years
df %>%
filter(case_status == "Confirmed") %>%
count(Year) %>%
ggplot(aes(x = Year, y = n)) +
geom_line(group = 1, color = "darkblue") +
geom_point(size = 3) +
geom_text(aes(label = n), vjust = -0.5) +
labs(title = "Trend of Confirmed Cases (Disease X)",
x = "Year", y = "Number of Confirmed Cases") +
theme_minimal()

Confirmed Cases in 2020
# Confirmed Cases in 2020
confirmed_2020 <- df %>%
filter(`case_status` == "Confirmed", year(onset_date) == 2020)
kable(confirmed_2020)
Disease X |
No |
Pending |
Confirmed |
3909145 |
1947-10-30 |
Male |
Unknown |
Non-Hispanic |
Bell |
R7 |
2020-10-01 |
PCR + |
Yes |
Yes |
Unknown |
unknown |
Rockport |
72.922656 |
2020 |
73 |
Disease X |
Yes |
Approved |
Confirmed |
3933593 |
1950-11-27 |
Male |
White |
Non-Hispanic |
Tarrant |
R2_3 |
2020-11-15 |
PCR + |
Yes |
Yes |
No |
unknown |
Mexico |
69.968515 |
2020 |
70 |
Disease X |
Yes |
Approved |
Confirmed |
4100280 |
1956-08-31 |
Male |
Black/African American |
Non-Hispanic |
Tarrant |
R2_3 |
2020-08-31 |
PCR + |
Yes |
Yes |
No |
ESRD, HIV/AIDS, immunosuppressed |
None |
64.000000 |
2020 |
64 |
Disease X |
Yes |
Approved |
Confirmed |
4187620 |
1950-01-03 |
Male |
White |
Non-Hispanic |
Harris |
R6_5 |
2020-12-05 |
Cx |
Yes |
Yes |
Yes |
none |
Unknown |
70.921287 |
2020 |
71 |
Disease X |
Yes |
Approved |
Confirmed |
4204794 |
1993-01-15 |
Male |
White |
Non-Hispanic |
Coryell |
R7 |
2020-10-25 |
PCR + |
Yes |
Yes |
Unknown |
diabetes, immunosuppressed |
El Paso |
27.775496 |
2020 |
28 |
Disease X |
Yes |
Approved |
Confirmed |
4212631 |
1978-10-19 |
Male |
White |
Hispanic |
Caldwell |
R7 |
2020-03-20 |
PCR + |
Yes |
Yes |
Unknown |
cancer, smoker, immunosuppressed |
None |
41.418207 |
2020 |
41 |
Disease X |
Yes |
Approved |
Confirmed |
4213141 |
1961-04-18 |
Male |
Unknown |
Unknown |
Dallas |
R2_3 |
2020-05-18 |
PCR + |
Yes |
Yes |
Unknown |
unknown |
Unknown |
59.082820 |
2020 |
59 |
Disease X |
Yes |
Approved |
Confirmed |
4229885 |
1964-08-09 |
Male |
White |
Unknown |
Dallas |
R2_3 |
2020-03-12 |
PCR + |
Yes |
Yes |
No |
Chronuc lung diseases, smoker, HIV?AIDS |
None |
55.589322 |
2020 |
56 |
Disease X |
Yes |
Approved |
Confirmed |
4254761 |
1943-09-27 |
Male |
Black/African American |
Non-Hispanic |
Travis |
R7 |
2020-12-18 |
PCR + |
Yes |
Yes |
Unknown |
unknown |
Tyler |
77.226557 |
2020 |
77 |
Disease X |
Yes |
Approved |
Confirmed |
4278160 |
1950-07-12 |
Female |
White |
Hispanic |
Bexar |
R8 |
2020-05-19 |
PCR + |
Yes |
Yes |
No |
corticosteriods, immunosuppressed |
None |
69.853525 |
2020 |
70 |
Disease X |
Yes |
Approved |
Confirmed |
4287521 |
1969-08-28 |
Male |
White |
Hispanic |
Travis |
R7 |
2020-10-25 |
PCR + |
Yes |
Yes |
No |
HIV |
Mexico |
51.159480 |
2020 |
51 |
Disease X |
Yes |
Approved |
Confirmed |
4294888 |
1950-12-20 |
Female |
Native American |
Unknown |
Harris |
R6_5 |
2020-12-28 |
PCR + |
Yes |
Yes |
No |
Chronic lung disease, diabetes, corticosteriods |
None |
70.023272 |
2020 |
70 |
Disease X |
Yes |
Approved |
Confirmed |
4297657 |
2019-04-14 |
Female |
White |
Non-Hispanic |
Bexar |
R8 |
2020-07-21 |
PCR + |
Yes |
Yes |
Yes |
none |
None |
1.270363 |
2020 |
1 |
Disease X |
Yes |
Approved |
Confirmed |
4341166 |
1981-10-17 |
Male |
White |
Non-Hispanic |
Williamson |
R7 |
2020-01-18 |
PCR + |
Yes |
Yes |
No |
immunosuppressed |
None |
38.253251 |
2020 |
38 |
Disease X |
Yes |
Approved |
Confirmed |
4377136 |
1944-09-16 |
Male |
White |
Non-Hispanic |
Travis |
R7 |
2020-09-24 |
PCR + |
Yes |
Yes |
No |
Cortocpsteriods |
None |
76.021903 |
2020 |
76 |
Disease X |
Yes |
Approved |
Confirmed |
4448346 |
1952-08-02 |
Male |
White |
Non-Hispanic |
Tarrant |
R2_3 |
2020-12-06 |
PCR + |
Yes |
Yes |
No |
cancer, immunosuppressed |
None |
68.344969 |
2020 |
68 |
Disease X |
Yes |
Pending |
Confirmed |
4577796 |
1996-07-24 |
Male |
Unknown |
Unknown |
Bell |
R7 |
2020-08-05 |
PCR + |
Yes |
Yes |
Unknown |
unknown |
San Antonio |
24.032854 |
2020 |
24 |
Disease X |
Yes |
Approved |
Confirmed |
4651412 |
1967-04-30 |
Female |
White |
Unknown |
Dallas |
R2_3 |
2020-06-16 |
PCR + |
Yes |
Yes |
Yes |
smoker |
None |
53.130732 |
2020 |
53 |
Disease X |
No |
Entered |
Confirmed |
4742323 |
1965-03-17 |
Female |
White |
Non-Hispanic |
Travis |
R7 |
2020-01-07 |
Cx |
Yes |
Unknown |
Unknown |
unknown |
Corpus Christi |
54.809035 |
2020 |
55 |
Disease X |
Yes |
Approved |
Confirmed |
4779266 |
1971-05-10 |
Male |
White |
Non-Hispanic |
Dallas |
R2_3 |
2020-07-02 |
PCR + |
Yes |
Yes |
No |
chronic lung disease, smoker, diabetes,
corticosteriods |
None |
49.147160 |
2020 |
49 |
Disease X |
Yes |
Approved |
Confirmed |
4800426 |
1957-11-02 |
Female |
Unknown |
Unknown |
Travis |
R7 |
2020-08-17 |
PCR + |
Yes |
Yes |
Unknown |
smoker |
Unknown |
62.789870 |
2020 |
63 |
Disease X |
Yes |
Approved |
Confirmed |
4816461 |
1949-10-28 |
Male |
White |
Non-Hispanic |
Bexar |
R8 |
2020-09-29 |
PCR + |
Yes |
Yes |
No |
unknown |
Unknown |
70.921287 |
2020 |
71 |
Disease X |
Yes |
Approved |
Confirmed |
4841807 |
1955-08-23 |
Male |
White |
Non-Hispanic |
Harris |
R6_5 |
2020-01-28 |
Cx |
Yes |
Yes |
No |
ESRD, corticosteriods, immunosuppressed |
None |
64.432580 |
2020 |
64 |
Disease X |
Yes |
Approved |
Confirmed |
4884572 |
1946-06-16 |
Male |
White |
Non-Hispanic |
Travis |
R7 |
2020-11-23 |
PCR + |
Yes |
Yes |
No |
chronic lung disease, ESRD |
None |
74.439425 |
2020 |
74 |
Disease X |
Yes |
Approved |
Confirmed |
4896006 |
1953-02-24 |
Female |
White |
Unknown |
Dallas |
R2_3 |
2020-08-31 |
PCR + |
Yes |
Yes |
No |
smoker, HTN |
None |
67.515400 |
2020 |
68 |
Disease X |
Yes |
Approved |
Confirmed |
4950746 |
1934-12-25 |
Male |
White |
Hispanic |
Bexar |
R8 |
2020-06-18 |
PCR + |
Yes |
Yes |
Yes |
chronic lung disease |
None |
85.481177 |
2020 |
85 |
Disease X |
No |
Approved |
Confirmed |
4970545 |
1947-10-20 |
Male |
Unknown |
Unknown |
Harris |
R6_5 |
2020-10-10 |
PCR + |
Yes |
Unknown |
Unknown |
unknown |
Unknown |
72.974675 |
2020 |
73 |
Disease X |
Yes |
Approved |
Confirmed |
4973689 |
1965-07-17 |
Male |
Unknown |
Unknown |
Dallas |
R2_3 |
2020-09-07 |
PCR + |
Yes |
Yes |
No |
diabetes |
None |
55.143053 |
2020 |
55 |
Disease X |
Yes |
Approved |
Confirmed |
5001088 |
1956-08-27 |
Female |
White |
Hispanic |
Travis |
R7 |
2020-07-08 |
PCR + |
Yes |
Yes |
No |
Diabetes, ESRD |
None |
63.863107 |
2020 |
64 |
Disease X |
Yes |
Approved |
Confirmed |
5082971 |
1952-01-17 |
Male |
White |
Hispanic |
Bexar |
R8 |
2020-04-16 |
PCR + |
Yes |
Yes |
No |
none |
None |
68.246407 |
2020 |
68 |
Disease X |
Yes |
Approved |
Confirmed |
5300544 |
1973-01-22 |
Male |
Black/African American |
Non-Hispanic |
Dallas |
R2_3 |
2020-09-17 |
Cx |
Yes |
Yes |
No |
ESRD, Smoker |
None |
47.652293 |
2020 |
48 |
Disease X |
Yes |
Approved |
Confirmed |
5322226 |
1949-08-08 |
Male |
Black/African American |
Non-Hispanic |
Harris |
R6_5 |
2020-03-16 |
PCR + |
Yes |
Yes |
No |
Diabetes, ESRD, smoker |
Unknown |
70.603696 |
2020 |
71 |
Disease X |
Yes |
Approved |
Confirmed |
5440837 |
1947-03-03 |
Male |
Black/African American |
Unknown |
Dallas |
R2_3 |
2020-05-19 |
PCR + |
Yes |
Yes |
No |
smoker |
None |
73.212868 |
2020 |
73 |
Disease X |
Yes |
Approved |
Confirmed |
5468553 |
1967-10-14 |
Male |
Unknown |
Hispanic |
Hays |
R7 |
2020-06-18 |
PCR + |
Yes |
Yes |
No |
diabetes, smoker |
None |
52.678987 |
2020 |
53 |
Disease X |
Yes |
Approved |
Confirmed |
5484340 |
1968-12-07 |
Male |
White |
Unknown |
McLennan |
R7 |
2020-06-10 |
PCR + |
Yes |
Yes |
No |
smoker, HCV |
None |
51.507187 |
2020 |
52 |
Disease X |
No |
Approved |
Confirmed |
5486335 |
1952-03-04 |
Male |
White |
Hispanic |
Travis |
R7 |
2020-08-16 |
Cx |
Yes |
Yes |
No |
ESRD |
Nevada |
68.451745 |
2020 |
68 |
Disease X |
Yes |
Approved |
Confirmed |
5486820 |
1948-04-15 |
Male |
White |
Non-Hispanic |
Tarrant |
R2_3 |
2020-01-23 |
PCR + |
Yes |
Yes |
No |
cancer, chronic lung disease, immunospurpessed |
None |
71.772758 |
2020 |
72 |
Disease X |
Yes |
Approved |
Confirmed |
5561144 |
1967-04-27 |
Male |
Black/African American |
Non-Hispanic |
Dallas |
R2_3 |
2020-06-13 |
PCR + |
Yes |
Yes |
No |
diabetes, smoker |
None |
53.130732 |
2020 |
53 |
Disease X |
No |
Entered |
Confirmed |
5613448 |
1984-05-20 |
Male |
Unknown |
Unknown |
Brazos |
R7 |
2020-08-05 |
PCR + |
Yes |
Yes |
Unknown |
unknown |
Unknown |
36.210814 |
2020 |
36 |
Disease X |
Yes |
Approved |
Confirmed |
5638597 |
1963-05-29 |
Male |
White |
Non-Hispanic |
Coryell |
R7 |
2020-07-01 |
PCR + |
Yes |
Yes |
Unknown |
cancer, diabetes, immunosuppressed |
None |
57.092402 |
2020 |
57 |
Disease X |
Yes |
Approved |
Confirmed |
5645308 |
1954-11-04 |
Female |
White |
Hispanic |
Travis |
R7 |
2020-05-19 |
PCR + |
Yes |
Yes |
No |
diabetee |
None |
65.538672 |
2020 |
66 |
Disease X |
Yes |
Approved |
Confirmed |
5721484 |
1945-08-28 |
Male |
Black/African American |
Non-Hispanic |
McLennan |
R7 |
2020-01-16 |
PCR + |
Yes |
Yes |
No |
lung disease, Diabetes, ESRD |
Louisanna |
74.384668 |
2020 |
74 |
Disease X |
Yes |
Approved |
Confirmed |
5947265 |
1956-10-17 |
Female |
White |
Hispanic |
Williamson |
R7 |
2020-01-03 |
PCR + |
Yes |
Yes |
Unknown |
diabetes |
Mexico |
63.211499 |
2020 |
63 |
Disease X |
Yes |
Approved |
Confirmed |
5958676 |
1955-09-06 |
Male |
White |
Non-Hispanic |
Bell |
R7 |
2020-08-12 |
PCR + |
Yes |
Yes |
Unknown |
ESRD |
Unknown |
64.933607 |
2020 |
65 |
Disease X |
Yes |
Approved |
Confirmed |
6018975 |
1960-09-02 |
Male |
Unknown |
Hispanic |
Dallas |
R2_3 |
2020-07-06 |
Cx |
Yes |
Yes |
No |
leukemia |
None |
59.841205 |
2020 |
60 |
Disease X |
Yes |
Approved |
Confirmed |
6070570 |
1968-03-17 |
Female |
White |
Unknown |
Dallas |
R2_3 |
2020-09-28 |
PCR + |
Yes |
Yes |
Unknown |
cancer, diabetes, smoker |
None |
52.533881 |
2020 |
53 |
Disease X |
Yes |
Approved |
Confirmed |
6098024 |
1951-10-20 |
Male |
White |
Non-Hispanic |
McLennan |
R7 |
2020-09-05 |
PCR + |
Yes |
Yes |
No |
unknown |
Amarillo |
68.878850 |
2020 |
69 |
Disease X |
Yes |
Approved |
Confirmed |
6166237 |
1952-11-16 |
Female |
White |
Non-Hispanic |
Tarrant |
R2_3 |
2020-03-26 |
PCR + |
Yes |
Yes |
No |
unknown |
Wisconsin and Florida |
67.356605 |
2020 |
67 |
Disease X |
Yes |
Approved |
Confirmed |
6192630 |
1948-09-28 |
Male |
White |
Non-Hispanic |
Harris |
R6_5 |
2020-10-14 |
PCR + |
Yes |
Yes |
Yes |
cancer, cortcicosteroids, smoker |
None |
72.043806 |
2020 |
72 |
Disease X |
Yes |
Approved |
Confirmed |
6213323 |
2004-12-11 |
Female |
White |
Non-Hispanic |
Limestone |
R7 |
2020-08-31 |
PCR + |
Yes |
Yes |
No |
unknown |
None |
15.720739 |
2020 |
16 |
Disease X |
Yes |
Approved |
Confirmed |
6221784 |
1998-06-13 |
Male |
White |
Non-Hispanic |
Coryell |
R7 |
2020-03-30 |
PCR + |
Yes |
Yes |
Unknown |
Diabetes, smoker |
Houston |
21.796030 |
2020 |
22 |
Disease X |
Yes |
Approved |
Confirmed |
6238081 |
1945-06-02 |
Male |
Asian |
Non-Hispanic |
Harris |
R6_5 |
2020-03-31 |
PCR + |
Yes |
Yes |
Unknown |
cancer, smoker |
None |
74.828200 |
2020 |
75 |
% Confirmed & Probable Cases
#% of Confirmed & Probable Cases with Known Hospitalization
known_hosp <- df %>%
filter(`case_status` %in% c("Confirmed", "Probable")) %>%
filter(hospitalized %in% c("Yes", "No"))
percent_known <- nrow(known_hosp) /
nrow(filter(df, `case_status` %in% c("Confirmed", "Probable"))) * 100
kable(round(percent_known, 2))
female to male ratio
# What is the female to male ratio among confirmed cases?
gender_ratio <- df %>%
filter(case_status == "Confirmed") %>%
count(sex) %>%
pivot_wider(names_from = sex, values_from = n, values_fill = 0)
# Female to male ratio
ratio_female_male <- round(gender_ratio$Female / gender_ratio$Male, 2)
kable(ratio_female_male)
Disease X increase or decrease
#Is disease X increasing or decreasing from 2018 to 2020?
cases <- df %>%
filter(case_status == "Confirmed", Year %in% 2018:2020) %>%
count(Year)
kable(cases)
Disease X seasonal or not
# Is Disease X seasonal? Why or why not?
# Prepare data
df_season <- df %>%
filter(case_status %in% c("Confirmed", "Probable")) %>%
mutate(
month_onset = month(onset_date, label = TRUE, abbr = FALSE)
) %>%
count(Year, month_onset)
# Reshape for wide-format table (months as rows, years as columns)
seasonality_wide <- df_season %>%
pivot_wider(names_from = Year, values_from = n, values_fill = 0) %>%
arrange(match(month_onset, month.name))
kable(seasonality_wide)
January |
2 |
1 |
6 |
3 |
February |
2 |
2 |
1 |
1 |
March |
1 |
4 |
6 |
3 |
April |
2 |
2 |
3 |
2 |
May |
2 |
1 |
6 |
4 |
June |
3 |
4 |
5 |
6 |
July |
5 |
10 |
7 |
3 |
August |
9 |
5 |
10 |
6 |
September |
3 |
5 |
7 |
5 |
October |
2 |
2 |
7 |
4 |
November |
0 |
2 |
4 |
4 |
December |
2 |
3 |
4 |
1 |
df %>%
filter(case_status %in% c("Confirmed", "Probable")) %>%
mutate(month = month(onset_date, label = TRUE)) %>%
count(month) %>%
ggplot(aes(x = month, y = n)) +
geom_col(fill = "tomato") +
labs(title = "Monthly Distribution of Disease X Cases",
x = "Month", y = "Number of Cases") +
theme_minimal()

# Faceted bar plot
ggplot(df_season, aes(x = month_onset, y = n)) +
geom_col(fill = "steelblue") +
facet_wrap(~ Year, ncol = 2) +
labs(
title = "Monthly Distribution of Disease X Cases by Year",
x = "Month",
y = "Number of Cases"
) +
theme_minimal() +
theme(
strip.text = element_text(size = 12, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1)
)

Incidence of Disease X
# Incidence of cases (combined confirmed and probable) of disease X from 2018 to 2021
pop <- 559139
# Filter and calculate incidence
incidence_summary <- df %>%
filter(case_status %in% c("Confirmed", "Probable"), Year %in% 2018:2021) %>%
count(Year) %>%
mutate(incidence_per_100k = round((n / pop) * 100000, 2))
# View the frequency table
kable(incidence_summary)
2018 |
33 |
5.90 |
2019 |
41 |
7.33 |
2020 |
66 |
11.80 |
2021 |
42 |
7.51 |
# Plot incidence
ggplot(incidence_summary, aes(x = factor(Year), y = incidence_per_100k)) +
geom_col(fill = "dodgerblue") +
geom_text(aes(label = incidence_per_100k), vjust = -0.5) +
labs(
title = "Incidence of Disease X (Confirmed + Probable)",
subtitle = "Per 100,000 population (2018–2021)",
x = "Year",
y = "Incidence Rate"
) +
theme_minimal()

Counts & incidence rates of Disease Y
# Case counts and incidence rates of Disease Y
# Load and clean Disease Y data
# Step 2: Extract case data from top 4 rows
df_y_cases <- df_y[1:4, ]%>%
mutate(across(everything(), as.character))
#colnames(df_y_cases) <- c("County", as.character(2015:2021))
colnames(df_y_cases) <- c("County", "2015", "2016", "2017", "2018", "2019", "2020", "2021")
# Step 3: Create population table manually
population_tbl <- tribble(
~County, ~Population,
"Jefferson", 150000,
"Washington", 500000,
"Lincoln", 95000,
"Jackson", 40000
)
# Step 4: Reshape and join population data
df_y_long <- df_y_cases %>%
pivot_longer(cols = -County, names_to = "Year", values_to = "Case_Count") %>%
mutate(
Year = as.numeric(Year),
Case_Count = as.numeric(Case_Count)
) %>%
left_join(population_tbl, by = "County") %>%
mutate(
Incidence_per_100k = round((Case_Count / Population) * 100000, 2)
)
# Step 5: View final result
kable(df_y_long)
Disease Y |
2015 |
2015 |
NA |
NA |
Disease Y |
2016 |
2016 |
NA |
NA |
Disease Y |
2017 |
2017 |
NA |
NA |
Disease Y |
2018 |
2018 |
NA |
NA |
Disease Y |
2019 |
2019 |
NA |
NA |
Disease Y |
2020 |
2020 |
NA |
NA |
Disease Y |
2021 |
2021 |
NA |
NA |
Jefferson |
2015 |
120 |
150000 |
80.00 |
Jefferson |
2016 |
140 |
150000 |
93.33 |
Jefferson |
2017 |
100 |
150000 |
66.67 |
Jefferson |
2018 |
98 |
150000 |
65.33 |
Jefferson |
2019 |
45 |
150000 |
30.00 |
Jefferson |
2020 |
140 |
150000 |
93.33 |
Jefferson |
2021 |
162 |
150000 |
108.00 |
Washington |
2015 |
183 |
500000 |
36.60 |
Washington |
2016 |
349 |
500000 |
69.80 |
Washington |
2017 |
438 |
500000 |
87.60 |
Washington |
2018 |
522 |
500000 |
104.40 |
Washington |
2019 |
414 |
500000 |
82.80 |
Washington |
2020 |
348 |
500000 |
69.60 |
Washington |
2021 |
412 |
500000 |
82.40 |
Lincoln |
2015 |
148 |
95000 |
155.79 |
Lincoln |
2016 |
91 |
95000 |
95.79 |
Lincoln |
2017 |
94 |
95000 |
98.95 |
Lincoln |
2018 |
134 |
95000 |
141.05 |
Lincoln |
2019 |
97 |
95000 |
102.11 |
Lincoln |
2020 |
101 |
95000 |
106.32 |
Lincoln |
2021 |
140 |
95000 |
147.37 |
## Remove rows where County is "Disease Y" or Case_Count is NA
df_y_long <- df_y_long %>%
filter(County != "Disease Y", !is.na(Case_Count))
# Bar Plot: Case Counts by County over Time
ggplot(df_y_long, aes(x = factor(Year), y = Case_Count, fill = County)) +
geom_col(position = position_dodge(width = 0.9)) +
geom_text(
aes(label = Case_Count),
position = position_dodge(width = 0.9),
vjust = -0.5,
size = 3
) +
labs(
title = "Disease Y Case Counts by County and Year",
x = "Year",
y = "Number of Cases"
) +
theme_minimal()

# Line Plot: Incidence Rates by County
ggplot(df_y_long, aes(x = Year, y = Incidence_per_100k, color = County)) +
geom_line(size = 1) +
geom_point() +
geom_text(
aes(label = Incidence_per_100k),
vjust = -0.5,
size = 3
) +
labs(
title = "Incidence Rate of Disease Y by County (Per 100,000)",
x = "Year",
y = "Incidence per 100k"
) +
theme_minimal()
