Purpose
This report performs early-stage exploration for:
clean_data_20260225* (close to raw, includes unmatched health zones)
hz_naming_convention.csv (reference naming/mapping file)
Goals:
confirm file provenance and loadability
understand data structure and variables
assess missingness and data quality
propose practical cleaning next steps
Setup and File Discovery
Summary:
Code
knitr:: opts_chunk$ set (echo = TRUE , warning = FALSE , message = FALSE )
data_dir <- "/rds/general/user/acp25/home/MIMIC/Clean_data/data"
# helper: null coalesce
` %||% ` <- function (x, y) if (is.null (x) || length (x) == 0 || (length (x) == 1 && is.na (x))) y else x
# helper: detect delimiter from first line
detect_delim <- function (path) {
first_line <- readLines (path, n = 1 , warn = FALSE )
if (length (first_line) == 0 ) {
return ("," )
}
counts <- c (
comma = lengths (regmatches (first_line, gregexpr ("," , first_line, fixed = TRUE ))),
tab = lengths (regmatches (first_line, gregexpr (" \t " , first_line, fixed = TRUE ))),
pipe = lengths (regmatches (first_line, gregexpr ("|" , first_line, fixed = TRUE ))),
semicolon = lengths (regmatches (first_line, gregexpr (";" , first_line, fixed = TRUE )))
)
names (counts)[which.max (counts)] |>
switch (comma = "," ,
tab = " \t " ,
pipe = "|" ,
semicolon = ";" ,
","
)
}
# helper: robust reader
read_table_safely <- function (path) {
delim <- detect_delim (path)
if (requireNamespace ("readr" , quietly = TRUE )) {
readr:: read_delim (
file = path, delim = delim, show_col_types = FALSE ,
guess_max = 100000 , progress = FALSE
)
} else {
utils:: read.csv (path, sep = delim, stringsAsFactors = FALSE , check.names = FALSE )
}
}
all_files <- list.files (data_dir, full.names = TRUE )
basename (all_files)
[1] "clean_data_20260225.csv"
[2] "clean_df_stage2.csv"
[3] "Combined_data.csv"
[4] "hz_naming_convention.csv"
[5] "idsr_clean.csv"
[6] "idsr_target_clean.csv"
[7] "Kinshasa_DRC_Cholera_Anticipatory_Action_Tracking (1).xlsx"
Code
# locate target files
clean_candidates <- list.files (
data_dir,
pattern = "^clean_data_20260225( \\ ..+)?$" ,
full.names = TRUE
)
hz_candidates <- list.files (
data_dir,
pattern = "^hz_naming_convention \\ .csv$" ,
full.names = TRUE ,
ignore.case = TRUE
)
if (length (clean_candidates) == 0 ) {
stop ("No file matching '^clean_data_20260225( \\\\ ..+)?$' found in data_dir." )
}
if (length (hz_candidates) == 0 ) {
stop ("No file matching 'hz_naming_convention.csv' found in data_dir." )
}
clean_path <- clean_candidates[1 ]
hz_path <- hz_candidates[1 ]
cat ("Selected raw-like dataset:" , clean_path, " \n " )
Selected raw-like dataset: /rds/general/user/acp25/home/MIMIC/Clean_data/data/clean_data_20260225.csv
Code
cat ("Selected HZ naming file:" , hz_path, " \n " )
Selected HZ naming file: /rds/general/user/acp25/home/MIMIC/Clean_data/data/hz_naming_convention.csv
Code
clean_df <- read_table_safely (clean_path)
hz_df <- read_table_safely (hz_path)
cat ("clean_data_20260225 rows:" , nrow (clean_df), " cols:" , ncol (clean_df), " \n " )
clean_data_20260225 rows: 46073 cols: 9
Code
cat ("hz_naming_convention rows:" , nrow (hz_df), " cols:" , ncol (hz_df), " \n " )
hz_naming_convention rows: 633 cols: 3
Cleaning and saving for all future analyses
The DEBUTSEM column contains three inconsistent date formats:
M/D/YYYY (majority) — e.g. 3/13/2023, 12/3/2012
D-Mon-YY (minority, older data) — e.g. 26-Jul-10, 4-Feb-08
YYYY-MM-DD (ISO format) — e.g. 2021-06-12, 2025-04-28
We standardise these into a single Date column and store the result in clean_data, which is used for all subsequent analysis.
Step 2: Remove rows with impossible dates
Ten rows contain dates that are clearly data-entry errors — either in the far future (up to 2102) or before the surveillance period (1989). These are removed because they cannot correspond to real epidemiological observations and would distort any time-based analysis.
Code
n_before <- nrow (clean_data)
clean_data <- clean_data %>%
filter (
is.na (DEBUTSEM) |
(DEBUTSEM >= as.Date ("2000-01-01" ) & DEBUTSEM <= Sys.Date ())
)
n_removed <- n_before - nrow (clean_data)
cat ("Removed" , n_removed, "rows with impossible dates (before 2000 or in the future). \n " )
Removed 10 rows with impossible dates (before 2000 or in the future).
Code
cat ("Rows remaining:" , nrow (clean_data), " \n " )
Step 3: Year-mismatch audit
Some rows have a mismatch between the year extracted from DEBUTSEM and the YEAR column. Most of these are benign: epidemiological week 1 or 52/53 can straddle a year boundary (e.g. 2007-12-31 assigned to YEAR = 2008, week 1). A smaller number are genuine errors where the date and year disagree by more than one calendar year or appear in mid-year weeks. Genuine errors are removed.
Code
clean_data <- clean_data %>%
mutate (date_year = year (DEBUTSEM))
mismatch <- clean_data %>%
filter (! is.na (DEBUTSEM) & date_year != YEAR) %>%
mutate (
year_diff = abs (date_year - YEAR),
is_straddling = year_diff == 1 & NUMSEM %in% c (1 , 52 , 53 )
)
n_straddling <- sum (mismatch$ is_straddling)
n_genuine <- sum (! mismatch$ is_straddling)
cat ("Total year mismatches:" , nrow (mismatch), " \n " )
Total year mismatches: 741
Code
cat (" Straddling year boundary (benign):" , n_straddling, " \n " )
Straddling year boundary (benign): 626
Code
cat (" Genuine date/year errors:" , n_genuine, " \n " )
Genuine date/year errors: 115
Code
if (n_genuine > 0 ) {
cat (" \n Genuine error rows (to be removed): \n " )
mismatch %>%
filter (! is_straddling) %>%
select (DEBUTSEM, YEAR, date_year, NUMSEM, PROV, HZ) %>%
print (n = 20 )
}
Genuine error rows (to be removed):
# A tibble: 115 × 6
DEBUTSEM YEAR date_year NUMSEM PROV HZ
<date> <dbl> <dbl> <dbl> <chr> <chr>
1 2015-06-05 2017 2015 23 KONGO-CENTRAL KIMPESE
2 2015-06-05 2017 2015 23 NORD-KIVU BINZA
3 2019-01-27 2020 2019 5 TANGANYIKA KALEMIE
4 2015-06-05 2017 2015 23 HAUT-LOMAMI KINKONDJA
5 2015-06-05 2017 2015 23 ITURI NYARAMBE
6 2019-01-13 2020 2019 3 TANGANYIKA KALEMIE
7 2019-01-20 2020 2019 4 TANGANYIKA KALEMIE
8 2015-06-05 2017 2015 23 TANGANYIKA KALEMIE
9 2024-01-13 2025 2024 3 LUALABA FUNGURUME
10 2019-03-02 2020 2019 6 TANGANYIKA KALEMIE
11 2019-12-09 2016 2019 37 TANGANYIKA KALEMIE
12 2015-06-05 2017 2015 23 NORD-KIVU GOMA
13 2019-01-13 2020 2019 3 TANGANYIKA MOBA
14 2024-01-06 2025 2024 2 LUALABA FUNGURUME
15 2024-01-27 2025 2024 5 LUALABA FUNGURUME
16 2015-06-05 2017 2015 23 KINSHASA KOKOLO
17 2015-06-05 2017 2015 23 NORD-KIVU RUTSHURU
18 2015-06-05 2017 2015 23 TANGANYIKA NYEMBA
19 2019-01-20 2020 2019 4 TANGANYIKA MOBA
20 2019-01-20 2020 2019 4 TANGANYIKA NYEMBA
# ℹ 95 more rows
Code
# remove genuine year-mismatch errors using row indices
n_before_mismatch <- nrow (clean_data)
clean_data <- clean_data %>%
mutate (.row_id = row_number ()) %>%
filter (! .row_id %in% (
clean_data %>%
mutate (.row_id = row_number ()) %>%
filter (! is.na (DEBUTSEM) & date_year != YEAR) %>%
mutate (
year_diff = abs (date_year - YEAR),
is_straddling = year_diff == 1 & NUMSEM %in% c (1 , 52 , 53 )
) %>%
filter (! is_straddling) %>%
pull (.row_id)
)) %>%
select (- .row_id)
cat ("Removed" , n_before_mismatch - nrow (clean_data), "genuine year-mismatch rows. \n " )
Removed 115 genuine year-mismatch rows.
Code
cat ("Rows remaining:" , nrow (clean_data), " \n " )
Code
# drop the helper column
clean_data <- clean_data %>% select (- date_year)
Note on year mismatches: Of the 741 rows where the parsed date year differs from YEAR, 626 are benign week-1/52/53 boundary cases and 115 are genuine errors (date and year disagree by > 1 year or occur in mid-year weeks). The genuine errors have been removed.
Step 4: Epidemiological week completeness
Incomplete years:
2017: missing week 52
2020: missing weeks 48–52 (5 weeks missing at year end)
All other years (2006–2016, 2018–2019, 2021–2025) have complete weeks 1–52.
Step 5: Column-name standardisation and whitespace trimming
Code
clean_names_basic <- function (nm) {
nm <- trimws (nm)
nm <- tolower (nm)
nm <- gsub ("[^a-z0-9]+" , "_" , nm)
nm <- gsub ("^_|_$" , "" , nm)
make.unique (nm, sep = "_" )
}
clean_df_stage1 <- clean_data
names (clean_df_stage1) <- clean_names_basic (names (clean_df_stage1))
# normalise character column values: trim, lowercase, replace special chars with _
char_cols <- names (clean_df_stage1)[sapply (clean_df_stage1, is.character)]
for (v in char_cols) {
clean_df_stage1[[v]] <- trimws (clean_df_stage1[[v]])
clean_df_stage1[[v]] <- tolower (clean_df_stage1[[v]])
clean_df_stage1[[v]] <- gsub ("[^a-z0-9]+" , "_" , clean_df_stage1[[v]])
clean_df_stage1[[v]] <- gsub ("^_|_$" , "" , clean_df_stage1[[v]])
clean_df_stage1[[v]][clean_df_stage1[[v]] == "" ] <- NA
}
cat ("Stage-1 cleaned object created: clean_df_stage1 \n " )
Stage-1 cleaned object created: clean_df_stage1
Code
cat ("Column names + character values standardised to lower_snake_case. \n " )
Column names + character values standardised to lower_snake_case.
1. Variable Inventory
1.1 Variable Name Descriptions
PROV
Province name (character). Values include NORD-KIVU, SUD-KIVU, etc.
NUMSEM
Epidemiological week number (numeric), from 1 to 53.
DEBUTSEM
Week start date (Date, after cleaning). Range spans 2008–2025.
TOTALCAS
Total reported cases for the week (numeric), from 0 upward.
TOTALDECES
Total reported deaths for the week (numeric), from 0 upward.
YEAR
Calendar year (numeric), from 2008 to 2025.
MALADIE
Disease name (character), e.g. CHOLERA.
HZ
Health zone name (character), raw name before standardisation.
key
Health zone key identifier (character), used for joining with hz_naming_convention.
1.2 Variable Profile
Code
profile_df <- function (df, max_example_chars = 40 ) {
n <- nrow (df)
out <- data.frame (
variable = names (df),
class = sapply (df, function (x) paste (class (x), collapse = ", " )),
n_missing = sapply (df, function (x) sum (is.na (x) | x == "" )),
n_unique = sapply (df, function (x) length (unique (x[! is.na (x) & x != "" ]))),
example_value = sapply (df, function (x) {
vals <- unique (x[! is.na (x) & x != "" ])
if (length (vals) == 0 ) {
return (NA_character_ )
}
ex <- as.character (vals[1 ])
substr (ex, 1 , max_example_chars)
}),
stringsAsFactors = FALSE
)
out$ pct_missing <- round (100 * out$ n_missing / max (n, 1 ), 2 )
out[order (- out$ pct_missing), c ("variable" , "class" , "n_missing" , "pct_missing" , "n_unique" , "example_value" )]
}
clean_profile <- profile_df (clean_df_stage1)
hz_profile <- profile_df (hz_df)
knitr:: kable (head (clean_profile, 40 ), caption = "All variables (clean_df_stage1) sorted by missingness" )
All variables (clean_df_stage1) sorted by missingness
key
key
character
46
0.10
428
goma
totalcas
totalcas
numeric
13
0.03
308
1177
totaldeces
totaldeces
numeric
16
0.03
23
0
prov
prov
character
0
0.00
31
nord_kivu
numsem
numsem
numeric
0
0.00
53
11
year
year
numeric
0
0.00
20
2023
maladie
maladie
character
0
0.00
1
cholera
hz
hz
character
0
0.00
428
goma
debutsem
debutsem
Date
NA
NA
1
NA
Code
knitr:: kable (hz_profile, caption = "Variables in hz_naming_convention" )
Variables in hz_naming_convention
value
value
character
498
78.67
89
BAFWAGBOGBO
pcode
pcode
character
17
2.69
516
CD5307ZS01
key
key
character
0
0.00
632
ABA
2. Exploratory Data Analysis (EDA)
2.1 Numeric Variables
Code
num_vars <- names (clean_df_stage1)[sapply (clean_df_stage1, is.numeric)]
if (length (num_vars) > 0 ) {
num_summary <- data.frame (
variable = num_vars,
mean = sapply (clean_df_stage1[num_vars], function (x) round (mean (x, na.rm = TRUE ), 3 )),
sd = sapply (clean_df_stage1[num_vars], function (x) round (sd (x, na.rm = TRUE ), 3 )),
min = sapply (clean_df_stage1[num_vars], function (x) round (min (x, na.rm = TRUE ), 3 )),
p50 = sapply (clean_df_stage1[num_vars], function (x) round (stats:: median (x, na.rm = TRUE ), 3 )),
max = sapply (clean_df_stage1[num_vars], function (x) round (max (x, na.rm = TRUE ), 3 ))
)
knitr:: kable (head (num_summary, 30 ), caption = "Numeric summary (first 30 numeric variables)" )
} else {
cat ("No numeric variables detected. \n " )
}
Numeric summary (first 30 numeric variables)
numsem
numsem
26.353
15.096
1
26
53
totalcas
totalcas
12.628
28.325
0
4
1177
totaldeces
totaldeces
0.246
1.028
0
0
98
year
year
2016.125
6.234
2006
2016
2025
Code
num_vars <- names (clean_df_stage1)[sapply (clean_df_stage1, is.numeric)]
if (length (num_vars) > 0 ) {
plot_vars <- head (num_vars, 6 )
old_par <- par (no.readonly = TRUE )
on.exit (par (old_par), add = TRUE )
par (mfrow = c (2 , 3 ))
for (v in plot_vars) {
hist (clean_df_stage1[[v]],
main = v, xlab = v, col = "steelblue" , border = "white"
)
}
} else {
plot.new ()
text (0.5 , 0.5 , "No numeric variables available" )
}
2.2 Categorical Variables
Code
cat_vars <- names (clean_df_stage1)[sapply (clean_df_stage1, function (x) is.character (x) || is.factor (x))]
if (length (cat_vars) > 0 ) {
cat ("Categorical variable count:" , length (cat_vars), " \n " )
for (v in cat_vars) {
cat (" \n ---" , v, "--- \n " )
print (sort (table (clean_df_stage1[[v]], useNA = "ifany" ), decreasing = TRUE ))
}
} else {
cat ("No character/factor variables detected. \n " )
}
Categorical variable count: 4
--- prov ---
sud_kivu kinshasa nord_kivu katanga
8778 6471 6244 4812
sankuru tanganyika haut_katanga haut_lomami
3022 2567 2376 1979
equateur orientale kasai_oriental tshopo
1748 1266 1078 920
maniema kongo_central mongala mai_ndombe
792 532 476 408
kasai lualaba ituri lomami
400 400 391 372
bandundu bas_congo kwilu nord_ubangi
245 215 200 178
bas_uele tshuapa kasai_central kwango
21 17 16 11
sud_ubangi kasai_occidental haut_uele
8 3 2
--- maladie ---
cholera
45948
--- hz ---
kalemie nyemba fizi uvira
969 945 943 928
kirotshe goma minova karisimbi
919 918 847 744
moba katana nundu kinkondja
703 697 647 575
mweso malemba bukama ruzizi
566 561 555 551
kadutu rutshuru masisi kabondo_dianda
530 529 498 452
kabare miti_murhesa mukanga kansimba
438 434 423 403
butumba ibanda lemera kabalo
393 390 390 344
limete nyiragongo bagira_kasha kalehe
343 342 309 291
lusambo kenya idjwi kongolo
274 272 271 269
kingabwa mulongo mutwanga tchomia
268 268 257 257
kampemba maluku_i manono bena_dibele
255 254 252 243
pania_mutombo katuba kilwa kisanga
243 225 218 216
pinga bolobo ankoro nsele
216 213 212 210
pweto bunyakiri kole kikula
209 205 204 202
barumbu gombe fungurume lubumbashi
198 198 197 197
mufunga_sampwe kokolo masina_ii kingasani
195 192 192 190
lukolela masina_i makanza yumbi
188 188 186 186
kisenso kalamu_i binza_meteo lwamba
184 182 181 181
matete nyunzu binza_ozone bumbu
181 180 179 178
ndjili kikimi ngiri_ngiri wangata
177 175 175 175
kalamu_ii katako_kombe birambizo ototo
174 173 172 172
tshudi_loto tshumbe vanga_kete bandalungwa
172 172 172 171
dikungu djalo_djeka lodja lomela
171 171 171 171
minga mont_ngafula_i omendjadi wembo_nyama
171 171 171 171
selembao kinshasa ngaba kintambo
170 169 169 167
makala mushenge kimbanseke nyarambe
167 166 165 165
mont_ngafula_ii biyela lingwala kasa_vubu
163 162 162 161
lemba kindu rwanguba lulenge_kimbi
161 160 160 153
nyangezi nyantende mumbunda lubunga
153 152 151 146
kapolowe bolenge angumu bunkeya
143 142 140 136
bibanga maluku_ii kalambayi_kabanga alunguli
135 132 131 130
lukafu mbandaka katoyi kyondo
129 129 128 127
likasi lilanga_bobangi bunia kamalondo
123 120 119 118
kibirizi mampoko mulumba isangi
118 118 118 116
rwashi walikale binza lufungula
115 113 112 111
kafubu kasenga makiso_kisangani moanda
109 108 106 104
yakusu muya kailo lisala
104 102 101 101
kimpese yahuma ngandajika tshamilemba
99 98 97 97
irebu kipushi bonzola lubudi
94 93 90 89
mokala gethy kasansa kitutu
87 85 85 84
manika kiyambi yamaluka lukelenge
82 79 78 77
vangu basoko binga kambove
76 75 75 75
tshilenge jiba kamituga kibua
73 69 69 69
kwamouth panda pimu dibindi
69 69 69 68
kunda lualaba mushie yalimbongo
68 68 67 67
bambo lowa mahagi bikoro
66 66 66 65
dekese drodro shabunda boso_mondanda
65 64 63 62
matadi bumba logo ubundu
62 61 59 59
bosondjo lita opala dilala
58 58 57 56
kabambare nsona_mpangu bipemba mbulula
55 55 54 53
kashobwe kayna kimpangu kitona
52 52 52 52
sakania citenge mukumbi police
52 51 51 51
tshishimbi yahisuli boma_bungu linga
51 51 50 50
wanierukula diulu rethy mikope
50 49 49 47
yamongili ferekeni bandundu kimbi_lulenge
46 45 44 44
kwilu_ngongo nioki boma lotumbe
44 44 43 42
ilebo ipamu kitenge basankusu
41 41 41 40
bulape lubutu yabaondo bili
40 40 40 39
bolomba kasongo lubilanji gombe_matadi
39 39 39 38
nizi djombo itebero bomongo
38 36 36 35
ingende kowe mwana ntondo
35 35 35 35
kalonge kansele nzaba iboko
34 34 34 33
miabi mitwaba monieka fataki
32 32 32 31
nzanza lubero mubumbano kampene
31 30 30 29
rwampara alimbongo rimba yambuku
29 28 28 28
walungu lukula mpokolo punia
27 25 25 25
tshopo kamina kaniola kabondo
25 24 24 23
kibombo pangi bosobe mangobo
23 23 22 22
saramabila sia aketi bokoro
22 22 21 21
lolanga_mampoko kilela_balanda kamango kanzenze
21 20 18 18
kisantu lolo mongbalu kuimba
18 18 18 17
nia_nia abuzi bosobolo bulu
17 15 15 15
gbadolite kabeya_kamuanga samba businga
15 15 15 14
karawa vuhovi wapinda wasolo
14 14 14 14
yakoma yaleko loko masereka
14 14 13 13
mobayi_mbongo mulungu ndage tshela
13 13 13 13
boga cilundu oshwe boko
12 11 11 10
kanda_kanda kangu lingomo mawuya
10 10 10 10
minembwe adi basali boko_kivulu
10 9 9 9
ariwara banga_lubaka boso_manzi kabinda
8 8 8 8
kabongo kilo butembo kakenge
7 7 6 6
katwa kizu laybo mwenga
6 6 6 6
nyakunde obokote oicha tshikapa
6 6 6 6
ikela itombwe komanda mwene_ditu
5 5 5 5
tembo banalia kalima kamonia
5 4 4 4
kaniama luambo lusangi mangala
4 4 4 4
mbaya musienene ntandembelo befale
4 4 4 3
beni bongandanga djuma inongo
3 3 3 3
kanzala kaziba kitangwa luebo
3 3 3 3
mandima masa mbanza_ngungu monga
3 3 3 3
monkoto ngidinga tunda vaku
3 3 3 3
aru bagata bobozo boende
2 2 2 2
bokungu boma_mangbetu doruma inga
2 2 2 2
kalole kambala kamwesha kikongo
2 2 2 2
kinda kiwa koshibanda luozi
2 2 2 2
luputa mweka niangara nyanga
2 2 2 2
songa wema aungba bafwagbogbo
2 2 1 1
bafwasende bagira baka biringi
1 1 1 1
bokonzi budjala damas dibaya
1 1 1 1
djolu faradje gemena gungu
1 1 1 1
hauts_plateaux idiofa kalenda kalonda
1 1 1 1
kalonda_est kalonda_ouest katende kayamba
1 1 1 1
kenge kibunzi kinkonzi libenge
1 1 1 1
likati lulingu makoro makota
1 1 1 1
mambasa mangembo mikalayi moanza
1 1 1 1
mondombe mutena mutshatsha ndjoko_mpunda
1 1 1 1
opienge panzi titule vanga
1 1 1 1
--- key ---
kalemie nyemba fizi uvira
969 945 943 928
kirotshe goma minova karisimbi
919 918 847 744
moba katana nundu kinkondja
703 697 647 575
mweso malemba bukama ruzizi
566 561 555 551
kadutu rutshuru masisi kabondo_dianda
530 529 498 452
kabare miti_murhesa mukanga kansimba
438 434 423 403
butumba ibanda lemera kabalo
393 390 390 344
limete nyiragongo bagira_kasha kalehe
343 342 309 291
lusambo kenya idjwi kongolo
274 272 271 269
kingabwa mulongo mutwanga tchomia
268 268 257 257
kampemba maluku_i manono bena_dibele
255 254 252 243
pania_mutombo katuba kilwa kisanga
243 225 218 216
pinga bolobo ankoro nsele
216 213 212 210
pweto bunyakiri kole kikula
209 205 204 202
barumbu gombe fungurume lubumbashi
198 198 197 197
mufunga_sampwe kokolo masina_ii kingasani
195 192 192 190
lukolela masina_i makanza yumbi
188 188 186 186
kisenso kalamu_i binza_meteo lwamba
184 182 181 181
matete nyunzu binza_ozone bumbu
181 180 179 178
ndjili kikimi ngiri_ngiri wangata
177 175 175 175
kalamu_ii birambizo ototo tshudi_loto
174 172 172 172
tshumbe vanga_kete bandalungwa dikungu
172 172 171 171
djalo_djeka lodja lomela minga
171 171 171 171
mont_ngafula_i omendjadi wembo_nyama selembao
171 171 171 170
kinshasa ngaba kintambo makala
169 169 167 167
mushenge kimbanseke nyarambe mont_ngafula_ii
166 165 165 163
biyela lingwala kasa_vubu lemba
162 162 161 161
kindu rwanguba lulenge_kimbi nyangezi
160 160 153 153
nyantende mumbunda lubunga kapolowe
152 151 146 143
bolenge angumu bunkeya bibanga
142 140 136 135
maluku_ii kalambayi_kabanga alunguli lukafu
132 131 130 129
mbandaka katoyi katako_kombe kyondo
129 128 127 127
likasi lilanga_bobangi bunia kamalondo
123 120 119 118
kibirizi mampoko mulumba isangi
118 118 118 116
rwashi walikale binza lufungula
115 113 112 111
kafubu kasenga makiso_kisangani moanda
109 108 106 104
yakusu muya kailo lisala
104 102 101 101
kimpese yahuma ngandajika tshamilemba
99 98 97 97
irebu kipushi bonzola lubudi
94 93 90 89
mokala gethy kasansa kitutu
87 85 85 84
manika kiyambi yamaluka lukelenge
82 79 78 77
vangu basoko binga kambove
76 75 75 75
tshilenge jiba kamituga kibua
73 69 69 69
kwamouth panda pimu dibindi
69 69 69 68
kunda lualaba mushie yalimbongo
68 68 67 67
bambo lowa mahagi bikoro
66 66 66 65
dekese drodro shabunda boso_mondanda
65 64 63 62
matadi bumba logo ubundu
62 61 59 59
bosondjo lita opala dilala
58 58 57 56
kabambare nsona_mpangu bipemba mbulula
55 55 54 53
kashobwe kayna kimpangu kitona
52 52 52 52
sakania citenge mukumbi police
52 51 51 51
tshishimbi yahisuli boma_bungu linga
51 51 50 50
wanierukula diulu rethy mikope
50 49 49 47
yamongili <NA> ferekeni bandundu
46 46 45 44
kimbi_lulenge kwilu_ngongo nioki boma
44 44 44 43
lotumbe ilebo ipamu kitenge
42 41 41 41
basankusu bulape lubutu yabaondo
40 40 40 40
bili bolomba kasongo lubilanji
39 39 39 39
gombe_matadi nizi djombo itebero
38 38 36 36
bomongo ingende kowe mwana
35 35 35 35
ntondo kalonge kansele nzaba
35 34 34 34
iboko miabi mitwaba monieka
33 32 32 32
fataki nzanza lubero mubumbano
31 31 30 30
kampene rwampara alimbongo rimba
29 29 28 28
yambuku walungu lukula mpokolo
28 27 25 25
punia tshopo kamina kaniola
25 25 24 24
kabondo kibombo pangi bosobe
23 23 23 22
mangobo saramabila sia aketi
22 22 22 21
bokoro lolanga_mampoko kilela_balanda kamango
21 21 20 18
kanzenze kisantu lolo mongbalu
18 18 18 18
kuimba nia_nia abuzi bosobolo
17 17 15 15
bulu gbadolite kabeya_kamuanga samba
15 15 15 15
businga karawa vuhovi wapinda
14 14 14 14
wasolo yakoma yaleko loko
14 14 14 13
masereka mobayi_mbongo mulungu ndage
13 13 13 13
tshela boga cilundu oshwe
13 12 11 11
boko kanda_kanda kangu lingomo
10 10 10 10
mawuya minembwe adi basali
10 10 9 9
boko_kivulu ariwara banga_lubaka boso_manzi
9 8 8 8
kabinda kabongo kilo butembo
8 7 7 6
kakenge katwa kizu laybo
6 6 6 6
mwenga nyakunde obokote oicha
6 6 6 6
tshikapa ikela itombwe komanda
6 5 5 5
mwene_ditu tembo banalia kalima
5 5 4 4
kamonia kaniama luambo lusangi
4 4 4 4
mangala mbaya musienene ntandembelo
4 4 4 4
befale beni bongandanga djuma
3 3 3 3
inongo kanzala kaziba kitangwa
3 3 3 3
luebo mandima masa mbanza_ngungu
3 3 3 3
monga monkoto ngidinga tunda
3 3 3 3
vaku aru bagata bobozo
3 2 2 2
boende bokungu boma_mangbetu doruma
2 2 2 2
inga kalole kambala kamwesha
2 2 2 2
kikongo kinda kiwa koshibanda
2 2 2 2
luozi luputa mweka niangara
2 2 2 2
nyanga songa wema aungba
2 2 2 1
bafwagbogbo bafwasende bagira baka
1 1 1 1
biringi bokonzi budjala damas
1 1 1 1
dibaya djolu faradje gemena
1 1 1 1
gungu hauts_plateaux idiofa kalenda
1 1 1 1
kalonda kalonda_est kalonda_ouest katende
1 1 1 1
kayamba kenge kibunzi kinkonzi
1 1 1 1
libenge likati lulingu makoro
1 1 1 1
makota mambasa mangembo mikalayi
1 1 1 1
moanza mondombe mutena mutshatsha
1 1 1 1
ndjoko_mpunda opienge panzi titule
1 1 1 1
vanga
1
Code
library (ggplot2)
cat_vars <- names (clean_df_stage1)[sapply (clean_df_stage1, function (x) is.character (x) || is.factor (x))]
if (length (cat_vars) > 0 ) {
plot_list <- list ()
for (v in cat_vars) {
freq <- as.data.frame (table (clean_df_stage1[[v]], useNA = "ifany" ),
stringsAsFactors = FALSE
)
names (freq) <- c ("category" , "count" )
freq$ category[is.na (freq$ category)] <- "(NA)"
# keep top 8 categories, lump the rest
if (nrow (freq) > 15 ) {
freq <- freq[order (- freq$ count), ]
top <- freq[1 : 15 , ]
other <- data.frame (
category = "Other" ,
count = sum (freq$ count[16 : nrow (freq)])
)
freq <- rbind (top, other)
}
freq$ fraction <- freq$ count / sum (freq$ count)
freq$ ymax <- cumsum (freq$ fraction)
freq$ ymin <- c (0 , head (freq$ ymax, - 1 ))
freq$ label_pos <- (freq$ ymax + freq$ ymin) / 2
freq$ label <- paste0 (
freq$ category, " \n (n=" ,
formatC (freq$ count, format = "d" , big.mark = "," ), ")"
)
p <- ggplot (freq, aes (
ymax = ymax, ymin = ymin, xmax = 4 , xmin = 2.5 ,
fill = category
)) +
geom_rect () +
geom_text (aes (x = 3.25 , y = label_pos, label = label), size = 2.5 ) +
coord_polar (theta = "y" ) +
xlim (c (1 , 4 )) +
theme_void () +
theme (legend.position = "none" ) +
labs (title = v)
plot_list[[v]] <- p
}
if (requireNamespace ("patchwork" , quietly = TRUE )) {
library (patchwork)
wrap_plots (plot_list, ncol = min (length (plot_list), 3 ))
} else {
for (p in plot_list) print (p)
}
} else {
cat ("No categorical variables to plot. \n " )
}
3. Exploring data set B: idsr_target_clean.csv
This dataset contains weekly cholera surveillance data at the health-zone level, with population denominators and pre-computed incidence rates. It will be joined with clean_df_stage1 in a later step.
3.1 Load and inspect
Code
idsr_path <- file.path (data_dir, "idsr_target_clean.csv" )
idsr_df <- read_table_safely (idsr_path)
cat ("idsr_target_clean rows:" , nrow (idsr_df), " cols:" , ncol (idsr_df), " \n " )
idsr_target_clean rows: 416004 cols: 11
Code
cat ("Column names:" , paste (names (idsr_df), collapse = ", " ), " \n " )
Column names: hz_code, year, week, cases, deaths, adm1_fr, hz, date, population, adm1_pcode, cases_pop
Code
spc_tbl_ [416,004 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ hz_code : chr [1:416004] "CD6101ZS01" "CD6101ZS01" "CD6101ZS01" "CD6101ZS01" ...
$ year : num [1:416004] 2006 2006 2006 2006 2006 ...
$ week : num [1:416004] 1 2 3 4 5 6 7 8 9 10 ...
$ cases : num [1:416004] 56 14 22 21 32 31 46 103 48 54 ...
$ deaths : num [1:416004] 0 0 0 0 0 0 0 1 0 0 ...
$ adm1_fr : chr [1:416004] "nord_kivu" "nord_kivu" "nord_kivu" "nord_kivu" ...
$ hz : chr [1:416004] "goma" "goma" "goma" "goma" ...
$ date : Date[1:416004], format: "2006-01-02" "2006-01-09" ...
$ population: num [1:416004] 296096 296096 296096 296096 296096 ...
$ adm1_pcode: chr [1:416004] "CD61" "CD61" "CD61" "CD61" ...
$ cases_pop : num [1:416004] 1.89e-04 4.73e-05 7.43e-05 7.09e-05 1.08e-04 ...
- attr(*, "spec")=
.. cols(
.. hz_code = col_character(),
.. year = col_double(),
.. week = col_double(),
.. cases = col_double(),
.. deaths = col_double(),
.. adm1_fr = col_character(),
.. hz = col_character(),
.. date = col_date(format = ""),
.. population = col_double(),
.. adm1_pcode = col_character(),
.. cases_pop = col_double()
.. )
- attr(*, "problems")=<externalptr>
3.2 Variable descriptions
hz_code
Health-zone geocode (character, e.g. CD6101ZS01). 400 unique values.
year
Calendar year (numeric), 2006–2025 (20 years).
week
Epidemiological week number (numeric), 1–52.
cases
Reported cholera cases for the week (numeric), 0–1 177.
deaths
Reported cholera deaths for the week (numeric), 0–98.
adm1_fr
Province name in French, lower-snake-case (character). 26 unique provinces.
hz
Health-zone name, lower-snake-case (character). 400 unique values.
date
Week start date (Date class, ISO YYYY-MM-DD). Range 2006-01-02 to 2025-12-22.
population
Health-zone population (numeric). Static per health zone (does not vary across years).
adm1_pcode
Province geocode (character, e.g. CD61). 26 unique values.
cases_pop
Pre-computed incidence rate = cases / population. Verified to match exactly.
3.3 Variable profile
Code
idsr_profile <- profile_df (idsr_df)
knitr:: kable (idsr_profile, caption = "All variables (idsr_target_clean) sorted by missingness" )
All variables (idsr_target_clean) sorted by missingness
hz_code
hz_code
character
0
0
400
CD6101ZS01
year
year
numeric
0
0
20
2006
week
week
numeric
0
0
52
1
cases
cases
numeric
0
0
308
56
deaths
deaths
numeric
0
0
23
0
adm1_fr
adm1_fr
character
0
0
26
nord_kivu
hz
hz
character
0
0
400
goma
population
population
numeric
0
0
400
296096
adm1_pcode
adm1_pcode
character
0
0
26
CD61
cases_pop
cases_pop
numeric
0
0
7537
0.000189127850426889
date
date
Date
NA
NA
1
NA
3.4 Missingness
Code
cat ("Zero missing values across all" , ncol (idsr_df), "columns. \n " )
Zero missing values across all 11 columns.
Code
Code
sapply (idsr_df, function (x) sum (is.na (x)))
hz_code year week cases deaths adm1_fr hz
0 0 0 0 0 0 0
date population adm1_pcode cases_pop
0 0 0 0
3.5 Date quality
Code
cat ("Date class:" , paste (class (idsr_df$ date), collapse = ", " ), " \n " )
Code
cat ("Date range:" , as.character (min (idsr_df$ date)), "to" , as.character (max (idsr_df$ date)), " \n " )
Date range: 2006-01-02 to 2025-12-22
Code
cat ("Future dates (after today):" , sum (idsr_df$ date > Sys.Date ()), " \n " )
Future dates (after today): 0
Code
cat ("Dates before 2000:" , sum (idsr_df$ date < as.Date ("2000-01-01" )), " \n " )
All dates are already parsed as Date class, in ISO format, with no impossible values.
3.6 Year/date consistency
Code
idsr_df <- idsr_df %>%
mutate (date_year = year (date))
idsr_mismatch <- idsr_df %>%
filter (date_year != year)
cat ("Year mismatches (date year != year column):" , nrow (idsr_mismatch), " \n " )
Year mismatches (date year != year column): 3200
All 3 200 mismatches are benign week-1 boundary cases (the week-start date falls in late December of the prior year while year is correctly assigned to the following epidemiological year).
Code
# verify: all mismatches are week 1, year differs by exactly 1
cat (
"All mismatches are week 1:" ,
all (idsr_mismatch$ week == 1 ), " \n "
)
All mismatches are week 1: TRUE
Code
cat (
"All year diffs are exactly 1:" ,
all (abs (idsr_mismatch$ date_year - idsr_mismatch$ year) == 1 ), " \n "
)
All year diffs are exactly 1: TRUE
Code
idsr_df <- idsr_df %>% select (- date_year)
3.7 Epidemiological week completeness
Code
cat ("Week range:" , min (idsr_df$ week), "to" , max (idsr_df$ week), " \n " )
Code
cat ("Unique weeks:" , n_distinct (idsr_df$ week), " \n\n " )
Code
# check completeness per year
idsr_week_summary <- idsr_df %>%
group_by (year) %>%
summarise (
min_week = min (week),
max_week = max (week),
n_unique_weeks = n_distinct (week),
.groups = "drop"
)
knitr:: kable (idsr_week_summary, caption = "Epi week coverage per year" )
Epi week coverage per year
2006
1
52
52
2007
1
52
52
2008
1
52
52
2009
1
52
52
2010
1
52
52
2011
1
52
52
2012
1
52
52
2013
1
52
52
2014
1
52
52
2015
1
52
52
2016
1
52
52
2017
1
52
52
2018
1
52
52
2019
1
52
52
2020
1
52
52
2021
1
52
52
2022
1
52
52
2023
1
52
52
2024
1
52
52
2025
1
52
52
Every year has complete weeks 1–52. No year has week 53 (unlike clean_df_stage1 where 2009 and 2015 have week 53).
3.8 Rows per health zone
Code
rows_per_hz <- idsr_df %>% count (hz_code, name = "n_rows" )
cat ("Expected rows per HZ: 20 years * 52 weeks =" , 20 * 52 , " \n " )
Expected rows per HZ: 20 years * 52 weeks = 1040
Code
cat ("HZs with exactly 1040 rows:" , sum (rows_per_hz$ n_rows == 1040 ), "of" , nrow (rows_per_hz), " \n " )
HZs with exactly 1040 rows: 399 of 400
Code
outliers <- rows_per_hz %>% filter (n_rows != 1040 )
if (nrow (outliers) > 0 ) {
cat (" \n HZs with unexpected row counts: \n " )
outliers <- outliers %>%
left_join (idsr_df %>% distinct (hz_code, hz), by = "hz_code" )
print (outliers)
}
HZs with unexpected row counts:
# A tibble: 1 × 3
hz_code n_rows hz
<chr> <int> <chr>
1 CD9207ZS03 1044 mushenge
3.9 Duplicate rows
Code
dups <- idsr_df %>%
group_by (hz_code, year, week) %>%
filter (n () > 1 ) %>%
ungroup ()
cat ("Duplicate (hz_code, year, week) rows:" , nrow (dups), " \n\n " )
Duplicate (hz_code, year, week) rows: 8
Code
if (nrow (dups) > 0 ) {
cat ("These duplicates all belong to hz_code CD9207ZS03 (mushenge): \n " )
dups %>%
select (hz_code, hz, year, week, cases, deaths, date) %>%
arrange (year, week) %>%
print (n = 20 )
}
These duplicates all belong to hz_code CD9207ZS03 (mushenge):
# A tibble: 8 × 7
hz_code hz year week cases deaths date
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <date>
1 CD9207ZS03 mushenge 2018 33 24 6 2018-08-13
2 CD9207ZS03 mushenge 2018 33 6 2 2018-08-13
3 CD9207ZS03 mushenge 2018 34 11 0 2018-08-20
4 CD9207ZS03 mushenge 2018 34 5 0 2018-08-20
5 CD9207ZS03 mushenge 2018 35 15 1 2018-08-27
6 CD9207ZS03 mushenge 2018 35 13 0 2018-08-27
7 CD9207ZS03 mushenge 2022 41 1 1 2022-10-10
8 CD9207ZS03 mushenge 2022 41 0 0 2022-10-10
There are 8 duplicate rows (4 duplicated week-keys), all in health zone mushenge (CD9207ZS03): weeks 33–35 of 2018 and week 41 of 2022. The duplicate pairs have different case/death counts, suggesting double reporting rather than exact duplicates.
3.10 Population
Code
cat (
"Population is static per health zone (does not vary by year):" ,
idsr_df %>%
group_by (hz_code) %>%
summarise (n_pop = n_distinct (population), .groups = "drop" ) %>%
pull (n_pop) %>% max () == 1 ,
" \n\n "
)
Population is static per health zone (does not vary by year): TRUE
Code
pop_summary <- idsr_df %>%
distinct (hz_code, hz, population) %>%
arrange (population)
cat ("Smallest HZ population: \n " )
Code
print (head (pop_summary, 5 ))
# A tibble: 5 × 3
hz_code hz population
<chr> <chr> <dbl>
1 CD4502ZS02 wema 10393
2 CD7302ZS01 baka 38249
3 CD4103ZS01 irebu 41282
4 CD7101ZS06 kowe 43539
5 CD9101ZS01 bobozo 53057
Code
cat (" \n Largest HZ population: \n " )
Code
print (tail (pop_summary, 5 ))
# A tibble: 5 × 3
hz_code hz population
<chr> <chr> <dbl>
1 CD7105ZS04 manika 563966
2 CD1000ZS06 bumbu 567311
3 CD1000ZS17 kisenso 592927
4 CD6101ZS02 karisimbi 723230
5 CD6103ZS01 katoyi 744355
Code
cat (" \n Population range:" , min (pop_summary$ population), "to" , max (pop_summary$ population), " \n " )
Population range: 10393 to 744355
Note: Population is a single static value per health zone — it does not change across the 20-year span. This may be a limitation for long-term incidence calculations.
3.11 Cases and deaths
Code
Code
cat ("Range:" , min (idsr_df$ cases), "to" , max (idsr_df$ cases), " \n " )
Code
cat ("Negative values:" , sum (idsr_df$ cases < 0 ), " \n " )
Code
cat (
"Zero-case weeks:" , sum (idsr_df$ cases == 0 ),
sprintf ("(%.1f%%)" , 100 * mean (idsr_df$ cases == 0 )), " \n "
)
Zero-case weeks: 380560 (91.5%)
Code
cat ("Mean:" , round (mean (idsr_df$ cases), 3 ), " Median:" , median (idsr_df$ cases), " \n " )
Code
cat (" \n === Deaths === \n " )
Code
cat ("Range:" , min (idsr_df$ deaths), "to" , max (idsr_df$ deaths), " \n " )
Code
cat ("Negative values:" , sum (idsr_df$ deaths < 0 ), " \n " )
Code
cat (
"Zero-death weeks:" , sum (idsr_df$ deaths == 0 ),
sprintf ("(%.1f%%)" , 100 * mean (idsr_df$ deaths == 0 )), " \n "
)
Zero-death weeks: 410417 (98.7%)
Code
cat ("Mean:" , round (mean (idsr_df$ deaths), 3 ), " Median:" , median (idsr_df$ deaths), " \n " )
Code
cat (" \n === Deaths > Cases? === \n " )
Code
deaths_gt_cases <- idsr_df %>% filter (deaths > cases)
cat ("Rows where deaths > cases:" , nrow (deaths_gt_cases), " \n " )
Rows where deaths > cases: 5
Code
if (nrow (deaths_gt_cases) > 0 ) {
print (head (deaths_gt_cases %>% select (hz_code, hz, year, week, cases, deaths), 10 ))
}
# A tibble: 5 × 6
hz_code hz year week cases deaths
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 CD6111ZS05 rutshuru 2007 18 0 1
2 CD7305ZS01 kinkondja 2023 47 9 11
3 CD7306ZS03 kabondo_dianda 2016 24 0 9
4 CD6311ZS02 kunda 2025 31 0 5
5 CD6202ZS01 kabare 2022 48 0 1
3.12 Numeric distributions
Code
idsr_num_vars <- names (idsr_df)[sapply (idsr_df, is.numeric)]
idsr_num_summary <- data.frame (
variable = idsr_num_vars,
mean = sapply (idsr_df[idsr_num_vars], function (x) round (mean (x, na.rm = TRUE ), 3 )),
sd = sapply (idsr_df[idsr_num_vars], function (x) round (sd (x, na.rm = TRUE ), 3 )),
min = sapply (idsr_df[idsr_num_vars], function (x) round (min (x, na.rm = TRUE ), 3 )),
p50 = sapply (idsr_df[idsr_num_vars], function (x) round (median (x, na.rm = TRUE ), 3 )),
max = sapply (idsr_df[idsr_num_vars], function (x) round (max (x, na.rm = TRUE ), 3 ))
)
knitr:: kable (idsr_num_summary, caption = "Numeric summary (idsr_target_clean)" )
Numeric summary (idsr_target_clean)
year
year
2015.500
5.766
2006
2016
2025.000
week
week
26.500
15.008
1
27
52.000
cases
cases
1.349
10.122
0
0
1177.000
deaths
deaths
0.026
0.342
0
0
98.000
population
population
218963.584
113084.783
10393
187233
744355.000
cases_pop
cases_pop
0.000
0.000
0
0
0.004
Code
old_par <- par (no.readonly = TRUE )
on.exit (par (old_par), add = TRUE )
par (mfrow = c (2 , 3 ))
for (v in idsr_num_vars) {
hist (idsr_df[[v]],
main = v, xlab = v, col = "steelblue" , border = "white"
)
}
3.13 Province and health-zone naming vs clean_df_stage1
Code
idsr_hz_names <- sort (unique (idsr_df$ hz))
clean_hz_names <- sort (unique (clean_df_stage1$ hz))
cat ("Unique HZs in idsr:" , length (idsr_hz_names), " \n " )
Code
cat ("Unique HZs in clean_df_stage1:" , n_distinct (clean_hz_names), " \n " )
Unique HZs in clean_df_stage1: 428
Code
cat ("In idsr but NOT in clean_df_stage1:" , length (setdiff (idsr_hz_names, clean_hz_names)), " \n " )
In idsr but NOT in clean_df_stage1: 0
Code
cat ("In clean_df_stage1 but NOT in idsr:" , length (setdiff (clean_hz_names, idsr_hz_names)), " \n\n " )
In clean_df_stage1 but NOT in idsr: 28
Code
extra_in_clean <- setdiff (clean_hz_names, idsr_hz_names)
if (length (extra_in_clean) > 0 ) {
cat ("HZ names in clean_df_stage1 but missing from idsr: \n " )
print (extra_in_clean)
}
HZ names in clean_df_stage1 but missing from idsr:
[1] "bagira_kasha" "binza_meteo" "binza_ozone"
[4] "bipemba" "boma_bungu" "boso_mondanda"
[7] "citenge" "kabeya_kamuanga" "kakenge"
[10] "kalambayi_kabanga" "kalonda" "kilela_balanda"
[13] "kiwa" "kuimba" "lufungula"
[16] "lulenge_kimbi" "makiso_kisangani" "malemba"
[19] "mampoko" "mobayi_mbongo" "mongbalu"
[22] "mont_ngafula_i" "mont_ngafula_ii" "mufunga_sampwe"
[25] "mwene_ditu" "ndjoko_mpunda" "vanga_kete"
[28] "wembo_nyama"
Code
cat (" \n === Province comparison === \n " )
=== Province comparison ===
Code
idsr_prov <- sort (unique (idsr_df$ adm1_fr))
clean_prov <- sort (unique (clean_df_stage1$ prov))
cat ("In idsr but NOT in clean_df_stage1:" , paste (setdiff (idsr_prov, clean_prov), collapse = ", " ), " \n " )
In idsr but NOT in clean_df_stage1:
Code
cat ("In clean_df_stage1 but NOT in idsr:" , paste (setdiff (clean_prov, idsr_prov), collapse = ", " ), " \n " )
In clean_df_stage1 but NOT in idsr: bandundu, bas_congo, kasai_occidental, katanga, orientale
Note on province differences: clean_df_stage1 contains 5 legacy province names (bandundu, bas_congo, kasai_occidental, katanga, orientale) that were split/renamed in the 2015 DRC administrative reform. These do not appear in idsr_target_clean, which uses only the current 26-province scheme.
3.14 Summary of issues found
8 duplicate rows (4 week-keys) in mushenge
Low
Different case/death values — needs decision on which to keep or whether to sum
Population is static across all 20 years
Medium
Single denominator per HZ regardless of year — may distort incidence trends
No week 53 in any year
Low
2009 and 2015 have week 53 in clean_df_stage1 but not here — week-53 data will not match
28 HZ names in clean_df_stage1 absent from idsr
Medium
These health zones have no population denominator in idsr_target_clean
5 legacy province names in clean_df_stage1
Medium
Pre-2015 province codes need mapping to current 26-province scheme for a clean join
4. Cleaning data set B
4.1 Map legacy province names in clean_df_stage1
The DRC’s 2015 administrative reform split 5 legacy provinces into the current 26-province scheme. We remap the prov column in clean_df_stage1 using a two-step approach:
Lookup via idsr_target_clean: For each health zone (hz) that has a legacy province name, we look up its current province in idsr_df (which uses the modern 26-province names). This successfully maps 138 of 147 legacy HZ/province combinations.
Manual assignment from hz_naming_convention geocodes: For the remaining 9 HZs not found in idsr_df, we use the pcode prefix in hz_naming_convention.csv to identify the correct province. Two HZs (kalonda and kiwa, totalling 3 rows) have no geocode and are left unmapped.
Code
# prov and hz values are already lower_snake_case from Step 5
# Create hz_norm alias for joining (hz values are already normalised)
clean_df_stage1 <- clean_df_stage1 %>%
mutate (hz_norm = hz)
# Build HZ -> current province lookup from idsr
idsr_hz_prov <- idsr_df %>%
distinct (hz, adm1_fr)
legacy_provs <- c ("bandundu" , "bas_congo" , "kasai_occidental" , "katanga" , "orientale" )
# Step 1: lookup from idsr
clean_df_stage1 <- clean_df_stage1 %>%
left_join (idsr_hz_prov, by = c ("hz_norm" = "hz" ), suffix = c ("" , "_idsr" ))
# Step 2: manual mapping for the 9 unmapped HZs (from hz_naming_convention pcodes)
manual_map <- tribble (
~ hz_norm, ~ manual_prov,
"boma_bungu" , "kongo_central" ,
"kuimba" , "kongo_central" ,
"kilela_balanda" , "haut_katanga" ,
"malemba" , "haut_lomami" ,
"mufunga_sampwe" , "haut_katanga" ,
"makiso_kisangani" , "tshopo" ,
"mongbalu" , "ituri" ,
"bambo" , "nord_kivu" ,
"nyantende" , "sud_kivu"
)
clean_df_stage1 <- clean_df_stage1 %>%
left_join (manual_map, by = "hz_norm" )
# Apply remapping: only overwrite prov for rows with legacy names
clean_df_stage1 <- clean_df_stage1 %>%
mutate (
prov = case_when (
prov %in% legacy_provs & ! is.na (adm1_fr) ~ adm1_fr,
prov %in% legacy_provs & ! is.na (manual_prov) ~ manual_prov,
TRUE ~ prov
)
) %>%
select (- adm1_fr, - manual_prov, - hz_norm)
# Report
n_still_legacy <- sum (clean_df_stage1$ prov %in% legacy_provs)
cat ("Rows still with legacy province name:" , n_still_legacy, " \n " )
Rows still with legacy province name: 3
Code
cat (" kalonda (kasai_occidental, 2008 wk 34): 1 row — no geocode available \n " )
kalonda (kasai_occidental, 2008 wk 34): 1 row — no geocode available
Code
cat (" kiwa (katanga, 2006 wks 45 & 50): 2 rows — no geocode available \n " )
kiwa (katanga, 2006 wks 45 & 50): 2 rows — no geocode available
Code
cat ("Unique provinces after remapping:" , n_distinct (clean_df_stage1$ prov), " \n " )
Unique provinces after remapping: 28
Code
cat (sort (unique (clean_df_stage1$ prov)), sep = " \n " )
bas_uele
equateur
haut_katanga
haut_lomami
haut_uele
ituri
kasai
kasai_central
kasai_occidental
kasai_oriental
katanga
kinshasa
kongo_central
kwango
kwilu
lomami
lualaba
mai_ndombe
maniema
mongala
nord_kivu
nord_ubangi
sankuru
sud_kivu
sud_ubangi
tanganyika
tshopo
tshuapa
4.2 Resolve duplicates in idsr_df
The 8 duplicate rows (4 week-keys) in health zone mushenge (CD9207ZS03) have different case/death counts, suggesting double reporting. We aggregate by summing cases and deaths for duplicated keys and recompute cases_pop.
Code
n_before_idsr <- nrow (idsr_df)
idsr_clean <- idsr_df %>%
group_by (hz_code, year, week, adm1_fr, hz, date, population, adm1_pcode) %>%
summarise (
cases = sum (cases),
deaths = sum (deaths),
.groups = "drop"
) %>%
mutate (cases_pop = cases / population)
n_after_idsr <- nrow (idsr_clean)
cat ("idsr rows before:" , n_before_idsr, " \n " )
Code
cat ("idsr rows after dedup:" , n_after_idsr, " \n " )
idsr rows after dedup: 416000
Code
cat ("Rows removed:" , n_before_idsr - n_after_idsr, " \n " )
Code
# verify no duplicates remain
cat (
"Remaining duplicates:" ,
idsr_clean %>%
group_by (hz_code, year, week) %>%
filter (n () > 1 ) %>%
nrow (),
" \n "
)
4.3 Build clean_df_stage2
Code
clean_df_stage2 <- clean_df_stage1
cat ("clean_df_stage2 created. \n " )
Code
cat ("Rows:" , nrow (clean_df_stage2), " \n " )
Code
cat ("Columns:" , ncol (clean_df_stage2), " \n " )
Code
cat ("Column names:" , paste (names (clean_df_stage2), collapse = ", " ), " \n " )
Column names: prov, numsem, debutsem, totalcas, totaldeces, year, maladie, hz, key
4.4 Summary of all cleaning decisions, where clean_data_20260225 and idsr_target_clean are still separate datasets, saved as new cleaned data objects “clean_df_stage2” and “idsr_clean”
Code
cleaning_summary <- data.frame (
Dataset = c (
"clean_data_20260225" ,
"clean_data_20260225" ,
"clean_data_20260225" ,
"clean_data_20260225" ,
"clean_data_20260225" ,
"idsr_target_clean" ,
"idsr_target_clean" ,
"idsr_target_clean" ,
"idsr_target_clean" ,
"idsr_target_clean" ,
"idsr_target_clean" ,
"clean_data_20260225"
),
Issue = c (
"Three inconsistent date formats (M/D/YYYY, D-Mon-YY, YYYY-MM-DD)" ,
"10 rows with impossible dates (before 2000 or far future up to 2102)" ,
"741 year mismatches between DEBUTSEM and YEAR column" ,
"Incomplete epi weeks: 2017 missing week 52; 2020 missing weeks 48-52" ,
"Column names and character values inconsistent (mixed case, special characters)" ,
"No date format issues; no missing data" ,
"3,200 year/date mismatches (all benign week-1 boundary cases)" ,
"No week 53 in any year (unlike clean_df_stage1 where 2009 and 2015 have week 53)" ,
"8 duplicate rows (4 week-keys) in mushenge (CD9207ZS03)" ,
"Population is static per HZ (single value across all 20 years)" ,
"28 HZ names in clean_df_stage1 absent from idsr" ,
"5 legacy province names from pre-2015 administrative scheme"
),
Action = c (
"Standardised all three formats to a single Date column via lubridate" ,
"Removed (data-entry errors: dates ranged from 1989 to 2102)" ,
"Removed 115 genuine errors (date/year differ by >1 year or mid-year week mismatch); retained 626 benign week-1/52/53 boundary cases" ,
"Left as-is (likely reflects genuine gaps in surveillance reporting)" ,
"Column names and all character values lowercased to lower_snake_case, special characters replaced with underscores, whitespace trimmed" ,
"No action needed" ,
"No action needed (verified: all are week 1, year differs by exactly 1)" ,
"No action taken; week-53 rows in clean_df_stage1 will not match on join" ,
"Summed cases and deaths for duplicate week-keys to consolidate into single rows" ,
"Not updated with year-varying population data yet (noted for future work)" ,
"No action taken (these 28 HZs have no population denominator in idsr)" ,
"Remapped to current 26-province names using a two-step method: (1) looked up each HZ in idsr_target_clean to find its current province (138 of 147 HZ/province combos matched); (2) for the remaining 9 HZs, used pcode prefixes in hz_naming_convention.csv to identify the correct province (7 more mapped, plus 2 cross-province corrections: bambo->nord_kivu, nyantende->sud_kivu). 2 HZs (kalonda, kiwa; 3 rows total) have no geocode and remain unmapped."
),
stringsAsFactors = FALSE
)
knitr:: kable (cleaning_summary, caption = "Summary of cleaning decisions for clean_df_stage2" )
Summary of cleaning decisions for clean_df_stage2
clean_data_20260225
Three inconsistent date formats (M/D/YYYY, D-Mon-YY, YYYY-MM-DD)
Standardised all three formats to a single Date column via lubridate
clean_data_20260225
10 rows with impossible dates (before 2000 or far future up to 2102)
Removed (data-entry errors: dates ranged from 1989 to 2102)
clean_data_20260225
741 year mismatches between DEBUTSEM and YEAR column
Removed 115 genuine errors (date/year differ by >1 year or mid-year week mismatch); retained 626 benign week-1/52/53 boundary cases
clean_data_20260225
Incomplete epi weeks: 2017 missing week 52; 2020 missing weeks 48-52
Left as-is (likely reflects genuine gaps in surveillance reporting)
clean_data_20260225
Column names and character values inconsistent (mixed case, special characters)
Column names and all character values lowercased to lower_snake_case, special characters replaced with underscores, whitespace trimmed
idsr_target_clean
No date format issues; no missing data
No action needed
idsr_target_clean
3,200 year/date mismatches (all benign week-1 boundary cases)
No action needed (verified: all are week 1, year differs by exactly 1)
idsr_target_clean
No week 53 in any year (unlike clean_df_stage1 where 2009 and 2015 have week 53)
No action taken; week-53 rows in clean_df_stage1 will not match on join
idsr_target_clean
8 duplicate rows (4 week-keys) in mushenge (CD9207ZS03)
Summed cases and deaths for duplicate week-keys to consolidate into single rows
idsr_target_clean
Population is static per HZ (single value across all 20 years)
Not updated with year-varying population data yet (noted for future work)
idsr_target_clean
28 HZ names in clean_df_stage1 absent from idsr
No action taken (these 28 HZs have no population denominator in idsr)
clean_data_20260225
5 legacy province names from pre-2015 administrative scheme
Remapped to current 26-province names using a two-step method: (1) looked up each HZ in idsr_target_clean to find its current province (138 of 147 HZ/province combos matched); (2) for the remaining 9 HZs, used pcode prefixes in hz_naming_convention.csv to identify the correct province (7 more mapped, plus 2 cross-province corrections: bambo->nord_kivu, nyantende->sud_kivu). 2 HZs (kalonda, kiwa; 3 rows total) have no geocode and remain unmapped.
5. Combining clean_df_stage2 and idsr_clean into Combined_data
We perform a full join on (hz, year, week) so that:
All 416,000 rows from idsr_clean are retained (the complete HZ-year-week panel with zero-case weeks and population denominators).
All 45,948 rows from clean_df_stage2 are retained, including the 28 HZs that do not appear in idsr_clean (these will have NA for population and other idsr-only columns).
5.2 Reconcile overlapping columns
Both datasets contain cases, deaths, province, and date columns. We coalesce these: prefer clean_df_stage2 values where available, fall back to idsr_clean.
Code
Combined_data <- Combined_data %>%
mutate (
# cases/deaths: prefer clean_df_stage2, fall back to idsr
cases = coalesce (totalcas, cases),
deaths = coalesce (totaldeces, deaths),
# province: prefer clean_df_stage2 (has remapped legacy names)
prov = coalesce (prov_clean, prov_idsr),
# date: prefer idsr (always well-formed ISO), fall back to clean
date = coalesce (date, debutsem),
# maladie: always cholera in this dataset
maladie = "cholera" ,
# key: fill from hz where missing
key = coalesce (key, hz)
) %>%
select (
hz_code, prov, adm1_pcode,
hz, year, numsem, date,
cases, deaths,
population, cases_pop,
maladie, key
)
cat ("=== After reconciliation === \n " )
=== After reconciliation ===
Code
cat ("Rows:" , nrow (Combined_data), " \n " )
Code
cat ("Columns:" , ncol (Combined_data), " \n " )
Code
cat ("Column names:" , paste (names (Combined_data), collapse = ", " ), " \n " )
Column names: hz_code, prov, adm1_pcode, hz, year, numsem, date, cases, deaths, population, cases_pop, maladie, key
5.3 Join diagnostics
Code
# Rows from idsr only (zero-case weeks not in clean_df_stage2)
n_idsr_only <- Combined_data %>%
filter (is.na (maladie)) %>%
nrow ()
# Rows from clean_df_stage2 only (the 28 HZs not in idsr)
n_clean_only <- Combined_data %>%
filter (is.na (hz_code)) %>%
nrow ()
# Rows matched in both
n_both <- nrow (Combined_data) - n_idsr_only - n_clean_only
cat ("Rows from idsr_clean only (zero-case weeks):" , n_idsr_only, " \n " )
Rows from idsr_clean only (zero-case weeks): 0
Code
cat ("Rows from clean_df_stage2 only (28 extra HZs):" , n_clean_only, " \n " )
Rows from clean_df_stage2 only (28 extra HZs): 3098
Code
cat ("Rows matched in both datasets:" , n_both, " \n " )
Rows matched in both datasets: 416000
Code
cat ("Total:" , nrow (Combined_data), " \n\n " )
Code
cat ("=== Missingness after join === \n " )
=== Missingness after join ===
Code
for (nm in names (Combined_data)) {
n_na <- sum (is.na (Combined_data[[nm]]))
if (n_na > 0 ) {
cat (sprintf (
" %-12s %d NA (%.1f%%) \n " , nm, n_na,
100 * n_na / nrow (Combined_data)
))
}
}
hz_code 3098 NA (0.7%)
adm1_pcode 3098 NA (0.7%)
date 2 NA (0.0%)
cases 1 NA (0.0%)
deaths 1 NA (0.0%)
population 3098 NA (0.7%)
cases_pop 3098 NA (0.7%)
5.4 Save all three datasets
Code
out_dir <- "/rds/general/user/acp25/home/MIMIC/Clean_data/data"
readr:: write_csv (
clean_df_stage2,
file.path (out_dir, "clean_df_stage2.csv" )
)
readr:: write_csv (
idsr_clean,
file.path (out_dir, "idsr_clean.csv" )
)
readr:: write_csv (
Combined_data,
file.path (out_dir, "Combined_data.csv" )
)
cat ("Saved: \n " )
Code
cat (" clean_df_stage2.csv -" , nrow (clean_df_stage2), "rows \n " )
clean_df_stage2.csv - 45948 rows
Code
cat (" idsr_clean.csv -" , nrow (idsr_clean), "rows \n " )
idsr_clean.csv - 416000 rows
Code
cat (" Combined_data.csv -" , nrow (Combined_data), "rows \n " )
Combined_data.csv - 419098 rows
5.5 Dataset dimensions and structure
Code
cat ("Rows:" , nrow (Combined_data), " \n " )
Code
cat ("Columns:" , ncol (Combined_data), " \n\n " )
Code
tibble [419,098 × 13] (S3: tbl_df/tbl/data.frame)
$ hz_code : chr [1:419098] "CD1000ZS01" "CD1000ZS01" "CD1000ZS01" "CD1000ZS01" ...
$ prov : chr [1:419098] "kinshasa" "kinshasa" "kinshasa" "kinshasa" ...
$ adm1_pcode: chr [1:419098] "CD10" "CD10" "CD10" "CD10" ...
$ hz : chr [1:419098] "bandalungwa" "bandalungwa" "bandalungwa" "bandalungwa" ...
$ year : num [1:419098] 2006 2006 2006 2006 2006 ...
$ numsem : num [1:419098] 1 2 3 4 5 6 7 8 9 10 ...
$ date : Date[1:419098], format: "2006-01-02" "2006-01-09" ...
$ cases : num [1:419098] 0 0 0 0 0 0 0 0 0 0 ...
$ deaths : num [1:419098] 0 0 0 0 0 0 0 0 0 0 ...
$ population: num [1:419098] 248790 248790 248790 248790 248790 ...
$ cases_pop : num [1:419098] 0 0 0 0 0 0 0 0 0 0 ...
$ maladie : chr [1:419098] "cholera" "cholera" "cholera" "cholera" ...
$ key : chr [1:419098] "bandalungwa" "bandalungwa" "bandalungwa" "bandalungwa" ...
5.6 Variable profile
Code
combined_profile <- profile_df (Combined_data)
knitr:: kable (combined_profile, caption = "Variable profile (Combined_data)" )
Variable profile (Combined_data)
hz_code
hz_code
character
3098
0.74
400
CD1000ZS01
adm1_pcode
adm1_pcode
character
3098
0.74
26
CD10
population
population
numeric
3098
0.74
400
248790
cases_pop
cases_pop
numeric
3098
0.74
7536
0
prov
prov
character
0
0.00
28
kinshasa
hz
hz
character
0
0.00
428
bandalungwa
year
year
numeric
0
0.00
20
2006
numsem
numsem
numeric
0
0.00
53
1
cases
cases
numeric
1
0.00
308
0
deaths
deaths
numeric
1
0.00
23
0
maladie
maladie
character
0
0.00
1
cholera
key
key
character
0
0.00
428
bandalungwa
date
date
Date
NA
NA
1
NA
5.7 Numeric variables
Code
num_vars_cb <- names (Combined_data)[sapply (Combined_data, is.numeric)]
if (length (num_vars_cb) > 0 ) {
num_summary_cb <- data.frame (
variable = num_vars_cb,
mean = sapply (Combined_data[num_vars_cb], function (x) round (mean (x, na.rm = TRUE ), 3 )),
sd = sapply (Combined_data[num_vars_cb], function (x) round (sd (x, na.rm = TRUE ), 3 )),
min = sapply (Combined_data[num_vars_cb], function (x) round (min (x, na.rm = TRUE ), 3 )),
p25 = sapply (Combined_data[num_vars_cb], function (x) round (quantile (x, 0.25 , na.rm = TRUE ), 3 )),
p50 = sapply (Combined_data[num_vars_cb], function (x) round (median (x, na.rm = TRUE ), 3 )),
p75 = sapply (Combined_data[num_vars_cb], function (x) round (quantile (x, 0.75 , na.rm = TRUE ), 3 )),
max = sapply (Combined_data[num_vars_cb], function (x) round (max (x, na.rm = TRUE ), 3 ))
)
knitr:: kable (num_summary_cb, caption = "Numeric summary (Combined_data)" )
} else {
cat ("No numeric variables detected. \n " )
}
Numeric summary (Combined_data)
year
year
2015.505
5.772
2006
2011
2016.0
2021.0
2025.000
numsem
numsem
26.509
15.013
1
14
27.0
40.0
53.000
cases
cases
1.387
10.177
0
0
0.0
0.0
1177.000
deaths
deaths
0.027
0.349
0
0
0.0
0.0
98.000
population
population
218964.030
113085.235
10393
141247
187930.5
285597.8
744355.000
cases_pop
cases_pop
0.000
0.000
0
0
0.0
0.0
0.004
Code
if (length (num_vars_cb) > 0 ) {
plot_vars_cb <- head (num_vars_cb, 6 )
old_par <- par (no.readonly = TRUE )
on.exit (par (old_par), add = TRUE )
par (mfrow = c (2 , 3 ))
for (v in plot_vars_cb) {
hist (Combined_data[[v]],
main = v, xlab = v, col = "steelblue" , border = "white"
)
}
}
5.8 Categorical variables
Code
cat_vars_cb <- names (Combined_data)[sapply (Combined_data, is.character)]
cat ("Categorical variables:" , length (cat_vars_cb), " \n\n " )
Code
for (v in cat_vars_cb) {
n_unique <- n_distinct (Combined_data[[v]], na.rm = TRUE )
n_na <- sum (is.na (Combined_data[[v]]))
cat (sprintf ("--- %s (%d unique, %d NA) --- \n " , v, n_unique, n_na))
tbl <- sort (table (Combined_data[[v]], useNA = "ifany" ), decreasing = TRUE )
print (head (tbl, 15 ))
if (length (tbl) > 15 ) cat (" ... and" , length (tbl) - 15 , "more levels \n " )
cat (" \n " )
}
--- hz_code (400 unique, 3098 NA) ---
<NA> CD1000ZS01 CD1000ZS02 CD1000ZS05 CD1000ZS06 CD1000ZS07 CD1000ZS08
3098 1040 1040 1040 1040 1040 1040
CD1000ZS09 CD1000ZS10 CD1000ZS11 CD1000ZS12 CD1000ZS13 CD1000ZS14 CD1000ZS15
1040 1040 1040 1040 1040 1040 1040
CD1000ZS16
1040
... and 386 more levels
--- prov (28 unique, 0 NA) ---
sud_kivu ituri kinshasa nord_kivu haut_katanga
35843 33303 33045 31208 27214
kongo_central tshopo equateur maniema kasai_oriental
26067 21943 19021 18722 16762
kasai haut_lomami sankuru kwilu tanganyika
16647 16168 14903 12480 11450
... and 13 more levels
--- adm1_pcode (26 unique, 3098 NA) ---
CD62 CD54 CD10 CD61 CD71 CD20 CD51 CD41 CD63 CD82 CD92 CD73 CD83
35360 33280 32240 31200 27040 26000 21840 18720 18720 16640 16640 15600 14560
CD32 CD44
12480 11440
... and 12 more levels
--- hz (428 unique, 0 NA) ---
bukama fizi kabare kadutu kalemie katana
1042 1042 1042 1042 1042 1042
kirotshe masisi minova miti_murhesa moba mutwanga
1042 1042 1042 1042 1042 1042
mweso nundu nyemba
1042 1042 1042
... and 413 more levels
--- maladie (1 unique, 0 NA) ---
cholera
419098
--- key (428 unique, 0 NA) ---
bukama fizi kabare kadutu kalemie katana
1042 1042 1042 1042 1042 1042
kirotshe masisi minova miti_murhesa moba mutwanga
1042 1042 1042 1042 1042 1042
mweso nundu nyemba
1042 1042 1042
... and 413 more levels
Code
if (length (cat_vars_cb) > 0 ) {
plot_list_cb <- list ()
for (v in cat_vars_cb) {
freq <- as.data.frame (table (Combined_data[[v]], useNA = "ifany" ),
stringsAsFactors = FALSE
)
names (freq) <- c ("category" , "count" )
freq$ category[is.na (freq$ category)] <- "(NA)"
if (nrow (freq) > 15 ) {
freq <- freq[order (- freq$ count), ]
top <- freq[1 : 15 , ]
other <- data.frame (
category = "Other" ,
count = sum (freq$ count[16 : nrow (freq)])
)
freq <- rbind (top, other)
}
freq$ fraction <- freq$ count / sum (freq$ count)
freq$ ymax <- cumsum (freq$ fraction)
freq$ ymin <- c (0 , head (freq$ ymax, - 1 ))
freq$ label_pos <- (freq$ ymax + freq$ ymin) / 2
freq$ label <- paste0 (
freq$ category, " \n (n=" ,
formatC (freq$ count, format = "d" , big.mark = "," ), ")"
)
p <- ggplot (freq, aes (
ymax = ymax, ymin = ymin, xmax = 4 , xmin = 2.5 ,
fill = category
)) +
geom_rect () +
geom_text (aes (x = 3.25 , y = label_pos, label = label), size = 2.5 ) +
coord_polar (theta = "y" ) +
xlim (c (1 , 4 )) +
theme_void () +
theme (legend.position = "none" ) +
labs (title = v)
plot_list_cb[[v]] <- p
}
if (requireNamespace ("patchwork" , quietly = TRUE )) {
wrap_plots (plot_list_cb, ncol = min (length (plot_list_cb), 3 ))
} else {
for (p in plot_list_cb) print (p)
}
}
5.9 Date and temporal overview
Code
Code
cat (" Class:" , paste (class (Combined_data$ date), collapse = ", " ), " \n " )
Code
cat (
" Range:" , as.character (min (Combined_data$ date, na.rm = TRUE )),
"to" , as.character (max (Combined_data$ date, na.rm = TRUE )), " \n "
)
Range: 2005-12-26 to 2025-12-30
Code
cat (" NAs:" , sum (is.na (Combined_data$ date)), "of" , nrow (Combined_data), " \n\n " )
Code
Code
print (table (Combined_data$ year))
2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
21056 21106 20867 20873 20863 20885 20936 20863 20901 21008 20900 20969 21025
2019 2020 2021 2022 2023 2024 2025
21003 20913 20855 20891 21009 20979 21196
Code
cat (" \n Epi week column (numsem): \n " )
Epi week column (numsem):
Code
cat (
" Range:" , min (Combined_data$ numsem, na.rm = TRUE ),
"to" , max (Combined_data$ numsem, na.rm = TRUE ), " \n "
)
Code
cat (" Unique weeks:" , n_distinct (Combined_data$ numsem), " \n " )
6. Descriptive plots
6.1 National epidemic curve (weekly)
Code
national_weekly <- Combined_data %>%
group_by (year, numsem) %>%
summarise (
cases = sum (cases, na.rm = TRUE ),
deaths = sum (deaths, na.rm = TRUE ),
.groups = "drop"
) %>%
arrange (year, numsem) %>%
mutate (week_index = row_number ())
ggplot (national_weekly, aes (x = week_index, y = cases)) +
geom_col (fill = "#2c7fb8" , width = 1 ) +
scale_x_continuous (
breaks = national_weekly %>%
filter (numsem == 1 ) %>%
pull (week_index),
labels = national_weekly %>%
filter (numsem == 1 ) %>%
pull (year)
) +
scale_y_continuous (labels = scales:: comma) +
labs (
x = "Year" , y = "Reported cases" ,
title = "National cholera epidemic curve (weekly)"
) +
theme_minimal () +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
6.2 National epidemic curve — cases and deaths
Code
ggplot (national_weekly, aes (x = week_index)) +
geom_col (aes (y = cases), fill = "#2c7fb8" , width = 1 , alpha = 0.8 ) +
geom_col (aes (y = deaths), fill = "#d95f02" , width = 1 , alpha = 0.9 ) +
scale_x_continuous (
breaks = national_weekly %>%
filter (numsem == 1 ) %>%
pull (week_index),
labels = national_weekly %>%
filter (numsem == 1 ) %>%
pull (year)
) +
scale_y_continuous (labels = scales:: comma) +
labs (
x = "Year" , y = "Count" ,
title = "National cholera — weekly cases (blue) and deaths (orange)"
) +
theme_minimal () +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
6.3 Annual epidemic curves (faceted by year)
Code
annual_weekly <- Combined_data %>%
group_by (year, numsem) %>%
summarise (cases = sum (cases, na.rm = TRUE ), .groups = "drop" )
ggplot (annual_weekly, aes (x = numsem, y = cases)) +
geom_col (fill = "#2c7fb8" , width = 1 ) +
facet_wrap (~ year, ncol = 3 , scales = "free_y" ) +
scale_y_continuous (labels = scales:: comma) +
labs (
x = "Epidemiological week" , y = "Reported cases" ,
title = "Cholera epidemic curve by year"
) +
theme_minimal () +
theme (strip.text = element_text (face = "bold" ))
6.4 Annual case and death totals
Code
annual_totals <- Combined_data %>%
group_by (year) %>%
summarise (
cases = sum (cases, na.rm = TRUE ),
deaths = sum (deaths, na.rm = TRUE ),
.groups = "drop"
) %>%
tidyr:: pivot_longer (
cols = c (cases, deaths), names_to = "measure" ,
values_to = "count"
)
ggplot (annual_totals, aes (x = factor (year), y = count, fill = measure)) +
geom_col (position = "dodge" ) +
scale_fill_manual (values = c (cases = "#2c7fb8" , deaths = "#d95f02" )) +
scale_y_continuous (labels = scales:: comma) +
labs (
x = "Year" , y = "Count" , fill = NULL ,
title = "Annual cholera cases and deaths"
) +
theme_minimal () +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
6.5 Cases by province (total)
Code
prov_cases <- Combined_data %>%
filter (! is.na (prov)) %>%
group_by (prov) %>%
summarise (
cases = sum (cases, na.rm = TRUE ),
deaths = sum (deaths, na.rm = TRUE ),
.groups = "drop"
) %>%
arrange (desc (cases))
ggplot (prov_cases, aes (x = reorder (prov, cases), y = cases)) +
geom_col (fill = "#2c7fb8" ) +
coord_flip () +
scale_y_continuous (labels = scales:: comma) +
labs (
x = NULL , y = "Total reported cases" ,
title = "Total cholera cases by province (all years)"
) +
theme_minimal ()
6.6 Deaths by province (total)
Code
ggplot (prov_cases, aes (x = reorder (prov, deaths), y = deaths)) +
geom_col (fill = "#d95f02" ) +
coord_flip () +
scale_y_continuous (labels = scales:: comma) +
labs (
x = NULL , y = "Total reported deaths" ,
title = "Total cholera deaths by province (all years)"
) +
theme_minimal ()
6.7 Top 20 health zones by case count
Code
hz_cases <- Combined_data %>%
filter (! is.na (hz)) %>%
group_by (hz) %>%
summarise (cases = sum (cases, na.rm = TRUE ), .groups = "drop" ) %>%
arrange (desc (cases)) %>%
slice_head (n = 20 )
ggplot (hz_cases, aes (x = reorder (hz, cases), y = cases)) +
geom_col (fill = "#2c7fb8" ) +
coord_flip () +
scale_y_continuous (labels = scales:: comma) +
labs (
x = NULL , y = "Total reported cases" ,
title = "Top 20 health zones by cholera cases (all years)"
) +
theme_minimal ()
6.8 Province-level epidemic curves (faceted)
Code
prov_annual <- Combined_data %>%
filter (! is.na (prov)) %>%
group_by (prov, year) %>%
summarise (cases = sum (cases, na.rm = TRUE ), .groups = "drop" )
ggplot (prov_annual, aes (x = year, y = cases)) +
geom_col (fill = "#2c7fb8" ) +
facet_wrap (~ prov, ncol = 4 , scales = "free_y" ) +
scale_y_continuous (labels = scales:: comma) +
labs (
x = "Year" , y = "Cases" ,
title = "Cholera cases by province and year"
) +
theme_minimal () +
theme (
strip.text = element_text (face = "bold" , size = 7 ),
axis.text.x = element_text (angle = 45 , hjust = 1 , size = 6 )
)
6.9 Case fatality proportion by province
Code
cfr_prov <- Combined_data %>%
filter (! is.na (prov)) %>%
group_by (prov) %>%
summarise (
cases = sum (cases, na.rm = TRUE ),
deaths = sum (deaths, na.rm = TRUE ),
.groups = "drop"
) %>%
filter (cases > 0 ) %>%
mutate (cfr = deaths / cases * 100 )
ggplot (cfr_prov, aes (x = reorder (prov, cfr), y = cfr)) +
geom_col (fill = "#7570b3" ) +
coord_flip () +
labs (
x = NULL , y = "Case fatality proportion (%)" ,
title = "Cholera case fatality proportion by province (all years)"
) +
theme_minimal ()
6.10 Seasonality — average cases by epidemiological week
Code
seasonality <- Combined_data %>%
group_by (numsem) %>%
summarise (
mean_cases = mean (cases, na.rm = TRUE ),
median_cases = median (cases, na.rm = TRUE ),
.groups = "drop"
)
ggplot (seasonality, aes (x = numsem)) +
geom_line (aes (y = mean_cases), colour = "#2c7fb8" , linewidth = 0.8 ) +
geom_line (aes (y = median_cases),
colour = "#d95f02" , linewidth = 0.8 ,
linetype = "dashed"
) +
labs (
x = "Epidemiological week" , y = "Cases per HZ-week" ,
title = "Cholera seasonality — mean (blue) and median (orange) cases by epi week"
) +
theme_minimal ()