Data Pack Familiarisation and Initial Cleaning (clean_data_20260225 and idsr_target_clean)

Author

acp25

Published

April 9, 2026

Purpose

This report performs early-stage exploration for:

  1. clean_data_20260225* (close to raw, includes unmatched health zones)
  2. hz_naming_convention.csv (reference naming/mapping file)

Goals:

  • confirm file provenance and loadability
  • understand data structure and variables
  • assess missingness and data quality
  • propose practical cleaning next steps

Setup and File Discovery

Summary:

Code
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

data_dir <- "/rds/general/user/acp25/home/MIMIC/Clean_data/data"

# helper: null coalesce
`%||%` <- function(x, y) if (is.null(x) || length(x) == 0 || (length(x) == 1 && is.na(x))) y else x

# helper: detect delimiter from first line
detect_delim <- function(path) {
    first_line <- readLines(path, n = 1, warn = FALSE)
    if (length(first_line) == 0) {
        return(",")
    }
    counts <- c(
        comma = lengths(regmatches(first_line, gregexpr(",", first_line, fixed = TRUE))),
        tab = lengths(regmatches(first_line, gregexpr("\t", first_line, fixed = TRUE))),
        pipe = lengths(regmatches(first_line, gregexpr("|", first_line, fixed = TRUE))),
        semicolon = lengths(regmatches(first_line, gregexpr(";", first_line, fixed = TRUE)))
    )
    names(counts)[which.max(counts)] |>
        switch(comma = ",",
            tab = "\t",
            pipe = "|",
            semicolon = ";",
            ","
        )
}

# helper: robust reader
read_table_safely <- function(path) {
    delim <- detect_delim(path)
    if (requireNamespace("readr", quietly = TRUE)) {
        readr::read_delim(
            file = path, delim = delim, show_col_types = FALSE,
            guess_max = 100000, progress = FALSE
        )
    } else {
        utils::read.csv(path, sep = delim, stringsAsFactors = FALSE, check.names = FALSE)
    }
}

all_files <- list.files(data_dir, full.names = TRUE)
basename(all_files)
[1] "clean_data_20260225.csv"                                   
[2] "clean_df_stage2.csv"                                       
[3] "Combined_data.csv"                                         
[4] "hz_naming_convention.csv"                                  
[5] "idsr_clean.csv"                                            
[6] "idsr_target_clean.csv"                                     
[7] "Kinshasa_DRC_Cholera_Anticipatory_Action_Tracking (1).xlsx"
Code
# locate target files
clean_candidates <- list.files(
    data_dir,
    pattern = "^clean_data_20260225(\\..+)?$",
    full.names = TRUE
)

hz_candidates <- list.files(
    data_dir,
    pattern = "^hz_naming_convention\\.csv$",
    full.names = TRUE,
    ignore.case = TRUE
)

if (length(clean_candidates) == 0) {
    stop("No file matching '^clean_data_20260225(\\\\..+)?$' found in data_dir.")
}
if (length(hz_candidates) == 0) {
    stop("No file matching 'hz_naming_convention.csv' found in data_dir.")
}

clean_path <- clean_candidates[1]
hz_path <- hz_candidates[1]

cat("Selected raw-like dataset:", clean_path, "\n")
Selected raw-like dataset: /rds/general/user/acp25/home/MIMIC/Clean_data/data/clean_data_20260225.csv 
Code
cat("Selected HZ naming file:", hz_path, "\n")
Selected HZ naming file: /rds/general/user/acp25/home/MIMIC/Clean_data/data/hz_naming_convention.csv 
Code
clean_df <- read_table_safely(clean_path)
hz_df <- read_table_safely(hz_path)

cat("clean_data_20260225 rows:", nrow(clean_df), " cols:", ncol(clean_df), "\n")
clean_data_20260225 rows: 46073  cols: 9 
Code
cat("hz_naming_convention rows:", nrow(hz_df), " cols:", ncol(hz_df), "\n")
hz_naming_convention rows: 633  cols: 3 

Cleaning and saving for all future analyses

The DEBUTSEM column contains three inconsistent date formats:

  • M/D/YYYY (majority) — e.g. 3/13/2023, 12/3/2012
  • D-Mon-YY (minority, older data) — e.g. 26-Jul-10, 4-Feb-08
  • YYYY-MM-DD (ISO format) — e.g. 2021-06-12, 2025-04-28

We standardise these into a single Date column and store the result in clean_data, which is used for all subsequent analysis.

Step 1: Parse all date formats

Code
library(dplyr)
library(lubridate)

clean_data <- clean_df %>%
    mutate(
        DEBUTSEM = case_when(
            # D-Mon-YY format (e.g. 26-Jul-10)
            grepl("^\\d{1,2}-[A-Za-z]{3}-\\d{2}$", DEBUTSEM) ~
                format(dmy(DEBUTSEM), "%m/%d/%Y"),
            # YYYY-MM-DD ISO format (e.g. 2021-06-12)
            grepl("^\\d{4}-\\d{2}-\\d{2}$", DEBUTSEM) ~
                format(ymd(DEBUTSEM), "%m/%d/%Y"),
            TRUE ~ DEBUTSEM
        ),
        DEBUTSEM = mdy(DEBUTSEM)
    )

cat("DEBUTSEM class:", class(clean_data$DEBUTSEM), "\n")
DEBUTSEM class: Date 
Code
cat(
    "Date range:", as.character(min(clean_data$DEBUTSEM, na.rm = TRUE)),
    "to", as.character(max(clean_data$DEBUTSEM, na.rm = TRUE)), "\n"
)
Date range: 1989-06-27 to 2102-06-06 
Code
cat("NAs after parsing:", sum(is.na(clean_data$DEBUTSEM)), "of", nrow(clean_data), "\n")
NAs after parsing: 49 of 46073 

Step 2: Remove rows with impossible dates

Ten rows contain dates that are clearly data-entry errors — either in the far future (up to 2102) or before the surveillance period (1989). These are removed because they cannot correspond to real epidemiological observations and would distort any time-based analysis.

Code
n_before <- nrow(clean_data)

clean_data <- clean_data %>%
    filter(
        is.na(DEBUTSEM) |
            (DEBUTSEM >= as.Date("2000-01-01") & DEBUTSEM <= Sys.Date())
    )

n_removed <- n_before - nrow(clean_data)
cat("Removed", n_removed, "rows with impossible dates (before 2000 or in the future).\n")
Removed 10 rows with impossible dates (before 2000 or in the future).
Code
cat("Rows remaining:", nrow(clean_data), "\n")
Rows remaining: 46063 

Step 3: Year-mismatch audit

Some rows have a mismatch between the year extracted from DEBUTSEM and the YEAR column. Most of these are benign: epidemiological week 1 or 52/53 can straddle a year boundary (e.g. 2007-12-31 assigned to YEAR = 2008, week 1). A smaller number are genuine errors where the date and year disagree by more than one calendar year or appear in mid-year weeks. Genuine errors are removed.

Code
clean_data <- clean_data %>%
    mutate(date_year = year(DEBUTSEM))

mismatch <- clean_data %>%
    filter(!is.na(DEBUTSEM) & date_year != YEAR) %>%
    mutate(
        year_diff = abs(date_year - YEAR),
        is_straddling = year_diff == 1 & NUMSEM %in% c(1, 52, 53)
    )

n_straddling <- sum(mismatch$is_straddling)
n_genuine <- sum(!mismatch$is_straddling)

cat("Total year mismatches:", nrow(mismatch), "\n")
Total year mismatches: 741 
Code
cat("  Straddling year boundary (benign):", n_straddling, "\n")
  Straddling year boundary (benign): 626 
Code
cat("  Genuine date/year errors:", n_genuine, "\n")
  Genuine date/year errors: 115 
Code
if (n_genuine > 0) {
    cat("\nGenuine error rows (to be removed):\n")
    mismatch %>%
        filter(!is_straddling) %>%
        select(DEBUTSEM, YEAR, date_year, NUMSEM, PROV, HZ) %>%
        print(n = 20)
}

Genuine error rows (to be removed):
# A tibble: 115 × 6
   DEBUTSEM    YEAR date_year NUMSEM PROV          HZ       
   <date>     <dbl>     <dbl>  <dbl> <chr>         <chr>    
 1 2015-06-05  2017      2015     23 KONGO-CENTRAL KIMPESE  
 2 2015-06-05  2017      2015     23 NORD-KIVU     BINZA    
 3 2019-01-27  2020      2019      5 TANGANYIKA    KALEMIE  
 4 2015-06-05  2017      2015     23 HAUT-LOMAMI   KINKONDJA
 5 2015-06-05  2017      2015     23 ITURI         NYARAMBE 
 6 2019-01-13  2020      2019      3 TANGANYIKA    KALEMIE  
 7 2019-01-20  2020      2019      4 TANGANYIKA    KALEMIE  
 8 2015-06-05  2017      2015     23 TANGANYIKA    KALEMIE  
 9 2024-01-13  2025      2024      3 LUALABA       FUNGURUME
10 2019-03-02  2020      2019      6 TANGANYIKA    KALEMIE  
11 2019-12-09  2016      2019     37 TANGANYIKA    KALEMIE  
12 2015-06-05  2017      2015     23 NORD-KIVU     GOMA     
13 2019-01-13  2020      2019      3 TANGANYIKA    MOBA     
14 2024-01-06  2025      2024      2 LUALABA       FUNGURUME
15 2024-01-27  2025      2024      5 LUALABA       FUNGURUME
16 2015-06-05  2017      2015     23 KINSHASA      KOKOLO   
17 2015-06-05  2017      2015     23 NORD-KIVU     RUTSHURU 
18 2015-06-05  2017      2015     23 TANGANYIKA    NYEMBA   
19 2019-01-20  2020      2019      4 TANGANYIKA    MOBA     
20 2019-01-20  2020      2019      4 TANGANYIKA    NYEMBA   
# ℹ 95 more rows
Code
# remove genuine year-mismatch errors using row indices
n_before_mismatch <- nrow(clean_data)

clean_data <- clean_data %>%
    mutate(.row_id = row_number()) %>%
    filter(!.row_id %in% (
        clean_data %>%
            mutate(.row_id = row_number()) %>%
            filter(!is.na(DEBUTSEM) & date_year != YEAR) %>%
            mutate(
                year_diff = abs(date_year - YEAR),
                is_straddling = year_diff == 1 & NUMSEM %in% c(1, 52, 53)
            ) %>%
            filter(!is_straddling) %>%
            pull(.row_id)
    )) %>%
    select(-.row_id)

cat("Removed", n_before_mismatch - nrow(clean_data), "genuine year-mismatch rows.\n")
Removed 115 genuine year-mismatch rows.
Code
cat("Rows remaining:", nrow(clean_data), "\n")
Rows remaining: 45948 
Code
# drop the helper column
clean_data <- clean_data %>% select(-date_year)

Note on year mismatches: Of the 741 rows where the parsed date year differs from YEAR, 626 are benign week-1/52/53 boundary cases and 115 are genuine errors (date and year disagree by > 1 year or occur in mid-year weeks). The genuine errors have been removed.

Step 4: Epidemiological week completeness

Incomplete years:

  • 2017: missing week 52
  • 2020: missing weeks 48–52 (5 weeks missing at year end)

All other years (2006–2016, 2018–2019, 2021–2025) have complete weeks 1–52.

Step 5: Column-name standardisation and whitespace trimming

Code
clean_names_basic <- function(nm) {
    nm <- trimws(nm)
    nm <- tolower(nm)
    nm <- gsub("[^a-z0-9]+", "_", nm)
    nm <- gsub("^_|_$", "", nm)
    make.unique(nm, sep = "_")
}

clean_df_stage1 <- clean_data
names(clean_df_stage1) <- clean_names_basic(names(clean_df_stage1))

# normalise character column values: trim, lowercase, replace special chars with _
char_cols <- names(clean_df_stage1)[sapply(clean_df_stage1, is.character)]
for (v in char_cols) {
    clean_df_stage1[[v]] <- trimws(clean_df_stage1[[v]])
    clean_df_stage1[[v]] <- tolower(clean_df_stage1[[v]])
    clean_df_stage1[[v]] <- gsub("[^a-z0-9]+", "_", clean_df_stage1[[v]])
    clean_df_stage1[[v]] <- gsub("^_|_$", "", clean_df_stage1[[v]])
    clean_df_stage1[[v]][clean_df_stage1[[v]] == ""] <- NA
}

cat("Stage-1 cleaned object created: clean_df_stage1\n")
Stage-1 cleaned object created: clean_df_stage1
Code
cat("Column names + character values standardised to lower_snake_case.\n")
Column names + character values standardised to lower_snake_case.

1. Variable Inventory

1.1 Variable Name Descriptions

Variable Description
PROV Province name (character). Values include NORD-KIVU, SUD-KIVU, etc.
NUMSEM Epidemiological week number (numeric), from 1 to 53.
DEBUTSEM Week start date (Date, after cleaning). Range spans 2008–2025.
TOTALCAS Total reported cases for the week (numeric), from 0 upward.
TOTALDECES Total reported deaths for the week (numeric), from 0 upward.
YEAR Calendar year (numeric), from 2008 to 2025.
MALADIE Disease name (character), e.g. CHOLERA.
HZ Health zone name (character), raw name before standardisation.
key Health zone key identifier (character), used for joining with hz_naming_convention.

1.2 Variable Profile

Code
profile_df <- function(df, max_example_chars = 40) {
    n <- nrow(df)
    out <- data.frame(
        variable = names(df),
        class = sapply(df, function(x) paste(class(x), collapse = ", ")),
        n_missing = sapply(df, function(x) sum(is.na(x) | x == "")),
        n_unique = sapply(df, function(x) length(unique(x[!is.na(x) & x != ""]))),
        example_value = sapply(df, function(x) {
            vals <- unique(x[!is.na(x) & x != ""])
            if (length(vals) == 0) {
                return(NA_character_)
            }
            ex <- as.character(vals[1])
            substr(ex, 1, max_example_chars)
        }),
        stringsAsFactors = FALSE
    )
    out$pct_missing <- round(100 * out$n_missing / max(n, 1), 2)
    out[order(-out$pct_missing), c("variable", "class", "n_missing", "pct_missing", "n_unique", "example_value")]
}

clean_profile <- profile_df(clean_df_stage1)
hz_profile <- profile_df(hz_df)

knitr::kable(head(clean_profile, 40), caption = "All variables (clean_df_stage1) sorted by missingness")
All variables (clean_df_stage1) sorted by missingness
variable class n_missing pct_missing n_unique example_value
key key character 46 0.10 428 goma
totalcas totalcas numeric 13 0.03 308 1177
totaldeces totaldeces numeric 16 0.03 23 0
prov prov character 0 0.00 31 nord_kivu
numsem numsem numeric 0 0.00 53 11
year year numeric 0 0.00 20 2023
maladie maladie character 0 0.00 1 cholera
hz hz character 0 0.00 428 goma
debutsem debutsem Date NA NA 1 NA
Code
knitr::kable(hz_profile, caption = "Variables in hz_naming_convention")
Variables in hz_naming_convention
variable class n_missing pct_missing n_unique example_value
value value character 498 78.67 89 BAFWAGBOGBO
pcode pcode character 17 2.69 516 CD5307ZS01
key key character 0 0.00 632 ABA

2. Exploratory Data Analysis (EDA)

2.1 Numeric Variables

Code
num_vars <- names(clean_df_stage1)[sapply(clean_df_stage1, is.numeric)]

if (length(num_vars) > 0) {
    num_summary <- data.frame(
        variable = num_vars,
        mean = sapply(clean_df_stage1[num_vars], function(x) round(mean(x, na.rm = TRUE), 3)),
        sd = sapply(clean_df_stage1[num_vars], function(x) round(sd(x, na.rm = TRUE), 3)),
        min = sapply(clean_df_stage1[num_vars], function(x) round(min(x, na.rm = TRUE), 3)),
        p50 = sapply(clean_df_stage1[num_vars], function(x) round(stats::median(x, na.rm = TRUE), 3)),
        max = sapply(clean_df_stage1[num_vars], function(x) round(max(x, na.rm = TRUE), 3))
    )
    knitr::kable(head(num_summary, 30), caption = "Numeric summary (first 30 numeric variables)")
} else {
    cat("No numeric variables detected.\n")
}
Numeric summary (first 30 numeric variables)
variable mean sd min p50 max
numsem numsem 26.353 15.096 1 26 53
totalcas totalcas 12.628 28.325 0 4 1177
totaldeces totaldeces 0.246 1.028 0 0 98
year year 2016.125 6.234 2006 2016 2025
Code
num_vars <- names(clean_df_stage1)[sapply(clean_df_stage1, is.numeric)]

if (length(num_vars) > 0) {
    plot_vars <- head(num_vars, 6)
    old_par <- par(no.readonly = TRUE)
    on.exit(par(old_par), add = TRUE)
    par(mfrow = c(2, 3))

    for (v in plot_vars) {
        hist(clean_df_stage1[[v]],
            main = v, xlab = v, col = "steelblue", border = "white"
        )
    }
} else {
    plot.new()
    text(0.5, 0.5, "No numeric variables available")
}

Distributions of first up to 6 numeric variables

2.2 Categorical Variables

Code
cat_vars <- names(clean_df_stage1)[sapply(clean_df_stage1, function(x) is.character(x) || is.factor(x))]

if (length(cat_vars) > 0) {
    cat("Categorical variable count:", length(cat_vars), "\n")
    for (v in cat_vars) {
        cat("\n---", v, "---\n")
        print(sort(table(clean_df_stage1[[v]], useNA = "ifany"), decreasing = TRUE))
    }
} else {
    cat("No character/factor variables detected.\n")
}
Categorical variable count: 4 

--- prov ---

        sud_kivu         kinshasa        nord_kivu          katanga 
            8778             6471             6244             4812 
         sankuru       tanganyika     haut_katanga      haut_lomami 
            3022             2567             2376             1979 
        equateur        orientale   kasai_oriental           tshopo 
            1748             1266             1078              920 
         maniema    kongo_central          mongala       mai_ndombe 
             792              532              476              408 
           kasai          lualaba            ituri           lomami 
             400              400              391              372 
        bandundu        bas_congo            kwilu      nord_ubangi 
             245              215              200              178 
        bas_uele          tshuapa    kasai_central           kwango 
              21               17               16               11 
      sud_ubangi kasai_occidental        haut_uele 
               8                3                2 

--- maladie ---
cholera 
  45948 

--- hz ---

          kalemie            nyemba              fizi             uvira 
              969               945               943               928 
         kirotshe              goma            minova         karisimbi 
              919               918               847               744 
             moba            katana             nundu         kinkondja 
              703               697               647               575 
            mweso           malemba            bukama            ruzizi 
              566               561               555               551 
           kadutu          rutshuru            masisi    kabondo_dianda 
              530               529               498               452 
           kabare      miti_murhesa           mukanga          kansimba 
              438               434               423               403 
          butumba            ibanda            lemera            kabalo 
              393               390               390               344 
           limete        nyiragongo      bagira_kasha            kalehe 
              343               342               309               291 
          lusambo             kenya             idjwi           kongolo 
              274               272               271               269 
         kingabwa           mulongo          mutwanga           tchomia 
              268               268               257               257 
         kampemba          maluku_i            manono       bena_dibele 
              255               254               252               243 
    pania_mutombo            katuba             kilwa           kisanga 
              243               225               218               216 
            pinga            bolobo            ankoro             nsele 
              216               213               212               210 
            pweto         bunyakiri              kole            kikula 
              209               205               204               202 
          barumbu             gombe         fungurume        lubumbashi 
              198               198               197               197 
   mufunga_sampwe            kokolo         masina_ii         kingasani 
              195               192               192               190 
         lukolela          masina_i           makanza             yumbi 
              188               188               186               186 
          kisenso          kalamu_i       binza_meteo            lwamba 
              184               182               181               181 
           matete            nyunzu       binza_ozone             bumbu 
              181               180               179               178 
           ndjili            kikimi       ngiri_ngiri           wangata 
              177               175               175               175 
        kalamu_ii      katako_kombe         birambizo             ototo 
              174               173               172               172 
      tshudi_loto           tshumbe        vanga_kete       bandalungwa 
              172               172               172               171 
          dikungu       djalo_djeka             lodja            lomela 
              171               171               171               171 
            minga    mont_ngafula_i         omendjadi       wembo_nyama 
              171               171               171               171 
         selembao          kinshasa             ngaba          kintambo 
              170               169               169               167 
           makala          mushenge        kimbanseke          nyarambe 
              167               166               165               165 
  mont_ngafula_ii            biyela          lingwala         kasa_vubu 
              163               162               162               161 
            lemba             kindu          rwanguba     lulenge_kimbi 
              161               160               160               153 
         nyangezi         nyantende          mumbunda           lubunga 
              153               152               151               146 
         kapolowe           bolenge            angumu           bunkeya 
              143               142               140               136 
          bibanga         maluku_ii kalambayi_kabanga          alunguli 
              135               132               131               130 
           lukafu          mbandaka            katoyi            kyondo 
              129               129               128               127 
           likasi   lilanga_bobangi             bunia         kamalondo 
              123               120               119               118 
         kibirizi           mampoko           mulumba            isangi 
              118               118               118               116 
           rwashi          walikale             binza         lufungula 
              115               113               112               111 
           kafubu           kasenga  makiso_kisangani            moanda 
              109               108               106               104 
           yakusu              muya             kailo            lisala 
              104               102               101               101 
          kimpese            yahuma        ngandajika       tshamilemba 
               99                98                97                97 
            irebu           kipushi           bonzola            lubudi 
               94                93                90                89 
           mokala             gethy           kasansa            kitutu 
               87                85                85                84 
           manika           kiyambi          yamaluka         lukelenge 
               82                79                78                77 
            vangu            basoko             binga           kambove 
               76                75                75                75 
        tshilenge              jiba          kamituga             kibua 
               73                69                69                69 
         kwamouth             panda              pimu           dibindi 
               69                69                69                68 
            kunda           lualaba            mushie        yalimbongo 
               68                68                67                67 
            bambo              lowa            mahagi            bikoro 
               66                66                66                65 
           dekese            drodro          shabunda     boso_mondanda 
               65                64                63                62 
           matadi             bumba              logo            ubundu 
               62                61                59                59 
         bosondjo              lita             opala            dilala 
               58                58                57                56 
        kabambare      nsona_mpangu           bipemba           mbulula 
               55                55                54                53 
         kashobwe             kayna          kimpangu            kitona 
               52                52                52                52 
          sakania           citenge           mukumbi            police 
               52                51                51                51 
       tshishimbi          yahisuli        boma_bungu             linga 
               51                51                50                50 
      wanierukula             diulu             rethy            mikope 
               50                49                49                47 
        yamongili          ferekeni          bandundu     kimbi_lulenge 
               46                45                44                44 
     kwilu_ngongo             nioki              boma           lotumbe 
               44                44                43                42 
            ilebo             ipamu           kitenge         basankusu 
               41                41                41                40 
           bulape            lubutu          yabaondo              bili 
               40                40                40                39 
          bolomba           kasongo         lubilanji      gombe_matadi 
               39                39                39                38 
             nizi            djombo           itebero           bomongo 
               38                36                36                35 
          ingende              kowe             mwana            ntondo 
               35                35                35                35 
          kalonge           kansele             nzaba             iboko 
               34                34                34                33 
            miabi           mitwaba           monieka            fataki 
               32                32                32                31 
           nzanza            lubero         mubumbano           kampene 
               31                30                30                29 
         rwampara         alimbongo             rimba           yambuku 
               29                28                28                28 
          walungu            lukula           mpokolo             punia 
               27                25                25                25 
           tshopo            kamina           kaniola           kabondo 
               25                24                24                23 
          kibombo             pangi            bosobe           mangobo 
               23                23                22                22 
       saramabila               sia             aketi            bokoro 
               22                22                21                21 
  lolanga_mampoko    kilela_balanda           kamango          kanzenze 
               21                20                18                18 
          kisantu              lolo          mongbalu            kuimba 
               18                18                18                17 
          nia_nia             abuzi          bosobolo              bulu 
               17                15                15                15 
        gbadolite   kabeya_kamuanga             samba           businga 
               15                15                15                14 
           karawa            vuhovi           wapinda            wasolo 
               14                14                14                14 
           yakoma            yaleko              loko          masereka 
               14                14                13                13 
    mobayi_mbongo           mulungu             ndage            tshela 
               13                13                13                13 
             boga           cilundu             oshwe              boko 
               12                11                11                10 
      kanda_kanda             kangu           lingomo            mawuya 
               10                10                10                10 
         minembwe               adi            basali       boko_kivulu 
               10                 9                 9                 9 
          ariwara      banga_lubaka        boso_manzi           kabinda 
                8                 8                 8                 8 
          kabongo              kilo           butembo           kakenge 
                7                 7                 6                 6 
            katwa              kizu             laybo            mwenga 
                6                 6                 6                 6 
         nyakunde           obokote             oicha          tshikapa 
                6                 6                 6                 6 
            ikela           itombwe           komanda        mwene_ditu 
                5                 5                 5                 5 
            tembo           banalia            kalima           kamonia 
                5                 4                 4                 4 
          kaniama            luambo           lusangi           mangala 
                4                 4                 4                 4 
            mbaya         musienene       ntandembelo            befale 
                4                 4                 4                 3 
             beni       bongandanga             djuma            inongo 
                3                 3                 3                 3 
          kanzala            kaziba          kitangwa             luebo 
                3                 3                 3                 3 
          mandima              masa     mbanza_ngungu             monga 
                3                 3                 3                 3 
          monkoto          ngidinga             tunda              vaku 
                3                 3                 3                 3 
              aru            bagata            bobozo            boende 
                2                 2                 2                 2 
          bokungu     boma_mangbetu            doruma              inga 
                2                 2                 2                 2 
           kalole           kambala          kamwesha           kikongo 
                2                 2                 2                 2 
            kinda              kiwa        koshibanda             luozi 
                2                 2                 2                 2 
           luputa             mweka          niangara            nyanga 
                2                 2                 2                 2 
            songa              wema            aungba       bafwagbogbo 
                2                 2                 1                 1 
       bafwasende            bagira              baka           biringi 
                1                 1                 1                 1 
          bokonzi           budjala             damas            dibaya 
                1                 1                 1                 1 
            djolu           faradje            gemena             gungu 
                1                 1                 1                 1 
   hauts_plateaux            idiofa           kalenda           kalonda 
                1                 1                 1                 1 
      kalonda_est     kalonda_ouest           katende           kayamba 
                1                 1                 1                 1 
            kenge           kibunzi          kinkonzi           libenge 
                1                 1                 1                 1 
           likati           lulingu            makoro            makota 
                1                 1                 1                 1 
          mambasa          mangembo          mikalayi            moanza 
                1                 1                 1                 1 
         mondombe            mutena        mutshatsha     ndjoko_mpunda 
                1                 1                 1                 1 
          opienge             panzi            titule             vanga 
                1                 1                 1                 1 

--- key ---

          kalemie            nyemba              fizi             uvira 
              969               945               943               928 
         kirotshe              goma            minova         karisimbi 
              919               918               847               744 
             moba            katana             nundu         kinkondja 
              703               697               647               575 
            mweso           malemba            bukama            ruzizi 
              566               561               555               551 
           kadutu          rutshuru            masisi    kabondo_dianda 
              530               529               498               452 
           kabare      miti_murhesa           mukanga          kansimba 
              438               434               423               403 
          butumba            ibanda            lemera            kabalo 
              393               390               390               344 
           limete        nyiragongo      bagira_kasha            kalehe 
              343               342               309               291 
          lusambo             kenya             idjwi           kongolo 
              274               272               271               269 
         kingabwa           mulongo          mutwanga           tchomia 
              268               268               257               257 
         kampemba          maluku_i            manono       bena_dibele 
              255               254               252               243 
    pania_mutombo            katuba             kilwa           kisanga 
              243               225               218               216 
            pinga            bolobo            ankoro             nsele 
              216               213               212               210 
            pweto         bunyakiri              kole            kikula 
              209               205               204               202 
          barumbu             gombe         fungurume        lubumbashi 
              198               198               197               197 
   mufunga_sampwe            kokolo         masina_ii         kingasani 
              195               192               192               190 
         lukolela          masina_i           makanza             yumbi 
              188               188               186               186 
          kisenso          kalamu_i       binza_meteo            lwamba 
              184               182               181               181 
           matete            nyunzu       binza_ozone             bumbu 
              181               180               179               178 
           ndjili            kikimi       ngiri_ngiri           wangata 
              177               175               175               175 
        kalamu_ii         birambizo             ototo       tshudi_loto 
              174               172               172               172 
          tshumbe        vanga_kete       bandalungwa           dikungu 
              172               172               171               171 
      djalo_djeka             lodja            lomela             minga 
              171               171               171               171 
   mont_ngafula_i         omendjadi       wembo_nyama          selembao 
              171               171               171               170 
         kinshasa             ngaba          kintambo            makala 
              169               169               167               167 
         mushenge        kimbanseke          nyarambe   mont_ngafula_ii 
              166               165               165               163 
           biyela          lingwala         kasa_vubu             lemba 
              162               162               161               161 
            kindu          rwanguba     lulenge_kimbi          nyangezi 
              160               160               153               153 
        nyantende          mumbunda           lubunga          kapolowe 
              152               151               146               143 
          bolenge            angumu           bunkeya           bibanga 
              142               140               136               135 
        maluku_ii kalambayi_kabanga          alunguli            lukafu 
              132               131               130               129 
         mbandaka            katoyi      katako_kombe            kyondo 
              129               128               127               127 
           likasi   lilanga_bobangi             bunia         kamalondo 
              123               120               119               118 
         kibirizi           mampoko           mulumba            isangi 
              118               118               118               116 
           rwashi          walikale             binza         lufungula 
              115               113               112               111 
           kafubu           kasenga  makiso_kisangani            moanda 
              109               108               106               104 
           yakusu              muya             kailo            lisala 
              104               102               101               101 
          kimpese            yahuma        ngandajika       tshamilemba 
               99                98                97                97 
            irebu           kipushi           bonzola            lubudi 
               94                93                90                89 
           mokala             gethy           kasansa            kitutu 
               87                85                85                84 
           manika           kiyambi          yamaluka         lukelenge 
               82                79                78                77 
            vangu            basoko             binga           kambove 
               76                75                75                75 
        tshilenge              jiba          kamituga             kibua 
               73                69                69                69 
         kwamouth             panda              pimu           dibindi 
               69                69                69                68 
            kunda           lualaba            mushie        yalimbongo 
               68                68                67                67 
            bambo              lowa            mahagi            bikoro 
               66                66                66                65 
           dekese            drodro          shabunda     boso_mondanda 
               65                64                63                62 
           matadi             bumba              logo            ubundu 
               62                61                59                59 
         bosondjo              lita             opala            dilala 
               58                58                57                56 
        kabambare      nsona_mpangu           bipemba           mbulula 
               55                55                54                53 
         kashobwe             kayna          kimpangu            kitona 
               52                52                52                52 
          sakania           citenge           mukumbi            police 
               52                51                51                51 
       tshishimbi          yahisuli        boma_bungu             linga 
               51                51                50                50 
      wanierukula             diulu             rethy            mikope 
               50                49                49                47 
        yamongili              <NA>          ferekeni          bandundu 
               46                46                45                44 
    kimbi_lulenge      kwilu_ngongo             nioki              boma 
               44                44                44                43 
          lotumbe             ilebo             ipamu           kitenge 
               42                41                41                41 
        basankusu            bulape            lubutu          yabaondo 
               40                40                40                40 
             bili           bolomba           kasongo         lubilanji 
               39                39                39                39 
     gombe_matadi              nizi            djombo           itebero 
               38                38                36                36 
          bomongo           ingende              kowe             mwana 
               35                35                35                35 
           ntondo           kalonge           kansele             nzaba 
               35                34                34                34 
            iboko             miabi           mitwaba           monieka 
               33                32                32                32 
           fataki            nzanza            lubero         mubumbano 
               31                31                30                30 
          kampene          rwampara         alimbongo             rimba 
               29                29                28                28 
          yambuku           walungu            lukula           mpokolo 
               28                27                25                25 
            punia            tshopo            kamina           kaniola 
               25                25                24                24 
          kabondo           kibombo             pangi            bosobe 
               23                23                23                22 
          mangobo        saramabila               sia             aketi 
               22                22                22                21 
           bokoro   lolanga_mampoko    kilela_balanda           kamango 
               21                21                20                18 
         kanzenze           kisantu              lolo          mongbalu 
               18                18                18                18 
           kuimba           nia_nia             abuzi          bosobolo 
               17                17                15                15 
             bulu         gbadolite   kabeya_kamuanga             samba 
               15                15                15                15 
          businga            karawa            vuhovi           wapinda 
               14                14                14                14 
           wasolo            yakoma            yaleko              loko 
               14                14                14                13 
         masereka     mobayi_mbongo           mulungu             ndage 
               13                13                13                13 
           tshela              boga           cilundu             oshwe 
               13                12                11                11 
             boko       kanda_kanda             kangu           lingomo 
               10                10                10                10 
           mawuya          minembwe               adi            basali 
               10                10                 9                 9 
      boko_kivulu           ariwara      banga_lubaka        boso_manzi 
                9                 8                 8                 8 
          kabinda           kabongo              kilo           butembo 
                8                 7                 7                 6 
          kakenge             katwa              kizu             laybo 
                6                 6                 6                 6 
           mwenga          nyakunde           obokote             oicha 
                6                 6                 6                 6 
         tshikapa             ikela           itombwe           komanda 
                6                 5                 5                 5 
       mwene_ditu             tembo           banalia            kalima 
                5                 5                 4                 4 
          kamonia           kaniama            luambo           lusangi 
                4                 4                 4                 4 
          mangala             mbaya         musienene       ntandembelo 
                4                 4                 4                 4 
           befale              beni       bongandanga             djuma 
                3                 3                 3                 3 
           inongo           kanzala            kaziba          kitangwa 
                3                 3                 3                 3 
            luebo           mandima              masa     mbanza_ngungu 
                3                 3                 3                 3 
            monga           monkoto          ngidinga             tunda 
                3                 3                 3                 3 
             vaku               aru            bagata            bobozo 
                3                 2                 2                 2 
           boende           bokungu     boma_mangbetu            doruma 
                2                 2                 2                 2 
             inga            kalole           kambala          kamwesha 
                2                 2                 2                 2 
          kikongo             kinda              kiwa        koshibanda 
                2                 2                 2                 2 
            luozi            luputa             mweka          niangara 
                2                 2                 2                 2 
           nyanga             songa              wema            aungba 
                2                 2                 2                 1 
      bafwagbogbo        bafwasende            bagira              baka 
                1                 1                 1                 1 
          biringi           bokonzi           budjala             damas 
                1                 1                 1                 1 
           dibaya             djolu           faradje            gemena 
                1                 1                 1                 1 
            gungu    hauts_plateaux            idiofa           kalenda 
                1                 1                 1                 1 
          kalonda       kalonda_est     kalonda_ouest           katende 
                1                 1                 1                 1 
          kayamba             kenge           kibunzi          kinkonzi 
                1                 1                 1                 1 
          libenge            likati           lulingu            makoro 
                1                 1                 1                 1 
           makota           mambasa          mangembo          mikalayi 
                1                 1                 1                 1 
           moanza          mondombe            mutena        mutshatsha 
                1                 1                 1                 1 
    ndjoko_mpunda           opienge             panzi            titule 
                1                 1                 1                 1 
            vanga 
                1 
Code
library(ggplot2)

cat_vars <- names(clean_df_stage1)[sapply(clean_df_stage1, function(x) is.character(x) || is.factor(x))]

if (length(cat_vars) > 0) {
    plot_list <- list()

    for (v in cat_vars) {
        freq <- as.data.frame(table(clean_df_stage1[[v]], useNA = "ifany"),
            stringsAsFactors = FALSE
        )
        names(freq) <- c("category", "count")
        freq$category[is.na(freq$category)] <- "(NA)"

        # keep top 8 categories, lump the rest
        if (nrow(freq) > 15) {
            freq <- freq[order(-freq$count), ]
            top <- freq[1:15, ]
            other <- data.frame(
                category = "Other",
                count = sum(freq$count[16:nrow(freq)])
            )
            freq <- rbind(top, other)
        }

        freq$fraction <- freq$count / sum(freq$count)
        freq$ymax <- cumsum(freq$fraction)
        freq$ymin <- c(0, head(freq$ymax, -1))
        freq$label_pos <- (freq$ymax + freq$ymin) / 2
        freq$label <- paste0(
            freq$category, "\n(n=",
            formatC(freq$count, format = "d", big.mark = ","), ")"
        )

        p <- ggplot(freq, aes(
            ymax = ymax, ymin = ymin, xmax = 4, xmin = 2.5,
            fill = category
        )) +
            geom_rect() +
            geom_text(aes(x = 3.25, y = label_pos, label = label), size = 2.5) +
            coord_polar(theta = "y") +
            xlim(c(1, 4)) +
            theme_void() +
            theme(legend.position = "none") +
            labs(title = v)

        plot_list[[v]] <- p
    }

    if (requireNamespace("patchwork", quietly = TRUE)) {
        library(patchwork)
        wrap_plots(plot_list, ncol = min(length(plot_list), 3))
    } else {
        for (p in plot_list) print(p)
    }
} else {
    cat("No categorical variables to plot.\n")
}

Donut plots for categorical variables

Donut plots for categorical variables

Donut plots for categorical variables

Donut plots for categorical variables

3. Exploring data set B: idsr_target_clean.csv

This dataset contains weekly cholera surveillance data at the health-zone level, with population denominators and pre-computed incidence rates. It will be joined with clean_df_stage1 in a later step.

3.1 Load and inspect

Code
idsr_path <- file.path(data_dir, "idsr_target_clean.csv")
idsr_df <- read_table_safely(idsr_path)

cat("idsr_target_clean rows:", nrow(idsr_df), " cols:", ncol(idsr_df), "\n")
idsr_target_clean rows: 416004  cols: 11 
Code
cat("Column names:", paste(names(idsr_df), collapse = ", "), "\n")
Column names: hz_code, year, week, cases, deaths, adm1_fr, hz, date, population, adm1_pcode, cases_pop 
Code
str(idsr_df)
spc_tbl_ [416,004 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ hz_code   : chr [1:416004] "CD6101ZS01" "CD6101ZS01" "CD6101ZS01" "CD6101ZS01" ...
 $ year      : num [1:416004] 2006 2006 2006 2006 2006 ...
 $ week      : num [1:416004] 1 2 3 4 5 6 7 8 9 10 ...
 $ cases     : num [1:416004] 56 14 22 21 32 31 46 103 48 54 ...
 $ deaths    : num [1:416004] 0 0 0 0 0 0 0 1 0 0 ...
 $ adm1_fr   : chr [1:416004] "nord_kivu" "nord_kivu" "nord_kivu" "nord_kivu" ...
 $ hz        : chr [1:416004] "goma" "goma" "goma" "goma" ...
 $ date      : Date[1:416004], format: "2006-01-02" "2006-01-09" ...
 $ population: num [1:416004] 296096 296096 296096 296096 296096 ...
 $ adm1_pcode: chr [1:416004] "CD61" "CD61" "CD61" "CD61" ...
 $ cases_pop : num [1:416004] 1.89e-04 4.73e-05 7.43e-05 7.09e-05 1.08e-04 ...
 - attr(*, "spec")=
  .. cols(
  ..   hz_code = col_character(),
  ..   year = col_double(),
  ..   week = col_double(),
  ..   cases = col_double(),
  ..   deaths = col_double(),
  ..   adm1_fr = col_character(),
  ..   hz = col_character(),
  ..   date = col_date(format = ""),
  ..   population = col_double(),
  ..   adm1_pcode = col_character(),
  ..   cases_pop = col_double()
  .. )
 - attr(*, "problems")=<externalptr> 

3.2 Variable descriptions

Variable Description
hz_code Health-zone geocode (character, e.g. CD6101ZS01). 400 unique values.
year Calendar year (numeric), 2006–2025 (20 years).
week Epidemiological week number (numeric), 1–52.
cases Reported cholera cases for the week (numeric), 0–1 177.
deaths Reported cholera deaths for the week (numeric), 0–98.
adm1_fr Province name in French, lower-snake-case (character). 26 unique provinces.
hz Health-zone name, lower-snake-case (character). 400 unique values.
date Week start date (Date class, ISO YYYY-MM-DD). Range 2006-01-02 to 2025-12-22.
population Health-zone population (numeric). Static per health zone (does not vary across years).
adm1_pcode Province geocode (character, e.g. CD61). 26 unique values.
cases_pop Pre-computed incidence rate = cases / population. Verified to match exactly.

3.3 Variable profile

Code
idsr_profile <- profile_df(idsr_df)
knitr::kable(idsr_profile, caption = "All variables (idsr_target_clean) sorted by missingness")
All variables (idsr_target_clean) sorted by missingness
variable class n_missing pct_missing n_unique example_value
hz_code hz_code character 0 0 400 CD6101ZS01
year year numeric 0 0 20 2006
week week numeric 0 0 52 1
cases cases numeric 0 0 308 56
deaths deaths numeric 0 0 23 0
adm1_fr adm1_fr character 0 0 26 nord_kivu
hz hz character 0 0 400 goma
population population numeric 0 0 400 296096
adm1_pcode adm1_pcode character 0 0 26 CD61
cases_pop cases_pop numeric 0 0 7537 0.000189127850426889
date date Date NA NA 1 NA

3.4 Missingness

Code
cat("Zero missing values across all", ncol(idsr_df), "columns.\n")
Zero missing values across all 11 columns.
Code
cat("Verification:\n")
Verification:
Code
sapply(idsr_df, function(x) sum(is.na(x)))
   hz_code       year       week      cases     deaths    adm1_fr         hz 
         0          0          0          0          0          0          0 
      date population adm1_pcode  cases_pop 
         0          0          0          0 

3.5 Date quality

Code
cat("Date class:", paste(class(idsr_df$date), collapse = ", "), "\n")
Date class: Date 
Code
cat("Date range:", as.character(min(idsr_df$date)), "to", as.character(max(idsr_df$date)), "\n")
Date range: 2006-01-02 to 2025-12-22 
Code
cat("Future dates (after today):", sum(idsr_df$date > Sys.Date()), "\n")
Future dates (after today): 0 
Code
cat("Dates before 2000:", sum(idsr_df$date < as.Date("2000-01-01")), "\n")
Dates before 2000: 0 

All dates are already parsed as Date class, in ISO format, with no impossible values.

3.6 Year/date consistency

Code
idsr_df <- idsr_df %>%
    mutate(date_year = year(date))

idsr_mismatch <- idsr_df %>%
    filter(date_year != year)

cat("Year mismatches (date year != year column):", nrow(idsr_mismatch), "\n")
Year mismatches (date year != year column): 3200 

All 3 200 mismatches are benign week-1 boundary cases (the week-start date falls in late December of the prior year while year is correctly assigned to the following epidemiological year).

Code
# verify: all mismatches are week 1, year differs by exactly 1
cat(
    "All mismatches are week 1:",
    all(idsr_mismatch$week == 1), "\n"
)
All mismatches are week 1: TRUE 
Code
cat(
    "All year diffs are exactly 1:",
    all(abs(idsr_mismatch$date_year - idsr_mismatch$year) == 1), "\n"
)
All year diffs are exactly 1: TRUE 
Code
idsr_df <- idsr_df %>% select(-date_year)

3.7 Epidemiological week completeness

Code
cat("Week range:", min(idsr_df$week), "to", max(idsr_df$week), "\n")
Week range: 1 to 52 
Code
cat("Unique weeks:", n_distinct(idsr_df$week), "\n\n")
Unique weeks: 52 
Code
# check completeness per year
idsr_week_summary <- idsr_df %>%
    group_by(year) %>%
    summarise(
        min_week = min(week),
        max_week = max(week),
        n_unique_weeks = n_distinct(week),
        .groups = "drop"
    )
knitr::kable(idsr_week_summary, caption = "Epi week coverage per year")
Epi week coverage per year
year min_week max_week n_unique_weeks
2006 1 52 52
2007 1 52 52
2008 1 52 52
2009 1 52 52
2010 1 52 52
2011 1 52 52
2012 1 52 52
2013 1 52 52
2014 1 52 52
2015 1 52 52
2016 1 52 52
2017 1 52 52
2018 1 52 52
2019 1 52 52
2020 1 52 52
2021 1 52 52
2022 1 52 52
2023 1 52 52
2024 1 52 52
2025 1 52 52

Every year has complete weeks 1–52. No year has week 53 (unlike clean_df_stage1 where 2009 and 2015 have week 53).

3.8 Rows per health zone

Code
rows_per_hz <- idsr_df %>% count(hz_code, name = "n_rows")
cat("Expected rows per HZ: 20 years * 52 weeks =", 20 * 52, "\n")
Expected rows per HZ: 20 years * 52 weeks = 1040 
Code
cat("HZs with exactly 1040 rows:", sum(rows_per_hz$n_rows == 1040), "of", nrow(rows_per_hz), "\n")
HZs with exactly 1040 rows: 399 of 400 
Code
outliers <- rows_per_hz %>% filter(n_rows != 1040)
if (nrow(outliers) > 0) {
    cat("\nHZs with unexpected row counts:\n")
    outliers <- outliers %>%
        left_join(idsr_df %>% distinct(hz_code, hz), by = "hz_code")
    print(outliers)
}

HZs with unexpected row counts:
# A tibble: 1 × 3
  hz_code    n_rows hz      
  <chr>       <int> <chr>   
1 CD9207ZS03   1044 mushenge

3.9 Duplicate rows

Code
dups <- idsr_df %>%
    group_by(hz_code, year, week) %>%
    filter(n() > 1) %>%
    ungroup()

cat("Duplicate (hz_code, year, week) rows:", nrow(dups), "\n\n")
Duplicate (hz_code, year, week) rows: 8 
Code
if (nrow(dups) > 0) {
    cat("These duplicates all belong to hz_code CD9207ZS03 (mushenge):\n")
    dups %>%
        select(hz_code, hz, year, week, cases, deaths, date) %>%
        arrange(year, week) %>%
        print(n = 20)
}
These duplicates all belong to hz_code CD9207ZS03 (mushenge):
# A tibble: 8 × 7
  hz_code    hz        year  week cases deaths date      
  <chr>      <chr>    <dbl> <dbl> <dbl>  <dbl> <date>    
1 CD9207ZS03 mushenge  2018    33    24      6 2018-08-13
2 CD9207ZS03 mushenge  2018    33     6      2 2018-08-13
3 CD9207ZS03 mushenge  2018    34    11      0 2018-08-20
4 CD9207ZS03 mushenge  2018    34     5      0 2018-08-20
5 CD9207ZS03 mushenge  2018    35    15      1 2018-08-27
6 CD9207ZS03 mushenge  2018    35    13      0 2018-08-27
7 CD9207ZS03 mushenge  2022    41     1      1 2022-10-10
8 CD9207ZS03 mushenge  2022    41     0      0 2022-10-10

There are 8 duplicate rows (4 duplicated week-keys), all in health zone mushenge (CD9207ZS03): weeks 33–35 of 2018 and week 41 of 2022. The duplicate pairs have different case/death counts, suggesting double reporting rather than exact duplicates.

3.10 Population

Code
cat(
    "Population is static per health zone (does not vary by year):",
    idsr_df %>%
        group_by(hz_code) %>%
        summarise(n_pop = n_distinct(population), .groups = "drop") %>%
        pull(n_pop) %>% max() == 1,
    "\n\n"
)
Population is static per health zone (does not vary by year): TRUE 
Code
pop_summary <- idsr_df %>%
    distinct(hz_code, hz, population) %>%
    arrange(population)

cat("Smallest HZ population:\n")
Smallest HZ population:
Code
print(head(pop_summary, 5))
# A tibble: 5 × 3
  hz_code    hz     population
  <chr>      <chr>       <dbl>
1 CD4502ZS02 wema        10393
2 CD7302ZS01 baka        38249
3 CD4103ZS01 irebu       41282
4 CD7101ZS06 kowe        43539
5 CD9101ZS01 bobozo      53057
Code
cat("\nLargest HZ population:\n")

Largest HZ population:
Code
print(tail(pop_summary, 5))
# A tibble: 5 × 3
  hz_code    hz        population
  <chr>      <chr>          <dbl>
1 CD7105ZS04 manika        563966
2 CD1000ZS06 bumbu         567311
3 CD1000ZS17 kisenso       592927
4 CD6101ZS02 karisimbi     723230
5 CD6103ZS01 katoyi        744355
Code
cat("\nPopulation range:", min(pop_summary$population), "to", max(pop_summary$population), "\n")

Population range: 10393 to 744355 

Note: Population is a single static value per health zone — it does not change across the 20-year span. This may be a limitation for long-term incidence calculations.

3.11 Cases and deaths

Code
cat("=== Cases ===\n")
=== Cases ===
Code
cat("Range:", min(idsr_df$cases), "to", max(idsr_df$cases), "\n")
Range: 0 to 1177 
Code
cat("Negative values:", sum(idsr_df$cases < 0), "\n")
Negative values: 0 
Code
cat(
    "Zero-case weeks:", sum(idsr_df$cases == 0),
    sprintf("(%.1f%%)", 100 * mean(idsr_df$cases == 0)), "\n"
)
Zero-case weeks: 380560 (91.5%) 
Code
cat("Mean:", round(mean(idsr_df$cases), 3), " Median:", median(idsr_df$cases), "\n")
Mean: 1.349  Median: 0 
Code
cat("\n=== Deaths ===\n")

=== Deaths ===
Code
cat("Range:", min(idsr_df$deaths), "to", max(idsr_df$deaths), "\n")
Range: 0 to 98 
Code
cat("Negative values:", sum(idsr_df$deaths < 0), "\n")
Negative values: 0 
Code
cat(
    "Zero-death weeks:", sum(idsr_df$deaths == 0),
    sprintf("(%.1f%%)", 100 * mean(idsr_df$deaths == 0)), "\n"
)
Zero-death weeks: 410417 (98.7%) 
Code
cat("Mean:", round(mean(idsr_df$deaths), 3), " Median:", median(idsr_df$deaths), "\n")
Mean: 0.026  Median: 0 
Code
cat("\n=== Deaths > Cases? ===\n")

=== Deaths > Cases? ===
Code
deaths_gt_cases <- idsr_df %>% filter(deaths > cases)
cat("Rows where deaths > cases:", nrow(deaths_gt_cases), "\n")
Rows where deaths > cases: 5 
Code
if (nrow(deaths_gt_cases) > 0) {
    print(head(deaths_gt_cases %>% select(hz_code, hz, year, week, cases, deaths), 10))
}
# A tibble: 5 × 6
  hz_code    hz              year  week cases deaths
  <chr>      <chr>          <dbl> <dbl> <dbl>  <dbl>
1 CD6111ZS05 rutshuru        2007    18     0      1
2 CD7305ZS01 kinkondja       2023    47     9     11
3 CD7306ZS03 kabondo_dianda  2016    24     0      9
4 CD6311ZS02 kunda           2025    31     0      5
5 CD6202ZS01 kabare          2022    48     0      1

3.12 Numeric distributions

Code
idsr_num_vars <- names(idsr_df)[sapply(idsr_df, is.numeric)]
idsr_num_summary <- data.frame(
    variable = idsr_num_vars,
    mean = sapply(idsr_df[idsr_num_vars], function(x) round(mean(x, na.rm = TRUE), 3)),
    sd = sapply(idsr_df[idsr_num_vars], function(x) round(sd(x, na.rm = TRUE), 3)),
    min = sapply(idsr_df[idsr_num_vars], function(x) round(min(x, na.rm = TRUE), 3)),
    p50 = sapply(idsr_df[idsr_num_vars], function(x) round(median(x, na.rm = TRUE), 3)),
    max = sapply(idsr_df[idsr_num_vars], function(x) round(max(x, na.rm = TRUE), 3))
)
knitr::kable(idsr_num_summary, caption = "Numeric summary (idsr_target_clean)")
Numeric summary (idsr_target_clean)
variable mean sd min p50 max
year year 2015.500 5.766 2006 2016 2025.000
week week 26.500 15.008 1 27 52.000
cases cases 1.349 10.122 0 0 1177.000
deaths deaths 0.026 0.342 0 0 98.000
population population 218963.584 113084.783 10393 187233 744355.000
cases_pop cases_pop 0.000 0.000 0 0 0.004
Code
old_par <- par(no.readonly = TRUE)
on.exit(par(old_par), add = TRUE)
par(mfrow = c(2, 3))

for (v in idsr_num_vars) {
    hist(idsr_df[[v]],
        main = v, xlab = v, col = "steelblue", border = "white"
    )
}

Distributions of numeric variables in idsr_target_clean

3.13 Province and health-zone naming vs clean_df_stage1

Code
idsr_hz_names <- sort(unique(idsr_df$hz))
clean_hz_names <- sort(unique(clean_df_stage1$hz))

cat("Unique HZs in idsr:", length(idsr_hz_names), "\n")
Unique HZs in idsr: 400 
Code
cat("Unique HZs in clean_df_stage1:", n_distinct(clean_hz_names), "\n")
Unique HZs in clean_df_stage1: 428 
Code
cat("In idsr but NOT in clean_df_stage1:", length(setdiff(idsr_hz_names, clean_hz_names)), "\n")
In idsr but NOT in clean_df_stage1: 0 
Code
cat("In clean_df_stage1 but NOT in idsr:", length(setdiff(clean_hz_names, idsr_hz_names)), "\n\n")
In clean_df_stage1 but NOT in idsr: 28 
Code
extra_in_clean <- setdiff(clean_hz_names, idsr_hz_names)
if (length(extra_in_clean) > 0) {
    cat("HZ names in clean_df_stage1 but missing from idsr:\n")
    print(extra_in_clean)
}
HZ names in clean_df_stage1 but missing from idsr:
 [1] "bagira_kasha"      "binza_meteo"       "binza_ozone"      
 [4] "bipemba"           "boma_bungu"        "boso_mondanda"    
 [7] "citenge"           "kabeya_kamuanga"   "kakenge"          
[10] "kalambayi_kabanga" "kalonda"           "kilela_balanda"   
[13] "kiwa"              "kuimba"            "lufungula"        
[16] "lulenge_kimbi"     "makiso_kisangani"  "malemba"          
[19] "mampoko"           "mobayi_mbongo"     "mongbalu"         
[22] "mont_ngafula_i"    "mont_ngafula_ii"   "mufunga_sampwe"   
[25] "mwene_ditu"        "ndjoko_mpunda"     "vanga_kete"       
[28] "wembo_nyama"      
Code
cat("\n=== Province comparison ===\n")

=== Province comparison ===
Code
idsr_prov <- sort(unique(idsr_df$adm1_fr))
clean_prov <- sort(unique(clean_df_stage1$prov))

cat("In idsr but NOT in clean_df_stage1:", paste(setdiff(idsr_prov, clean_prov), collapse = ", "), "\n")
In idsr but NOT in clean_df_stage1:  
Code
cat("In clean_df_stage1 but NOT in idsr:", paste(setdiff(clean_prov, idsr_prov), collapse = ", "), "\n")
In clean_df_stage1 but NOT in idsr: bandundu, bas_congo, kasai_occidental, katanga, orientale 

Note on province differences: clean_df_stage1 contains 5 legacy province names (bandundu, bas_congo, kasai_occidental, katanga, orientale) that were split/renamed in the 2015 DRC administrative reform. These do not appear in idsr_target_clean, which uses only the current 26-province scheme.

3.14 Summary of issues found

Issue Severity Detail
8 duplicate rows (4 week-keys) in mushenge Low Different case/death values — needs decision on which to keep or whether to sum
Population is static across all 20 years Medium Single denominator per HZ regardless of year — may distort incidence trends
No week 53 in any year Low 2009 and 2015 have week 53 in clean_df_stage1 but not here — week-53 data will not match
28 HZ names in clean_df_stage1 absent from idsr Medium These health zones have no population denominator in idsr_target_clean
5 legacy province names in clean_df_stage1 Medium Pre-2015 province codes need mapping to current 26-province scheme for a clean join

4. Cleaning data set B

4.1 Map legacy province names in clean_df_stage1

The DRC’s 2015 administrative reform split 5 legacy provinces into the current 26-province scheme. We remap the prov column in clean_df_stage1 using a two-step approach:

  1. Lookup via idsr_target_clean: For each health zone (hz) that has a legacy province name, we look up its current province in idsr_df (which uses the modern 26-province names). This successfully maps 138 of 147 legacy HZ/province combinations.
  2. Manual assignment from hz_naming_convention geocodes: For the remaining 9 HZs not found in idsr_df, we use the pcode prefix in hz_naming_convention.csv to identify the correct province. Two HZs (kalonda and kiwa, totalling 3 rows) have no geocode and are left unmapped.
Code
# prov and hz values are already lower_snake_case from Step 5
# Create hz_norm alias for joining (hz values are already normalised)
clean_df_stage1 <- clean_df_stage1 %>%
    mutate(hz_norm = hz)

# Build HZ -> current province lookup from idsr
idsr_hz_prov <- idsr_df %>%
    distinct(hz, adm1_fr)

legacy_provs <- c("bandundu", "bas_congo", "kasai_occidental", "katanga", "orientale")

# Step 1: lookup from idsr
clean_df_stage1 <- clean_df_stage1 %>%
    left_join(idsr_hz_prov, by = c("hz_norm" = "hz"), suffix = c("", "_idsr"))

# Step 2: manual mapping for the 9 unmapped HZs (from hz_naming_convention pcodes)
manual_map <- tribble(
    ~hz_norm,            ~manual_prov,
    "boma_bungu",        "kongo_central",
    "kuimba",            "kongo_central",
    "kilela_balanda",    "haut_katanga",
    "malemba",           "haut_lomami",
    "mufunga_sampwe",    "haut_katanga",
    "makiso_kisangani",  "tshopo",
    "mongbalu",          "ituri",
    "bambo",             "nord_kivu",
    "nyantende",         "sud_kivu"
)

clean_df_stage1 <- clean_df_stage1 %>%
    left_join(manual_map, by = "hz_norm")

# Apply remapping: only overwrite prov for rows with legacy names
clean_df_stage1 <- clean_df_stage1 %>%
    mutate(
        prov = case_when(
            prov %in% legacy_provs & !is.na(adm1_fr) ~ adm1_fr,
            prov %in% legacy_provs & !is.na(manual_prov) ~ manual_prov,
            TRUE ~ prov
        )
    ) %>%
    select(-adm1_fr, -manual_prov, -hz_norm)

# Report
n_still_legacy <- sum(clean_df_stage1$prov %in% legacy_provs)
cat("Rows still with legacy province name:", n_still_legacy, "\n")
Rows still with legacy province name: 3 
Code
cat("  kalonda (kasai_occidental, 2008 wk 34): 1 row — no geocode available\n")
  kalonda (kasai_occidental, 2008 wk 34): 1 row — no geocode available
Code
cat("  kiwa (katanga, 2006 wks 45 & 50): 2 rows — no geocode available\n")
  kiwa (katanga, 2006 wks 45 & 50): 2 rows — no geocode available
Code
cat("Unique provinces after remapping:", n_distinct(clean_df_stage1$prov), "\n")
Unique provinces after remapping: 28 
Code
cat(sort(unique(clean_df_stage1$prov)), sep = "\n")
bas_uele
equateur
haut_katanga
haut_lomami
haut_uele
ituri
kasai
kasai_central
kasai_occidental
kasai_oriental
katanga
kinshasa
kongo_central
kwango
kwilu
lomami
lualaba
mai_ndombe
maniema
mongala
nord_kivu
nord_ubangi
sankuru
sud_kivu
sud_ubangi
tanganyika
tshopo
tshuapa

4.2 Resolve duplicates in idsr_df

The 8 duplicate rows (4 week-keys) in health zone mushenge (CD9207ZS03) have different case/death counts, suggesting double reporting. We aggregate by summing cases and deaths for duplicated keys and recompute cases_pop.

Code
n_before_idsr <- nrow(idsr_df)

idsr_clean <- idsr_df %>%
    group_by(hz_code, year, week, adm1_fr, hz, date, population, adm1_pcode) %>%
    summarise(
        cases = sum(cases),
        deaths = sum(deaths),
        .groups = "drop"
    ) %>%
    mutate(cases_pop = cases / population)

n_after_idsr <- nrow(idsr_clean)

cat("idsr rows before:", n_before_idsr, "\n")
idsr rows before: 416004 
Code
cat("idsr rows after dedup:", n_after_idsr, "\n")
idsr rows after dedup: 416000 
Code
cat("Rows removed:", n_before_idsr - n_after_idsr, "\n")
Rows removed: 4 
Code
# verify no duplicates remain
cat(
    "Remaining duplicates:",
    idsr_clean %>%
        group_by(hz_code, year, week) %>%
        filter(n() > 1) %>%
        nrow(),
    "\n"
)
Remaining duplicates: 0 

4.3 Build clean_df_stage2

Code
clean_df_stage2 <- clean_df_stage1

cat("clean_df_stage2 created.\n")
clean_df_stage2 created.
Code
cat("Rows:", nrow(clean_df_stage2), "\n")
Rows: 45948 
Code
cat("Columns:", ncol(clean_df_stage2), "\n")
Columns: 9 
Code
cat("Column names:", paste(names(clean_df_stage2), collapse = ", "), "\n")
Column names: prov, numsem, debutsem, totalcas, totaldeces, year, maladie, hz, key 

4.4 Summary of all cleaning decisions, where clean_data_20260225 and idsr_target_clean are still separate datasets, saved as new cleaned data objects “clean_df_stage2” and “idsr_clean”

Code
cleaning_summary <- data.frame(
    Dataset = c(
        "clean_data_20260225",
        "clean_data_20260225",
        "clean_data_20260225",
        "clean_data_20260225",
        "clean_data_20260225",
        "idsr_target_clean",
        "idsr_target_clean",
        "idsr_target_clean",
        "idsr_target_clean",
        "idsr_target_clean",
        "idsr_target_clean",
        "clean_data_20260225"
    ),
    Issue = c(
        "Three inconsistent date formats (M/D/YYYY, D-Mon-YY, YYYY-MM-DD)",
        "10 rows with impossible dates (before 2000 or far future up to 2102)",
        "741 year mismatches between DEBUTSEM and YEAR column",
        "Incomplete epi weeks: 2017 missing week 52; 2020 missing weeks 48-52",
        "Column names and character values inconsistent (mixed case, special characters)",
        "No date format issues; no missing data",
        "3,200 year/date mismatches (all benign week-1 boundary cases)",
        "No week 53 in any year (unlike clean_df_stage1 where 2009 and 2015 have week 53)",
        "8 duplicate rows (4 week-keys) in mushenge (CD9207ZS03)",
        "Population is static per HZ (single value across all 20 years)",
        "28 HZ names in clean_df_stage1 absent from idsr",
        "5 legacy province names from pre-2015 administrative scheme"
    ),
    Action = c(
        "Standardised all three formats to a single Date column via lubridate",
        "Removed (data-entry errors: dates ranged from 1989 to 2102)",
        "Removed 115 genuine errors (date/year differ by >1 year or mid-year week mismatch); retained 626 benign week-1/52/53 boundary cases",
        "Left as-is (likely reflects genuine gaps in surveillance reporting)",
        "Column names and all character values lowercased to lower_snake_case, special characters replaced with underscores, whitespace trimmed",
        "No action needed",
        "No action needed (verified: all are week 1, year differs by exactly 1)",
        "No action taken; week-53 rows in clean_df_stage1 will not match on join",
        "Summed cases and deaths for duplicate week-keys to consolidate into single rows",
        "Not updated with year-varying population data yet (noted for future work)",
        "No action taken (these 28 HZs have no population denominator in idsr)",
        "Remapped to current 26-province names using a two-step method: (1) looked up each HZ in idsr_target_clean to find its current province (138 of 147 HZ/province combos matched); (2) for the remaining 9 HZs, used pcode prefixes in hz_naming_convention.csv to identify the correct province (7 more mapped, plus 2 cross-province corrections: bambo->nord_kivu, nyantende->sud_kivu). 2 HZs (kalonda, kiwa; 3 rows total) have no geocode and remain unmapped."
    ),
    stringsAsFactors = FALSE
)

knitr::kable(cleaning_summary, caption = "Summary of cleaning decisions for clean_df_stage2")
Summary of cleaning decisions for clean_df_stage2
Dataset Issue Action
clean_data_20260225 Three inconsistent date formats (M/D/YYYY, D-Mon-YY, YYYY-MM-DD) Standardised all three formats to a single Date column via lubridate
clean_data_20260225 10 rows with impossible dates (before 2000 or far future up to 2102) Removed (data-entry errors: dates ranged from 1989 to 2102)
clean_data_20260225 741 year mismatches between DEBUTSEM and YEAR column Removed 115 genuine errors (date/year differ by >1 year or mid-year week mismatch); retained 626 benign week-1/52/53 boundary cases
clean_data_20260225 Incomplete epi weeks: 2017 missing week 52; 2020 missing weeks 48-52 Left as-is (likely reflects genuine gaps in surveillance reporting)
clean_data_20260225 Column names and character values inconsistent (mixed case, special characters) Column names and all character values lowercased to lower_snake_case, special characters replaced with underscores, whitespace trimmed
idsr_target_clean No date format issues; no missing data No action needed
idsr_target_clean 3,200 year/date mismatches (all benign week-1 boundary cases) No action needed (verified: all are week 1, year differs by exactly 1)
idsr_target_clean No week 53 in any year (unlike clean_df_stage1 where 2009 and 2015 have week 53) No action taken; week-53 rows in clean_df_stage1 will not match on join
idsr_target_clean 8 duplicate rows (4 week-keys) in mushenge (CD9207ZS03) Summed cases and deaths for duplicate week-keys to consolidate into single rows
idsr_target_clean Population is static per HZ (single value across all 20 years) Not updated with year-varying population data yet (noted for future work)
idsr_target_clean 28 HZ names in clean_df_stage1 absent from idsr No action taken (these 28 HZs have no population denominator in idsr)
clean_data_20260225 5 legacy province names from pre-2015 administrative scheme Remapped to current 26-province names using a two-step method: (1) looked up each HZ in idsr_target_clean to find its current province (138 of 147 HZ/province combos matched); (2) for the remaining 9 HZs, used pcode prefixes in hz_naming_convention.csv to identify the correct province (7 more mapped, plus 2 cross-province corrections: bambo->nord_kivu, nyantende->sud_kivu). 2 HZs (kalonda, kiwa; 3 rows total) have no geocode and remain unmapped.

5. Combining clean_df_stage2 and idsr_clean into Combined_data

We perform a full join on (hz, year, week) so that:

  • All 416,000 rows from idsr_clean are retained (the complete HZ-year-week panel with zero-case weeks and population denominators).
  • All 45,948 rows from clean_df_stage2 are retained, including the 28 HZs that do not appear in idsr_clean (these will have NA for population and other idsr-only columns).

5.1 Align column names and perform the full join

Code
# Rename idsr_clean columns to align with clean_df_stage2
idsr_for_join <- idsr_clean %>%
    rename(
        numsem = week,
        prov = adm1_fr
    )

# Full join on hz, year, numsem
Combined_data <- full_join(
    idsr_for_join,
    clean_df_stage2,
    by = c("hz", "year", "numsem"),
    suffix = c("_idsr", "_clean")
)

cat("=== Join result ===\n")
=== Join result ===
Code
cat("Rows:", nrow(Combined_data), "\n")
Rows: 419098 
Code
cat("Columns:", ncol(Combined_data), "\n")
Columns: 17 
Code
cat("Column names:", paste(names(Combined_data), collapse = ", "), "\n")
Column names: hz_code, year, numsem, prov_idsr, hz, date, population, adm1_pcode, cases, deaths, cases_pop, prov_clean, debutsem, totalcas, totaldeces, maladie, key 

5.2 Reconcile overlapping columns

Both datasets contain cases, deaths, province, and date columns. We coalesce these: prefer clean_df_stage2 values where available, fall back to idsr_clean.

Code
Combined_data <- Combined_data %>%
    mutate(
        # cases/deaths: prefer clean_df_stage2, fall back to idsr
        cases = coalesce(totalcas, cases),
        deaths = coalesce(totaldeces, deaths),
        # province: prefer clean_df_stage2 (has remapped legacy names)
        prov = coalesce(prov_clean, prov_idsr),
        # date: prefer idsr (always well-formed ISO), fall back to clean
        date = coalesce(date, debutsem),
        # maladie: always cholera in this dataset
        maladie = "cholera",
        # key: fill from hz where missing
        key = coalesce(key, hz)
    ) %>%
    select(
        hz_code, prov, adm1_pcode,
        hz, year, numsem, date,
        cases, deaths,
        population, cases_pop,
        maladie, key
    )

cat("=== After reconciliation ===\n")
=== After reconciliation ===
Code
cat("Rows:", nrow(Combined_data), "\n")
Rows: 419098 
Code
cat("Columns:", ncol(Combined_data), "\n")
Columns: 13 
Code
cat("Column names:", paste(names(Combined_data), collapse = ", "), "\n")
Column names: hz_code, prov, adm1_pcode, hz, year, numsem, date, cases, deaths, population, cases_pop, maladie, key 

5.3 Join diagnostics

Code
# Rows from idsr only (zero-case weeks not in clean_df_stage2)
n_idsr_only <- Combined_data %>%
    filter(is.na(maladie)) %>%
    nrow()

# Rows from clean_df_stage2 only (the 28 HZs not in idsr)
n_clean_only <- Combined_data %>%
    filter(is.na(hz_code)) %>%
    nrow()

# Rows matched in both
n_both <- nrow(Combined_data) - n_idsr_only - n_clean_only

cat("Rows from idsr_clean only (zero-case weeks):", n_idsr_only, "\n")
Rows from idsr_clean only (zero-case weeks): 0 
Code
cat("Rows from clean_df_stage2 only (28 extra HZs):", n_clean_only, "\n")
Rows from clean_df_stage2 only (28 extra HZs): 3098 
Code
cat("Rows matched in both datasets:", n_both, "\n")
Rows matched in both datasets: 416000 
Code
cat("Total:", nrow(Combined_data), "\n\n")
Total: 419098 
Code
cat("=== Missingness after join ===\n")
=== Missingness after join ===
Code
for (nm in names(Combined_data)) {
    n_na <- sum(is.na(Combined_data[[nm]]))
    if (n_na > 0) {
        cat(sprintf(
            "  %-12s  %d NA (%.1f%%)\n", nm, n_na,
            100 * n_na / nrow(Combined_data)
        ))
    }
}
  hz_code       3098 NA (0.7%)
  adm1_pcode    3098 NA (0.7%)
  date          2 NA (0.0%)
  cases         1 NA (0.0%)
  deaths        1 NA (0.0%)
  population    3098 NA (0.7%)
  cases_pop     3098 NA (0.7%)

5.4 Save all three datasets

Code
out_dir <- "/rds/general/user/acp25/home/MIMIC/Clean_data/data"

readr::write_csv(
    clean_df_stage2,
    file.path(out_dir, "clean_df_stage2.csv")
)
readr::write_csv(
    idsr_clean,
    file.path(out_dir, "idsr_clean.csv")
)
readr::write_csv(
    Combined_data,
    file.path(out_dir, "Combined_data.csv")
)

cat("Saved:\n")
Saved:
Code
cat("  clean_df_stage2.csv  -", nrow(clean_df_stage2), "rows\n")
  clean_df_stage2.csv  - 45948 rows
Code
cat("  idsr_clean.csv       -", nrow(idsr_clean), "rows\n")
  idsr_clean.csv       - 416000 rows
Code
cat("  Combined_data.csv    -", nrow(Combined_data), "rows\n")
  Combined_data.csv    - 419098 rows

5.5 Dataset dimensions and structure

Code
cat("Rows:", nrow(Combined_data), "\n")
Rows: 419098 
Code
cat("Columns:", ncol(Combined_data), "\n\n")
Columns: 13 
Code
str(Combined_data)
tibble [419,098 × 13] (S3: tbl_df/tbl/data.frame)
 $ hz_code   : chr [1:419098] "CD1000ZS01" "CD1000ZS01" "CD1000ZS01" "CD1000ZS01" ...
 $ prov      : chr [1:419098] "kinshasa" "kinshasa" "kinshasa" "kinshasa" ...
 $ adm1_pcode: chr [1:419098] "CD10" "CD10" "CD10" "CD10" ...
 $ hz        : chr [1:419098] "bandalungwa" "bandalungwa" "bandalungwa" "bandalungwa" ...
 $ year      : num [1:419098] 2006 2006 2006 2006 2006 ...
 $ numsem    : num [1:419098] 1 2 3 4 5 6 7 8 9 10 ...
 $ date      : Date[1:419098], format: "2006-01-02" "2006-01-09" ...
 $ cases     : num [1:419098] 0 0 0 0 0 0 0 0 0 0 ...
 $ deaths    : num [1:419098] 0 0 0 0 0 0 0 0 0 0 ...
 $ population: num [1:419098] 248790 248790 248790 248790 248790 ...
 $ cases_pop : num [1:419098] 0 0 0 0 0 0 0 0 0 0 ...
 $ maladie   : chr [1:419098] "cholera" "cholera" "cholera" "cholera" ...
 $ key       : chr [1:419098] "bandalungwa" "bandalungwa" "bandalungwa" "bandalungwa" ...

5.6 Variable profile

Code
combined_profile <- profile_df(Combined_data)
knitr::kable(combined_profile, caption = "Variable profile (Combined_data)")
Variable profile (Combined_data)
variable class n_missing pct_missing n_unique example_value
hz_code hz_code character 3098 0.74 400 CD1000ZS01
adm1_pcode adm1_pcode character 3098 0.74 26 CD10
population population numeric 3098 0.74 400 248790
cases_pop cases_pop numeric 3098 0.74 7536 0
prov prov character 0 0.00 28 kinshasa
hz hz character 0 0.00 428 bandalungwa
year year numeric 0 0.00 20 2006
numsem numsem numeric 0 0.00 53 1
cases cases numeric 1 0.00 308 0
deaths deaths numeric 1 0.00 23 0
maladie maladie character 0 0.00 1 cholera
key key character 0 0.00 428 bandalungwa
date date Date NA NA 1 NA

5.7 Numeric variables

Code
num_vars_cb <- names(Combined_data)[sapply(Combined_data, is.numeric)]

if (length(num_vars_cb) > 0) {
    num_summary_cb <- data.frame(
        variable = num_vars_cb,
        mean = sapply(Combined_data[num_vars_cb], function(x) round(mean(x, na.rm = TRUE), 3)),
        sd = sapply(Combined_data[num_vars_cb], function(x) round(sd(x, na.rm = TRUE), 3)),
        min = sapply(Combined_data[num_vars_cb], function(x) round(min(x, na.rm = TRUE), 3)),
        p25 = sapply(Combined_data[num_vars_cb], function(x) round(quantile(x, 0.25, na.rm = TRUE), 3)),
        p50 = sapply(Combined_data[num_vars_cb], function(x) round(median(x, na.rm = TRUE), 3)),
        p75 = sapply(Combined_data[num_vars_cb], function(x) round(quantile(x, 0.75, na.rm = TRUE), 3)),
        max = sapply(Combined_data[num_vars_cb], function(x) round(max(x, na.rm = TRUE), 3))
    )
    knitr::kable(num_summary_cb, caption = "Numeric summary (Combined_data)")
} else {
    cat("No numeric variables detected.\n")
}
Numeric summary (Combined_data)
variable mean sd min p25 p50 p75 max
year year 2015.505 5.772 2006 2011 2016.0 2021.0 2025.000
numsem numsem 26.509 15.013 1 14 27.0 40.0 53.000
cases cases 1.387 10.177 0 0 0.0 0.0 1177.000
deaths deaths 0.027 0.349 0 0 0.0 0.0 98.000
population population 218964.030 113085.235 10393 141247 187930.5 285597.8 744355.000
cases_pop cases_pop 0.000 0.000 0 0 0.0 0.0 0.004
Code
if (length(num_vars_cb) > 0) {
    plot_vars_cb <- head(num_vars_cb, 6)
    old_par <- par(no.readonly = TRUE)
    on.exit(par(old_par), add = TRUE)
    par(mfrow = c(2, 3))

    for (v in plot_vars_cb) {
        hist(Combined_data[[v]],
            main = v, xlab = v, col = "steelblue", border = "white"
        )
    }
}

Distributions of numeric variables in Combined_data

5.8 Categorical variables

Code
cat_vars_cb <- names(Combined_data)[sapply(Combined_data, is.character)]

cat("Categorical variables:", length(cat_vars_cb), "\n\n")
Categorical variables: 6 
Code
for (v in cat_vars_cb) {
    n_unique <- n_distinct(Combined_data[[v]], na.rm = TRUE)
    n_na <- sum(is.na(Combined_data[[v]]))
    cat(sprintf("--- %s (%d unique, %d NA) ---\n", v, n_unique, n_na))
    tbl <- sort(table(Combined_data[[v]], useNA = "ifany"), decreasing = TRUE)
    print(head(tbl, 15))
    if (length(tbl) > 15) cat("  ... and", length(tbl) - 15, "more levels\n")
    cat("\n")
}
--- hz_code (400 unique, 3098 NA) ---

      <NA> CD1000ZS01 CD1000ZS02 CD1000ZS05 CD1000ZS06 CD1000ZS07 CD1000ZS08 
      3098       1040       1040       1040       1040       1040       1040 
CD1000ZS09 CD1000ZS10 CD1000ZS11 CD1000ZS12 CD1000ZS13 CD1000ZS14 CD1000ZS15 
      1040       1040       1040       1040       1040       1040       1040 
CD1000ZS16 
      1040 
  ... and 386 more levels

--- prov (28 unique, 0 NA) ---

      sud_kivu          ituri       kinshasa      nord_kivu   haut_katanga 
         35843          33303          33045          31208          27214 
 kongo_central         tshopo       equateur        maniema kasai_oriental 
         26067          21943          19021          18722          16762 
         kasai    haut_lomami        sankuru          kwilu     tanganyika 
         16647          16168          14903          12480          11450 
  ... and 13 more levels

--- adm1_pcode (26 unique, 3098 NA) ---

 CD62  CD54  CD10  CD61  CD71  CD20  CD51  CD41  CD63  CD82  CD92  CD73  CD83 
35360 33280 32240 31200 27040 26000 21840 18720 18720 16640 16640 15600 14560 
 CD32  CD44 
12480 11440 
  ... and 12 more levels

--- hz (428 unique, 0 NA) ---

      bukama         fizi       kabare       kadutu      kalemie       katana 
        1042         1042         1042         1042         1042         1042 
    kirotshe       masisi       minova miti_murhesa         moba     mutwanga 
        1042         1042         1042         1042         1042         1042 
       mweso        nundu       nyemba 
        1042         1042         1042 
  ... and 413 more levels

--- maladie (1 unique, 0 NA) ---
cholera 
 419098 

--- key (428 unique, 0 NA) ---

      bukama         fizi       kabare       kadutu      kalemie       katana 
        1042         1042         1042         1042         1042         1042 
    kirotshe       masisi       minova miti_murhesa         moba     mutwanga 
        1042         1042         1042         1042         1042         1042 
       mweso        nundu       nyemba 
        1042         1042         1042 
  ... and 413 more levels
Code
if (length(cat_vars_cb) > 0) {
    plot_list_cb <- list()

    for (v in cat_vars_cb) {
        freq <- as.data.frame(table(Combined_data[[v]], useNA = "ifany"),
            stringsAsFactors = FALSE
        )
        names(freq) <- c("category", "count")
        freq$category[is.na(freq$category)] <- "(NA)"

        if (nrow(freq) > 15) {
            freq <- freq[order(-freq$count), ]
            top <- freq[1:15, ]
            other <- data.frame(
                category = "Other",
                count = sum(freq$count[16:nrow(freq)])
            )
            freq <- rbind(top, other)
        }

        freq$fraction <- freq$count / sum(freq$count)
        freq$ymax <- cumsum(freq$fraction)
        freq$ymin <- c(0, head(freq$ymax, -1))
        freq$label_pos <- (freq$ymax + freq$ymin) / 2
        freq$label <- paste0(
            freq$category, "\n(n=",
            formatC(freq$count, format = "d", big.mark = ","), ")"
        )

        p <- ggplot(freq, aes(
            ymax = ymax, ymin = ymin, xmax = 4, xmin = 2.5,
            fill = category
        )) +
            geom_rect() +
            geom_text(aes(x = 3.25, y = label_pos, label = label), size = 2.5) +
            coord_polar(theta = "y") +
            xlim(c(1, 4)) +
            theme_void() +
            theme(legend.position = "none") +
            labs(title = v)

        plot_list_cb[[v]] <- p
    }

    if (requireNamespace("patchwork", quietly = TRUE)) {
        wrap_plots(plot_list_cb, ncol = min(length(plot_list_cb), 3))
    } else {
        for (p in plot_list_cb) print(p)
    }
}

Donut plots for categorical variables in Combined_data

Donut plots for categorical variables in Combined_data

Donut plots for categorical variables in Combined_data

Donut plots for categorical variables in Combined_data

Donut plots for categorical variables in Combined_data

Donut plots for categorical variables in Combined_data

5.9 Date and temporal overview

Code
cat("Date column:\n")
Date column:
Code
cat("  Class:", paste(class(Combined_data$date), collapse = ", "), "\n")
  Class: Date 
Code
cat(
    "  Range:", as.character(min(Combined_data$date, na.rm = TRUE)),
    "to", as.character(max(Combined_data$date, na.rm = TRUE)), "\n"
)
  Range: 2005-12-26 to 2025-12-30 
Code
cat("  NAs:", sum(is.na(Combined_data$date)), "of", nrow(Combined_data), "\n\n")
  NAs: 2 of 419098 
Code
cat("Year column:\n")
Year column:
Code
print(table(Combined_data$year))

 2006  2007  2008  2009  2010  2011  2012  2013  2014  2015  2016  2017  2018 
21056 21106 20867 20873 20863 20885 20936 20863 20901 21008 20900 20969 21025 
 2019  2020  2021  2022  2023  2024  2025 
21003 20913 20855 20891 21009 20979 21196 
Code
cat("\nEpi week column (numsem):\n")

Epi week column (numsem):
Code
cat(
    "  Range:", min(Combined_data$numsem, na.rm = TRUE),
    "to", max(Combined_data$numsem, na.rm = TRUE), "\n"
)
  Range: 1 to 53 
Code
cat("  Unique weeks:", n_distinct(Combined_data$numsem), "\n")
  Unique weeks: 53 

6. Descriptive plots

6.1 National epidemic curve (weekly)

Code
national_weekly <- Combined_data %>%
    group_by(year, numsem) %>%
    summarise(
        cases = sum(cases, na.rm = TRUE),
        deaths = sum(deaths, na.rm = TRUE),
        .groups = "drop"
    ) %>%
    arrange(year, numsem) %>%
    mutate(week_index = row_number())

ggplot(national_weekly, aes(x = week_index, y = cases)) +
    geom_col(fill = "#2c7fb8", width = 1) +
    scale_x_continuous(
        breaks = national_weekly %>%
            filter(numsem == 1) %>%
            pull(week_index),
        labels = national_weekly %>%
            filter(numsem == 1) %>%
            pull(year)
    ) +
    scale_y_continuous(labels = scales::comma) +
    labs(
        x = "Year", y = "Reported cases",
        title = "National cholera epidemic curve (weekly)"
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

National cholera epidemic curve — weekly reported cases

6.2 National epidemic curve — cases and deaths

Code
ggplot(national_weekly, aes(x = week_index)) +
    geom_col(aes(y = cases), fill = "#2c7fb8", width = 1, alpha = 0.8) +
    geom_col(aes(y = deaths), fill = "#d95f02", width = 1, alpha = 0.9) +
    scale_x_continuous(
        breaks = national_weekly %>%
            filter(numsem == 1) %>%
            pull(week_index),
        labels = national_weekly %>%
            filter(numsem == 1) %>%
            pull(year)
    ) +
    scale_y_continuous(labels = scales::comma) +
    labs(
        x = "Year", y = "Count",
        title = "National cholera — weekly cases (blue) and deaths (orange)"
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

National cholera epidemic curve — weekly cases and deaths

6.3 Annual epidemic curves (faceted by year)

Code
annual_weekly <- Combined_data %>%
    group_by(year, numsem) %>%
    summarise(cases = sum(cases, na.rm = TRUE), .groups = "drop")

ggplot(annual_weekly, aes(x = numsem, y = cases)) +
    geom_col(fill = "#2c7fb8", width = 1) +
    facet_wrap(~year, ncol = 3, scales = "free_y") +
    scale_y_continuous(labels = scales::comma) +
    labs(
        x = "Epidemiological week", y = "Reported cases",
        title = "Cholera epidemic curve by year"
    ) +
    theme_minimal() +
    theme(strip.text = element_text(face = "bold"))

Cholera epidemic curves faceted by year

6.4 Annual case and death totals

Code
annual_totals <- Combined_data %>%
    group_by(year) %>%
    summarise(
        cases = sum(cases, na.rm = TRUE),
        deaths = sum(deaths, na.rm = TRUE),
        .groups = "drop"
    ) %>%
    tidyr::pivot_longer(
        cols = c(cases, deaths), names_to = "measure",
        values_to = "count"
    )

ggplot(annual_totals, aes(x = factor(year), y = count, fill = measure)) +
    geom_col(position = "dodge") +
    scale_fill_manual(values = c(cases = "#2c7fb8", deaths = "#d95f02")) +
    scale_y_continuous(labels = scales::comma) +
    labs(
        x = "Year", y = "Count", fill = NULL,
        title = "Annual cholera cases and deaths"
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

Annual cholera case and death totals

6.5 Cases by province (total)

Code
prov_cases <- Combined_data %>%
    filter(!is.na(prov)) %>%
    group_by(prov) %>%
    summarise(
        cases = sum(cases, na.rm = TRUE),
        deaths = sum(deaths, na.rm = TRUE),
        .groups = "drop"
    ) %>%
    arrange(desc(cases))

ggplot(prov_cases, aes(x = reorder(prov, cases), y = cases)) +
    geom_col(fill = "#2c7fb8") +
    coord_flip() +
    scale_y_continuous(labels = scales::comma) +
    labs(
        x = NULL, y = "Total reported cases",
        title = "Total cholera cases by province (all years)"
    ) +
    theme_minimal()

Total cholera cases by province

6.6 Deaths by province (total)

Code
ggplot(prov_cases, aes(x = reorder(prov, deaths), y = deaths)) +
    geom_col(fill = "#d95f02") +
    coord_flip() +
    scale_y_continuous(labels = scales::comma) +
    labs(
        x = NULL, y = "Total reported deaths",
        title = "Total cholera deaths by province (all years)"
    ) +
    theme_minimal()

Total cholera deaths by province

6.7 Top 20 health zones by case count

Code
hz_cases <- Combined_data %>%
    filter(!is.na(hz)) %>%
    group_by(hz) %>%
    summarise(cases = sum(cases, na.rm = TRUE), .groups = "drop") %>%
    arrange(desc(cases)) %>%
    slice_head(n = 20)

ggplot(hz_cases, aes(x = reorder(hz, cases), y = cases)) +
    geom_col(fill = "#2c7fb8") +
    coord_flip() +
    scale_y_continuous(labels = scales::comma) +
    labs(
        x = NULL, y = "Total reported cases",
        title = "Top 20 health zones by cholera cases (all years)"
    ) +
    theme_minimal()

Top 20 health zones by total cholera cases

6.8 Province-level epidemic curves (faceted)

Code
prov_annual <- Combined_data %>%
    filter(!is.na(prov)) %>%
    group_by(prov, year) %>%
    summarise(cases = sum(cases, na.rm = TRUE), .groups = "drop")

ggplot(prov_annual, aes(x = year, y = cases)) +
    geom_col(fill = "#2c7fb8") +
    facet_wrap(~prov, ncol = 4, scales = "free_y") +
    scale_y_continuous(labels = scales::comma) +
    labs(
        x = "Year", y = "Cases",
        title = "Cholera cases by province and year"
    ) +
    theme_minimal() +
    theme(
        strip.text = element_text(face = "bold", size = 7),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 6)
    )

Provincial cholera epidemic curves (annual totals)

6.9 Case fatality proportion by province

Code
cfr_prov <- Combined_data %>%
    filter(!is.na(prov)) %>%
    group_by(prov) %>%
    summarise(
        cases = sum(cases, na.rm = TRUE),
        deaths = sum(deaths, na.rm = TRUE),
        .groups = "drop"
    ) %>%
    filter(cases > 0) %>%
    mutate(cfr = deaths / cases * 100)

ggplot(cfr_prov, aes(x = reorder(prov, cfr), y = cfr)) +
    geom_col(fill = "#7570b3") +
    coord_flip() +
    labs(
        x = NULL, y = "Case fatality proportion (%)",
        title = "Cholera case fatality proportion by province (all years)"
    ) +
    theme_minimal()

Case fatality proportion by province

6.10 Seasonality — average cases by epidemiological week

Code
seasonality <- Combined_data %>%
    group_by(numsem) %>%
    summarise(
        mean_cases = mean(cases, na.rm = TRUE),
        median_cases = median(cases, na.rm = TRUE),
        .groups = "drop"
    )

ggplot(seasonality, aes(x = numsem)) +
    geom_line(aes(y = mean_cases), colour = "#2c7fb8", linewidth = 0.8) +
    geom_line(aes(y = median_cases),
        colour = "#d95f02", linewidth = 0.8,
        linetype = "dashed"
    ) +
    labs(
        x = "Epidemiological week", y = "Cases per HZ-week",
        title = "Cholera seasonality — mean (blue) and median (orange) cases by epi week"
    ) +
    theme_minimal()

Average weekly cholera cases across all years (seasonality)