Objective

  1. Loading packages;
  2. R: object-oriented programming
  3. Cleaning data and core functions
pacman::p_load(
  rio,        # importing data  
  here,       # relative file pathways  
  janitor,    # data cleaning and tables
  lubridate,  # working with dates
  matchmaker, # dictionary-based cleaning
  epikit,     # age_categories() function
  tidyverse,   # data management and visualization
  dplyr
)
## 
## epikit installed
## also installing the dependency 'readxl'
## 
## tidyverse installed

Data Management

1. Clean data and core function

1.1 Cleaning pipeline

In a cleaning pipeline the order of the steps is important. Cleaning steps might include: 1. Importing of data; 2. Column names cleaned or changed; 3. De-duplication; 4. Column creation and transformation (e.g. re-coding or standardising values); 5. Rows filtered or added.

1.3 Import

# import
linelist_raw <- import("linelist_raw.xlsx")
## New names:
## • `` -> `...28`
# review
head(linelist_raw, 5)
##   case_id generation infection date date onset  hosp date date_of_outcome
## 1  5fe599          4     2014-05-08 2014-05-13 2014-05-15            <NA>
## 2  8689b7          4           <NA> 2014-05-13 2014-05-14      2014-05-18
## 3  11f8ea          2           <NA> 2014-05-16 2014-05-18      2014-05-30
## 4  b8812a          3     2014-05-04 2014-05-18 2014-05-20            <NA>
## 5  893f25          3     2014-05-18 2014-05-21 2014-05-22      2014-05-29
##   outcome gender                             hospital       lon      lat
## 1    <NA>      m                                Other -13.21574 8.468973
## 2 Recover      f                                 <NA> -13.21523 8.451719
## 3 Recover      m St. Mark's Maternity Hospital (SMMH) -13.21291 8.464817
## 4    <NA>      f                        Port Hospital -13.23637 8.475476
## 5 Recover      m                    Military Hospital -13.22286 8.460824
##   infector source age age_unit row_num wt_kg ht_cm ct_blood fever chills cough
## 1   f547d6  other   2    years       1    27    48       22    no     no   yes
## 2     <NA>   <NA>   3    years       2    25    59       22  <NA>   <NA>  <NA>
## 3     <NA>   <NA>  56    years       3    91   238       21  <NA>   <NA>  <NA>
## 4   f90f5f  other  18    years       4    41   135       23    no     no    no
## 5   11f8ea  other   3    years       5    36    71       23    no     no   yes
##   aches vomit temp time_admission merged_header ...28
## 1    no   yes 36.8           <NA>             a     b
## 2  <NA>  <NA> 36.9          09:36             a     b
## 3  <NA>  <NA> 36.9          16:48             a     b
## 4    no    no 36.8          11:22             a     b
## 5    no   yes 36.9          12:60             a     b
# skim
skimr::skim(linelist_raw)
Data summary
Name linelist_raw
Number of rows 6611
Number of columns 28
_______________________
Column type frequency:
character 17
numeric 8
POSIXct 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
case_id 137 0.98 6 6 0 5888 0
date onset 293 0.96 10 10 0 580 0
outcome 1500 0.77 5 7 0 2 0
gender 324 0.95 1 1 0 2 0
hospital 1512 0.77 5 36 0 13 0
infector 2323 0.65 6 6 0 2697 0
source 2323 0.65 5 7 0 2 0
age 107 0.98 1 2 0 75 0
age_unit 7 1.00 5 6 0 2 0
fever 258 0.96 2 3 0 2 0
chills 258 0.96 2 3 0 2 0
cough 258 0.96 2 3 0 2 0
aches 258 0.96 2 3 0 2 0
vomit 258 0.96 2 3 0 2 0
time_admission 844 0.87 5 5 0 1091 0
merged_header 0 1.00 1 1 0 1 0
…28 0 1.00 1 1 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
generation 7 1.00 16.60 5.71 0.00 13.00 16.00 20.00 37.00 ▁▆▇▂▁
lon 7 1.00 -13.23 0.02 -13.27 -13.25 -13.23 -13.22 -13.21 ▅▃▃▅▇
lat 7 1.00 8.47 0.01 8.45 8.46 8.47 8.48 8.49 ▅▇▇▇▆
row_num 0 1.00 3240.91 1857.83 1.00 1647.50 3241.00 4836.50 6481.00 ▇▇▇▇▇
wt_kg 7 1.00 52.69 18.59 -11.00 41.00 54.00 66.00 111.00 ▁▃▇▅▁
ht_cm 7 1.00 125.25 49.57 4.00 91.00 130.00 159.00 295.00 ▂▅▇▂▁
ct_blood 7 1.00 21.26 1.67 16.00 20.00 22.00 22.00 26.00 ▁▃▇▃▁
temp 158 0.98 38.60 0.95 35.20 38.30 38.80 39.20 40.80 ▁▂▂▇▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
infection date 2322 0.65 2012-04-09 2015-04-27 2014-10-04 538
hosp date 7 1.00 2012-04-20 2015-04-30 2014-10-15 570
date_of_outcome 1068 0.84 2012-05-14 2015-06-04 2014-10-26 575

1.4 Column names: cleaning & rename

Cleaning column names

# review column names 
names(linelist_raw)   # colum 28,  "...28" 
##  [1] "case_id"         "generation"      "infection date"  "date onset"     
##  [5] "hosp date"       "date_of_outcome" "outcome"         "gender"         
##  [9] "hospital"        "lon"             "lat"             "infector"       
## [13] "source"          "age"             "age_unit"        "row_num"        
## [17] "wt_kg"           "ht_cm"           "ct_blood"        "fever"          
## [21] "chills"          "cough"           "aches"           "vomit"          
## [25] "temp"            "time_admission"  "merged_header"   "...28"
# Automatic cleaning 
linelist <- linelist_raw %>% 
  janitor::clean_names()

# see the new column names
names(linelist)      # column 28,  "x28" , the three dot dot dot is removed 
##  [1] "case_id"         "generation"      "infection_date"  "date_onset"     
##  [5] "hosp_date"       "date_of_outcome" "outcome"         "gender"         
##  [9] "hospital"        "lon"             "lat"             "infector"       
## [13] "source"          "age"             "age_unit"        "row_num"        
## [17] "wt_kg"           "ht_cm"           "ct_blood"        "fever"          
## [21] "chills"          "cough"           "aches"           "vomit"          
## [25] "temp"            "time_admission"  "merged_header"   "x28"

Manunal re-name column names

linelist <- linelist_raw %>%
    
    # standardize column name syntax
    janitor::clean_names() %>% 
    
    # manually re-name columns
           # NEW name             # OLD name
    dplyr::rename(date_infection       = infection_date,
           date_hospitalisation = hosp_date,
           date_outcome         = date_of_outcome)


linelist_raw %>% 
  select(# NEW name             # OLD name
         date_infection       = `infection date`,    # rename and KEEP ONLY these columns
         date_hospitalisation = `hosp date`)

1.5 Column names: select or re-order

names(linelist_raw)
##  [1] "case_id"         "generation"      "infection date"  "date onset"     
##  [5] "hosp date"       "date_of_outcome" "outcome"         "gender"         
##  [9] "hospital"        "lon"             "lat"             "infector"       
## [13] "source"          "age"             "age_unit"        "row_num"        
## [17] "wt_kg"           "ht_cm"           "ct_blood"        "fever"          
## [21] "chills"          "cough"           "aches"           "vomit"          
## [25] "temp"            "time_admission"  "merged_header"   "...28"
# linelist dataset is piped through select() command, and names() prints just the column names
linelist %>% 
  select(case_id, date_onset, date_hospitalisation, fever) %>% 
  names()  # display the column names
## [1] "case_id"              "date_onset"           "date_hospitalisation"
## [4] "fever"
# move date_onset and date_hospitalisation to beginning
linelist %>% 
  select(date_onset, date_hospitalisation, everything()) %>% 
  names()
##  [1] "date_onset"           "date_hospitalisation" "case_id"             
##  [4] "generation"           "date_infection"       "date_outcome"        
##  [7] "outcome"              "gender"               "hospital"            
## [10] "lon"                  "lat"                  "infector"            
## [13] "source"               "age"                  "age_unit"            
## [16] "row_num"              "wt_kg"                "ht_cm"               
## [19] "ct_blood"             "fever"                "chills"              
## [22] "cough"                "aches"                "vomit"               
## [25] "temp"                 "time_admission"       "merged_header"       
## [28] "x28"

1.6 Others

# select columns that are class Numeric
linelist %>% 
  select(where(is.numeric)) %>% 
  names()
## [1] "generation" "lon"        "lat"        "row_num"    "wt_kg"     
## [6] "ht_cm"      "ct_blood"   "temp"
# select columns containing certain characters
linelist %>% 
  select(contains("date")) %>% 
  names()
## [1] "date_infection"       "date_onset"           "date_hospitalisation"
## [4] "date_outcome"
# searched for multiple character matches
linelist %>% 
  select(matches("onset|hosp|fev")) %>%   # note the OR symbol "|"
  names()
## [1] "date_onset"           "date_hospitalisation" "hospital"            
## [4] "fever"
# any_of 
linelist %>% 
  select(any_of(c("date_onset", "village_origin", "village_detection", "village_residence", "village_travel"))) %>% 
  names()
## [1] "date_onset"

1.7 Remove column

linelist %>% 
  select(-c(date_onset, fever:vomit)) %>% # remove date_onset and all columns from fever to vomit
  names()
##  [1] "case_id"              "generation"           "date_infection"      
##  [4] "date_hospitalisation" "date_outcome"         "outcome"             
##  [7] "gender"               "hospital"             "lon"                 
## [10] "lat"                  "infector"             "source"              
## [13] "age"                  "age_unit"             "row_num"             
## [16] "wt_kg"                "ht_cm"                "ct_blood"            
## [19] "temp"                 "time_admission"       "merged_header"       
## [22] "x28"
linelist$date_onset <- NULL   # deletes column with base R syntax 
# Adding to pipe chain 

linelist <- linelist_raw %>%
    
    # standardize column name syntax
    janitor::clean_names() %>% 
    
    # manually re-name columns
           # NEW name             # OLD name
    dplyr::rename(date_infection       = infection_date,
           date_hospitalisation = hosp_date,
           date_outcome         = date_of_outcome) %>% 

    # remove column
    select(-c(row_num, merged_header, x28)) %>% 

    # Re-order column with everything()
  
    # duplication
    dplyr::distinct() # removes rows that are 100% duplicates

2. Column creation and transformation

2.1 Create new column

linelist <- linelist %>% 
  mutate(bmi = wt_kg / (ht_cm/100)^2)

new_col_demo <- linelist %>%                       
  mutate(
    new_var_dup    = case_id,             # new column = duplicate/copy another existing column
    new_var_static = 7,                   # new column = all values the same
    new_var_static = new_var_static + 5,  # you can overwrite a column, and it can be a calculation using other variables
    new_var_paste  = stringr::str_glue("{hospital} on ({date_hospitalisation})") # new column = pasting together values from other columns
    ) %>% 
  select(case_id, hospital, date_hospitalisation, contains("new"))        # show only new columns, for demonstration purposes

2.2 Convert class & grpup column

# Convert class  
class(linelist$age)
linelist <- linelist %>% 
  mutate(age = as.numeric(age))

# Group
linelist %>% 
  mutate(age_norm = age / mean(age, na.rm=T))  # age normalized to mean of ALL rows

linelist %>% 
  group_by(hospital) %>% 
  mutate(age_norm = age / mean(age, na.rm=T))  # age normalized to mean of hospital group

2.3 Transform multiple columns

Using across()

# selected columns 
linelist <- linelist %>% 
  mutate(across(.cols = c(temp, ht_cm, wt_kg), .fns = as.character))

# all columns
linelist <- linelist %>% 
  mutate(across(.cols = everything(), .fns = as.character))


linelist <- linelist %>% 
  mutate(across(.col = contains('date'), .fns = as.character))

linelist <- linelist %>% 
  mutate(across(.cols = where(is.POSIXct), .fns = as.Date))

2.4 Recode values

Using dplyr::recode() function

# specific value
linelist <- linelist %>% 
  mutate(date_onset = dplyr::recode(date_onset, "2014-14-15" = "2014-04-15"))

# recode multiple values
table(linelist$hospital, useNA = "always")  # print table of all unique values, including missing  
## 
##                      Central Hopital                     Central Hospital 
##                                   11                                  457 
##                           Hospital A                           Hospital B 
##                                  290                                  289 
##                     Military Hopital                    Military Hospital 
##                                   32                                  798 
##                     Mitylira Hopital                    Mitylira Hospital 
##                                    1                                   79 
##                                Other                         Port Hopital 
##                                  907                                   48 
##                        Port Hospital St. Mark's Maternity Hospital (SMMH) 
##                                 1756                                  417 
##   St. Marks Maternity Hopital (SMMH)                                 <NA> 
##                                   11                                 1512
# fix incorrect values                   # old value       # new value
linelist <- linelist %>% 
  mutate(hospital = dplyr::recode(hospital,
                     # for reference: OLD = NEW
                      "Mitylira Hopital"  = "Military Hospital",
                      "Mitylira Hospital" = "Military Hospital",
                      "Military Hopital"  = "Military Hospital",
                      "Port Hopital"      = "Port Hospital",
                      "Central Hopital"   = "Central Hospital",
                      "other"             = "Other",
                      "St. Marks Maternity Hopital (SMMH)" = "St. Mark's Maternity Hospital (SMMH)"
                      ))

By logic: 1. Using simple logic: replace(), ifelse(), if_else(); 2. Using case_when().

# use replace() is changing just one value in one row, using an unique row identifier
# mutate(col_to_change = replace(col_to_change, criteria for rows, new value)).

linelist <- linelist %>% 
  mutate(gender = replace(gender, case_id == "2195", "Female"))

# Using base R
linelist$gender[linelist$case_id == "2195"] <- "Female"


# Using ifelse()
linelist <- linelist %>% 
  mutate(source_known = ifelse(!is.na(source),"known","unknown")) # the column source_known is defined. Its value in a given row is set to “known” if the row’s value in column source is not missing. If the value in source is missing, then the value in source_known is set to “unknown”.


# Case_when()

class(linelist$age)
## [1] "character"
linelist$age <- as.numeric(linelist$age)

linelist <- linelist %>% 
  mutate(age_years = dplyr::case_when(
       age_unit == "years"  ~ age,       # if age unit is years
       age_unit == "months" ~ age/12,    # if age unit is months, divide age by 12
       is.na(age_unit)      ~ age))      # if age unit is missing, assume years
                                         # any other circumstance, assign NA (missing)

linelist <- linelist %>% 
 mutate(age_years = dplyr::case_when(
          age_unit == "years" ~ age,
          age_unit == "months" ~ age/12,
          is.na(age_unit) ~ age,
          TRUE ~ NA_real_))


linelist <- linelist %>% 
     mutate(case_status = dplyr::case_when(
          
          # if patient had lab test and it is positive,
          # then they are marked as a confirmed case 
          ct_blood < 20                   ~ "Confirmed",
          
          # given that a patient does not have a positive lab result,
          # if patient has a "source" (epidemiological link) AND has fever, 
          # then they are marked as a suspect case
          !is.na(source) & fever == "yes" ~ "Suspect",
          
          # any other patient not addressed above 
          # is marked for follow up
          TRUE                            ~ "To investigate"))

Missing value

# replace_na() for class character 
class(linelist$hospital)
## [1] "character"
linelist <- linelist %>% 
  mutate(hospital = tidyr::replace_na(hospital, "Missing"))

# fct_explicit_na() for class factor
linelist <- linelist %>% 
  mutate(hospital = forcats::fct_explicit_na(hospital))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `hospital = forcats::fct_explicit_na(hospital)`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.
# na_if() Convert specific value to NA
linelist <- linelist %>% 
  mutate(hospital = dplyr::na_if(hospital, 'Missing')) # convert value named 'missing' to 'NA'


# Note: na_if() cannot be used for logic criteria (e.g. “all values > 99”) - use replace() or case_when() for this:
linelist <- linelist %>% 
  mutate(temp = replace(temp, temp > 40, NA)) # convert temp above 40 to NA

linelist <- linelist %>% 
  mutate(date_onset = replace(date_onset, date_onset > as.Date("2000-01-01"), NA))

2.5 Cleaning Dictionary

For more details, please visit here

cleaning_dict <- import("cleaning_dict.csv")

linelist <- linelist %>% 
  matchmaker::match_df(
    dictionary = cleaning_dict, 
    from = "from",
    to = "to",
    by = "col"
  )

2.6 Numeric categories

Creating categories from numerical columns by using the list of packages: age_categories(), cut(), case_when(), and quantile(), ntile()

Let’s using age_categories():

# Check the class of variables 
class(linelist$age_years)
## [1] "numeric"
# Review distribution to make appropriate cut-points
hist(linelist$age_years)

summary(linelist$age_years, na.rm = T) # Yes, remove NA 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    6.00   13.00   16.04   23.00   84.00     107
# age_categories() 

# Simple example
################
pacman::p_load(epikit)                    # load package

linelist <- linelist %>% 
  mutate(
    age_cat = age_categories(             # create new column
      age_years,                            # numeric column to make groups from
      breakers = c(0, 5, 10, 15, 20,        # break points
                   30, 40, 50, 60, 70)))

# show table
table(linelist$age_cat, useNA = "always")
linelist_2 <- linelist %>% 
  mutate(
    age_cat = age_categories(
      age_years,
      breakers = c(0, 6, 11, 16, 21, 31, 41, 51, 61, 71)
    )
  )

table(linelist_2$age_cat, useNA = "always")
# With ceiling set to TRUE
linelist <- linelist %>% 
  mutate(
    age_cat = age_categories(
      age_years, 
      breakers = c(0, 5, 10, 15, 20, 30, 40, 50, 60, 70),
      ceiling = TRUE)) # 70 is ceiling, all above become NA

# show table
table(linelist$age_cat, useNA = "always")
linelist <- linelist %>% 
  mutate(
    age_cat = age_categories(
      age_years, 
      lower = 0,
      upper = 100,
      by = 10))

linelist <- linelist %>% 
  mutate(
    age_cat = age_categories(
      age_years, 
      breaker = seq(0, 85, 5))) # age categories: 0 to 85 by 5s

# show table
table(linelist$age_cat, useNA = "always")

How about cut():

# using Cut() 
linelist <- linelist %>% 
  mutate(
    age_cat = cut(
      age_years,
      breaks = c(0, 5, 10, 15, 20, 30, 50, 70, 100),
      include.lowest = TRUE  # include 0 in the lowest group 
    ))

table(linelist$age_cat, useNA = "always")
## 
##    [0,5]   (5,10]  (10,15]  (15,20]  (20,30]  (30,50]  (50,70] (70,100] 
##     1469     1195     1040      770     1149      778       94        6 
##     <NA> 
##      107
# Cross tabulation of the numeric and category 
head(table("Numeric Values" = linelist$age_years,
      "Categories" = linelist$age_cat,
      useNA = "always"),10)
##                     Categories
## Numeric Values       [0,5] (5,10] (10,15] (15,20] (20,30] (30,50] (50,70]
##   0                    136      0       0       0       0       0       0
##   0.0833333333333333     1      0       0       0       0       0       0
##   0.25                   2      0       0       0       0       0       0
##   0.333333333333333      6      0       0       0       0       0       0
##   0.416666666666667      1      0       0       0       0       0       0
##   0.5                    6      0       0       0       0       0       0
##   0.583333333333333      3      0       0       0       0       0       0
##   0.666666666666667      3      0       0       0       0       0       0
##   0.75                   3      0       0       0       0       0       0
##   0.833333333333333      1      0       0       0       0       0       0
##                     Categories
## Numeric Values       (70,100] <NA>
##   0                         0    0
##   0.0833333333333333        0    0
##   0.25                      0    0
##   0.333333333333333         0    0
##   0.416666666666667         0    0
##   0.5                       0    0
##   0.583333333333333         0    0
##   0.666666666666667         0    0
##   0.75                      0    0
##   0.833333333333333         0    0
# Relabeling NA values
linelist <- linelist %>% 
  mutate(age_cat = cut(
    age_years,
    breaks = c(0, 5, 10, 15, 20, 30, 50, 70, 100),
    right = FALSE,
    include.lowest = TRUE,
    labels = c("0-4", "5-9", "10-14", "15-19", "20-29", "30-49",
               "50-69", "70-100")),
    
    # make missing values explicit
    age_cat = fct_explicit_na(
      age_cat,
      na_level = "Missing age"
    ))

table(linelist$age_cat, useNA = "always")

2.7 Filter row

# Filter out missing values 
linelist %>% 
  tidyr::drop_na(case_id, age_years) # drop rows with missing values for case_id or age_years

# Filter by row number
# View first 100 rows
linelist %>% head(100)     # or use tail() to see the n last rows

# Show row 5 only
linelist %>% filter(row_number() == 5)

# View rows 2 through 20, and three specific columns
linelist %>% filter(row_number() %in% 2:20) %>% select(date_onset, outcome, age)
# Complex filter

hist(linelist$date_onset, breaks = 50)

Can we just filter by date_onset to rows after June 2013? Caution! Applying the code filter(date_onset > as.Date(“2013-06-01”))) would remove any rows in the later epidemic with a missing date of onset!

DANGER: Filtering to greater than (>) or less than (<) a date or number can remove any rows with missing values (NA)! This is because NA is treated as infinitely large and small.

table(Hospital  = linelist$hospital,                     # hospital name
      YearOnset = lubridate::year(linelist$date_onset),  # year of date_onset
      useNA     = "always")                              # show missing values
linelist <- linelist %>% 
  # keep rows where onset is after 1 June 2013 OR where onset is missing and it was a hospital OTHER than Hospital A or B
  filter(date_onset > as.Date("2013-06-01") | (is.na(date_onset) & !hospital %in% c("Hospital A", "Hospital B")))

nrow(linelist)

table(Hospital = linelist$hospital,
      Yearonset = lubridate::year(linelist$date_onset),
      useNA = "always")

Standalone

# dataframe <- filter(dataframe, condition(s) for rows to keep)
linelist <- filter(linelist, !is.na(case_id))

linelist <- linelist[!is.na(case_id), ] # or subset using base R 

Quickly review records

View(linelist)

View(linelist %>% 
       filter(case_id %in% c("11f8ea", "76b97a", "47a5f5")) %>% 
       select(date_onset, date_hospitalisation)
     )

2.8 Row-wise calculations

dplyr::row-wise perform a calculation within a row.

linelist %>%
  dplyr::rowwise() %>% 
  mutate(num_symptoms = sum(c(fever, chills, cough, aches, vomit) == "yes")) %>% 
           ungroup() %>% 
  select(fever, chills, cough, aches, vomit, num_symptoms) # for display
linelist %>% 
  dplyr::rowwise() %>% 
  mutate(num_NA_dates = sum(is.na(c_across(contains("date"))))) %>% 
  ungroup() %>% 
  select(num_NA_dates, contains("date"))

Get the latest or most recent date

linelist %>% 
  dplyr::rowwise() %>% 
  mutate(latest_date = max(c_across(contains("date")), na.rm = T)) %>%  # what happens if not specify na.rm = T? 
  ungroup() %>% 
  select(latest_date, contains("date"))