Data Management

1. Clean data and core function

1.1 Cleaning pipeline

In a cleaning pipeline the order of the steps is important. Cleaning steps might include: 1. Importing of data; 2. Column names cleaned or changed; 3. De-duplication; 4. Column creation and transformation (e.g. re-coding or standardising values); 5. Rows filtered or added.

1.3 Import

# import
linelist_raw <- import("linelist_raw.xlsx")

## New names:
## • `` -> `...28`

# review
head(linelist_raw, 5)

##   case_id generation infection date date onset  hosp date date_of_outcome
## 1  5fe599          4     2014-05-08 2014-05-13 2014-05-15            <NA>
## 2  8689b7          4           <NA> 2014-05-13 2014-05-14      2014-05-18
## 3  11f8ea          2           <NA> 2014-05-16 2014-05-18      2014-05-30
## 4  b8812a          3     2014-05-04 2014-05-18 2014-05-20            <NA>
## 5  893f25          3     2014-05-18 2014-05-21 2014-05-22      2014-05-29
##   outcome gender                             hospital       lon      lat
## 1    <NA>      m                                Other -13.21574 8.468973
## 2 Recover      f                                 <NA> -13.21523 8.451719
## 3 Recover      m St. Mark's Maternity Hospital (SMMH) -13.21291 8.464817
## 4    <NA>      f                        Port Hospital -13.23637 8.475476
## 5 Recover      m                    Military Hospital -13.22286 8.460824
##   infector source age age_unit row_num wt_kg ht_cm ct_blood fever chills cough
## 1   f547d6  other   2    years       1    27    48       22    no     no   yes
## 2     <NA>   <NA>   3    years       2    25    59       22  <NA>   <NA>  <NA>
## 3     <NA>   <NA>  56    years       3    91   238       21  <NA>   <NA>  <NA>
## 4   f90f5f  other  18    years       4    41   135       23    no     no    no
## 5   11f8ea  other   3    years       5    36    71       23    no     no   yes
##   aches vomit temp time_admission merged_header ...28
## 1    no   yes 36.8           <NA>             a     b
## 2  <NA>  <NA> 36.9          09:36             a     b
## 3  <NA>  <NA> 36.9          16:48             a     b
## 4    no    no 36.8          11:22             a     b
## 5    no   yes 36.9          12:60             a     b

# skim
skimr::skim(linelist_raw)

Data summary
Name	linelist_raw
Number of rows	6611
Number of columns	28
_______________________
Column type frequency:
character	17
numeric	8
POSIXct	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
case_id	137	0.98	6	6	5888
date onset	293	0.96	10	10	580
outcome	1500	0.77	5	7	2
gender	324	0.95	1	1	2
hospital	1512	0.77	5	36	13
infector	2323	0.65	6	6	2697
source	2323	0.65	5	7	2
age	107	0.98	1	2	75
age_unit	7	1.00	5	6	2
fever	258	0.96	2	3	2
chills	258	0.96	2	3	2
cough	258	0.96	2	3	2
aches	258	0.96	2	3	2
vomit	258	0.96	2	3	2
time_admission	844	0.87	5	5	1091
merged_header	0	1.00	1	1	1
…28	0	1.00	1	1	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
generation	7	1.00	16.60	5.71	0.00	13.00	16.00	20.00	37.00	▁▆▇▂▁
lon	7	1.00	-13.23	0.02	-13.27	-13.25	-13.23	-13.22	-13.21	▅▃▃▅▇
lat	7	1.00	8.47	0.01	8.45	8.46	8.47	8.48	8.49	▅▇▇▇▆
row_num	0	1.00	3240.91	1857.83	1.00	1647.50	3241.00	4836.50	6481.00	▇▇▇▇▇
wt_kg	7	1.00	52.69	18.59	-11.00	41.00	54.00	66.00	111.00	▁▃▇▅▁
ht_cm	7	1.00	125.25	49.57	4.00	91.00	130.00	159.00	295.00	▂▅▇▂▁
ct_blood	7	1.00	21.26	1.67	16.00	20.00	22.00	22.00	26.00	▁▃▇▃▁
temp	158	0.98	38.60	0.95	35.20	38.30	38.80	39.20	40.80	▁▂▂▇▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
infection date	2322	0.65	2012-04-09	2015-04-27	2014-10-04	538
hosp date	7	1.00	2012-04-20	2015-04-30	2014-10-15	570
date_of_outcome	1068	0.84	2012-05-14	2015-06-04	2014-10-26	575

1.4 Column names: cleaning & rename

Cleaning column names

# review column names 
names(linelist_raw)   # colum 28,  "...28"

##  [1] "case_id"         "generation"      "infection date"  "date onset"     
##  [5] "hosp date"       "date_of_outcome" "outcome"         "gender"         
##  [9] "hospital"        "lon"             "lat"             "infector"       
## [13] "source"          "age"             "age_unit"        "row_num"        
## [17] "wt_kg"           "ht_cm"           "ct_blood"        "fever"          
## [21] "chills"          "cough"           "aches"           "vomit"          
## [25] "temp"            "time_admission"  "merged_header"   "...28"

# Automatic cleaning 
linelist <- linelist_raw %>% 
  janitor::clean_names()

# see the new column names
names(linelist)      # column 28,  "x28" , the three dot dot dot is removed

##  [1] "case_id"         "generation"      "infection_date"  "date_onset"     
##  [5] "hosp_date"       "date_of_outcome" "outcome"         "gender"         
##  [9] "hospital"        "lon"             "lat"             "infector"       
## [13] "source"          "age"             "age_unit"        "row_num"        
## [17] "wt_kg"           "ht_cm"           "ct_blood"        "fever"          
## [21] "chills"          "cough"           "aches"           "vomit"          
## [25] "temp"            "time_admission"  "merged_header"   "x28"

Manunal re-name column names

linelist <- linelist_raw %>%
    
    # standardize column name syntax
    janitor::clean_names() %>% 
    
    # manually re-name columns
           # NEW name             # OLD name
    dplyr::rename(date_infection       = infection_date,
           date_hospitalisation = hosp_date,
           date_outcome         = date_of_outcome)


linelist_raw %>% 
  select(# NEW name             # OLD name
         date_infection       = `infection date`,    # rename and KEEP ONLY these columns
         date_hospitalisation = `hosp date`)

1.5 Column names: select or re-order

names(linelist_raw)

##  [1] "case_id"         "generation"      "infection date"  "date onset"     
##  [5] "hosp date"       "date_of_outcome" "outcome"         "gender"         
##  [9] "hospital"        "lon"             "lat"             "infector"       
## [13] "source"          "age"             "age_unit"        "row_num"        
## [17] "wt_kg"           "ht_cm"           "ct_blood"        "fever"          
## [21] "chills"          "cough"           "aches"           "vomit"          
## [25] "temp"            "time_admission"  "merged_header"   "...28"

# linelist dataset is piped through select() command, and names() prints just the column names
linelist %>% 
  select(case_id, date_onset, date_hospitalisation, fever) %>% 
  names()  # display the column names

## [1] "case_id"              "date_onset"           "date_hospitalisation"
## [4] "fever"

# move date_onset and date_hospitalisation to beginning
linelist %>% 
  select(date_onset, date_hospitalisation, everything()) %>% 
  names()

##  [1] "date_onset"           "date_hospitalisation" "case_id"             
##  [4] "generation"           "date_infection"       "date_outcome"        
##  [7] "outcome"              "gender"               "hospital"            
## [10] "lon"                  "lat"                  "infector"            
## [13] "source"               "age"                  "age_unit"            
## [16] "row_num"              "wt_kg"                "ht_cm"               
## [19] "ct_blood"             "fever"                "chills"              
## [22] "cough"                "aches"                "vomit"               
## [25] "temp"                 "time_admission"       "merged_header"       
## [28] "x28"

1.6 Others

# select columns that are class Numeric
linelist %>% 
  select(where(is.numeric)) %>% 
  names()

## [1] "generation" "lon"        "lat"        "row_num"    "wt_kg"     
## [6] "ht_cm"      "ct_blood"   "temp"

# select columns containing certain characters
linelist %>% 
  select(contains("date")) %>% 
  names()

## [1] "date_infection"       "date_onset"           "date_hospitalisation"
## [4] "date_outcome"

# searched for multiple character matches
linelist %>% 
  select(matches("onset|hosp|fev")) %>%   # note the OR symbol "|"
  names()

## [1] "date_onset"           "date_hospitalisation" "hospital"            
## [4] "fever"

# any_of 
linelist %>% 
  select(any_of(c("date_onset", "village_origin", "village_detection", "village_residence", "village_travel"))) %>% 
  names()

## [1] "date_onset"

1.7 Remove column

linelist %>% 
  select(-c(date_onset, fever:vomit)) %>% # remove date_onset and all columns from fever to vomit
  names()

##  [1] "case_id"              "generation"           "date_infection"      
##  [4] "date_hospitalisation" "date_outcome"         "outcome"             
##  [7] "gender"               "hospital"             "lon"                 
## [10] "lat"                  "infector"             "source"              
## [13] "age"                  "age_unit"             "row_num"             
## [16] "wt_kg"                "ht_cm"                "ct_blood"            
## [19] "temp"                 "time_admission"       "merged_header"       
## [22] "x28"

linelist$date_onset <- NULL   # deletes column with base R syntax

# Adding to pipe chain 

linelist <- linelist_raw %>%
    
    # standardize column name syntax
    janitor::clean_names() %>% 
    
    # manually re-name columns
           # NEW name             # OLD name
    dplyr::rename(date_infection       = infection_date,
           date_hospitalisation = hosp_date,
           date_outcome         = date_of_outcome) %>% 

    # remove column
    select(-c(row_num, merged_header, x28)) %>% 

    # Re-order column with everything()
  
    # duplication
    dplyr::distinct() # removes rows that are 100% duplicates

2. Column creation and transformation

2.1 Create new column

linelist <- linelist %>% 
  mutate(bmi = wt_kg / (ht_cm/100)^2)

new_col_demo <- linelist %>%                       
  mutate(
    new_var_dup    = case_id,             # new column = duplicate/copy another existing column
    new_var_static = 7,                   # new column = all values the same
    new_var_static = new_var_static + 5,  # you can overwrite a column, and it can be a calculation using other variables
    new_var_paste  = stringr::str_glue("{hospital} on ({date_hospitalisation})") # new column = pasting together values from other columns
    ) %>% 
  select(case_id, hospital, date_hospitalisation, contains("new"))        # show only new columns, for demonstration purposes

2.2 Convert class & grpup column

# Convert class  
class(linelist$age)
linelist <- linelist %>% 
  mutate(age = as.numeric(age))

# Group
linelist %>% 
  mutate(age_norm = age / mean(age, na.rm=T))  # age normalized to mean of ALL rows

linelist %>% 
  group_by(hospital) %>% 
  mutate(age_norm = age / mean(age, na.rm=T))  # age normalized to mean of hospital group

2.3 Transform multiple columns

Using across()

# selected columns 
linelist <- linelist %>% 
  mutate(across(.cols = c(temp, ht_cm, wt_kg), .fns = as.character))

# all columns
linelist <- linelist %>% 
  mutate(across(.cols = everything(), .fns = as.character))


linelist <- linelist %>% 
  mutate(across(.col = contains('date'), .fns = as.character))

linelist <- linelist %>% 
  mutate(across(.cols = where(is.POSIXct), .fns = as.Date))

2.4 Recode values

Using dplyr::recode() function

# specific value
linelist <- linelist %>% 
  mutate(date_onset = dplyr::recode(date_onset, "2014-14-15" = "2014-04-15"))

# recode multiple values
table(linelist$hospital, useNA = "always")  # print table of all unique values, including missing

## 
##                      Central Hopital                     Central Hospital 
##                                   11                                  457 
##                           Hospital A                           Hospital B 
##                                  290                                  289 
##                     Military Hopital                    Military Hospital 
##                                   32                                  798 
##                     Mitylira Hopital                    Mitylira Hospital 
##                                    1                                   79 
##                                Other                         Port Hopital 
##                                  907                                   48 
##                        Port Hospital St. Mark's Maternity Hospital (SMMH) 
##                                 1756                                  417 
##   St. Marks Maternity Hopital (SMMH)                                 <NA> 
##                                   11                                 1512

# fix incorrect values                   # old value       # new value
linelist <- linelist %>% 
  mutate(hospital = dplyr::recode(hospital,
                     # for reference: OLD = NEW
                      "Mitylira Hopital"  = "Military Hospital",
                      "Mitylira Hospital" = "Military Hospital",
                      "Military Hopital"  = "Military Hospital",
                      "Port Hopital"      = "Port Hospital",
                      "Central Hopital"   = "Central Hospital",
                      "other"             = "Other",
                      "St. Marks Maternity Hopital (SMMH)" = "St. Mark's Maternity Hospital (SMMH)"
                      ))

By logic: 1. Using simple logic: replace(), ifelse(), if_else(); 2. Using case_when().

# use replace() is changing just one value in one row, using an unique row identifier
# mutate(col_to_change = replace(col_to_change, criteria for rows, new value)).

linelist <- linelist %>% 
  mutate(gender = replace(gender, case_id == "2195", "Female"))

# Using base R
linelist$gender[linelist$case_id == "2195"] <- "Female"


# Using ifelse()
linelist <- linelist %>% 
  mutate(source_known = ifelse(!is.na(source),"known","unknown")) # the column source_known is defined. Its value in a given row is set to “known” if the row’s value in column source is not missing. If the value in source is missing, then the value in source_known is set to “unknown”.


# Case_when()

class(linelist$age)

## [1] "character"

linelist$age <- as.numeric(linelist$age)

linelist <- linelist %>% 
  mutate(age_years = dplyr::case_when(
       age_unit == "years"  ~ age,       # if age unit is years
       age_unit == "months" ~ age/12,    # if age unit is months, divide age by 12
       is.na(age_unit)      ~ age))      # if age unit is missing, assume years
                                         # any other circumstance, assign NA (missing)

linelist <- linelist %>% 
 mutate(age_years = dplyr::case_when(
          age_unit == "years" ~ age,
          age_unit == "months" ~ age/12,
          is.na(age_unit) ~ age,
          TRUE ~ NA_real_))


linelist <- linelist %>% 
     mutate(case_status = dplyr::case_when(
          
          # if patient had lab test and it is positive,
          # then they are marked as a confirmed case 
          ct_blood < 20                   ~ "Confirmed",
          
          # given that a patient does not have a positive lab result,
          # if patient has a "source" (epidemiological link) AND has fever, 
          # then they are marked as a suspect case
          !is.na(source) & fever == "yes" ~ "Suspect",
          
          # any other patient not addressed above 
          # is marked for follow up
          TRUE                            ~ "To investigate"))

Missing value

# replace_na() for class character 
class(linelist$hospital)

## [1] "character"

linelist <- linelist %>% 
  mutate(hospital = tidyr::replace_na(hospital, "Missing"))

# fct_explicit_na() for class factor
linelist <- linelist %>% 
  mutate(hospital = forcats::fct_explicit_na(hospital))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `hospital = forcats::fct_explicit_na(hospital)`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.

# na_if() Convert specific value to NA
linelist <- linelist %>% 
  mutate(hospital = dplyr::na_if(hospital, 'Missing')) # convert value named 'missing' to 'NA'


# Note: na_if() cannot be used for logic criteria (e.g. “all values > 99”) - use replace() or case_when() for this:
linelist <- linelist %>% 
  mutate(temp = replace(temp, temp > 40, NA)) # convert temp above 40 to NA

linelist <- linelist %>% 
  mutate(date_onset = replace(date_onset, date_onset > as.Date("2000-01-01"), NA))

2.5 Cleaning Dictionary

For more details, please visit here

cleaning_dict <- import("cleaning_dict.csv")

linelist <- linelist %>% 
  matchmaker::match_df(
    dictionary = cleaning_dict, 
    from = "from",
    to = "to",
    by = "col"
  )

2.6 Numeric categories

Creating categories from numerical columns by using the list of packages: age_categories(), cut(), case_when(), and quantile(), ntile()

Let’s using age_categories():

# Check the class of variables 
class(linelist$age_years)

## [1] "numeric"

# Review distribution to make appropriate cut-points
hist(linelist$age_years)

summary(linelist$age_years, na.rm = T) # Yes, remove NA

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    6.00   13.00   16.04   23.00   84.00     107

# age_categories() 

# Simple example
################
pacman::p_load(epikit)                    # load package

linelist <- linelist %>% 
  mutate(
    age_cat = age_categories(             # create new column
      age_years,                            # numeric column to make groups from
      breakers = c(0, 5, 10, 15, 20,        # break points
                   30, 40, 50, 60, 70)))

# show table
table(linelist$age_cat, useNA = "always")

linelist_2 <- linelist %>% 
  mutate(
    age_cat = age_categories(
      age_years,
      breakers = c(0, 6, 11, 16, 21, 31, 41, 51, 61, 71)
    )
  )

table(linelist_2$age_cat, useNA = "always")

# With ceiling set to TRUE
linelist <- linelist %>% 
  mutate(
    age_cat = age_categories(
      age_years, 
      breakers = c(0, 5, 10, 15, 20, 30, 40, 50, 60, 70),
      ceiling = TRUE)) # 70 is ceiling, all above become NA

# show table
table(linelist$age_cat, useNA = "always")

linelist <- linelist %>% 
  mutate(
    age_cat = age_categories(
      age_years, 
      lower = 0,
      upper = 100,
      by = 10))

linelist <- linelist %>% 
  mutate(
    age_cat = age_categories(
      age_years, 
      breaker = seq(0, 85, 5))) # age categories: 0 to 85 by 5s

# show table
table(linelist$age_cat, useNA = "always")

How about cut():

# using Cut() 
linelist <- linelist %>% 
  mutate(
    age_cat = cut(
      age_years,
      breaks = c(0, 5, 10, 15, 20, 30, 50, 70, 100),
      include.lowest = TRUE  # include 0 in the lowest group 
    ))

table(linelist$age_cat, useNA = "always")

## 
##    [0,5]   (5,10]  (10,15]  (15,20]  (20,30]  (30,50]  (50,70] (70,100] 
##     1469     1195     1040      770     1149      778       94        6 
##     <NA> 
##      107

# Cross tabulation of the numeric and category 
head(table("Numeric Values" = linelist$age_years,
      "Categories" = linelist$age_cat,
      useNA = "always"),10)

##                     Categories
## Numeric Values       [0,5] (5,10] (10,15] (15,20] (20,30] (30,50] (50,70]
##   0                    136      0       0       0       0       0       0
##   0.0833333333333333     1      0       0       0       0       0       0
##   0.25                   2      0       0       0       0       0       0
##   0.333333333333333      6      0       0       0       0       0       0
##   0.416666666666667      1      0       0       0       0       0       0
##   0.5                    6      0       0       0       0       0       0
##   0.583333333333333      3      0       0       0       0       0       0
##   0.666666666666667      3      0       0       0       0       0       0
##   0.75                   3      0       0       0       0       0       0
##   0.833333333333333      1      0       0       0       0       0       0
##                     Categories
## Numeric Values       (70,100] <NA>
##   0                         0    0
##   0.0833333333333333        0    0
##   0.25                      0    0
##   0.333333333333333         0    0
##   0.416666666666667         0    0
##   0.5                       0    0
##   0.583333333333333         0    0
##   0.666666666666667         0    0
##   0.75                      0    0
##   0.833333333333333         0    0

# Relabeling NA values
linelist <- linelist %>% 
  mutate(age_cat = cut(
    age_years,
    breaks = c(0, 5, 10, 15, 20, 30, 50, 70, 100),
    right = FALSE,
    include.lowest = TRUE,
    labels = c("0-4", "5-9", "10-14", "15-19", "20-29", "30-49",
               "50-69", "70-100")),
    
    # make missing values explicit
    age_cat = fct_explicit_na(
      age_cat,
      na_level = "Missing age"
    ))

table(linelist$age_cat, useNA = "always")

2.7 Filter row

# Filter out missing values 
linelist %>% 
  tidyr::drop_na(case_id, age_years) # drop rows with missing values for case_id or age_years

# Filter by row number
# View first 100 rows
linelist %>% head(100)     # or use tail() to see the n last rows

# Show row 5 only
linelist %>% filter(row_number() == 5)

# View rows 2 through 20, and three specific columns
linelist %>% filter(row_number() %in% 2:20) %>% select(date_onset, outcome, age)

# Complex filter

hist(linelist$date_onset, breaks = 50)

Can we just filter by date_onset to rows after June 2013? Caution! Applying the code filter(date_onset > as.Date(“2013-06-01”))) would remove any rows in the later epidemic with a missing date of onset!

DANGER: Filtering to greater than (>) or less than (<) a date or number can remove any rows with missing values (NA)! This is because NA is treated as infinitely large and small.

table(Hospital  = linelist$hospital,                     # hospital name
      YearOnset = lubridate::year(linelist$date_onset),  # year of date_onset
      useNA     = "always")                              # show missing values

linelist <- linelist %>% 
  # keep rows where onset is after 1 June 2013 OR where onset is missing and it was a hospital OTHER than Hospital A or B
  filter(date_onset > as.Date("2013-06-01") | (is.na(date_onset) & !hospital %in% c("Hospital A", "Hospital B")))

nrow(linelist)

table(Hospital = linelist$hospital,
      Yearonset = lubridate::year(linelist$date_onset),
      useNA = "always")

Standalone

# dataframe <- filter(dataframe, condition(s) for rows to keep)
linelist <- filter(linelist, !is.na(case_id))

linelist <- linelist[!is.na(case_id), ] # or subset using base R

Quickly review records

View(linelist)

View(linelist %>% 
       filter(case_id %in% c("11f8ea", "76b97a", "47a5f5")) %>% 
       select(date_onset, date_hospitalisation)
     )

2.8 Row-wise calculations

dplyr::row-wise perform a calculation within a row.

linelist %>%
  dplyr::rowwise() %>% 
  mutate(num_symptoms = sum(c(fever, chills, cough, aches, vomit) == "yes")) %>% 
           ungroup() %>% 
  select(fever, chills, cough, aches, vomit, num_symptoms) # for display

linelist %>% 
  dplyr::rowwise() %>% 
  mutate(num_NA_dates = sum(is.na(c_across(contains("date"))))) %>% 
  ungroup() %>% 
  select(num_NA_dates, contains("date"))

Get the latest or most recent date

linelist %>% 
  dplyr::rowwise() %>% 
  mutate(latest_date = max(c_across(contains("date")), na.rm = T)) %>%  # what happens if not specify na.rm = T? 
  ungroup() %>% 
  select(latest_date, contains("date"))

Data management with R (1)

2023-04-15

Objective