pacman::p_load(
rio, # importing data
here, # relative file pathways
janitor, # data cleaning and tables
lubridate, # working with dates
matchmaker, # dictionary-based cleaning
epikit, # age_categories() function
tidyverse, # data management and visualization
dplyr
)
##
## epikit installed
## also installing the dependency 'readxl'
##
## tidyverse installed
In a cleaning pipeline the order of the steps is important. Cleaning steps might include: 1. Importing of data; 2. Column names cleaned or changed; 3. De-duplication; 4. Column creation and transformation (e.g. re-coding or standardising values); 5. Rows filtered or added.
# import
linelist_raw <- import("linelist_raw.xlsx")
## New names:
## • `` -> `...28`
# review
head(linelist_raw, 5)
## case_id generation infection date date onset hosp date date_of_outcome
## 1 5fe599 4 2014-05-08 2014-05-13 2014-05-15 <NA>
## 2 8689b7 4 <NA> 2014-05-13 2014-05-14 2014-05-18
## 3 11f8ea 2 <NA> 2014-05-16 2014-05-18 2014-05-30
## 4 b8812a 3 2014-05-04 2014-05-18 2014-05-20 <NA>
## 5 893f25 3 2014-05-18 2014-05-21 2014-05-22 2014-05-29
## outcome gender hospital lon lat
## 1 <NA> m Other -13.21574 8.468973
## 2 Recover f <NA> -13.21523 8.451719
## 3 Recover m St. Mark's Maternity Hospital (SMMH) -13.21291 8.464817
## 4 <NA> f Port Hospital -13.23637 8.475476
## 5 Recover m Military Hospital -13.22286 8.460824
## infector source age age_unit row_num wt_kg ht_cm ct_blood fever chills cough
## 1 f547d6 other 2 years 1 27 48 22 no no yes
## 2 <NA> <NA> 3 years 2 25 59 22 <NA> <NA> <NA>
## 3 <NA> <NA> 56 years 3 91 238 21 <NA> <NA> <NA>
## 4 f90f5f other 18 years 4 41 135 23 no no no
## 5 11f8ea other 3 years 5 36 71 23 no no yes
## aches vomit temp time_admission merged_header ...28
## 1 no yes 36.8 <NA> a b
## 2 <NA> <NA> 36.9 09:36 a b
## 3 <NA> <NA> 36.9 16:48 a b
## 4 no no 36.8 11:22 a b
## 5 no yes 36.9 12:60 a b
# skim
skimr::skim(linelist_raw)
| Name | linelist_raw |
| Number of rows | 6611 |
| Number of columns | 28 |
| _______________________ | |
| Column type frequency: | |
| character | 17 |
| numeric | 8 |
| POSIXct | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| case_id | 137 | 0.98 | 6 | 6 | 0 | 5888 | 0 |
| date onset | 293 | 0.96 | 10 | 10 | 0 | 580 | 0 |
| outcome | 1500 | 0.77 | 5 | 7 | 0 | 2 | 0 |
| gender | 324 | 0.95 | 1 | 1 | 0 | 2 | 0 |
| hospital | 1512 | 0.77 | 5 | 36 | 0 | 13 | 0 |
| infector | 2323 | 0.65 | 6 | 6 | 0 | 2697 | 0 |
| source | 2323 | 0.65 | 5 | 7 | 0 | 2 | 0 |
| age | 107 | 0.98 | 1 | 2 | 0 | 75 | 0 |
| age_unit | 7 | 1.00 | 5 | 6 | 0 | 2 | 0 |
| fever | 258 | 0.96 | 2 | 3 | 0 | 2 | 0 |
| chills | 258 | 0.96 | 2 | 3 | 0 | 2 | 0 |
| cough | 258 | 0.96 | 2 | 3 | 0 | 2 | 0 |
| aches | 258 | 0.96 | 2 | 3 | 0 | 2 | 0 |
| vomit | 258 | 0.96 | 2 | 3 | 0 | 2 | 0 |
| time_admission | 844 | 0.87 | 5 | 5 | 0 | 1091 | 0 |
| merged_header | 0 | 1.00 | 1 | 1 | 0 | 1 | 0 |
| …28 | 0 | 1.00 | 1 | 1 | 0 | 1 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| generation | 7 | 1.00 | 16.60 | 5.71 | 0.00 | 13.00 | 16.00 | 20.00 | 37.00 | ▁▆▇▂▁ |
| lon | 7 | 1.00 | -13.23 | 0.02 | -13.27 | -13.25 | -13.23 | -13.22 | -13.21 | ▅▃▃▅▇ |
| lat | 7 | 1.00 | 8.47 | 0.01 | 8.45 | 8.46 | 8.47 | 8.48 | 8.49 | ▅▇▇▇▆ |
| row_num | 0 | 1.00 | 3240.91 | 1857.83 | 1.00 | 1647.50 | 3241.00 | 4836.50 | 6481.00 | ▇▇▇▇▇ |
| wt_kg | 7 | 1.00 | 52.69 | 18.59 | -11.00 | 41.00 | 54.00 | 66.00 | 111.00 | ▁▃▇▅▁ |
| ht_cm | 7 | 1.00 | 125.25 | 49.57 | 4.00 | 91.00 | 130.00 | 159.00 | 295.00 | ▂▅▇▂▁ |
| ct_blood | 7 | 1.00 | 21.26 | 1.67 | 16.00 | 20.00 | 22.00 | 22.00 | 26.00 | ▁▃▇▃▁ |
| temp | 158 | 0.98 | 38.60 | 0.95 | 35.20 | 38.30 | 38.80 | 39.20 | 40.80 | ▁▂▂▇▁ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| infection date | 2322 | 0.65 | 2012-04-09 | 2015-04-27 | 2014-10-04 | 538 |
| hosp date | 7 | 1.00 | 2012-04-20 | 2015-04-30 | 2014-10-15 | 570 |
| date_of_outcome | 1068 | 0.84 | 2012-05-14 | 2015-06-04 | 2014-10-26 | 575 |
Cleaning column names
# review column names
names(linelist_raw) # colum 28, "...28"
## [1] "case_id" "generation" "infection date" "date onset"
## [5] "hosp date" "date_of_outcome" "outcome" "gender"
## [9] "hospital" "lon" "lat" "infector"
## [13] "source" "age" "age_unit" "row_num"
## [17] "wt_kg" "ht_cm" "ct_blood" "fever"
## [21] "chills" "cough" "aches" "vomit"
## [25] "temp" "time_admission" "merged_header" "...28"
# Automatic cleaning
linelist <- linelist_raw %>%
janitor::clean_names()
# see the new column names
names(linelist) # column 28, "x28" , the three dot dot dot is removed
## [1] "case_id" "generation" "infection_date" "date_onset"
## [5] "hosp_date" "date_of_outcome" "outcome" "gender"
## [9] "hospital" "lon" "lat" "infector"
## [13] "source" "age" "age_unit" "row_num"
## [17] "wt_kg" "ht_cm" "ct_blood" "fever"
## [21] "chills" "cough" "aches" "vomit"
## [25] "temp" "time_admission" "merged_header" "x28"
Manunal re-name column names
linelist <- linelist_raw %>%
# standardize column name syntax
janitor::clean_names() %>%
# manually re-name columns
# NEW name # OLD name
dplyr::rename(date_infection = infection_date,
date_hospitalisation = hosp_date,
date_outcome = date_of_outcome)
linelist_raw %>%
select(# NEW name # OLD name
date_infection = `infection date`, # rename and KEEP ONLY these columns
date_hospitalisation = `hosp date`)
names(linelist_raw)
## [1] "case_id" "generation" "infection date" "date onset"
## [5] "hosp date" "date_of_outcome" "outcome" "gender"
## [9] "hospital" "lon" "lat" "infector"
## [13] "source" "age" "age_unit" "row_num"
## [17] "wt_kg" "ht_cm" "ct_blood" "fever"
## [21] "chills" "cough" "aches" "vomit"
## [25] "temp" "time_admission" "merged_header" "...28"
# linelist dataset is piped through select() command, and names() prints just the column names
linelist %>%
select(case_id, date_onset, date_hospitalisation, fever) %>%
names() # display the column names
## [1] "case_id" "date_onset" "date_hospitalisation"
## [4] "fever"
# move date_onset and date_hospitalisation to beginning
linelist %>%
select(date_onset, date_hospitalisation, everything()) %>%
names()
## [1] "date_onset" "date_hospitalisation" "case_id"
## [4] "generation" "date_infection" "date_outcome"
## [7] "outcome" "gender" "hospital"
## [10] "lon" "lat" "infector"
## [13] "source" "age" "age_unit"
## [16] "row_num" "wt_kg" "ht_cm"
## [19] "ct_blood" "fever" "chills"
## [22] "cough" "aches" "vomit"
## [25] "temp" "time_admission" "merged_header"
## [28] "x28"
# select columns that are class Numeric
linelist %>%
select(where(is.numeric)) %>%
names()
## [1] "generation" "lon" "lat" "row_num" "wt_kg"
## [6] "ht_cm" "ct_blood" "temp"
# select columns containing certain characters
linelist %>%
select(contains("date")) %>%
names()
## [1] "date_infection" "date_onset" "date_hospitalisation"
## [4] "date_outcome"
# searched for multiple character matches
linelist %>%
select(matches("onset|hosp|fev")) %>% # note the OR symbol "|"
names()
## [1] "date_onset" "date_hospitalisation" "hospital"
## [4] "fever"
# any_of
linelist %>%
select(any_of(c("date_onset", "village_origin", "village_detection", "village_residence", "village_travel"))) %>%
names()
## [1] "date_onset"
linelist %>%
select(-c(date_onset, fever:vomit)) %>% # remove date_onset and all columns from fever to vomit
names()
## [1] "case_id" "generation" "date_infection"
## [4] "date_hospitalisation" "date_outcome" "outcome"
## [7] "gender" "hospital" "lon"
## [10] "lat" "infector" "source"
## [13] "age" "age_unit" "row_num"
## [16] "wt_kg" "ht_cm" "ct_blood"
## [19] "temp" "time_admission" "merged_header"
## [22] "x28"
linelist$date_onset <- NULL # deletes column with base R syntax
# Adding to pipe chain
linelist <- linelist_raw %>%
# standardize column name syntax
janitor::clean_names() %>%
# manually re-name columns
# NEW name # OLD name
dplyr::rename(date_infection = infection_date,
date_hospitalisation = hosp_date,
date_outcome = date_of_outcome) %>%
# remove column
select(-c(row_num, merged_header, x28)) %>%
# Re-order column with everything()
# duplication
dplyr::distinct() # removes rows that are 100% duplicates
linelist <- linelist %>%
mutate(bmi = wt_kg / (ht_cm/100)^2)
new_col_demo <- linelist %>%
mutate(
new_var_dup = case_id, # new column = duplicate/copy another existing column
new_var_static = 7, # new column = all values the same
new_var_static = new_var_static + 5, # you can overwrite a column, and it can be a calculation using other variables
new_var_paste = stringr::str_glue("{hospital} on ({date_hospitalisation})") # new column = pasting together values from other columns
) %>%
select(case_id, hospital, date_hospitalisation, contains("new")) # show only new columns, for demonstration purposes
# Convert class
class(linelist$age)
linelist <- linelist %>%
mutate(age = as.numeric(age))
# Group
linelist %>%
mutate(age_norm = age / mean(age, na.rm=T)) # age normalized to mean of ALL rows
linelist %>%
group_by(hospital) %>%
mutate(age_norm = age / mean(age, na.rm=T)) # age normalized to mean of hospital group
Using across()
# selected columns
linelist <- linelist %>%
mutate(across(.cols = c(temp, ht_cm, wt_kg), .fns = as.character))
# all columns
linelist <- linelist %>%
mutate(across(.cols = everything(), .fns = as.character))
linelist <- linelist %>%
mutate(across(.col = contains('date'), .fns = as.character))
linelist <- linelist %>%
mutate(across(.cols = where(is.POSIXct), .fns = as.Date))
Using dplyr::recode() function
# specific value
linelist <- linelist %>%
mutate(date_onset = dplyr::recode(date_onset, "2014-14-15" = "2014-04-15"))
# recode multiple values
table(linelist$hospital, useNA = "always") # print table of all unique values, including missing
##
## Central Hopital Central Hospital
## 11 457
## Hospital A Hospital B
## 290 289
## Military Hopital Military Hospital
## 32 798
## Mitylira Hopital Mitylira Hospital
## 1 79
## Other Port Hopital
## 907 48
## Port Hospital St. Mark's Maternity Hospital (SMMH)
## 1756 417
## St. Marks Maternity Hopital (SMMH) <NA>
## 11 1512
# fix incorrect values # old value # new value
linelist <- linelist %>%
mutate(hospital = dplyr::recode(hospital,
# for reference: OLD = NEW
"Mitylira Hopital" = "Military Hospital",
"Mitylira Hospital" = "Military Hospital",
"Military Hopital" = "Military Hospital",
"Port Hopital" = "Port Hospital",
"Central Hopital" = "Central Hospital",
"other" = "Other",
"St. Marks Maternity Hopital (SMMH)" = "St. Mark's Maternity Hospital (SMMH)"
))
By logic: 1. Using simple logic: replace(), ifelse(), if_else(); 2. Using case_when().
# use replace() is changing just one value in one row, using an unique row identifier
# mutate(col_to_change = replace(col_to_change, criteria for rows, new value)).
linelist <- linelist %>%
mutate(gender = replace(gender, case_id == "2195", "Female"))
# Using base R
linelist$gender[linelist$case_id == "2195"] <- "Female"
# Using ifelse()
linelist <- linelist %>%
mutate(source_known = ifelse(!is.na(source),"known","unknown")) # the column source_known is defined. Its value in a given row is set to “known” if the row’s value in column source is not missing. If the value in source is missing, then the value in source_known is set to “unknown”.
# Case_when()
class(linelist$age)
## [1] "character"
linelist$age <- as.numeric(linelist$age)
linelist <- linelist %>%
mutate(age_years = dplyr::case_when(
age_unit == "years" ~ age, # if age unit is years
age_unit == "months" ~ age/12, # if age unit is months, divide age by 12
is.na(age_unit) ~ age)) # if age unit is missing, assume years
# any other circumstance, assign NA (missing)
linelist <- linelist %>%
mutate(age_years = dplyr::case_when(
age_unit == "years" ~ age,
age_unit == "months" ~ age/12,
is.na(age_unit) ~ age,
TRUE ~ NA_real_))
linelist <- linelist %>%
mutate(case_status = dplyr::case_when(
# if patient had lab test and it is positive,
# then they are marked as a confirmed case
ct_blood < 20 ~ "Confirmed",
# given that a patient does not have a positive lab result,
# if patient has a "source" (epidemiological link) AND has fever,
# then they are marked as a suspect case
!is.na(source) & fever == "yes" ~ "Suspect",
# any other patient not addressed above
# is marked for follow up
TRUE ~ "To investigate"))
Missing value
# replace_na() for class character
class(linelist$hospital)
## [1] "character"
linelist <- linelist %>%
mutate(hospital = tidyr::replace_na(hospital, "Missing"))
# fct_explicit_na() for class factor
linelist <- linelist %>%
mutate(hospital = forcats::fct_explicit_na(hospital))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `hospital = forcats::fct_explicit_na(hospital)`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.
# na_if() Convert specific value to NA
linelist <- linelist %>%
mutate(hospital = dplyr::na_if(hospital, 'Missing')) # convert value named 'missing' to 'NA'
# Note: na_if() cannot be used for logic criteria (e.g. “all values > 99”) - use replace() or case_when() for this:
linelist <- linelist %>%
mutate(temp = replace(temp, temp > 40, NA)) # convert temp above 40 to NA
linelist <- linelist %>%
mutate(date_onset = replace(date_onset, date_onset > as.Date("2000-01-01"), NA))
For more details, please visit here
cleaning_dict <- import("cleaning_dict.csv")
linelist <- linelist %>%
matchmaker::match_df(
dictionary = cleaning_dict,
from = "from",
to = "to",
by = "col"
)
Creating categories from numerical columns by using the list of packages: age_categories(), cut(), case_when(), and quantile(), ntile()
Let’s using age_categories():
# Check the class of variables
class(linelist$age_years)
## [1] "numeric"
# Review distribution to make appropriate cut-points
hist(linelist$age_years)
summary(linelist$age_years, na.rm = T) # Yes, remove NA
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 6.00 13.00 16.04 23.00 84.00 107
# age_categories()
# Simple example
################
pacman::p_load(epikit) # load package
linelist <- linelist %>%
mutate(
age_cat = age_categories( # create new column
age_years, # numeric column to make groups from
breakers = c(0, 5, 10, 15, 20, # break points
30, 40, 50, 60, 70)))
# show table
table(linelist$age_cat, useNA = "always")
linelist_2 <- linelist %>%
mutate(
age_cat = age_categories(
age_years,
breakers = c(0, 6, 11, 16, 21, 31, 41, 51, 61, 71)
)
)
table(linelist_2$age_cat, useNA = "always")
# With ceiling set to TRUE
linelist <- linelist %>%
mutate(
age_cat = age_categories(
age_years,
breakers = c(0, 5, 10, 15, 20, 30, 40, 50, 60, 70),
ceiling = TRUE)) # 70 is ceiling, all above become NA
# show table
table(linelist$age_cat, useNA = "always")
linelist <- linelist %>%
mutate(
age_cat = age_categories(
age_years,
lower = 0,
upper = 100,
by = 10))
linelist <- linelist %>%
mutate(
age_cat = age_categories(
age_years,
breaker = seq(0, 85, 5))) # age categories: 0 to 85 by 5s
# show table
table(linelist$age_cat, useNA = "always")
How about cut():
# using Cut()
linelist <- linelist %>%
mutate(
age_cat = cut(
age_years,
breaks = c(0, 5, 10, 15, 20, 30, 50, 70, 100),
include.lowest = TRUE # include 0 in the lowest group
))
table(linelist$age_cat, useNA = "always")
##
## [0,5] (5,10] (10,15] (15,20] (20,30] (30,50] (50,70] (70,100]
## 1469 1195 1040 770 1149 778 94 6
## <NA>
## 107
# Cross tabulation of the numeric and category
head(table("Numeric Values" = linelist$age_years,
"Categories" = linelist$age_cat,
useNA = "always"),10)
## Categories
## Numeric Values [0,5] (5,10] (10,15] (15,20] (20,30] (30,50] (50,70]
## 0 136 0 0 0 0 0 0
## 0.0833333333333333 1 0 0 0 0 0 0
## 0.25 2 0 0 0 0 0 0
## 0.333333333333333 6 0 0 0 0 0 0
## 0.416666666666667 1 0 0 0 0 0 0
## 0.5 6 0 0 0 0 0 0
## 0.583333333333333 3 0 0 0 0 0 0
## 0.666666666666667 3 0 0 0 0 0 0
## 0.75 3 0 0 0 0 0 0
## 0.833333333333333 1 0 0 0 0 0 0
## Categories
## Numeric Values (70,100] <NA>
## 0 0 0
## 0.0833333333333333 0 0
## 0.25 0 0
## 0.333333333333333 0 0
## 0.416666666666667 0 0
## 0.5 0 0
## 0.583333333333333 0 0
## 0.666666666666667 0 0
## 0.75 0 0
## 0.833333333333333 0 0
# Relabeling NA values
linelist <- linelist %>%
mutate(age_cat = cut(
age_years,
breaks = c(0, 5, 10, 15, 20, 30, 50, 70, 100),
right = FALSE,
include.lowest = TRUE,
labels = c("0-4", "5-9", "10-14", "15-19", "20-29", "30-49",
"50-69", "70-100")),
# make missing values explicit
age_cat = fct_explicit_na(
age_cat,
na_level = "Missing age"
))
table(linelist$age_cat, useNA = "always")
# Filter out missing values
linelist %>%
tidyr::drop_na(case_id, age_years) # drop rows with missing values for case_id or age_years
# Filter by row number
# View first 100 rows
linelist %>% head(100) # or use tail() to see the n last rows
# Show row 5 only
linelist %>% filter(row_number() == 5)
# View rows 2 through 20, and three specific columns
linelist %>% filter(row_number() %in% 2:20) %>% select(date_onset, outcome, age)
# Complex filter
hist(linelist$date_onset, breaks = 50)
Can we just filter by date_onset to rows after June 2013? Caution! Applying the code filter(date_onset > as.Date(“2013-06-01”))) would remove any rows in the later epidemic with a missing date of onset!
DANGER: Filtering to greater than (>) or less than (<) a date or number can remove any rows with missing values (NA)! This is because NA is treated as infinitely large and small.
table(Hospital = linelist$hospital, # hospital name
YearOnset = lubridate::year(linelist$date_onset), # year of date_onset
useNA = "always") # show missing values
linelist <- linelist %>%
# keep rows where onset is after 1 June 2013 OR where onset is missing and it was a hospital OTHER than Hospital A or B
filter(date_onset > as.Date("2013-06-01") | (is.na(date_onset) & !hospital %in% c("Hospital A", "Hospital B")))
nrow(linelist)
table(Hospital = linelist$hospital,
Yearonset = lubridate::year(linelist$date_onset),
useNA = "always")
Standalone
# dataframe <- filter(dataframe, condition(s) for rows to keep)
linelist <- filter(linelist, !is.na(case_id))
linelist <- linelist[!is.na(case_id), ] # or subset using base R
Quickly review records
View(linelist)
View(linelist %>%
filter(case_id %in% c("11f8ea", "76b97a", "47a5f5")) %>%
select(date_onset, date_hospitalisation)
)
dplyr::row-wise perform a calculation within a row.
linelist %>%
dplyr::rowwise() %>%
mutate(num_symptoms = sum(c(fever, chills, cough, aches, vomit) == "yes")) %>%
ungroup() %>%
select(fever, chills, cough, aches, vomit, num_symptoms) # for display
linelist %>%
dplyr::rowwise() %>%
mutate(num_NA_dates = sum(is.na(c_across(contains("date"))))) %>%
ungroup() %>%
select(num_NA_dates, contains("date"))
Get the latest or most recent date
linelist %>%
dplyr::rowwise() %>%
mutate(latest_date = max(c_across(contains("date")), na.rm = T)) %>% # what happens if not specify na.rm = T?
ungroup() %>%
select(latest_date, contains("date"))