Objective

Use of the stringr package to evaluate and handle character values (“strings”):

  1. Combine, order, split, arrange - str_c(), str_glue(), str_order(), str_split();
  2. Clean and standardise;
  1. Evaluate and extract by position - str_length(), str_sub(), word();
  2. Patterns:

Loading packages

pacman::p_load(
  rio,        # importing data  
  here,       # relative file pathways  
  janitor,    # data cleaning and tables
  lubridate,  # working with dates
  matchmaker, # dictionary-based cleaning
  epikit,     # age_categories() function
  tidyverse,  # data management and visualization
  parsedate,  # has function to "guess" messy dates
  aweek,      # another option for converting dates to weeks, and weeks to dates
  zoo,        # additional date/time functions
  stringr,    # many functions for handling strings
  tools       # alternative for converting to title case
)

Import data

# import case linelist 
linelist <- import(here("data", "linelist_cleaned.rds"))

Unite, Split, Arrange

  1. Using str_c(), str_glue(), and unite() to combine strings;
  2. Using str_order() to arrange strings;
  3. Using str_split() and separate() to split strings.

Combine strings

str_c("String1", "String2", "String3")
## [1] "String1String2String3"
str_c("String1", "String2", "String3", sep = ", ")
## [1] "String1, String2, String3"
first_names <- c("abdul", "fahruk", "janice") 
last_names  <- c("hussein", "akinleye", "okeke")

str_c(first_names, last_names, sep = " ", collapse = "; ")
## [1] "abdul hussein; fahruk akinleye; janice okeke"

The cat() function in R can be used to concatenate together several objects in R. Use \n to force newline.

cat(str_c(first_names, last_names, sep = " ", collapse = ";\n"))
## abdul hussein;
## fahruk akinleye;
## janice okeke

Dynamic strings - str_glue

str_glue("Data include {nrow(linelist)} cases and are current to {format(Sys.Date(), '%d %b %Y')}.")
## Data include 5888 cases and are current to 17 Apr 2023.
# All content goes between double quotation marks str_glue("")
# Any dynamic code or references to pre-defined values are placed within curly brackets {} within the double quotation marks. There can be many curly brackets in the same str_glue() command.
# To display character quotes ’’, use single quotes within the surrounding double quotes (e.g. when providing date format - see example below)
# Tip: You can use \n to force a new line
# Tip: You use format() to adjust date display, and use Sys.Date() to display the current date

An alternative format is to use placeholders within the brackets and define the code in separate arguments at the end of the str_glue() function, as below. This can improve code readability if the text is long.

str_glue("Linelist as of {current_date}.\nLast case hospitalized on {last_hospital}.\n{n_missing_onset} cases are missing date of onset and not shown",
         current_date = format(Sys.Date(), '%d %b %Y'),
         last_hospital = format(as.Date(max(linelist$date_hospitalisation, na.rm=T)), '%d %b %Y'),
         n_missing_onset = nrow(linelist %>% filter(is.na(date_onset)))
         )
## Linelist as of 17 Apr 2023.
## Last case hospitalized on 30 Apr 2015.
## 0 cases are missing date of onset and not shown

Pulling from a data frame

# make case data frame
case_table <- data.frame(
  zone        = c("Zone 1", "Zone 2", "Zone 3", "Zone 4", "Zone 5"),
  new_cases   = c(3, 0, 7, 0, 15),
  total_cases = c(40, 4, 25, 10, 103)
  )

# using str_glue_data()
case_table %>% 
  str_glue_data("{zone}: {new_cases} ({total_cases} total cases)")
## Zone 1: 3 (40 total cases)
## Zone 2: 0 (4 total cases)
## Zone 3: 7 (25 total cases)
## Zone 4: 0 (10 total cases)
## Zone 5: 15 (103 total cases)

Data frame to one line

str_c(case_table$zone, case_table$new_cases, sep = " = ", collapse = ", ")
## [1] "Zone 1 = 3, Zone 2 = 0, Zone 3 = 7, Zone 4 = 0, Zone 5 = 15"
str_c("New casese: ", str_c(case_table$zone, case_table$new_cases, sep = " = ", collapse = ", "))
## [1] "New casese: Zone 1 = 3, Zone 2 = 0, Zone 3 = 7, Zone 4 = 0, Zone 5 = 15"

Unite colums

df <- data.frame(
  case_ID = c(1:6),
  symptoms  = c("jaundice, fever, chills",     # patient 1
                "chills, aches, pains",        # patient 2 
                "fever",                       # patient 3
                "vomiting, diarrhoea",         # patient 4
                "bleeding from gums, fever",   # patient 5
                "rapid pulse, headache"),      # patient 6
  outcome = c("Recover", "Death", "Death", "Recover", "Recover", "Recover")
  )
                # dataframe  # colum to separate      # name of new columns separated 
df_split <- separate(df, symptoms, into = c("sym_1", "sym_2", "sym_3"), extra = "merge")
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2 rows [3, 4].
t <- df_split %>% 
  unite(
    col = "all_symptoms",         # name of the new united column
    c("sym_1", "sym_2", "sym_3"), # columns to unite
    sep = ", ",                   # separator to use in united column
    remove = TRUE,                # if TRUE, removes input cols from the data frame
    na.rm = TRUE                  # if TRUE, missing values are removed before uniting
  )

t2 <- df_split %>% 
  unite(
    col = "all_symtoms",
    c("sym_1", "sym_2", "sym_3"),
    sep = ", ",
    remove = FALSE,  # columns to unite still being keep without removing 
    na.rm = TRUE
  )

Split

To split a string based on a pattern, use str_split(). It evaluates the string(s) and returns a list of character vectors consisting of the newly-split values.

str_split(string = "jaundice, fewer, chills",
          pattern = ",") 
## [[1]]
## [1] "jaundice" " fewer"   " chills"
pt1_symptoms <- str_split("jaundice, fever, chills", ",")
pt1_symptoms[[1]][2]  # extracts 2nd value from 1st (and only) element of the list
## [1] " fever"
symptoms <- c("jaundice, fever, chills", 
              "chills, aches, pains",
              "fever",
              "vomiting, diarrhoea",
              "bleeding from gums, fever",
              "rapid pulse, headache")

str_split(symptoms, ",") # return a character matrix 
## [[1]]
## [1] "jaundice" " fever"   " chills" 
## 
## [[2]]
## [1] "chills" " aches" " pains"
## 
## [[3]]
## [1] "fever"
## 
## [[4]]
## [1] "vomiting"   " diarrhoea"
## 
## [[5]]
## [1] "bleeding from gums" " fever"            
## 
## [[6]]
## [1] "rapid pulse" " headache"
str_split(symptoms, ",", simplify = TRUE) # opt 1: creating data frame colums 
##      [,1]                 [,2]         [,3]     
## [1,] "jaundice"           " fever"     " chills"
## [2,] "chills"             " aches"     " pains" 
## [3,] "fever"              ""           ""       
## [4,] "vomiting"           " diarrhoea" ""       
## [5,] "bleeding from gums" " fever"     ""       
## [6,] "rapid pulse"        " headache"  ""
str_split_fixed(symptoms, ",", n = 2) # opt 2: do not need to define data frame, but provide no of colums 
##      [,1]                 [,2]            
## [1,] "jaundice"           " fever, chills"
## [2,] "chills"             " aches, pains" 
## [3,] "fever"              ""              
## [4,] "vomiting"           " diarrhoea"    
## [5,] "bleeding from gums" " fever"        
## [6,] "rapid pulse"        " headache"

Split columns

df
##   case_ID                  symptoms outcome
## 1       1   jaundice, fever, chills Recover
## 2       2      chills, aches, pains   Death
## 3       3                     fever   Death
## 4       4       vomiting, diarrhoea Recover
## 5       5 bleeding from gums, fever Recover
## 6       6     rapid pulse, headache Recover
# third symtoms combined into second new column 
df %>% separate(symptoms, into = c("sym_1", "sym_2"), sep = ",", extra = "merge") # preserve all your data (saving all three symtoms although we only create two colums)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [3].
##   case_ID              sym_1          sym_2 outcome
## 1       1           jaundice  fever, chills Recover
## 2       2             chills   aches, pains   Death
## 3       3              fever           <NA>   Death
## 4       4           vomiting      diarrhoea Recover
## 5       5 bleeding from gums          fever Recover
## 6       6        rapid pulse       headache Recover
# third symptoms are lost
df %>% 
  separate(symptoms, into = c("sym_1", "sym_2"), sep=",")
## Warning: Expected 2 pieces. Additional pieces discarded in 2 rows [1, 2].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [3].
##   case_ID              sym_1      sym_2 outcome
## 1       1           jaundice      fever Recover
## 2       2             chills      aches   Death
## 3       3              fever       <NA>   Death
## 4       4           vomiting  diarrhoea Recover
## 5       5 bleeding from gums      fever Recover
## 6       6        rapid pulse   headache Recover
df %>% separate(symptoms, into = c("sym_1", "sym_2"), sep = ",", extra = "warn") # drop value with no warning 
## Warning: Expected 2 pieces. Additional pieces discarded in 2 rows [1, 2].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [3].
##   case_ID              sym_1      sym_2 outcome
## 1       1           jaundice      fever Recover
## 2       2             chills      aches   Death
## 3       3              fever       <NA>   Death
## 4       4           vomiting  diarrhoea Recover
## 5       5 bleeding from gums      fever Recover
## 6       6        rapid pulse   headache Recover

Clean and Standardise

Change case

str_to_upper("California")
## [1] "CALIFORNIA"
str_to_lower("California")
## [1] "california"
str_to_title("go to the US state of California")
## [1] "Go To The Us State Of California"
tools::toTitleCase("This is the US state of california")
## [1] "This is the US State of California"
str_to_sentence("the patient must be transported")
## [1] "The patient must be transported"

Pad length

Use str_pad() to add characters to a string, to a minimum length.

ICD_codes <- c("R10.13",
               "R10.819",
               "R17")

str_pad(ICD_codes, 7, "right")
## [1] "R10.13 " "R10.819" "R17    "
str_pad(ICD_codes, 7, "right", pad = ".")
## [1] "R10.13." "R10.819" "R17...."
str_pad(ICD_codes, 8, "right", pad = "0")
## [1] "R10.1300" "R10.8190" "R1700000"

Truncate (delete, shorten string)

original <- "Symptom onset on 4/3/2020 with vomiting"
str_trunc(original, 10, "center")
## [1] "Symp...ing"

Standardise length

Using str_trunc to delete or str_pad to expand string

ICD_codes   <- c("R10.13",
                 "R10.819",
                 "R17")

ICD_codes_2 <- str_trunc(ICD_codes, 6) # truncate 
ICD_codes_2
## [1] "R10.13" "R10..." "R17"
IDC_code_3 <- str_pad(ICD_codes_2, 6, "right")
IDC_code_3
## [1] "R10.13" "R10..." "R17   "

Removing leading/ trailing whitespace

Using str_trim() to remove spaces, newlines () or tabs ( on sides of a string input. Add “right” “left”, or “both” to the command to specify which side to trim (e.g. str_trim(x, “right”).

# ID numbers with excess spaces on right
IDs <- c("provA_1852  ", # two excess spaces
         "provA_2345",   # zero excess spaces
         "provA_9460 ")  # one excess space

str_trim(IDs) # IDs trimmed to remove excess spaces on right side only
## [1] "provA_1852" "provA_2345" "provA_9460"
str_trim(IDs, "left")
## [1] "provA_1852  " "provA_2345"   "provA_9460 "
str_trim(IDs, "both")
## [1] "provA_1852" "provA_2345" "provA_9460"

Remove repeated whitespace within

str_squish(" Pt requires    IV saline\n")
## [1] "Pt requires IV saline"

Handle by position

Extract by character position

# start and end third from left (3rd letter from left)
str_sub("pneumonia", 3, 3)
## [1] "e"
# 0 is not present
str_sub("pneumonia", 0, 0)
## [1] ""
# 6th from left, to the 1st from right
str_sub("pneumonia", 6, -1)
## [1] "onia"
# 5th from right, to the 2nd from right
str_sub("pneumonia", -5, -2)
## [1] "moni"
# 4th from left to a position outside the string
str_sub("pneumonia", 4, 15)
## [1] "umonia"

Extract by word position

# strings to evaluate
chief_complaints <- c("I just got out of the hospital 2 days ago, but still can barely breathe.",
                      "My stomach hurts",
                      "Severe ear pain")

word(chief_complaints, start = 1, end = 3, sep = " ")
## [1] "I just got"       "My stomach hurts" "Severe ear pain"

Replace by character position

word <- "pneumonia"
str_sub(word, 3, 4) <- "XX"
word
## [1] "pnXXmonia"
words <- c("pneumonia", "tubercolosis", "HIV")
# convert the third and fourth characters to X 
str_sub(words, 3, 4) <- "XX"
words
## [1] "pnXXmonia"    "tuXXrcolosis" "HIXX"

Evaluate length

str_length("abc")
## [1] 3

Patterns

Detect a pattern presence/ absence within a string.

str_detect(string = "primary school teacher", pattern = "teach")
## [1] TRUE
# a vector/column of occupations 
occupations <- c("field laborer",
                 "university professor",
                 "primary school teacher & tutor",
                 "tutor",
                 "nurse at regional hospital",
                 "lineworker at Amberdeen Fish Factory",
                 "physican",
                 "cardiologist",
                 "office worker",
                 "food service")
print(occupations)
##  [1] "field laborer"                       
##  [2] "university professor"                
##  [3] "primary school teacher & tutor"      
##  [4] "tutor"                               
##  [5] "nurse at regional hospital"          
##  [6] "lineworker at Amberdeen Fish Factory"
##  [7] "physican"                            
##  [8] "cardiologist"                        
##  [9] "office worker"                       
## [10] "food service"
str_detect(occupations, "teach")
##  [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(str_detect(occupations, "teach"))
## [1] 1
sum(str_detect(string = occupations,
               pattern = "teach|professor|tutor"))
## [1] 3

To ignore case/capitalization, wrap the pattern within regex(), and within regex() add the argument ignore_case = TRUE (or T as shorthand).

str_detect(string = "Teacher", pattern = regex("teacher", ignore_case = T))
## [1] TRUE

Convert comma (,) to period (.)

lengths <- c("2.454,56", "1,2", "6.096,5")

lengths <- c("2.454,56", "1,2", "6.096,5")

as.numeric(gsub(pattern = ",",                # find commas     
                replacement = ".",            # replace with periods
                x = gsub("\\.", "", lengths)  # vector with other periods removed (periods escaped)
                )
           )                                  # convert outcome to numeric
## [1] 2454.56    1.20 6096.50

Replace all

outcome <- c("Karl: dead",
            "Samantha: dead",
            "Marco: not dead",
            "NA")

str_replace_all(string = outcome, pattern = "dead", replacement = "deceased")
## [1] "Karl: deceased"      "Samantha: deceased"  "Marco: not deceased"
## [4] "NA"

Detect within logic

df <- df %>% 
  mutate(is_educator = case_when(
    # term search within occupation, not case sensitive
    str_detect(occupations,
               regex("teach|prof|tutor|university",
                     ignore_case = TRUE))              ~ "Educator",
    # all others
    TRUE                                               ~ "Not an educator"))
df <- df %>% 
  # value in new column is_educator is based on conditional logic
  mutate(is_educator = case_when(
    
    # occupation column must meet 2 criteria to be assigned "Educator":
    # it must have a search term AND NOT any exclusion term
    
    # Must have a search term
    str_detect(occupations,
               regex("teach|prof|tutor|university", ignore_case = T)) &              
    
    # AND must NOT have an exclusion term
    str_detect(occupations,
               regex("admin", ignore_case = T),
               negate = TRUE                        ~ "Educator"
    
    # All rows not meeting above criteria
    TRUE                                            ~ "Not an educator"))

Extract a match

occupations <- c("field laborer",
                 "university professor",
                 "primary school teacher & tutor",
                 "tutor",
                 "nurse at regional hospital",
                 "lineworker at Amberdeen Fish Factory",
                 "physican",
                 "cardiologist",
                 "office worker",
                 "food service",
                 "tutor teacher",
                 "professors")
print(occupations)
##  [1] "field laborer"                       
##  [2] "university professor"                
##  [3] "primary school teacher & tutor"      
##  [4] "tutor"                               
##  [5] "nurse at regional hospital"          
##  [6] "lineworker at Amberdeen Fish Factory"
##  [7] "physican"                            
##  [8] "cardiologist"                        
##  [9] "office worker"                       
## [10] "food service"                        
## [11] "tutor teacher"                       
## [12] "professors"
str_extract_all(occupations, "teach|prof|tutor")
## [[1]]
## character(0)
## 
## [[2]]
## [1] "prof"
## 
## [[3]]
## [1] "teach" "tutor"
## 
## [[4]]
## [1] "tutor"
## 
## [[5]]
## character(0)
## 
## [[6]]
## character(0)
## 
## [[7]]
## character(0)
## 
## [[8]]
## character(0)
## 
## [[9]]
## character(0)
## 
## [[10]]
## character(0)
## 
## [[11]]
## [1] "tutor" "teach"
## 
## [[12]]
## [1] "prof"
occupations <- c("field laborer",
                 "university professor",
                 "primary school teacher & tutor",
                 "tutor",
                 "nurse at regional hospital",
                 "lineworker at Amberdeen Fish Factory",
                 "physican",
                 "cardiologist",
                 "office worker",
                 "food service",
                 "tutor teacher",
                 "professors")
print(occupations)
##  [1] "field laborer"                       
##  [2] "university professor"                
##  [3] "primary school teacher & tutor"      
##  [4] "tutor"                               
##  [5] "nurse at regional hospital"          
##  [6] "lineworker at Amberdeen Fish Factory"
##  [7] "physican"                            
##  [8] "cardiologist"                        
##  [9] "office worker"                       
## [10] "food service"                        
## [11] "tutor teacher"                       
## [12] "professors"
str_extract(occupations, "teach|prof|tutor") # str_extract() extracts only the first match in each evaluated string, producing a character vector with one element for each evaluated string. It returns NA where there was no match.
##  [1] NA      "prof"  "teach" "tutor" NA      NA      NA      NA      NA     
## [10] NA      "tutor" "prof"

Subset and count

str_subset(occupations, "teach|prof|tutor")
## [1] "university professor"           "primary school teacher & tutor"
## [3] "tutor"                          "tutor teacher"                 
## [5] "professors"
str_count(occupations, regex("teach|prof|tutor"))
##  [1] 0 1 2 1 0 0 0 0 0 0 2 1

References

  1. The Epidemiologist R
  2. Working with strings with stringr::