Use of the stringr package to evaluate and handle
character values (“strings”):
pacman::p_load(
rio, # importing data
here, # relative file pathways
janitor, # data cleaning and tables
lubridate, # working with dates
matchmaker, # dictionary-based cleaning
epikit, # age_categories() function
tidyverse, # data management and visualization
parsedate, # has function to "guess" messy dates
aweek, # another option for converting dates to weeks, and weeks to dates
zoo, # additional date/time functions
stringr, # many functions for handling strings
tools # alternative for converting to title case
)
# import case linelist
linelist <- import(here("data", "linelist_cleaned.rds"))
str_c("String1", "String2", "String3")
## [1] "String1String2String3"
str_c("String1", "String2", "String3", sep = ", ")
## [1] "String1, String2, String3"
first_names <- c("abdul", "fahruk", "janice")
last_names <- c("hussein", "akinleye", "okeke")
str_c(first_names, last_names, sep = " ", collapse = "; ")
## [1] "abdul hussein; fahruk akinleye; janice okeke"
The cat() function in R can be used to concatenate
together several objects in R. Use \n to force newline.
cat(str_c(first_names, last_names, sep = " ", collapse = ";\n"))
## abdul hussein;
## fahruk akinleye;
## janice okeke
Dynamic strings - str_glue
str_glue("Data include {nrow(linelist)} cases and are current to {format(Sys.Date(), '%d %b %Y')}.")
## Data include 5888 cases and are current to 17 Apr 2023.
# All content goes between double quotation marks str_glue("")
# Any dynamic code or references to pre-defined values are placed within curly brackets {} within the double quotation marks. There can be many curly brackets in the same str_glue() command.
# To display character quotes ’’, use single quotes within the surrounding double quotes (e.g. when providing date format - see example below)
# Tip: You can use \n to force a new line
# Tip: You use format() to adjust date display, and use Sys.Date() to display the current date
An alternative format is to use placeholders within the brackets and define the code in separate arguments at the end of the str_glue() function, as below. This can improve code readability if the text is long.
str_glue("Linelist as of {current_date}.\nLast case hospitalized on {last_hospital}.\n{n_missing_onset} cases are missing date of onset and not shown",
current_date = format(Sys.Date(), '%d %b %Y'),
last_hospital = format(as.Date(max(linelist$date_hospitalisation, na.rm=T)), '%d %b %Y'),
n_missing_onset = nrow(linelist %>% filter(is.na(date_onset)))
)
## Linelist as of 17 Apr 2023.
## Last case hospitalized on 30 Apr 2015.
## 0 cases are missing date of onset and not shown
Pulling from a data frame
# make case data frame
case_table <- data.frame(
zone = c("Zone 1", "Zone 2", "Zone 3", "Zone 4", "Zone 5"),
new_cases = c(3, 0, 7, 0, 15),
total_cases = c(40, 4, 25, 10, 103)
)
# using str_glue_data()
case_table %>%
str_glue_data("{zone}: {new_cases} ({total_cases} total cases)")
## Zone 1: 3 (40 total cases)
## Zone 2: 0 (4 total cases)
## Zone 3: 7 (25 total cases)
## Zone 4: 0 (10 total cases)
## Zone 5: 15 (103 total cases)
Data frame to one line
str_c(case_table$zone, case_table$new_cases, sep = " = ", collapse = ", ")
## [1] "Zone 1 = 3, Zone 2 = 0, Zone 3 = 7, Zone 4 = 0, Zone 5 = 15"
str_c("New casese: ", str_c(case_table$zone, case_table$new_cases, sep = " = ", collapse = ", "))
## [1] "New casese: Zone 1 = 3, Zone 2 = 0, Zone 3 = 7, Zone 4 = 0, Zone 5 = 15"
df <- data.frame(
case_ID = c(1:6),
symptoms = c("jaundice, fever, chills", # patient 1
"chills, aches, pains", # patient 2
"fever", # patient 3
"vomiting, diarrhoea", # patient 4
"bleeding from gums, fever", # patient 5
"rapid pulse, headache"), # patient 6
outcome = c("Recover", "Death", "Death", "Recover", "Recover", "Recover")
)
# dataframe # colum to separate # name of new columns separated
df_split <- separate(df, symptoms, into = c("sym_1", "sym_2", "sym_3"), extra = "merge")
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2 rows [3, 4].
t <- df_split %>%
unite(
col = "all_symptoms", # name of the new united column
c("sym_1", "sym_2", "sym_3"), # columns to unite
sep = ", ", # separator to use in united column
remove = TRUE, # if TRUE, removes input cols from the data frame
na.rm = TRUE # if TRUE, missing values are removed before uniting
)
t2 <- df_split %>%
unite(
col = "all_symtoms",
c("sym_1", "sym_2", "sym_3"),
sep = ", ",
remove = FALSE, # columns to unite still being keep without removing
na.rm = TRUE
)
To split a string based on a pattern, use
str_split(). It evaluates the string(s) and returns a
list of character vectors consisting of the newly-split
values.
str_split(string = "jaundice, fewer, chills",
pattern = ",")
## [[1]]
## [1] "jaundice" " fewer" " chills"
pt1_symptoms <- str_split("jaundice, fever, chills", ",")
pt1_symptoms[[1]][2] # extracts 2nd value from 1st (and only) element of the list
## [1] " fever"
symptoms <- c("jaundice, fever, chills",
"chills, aches, pains",
"fever",
"vomiting, diarrhoea",
"bleeding from gums, fever",
"rapid pulse, headache")
str_split(symptoms, ",") # return a character matrix
## [[1]]
## [1] "jaundice" " fever" " chills"
##
## [[2]]
## [1] "chills" " aches" " pains"
##
## [[3]]
## [1] "fever"
##
## [[4]]
## [1] "vomiting" " diarrhoea"
##
## [[5]]
## [1] "bleeding from gums" " fever"
##
## [[6]]
## [1] "rapid pulse" " headache"
str_split(symptoms, ",", simplify = TRUE) # opt 1: creating data frame colums
## [,1] [,2] [,3]
## [1,] "jaundice" " fever" " chills"
## [2,] "chills" " aches" " pains"
## [3,] "fever" "" ""
## [4,] "vomiting" " diarrhoea" ""
## [5,] "bleeding from gums" " fever" ""
## [6,] "rapid pulse" " headache" ""
str_split_fixed(symptoms, ",", n = 2) # opt 2: do not need to define data frame, but provide no of colums
## [,1] [,2]
## [1,] "jaundice" " fever, chills"
## [2,] "chills" " aches, pains"
## [3,] "fever" ""
## [4,] "vomiting" " diarrhoea"
## [5,] "bleeding from gums" " fever"
## [6,] "rapid pulse" " headache"
df
## case_ID symptoms outcome
## 1 1 jaundice, fever, chills Recover
## 2 2 chills, aches, pains Death
## 3 3 fever Death
## 4 4 vomiting, diarrhoea Recover
## 5 5 bleeding from gums, fever Recover
## 6 6 rapid pulse, headache Recover
# third symtoms combined into second new column
df %>% separate(symptoms, into = c("sym_1", "sym_2"), sep = ",", extra = "merge") # preserve all your data (saving all three symtoms although we only create two colums)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [3].
## case_ID sym_1 sym_2 outcome
## 1 1 jaundice fever, chills Recover
## 2 2 chills aches, pains Death
## 3 3 fever <NA> Death
## 4 4 vomiting diarrhoea Recover
## 5 5 bleeding from gums fever Recover
## 6 6 rapid pulse headache Recover
# third symptoms are lost
df %>%
separate(symptoms, into = c("sym_1", "sym_2"), sep=",")
## Warning: Expected 2 pieces. Additional pieces discarded in 2 rows [1, 2].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [3].
## case_ID sym_1 sym_2 outcome
## 1 1 jaundice fever Recover
## 2 2 chills aches Death
## 3 3 fever <NA> Death
## 4 4 vomiting diarrhoea Recover
## 5 5 bleeding from gums fever Recover
## 6 6 rapid pulse headache Recover
df %>% separate(symptoms, into = c("sym_1", "sym_2"), sep = ",", extra = "warn") # drop value with no warning
## Warning: Expected 2 pieces. Additional pieces discarded in 2 rows [1, 2].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [3].
## case_ID sym_1 sym_2 outcome
## 1 1 jaundice fever Recover
## 2 2 chills aches Death
## 3 3 fever <NA> Death
## 4 4 vomiting diarrhoea Recover
## 5 5 bleeding from gums fever Recover
## 6 6 rapid pulse headache Recover
str_to_upper("California")
## [1] "CALIFORNIA"
str_to_lower("California")
## [1] "california"
str_to_title("go to the US state of California")
## [1] "Go To The Us State Of California"
tools::toTitleCase("This is the US state of california")
## [1] "This is the US State of California"
str_to_sentence("the patient must be transported")
## [1] "The patient must be transported"
Use str_pad() to add characters to a string, to a minimum length.
ICD_codes <- c("R10.13",
"R10.819",
"R17")
str_pad(ICD_codes, 7, "right")
## [1] "R10.13 " "R10.819" "R17 "
str_pad(ICD_codes, 7, "right", pad = ".")
## [1] "R10.13." "R10.819" "R17...."
str_pad(ICD_codes, 8, "right", pad = "0")
## [1] "R10.1300" "R10.8190" "R1700000"
original <- "Symptom onset on 4/3/2020 with vomiting"
str_trunc(original, 10, "center")
## [1] "Symp...ing"
Using str_trunc to delete or str_pad to
expand string
ICD_codes <- c("R10.13",
"R10.819",
"R17")
ICD_codes_2 <- str_trunc(ICD_codes, 6) # truncate
ICD_codes_2
## [1] "R10.13" "R10..." "R17"
IDC_code_3 <- str_pad(ICD_codes_2, 6, "right")
IDC_code_3
## [1] "R10.13" "R10..." "R17 "
Using str_trim() to remove spaces, newlines () or tabs (
on sides of a string input. Add “right” “left”, or “both” to the command
to specify which side to trim (e.g. str_trim(x, “right”).
# ID numbers with excess spaces on right
IDs <- c("provA_1852 ", # two excess spaces
"provA_2345", # zero excess spaces
"provA_9460 ") # one excess space
str_trim(IDs) # IDs trimmed to remove excess spaces on right side only
## [1] "provA_1852" "provA_2345" "provA_9460"
str_trim(IDs, "left")
## [1] "provA_1852 " "provA_2345" "provA_9460 "
str_trim(IDs, "both")
## [1] "provA_1852" "provA_2345" "provA_9460"
str_squish(" Pt requires IV saline\n")
## [1] "Pt requires IV saline"
# start and end third from left (3rd letter from left)
str_sub("pneumonia", 3, 3)
## [1] "e"
# 0 is not present
str_sub("pneumonia", 0, 0)
## [1] ""
# 6th from left, to the 1st from right
str_sub("pneumonia", 6, -1)
## [1] "onia"
# 5th from right, to the 2nd from right
str_sub("pneumonia", -5, -2)
## [1] "moni"
# 4th from left to a position outside the string
str_sub("pneumonia", 4, 15)
## [1] "umonia"
# strings to evaluate
chief_complaints <- c("I just got out of the hospital 2 days ago, but still can barely breathe.",
"My stomach hurts",
"Severe ear pain")
word(chief_complaints, start = 1, end = 3, sep = " ")
## [1] "I just got" "My stomach hurts" "Severe ear pain"
word <- "pneumonia"
str_sub(word, 3, 4) <- "XX"
word
## [1] "pnXXmonia"
words <- c("pneumonia", "tubercolosis", "HIV")
# convert the third and fourth characters to X
str_sub(words, 3, 4) <- "XX"
words
## [1] "pnXXmonia" "tuXXrcolosis" "HIXX"
str_length("abc")
## [1] 3
str_detect(string = "primary school teacher", pattern = "teach")
## [1] TRUE
# a vector/column of occupations
occupations <- c("field laborer",
"university professor",
"primary school teacher & tutor",
"tutor",
"nurse at regional hospital",
"lineworker at Amberdeen Fish Factory",
"physican",
"cardiologist",
"office worker",
"food service")
print(occupations)
## [1] "field laborer"
## [2] "university professor"
## [3] "primary school teacher & tutor"
## [4] "tutor"
## [5] "nurse at regional hospital"
## [6] "lineworker at Amberdeen Fish Factory"
## [7] "physican"
## [8] "cardiologist"
## [9] "office worker"
## [10] "food service"
str_detect(occupations, "teach")
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(str_detect(occupations, "teach"))
## [1] 1
sum(str_detect(string = occupations,
pattern = "teach|professor|tutor"))
## [1] 3
To ignore case/capitalization, wrap the pattern within
regex(), and within regex() add the argument
ignore_case = TRUE (or T as shorthand).
str_detect(string = "Teacher", pattern = regex("teacher", ignore_case = T))
## [1] TRUE
lengths <- c("2.454,56", "1,2", "6.096,5")
lengths <- c("2.454,56", "1,2", "6.096,5")
as.numeric(gsub(pattern = ",", # find commas
replacement = ".", # replace with periods
x = gsub("\\.", "", lengths) # vector with other periods removed (periods escaped)
)
) # convert outcome to numeric
## [1] 2454.56 1.20 6096.50
outcome <- c("Karl: dead",
"Samantha: dead",
"Marco: not dead",
"NA")
str_replace_all(string = outcome, pattern = "dead", replacement = "deceased")
## [1] "Karl: deceased" "Samantha: deceased" "Marco: not deceased"
## [4] "NA"
df <- df %>%
mutate(is_educator = case_when(
# term search within occupation, not case sensitive
str_detect(occupations,
regex("teach|prof|tutor|university",
ignore_case = TRUE)) ~ "Educator",
# all others
TRUE ~ "Not an educator"))
df <- df %>%
# value in new column is_educator is based on conditional logic
mutate(is_educator = case_when(
# occupation column must meet 2 criteria to be assigned "Educator":
# it must have a search term AND NOT any exclusion term
# Must have a search term
str_detect(occupations,
regex("teach|prof|tutor|university", ignore_case = T)) &
# AND must NOT have an exclusion term
str_detect(occupations,
regex("admin", ignore_case = T),
negate = TRUE ~ "Educator"
# All rows not meeting above criteria
TRUE ~ "Not an educator"))
occupations <- c("field laborer",
"university professor",
"primary school teacher & tutor",
"tutor",
"nurse at regional hospital",
"lineworker at Amberdeen Fish Factory",
"physican",
"cardiologist",
"office worker",
"food service",
"tutor teacher",
"professors")
print(occupations)
## [1] "field laborer"
## [2] "university professor"
## [3] "primary school teacher & tutor"
## [4] "tutor"
## [5] "nurse at regional hospital"
## [6] "lineworker at Amberdeen Fish Factory"
## [7] "physican"
## [8] "cardiologist"
## [9] "office worker"
## [10] "food service"
## [11] "tutor teacher"
## [12] "professors"
str_extract_all(occupations, "teach|prof|tutor")
## [[1]]
## character(0)
##
## [[2]]
## [1] "prof"
##
## [[3]]
## [1] "teach" "tutor"
##
## [[4]]
## [1] "tutor"
##
## [[5]]
## character(0)
##
## [[6]]
## character(0)
##
## [[7]]
## character(0)
##
## [[8]]
## character(0)
##
## [[9]]
## character(0)
##
## [[10]]
## character(0)
##
## [[11]]
## [1] "tutor" "teach"
##
## [[12]]
## [1] "prof"
occupations <- c("field laborer",
"university professor",
"primary school teacher & tutor",
"tutor",
"nurse at regional hospital",
"lineworker at Amberdeen Fish Factory",
"physican",
"cardiologist",
"office worker",
"food service",
"tutor teacher",
"professors")
print(occupations)
## [1] "field laborer"
## [2] "university professor"
## [3] "primary school teacher & tutor"
## [4] "tutor"
## [5] "nurse at regional hospital"
## [6] "lineworker at Amberdeen Fish Factory"
## [7] "physican"
## [8] "cardiologist"
## [9] "office worker"
## [10] "food service"
## [11] "tutor teacher"
## [12] "professors"
str_extract(occupations, "teach|prof|tutor") # str_extract() extracts only the first match in each evaluated string, producing a character vector with one element for each evaluated string. It returns NA where there was no match.
## [1] NA "prof" "teach" "tutor" NA NA NA NA NA
## [10] NA "tutor" "prof"
str_subset(occupations, "teach|prof|tutor")
## [1] "university professor" "primary school teacher & tutor"
## [3] "tutor" "tutor teacher"
## [5] "professors"
str_count(occupations, regex("teach|prof|tutor"))
## [1] 0 1 2 1 0 0 0 0 0 0 2 1