stringR

From Ewen Harrison - Sarah Elliot - The University of Edinburgh

[https://media.ed.ac.uk/media/HealthyR+demoA+stringR/1_2ksr312b]

Day 07 of HealthyR demo

stringR

library(tidyverse)
library(stringr) # already part of tidyverse
library(lubridate)
# Create the dataset 
df = tibble(subjid = 1:50,
            age = round(runif(50, 5, 100)),
            sex = sample(c("Male", "Female"), size=50, replace=TRUE, prob=c(0.49,0.50)),
            country = sample(c("England", "Scotland", "Wales", "N. Ireland"), size=50, replace=TRUE, prob=c(0.7, 0.16, 0.09, 0.05)),
            adm_date = sample(seq(ymd('2019-01-01'), ymd('2022-10-01'), by = "day"), size = 50, replace=TRUE),
            heart_rate = round(runif(50, 45, 150)),
            oxy_sat = round(runif(50, 90, 100)),
            test_pos = sample(c("Yes", "No", "Unknown"), size=50, replace=TRUE, prob=c(0.33,0.60, 0.07)),
            medication = c("Patient received 500mg x2 paracetamol", 
                           "Morphine; Amoxicillin", 
                           "Paracetamol 500 mg x 2", 
                           "Insulin", 
                           "paracetimol", 
                           "meropenem, diazapam, paracetamol",
                           "Paracetamol when needed",
                           "Citalopram & paractamol",
                           "Ibuprofen",
                           "adenosine/paracetamol",
                           "patient to take 2 tablets paracetamol 500mg every 4 hours",
                           "heparin, amoxicillin",
                           "cephalexin",
                           "paracetamol for pain",
                           NA,
                           "Insulin",
                           "Asprin",
                           "Codine",
                           "patient received paracetimol at 4am",
                           "500mg x2 Ibuprofen & 500mg x2 paracetamol",
                           "Cephalexin, Sertraline, Atenolol",
                           "paracetomol",
                           "Insulin, folic acid",
                           "Multivitamins",
                           "paracetamol/ibuprofin",
                           "patient started taking parcetamol on day 2 of admission",
                           "Glucose",
                           "paracetamol for pain",
                           "Insulin&atenolol",
                           "paracetamol",
                           "diazipam",
                           "paracetamol 500mg x2 at 7am",
                           "Lactulose",
                           "Lactulose and multivitamins",
                           "unknown",
                           "11:00 paracetamol, 13:00 ibuprofen, 15:00 paracetimol",
                           "Insulin",
                           "meropenem then paracetaml",
                           "unknown",
                           "Saline solution",
                           "10am Insulin",
                           "Asprin",
                           "Lorazepam",
                           NA,
                           "Laculose&Paracetamol",
                           NA,
                           "Insulin",
                           "glucose",
                           "Citalopram & paractamol",
                           "16.30 paracetamol 500mg"))
df
## # A tibble: 50 × 9
##    subjid   age sex    country adm_date   heart_rate oxy_sat test_pos medication
##     <int> <dbl> <chr>  <chr>   <date>          <dbl>   <dbl> <chr>    <chr>     
##  1      1    12 Male   England 2022-02-11         62      92 No       Patient r…
##  2      2    65 Male   Wales   2019-01-20        115      94 No       Morphine;…
##  3      3    38 Male   Scotla… 2021-07-05         81     100 Yes      Paracetam…
##  4      4    78 Male   England 2021-06-09        141      90 No       Insulin   
##  5      5     9 Male   Scotla… 2020-01-06         68      92 No       paracetim…
##  6      6    77 Female England 2022-07-09        133      95 No       meropenem…
##  7      7     7 Female Wales   2019-07-17        133      90 No       Paracetam…
##  8      8    47 Male   England 2020-04-24        113      96 Unknown  Citalopra…
##  9      9    76 Female England 2021-04-23        131      95 Yes      Ibuprofen 
## 10     10    84 Female England 2020-03-09         57      98 No       adenosine…
## # ℹ 40 more rows

Stringr to change case

df = df %>% 
  mutate(medication = str_to_upper(df$medication))

df = df %>% 
  mutate(medication = str_to_title(df$medication))

df = df %>% 
  mutate(medication = str_to_lower(df$medication))

Extracting information

#Length of character strings
str_length(string = df$medication)
##  [1] 37 21 22  7 11 32 23 23  9 21 57 20 10 20 NA  7  6  6 35 41 32 11 19 13 21
## [26] 55  7 20 16 11  8 27  9 27  7 53  7 25  7 15 12  6  9 NA 20 NA  7  7 23 23
#Count the character strings with specific pattern 
str_count(string = df$medication, pattern = "insulin")
##  [1]  0  0  0  1  0  0  0  0  0  0  0  0  0  0 NA  1  0  0  0  0  0  0  1  0  0
## [26]  0  0  0  1  0  0  0  0  0  0  0  1  0  0  0  1  0  0 NA  0 NA  1  0  0  0
#Identify strings with pattern
str_which(df$medication, pattern = "insulin")
## [1]  4 16 23 29 37 41 47
str_subset(string = df$medication, pattern = "insulin")
## [1] "insulin"             "insulin"             "insulin, folic acid"
## [4] "insulin&atenolol"    "insulin"             "10am insulin"       
## [7] "insulin"
#Presence or absence of a pattern
df %>% 
  filter(str_detect(df$medication, pattern = "insulin"))
## # A tibble: 7 × 9
##   subjid   age sex    country adm_date   heart_rate oxy_sat test_pos medication 
##    <int> <dbl> <chr>  <chr>   <date>          <dbl>   <dbl> <chr>    <chr>      
## 1      4    78 Male   England 2021-06-09        141      90 No       insulin    
## 2     16    94 Female England 2019-01-14        129      99 Yes      insulin    
## 3     23    76 Female England 2019-01-08         59      99 No       insulin, f…
## 4     29    28 Male   England 2020-10-26        117     100 No       insulin&at…
## 5     37    40 Female England 2019-09-25         58      98 Yes      insulin    
## 6     41    23 Female England 2019-04-26         99      99 Yes      10am insul…
## 7     47    33 Male   England 2019-09-13         73     100 No       insulin
df = df %>% 
  mutate(insulin = str_count(df$medication, "insulin"))
df
## # A tibble: 50 × 10
##    subjid   age sex    country adm_date   heart_rate oxy_sat test_pos medication
##     <int> <dbl> <chr>  <chr>   <date>          <dbl>   <dbl> <chr>    <chr>     
##  1      1    12 Male   England 2022-02-11         62      92 No       patient r…
##  2      2    65 Male   Wales   2019-01-20        115      94 No       morphine;…
##  3      3    38 Male   Scotla… 2021-07-05         81     100 Yes      paracetam…
##  4      4    78 Male   England 2021-06-09        141      90 No       insulin   
##  5      5     9 Male   Scotla… 2020-01-06         68      92 No       paracetim…
##  6      6    77 Female England 2022-07-09        133      95 No       meropenem…
##  7      7     7 Female Wales   2019-07-17        133      90 No       paracetam…
##  8      8    47 Male   England 2020-04-24        113      96 Unknown  citalopra…
##  9      9    76 Female England 2021-04-23        131      95 Yes      ibuprofen 
## 10     10    84 Female England 2020-03-09         57      98 No       adenosine…
## # ℹ 40 more rows
## # ℹ 1 more variable: insulin <int>

Paracetamol

hint: typos paracetimol, paracetomol, paractamol, parcetamol, paracetaml

#Length of character strings
str_length(string = df$medication)
##  [1] 37 21 22  7 11 32 23 23  9 21 57 20 10 20 NA  7  6  6 35 41 32 11 19 13 21
## [26] 55  7 20 16 11  8 27  9 27  7 53  7 25  7 15 12  6  9 NA 20 NA  7  7 23 23
#Count the character strings with specific pattern 
str_count(string = df$medication, pattern = "par[ac]")
##  [1]  1  0  1  0  1  1  1  1  0  1  1  0  0  1 NA  0  0  0  1  1  0  1  0  0  1
## [26]  1  0  1  0  1  0  1  0  0  0  2  0  1  0  0  0  0  0 NA  1 NA  0  0  1  1
#Identify strings with pattern
str_which(df$medication, pattern = "par[ac]")
##  [1]  1  3  5  6  7  8 10 11 14 19 20 22 25 26 28 30 32 36 38 45 49 50
str_subset(string = df$medication, pattern = "par[ac]")
##  [1] "patient received 500mg x2 paracetamol"                    
##  [2] "paracetamol 500 mg x 2"                                   
##  [3] "paracetimol"                                              
##  [4] "meropenem, diazapam, paracetamol"                         
##  [5] "paracetamol when needed"                                  
##  [6] "citalopram & paractamol"                                  
##  [7] "adenosine/paracetamol"                                    
##  [8] "patient to take 2 tablets paracetamol 500mg every 4 hours"
##  [9] "paracetamol for pain"                                     
## [10] "patient received paracetimol at 4am"                      
## [11] "500mg x2 ibuprofen & 500mg x2 paracetamol"                
## [12] "paracetomol"                                              
## [13] "paracetamol/ibuprofin"                                    
## [14] "patient started taking parcetamol on day 2 of admission"  
## [15] "paracetamol for pain"                                     
## [16] "paracetamol"                                              
## [17] "paracetamol 500mg x2 at 7am"                              
## [18] "11:00 paracetamol, 13:00 ibuprofen, 15:00 paracetimol"    
## [19] "meropenem then paracetaml"                                
## [20] "laculose&paracetamol"                                     
## [21] "citalopram & paractamol"                                  
## [22] "16.30 paracetamol 500mg"
#Presence or absence of a pattern
df %>% 
  filter(str_detect(df$medication, pattern = "par[ac]"))
## # A tibble: 22 × 10
##    subjid   age sex    country adm_date   heart_rate oxy_sat test_pos medication
##     <int> <dbl> <chr>  <chr>   <date>          <dbl>   <dbl> <chr>    <chr>     
##  1      1    12 Male   England 2022-02-11         62      92 No       patient r…
##  2      3    38 Male   Scotla… 2021-07-05         81     100 Yes      paracetam…
##  3      5     9 Male   Scotla… 2020-01-06         68      92 No       paracetim…
##  4      6    77 Female England 2022-07-09        133      95 No       meropenem…
##  5      7     7 Female Wales   2019-07-17        133      90 No       paracetam…
##  6      8    47 Male   England 2020-04-24        113      96 Unknown  citalopra…
##  7     10    84 Female England 2020-03-09         57      98 No       adenosine…
##  8     11    37 Male   England 2019-11-19         51      99 Yes      patient t…
##  9     14    71 Female England 2020-02-08         50      96 Yes      paracetam…
## 10     19    64 Female England 2020-12-06         94      96 No       patient r…
## # ℹ 12 more rows
## # ℹ 1 more variable: insulin <int>
df = df %>% 
  mutate(insulin = str_count(df$medication, "par[ac]"))
df
## # A tibble: 50 × 10
##    subjid   age sex    country adm_date   heart_rate oxy_sat test_pos medication
##     <int> <dbl> <chr>  <chr>   <date>          <dbl>   <dbl> <chr>    <chr>     
##  1      1    12 Male   England 2022-02-11         62      92 No       patient r…
##  2      2    65 Male   Wales   2019-01-20        115      94 No       morphine;…
##  3      3    38 Male   Scotla… 2021-07-05         81     100 Yes      paracetam…
##  4      4    78 Male   England 2021-06-09        141      90 No       insulin   
##  5      5     9 Male   Scotla… 2020-01-06         68      92 No       paracetim…
##  6      6    77 Female England 2022-07-09        133      95 No       meropenem…
##  7      7     7 Female Wales   2019-07-17        133      90 No       paracetam…
##  8      8    47 Male   England 2020-04-24        113      96 Unknown  citalopra…
##  9      9    76 Female England 2021-04-23        131      95 Yes      ibuprofen 
## 10     10    84 Female England 2020-03-09         57      98 No       adenosine…
## # ℹ 40 more rows
## # ℹ 1 more variable: insulin <int>

Pattern matching with regular expressions

parac[e]t[iao]m[o]l

str_replace

df$medication
##  [1] "patient received 500mg x2 paracetamol"                    
##  [2] "morphine; amoxicillin"                                    
##  [3] "paracetamol 500 mg x 2"                                   
##  [4] "insulin"                                                  
##  [5] "paracetimol"                                              
##  [6] "meropenem, diazapam, paracetamol"                         
##  [7] "paracetamol when needed"                                  
##  [8] "citalopram & paractamol"                                  
##  [9] "ibuprofen"                                                
## [10] "adenosine/paracetamol"                                    
## [11] "patient to take 2 tablets paracetamol 500mg every 4 hours"
## [12] "heparin, amoxicillin"                                     
## [13] "cephalexin"                                               
## [14] "paracetamol for pain"                                     
## [15] NA                                                         
## [16] "insulin"                                                  
## [17] "asprin"                                                   
## [18] "codine"                                                   
## [19] "patient received paracetimol at 4am"                      
## [20] "500mg x2 ibuprofen & 500mg x2 paracetamol"                
## [21] "cephalexin, sertraline, atenolol"                         
## [22] "paracetomol"                                              
## [23] "insulin, folic acid"                                      
## [24] "multivitamins"                                            
## [25] "paracetamol/ibuprofin"                                    
## [26] "patient started taking parcetamol on day 2 of admission"  
## [27] "glucose"                                                  
## [28] "paracetamol for pain"                                     
## [29] "insulin&atenolol"                                         
## [30] "paracetamol"                                              
## [31] "diazipam"                                                 
## [32] "paracetamol 500mg x2 at 7am"                              
## [33] "lactulose"                                                
## [34] "lactulose and multivitamins"                              
## [35] "unknown"                                                  
## [36] "11:00 paracetamol, 13:00 ibuprofen, 15:00 paracetimol"    
## [37] "insulin"                                                  
## [38] "meropenem then paracetaml"                                
## [39] "unknown"                                                  
## [40] "saline solution"                                          
## [41] "10am insulin"                                             
## [42] "asprin"                                                   
## [43] "lorazepam"                                                
## [44] NA                                                         
## [45] "laculose&paracetamol"                                     
## [46] NA                                                         
## [47] "insulin"                                                  
## [48] "glucose"                                                  
## [49] "citalopram & paractamol"                                  
## [50] "16.30 paracetamol 500mg"
str_replace(df$medication, "parac[e]*t[iao]m[o]*l", "paracetamol")
##  [1] "patient received 500mg x2 paracetamol"                    
##  [2] "morphine; amoxicillin"                                    
##  [3] "paracetamol 500 mg x 2"                                   
##  [4] "insulin"                                                  
##  [5] "paracetamol"                                              
##  [6] "meropenem, diazapam, paracetamol"                         
##  [7] "paracetamol when needed"                                  
##  [8] "citalopram & paracetamol"                                 
##  [9] "ibuprofen"                                                
## [10] "adenosine/paracetamol"                                    
## [11] "patient to take 2 tablets paracetamol 500mg every 4 hours"
## [12] "heparin, amoxicillin"                                     
## [13] "cephalexin"                                               
## [14] "paracetamol for pain"                                     
## [15] NA                                                         
## [16] "insulin"                                                  
## [17] "asprin"                                                   
## [18] "codine"                                                   
## [19] "patient received paracetamol at 4am"                      
## [20] "500mg x2 ibuprofen & 500mg x2 paracetamol"                
## [21] "cephalexin, sertraline, atenolol"                         
## [22] "paracetamol"                                              
## [23] "insulin, folic acid"                                      
## [24] "multivitamins"                                            
## [25] "paracetamol/ibuprofin"                                    
## [26] "patient started taking parcetamol on day 2 of admission"  
## [27] "glucose"                                                  
## [28] "paracetamol for pain"                                     
## [29] "insulin&atenolol"                                         
## [30] "paracetamol"                                              
## [31] "diazipam"                                                 
## [32] "paracetamol 500mg x2 at 7am"                              
## [33] "lactulose"                                                
## [34] "lactulose and multivitamins"                              
## [35] "unknown"                                                  
## [36] "11:00 paracetamol, 13:00 ibuprofen, 15:00 paracetimol"    
## [37] "insulin"                                                  
## [38] "meropenem then paracetamol"                               
## [39] "unknown"                                                  
## [40] "saline solution"                                          
## [41] "10am insulin"                                             
## [42] "asprin"                                                   
## [43] "lorazepam"                                                
## [44] NA                                                         
## [45] "laculose&paracetamol"                                     
## [46] NA                                                         
## [47] "insulin"                                                  
## [48] "glucose"                                                  
## [49] "citalopram & paracetamol"                                 
## [50] "16.30 paracetamol 500mg"
str_replace_all(df$medication, "parac[e]*t[iao]m[o]*l", "paracetamol")
##  [1] "patient received 500mg x2 paracetamol"                    
##  [2] "morphine; amoxicillin"                                    
##  [3] "paracetamol 500 mg x 2"                                   
##  [4] "insulin"                                                  
##  [5] "paracetamol"                                              
##  [6] "meropenem, diazapam, paracetamol"                         
##  [7] "paracetamol when needed"                                  
##  [8] "citalopram & paracetamol"                                 
##  [9] "ibuprofen"                                                
## [10] "adenosine/paracetamol"                                    
## [11] "patient to take 2 tablets paracetamol 500mg every 4 hours"
## [12] "heparin, amoxicillin"                                     
## [13] "cephalexin"                                               
## [14] "paracetamol for pain"                                     
## [15] NA                                                         
## [16] "insulin"                                                  
## [17] "asprin"                                                   
## [18] "codine"                                                   
## [19] "patient received paracetamol at 4am"                      
## [20] "500mg x2 ibuprofen & 500mg x2 paracetamol"                
## [21] "cephalexin, sertraline, atenolol"                         
## [22] "paracetamol"                                              
## [23] "insulin, folic acid"                                      
## [24] "multivitamins"                                            
## [25] "paracetamol/ibuprofin"                                    
## [26] "patient started taking parcetamol on day 2 of admission"  
## [27] "glucose"                                                  
## [28] "paracetamol for pain"                                     
## [29] "insulin&atenolol"                                         
## [30] "paracetamol"                                              
## [31] "diazipam"                                                 
## [32] "paracetamol 500mg x2 at 7am"                              
## [33] "lactulose"                                                
## [34] "lactulose and multivitamins"                              
## [35] "unknown"                                                  
## [36] "11:00 paracetamol, 13:00 ibuprofen, 15:00 paracetamol"    
## [37] "insulin"                                                  
## [38] "meropenem then paracetamol"                               
## [39] "unknown"                                                  
## [40] "saline solution"                                          
## [41] "10am insulin"                                             
## [42] "asprin"                                                   
## [43] "lorazepam"                                                
## [44] NA                                                         
## [45] "laculose&paracetamol"                                     
## [46] NA                                                         
## [47] "insulin"                                                  
## [48] "glucose"                                                  
## [49] "citalopram & paracetamol"                                 
## [50] "16.30 paracetamol 500mg"
df = df %>% 
  mutate(medication = str_replace_all(df$medication, pattern = "parac[e]*t[iao]m[o]*l", replacement = "paracetamol"))