[https://media.ed.ac.uk/media/HealthyR+demoA+stringR/1_2ksr312b]
Day 07 of HealthyR demo
stringR
library(tidyverse)
library(stringr) # already part of tidyverse
library(lubridate)
# Create the dataset
df = tibble(subjid = 1:50,
age = round(runif(50, 5, 100)),
sex = sample(c("Male", "Female"), size=50, replace=TRUE, prob=c(0.49,0.50)),
country = sample(c("England", "Scotland", "Wales", "N. Ireland"), size=50, replace=TRUE, prob=c(0.7, 0.16, 0.09, 0.05)),
adm_date = sample(seq(ymd('2019-01-01'), ymd('2022-10-01'), by = "day"), size = 50, replace=TRUE),
heart_rate = round(runif(50, 45, 150)),
oxy_sat = round(runif(50, 90, 100)),
test_pos = sample(c("Yes", "No", "Unknown"), size=50, replace=TRUE, prob=c(0.33,0.60, 0.07)),
medication = c("Patient received 500mg x2 paracetamol",
"Morphine; Amoxicillin",
"Paracetamol 500 mg x 2",
"Insulin",
"paracetimol",
"meropenem, diazapam, paracetamol",
"Paracetamol when needed",
"Citalopram & paractamol",
"Ibuprofen",
"adenosine/paracetamol",
"patient to take 2 tablets paracetamol 500mg every 4 hours",
"heparin, amoxicillin",
"cephalexin",
"paracetamol for pain",
NA,
"Insulin",
"Asprin",
"Codine",
"patient received paracetimol at 4am",
"500mg x2 Ibuprofen & 500mg x2 paracetamol",
"Cephalexin, Sertraline, Atenolol",
"paracetomol",
"Insulin, folic acid",
"Multivitamins",
"paracetamol/ibuprofin",
"patient started taking parcetamol on day 2 of admission",
"Glucose",
"paracetamol for pain",
"Insulin&atenolol",
"paracetamol",
"diazipam",
"paracetamol 500mg x2 at 7am",
"Lactulose",
"Lactulose and multivitamins",
"unknown",
"11:00 paracetamol, 13:00 ibuprofen, 15:00 paracetimol",
"Insulin",
"meropenem then paracetaml",
"unknown",
"Saline solution",
"10am Insulin",
"Asprin",
"Lorazepam",
NA,
"Laculose&Paracetamol",
NA,
"Insulin",
"glucose",
"Citalopram & paractamol",
"16.30 paracetamol 500mg"))
df
## # A tibble: 50 × 9
## subjid age sex country adm_date heart_rate oxy_sat test_pos medication
## <int> <dbl> <chr> <chr> <date> <dbl> <dbl> <chr> <chr>
## 1 1 12 Male England 2022-02-11 62 92 No Patient r…
## 2 2 65 Male Wales 2019-01-20 115 94 No Morphine;…
## 3 3 38 Male Scotla… 2021-07-05 81 100 Yes Paracetam…
## 4 4 78 Male England 2021-06-09 141 90 No Insulin
## 5 5 9 Male Scotla… 2020-01-06 68 92 No paracetim…
## 6 6 77 Female England 2022-07-09 133 95 No meropenem…
## 7 7 7 Female Wales 2019-07-17 133 90 No Paracetam…
## 8 8 47 Male England 2020-04-24 113 96 Unknown Citalopra…
## 9 9 76 Female England 2021-04-23 131 95 Yes Ibuprofen
## 10 10 84 Female England 2020-03-09 57 98 No adenosine…
## # ℹ 40 more rows
df = df %>%
mutate(medication = str_to_upper(df$medication))
df = df %>%
mutate(medication = str_to_title(df$medication))
df = df %>%
mutate(medication = str_to_lower(df$medication))
#Length of character strings
str_length(string = df$medication)
## [1] 37 21 22 7 11 32 23 23 9 21 57 20 10 20 NA 7 6 6 35 41 32 11 19 13 21
## [26] 55 7 20 16 11 8 27 9 27 7 53 7 25 7 15 12 6 9 NA 20 NA 7 7 23 23
#Count the character strings with specific pattern
str_count(string = df$medication, pattern = "insulin")
## [1] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 NA 1 0 0 0 0 0 0 1 0 0
## [26] 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 NA 0 NA 1 0 0 0
#Identify strings with pattern
str_which(df$medication, pattern = "insulin")
## [1] 4 16 23 29 37 41 47
str_subset(string = df$medication, pattern = "insulin")
## [1] "insulin" "insulin" "insulin, folic acid"
## [4] "insulin&atenolol" "insulin" "10am insulin"
## [7] "insulin"
#Presence or absence of a pattern
df %>%
filter(str_detect(df$medication, pattern = "insulin"))
## # A tibble: 7 × 9
## subjid age sex country adm_date heart_rate oxy_sat test_pos medication
## <int> <dbl> <chr> <chr> <date> <dbl> <dbl> <chr> <chr>
## 1 4 78 Male England 2021-06-09 141 90 No insulin
## 2 16 94 Female England 2019-01-14 129 99 Yes insulin
## 3 23 76 Female England 2019-01-08 59 99 No insulin, f…
## 4 29 28 Male England 2020-10-26 117 100 No insulin&at…
## 5 37 40 Female England 2019-09-25 58 98 Yes insulin
## 6 41 23 Female England 2019-04-26 99 99 Yes 10am insul…
## 7 47 33 Male England 2019-09-13 73 100 No insulin
df = df %>%
mutate(insulin = str_count(df$medication, "insulin"))
df
## # A tibble: 50 × 10
## subjid age sex country adm_date heart_rate oxy_sat test_pos medication
## <int> <dbl> <chr> <chr> <date> <dbl> <dbl> <chr> <chr>
## 1 1 12 Male England 2022-02-11 62 92 No patient r…
## 2 2 65 Male Wales 2019-01-20 115 94 No morphine;…
## 3 3 38 Male Scotla… 2021-07-05 81 100 Yes paracetam…
## 4 4 78 Male England 2021-06-09 141 90 No insulin
## 5 5 9 Male Scotla… 2020-01-06 68 92 No paracetim…
## 6 6 77 Female England 2022-07-09 133 95 No meropenem…
## 7 7 7 Female Wales 2019-07-17 133 90 No paracetam…
## 8 8 47 Male England 2020-04-24 113 96 Unknown citalopra…
## 9 9 76 Female England 2021-04-23 131 95 Yes ibuprofen
## 10 10 84 Female England 2020-03-09 57 98 No adenosine…
## # ℹ 40 more rows
## # ℹ 1 more variable: insulin <int>
hint: typos paracetimol, paracetomol, paractamol, parcetamol, paracetaml
#Length of character strings
str_length(string = df$medication)
## [1] 37 21 22 7 11 32 23 23 9 21 57 20 10 20 NA 7 6 6 35 41 32 11 19 13 21
## [26] 55 7 20 16 11 8 27 9 27 7 53 7 25 7 15 12 6 9 NA 20 NA 7 7 23 23
#Count the character strings with specific pattern
str_count(string = df$medication, pattern = "par[ac]")
## [1] 1 0 1 0 1 1 1 1 0 1 1 0 0 1 NA 0 0 0 1 1 0 1 0 0 1
## [26] 1 0 1 0 1 0 1 0 0 0 2 0 1 0 0 0 0 0 NA 1 NA 0 0 1 1
#Identify strings with pattern
str_which(df$medication, pattern = "par[ac]")
## [1] 1 3 5 6 7 8 10 11 14 19 20 22 25 26 28 30 32 36 38 45 49 50
str_subset(string = df$medication, pattern = "par[ac]")
## [1] "patient received 500mg x2 paracetamol"
## [2] "paracetamol 500 mg x 2"
## [3] "paracetimol"
## [4] "meropenem, diazapam, paracetamol"
## [5] "paracetamol when needed"
## [6] "citalopram & paractamol"
## [7] "adenosine/paracetamol"
## [8] "patient to take 2 tablets paracetamol 500mg every 4 hours"
## [9] "paracetamol for pain"
## [10] "patient received paracetimol at 4am"
## [11] "500mg x2 ibuprofen & 500mg x2 paracetamol"
## [12] "paracetomol"
## [13] "paracetamol/ibuprofin"
## [14] "patient started taking parcetamol on day 2 of admission"
## [15] "paracetamol for pain"
## [16] "paracetamol"
## [17] "paracetamol 500mg x2 at 7am"
## [18] "11:00 paracetamol, 13:00 ibuprofen, 15:00 paracetimol"
## [19] "meropenem then paracetaml"
## [20] "laculose¶cetamol"
## [21] "citalopram & paractamol"
## [22] "16.30 paracetamol 500mg"
#Presence or absence of a pattern
df %>%
filter(str_detect(df$medication, pattern = "par[ac]"))
## # A tibble: 22 × 10
## subjid age sex country adm_date heart_rate oxy_sat test_pos medication
## <int> <dbl> <chr> <chr> <date> <dbl> <dbl> <chr> <chr>
## 1 1 12 Male England 2022-02-11 62 92 No patient r…
## 2 3 38 Male Scotla… 2021-07-05 81 100 Yes paracetam…
## 3 5 9 Male Scotla… 2020-01-06 68 92 No paracetim…
## 4 6 77 Female England 2022-07-09 133 95 No meropenem…
## 5 7 7 Female Wales 2019-07-17 133 90 No paracetam…
## 6 8 47 Male England 2020-04-24 113 96 Unknown citalopra…
## 7 10 84 Female England 2020-03-09 57 98 No adenosine…
## 8 11 37 Male England 2019-11-19 51 99 Yes patient t…
## 9 14 71 Female England 2020-02-08 50 96 Yes paracetam…
## 10 19 64 Female England 2020-12-06 94 96 No patient r…
## # ℹ 12 more rows
## # ℹ 1 more variable: insulin <int>
df = df %>%
mutate(insulin = str_count(df$medication, "par[ac]"))
df
## # A tibble: 50 × 10
## subjid age sex country adm_date heart_rate oxy_sat test_pos medication
## <int> <dbl> <chr> <chr> <date> <dbl> <dbl> <chr> <chr>
## 1 1 12 Male England 2022-02-11 62 92 No patient r…
## 2 2 65 Male Wales 2019-01-20 115 94 No morphine;…
## 3 3 38 Male Scotla… 2021-07-05 81 100 Yes paracetam…
## 4 4 78 Male England 2021-06-09 141 90 No insulin
## 5 5 9 Male Scotla… 2020-01-06 68 92 No paracetim…
## 6 6 77 Female England 2022-07-09 133 95 No meropenem…
## 7 7 7 Female Wales 2019-07-17 133 90 No paracetam…
## 8 8 47 Male England 2020-04-24 113 96 Unknown citalopra…
## 9 9 76 Female England 2021-04-23 131 95 Yes ibuprofen
## 10 10 84 Female England 2020-03-09 57 98 No adenosine…
## # ℹ 40 more rows
## # ℹ 1 more variable: insulin <int>
df$medication
## [1] "patient received 500mg x2 paracetamol"
## [2] "morphine; amoxicillin"
## [3] "paracetamol 500 mg x 2"
## [4] "insulin"
## [5] "paracetimol"
## [6] "meropenem, diazapam, paracetamol"
## [7] "paracetamol when needed"
## [8] "citalopram & paractamol"
## [9] "ibuprofen"
## [10] "adenosine/paracetamol"
## [11] "patient to take 2 tablets paracetamol 500mg every 4 hours"
## [12] "heparin, amoxicillin"
## [13] "cephalexin"
## [14] "paracetamol for pain"
## [15] NA
## [16] "insulin"
## [17] "asprin"
## [18] "codine"
## [19] "patient received paracetimol at 4am"
## [20] "500mg x2 ibuprofen & 500mg x2 paracetamol"
## [21] "cephalexin, sertraline, atenolol"
## [22] "paracetomol"
## [23] "insulin, folic acid"
## [24] "multivitamins"
## [25] "paracetamol/ibuprofin"
## [26] "patient started taking parcetamol on day 2 of admission"
## [27] "glucose"
## [28] "paracetamol for pain"
## [29] "insulin&atenolol"
## [30] "paracetamol"
## [31] "diazipam"
## [32] "paracetamol 500mg x2 at 7am"
## [33] "lactulose"
## [34] "lactulose and multivitamins"
## [35] "unknown"
## [36] "11:00 paracetamol, 13:00 ibuprofen, 15:00 paracetimol"
## [37] "insulin"
## [38] "meropenem then paracetaml"
## [39] "unknown"
## [40] "saline solution"
## [41] "10am insulin"
## [42] "asprin"
## [43] "lorazepam"
## [44] NA
## [45] "laculose¶cetamol"
## [46] NA
## [47] "insulin"
## [48] "glucose"
## [49] "citalopram & paractamol"
## [50] "16.30 paracetamol 500mg"
str_replace(df$medication, "parac[e]*t[iao]m[o]*l", "paracetamol")
## [1] "patient received 500mg x2 paracetamol"
## [2] "morphine; amoxicillin"
## [3] "paracetamol 500 mg x 2"
## [4] "insulin"
## [5] "paracetamol"
## [6] "meropenem, diazapam, paracetamol"
## [7] "paracetamol when needed"
## [8] "citalopram & paracetamol"
## [9] "ibuprofen"
## [10] "adenosine/paracetamol"
## [11] "patient to take 2 tablets paracetamol 500mg every 4 hours"
## [12] "heparin, amoxicillin"
## [13] "cephalexin"
## [14] "paracetamol for pain"
## [15] NA
## [16] "insulin"
## [17] "asprin"
## [18] "codine"
## [19] "patient received paracetamol at 4am"
## [20] "500mg x2 ibuprofen & 500mg x2 paracetamol"
## [21] "cephalexin, sertraline, atenolol"
## [22] "paracetamol"
## [23] "insulin, folic acid"
## [24] "multivitamins"
## [25] "paracetamol/ibuprofin"
## [26] "patient started taking parcetamol on day 2 of admission"
## [27] "glucose"
## [28] "paracetamol for pain"
## [29] "insulin&atenolol"
## [30] "paracetamol"
## [31] "diazipam"
## [32] "paracetamol 500mg x2 at 7am"
## [33] "lactulose"
## [34] "lactulose and multivitamins"
## [35] "unknown"
## [36] "11:00 paracetamol, 13:00 ibuprofen, 15:00 paracetimol"
## [37] "insulin"
## [38] "meropenem then paracetamol"
## [39] "unknown"
## [40] "saline solution"
## [41] "10am insulin"
## [42] "asprin"
## [43] "lorazepam"
## [44] NA
## [45] "laculose¶cetamol"
## [46] NA
## [47] "insulin"
## [48] "glucose"
## [49] "citalopram & paracetamol"
## [50] "16.30 paracetamol 500mg"
str_replace_all(df$medication, "parac[e]*t[iao]m[o]*l", "paracetamol")
## [1] "patient received 500mg x2 paracetamol"
## [2] "morphine; amoxicillin"
## [3] "paracetamol 500 mg x 2"
## [4] "insulin"
## [5] "paracetamol"
## [6] "meropenem, diazapam, paracetamol"
## [7] "paracetamol when needed"
## [8] "citalopram & paracetamol"
## [9] "ibuprofen"
## [10] "adenosine/paracetamol"
## [11] "patient to take 2 tablets paracetamol 500mg every 4 hours"
## [12] "heparin, amoxicillin"
## [13] "cephalexin"
## [14] "paracetamol for pain"
## [15] NA
## [16] "insulin"
## [17] "asprin"
## [18] "codine"
## [19] "patient received paracetamol at 4am"
## [20] "500mg x2 ibuprofen & 500mg x2 paracetamol"
## [21] "cephalexin, sertraline, atenolol"
## [22] "paracetamol"
## [23] "insulin, folic acid"
## [24] "multivitamins"
## [25] "paracetamol/ibuprofin"
## [26] "patient started taking parcetamol on day 2 of admission"
## [27] "glucose"
## [28] "paracetamol for pain"
## [29] "insulin&atenolol"
## [30] "paracetamol"
## [31] "diazipam"
## [32] "paracetamol 500mg x2 at 7am"
## [33] "lactulose"
## [34] "lactulose and multivitamins"
## [35] "unknown"
## [36] "11:00 paracetamol, 13:00 ibuprofen, 15:00 paracetamol"
## [37] "insulin"
## [38] "meropenem then paracetamol"
## [39] "unknown"
## [40] "saline solution"
## [41] "10am insulin"
## [42] "asprin"
## [43] "lorazepam"
## [44] NA
## [45] "laculose¶cetamol"
## [46] NA
## [47] "insulin"
## [48] "glucose"
## [49] "citalopram & paracetamol"
## [50] "16.30 paracetamol 500mg"
df = df %>%
mutate(medication = str_replace_all(df$medication, pattern = "parac[e]*t[iao]m[o]*l", replacement = "paracetamol"))