library('tidyverse')
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
1. Basic
library(rebus)
##
## Attaching package: 'rebus'
## The following object is masked from 'package:stringr':
##
## regex
## The following object is masked from 'package:ggplot2':
##
## alpha
# Some strings to practice with
x <- c("cat", "coat", "scotland", "tic toc")
# Print END
END
## <regex> $
# Run me
str_view(x, pattern = START %R% "c")
# Match the strings that start with "co"
str_view(x, pattern = START %R% "co")
# Match two characters, where the second is a "t"
str_view(x, pattern = ANY_CHAR %R% "t")
# Match two characters
str_view(x, pattern = ANY_CHAR %R% ANY_CHAR)
# Match a string with exactly three characters
str_view(x, pattern = START %R% ANY_CHAR %R% ANY_CHAR %R% ANY_CHAR %R% END)
2. Combine with stringr functions
library(babynames)
babynames <- filter(babynames, year == 2014)
boy_names <- filter(babynames, sex == "M")$name
girl_names <- filter(babynames, sex == "F")$name
pattern <- "q" %R% ANY_CHAR
# Find names that have the pattern
str_view(c("Quentin", "Kaliq", "Jacques", "Jacqes"), pattern)
# How many names were there?
names_with_q <- str_subset(boy_names, pattern)
length(names_with_q)
## [1] 96
# Find part of name that matches pattern
part_with_q <- str_extract(boy_names, pattern)
# Get a table of counts
table(part_with_q)
## part_with_q
## qa qe qi qm qo qu
## 1 1 2 2 1 89
# Did any names have the pattern more than once?
count_of_q <- str_count(boy_names, pattern)
# Get a table of counts
table(count_of_q)
## count_of_q
## 0 1
## 13951 96
# Which babies got these names?
with_q <- str_detect(boy_names, pattern)
# What fraction of babies got these names?
mean(with_q)
## [1] 0.006834199
3. or()
# Match Jeffrey or Geoffrey
whole_names <- or("Jeffrey", "Geoffrey")
str_view(boy_names, pattern = whole_names, match = TRUE)
# Match Jeffrey or Geoffrey, another way
common_ending <- or("Je", "Geo") %R% "ffrey"
common_ending
## <regex> (?:Je|Geo)ffrey
str_view(boy_names, pattern = common_ending, match = TRUE)
# Match with alternate endings
by_parts <- or("Je", "Geo") %R% "ff" %R% or("ry", "ery", "rey", "erey")
by_parts
## <regex> (?:Je|Geo)ff(?:ry|ery|rey|erey)
str_view(boy_names, pattern = by_parts, match = TRUE)
# Match names that start with Cath or Kath
ckath <- or("C", "K") %R% "ath"
str_view(girl_names, pattern = ckath, match = TRUE)
4. char_class()
# Create character class containing vowels
vowels <- char_class("aeiouAEIOU")
# Print vowels
vowels
## <regex> [aeiouAEIOU]
# See vowels in x with str_view()
x
## [1] "cat" "coat" "scotland" "tic toc"
str_view(x, vowels) #only matches first vowel
# See vowels in x with str_view_all()
str_view_all(x, vowels)
# Number of vowels in boy_names
num_vowels <- str_count(boy_names, vowels)
# Number of characters in boy_names
name_length <- str_length(boy_names)
# Calc mean number of vowels
mean(num_vowels)
## [1] 2.385563
# Calc mean fraction of vowels per name
mean(num_vowels / name_length)
## [1] 0.4000596
5. exactly & one_or_more
# Vowels from last exercise
vowels <- char_class("aeiouAEIOU")
# See names with only vowels
str_view(boy_names,
pattern = exactly(one_or_more(vowels)),
match = TRUE)
# Use `negated_char_class()` for everything but vowels
not_vowels <- negated_char_class("aeiouAEIOU")
# See names with no vowels
str_view(boy_names,
pattern = exactly(one_or_more(not_vowels)),
match = TRUE)
6. shortcut
# Create a three digit pattern
three_digits <- DGT %R% DGT %R% DGT
# Test it
contact <- c("Call me at 555-555-0191",
"123 Main St",
"(555) 555 0191",
"Phone: 555.555.0191 Mobile: 555.555.0192")
str_view_all(contact, pattern = three_digits)
# Create a separator pattern
separator <- char_class("-.() ")
# Test it
str_view_all(contact, pattern = separator)
# Use these components
three_digits <- DGT %R% DGT %R% DGT
four_digits <- three_digits %R% DGT
separator <- char_class("-.() ")
# Create phone pattern
phone_pattern <- optional(OPEN_PAREN) %R%
three_digits %R%
zero_or_more(separator) %R%
three_digits %R%
zero_or_more(separator) %R%
four_digits
# Test it
str_view_all(contact, pattern = phone_pattern)
# Use these components
three_digits <- DGT %R% DGT %R% DGT
four_digits <- three_digits %R% DGT
separator <- char_class("-.() ")
# Create phone pattern
phone_pattern <- optional(OPEN_PAREN) %R%
three_digits %R%
zero_or_more(separator) %R%
three_digits %R%
zero_or_more(separator) %R%
four_digits
# Test it
str_view_all(contact, pattern = phone_pattern)
7. capture
# Capture parts between @ and . and after .
email <- capture(one_or_more(WRD)) %R%
"@" %R% capture(one_or_more(WRD)) %R%
DOT %R% capture(one_or_more(WRD))
email
## <regex> ([\w]+)@([\w]+)\.([\w]+)
# Check match hasn't changed
hero_contacts <- c("(wolverine@xmen.com)",
"wonderwoman@justiceleague.org",
"thor@avengers.com")
str_view(hero_contacts, email)
# Pull out match and captures
email_parts <- str_match(hero_contacts, pattern = email)
email_parts
## [,1] [,2] [,3] [,4]
## [1,] "wolverine@xmen.com" "wolverine" "xmen" "com"
## [2,] "wonderwoman@justiceleague.org" "wonderwoman" "justiceleague" "org"
## [3,] "thor@avengers.com" "thor" "avengers" "com"
# Save host
host <- email_parts[, 3]
host
## [1] "xmen" "justiceleague" "avengers"
8. Backreferences
# Names with three repeated letters
repeated_three_times <- capture(LOWER) %R% REF1 %R% REF1
# Test it
str_view(boy_names, pattern = repeated_three_times, match = TRUE)
# Names with a pair of repeated letters
pair_of_repeated <- capture(LOWER %R% LOWER) %R% REF1
# Test it
str_view(boy_names, pattern = pair_of_repeated, match = TRUE)
# Names with a pair that reverses
pair_that_reverses <- capture(LOWER) %R% capture(LOWER) %R% REF2 %R% REF1
# Test it
str_view(boy_names, pattern = pair_that_reverses, match = TRUE)