library('tidyverse')
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

1. Basic

library(rebus)
## 
## Attaching package: 'rebus'
## The following object is masked from 'package:stringr':
## 
##     regex
## The following object is masked from 'package:ggplot2':
## 
##     alpha
# Some strings to practice with
x <- c("cat", "coat", "scotland", "tic toc")
# Print END
END
## <regex> $
# Run me
str_view(x, pattern = START %R% "c")
# Match the strings that start with "co" 
str_view(x, pattern = START %R% "co")
# Match two characters, where the second is a "t"
str_view(x, pattern = ANY_CHAR %R% "t")
# Match two characters
str_view(x, pattern = ANY_CHAR %R% ANY_CHAR)
# Match a string with exactly three characters
str_view(x, pattern = START %R% ANY_CHAR %R% ANY_CHAR %R% ANY_CHAR %R% END)

2. Combine with stringr functions

library(babynames)
babynames <- filter(babynames, year == 2014)
boy_names <- filter(babynames, sex == "M")$name
girl_names <- filter(babynames, sex == "F")$name

pattern <- "q" %R% ANY_CHAR
# Find names that have the pattern
str_view(c("Quentin", "Kaliq", "Jacques",  "Jacqes"), pattern)  
# How many names were there?
names_with_q <- str_subset(boy_names, pattern)
length(names_with_q)
## [1] 96
# Find part of name that matches pattern
part_with_q <- str_extract(boy_names, pattern)
# Get a table of counts
table(part_with_q)
## part_with_q
## qa qe qi qm qo qu 
##  1  1  2  2  1 89
# Did any names have the pattern more than once?
count_of_q <- str_count(boy_names, pattern)
# Get a table of counts
table(count_of_q)
## count_of_q
##     0     1 
## 13951    96
# Which babies got these names?
with_q <- str_detect(boy_names, pattern)
# What fraction of babies got these names?
mean(with_q)
## [1] 0.006834199

3. or()

# Match Jeffrey or Geoffrey
whole_names <- or("Jeffrey", "Geoffrey")
str_view(boy_names, pattern = whole_names, match = TRUE)
# Match Jeffrey or Geoffrey, another way
common_ending <- or("Je", "Geo") %R% "ffrey"
common_ending
## <regex> (?:Je|Geo)ffrey
str_view(boy_names, pattern = common_ending, match = TRUE)
# Match with alternate endings
by_parts <- or("Je", "Geo") %R% "ff" %R% or("ry", "ery", "rey", "erey")
by_parts
## <regex> (?:Je|Geo)ff(?:ry|ery|rey|erey)
str_view(boy_names, pattern = by_parts, match = TRUE)
# Match names that start with Cath or Kath
ckath <- or("C", "K") %R% "ath"
str_view(girl_names, pattern = ckath, match = TRUE)

4. char_class()

# Create character class containing vowels
vowels <- char_class("aeiouAEIOU")

# Print vowels
vowels
## <regex> [aeiouAEIOU]
# See vowels in x with str_view()
x
## [1] "cat"      "coat"     "scotland" "tic toc"
str_view(x, vowels) #only matches first vowel
# See vowels in x with str_view_all()
str_view_all(x, vowels)
# Number of vowels in boy_names
num_vowels <- str_count(boy_names, vowels)
# Number of characters in boy_names
name_length <- str_length(boy_names)
# Calc mean number of vowels
mean(num_vowels)
## [1] 2.385563
# Calc mean fraction of vowels per name
mean(num_vowels / name_length)
## [1] 0.4000596

5. exactly & one_or_more

# Vowels from last exercise
vowels <- char_class("aeiouAEIOU")

# See names with only vowels
str_view(boy_names, 
  pattern = exactly(one_or_more(vowels)), 
  match = TRUE)
# Use `negated_char_class()` for everything but vowels
not_vowels <- negated_char_class("aeiouAEIOU")

# See names with no vowels
str_view(boy_names, 
  pattern = exactly(one_or_more(not_vowels)), 
  match = TRUE)

6. shortcut

# Create a three digit pattern
three_digits <- DGT %R% DGT %R% DGT

# Test it
contact <- c("Call me at 555-555-0191",
             "123 Main St",
             "(555) 555 0191",
             "Phone: 555.555.0191 Mobile: 555.555.0192")
str_view_all(contact, pattern = three_digits)
# Create a separator pattern
separator <-  char_class("-.() ")
# Test it
str_view_all(contact, pattern = separator)
# Use these components
three_digits <- DGT %R% DGT %R% DGT
four_digits <- three_digits %R% DGT
separator <- char_class("-.() ")

# Create phone pattern
phone_pattern <- optional(OPEN_PAREN) %R% 
  three_digits %R% 
  zero_or_more(separator) %R% 
  three_digits %R% 
  zero_or_more(separator) %R%
  four_digits

        
# Test it           
str_view_all(contact, pattern = phone_pattern)
# Use these components
three_digits <- DGT %R% DGT %R% DGT
four_digits <- three_digits %R% DGT
separator <- char_class("-.() ")

# Create phone pattern
phone_pattern <- optional(OPEN_PAREN) %R% 
  three_digits %R% 
  zero_or_more(separator) %R% 
  three_digits %R% 
  zero_or_more(separator) %R%
  four_digits

        
# Test it           
str_view_all(contact, pattern = phone_pattern)

7. capture

# Capture parts between @ and . and after .
email <- capture(one_or_more(WRD)) %R% 
  "@" %R% capture(one_or_more(WRD)) %R% 
  DOT %R% capture(one_or_more(WRD))
email
## <regex> ([\w]+)@([\w]+)\.([\w]+)
# Check match hasn't changed
hero_contacts <- c("(wolverine@xmen.com)",
                   "wonderwoman@justiceleague.org",
                   "thor@avengers.com")  
str_view(hero_contacts, email)
# Pull out match and captures
email_parts <- str_match(hero_contacts, pattern = email)
email_parts
##      [,1]                            [,2]          [,3]            [,4] 
## [1,] "wolverine@xmen.com"            "wolverine"   "xmen"          "com"
## [2,] "wonderwoman@justiceleague.org" "wonderwoman" "justiceleague" "org"
## [3,] "thor@avengers.com"             "thor"        "avengers"      "com"
# Save host
host <- email_parts[, 3]
host
## [1] "xmen"          "justiceleague" "avengers"

8. Backreferences

# Names with three repeated letters
repeated_three_times <- capture(LOWER) %R% REF1 %R% REF1
# Test it
str_view(boy_names, pattern = repeated_three_times, match = TRUE)
# Names with a pair of repeated letters
pair_of_repeated <- capture(LOWER %R% LOWER) %R% REF1
# Test it
str_view(boy_names, pattern = pair_of_repeated, match = TRUE)
# Names with a pair that reverses
pair_that_reverses <- capture(LOWER) %R% capture(LOWER) %R% REF2 %R% REF1
# Test it
str_view(boy_names, pattern = pair_that_reverses, match = TRUE)