library(tidyverse)
An Evening with Regex
Setup
1. College majors
Find all college majors in the data that contain the words “DATA” or “STATISTICS” in the name of the major.
# Read in the data
= read.csv("majors-list.csv") df
# Only need one column
<- df |>
majors select(Major)
# Extract all majors that contain DATA or STATISTICS
|>
majors filter(str_detect(Major, "\\b(DATA|STATISTICS)\\b"))
Major
1 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS
2 COMPUTER PROGRAMMING AND DATA PROCESSING
3 STATISTICS AND DECISION SCIENCE
2. Proper fruits
Convert a messy string of fruits into a useful character vector!
Note: I interpreted this question to mean “Given this raw string, use regex to convert it into a usable character vector”, so I’m recreating the string as authentically as I can, and then breaking it down into the character vector.
# Create the fruit string!
<- "[1] \"bell pepper\" \"bilberry\" \"blackberry\"
fruit_string \"blood orange\"[5] \"blueberry\" \"cantaloupe\" \"chili pepper\"
\"cloudberry\" [9] \"elderberry\" \"lime\" \"lychee\"
\"mulberry\" [13] \"olive\" \"salal berry\""
# Reduce the string to each fruit
# Match words with optional spaces between literal \" bookends
<-"(?<=\")(\\b\\w+(\\s\\w+)?\\b)(?=\")"
pattern
# Extract the list
<- str_extract_all(fruit_string, pattern)[[1]]
fruits
# Turn into a vector
<- unlist(fruits)
fruits
print(fruits)
[1] "bell pepper" "bilberry" "blackberry" "blood orange" "blueberry"
[6] "cantaloupe" "chili pepper" "cloudberry" "elderberry" "lime"
[11] "lychee" "mulberry" "olive" "salal berry"
3. Expressions
Explain what each regex expression is doing in plain language.
3.1 Expression 1
The full expression matches any character besides a line break followed by a literal \1\1
(.)\1\1
(.)
Establish capture group 1 as; Capture any character except a line break\
Match a literal backslash1
Match a literal 1\
Match a literal backslash1
Match a literal 1
<- "(.)\1\1"
pattern_1
<- "a\1\1"
string_1
str_view(string_1, pattern_1)
# Suppressed output so file could be rendered
3.2 Expression 2
The full expression matches a double quote, any two characters as capture groups, capture group 2, capture group 1, then a double quote
"(.)(.)\\2\\1"
"
Match a double quote(.)
Establish capture group 1 as; capture any character except a line break(.)
Establish capture group 2 as; capture any character except a line break\\2
Match capture group 2\\1
Match capture group 1"
Match a literal double quote
<- '"(.)(.)\\2\\1"'
pattern_2
<- c('"aaaa"', '"abba"')
string_2
str_view(string_2, pattern_2)
[1] │ <"aaaa">
[2] │ <"abba">
3.3 Expression 3
The full expression matches any two characters, a literal backslash, and a 1
(..)\1
(..)
Match any two characters except line breaks\
Match a literal backslash1
Match a one
<- '(..)\1'
pattern_3
<- c('aa\1', 'X!\1')
string_3
str_view(string_3, pattern_3)
# Had to suppress output so file could be rendered
3.4 Expression 4
The full expression matches a double quote, any character as capture group 1, any character, capture group 1, any character, capture group 1, then a double quote
"(.).\\1.\\1"
"
Match a literal double quote(.)
Establish capture group 1 as any character except line break.
Match any character except line break\\1
Match capture group 1.
Match any character\\1
Match capture group 1"
Match a literal double quote
<- '"(.).\\1.\\1"'
pattern_4
<- c('"aXa!a"', '"2Z2$2"')
string_4
str_view(string_4, pattern_4)
[1] │ <"aXa!a">
[2] │ <"2Z2$2">
3.5 Expression 5
This expression matches a literal double quote, any three characters and establishes them as capture groups, any character any amount of times, capture groups 3, 2, and 1 respectively, and a literal double quote.
"(.)(.)(.).*\\3\\2\\1"
<- '"(.)(.)(.).*\\3\\2\\1"'
pattern_5
<- c('"ABCXXXXXXXXCBA"', '"AAAXAAA"')
string_5
str_view(string_5, pattern_5)
[1] │ <"ABCXXXXXXXXCBA">
[2] │ <"AAAXAAA">
4. More Expressions
Create regular expressions to match words.
# Start and end with the same character
= "\\b(\\w)\\w*\\1\\b"
start_end_pattern
= c("racecar", "radar", "abba")
start_end_words
str_view(start_end_words, start_end_pattern)
[1] │ <racecar>
[2] │ <radar>
[3] │ <abba>
# Contain repeated pairs of letters
= "\\b\\w*(.)\\1\\w*\\b"
repeat_letters_pattern
= c("abba",
repeat_letters_words "apple",
"pattern",
"banana" # should not be captured
)
str_view(repeat_letters_words, repeat_letters_pattern)
[1] │ <abba>
[2] │ <apple>
[3] │ <pattern>