Solution
library(tidyverse)
df_p1 <- read_csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv')
# Checking to see which columns contain the patterns through coercive behavior of str_detect (in case a data frame is passed as argument, each column's records are collapsed into a single string, then str_detect returns TRUE for each column containing the patterns)
str_detect(df_p1, '(DATA|STATISTICS)')
## [1] FALSE TRUE FALSE
str_subset(df_p1[[2]], pattern = '(DATA|STATISTICS)')
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"
## [3] "STATISTICS AND DECISION SCIENCE"
# In a single line of code:
str_subset(df_p1[[which(str_detect(df_p1, '(DATA|STATISTICS)'))]], pattern = '(DATA|STATISTICS)')
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"
## [3] "STATISTICS AND DECISION SCIENCE"
## [1] "bell pepper" "bilberry" "blackberry" "blood orange"
##
## [5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
##
## [9] "elderberry" "lime" "lychee" "mulberry"
##
## [13] "olive" "salal berry"
## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")
Solution
input_v <- '[1] "bell pepper" "bilberry" "blackberry" "blood orange"
[5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
[9] "elderberry" "lime" "lychee" "mulberry"
[13] "olive" "salal berry"'
# Lazy-matching (*? instead of *) is required so that each item becomes an element inside the first and only vector of list_items
list_items <- str_extract_all(string = input_v, pattern = '\\".*?\\"')
items <- str_c(list_items[[1]], collapse = ', ')
str_glue('c({items})', items = items)
## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")
(.)\1\1 Any character that appears 3 times in a
row."(.)(.)\\2\\1" Any 2 characters that repeat
immediately in the reverse order.(..)\1 Any 2 characters that repeat immediately
in the same order."(.).\\1.\\1" Any single character that repeats
2 more times, with each repetition after another single variable
character."(.)(.)(.).*\\3\\2\\1" Any 3 characters that
repeat in the reverse order after any number (even 0) of variable
characters.I assume the term “words” refers to actual words and not arbitrary sets of any characters. Additionally, I cannot find any way to do case-insensitive backreferencing, so I am using the lowercase alphabets.
(^|\\s)([a-z])(([a-z]+\\2(\\s|$))|\\2?(\\s|$))str_subset(string = c('lol', ' madam', 'cat'), pattern = '(^|\\s)([a-z])(([a-z]+\\2(\\s|$))|\\2?(\\s|$))')
## [1] "lol" " madam"
(^|\\s)[a-z]*([a-z][a-z])[a-z]*\\2[a-z]*(\\s|$)str_subset(string = c('tomato', ' mississippi ', 'what'), pattern = '(^|\\s)[a-z]*([a-z][a-z])[a-z]*\\2[a-z]*(\\s|$)')
## [1] "tomato" " mississippi "
(^|\\s)[a-z]*([a-z])[a-z]*\\2[a-z]*\\2[a-z]*(\\s|$)str_subset(string = c('applepie', ' monsoon ', 'panda'), pattern = '(^|\\s)[a-z]*([a-z])[a-z]*\\2[a-z]*\\2[a-z]*(\\s|$)')
## [1] "applepie" " monsoon "