library(stringr) # more consistent than base r
# note: github wraps data in formatting, to get raw data link, click the "Raw" button at the top-left of the data
data_file <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"
majors<-read.csv(data_file)
str_view(majors$Major, "DATA|STATISTICS", match=TRUE)
Into a format like this: c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)
The question asks to convert raw data into R code
vector_data<-c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")
dbl_quot<-'\"'
double_quote <- "\""
double_quote<-'"'
data_string<-str_c(double_quote,vector_data,double_quote,collapse = ", ")
# data_string<-str_c(vector_data, collapse = ", ")
code_string<-str_c("c(",data_string,")")
writeLines(code_string) # writeLines omits the escape characters
## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")
(.)\1\1
Whats inside the parenthesis is a “captured” match. In this case the dot means any character,
The back reference \1 then takes that match and duplicates a search of the next character only.
i.e. (.)\1\1 will macth any 3 consective characters.
str<-c("BBAAAX", "bBaAAxx", "zzza", "aaaa","...","abaaaaaaax")
str_view(str,"(.)\\1\\1")
(.)(.)\2\1
\2 will duplicate a search of whats inside the second parenthesis,
thus \2\1 effectively will captured match #2 then captured match #1,
str<-c("BAAX", "bBaAAxx", "zzza", "abba","...","aaaax")
str_view(str,"(.)(.)\\2\\1")
(..)\1
Any 2 characters followed the same 2 characters
str<-c("BABAX", "bBaAAxx", "zzza", "abba","...","axaaaxa")
str_view(str,"(..)\1")
(.).\1.\1
Any 2 characters followed the first character followed by any character than the first character again.
str<-c("BABAX", "bBaAAxx", "Z4Z5Z6za", "abba","...","XaXxXaaaxa")
str_view(str,"(.)(.)\\2\\1")
"(.)(.)(.).*\3\2\1" Any 3 characters followed by any number of characters we dont care about, followed by
the first 3 characters reversed. Asterix means 0 or more, so the “any number of”
characters in the middle could be zero."
str<-c("BAZAZZA", "OBAZZABOO", "Z4Z5Z6za", "abbba","...","XaXxXaaaXaXxa")
str_view(str,"(.)(.)(.).*\\3\\2\\1")
str<-c("BAZAZZA", "OBAZZABOO", "Z4Z5Z6za", "abbba","...","XaXxXaaaXaXxa")
str_view(str,"^(.).*\\1$")
Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)
str<-c("BBAAABBX", "bBaAAxx", "zzza", "aaaa","church")
str_view(str,"(..).*\\1")
Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
str<-c("BANANA", "bBaAAxx", "zzza", "aaNOa","eleven","church")
str_view(str,"(.).*\\1.*\\1")