1

Provide code that identifies the majors that contain either “DATA” or “STATISTICS”

majors_df = read.csv(text=getURL('https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv'))
head(majors_df)
##   FOD1P                                 Major                  Major_Category
## 1  1100                   GENERAL AGRICULTURE Agriculture & Natural Resources
## 2  1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources
## 3  1102                AGRICULTURAL ECONOMICS Agriculture & Natural Resources
## 4  1103                       ANIMAL SCIENCES Agriculture & Natural Resources
## 5  1104                          FOOD SCIENCE Agriculture & Natural Resources
## 6  1105            PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources
# check if major is all caps
majors = levels(majors_df$Major)
match_data_stats = str_detect(majors, regex("DATA|STATISTICS", ignore_case=TRUE))
majors[match_data_stats]
## [1] "COMPUTER PROGRAMMING AND DATA PROCESSING"     
## [2] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [3] "STATISTICS AND DECISION SCIENCE"

2

Write code that transforms the data below:

log_output = '[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"

[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  

[9] "elderberry"   "lime"         "lychee"       "mulberry"    

[13] "olive"        "salal berry"'

Into a format like this:

c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)

i1 = str_extract_all(log_output, regex('"(.*?)"') )
i2 = unlist(i1)
final = str_replace_all(i2, '\\\"', '')
final
##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"

3

Describe, in words, what these expressions will match:

(.)\1\1 <- Matches any character followed by \1\1 e.g. matches 'b\1\1' (anychar-\-1-\-1)
rexpr = "(.)\1\1"
test_str = c("...", "bbb", "(.)\1\1", "(.)(.)(.)", "(.)\1\1(.)\1\1(.)\1\1", "(.)\\1\\1(.)\\1\\1(.)\\1\\1",  "(.)\1\1(.)\1\1(.)\1\1(.)\1\1(.)\1\1", ")\1\1)\1\1)\1\1", ".)\1\1.)\1\1.)\1\1", ".\1\1.\1\1.\1\1", "b\1\1b\1\1b\1\1")
str_view_all(test_str, rexpr)
"(.)(.)\\2\\1" <- Matches any 2 character palindrome inside double quotes e.g.'"!**!" ("group1-group2-group2-group1")
rexpr = '"(.)(.)\\2\\1"'
test_str = c('abba', 'abab', '"abba"', '"baba"', 'baba', '"aaaaabbbbbbbbabab"', '"aabbab"', 'a"abba"b', '"!**!"', "'*!!*'", '"!*8*!"', '"1221"')
str_view_all(test_str, rexpr)
(..)\1 <- matches any 2 characters followed by a \1 e.g. '/3\1' (anychar-anychar-\-1)
rexpr = '(..)\1'
test_str = c('bb', 'b2\1', '/3\1', '\\\\\\1', '\\\\\1', 'efg\\\\\1abcd', 'abcdefsgz\1fasdf')
str_view_all(test_str, rexpr)
"(.).\\1.\\1" <- matches any 2 characters sandwiched between the same character 3 times inside double quotes e.g. '"b*b!b"' ("group1-anychar-group1-anychar-group1")
rexpr = '"(.).\\1.\\1"'
test_str = c('b*b!b', '"b*b!b"', '"b*b!c"', '"b*bb!b"', '"b**b!b"', 'aaaa"b*b!b"bbb')
str_view_all(test_str, rexpr)
"(.)(.)(.).*\\3\\2\\1" <- matches 0 or more characters sandwhiched between a palindrome of 6 characters inside double quotes e.g. '"123ab321"' ("group1-group2-group3-0 or more anychar-group3-group2-group1")
rexpr = '"(.)(.)(.).*\\3\\2\\1"'
test_str = c('123aaaaa321', '"123aaaaa321"', '"123321"', '"123ab321"', 'aaa"123ab321"bbb')
str_view_all(test_str, rexpr)

4

Construct regular expressions to match words that: (assuming match the whole word)

Start and end with the same character.
rexpr -> ^(.).*\\1$
rexpr = '^(.).*\\1$'
test_str = c('123aaaaa321', '"123aaaaa321"', '123ab32', '1cxboiasdlk1', 'zaesfasZ', 'zadsfsfz"')
str_view_all(test_str, rexpr)

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.) rexpr -> (..).*\1

rexpr = '^.*(..).*\\1.*$'
test_str = c('123aaaaa321', '"123aaaaa321"', '123aaa32', '123aaaa32', '123abab', '12xabyabz12', '21xabyabz12')
str_view_all(test_str, rexpr)

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.) rexpr -> .(.)(.\1.*){2,}

rexpr = '.*(.)(.*\\1.*){2,}'
test_str = c('eleven', 'tweleve', 'a123aa32', '123aa32', 'abcdef3 ghijkl3mnopqr3stuv3wxyz3')
str_view_all(test_str, rexpr)