Exercise 1

Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/, provide code that identifies the majors that contain either “DATA” or “STATISTICS”

x <- getURL('https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv')
collegeMajors <- read.csv(text = x)
reactable( collegeMajors %>% filter( str_detect(collegeMajors$Major, pattern = "DATA|STATISTICS") ) )

Exercise 2

Write code that transforms the data below:

[1] “bell pepper” “bilberry” “blackberry” “blood orange”

[5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”

[9] “elderberry” “lime” “lychee” “mulberry”

[13] “olive” “salal berry”

Into a format like this:

c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)

fruitsStringRaw <- "[1] \"bell pepper\"  \"bilberry\"     \"blackberry\"   \"blood orange\"

[5] \"blueberry\"    \"cantaloupe\"   \"chili pepper\" \"cloudberry\"  

[9] \"elderberry\"   \"lime\"         \"lychee\"       \"mulberry\"    

[13] \"olive\"        \"salal berry\""

fruitsStringRaw<- unlist(str_extract_all(fruitsStringRaw, pattern = "[a-z]+[:space:]?[a-z]*") )
fruitsStringRaw
##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"

Exercise 3

Describe, in words, what these expressions will match:

testWords <- c( "aaa","aba","111","212" , "abcedfg")
str_subset(testWords, pattern = "(.)\\1\\1" )
## [1] "aaa" "111"

As written, the expression matches any character, excluding newlines, followed by the ASCII SOH character, represented by \1, twice.

testWords <- c( "aaa\1\1","aba\1\1","111","212" , "abcedfg")
str_subset(testWords, pattern = "(.)\1\1" )
## [1] "aaa\001\001" "aba\001\001"
testWords <- c( "\"abba\"","aaba","1221","\"1221\"" , "abcedfg","123\"abba\"abc")
str_subset(testWords, pattern = "\"(.)(.)\\2\\1\"")
## [1] "\"abba\""       "\"1221\""       "123\"abba\"abc"
testWords <- c( "abab","aaba","1212","abcabc" , "abcedfg", "adedeb")
str_subset(testWords, pattern = "(..)\\1")
## [1] "abab"   "1212"   "adedeb"

As written, the expression matches any two characters, excluding newlines, followed by the ASCII SOH character, represented by \1.

testWords <- c( "ab\1","aa\1ba","1212\1","abcabc" , "abcedfg", "adedeb")
str_subset(testWords, pattern = "(..)\1")
## [1] "ab\001"   "aa\001ba" "1212\001"
testWords <- c( "\"abaca\"","\"12151\"","1212","abcabc" , "abcedfg", "adedeb")
str_subset(testWords, pattern = "\"(.).\\1.\\1\"")
## [1] "\"abaca\"" "\"12151\""
testWords <- c( "\"123abcdejdhfj321\"","\"123321\"", "\"abcanythingcba\"", "1212","abcabc" , "abcedfg", "adedeb")
str_subset(testWords, pattern = "\"(.)(.)(.).*\\3\\2\\1\"")
## [1] "\"123abcdejdhfj321\"" "\"123321\""           "\"abcanythingcba\""

Exercise 4

^(.).*\\1\(| \^( .)\)

testWords <- c( "aasdfjdskfjksdla","b1238219839129b", "asdfkasdjfkas", "abcdef1234","123451" , "dfkdsafi21","a")
str_subset(testWords, pattern = "^(.).*\\1$|^(.)$")
## [1] "aasdfjdskfjksdla" "b1238219839129b"  "123451"           "a"
testWords <- c( "church","aasbdfsdbfaashadbfhaa", "12abcdef12", "213456","123451" , "dfkdsafi21","a")
str_subset(testWords, pattern = "([:alpha:][:alpha:]).*\\1")
## [1] "church"                "aasbdfsdbfaashadbfhaa"
testWords <- c( "eleven","seventeen", "123123123", "abcabcabc", "dfkdsafi21","a","eee","111")
str_subset(testWords, pattern = "([:alpha:]).*\\1.*\\1")
## [1] "eleven"    "seventeen" "abcabcabc" "eee"