R> name [1] “Moe Szyslak” “Burns, C. Montgomery” “Rev. Timothy Lovejoy” [4] “Ned Flanders” “Simpson, Homer” “Dr. Julius Hibbert”
rawData <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
names <- unlist(str_extract_all(rawData, "[[:alpha:]., ]{2,}"))
names
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
# remove titles from the names
firstLastName <- str_replace_all(names, "\\b\\w{2,}?\\.", '') # the '\\b' is not necessary but can be useful
# when the title is preceeded by another word in a sentence
firstLastName
## [1] "Moe Szyslak" "Burns, C. Montgomery" " Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" " Julius Hibbert"
data.frame(names = names, hasTitle = str_detect(names, "\\b\\w{2,}?\\."))
## names hasTitle
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert TRUE
data.frame(names = firstLastName, hasSecondName = str_detect(names, "\\b\\w{1,}?\\."))
## names hasSecondName
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery TRUE
## 3 Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Julius Hibbert TRUE
values <- c('The shoe worths $100', 'ab988uy$', '1234', '5000$')
matcher <- '[0-9]+\\$'
data.frame(value = values, matched = str_detect(values, matcher))
## value matched
## 1 The shoe worths $100 FALSE
## 2 ab988uy$ FALSE
## 3 1234 FALSE
## 4 5000$ TRUE
values <- c('The shoe worths $100', 'My Name Is Subzero', 'My name Is subzero', '5000$')
matcher <- '\\b[a-z]{1,4}\\b'
data.frame(value = values, matched = str_detect(values, matcher))
## value matched
## 1 The shoe worths $100 TRUE
## 2 My Name Is Subzero FALSE
## 3 My name Is subzero TRUE
## 4 5000$ FALSE
values <- c('chess_players_ratings.txt', 'chess_players_ratings.pdf', 'Chess_PLAYERS_raTings.txt', 'Chess_PlAYeRS.tx')
matcher <- '.*?\\.txt$'
data.frame(value = values, matched = str_detect(values, matcher))
## value matched
## 1 chess_players_ratings.txt TRUE
## 2 chess_players_ratings.pdf FALSE
## 3 Chess_PLAYERS_raTings.txt TRUE
## 4 Chess_PlAYeRS.tx FALSE
values <- c('3/50', '20/50/5504', '99/100-33/50', '11/22/4840')
matcher <- '\\d{2}/\\d{2}/\\d{4}'
data.frame(value = values, matched = str_detect(values, matcher))
## value matched
## 1 3/50 FALSE
## 2 20/50/5504 TRUE
## 3 99/100-33/50 FALSE
## 4 11/22/4840 TRUE
values <- c('<a>click here to sign up</a>', 'sps.cuny.edu', '<button> a classic button</button>', 'https://www.amazon.com')
matcher <- '<(.+?)>.+?</\\1>'
data.frame(value = values, matched = str_detect(values, matcher))
## value matched
## 1 <a>click here to sign up</a> TRUE
## 2 sps.cuny.edu FALSE
## 3 <button> a classic button</button> TRUE
## 4 https://www.amazon.com FALSE
Crack it with R and regular expressions. ##### Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
#With close observations, the hidden message can be seen to be a meaningful word in upper cases mixed among the lot
nerdMs <- 'clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr'
superNerd <- str_replace_all(str_replace_all(nerdMs,"([a-z])|([0-9])|\n",""), "\\.", " ")
superNerd
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD!"