library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data,"[[:alpha:]., ]{2,}")) #introductory example
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
First I remove titles and middle intials by identifying any words followed by “.”, and replacing them with “”
replace <- str_replace_all(name, "\\w+\\. ", "")
replace
## [1] "Moe Szyslak" "Burns, Montgomery" "Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Julius Hibbert"
Then I replace those having “last name”“,”“first name” by backreferencing in reverse order: “first name”" “”last name"
first_last_name <- str_replace_all(replace, "(\\w+)\\, (\\w+)", "\\2 \\1")
first_last_name
## [1] "Moe Szyslak" "Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
A title is TRUE when we detect the presence of any words containing 2 or more characters, followed by “.”
title <- str_detect(name, "\\w{2,}\\. ")
data.frame(name, title)
## name title
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert TRUE
A second name (initial) is TRUE when we detect the presence of any single uppercase character followed by “.”
second_name <- str_detect(name, "[A-Z]\\. ")
data.frame(name, second_name)
## name second_name
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert FALSE
Extracts one or more digits from 0 to 9, followed by “$”
str_extract_all("4324dfa94324$9842w34$", "[0-9]+\\$")
## [[1]]
## [1] "94324$" "34$"
Extracts any words that contained 1 to 4 lowercase characters
str_extract_all(c("hello", "my name is Mia"), "\\b[a-z]{1,4}\\b")
## [[1]]
## character(0)
##
## [[2]]
## [1] "my" "name" "is"
Extracts any text of any length (including zero length) that contains “.txt” as end of string, i.e. txt file names
str_extract_all(c("lab1.txt", "hw#.txt", ".txt"), ".*?\\.txt$")
## [[1]]
## [1] "lab1.txt"
##
## [[2]]
## [1] "hw#.txt"
##
## [[3]]
## [1] ".txt"
Extracts digits in 2-2-4 format: xx/xx/xxxx (Date)
str_extract_all(c("2/17/2019", "02/17/2019", "2/17/19"), "\\d{2}/\\d{2}/\\d{4}")
## [[1]]
## character(0)
##
## [[2]]
## [1] "02/17/2019"
##
## [[3]]
## character(0)
Extracts a set of strings in the format:
text (HTML)
str_extract_all(c("<html>","<title>Page Title</title>","</head>"), "<(.+?)>.+?</\\1>")
## [[1]]
## character(0)
##
## [[2]]
## [1] "<title>Page Title</title>"
##
## [[3]]
## character(0)
Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
code <- ("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr")
I found the message inside the code by extracting all the upper letters (followed by “.” as a separator for each word)
upper <- str_extract_all(code, "[[:upper:].]+")
upper
## [[1]]
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "AT" "I" "O" "N" "S"
## [15] "." "Y" "O" "U" "." "A" "R" "E" "." "A" ".S" "U" "P" "E"
## [29] "R" "N" "E" "R" "D"
Make the message look nicer by unlisting, pasting the letters together, and replacing all “.” by " "
str_replace_all(paste(unlist(upper), collapse = ""), "[\\.]", " ")
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"