library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
[1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
[4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
### create a function to extract last names
get_last <- function(list){
last <- str_extract(list, '[[:alpha:]]{1,}\\,|\\b [[:alpha:]]{2,}')
last <- str_extract(last, "[[:alpha:]]{1,}")
last
}
### create a function to extract first names
get_first <- function(list) {
first <- str_extract(list, '[[:alpha:]]{1,} |\\. [[:alpha:]]{1,}|\\, [[:alpha:]]{2,}')
first <- str_extract(first, "[[:alpha:]]{1,}")
first
}
# run functions to create df
simpsons <- data.frame(first = get_first(name),
last = get_last(name))
simpsons$fullname <- paste0(simpsons$first, " ", simpsons$last)
# create logicial vector describing if title
simpsons$title <- str_detect(name, 'Dr.|Rev.')
# create a logical vector describing if second name
simpsons$secondname <- str_detect(name, ' [:alpha:]{1}\\. ')
# final product
simpsons
first last fullname title secondname
1 Moe Szyslak Moe Szyslak FALSE FALSE
2 Montgomery Burns Montgomery Burns FALSE TRUE
3 Timothy Lovejoy Timothy Lovejoy TRUE FALSE
4 Ned Flanders Ned Flanders FALSE FALSE
5 Homer Simpson Homer Simpson FALSE FALSE
6 Julius Hibbert Julius Hibbert TRUE FALSE
Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
This regex will return a string of one or more numbers followed by a ‘$’ sign.
#(a) [0-9]+\\$
a <- "342$a66$.2.2$a$"
unlist(str_extract_all(a, '[0-9]+\\$'))
[1] "342$" "66$" "2$"
This regex will return a string of one to four lowercase alpha characters so long as they are bounded by non-word characters. Note that digits are considered to be word characters, thus the “two” does not show up because the “2” is not a boundary, and is not [a-z].
#(b) \b[a-z]{1,4}\\b
b <- "u-one 2two.three FOUR%four$;etc"
unlist(str_extract_all(b, '\\b[a-z]{1,4}\\b'))
[1] "u" "one" "four" "etc"
This regex will return any string that ends with ‘.txt’. If it does not end with .txt, it will not return anything.
#(c) .*?\\.txt$
c <- "332!34_2.txt else.png dark.txt"
unlist(str_extract_all(c, '.*?\\.txt$'))
[1] "332!34_2.txt else.png dark.txt"
This regex will return any string in the form ‘dd/dd/dddd’ where ‘d’ is a digit. This is a common form for dates.
#(d) \\d{2}/\\d{2}/\\d{4}
d <- "09/10/2016 10/12/2014 2015/01/02"
unlist(str_extract_all(d, '\\d{2}/\\d{2}/\\d{4}'))
[1] "09/10/2016" "10/12/2014"
This regex uses backreferencing to return any string that starts with a <text> and ends with </text>. This would be a good way to search through html or xml.
#(e) <(.+?)>.+?</\\1>
e <- "<div>sample text</div> <ol><li>thingone</li><li>thingtwo</li></ol>"
unlist(str_extract_all(e, '<(.+?)>.+?</\\1>'))
[1] "<div>sample text</div>"
[2] "<ol><li>thingone</li><li>thingtwo</li></ol>"
The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
encrypted <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
## remove all nums, lowercase letters
cracked <- unlist(str_extract_all(encrypted, '[^[a-z]|[0-9]]'))
## collapse above, then split by '.'
cracked <- paste0(unlist(strsplit(paste0(cracked, collapse=""), ".", fixed=TRUE)), collapse=" ")
cracked
[1] "CONGRATULATIONS YOU ARE A SUPERNERD!"