library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
[1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
[4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
### Function to extract last names
get_last <- function(list){
last <- str_extract(list, '[[:alpha:]]{1,}\\,|\\b [[:alpha:]]{2,}')
last <- str_extract(last, "[[:alpha:]]{1,}")
last
}
### Function to extract first names
get_first <- function(list) {
first <- str_extract(list, '[[:alpha:]]{1,} |\\. [[:alpha:]]{1,}|\\, [[:alpha:]]{2,}')
first <- str_extract(first, "[[:alpha:]]{1,}")
first
}
# Create df
allNameInDF <- data.frame(first = get_first(name),
last = get_last(name))
allNameInDF$fullname <- paste0(allNameInDF$first, " ", allNameInDF$last)
# Find if title exists
allNameInDF$title <- str_detect(name, 'Dr.|Rev.')
# Find if Middle name exists
allNameInDF$MiddleName <- str_detect(name, ' [:alpha:]{1}\\. ')
# Print DF
allNameInDF
first last fullname title MiddleName
1 Moe Szyslak Moe Szyslak FALSE FALSE
2 Montgomery Burns Montgomery Burns FALSE TRUE
3 Timothy Lovejoy Timothy Lovejoy TRUE FALSE
4 Ned Flanders Ned Flanders FALSE FALSE
5 Homer Simpson Homer Simpson FALSE FALSE
6 Julius Hibbert Julius Hibbert TRUE FALSE
Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
This regex will return a string of one or more numbers followed by a ‘$’ sign.
#(a) [0-9]+\\$
a <- "892$xp09$.2*.2$oa$"
unlist(str_extract_all(a, '[0-9]+\\$'))
[1] "892$" "09$" "2$"
This regex will return a string of one to four lowercase alpha characters so long as they are bounded by non-word characters. Note that digits are considered to be word characters, thus the “two” does not show up because the “3” is not a boundary, and is not [a-z].
#(b) \b[a-z]{1,4}\\b
b <- "u-one 3two.three FOUR%four$;etc U pp Er"
unlist(str_extract_all(b, '\\b[a-z]{1,4}\\b'))
[1] "u" "one" "four" "etc" "pp"
This regex will return any string that ends with ‘.txt’. If it does not end with .txt, it will not return anything.
#(c) .*?\\.txt$
c <- "This$$is Ending with .txt"
c2 <- "this .txt is not ending with txt"
unlist(str_extract_all(c, '.*?\\.txt$'))
[1] "This$$is Ending with .txt"
This regex will return any string in the form ‘dd/dd/dddd’ where ‘d’ is a digit. This is a common form for dates.
#(d) \\d{2}/\\d{2}/\\d{4}
d <- "09/10/2016 10/12/2014 2015/01/02"
unlist(str_extract_all(d, '\\d{2}/\\d{2}/\\d{4}'))
[1] "09/10/2016" "10/12/2014"
This regex returns any string that starts with a <text> and ends with </text>.
#(e) <(.+?)>.+?</\\1>
e <- "<div>HTML tags </div> <ol><li>This is LI Tag</li><li>Again LI Tag</li></ol>"
unlist(str_extract_all(e, '<(.+?)>.+?</\\1>'))
[1] "<div>HTML tags </div>"
[2] "<ol><li>This is LI Tag</li><li>Again LI Tag</li></ol>"