raw.data <-("555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert")
library(stringr)
name <- str_extract_all(raw.data, "[[:alpha:]., ]{2,}")
name
## [[1]]
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
# Check for ",". If no comma, is first_name last_name, if yes comma, reverse to make first_name last-name
reverse <- function(full_name) {
fl_name <- ifelse(str_detect(full_name,","),
paste( str_split(full_name, ", ")[[1]][[2]], str_split(full_name, ", ")[[1]][[1]] ),
full_name)
str_split( str_replace(fl_name, "[[:alpha:]]{1,}\\. ", ""), " ")
}
Simpson_names <- lapply(name[[1]], reverse)
Simpson_names
## [[1]]
## [[1]][[1]]
## [1] "Moe" "Szyslak"
##
##
## [[2]]
## [[2]][[1]]
## [1] "Montgomery" "Burns"
##
##
## [[3]]
## [[3]][[1]]
## [1] "Timothy" "Lovejoy"
##
##
## [[4]]
## [[4]][[1]]
## [1] "Ned" "Flanders"
##
##
## [[5]]
## [[5]][[1]]
## [1] "Homer" "Simpson"
##
##
## [[6]]
## [[6]][[1]]
## [1] "Julius" "Hibbert"
has_title <- function(full_name) !is.na(str_extract(full_name, "[[:alpha:]]{2,}\\."))
name
## [[1]]
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
lapply(name, has_title)
## [[1]]
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
has_secname <- function(full_name) !is.na(str_extract(full_name, "\\b[[:alpha:]]{1,1}\\."))
has_secname <- function(full_name) !is.na(str_extract(full_name, "\\b[[:alpha:]]{1,1}\\."))
name
## [[1]]
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
lapply(name, has_secname)
## [[1]]
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
One or more numbers followed by a dollar sign
str_detect("12345$", "[0-9]+\\$")
## [1] TRUE
A lower case character string that is between 1 to 4 letters long
str_detect("asgf","\\b[a-z]{1,4}\\b") # TRUE
## [1] TRUE
0 or more (ungreedy) digits of any character followed by a . and txt with nothing after the txt In other words a name of a text file ending .txt
str_detect("jad34.txt", ".*?\\.txt$") # TRUE
## [1] TRUE
2 digits / 2 digits/ 4 digits which would match a date such as 09/16/2018
str_detect("01/01/2018","\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE
str_detect("<html>I really love regex!!!!!!!!!!!</html>","<(.+?)>.+?</\\1>")
## [1] TRUE