3-1
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
regex <- "([a-zA-Z,. ]){2,}"
name <- unlist(str_extract_all(raw.data, regex))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
str(name)
## chr [1:6] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy" ...
data.name <- str_remove_all(name,"(\\w+)\\.\\s")
data.name
## [1] "Moe Szyslak" "Burns, Montgomery" "Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Julius Hibbert"
data.name <-gsub("(\\w+)\\,\\s(.*)","\\2 \\1", data.name)
df <- data.frame(str_split_fixed(data.name, " ", 2))
colnames(df) <- c("First Name","Last Name")
df
## First Name Last Name
## 1 Moe Szyslak
## 2 Montgomery Burns
## 3 Timothy Lovejoy
## 4 Ned Flanders
## 5 Homer Simpson
## 6 Julius Hibbert
Checking whether the names have a title
str_detect(name, "[A-Za-z]{2,3}\\.")
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
# We can see that 3rd and last name contain titles
unlist(str_extract(name, "[A-Za-z]{2,3}\\."))
## [1] NA NA "Rev." NA NA "Dr."
Check whether a character has a 2nd name
# We need to remove the title first
cleaned_names <- str_remove_all(name,"[A-Za-z]{2,3}\\.")
cleaned_names
## [1] "Moe Szyslak" "Burns, C. Montgomery" " Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" " Julius Hibbert"
# let's detect whether any of these have a more than 3 names
str_count(str_trim(cleaned_names), "\\w+") > 2
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
## Only C. Montomery Burns has a middle name