library(stringr)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson,Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson,Homer" "Dr. Julius Hibbert"
# create vectors to later fill in as a dataframe
title <- rep_len(NA,6)
first <- rep_len(NA,6)
middle <- rep_len(NA,6)
last <- rep_len(NA,6)
splitName <- str_split(name, ",")
splitFirst <- str_split(first, " ")
# modify names and store values in vectors created above
for (i in 1:length(name))
{
# for names with comma, split and exchange position for last and first name
if (str_detect(name[i],",")) {
splitName[i] <- str_split(name[i], ",")
first[i] <- sapply(splitName[i],"[", 2)
last[i] <- sapply(splitName[i],"[", 1)
# remove excess white spaces in first name
first[i] <- str_trim(first[i])
if (str_detect(first[i],"\\.")) {
splitFirst[i] <- str_split(first[i], " ")
# for first name that includes middle, separate middle
middle[i] <- sapply(splitFirst[i],"[", 1)
first[i] <- sapply(splitFirst[i],"[", 2)
}
}
# for names with titles, separate titles
else if (str_detect(name[i],"[[:alpha:]]{2,3}\\.")) {
splitName[i] <- str_split(name[i], " ")
title[i] <- sapply(splitName[i],"[", 1)
first[i] <- sapply(splitName[i],"[", 2)
last[i] <- sapply(splitName[i],"[", 3)
}
else {
# split all others at space
splitName[i] <- str_split(name[i], " ")
first[i] <- sapply(splitName[i],"[", 1)
last[i] <- sapply(splitName[i],"[", 2)
}
}
title
## [1] NA NA "Rev." NA NA "Dr."
first
## [1] "Moe" "Montgomery" "Timothy" "Ned" "Homer"
## [6] "Julius"
middle
## [1] NA "C." NA NA NA NA
last
## [1] "Szyslak" "Burns" "Lovejoy" "Flanders" "Simpson" "Hibbert"
#create dataframe
newName <- data.frame(matrix(ncol = 4,nrow = 6))
colnames(newName) <- c("title","first","middle","last")
newName$title <- title
newName$first <- first
newName$middle <- middle
newName$last <- last
newName
## title first middle last
## 1 <NA> Moe <NA> Szyslak
## 2 <NA> Montgomery C. Burns
## 3 Rev. Timothy <NA> Lovejoy
## 4 <NA> Ned <NA> Flanders
## 5 <NA> Homer <NA> Simpson
## 6 Dr. Julius <NA> Hibbert
hasTitle <- newName$title != 'NA'
hasTitle
## [1] NA NA TRUE NA NA TRUE
hasMiddle <- newName$middle != 'NA'
hasMiddle
## [1] NA TRUE NA NA NA NA
any number of digits with $ at the end:
pattern <- "[0-9]+\\$"
string <- c("numbers","123123$a","123a123$")
str_detect(string,pattern)
## [1] FALSE TRUE TRUE
str_extract(string,pattern)
## [1] NA "123123$" "123$"
4-letter word:
pattern <- "\\b[a-z]{1,4}\\b"
string <- c("AbCd","abcd5","abcd")
str_detect(string,pattern)
## [1] FALSE FALSE TRUE
str_extract(string,pattern)
## [1] NA NA "abcd"
optional any number of characters ended by ‘.txt’:
pattern <- ".*?\\.txt$"
string <- c(".txt ","abc.txt$","anyTh1nG$.txt")
str_detect(string,pattern)
## [1] FALSE FALSE TRUE
str_extract(string,pattern)
## [1] NA NA "anyTh1nG$.txt"
two digits/two digits/4 digits
pattern <- "\\d{2}/\\d{2}/\\d{4}"
string <- c("12312017","12/31/17","12/31/2017")
str_detect(string,pattern)
## [1] FALSE FALSE TRUE
str_extract(string,pattern)
## [1] NA NA "12/31/2017"
string with any number if characters inside <> followed by any number of characters and a backreference to the first expression but including a slash </>
pattern <- "<(.+?)>.+?</\\1>"
string <- c("<abc>","<abc> </def>","<abc>anyTh1nG</abc>")
str_detect(string,pattern)
## [1] FALSE FALSE TRUE
str_extract(string,pattern)
## [1] NA NA "<abc>anyTh1nG</abc>"