library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data
## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
notconfflag <- str_detect(name, ",")
notconfnames <- name[notconfflag]
notconfnames
## [1] "Burns, C. Montgomery" "Simpson, Homer"
confnames <- name[notconfflag==FALSE]
confnames
## [1] "Moe Szyslak" "Rev. Timothy Lovejoy" "Ned Flanders"
## [4] "Dr. Julius Hibbert"
splitnames <- unlist(str_split(notconfnames, ","))
firstnamesflag <- str_detect(splitnames, "[[:blank:]]+")
firstnames <- splitnames[firstnamesflag]
firstnames <- str_trim(firstnames)
lastnames <- splitnames[firstnamesflag==FALSE]
fixednames <- str_c(str_c(firstnames, " "), lastnames)
finalnames <- c(confnames, fixednames)
finalnames
## [1] "Moe Szyslak" "Rev. Timothy Lovejoy" "Ned Flanders"
## [4] "Dr. Julius Hibbert" "C. Montgomery Burns" "Homer Simpson"
hastitleflag <- str_detect(finalnames, "([:alpha:]{2,})(\\.)")
hastitle <- finalnames[hastitleflag]
hastitle
## [1] "Rev. Timothy Lovejoy" "Dr. Julius Hibbert"
has2ndflag <- str_detect(finalnames, "[A-Z](\\.)")
has2ndname <- finalnames[has2ndflag]
has2ndname
## [1] "C. Montgomery Burns"
ex_dollar <- "Price: $50, Tax: $5, Total: 55$, Fees$"
ex_dollar
## [1] "Price: $50, Tax: $5, Total: 55$, Fees$"
dollar_tot <- str_extract_all(ex_dollar, "[0-9]+\\$")
dollar_tot
## [[1]]
## [1] "55$"
ex_lowerc <- "abcd abc ABCD abcdefg 1234 abcd1 xy xy2z zyxw lmnopq"
ex_lowerc
## [1] "abcd abc ABCD abcdefg 1234 abcd1 xy xy2z zyxw lmnopq"
lowerc <- str_extract_all(ex_lowerc, "\\b[a-z]{1,4}\\b")
lowerc
## [[1]]
## [1] "abcd" "abc" "xy" "zyxw"
ex_files <- c("File1.doc", "File2.xls", "File3.txt", "File4.ppt", "File5.txt")
ex_files
## [1] "File1.doc" "File2.xls" "File3.txt" "File4.ppt" "File5.txt"
txtfiles <- unlist(str_extract_all(ex_files, ".+?\\.txt$"))
txtfiles
## [1] "File3.txt" "File5.txt"
ex_date <- "IssueDate 01/01/2018, ExpDate 12/31/2020, OtherDate 04/02/18"
ex_date
## [1] "IssueDate 01/01/2018, ExpDate 12/31/2020, OtherDate 04/02/18"
dates <- str_extract_all(ex_date, "\\d{2}/\\d{2}/\\d{4}")
dates
## [[1]]
## [1] "01/01/2018" "12/31/2020"
ex_backref <- "<1234> abcde <1234> xyz<1234>"
ex_backref
## [1] "<1234> abcde <1234> xyz<1234>"
backref <- str_extract_all(ex_backref, "(<.+?>).+?\\1")
backref
## [[1]]
## [1] "<1234> abcde <1234>"