library(stringr)
## Warning: package 'stringr' was built under R version 3.4.3
library(XML)
## Warning: package 'XML' was built under R version 3.4.3
library(RCurl)
## Warning: package 'RCurl' was built under R version 3.4.3
## Loading required package: bitops
library(tau)
## Warning: package 'tau' was built under R version 3.4.3
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data
## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
# extract name data from raw data
name.data <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
# subset strings into different cases based on word order
# first, those beginning with surnames followed by first names, initials, etc.
surname.case <- str_subset(name.data, "\\,+")
sc <- surname.case
sc.first <- matrix(unlist(str_extract_all(sc, "\\b[:alpha:]+$")))
sc.last <- matrix(unlist(str_extract_all(sc, "^[:alpha:]+[^\\, ]")))
sc.cleaned <- cbind(sc.first, sc.last)
# next, those beginning with titles (which we'll also remove and store for later)
title.case <- str_subset(name.data, "\\w{2,3}\\.")
tc <- title.case
tc.first <- matrix(unlist(str_extract_all(tc, "(?<=\\s)[:alpha:]+(?=\\s)")))
tc.last <- matrix(unlist(str_extract_all(tc, "[:alpha:]+$")))
tc.title <- matrix(unlist(str_extract_all(tc, "^\\w{1,3}\\.")))
tc.cleaned <- cbind(tc.first, tc.last)
# next, those in conventional order of firstname lastname
firstlast.case <- str_subset(name.data, "^\\w+\\s\\w+$")
fc <- firstlast.case
fc.cleaned <- matrix(unlist(str_split(fc, "[ ]")), nrow = 2, ncol = 2, byrow = TRUE)
name.data.cleaned <- rbind(sc.cleaned, tc.cleaned, fc.cleaned)
colnames(name.data.cleaned) <- c("first name", "last name")
data.frame(name.data.cleaned)
## first.name last.name
## 1 Montgomery Burns
## 2 Homer Simpson
## 3 Timothy Lovejoy
## 4 Julius Hibbert
## 5 Moe Szyslak
## 6 Ned Flanders
# see 3.1 above
colnames(tc.title) <- "Title"
tc.title
## Title
## [1,] "Rev."
## [2,] "Dr."
initial.case <- str_subset(name.data, "\\b[:alpha:]{1}\\.")
ic <- initial.case
ic.initial <- matrix(unlist(str_extract_all(ic, "\\b[:alpha:]{1}\\.")))
colnames(ic.initial) <- "Initial"
ic.initial
## Initial
## [1,] "C."
One or more digits followed by a backslash ending the string.
test4.1positive1 <- c("06793028$")
str_detect(test4.1positive1, "[0-9]+\\$")
## [1] TRUE
test4.1positive2 <- c("067$")
str_detect(test4.1positive2, "[0-9]+\\$")
## [1] TRUE
test4.1negative1 <- c("06793028")
str_detect(test4.1negative1, "[0-9]+\\$")
## [1] FALSE
One to four lower case letters compromising the entirety of the string.
test4.2positive1 <- c("ab")
str_detect(test4.2positive1, "\\b[a-z]{1,4}\\b")
## [1] TRUE
test4.2positive2 <- c("acbz")
str_detect(test4.2positive2, "\\b[a-z]{1,4}\\b")
## [1] TRUE
test4.2negative1 <- c("A")
str_detect(test4.2negative1, "\\b[a-z]{1,4}\\b")
## [1] FALSE
test4.2negative2 <- c("abcxyz")
str_detect(test4.2negative2, "\\b[a-z]{1,4}\\b")
## [1] FALSE
test4.2positive1 <- c("applesauce.txt")
str_detect(test4.2positive1, ".*?\\.txt$")
## [1] TRUE
test4.2positive2 <- c(".txt")
str_detect(test4.2positive2, ".*?\\.txt$")
## [1] TRUE
test4.2negative1 <- c("applesauce.txt.rmd")
str_detect(test4.2negative1, ".*?\\.txt$")
## [1] FALSE
test4.2negative2 <- c(".TXT")
str_detect(test4.2negative2, ".*?\\.txt$")
## [1] FALSE
Two digits followed by a forward slash, then two more digits followed by another forward slash, then four digits. (Like mm/dd/yyyy, but without maxima for months and days)
test4.2positive1 <- c("12/07/1941")
str_detect(test4.2positive1, "\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE
test4.2positive2 <- c("20/50/0000")
str_detect(test4.2positive2, "\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE
test4.2negative1 <- c("12\31\1999")
str_detect(test4.2negative1, "\\d{2}/\\d{2}/\\d{4}")
## [1] FALSE
test4.2negative2 <- c("12/7/41")
str_detect(test4.2negative2, "\\d{2}/\\d{2}/\\d{4}")
## [1] FALSE
One, several, or no characters, preceded by a left-angle sign and followed by a right-angle sign. Optionally followed by one character. Then a forward slash and (backreferenced group of) one, several, or not characters, again preceded by a left-angle sign and followed by a right-angle sign.
test4.2positive1 <- c("<hidden agenda>-</hidden agenda>")
str_detect(test4.2positive1, "<(.+?)>.?</\\1>")
## [1] TRUE
test4.2positive2 <- c("< ></ >")
str_detect(test4.2positive2, "<(.+?)>.?</\\1>")
## [1] TRUE
test4.2negative1 <- c("<hidden agenda></hiddenagenda>")
str_detect(test4.2negative1, "<(.+?)>.?</\\1>")
## [1] FALSE