R> name [1] “Moe Szyslak” “Burns, C. Montgomery” “Rev. Timothy Lovejoy” [4] “Ned Flanders” “Simpson, Homer” “Dr. Julius Hibbert”
library(stringr)
# A difficult example
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5543642Dr. Julius Hibbert"
# Extract information
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
We go about by first removing any titles the names may have and also any other initials. Few names are in form of lastname first followed by ‘,’ (comma). Hence we start by extracting firstnames and lastnames , removing any space paddings and then concatenating the two vectors obtained into one ordered names character vector separated by space.
# Removing the titles Dr. / Rev. and also the middke initial of Montogomery C. Burns
name.notitle <- str_replace(name,pattern="[[:alpha:]]{1,}[.]", replacement="")
# Extracting firstname
firstname <- ifelse(str_detect(name.notitle,","),str_extract(name.notitle,", +[[:alpha:]]{2,}"), str_extract(name.notitle," ?[[:alpha:]]{2,} "))
firstname <- str_replace(firstname,pattern=", +", replacement ="")
firstname <- str_trim(firstname)
# Extracting last name
lastname <- ifelse(str_detect(name.notitle,","), str_extract(name.notitle,"[[:alpha:]]{2,}, +"), str_extract(name.notitle,"\\s[[:alpha:]]{2,}$"))
lastname <- str_replace(lastname,pattern=",| +", replacement ="")
# Concatenating the firstname and lastname to form ordered names vector
orderedname <- paste(firstname,lastname ,sep=" ")
# Prints orderednames
orderedname## [1] "Moe Szyslak" "Montgomery Burns " "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson " "Julius Hibbert"
We match the patterns that are likely to appear with the name vector to identify a logical vector. Only Rev. Timothy Lovejoy and Dr. Julius Hibbert show TRUE.
name.hastitle <- str_detect(name,pattern="Rev|Dr[.]")
name.hastitle## [1] FALSE FALSE TRUE FALSE FALSE TRUE
We start by removing from the name vector only the titles and then doing a word count. This gives the number of words in each character string. Any stirng that has anything more tha first name and last name , i.e. more than 2 strings would be the one to have second name.
Only Burns, C. Montgomery has a second name starting from C and hence returns TRUE.
name.notitle <- str_replace(name,pattern="Rev|Dr[.]", replacement="")
hassecondname <- str_count(name.notitle,"\\w+")
hassecondname <- str_detect(hassecondname, "3")
# Prints the logical vector that reflects the strings that have second names
hassecondname## [1] FALSE TRUE FALSE FALSE FALSE FALSE
Any character string that contains a substring that matches a pattern of continuous digits followed by a $ sign. This may be a pattern at start or end or in between the string.
# example [0-9]+\\$
digitstring1 <- c("7812098$","kjj989898ww$","$243464","9090$09090")
digitstring1 <- str_extract(digitstring1,"[0-9]+\\$")
digitstring1## [1] "7812098$" NA NA "9090$"
digitstring2 <- c("7812098$","kjj989898ww$","$243464")
digitstring2 <- str_extract(digitstring2,"[0-9]+\\$")
digitstring2## [1] "7812098$" NA NA
This pattern would fetch only those substrings that match a blank followed by atleast 1 upto 4 small alphabet characters followed by nother blank.
evalstring1 <- c(" 7812098$"," k "," ASDFG "," asdfg "," asdf ")
evalstring1 <- str_extract(evalstring1,"\\b[a-z]{1,4}\\b")
evalstring1## [1] NA "k" NA NA "asdf"
This pattern fetches strings that end with ‘.txt’
evalstring2 <- c(" 7812098$"," kjj98989txt$"," ASDFG.txt "," asdfg.txt"," asdf ")
evalstring2 <- str_extract(evalstring2,".*?\\.txt$")
evalstring2## [1] NA NA NA " asdfg.txt" NA
This string fetches string of digits which is more or less in date format of strictly 2 digits followed forward slash followed by strictly 2 digits and a forward slash and then strictly 4 digits. However it does not validate the range of digits.
evalstring3 <- c("781/1/121","10/10/2010","12/2/2010 ","12/12\2010"," asdf ","2/12/2010 " )
evalstring3 <- str_extract(evalstring3,"\\d{2}/\\d{2}/\\d{4}")
evalstring3## [1] NA "10/10/2010" NA NA NA
## [6] NA
This pattern evaluates any wellformed markup tag which starts and ends.
evalstring4 <- c("<tag> hello/>","<tag>capture</tag>","<tag tagme /tag>")
evalstring4 <- str_extract(evalstring4,"<(.+?)>.+?</\\1>")
evalstring4## [1] NA "<tag>capture</tag>" NA
clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
Observing the string , we find that intermediary characters are capitalized and hence to extract them to check if that has a hidden meaningful string, we extarct all uppercase characters.
message <- c("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr")
message <- str_extract_all(message,"[[:upper:]]")
# Extracted hidden message
message## [[1]]
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"