Load stringr package and the data
library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
Extract the names from the raw data
name <- unlist(str_extract_all(raw.data, "[[:alpha:],. ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
Step 1. Remove any words end with a period. This will remove all titles and second names
name1 <- str_replace_all(name, pattern = "\\w+\\. ", replacement = "")
name1
## [1] "Moe Szyslak" "Burns, Montgomery" "Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Julius Hibbert"
Step 2. Create a function that split a name with comma and flip it so it’s first_name first
correctName <- function(string)
{
spliting <- str_split(string, pattern = ", ")
lname <- spliting[[1]][1]
fname <- spliting[[1]][2]
return(str_c(fname, lname, sep = " "))
}
Step 3. Use a for-loop to go thru all the names applying the function.
for (i in 1:length(name1))
{
if (str_detect(name1[i], pattern = ", "))
{
name1[i] <- correctName(name1[i])
}
}
name1
## [1] "Moe Szyslak" "Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
title <- str_detect(name, pattern = "Rev.|Dr.")
data.frame(name, title)
## name title
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert TRUE
Alternatively, below code will catch other titles as well, detecting pattern of words with 2 to 3 letters ending with a period.
str_detect(name, pattern = "\\w{2,3}\\. ")
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
second_name <- str_detect(name, pattern = " \\w{1}\\. ")
data.frame(name, second_name)
## name second_name
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert FALSE
Numbers 0 thru 9 match one or more times ending with dollar sign $
example4a <- "90210$"
str_detect(example4a, "[0-9]+\\$")
## [1] TRUE
A word that is 1 to 4 lower case English letters in length
example4b <- "test"
str_detect(example4b, "\\b[a-z]{1,4}\\b")
## [1] TRUE
A .txt file named with or without any characters.
example4c <- c("testing1.txt", ".txt")
str_detect(example4c, ".*?\\.txt$")
## [1] TRUE TRUE
Date of birth formatted mm/dd/yyyy. Althought the query doesn’t have to be for DOB.
example4d <- c("07/02/2013", "12/34/5678")
str_detect(example4d, "\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE TRUE
This is for HTML code search. A string surrounded by <> brackets, followed by string of any length more than 1, followed by back referencing the same string but added a slash /
example4e <- c("<bold> Any string </bold>", "<646718>3</646718>")
str_detect(example4e, "<(.+?)>.+?</\\1>")
## [1] TRUE TRUE
Here’s the string where there’s a message hidden:
message <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
I noticed there are four types of characters in this string:
My first plan was to remove these types one by one from the string to see what’s the result.
func <- c("[0-9]", "[a-z]", "[A-Z]", "[[.]!]")
str_replace_all(message, func, replacement = "")
## [1] "clcopCowzmstcdwnkigOvdicpNuggvhrynGjuwczihqrfpRxsAjdwpnTanwoUwisdijLjkpfATIdrcocbtyczjatOaootjtNjnecSfek.rwYwwojigOdvrfUrbz.bkAnbhzgvRizEcrop.wAgnb.SqoUfPaotfbwEmktsRzqefynNdtkcfEgmcRgxonhDk!gr"
## [2] "C10877ON92G8R5A50TU7L803AT5I307O553N364S.11YO6U2.2A4R905E.A.SU65P17E2463R95896N594E9054R5D!"
## [3] "clcopow1zmstc0d87wnkig7vdicpuggvhryn92juwczi8hqrfpxs5j5dwpn0anwowisdij7j8kpf035dr3coc0bt7yczjataootj55t3j3ne6c4fek.r1w1wwojigd6vrfrbz2.2bknbhzgv49i05zcrop.wgnb.qo65fa1otfb7wm24k6t3s9zqe5fy89n6d5t9kc4f905gmc4gxo5nhk!gr"
## [4] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfekr1w1YwwojigOd6vrfUrbz22bkAnbhzgv4R9i05zEcropwAgnbSqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDkgr"
Didn’t look good.
My second try was to remove combination of two of the four types from the string, then combination of three types. And check to see what’s the result.
Here I use combn function to generate combinations, for example:
combn(func, 2)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] "[0-9]" "[0-9]" "[0-9]" "[a-z]" "[a-z]" "[A-Z]"
## [2,] "[a-z]" "[A-Z]" "[[.]!]" "[A-Z]" "[[.]!]" "[[.]!]"
Here, it generates a matrix, where each column is a combination of the types.
I then wrote a double for-loop and used str_c to combine the types, and removed the combined types from the string message.
combo <- combn(func, 2)
combo_dim <- dim(combo)
# Following double loops splice the func together
for (j in 1:combo_dim[2])
{
new_func <- combo[1,j]
for (i in 2:combo_dim[1])
{
new_func <- str_c(new_func, "|", combo[i,j])
}
# Now removing the combined funcs
new_message <- str_replace_all(message, new_func, replacement = "")
print(c(new_func, new_message))
}
## [1] "[0-9]|[a-z]"
## [2] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"
## [1] "[0-9]|[A-Z]"
## [2] "clcopowzmstcdwnkigvdicpuggvhrynjuwczihqrfpxsjdwpnanwowisdijjkpfdrcocbtyczjataootjtjnecfek.rwwwojigdvrfrbz.bknbhzgvizcrop.wgnb.qofaotfbwmktszqefyndtkcfgmcgxonhk!gr"
## [1] "[0-9]|[[.]!]"
## [2] "clcopCowzmstcdwnkigOvdicpNuggvhrynGjuwczihqrfpRxsAjdwpnTanwoUwisdijLjkpfATIdrcocbtyczjatOaootjtNjnecSfekrwYwwojigOdvrfUrbzbkAnbhzgvRizEcropwAgnbSqoUfPaotfbwEmktsRzqefynNdtkcfEgmcRgxonhDkgr"
## [1] "[a-z]|[A-Z]"
## [2] "1087792855078035307553364.1162.24905..651724639589659490545!"
## [1] "[a-z]|[[.]!]"
## [2] "C10877ON92G8R5A50TU7L803AT5I307O553N364S11YO6U22A4R905EASU65P17E2463R95896N594E9054R5D"
## [1] "[A-Z]|[[.]!]"
## [2] "clcopow1zmstc0d87wnkig7vdicpuggvhryn92juwczi8hqrfpxs5j5dwpn0anwowisdij7j8kpf035dr3coc0bt7yczjataootj55t3j3ne6c4fekr1w1wwojigd6vrfrbz22bknbhzgv49i05zcropwgnbqo65fa1otfb7wm24k6t3s9zqe5fy89n6d5t9kc4f905gmc4gxo5nhkgr"
I found the message! After removing the two types “[0-9]|[a-z]”, the message appears to be “Congratulations you are a supernerd”.
I continued on to remove combination of three types, using the same method, just to see what happens.
combo <- combn(func, 3)
combo_dim <- dim(combo)
# Following double loops splice the func together
for (j in 1:combo_dim[2])
{
new_func <- combo[1,j]
for (i in 2:combo_dim[1])
{
new_func <- str_c(new_func, "|", combo[i,j])
}
# Now removing the combined funcs
new_message <- str_replace_all(message, new_func, replacement = "")
print(c(new_func, new_message))
}
## [1] "[0-9]|[a-z]|[A-Z]" "....!"
## [1] "[0-9]|[a-z]|[[.]!]" "CONGRATULATIONSYOUAREASUPERNERD"
## [1] "[0-9]|[A-Z]|[[.]!]"
## [2] "clcopowzmstcdwnkigvdicpuggvhrynjuwczihqrfpxsjdwpnanwowisdijjkpfdrcocbtyczjataootjtjnecfekrwwwojigdvrfrbzbknbhzgvizcropwgnbqofaotfbwmktszqefyndtkcfgmcgxonhkgr"
## [1] "[a-z]|[A-Z]|[[.]!]"
## [2] "1087792855078035307553364116224905651724639589659490545"
As you can see, after removing “[0-9]|[a-z]|[[.]!]”, the message still can be seen. This is because the message was hidden as upper case in the strings.