library(stringr)
library(knitr)
Introductory Example
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
Extract vectors, Vector name stores the extracted name
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
Replace the title and the middle name for name dataframe with blank
remove_title_middle <- str_replace(name,"([[:alpha:]]{1,3}\\.\\s)", "")
Rearrange the vector so that all elements conform to the standard first_name last_name.
Reverse the first and last name where necessary
firstName_lastName <- str_replace(remove_title_middle, "([[:alpha:]]+), ([[:alpha:]]+)", "\\2 \\1")
kable(list(data.frame(name, firstName_lastName)), caption = "Reorder \"name\" so it conform to the standard first and last name format.")
|
As we can see, all the names are separated as the standard format, firstName, lastName
title_char <- str_detect(name, "[[:alpha:]]{2,3}\\. ")
kable(list(data.frame(name, title_char)), caption = "Detect if names in \"name\" has a title.")
|
second_name <- str_detect(name, " [[:alpha:],]{1,}")
kable(list(data.frame(name, second_name)), caption = "Detect if names in \"name\" has a second name.")
|
All the names in the name vector has a second name, therefore, the resulting table has TRUE for all the inputs.
This regular expression is matched by a string which starts with a number/s [0-9], + sign means that the preceeding item will be matched one or more time, \$ ends with a $ sign
example:
str_1<- c("123$", "abc$", "-1098$", "test$", 27)
test_1<- unlist(str_extract_all(str_1, "[0-9]+\\$"))
test_1
## [1] "123$" "1098$"
To detect if the above regular expression works:
test_1 <- str_detect(str_1, "[0-9]+\\$")
test_1
## [1] TRUE FALSE TRUE FALSE FALSE
This regular expression is matched by a string which is followed by lowe case alphabets which ranges between a to z.We are asking the function for all instances where this sequence appears at least once, but at most four times. \b indicates the word edges
str_2<- c("aa", "bzdg", "bnewyork", "jeny")
test_2<- unlist(str_extract_all(str_2, "\\b[a-z]{1,4}\\b"))
test_2
## [1] "aa" "bzdg" "jeny"
To detect if the above regular expression works:
test_2 <- str_detect(str_2, "\\b[a-z]{1,4}\\b")
test_2
## [1] TRUE TRUE FALSE TRUE
This regular expression is matched by any string which ends by .txt
str_3<- c("aa.txt", "bzdg", "$bnewyork", "sneha.txt")
test_3<- unlist(str_extract_all(str_3, ".*?\\.txt$"))
test_3
## [1] "aa.txt" "sneha.txt"
To detect if the above regular expression works:
test_3 <- str_detect(str_3, "\\b[a-z]{1,4}\\b")
test_3
## [1] TRUE TRUE FALSE TRUE
This regular expression is matched by a number pattern dd/dd/dddd
str_4<- c("12/12/2019", "11-22/2018", "11/22/2018", "sneha.txt")
test_4<- unlist(str_extract_all(str_4, "\\d{2}/\\d{2}/\\d{4}"))
test_4
## [1] "12/12/2019" "11/22/2018"
To detect if the above regular expression works:
test_4 <- str_detect(str_4, "\\b[a-z]{1,4}\\b")
test_4
## [1] FALSE FALSE FALSE TRUE
This regular expression is matched by a pattern with open and closed tags, like the html tags.
str_5 <- c("<b> qwerty </b>", "<h1>priya <h1>", "<>test</tag>", "<tag> helloworld </tag>")
test_5 <- unlist(str_extract_all(str_5, "<(.+?)>.+?</\\1>"))
test_5
## [1] "<b> qwerty </b>" "<tag> helloworld </tag>"
To detect if the above regular expression works:
test_5 <- str_detect(str_5, "<(.+?)>.+?</\\1>")
test_5
## [1] TRUE FALSE FALSE TRUE
str_test <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
str_extract_all(str_test, "[a-z]")
## [[1]]
## [1] "c" "l" "c" "o" "p" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k" "i"
## [18] "g" "v" "d" "i" "c" "p" "u" "g" "g" "v" "h" "r" "y" "n" "j" "u" "w"
## [35] "c" "z" "i" "h" "q" "r" "f" "p" "x" "s" "j" "d" "w" "p" "n" "a" "n"
## [52] "w" "o" "w" "i" "s" "d" "i" "j" "j" "k" "p" "f" "d" "r" "c" "o" "c"
## [69] "b" "t" "y" "c" "z" "j" "a" "t" "a" "o" "o" "t" "j" "t" "j" "n" "e"
## [86] "c" "f" "e" "k" "r" "w" "w" "w" "o" "j" "i" "g" "d" "v" "r" "f" "r"
## [103] "b" "z" "b" "k" "n" "b" "h" "z" "g" "v" "i" "z" "c" "r" "o" "p" "w"
## [120] "g" "n" "b" "q" "o" "f" "a" "o" "t" "f" "b" "w" "m" "k" "t" "s" "z"
## [137] "q" "e" "f" "y" "n" "d" "t" "k" "c" "f" "g" "m" "c" "g" "x" "o" "n"
## [154] "h" "k" "g" "r"
str_extract_all(str_test, "[A-Z]")
## [[1]]
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"
The hidden message is revealed. “CONGRATULATIONS YOU ARE A SUPER NERD”