In the example below we are extracting the names and phone numbers out of the string let’s break down the problem into it’s components [:alpha:] - This is the alpha class which is all alphabetic characters both lower and upper case [[:alpha:].,] - This piece is basically stating that we want the entire character class and we also want to add periods and commmas to it [[:alpha:].,]{2,} - This adds a quantifier so that the contents of the character class have to matched at least twice ((\d{3})?\)? - This is how we are gathering the three digit zip code, the \ means to extract all the d{3} means we are looking for a three digit combination and the ? represents that this part of the string combination is optional since not every phone number has a zip code (-| )? - This part of the string combination is looking for a dash or a space after the zip code but since not all phone numbers contain these, there is also a question mark to signify that this is optional also \d{3} - this piece that comes next represents 3 required digits that come after a zip code (- )? - once again this represents a dash or space that may or may not before the last combination of digits \{d}4 - represents the required last 4 digits \- the double slash at the front of the combination represents the fact that we are extracting as many cases that fit this criteria as possible because there are multiple phone numbers in the string
library(stringr)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555 -6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
phone
## [1] "555-1239" "(636) 555-0113" "555 8904" "636-555-3226"
## [5] "5553642"
[.,] [[:alpha:]]{2,}$|[[:alpha:]]{2,} - this is basically stating that the first name vector will either begin or a ., or it will be the beginning of the of the character sequence #Last Name
“[[:alpha:]]{2,}\.” - This represents any character string that ends with a period
firstNames_1 = unlist(str_extract_all(name, "[.,] [[:alpha:]]{2,}$|[[:alpha:]]{2,} "))
firstNames = unlist(str_extract_all(firstNames_1, "[[:alpha:]]{2,}"))
firstNames
## [1] "Moe" "Montgomery" "Timothy" "Ned" "Homer"
## [6] "Julius"
lastNames_1 = unlist(str_extract_all(name, "[^[.,]] [[:alpha:]]{2,}$|[[:alpha:]]{2,}, "))
lastNames = unlist(str_extract_all(lastNames_1, "[[:alpha:]]{2,}"))
lastNames
## [1] "Szyslak" "Burns" "Lovejoy" "Flanders" "Simpson" "Hibbert"
titles = unlist(str_extract_all(name, "[[:alpha:]]{2,}\\."))
titles
## [1] "Rev." "Dr."
logic_title = str_detect(name, "[[:alpha:]]{2,}\\.")
logic_title
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
logicalSecondName<-str_detect(name," [:alpha:]{1}[.]|:alpha:]{1}| [:alpha:] ")
logicalSecondName
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
secret_msg = "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
digit = unlist(str_extract_all(secret_msg, "[[:digit:].! ]"))
lower = unlist(str_extract_all(secret_msg, "[[:lower:].! ]"))
upper = unlist(str_extract_all(secret_msg, "[[:upper:].! ]"))
alpha = unlist(str_extract_all(secret_msg, "[[:alpha:].! ]"))
alnum = unlist(str_extract_all(secret_msg, "[[:alnum:].! ]"))
punct = unlist(str_extract_all(secret_msg, "[[:punct:].! ]"))
graph = unlist(str_extract_all(secret_msg, "[[:graph:].! ]"))
blank = unlist(str_extract_all(secret_msg, "[[:blank:].! ]"))
space = unlist(str_extract_all(secret_msg, "[[:space:].! ]"))
print = unlist(str_extract_all(secret_msg, "[[:print:].! ]"))
digit
## [1] "1" "0" "8" "7" "7" "9" "2" "8" "5" "5" "0" "7" "8" "0" "3" "5" "3"
## [18] "0" "7" "5" "5" "3" "3" "6" "4" "." "1" "1" "6" "2" "." "2" "4" "9"
## [35] "0" "5" "." "." "6" "5" "1" "7" "2" "4" "6" "3" "9" "5" "8" "9" "6"
## [52] "5" "9" "4" "9" "0" "5" "4" "5" "!"
lower
## [1] "c" "l" "c" "o" "p" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k" "i"
## [18] "g" "v" "d" "i" "c" "p" "u" "g" "g" "v" "h" "r" "y" "n" "j" "u" "w"
## [35] "c" "z" "i" "h" "q" "r" "f" "p" "x" "s" "j" "d" "w" "p" "n" "a" "n"
## [52] "w" "o" "w" "i" "s" "d" "i" "j" "j" "k" "p" "f" "d" "r" "c" "o" "c"
## [69] "b" "t" "y" "c" "z" "j" "a" "t" "a" "o" "o" "t" "j" "t" "j" "n" "e"
## [86] "c" "f" "e" "k" "." "r" "w" "w" "w" "o" "j" "i" "g" "d" "v" "r" "f"
## [103] "r" "b" "z" "." "b" "k" "n" "b" "h" "z" "g" "v" "i" "z" "c" "r" "o"
## [120] "p" "." "w" "g" "n" "b" "." "q" "o" "f" "a" "o" "t" "f" "b" "w" "m"
## [137] "k" "t" "s" "z" "q" "e" "f" "y" "n" "d" "t" "k" "c" "f" "g" "m" "c"
## [154] "g" "x" "o" "n" "h" "k" "!" "g" "r"
upper
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"
alpha
## [1] "c" "l" "c" "o" "p" "C" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k"
## [18] "i" "g" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g" "v" "h" "r" "y" "n"
## [35] "G" "j" "u" "w" "c" "z" "i" "h" "q" "r" "f" "p" "R" "x" "s" "A" "j"
## [52] "d" "w" "p" "n" "T" "a" "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "L"
## [69] "j" "k" "p" "f" "A" "T" "I" "d" "r" "c" "o" "c" "b" "t" "y" "c" "z"
## [86] "j" "a" "t" "O" "a" "o" "o" "t" "j" "t" "N" "j" "n" "e" "c" "S" "f"
## [103] "e" "k" "." "r" "w" "Y" "w" "w" "o" "j" "i" "g" "O" "d" "v" "r" "f"
## [120] "U" "r" "b" "z" "." "b" "k" "A" "n" "b" "h" "z" "g" "v" "R" "i" "z"
## [137] "E" "c" "r" "o" "p" "." "w" "A" "g" "n" "b" "." "S" "q" "o" "U" "f"
## [154] "P" "a" "o" "t" "f" "b" "w" "E" "m" "k" "t" "s" "R" "z" "q" "e" "f"
## [171] "y" "n" "N" "d" "t" "k" "c" "f" "E" "g" "m" "c" "R" "g" "x" "o" "n"
## [188] "h" "D" "k" "!" "g" "r"
alnum
## [1] "c" "l" "c" "o" "p" "C" "o" "w" "1" "z" "m" "s" "t" "c" "0" "d" "8"
## [18] "7" "w" "n" "k" "i" "g" "7" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g"
## [35] "v" "h" "r" "y" "n" "9" "2" "G" "j" "u" "w" "c" "z" "i" "8" "h" "q"
## [52] "r" "f" "p" "R" "x" "s" "5" "A" "j" "5" "d" "w" "p" "n" "0" "T" "a"
## [69] "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "7" "L" "j" "8" "k" "p" "f"
## [86] "0" "3" "A" "T" "5" "I" "d" "r" "3" "c" "o" "c" "0" "b" "t" "7" "y"
## [103] "c" "z" "j" "a" "t" "O" "a" "o" "o" "t" "j" "5" "5" "t" "3" "N" "j"
## [120] "3" "n" "e" "6" "c" "4" "S" "f" "e" "k" "." "r" "1" "w" "1" "Y" "w"
## [137] "w" "o" "j" "i" "g" "O" "d" "6" "v" "r" "f" "U" "r" "b" "z" "2" "."
## [154] "2" "b" "k" "A" "n" "b" "h" "z" "g" "v" "4" "R" "9" "i" "0" "5" "z"
## [171] "E" "c" "r" "o" "p" "." "w" "A" "g" "n" "b" "." "S" "q" "o" "U" "6"
## [188] "5" "f" "P" "a" "1" "o" "t" "f" "b" "7" "w" "E" "m" "2" "4" "k" "6"
## [205] "t" "3" "s" "R" "9" "z" "q" "e" "5" "f" "y" "8" "9" "n" "6" "N" "d"
## [222] "5" "t" "9" "k" "c" "4" "f" "E" "9" "0" "5" "g" "m" "c" "4" "R" "g"
## [239] "x" "o" "5" "n" "h" "D" "k" "!" "g" "r"
punct
## [1] "." "." "." "." "!"
graph
## [1] "c" "l" "c" "o" "p" "C" "o" "w" "1" "z" "m" "s" "t" "c" "0" "d" "8"
## [18] "7" "w" "n" "k" "i" "g" "7" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g"
## [35] "v" "h" "r" "y" "n" "9" "2" "G" "j" "u" "w" "c" "z" "i" "8" "h" "q"
## [52] "r" "f" "p" "R" "x" "s" "5" "A" "j" "5" "d" "w" "p" "n" "0" "T" "a"
## [69] "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "7" "L" "j" "8" "k" "p" "f"
## [86] "0" "3" "A" "T" "5" "I" "d" "r" "3" "c" "o" "c" "0" "b" "t" "7" "y"
## [103] "c" "z" "j" "a" "t" "O" "a" "o" "o" "t" "j" "5" "5" "t" "3" "N" "j"
## [120] "3" "n" "e" "6" "c" "4" "S" "f" "e" "k" "." "r" "1" "w" "1" "Y" "w"
## [137] "w" "o" "j" "i" "g" "O" "d" "6" "v" "r" "f" "U" "r" "b" "z" "2" "."
## [154] "2" "b" "k" "A" "n" "b" "h" "z" "g" "v" "4" "R" "9" "i" "0" "5" "z"
## [171] "E" "c" "r" "o" "p" "." "w" "A" "g" "n" "b" "." "S" "q" "o" "U" "6"
## [188] "5" "f" "P" "a" "1" "o" "t" "f" "b" "7" "w" "E" "m" "2" "4" "k" "6"
## [205] "t" "3" "s" "R" "9" "z" "q" "e" "5" "f" "y" "8" "9" "n" "6" "N" "d"
## [222] "5" "t" "9" "k" "c" "4" "f" "E" "9" "0" "5" "g" "m" "c" "4" "R" "g"
## [239] "x" "o" "5" "n" "h" "D" "k" "!" "g" "r"
blank
## [1] "." "." "." "." "!"
space
## [1] "." "." "." "." "!"
print
## [1] "c" "l" "c" "o" "p" "C" "o" "w" "1" "z" "m" "s" "t" "c" "0" "d" "8"
## [18] "7" "w" "n" "k" "i" "g" "7" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g"
## [35] "v" "h" "r" "y" "n" "9" "2" "G" "j" "u" "w" "c" "z" "i" "8" "h" "q"
## [52] "r" "f" "p" "R" "x" "s" "5" "A" "j" "5" "d" "w" "p" "n" "0" "T" "a"
## [69] "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "7" "L" "j" "8" "k" "p" "f"
## [86] "0" "3" "A" "T" "5" "I" "d" "r" "3" "c" "o" "c" "0" "b" "t" "7" "y"
## [103] "c" "z" "j" "a" "t" "O" "a" "o" "o" "t" "j" "5" "5" "t" "3" "N" "j"
## [120] "3" "n" "e" "6" "c" "4" "S" "f" "e" "k" "." "r" "1" "w" "1" "Y" "w"
## [137] "w" "o" "j" "i" "g" "O" "d" "6" "v" "r" "f" "U" "r" "b" "z" "2" "."
## [154] "2" "b" "k" "A" "n" "b" "h" "z" "g" "v" "4" "R" "9" "i" "0" "5" "z"
## [171] "E" "c" "r" "o" "p" "." "w" "A" "g" "n" "b" "." "S" "q" "o" "U" "6"
## [188] "5" "f" "P" "a" "1" "o" "t" "f" "b" "7" "w" "E" "m" "2" "4" "k" "6"
## [205] "t" "3" "s" "R" "9" "z" "q" "e" "5" "f" "y" "8" "9" "n" "6" "N" "d"
## [222] "5" "t" "9" "k" "c" "4" "f" "E" "9" "0" "5" "g" "m" "c" "4" "R" "g"
## [239] "x" "o" "5" "n" "h" "D" "k" "!" "g" "r"
shortened = paste(upper, collapse="")
shortened
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"