The purpose of this assignment is to gain experience extracting meaningful segments of text/numbers out of large strings

In the example below we are extracting the names and phone numbers out of the string let’s break down the problem into it’s components [:alpha:] - This is the alpha class which is all alphabetic characters both lower and upper case [[:alpha:].,] - This piece is basically stating that we want the entire character class and we also want to add periods and commmas to it [[:alpha:].,]{2,} - This adds a quantifier so that the contents of the character class have to matched at least twice ((\d{3})?\)? - This is how we are gathering the three digit zip code, the \ means to extract all the d{3} means we are looking for a three digit combination and the ? represents that this part of the string combination is optional since not every phone number has a zip code (-| )? - This part of the string combination is looking for a dash or a space after the zip code but since not all phone numbers contain these, there is also a question mark to signify that this is optional also \d{3} - this piece that comes next represents 3 required digits that come after a zip code (- )? - once again this represents a dash or space that may or may not before the last combination of digits \{d}4 - represents the required last 4 digits \- the double slash at the front of the combination represents the fact that we are extracting as many cases that fit this criteria as possible because there are multiple phone numbers in the string

library(stringr)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555 -6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))

name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"
phone
## [1] "555-1239"       "(636) 555-0113" "555 8904"       "636-555-3226"  
## [5] "5553642"

3

First Name, Last Name, Title, Second Name

First Name

[.,] [[:alpha:]]{2,}$|[[:alpha:]]{2,} - this is basically stating that the first name vector will either begin or a ., or it will be the beginning of the of the character sequence #Last Name

title

“[[:alpha:]]{2,}\.” - This represents any character string that ends with a period

Second Name

firstNames_1 = unlist(str_extract_all(name, "[.,] [[:alpha:]]{2,}$|[[:alpha:]]{2,} "))
firstNames = unlist(str_extract_all(firstNames_1, "[[:alpha:]]{2,}"))
firstNames
## [1] "Moe"        "Montgomery" "Timothy"    "Ned"        "Homer"     
## [6] "Julius"
lastNames_1 = unlist(str_extract_all(name, "[^[.,]] [[:alpha:]]{2,}$|[[:alpha:]]{2,}, "))
lastNames = unlist(str_extract_all(lastNames_1, "[[:alpha:]]{2,}"))
lastNames
## [1] "Szyslak"  "Burns"    "Lovejoy"  "Flanders" "Simpson"  "Hibbert"
titles = unlist(str_extract_all(name, "[[:alpha:]]{2,}\\."))
titles
## [1] "Rev." "Dr."
logic_title = str_detect(name, "[[:alpha:]]{2,}\\.")
logic_title 
## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE
logicalSecondName<-str_detect(name," [:alpha:]{1}[.]|:alpha:]{1}| [:alpha:] ")
logicalSecondName
## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

4

At least one digit between 0 and 9 followed by a $

Any optional pattern that starts with any character including space and ends with .txt

Any 2 digits followed by a forward slash then any 2 digits followed by a forward slash and then any 4 digit number

Any pattern that matches a html/xml markup with open and closing tags

#At least one digit between 0 and 9 followed by a dollar sign
string_1 = c("01$", "580493$", "73837$doesthiswork", "yesitdoes1$564445", "1$234")
str_detect(string_1, "[0-9]+\\$")
## [1] TRUE TRUE TRUE TRUE TRUE
#Between 1 to 4 occurrences of any lower case letter a thru z that performs whole word only search
string_2 = c("a","ab","abc","abcd","abcz")
str_detect(string_2,"\\b[a-z]{1,4}\\b")
## [1] TRUE TRUE TRUE TRUE TRUE
#any pattern that starts with any character and ends with .txt
string_3 = c("corey.txt", ".txt", "dsdbsdbds .txt", "555.txt")
str_detect(string_3, ".*?\\.txt$")
## [1] TRUE TRUE TRUE TRUE
#Any 2 digits followed by a forward slash then any 2 digits followed by a forward slash and then any 4 digit number
string_4 = c("09/16/2018", "04/11/1994")
str_detect(string_4, "\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE TRUE
#Any pattern that matches a html/xml markup with open and closing tags
string_5 = c("<html>R IS FUN</html>")
str_detect(string_5, "<(.+?)>.+?</\\1>")
## [1] TRUE

9

extra credit

secret_msg = "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"


digit = unlist(str_extract_all(secret_msg, "[[:digit:].! ]"))
lower = unlist(str_extract_all(secret_msg, "[[:lower:].! ]"))
upper = unlist(str_extract_all(secret_msg, "[[:upper:].! ]"))
alpha = unlist(str_extract_all(secret_msg, "[[:alpha:].! ]"))
alnum = unlist(str_extract_all(secret_msg, "[[:alnum:].! ]"))
punct = unlist(str_extract_all(secret_msg, "[[:punct:].! ]"))
graph = unlist(str_extract_all(secret_msg, "[[:graph:].! ]"))
blank = unlist(str_extract_all(secret_msg, "[[:blank:].! ]"))
space = unlist(str_extract_all(secret_msg, "[[:space:].! ]"))
print = unlist(str_extract_all(secret_msg, "[[:print:].! ]"))

digit
##  [1] "1" "0" "8" "7" "7" "9" "2" "8" "5" "5" "0" "7" "8" "0" "3" "5" "3"
## [18] "0" "7" "5" "5" "3" "3" "6" "4" "." "1" "1" "6" "2" "." "2" "4" "9"
## [35] "0" "5" "." "." "6" "5" "1" "7" "2" "4" "6" "3" "9" "5" "8" "9" "6"
## [52] "5" "9" "4" "9" "0" "5" "4" "5" "!"
lower
##   [1] "c" "l" "c" "o" "p" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k" "i"
##  [18] "g" "v" "d" "i" "c" "p" "u" "g" "g" "v" "h" "r" "y" "n" "j" "u" "w"
##  [35] "c" "z" "i" "h" "q" "r" "f" "p" "x" "s" "j" "d" "w" "p" "n" "a" "n"
##  [52] "w" "o" "w" "i" "s" "d" "i" "j" "j" "k" "p" "f" "d" "r" "c" "o" "c"
##  [69] "b" "t" "y" "c" "z" "j" "a" "t" "a" "o" "o" "t" "j" "t" "j" "n" "e"
##  [86] "c" "f" "e" "k" "." "r" "w" "w" "w" "o" "j" "i" "g" "d" "v" "r" "f"
## [103] "r" "b" "z" "." "b" "k" "n" "b" "h" "z" "g" "v" "i" "z" "c" "r" "o"
## [120] "p" "." "w" "g" "n" "b" "." "q" "o" "f" "a" "o" "t" "f" "b" "w" "m"
## [137] "k" "t" "s" "z" "q" "e" "f" "y" "n" "d" "t" "k" "c" "f" "g" "m" "c"
## [154] "g" "x" "o" "n" "h" "k" "!" "g" "r"
upper
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"
alpha
##   [1] "c" "l" "c" "o" "p" "C" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k"
##  [18] "i" "g" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g" "v" "h" "r" "y" "n"
##  [35] "G" "j" "u" "w" "c" "z" "i" "h" "q" "r" "f" "p" "R" "x" "s" "A" "j"
##  [52] "d" "w" "p" "n" "T" "a" "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "L"
##  [69] "j" "k" "p" "f" "A" "T" "I" "d" "r" "c" "o" "c" "b" "t" "y" "c" "z"
##  [86] "j" "a" "t" "O" "a" "o" "o" "t" "j" "t" "N" "j" "n" "e" "c" "S" "f"
## [103] "e" "k" "." "r" "w" "Y" "w" "w" "o" "j" "i" "g" "O" "d" "v" "r" "f"
## [120] "U" "r" "b" "z" "." "b" "k" "A" "n" "b" "h" "z" "g" "v" "R" "i" "z"
## [137] "E" "c" "r" "o" "p" "." "w" "A" "g" "n" "b" "." "S" "q" "o" "U" "f"
## [154] "P" "a" "o" "t" "f" "b" "w" "E" "m" "k" "t" "s" "R" "z" "q" "e" "f"
## [171] "y" "n" "N" "d" "t" "k" "c" "f" "E" "g" "m" "c" "R" "g" "x" "o" "n"
## [188] "h" "D" "k" "!" "g" "r"
alnum
##   [1] "c" "l" "c" "o" "p" "C" "o" "w" "1" "z" "m" "s" "t" "c" "0" "d" "8"
##  [18] "7" "w" "n" "k" "i" "g" "7" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g"
##  [35] "v" "h" "r" "y" "n" "9" "2" "G" "j" "u" "w" "c" "z" "i" "8" "h" "q"
##  [52] "r" "f" "p" "R" "x" "s" "5" "A" "j" "5" "d" "w" "p" "n" "0" "T" "a"
##  [69] "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "7" "L" "j" "8" "k" "p" "f"
##  [86] "0" "3" "A" "T" "5" "I" "d" "r" "3" "c" "o" "c" "0" "b" "t" "7" "y"
## [103] "c" "z" "j" "a" "t" "O" "a" "o" "o" "t" "j" "5" "5" "t" "3" "N" "j"
## [120] "3" "n" "e" "6" "c" "4" "S" "f" "e" "k" "." "r" "1" "w" "1" "Y" "w"
## [137] "w" "o" "j" "i" "g" "O" "d" "6" "v" "r" "f" "U" "r" "b" "z" "2" "."
## [154] "2" "b" "k" "A" "n" "b" "h" "z" "g" "v" "4" "R" "9" "i" "0" "5" "z"
## [171] "E" "c" "r" "o" "p" "." "w" "A" "g" "n" "b" "." "S" "q" "o" "U" "6"
## [188] "5" "f" "P" "a" "1" "o" "t" "f" "b" "7" "w" "E" "m" "2" "4" "k" "6"
## [205] "t" "3" "s" "R" "9" "z" "q" "e" "5" "f" "y" "8" "9" "n" "6" "N" "d"
## [222] "5" "t" "9" "k" "c" "4" "f" "E" "9" "0" "5" "g" "m" "c" "4" "R" "g"
## [239] "x" "o" "5" "n" "h" "D" "k" "!" "g" "r"
punct
## [1] "." "." "." "." "!"
graph
##   [1] "c" "l" "c" "o" "p" "C" "o" "w" "1" "z" "m" "s" "t" "c" "0" "d" "8"
##  [18] "7" "w" "n" "k" "i" "g" "7" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g"
##  [35] "v" "h" "r" "y" "n" "9" "2" "G" "j" "u" "w" "c" "z" "i" "8" "h" "q"
##  [52] "r" "f" "p" "R" "x" "s" "5" "A" "j" "5" "d" "w" "p" "n" "0" "T" "a"
##  [69] "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "7" "L" "j" "8" "k" "p" "f"
##  [86] "0" "3" "A" "T" "5" "I" "d" "r" "3" "c" "o" "c" "0" "b" "t" "7" "y"
## [103] "c" "z" "j" "a" "t" "O" "a" "o" "o" "t" "j" "5" "5" "t" "3" "N" "j"
## [120] "3" "n" "e" "6" "c" "4" "S" "f" "e" "k" "." "r" "1" "w" "1" "Y" "w"
## [137] "w" "o" "j" "i" "g" "O" "d" "6" "v" "r" "f" "U" "r" "b" "z" "2" "."
## [154] "2" "b" "k" "A" "n" "b" "h" "z" "g" "v" "4" "R" "9" "i" "0" "5" "z"
## [171] "E" "c" "r" "o" "p" "." "w" "A" "g" "n" "b" "." "S" "q" "o" "U" "6"
## [188] "5" "f" "P" "a" "1" "o" "t" "f" "b" "7" "w" "E" "m" "2" "4" "k" "6"
## [205] "t" "3" "s" "R" "9" "z" "q" "e" "5" "f" "y" "8" "9" "n" "6" "N" "d"
## [222] "5" "t" "9" "k" "c" "4" "f" "E" "9" "0" "5" "g" "m" "c" "4" "R" "g"
## [239] "x" "o" "5" "n" "h" "D" "k" "!" "g" "r"
blank
## [1] "." "." "." "." "!"
space
## [1] "." "." "." "." "!"
print
##   [1] "c" "l" "c" "o" "p" "C" "o" "w" "1" "z" "m" "s" "t" "c" "0" "d" "8"
##  [18] "7" "w" "n" "k" "i" "g" "7" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g"
##  [35] "v" "h" "r" "y" "n" "9" "2" "G" "j" "u" "w" "c" "z" "i" "8" "h" "q"
##  [52] "r" "f" "p" "R" "x" "s" "5" "A" "j" "5" "d" "w" "p" "n" "0" "T" "a"
##  [69] "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "7" "L" "j" "8" "k" "p" "f"
##  [86] "0" "3" "A" "T" "5" "I" "d" "r" "3" "c" "o" "c" "0" "b" "t" "7" "y"
## [103] "c" "z" "j" "a" "t" "O" "a" "o" "o" "t" "j" "5" "5" "t" "3" "N" "j"
## [120] "3" "n" "e" "6" "c" "4" "S" "f" "e" "k" "." "r" "1" "w" "1" "Y" "w"
## [137] "w" "o" "j" "i" "g" "O" "d" "6" "v" "r" "f" "U" "r" "b" "z" "2" "."
## [154] "2" "b" "k" "A" "n" "b" "h" "z" "g" "v" "4" "R" "9" "i" "0" "5" "z"
## [171] "E" "c" "r" "o" "p" "." "w" "A" "g" "n" "b" "." "S" "q" "o" "U" "6"
## [188] "5" "f" "P" "a" "1" "o" "t" "f" "b" "7" "w" "E" "m" "2" "4" "k" "6"
## [205] "t" "3" "s" "R" "9" "z" "q" "e" "5" "f" "y" "8" "9" "n" "6" "N" "d"
## [222] "5" "t" "9" "k" "c" "4" "f" "E" "9" "0" "5" "g" "m" "c" "4" "R" "g"
## [239] "x" "o" "5" "n" "h" "D" "k" "!" "g" "r"
shortened = paste(upper, collapse="")

shortened
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"