raw.data = "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
names = unlist(str_extract_all(raw.data, "[[A-z]., ]{2,}"))
print(names)
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy" "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
# Get the first names
firstNames = unlist(str_extract_all(names, "[.,] [[A-z]]{2,}$|[[A-z]]{2,} "))
firstNames = unlist(str_extract_all(firstNames, "[[A-z]]{2,}"))
print (firstNames)
## [1] "Moe" "Montgomery" "Timothy" "Ned" "Homer" "Julius"
# Get the last names
lastNames = unlist(str_extract_all(names, "[^[.,]] [[A-z]]{2,}$|[[A-z]]{2,}, "))
lastNames = unlist(str_extract_all(lastNames, "[[A-z]]{2,}"))
print(lastNames)
## [1] "Szyslak" "Burns" "Lovejoy" "Flanders" "Simpson" "Hibbert"
# Construct a data frame and display the frame
data.frame("Fist Name" = firstNames, "Last Name" = lastNames)
## Fist.Name Last.Name
## 1 Moe Szyslak
## 2 Montgomery Burns
## 3 Timothy Lovejoy
## 4 Ned Flanders
## 5 Homer Simpson
## 6 Julius Hibbert
# Get the titles
titles = unlist(str_extract_all(names, "[[A-z]]{2,}\\."))
print(titles)
## [1] "Rev." "Dr."
data.frame("Name" = names, "Title Exists" = str_detect(names, titles))
## Name Title.Exists
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert TRUE
# Get names with 2 parts
secondNames = unlist(str_extract_all(names, " [[A-z]]{1}\\.? [[A-z]]{1,}\\.?"))
secondNames = unlist(str_extract_all(secondNames,"[[A-z]]{1}\\.? [[A-z]]{1,}\\.?"))
print(secondNames)
## [1] "C. Montgomery"
data.frame("Name" = names, "Second Name Exists" = str_detect(names, secondNames))
## Name Second.Name.Exists
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert FALSE
Answer: At least one digit betwen 0 and 9 followed by a dollar sign
string4a = c("", "01\\", "0012$", "02020", "738372", "34384782347832", "34")
str_detect(string4a, "[0-9]+\\$")
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE
Answer: Between 1 to 4 occurrences of any lower case letter a thru z that performs whole word only search
string4b = c("a", "bc", "def", "ghij", "klmno", "02020", "34384782347832", "34")
str_detect(string4b, "\\b[a-z]{1,4}\\b")
## [1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
Answer: Any optional pattern that starts with any character including space and ends with .txt.
string4c = c("a", "bc", "def", "ghij.", "klmn.txt", "*.txt", "pqr874238743 .txt", ".txt")
str_detect(string4c, ".*?\\.txt$")
## [1] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
Answer: Any 2 digits followed by a forward slash then any 2 digits followed by a forward slash and then any 4 digits number. This pattern can be used to check dates but it doesn’t verify dates.
string4d = c("1/1/2016", "02/02/2016", "20/20/0000", "1/1/16", "1/13/293393", "34384782347832", "34")
str_detect(string4d, "\\d{2}/\\d{2}/\\d{4}")
## [1] FALSE TRUE TRUE FALSE FALSE FALSE FALSE
Answer: Any pattern that matches a html/xml markup with open and closing tags.
string4e = c("<html>Hello World</html>", "<html>Hello World<html>", "34384782347832", "34")
str_detect(string4e, "<(.+?)>.+?</\\1>")
## [1] TRUE FALSE FALSE FALSE
clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
code = "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
print(code)
## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
# Apply each of the following character classes in R Regular expressions given in the text book
# to see which one extracts some meaninful clue from the secret message
# [:digit:], [:lower:], [:upper:], [:alpha:], [:alnum:],
# [:punct:], [:graph:], [:blank:], [:space:], [:print:]
# It's clear that [:upper:] regular expression extracts meaningful text
x = unlist(str_extract_all(code, "[[:upper:].! ]"))
print(x)
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y" "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R" "D" "!"
# Concatenate all vectors into a string of characters
y = paste(x, collapse="")
print(y)
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"
# Replace dots (.) with spaces to obtain the secret message
z = str_replace_all(y, "[\\.]", " ")
print(z)
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD!"