3. Copy the introductory example. The vector “name” stores the extracted names.

raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

library(stringr)

name <-  unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

(a) Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

Extract first names:

#from first name last name format:
firstName <- unlist(str_extract_all(name, "[[:alpha:]]{2,} "))
firstName
## [1] "Moe "     "Timothy " "Ned "     "Julius "
#from last name, first name format
first <- unlist(str_extract_all(name, ", [[:alpha:].+]{1,}+"))
first
## [1] ", C."    ", Homer"
#extract name without comma and space
FirstName <- sub(".*? (.+)", "\\1", first)
FirstName
## [1] "C."    "Homer"
firstNames <- c(firstName, FirstName)
firstNames
## [1] "Moe "     "Timothy " "Ned "     "Julius "  "C."       "Homer"

Extract last names:

#from first name last name format:
lastName <- unlist(str_extract_all(name, "\\b .+\\b"))
lastName
## [1] " Szyslak"  " Lovejoy"  " Flanders" " Hibbert"
#extract name without space before
lastName1 <- sub(".*? ", "\\1", lastName)
lastName1
## [1] "Szyslak"  "Lovejoy"  "Flanders" "Hibbert"
#from last name, first name format
last <- unlist(str_extract_all(name, "[[:alpha:]]{2,},"))
last
## [1] "Burns,"   "Simpson,"
#extract name without comma
LastName <- sub(",", "\\1", last)
LastName
## [1] "Burns"   "Simpson"
lastNames <- c(lastName1, LastName)
lastNames
## [1] "Szyslak"  "Lovejoy"  "Flanders" "Hibbert"  "Burns"    "Simpson"

Create a data frame with first names and last names:

df <- data.frame("First Name" = firstNames, "Last Name" = lastNames)
knitr::kable(df)
First.Name Last.Name
Moe Szyslak
Timothy Lovejoy
Ned Flanders
Julius Hibbert
C. Burns
Homer Simpson

Create a vector of first and last names:

namesVector <- paste0(firstNames,' ',lastNames)
namesVector
## [1] "Moe  Szyslak"     "Timothy  Lovejoy" "Ned  Flanders"   
## [4] "Julius  Hibbert"  "C. Burns"         "Homer Simpson"

(b) Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

prefix <- (str_extract_all(name, "[[A-z]]{2,}\\."))
prefix
## [[1]]
## character(0)
## 
## [[2]]
## character(0)
## 
## [[3]]
## [1] "Rev."
## 
## [[4]]
## character(0)
## 
## [[5]]
## character(0)
## 
## [[6]]
## [1] "Dr."
title <- str_detect(name, "[[:alpha:]]{2,}\\.")
title
## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

(c) Construct a logical vector indicating whether a character has a second name.

middleName <- str_extract_all(name,"\\,\\s\\w\\.\\s[[:alpha:]]{2,}")
middleName
## [[1]]
## character(0)
## 
## [[2]]
## [1] ", C. Montgomery"
## 
## [[3]]
## character(0)
## 
## [[4]]
## character(0)
## 
## [[5]]
## character(0)
## 
## [[6]]
## character(0)
secondName <- str_detect(name,"\\,\\s\\w\\.\\s[[:alpha:]]{2,}")
secondName
## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

4. Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.**

**(a) [0-9]+\\$**
    This expression returns strings of 1 or more numbers followed by a $.
example1 <- "35iu68903$64678$t8bhj$246"
str_extract_all(example1, "[0-9]+\\$" )
## [[1]]
## [1] "68903$" "64678$"
**(b) \\b[a-z]{1,4}\\b**
    This expression returns strings of 1-4 lowercase letters
example2 <- c("abc", "de", "f", "ghij", "kl123")
str_extract_all(example2, "\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "abc"
## 
## [[2]]
## [1] "de"
## 
## [[3]]
## [1] "f"
## 
## [[4]]
## [1] "ghij"
## 
## [[5]]
## character(0)
**(c) .*?\\.txt$**
    This expression returns strings that end with ".txt"
example3 <- c("abc.txt", "abc.doc", "txt")
str_extract_all(example3, ".*?\\.txt$")
## [[1]]
## [1] "abc.txt"
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
**(d) \\d{2}/\\d{2}/\\d{4}**
    This string returns date formats: any 2 digits, forward slash, any 2 digits, forward slash, any 4 digits
    
example3 <- c("02/11/2019", "2/11/19", "04/02/2018", "1789/12", "12387689")
str_extract_all(example3, "\\d{2}/\\d{2}/\\d{4}")
## [[1]]
## [1] "02/11/2019"
## 
## [[2]]
## character(0)
## 
## [[3]]
## [1] "04/02/2018"
## 
## [[4]]
## character(0)
## 
## [[5]]
## character(0)
**(e) <(.+?)>.+?</\\1>**
    This expression returns strings that have html format
example4 <- c("<p>Hello world</p>", "Hello", "<Hi>")
str_extract_all(example4,"<(.+?)>.+?</\\1>")
## [[1]]
## [1] "<p>Hello world</p>"
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)

9. The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.**

clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
message <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

#Remove numbers
message <- unlist(str_extract_all(message, "[^[:digit:]]"))
message
##   [1] "c" "l" "c" "o" "p" "C" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k"
##  [18] "i" "g" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g" "v" "h" "r" "y" "n"
##  [35] "G" "j" "u" "w" "c" "z" "i" "h" "q" "r" "f" "p" "R" "x" "s" "A" "j"
##  [52] "d" "w" "p" "n" "T" "a" "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "L"
##  [69] "j" "k" "p" "f" "A" "T" "I" "d" "r" "c" "o" "c" "b" "t" "y" "c" "z"
##  [86] "j" "a" "t" "O" "a" "o" "o" "t" "j" "t" "N" "j" "n" "e" "c" "S" "f"
## [103] "e" "k" "." "r" "w" "Y" "w" "w" "o" "j" "i" "g" "O" "d" "v" "r" "f"
## [120] "U" "r" "b" "z" "." "b" "k" "A" "n" "b" "h" "z" "g" "v" "R" "i" "z"
## [137] "E" "c" "r" "o" "p" "." "w" "A" "g" "n" "b" "." "S" "q" "o" "U" "f"
## [154] "P" "a" "o" "t" "f" "b" "w" "E" "m" "k" "t" "s" "R" "z" "q" "e" "f"
## [171] "y" "n" "N" "d" "t" "k" "c" "f" "E" "g" "m" "c" "R" "g" "x" "o" "n"
## [188] "h" "D" "k" "!" "g" "r"
#Remove lower case letters
message1 <- unlist(str_extract_all(message, "[^[:lower:]]"))
message1
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"
#Concatenate individual letters
message2 <- paste(message1, collapse = '')
message2
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"
#Replace periods with spaces
message3 <- unlist(str_replace_all(message2,"\\.", " "))
message3
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD!"