3. Copy the introductory example. The vector name stores the extracted names.

library(stringr)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson,Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson,Homer"        "Dr. Julius Hibbert"

3(a) Use the tools of this chapter to rearrange the vector so that all elements conform to the standardfirst_name last_name.

# create vectors to later fill in as a dataframe
title <- rep_len(NA,6)
first <- rep_len(NA,6)
middle <- rep_len(NA,6)
last <- rep_len(NA,6)
splitName <- str_split(name, ",")
splitFirst <- str_split(first, " ")

# modify names and store values in vectors created above
for (i in 1:length(name))
{
# for names with comma, split and exchange position for last and first name
     if (str_detect(name[i],",")) {
    splitName[i] <- str_split(name[i], ",")
    first[i] <- sapply(splitName[i],"[", 2)
    last[i] <- sapply(splitName[i],"[", 1)
# remove excess white spaces in first name
    first[i] <- str_trim(first[i])
    if (str_detect(first[i],"\\.")) {
      splitFirst[i] <- str_split(first[i], " ")
# for first name that includes middle, separate middle
      middle[i] <- sapply(splitFirst[i],"[", 1)
      first[i] <- sapply(splitFirst[i],"[", 2)
    }
  }
# for names with titles, separate titles
  else if (str_detect(name[i],"[[:alpha:]]{2,3}\\.")) {
    splitName[i] <- str_split(name[i], " ")
    title[i] <- sapply(splitName[i],"[", 1)
    first[i] <- sapply(splitName[i],"[", 2)
    last[i] <- sapply(splitName[i],"[", 3)
   }
else {
# split all others at space
    splitName[i] <- str_split(name[i], " ")
    first[i] <- sapply(splitName[i],"[", 1)
    last[i] <- sapply(splitName[i],"[", 2)
  } 
}
title
## [1] NA     NA     "Rev." NA     NA     "Dr."
first
## [1] "Moe"        "Montgomery" "Timothy"    "Ned"        "Homer"     
## [6] "Julius"
middle
## [1] NA   "C." NA   NA   NA   NA
last
## [1] "Szyslak"  "Burns"    "Lovejoy"  "Flanders" "Simpson"  "Hibbert"
#create dataframe
  newName <- data.frame(matrix(ncol = 4,nrow = 6))
  colnames(newName) <- c("title","first","middle","last")
  newName$title <- title
  newName$first <- first
  newName$middle <- middle
  newName$last <- last
  newName
##   title      first middle     last
## 1  <NA>        Moe   <NA>  Szyslak
## 2  <NA> Montgomery     C.    Burns
## 3  Rev.    Timothy   <NA>  Lovejoy
## 4  <NA>        Ned   <NA> Flanders
## 5  <NA>      Homer   <NA>  Simpson
## 6   Dr.     Julius   <NA>  Hibbert

3(b) Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

hasTitle <- newName$title != 'NA'
hasTitle
## [1]   NA   NA TRUE   NA   NA TRUE

3(c) Construct a logical vector indicating whether a character has a second name.

hasMiddle <- newName$middle != 'NA'
hasMiddle
## [1]   NA TRUE   NA   NA   NA   NA

4. Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

4(a)

any number of digits with $ at the end:

pattern <- "[0-9]+\\$"
string <- c("numbers","123123$a","123a123$")
str_detect(string,pattern)
## [1] FALSE  TRUE  TRUE
str_extract(string,pattern)
## [1] NA        "123123$" "123$"

4(b)

4-letter word:

pattern <- "\\b[a-z]{1,4}\\b"
string <- c("AbCd","abcd5","abcd")
str_detect(string,pattern)
## [1] FALSE FALSE  TRUE
str_extract(string,pattern)
## [1] NA     NA     "abcd"

4(c)

optional any number of characters ended by ‘.txt’:

pattern <- ".*?\\.txt$"
string <- c(".txt ","abc.txt$","anyTh1nG$.txt")
str_detect(string,pattern)
## [1] FALSE FALSE  TRUE
str_extract(string,pattern)
## [1] NA              NA              "anyTh1nG$.txt"

4(d)

two digits/two digits/4 digits

pattern <- "\\d{2}/\\d{2}/\\d{4}"
string <- c("12312017","12/31/17","12/31/2017")
str_detect(string,pattern)
## [1] FALSE FALSE  TRUE
str_extract(string,pattern)
## [1] NA           NA           "12/31/2017"

4(e)

string with any number if characters inside <> followed by any number of characters and a backreference to the first expression but including a slash </>

pattern <- "<(.+?)>.+?</\\1>"
string <- c("<abc>","<abc> </def>","<abc>anyTh1nG</abc>")
str_detect(string,pattern)
## [1] FALSE FALSE  TRUE
str_extract(string,pattern)
## [1] NA                    NA                    "<abc>anyTh1nG</abc>"