library(stringr)
raw.data <- "555-1239Moe Szyslak (636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
#code given by Munzert etal. to extract names of characters
names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
# remove titles from rev. lovejoy and dr. hibbert
# pattern: titles have periods and are at least 2 letters
names2 <- str_replace(names, "[:alpha:]{2,3}\\.", "")
# switch order for homer and burns
# pattern: in last_name, first_name notation there is a comma always
# detect this pattern in the vector by looping through the character vector and applying conditional logic
# if it exists, split the string at the comma then switch the order
for(i in 1:length(names)){
if(str_detect(names2[i],",")){
spl <- str_split(names2[i],",")
names2[i] <- str_c(spl[[1]][2]," ",spl[[1]][1])
}
}
firstname_lastname <- str_trim(names2, side="both")
firstname_lastname
## [1] "Moe Szyslak" "C. Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
# detect the pattern
title.names <- str_detect(names, "[:alpha:]{2,3}\\.")
# name the vector
names(title.names) <- firstname_lastname
title.names
## Moe Szyslak C. Montgomery Burns Timothy Lovejoy
## FALSE FALSE TRUE
## Ned Flanders Homer Simpson Julius Hibbert
## FALSE FALSE TRUE
#pattern: a character with a second name will have more than two groups of text when split by blank space
second.name <- sapply(str_split(firstname_lastname, " "), length) > 2
names(second.name) <- firstname_lastname
second.name
## Moe Szyslak C. Montgomery Burns Timothy Lovejoy
## FALSE TRUE FALSE
## Ned Flanders Homer Simpson Julius Hibbert
## FALSE FALSE FALSE
#(a) any integer with a dollar sign at the end
pattern.a <- "[0-9]+\\$"
example.a <- "4235$"
str_extract(example.a, pattern.a)
## [1] "4235$"
# (b) a literal b then any lowercase letter (1 to 4 of them) followed by another b
pattern.b <- "\\b[a-z]{1,4}\\b"
example.b <- "barb"
str_extract(example.b, pattern.b)
## [1] "barb"
#(c) any character 0 or more times then a literal dot then 'txt'
pattern.c <- ".*\\.txt$"
example.c <- "anything.txt"
str_extract(example.c, pattern.c)
## [1] "anything.txt"
#(d) a two digit number / a two digit number / a four digit number
pattern.d <- "\\d{2}/\\d{2}/\\d{4}"
example.d <- "07/14/1984"
str_extract(example.d, pattern.d)
## [1] "07/14/1984"