library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.4
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'readr' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.4
# raw.data
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
# name.transform1 : extract all the names (no numbers)
raw.data %>%
str_extract_all(., pattern = "[[:alpha:]., ]{2,}") %>%
unlist -> name.transform1
# remove titles manually : where are these "Rev." or "Dr."
# which(str_detect(name.transform1, "Rev."))
# which(str_detect(name.transform1, "Dr."))
# name.transform2 : remove titles, then split the names into "first" and "last" and lastly convert it into a data.frame
name.transform1 %>% gsub("(Rev\\. )|(Dr\\. )", replace = "", .) %>%
str_split(., pattern = ", | ", n = 2) %>%
plyr::ldply(., rbind) -> name.transform2
# change column names, and data type
names(name.transform2) <- c("first", "last")
name.transform2 <- name.transform2 %>%
dplyr::mutate(first = as.character(first),
last = as.character(last))
## Warning: package 'bindrcpp' was built under R version 3.4.4
# name.transform3 : manually arrange first and last name in correct order
name.transform3 <- name.transform2
name.transform3[2, 1] <- name.transform2[2, 2]
name.transform3[2, 2] <- name.transform2[2, 1]
name.transform3[5, 1] <- name.transform2[5, 2]
name.transform3[5, 2] <- name.transform2[5, 1]
# create title column, then insert it back to the data.frame
title <- c(rep("na", nrow(name.transform3)))
title[which(str_detect(name.transform1, "Rev."))] <- "Rev."
title[which(str_detect(name.transform1, "Dr."))] <- "Dr."
name.transform3 <- name.transform3 %>%
dplyr::mutate(title = title) %>%
select(title, first, last)
# final data.frame
name.transform3
A little fun fact: although “C. Montgomery” is treated as first name in above data.frame, “Montgomery” is actually a middle name. “C” is the first name and it represents “Charles”.
name.transform4 <- name.transform3 %>%
dplyr::mutate(full = ifelse(title == "na", "", title) %>%
paste(., first, last, sep = " "),
title_flag = str_detect(full,
pattern = "(Rev\\.|Dr\\.)"))
name.transform4
name.transform5 <- name.transform4 %>%
dplyr::mutate( middle_flag = ifelse(
str_count(first, pattern = "\\S+") >1, # counting all sequences on non-space characters
TRUE, FALSE) )
name.transform5
I am guessing the question is asking which character is having a middle name. In this case, that’s Mr. Burns.
# 1. [0-9]+\\$
# this is referred to 1 or more digits followed by a dollar sign
str_extract_all(c("44$", "4j$"),
pattern = "[0-9]+\\$") %>% unlist
## [1] "44$"
# 2. \\b[a-z]{1,4}\\b
# this is referred to any word (in lower case) inside a string that has a boundary (word edge)
# AND it must be at least 1 and up to 4 letters
str_extract_all(c("good", "not so good", "hi goodbye",
"heyheyhey", "NOT SO GOOD", "88lu"),
pattern = "\\b[a-z]{1,4}\\b") %>% unlist
## [1] "good" "not" "so" "good" "hi"
# 3. .*?\\.txt$
# this is referred to anything that ends with .txt
# before .txt, anything can be optional, including nothing (.*?)
str_extract_all(c("not so good.txt", ".txt",
"homework.txt.csv", "assignment_txt"),
pattern = ".*?\\.txt$") %>% unlist
## [1] "not so good.txt" ".txt"
# 4. \\d{2}/\\d{2}/\\d{4}
# this is referred to a combo of digits separated by a slash /
# specifically 2 digits separated by slash and then 2 digits, separated by slash again and followed by 4 digits
# however, the beginning don't necessarily to be just 2 digits, e.g. 1111/88/9999 would still return true
# likewise, the end can be more than 4, e.g. 1111/88/999999999 would still return true
str_extract_all(c("haha 11/12/2001 oh yes", "06/14/9999", "1111/88/999999999",
"h9/88/9800", "randombirthdaywishes!:-)", "11/17a/2018"),
pattern = "\\d{2}/\\d{2}/\\d{4}") %>% unlist
## [1] "11/12/2001" "06/14/9999" "11/88/9999"
# 5. <(.+?)>.+?</\\1>
# this is referred to something similar in html format, something may look like this
# <SomethingInBetween>followedBySomethingInBetween</RepeatedWhatIsShowedatFirst>
# \\1 is backreferencing, whereas .+? means optional appearance
str_extract_all(c("<hihi>dllm</hihi>", "<hihi> </hihi>", "< > </ >",
"<hi>dllm</hihi>", "</hihi> </hihi>", "<> </>"),
pattern = "<(.+?)>.+?</\\1>") %>% unlist
## [1] "<hihi>dllm</hihi>" "<hihi> </hihi>" "< > </ >"
secret_code <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
secret_code %>%
str_extract_all(., pattern = "[[A-Z]|[:punct:]]") %>% unlist %>%
cat
## C O N G R A T U L A T I O N S . Y O U . A R E . A . S U P E R N E R D !