1- Arranege the vector, “simp” so that all elements conform to the standard first name and last name.
simp <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
Extracting full names
simpnames_orig <- unlist(str_extract_all(simp, "[[:alpha:]., ]{2,}"))
simpnames <- simpnames_orig
simpnames_orig
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
Changing the format to first_name last_name
for( i in 1:length(simpnames)){
if(str_detect(simpnames[i], ",") == TRUE){
simpnames[i] = str_c(str_replace(str_extract(simpnames[i], "(, ).+"), ", ", "")
, " ", str_replace(str_extract(simpnames[i], ".+,"), ",", ""))
}else if(str_detect(simpnames[i], "[[:alpha:]]{2,}+\\. ") == T){
simpnames[i] = str_replace(simpnames[i], str_extract(simpnames[i], "[[:alpha:]]{2,}+\\. "), "")
}
}
simpnames
## [1] "Moe Szyslak" "C. Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
2- Construct a logical vector indicating whther a character has a title (i.e., Rev. and Dr.).
str_detect(simpnames_orig, "^[[:alpha:]]{2,}+\\. ")
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
3. Construct a logical vector indicating whther a character has a second name.
str_detect(simpnames_orig, "(^[[:alpha:]]{1}\\.) | [[:alpha:]]{1}\\.")
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
str_detect(simpnames, "(^[[:alpha:]]{1}\\.) | [[:alpha:]]{1}\\.")
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
4- Describe the types of strings that conform to the following regular expressions and construct an example that is matched by regular expression.
(a)[0-9]+\$ At least one digit followed by the dollar sign:
x <- "$ dollar sign 5$ 1000000$ 1,000$"
unlist(str_extract_all(x, "[0-9]+\\$"))
## [1] "5$" "1000000$" "000$"
(b)\b[a-z]{1,4}\b A word with a sequence of at least 1 and at most 4 lowercase letters will be matched.
w <- "France, fra.nce, one, two, three, fo.ur, fra,nce98, a test"
unlist(str_extract_all(w, "\\b[a-z]{1,4}\\b"))
## [1] "fra" "nce" "one" "two" "fo" "ur" "fra" "a" "test"
(c) .*?\.txt$ Strings ending with .txt and any characters that come before that in the string, if there is any, will be matched.
f <- c("test.txt", "test test test$ .txt", " .txt", "test.txt.")
unlist(str_extract_all(f, ".*\\.txt$"))
## [1] "test.txt" "test test test$ .txt" " .txt"
(d) \d{2}/\d{2}/\d{4} digits in the following format will be matched: xx/xx/xxxx
d <- "33/22/1111, 10-11-1991, 7/4/1776, 07/04/1776"
unlist(str_extract_all(d, "\\d{2}/\\d{2}/\\d{4}"))
## [1] "33/22/1111" "07/04/1776"
(e) <(.+?)>.+?</\1> at least one character in tag and again at least one character in the content area, then the same string with literal “/” in the beginning.
t <- "test test<test>m</t> <html>test</html>"
unlist(str_extract_all(t, "<(.+?)>.+?</\\1>"))
## [1] "<html>test</html>"