Copy the introductory example. The vector namestores the extracted names.
R> name
[1] “Moe Szyslak” “Burns, C. Montgomery” “Rev. Timothy Lovejoy”
[4] “Ned Flanders” “Simpson, Homer” “Dr. Julius Hibbert”
(a) Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
library(stringr)
names <- c("Moe Szyslak", "Burns, C. Montgomery", "Rev. Timothy Lovejoy", "Ned Flanders", "Simpson, Homer", "Dr. Julius Hibbert")
names
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
first_name <- unlist(str_extract_all(names,"[[:alpha:]]+\\s|[.,] [[:alpha:]]+$"))
first_name <- unlist(str_extract_all(first_name, "[[:alpha:]]+"))
first_name
## [1] "Moe" "Montgomery" "Timothy" "Ned" "Homer"
## [6] "Julius"
last_name <- str_extract(names, "([[:alpha:]]+?[,]|[[:alpha:]]+$)")
last_name <- str_extract(last_name, "[^,]+")
last_name
## [1] "Szyslak" "Burns" "Lovejoy" "Flanders" "Simpson" "Hibbert"
firstlast <- str_c(first_name, " ", last_name)
firstlast
## [1] "Moe Szyslak" "Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
(b) Construct a logical vector indicating whether a character has a title (i.e.,Rev. and Dr.).
title <-str_detect(names, "^[[:alpha:]]+?['.']")
title
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
(c) Construct a logical vector indicating whether a character has a second name.
second_name <- str_detect(names, "[[:upper:]]['.']")
second_name
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
(a) [0-9]+\$
#Returns a string of numbrs of n length ending with the character $
s4a <- c("56424895314$", "534565645656", "dsdf542$65201" , "a442367", "b542346y", "abc123$0$ qw567$$")
unlist(str_extract_all(s4a, "[0-9]+\\$"))
## [1] "56424895314$" "542$" "123$" "0$"
## [5] "567$"
(b) \b[a-z]{1,4}\b
#Returns a string of lowercase letters of length 1, 2, 3, or 4
s4b <- c("ahdb32ja", "a", "ab", "abc", "aBc", "abcd", "123abcd", "ab89@0", "371$12")
unlist(str_extract_all(s4b, "\\b[a-z]{1,4}\\b"))
## [1] "a" "ab" "abc" "abcd"
**(c) .*?\.txt$ **
#Returns a string of length n of any characters (except new line) ending in the text '.txt' or just the text '.txt'
s4c <- c("filename.txt", "file name.txt", "284@$.!.txt", "284@$.!.", "2file name.txt", ".txt", "abc", "2.txt")
unlist(str_extract_all(s4c, ".*?\\.txt$"))
## [1] "filename.txt" "file name.txt" "284@$.!.txt" "2file name.txt"
## [5] ".txt" "2.txt"
(d) \d{2}/\d{2}/\d{4}
#Returns a date of format xx/xx/xxxx where x is a while number
s4d <- c("42", "33/35/99", "01/18/2199", "33\22\1111")
unlist(str_extract_all(s4d, "\\d{2}/\\d{2}/\\d{4}"))
## [1] "01/18/2199"
(e) <(.+?)>.+?</\1>
#Returns a string what starts with the format <s1> s2 </s1>, where s1 is a string of any length and characters, and s2 is a string of any length. This string usually represents internet tags
s5e <- c("abcd", "4sdf asdf", "2", " ", "abc xyz /abc", "<html> Hello World 1 </html>", "<a> b </c>", "<a> b </a>", "<b> </b>", "<abc 123> b@2 </abc 123>")
str_extract_all(s5e, "<(.+?)>.+?</\\1>")
## [[1]]
## character(0)
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## character(0)
##
## [[5]]
## character(0)
##
## [[6]]
## [1] "<html> Hello World 1 </html>"
##
## [[7]]
## character(0)
##
## [[8]]
## [1] "<a> b </a>"
##
## [[9]]
## [1] "<b> </b>"
##
## [[10]]
## [1] "<abc 123> b@2 </abc 123>"
The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
unknowntext <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaotj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
#Remove all lower cases
knowntext <- unlist(str_extract_all(unknowntext, "[[:upper:]]|[[:punct:]]"))
knowntext
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"
#combine into one string
knowntext <- str_c(knowntext, collapse = "")
knowntext
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"
#Split string on the "." characters and put each word in individual element
knowntext <- unlist(str_split(knowntext, "\\."))
knowntext
## [1] "CONGRATULATIONS" "YOU" "ARE" "A"
## [5] "SUPERNERD!"