raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
library(stringr)
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
(a) Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
Extract first names:
#from first name last name format:
firstName <- unlist(str_extract_all(name, "[[:alpha:]]{2,} "))
firstName
## [1] "Moe " "Timothy " "Ned " "Julius "
#from last name, first name format
first <- unlist(str_extract_all(name, ", [[:alpha:].+]{1,}+"))
first
## [1] ", C." ", Homer"
#extract name without comma and space
FirstName <- sub(".*? (.+)", "\\1", first)
FirstName
## [1] "C." "Homer"
firstNames <- c(firstName, FirstName)
firstNames
## [1] "Moe " "Timothy " "Ned " "Julius " "C." "Homer"
Extract last names:
#from first name last name format:
lastName <- unlist(str_extract_all(name, "\\b .+\\b"))
lastName
## [1] " Szyslak" " Lovejoy" " Flanders" " Hibbert"
#extract name without space before
lastName1 <- sub(".*? ", "\\1", lastName)
lastName1
## [1] "Szyslak" "Lovejoy" "Flanders" "Hibbert"
#from last name, first name format
last <- unlist(str_extract_all(name, "[[:alpha:]]{2,},"))
last
## [1] "Burns," "Simpson,"
#extract name without comma
LastName <- sub(",", "\\1", last)
LastName
## [1] "Burns" "Simpson"
lastNames <- c(lastName1, LastName)
lastNames
## [1] "Szyslak" "Lovejoy" "Flanders" "Hibbert" "Burns" "Simpson"
Create a data frame with first names and last names:
df <- data.frame("First Name" = firstNames, "Last Name" = lastNames)
knitr::kable(df)
| First.Name | Last.Name |
|---|---|
| Moe | Szyslak |
| Timothy | Lovejoy |
| Ned | Flanders |
| Julius | Hibbert |
| C. | Burns |
| Homer | Simpson |
Create a vector of first and last names:
namesVector <- paste0(firstNames,' ',lastNames)
namesVector
## [1] "Moe Szyslak" "Timothy Lovejoy" "Ned Flanders"
## [4] "Julius Hibbert" "C. Burns" "Homer Simpson"
(b) Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
prefix <- (str_extract_all(name, "[[A-z]]{2,}\\."))
prefix
## [[1]]
## character(0)
##
## [[2]]
## character(0)
##
## [[3]]
## [1] "Rev."
##
## [[4]]
## character(0)
##
## [[5]]
## character(0)
##
## [[6]]
## [1] "Dr."
title <- str_detect(name, "[[:alpha:]]{2,}\\.")
title
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
(c) Construct a logical vector indicating whether a character has a second name.
middleName <- str_extract_all(name,"\\,\\s\\w\\.\\s[[:alpha:]]{2,}")
middleName
## [[1]]
## character(0)
##
## [[2]]
## [1] ", C. Montgomery"
##
## [[3]]
## character(0)
##
## [[4]]
## character(0)
##
## [[5]]
## character(0)
##
## [[6]]
## character(0)
secondName <- str_detect(name,"\\,\\s\\w\\.\\s[[:alpha:]]{2,}")
secondName
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
**(a) [0-9]+\\$**
This expression returns strings of 1 or more numbers followed by a $.
example1 <- "35iu68903$64678$t8bhj$246"
str_extract_all(example1, "[0-9]+\\$" )
## [[1]]
## [1] "68903$" "64678$"
**(b) \\b[a-z]{1,4}\\b**
This expression returns strings of 1-4 lowercase letters
example2 <- c("abc", "de", "f", "ghij", "kl123")
str_extract_all(example2, "\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "abc"
##
## [[2]]
## [1] "de"
##
## [[3]]
## [1] "f"
##
## [[4]]
## [1] "ghij"
##
## [[5]]
## character(0)
**(c) .*?\\.txt$**
This expression returns strings that end with ".txt"
example3 <- c("abc.txt", "abc.doc", "txt")
str_extract_all(example3, ".*?\\.txt$")
## [[1]]
## [1] "abc.txt"
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
**(d) \\d{2}/\\d{2}/\\d{4}**
This string returns date formats: any 2 digits, forward slash, any 2 digits, forward slash, any 4 digits
example3 <- c("02/11/2019", "2/11/19", "04/02/2018", "1789/12", "12387689")
str_extract_all(example3, "\\d{2}/\\d{2}/\\d{4}")
## [[1]]
## [1] "02/11/2019"
##
## [[2]]
## character(0)
##
## [[3]]
## [1] "04/02/2018"
##
## [[4]]
## character(0)
##
## [[5]]
## character(0)
**(e) <(.+?)>.+?</\\1>**
This expression returns strings that have html format
example4 <- c("<p>Hello world</p>", "Hello", "<Hi>")
str_extract_all(example4,"<(.+?)>.+?</\\1>")
## [[1]]
## [1] "<p>Hello world</p>"
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
message <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
#Remove numbers
message <- unlist(str_extract_all(message, "[^[:digit:]]"))
message
## [1] "c" "l" "c" "o" "p" "C" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k"
## [18] "i" "g" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g" "v" "h" "r" "y" "n"
## [35] "G" "j" "u" "w" "c" "z" "i" "h" "q" "r" "f" "p" "R" "x" "s" "A" "j"
## [52] "d" "w" "p" "n" "T" "a" "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "L"
## [69] "j" "k" "p" "f" "A" "T" "I" "d" "r" "c" "o" "c" "b" "t" "y" "c" "z"
## [86] "j" "a" "t" "O" "a" "o" "o" "t" "j" "t" "N" "j" "n" "e" "c" "S" "f"
## [103] "e" "k" "." "r" "w" "Y" "w" "w" "o" "j" "i" "g" "O" "d" "v" "r" "f"
## [120] "U" "r" "b" "z" "." "b" "k" "A" "n" "b" "h" "z" "g" "v" "R" "i" "z"
## [137] "E" "c" "r" "o" "p" "." "w" "A" "g" "n" "b" "." "S" "q" "o" "U" "f"
## [154] "P" "a" "o" "t" "f" "b" "w" "E" "m" "k" "t" "s" "R" "z" "q" "e" "f"
## [171] "y" "n" "N" "d" "t" "k" "c" "f" "E" "g" "m" "c" "R" "g" "x" "o" "n"
## [188] "h" "D" "k" "!" "g" "r"
#Remove lower case letters
message1 <- unlist(str_extract_all(message, "[^[:lower:]]"))
message1
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"
#Concatenate individual letters
message2 <- paste(message1, collapse = '')
message2
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"
#Replace periods with spaces
message3 <- unlist(str_replace_all(message2,"\\.", " "))
message3
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD!"