library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
3.Copy the intorductory vector name stores the extracted names
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
3.1. Use the tools of this chapter to re-arrange the vector so that all of the elements conform to the standard first_name last_name
first_last <- function(x){
if ( str_detect(x,",")){
firstAndLast <- unlist(str_extract_all(x, "[[:alpha:]]{3,}"))
currentName <- paste(trimws(firstAndLast[2]), firstAndLast[1])
} else if (str_detect(x, "^[:alpha:]{2,3}\\.") ) {
firstAndLast <- unlist(str_extract_all(x, "[[:alpha:]]{4,}"))
currentName <- paste(firstAndLast[1], firstAndLast[2])
} else {
currentName <- x
}
currentName
}
standardNameFormat <- unname(sapply( name, first_last ))
standardNameFormat
## [1] "Moe Szyslak" "Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
3.2 Construct A logical Vector indicating whether a character has a title (Rev. or Dr.)
hasTitle <- str_detect(name, "^[:alpha:]{2,3}\\.")
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
hasTitle
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
3.2 Construct A logical Vector indicating whether a character has a second name
hasSecondName <- str_detect(name, "[:space:][:alpha:]{1}\\.[:space:][:alpha:]+")
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
hasSecondName
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
4.1 [0-9]+\$
One or more digits followed by a Dollar Sign
str_detect ("1234$", "[0-9]+\\$")
## [1] TRUE
4.2 \b[a-z]{1,4}
The begining of a line starts with 1 to 4 occurrences of a lower case letter
str_detect ("a", "\\b[a-z]{1,4}")
## [1] TRUE
str_detect ("ab", "\\b[a-z]{1,4}")
## [1] TRUE
str_detect ("abc", "\\b[a-z]{1,4}")
## [1] TRUE
str_detect ("abcd", "\\b[a-z]{1,4}")
## [1] TRUE
4.3 .*?\.txt$
Anything with an optional space and then the line ends with “.txt”
str_detect("abc .txt", ".*?\\.txt$")
## [1] TRUE
str_detect("abc.txt", ".*?\\.txt$")
## [1] TRUE
4.4 \d{2/\d{2}/\d{4}}
Standard date format mm/dd/yyyy
str_detect("01/01/2016", "\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE
4.5 <(.+?)>.+</\1>
This checks to see if you have HTML/XML tags that are properly closed with optional text between them.
str_detect("<foo>bar</foo>", "<(.+?)>.+</\\1>")
## [1] TRUE