#rm(list=ls()) #clear everything
library(stringr)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
load into name vector
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
Remove titles and initials. Trim whitespaces
name2 <- str_trim(sub("[[:alpha:]]{1,3}\\.", '', name))
name2
## [1] "Moe Szyslak" "Burns, Montgomery" "Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Julius Hibbert"
Replace instances of “Last name comma first name” with “first name lastname” by Backreferencing.
name3 <- sub("(\\w+),\\s+(\\w+)","\\2 \\1", name2)
name3
## [1] "Moe Szyslak" "Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
nameTitle <- str_detect(name,"Rev|Dr[.]")
nameTitle
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
namecount <- str_count(str_trim(sub("Rev|Dr[.]", "", name)),"\\w+")
secndname <- str_detect(namecount, "3") # anything with more than 3 words has a second name
secndname
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
4.Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
a string of numbers followed by a $
pattern="[0-9]+\\$"
ex <- "1234$ ab$a 1$ 00012324"
str_extract_all(ex, pattern)
## [[1]]
## [1] "1234$" "1$"
get any word (as separated by the word barriers \b)all in lowercase letters. Words matched are between 1 and 4 characters long
ex1 = "How do I know the things"
str_extract_all(ex1,"\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "do" "know" "the"
ex2 = "filename.dat something.txt"
str_extract_all(ex2, ".*?\\.txt$")
## [[1]]
## [1] "filename.dat something.txt"
Any values that have 2 numbers slash 2 numbers slash 4 numbers. For example a date.
date1 <- c("12/25/1978 2017/16/12 99/99/1978")
str_extract_all(date1, "\\d{2}/\\d{2}/\\d{4}")
## [[1]]
## [1] "12/25/1978" "99/99/1978"
ex3 = "<html> <head> <title>CUNY - Self Service</title>"
str_extract_all(ex3, "<(.+?)>.+?</\\1>")
## [[1]]
## [1] "<title>CUNY - Self Service</title>"
Extract all the uppercase letters from the message and concatenate into a single vector
x="clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
str_c(unlist(str_extract_all(x, "[[:upper:].]")),collapse = "")
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD"