library(stringr)
## Warning: package 'stringr' was built under R version 3.5.2
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <-unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
phone
## [1] "555-1239" "(636) 555-0113" "555-6542" "555 8904"
## [5] "636-555-3226" "5553642"
Copy the introductory example.
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
#Determine which names *don't* follow first name + last name (i.e. user lastname,firstname)
last_first <- name[str_detect(name,"[[:alpha:]],")]
last_first
## [1] "Burns, C. Montgomery" "Simpson, Homer"
#Split members on the comma, swap via backreferencing
first_last <- str_replace(last_first,"([^,]+),\\s([^,]+)","\\2 \\1")
first_last
## [1] "C. Montgomery Burns" "Homer Simpson"
#Place back into original vector.
name[str_detect(name,"[[:alpha:]],")] <- first_last
title <- str_detect(name, "[[:alpha:]]{2,3}[.]")
title
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
name[title]
## [1] "Rev. Timothy Lovejoy" "Dr. Julius Hibbert"
#Assuming second name means middle name OR last name
#Remove titles, then check for a space. This would indicate there are 2 words, hence 2 names.
second_name <- str_detect(str_replace(name,"[[:alpha:].]{2,3}\\. ",""),"\\s")
second_name
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
name[second_name]
## [1] "Moe Szyslak" "C. Montgomery Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Dr. Julius Hibbert"
#If they are referring to a middle name, do the same, just checking for 2 spaces, hence 3 words:
middle_initial <- str_detect(str_replace(name,"[[:alpha:].]{2,3}\\. ",""),"\\s.*\\s")
middle_initial
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
name[middle_initial]
## [1] "C. Montgomery Burns"
Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
NOTE: I’ve included examples that return false in order to better exemplify the limits of the pattern.
[0-9]+\$
1 or more digits followed by a dollar sign.
answer_4a <- c("$1000", "1000", "100$", "1000$")
str_detect(answer_4a, "[0-9]+\\$")
## [1] FALSE FALSE TRUE TRUE
\b[a-z]{1,4}\b
Word break follow by 1 to 4 lowercase letters followed by another word break.
answer_4b <- c("This is a full sentence with several one to four letter words")
str_extract_all(answer_4b,"\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "is" "a" "full" "with" "one" "to" "four"
.*?\.txt$
Zero or more of any chacter (optionally), ending with a literal period and ‘txt’
answer_4c <- c("filename.txt", ".txt", "This sentence doesn't end with a .txt extension")
str_detect(answer_4c, ".*?\\.txt$")
## [1] TRUE TRUE FALSE
\d{2}/\d{2}/\d{4}
2 digits followed by a foreward slash followed by 2 digits followed by a forward slash followed by 4 digits
A common date notation.
answer_4d <- c("04/04/2019", "12/23/2011", "99/99/9999", "1/2/3432")
str_detect(answer_4d, "\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE TRUE TRUE FALSE
<(.+?)>.+?</\1>
Less than sign folowed by an optional 1 or more chacters followed by a greater than sign, followed by an optional 1 or more chacters, followed by a less than sign, a foreward slash, and the first matching group, and a final greater than sign.
I.e.: This will extract HTML tags and their containments.
str_extract_all("This is HTML markup code, with <strong>bolded text</strong> as well as <em>italics</em>", "<(.+?)>.+?</\\1>")
## [[1]]
## [1] "<strong>bolded text</strong>" "<em>italics</em>"
9.The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
secretcode <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
str_replace_all(secretcode,"[[:lower:]]|[[:digit:]]","")
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"